Index: head/sys/kern/subr_bus.c
===================================================================
--- head/sys/kern/subr_bus.c	(revision 145952)
+++ head/sys/kern/subr_bus.c	(revision 145953)
@@ -1,4009 +1,4010 @@
 /*-
  * Copyright (c) 1997,1998,2003 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bus.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/filio.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/condvar.h>
 #include <sys/queue.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 #include <sys/bus.h>
 
 #include <machine/stdarg.h>
 
 #include <vm/uma.h>
 
 SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL);
 SYSCTL_NODE(, OID_AUTO, dev, CTLFLAG_RW, NULL, NULL);
 
 /*
  * Used to attach drivers to devclasses.
  */
 typedef struct driverlink *driverlink_t;
 struct driverlink {
 	kobj_class_t	driver;
 	TAILQ_ENTRY(driverlink) link;	/* list of drivers in devclass */
 };
 
 /*
  * Forward declarations
  */
 typedef TAILQ_HEAD(devclass_list, devclass) devclass_list_t;
 typedef TAILQ_HEAD(driver_list, driverlink) driver_list_t;
 typedef TAILQ_HEAD(device_list, device) device_list_t;
 
 struct devclass {
 	TAILQ_ENTRY(devclass) link;
 	devclass_t	parent;		/* parent in devclass hierarchy */
 	driver_list_t	drivers;     /* bus devclasses store drivers for bus */
 	char		*name;
 	device_t	*devices;	/* array of devices indexed by unit */
 	int		maxunit;	/* size of devices array */
 
 	struct sysctl_ctx_list sysctl_ctx;
 	struct sysctl_oid *sysctl_tree;
 };
 
 /**
  * @brief Implementation of device.
  */
 struct device {
 	/*
 	 * A device is a kernel object. The first field must be the
 	 * current ops table for the object.
 	 */
 	KOBJ_FIELDS;
 
 	/*
 	 * Device hierarchy.
 	 */
 	TAILQ_ENTRY(device)	link;	/**< list of devices in parent */
 	TAILQ_ENTRY(device)	devlink; /**< global device list membership */
 	device_t	parent;		/**< parent of this device  */
 	device_list_t	children;	/**< list of child devices */
 
 	/*
 	 * Details of this device.
 	 */
 	driver_t	*driver;	/**< current driver */
 	devclass_t	devclass;	/**< current device class */
 	int		unit;		/**< current unit number */
 	char*		nameunit;	/**< name+unit e.g. foodev0 */
 	char*		desc;		/**< driver specific description */
 	int		busy;		/**< count of calls to device_busy() */
 	device_state_t	state;		/**< current device state  */
 	u_int32_t	devflags;	/**< api level flags for device_get_flags() */
 	u_short		flags;		/**< internal device flags  */
 #define	DF_ENABLED	1		/* device should be probed/attached */
 #define	DF_FIXEDCLASS	2		/* devclass specified at create time */
 #define	DF_WILDCARD	4		/* unit was originally wildcard */
 #define	DF_DESCMALLOCED	8		/* description was malloced */
 #define	DF_QUIET	16		/* don't print verbose attach message */
 #define	DF_DONENOMATCH	32		/* don't execute DEVICE_NOMATCH again */
 #define	DF_EXTERNALSOFTC 64		/* softc not allocated by us */
 #define	DF_REBID	128		/* Can rebid after attach */
 	u_char	order;			/**< order from device_add_child_ordered() */
 	u_char	pad;
 	void	*ivars;			/**< instance variables  */
 	void	*softc;			/**< current driver's variables  */
 
 	struct sysctl_ctx_list sysctl_ctx; /**< state for sysctl variables  */
 	struct sysctl_oid *sysctl_tree;	/**< state for sysctl variables */
 };
 
 static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures");
 static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc");
 
 #ifdef BUS_DEBUG
 
 static int bus_debug = 1;
 TUNABLE_INT("bus.debug", &bus_debug);
 SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RW, &bus_debug, 0,
     "Debug bus code");
 
 #define PDEBUG(a)	if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a; printf("\n");}
 #define DEVICENAME(d)	((d)? device_get_name(d): "no device")
 #define DRIVERNAME(d)	((d)? d->name : "no driver")
 #define DEVCLANAME(d)	((d)? d->name : "no devclass")
 
 /**
  * Produce the indenting, indent*2 spaces plus a '.' ahead of that to
  * prevent syslog from deleting initial spaces
  */
 #define indentprintf(p)	do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf("  "); printf p ; } while (0)
 
 static void print_device_short(device_t dev, int indent);
 static void print_device(device_t dev, int indent);
 void print_device_tree_short(device_t dev, int indent);
 void print_device_tree(device_t dev, int indent);
 static void print_driver_short(driver_t *driver, int indent);
 static void print_driver(driver_t *driver, int indent);
 static void print_driver_list(driver_list_t drivers, int indent);
 static void print_devclass_short(devclass_t dc, int indent);
 static void print_devclass(devclass_t dc, int indent);
 void print_devclass_list_short(void);
 void print_devclass_list(void);
 
 #else
 /* Make the compiler ignore the function calls */
 #define PDEBUG(a)			/* nop */
 #define DEVICENAME(d)			/* nop */
 #define DRIVERNAME(d)			/* nop */
 #define DEVCLANAME(d)			/* nop */
 
 #define print_device_short(d,i)		/* nop */
 #define print_device(d,i)		/* nop */
 #define print_device_tree_short(d,i)	/* nop */
 #define print_device_tree(d,i)		/* nop */
 #define print_driver_short(d,i)		/* nop */
 #define print_driver(d,i)		/* nop */
 #define print_driver_list(d,i)		/* nop */
 #define print_devclass_short(d,i)	/* nop */
 #define print_devclass(d,i)		/* nop */
 #define print_devclass_list_short()	/* nop */
 #define print_devclass_list()		/* nop */
 #endif
 
 /*
  * dev sysctl tree
  */
 
 enum {
 	DEVCLASS_SYSCTL_PARENT,
 };
 
 static int
 devclass_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	devclass_t dc = (devclass_t)arg1;
 	const char *value;
 
 	switch (arg2) {
 	case DEVCLASS_SYSCTL_PARENT:
 		value = dc->parent ? dc->parent->name : "";
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (SYSCTL_OUT(req, value, strlen(value)));
 }
 
 static void
 devclass_sysctl_init(devclass_t dc)
 {
 
 	if (dc->sysctl_tree != NULL)
 		return;
 	sysctl_ctx_init(&dc->sysctl_ctx);
 	dc->sysctl_tree = SYSCTL_ADD_NODE(&dc->sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, dc->name,
 	    CTLFLAG_RD, 0, "");
 	SYSCTL_ADD_PROC(&dc->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree),
 	    OID_AUTO, "%parent", CTLFLAG_RD,
 	    dc, DEVCLASS_SYSCTL_PARENT, devclass_sysctl_handler, "A",
 	    "parent class");
 }
 
 enum {
 	DEVICE_SYSCTL_DESC,
 	DEVICE_SYSCTL_DRIVER,
 	DEVICE_SYSCTL_LOCATION,
 	DEVICE_SYSCTL_PNPINFO,
 	DEVICE_SYSCTL_PARENT,
 };
 
 static int
 device_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	device_t dev = (device_t)arg1;
 	const char *value;
 	char *buf;
 	int error;
 
 	buf = NULL;
 	switch (arg2) {
 	case DEVICE_SYSCTL_DESC:
 		value = dev->desc ? dev->desc : "";
 		break;
 	case DEVICE_SYSCTL_DRIVER:
 		value = dev->driver ? dev->driver->name : "";
 		break;
 	case DEVICE_SYSCTL_LOCATION:
 		value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
 		bus_child_location_str(dev, buf, 1024);
 		break;
 	case DEVICE_SYSCTL_PNPINFO:
 		value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
 		bus_child_pnpinfo_str(dev, buf, 1024);
 		break;
 	case DEVICE_SYSCTL_PARENT:
 		value = dev->parent ? dev->parent->nameunit : "";
 		break;
 	default:
 		return (EINVAL);
 	}
 	error = SYSCTL_OUT(req, value, strlen(value));
 	if (buf != NULL)
 		free(buf, M_BUS);
 	return (error);
 }
 
 static void
 device_sysctl_init(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 
 	if (dev->sysctl_tree != NULL)
 		return;
 	devclass_sysctl_init(dc);
 	sysctl_ctx_init(&dev->sysctl_ctx);
 	dev->sysctl_tree = SYSCTL_ADD_NODE(&dev->sysctl_ctx,
 	    SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO,
 	    dev->nameunit + strlen(dc->name),
 	    CTLFLAG_RD, 0, "");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%desc", CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_DESC, device_sysctl_handler, "A",
 	    "device description");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%driver", CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_DRIVER, device_sysctl_handler, "A",
 	    "device driver name");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%location", CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_LOCATION, device_sysctl_handler, "A",
 	    "device location relative to parent");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%pnpinfo", CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_PNPINFO, device_sysctl_handler, "A",
 	    "device identification");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%parent", CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A",
 	    "parent device");
 }
 
 static void
 device_sysctl_fini(device_t dev)
 {
 	if (dev->sysctl_tree == NULL)
 		return;
 	sysctl_ctx_free(&dev->sysctl_ctx);
 	dev->sysctl_tree = NULL;
 }
 
 /*
  * /dev/devctl implementation
  */
 
 /*
  * This design allows only one reader for /dev/devctl.  This is not desirable
  * in the long run, but will get a lot of hair out of this implementation.
  * Maybe we should make this device a clonable device.
  *
  * Also note: we specifically do not attach a device to the device_t tree
  * to avoid potential chicken and egg problems.  One could argue that all
  * of this belongs to the root node.  One could also further argue that the
  * sysctl interface that we have not might more properly be an ioctl
  * interface, but at this stage of the game, I'm not inclined to rock that
  * boat.
  *
  * I'm also not sure that the SIGIO support is done correctly or not, as
  * I copied it from a driver that had SIGIO support that likely hasn't been
  * tested since 3.4 or 2.2.8!
  */
 
 static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS);
 static int devctl_disable = 0;
 TUNABLE_INT("hw.bus.devctl_disable", &devctl_disable);
 SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_disable, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
     sysctl_devctl_disable, "I", "devctl disable");
 
 static d_open_t		devopen;
 static d_close_t	devclose;
 static d_read_t		devread;
 static d_ioctl_t	devioctl;
 static d_poll_t		devpoll;
 
 static struct cdevsw dev_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_NEEDGIANT,
 	.d_open =	devopen,
 	.d_close =	devclose,
 	.d_read =	devread,
 	.d_ioctl =	devioctl,
 	.d_poll =	devpoll,
 	.d_name =	"devctl",
 };
 
 struct dev_event_info
 {
 	char *dei_data;
 	TAILQ_ENTRY(dev_event_info) dei_link;
 };
 
 TAILQ_HEAD(devq, dev_event_info);
 
 static struct dev_softc
 {
 	int	inuse;
 	int	nonblock;
 	struct mtx mtx;
 	struct cv cv;
 	struct selinfo sel;
 	struct devq devq;
 	struct proc *async_proc;
 } devsoftc;
 
 static struct cdev *devctl_dev;
 
 static void
 devinit(void)
 {
 	devctl_dev = make_dev(&dev_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
 	    "devctl");
 	mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF);
 	cv_init(&devsoftc.cv, "dev cv");
 	TAILQ_INIT(&devsoftc.devq);
 }
 
 static int
 devopen(struct cdev *dev, int oflags, int devtype, d_thread_t *td)
 {
 	if (devsoftc.inuse)
 		return (EBUSY);
 	/* move to init */
 	devsoftc.inuse = 1;
 	devsoftc.nonblock = 0;
 	devsoftc.async_proc = NULL;
 	return (0);
 }
 
 static int
 devclose(struct cdev *dev, int fflag, int devtype, d_thread_t *td)
 {
 	devsoftc.inuse = 0;
 	mtx_lock(&devsoftc.mtx);
 	cv_broadcast(&devsoftc.cv);
 	mtx_unlock(&devsoftc.mtx);
 
 	return (0);
 }
 
 /*
  * The read channel for this device is used to report changes to
  * userland in realtime.  We are required to free the data as well as
  * the n1 object because we allocate them separately.  Also note that
  * we return one record at a time.  If you try to read this device a
  * character at a time, you will loose the rest of the data.  Listening
  * programs are expected to cope.
  */
 static int
 devread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct dev_event_info *n1;
 	int rv;
 
 	mtx_lock(&devsoftc.mtx);
 	while (TAILQ_EMPTY(&devsoftc.devq)) {
 		if (devsoftc.nonblock) {
 			mtx_unlock(&devsoftc.mtx);
 			return (EAGAIN);
 		}
 		rv = cv_wait_sig(&devsoftc.cv, &devsoftc.mtx);
 		if (rv) {
 			/*
 			 * Need to translate ERESTART to EINTR here? -- jake
 			 */
 			mtx_unlock(&devsoftc.mtx);
 			return (rv);
 		}
 	}
 	n1 = TAILQ_FIRST(&devsoftc.devq);
 	TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
 	mtx_unlock(&devsoftc.mtx);
 	rv = uiomove(n1->dei_data, strlen(n1->dei_data), uio);
 	free(n1->dei_data, M_BUS);
 	free(n1, M_BUS);
 	return (rv);
 }
 
 static	int
 devioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, d_thread_t *td)
 {
 	switch (cmd) {
 
 	case FIONBIO:
 		if (*(int*)data)
 			devsoftc.nonblock = 1;
 		else
 			devsoftc.nonblock = 0;
 		return (0);
 	case FIOASYNC:
 		if (*(int*)data)
 			devsoftc.async_proc = td->td_proc;
 		else
 			devsoftc.async_proc = NULL;
 		return (0);
 
 		/* (un)Support for other fcntl() calls. */
 	case FIOCLEX:
 	case FIONCLEX:
 	case FIONREAD:
 	case FIOSETOWN:
 	case FIOGETOWN:
 	default:
 		break;
 	}
 	return (ENOTTY);
 }
 
 static	int
 devpoll(struct cdev *dev, int events, d_thread_t *td)
 {
 	int	revents = 0;
 
 	mtx_lock(&devsoftc.mtx);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (!TAILQ_EMPTY(&devsoftc.devq))
 			revents = events & (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &devsoftc.sel);
 	}
 	mtx_unlock(&devsoftc.mtx);
 
 	return (revents);
 }
 
 /**
  * @brief Queue data to be read from the devctl device
  *
  * Generic interface to queue data to the devctl device.  It is
  * assumed that @p data is properly formatted.  It is further assumed
  * that @p data is allocated using the M_BUS malloc type.
  */
 void
 devctl_queue_data(char *data)
 {
 	struct dev_event_info *n1 = NULL;
 	struct proc *p;
 
 	n1 = malloc(sizeof(*n1), M_BUS, M_NOWAIT);
 	if (n1 == NULL)
 		return;
 	n1->dei_data = data;
 	mtx_lock(&devsoftc.mtx);
 	TAILQ_INSERT_TAIL(&devsoftc.devq, n1, dei_link);
 	cv_broadcast(&devsoftc.cv);
 	mtx_unlock(&devsoftc.mtx);
 	selwakeup(&devsoftc.sel);
 	p = devsoftc.async_proc;
 	if (p != NULL) {
 		PROC_LOCK(p);
 		psignal(p, SIGIO);
 		PROC_UNLOCK(p);
 	}
 }
 
 /**
  * @brief Send a 'notification' to userland, using standard ways
  */
 void
 devctl_notify(const char *system, const char *subsystem, const char *type,
     const char *data)
 {
 	int len = 0;
 	char *msg;
 
 	if (system == NULL)
 		return;		/* BOGUS!  Must specify system. */
 	if (subsystem == NULL)
 		return;		/* BOGUS!  Must specify subsystem. */
 	if (type == NULL)
 		return;		/* BOGUS!  Must specify type. */
 	len += strlen(" system=") + strlen(system);
 	len += strlen(" subsystem=") + strlen(subsystem);
 	len += strlen(" type=") + strlen(type);
 	/* add in the data message plus newline. */
 	if (data != NULL)
 		len += strlen(data);
 	len += 3;	/* '!', '\n', and NUL */
 	msg = malloc(len, M_BUS, M_NOWAIT);
 	if (msg == NULL)
 		return;		/* Drop it on the floor */
 	snprintf(msg, len, "!system=%s subsystem=%s type=%s %s\n", system,
 	    subsystem, type, data);
 	devctl_queue_data(msg);
 }
 
 /*
  * Common routine that tries to make sending messages as easy as possible.
  * We allocate memory for the data, copy strings into that, but do not
  * free it unless there's an error.  The dequeue part of the driver should
  * free the data.  We don't send data when the device is disabled.  We do
  * send data, even when we have no listeners, because we wish to avoid
  * races relating to startup and restart of listening applications.
  *
  * devaddq is designed to string together the type of event, with the
  * object of that event, plus the plug and play info and location info
  * for that event.  This is likely most useful for devices, but less
  * useful for other consumers of this interface.  Those should use
  * the devctl_queue_data() interface instead.
  */
 static void
 devaddq(const char *type, const char *what, device_t dev)
 {
 	char *data = NULL;
 	char *loc = NULL;
 	char *pnp = NULL;
 	const char *parstr;
 
 	if (devctl_disable)
 		return;
 	data = malloc(1024, M_BUS, M_NOWAIT);
 	if (data == NULL)
 		goto bad;
 
 	/* get the bus specific location of this device */
 	loc = malloc(1024, M_BUS, M_NOWAIT);
 	if (loc == NULL)
 		goto bad;
 	*loc = '\0';
 	bus_child_location_str(dev, loc, 1024);
 
 	/* Get the bus specific pnp info of this device */
 	pnp = malloc(1024, M_BUS, M_NOWAIT);
 	if (pnp == NULL)
 		goto bad;
 	*pnp = '\0';
 	bus_child_pnpinfo_str(dev, pnp, 1024);
 
 	/* Get the parent of this device, or / if high enough in the tree. */
 	if (device_get_parent(dev) == NULL)
 		parstr = ".";	/* Or '/' ? */
 	else
 		parstr = device_get_nameunit(device_get_parent(dev));
 	/* String it all together. */
 	snprintf(data, 1024, "%s%s at %s %s on %s\n", type, what, loc, pnp,
 	  parstr);
 	free(loc, M_BUS);
 	free(pnp, M_BUS);
 	devctl_queue_data(data);
 	return;
 bad:
 	free(pnp, M_BUS);
 	free(loc, M_BUS);
 	free(data, M_BUS);
 	return;
 }
 
 /*
  * A device was added to the tree.  We are called just after it successfully
  * attaches (that is, probe and attach success for this device).  No call
  * is made if a device is merely parented into the tree.  See devnomatch
  * if probe fails.  If attach fails, no notification is sent (but maybe
  * we should have a different message for this).
  */
 static void
 devadded(device_t dev)
 {
 	char *pnp = NULL;
 	char *tmp = NULL;
 
 	pnp = malloc(1024, M_BUS, M_NOWAIT);
 	if (pnp == NULL)
 		goto fail;
 	tmp = malloc(1024, M_BUS, M_NOWAIT);
 	if (tmp == NULL)
 		goto fail;
 	*pnp = '\0';
 	bus_child_pnpinfo_str(dev, pnp, 1024);
 	snprintf(tmp, 1024, "%s %s", device_get_nameunit(dev), pnp);
 	devaddq("+", tmp, dev);
 fail:
 	if (pnp != NULL)
 		free(pnp, M_BUS);
 	if (tmp != NULL)
 		free(tmp, M_BUS);
 	return;
 }
 
 /*
  * A device was removed from the tree.  We are called just before this
  * happens.
  */
 static void
 devremoved(device_t dev)
 {
 	char *pnp = NULL;
 	char *tmp = NULL;
 
 	pnp = malloc(1024, M_BUS, M_NOWAIT);
 	if (pnp == NULL)
 		goto fail;
 	tmp = malloc(1024, M_BUS, M_NOWAIT);
 	if (tmp == NULL)
 		goto fail;
 	*pnp = '\0';
 	bus_child_pnpinfo_str(dev, pnp, 1024);
 	snprintf(tmp, 1024, "%s %s", device_get_nameunit(dev), pnp);
 	devaddq("-", tmp, dev);
 fail:
 	if (pnp != NULL)
 		free(pnp, M_BUS);
 	if (tmp != NULL)
 		free(tmp, M_BUS);
 	return;
 }
 
 /*
  * Called when there's no match for this device.  This is only called
  * the first time that no match happens, so we don't keep getitng this
  * message.  Should that prove to be undesirable, we can change it.
  * This is called when all drivers that can attach to a given bus
  * decline to accept this device.  Other errrors may not be detected.
  */
 static void
 devnomatch(device_t dev)
 {
 	devaddq("?", "", dev);
 }
 
 static int
 sysctl_devctl_disable(SYSCTL_HANDLER_ARGS)
 {
 	struct dev_event_info *n1;
 	int dis, error;
 
 	dis = devctl_disable;
 	error = sysctl_handle_int(oidp, &dis, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	mtx_lock(&devsoftc.mtx);
 	devctl_disable = dis;
 	if (dis) {
 		while (!TAILQ_EMPTY(&devsoftc.devq)) {
 			n1 = TAILQ_FIRST(&devsoftc.devq);
 			TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
 			free(n1->dei_data, M_BUS);
 			free(n1, M_BUS);
 		}
 	}
 	mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 /* End of /dev/devctl code */
 
 TAILQ_HEAD(,device)	bus_data_devices;
 static int bus_data_generation = 1;
 
 kobj_method_t null_methods[] = {
 	{ 0, 0 }
 };
 
 DEFINE_CLASS(null, null_methods, 0);
 
 /*
  * Devclass implementation
  */
 
 static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);
 
 
 /**
  * @internal
  * @brief Find or create a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise if @p create is non-zero create and return a new device
  * class.
  *
  * If @p parentname is non-NULL, the parent of the devclass is set to
  * the devclass of that name.
  *
  * @param classname	the devclass name to find or create
  * @param parentname	the parent devclass name or @c NULL
  * @param create	non-zero to create a devclass
  */
 static devclass_t
 devclass_find_internal(const char *classname, const char *parentname,
 		       int create)
 {
 	devclass_t dc;
 
 	PDEBUG(("looking for %s", classname));
 	if (!classname)
 		return (NULL);
 
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		if (!strcmp(dc->name, classname))
 			break;
 	}
 
 	if (create && !dc) {
 		PDEBUG(("creating %s", classname));
 		dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
 		    M_BUS, M_NOWAIT|M_ZERO);
 		if (!dc)
 			return (NULL);
 		dc->parent = NULL;
 		dc->name = (char*) (dc + 1);
 		strcpy(dc->name, classname);
 		TAILQ_INIT(&dc->drivers);
 		TAILQ_INSERT_TAIL(&devclasses, dc, link);
 
 		bus_data_generation_update();
 	}
 	if (parentname && dc && !dc->parent) {
 		dc->parent = devclass_find_internal(parentname, 0, FALSE);
 	}
 
 	return (dc);
 }
 
 /**
  * @brief Create a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise create and return a new device class.
  *
  * @param classname	the devclass name to find or create
  */
 devclass_t
 devclass_create(const char *classname)
 {
 	return (devclass_find_internal(classname, 0, TRUE));
 }
 
 /**
  * @brief Find a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise return @c NULL.
  *
  * @param classname	the devclass name to find
  */
 devclass_t
 devclass_find(const char *classname)
 {
 	return (devclass_find_internal(classname, 0, FALSE));
 }
 
 /**
  * @brief Add a device driver to a device class
  *
  * Add a device driver to a devclass. This is normally called
  * automatically by DRIVER_MODULE(). The BUS_DRIVER_ADDED() method of
  * all devices in the devclass will be called to allow them to attempt
  * to re-probe any unmatched children.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to register
  */
 int
 devclass_add_driver(devclass_t dc, driver_t *driver)
 {
 	driverlink_t dl;
 	int i;
 
 	PDEBUG(("%s", DRIVERNAME(driver)));
 
 	dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO);
 	if (!dl)
 		return (ENOMEM);
 
 	/*
 	 * Compile the driver's methods. Also increase the reference count
 	 * so that the class doesn't get freed when the last instance
 	 * goes. This means we can safely use static methods and avoids a
 	 * double-free in devclass_delete_driver.
 	 */
 	kobj_class_compile((kobj_class_t) driver);
 
 	/*
 	 * Make sure the devclass which the driver is implementing exists.
 	 */
 	devclass_find_internal(driver->name, 0, TRUE);
 
 	dl->driver = driver;
 	TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
 	driver->refs++;		/* XXX: kobj_mtx */
 
 	/*
 	 * Call BUS_DRIVER_ADDED for any existing busses in this class.
 	 */
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i])
 			BUS_DRIVER_ADDED(dc->devices[i], driver);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Delete a device driver from a device class
  *
  * Delete a device driver from a devclass. This is normally called
  * automatically by DRIVER_MODULE().
  *
  * If the driver is currently attached to any devices,
  * devclass_delete_driver() will first attempt to detach from each
  * device. If one of the detach calls fails, the driver will not be
  * deleted.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to unregister
  */
 int
 devclass_delete_driver(devclass_t busclass, driver_t *driver)
 {
 	devclass_t dc = devclass_find(driver->name);
 	driverlink_t dl;
 	device_t dev;
 	int i;
 	int error;
 
 	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
 
 	if (!dc)
 		return (0);
 
 	/*
 	 * Find the link structure in the bus' list of drivers.
 	 */
 	TAILQ_FOREACH(dl, &busclass->drivers, link) {
 		if (dl->driver == driver)
 			break;
 	}
 
 	if (!dl) {
 		PDEBUG(("%s not found in %s list", driver->name,
 		    busclass->name));
 		return (ENOENT);
 	}
 
 	/*
 	 * Disassociate from any devices.  We iterate through all the
 	 * devices in the devclass of the driver and detach any which are
 	 * using the driver and which have a parent in the devclass which
 	 * we are deleting from.
 	 *
 	 * Note that since a driver can be in multiple devclasses, we
 	 * should not detach devices which are not children of devices in
 	 * the affected devclass.
 	 */
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			dev = dc->devices[i];
 			if (dev->driver == driver && dev->parent &&
 			    dev->parent->devclass == busclass) {
 				if ((error = device_detach(dev)) != 0)
 					return (error);
 				device_set_driver(dev, NULL);
 			}
 		}
 	}
 
 	TAILQ_REMOVE(&busclass->drivers, dl, link);
 	free(dl, M_BUS);
 
 	/* XXX: kobj_mtx */
 	driver->refs--;
 	if (driver->refs == 0)
 		kobj_class_free((kobj_class_t) driver);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Quiesces a set of device drivers from a device class
  *
  * Quiesce a device driver from a devclass. This is normally called
  * automatically by DRIVER_MODULE().
  *
  * If the driver is currently attached to any devices,
  * devclass_quiesece_driver() will first attempt to quiesce each
  * device.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to unregister
  */
 int
 devclass_quiesce_driver(devclass_t busclass, driver_t *driver)
 {
 	devclass_t dc = devclass_find(driver->name);
 	driverlink_t dl;
 	device_t dev;
 	int i;
 	int error;
 
 	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
 
 	if (!dc)
 		return (0);
 
 	/*
 	 * Find the link structure in the bus' list of drivers.
 	 */
 	TAILQ_FOREACH(dl, &busclass->drivers, link) {
 		if (dl->driver == driver)
 			break;
 	}
 
 	if (!dl) {
 		PDEBUG(("%s not found in %s list", driver->name,
 		    busclass->name));
 		return (ENOENT);
 	}
 
 	/*
 	 * Quiesce all devices.  We iterate through all the devices in
 	 * the devclass of the driver and quiesce any which are using
 	 * the driver and which have a parent in the devclass which we
 	 * are quiescing.
 	 *
 	 * Note that since a driver can be in multiple devclasses, we
 	 * should not quiesce devices which are not children of
 	 * devices in the affected devclass.
 	 */
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			dev = dc->devices[i];
 			if (dev->driver == driver && dev->parent &&
 			    dev->parent->devclass == busclass) {
 				if ((error = device_quiesce(dev)) != 0)
 					return (error);
 			}
 		}
 	}
 
 	return (0);
 }
 
 /**
  * @internal
  */
 static driverlink_t
 devclass_find_driver_internal(devclass_t dc, const char *classname)
 {
 	driverlink_t dl;
 
 	PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));
 
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		if (!strcmp(dl->driver->name, classname))
 			return (dl);
 	}
 
 	PDEBUG(("not found"));
 	return (NULL);
 }
 
 /**
  * @brief Search a devclass for a driver
  *
  * This function searches the devclass's list of drivers and returns
  * the first driver whose name is @p classname or @c NULL if there is
  * no driver of that name.
  *
  * @param dc		the devclass to search
  * @param classname	the driver name to search for
  */
 kobj_class_t
 devclass_find_driver(devclass_t dc, const char *classname)
 {
 	driverlink_t dl;
 
 	dl = devclass_find_driver_internal(dc, classname);
 	if (dl)
 		return (dl->driver);
 	return (NULL);
 }
 
 /**
  * @brief Return the name of the devclass
  */
 const char *
 devclass_get_name(devclass_t dc)
 {
 	return (dc->name);
 }
 
 /**
  * @brief Find a device given a unit number
  *
  * @param dc		the devclass to search
  * @param unit		the unit number to search for
  * 
  * @returns		the device with the given unit number or @c
  *			NULL if there is no such device
  */
 device_t
 devclass_get_device(devclass_t dc, int unit)
 {
 	if (dc == NULL || unit < 0 || unit >= dc->maxunit)
 		return (NULL);
 	return (dc->devices[unit]);
 }
 
 /**
  * @brief Find the softc field of a device given a unit number
  *
  * @param dc		the devclass to search
  * @param unit		the unit number to search for
  * 
  * @returns		the softc field of the device with the given
  *			unit number or @c NULL if there is no such
  *			device
  */
 void *
 devclass_get_softc(devclass_t dc, int unit)
 {
 	device_t dev;
 
 	dev = devclass_get_device(dc, unit);
 	if (!dev)
 		return (NULL);
 
 	return (device_get_softc(dev));
 }
 
 /**
  * @brief Get a list of devices in the devclass
  *
  * An array containing a list of all the devices in the given devclass
  * is allocated and returned in @p *devlistp. The number of devices
  * in the array is returned in @p *devcountp. The caller should free
  * the array using @c free(p, M_TEMP).
  *
  * @param dc		the devclass to examine
  * @param devlistp	points at location for array pointer return
  *			value
  * @param devcountp	points at location for array size return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp)
 {
 	int count, i;
 	device_t *list;
 
 	count = devclass_get_count(dc);
 	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
 	if (!list)
 		return (ENOMEM);
 
 	count = 0;
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			list[count] = dc->devices[i];
 			count++;
 		}
 	}
 
 	*devlistp = list;
 	*devcountp = count;
 
 	return (0);
 }
 
 /**
  * @brief Get a list of drivers in the devclass
  *
  * An array containing a list of pointers to all the drivers in the
  * given devclass is allocated and returned in @p *listp.  The number
  * of drivers in the array is returned in @p *countp. The caller should
  * free the array using @c free(p, M_TEMP).
  *
  * @param dc		the devclass to examine
  * @param listp		gives location for array pointer return value
  * @param countp	gives location for number of array elements
  *			return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 devclass_get_drivers(devclass_t dc, driver_t ***listp, int *countp)
 {
 	driverlink_t dl;
 	driver_t **list;
 	int count;
 
 	count = 0;
 	TAILQ_FOREACH(dl, &dc->drivers, link)
 		count++;
 	list = malloc(count * sizeof(driver_t *), M_TEMP, M_NOWAIT);
 	if (list == NULL)
 		return (ENOMEM);
 
 	count = 0;
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		list[count] = dl->driver;
 		count++;
 	}
 	*listp = list;
 	*countp = count;
 
 	return (0);
 }
 
 /**
  * @brief Get the number of devices in a devclass
  *
  * @param dc		the devclass to examine
  */
 int
 devclass_get_count(devclass_t dc)
 {
 	int count, i;
 
 	count = 0;
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i])
 			count++;
 	return (count);
 }
 
 /**
  * @brief Get the maximum unit number used in a devclass
  *
  * Note that this is one greater than the highest currently-allocated
  * unit.
  *
  * @param dc		the devclass to examine
  */
 int
 devclass_get_maxunit(devclass_t dc)
 {
 	return (dc->maxunit);
 }
 
 /**
  * @brief Find a free unit number in a devclass
  *
  * This function searches for the first unused unit number greater
  * that or equal to @p unit.
  *
  * @param dc		the devclass to examine
  * @param unit		the first unit number to check
  */
 int
 devclass_find_free_unit(devclass_t dc, int unit)
 {
 	if (dc == NULL)
 		return (unit);
 	while (unit < dc->maxunit && dc->devices[unit] != NULL)
 		unit++;
 	return (unit);
 }
 
 /**
  * @brief Set the parent of a devclass
  *
  * The parent class is normally initialised automatically by
  * DRIVER_MODULE().
  *
  * @param dc		the devclass to edit
  * @param pdc		the new parent devclass
  */
 void
 devclass_set_parent(devclass_t dc, devclass_t pdc)
 {
 	dc->parent = pdc;
 }
 
 /**
  * @brief Get the parent of a devclass
  *
  * @param dc		the devclass to examine
  */
 devclass_t
 devclass_get_parent(devclass_t dc)
 {
 	return (dc->parent);
 }
 
 struct sysctl_ctx_list *
 devclass_get_sysctl_ctx(devclass_t dc)
 {
 	return (&dc->sysctl_ctx);
 }
 
 struct sysctl_oid *
 devclass_get_sysctl_tree(devclass_t dc)
 {
 	return (dc->sysctl_tree);
 }
 
 /**
  * @internal
  * @brief Allocate a unit number
  *
  * On entry, @p *unitp is the desired unit number (or @c -1 if any
  * will do). The allocated unit number is returned in @p *unitp.
 
  * @param dc		the devclass to allocate from
  * @param unitp		points at the location for the allocated unit
  *			number
  *
  * @retval 0		success
  * @retval EEXIST	the requested unit number is already allocated
  * @retval ENOMEM	memory allocation failure
  */
 static int
 devclass_alloc_unit(devclass_t dc, int *unitp)
 {
 	int unit = *unitp;
 
 	PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));
 
 	/* If we were given a wired unit number, check for existing device */
 	/* XXX imp XXX */
 	if (unit != -1) {
 		if (unit >= 0 && unit < dc->maxunit &&
 		    dc->devices[unit] != NULL) {
 			if (bootverbose)
 				printf("%s: %s%d already exists; skipping it\n",
 				    dc->name, dc->name, *unitp);
 			return (EEXIST);
 		}
 	} else {
 		/* Unwired device, find the next available slot for it */
 		unit = 0;
 		while (unit < dc->maxunit && dc->devices[unit] != NULL)
 			unit++;
 	}
 
 	/*
 	 * We've selected a unit beyond the length of the table, so let's
 	 * extend the table to make room for all units up to and including
 	 * this one.
 	 */
 	if (unit >= dc->maxunit) {
 		device_t *newlist;
 		int newsize;
 
 		newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t));
 		newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT);
 		if (!newlist)
 			return (ENOMEM);
 		bcopy(dc->devices, newlist, sizeof(device_t) * dc->maxunit);
 		bzero(newlist + dc->maxunit,
 		    sizeof(device_t) * (newsize - dc->maxunit));
 		if (dc->devices)
 			free(dc->devices, M_BUS);
 		dc->devices = newlist;
 		dc->maxunit = newsize;
 	}
 	PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));
 
 	*unitp = unit;
 	return (0);
 }
 
 /**
  * @internal
  * @brief Add a device to a devclass
  *
  * A unit number is allocated for the device (using the device's
  * preferred unit number if any) and the device is registered in the
  * devclass. This allows the device to be looked up by its unit
  * number, e.g. by decoding a dev_t minor number.
  *
  * @param dc		the devclass to add to
  * @param dev		the device to add
  *
  * @retval 0		success
  * @retval EEXIST	the requested unit number is already allocated
  * @retval ENOMEM	memory allocation failure
  */
 static int
 devclass_add_device(devclass_t dc, device_t dev)
 {
 	int buflen, error;
 
 	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
 
 	buflen = snprintf(NULL, 0, "%s%d$", dc->name, dev->unit);
 	if (buflen < 0)
 		return (ENOMEM);
 	dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO);
 	if (!dev->nameunit)
 		return (ENOMEM);
 
 	if ((error = devclass_alloc_unit(dc, &dev->unit)) != 0) {
 		free(dev->nameunit, M_BUS);
 		dev->nameunit = NULL;
 		return (error);
 	}
 	dc->devices[dev->unit] = dev;
 	dev->devclass = dc;
 	snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit);
 
 	return (0);
 }
 
 /**
  * @internal
  * @brief Delete a device from a devclass
  *
  * The device is removed from the devclass's device list and its unit
  * number is freed.
 
  * @param dc		the devclass to delete from
  * @param dev		the device to delete
  *
  * @retval 0		success
  */
 static int
 devclass_delete_device(devclass_t dc, device_t dev)
 {
 	if (!dc || !dev)
 		return (0);
 
 	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
 
 	if (dev->devclass != dc || dc->devices[dev->unit] != dev)
 		panic("devclass_delete_device: inconsistent device class");
 	dc->devices[dev->unit] = NULL;
 	if (dev->flags & DF_WILDCARD)
 		dev->unit = -1;
 	dev->devclass = NULL;
 	free(dev->nameunit, M_BUS);
 	dev->nameunit = NULL;
 
 	return (0);
 }
 
 /**
  * @internal
  * @brief Make a new device and add it as a child of @p parent
  *
  * @param parent	the parent of the new device
  * @param name		the devclass name of the new device or @c NULL
  *			to leave the devclass unspecified
  * @parem unit		the unit number of the new device of @c -1 to
  *			leave the unit number unspecified
  *
  * @returns the new device
  */
 static device_t
 make_device(device_t parent, const char *name, int unit)
 {
 	device_t dev;
 	devclass_t dc;
 
 	PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));
 
 	if (name) {
 		dc = devclass_find_internal(name, 0, TRUE);
 		if (!dc) {
 			printf("make_device: can't find device class %s\n",
 			    name);
 			return (NULL);
 		}
 	} else {
 		dc = NULL;
 	}
 
 	dev = malloc(sizeof(struct device), M_BUS, M_NOWAIT|M_ZERO);
 	if (!dev)
 		return (NULL);
 
 	dev->parent = parent;
 	TAILQ_INIT(&dev->children);
 	kobj_init((kobj_t) dev, &null_class);
 	dev->driver = NULL;
 	dev->devclass = NULL;
 	dev->unit = unit;
 	dev->nameunit = NULL;
 	dev->desc = NULL;
 	dev->busy = 0;
 	dev->devflags = 0;
 	dev->flags = DF_ENABLED;
 	dev->order = 0;
 	if (unit == -1)
 		dev->flags |= DF_WILDCARD;
 	if (name) {
 		dev->flags |= DF_FIXEDCLASS;
 		if (devclass_add_device(dc, dev)) {
 			kobj_delete((kobj_t) dev, M_BUS);
 			return (NULL);
 		}
 	}
 	dev->ivars = NULL;
 	dev->softc = NULL;
 
 	dev->state = DS_NOTPRESENT;
 
 	TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink);
 	bus_data_generation_update();
 
 	return (dev);
 }
 
 /**
  * @internal
  * @brief Print a description of a device.
  */
 static int
 device_print_child(device_t dev, device_t child)
 {
 	int retval = 0;
 
 	if (device_is_alive(child))
 		retval += BUS_PRINT_CHILD(dev, child);
 	else
 		retval += device_printf(child, " not found\n");
 
 	return (retval);
 }
 
 /**
  * @brief Create a new device
  *
  * This creates a new device and adds it as a child of an existing
  * parent device. The new device will be added after the last existing
  * child with order zero.
  * 
  * @param dev		the device which will be the parent of the
  *			new child device
  * @param name		devclass name for new device or @c NULL if not
  *			specified
  * @param unit		unit number for new device or @c -1 if not
  *			specified
  * 
  * @returns		the new device
  */
 device_t
 device_add_child(device_t dev, const char *name, int unit)
 {
 	return (device_add_child_ordered(dev, 0, name, unit));
 }
 
 /**
  * @brief Create a new device
  *
  * This creates a new device and adds it as a child of an existing
  * parent device. The new device will be added after the last existing
  * child with the same order.
  * 
  * @param dev		the device which will be the parent of the
  *			new child device
  * @param order		a value which is used to partially sort the
  *			children of @p dev - devices created using
  *			lower values of @p order appear first in @p
  *			dev's list of children
  * @param name		devclass name for new device or @c NULL if not
  *			specified
  * @param unit		unit number for new device or @c -1 if not
  *			specified
  * 
  * @returns		the new device
  */
 device_t
 device_add_child_ordered(device_t dev, int order, const char *name, int unit)
 {
 	device_t child;
 	device_t place;
 
 	PDEBUG(("%s at %s with order %d as unit %d",
 	    name, DEVICENAME(dev), order, unit));
 
 	child = make_device(dev, name, unit);
 	if (child == NULL)
 		return (child);
 	child->order = order;
 
 	TAILQ_FOREACH(place, &dev->children, link) {
 		if (place->order > order)
 			break;
 	}
 
 	if (place) {
 		/*
 		 * The device 'place' is the first device whose order is
 		 * greater than the new child.
 		 */
 		TAILQ_INSERT_BEFORE(place, child, link);
 	} else {
 		/*
 		 * The new child's order is greater or equal to the order of
 		 * any existing device. Add the child to the tail of the list.
 		 */
 		TAILQ_INSERT_TAIL(&dev->children, child, link);
 	}
 
 	bus_data_generation_update();
 	return (child);
 }
 
 /**
  * @brief Delete a device
  *
  * This function deletes a device along with all of its children. If
  * the device currently has a driver attached to it, the device is
  * detached first using device_detach().
  * 
  * @param dev		the parent device
  * @param child		the device to delete
  *
  * @retval 0		success
  * @retval non-zero	a unit error code describing the error
  */
 int
 device_delete_child(device_t dev, device_t child)
 {
 	int error;
 	device_t grandchild;
 
 	PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
 
 	/* remove children first */
 	while ( (grandchild = TAILQ_FIRST(&child->children)) ) {
 		error = device_delete_child(child, grandchild);
 		if (error)
 			return (error);
 	}
 
 	if ((error = device_detach(child)) != 0)
 		return (error);
 	if (child->devclass)
 		devclass_delete_device(child->devclass, child);
 	TAILQ_REMOVE(&dev->children, child, link);
 	TAILQ_REMOVE(&bus_data_devices, child, devlink);
 	kobj_delete((kobj_t) child, M_BUS);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Find a device given a unit number
  *
  * This is similar to devclass_get_devices() but only searches for
  * devices which have @p dev as a parent.
  *
  * @param dev		the parent device to search
  * @param unit		the unit number to search for.  If the unit is -1,
  *			return the first child of @p dev which has name
  *			@p classname (that is, the one with the lowest unit.)
  *
  * @returns		the device with the given unit number or @c
  *			NULL if there is no such device
  */
 device_t
 device_find_child(device_t dev, const char *classname, int unit)
 {
 	devclass_t dc;
 	device_t child;
 
 	dc = devclass_find(classname);
 	if (!dc)
 		return (NULL);
 
 	if (unit != -1) {
 		child = devclass_get_device(dc, unit);
 		if (child && child->parent == dev)
 			return (child);
 	} else {
 		for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
 			child = devclass_get_device(dc, unit);
 			if (child && child->parent == dev)
 				return (child);
 		}
 	}
 	return (NULL);
 }
 
 /**
  * @internal
  */
 static driverlink_t
 first_matching_driver(devclass_t dc, device_t dev)
 {
 	if (dev->devclass)
 		return (devclass_find_driver_internal(dc, dev->devclass->name));
 	return (TAILQ_FIRST(&dc->drivers));
 }
 
 /**
  * @internal
  */
 static driverlink_t
 next_matching_driver(devclass_t dc, device_t dev, driverlink_t last)
 {
 	if (dev->devclass) {
 		driverlink_t dl;
 		for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link))
 			if (!strcmp(dev->devclass->name, dl->driver->name))
 				return (dl);
 		return (NULL);
 	}
 	return (TAILQ_NEXT(last, link));
 }
 
 /**
  * @internal
  */
 static int
 device_probe_child(device_t dev, device_t child)
 {
 	devclass_t dc;
 	driverlink_t best = 0;
 	driverlink_t dl;
 	int result, pri = 0;
 	int hasclass = (child->devclass != 0);
 
 	GIANT_REQUIRED;
 
 	dc = dev->devclass;
 	if (!dc)
 		panic("device_probe_child: parent device has no devclass");
 
 	/*
 	 * If the state is already probed, then return.  However, don't
 	 * return if we can rebid this object.
 	 */
 	if (child->state == DS_ALIVE && (child->flags & DF_REBID) == 0)
 		return (0);
 
 	for (; dc; dc = dc->parent) {
 		for (dl = first_matching_driver(dc, child);
 		     dl;
 		     dl = next_matching_driver(dc, child, dl)) {
 			PDEBUG(("Trying %s", DRIVERNAME(dl->driver)));
 			device_set_driver(child, dl->driver);
 			if (!hasclass)
 				device_set_devclass(child, dl->driver->name);
 
 			/* Fetch any flags for the device before probing. */
 			resource_int_value(dl->driver->name, child->unit,
 			    "flags", &child->devflags);
 
 			result = DEVICE_PROBE(child);
 
 			/* Reset flags and devclass before the next probe. */
 			child->devflags = 0;
 			if (!hasclass)
 				device_set_devclass(child, 0);
 
 			/*
 			 * If the driver returns SUCCESS, there can be
 			 * no higher match for this device.
 			 */
 			if (result == 0) {
 				best = dl;
 				pri = 0;
 				break;
 			}
 
 			/*
 			 * The driver returned an error so it
 			 * certainly doesn't match.
 			 */
 			if (result > 0) {
 				device_set_driver(child, 0);
 				continue;
 			}
 
 			/*
 			 * A priority lower than SUCCESS, remember the
 			 * best matching driver. Initialise the value
 			 * of pri for the first match.
 			 */
 			if (best == 0 || result > pri) {
 				best = dl;
 				pri = result;
 				continue;
 			}
 		}
 		/*
 		 * If we have an unambiguous match in this devclass,
 		 * don't look in the parent.
 		 */
 		if (best && pri == 0)
 			break;
 	}
 
 	/*
 	 * If we found a driver, change state and initialise the devclass.
 	 */
 	/* XXX What happens if we rebid and got no best? */
 	if (best) {
 		/*
 		 * If this device was atached, and we were asked to
 		 * rescan, and it is a different driver, then we have
 		 * to detach the old driver and reattach this new one.
 		 * Note, we don't have to check for DF_REBID here
 		 * because if the state is > DS_ALIVE, we know it must
 		 * be.
 		 *
 		 * This assumes that all DF_REBID drivers can have
 		 * their probe routine called at any time and that
 		 * they are idempotent as well as completely benign in
 		 * normal operations.
 		 *
 		 * We also have to make sure that the detach
 		 * succeeded, otherwise we fail the operation (or
 		 * maybe it should just fail silently?  I'm torn).
 		 */
 		if (child->state > DS_ALIVE && best->driver != child->driver)
 			if ((result = device_detach(dev)) != 0)
 				return (result);
 
 		/* Set the winning driver, devclass, and flags. */
 		if (!child->devclass)
 			device_set_devclass(child, best->driver->name);
 		device_set_driver(child, best->driver);
 		resource_int_value(best->driver->name, child->unit,
 		    "flags", &child->devflags);
 
 		if (pri < 0) {
 			/*
 			 * A bit bogus. Call the probe method again to make
 			 * sure that we have the right description.
 			 */
 			DEVICE_PROBE(child);
 #if 0
 			child->flags |= DF_REBID;
 #endif
 		} else
 			child->flags &= ~DF_REBID;
 		child->state = DS_ALIVE;
 
 		bus_data_generation_update();
 		return (0);
 	}
 
 	return (ENXIO);
 }
 
 /**
  * @brief Return the parent of a device
  */
 device_t
 device_get_parent(device_t dev)
 {
 	return (dev->parent);
 }
 
 /**
  * @brief Get a list of children of a device
  *
  * An array containing a list of all the children of the given device
  * is allocated and returned in @p *devlistp. The number of devices
  * in the array is returned in @p *devcountp. The caller should free
  * the array using @c free(p, M_TEMP).
  *
  * @param dev		the device to examine
  * @param devlistp	points at location for array pointer return
  *			value
  * @param devcountp	points at location for array size return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 device_get_children(device_t dev, device_t **devlistp, int *devcountp)
 {
 	int count;
 	device_t child;
 	device_t *list;
 
 	count = 0;
 	TAILQ_FOREACH(child, &dev->children, link) {
 		count++;
 	}
 
 	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
 	if (!list)
 		return (ENOMEM);
 
 	count = 0;
 	TAILQ_FOREACH(child, &dev->children, link) {
 		list[count] = child;
 		count++;
 	}
 
 	*devlistp = list;
 	*devcountp = count;
 
 	return (0);
 }
 
 /**
  * @brief Return the current driver for the device or @c NULL if there
  * is no driver currently attached
  */
 driver_t *
 device_get_driver(device_t dev)
 {
 	return (dev->driver);
 }
 
 /**
  * @brief Return the current devclass for the device or @c NULL if
  * there is none.
  */
 devclass_t
 device_get_devclass(device_t dev)
 {
 	return (dev->devclass);
 }
 
 /**
  * @brief Return the name of the device's devclass or @c NULL if there
  * is none.
  */
 const char *
 device_get_name(device_t dev)
 {
 	if (dev != NULL && dev->devclass)
 		return (devclass_get_name(dev->devclass));
 	return (NULL);
 }
 
 /**
  * @brief Return a string containing the device's devclass name
  * followed by an ascii representation of the device's unit number
  * (e.g. @c "foo2").
  */
 const char *
 device_get_nameunit(device_t dev)
 {
 	return (dev->nameunit);
 }
 
 /**
  * @brief Return the device's unit number.
  */
 int
 device_get_unit(device_t dev)
 {
 	return (dev->unit);
 }
 
 /**
  * @brief Return the device's description string
  */
 const char *
 device_get_desc(device_t dev)
 {
 	return (dev->desc);
 }
 
 /**
  * @brief Return the device's flags
  */
 u_int32_t
 device_get_flags(device_t dev)
 {
 	return (dev->devflags);
 }
 
 struct sysctl_ctx_list *
 device_get_sysctl_ctx(device_t dev)
 {
 	return (&dev->sysctl_ctx);
 }
 
 struct sysctl_oid *
 device_get_sysctl_tree(device_t dev)
 {
 	return (dev->sysctl_tree);
 }
 
 /**
  * @brief Print the name of the device followed by a colon and a space
  *
  * @returns the number of characters printed
  */
 int
 device_print_prettyname(device_t dev)
 {
 	const char *name = device_get_name(dev);
 
 	if (name == 0)
 		return (printf("unknown: "));
 	return (printf("%s%d: ", name, device_get_unit(dev)));
 }
 
 /**
  * @brief Print the name of the device followed by a colon, a space
  * and the result of calling vprintf() with the value of @p fmt and
  * the following arguments.
  *
  * @returns the number of characters printed
  */
 int
 device_printf(device_t dev, const char * fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	retval = device_print_prettyname(dev);
 	va_start(ap, fmt);
 	retval += vprintf(fmt, ap);
 	va_end(ap);
 	return (retval);
 }
 
 /**
  * @internal
  */
 static void
 device_set_desc_internal(device_t dev, const char* desc, int copy)
 {
 	if (dev->desc && (dev->flags & DF_DESCMALLOCED)) {
 		free(dev->desc, M_BUS);
 		dev->flags &= ~DF_DESCMALLOCED;
 		dev->desc = NULL;
 	}
 
 	if (copy && desc) {
 		dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT);
 		if (dev->desc) {
 			strcpy(dev->desc, desc);
 			dev->flags |= DF_DESCMALLOCED;
 		}
 	} else {
 		/* Avoid a -Wcast-qual warning */
 		dev->desc = (char *)(uintptr_t) desc;
 	}
 
 	bus_data_generation_update();
 }
 
 /**
  * @brief Set the device's description
  *
  * The value of @c desc should be a string constant that will not
  * change (at least until the description is changed in a subsequent
  * call to device_set_desc() or device_set_desc_copy()).
  */
 void
 device_set_desc(device_t dev, const char* desc)
 {
 	device_set_desc_internal(dev, desc, FALSE);
 }
 
 /**
  * @brief Set the device's description
  *
  * The string pointed to by @c desc is copied. Use this function if
  * the device description is generated, (e.g. with sprintf()).
  */
 void
 device_set_desc_copy(device_t dev, const char* desc)
 {
 	device_set_desc_internal(dev, desc, TRUE);
 }
 
 /**
  * @brief Set the device's flags
  */
 void
 device_set_flags(device_t dev, u_int32_t flags)
 {
 	dev->devflags = flags;
 }
 
 /**
  * @brief Return the device's softc field
  *
  * The softc is allocated and zeroed when a driver is attached, based
  * on the size field of the driver.
  */
 void *
 device_get_softc(device_t dev)
 {
 	return (dev->softc);
 }
 
 /**
  * @brief Set the device's softc field
  *
  * Most drivers do not need to use this since the softc is allocated
  * automatically when the driver is attached.
  */
 void
 device_set_softc(device_t dev, void *softc)
 {
 	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC))
 		free(dev->softc, M_BUS_SC);
 	dev->softc = softc;
 	if (dev->softc)
 		dev->flags |= DF_EXTERNALSOFTC;
 	else
 		dev->flags &= ~DF_EXTERNALSOFTC;
 }
 
 /**
  * @brief Get the device's ivars field
  *
  * The ivars field is used by the parent device to store per-device
  * state (e.g. the physical location of the device or a list of
  * resources).
  */
 void *
 device_get_ivars(device_t dev)
 {
 
 	KASSERT(dev != NULL, ("device_get_ivars(NULL, ...)"));
 	return (dev->ivars);
 }
 
 /**
  * @brief Set the device's ivars field
  */
 void
 device_set_ivars(device_t dev, void * ivars)
 {
 
 	KASSERT(dev != NULL, ("device_set_ivars(NULL, ...)"));
 	dev->ivars = ivars;
 }
 
 /**
  * @brief Return the device's state
  */
 device_state_t
 device_get_state(device_t dev)
 {
 	return (dev->state);
 }
 
 /**
  * @brief Set the DF_ENABLED flag for the device
  */
 void
 device_enable(device_t dev)
 {
 	dev->flags |= DF_ENABLED;
 }
 
 /**
  * @brief Clear the DF_ENABLED flag for the device
  */
 void
 device_disable(device_t dev)
 {
 	dev->flags &= ~DF_ENABLED;
 }
 
 /**
  * @brief Increment the busy counter for the device
  */
 void
 device_busy(device_t dev)
 {
 	if (dev->state < DS_ATTACHED)
 		panic("device_busy: called for unattached device");
 	if (dev->busy == 0 && dev->parent)
 		device_busy(dev->parent);
 	dev->busy++;
 	dev->state = DS_BUSY;
 }
 
 /**
  * @brief Decrement the busy counter for the device
  */
 void
 device_unbusy(device_t dev)
 {
 	if (dev->state != DS_BUSY)
 		panic("device_unbusy: called for non-busy device %s",
 		    device_get_nameunit(dev));
 	dev->busy--;
 	if (dev->busy == 0) {
 		if (dev->parent)
 			device_unbusy(dev->parent);
 		dev->state = DS_ATTACHED;
 	}
 }
 
 /**
  * @brief Set the DF_QUIET flag for the device
  */
 void
 device_quiet(device_t dev)
 {
 	dev->flags |= DF_QUIET;
 }
 
 /**
  * @brief Clear the DF_QUIET flag for the device
  */
 void
 device_verbose(device_t dev)
 {
 	dev->flags &= ~DF_QUIET;
 }
 
 /**
  * @brief Return non-zero if the DF_QUIET flag is set on the device
  */
 int
 device_is_quiet(device_t dev)
 {
 	return ((dev->flags & DF_QUIET) != 0);
 }
 
 /**
  * @brief Return non-zero if the DF_ENABLED flag is set on the device
  */
 int
 device_is_enabled(device_t dev)
 {
 	return ((dev->flags & DF_ENABLED) != 0);
 }
 
 /**
  * @brief Return non-zero if the device was successfully probed
  */
 int
 device_is_alive(device_t dev)
 {
 	return (dev->state >= DS_ALIVE);
 }
 
 /**
  * @brief Return non-zero if the device currently has a driver
  * attached to it
  */
 int
 device_is_attached(device_t dev)
 {
 	return (dev->state >= DS_ATTACHED);
 }
 
 /**
  * @brief Set the devclass of a device
  * @see devclass_add_device().
  */
 int
 device_set_devclass(device_t dev, const char *classname)
 {
 	devclass_t dc;
 	int error;
 
 	if (!classname) {
 		if (dev->devclass)
 			devclass_delete_device(dev->devclass, dev);
 		return (0);
 	}
 
 	if (dev->devclass) {
 		printf("device_set_devclass: device class already set\n");
 		return (EINVAL);
 	}
 
 	dc = devclass_find_internal(classname, 0, TRUE);
 	if (!dc)
 		return (ENOMEM);
 
 	error = devclass_add_device(dc, dev);
 
 	bus_data_generation_update();
 	return (error);
 }
 
 /**
  * @brief Set the driver of a device
  *
  * @retval 0		success
  * @retval EBUSY	the device already has a driver attached
  * @retval ENOMEM	a memory allocation failure occurred
  */
 int
 device_set_driver(device_t dev, driver_t *driver)
 {
 	if (dev->state >= DS_ATTACHED)
 		return (EBUSY);
 
 	if (dev->driver == driver)
 		return (0);
 
 	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) {
 		free(dev->softc, M_BUS_SC);
 		dev->softc = NULL;
 	}
 	kobj_delete((kobj_t) dev, 0);
 	dev->driver = driver;
 	if (driver) {
 		kobj_init((kobj_t) dev, (kobj_class_t) driver);
 		if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) {
 			dev->softc = malloc(driver->size, M_BUS_SC,
 			    M_NOWAIT | M_ZERO);
 			if (!dev->softc) {
 				kobj_delete((kobj_t) dev, 0);
 				kobj_init((kobj_t) dev, &null_class);
 				dev->driver = NULL;
 				return (ENOMEM);
 			}
 		}
 	} else {
 		kobj_init((kobj_t) dev, &null_class);
 	}
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Probe a device and attach a driver if possible
  *
  * This function is the core of the device autoconfiguration
  * system. Its purpose is to select a suitable driver for a device and
  * then call that driver to initialise the hardware appropriately. The
  * driver is selected by calling the DEVICE_PROBE() method of a set of
  * candidate drivers and then choosing the driver which returned the
  * best value. This driver is then attached to the device using
  * device_attach().
  *
  * The set of suitable drivers is taken from the list of drivers in
  * the parent device's devclass. If the device was originally created
  * with a specific class name (see device_add_child()), only drivers
  * with that name are probed, otherwise all drivers in the devclass
  * are probed. If no drivers return successful probe values in the
  * parent devclass, the search continues in the parent of that
  * devclass (see devclass_get_parent()) if any.
  *
  * @param dev		the device to initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_probe_and_attach(device_t dev)
 {
 	int error;
 
 	GIANT_REQUIRED;
 
 	if (dev->state >= DS_ALIVE && (dev->flags & DF_REBID) == 0)
 		return (0);
 
 	if (!(dev->flags & DF_ENABLED)) {
 		if (bootverbose && device_get_name(dev) != NULL) {
 			device_print_prettyname(dev);
 			printf("not probed (disabled)\n");
 		}
 		return (0);
 	}
 	if ((error = device_probe_child(dev->parent, dev)) != 0) {
 		if (!(dev->flags & DF_DONENOMATCH)) {
 			BUS_PROBE_NOMATCH(dev->parent, dev);
 			devnomatch(dev);
 			dev->flags |= DF_DONENOMATCH;
 		}
 		return (error);
 	}
 	error = device_attach(dev);
 
 	return (error);
 }
 
 /**
  * @brief Attach a device driver to a device
  *
  * This function is a wrapper around the DEVICE_ATTACH() driver
  * method. In addition to calling DEVICE_ATTACH(), it initialises the
  * device's sysctl tree, optionally prints a description of the device
  * and queues a notification event for user-based device management
  * services.
  *
  * Normally this function is only called internally from
  * device_probe_and_attach().
  *
  * @param dev		the device to initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_attach(device_t dev)
 {
 	int error;
 
 	device_sysctl_init(dev);
 	if (!device_is_quiet(dev))
 		device_print_child(dev->parent, dev);
 	if ((error = DEVICE_ATTACH(dev)) != 0) {
 		printf("device_attach: %s%d attach returned %d\n",
 		    dev->driver->name, dev->unit, error);
 		/* Unset the class; set in device_probe_child */
 		if (dev->devclass == 0)
 			device_set_devclass(dev, 0);
 		device_set_driver(dev, NULL);
 		device_sysctl_fini(dev);
 		dev->state = DS_NOTPRESENT;
 		return (error);
 	}
 	dev->state = DS_ATTACHED;
 	devadded(dev);
 	return (0);
 }
 
 /**
  * @brief Detach a driver from a device
  *
  * This function is a wrapper around the DEVICE_DETACH() driver
  * method. If the call to DEVICE_DETACH() succeeds, it calls
  * BUS_CHILD_DETACHED() for the parent of @p dev, queues a
  * notification event for user-based device management services and
  * cleans up the device's sysctl tree.
  *
  * @param dev		the device to un-initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_detach(device_t dev)
 {
 	int error;
 
 	GIANT_REQUIRED;
 
 	PDEBUG(("%s", DEVICENAME(dev)));
 	if (dev->state == DS_BUSY)
 		return (EBUSY);
 	if (dev->state != DS_ATTACHED)
 		return (0);
 
 	if ((error = DEVICE_DETACH(dev)) != 0)
 		return (error);
 	devremoved(dev);
 	device_printf(dev, "detached\n");
 	if (dev->parent)
 		BUS_CHILD_DETACHED(dev->parent, dev);
 
 	if (!(dev->flags & DF_FIXEDCLASS))
 		devclass_delete_device(dev->devclass, dev);
 
 	dev->state = DS_NOTPRESENT;
 	device_set_driver(dev, NULL);
 	device_set_desc(dev, NULL);
 	device_sysctl_fini(dev);
 
 	return (0);
 }
 
 /**
  * @brief Tells a driver to quiesce itself.
  *
  * This function is a wrapper around the DEVICE_QUIESCE() driver
  * method. If the call to DEVICE_QUIESCE() succeeds.
  *
  * @param dev		the device to quiesce
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_quiesce(device_t dev)
 {
 
 	PDEBUG(("%s", DEVICENAME(dev)));
 	if (dev->state == DS_BUSY)
 		return (EBUSY);
 	if (dev->state != DS_ATTACHED)
 		return (0);
 
 	return (DEVICE_QUIESCE(dev));
 }
 
 /**
  * @brief Notify a device of system shutdown
  *
  * This function calls the DEVICE_SHUTDOWN() driver method if the
  * device currently has an attached driver.
  *
  * @returns the value returned by DEVICE_SHUTDOWN()
  */
 int
 device_shutdown(device_t dev)
 {
 	if (dev->state < DS_ATTACHED)
 		return (0);
 	return (DEVICE_SHUTDOWN(dev));
 }
 
 /**
  * @brief Set the unit number of a device
  *
  * This function can be used to override the unit number used for a
  * device (e.g. to wire a device to a pre-configured unit number).
  */
 int
 device_set_unit(device_t dev, int unit)
 {
 	devclass_t dc;
 	int err;
 
 	dc = device_get_devclass(dev);
 	if (unit < dc->maxunit && dc->devices[unit])
 		return (EBUSY);
 	err = devclass_delete_device(dc, dev);
 	if (err)
 		return (err);
 	dev->unit = unit;
 	err = devclass_add_device(dc, dev);
 	if (err)
 		return (err);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /*======================================*/
 /*
  * Some useful method implementations to make life easier for bus drivers.
  */
 
 /**
  * @brief Initialise a resource list.
  *
  * @param rl		the resource list to initialise
  */
 void
 resource_list_init(struct resource_list *rl)
 {
 	STAILQ_INIT(rl);
 }
 
 /**
  * @brief Reclaim memory used by a resource list.
  *
  * This function frees the memory for all resource entries on the list
  * (if any).
  *
  * @param rl		the resource list to free		
  */
 void
 resource_list_free(struct resource_list *rl)
 {
 	struct resource_list_entry *rle;
 
 	while ((rle = STAILQ_FIRST(rl)) != NULL) {
 		if (rle->res)
 			panic("resource_list_free: resource entry is busy");
 		STAILQ_REMOVE_HEAD(rl, link);
 		free(rle, M_BUS);
 	}
 }
 
 /**
  * @brief Add a resource entry.
  *
  * This function adds a resource entry using the given @p type, @p
  * start, @p end and @p count values. A rid value is chosen by
  * searching sequentially for the first unused rid starting at zero.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param start		the start address of the resource
  * @param end		the end address of the resource
  * @param count		XXX end-start+1
  */
 int
 resource_list_add_next(struct resource_list *rl, int type, u_long start,
     u_long end, u_long count)
 {
 	int rid;
 
 	rid = 0;
 	while (resource_list_find(rl, type, rid) != NULL)
 		rid++;
 	resource_list_add(rl, type, rid, start, end, count);
 	return (rid);
 }
 
 /**
  * @brief Add or modify a resource entry.
  *
  * If an existing entry exists with the same type and rid, it will be
  * modified using the given values of @p start, @p end and @p
  * count. If no entry exists, a new one will be created using the
  * given values.  The resource list entry that matches is then returned.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  * @param start		the start address of the resource
  * @param end		the end address of the resource
  * @param count		XXX end-start+1
  */
 struct resource_list_entry *
 resource_list_add(struct resource_list *rl, int type, int rid,
     u_long start, u_long end, u_long count)
 {
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (!rle) {
 		rle = malloc(sizeof(struct resource_list_entry), M_BUS,
 		    M_NOWAIT);
 		if (!rle)
 			panic("resource_list_add: can't record entry");
 		STAILQ_INSERT_TAIL(rl, rle, link);
 		rle->type = type;
 		rle->rid = rid;
 		rle->res = NULL;
 	}
 
 	if (rle->res)
 		panic("resource_list_add: resource entry is busy");
 
 	rle->start = start;
 	rle->end = end;
 	rle->count = count;
 	return (rle);
 }
 
 /**
  * @brief Find a resource entry by type and rid.
  *
  * @param rl		the resource list to search
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  *
  * @returns the resource entry pointer or NULL if there is no such
  * entry.
  */
 struct resource_list_entry *
 resource_list_find(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle;
 
 	STAILQ_FOREACH(rle, rl, link) {
 		if (rle->type == type && rle->rid == rid)
 			return (rle);
 	}
 	return (NULL);
 }
 
 /**
  * @brief Delete a resource entry.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  */
 void
 resource_list_delete(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle = resource_list_find(rl, type, rid);
 
 	if (rle) {
 		if (rle->res != NULL)
 			panic("resource_list_delete: resource has not been released");
 		STAILQ_REMOVE(rl, rle, resource_list_entry, link);
 		free(rle, M_BUS);
 	}
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE()
  *
  * Implement BUS_ALLOC_RESOURCE() by looking up a resource from the list
  * and passing the allocation up to the parent of @p bus. This assumes
  * that the first entry of @c device_get_ivars(child) is a struct
  * resource_list. This also handles 'passthrough' allocations where a
  * child is a remote descendant of bus by passing the allocation up to
  * the parent of bus.
  *
  * Typically, a bus driver would store a list of child resources
  * somewhere in the child device's ivars (see device_get_ivars()) and
  * its implementation of BUS_ALLOC_RESOURCE() would find that list and
  * then call resource_list_alloc() to perform the allocation.
  *
  * @param rl		the resource list to allocate from
  * @param bus		the parent device of @p child
  * @param child		the device which is requesting an allocation
  * @param type		the type of resource to allocate
  * @param rid		a pointer to the resource identifier
  * @param start		hint at the start of the resource range - pass
  *			@c 0UL for any start address
  * @param end		hint at the end of the resource range - pass
  *			@c ~0UL for any end address
  * @param count		hint at the size of range required - pass @c 1
  *			for any size
  * @param flags		any extra flags to control the resource
  *			allocation - see @c RF_XXX flags in
  *			<sys/rman.h> for details
  * 
  * @returns		the resource which was allocated or @c NULL if no
  *			resource could be allocated
  */
 struct resource *
 resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
     int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
 {
 	struct resource_list_entry *rle = 0;
 	int passthrough = (device_get_parent(child) != bus);
 	int isdefault = (start == 0UL && end == ~0UL);
 
 	if (passthrough) {
 		return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
 		    type, rid, start, end, count, flags));
 	}
 
 	rle = resource_list_find(rl, type, *rid);
 
 	if (!rle)
 		return (NULL);		/* no resource of that type/rid */
 
 	if (rle->res)
 		panic("resource_list_alloc: resource entry is busy");
 
 	if (isdefault) {
 		start = rle->start;
 		count = ulmax(count, rle->count);
 		end = ulmax(rle->end, start + count - 1);
 	}
 
 	rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
 	    type, rid, start, end, count, flags);
 
 	/*
 	 * Record the new range.
 	 */
 	if (rle->res) {
 		rle->start = rman_get_start(rle->res);
 		rle->end = rman_get_end(rle->res);
 		rle->count = count;
 	}
 
 	return (rle->res);
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE()
  * 
  * Implement BUS_RELEASE_RESOURCE() using a resource list. Normally
  * used with resource_list_alloc().
  * 
  * @param rl		the resource list which was allocated from
  * @param bus		the parent device of @p child
  * @param child		the device which is requesting a release
  * @param type		the type of resource to allocate
  * @param rid		the resource identifier
  * @param res		the resource to release
  * 
  * @retval 0		success
  * @retval non-zero	a standard unix error code indicating what
  *			error condition prevented the operation
  */
 int
 resource_list_release(struct resource_list *rl, device_t bus, device_t child,
     int type, int rid, struct resource *res)
 {
 	struct resource_list_entry *rle = 0;
 	int passthrough = (device_get_parent(child) != bus);
 	int error;
 
 	if (passthrough) {
 		return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
 		    type, rid, res));
 	}
 
 	rle = resource_list_find(rl, type, rid);
 
 	if (!rle)
 		panic("resource_list_release: can't find resource");
 	if (!rle->res)
 		panic("resource_list_release: resource entry is not busy");
 
 	error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
 	    type, rid, res);
 	if (error)
 		return (error);
 
 	rle->res = NULL;
 	return (0);
 }
 
 /**
  * @brief Print a description of resources in a resource list
  *
  * Print all resources of a specified type, for use in BUS_PRINT_CHILD().
  * The name is printed if at least one resource of the given type is available.
  * The format is used to print resource start and end.
  *
  * @param rl		the resource list to print
  * @param name		the name of @p type, e.g. @c "memory"
  * @param type		type type of resource entry to print
  * @param format	printf(9) format string to print resource
  *			start and end values
  * 
  * @returns		the number of characters printed
  */
 int
 resource_list_print_type(struct resource_list *rl, const char *name, int type,
     const char *format)
 {
 	struct resource_list_entry *rle;
 	int printed, retval;
 
 	printed = 0;
 	retval = 0;
 	/* Yes, this is kinda cheating */
 	STAILQ_FOREACH(rle, rl, link) {
 		if (rle->type == type) {
 			if (printed == 0)
 				retval += printf(" %s ", name);
 			else
 				retval += printf(",");
 			printed++;
 			retval += printf(format, rle->start);
 			if (rle->count > 1) {
 				retval += printf("-");
 				retval += printf(format, rle->start +
 						 rle->count - 1);
 			}
 		}
 	}
 	return (retval);
 }
 
 /**
  * @brief Releases all the resources in a list.
  *
  * @param rl		The resource list to purge.
  * 
  * @returns		nothing
  */
 void
 resource_list_purge(struct resource_list *rl)
 {
 	struct resource_list_entry *rle;
 
 	STAILQ_FOREACH(rle, rl, link) {
 		if (rle->res)
 			bus_release_resource(rman_get_device(rle->res),
 			    rle->type, rle->rid, rle->res);
 		STAILQ_REMOVE_HEAD(rl, link);
 		free(rle, M_BUS);
 	}
 }
 
 /**
  * @brief Helper function for implementing DEVICE_PROBE()
  *
  * This function can be used to help implement the DEVICE_PROBE() for
  * a bus (i.e. a device which has other devices attached to it). It
  * calls the DEVICE_IDENTIFY() method of each driver in the device's
  * devclass.
  */
 int
 bus_generic_probe(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 	driverlink_t dl;
 
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		DEVICE_IDENTIFY(dl->driver, dev);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_ATTACH()
  *
  * This function can be used to help implement the DEVICE_ATTACH() for
  * a bus. It calls device_probe_and_attach() for each of the device's
  * children.
  */
 int
 bus_generic_attach(device_t dev)
 {
 	device_t child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		device_probe_and_attach(child);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_DETACH()
  *
  * This function can be used to help implement the DEVICE_DETACH() for
  * a bus. It calls device_detach() for each of the device's
  * children.
  */
 int
 bus_generic_detach(device_t dev)
 {
 	device_t child;
 	int error;
 
 	if (dev->state != DS_ATTACHED)
 		return (EBUSY);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		if ((error = device_detach(child)) != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_SHUTDOWN()
  *
  * This function can be used to help implement the DEVICE_SHUTDOWN()
  * for a bus. It calls device_shutdown() for each of the device's
  * children.
  */
 int
 bus_generic_shutdown(device_t dev)
 {
 	device_t child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		device_shutdown(child);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_SUSPEND()
  *
  * This function can be used to help implement the DEVICE_SUSPEND()
  * for a bus. It calls DEVICE_SUSPEND() for each of the device's
  * children. If any call to DEVICE_SUSPEND() fails, the suspend
  * operation is aborted and any devices which were suspended are
  * resumed immediately by calling their DEVICE_RESUME() methods.
  */
 int
 bus_generic_suspend(device_t dev)
 {
 	int		error;
 	device_t	child, child2;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		error = DEVICE_SUSPEND(child);
 		if (error) {
 			for (child2 = TAILQ_FIRST(&dev->children);
 			     child2 && child2 != child;
 			     child2 = TAILQ_NEXT(child2, link))
 				DEVICE_RESUME(child2);
 			return (error);
 		}
 	}
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_RESUME()
  *
  * This function can be used to help implement the DEVICE_RESUME() for
  * a bus. It calls DEVICE_RESUME() on each of the device's children.
  */
 int
 bus_generic_resume(device_t dev)
 {
 	device_t	child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		DEVICE_RESUME(child);
 		/* if resume fails, there's nothing we can usefully do... */
 	}
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints the first part of the ascii representation of
  * @p child, including its name, unit and description (if any - see
  * device_set_desc()).
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_header(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	if (device_get_desc(child)) {
 		retval += device_printf(child, "<%s>", device_get_desc(child));
 	} else {
 		retval += printf("%s", device_get_nameunit(child));
 	}
 
 	return (retval);
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints the last part of the ascii representation of
  * @p child, which consists of the string @c " on " followed by the
  * name and unit of the @p dev.
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_footer(device_t dev, device_t child)
 {
 	return (printf(" on %s\n", device_get_nameunit(dev)));
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function simply calls bus_print_child_header() followed by
  * bus_print_child_footer().
  *
  * @returns the number of characters printed
  */
 int
 bus_generic_print_child(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	retval += bus_print_child_header(dev, child);
 	retval += bus_print_child_footer(dev, child);
 
 	return (retval);
 }
 
 /**
  * @brief Stub function for implementing BUS_READ_IVAR().
  * 
  * @returns ENOENT
  */
 int
 bus_generic_read_ivar(device_t dev, device_t child, int index,
     uintptr_t * result)
 {
 	return (ENOENT);
 }
 
 /**
  * @brief Stub function for implementing BUS_WRITE_IVAR().
  * 
  * @returns ENOENT
  */
 int
 bus_generic_write_ivar(device_t dev, device_t child, int index,
     uintptr_t value)
 {
 	return (ENOENT);
 }
 
 /**
  * @brief Stub function for implementing BUS_GET_RESOURCE_LIST().
  * 
  * @returns NULL
  */
 struct resource_list *
 bus_generic_get_resource_list(device_t dev, device_t child)
 {
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DRIVER_ADDED().
  *
  * This implementation of BUS_DRIVER_ADDED() simply calls the driver's
  * DEVICE_IDENTIFY() method to allow it to add new children to the bus
  * and then calls device_probe_and_attach() for each unattached child.
  */
 void
 bus_generic_driver_added(device_t dev, driver_t *driver)
 {
 	device_t child;
 
 	DEVICE_IDENTIFY(driver, dev);
 	TAILQ_FOREACH(child, &dev->children, link) {
 		if (child->state == DS_NOTPRESENT ||
 		    (child->flags & DF_REBID))
 			device_probe_and_attach(child);
 	}
 }
 
 /**
  * @brief Helper function for implementing BUS_SETUP_INTR().
  *
  * This simple implementation of BUS_SETUP_INTR() simply calls the
  * BUS_SETUP_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
     int flags, driver_intr_t *intr, void *arg, void **cookiep)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
 		    intr, arg, cookiep));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_TEARDOWN_INTR().
  *
  * This simple implementation of BUS_TEARDOWN_INTR() simply calls the
  * BUS_TEARDOWN_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
     void *cookie)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
  *
  * This simple implementation of BUS_ALLOC_RESOURCE() simply calls the
  * BUS_ALLOC_RESOURCE() method of the parent of @p dev.
  */
 struct resource *
 bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
     u_long start, u_long end, u_long count, u_int flags)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid,
 		    start, end, count, flags));
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
  *
  * This simple implementation of BUS_RELEASE_RESOURCE() simply calls the
  * BUS_RELEASE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
     struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_ACTIVATE_RESOURCE().
  *
  * This simple implementation of BUS_ACTIVATE_RESOURCE() simply calls the
  * BUS_ACTIVATE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
     struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DEACTIVATE_RESOURCE().
  *
  * This simple implementation of BUS_DEACTIVATE_RESOURCE() simply calls the
  * BUS_DEACTIVATE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_deactivate_resource(device_t dev, device_t child, int type,
     int rid, struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_CONFIG_INTR().
  *
  * This simple implementation of BUS_CONFIG_INTR() simply calls the
  * BUS_CONFIG_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_config_intr(device_t dev, int irq, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_CONFIG_INTR(dev->parent, irq, trig, pol));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_RESOURCE().
  *
  * This implementation of BUS_GET_RESOURCE() uses the
  * resource_list_find() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * search.
  */
 int
 bus_generic_rl_get_resource(device_t dev, device_t child, int type, int rid,
     u_long *startp, u_long *countp)
 {
 	struct resource_list *		rl = NULL;
 	struct resource_list_entry *	rle = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	rle = resource_list_find(rl, type, rid);
 	if (!rle)
 		return (ENOENT);
 
 	if (startp)
 		*startp = rle->start;
 	if (countp)
 		*countp = rle->count;
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_SET_RESOURCE().
  *
  * This implementation of BUS_SET_RESOURCE() uses the
  * resource_list_add() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * edit.
  */
 int
 bus_generic_rl_set_resource(device_t dev, device_t child, int type, int rid,
     u_long start, u_long count)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	resource_list_add(rl, type, rid, start, (start + count - 1), count);
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_DELETE_RESOURCE().
  *
  * This implementation of BUS_DELETE_RESOURCE() uses the
  * resource_list_delete() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * edit.
  */
 void
 bus_generic_rl_delete_resource(device_t dev, device_t child, int type, int rid)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return;
 
 	resource_list_delete(rl, type, rid);
 
 	return;
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
  *
  * This implementation of BUS_RELEASE_RESOURCE() uses the
  * resource_list_release() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
  */
 int
 bus_generic_rl_release_resource(device_t dev, device_t child, int type,
     int rid, struct resource *r)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	return (resource_list_release(rl, dev, child, type, rid, r));
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
  *
  * This implementation of BUS_ALLOC_RESOURCE() uses the
  * resource_list_alloc() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
  */
 struct resource *
 bus_generic_rl_alloc_resource(device_t dev, device_t child, int type,
     int *rid, u_long start, u_long end, u_long count, u_int flags)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (NULL);
 
 	return (resource_list_alloc(rl, dev, child, type, rid,
 	    start, end, count, flags));
 }
 
 /**
  * @brief Helper function for implementing BUS_CHILD_PRESENT().
  *
  * This simple implementation of BUS_CHILD_PRESENT() simply calls the
  * BUS_CHILD_PRESENT() method of the parent of @p dev.
  */
 int
 bus_generic_child_present(device_t dev, device_t child)
 {
 	return (BUS_CHILD_PRESENT(device_get_parent(dev), dev));
 }
 
 /*
  * Some convenience functions to make it easier for drivers to use the
  * resource-management functions.  All these really do is hide the
  * indirection through the parent's method table, making for slightly
  * less-wordy code.  In the future, it might make sense for this code
  * to maintain some sort of a list of resources allocated by each device.
  */
 
 /**
  * @brief Wrapper function for BUS_ALLOC_RESOURCE().
  *
  * This function simply calls the BUS_ALLOC_RESOURCE() method of the
  * parent of @p dev.
  */
 struct resource *
 bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
     u_long count, u_int flags)
 {
 	if (dev->parent == 0)
 		return (0);
 	return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
 	    count, flags));
 }
 
 /**
  * @brief Wrapper function for BUS_ACTIVATE_RESOURCE().
  *
  * This function simply calls the BUS_ACTIVATE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == 0)
 		return (EINVAL);
 	return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_DEACTIVATE_RESOURCE().
  *
  * This function simply calls the BUS_DEACTIVATE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == 0)
 		return (EINVAL);
 	return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_RELEASE_RESOURCE().
  *
  * This function simply calls the BUS_RELEASE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_release_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == 0)
 		return (EINVAL);
 	return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_SETUP_INTR().
  *
  * This function simply calls the BUS_SETUP_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_setup_intr(device_t dev, struct resource *r, int flags,
     driver_intr_t handler, void *arg, void **cookiep)
 {
 	int error;
 
 	if (dev->parent != 0) {
 		if ((flags &~ INTR_ENTROPY) == (INTR_TYPE_NET | INTR_MPSAFE) &&
 		    !debug_mpsafenet)
 			flags &= ~INTR_MPSAFE;
 		error = BUS_SETUP_INTR(dev->parent, dev, r, flags,
 		    handler, arg, cookiep);
 		if (error == 0) {
 			if (!(flags & (INTR_MPSAFE | INTR_FAST)))
 				device_printf(dev, "[GIANT-LOCKED]\n");
 			if (bootverbose && (flags & INTR_MPSAFE))
 				device_printf(dev, "[MPSAFE]\n");
 			if (flags & INTR_FAST)
 				device_printf(dev, "[FAST]\n");
 		}
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 /**
  * @brief Wrapper function for BUS_TEARDOWN_INTR().
  *
  * This function simply calls the BUS_TEARDOWN_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
 {
 	if (dev->parent == 0)
 		return (EINVAL);
 	return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
 }
 
 /**
  * @brief Wrapper function for BUS_SET_RESOURCE().
  *
  * This function simply calls the BUS_SET_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_set_resource(device_t dev, int type, int rid,
     u_long start, u_long count)
 {
 	return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    start, count));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_get_resource(device_t dev, int type, int rid,
     u_long *startp, u_long *countp)
 {
 	return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    startp, countp));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev and returns the start value.
  */
 u_long
 bus_get_resource_start(device_t dev, int type, int rid)
 {
 	u_long start, count;
 	int error;
 
 	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    &start, &count);
 	if (error)
 		return (0);
 	return (start);
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev and returns the count value.
  */
 u_long
 bus_get_resource_count(device_t dev, int type, int rid)
 {
 	u_long start, count;
 	int error;
 
 	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    &start, &count);
 	if (error)
 		return (0);
 	return (count);
 }
 
 /**
  * @brief Wrapper function for BUS_DELETE_RESOURCE().
  *
  * This function simply calls the BUS_DELETE_RESOURCE() method of the
  * parent of @p dev.
  */
 void
 bus_delete_resource(device_t dev, int type, int rid)
 {
 	BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid);
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_PRESENT().
  *
  * This function simply calls the BUS_CHILD_PRESENT() method of the
  * parent of @p dev.
  */
 int
 bus_child_present(device_t child)
 {
 	return (BUS_CHILD_PRESENT(device_get_parent(child), child));
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_PNPINFO_STR().
  *
  * This function simply calls the BUS_CHILD_PNPINFO_STR() method of the
  * parent of @p dev.
  */
 int
 bus_child_pnpinfo_str(device_t child, char *buf, size_t buflen)
 {
 	device_t parent;
 
 	parent = device_get_parent(child);
 	if (parent == NULL) {
 		*buf = '\0';
 		return (0);
 	}
 	return (BUS_CHILD_PNPINFO_STR(parent, child, buf, buflen));
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_LOCATION_STR().
  *
  * This function simply calls the BUS_CHILD_LOCATION_STR() method of the
  * parent of @p dev.
  */
 int
 bus_child_location_str(device_t child, char *buf, size_t buflen)
 {
 	device_t parent;
 
 	parent = device_get_parent(child);
 	if (parent == NULL) {
 		*buf = '\0';
 		return (0);
 	}
 	return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen));
 }
 
 static int
 root_print_child(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	retval += bus_print_child_header(dev, child);
 	retval += printf("\n");
 
 	return (retval);
 }
 
 static int
 root_setup_intr(device_t dev, device_t child, driver_intr_t *intr, void *arg,
     void **cookiep)
 {
 	/*
 	 * If an interrupt mapping gets to here something bad has happened.
 	 */
 	panic("root_setup_intr");
 }
 
 /*
  * If we get here, assume that the device is permanant and really is
  * present in the system.  Removable bus drivers are expected to intercept
  * this call long before it gets here.  We return -1 so that drivers that
  * really care can check vs -1 or some ERRNO returned higher in the food
  * chain.
  */
 static int
 root_child_present(device_t dev, device_t child)
 {
 	return (-1);
 }
 
 static kobj_method_t root_methods[] = {
 	/* Device interface */
 	KOBJMETHOD(device_shutdown,	bus_generic_shutdown),
 	KOBJMETHOD(device_suspend,	bus_generic_suspend),
 	KOBJMETHOD(device_resume,	bus_generic_resume),
 
 	/* Bus interface */
 	KOBJMETHOD(bus_print_child,	root_print_child),
 	KOBJMETHOD(bus_read_ivar,	bus_generic_read_ivar),
 	KOBJMETHOD(bus_write_ivar,	bus_generic_write_ivar),
 	KOBJMETHOD(bus_setup_intr,	root_setup_intr),
 	KOBJMETHOD(bus_child_present,	root_child_present),
 
 	{ 0, 0 }
 };
 
 static driver_t root_driver = {
 	"root",
 	root_methods,
 	1,			/* no softc */
 };
 
 device_t	root_bus;
 devclass_t	root_devclass;
 
 static int
 root_bus_module_handler(module_t mod, int what, void* arg)
 {
 	switch (what) {
 	case MOD_LOAD:
 		TAILQ_INIT(&bus_data_devices);
 		kobj_class_compile((kobj_class_t) &root_driver);
 		root_bus = make_device(NULL, "root", 0);
 		root_bus->desc = "System root bus";
 		kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
 		root_bus->driver = &root_driver;
 		root_bus->state = DS_ATTACHED;
 		root_devclass = devclass_find_internal("root", 0, FALSE);
 		devinit();
 		return (0);
 
 	case MOD_SHUTDOWN:
 		device_shutdown(root_bus);
 		return (0);
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static moduledata_t root_bus_mod = {
 	"rootbus",
 	root_bus_module_handler,
 	0
 };
 DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
 
 /**
  * @brief Automatically configure devices
  *
  * This function begins the autoconfiguration process by calling
  * device_probe_and_attach() for each child of the @c root0 device.
  */ 
 void
 root_bus_configure(void)
 {
 	device_t dev;
 
 	PDEBUG(("."));
 
 	TAILQ_FOREACH(dev, &root_bus->children, link) {
 		device_probe_and_attach(dev);
 	}
 }
 
 /**
  * @brief Module handler for registering device drivers
  *
  * This module handler is used to automatically register device
  * drivers when modules are loaded. If @p what is MOD_LOAD, it calls
  * devclass_add_driver() for the driver described by the
  * driver_module_data structure pointed to by @p arg
  */
 int
 driver_module_handler(module_t mod, int what, void *arg)
 {
 	int error;
 	struct driver_module_data *dmd;
 	devclass_t bus_devclass;
 	kobj_class_t driver;
 
 	dmd = (struct driver_module_data *)arg;
 	bus_devclass = devclass_find_internal(dmd->dmd_busname, 0, TRUE);
 	error = 0;
 
 	switch (what) {
 	case MOD_LOAD:
 		if (dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 
 		driver = dmd->dmd_driver;
 		PDEBUG(("Loading module: driver %s on bus %s",
 		    DRIVERNAME(driver), dmd->dmd_busname));
 		error = devclass_add_driver(bus_devclass, driver);
 		if (error)
 			break;
 
 		/*
 		 * If the driver has any base classes, make the
 		 * devclass inherit from the devclass of the driver's
 		 * first base class. This will allow the system to
 		 * search for drivers in both devclasses for children
 		 * of a device using this driver.
 		 */
 		if (driver->baseclasses) {
 			const char *parentname;
 			parentname = driver->baseclasses[0]->name;
 			*dmd->dmd_devclass =
 				devclass_find_internal(driver->name,
 				    parentname, TRUE);
 		} else {
 			*dmd->dmd_devclass =
 				devclass_find_internal(driver->name, 0, TRUE);
 		}
 		break;
 
 	case MOD_UNLOAD:
 		PDEBUG(("Unloading module: driver %s from bus %s",
 		    DRIVERNAME(dmd->dmd_driver),
 		    dmd->dmd_busname));
 		error = devclass_delete_driver(bus_devclass,
 		    dmd->dmd_driver);
 
 		if (!error && dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 		break;
 	case MOD_QUIESCE:
 		PDEBUG(("Quiesce module: driver %s from bus %s",
 		    DRIVERNAME(dmd->dmd_driver),
 		    dmd->dmd_busname));
 		error = devclass_quiesce_driver(bus_devclass,
 		    dmd->dmd_driver);
 
 		if (!error && dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 #ifdef BUS_DEBUG
 
 /* the _short versions avoid iteration by not calling anything that prints
  * more than oneliners. I love oneliners.
  */
 
 static void
 print_device_short(device_t dev, int indent)
 {
 	if (!dev)
 		return;
 
 	indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s%s,%sivars,%ssoftc,busy=%d\n",
 	    dev->unit, dev->desc,
 	    (dev->parent? "":"no "),
 	    (TAILQ_EMPTY(&dev->children)? "no ":""),
 	    (dev->flags&DF_ENABLED? "enabled,":"disabled,"),
 	    (dev->flags&DF_FIXEDCLASS? "fixed,":""),
 	    (dev->flags&DF_WILDCARD? "wildcard,":""),
 	    (dev->flags&DF_DESCMALLOCED? "descmalloced,":""),
 	    (dev->flags&DF_REBID? "rebiddable,":""),
 	    (dev->ivars? "":"no "),
 	    (dev->softc? "":"no "),
 	    dev->busy));
 }
 
 static void
 print_device(device_t dev, int indent)
 {
 	if (!dev)
 		return;
 
 	print_device_short(dev, indent);
 
 	indentprintf(("Parent:\n"));
 	print_device_short(dev->parent, indent+1);
 	indentprintf(("Driver:\n"));
 	print_driver_short(dev->driver, indent+1);
 	indentprintf(("Devclass:\n"));
 	print_devclass_short(dev->devclass, indent+1);
 }
 
 void
 print_device_tree_short(device_t dev, int indent)
 /* print the device and all its children (indented) */
 {
 	device_t child;
 
 	if (!dev)
 		return;
 
 	print_device_short(dev, indent);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		print_device_tree_short(child, indent+1);
 	}
 }
 
 void
 print_device_tree(device_t dev, int indent)
 /* print the device and all its children (indented) */
 {
 	device_t child;
 
 	if (!dev)
 		return;
 
 	print_device(dev, indent);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		print_device_tree(child, indent+1);
 	}
 }
 
 static void
 print_driver_short(driver_t *driver, int indent)
 {
 	if (!driver)
 		return;
 
 	indentprintf(("driver %s: softc size = %zd\n",
 	    driver->name, driver->size));
 }
 
 static void
 print_driver(driver_t *driver, int indent)
 {
 	if (!driver)
 		return;
 
 	print_driver_short(driver, indent);
 }
 
 
 static void
 print_driver_list(driver_list_t drivers, int indent)
 {
 	driverlink_t driver;
 
 	TAILQ_FOREACH(driver, &drivers, link) {
 		print_driver(driver->driver, indent);
 	}
 }
 
 static void
 print_devclass_short(devclass_t dc, int indent)
 {
 	if ( !dc )
 		return;
 
 	indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit));
 }
 
 static void
 print_devclass(devclass_t dc, int indent)
 {
 	int i;
 
 	if ( !dc )
 		return;
 
 	print_devclass_short(dc, indent);
 	indentprintf(("Drivers:\n"));
 	print_driver_list(dc->drivers, indent+1);
 
 	indentprintf(("Devices:\n"));
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i])
 			print_device(dc->devices[i], indent+1);
 }
 
 void
 print_devclass_list_short(void)
 {
 	devclass_t dc;
 
 	printf("Short listing of devclasses, drivers & devices:\n");
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		print_devclass_short(dc, 0);
 	}
 }
 
 void
 print_devclass_list(void)
 {
 	devclass_t dc;
 
 	printf("Full listing of devclasses, drivers & devices:\n");
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		print_devclass(dc, 0);
 	}
 }
 
 #endif
 
 /*
  * User-space access to the device tree.
  *
  * We implement a small set of nodes:
  *
  * hw.bus			Single integer read method to obtain the
  *				current generation count.
  * hw.bus.devices		Reads the entire device tree in flat space.
  * hw.bus.rman			Resource manager interface
  *
  * We might like to add the ability to scan devclasses and/or drivers to
  * determine what else is currently loaded/available.
  */
 
 static int
 sysctl_bus(SYSCTL_HANDLER_ARGS)
 {
 	struct u_businfo	ubus;
 
 	ubus.ub_version = BUS_USER_VERSION;
 	ubus.ub_generation = bus_data_generation;
 
 	return (SYSCTL_OUT(req, &ubus, sizeof(ubus)));
 }
 SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus,
     "bus-related data");
 
 static int
 sysctl_devices(SYSCTL_HANDLER_ARGS)
 {
 	int			*name = (int *)arg1;
 	u_int			namelen = arg2;
 	int			index;
 	struct device		*dev;
 	struct u_device		udev;	/* XXX this is a bit big */
 	int			error;
 
 	if (namelen != 2)
 		return (EINVAL);
 
 	if (bus_data_generation_check(name[0]))
 		return (EINVAL);
 
 	index = name[1];
 
 	/*
 	 * Scan the list of devices, looking for the requested index.
 	 */
 	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
 		if (index-- == 0)
 			break;
 	}
 	if (dev == NULL)
 		return (ENOENT);
 
 	/*
 	 * Populate the return array.
 	 */
+	bzero(&udev, sizeof(udev));
 	udev.dv_handle = (uintptr_t)dev;
 	udev.dv_parent = (uintptr_t)dev->parent;
 	if (dev->nameunit == NULL)
 		udev.dv_name[0] = '\0';
 	else
 		strlcpy(udev.dv_name, dev->nameunit, sizeof(udev.dv_name));
 
 	if (dev->desc == NULL)
 		udev.dv_desc[0] = '\0';
 	else
 		strlcpy(udev.dv_desc, dev->desc, sizeof(udev.dv_desc));
 	if (dev->driver == NULL || dev->driver->name == NULL)
 		udev.dv_drivername[0] = '\0';
 	else
 		strlcpy(udev.dv_drivername, dev->driver->name,
 		    sizeof(udev.dv_drivername));
 	udev.dv_pnpinfo[0] = '\0';
 	udev.dv_location[0] = '\0';
 	bus_child_pnpinfo_str(dev, udev.dv_pnpinfo, sizeof(udev.dv_pnpinfo));
 	bus_child_location_str(dev, udev.dv_location, sizeof(udev.dv_location));
 	udev.dv_devflags = dev->devflags;
 	udev.dv_flags = dev->flags;
 	udev.dv_state = dev->state;
 	error = SYSCTL_OUT(req, &udev, sizeof(udev));
 	return (error);
 }
 
 SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices,
     "system device tree");
 
 int
 bus_data_generation_check(int generation)
 {
 	if (generation != bus_data_generation)
 		return (1);
 
 	/* XXX generate optimised lists here? */
 	return (0);
 }
 
 void
 bus_data_generation_update(void)
 {
 	bus_data_generation++;
 }
 
 int
 bus_free_resource(device_t dev, int type, struct resource *r)
 {
 	if (r == NULL)
 		return (0);
 	return (bus_release_resource(dev, type, rman_get_rid(r), r));
 }
Index: head/sys/kern/subr_rman.c
===================================================================
--- head/sys/kern/subr_rman.c	(revision 145952)
+++ head/sys/kern/subr_rman.c	(revision 145953)
@@ -1,808 +1,810 @@
 /*-
  * Copyright 1998 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  * 
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * The kernel resource manager.  This code is responsible for keeping track
  * of hardware resources which are apportioned out to various drivers.
  * It does not actually assign those resources, and it is not expected
  * that end-device drivers will call into this code directly.  Rather,
  * the code which implements the buses that those devices are attached to,
  * and the code which manages CPU resources, will call this code, and the
  * end-device drivers will make upcalls to that code to actually perform
  * the allocation.
  *
  * There are two sorts of resources managed by this code.  The first is
  * the more familiar array (RMAN_ARRAY) type; resources in this class
  * consist of a sequence of individually-allocatable objects which have
  * been numbered in some well-defined order.  Most of the resources
  * are of this type, as it is the most familiar.  The second type is
  * called a gauge (RMAN_GAUGE), and models fungible resources (i.e.,
  * resources in which each instance is indistinguishable from every
  * other instance).  The principal anticipated application of gauges
  * is in the context of power consumption, where a bus may have a specific
  * power budget which all attached devices share.  RMAN_GAUGE is not
  * implemented yet.
  *
  * For array resources, we make one simplifying assumption: two clients
  * sharing the same resource must use the same range of indices.  That
  * is to say, sharing of overlapping-but-not-identical regions is not
  * permitted.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define __RMAN_RESOURCE_VISIBLE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/bus.h>		/* XXX debugging */
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/sysctl.h>
 
 int     rman_debug = 0;
 TUNABLE_INT("debug.rman_debug", &rman_debug);
 SYSCTL_INT(_debug, OID_AUTO, rman_debug, CTLFLAG_RW,
     &rman_debug, 0, "rman debug");
 
 #define DPRINTF(params) if (rman_debug) printf params
 
 static MALLOC_DEFINE(M_RMAN, "rman", "Resource manager");
 
 struct	rman_head rman_head;
 static	struct mtx rman_mtx; /* mutex to protect rman_head */
 static	int int_rman_activate_resource(struct rman *rm, struct resource *r,
 				       struct resource **whohas);
 static	int int_rman_deactivate_resource(struct resource *r);
 static	int int_rman_release_resource(struct rman *rm, struct resource *r);
 
 int
 rman_init(struct rman *rm)
 {
 	static int once;
 
 	if (once == 0) {
 		once = 1;
 		TAILQ_INIT(&rman_head);
 		mtx_init(&rman_mtx, "rman head", NULL, MTX_DEF);
 	}
 
 	if (rm->rm_type == RMAN_UNINIT)
 		panic("rman_init");
 	if (rm->rm_type == RMAN_GAUGE)
 		panic("implement RMAN_GAUGE");
 
 	TAILQ_INIT(&rm->rm_list);
 	rm->rm_mtx = malloc(sizeof *rm->rm_mtx, M_RMAN, M_NOWAIT | M_ZERO);
 	if (rm->rm_mtx == 0)
 		return ENOMEM;
 	mtx_init(rm->rm_mtx, "rman", NULL, MTX_DEF);
 
 	mtx_lock(&rman_mtx);
 	TAILQ_INSERT_TAIL(&rman_head, rm, rm_link);
 	mtx_unlock(&rman_mtx);
 	return 0;
 }
 
 /*
  * NB: this interface is not robust against programming errors which
  * add multiple copies of the same region.
  */
 int
 rman_manage_region(struct rman *rm, u_long start, u_long end)
 {
 	struct resource *r, *s;
 
 	DPRINTF(("rman_manage_region: <%s> request: start %#lx, end %#lx\n",
 	    rm->rm_descr, start, end));
 	r = malloc(sizeof *r, M_RMAN, M_NOWAIT | M_ZERO);
 	if (r == 0)
 		return ENOMEM;
 	r->r_start = start;
 	r->r_end = end;
 	r->r_rm = rm;
 
 	mtx_lock(rm->rm_mtx);
 	for (s = TAILQ_FIRST(&rm->rm_list);	
 	     s && s->r_end < r->r_start;
 	     s = TAILQ_NEXT(s, r_link))
 		;
 
 	if (s == NULL) {
 		TAILQ_INSERT_TAIL(&rm->rm_list, r, r_link);
 	} else {
 		TAILQ_INSERT_BEFORE(s, r, r_link);
 	}
 
 	mtx_unlock(rm->rm_mtx);
 	return 0;
 }
 
 int
 rman_fini(struct rman *rm)
 {
 	struct resource *r;
 
 	mtx_lock(rm->rm_mtx);
 	TAILQ_FOREACH(r, &rm->rm_list, r_link) {
 		if (r->r_flags & RF_ALLOCATED) {
 			mtx_unlock(rm->rm_mtx);
 			return EBUSY;
 		}
 	}
 
 	/*
 	 * There really should only be one of these if we are in this
 	 * state and the code is working properly, but it can't hurt.
 	 */
 	while (!TAILQ_EMPTY(&rm->rm_list)) {
 		r = TAILQ_FIRST(&rm->rm_list);
 		TAILQ_REMOVE(&rm->rm_list, r, r_link);
 		free(r, M_RMAN);
 	}
 	mtx_unlock(rm->rm_mtx);
 	mtx_lock(&rman_mtx);
 	TAILQ_REMOVE(&rman_head, rm, rm_link);
 	mtx_unlock(&rman_mtx);
 	mtx_destroy(rm->rm_mtx);
 	free(rm->rm_mtx, M_RMAN);
 
 	return 0;
 }
 
 struct resource *
 rman_reserve_resource_bound(struct rman *rm, u_long start, u_long end,
 		      u_long count, u_long bound,  u_int flags,
 		      struct device *dev)
 {
 	u_int	want_activate;
 	struct	resource *r, *s, *rv;
 	u_long	rstart, rend, amask, bmask;
 
 	rv = 0;
 
 	DPRINTF(("rman_reserve_resource: <%s> request: [%#lx, %#lx], length "
 	       "%#lx, flags %u, device %s\n", rm->rm_descr, start, end, count,
 	       flags, dev == NULL ? "<null>" : device_get_nameunit(dev)));
 	want_activate = (flags & RF_ACTIVE);
 	flags &= ~RF_ACTIVE;
 
 	mtx_lock(rm->rm_mtx);
 
 	for (r = TAILQ_FIRST(&rm->rm_list); 
 	     r && r->r_end < start;
 	     r = TAILQ_NEXT(r, r_link))
 		;
 
 	if (r == NULL) {
 		DPRINTF(("could not find a region\n"));
 		goto out;
 	}
 
 	amask = (1ul << RF_ALIGNMENT(flags)) - 1;
 	/* If bound is 0, bmask will also be 0 */
 	bmask = ~(bound - 1);
 	/*
 	 * First try to find an acceptable totally-unshared region.
 	 */
 	for (s = r; s; s = TAILQ_NEXT(s, r_link)) {
 		DPRINTF(("considering [%#lx, %#lx]\n", s->r_start, s->r_end));
 		if (s->r_start + count - 1 > end) {
 			DPRINTF(("s->r_start (%#lx) + count - 1> end (%#lx)\n",
 			    s->r_start, end));
 			break;
 		}
 		if (s->r_flags & RF_ALLOCATED) {
 			DPRINTF(("region is allocated\n"));
 			continue;
 		}
 		rstart = ulmax(s->r_start, start);
 		/*
 		 * Try to find a region by adjusting to boundary and alignment
 		 * until both conditions are satisfied. This is not an optimal
 		 * algorithm, but in most cases it isn't really bad, either.
 		 */
 		do {
 			rstart = (rstart + amask) & ~amask;
 			if (((rstart ^ (rstart + count - 1)) & bmask) != 0)
 				rstart += bound - (rstart & ~bmask);
 		} while ((rstart & amask) != 0 && rstart < end &&
 		    rstart < s->r_end);
 		rend = ulmin(s->r_end, ulmax(rstart + count - 1, end));
 		if (rstart > rend) {
 			DPRINTF(("adjusted start exceeds end\n"));
 			continue;
 		}
 		DPRINTF(("truncated region: [%#lx, %#lx]; size %#lx (requested %#lx)\n",
 		       rstart, rend, (rend - rstart + 1), count));
 
 		if ((rend - rstart + 1) >= count) {
 			DPRINTF(("candidate region: [%#lx, %#lx], size %#lx\n",
 			       rstart, rend, (rend - rstart + 1)));
 			if ((s->r_end - s->r_start + 1) == count) {
 				DPRINTF(("candidate region is entire chunk\n"));
 				rv = s;
 				rv->r_flags |= RF_ALLOCATED | flags;
 				rv->r_dev = dev;
 				goto out;
 			}
 
 			/*
 			 * If s->r_start < rstart and
 			 *    s->r_end > rstart + count - 1, then
 			 * we need to split the region into three pieces
 			 * (the middle one will get returned to the user).
 			 * Otherwise, we are allocating at either the
 			 * beginning or the end of s, so we only need to
 			 * split it in two.  The first case requires
 			 * two new allocations; the second requires but one.
 			 */
 			rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT | M_ZERO);
 			if (rv == 0)
 				goto out;
 			rv->r_start = rstart;
 			rv->r_end = rstart + count - 1;
 			rv->r_flags = flags | RF_ALLOCATED;
 			rv->r_dev = dev;
 			rv->r_rm = rm;
 			
 			if (s->r_start < rv->r_start && s->r_end > rv->r_end) {
 				DPRINTF(("splitting region in three parts: "
 				       "[%#lx, %#lx]; [%#lx, %#lx]; [%#lx, %#lx]\n",
 				       s->r_start, rv->r_start - 1,
 				       rv->r_start, rv->r_end,
 				       rv->r_end + 1, s->r_end));
 				/*
 				 * We are allocating in the middle.
 				 */
 				r = malloc(sizeof *r, M_RMAN, M_NOWAIT|M_ZERO);
 				if (r == 0) {
 					free(rv, M_RMAN);
 					rv = 0;
 					goto out;
 				}
 				r->r_start = rv->r_end + 1;
 				r->r_end = s->r_end;
 				r->r_flags = s->r_flags;
 				r->r_rm = rm;
 				s->r_end = rv->r_start - 1;
 				TAILQ_INSERT_AFTER(&rm->rm_list, s, rv,
 						     r_link);
 				TAILQ_INSERT_AFTER(&rm->rm_list, rv, r,
 						     r_link);
 			} else if (s->r_start == rv->r_start) {
 				DPRINTF(("allocating from the beginning\n"));
 				/*
 				 * We are allocating at the beginning.
 				 */
 				s->r_start = rv->r_end + 1;
 				TAILQ_INSERT_BEFORE(s, rv, r_link);
 			} else {
 				DPRINTF(("allocating at the end\n"));
 				/*
 				 * We are allocating at the end.
 				 */
 				s->r_end = rv->r_start - 1;
 				TAILQ_INSERT_AFTER(&rm->rm_list, s, rv,
 						     r_link);
 			}
 			goto out;
 		}
 	}
 
 	/*
 	 * Now find an acceptable shared region, if the client's requirements
 	 * allow sharing.  By our implementation restriction, a candidate
 	 * region must match exactly by both size and sharing type in order
 	 * to be considered compatible with the client's request.  (The
 	 * former restriction could probably be lifted without too much
 	 * additional work, but this does not seem warranted.)
 	 */
 	DPRINTF(("no unshared regions found\n"));
 	if ((flags & (RF_SHAREABLE | RF_TIMESHARE)) == 0)
 		goto out;
 
 	for (s = r; s; s = TAILQ_NEXT(s, r_link)) {
 		if (s->r_start > end)
 			break;
 		if ((s->r_flags & flags) != flags)
 			continue;
 		rstart = ulmax(s->r_start, start);
 		rend = ulmin(s->r_end, ulmax(start + count - 1, end));
 		if (s->r_start >= start && s->r_end <= end
 		    && (s->r_end - s->r_start + 1) == count &&
 		    (s->r_start & amask) == 0 &&
 		    ((s->r_start ^ s->r_end) & bmask) == 0) {
 			rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT | M_ZERO);
 			if (rv == 0)
 				goto out;
 			rv->r_start = s->r_start;
 			rv->r_end = s->r_end;
 			rv->r_flags = s->r_flags & 
 				(RF_ALLOCATED | RF_SHAREABLE | RF_TIMESHARE);
 			rv->r_dev = dev;
 			rv->r_rm = rm;
 			if (s->r_sharehead == 0) {
 				s->r_sharehead = malloc(sizeof *s->r_sharehead,
 						M_RMAN, M_NOWAIT | M_ZERO);
 				if (s->r_sharehead == 0) {
 					free(rv, M_RMAN);
 					rv = 0;
 					goto out;
 				}
 				LIST_INIT(s->r_sharehead);
 				LIST_INSERT_HEAD(s->r_sharehead, s, 
 						 r_sharelink);
 				s->r_flags |= RF_FIRSTSHARE;
 			}
 			rv->r_sharehead = s->r_sharehead;
 			LIST_INSERT_HEAD(s->r_sharehead, rv, r_sharelink);
 			goto out;
 		}
 	}
 
 	/*
 	 * We couldn't find anything.
 	 */
 out:
 	/*
 	 * If the user specified RF_ACTIVE in the initial flags,
 	 * which is reflected in `want_activate', we attempt to atomically
 	 * activate the resource.  If this fails, we release the resource
 	 * and indicate overall failure.  (This behavior probably doesn't
 	 * make sense for RF_TIMESHARE-type resources.)
 	 */
 	if (rv && want_activate) {
 		struct resource *whohas;
 		if (int_rman_activate_resource(rm, rv, &whohas)) {
 			int_rman_release_resource(rm, rv);
 			rv = 0;
 		}
 	}
 			
 	mtx_unlock(rm->rm_mtx);
 	return (rv);
 }
 
 struct resource *
 rman_reserve_resource(struct rman *rm, u_long start, u_long end, u_long count,
 		      u_int flags, struct device *dev)
 {
 
 	return (rman_reserve_resource_bound(rm, start, end, count, 0, flags,
 	    dev));
 }
 
 static int
 int_rman_activate_resource(struct rman *rm, struct resource *r,
 			   struct resource **whohas)
 {
 	struct resource *s;
 	int ok;
 
 	/*
 	 * If we are not timesharing, then there is nothing much to do.
 	 * If we already have the resource, then there is nothing at all to do.
 	 * If we are not on a sharing list with anybody else, then there is
 	 * little to do.
 	 */
 	if ((r->r_flags & RF_TIMESHARE) == 0
 	    || (r->r_flags & RF_ACTIVE) != 0
 	    || r->r_sharehead == 0) {
 		r->r_flags |= RF_ACTIVE;
 		return 0;
 	}
 
 	ok = 1;
 	for (s = LIST_FIRST(r->r_sharehead); s && ok;
 	     s = LIST_NEXT(s, r_sharelink)) {
 		if ((s->r_flags & RF_ACTIVE) != 0) {
 			ok = 0;
 			*whohas = s;
 		}
 	}
 	if (ok) {
 		r->r_flags |= RF_ACTIVE;
 		return 0;
 	}
 	return EBUSY;
 }
 
 int
 rman_activate_resource(struct resource *r)
 {
 	int rv;
 	struct resource *whohas;
 	struct rman *rm;
 
 	rm = r->r_rm;
 	mtx_lock(rm->rm_mtx);
 	rv = int_rman_activate_resource(rm, r, &whohas);
 	mtx_unlock(rm->rm_mtx);
 	return rv;
 }
 
 int
 rman_await_resource(struct resource *r, int pri, int timo)
 {
 	int	rv;
 	struct	resource *whohas;
 	struct	rman *rm;
 
 	rm = r->r_rm;
 	mtx_lock(rm->rm_mtx);
 	for (;;) {
 		rv = int_rman_activate_resource(rm, r, &whohas);
 		if (rv != EBUSY)
 			return (rv);	/* returns with mutex held */
 
 		if (r->r_sharehead == 0)
 			panic("rman_await_resource");
 		whohas->r_flags |= RF_WANTED;
 		rv = msleep(r->r_sharehead, rm->rm_mtx, pri, "rmwait", timo);
 		if (rv) {
 			mtx_unlock(rm->rm_mtx);
 			return (rv);
 		}
 	}
 }
 
 static int
 int_rman_deactivate_resource(struct resource *r)
 {
 
 	r->r_flags &= ~RF_ACTIVE;
 	if (r->r_flags & RF_WANTED) {
 		r->r_flags &= ~RF_WANTED;
 		wakeup(r->r_sharehead);
 	}
 	return 0;
 }
 
 int
 rman_deactivate_resource(struct resource *r)
 {
 	struct	rman *rm;
 
 	rm = r->r_rm;
 	mtx_lock(rm->rm_mtx);
 	int_rman_deactivate_resource(r);
 	mtx_unlock(rm->rm_mtx);
 	return 0;
 }
 
 static int
 int_rman_release_resource(struct rman *rm, struct resource *r)
 {
 	struct	resource *s, *t;
 
 	if (r->r_flags & RF_ACTIVE)
 		int_rman_deactivate_resource(r);
 
 	/*
 	 * Check for a sharing list first.  If there is one, then we don't
 	 * have to think as hard.
 	 */
 	if (r->r_sharehead) {
 		/*
 		 * If a sharing list exists, then we know there are at
 		 * least two sharers.
 		 *
 		 * If we are in the main circleq, appoint someone else.
 		 */
 		LIST_REMOVE(r, r_sharelink);
 		s = LIST_FIRST(r->r_sharehead);
 		if (r->r_flags & RF_FIRSTSHARE) {
 			s->r_flags |= RF_FIRSTSHARE;
 			TAILQ_INSERT_BEFORE(r, s, r_link);
 			TAILQ_REMOVE(&rm->rm_list, r, r_link);
 		}
 
 		/*
 		 * Make sure that the sharing list goes away completely
 		 * if the resource is no longer being shared at all.
 		 */
 		if (LIST_NEXT(s, r_sharelink) == 0) {
 			free(s->r_sharehead, M_RMAN);
 			s->r_sharehead = 0;
 			s->r_flags &= ~RF_FIRSTSHARE;
 		}
 		goto out;
 	}
 
 	/*
 	 * Look at the adjacent resources in the list and see if our
 	 * segment can be merged with any of them.  If either of the
 	 * resources is allocated or is not exactly adjacent then they
 	 * cannot be merged with our segment.
 	 */
 	s = TAILQ_PREV(r, resource_head, r_link);
 	if (s != NULL && ((s->r_flags & RF_ALLOCATED) != 0 ||
 	    s->r_end + 1 != r->r_start))
 		s = NULL;
 	t = TAILQ_NEXT(r, r_link);
 	if (t != NULL && ((t->r_flags & RF_ALLOCATED) != 0 ||
 	    r->r_end + 1 != t->r_start))
 		t = NULL;
 
 	if (s != NULL && t != NULL) {
 		/*
 		 * Merge all three segments.
 		 */
 		s->r_end = t->r_end;
 		TAILQ_REMOVE(&rm->rm_list, r, r_link);
 		TAILQ_REMOVE(&rm->rm_list, t, r_link);
 		free(t, M_RMAN);
 	} else if (s != NULL) {
 		/*
 		 * Merge previous segment with ours.
 		 */
 		s->r_end = r->r_end;
 		TAILQ_REMOVE(&rm->rm_list, r, r_link);
 	} else if (t != NULL) {
 		/*
 		 * Merge next segment with ours.
 		 */
 		t->r_start = r->r_start;
 		TAILQ_REMOVE(&rm->rm_list, r, r_link);
 	} else {
 		/*
 		 * At this point, we know there is nothing we
 		 * can potentially merge with, because on each
 		 * side, there is either nothing there or what is
 		 * there is still allocated.  In that case, we don't
 		 * want to remove r from the list; we simply want to
 		 * change it to an unallocated region and return
 		 * without freeing anything.
 		 */
 		r->r_flags &= ~RF_ALLOCATED;
 		return 0;
 	}
 
 out:
 	free(r, M_RMAN);
 	return 0;
 }
 
 int
 rman_release_resource(struct resource *r)
 {
 	int	rv;
 	struct	rman *rm = r->r_rm;
 
 	mtx_lock(rm->rm_mtx);
 	rv = int_rman_release_resource(rm, r);
 	mtx_unlock(rm->rm_mtx);
 	return (rv);
 }
 
 uint32_t
 rman_make_alignment_flags(uint32_t size)
 {
 	int	i;
 
 	/*
 	 * Find the hightest bit set, and add one if more than one bit
 	 * set.  We're effectively computing the ceil(log2(size)) here.
 	 */
 	for (i = 31; i > 0; i--)
 		if ((1 << i) & size)
 			break;
 	if (~(1 << i) & size)
 		i++;
 
 	return(RF_ALIGNMENT_LOG2(i));
 }
 
 u_long
 rman_get_start(struct resource *r)
 {
 	return (r->r_start);
 }
 
 u_long
 rman_get_end(struct resource *r)
 {
 	return (r->r_end);
 }
 
 u_long
 rman_get_size(struct resource *r)
 {
 	return (r->r_end - r->r_start + 1);
 }
 
 u_int
 rman_get_flags(struct resource *r)
 {
 	return (r->r_flags);
 }
 
 void
 rman_set_virtual(struct resource *r, void *v)
 {
 	r->r_virtual = v;
 }
 
 void *
 rman_get_virtual(struct resource *r)
 {
 	return (r->r_virtual);
 }
 
 void
 rman_set_bustag(struct resource *r, bus_space_tag_t t)
 {
 	r->r_bustag = t;
 }
 
 bus_space_tag_t
 rman_get_bustag(struct resource *r)
 {
 	return (r->r_bustag);
 }
 
 void
 rman_set_bushandle(struct resource *r, bus_space_handle_t h)
 {
 	r->r_bushandle = h;
 }
 
 bus_space_handle_t
 rman_get_bushandle(struct resource *r)
 {
 	return (r->r_bushandle);
 }
 
 void
 rman_set_rid(struct resource *r, int rid)
 {
 	r->r_rid = rid;
 }
 
 void
 rman_set_start(struct resource *r, u_long start)
 {
 	r->r_start = start;
 }
 
 void
 rman_set_end(struct resource *r, u_long end)
 {
 	r->r_end = end;
 }
 
 int
 rman_get_rid(struct resource *r)
 {
 	return (r->r_rid);
 }
 
 struct device *
 rman_get_device(struct resource *r)
 {
 	return (r->r_dev);
 }
 
 void
 rman_set_device(struct resource *r, struct device *dev)
 {
 	r->r_dev = dev;
 }
 
 /*
  * Sysctl interface for scanning the resource lists.
  *
  * We take two input parameters; the index into the list of resource
  * managers, and the resource offset into the list.
  */
 static int
 sysctl_rman(SYSCTL_HANDLER_ARGS)
 {
 	int			*name = (int *)arg1;
 	u_int			namelen = arg2;
 	int			rman_idx, res_idx;
 	struct rman		*rm;
 	struct resource		*res;
 	struct u_rman		urm;
 	struct u_resource	ures;
 	int			error;
 
 	if (namelen != 3)
 		return (EINVAL);
 
 	if (bus_data_generation_check(name[0]))
 		return (EINVAL);
 	rman_idx = name[1];
 	res_idx = name[2];
 
 	/*
 	 * Find the indexed resource manager
 	 */
 	TAILQ_FOREACH(rm, &rman_head, rm_link) {
 		if (rman_idx-- == 0)
 			break;
 	}
 	if (rm == NULL)
 		return (ENOENT);
 
 	/*
 	 * If the resource index is -1, we want details on the
 	 * resource manager.
 	 */
 	if (res_idx == -1) {
+		bzero(&urm, sizeof(urm));
 		urm.rm_handle = (uintptr_t)rm;
 		strlcpy(urm.rm_descr, rm->rm_descr, RM_TEXTLEN);
 		urm.rm_start = rm->rm_start;
 		urm.rm_size = rm->rm_end - rm->rm_start + 1;
 		urm.rm_type = rm->rm_type;
 
 		error = SYSCTL_OUT(req, &urm, sizeof(urm));
 		return (error);
 	}
 
 	/*
 	 * Find the indexed resource and return it.
 	 */
 	TAILQ_FOREACH(res, &rm->rm_list, r_link) {
 		if (res_idx-- == 0) {
+			bzero(&ures, sizeof(ures));
 			ures.r_handle = (uintptr_t)res;
 			ures.r_parent = (uintptr_t)res->r_rm;
 			ures.r_device = (uintptr_t)res->r_dev;
 			if (res->r_dev != NULL) {
 				if (device_get_name(res->r_dev) != NULL) {
 					snprintf(ures.r_devname, RM_TEXTLEN,
 					    "%s%d",
 					    device_get_name(res->r_dev),
 					    device_get_unit(res->r_dev));
 				} else {
 					strlcpy(ures.r_devname, "nomatch",
 					    RM_TEXTLEN);
 				}
 			} else {
 				ures.r_devname[0] = '\0';
 			}
 			ures.r_start = res->r_start;
 			ures.r_size = res->r_end - res->r_start + 1;
 			ures.r_flags = res->r_flags;
 
 			error = SYSCTL_OUT(req, &ures, sizeof(ures));
 			return (error);
 		}
 	}
 	return (ENOENT);
 }
 
 SYSCTL_NODE(_hw_bus, OID_AUTO, rman, CTLFLAG_RD, sysctl_rman,
     "kernel resource manager");
 
Index: head/sys/kern/vfs_subr.c
===================================================================
--- head/sys/kern/vfs_subr.c	(revision 145952)
+++ head/sys/kern/vfs_subr.c	(revision 145953)
@@ -1,3508 +1,3511 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  */
 
 /*
  * External virtual filesystem routines
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/fcntl.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/reboot.h>
 #include <sys/sleepqueue.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <machine/stdarg.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
 
 static void	delmntque(struct vnode *vp);
 static void	insmntque(struct vnode *vp, struct mount *mp);
 static void	vlruvp(struct vnode *vp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	vbusy(struct vnode *vp);
 static void	vdropl(struct vnode *vp);
 static void	vinactive(struct vnode *, struct thread *);
 static void	v_incr_usecount(struct vnode *, int);
 static void	vfree(struct vnode *);
 static void	vfreehead(struct vnode *);
 static void	vnlru_free(int);
 static void	vdestroy(struct vnode *);
 
 /*
  * Enable Giant pushdown based on whether or not the vm is mpsafe in this
  * build.  Without mpsafevm the buffer cache can not run Giant free.
  */
 #if defined(__alpha__) || defined(__amd64__) || defined(__i386__)
 int mpsafe_vfs = 1;
 #else
 int mpsafe_vfs;
 #endif
 TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
 SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
     "MPSAFE VFS");
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, never decreased.
  */
 static unsigned long	numvnodes;
 
 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
 int vttoif_tab[9] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT,
 };
 
 /*
  * List of vnodes that are ready for recycling.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
  * Free vnode target.  Free vnodes may simply be files which have been stat'd
  * but not read.  This is somewhat common, and a small cache of such files
  * should be kept to avoid recreation costs.
  */
 static u_long wantfreevnodes;
 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 /* Number of vnodes in the free list. */
 static u_long freevnodes;
 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 
 /*
  * Various variables used for debugging the new implementation of
  * reassignbuf().
  * XXX these are probably of (very) limited utility now.
  */
 static int reassignbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
 
 /*
  * Cache for the mount type id assigned to NFS.  This is used for
  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
  */
 int	nfs_mount_type = -1;
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_free_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx vnode_free_list_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 
 /* Set to 1 to print out reclaim of active vnodes */
 int	prtactive;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, bufobj);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	bo->bo_synclist
  *	sync_vnode_count
  *	syncer_delayno
  *	syncer_state
  *	syncer_workitem_pending
  *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
 #define SYNCER_SHUTDOWN_SPEEDUP		4
 static int sync_vnode_count;
 static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
 /*
  * Number of vnodes we want to exist at any one time.  This is mostly used
  * to size hash tables in vnode-related code.  It is normally not used in
  * getnewvnode(), as wantfreevnodes is normally nonzero.)
  *
  * XXX desiredvnodes is historical cruft and should not exist.
  */
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
     &desiredvnodes, 0, "Maximum number of vnodes");
 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
 /* Hook for calling soft updates. */
 int (*softdep_process_worklist_hook)(struct mount *);
 
 /*
  * Macros to control when a vnode is freed and recycled.  All require
  * the vnode interlock.
  */
 #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
 #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
 #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
 
 
 /*
  * Initialize the vnode management data structures.
  */
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	100000
 #endif
 static void
 vntblinit(void *dummy __unused)
 {
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and
 	 * the kernel's heap size.  Specifically, desiredvnodes scales
 	 * in proportion to the physical memory size until two fifths
 	 * of the kernel's heap size is consumed by vnodes and vm
 	 * objects.
 	 */
 	desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
 	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
 	if (desiredvnodes > MAXVNODES_MAX) {
 		if (bootverbose)
 			printf("Reducing kern.maxvnodes %d -> %d\n",
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
 	wantfreevnodes = desiredvnodes / 4; 
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 		&syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
 
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Interlock is not released on failure.
  */
 int
 vfs_busy(mp, flags, interlkp, td)
 	struct mount *mp;
 	int flags;
 	struct mtx *interlkp;
 	struct thread *td;
 {
 	int lkflags;
 
 	MNT_ILOCK(mp);
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & LK_NOWAIT) {
 			MNT_IUNLOCK(mp);
 			return (ENOENT);
 		}
 		if (interlkp)
 			mtx_unlock(interlkp);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		/*
 		 * Since all busy locks are shared except the exclusive
 		 * lock granted when unmounting, the only place that a
 		 * wakeup needs to be done is at the release of the
 		 * exclusive lock at the end of dounmount.
 		 */
 		msleep(mp, MNT_MTX(mp), PVFS|PDROP, "vfs_busy", 0);
 		if (interlkp)
 			mtx_lock(interlkp);
 		return (ENOENT);
 	}
 	if (interlkp)
 		mtx_unlock(interlkp);
 	lkflags = LK_SHARED | LK_INTERLOCK;
 	if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td))
 		panic("vfs_busy: unexpected lock failure");
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(mp, td)
 	struct mount *mp;
 	struct thread *td;
 {
 
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid)
 	fsid_t *fsid;
 {
 	struct mount *mp;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Check if a user can access priveledged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
 	if ((mp->mnt_flag & MNT_USER) == 0 ||
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 		if ((error = suser(td)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(mp)
 	struct mount *mp;
 {
 	static u_int16_t mntid_base;
 	fsid_t tfsid;
 	int mtype;
 
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if (vfs_getvfs(&tfsid) == NULL)
 			break;
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_SEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(tsp)
 	struct timespec *tsp;
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(vap)
 	struct vattr *vap;
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desireable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
 vlrureclaim(struct mount *mp)
 {
 	struct vnode *vp;
 	int done;
 	int trigger;
 	int usevnodes;
 	int count;
 
 	/*
 	 * Calculate the trigger point, don't allow user
 	 * screwups to blow us up.   This prevents us from
 	 * recycling vnodes with lots of resident pages.  We
 	 * aren't trying to free memory, we are trying to
 	 * free vnodes.
 	 */
 	usevnodes = desiredvnodes;
 	if (usevnodes <= 0)
 		usevnodes = 1;
 	trigger = cnt.v_page_count * 2 / usevnodes;
 
 	done = 0;
 	vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
 	count = mp->mnt_nvnodelistsize / 10 + 1;
 	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 
 		if (vp->v_type != VNON &&
 		    vp->v_type != VBAD &&
 		    VI_TRYLOCK(vp)) {
 			/* critical path opt */
 			if (LIST_EMPTY(&(vp)->v_cache_src) &&
 			    !(vp)->v_usecount &&
 			    (vp->v_object == NULL ||
 			    vp->v_object->resident_page_count < trigger)) {
 				struct thread *td;
 
 				td = curthread;
 				MNT_IUNLOCK(mp);
 				VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE, td);
 				if ((vp->v_iflag & VI_DOOMED) == 0)
 					vgone(vp);
 				VOP_UNLOCK(vp, 0, td);
 				done++;
 				MNT_ILOCK(mp);
 			} else
 				VI_UNLOCK(vp);
 		}
 		--count;
 	}
 	MNT_IUNLOCK(mp);
 	vn_finished_write(mp);
 	return done;
 }
 
 /*
  * Attempt to keep the free list at wantfreevnodes length.
  */
 static void
 vnlru_free(int count)
 {
 	struct vnode *vp;
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	for (; count > 0; count--) {
 		vp = TAILQ_FIRST(&vnode_free_list);
 		/*
 		 * The list can be modified while the free_list_mtx
 		 * has been dropped and vp could be NULL here.
 		 */
 		if (!vp)
 			break;
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		/*
 		 * Don't recycle if we can't get the interlock.
 		 */
 		if (!VI_TRYLOCK(vp)) {
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 			continue;
 		}
 		if (!VCANRECYCLE(vp)) {
 			VI_UNLOCK(vp);
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 			continue;
 		}
 		freevnodes--;
 		vp->v_iflag &= ~VI_FREE;
 		mtx_unlock(&vnode_free_list_mtx);
 		if (vtryrecycle(vp) != 0) {
 			mtx_lock(&vnode_free_list_mtx);
 			continue;
 		}
 		vdestroy(vp);
 		mtx_lock(&vnode_free_list_mtx);
 		numvnodes--;
 	}
 }
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
 	int done;
 	struct proc *p = vnlruproc;
 	struct thread *td = FIRST_THREAD_IN_PROC(p);
 
 	mtx_lock(&Giant);
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 	    SHUTDOWN_PRI_FIRST);
 
 	for (;;) {
 		kthread_suspend_check(p);
 		mtx_lock(&vnode_free_list_mtx);
 		if (freevnodes > wantfreevnodes)
 			vnlru_free(freevnodes - wantfreevnodes);
 		if (numvnodes <= desiredvnodes * 9 / 10) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_free_list_mtx,
 			    PVFS|PDROP, "vlruwt", hz);
 			continue;
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 			done += vlrureclaim(mp);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp, td);
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (done == 0) {
 #if 0
 			/* These messages are temporary debugging aids */
 			if (vnlru_nowhere < 5)
 				printf("vnlru process getting nowhere..\n");
 			else if (vnlru_nowhere == 5)
 				printf("vnlru process messages stopped.\n");
 #endif
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		}
 	}
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 static void
 vdestroy(struct vnode *vp)
 {
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
 #ifdef MAC
 	mac_destroy_vnode(vp);
 #endif
 	if (vp->v_pollinfo != NULL) {
 		knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note);
 		mtx_destroy(&vp->v_pollinfo->vpi_lock);
 		uma_zfree(vnodepoll_zone, vp->v_pollinfo);
 	}
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	uma_zfree(vnode_zone, vp);
 }
 
 /*
  * Check to see if a free vnode can be recycled. If it can,
  * recycle it and return it with the vnode interlock held.
  */
 static int
 vtryrecycle(struct vnode *vp)
 {
 	struct thread *td = curthread;
 	struct mount *vnmp;
 	int error;
 
 	ASSERT_VI_LOCKED(vp, "vtryrecycle");
 	error = 0;
 	vnmp = NULL;
 	/*
 	 * This vnode may found and locked via some other list, if so we
 	 * can't recycle it yet.
 	 */
 	if (VOP_LOCK(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, td) != 0) {
 		VI_LOCK(vp);
 		if (VSHOULDFREE(vp))
 			vfree(vp);
 		VI_UNLOCK(vp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 		vnmp = NULL;
 		error = EBUSY;
 		VI_LOCK(vp);
 		goto err;
 	}
 	/*
 	 * If we got this far, we need to acquire the interlock and see if
 	 * anyone picked up this vnode from another list.  If not, we will
 	 * mark it with DOOMED via vgonel() so that anyone who does find it
 	 * will skip over it.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_holdcnt) {
 		error = EBUSY;
 		goto err;
 	}
 	if ((vp->v_iflag & VI_DOOMED) == 0) {
 		vp->v_iflag |= VI_DOOMED;
 		vgonel(vp, td);
 		VI_LOCK(vp);
 	}
 	/*
 	 * If someone ref'd the vnode while we were cleaning, we have to
 	 * free it once the last ref is dropped.
 	 */
 	if (vp->v_holdcnt) {
 		error = EBUSY;
 		goto err;
 	}
 	if (vp->v_iflag & VI_FREE)
 		vbusy(vp);
 	VI_UNLOCK(vp);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(vnmp);
 	return (0);
 err:
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 	VI_UNLOCK(vp);
 	VOP_UNLOCK(vp, 0, td);
 	if (vnmp != NULL)
 		vn_finished_write(vnmp);
 	return (error);
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(tag, mp, vops, vpp)
 	const char *tag;
 	struct mount *mp;
 	struct vop_vector *vops;
 	struct vnode **vpp;
 {
 	struct vnode *vp = NULL;
 	struct bufobj *bo;
 
 	mtx_lock(&vnode_free_list_mtx);
 	/*
 	 * Lend our context to reclaim vnodes if they've exceeded the max.
 	 */
 	if (freevnodes > wantfreevnodes)
 		vnlru_free(1);
 	/*
 	 * Wait for available vnodes.
 	 */
 	if (numvnodes > desiredvnodes) {
 		if (vnlruproc_sig == 0) {
 			vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
 			wakeup(vnlruproc);
 		}
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 		    "vlruwk", hz);
 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
 		if (numvnodes > desiredvnodes) {
 			mtx_unlock(&vnode_free_list_mtx);
 			return (ENFILE);
 		}
 #endif
 	}
 	numvnodes++;
 	mtx_unlock(&vnode_free_list_mtx);
 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
 	/*
 	 * Setup locks.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 	/*
 	 * By default, don't allow shared locks unless filesystems
 	 * opt-in.
 	 */
 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
 	/*
 	 * Initialize bufobj.
 	 */
 	bo = &vp->v_bufobj;
 	bo->__bo_vnode = vp;
 	bo->bo_mtx = &vp->v_interlock;
 	bo->bo_ops = &buf_ops_bio;
 	bo->bo_private = vp;
 	TAILQ_INIT(&bo->bo_clean.bv_hd);
 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
 	/*
 	 * Initialize namecache.
 	 */
 	LIST_INIT(&vp->v_cache_src);
 	TAILQ_INIT(&vp->v_cache_dst);
 	/*
 	 * Finalize various vnode identity bits.
 	 */
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	v_incr_usecount(vp, 1);
 	vp->v_data = 0;
 #ifdef MAC
 	mac_init_vnode(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 		mac_associate_vnode_singlelabel(mp, vp);
 	else if (mp == NULL)
 		printf("NULL mp in getnewvnode()\n");
 #endif
 	delmntque(vp);
 	if (mp != NULL) {
 		insmntque(vp, mp);
 		bo->bo_bsize = mp->mnt_stat.f_iosize;
 	}
 
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Delete from old mount point vnode list, if on one.
  */
 static void
 delmntque(struct vnode *vp)
 {
 	struct mount *mp;
 
 	if (vp->v_mount == NULL)
 		return;
 	mp = vp->v_mount;
 	MNT_ILOCK(mp);
 	vp->v_mount = NULL;
 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 		("bad mount point vnode list size"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize--;
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Insert into list of vnodes for the new mount point, if available.
  */
 static void
 insmntque(struct vnode *vp, struct mount *mp)
 {
 
 	vp->v_mount = mp;
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 	MNT_ILOCK(vp->v_mount);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize++;
 	MNT_IUNLOCK(vp->v_mount);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a bufobj
  * Called with the underlying object locked.
  */
 int
 bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo)
 {
 	int error;
 
 	BO_LOCK(bo);
 	if (flags & V_SAVE) {
 		error = bufobj_wwait(bo, slpflag, slptimeo);
 		if (error) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
 			if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0)
 				return (error);
 			/*
 			 * XXX We could save a lock/unlock if this was only
 			 * enabled under INVARIANTS
 			 */
 			BO_LOCK(bo);
 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
 				panic("vinvalbuf: dirty bufs");
 		}
 	}
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
 		if (error == 0)
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 	} while (error != 0);
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		bufobj_wwait(bo, 0, 0);
 		BO_UNLOCK(bo);
 		if (bo->bo_object != NULL) {
 			VM_OBJECT_LOCK(bo->bo_object);
 			vm_object_pip_wait(bo->bo_object, "bovlbx");
 			VM_OBJECT_UNLOCK(bo->bo_object);
 		}
 		BO_LOCK(bo);
 	} while (bo->bo_numoutput > 0);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (bo->bo_object != NULL) {
 		VM_OBJECT_LOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0,
 			(flags & V_SAVE) ? TRUE : FALSE);
 		VM_OBJECT_UNLOCK(bo->bo_object);
 	}
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
 	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	BO_UNLOCK(bo);
 #endif
 	return (0);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo)
 {
 
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 	return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo));
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist(bufv, flags, bo, slpflag, slptimeo)
 	struct bufv *bufv;
 	int flags;
 	struct bufobj *bo;
 	int slpflag, slptimeo;
 {
 	struct buf *bp, *nbp;
 	int retval, error;
 
 	ASSERT_BO_LOCKED(bo);
 
 	retval = 0;
 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
 			continue;
 		}
 		retval = EAGAIN;
 		error = BUF_TIMELOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			BO_LOCK(bo);
 			return (error != ENOLCK ? error : EAGAIN);
 		}
 		KASSERT(bp->b_bufobj == bo,
 	            ("wrong b_bufobj %p should be %p", bp->b_bufobj, bo));
 		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
 			BUF_UNLOCK(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);
 		}
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 		    (flags & V_SAVE)) {
 			bremfree(bp);
 			bp->b_flags |= B_ASYNC;
 			bwrite(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		BO_LOCK(bo);
 	}
 	return (retval);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	int anyfreed;
 	int trunclbn;
 	struct bufobj *bo;
 
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	trunclbn = (length + blksize - 1) / blksize;
 
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 restart:
 	VI_LOCK(vp);
 	bo = &vp->v_bufobj;
 	anyfreed = 1;
 	for (;anyfreed;) {
 		anyfreed = 0;
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK)
 				goto restart;
 
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI))) {
 				goto restart;
 			}
 			VI_LOCK(vp);
 		}
 
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK)
 				goto restart;
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI) == 0)) {
 				goto restart;
 			}
 			VI_LOCK(vp);
 		}
 	}
 
 	if (length > 0) {
 restartsync:
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno > 0)
 				continue;
 			/*
 			 * Since we hold the vnode lock this should only
 			 * fail if we're racing with the buf daemon.
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK) {
 				goto restart;
 			}
 			VNASSERT((bp->b_flags & B_DELWRI), vp,
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
 			bremfree(bp);
 			bawrite(bp);
 			VI_LOCK(vp);
 			goto restartsync;
 		}
 	}
 
 	bufobj_wwait(bo, 0, 0);
 	VI_UNLOCK(vp);
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * buf_splay() - splay tree core for the clean/dirty list of buffers in
  * 		 a vnode.
  *
  *	NOTE: We have to deal with the special case of a background bitmap
  *	buffer, a situation where two buffers will have the same logical
  *	block offset.  We want (1) only the foreground buffer to be accessed
  *	in a lookup and (2) must differentiate between the foreground and
  *	background buffer in the splay tree algorithm because the splay
  *	tree cannot normally handle multiple entities with the same 'index'.
  *	We accomplish this by adding differentiating flags to the splay tree's
  *	numerical domain.
  */
 static
 struct buf *
 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
 {
 	struct buf dummy;
 	struct buf *lefttreemax, *righttreemin, *y;
 
 	if (root == NULL)
 		return (NULL);
 	lefttreemax = righttreemin = &dummy;
 	for (;;) {
 		if (lblkno < root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_left) == NULL)
 				break;
 			if (lblkno < y->b_lblkno) {
 				/* Rotate right. */
 				root->b_left = y->b_right;
 				y->b_right = root;
 				root = y;
 				if ((y = root->b_left) == NULL)
 					break;
 			}
 			/* Link into the new root's right tree. */
 			righttreemin->b_left = root;
 			righttreemin = root;
 		} else if (lblkno > root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_right) == NULL)
 				break;
 			if (lblkno > y->b_lblkno) {
 				/* Rotate left. */
 				root->b_right = y->b_left;
 				y->b_left = root;
 				root = y;
 				if ((y = root->b_right) == NULL)
 					break;
 			}
 			/* Link into the new root's left tree. */
 			lefttreemax->b_right = root;
 			lefttreemax = root;
 		} else {
 			break;
 		}
 		root = y;
 	}
 	/* Assemble the new root. */
 	lefttreemax->b_right = root->b_left;
 	righttreemin->b_left = root->b_right;
 	root->b_left = dummy.b_right;
 	root->b_right = dummy.b_left;
 	return (root);
 }
 
 static void
 buf_vlist_remove(struct buf *bp)
 {
 	struct buf *root;
 	struct bufv *bv;
 
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	ASSERT_BO_LOCKED(bp->b_bufobj);
 	if (bp->b_xflags & BX_VNDIRTY) 
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
 	if (bp != bv->bv_root) {
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
 		KASSERT(root == bp, ("splay lookup failed in remove"));
 	}
 	if (bp->b_left == NULL) {
 		root = bp->b_right;
 	} else {
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
 		root->b_right = bp->b_right;
 	}
 	bv->bv_root = root;
 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 	bv->bv_cnt--;
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list using a
  * splay tree algorithm.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static void
 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 {
 	struct buf *root;
 	struct bufv *bv;
 
 	ASSERT_BO_LOCKED(bo);
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY)
 		bv = &bo->bo_dirty;
 	else
 		bv = &bo->bo_clean;
 
 	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
 	if (root == NULL) {
 		bp->b_left = NULL;
 		bp->b_right = NULL;
 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 	} else if (bp->b_lblkno < root->b_lblkno ||
 	    (bp->b_lblkno == root->b_lblkno &&
 	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 		bp->b_left = root->b_left;
 		bp->b_right = root;
 		root->b_left = NULL;
 		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
 	} else {
 		bp->b_right = root->b_right;
 		bp->b_left = root;
 		root->b_right = NULL;
 		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
 	}
 	bv->bv_cnt++;
 	bv->bv_root = bp;
 }
 
 /*
  * Lookup a buffer using the splay tree.  Note that we specifically avoid
  * shadow buffers used in background bitmap writes.
  *
  * This code isn't quite efficient as it could be because we are maintaining
  * two sorted lists and do not know which list the block resides in.
  *
  * During a "make buildworld" the desired buffer is found at one of
  * the roots more than 60% of the time.  Thus, checking both roots
  * before performing either splay eliminates unnecessary splays on the
  * first tree splayed.
  */
 struct buf *
 gbincore(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_LOCKED(bo);
 	if ((bp = bo->bo_clean.bv_root) != NULL &&
 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return (bp);
 	if ((bp = bo->bo_dirty.bv_root) != NULL &&
 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return (bp);
 	if ((bp = bo->bo_clean.bv_root) != NULL) {
 		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 			return (bp);
 	}
 	if ((bp = bo->bo_dirty.bv_root) != NULL) {
 		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 			return (bp);
 	}
 	return (NULL);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(struct vnode *vp, struct buf *bp)
 {
 
 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 
 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	ASSERT_VI_LOCKED(vp, "bgetvp");
 	vholdl(vp);
 	bp->b_vp = vp;
 	bp->b_bufobj = &vp->v_bufobj;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(struct buf *bp)
 {
 	struct bufobj *bo;
 	struct vnode *vp;
 
 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;		/* XXX */
 	bo = bp->b_bufobj;
 	BO_LOCK(bo);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("brelvp: Buffer %p not on queue.", bp);
 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 		bo->bo_flag &= ~BO_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
  		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	vdropl(vp);
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	BO_UNLOCK(bo);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 {
 	int slot;
 
 	ASSERT_BO_LOCKED(bo);
 
 	mtx_lock(&sync_mtx);
 	if (bo->bo_flag & BO_ONWORKLST)
 		LIST_REMOVE(bo, bo_synclist);
 	else {
 		bo->bo_flag |= BO_ONWORKLST;
  		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
 static int
 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 {
 	int error, len;
 
 	mtx_lock(&sync_mtx);
 	len = syncer_worklist_len - sync_vnode_count;
 	mtx_unlock(&sync_mtx);
 	error = SYSCTL_OUT(req, &len, sizeof(len));
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 struct  proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 
 static int
 sync_vnode(struct bufobj *bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 
 	vp = bo->__bo_vnode; 	/* XXX */
 	if (VOP_ISLOCKED(vp, NULL) != 0)
 		return (1);
 	if (VI_TRYLOCK(vp) == 0)
 		return (1);
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
 	 * going away when we unlock the sync_mtx so that
 	 * we can acquire the vnode interlock.
 	 */
 	vholdl(vp);
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		vdrop(vp);
 		mtx_lock(&sync_mtx);
 		return (1);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	VI_LOCK(vp);
 	if ((bo->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
 		vn_syncer_add_to_worklist(bo, syncdelay);
 	}
 	vdropl(vp);
 	VI_UNLOCK(vp);
 	mtx_lock(&sync_mtx);
 	return (0);
 }
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next;
 	struct synclist *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
 	static int dummychan;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int first_printf;
 	int error;
 
 	mtx_lock(&Giant);
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_second;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	for (;;) {
 		mtx_lock(&sync_mtx);
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kthread_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_second) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining...");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_second;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes, 
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
 		while ((bo = LIST_FIRST(slp)) != NULL) {
 			error = sync_vnode(bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		mtx_unlock(&sync_mtx);
 
 		/*
 		 * Do soft update processing.
 		 */
 		if (softdep_process_worklist_hook != NULL)
 			(*softdep_process_worklist_hook)(NULL);
 
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		mtx_lock(&sync_mtx);
 		if (rushjob > 0) {
 			rushjob -= 1;
 			mtx_unlock(&sync_mtx);
 			continue;
 		}
 		mtx_unlock(&sync_mtx);
 		/*
 		 * Just sleep for a short period if time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING)
 			tsleep(&dummychan, PPAUSE, "syncfnl",
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
 		else if (time_second == starttime)
 			tsleep(&lbolt, PPAUSE, "syncer", 0);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer()
 {
 	struct thread *td;
 	int ret = 0;
 
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	sleepq_remove(td, &lbolt);
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	return (ret);
 }
 
 /*
  * Tell the syncer to speed up its work and run though its work
  * list several times, then tell it to shut down.
  */
 static void
 syncer_shutdown(void *arg, int howto)
 {
 	struct thread *td;
 
 	if (howto & RB_NOSYNC)
 		return;
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	sleepq_remove(td, &lbolt);
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
 	kproc_shutdown(arg, howto);
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(struct buf *bp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	int delay;
 
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	++reassignbufcalls;
 
 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	VI_LOCK(vp);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("reassignbuf: Buffer %p not on queue.", bp);
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				delay = metadelay;
 				break;
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(bo, delay);
 		}
 		buf_vlist_add(bp, bo, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, bo, BX_VNCLEAN);
 
 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(bo, bo_synclist);
  			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			bo->bo_flag &= ~BO_ONWORKLST;
 		}
 	}
 	VI_UNLOCK(vp);
 }
 
 static void
 v_incr_usecount(struct vnode *vp, int delta)
 {
 
 	vp->v_usecount += delta;
 	vp->v_holdcnt += delta;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount += delta;
 		dev_unlock();
 	}
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it. The vnode lock bit is set if the
  * vnode is being eliminated in vgone. The process is awakened
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new filesystem type).
  */
 int
 vget(vp, flags, td)
 	struct vnode *vp;
 	int flags;
 	struct thread *td;
 {
 	int oweinact;
 	int oldflags;
 	int error;
 
 	error = 0;
 	oldflags = flags;
 	oweinact = 0;
 	if ((flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
 	/*
 	 * If the inactive call was deferred because vput() was called
 	 * with a shared lock, we have to do it here before another thread
 	 * gets a reference to data that should be dead.
 	 */
 	if (vp->v_iflag & VI_OWEINACT) {
 		if (flags & LK_NOWAIT) {
 			VI_UNLOCK(vp);
 			return (EBUSY);
 		}
 		flags &= ~LK_TYPE_MASK;
 		flags |= LK_EXCLUSIVE;
 		oweinact = 1;
 	}
 	v_incr_usecount(vp, 1);
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 	if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
 		VI_LOCK(vp);
 		/*
 		 * must expand vrele here because we do not want
 		 * to call VOP_INACTIVE if the reference count
 		 * drops back to zero since it was never really
 		 * active.
 		 */
 		v_incr_usecount(vp, -1);
 		if (VSHOULDFREE(vp))
 			vfree(vp);
 		else
 			vlruvp(vp);
 		VI_UNLOCK(vp);
 		return (error);
 	}
 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
 		panic("vget: vn_lock failed to return ENOENT\n");
 	if (oweinact) {
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_OWEINACT)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 		if ((oldflags & LK_TYPE_MASK) == 0)
 			VOP_UNLOCK(vp, 0, td);
 	}
 	return (0);
 }
 
 /*
  * Increase the reference count of a vnode.
  */
 void
 vref(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	v_incr_usecount(vp, 1);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Return reference count of a vnode.
  *
  * The results of this call are only guaranteed when some mechanism other
  * than the VI lock is used to stop other processes from gaining references
  * to the vnode.  This may be the case if the caller holds the only reference.
  * This is also useful when stale data is acceptable as race conditions may
  * be accounted for by some other means.
  */
 int
 vrefcnt(struct vnode *vp)
 {
 	int usecnt;
 
 	VI_LOCK(vp);
 	usecnt = vp->v_usecount;
 	VI_UNLOCK(vp);
 
 	return (usecnt);
 }
 
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(vp)
 	struct vnode *vp;
 {
 	struct thread *td = curthread;	/* XXX */
 
 	KASSERT(vp != NULL, ("vrele: null vp"));
 
 	VI_LOCK(vp);
 
 	/* Skip this v_writecount check if we're going to panic below. */
 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 	    ("vrele: missed vn_close"));
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		v_incr_usecount(vp, -1);
 		VI_UNLOCK(vp);
 
 		return;
 	}
 	if (vp->v_usecount != 1) {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 #endif
 		VI_UNLOCK(vp);
 		panic("vrele: negative ref cnt");
 	}
 	v_incr_usecount(vp, -1);
 	/*
 	 * We must call VOP_INACTIVE with the node locked. Mark
 	 * as VI_DOINGINACT to avoid recursion.
 	 */
 	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
 		VI_LOCK(vp);
 		vinactive(vp, td);
 		VOP_UNLOCK(vp, 0, td);
 	} else
 		VI_LOCK(vp);
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 	else
 		vlruvp(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Release an already locked vnode.  This give the same effects as
  * unlock+vrele(), but takes less time and avoids releasing and
  * re-aquiring the lock (as vrele() aquires the lock internally.)
  */
 void
 vput(vp)
 	struct vnode *vp;
 {
 	struct thread *td = curthread;	/* XXX */
 	int error;
 
 	KASSERT(vp != NULL, ("vput: null vp"));
 	ASSERT_VOP_LOCKED(vp, "vput");
 	VI_LOCK(vp);
 	/* Skip this v_writecount check if we're going to panic below. */
 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 	    ("vput: missed vn_close"));
 	error = 0;
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		v_incr_usecount(vp, -1);
 		VOP_UNLOCK(vp, LK_INTERLOCK, td);
 		return;
 	}
 
 	if (vp->v_usecount != 1) {
 #ifdef DIAGNOSTIC
 		vprint("vput: negative ref count", vp);
 #endif
 		panic("vput: negative ref cnt");
 	}
 	v_incr_usecount(vp, -1);
 	vp->v_iflag |= VI_OWEINACT;
 	if (VOP_ISLOCKED(vp, NULL) != LK_EXCLUSIVE) {
 		error = VOP_LOCK(vp, LK_EXCLUPGRADE|LK_INTERLOCK|LK_NOWAIT, td);
 		VI_LOCK(vp);
 		if (error)
 			goto done;
 	}
 	if (vp->v_iflag & VI_OWEINACT)
 		vinactive(vp, td);
 	VOP_UNLOCK(vp, 0, td);
 done:
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 	else
 		vlruvp(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Somebody doesn't want the vnode recycled.
  */
 void
 vhold(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vholdl(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vholdl(struct vnode *vp)
 {
 
 	vp->v_holdcnt++;
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 }
 
 /*
  * Note that there is one less who cares about this vnode.  vdrop() is the
  * opposite of vhold().
  */
 void
 vdrop(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vdropl(vp);
 	VI_UNLOCK(vp);
 }
 
 static void
 vdropl(struct vnode *vp)
 {
 
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
 	vp->v_holdcnt--;
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 	else
 		vlruvp(vp);
 }
 
 static void
 vinactive(struct vnode *vp, struct thread *td)
 {
 	ASSERT_VOP_LOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 	    ("vinactive: recursed on VI_DOINGINACT"));
 	vp->v_iflag |= VI_DOINGINACT;
 	VI_UNLOCK(vp);
 	VOP_INACTIVE(vp, td);
 	VI_LOCK(vp);
 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 	    ("vinactive: lost VI_DOINGINACT"));
 	vp->v_iflag &= ~(VI_DOINGINACT|VI_OWEINACT);
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush(, td)
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
 vflush(mp, rootrefs, flags, td)
 	struct mount *mp;
 	int rootrefs;
 	int flags;
 	struct thread *td;
 {
 	struct vnode *vp, *nvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0)
 			return (error);
 		vput(rootvp);
 
 	}
 	MNT_ILOCK(mp);
 loop:
 	MNT_VNODE_FOREACH(vp, mp, nvp) {
 
 		VI_LOCK(vp);
 		MNT_IUNLOCK(mp);
 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
 		if (error) {
 			MNT_ILOCK(mp);
 			goto loop;
 		}
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp, 0, td);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp, LK_INTERLOCK, td);
 				MNT_ILOCK(mp);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 */
 		if (vp->v_usecount == 0) {
 			vgonel(vp, td);
 			VOP_UNLOCK(vp, 0, td);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * If FORCECLOSE is set, forcibly close the vnode. For block
 		 * or character devices, revert to an anonymous device. For
 		 * all other files, just kill them.
 		 */
 		if (flags & FORCECLOSE) {
 			VNASSERT(vp->v_type != VCHR && vp->v_type != VBLK, vp,
 			    ("device VNODE %p is FORCECLOSED", vp));
 			vgonel(vp, td);
 			VOP_UNLOCK(vp, 0, td);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		VOP_UNLOCK(vp, 0, td);
 #ifdef DIAGNOSTIC
 		if (busyprt)
 			vprint("vflush: busy vnode", vp);
 #endif
 		VI_UNLOCK(vp);
 		MNT_ILOCK(mp);
 		busy++;
 	}
 	MNT_IUNLOCK(mp);
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 		    ("vflush: usecount %d < rootrefs %d",
 		     rootvp->v_usecount, rootrefs));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td);
 			vgone(rootvp);
 			VOP_UNLOCK(rootvp, 0, td);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy)
 		return (EBUSY);
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * This moves a now (likely recyclable) vnode to the end of the
  * mountlist.  XXX However, it is temporarily disabled until we
  * can clean up ffs_sync() and friends, which have loop restart
  * conditions which this code causes to operate O(N^2).
  */
 static void
 vlruvp(struct vnode *vp)
 {
 #if 0
 	struct mount *mp;
 
 	if ((mp = vp->v_mount) != NULL) {
 		MNT_ILOCK(mp);
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		MNT_IUNLOCK(mp);
 	}
 #endif
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  * Release the passed interlock if the vnode will be recycled.
  */
 int
 vrecycle(struct vnode *vp, struct thread *td)
 {
 
 	ASSERT_VOP_LOCKED(vp, "vrecycle");
 	VI_LOCK(vp);
 	if (vp->v_usecount == 0 && (vp->v_iflag & VI_DOOMED) == 0) {
 		vgonel(vp, td);
 		return (1);
 	}
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(struct vnode *vp)
 {
 	struct thread *td = curthread;	/* XXX */
 	ASSERT_VOP_LOCKED(vp, "vgone");
 
 	/*
 	 * Don't vgonel if we're already doomed.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_iflag & VI_DOOMED) {
 		VI_UNLOCK(vp);
 		return;
 	}
 	vgonel(vp, td);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 void
 vgonel(struct vnode *vp, struct thread *td)
 {
 	int oweinact;
 	int active;
 	int doomed;
 
 	ASSERT_VOP_LOCKED(vp, "vgonel");
 	ASSERT_VI_LOCKED(vp, "vgonel");
 
 	/*
 	 * Check to see if the vnode is in use. If so we have to reference it
 	 * before we clean it out so that its count cannot fall to zero and
 	 * generate a race against ourselves to recycle it.
 	 */
 	if ((active = vp->v_usecount))
 		v_incr_usecount(vp, 1);
 
 	/*
 	 * See if we're already doomed, if so, this is coming from a
 	 * successful vtryrecycle();
 	 */
 	doomed = (vp->v_iflag & VI_DOOMED);
 	vp->v_iflag |= VI_DOOMED;
 	oweinact = (vp->v_iflag & VI_OWEINACT);
 	VI_UNLOCK(vp);
 
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 		(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
 	if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0)
 		vinvalbuf(vp, 0, td, 0, 0);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed.
 	 */
 	if (active)
 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 	if (oweinact || active) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 	}
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, td))
 		panic("vgone: cannot reclaim");
 
 	VNASSERT(vp->v_object == NULL, vp,
 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
 
 	/*
 	 * Delete from old mount point vnode list.
 	 */
 	delmntque(vp);
 	cache_purge(vp);
 	VI_LOCK(vp);
 	if (active) {
 		v_incr_usecount(vp, -1);
 		VNASSERT(vp->v_usecount >= 0, vp, ("vgone: bad ref count"));
 	}
 	/*
 	 * Done with purge, reset to the standard lock and
 	 * notify sleepers of the grim news.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = &dead_vnodeops;
 	vp->v_tag = "none";
 	vp->v_type = VBAD;
 
 	/*
 	 * If it is on the freelist and not already at the head,
 	 * move it to the head of the list. The test of the
 	 * VDOOMED flag and the reference count of zero is because
 	 * it will be removed from the free list by vnlru_free,
 	 * but will not have its reference count incremented until
 	 * after calling vgone. If the reference count were
 	 * incremented first, vgone would (incorrectly) try to
 	 * close the previous instance of the underlying object.
 	 */
 	if (vp->v_holdcnt == 0 && !doomed)
 		vfreehead(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(vp)
 	struct vnode *vp;
 {
 	int count;
 
 	dev_lock();
 	count = vp->v_rdev->si_usecount;
 	dev_unlock();
 	return (count);
 }
 
 /*
  * Same as above, but using the struct cdev *as argument
  */
 int
 count_dev(dev)
 	struct cdev *dev;
 {
 	int count;
 
 	dev_lock();
 	count = dev->si_usecount;
 	dev_unlock();
 	return(count);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
 
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
 	char buf[96];
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (vp->v_vflag & VV_ROOT)
 		strcat(buf, "|VV_ROOT");
 	if (vp->v_vflag & VV_TEXT)
 		strcat(buf, "|VV_TEXT");
 	if (vp->v_vflag & VV_SYSTEM)
 		strcat(buf, "|VV_SYSTEM");
 	if (vp->v_iflag & VI_DOOMED)
 		strcat(buf, "|VI_DOOMED");
 	if (vp->v_iflag & VI_FREE)
 		strcat(buf, "|VI_FREE");
 	printf("    flags (%s)\n", buf + 1);
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
 	if (vp->v_object != NULL)
 		printf("    v_object %p ref %d pages %d\n",
 		    vp->v_object, vp->v_object->ref_count,
 		    vp->v_object->resident_page_count);
 	printf("    ");
 	lockmgr_printinfo(vp->v_vnlock);
 	printf("\n");
 	if (vp->v_data != NULL)
 		VOP_PRINT(vp);
 }
 
 #ifdef DDB
 #include <ddb/ddb.h>
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 {
 	struct mount *mp, *nmp;
 	struct vnode *vp;
 
 	/*
 	 * Note: because this is DDB, we can't obey the locking semantics
 	 * for these structures, which means we could catch an inconsistent
 	 * state and dereference a nasty pointer.  Not much to be done
 	 * about that.
 	 */
 	printf("Locked vnodes\n");
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (VOP_ISLOCKED(vp, NULL))
 				vprint("", vp);
 		}
 		nmp = TAILQ_NEXT(mp, mnt_list);
 	}
 }
 #endif
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static void
 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
 {
 
 	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
 	xvfsp->vfc_typenum = vfsp->vfc_typenum;
 	xvfsp->vfc_refcount = vfsp->vfc_refcount;
 	xvfsp->vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp->vfc_vfsops = NULL;
 	xvfsp->vfc_next = NULL;
 }
 
 /*
  * Top level filesystem related information gathering.
  */
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 	int error;
 
 	error = 0;
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+		bzero(&xvfsp, sizeof(xvfsp));
 		vfsconf2x(vfsp, &xvfsp);
 		error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 
 	printf("WARNING: userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
+		bzero(&xvfsp, sizeof(xvfsp));
 		vfsconf2x(vfsp, &xvfsp);
 		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 	}
 	return (EOPNOTSUPP);
 }
 
 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
 	vfs_sysctl, "Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error)
 			return error;
 	}
 	return 0;
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 #endif /* !BURN_BRIDGES */
 
 #define KINFO_VNODESLOP		10
 #ifdef notyet
 /*
  * Dump vnode list (via sysctl).
  */
 /* ARGSUSED */
 static int
 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct xvnode *xvn;
 	struct thread *td = req->td;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, len, n;
 
 	/*
 	 * Stale numvnodes access is not fatal here.
 	 */
 	req->lock = 0;
 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 	if (!req->oldptr)
 		/* Make an estimate */
 		return (SYSCTL_OUT(req, 0, len));
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 	n = 0;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 			continue;
 		MNT_ILOCK(mp);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (n == len)
 				break;
 			vref(vp);
 			xvn[n].xv_size = sizeof *xvn;
 			xvn[n].xv_vnode = vp;
 			xvn[n].xv_id = 0;	/* XXX compat */
 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 			XV_COPY(usecount);
 			XV_COPY(writecount);
 			XV_COPY(holdcnt);
 			XV_COPY(mount);
 			XV_COPY(numoutput);
 			XV_COPY(type);
 #undef XV_COPY
 			xvn[n].xv_flag = vp->v_vflag;
 
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				break;
 			case VBLK:
 			case VCHR:
 				if (vp->v_rdev == NULL) {
 					vrele(vp);
 					continue;
 				}
 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
 				break;
 			case VSOCK:
 				xvn[n].xv_socket = vp->v_socket;
 				break;
 			case VFIFO:
 				xvn[n].xv_fifo = vp->v_fifoinfo;
 				break;
 			case VNON:
 			case VBAD:
 			default:
 				/* shouldn't happen? */
 				vrele(vp);
 				continue;
 			}
 			vrele(vp);
 			++n;
 		}
 		MNT_IUNLOCK(mp);
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp, td);
 		if (n == len)
 			break;
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 	free(xvn, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_vnode, "S,xvnode", "");
 #endif
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall()
 {
 	struct mount *mp;
 	struct thread *td;
 	int error;
 
 	KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
 	td = curthread;
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	while(!TAILQ_EMPTY(&mountlist)) {
 		mp = TAILQ_LAST(&mountlist, mntlist);
 		error = dounmount(mp, MNT_FORCE, td);
 		if (error) {
 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
 			printf("unmount of %s failed (",
 			    mp->mnt_stat.f_mntonname);
 			if (error == EBUSY)
 				printf("BUSY)\n");
 			else
 				printf("%d)\n", error);
 		} else {
 			/* The unmount has removed mp from the mountlist */
 		}
 	}
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags)
 {
 	struct vnode *vp, *nvp;
 	struct vm_object *obj;
 	int tries;
 
 	tries = 5;
 	MNT_ILOCK(mp);
 loop:
 	TAILQ_FOREACH_SAFE(vp, &mp->mnt_nvnodelist, v_nmntvnodes, nvp) {
 		if (vp->v_mount != mp) {
 			if (--tries > 0)
 				goto loop;
 			break;
 		}
 
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_OBJDIRTY) &&
 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
 			MNT_IUNLOCK(mp);
 			if (!vget(vp,
 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
 			    curthread)) {
 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
 					vput(vp);
 					MNT_ILOCK(mp);
 					continue;
 				}
 
 				obj = vp->v_object;
 				if (obj != NULL) {
 					VM_OBJECT_LOCK(obj);
 					vm_object_page_clean(obj, 0, 0,
 					    flags == MNT_WAIT ?
 					    OBJPC_SYNC : OBJPC_NOSYNC);
 					VM_OBJECT_UNLOCK(obj);
 				}
 				vput(vp);
 			}
 			MNT_ILOCK(mp);
 			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
 				if (--tries > 0)
 					goto loop;
 				break;
 			}
 		} else
 			VI_UNLOCK(vp);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Mark a vnode as free, putting it up for recycling.
  */
 static void
 vfree(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, "vfree");
 	mtx_lock(&vnode_free_list_mtx);
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
 	VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
 	if (vp->v_iflag & (VI_AGE|VI_DOOMED)) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	} else {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
 	freevnodes++;
 	mtx_unlock(&vnode_free_list_mtx);
 	vp->v_iflag &= ~(VI_AGE|VI_DOOMED);
 	vp->v_iflag |= VI_FREE;
 }
 
 /*
  * Move a vnode to the head of the free list.
  */
 static void
 vfreehead(struct vnode *vp)
 {
 	mtx_lock(&vnode_free_list_mtx);
 	if (vp->v_iflag & VI_FREE) {
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 	} else {
 		vp->v_iflag |= VI_FREE;
 		freevnodes++;
 	}
 	TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 /*
  * Opposite of vfree() - mark a vnode as in use.
  */
 static void
 vbusy(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, "vbusy");
 	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
 
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 	freevnodes--;
 	mtx_unlock(&vnode_free_list_mtx);
 
 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
 }
 
 /*
  * Initalize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
 	if (vp->v_pollinfo != NULL) {
 		uma_zfree(vnodepoll_zone, vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
 	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note,
 	    &vp->v_pollinfo->vpi_lock);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(vp, td, events)
 	struct vnode *vp;
 	struct thread *td;
 	short events;
 {
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return events;
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return 0;
 }
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 
 static struct vop_vector sync_vnodeops = {
 	.vop_bypass =	VOP_EOPNOTSUPP,
 	.vop_close =	sync_close,		/* close */
 	.vop_fsync =	sync_fsync,		/* fsync */
 	.vop_inactive =	sync_inactive,	/* inactive */
 	.vop_reclaim =	sync_reclaim,	/* reclaim */
 	.vop_lock =	vop_stdlock,	/* lock */
 	.vop_unlock =	vop_stdunlock,	/* unlock */
 	.vop_islocked =	vop_stdislocked,	/* islocked */
 };
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 int
 vfs_allocate_syncvnode(mp)
 	struct mount *mp;
 {
 	struct vnode *vp;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) {
 		mp->mnt_syncer = NULL;
 		return (error);
 	}
 	vp->v_type = VNON;
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	VI_LOCK(vp);
 	vn_syncer_add_to_worklist(&vp->v_bufobj,
 	    syncdelay > 0 ? next % syncdelay : 0);
 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 	mtx_lock(&sync_mtx);
 	sync_vnode_count++;
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	mp->mnt_syncer = vp;
 	return (0);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	struct thread *td = ap->a_td;
 	int error, asyncflag;
 	struct bufobj *bo;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	bo = &syncvp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	mtx_lock(&mountlist_mtx);
 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
 		mtx_unlock(&mountlist_mtx);
 		return (0);
 	}
 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 		vfs_unbusy(mp, td);
 		return (0);
 	}
 	asyncflag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	vfs_msync(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY, td);
 	if (asyncflag)
 		mp->mnt_flag |= MNT_ASYNC;
 	vn_finished_write(mp);
 	vfs_unbusy(mp, td);
 	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
 sync_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 
 	VI_LOCK(vp);
 	bo = &vp->v_bufobj;
 	vp->v_mount->mnt_syncer = NULL;
 	if (bo->bo_flag & BO_ONWORKLST) {
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
  		syncer_worklist_len--;
 		sync_vnode_count--;
 		mtx_unlock(&sync_mtx);
 		bo->bo_flag &= ~BO_ONWORKLST;
 	}
 	VI_UNLOCK(vp);
 
 	return (0);
 }
 
 /*
  * Check if vnode represents a disk device
  */
 int
 vn_isdisk(vp, errp)
 	struct vnode *vp;
 	int *errp;
 {
 	int error;
 
 	error = 0;
 	dev_lock();
 	if (vp->v_type != VCHR)
 		error = ENOTBLK;
 	else if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (vp->v_rdev->si_devsw == NULL)
 		error = ENXIO;
 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 		error = ENOTBLK;
 	dev_unlock();
 	if (errp != NULL)
 		*errp = error;
 	return (error == 0);
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, credentials,
  * and optional call-by-reference privused argument allowing vaccess()
  * to indicate to the caller whether privilege was used to satisfy the
  * request (obsoleted).  Returns 0 on success, or an errno on failure.
  */
 int
 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
 	enum vtype type;
 	mode_t file_mode;
 	uid_t file_uid;
 	gid_t file_gid;
 	mode_t acc_mode;
 	struct ucred *cred;
 	int *privused;
 {
 	mode_t dac_granted;
 #ifdef CAPABILITIES
 	mode_t cap_granted;
 #endif
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	if (privused != NULL)
 		*privused = 0;
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((acc_mode & dac_granted) == acc_mode)
 		return (0);
 
 privcheck:
 	if (!suser_cred(cred, SUSER_ALLOWJAIL)) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 #ifdef CAPABILITIES
 	/*
 	 * Build a capability mask to determine if the set of capabilities
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.
 	 * For each capability, if the capability is required, bitwise
 	 * or the request type onto the cap_granted mask.
 	 */
 	cap_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
 		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
 		 */
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
 			cap_granted |= VEXEC;
 	} else {
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL))
 			cap_granted |= VEXEC;
 	}
 
 	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
 		cap_granted |= VREAD;
 
 	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL))
 		cap_granted |= (VWRITE | VAPPEND);
 
 	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL))
 		cap_granted |= VADMIN;
 
 	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 #endif
 
 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace,
     struct ucred *cred, struct thread *td, int access)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly
 	 * manipulate system attributes.
 	 *
 	 * XXX What capability should apply here?
 	 * Probably CAP_SYS_SETFFLAG.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (suser_cred(cred, 0));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, access, cred, td));
 	default:
 		return (EPERM);
 	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * This only exists to supress warnings from unlocked specfs accesses.  It is
  * no longer ok to have an unlocked VFS.
  */
 #define	IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
 
 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "");
 
 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "");
 
 int vfs_badlock_print = 1;	/* Print lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "");
 
 #ifdef KDB
 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "");
 #endif
 
 static void
 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 {
 
 #ifdef KDB
 	if (vfs_badlock_backtrace)
 		kdb_backtrace();
 #endif
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, (void *)vp, msg);
 	if (vfs_badlock_ddb)
 		kdb_enter("lock violation");
 }
 
 void
 assert_vi_locked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
 		vfs_badlock("is not locked but should be", str, vp);
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 void
 assert_vop_elocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 
 #if 0
 void
 assert_vop_elocked_other(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
 		vfs_badlock("is not exclusive locked by another thread",
 		    str, vp);
 }
 
 void
 assert_vop_slocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
 		vfs_badlock("is not locked shared but should be", str, vp);
 }
 #endif /* 0 */
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
 	if (a->a_tdvp != a->a_fdvp)
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp != a->a_fvp)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 }
 
 void
 vop_strategy_pre(void *ap)
 {
 	struct vop_strategy_args *a;
 	struct buf *bp;
 
 	a = ap;
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (BUF_REFCNT(bp) < 1) {
 		if (vfs_badlock_print)
 			printf(
 			    "VOP_STRATEGY: bp is not locked but should be\n");
 		if (vfs_badlock_ddb)
 			kdb_enter("lock violation");
 	}
 }
 
 void
 vop_lookup_pre(void *ap)
 {
 	struct vop_lookup_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 }
 
 void
 vop_lookup_post(void *ap, int rc)
 {
 	struct vop_lookup_args *a;
 	struct vnode *dvp;
 	struct vnode *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = *(a->a_vpp);
 
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 
 	if (!rc)
 		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
 }
 
 void
 vop_lock_pre(void *ap)
 {
 	struct vop_lock_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_lock_post(void *ap, int rc)
 {
 	struct vop_lock_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0)
 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_unlock_pre(void *ap)
 {
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 }
 
 void
 vop_unlock_post(void *ap, int rc)
 {
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
 }
 #endif /* DEBUG_VFS_LOCKS */
 
 static struct knlist fs_knlist;
 
 static void
 vfs_event_init(void *arg)
 {
 	knlist_init(&fs_knlist, NULL);
 }
 /* XXX - correct order? */
 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 
 void
 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
 {
 
 	KNOTE_UNLOCKED(&fs_knlist, event);
 }
 
 static int	filt_fsattach(struct knote *kn);
 static void	filt_fsdetach(struct knote *kn);
 static int	filt_fsevent(struct knote *kn, long hint);
 
 struct filterops fs_filtops =
 	{ 0, filt_fsattach, filt_fsdetach, filt_fsevent };
 
 static int
 filt_fsattach(struct knote *kn)
 {
 
 	kn->kn_flags |= EV_CLEAR;
 	knlist_add(&fs_knlist, kn, 0);
 	return (0);
 }
 
 static void
 filt_fsdetach(struct knote *kn)
 {
 
 	knlist_remove(&fs_knlist, kn, 0);
 }
 
 static int
 filt_fsevent(struct knote *kn, long hint)
 {
 
 	kn->kn_fflags |= hint;
 	return (kn->kn_fflags != 0);
 }
 
 static int
 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsidctl vc;
 	int error;
 	struct mount *mp;
 
 	error = SYSCTL_IN(req, &vc, sizeof(vc));
 	if (error)
 		return (error);
 	if (vc.vc_vers != VFS_CTL_VERS1)
 		return (EINVAL);
 	mp = vfs_getvfs(&vc.vc_fsid);
 	if (mp == NULL)
 		return (ENOENT);
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
 	return (VFS_SYSCTL(mp, vc.vc_op, req));
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR,
         NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid");
 
 /*
  * Function to initialize a va_filerev field sensibly.
  * XXX: Wouldn't a random number make a lot more sense ??
  */
 u_quad_t
 init_va_filerev(void)
 {
 	struct bintime bt;
 
 	getbinuptime(&bt);
 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 }
Index: head/sys/net/if_mib.c
===================================================================
--- head/sys/net/if_mib.c	(revision 145952)
+++ head/sys/net/if_mib.c	(revision 145953)
@@ -1,143 +1,144 @@
 /*-
  * Copyright 1996 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  * 
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_mib.h>
 
 /*
  * A sysctl(3) MIB for generic interface information.  This information
  * is exported in the net.link.generic branch, which has the following
  * structure:
  *
  * net.link.generic	.system			- system-wide control variables
  *						  and statistics (node)
  *			.ifdata.<ifindex>.general
  *						- what's in `struct ifdata'
  *						  plus some other info
  *			.ifdata.<ifindex>.linkspecific
  *						- a link-type-specific data
  *						  structure (as might be used
  *						  by an SNMP agent
  *
  * Perhaps someday we will make addresses accessible via this interface
  * as well (then there will be four such...).  The reason that the
  * index comes before the last element in the name is because it
  * seems more orthogonal that way, particularly with the possibility
  * of other per-interface data living down here as well (e.g., integrated
  * services stuff).
  */
 
 SYSCTL_DECL(_net_link_generic);
 SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RW, 0,
 	    "Variables global to all interfaces");
 SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD,
 	   &if_index, 0, "Number of configured interfaces");
 
 static int
 sysctl_ifdata(SYSCTL_HANDLER_ARGS) /* XXX bad syntax! */
 {
 	int *name = (int *)arg1;
 	int error;
 	u_int namelen = arg2;
 	struct ifnet *ifp;
 	struct ifmibdata ifmd;
 
 	if (namelen != 2)
 		return EINVAL;
 
 	if (name[0] <= 0 || name[0] > if_index ||
 	    ifaddr_byindex(name[0]) == NULL)
 		return ENOENT;
 
 	ifp = ifaddr_byindex(name[0])->ifa_ifp;
 
 	switch(name[1]) {
 	default:
 		return ENOENT;
 
 	case IFDATA_GENERAL:
+		bzero(&ifmd, sizeof(ifmd));
 		strlcpy(ifmd.ifmd_name, ifp->if_xname, sizeof(ifmd.ifmd_name));
 
 #define COPY(fld) ifmd.ifmd_##fld = ifp->if_##fld
 		COPY(pcount);
 		COPY(flags);
 		COPY(data);
 #undef COPY
 		ifmd.ifmd_snd_len = ifp->if_snd.ifq_len;
 		ifmd.ifmd_snd_maxlen = ifp->if_snd.ifq_maxlen;
 		ifmd.ifmd_snd_drops = ifp->if_snd.ifq_drops;
 
 		error = SYSCTL_OUT(req, &ifmd, sizeof ifmd);
 		if (error || !req->newptr)
 			return error;
 
 		error = SYSCTL_IN(req, &ifmd, sizeof ifmd);
 		if (error)
 			return error;
 
 #define DONTCOPY(fld) ifmd.ifmd_data.ifi_##fld = ifp->if_data.ifi_##fld
 		DONTCOPY(type);
 		DONTCOPY(physical);
 		DONTCOPY(addrlen);
 		DONTCOPY(hdrlen);
 		DONTCOPY(mtu);
 		DONTCOPY(metric);
 		DONTCOPY(baudrate);
 #undef DONTCOPY
 #define COPY(fld) ifp->if_##fld = ifmd.ifmd_##fld
 		COPY(data);
 		ifp->if_snd.ifq_maxlen = ifmd.ifmd_snd_maxlen;
 		ifp->if_snd.ifq_drops = ifmd.ifmd_snd_drops;
 #undef COPY
 		break;
 
 	case IFDATA_LINKSPECIFIC:
 		error = SYSCTL_OUT(req, ifp->if_linkmib, ifp->if_linkmiblen);
 		if (error || !req->newptr)
 			return error;
 
 		error = SYSCTL_IN(req, ifp->if_linkmib, ifp->if_linkmiblen);
 		if (error)
 			return error;
 		
 	}
 	return 0;
 }
 
 SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RW,
 	    sysctl_ifdata, "Interface table");
 
Index: head/sys/netinet/ip_divert.c
===================================================================
--- head/sys/netinet/ip_divert.c	(revision 145952)
+++ head/sys/netinet/ip_divert.c	(revision 145953)
@@ -1,717 +1,718 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #if !defined(KLD_MODULE)
 #include "opt_inet.h"
 #include "opt_ipfw.h"
 #include "opt_mac.h"
 #ifndef INET
 #error "IPDIVERT requires INET."
 #endif
 #ifndef IPFIREWALL
 #error "IPDIVERT requires IPFIREWALL"
 #endif
 #endif
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mac.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_divert.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 
 /*
  * Divert sockets
  */
 
 /*
  * Allocate enough space to hold a full IP packet
  */
 #define	DIVSNDQ		(65536 + 100)
 #define	DIVRCVQ		(65536 + 100)
 
 /*
  * Divert sockets work in conjunction with ipfw, see the divert(4)
  * manpage for features.
  * Internally, packets selected by ipfw in ip_input() or ip_output(),
  * and never diverted before, are passed to the input queue of the
  * divert socket with a given 'divert_port' number (as specified in
  * the matching ipfw rule), and they are tagged with a 16 bit cookie
  * (representing the rule number of the matching ipfw rule), which
  * is passed to process reading from the socket.
  *
  * Packets written to the divert socket are again tagged with a cookie
  * (usually the same as above) and a destination address.
  * If the destination address is INADDR_ANY then the packet is
  * treated as outgoing and sent to ip_output(), otherwise it is
  * treated as incoming and sent to ip_input().
  * In both cases, the packet is tagged with the cookie.
  *
  * On reinjection, processing in ip_input() and ip_output()
  * will be exactly the same as for the original packet, except that
  * ipfw processing will start at the rule number after the one
  * written in the cookie (so, tagging a packet with a cookie of 0
  * will cause it to be effectively considered as a standard packet).
  */
 
 /* Internal variables. */
 static struct inpcbhead divcb;
 static struct inpcbinfo divcbinfo;
 
 static u_long	div_sendspace = DIVSNDQ;	/* XXX sysctl ? */
 static u_long	div_recvspace = DIVRCVQ;	/* XXX sysctl ? */
 
 /*
  * Initialize divert connection block queue.
  */
 void
 div_init(void)
 {
 	INP_INFO_LOCK_INIT(&divcbinfo, "div");
 	LIST_INIT(&divcb);
 	divcbinfo.listhead = &divcb;
 	/*
 	 * XXX We don't use the hash list for divert IP, but it's easier
 	 * to allocate a one entry hash list than it is to check all
 	 * over the place for hashbase == NULL.
 	 */
 	divcbinfo.hashbase = hashinit(1, M_PCB, &divcbinfo.hashmask);
 	divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask);
 	divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(divcbinfo.ipi_zone, maxsockets);
 }
 
 /*
  * IPPROTO_DIVERT is not in the real IP protocol number space; this
  * function should never be called.  Just in case, drop any packets.
  */
 void
 div_input(struct mbuf *m, int off)
 {
 	ipstat.ips_noproto++;
 	m_freem(m);
 }
 
 /*
  * Divert a packet by passing it up to the divert socket at port 'port'.
  *
  * Setup generic address and protocol structures for div_input routine,
  * then pass them along with mbuf chain.
  */
 static void
 divert_packet(struct mbuf *m, int incoming)
 {
 	struct ip *ip;
 	struct inpcb *inp;
 	struct socket *sa;
 	u_int16_t nport;
 	struct sockaddr_in divsrc;
 	struct m_tag *mtag;
 
 	mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
 	if (mtag == NULL) {
 		printf("%s: no divert tag\n", __func__);
 		m_freem(m);
 		return;
 	}
 	/* Assure header */
 	if (m->m_len < sizeof(struct ip) &&
 	    (m = m_pullup(m, sizeof(struct ip))) == 0)
 		return;
 	ip = mtod(m, struct ip *);
 
 	/* Delayed checksums are currently not compatible with divert. */
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		ip->ip_len = ntohs(ip->ip_len);
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 		ip->ip_len = htons(ip->ip_len);
 	}
 
 	/*
 	 * Record receive interface address, if any.
 	 * But only for incoming packets.
 	 */
 	bzero(&divsrc, sizeof(divsrc));
 	divsrc.sin_len = sizeof(divsrc);
 	divsrc.sin_family = AF_INET;
 	divsrc.sin_port = divert_cookie(mtag);	/* record matching rule */
 	if (incoming) {
 		struct ifaddr *ifa;
 
 		/* Sanity check */
 		M_ASSERTPKTHDR(m);
 
 		/* Find IP address for receive interface */
 		TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr == NULL)
 				continue;
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			divsrc.sin_addr =
 			    ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
 			break;
 		}
 	}
 	/*
 	 * Record the incoming interface name whenever we have one.
 	 */
 	if (m->m_pkthdr.rcvif) {
 		/*
 		 * Hide the actual interface name in there in the 
 		 * sin_zero array. XXX This needs to be moved to a
 		 * different sockaddr type for divert, e.g.
 		 * sockaddr_div with multiple fields like 
 		 * sockaddr_dl. Presently we have only 7 bytes
 		 * but that will do for now as most interfaces
 		 * are 4 or less + 2 or less bytes for unit.
 		 * There is probably a faster way of doing this,
 		 * possibly taking it from the sockaddr_dl on the iface.
 		 * This solves the problem of a P2P link and a LAN interface
 		 * having the same address, which can result in the wrong
 		 * interface being assigned to the packet when fed back
 		 * into the divert socket. Theoretically if the daemon saves
 		 * and re-uses the sockaddr_in as suggested in the man pages,
 		 * this iface name will come along for the ride.
 		 * (see div_output for the other half of this.)
 		 */ 
 		strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname,
 		    sizeof(divsrc.sin_zero));
 	}
 
 	/* Put packet on socket queue, if any */
 	sa = NULL;
 	nport = htons((u_int16_t)divert_info(mtag));
 	INP_INFO_RLOCK(&divcbinfo);
 	LIST_FOREACH(inp, &divcb, inp_list) {
 		INP_LOCK(inp);
 		/* XXX why does only one socket match? */
 		if (inp->inp_lport == nport) {
 			sa = inp->inp_socket;
 			SOCKBUF_LOCK(&sa->so_rcv);
 			if (sbappendaddr_locked(&sa->so_rcv,
 			    (struct sockaddr *)&divsrc, m,
 			    (struct mbuf *)0) == 0) {
 				SOCKBUF_UNLOCK(&sa->so_rcv);
 				sa = NULL;	/* force mbuf reclaim below */
 			} else
 				sorwakeup_locked(sa);
 			INP_UNLOCK(inp);
 			break;
 		}
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&divcbinfo);
 	if (sa == NULL) {
 		m_freem(m);
 		ipstat.ips_noproto++;
 		ipstat.ips_delivered--;
         }
 }
 
 /*
  * Deliver packet back into the IP processing machinery.
  *
  * If no address specified, or address is 0.0.0.0, send to ip_output();
  * otherwise, send to ip_input() and mark as having been received on
  * the interface with that address.
  */
 static int
 div_output(struct socket *so, struct mbuf *m,
 	struct sockaddr_in *sin, struct mbuf *control)
 {
 	struct m_tag *mtag;
 	struct divert_tag *dt;
 	int error = 0;
 
 	m->m_pkthdr.rcvif = NULL;
 
 	if (control)
 		m_freem(control);		/* XXX */
 
 	if ((mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL)) == NULL) {
 		mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag),
 		    M_NOWAIT | M_ZERO);
 		if (mtag == NULL) {
 			error = ENOBUFS;
 			goto cantsend;
 		}
 		dt = (struct divert_tag *)(mtag+1);
 		m_tag_prepend(m, mtag);
 	} else
 		dt = (struct divert_tag *)(mtag+1);
 
 	/* Loopback avoidance and state recovery */
 	if (sin) {
 		int i;
 
 		dt->cookie = sin->sin_port;
 		/*
 		 * Find receive interface with the given name, stuffed
 		 * (if it exists) in the sin_zero[] field.
 		 * The name is user supplied data so don't trust its size
 		 * or that it is zero terminated.
 		 */
 		for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++)
 			;
 		if ( i > 0 && i < sizeof(sin->sin_zero))
 			m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
 	}
 
 	/* Reinject packet into the system as incoming or outgoing */
 	if (!sin || sin->sin_addr.s_addr == 0) {
 		struct ip *const ip = mtod(m, struct ip *);
 		struct inpcb *inp;
 
 		dt->info |= IP_FW_DIVERT_OUTPUT_FLAG;
 		INP_INFO_WLOCK(&divcbinfo);
 		inp = sotoinpcb(so);
 		INP_LOCK(inp);
 		/*
 		 * Don't allow both user specified and setsockopt options,
 		 * and don't allow packet length sizes that will crash
 		 */
 		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) ||
 		     ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
 			error = EINVAL;
 			m_freem(m);
 		} else {
 			/* Convert fields to host order for ip_output() */
 			ip->ip_len = ntohs(ip->ip_len);
 			ip->ip_off = ntohs(ip->ip_off);
 
 			/* Send packet to output processing */
 			ipstat.ips_rawout++;			/* XXX */
 
 #ifdef MAC
 			mac_create_mbuf_from_inpcb(inp, m);
 #endif
 			error = ip_output(m,
 				    inp->inp_options, NULL,
 				    ((so->so_options & SO_DONTROUTE) ?
 				    IP_ROUTETOIF : 0) |
 				    IP_ALLOWBROADCAST | IP_RAWOUTPUT,
 				    inp->inp_moptions, NULL);
 		}
 		INP_UNLOCK(inp);
 		INP_INFO_WUNLOCK(&divcbinfo);
 	} else {
 		dt->info |= IP_FW_DIVERT_LOOPBACK_FLAG;
 		if (m->m_pkthdr.rcvif == NULL) {
 			/*
 			 * No luck with the name, check by IP address.
 			 * Clear the port and the ifname to make sure
 			 * there are no distractions for ifa_ifwithaddr.
 			 */
 			struct	ifaddr *ifa;
 
 			bzero(sin->sin_zero, sizeof(sin->sin_zero));
 			sin->sin_port = 0;
 			ifa = ifa_ifwithaddr((struct sockaddr *) sin);
 			if (ifa == NULL) {
 				error = EADDRNOTAVAIL;
 				goto cantsend;
 			}
 			m->m_pkthdr.rcvif = ifa->ifa_ifp;
 		}
 #ifdef MAC
 		SOCK_LOCK(so);
 		mac_create_mbuf_from_socket(so, m);
 		SOCK_UNLOCK(so);
 #endif
 		/* Send packet to input processing */
 		ip_input(m);
 	}
 
 	return error;
 
 cantsend:
 	m_freem(m);
 	return error;
 }
 
 static int
 div_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	INP_INFO_WLOCK(&divcbinfo);
 	inp  = sotoinpcb(so);
 	if (inp != 0) {
 		INP_INFO_WUNLOCK(&divcbinfo);
 		return EINVAL;
 	}
 	if (td && (error = suser(td)) != 0) {
 		INP_INFO_WUNLOCK(&divcbinfo);
 		return error;
 	}
 	error = soreserve(so, div_sendspace, div_recvspace);
 	if (error) {
 		INP_INFO_WUNLOCK(&divcbinfo);
 		return error;
 	}
 	error = in_pcballoc(so, &divcbinfo, "divinp");
 	if (error) {
 		INP_INFO_WUNLOCK(&divcbinfo);
 		return error;
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	INP_LOCK(inp);
 	INP_INFO_WUNLOCK(&divcbinfo);
 	inp->inp_ip_p = proto;
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_flags |= INP_HDRINCL;
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 static int
 div_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_WLOCK(&divcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&divcbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	in_pcbdetach(inp);
 	INP_INFO_WUNLOCK(&divcbinfo);
 	return 0;
 }
 
 static int
 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	INP_INFO_WLOCK(&divcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&divcbinfo);
 		return EINVAL;
 	}
 	/* in_pcbbind assumes that nam is a sockaddr_in
 	 * and in_pcbbind requires a valid address. Since divert
 	 * sockets don't we need to make sure the address is
 	 * filled in properly.
 	 * XXX -- divert should not be abusing in_pcbind
 	 * and should probably have its own family.
 	 */
 	if (nam->sa_family != AF_INET)
 		error = EAFNOSUPPORT;
 	else {
 		((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
 		INP_LOCK(inp);
 		error = in_pcbbind(inp, nam, td->td_ucred);
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&divcbinfo);
 	return error;
 }
 
 static int
 div_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_RLOCK(&divcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_RUNLOCK(&divcbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	INP_INFO_RUNLOCK(&divcbinfo);
 	socantsendmore(so);
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 static int
 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct thread *td)
 {
 	/* Packet must have a header (but that's about it) */
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == 0) {
 		ipstat.ips_toosmall++;
 		m_freem(m);
 		return EINVAL;
 	}
 
 	/* Send packet */
 	return div_output(so, m, (struct sockaddr_in *)nam, control);
 }
 
 void
 div_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
         struct in_addr faddr;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
         	return;
 	if (PRC_IS_REDIRECT(cmd))
 		return;
 }
 
 static int
 div_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = divcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&divcbinfo);
 	gencnt = divcbinfo.ipi_gencnt;
 	n = divcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&divcbinfo);
 
 	error = sysctl_wire_old_buffer(req,
 	    2 * sizeof(xig) + n*sizeof(struct xinpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	INP_INFO_RLOCK(&divcbinfo);
 	for (inp = LIST_FIRST(divcbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
 			inp_list[i++] = inp;
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&divcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
+			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		}
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&divcbinfo);
 		xig.xig_gen = divcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = divcbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&divcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 /*
  * This is the wrapper function for in_setsockaddr.  We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 div_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setsockaddr(so, nam, &divcbinfo));
 }
 
 /*
  * This is the wrapper function for in_setpeeraddr. We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 div_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setpeeraddr(so, nam, &divcbinfo));
 }
 
 #ifdef SYSCTL_NODE
 SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT");
 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0,
 	    div_pcblist, "S,xinpcb", "List of active divert sockets");
 #endif
 
 struct pr_usrreqs div_usrreqs = {
 	.pru_attach =		div_attach,
 	.pru_bind =		div_bind,
 	.pru_control =		in_control,
 	.pru_detach =		div_detach,
 	.pru_peeraddr =		div_peeraddr,
 	.pru_send =		div_send,
 	.pru_shutdown =		div_shutdown,
 	.pru_sockaddr =		div_sockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel
 };
 
 struct protosw div_protosw = {
   SOCK_RAW,	NULL,		IPPROTO_DIVERT,	PR_ATOMIC|PR_ADDR,
   div_input,	NULL,		div_ctlinput,	ip_ctloutput,
   NULL,
   div_init,	NULL,		NULL,		NULL,
   &div_usrreqs
 };
 
 static int
 div_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 	int n;
 
 	switch (type) {
 	case MOD_LOAD:
 		/*
 		 * Protocol will be initialized by pf_proto_register().
 		 * We don't have to register ip_protox because we are not
 		 * a true IP protocol that goes over the wire.
 		 */
 		err = pf_proto_register(PF_INET, &div_protosw);
 		ip_divert_ptr = divert_packet;
 		break;
 	case MOD_QUIESCE:
 		/*
 		 * IPDIVERT may normally not be unloaded because of the
 		 * potential race conditions.  Tell kldunload we can't be
 		 * unloaded unless the unload is forced.
 		 */
 		err = EPERM;
 		break;
 	case MOD_UNLOAD:
 		/*
 		 * Forced unload.
 		 *
 		 * Module ipdivert can only be unloaded if no sockets are
 		 * connected.  Maybe this can be changed later to forcefully
 		 * disconnect any open sockets.
 		 *
 		 * XXXRW: Note that there is a slight race here, as a new
 		 * socket open request could be spinning on the lock and then
 		 * we destroy the lock.
 		 */
 		INP_INFO_WLOCK(&divcbinfo);
 		n = divcbinfo.ipi_count;
 		if (n != 0) {
 			err = EBUSY;
 			INP_INFO_WUNLOCK(&divcbinfo);
 			break;
 		}
 		ip_divert_ptr = NULL;
 		err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW);
 		INP_INFO_WUNLOCK(&divcbinfo);
 		INP_INFO_LOCK_DESTROY(&divcbinfo);
 		uma_zdestroy(divcbinfo.ipi_zone);
 		break;
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipdivertmod = {
         "ipdivert",
         div_modevent,
         0
 };
 
 DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
 MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
 MODULE_VERSION(ipdivert, 1);
Index: head/sys/netinet/raw_ip.c
===================================================================
--- head/sys/netinet/raw_ip.c	(revision 145952)
+++ head/sys/netinet/raw_ip.c	(revision 145953)
@@ -1,914 +1,915 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
  * $FreeBSD$
  */
 
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_mroute.h>
 
 #include <netinet/ip_fw.h>
 #include <netinet/ip_dummynet.h>
 
 #ifdef FAST_IPSEC
 #include <netipsec/ipsec.h>
 #endif /*FAST_IPSEC*/
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #endif /*IPSEC*/
 
 struct	inpcbhead ripcb;
 struct	inpcbinfo ripcbinfo;
 
 /* control hooks for ipfw and dummynet */
 ip_fw_ctl_t *ip_fw_ctl_ptr = NULL;
 ip_dn_ctl_t *ip_dn_ctl_ptr = NULL;
 
 /*
  * hooks for multicast routing. They all default to NULL,
  * so leave them not initialized and rely on BSS being set to 0.
  */
 
 /* The socket used to communicate with the multicast routing daemon.  */
 struct socket  *ip_mrouter;
 
 /* The various mrouter and rsvp functions */
 int (*ip_mrouter_set)(struct socket *, struct sockopt *);
 int (*ip_mrouter_get)(struct socket *, struct sockopt *);
 int (*ip_mrouter_done)(void);
 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 		   struct ip_moptions *);
 int (*mrt_ioctl)(int, caddr_t);
 int (*legal_vif_num)(int);
 u_long (*ip_mcast_src)(int);
 
 void (*rsvp_input_p)(struct mbuf *m, int off);
 int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
 void (*ip_rsvp_force_done)(struct socket *);
 
 /*
  * Nominal space allocated to a raw ip socket.
  */
 #define	RIPSNDQ		8192
 #define	RIPRCVQ		8192
 
 /*
  * Raw interface to IP protocol.
  */
 
 /*
  * Initialize raw connection block q.
  */
 void
 rip_init()
 {
 	INP_INFO_LOCK_INIT(&ripcbinfo, "rip");
 	LIST_INIT(&ripcb);
 	ripcbinfo.listhead = &ripcb;
 	/*
 	 * XXX We don't use the hash list for raw IP, but it's easier
 	 * to allocate a one entry hash list than it is to check all
 	 * over the place for hashbase == NULL.
 	 */
 	ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask);
 	ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask);
 	ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets);
 }
 
 static struct	sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET };
 
 static int
 raw_append(struct inpcb *last, struct ip *ip, struct mbuf *n)
 {
 	int policyfail = 0;
 
 	INP_LOCK_ASSERT(last);
 
 #if defined(IPSEC) || defined(FAST_IPSEC)
 	/* check AH/ESP integrity. */
 	if (ipsec4_in_reject(n, last)) {
 		policyfail = 1;
 #ifdef IPSEC
 		ipsecstat.in_polvio++;
 #endif /*IPSEC*/
 		/* do not inject data to pcb */
 	}
 #endif /*IPSEC || FAST_IPSEC*/
 #ifdef MAC
 	if (!policyfail && mac_check_inpcb_deliver(last, n) != 0)
 		policyfail = 1;
 #endif
 	if (!policyfail) {
 		struct mbuf *opts = NULL;
 		struct socket *so;
 
 		so = last->inp_socket;
 		if ((last->inp_flags & INP_CONTROLOPTS) ||
 		    (so->so_options & SO_TIMESTAMP))
 			ip_savecontrol(last, &opts, ip, n);
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (sbappendaddr_locked(&so->so_rcv,
 		    (struct sockaddr *)&ripsrc, n, opts) == 0) {
 			/* should notify about lost packet */
 			m_freem(n);
 			if (opts)
 				m_freem(opts);
 			SOCKBUF_UNLOCK(&so->so_rcv);
 		} else
 			sorwakeup_locked(so);
 	} else
 		m_freem(n);
 	return policyfail;
 }
 
 /*
  * Setup generic address and protocol structures
  * for raw_input routine, then pass them along with
  * mbuf chain.
  */
 void
 rip_input(struct mbuf *m, int off)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	int proto = ip->ip_p;
 	struct inpcb *inp, *last;
 
 	INP_INFO_RLOCK(&ripcbinfo);
 	ripsrc.sin_addr = ip->ip_src;
 	last = NULL;
 	LIST_FOREACH(inp, &ripcb, inp_list) {
 		INP_LOCK(inp);
 		if (inp->inp_ip_p && inp->inp_ip_p != proto) {
 	docontinue:
 			INP_UNLOCK(inp);
 			continue;
 		}
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			goto docontinue;
 #endif
 		if (inp->inp_laddr.s_addr &&
 		    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 			goto docontinue;
 		if (inp->inp_faddr.s_addr &&
 		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 			goto docontinue;
 		if (jailed(inp->inp_socket->so_cred))
 			if (htonl(prison_getip(inp->inp_socket->so_cred)) !=
 			    ip->ip_dst.s_addr)
 				goto docontinue;
 		if (last) {
 			struct mbuf *n;
 
 			n = m_copy(m, 0, (int)M_COPYALL);
 			if (n != NULL)
 				(void) raw_append(last, ip, n);
 			/* XXX count dropped packet */
 			INP_UNLOCK(last);
 		}
 		last = inp;
 	}
 	if (last != NULL) {
 		if (raw_append(last, ip, m) != 0)
 			ipstat.ips_delivered--;
 		INP_UNLOCK(last);
 	} else {
 		m_freem(m);
 		ipstat.ips_noproto++;
 		ipstat.ips_delivered--;
 	}
 	INP_INFO_RUNLOCK(&ripcbinfo);
 }
 
 /*
  * Generate IP header and pass packet to ip_output.
  * Tack on options user may have setup with control call.
  */
 int
 rip_output(struct mbuf *m, struct socket *so, u_long dst)
 {
 	struct ip *ip;
 	int error;
 	struct inpcb *inp = sotoinpcb(so);
 	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
 	    IP_ALLOWBROADCAST;
 
 	/*
 	 * If the user handed us a complete IP packet, use it.
 	 * Otherwise, allocate an mbuf for a header and fill it in.
 	 */
 	if ((inp->inp_flags & INP_HDRINCL) == 0) {
 		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
 		if (m == NULL)
 			return(ENOBUFS);
 
 		INP_LOCK(inp);
 		ip = mtod(m, struct ip *);
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_off = 0;
 		ip->ip_p = inp->inp_ip_p;
 		ip->ip_len = m->m_pkthdr.len;
 		if (jailed(inp->inp_socket->so_cred))
 			ip->ip_src.s_addr =
 			    htonl(prison_getip(inp->inp_socket->so_cred));
 		else
 			ip->ip_src = inp->inp_laddr;
 		ip->ip_dst.s_addr = dst;
 		ip->ip_ttl = inp->inp_ip_ttl;
 	} else {
 		if (m->m_pkthdr.len > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		INP_LOCK(inp);
 		ip = mtod(m, struct ip *);
 		if (jailed(inp->inp_socket->so_cred)) {
 			if (ip->ip_src.s_addr !=
 			    htonl(prison_getip(inp->inp_socket->so_cred))) {
 				INP_UNLOCK(inp);
 				m_freem(m);
 				return (EPERM);
 			}
 		}
 		/* don't allow both user specified and setsockopt options,
 		   and don't allow packet length sizes that will crash */
 		if (((ip->ip_hl != (sizeof (*ip) >> 2))
 		     && inp->inp_options)
 		    || (ip->ip_len > m->m_pkthdr.len)
 		    || (ip->ip_len < (ip->ip_hl << 2))) {
 			INP_UNLOCK(inp);
 			m_freem(m);
 			return EINVAL;
 		}
 		if (ip->ip_id == 0)
 			ip->ip_id = ip_newid();
 		/* XXX prevent ip_output from overwriting header fields */
 		flags |= IP_RAWOUTPUT;
 		ipstat.ips_rawout++;
 	}
 
 	if (inp->inp_flags & INP_ONESBCAST)
 		flags |= IP_SENDONES;
 
 #ifdef MAC
 	mac_create_mbuf_from_inpcb(inp, m);
 #endif
 
 	error = ip_output(m, inp->inp_options, NULL, flags,
 	    inp->inp_moptions, inp);
 	INP_UNLOCK(inp);
 	return error;
 }
 
 /*
  * Raw IP socket option processing.
  *
  * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
  * only be created by a privileged process, and as such, socket option
  * operations to manage system properties on any raw socket were allowed to
  * take place without explicit additional access control checks.  However,
  * raw sockets can now also be created in jail(), and therefore explicit
  * checks are now required.  Likewise, raw sockets can be used by a process
  * after it gives up privilege, so some caution is required.  For options
  * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
  * performed in ip_ctloutput() and therefore no check occurs here.
  * Unilaterally checking suser() here breaks normal IP socket option
  * operations on raw sockets.
  *
  * When adding new socket options here, make sure to add access control
  * checks here as necessary.
  */
 int
 rip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 
 	if (sopt->sopt_level != IPPROTO_IP)
 		return (EINVAL);
 
 	error = 0;
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			optval = inp->inp_flags & INP_HDRINCL;
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		case IP_FW_ADD:	/* ADD actually returns the body... */
 		case IP_FW_GET:
 		case IP_FW_TABLE_GETSIZE:
 		case IP_FW_TABLE_LIST:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			if (ip_fw_ctl_ptr != NULL)
 				error = ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET_GET:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			if (ip_dn_ctl_ptr != NULL)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break ;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case MRT_API_SUPPORT:
 		case MRT_API_CONFIG:
 		case MRT_ADD_BW_UPCALL:
 		case MRT_DEL_BW_UPCALL:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
 				EOPNOTSUPP;
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 			if (optval)
 				inp->inp_flags |= INP_HDRINCL;
 			else
 				inp->inp_flags &= ~INP_HDRINCL;
 			break;
 
 		case IP_FW_ADD:
 		case IP_FW_DEL:
 		case IP_FW_FLUSH:
 		case IP_FW_ZERO:
 		case IP_FW_RESETLOG:
 		case IP_FW_TABLE_ADD:
 		case IP_FW_TABLE_DEL:
 		case IP_FW_TABLE_FLUSH:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			if (ip_fw_ctl_ptr != NULL)
 				error = ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET_CONFIGURE:
 		case IP_DUMMYNET_DEL:
 		case IP_DUMMYNET_FLUSH:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			if (ip_dn_ctl_ptr != NULL)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT ;
 			break ;
 
 		case IP_RSVP_ON:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_init(so);
 			break;
 
 		case IP_RSVP_OFF:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_done();
 			break;
 
 		case IP_RSVP_VIF_ON:
 		case IP_RSVP_VIF_OFF:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_vif ?
 				ip_rsvp_vif(so, sopt) : EINVAL;
 			break;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case MRT_API_SUPPORT:
 		case MRT_API_CONFIG:
 		case MRT_ADD_BW_UPCALL:
 		case MRT_DEL_BW_UPCALL:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
 					EOPNOTSUPP;
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * This function exists solely to receive the PRC_IFDOWN messages which
  * are sent by if_down().  It looks for an ifaddr whose ifa_addr is sa,
  * and calls in_ifadown() to remove all routes corresponding to that address.
  * It also receives the PRC_IFUP messages from if_up() and reinstalls the
  * interface routes.
  */
 void
 rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct in_ifaddr *ia;
 	struct ifnet *ifp;
 	int err;
 	int flags;
 
 	switch (cmd) {
 	case PRC_IFDOWN:
 		TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
 			if (ia->ia_ifa.ifa_addr == sa
 			    && (ia->ia_flags & IFA_ROUTE)) {
 				/*
 				 * in_ifscrub kills the interface route.
 				 */
 				in_ifscrub(ia->ia_ifp, ia);
 				/*
 				 * in_ifadown gets rid of all the rest of
 				 * the routes.  This is not quite the right
 				 * thing to do, but at least if we are running
 				 * a routing process they will come back.
 				 */
 				in_ifadown(&ia->ia_ifa, 0);
 				break;
 			}
 		}
 		break;
 
 	case PRC_IFUP:
 		TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
 			if (ia->ia_ifa.ifa_addr == sa)
 				break;
 		}
 		if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
 			return;
 		flags = RTF_UP;
 		ifp = ia->ia_ifa.ifa_ifp;
 
 		if ((ifp->if_flags & IFF_LOOPBACK)
 		    || (ifp->if_flags & IFF_POINTOPOINT))
 			flags |= RTF_HOST;
 
 		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
 		if (err == 0)
 			ia->ia_flags |= IFA_ROUTE;
 		break;
 	}
 }
 
 u_long	rip_sendspace = RIPSNDQ;
 u_long	rip_recvspace = RIPRCVQ;
 
 SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
     &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
 SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
     &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
 
 static int
 rip_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	/* XXX why not lower? */
 	INP_INFO_WLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (inp) {
 		/* XXX counter, printf */
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return EINVAL;
 	}
 	if (jailed(td->td_ucred) && !jail_allow_raw_sockets) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return (EPERM);
 	}
 	if ((error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL)) != 0) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return error;
 	}
 	if (proto >= IPPROTO_MAX || proto < 0) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return EPROTONOSUPPORT;
 	}
 
 	error = soreserve(so, rip_sendspace, rip_recvspace);
 	if (error) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return error;
 	}
 	error = in_pcballoc(so, &ripcbinfo, "rawinp");
 	if (error) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return error;
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	INP_LOCK(inp);
 	INP_INFO_WUNLOCK(&ripcbinfo);
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_p = proto;
 	inp->inp_ip_ttl = ip_defttl;
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 static void
 rip_pcbdetach(struct socket *so, struct inpcb *inp)
 {
 	INP_INFO_WLOCK_ASSERT(&ripcbinfo);
 	INP_LOCK_ASSERT(inp);
 
 	if (so == ip_mrouter && ip_mrouter_done)
 		ip_mrouter_done();
 	if (ip_rsvp_force_done)
 		ip_rsvp_force_done(so);
 	if (so == ip_rsvpd)
 		ip_rsvp_done();
 	in_pcbdetach(inp);
 }
 
 static int
 rip_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_WLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		/* XXX counter, printf */
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	rip_pcbdetach(so, inp);
 	INP_INFO_WUNLOCK(&ripcbinfo);
 	return 0;
 }
 
 static int
 rip_abort(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_WLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return EINVAL;	/* ??? possible? panic instead? */
 	}
 	INP_LOCK(inp);
 	soisdisconnected(so);
 	if (so->so_state & SS_NOFDREF)
 		rip_pcbdetach(so, inp);
 	else
 		INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&ripcbinfo);
 	return 0;
 }
 
 static int
 rip_disconnect(struct socket *so)
 {
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return ENOTCONN;
 	return rip_abort(so);
 }
 
 static int
 rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 	struct inpcb *inp;
 
 	if (nam->sa_len != sizeof(*addr))
 		return EINVAL;
 
 	if (jailed(td->td_ucred)) {
 		if (addr->sin_addr.s_addr == INADDR_ANY)
 			addr->sin_addr.s_addr =
 			    htonl(prison_getip(td->td_ucred));
 		if (htonl(prison_getip(td->td_ucred)) != addr->sin_addr.s_addr)
 			return (EADDRNOTAVAIL);
 	}
 
 	if (TAILQ_EMPTY(&ifnet) ||
 	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
 	    (addr->sin_addr.s_addr &&
 	     ifa_ifwithaddr((struct sockaddr *)addr) == 0))
 		return EADDRNOTAVAIL;
 
 	INP_INFO_WLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	inp->inp_laddr = addr->sin_addr;
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&ripcbinfo);
 	return 0;
 }
 
 static int
 rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 	struct inpcb *inp;
 
 	if (nam->sa_len != sizeof(*addr))
 		return EINVAL;
 	if (TAILQ_EMPTY(&ifnet))
 		return EADDRNOTAVAIL;
 	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
 		return EAFNOSUPPORT;
 
 	INP_INFO_WLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	inp->inp_faddr = addr->sin_addr;
 	soisconnected(so);
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&ripcbinfo);
 	return 0;
 }
 
 static int
 rip_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_RLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_RUNLOCK(&ripcbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	INP_INFO_RUNLOCK(&ripcbinfo);
 	socantsendmore(so);
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 static int
 rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 	u_long dst;
 	int ret;
 
 	INP_INFO_WLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (so->so_state & SS_ISCONNECTED) {
 		if (nam) {
 			INP_INFO_WUNLOCK(&ripcbinfo);
 			m_freem(m);
 			return EISCONN;
 		}
 		dst = inp->inp_faddr.s_addr;
 	} else {
 		if (nam == NULL) {
 			INP_INFO_WUNLOCK(&ripcbinfo);
 			m_freem(m);
 			return ENOTCONN;
 		}
 		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
 	}
 	ret = rip_output(m, so, dst);
 	INP_INFO_WUNLOCK(&ripcbinfo);
 	return ret;
 }
 
 static int
 rip_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = ripcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&ripcbinfo);
 	gencnt = ripcbinfo.ipi_gencnt;
 	n = ripcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&ripcbinfo);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	INP_INFO_RLOCK(&ripcbinfo);
 	for (inp = LIST_FIRST(ripcbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) {
 			/* XXX held references? */
 			inp_list[i++] = inp;
 		}
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&ripcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
+			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		}
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&ripcbinfo);
 		xig.xig_gen = ripcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = ripcbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&ripcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 /*
  * This is the wrapper function for in_setsockaddr.  We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 rip_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setsockaddr(so, nam, &ripcbinfo));
 }
 
 /*
  * This is the wrapper function for in_setpeeraddr.  We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 rip_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setpeeraddr(so, nam, &ripcbinfo));
 }
 
 
 SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
 	    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
 
 struct pr_usrreqs rip_usrreqs = {
 	.pru_abort =		rip_abort,
 	.pru_attach =		rip_attach,
 	.pru_bind =		rip_bind,
 	.pru_connect =		rip_connect,
 	.pru_control =		in_control,
 	.pru_detach =		rip_detach,
 	.pru_disconnect =	rip_disconnect,
 	.pru_peeraddr =		rip_peeraddr,
 	.pru_send =		rip_send,
 	.pru_shutdown =		rip_shutdown,
 	.pru_sockaddr =		rip_sockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel
 };
Index: head/sys/netinet/udp_usrreq.c
===================================================================
--- head/sys/netinet/udp_usrreq.c	(revision 145952)
+++ head/sys/netinet/udp_usrreq.c	(revision 145953)
@@ -1,1131 +1,1132 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  * $FreeBSD$
  */
 
 #include "opt_ipsec.h"
 #include "opt_inet6.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #ifdef FAST_IPSEC
 #include <netipsec/ipsec.h>
 #endif /*FAST_IPSEC*/
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 /*
  * UDP protocol implementation.
  * Per RFC 768, August, 1980.
  */
 #ifndef	COMPAT_42
 static int	udpcksum = 1;
 #else
 static int	udpcksum = 0;		/* XXX */
 #endif
 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW,
 		&udpcksum, 0, "");
 
 int	log_in_vain = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &log_in_vain, 0, "Log all incoming UDP packets");
 
 static int	blackhole = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
 	&blackhole, 0, "Do not send port unreachables for refused connects");
 
 static int	strict_mcast_mship = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, strict_mcast_mship, CTLFLAG_RW,
 	&strict_mcast_mship, 0, "Only send multicast to member sockets");
 
 struct	inpcbhead udb;		/* from udp_var.h */
 #define	udb6	udb  /* for KAME src sync over BSD*'s */
 struct	inpcbinfo udbinfo;
 
 #ifndef UDBHASHSIZE
 #define UDBHASHSIZE 16
 #endif
 
 struct	udpstat udpstat;	/* from udp_var.h */
 SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW,
     &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
 
 static void udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
 		int off, struct sockaddr_in *udp_in);
 
 static int udp_detach(struct socket *so);
 static	int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
 		struct mbuf *, struct thread *);
 
 void
 udp_init()
 {
 	INP_INFO_LOCK_INIT(&udbinfo, "udp");
 	LIST_INIT(&udb);
 	udbinfo.listhead = &udb;
 	udbinfo.hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.hashmask);
 	udbinfo.porthashbase = hashinit(UDBHASHSIZE, M_PCB,
 					&udbinfo.porthashmask);
 	udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(udbinfo.ipi_zone, maxsockets);
 }
 
 void
 udp_input(m, off)
 	register struct mbuf *m;
 	int off;
 {
 	int iphlen = off;
 	register struct ip *ip;
 	register struct udphdr *uh;
 	register struct inpcb *inp;
 	struct mbuf *opts = 0;
 	int len;
 	struct ip save_ip;
 	struct sockaddr_in udp_in;
 
 	udpstat.udps_ipackets++;
 
 	/*
 	 * Strip IP options, if any; should skip this,
 	 * make available to user, and use on returned packets,
 	 * but we don't yet have a way to check the checksum
 	 * with options still present.
 	 */
 	if (iphlen > sizeof (struct ip)) {
 		ip_stripoptions(m, (struct mbuf *)0);
 		iphlen = sizeof(struct ip);
 	}
 
 	/*
 	 * Get IP and UDP header together in first mbuf.
 	 */
 	ip = mtod(m, struct ip *);
 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
 			udpstat.udps_hdrops++;
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
 
 	/* destination port of 0 is illegal, based on RFC768. */
 	if (uh->uh_dport == 0)
 		goto badunlocked;
 
 	/*
 	 * Construct sockaddr format source address.
 	 * Stuff source address and datagram in user buffer.
 	 */
 	bzero(&udp_in, sizeof(udp_in));
 	udp_in.sin_len = sizeof(udp_in);
 	udp_in.sin_family = AF_INET;
 	udp_in.sin_port = uh->uh_sport;
 	udp_in.sin_addr = ip->ip_src;
 
 	/*
 	 * Make mbuf data length reflect UDP length.
 	 * If not enough data to reflect UDP length, drop.
 	 */
 	len = ntohs((u_short)uh->uh_ulen);
 	if (ip->ip_len != len) {
 		if (len > ip->ip_len || len < sizeof(struct udphdr)) {
 			udpstat.udps_badlen++;
 			goto badunlocked;
 		}
 		m_adj(m, len - ip->ip_len);
 		/* ip->ip_len = len; */
 	}
 	/*
 	 * Save a copy of the IP header in case we want restore it
 	 * for sending an ICMP error message in response.
 	 */
 	if (!blackhole)
 		save_ip = *ip;
 
 	/*
 	 * Checksum extended UDP header and data.
 	 */
 	if (uh->uh_sum) {
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				uh->uh_sum = m->m_pkthdr.csum_data;
 			else
 				uh->uh_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr, htonl((u_short)len +
 				    m->m_pkthdr.csum_data + IPPROTO_UDP));
 			uh->uh_sum ^= 0xffff;
 		} else {
 			char b[9];
 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
 			bzero(((struct ipovly *)ip)->ih_x1, 9);
 			((struct ipovly *)ip)->ih_len = uh->uh_ulen;
 			uh->uh_sum = in_cksum(m, len + sizeof (struct ip));
 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
 		}
 		if (uh->uh_sum) {
 			udpstat.udps_badsum++;
 			m_freem(m);
 			return;
 		}
 	} else
 		udpstat.udps_nosum++;
 
 	INP_INFO_RLOCK(&udbinfo);
 
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
 		struct inpcb *last;
 		/*
 		 * Deliver a multicast or broadcast datagram to *all* sockets
 		 * for which the local and remote addresses and ports match
 		 * those of the incoming datagram.  This allows more than
 		 * one process to receive multi/broadcasts on the same port.
 		 * (This really ought to be done for unicast datagrams as
 		 * well, but that would cause problems with existing
 		 * applications that open both address-specific sockets and
 		 * a wildcard socket listening to the same port -- they would
 		 * end up receiving duplicates of every unicast datagram.
 		 * Those applications open the multiple sockets to overcome an
 		 * inadequacy of the UDP socket interface, but for backwards
 		 * compatibility we avoid the problem here rather than
 		 * fixing the interface.  Maybe 4.5BSD will remedy this?)
 		 */
 
 		/*
 		 * Locate pcb(s) for datagram.
 		 * (Algorithm copied from raw_intr().)
 		 */
 		last = NULL;
 		LIST_FOREACH(inp, &udb, inp_list) {
 			if (inp->inp_lport != uh->uh_dport)
 				continue;
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_laddr.s_addr != INADDR_ANY) {
 				if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 					continue;
 			}
 			if (inp->inp_faddr.s_addr != INADDR_ANY) {
 				if (inp->inp_faddr.s_addr !=
 				    ip->ip_src.s_addr ||
 				    inp->inp_fport != uh->uh_sport)
 					continue;
 			}
 			INP_LOCK(inp);
 
 			/*
 			 * Check multicast packets to make sure they are only
 			 * sent to sockets with multicast memberships for the
 			 * packet's destination address and arrival interface
 			 */
 #define MSHIP(_inp, n) ((_inp)->inp_moptions->imo_membership[(n)])
 #define NMSHIPS(_inp) ((_inp)->inp_moptions->imo_num_memberships)
 			if (strict_mcast_mship && inp->inp_moptions != NULL) {
 				int mship, foundmship = 0;
 
 				for (mship = 0; mship < NMSHIPS(inp); mship++) {
 					if (MSHIP(inp, mship)->inm_addr.s_addr
 					    == ip->ip_dst.s_addr &&
 					    MSHIP(inp, mship)->inm_ifp
 					    == m->m_pkthdr.rcvif) {
 						foundmship = 1;
 						break;
 					}
 				}
 				if (foundmship == 0) {
 					INP_UNLOCK(inp);
 					continue;
 				}
 			}
 #undef NMSHIPS
 #undef MSHIP
 			if (last != NULL) {
 				struct mbuf *n;
 
 				n = m_copy(m, 0, M_COPYALL);
 				if (n != NULL)
 					udp_append(last, ip, n,
 						   iphlen +
 						   sizeof(struct udphdr),
 						   &udp_in);
 				INP_UNLOCK(last);
 			}
 			last = inp;
 			/*
 			 * Don't look for additional matches if this one does
 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
 			 * socket options set.  This heuristic avoids searching
 			 * through all pcbs in the common case of a non-shared
 			 * port.  It * assumes that an application will never
 			 * clear these options after setting them.
 			 */
 			if ((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0)
 				break;
 		}
 
 		if (last == NULL) {
 			/*
 			 * No matching pcb found; discard datagram.
 			 * (No need to send an ICMP Port Unreachable
 			 * for a broadcast or multicast datgram.)
 			 */
 			udpstat.udps_noportbcast++;
 			goto badheadlocked;
 		}
 		udp_append(last, ip, m, iphlen + sizeof(struct udphdr),
 		    &udp_in);
 		INP_UNLOCK(last);
 		INP_INFO_RUNLOCK(&udbinfo);
 		return;
 	}
 	/*
 	 * Locate pcb for datagram.
 	 */
 	inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport,
 	    ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif);
 	if (inp == NULL) {
 		if (log_in_vain) {
 			char buf[4*sizeof "123"];
 
 			strcpy(buf, inet_ntoa(ip->ip_dst));
 			log(LOG_INFO,
 			    "Connection attempt to UDP %s:%d from %s:%d\n",
 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
 			    ntohs(uh->uh_sport));
 		}
 		udpstat.udps_noport++;
 		if (m->m_flags & (M_BCAST | M_MCAST)) {
 			udpstat.udps_noportbcast++;
 			goto badheadlocked;
 		}
 		if (blackhole)
 			goto badheadlocked;
 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
 			goto badheadlocked;
 		*ip = save_ip;
 		ip->ip_len += iphlen;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
 		INP_INFO_RUNLOCK(&udbinfo);
 		return;
 	}
 	INP_LOCK(inp);
 	udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in);
 	INP_UNLOCK(inp);
 	INP_INFO_RUNLOCK(&udbinfo);
 	return;
 
 badheadlocked:
 	if (inp)
 		INP_UNLOCK(inp);
 	INP_INFO_RUNLOCK(&udbinfo);
 badunlocked:
 	m_freem(m);
 	if (opts)
 		m_freem(opts);
 	return;
 }
 
 /*
  * Subroutine of udp_input(), which appends the provided mbuf chain to the
  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
  * contains the source address.  If the socket ends up being an IPv6 socket,
  * udp_append() will convert to a sockaddr_in6 before passing the address
  * into the socket code.
  */
 static void
 udp_append(last, ip, n, off, udp_in)
 	struct inpcb *last;
 	struct ip *ip;
 	struct mbuf *n;
 	int off;
 	struct sockaddr_in *udp_in;
 {
 	struct sockaddr *append_sa;
 	struct socket *so;
 	struct mbuf *opts = 0;
 #ifdef INET6
 	struct sockaddr_in6 udp_in6;
 #endif
 
 	INP_LOCK_ASSERT(last);
 
 #if defined(IPSEC) || defined(FAST_IPSEC)
 	/* check AH/ESP integrity. */
 	if (ipsec4_in_reject(n, last)) {
 #ifdef IPSEC
 		ipsecstat.in_polvio++;
 #endif /*IPSEC*/
 		m_freem(n);
 		return;
 	}
 #endif /*IPSEC || FAST_IPSEC*/
 #ifdef MAC
 	if (mac_check_inpcb_deliver(last, n) != 0) {
 		m_freem(n);
 		return;
 	}
 #endif
 	if (last->inp_flags & INP_CONTROLOPTS ||
 	    last->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
 #ifdef INET6
 		if (last->inp_vflag & INP_IPV6) {
 			int savedflags;
 
 			savedflags = last->inp_flags;
 			last->inp_flags &= ~INP_UNMAPPABLEOPTS;
 			ip6_savecontrol(last, n, &opts);
 			last->inp_flags = savedflags;
 		} else
 #endif
 		ip_savecontrol(last, &opts, ip, n);
 	}
 #ifdef INET6
 	if (last->inp_vflag & INP_IPV6) {
 		bzero(&udp_in6, sizeof(udp_in6));
 		udp_in6.sin6_len = sizeof(udp_in6);
 		udp_in6.sin6_family = AF_INET6;
 		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
 		append_sa = (struct sockaddr *)&udp_in6;
 	} else
 #endif
 	append_sa = (struct sockaddr *)udp_in;
 	m_adj(n, off);
 
 	so = last->inp_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		udpstat.udps_fullsock++;
 		SOCKBUF_UNLOCK(&so->so_rcv);
 	} else
 		sorwakeup_locked(so);
 }
 
 /*
  * Notify a udp user of an asynchronous error;
  * just wake up so that he can collect error status.
  */
 struct inpcb *
 udp_notify(inp, errno)
 	register struct inpcb *inp;
 	int errno;
 {
 	inp->inp_socket->so_error = errno;
 	sorwakeup(inp->inp_socket);
 	sowwakeup(inp->inp_socket);
 	return inp;
 }
 
 void
 udp_ctlinput(cmd, sa, vip)
 	int cmd;
 	struct sockaddr *sa;
 	void *vip;
 {
 	struct ip *ip = vip;
 	struct udphdr *uh;
 	struct inpcb *(*notify)(struct inpcb *, int) = udp_notify;
 	struct in_addr faddr;
 	struct inpcb *inp;
 	int s;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	/*
 	 * Redirects don't need to be handled up here.
 	 */
 	if (PRC_IS_REDIRECT(cmd))
 		return;
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 * XXX: We never get this from ICMP, otherwise it makes an
 	 * excellent DoS attack on machines with many connections.
 	 */
 	if (cmd == PRC_HOSTDEAD)
 		ip = 0;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip) {
 		s = splnet();
 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		INP_INFO_RLOCK(&udbinfo);
 		inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport,
 		    ip->ip_src, uh->uh_sport, 0, NULL);
 		if (inp != NULL) {
 			INP_LOCK(inp);
 			if (inp->inp_socket != NULL) {
 				(*notify)(inp, inetctlerrmap[cmd]);
 			}
 			INP_UNLOCK(inp);
 		}
 		INP_INFO_RUNLOCK(&udbinfo);
 		splx(s);
 	} else
 		in_pcbnotifyall(&udbinfo, faddr, inetctlerrmap[cmd], notify);
 }
 
 static int
 udp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n, s;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = udbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	s = splnet();
 	INP_INFO_RLOCK(&udbinfo);
 	gencnt = udbinfo.ipi_gencnt;
 	n = udbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&udbinfo);
 	splx(s);
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ n * sizeof(struct xinpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 
 	s = splnet();
 	INP_INFO_RLOCK(&udbinfo);
 	for (inp = LIST_FIRST(udbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
 			inp_list[i++] = inp;
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&udbinfo);
 	splx(s);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
+			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		}
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		s = splnet();
 		INP_INFO_RLOCK(&udbinfo);
 		xig.xig_gen = udbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = udbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&udbinfo);
 		splx(s);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
 	    udp_pcblist, "S,xinpcb", "List of active UDP sockets");
 
 static int
 udp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error, s;
 
 	error = suser_cred(req->td->td_ucred, SUSER_ALLOWJAIL);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	s = splnet();
 	INP_INFO_RLOCK(&udbinfo);
 	inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 				addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);
 	if (inp == NULL || inp->inp_socket == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
 	if (error)
 		goto out;
 	cru2x(inp->inp_socket->so_cred, &xuc);
 out:
 	INP_INFO_RUNLOCK(&udbinfo);
 	splx(s);
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
 
 static int
 udp_output(inp, m, addr, control, td)
 	register struct inpcb *inp;
 	struct mbuf *m;
 	struct sockaddr *addr;
 	struct mbuf *control;
 	struct thread *td;
 {
 	register struct udpiphdr *ui;
 	register int len = m->m_pkthdr.len;
 	struct in_addr faddr, laddr;
 	struct cmsghdr *cm;
 	struct sockaddr_in *sin, src;
 	int error = 0;
 	int ipflags;
 	u_short fport, lport;
 	int unlock_udbinfo;
 
 	/*
 	 * udp_output() may need to temporarily bind or connect the current
 	 * inpcb.  As such, we don't know up front what inpcb locks we will
 	 * need.  Do any work to decide what is needed up front before
 	 * acquiring locks.
 	 */
 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
 		if (control)
 			m_freem(control);
 		m_freem(m);
 		return EMSGSIZE;
 	}
 
 	src.sin_addr.s_addr = INADDR_ANY;
 	if (control != NULL) {
 		/*
 		 * XXX: Currently, we assume all the optional information
 		 * is stored in a single mbuf.
 		 */
 		if (control->m_next) {
 			m_freem(control);
 			m_freem(m);
 			return EINVAL;
 		}
 		for (; control->m_len > 0;
 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 			cm = mtod(control, struct cmsghdr *);
 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 ||
 			    cm->cmsg_len > control->m_len) {
 				error = EINVAL;
 				break;
 			}
 			if (cm->cmsg_level != IPPROTO_IP)
 				continue;
 
 			switch (cm->cmsg_type) {
 			case IP_SENDSRCADDR:
 				if (cm->cmsg_len !=
 				    CMSG_LEN(sizeof(struct in_addr))) {
 					error = EINVAL;
 					break;
 				}
 				bzero(&src, sizeof(src));
 				src.sin_family = AF_INET;
 				src.sin_len = sizeof(src);
 				src.sin_port = inp->inp_lport;
 				src.sin_addr = *(struct in_addr *)CMSG_DATA(cm);
 				break;
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			if (error)
 				break;
 		}
 		m_freem(control);
 	}
 	if (error) {
 		m_freem(m);
 		return error;
 	}
 
 	if (src.sin_addr.s_addr != INADDR_ANY ||
 	    addr != NULL) {
 		INP_INFO_WLOCK(&udbinfo);
 		unlock_udbinfo = 1;
 	} else
 		unlock_udbinfo = 0;
 	INP_LOCK(inp);
 
 #ifdef MAC
 	mac_create_mbuf_from_inpcb(inp, m);
 #endif
 
 	laddr = inp->inp_laddr;
 	lport = inp->inp_lport;
 	if (src.sin_addr.s_addr != INADDR_ANY) {
 		if (lport == 0) {
 			error = EINVAL;
 			goto release;
 		}
 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
 		    &laddr.s_addr, &lport, td->td_ucred);
 		if (error)
 			goto release;
 	}
 
 	if (addr) {
 		sin = (struct sockaddr_in *)addr;
 		if (jailed(td->td_ucred))
 			prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr);
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto release;
 		}
 		error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport,
 		    &faddr.s_addr, &fport, NULL, td->td_ucred);
 		if (error)
 			goto release;
 
 		/* Commit the local port if newly assigned. */
 		if (inp->inp_laddr.s_addr == INADDR_ANY &&
 		    inp->inp_lport == 0) {
 			/*
 			 * Remember addr if jailed, to prevent rebinding.
 			 */
 			if (jailed(td->td_ucred))
 				inp->inp_laddr = laddr;
 			inp->inp_lport = lport;
 			if (in_pcbinshash(inp) != 0) {
 				inp->inp_lport = 0;
 				error = EAGAIN;
 				goto release;
 			}
 			inp->inp_flags |= INP_ANONPORT;
 		}
 	} else {
 		faddr = inp->inp_faddr;
 		fport = inp->inp_fport;
 		if (faddr.s_addr == INADDR_ANY) {
 			error = ENOTCONN;
 			goto release;
 		}
 	}
 
 	/*
 	 * Calculate data length and get a mbuf for UDP, IP, and possible
 	 * link-layer headers.  Immediate slide the data pointer back forward
 	 * since we won't use that space at this layer.
 	 */
 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto release;
 	}
 	m->m_data += max_linkhdr;
 	m->m_len -= max_linkhdr;
 	m->m_pkthdr.len -= max_linkhdr;
 
 	/*
 	 * Fill in mbuf with extended UDP header
 	 * and addresses and length put into network format.
 	 */
 	ui = mtod(m, struct udpiphdr *);
 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
 	ui->ui_pr = IPPROTO_UDP;
 	ui->ui_src = laddr;
 	ui->ui_dst = faddr;
 	ui->ui_sport = lport;
 	ui->ui_dport = fport;
 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
 
 	ipflags = 0;
 	if (inp->inp_socket->so_options & SO_DONTROUTE)
 		ipflags |= IP_ROUTETOIF;
 	if (inp->inp_socket->so_options & SO_BROADCAST)
 		ipflags |= IP_ALLOWBROADCAST;
 	if (inp->inp_flags & INP_ONESBCAST)
 		ipflags |= IP_SENDONES;
 
 	/*
 	 * Set up checksum and output datagram.
 	 */
 	if (udpcksum) {
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
 		    htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
 		m->m_pkthdr.csum_flags = CSUM_UDP;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	} else {
 		ui->ui_sum = 0;
 	}
 	((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
 	((struct ip *)ui)->ip_tos = inp->inp_ip_tos;	/* XXX */
 	udpstat.udps_opackets++;
 
 	if (unlock_udbinfo)
 		INP_INFO_WUNLOCK(&udbinfo);
 	error = ip_output(m, inp->inp_options, NULL, ipflags,
 	    inp->inp_moptions, inp);
 	INP_UNLOCK(inp);
 	return (error);
 
 release:
 	INP_UNLOCK(inp);
 	if (unlock_udbinfo)
 		INP_INFO_WUNLOCK(&udbinfo);
 	m_freem(m);
 	return (error);
 }
 
 u_long	udp_sendspace = 9216;		/* really max datagram size */
 					/* 40 1K datagrams */
 SYSCTL_INT(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
 
 u_long	udp_recvspace = 40 * (1024 +
 #ifdef INET6
 				      sizeof(struct sockaddr_in6)
 #else
 				      sizeof(struct sockaddr_in)
 #endif
 				      );
 SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
 
 static int
 udp_abort(struct socket *so)
 {
 	struct inpcb *inp;
 	int s;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;	/* ??? possible? panic instead? */
 	}
 	INP_LOCK(inp);
 	soisdisconnected(so);
 	s = splnet();
 	in_pcbdetach(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	splx(s);
 	return 0;
 }
 
 static int
 udp_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int s, error;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp != 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	error = soreserve(so, udp_sendspace, udp_recvspace);
 	if (error) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return error;
 	}
 	s = splnet();
 	error = in_pcballoc(so, &udbinfo, "udpinp");
 	splx(s);
 	if (error) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return error;
 	}
 
 	inp = (struct inpcb *)so->so_pcb;
 	INP_LOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_ttl = ip_defttl;
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 static int
 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int s, error;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	s = splnet();
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	splx(s);
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	return error;
 }
 
 static int
 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int s, error;
 	struct sockaddr_in *sin;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_UNLOCK(inp);
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EISCONN;
 	}
 	s = splnet();
 	sin = (struct sockaddr_in *)nam;
 	if (jailed(td->td_ucred))
 		prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr);
 	error = in_pcbconnect(inp, nam, td->td_ucred);
 	splx(s);
 	if (error == 0)
 		soisconnected(so);
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	return error;
 }
 
 static int
 udp_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	int s;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	s = splnet();
 	in_pcbdetach(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	splx(s);
 	return 0;
 }
 
 static int
 udp_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	int s;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		INP_UNLOCK(inp);
 		return ENOTCONN;
 	}
 
 	s = splnet();
 	in_pcbdisconnect(inp);
 	inp->inp_laddr.s_addr = INADDR_ANY;
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	splx(s);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	return 0;
 }
 
 static int
 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
 	    struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	return udp_output(inp, m, addr, control, td);
 }
 
 int
 udp_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_RLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_RUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	INP_INFO_RUNLOCK(&udbinfo);
 	socantsendmore(so);
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 /*
  * This is the wrapper function for in_setsockaddr.  We just pass down
  * the pcbinfo for in_setsockaddr to lock.  We don't want to do the locking
  * here because in_setsockaddr will call malloc and might block.
  */
 static int
 udp_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setsockaddr(so, nam, &udbinfo));
 }
 
 /*
  * This is the wrapper function for in_setpeeraddr.  We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 udp_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setpeeraddr(so, nam, &udbinfo));
 }
 
 struct pr_usrreqs udp_usrreqs = {
 	.pru_abort =		udp_abort,
 	.pru_attach =		udp_attach,
 	.pru_bind =		udp_bind,
 	.pru_connect =		udp_connect,
 	.pru_control =		in_control,
 	.pru_detach =		udp_detach,
 	.pru_disconnect =	udp_disconnect,
 	.pru_peeraddr =		udp_peeraddr,
 	.pru_send =		udp_send,
 	.pru_shutdown =		udp_shutdown,
 	.pru_sockaddr =		udp_sockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel
 };
Index: stable/4/sys/kern/vfs_subr.c
===================================================================
--- stable/4/sys/kern/vfs_subr.c	(revision 145952)
+++ stable/4/sys/kern/vfs_subr.c	(revision 145953)
@@ -1,3135 +1,3136 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  * $FreeBSD$
  */
 
 /*
  * External virtual filesystem routines
  */
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <machine/limits.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_zone.h>
 
 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
 
 static void	insmntque __P((struct vnode *vp, struct mount *mp));
 static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
 static unsigned long	numvnodes;
 static void	vlruvp(struct vnode *vp);
 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
 
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
 int vttoif_tab[9] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT,
 };
 
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
 
 static u_long wantfreevnodes = 25;
 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 static u_long freevnodes = 0;
 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 
 static int reassignbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
 static int reassignbufloops;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
 static int reassignbufsortgood;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
 static int reassignbufsortbad;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
 static int reassignbufmethod = 1;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
 static int nameileafonly = 0;
 SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
 
 #ifdef ENABLE_VFS_IOOPT
 int vfs_ioopt = 0;
 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
 #endif
 
 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
 struct simplelock mountlist_slock;
 struct simplelock mntvnode_slock;
 int	nfs_mount_type = -1;
 #ifndef NULL_SIMPLELOCKS
 static struct simplelock mntid_slock;
 static struct simplelock vnode_free_list_slock;
 static struct simplelock spechash_slock;
 #endif
 struct nfs_public nfs_pub;	/* publicly exported FS */
 static vm_zone_t vnode_zone;
 
 /*
  * The workitem queue.
  */
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 time_t syncdelay = 30;		/* max time to delay syncing data */
 time_t filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 time_t dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 time_t metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 static int rushjob;			/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 
 static int syncer_delayno = 0;
 static long syncer_mask; 
 LIST_HEAD(synclist, vnode);
 static struct synclist *syncer_workitem_pending;
 
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 
     &desiredvnodes, 0, "Maximum number of vnodes");
 static int minvnodes;
 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 
     &minvnodes, 0, "Minimum number of vnodes");
 static int vnlru_nowhere = 0;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
     "Number of times the vnlru process ran without success");
 
 static void	vfs_free_addrlist __P((struct netexport *nep));
 static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
 static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
 				       struct export_args *argp));
 
 /*
  * Initialize the vnode management data structures.
  */
 void
 vntblinit()
 {
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and
 	 * the kernel's heap size.  Specifically, desiredvnodes scales
 	 * in proportion to the physical memory size until two fifths
 	 * of the kernel's heap size is consumed by vnodes and vm
 	 * objects.  
 	 */
 	desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
 	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
 	minvnodes = desiredvnodes / 4;
 	simple_lock_init(&mntvnode_slock);
 	simple_lock_init(&mntid_slock);
 	simple_lock_init(&spechash_slock);
 	TAILQ_INIT(&vnode_free_list);
 	simple_lock_init(&vnode_free_list_slock);
 	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
 	/*
 	 * Initialize the filesystem syncer.
 	 */     
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 
 		&syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 }
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Interlock is not released on failure.
  */
 int
 vfs_busy(mp, flags, interlkp, p)
 	struct mount *mp;
 	int flags;
 	struct simplelock *interlkp;
 	struct proc *p;
 {
 	int lkflags;
 
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & LK_NOWAIT)
 			return (ENOENT);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		if (interlkp) {
 			simple_unlock(interlkp);
 		}
 		/*
 		 * Since all busy locks are shared except the exclusive
 		 * lock granted when unmounting, the only place that a
 		 * wakeup needs to be done is at the release of the
 		 * exclusive lock at the end of dounmount.
 		 */
 		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
 		if (interlkp) {
 			simple_lock(interlkp);
 		}
 		return (ENOENT);
 	}
 	lkflags = LK_SHARED | LK_NOPAUSE;
 	if (interlkp)
 		lkflags |= LK_INTERLOCK;
 	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
 		panic("vfs_busy: unexpected lock failure");
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(mp, p)
 	struct mount *mp;
 	struct proc *p;
 {
 
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
 }
 
 /*
  * Lookup a filesystem type, and if found allocate and initialize
  * a mount structure for it.
  *
  * Devname is usually updated by mount(8) after booting.
  */
 int
 vfs_rootmountalloc(fstypename, devname, mpp)
 	char *fstypename;
 	char *devname;
 	struct mount **mpp;
 {
 	struct proc *p = curproc;	/* XXX */
 	struct vfsconf *vfsp;
 	struct mount *mp;
 
 	if (fstypename == NULL)
 		return (ENODEV);
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 		if (!strcmp(vfsp->vfc_name, fstypename))
 			break;
 	if (vfsp == NULL)
 		return (ENODEV);
 	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
 	bzero((char *)mp, (u_long)sizeof(struct mount));
 	lockinit(&mp->mnt_lock, PVFS, "vfslock", VLKTIMEOUT, LK_NOPAUSE);
 	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
 	TAILQ_INIT(&mp->mnt_nvnodelist);
 	TAILQ_INIT(&mp->mnt_reservedvnlist);
 	mp->mnt_nvnodelistsize = 0;
 	mp->mnt_vfc = vfsp;
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_flag = MNT_RDONLY;
 	mp->mnt_vnodecovered = NULLVP;
 	vfsp->vfc_refcount++;
 	mp->mnt_iosize_max = DFLTPHYS;
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 	mp->mnt_stat.f_mntonname[0] = '/';
 	mp->mnt_stat.f_mntonname[1] = 0;
 	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
 	*mpp = mp;
 	return (0);
 }
 
 /*
  * Find an appropriate filesystem to use for the root. If a filesystem
  * has not been preselected, walk through the list of known filesystems
  * trying those that have mountroot routines, and try them until one
  * works or we have tried them all.
  */
 #ifdef notdef	/* XXX JH */
 int
 lite2_vfs_mountroot()
 {
 	struct vfsconf *vfsp;
 	extern int (*lite2_mountroot) __P((void));
 	int error;
 
 	if (lite2_mountroot != NULL)
 		return ((*lite2_mountroot)());
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 		if (vfsp->vfc_mountroot == NULL)
 			continue;
 		if ((error = (*vfsp->vfc_mountroot)()) == 0)
 			return (0);
 		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
 	}
 	return (ENODEV);
 }
 #endif
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid)
 	fsid_t *fsid;
 {
 	register struct mount *mp;
 
 	simple_lock(&mountlist_slock);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			simple_unlock(&mountlist_slock);
 			return (mp);
 	    }
 	}
 	simple_unlock(&mountlist_slock);
 	return ((struct mount *) 0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(mp)
 	struct mount *mp;
 {
 	static u_int16_t mntid_base;
 	fsid_t tfsid;
 	int mtype;
 
 	simple_lock(&mntid_slock);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makeudev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if (vfs_getvfs(&tfsid) == NULL)
 			break;
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	simple_unlock(&mntid_slock);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_SEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(tsp)
 	struct timespec *tsp;
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(vap)
 	register struct vattr *vap;
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desireable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
 vlrureclaim(struct mount *mp)
 {
 	struct vnode *vp;
 	int done;
 	int trigger;
 	int usevnodes;
 	int count;
 
 	/*
 	 * Calculate the trigger point, don't allow user
 	 * screwups to blow us up.   This prevents us from
 	 * recycling vnodes with lots of resident pages.  We
 	 * aren't trying to free memory, we are trying to
 	 * free vnodes.
 	 */
 	usevnodes = desiredvnodes;
 	if (usevnodes <= 0)
 		usevnodes = 1;
 	trigger = cnt.v_page_count * 2 / usevnodes;
 
 	done = 0;
 	simple_lock(&mntvnode_slock);
 	count = mp->mnt_nvnodelistsize / 10 + 1;
 	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 
 		if (vp->v_type != VNON &&
 		    vp->v_type != VBAD &&
 		    VMIGHTFREE(vp) &&		/* critical path opt */
 		    (vp->v_object == NULL || vp->v_object->resident_page_count < trigger) &&
 		    simple_lock_try(&vp->v_interlock)
 		) {
 			simple_unlock(&mntvnode_slock);
 			if (VMIGHTFREE(vp)) {
 				vgonel(vp, curproc);
 				done++;
 			} else {
 				simple_unlock(&vp->v_interlock);
 			}
 			simple_lock(&mntvnode_slock);
 		}
 		--count;
 	}
 	simple_unlock(&mntvnode_slock);
 	return done;
 }
 
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of file system code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 static void 
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
 	int s;
 	int done;
 	struct proc *p = vnlruproc;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p,
 	    SHUTDOWN_PRI_FIRST);   
 
 	s = splbio();
 	for (;;) {
 		kproc_suspend_loop(p);
 		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			tsleep(vnlruproc, PVFS, "vlruwt", hz);
 			continue;
 		}
 		done = 0;
 		simple_lock(&mountlist_slock);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 			done += vlrureclaim(mp);
 			simple_lock(&mountlist_slock);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp, p);
 		}
 		simple_unlock(&mountlist_slock);
 		if (done == 0) {
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		}
 	}
 	splx(s);
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 extern vop_t **dead_vnodeop_p;
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(tag, mp, vops, vpp)
 	enum vtagtype tag;
 	struct mount *mp;
 	vop_t **vops;
 	struct vnode **vpp;
 {
 	int s;
 	struct proc *p = curproc;	/* XXX */
 	struct vnode *vp = NULL;
 	vm_object_t object;
 
 	s = splbio();
 
 	/*
 	 * Try to reuse vnodes if we hit the max.  This situation only
 	 * occurs in certain large-memory (2G+) situations.  We cannot
 	 * attempt to directly reclaim vnodes due to nasty recursion
 	 * problems.
 	 */
 	while (numvnodes - freevnodes > desiredvnodes) {
 		if (vnlruproc_sig == 0) {
 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
 			wakeup(vnlruproc);
 		}
 		tsleep(&vnlruproc_sig, PVFS, "vlruwk", hz);
 	}
 
 
 	/*
 	 * Attempt to reuse a vnode already on the free list, allocating
 	 * a new vnode if we can't find one or if we have not reached a
 	 * good minimum for good LRU performance.
 	 */
 	simple_lock(&vnode_free_list_slock);
 	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
 		int count;
 
 		for (count = 0; count < freevnodes; count++) {
 			vp = TAILQ_FIRST(&vnode_free_list);
 			if (vp == NULL || vp->v_usecount)
 				panic("getnewvnode: free vnode isn't");
 
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 			if ((VOP_GETVOBJECT(vp, &object) == 0 &&
 			    (object->resident_page_count || object->ref_count)) ||
 			    !simple_lock_try(&vp->v_interlock)) {
 				TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 				vp = NULL;
 				continue;
 			}
 			if (LIST_FIRST(&vp->v_cache_src)) {
 				/*
 				 * note: nameileafonly sysctl is temporary,
 				 * for debugging only, and will eventually be
 				 * removed.
 				 */
 				if (nameileafonly > 0) {
 					/*
 					 * Do not reuse namei-cached directory
 					 * vnodes that have cached
 					 * subdirectories.
 					 */
 					if (cache_leaf_test(vp) < 0) {
 						simple_unlock(&vp->v_interlock);
 						TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 						vp = NULL;
 						continue;
 					}
 				} else if (nameileafonly < 0 || 
 					    vmiodirenable == 0) {
 					/*
 					 * Do not reuse namei-cached directory
 					 * vnodes if nameileafonly is -1 or
 					 * if VMIO backing for directories is
 					 * turned off (otherwise we reuse them
 					 * too quickly).
 					 */
 					simple_unlock(&vp->v_interlock);
 					TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 					vp = NULL;
 					continue;
 				}
 			}
 			break;
 		}
 	}
 
 	if (vp) {
 		vp->v_flag |= VDOOMED;
 		vp->v_flag &= ~VFREE;
 		freevnodes--;
 		simple_unlock(&vnode_free_list_slock);
 		cache_purge(vp);
 		vp->v_lease = NULL;
 		if (vp->v_type != VBAD) {
 			vgonel(vp, p);
 		} else {
 			simple_unlock(&vp->v_interlock);
 		}
 
 #ifdef INVARIANTS
 		{
 			int s;
 
 			if (vp->v_data)
 				panic("cleaned vnode isn't");
 			s = splbio();
 			if (vp->v_numoutput)
 				panic("Clean vnode has pending I/O's");
 			splx(s);
 		}
 #endif
 		vp->v_flag = 0;
 		vp->v_lastw = 0;
 		vp->v_lasta = 0;
 		vp->v_cstart = 0;
 		vp->v_clen = 0;
 		vp->v_socket = 0;
 		vp->v_writecount = 0;	/* XXX */
 	} else {
 		simple_unlock(&vnode_free_list_slock);
 		vp = (struct vnode *) zalloc(vnode_zone);
 		bzero((char *) vp, sizeof *vp);
 		simple_lock_init(&vp->v_interlock);
 		vp->v_dd = vp;
 		cache_purge(vp);
 		LIST_INIT(&vp->v_cache_src);
 		TAILQ_INIT(&vp->v_cache_dst);
 		numvnodes++;
 	}
 
 	TAILQ_INIT(&vp->v_cleanblkhd);
 	TAILQ_INIT(&vp->v_dirtyblkhd);
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	insmntque(vp, mp);
 	*vpp = vp;
 	vp->v_usecount = 1;
 	vp->v_data = 0;
 	splx(s);
 
 	vfs_object_create(vp, p, p->p_ucred);
 	return (0);
 }
 
 /*
  * Move a vnode from one mount queue to another.
  */
 static void
 insmntque(vp, mp)
 	register struct vnode *vp;
 	register struct mount *mp;
 {
 
 	simple_lock(&mntvnode_slock);
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
 	if (vp->v_mount != NULL) {
 		KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
 			("bad mount point vnode list size"));
 		TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
 		vp->v_mount->mnt_nvnodelistsize--;
 	}
 	/*
 	 * Insert into list of vnodes for the new mount point, if available.
 	 */
 	if ((vp->v_mount = mp) == NULL) {
 		simple_unlock(&mntvnode_slock);
 		return;
 	}
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize++;
 	simple_unlock(&mntvnode_slock);
 }
 
 /*
  * Update outstanding I/O count and do wakeup if requested.
  */
 void
 vwakeup(bp)
 	register struct buf *bp;
 {
 	register struct vnode *vp;
 
 	bp->b_flags &= ~B_WRITEINPROG;
 	if ((vp = bp->b_vp)) {
 		vp->v_numoutput--;
 		if (vp->v_numoutput < 0)
 			panic("vwakeup: neg numoutput");
 		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
 			vp->v_flag &= ~VBWAIT;
 			wakeup((caddr_t) &vp->v_numoutput);
 		}
 	}
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 	register struct vnode *vp;
 	int flags;
 	struct ucred *cred;
 	struct proc *p;
 	int slpflag, slptimeo;
 {
 	register struct buf *bp;
 	struct buf *nbp, *blist;
 	int s, error;
 	vm_object_t object;
 
 	if (flags & V_SAVE) {
 		s = splbio();
 		while (vp->v_numoutput) {
 			vp->v_flag |= VBWAIT;
 			error = tsleep((caddr_t)&vp->v_numoutput,
 			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
 			if (error) {
 				splx(s);
 				return (error);
 			}
 		}
 		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 			splx(s);
 			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
 				return (error);
 			s = splbio();
 			if (vp->v_numoutput > 0 ||
 			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
 				panic("vinvalbuf: dirty bufs");
 		}
 		splx(s);
   	}
 	s = splbio();
 	for (;;) {
 		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
 		if (!blist)
 			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
 		if (!blist)
 			break;
 
 		for (bp = blist; bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 				error = BUF_TIMELOCK(bp,
 				    LK_EXCLUSIVE | LK_SLEEPFAIL,
 				    "vinvalbuf", slpflag, slptimeo);
 				if (error == ENOLCK)
 					break;
 				splx(s);
 				return (error);
 			}
 			/*
 			 * XXX Since there are no node locks for NFS, I
 			 * believe there is a slight chance that a delayed
 			 * write will occur while sleeping just above, so
 			 * check for it.  Note that vfs_bio_awrite expects
 			 * buffers to reside on a queue, while VOP_BWRITE and
 			 * brelse do not.
 			 */
 			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 				(flags & V_SAVE)) {
 
 				if (bp->b_vp == vp) {
 					if (bp->b_flags & B_CLUSTEROK) {
 						BUF_UNLOCK(bp);
 						vfs_bio_awrite(bp);
 					} else {
 						bremfree(bp);
 						bp->b_flags |= B_ASYNC;
 						VOP_BWRITE(bp->b_vp, bp);
 					}
 				} else {
 					bremfree(bp);
 					(void) VOP_BWRITE(bp->b_vp, bp);
 				}
 				break;
 			}
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 		}
 	}
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		while (vp->v_numoutput > 0) {
 			vp->v_flag |= VBWAIT;
 			tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
 		}
 		if (VOP_GETVOBJECT(vp, &object) == 0) {
 			while (object->paging_in_progress)
 				vm_object_pip_sleep(object, "vnvlbx");
 		}
 	} while (vp->v_numoutput > 0);
 
 	splx(s);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	simple_lock(&vp->v_interlock);
 	if (VOP_GETVOBJECT(vp, &object) == 0) {
 		vm_object_page_remove(object, 0, 0,
 			(flags & V_SAVE) ? TRUE : FALSE);
 	}
 	simple_unlock(&vp->v_interlock);
 
 	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
 		panic("vinvalbuf: flush failed");
 	return (0);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(vp, cred, p, length, blksize)
 	register struct vnode *vp;
 	struct ucred *cred;
 	struct proc *p;
 	off_t length;
 	int blksize;
 {
 	register struct buf *bp;
 	struct buf *nbp;
 	int s, anyfreed;
 	int trunclbn;
 
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	trunclbn = (length + blksize - 1) / blksize;
 
 	s = splbio();
 restart:
 	anyfreed = 1;
 	for (;anyfreed;) {
 		anyfreed = 0;
 		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if (bp->b_lblkno >= trunclbn) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 					goto restart;
 				} else {
 					bremfree(bp);
 					bp->b_flags |= (B_INVAL | B_RELBUF);
 					bp->b_flags &= ~B_ASYNC;
 					brelse(bp);
 					anyfreed = 1;
 				}
 				if (nbp &&
 				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 				    (nbp->b_vp != vp) ||
 				    (nbp->b_flags & B_DELWRI))) {
 					goto restart;
 				}
 			}
 		}
 
 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if (bp->b_lblkno >= trunclbn) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 					goto restart;
 				} else {
 					bremfree(bp);
 					bp->b_flags |= (B_INVAL | B_RELBUF);
 					bp->b_flags &= ~B_ASYNC;
 					brelse(bp);
 					anyfreed = 1;
 				}
 				if (nbp &&
 				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 				    (nbp->b_vp != vp) ||
 				    (nbp->b_flags & B_DELWRI) == 0)) {
 					goto restart;
 				}
 			}
 		}
 	}
 
 	if (length > 0) {
 restartsync:
 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 					goto restart;
 				} else {
 					bremfree(bp);
 					if (bp->b_vp == vp) {
 						bp->b_flags |= B_ASYNC;
 					} else {
 						bp->b_flags &= ~B_ASYNC;
 					}
 					VOP_BWRITE(bp->b_vp, bp);
 				}
 				goto restartsync;
 			}
 
 		}
 	}
 
 	while (vp->v_numoutput > 0) {
 		vp->v_flag |= VBWAIT;
 		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
 	}
 
 	splx(s);
 
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 	int s;
 
 	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
 
 	vhold(vp);
 	bp->b_vp = vp;
 	bp->b_dev = vn_todev(vp);
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	s = splbio();
 	bp->b_xflags |= BX_VNCLEAN;
 	bp->b_xflags &= ~BX_VNDIRTY;
 	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
 	splx(s);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(bp)
 	register struct buf *bp;
 {
 	struct vnode *vp;
 	struct buflists *listheadp;
 	int s;
 
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;
 	s = splbio();
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
 		if (bp->b_xflags & BX_VNDIRTY)
 			listheadp = &vp->v_dirtyblkhd;
 		else 
 			listheadp = &vp->v_cleanblkhd;
 		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
 		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 	}
 	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 		vp->v_flag &= ~VONWORKLST;
 		LIST_REMOVE(vp, v_synclist);
 	}
 	splx(s);
 	bp->b_vp = (struct vnode *) 0;
 	vdrop(vp);
 }
 
 /*
  * The workitem queue.
  * 
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct vnode *vp, int delay)
 {
 	int s, slot;
 
 	s = splbio();
 
 	if (vp->v_flag & VONWORKLST) {
 		LIST_REMOVE(vp, v_synclist);
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
 	vp->v_flag |= VONWORKLST;
 	splx(s);
 }
 
 struct  proc *updateproc;
 static void sched_sync __P((void));
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 
 /*
  * System filesystem synchronizer daemon.
  */
 void 
 sched_sync(void)
 {
 	struct synclist *slp;
 	struct vnode *vp;
 	long starttime;
 	int s;
 	struct proc *p = updateproc;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p,
 	    SHUTDOWN_PRI_LAST);   
 
 	for (;;) {
 		kproc_suspend_loop(p);
 
 		starttime = time_second;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 */
 		s = splbio();
 		slp = &syncer_workitem_pending[syncer_delayno];
 		syncer_delayno += 1;
 		if (syncer_delayno == syncer_maxdelay)
 			syncer_delayno = 0;
 		splx(s);
 
 		while ((vp = LIST_FIRST(slp)) != NULL) {
 			if (VOP_ISLOCKED(vp, NULL) == 0) {
 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 				(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
 				VOP_UNLOCK(vp, 0, p);
 			}
 			s = splbio();
 			if (LIST_FIRST(slp) == vp) {
 				/*
 				 * Note: v_tag VT_VFS vps can remain on the
 				 * worklist too with no dirty blocks, but 
 				 * since sync_fsync() moves it to a different 
 				 * slot we are safe.
 				 */
 				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
 				    !vn_isdisk(vp, NULL))
 					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
 				/*
 				 * Put us back on the worklist.  The worklist
 				 * routine will remove us from our current
 				 * position and then add us back in at a later
 				 * position.
 				 */
 				vn_syncer_add_to_worklist(vp, syncdelay);
 			}
 			splx(s);
 		}
 
 		/*
 		 * Do soft update processing.
 		 */
 		if (bioops.io_sync)
 			(*bioops.io_sync)(NULL);
 
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		if (rushjob > 0) {
 			rushjob -= 1;
 			continue;
 		}
 		/*
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (time_second == starttime)
 			tsleep(&lbolt, PPAUSE, "syncer", 0);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer()
 {
 	int s;
 
 	s = splhigh();
 	if (updateproc->p_wchan == &lbolt)
 		setrunnable(updateproc);
 	splx(s);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		return (1);
 	}
 	return(0);
 }
 
 /*
  * Associate a p-buffer with a vnode.
  *
  * Also sets B_PAGING flag to indicate that vnode is not fully associated
  * with the buffer.  i.e. the bp has not been linked into the vnode or
  * ref-counted.
  */
 void
 pbgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 
 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
 
 	bp->b_vp = vp;
 	bp->b_flags |= B_PAGING;
 	bp->b_dev = vn_todev(vp);
 }
 
 /*
  * Disassociate a p-buffer from a vnode.
  */
 void
 pbrelvp(bp)
 	register struct buf *bp;
 {
 
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 
 	/* XXX REMOVE ME */
 	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
 		panic(
 		    "relpbuf(): b_vp was probably reassignbuf()d %p %x", 
 		    bp,
 		    (int)bp->b_flags
 		);
 	}
 	bp->b_vp = (struct vnode *) 0;
 	bp->b_flags &= ~B_PAGING;
 }
 
 void
 pbreassignbuf(bp, newvp)
 	struct buf *bp;
 	struct vnode *newvp;
 {
 	if ((bp->b_flags & B_PAGING) == 0) {
 		panic(
 		    "pbreassignbuf() on non phys bp %p", 
 		    bp
 		);
 	}
 	bp->b_vp = newvp;
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(bp, newvp)
 	register struct buf *bp;
 	register struct vnode *newvp;
 {
 	struct buflists *listheadp;
 	int delay;
 	int s;
 
 	if (newvp == NULL) {
 		printf("reassignbuf: NULL");
 		return;
 	}
 	++reassignbufcalls;
 
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 
 	s = splbio();
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
 		if (bp->b_xflags & BX_VNDIRTY)
 			listheadp = &bp->b_vp->v_dirtyblkhd;
 		else 
 			listheadp = &bp->b_vp->v_cleanblkhd;
 		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
 		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 		if (bp->b_vp != newvp) {
 			vdrop(bp->b_vp);
 			bp->b_vp = NULL;	/* for clarification */
 		}
 	}
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		struct buf *tbp;
 
 		listheadp = &newvp->v_dirtyblkhd;
 		if ((newvp->v_flag & VONWORKLST) == 0) {
 			switch (newvp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 			case VBLK:
 				if (newvp->v_specmountpoint != NULL) {
 					delay = metadelay;
 					break;
 				}
 				/* fall through */
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(newvp, delay);
 		}
 		bp->b_xflags |= BX_VNDIRTY;
 		tbp = TAILQ_FIRST(listheadp);
 		if (tbp == NULL ||
 		    bp->b_lblkno == 0 ||
 		    (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
 		    (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
 			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
 			++reassignbufsortgood;
 		} else if (bp->b_lblkno < 0) {
 			TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
 			++reassignbufsortgood;
 		} else if (reassignbufmethod == 1) {
 			/*
 			 * New sorting algorithm, only handle sequential case,
 			 * otherwise append to end (but before metadata)
 			 */
 			if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
 			    (tbp->b_xflags & BX_VNDIRTY)) {
 				/*
 				 * Found the best place to insert the buffer
 				 */
 				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
 				++reassignbufsortgood;
 			} else {
 				/*
 				 * Missed, append to end, but before meta-data.
 				 * We know that the head buffer in the list is
 				 * not meta-data due to prior conditionals.
 				 *
 				 * Indirect effects:  NFS second stage write
 				 * tends to wind up here, giving maximum 
 				 * distance between the unstable write and the
 				 * commit rpc.
 				 */
 				tbp = TAILQ_LAST(listheadp, buflists);
 				while (tbp && tbp->b_lblkno < 0)
 					tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
 				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
 				++reassignbufsortbad;
 			}
 		} else {
 			/*
 			 * Old sorting algorithm, scan queue and insert
 			 */
 			struct buf *ttbp;
 			while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
 			    (ttbp->b_lblkno < bp->b_lblkno)) {
 				++reassignbufloops;
 				tbp = ttbp;
 			}
 			TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
 		}
 	} else {
 		bp->b_xflags |= BX_VNCLEAN;
 		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
 		if ((newvp->v_flag & VONWORKLST) &&
 		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
 			newvp->v_flag &= ~VONWORKLST;
 			LIST_REMOVE(newvp, v_synclist);
 		}
 	}
 	if (bp->b_vp != newvp) {
 		bp->b_vp = newvp;
 		vhold(bp->b_vp);
 	}
 	splx(s);
 }
 
 /*
  * Create a vnode for a block device.
  * Used for mounting the root file system.
  */
 int
 bdevvp(dev, vpp)
 	dev_t dev;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 	struct vnode *nvp;
 	int error;
 
 	if (dev == NODEV) {
 		*vpp = NULLVP;
 		return (ENXIO);
 	}
 	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
 	if (error) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	vp = nvp;
 	vp->v_type = VBLK;
 	addalias(vp, dev);
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Add vnode to the alias list hung off the dev_t.
  *
  * The reason for this gunk is that multiple vnodes can reference
  * the same physical device, so checking vp->v_usecount to see
  * how many users there are is inadequate; the v_usecount for
  * the vnodes need to be accumulated.  vcount() does that.
  */
 void
 addaliasu(nvp, nvp_rdev)
 	struct vnode *nvp;
 	udev_t nvp_rdev;
 {
 
 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
 		panic("addaliasu on non-special vnode");
 	addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0));
 }
 
 void
 addalias(nvp, dev)
 	struct vnode *nvp;
 	dev_t dev;
 {
 
 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
 		panic("addalias on non-special vnode");
 
 	nvp->v_rdev = dev;
 	simple_lock(&spechash_slock);
 	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
 	simple_unlock(&spechash_slock);
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it. The vnode lock bit is set if the
  * vnode is being eliminated in vgone. The process is awakened
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new file system type).
  */
 int
 vget(vp, flags, p)
 	register struct vnode *vp;
 	int flags;
 	struct proc *p;
 {
 	int error;
 
 	/*
 	 * If the vnode is in the process of being cleaned out for
 	 * another use, we wait for the cleaning to finish and then
 	 * return failure. Cleaning is determined by checking that
 	 * the VXLOCK flag is set.
 	 */
 	if ((flags & LK_INTERLOCK) == 0) {
 		simple_lock(&vp->v_interlock);
 	}
 	if (vp->v_flag & VXLOCK) {
 		if (vp->v_vxproc == curproc) {
 #if 0
 			/* this can now occur in normal operation */
 			log(LOG_INFO, "VXLOCK interlock avoided\n");
 #endif
 		} else {
 			vp->v_flag |= VXWANT;
 			simple_unlock(&vp->v_interlock);
 			tsleep((caddr_t)vp, PINOD, "vget", 0);
 			return (ENOENT);
 		}
 	}
 
 	vp->v_usecount++;
 
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 	if (flags & LK_TYPE_MASK) {
 		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
 			/*
 			 * must expand vrele here because we do not want
 			 * to call VOP_INACTIVE if the reference count
 			 * drops back to zero since it was never really
 			 * active. We must remove it from the free list
 			 * before sleeping so that multiple processes do
 			 * not try to recycle it.
 			 */
 			simple_lock(&vp->v_interlock);
 			vp->v_usecount--;
 			if (VSHOULDFREE(vp))
 				vfree(vp);
 			else
 				vlruvp(vp);
 			simple_unlock(&vp->v_interlock);
 		}
 		return (error);
 	}
 	simple_unlock(&vp->v_interlock);
 	return (0);
 }
 
 void
 vref(struct vnode *vp)
 {
 	simple_lock(&vp->v_interlock);
 	vp->v_usecount++;
 	simple_unlock(&vp->v_interlock);
 }
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(vp)
 	struct vnode *vp;
 {
 	struct proc *p = curproc;	/* XXX */
 
 	KASSERT(vp != NULL, ("vrele: null vp"));
 
 	simple_lock(&vp->v_interlock);
 
 	if (vp->v_usecount > 1) {
 
 		vp->v_usecount--;
 		simple_unlock(&vp->v_interlock);
 
 		return;
 	}
 
 	if (vp->v_usecount == 1) {
 		vp->v_usecount--;
 		/*
 		 * We must call VOP_INACTIVE with the node locked.
 		 * If we are doing a vpu, the node is already locked,
 		 * but, in the case of vrele, we must explicitly lock
 		 * the vnode before calling VOP_INACTIVE
 		 */
 
 		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0)
 			VOP_INACTIVE(vp, p);
 		if (VSHOULDFREE(vp))
 			vfree(vp);
 		else
 			vlruvp(vp);
 	} else {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 		simple_unlock(&vp->v_interlock);
 #endif
 		panic("vrele: negative ref cnt");
 	}
 }
 
 void
 vput(vp)
 	struct vnode *vp;
 {
 	struct proc *p = curproc;	/* XXX */
 
 	KASSERT(vp != NULL, ("vput: null vp"));
 
 	simple_lock(&vp->v_interlock);
 
 	if (vp->v_usecount > 1) {
 		vp->v_usecount--;
 		VOP_UNLOCK(vp, LK_INTERLOCK, p);
 		return;
 	}
 
 	if (vp->v_usecount == 1) {
 		vp->v_usecount--;
 		/*
 		 * We must call VOP_INACTIVE with the node locked.
 		 * If we are doing a vpu, the node is already locked,
 		 * so we just need to release the vnode mutex.
 		 */
 		simple_unlock(&vp->v_interlock);
 		VOP_INACTIVE(vp, p);
 		if (VSHOULDFREE(vp))
 			vfree(vp);
 		else
 			vlruvp(vp);
 	} else {
 #ifdef DIAGNOSTIC
 		vprint("vput: negative ref count", vp);
 #endif
 		panic("vput: negative ref cnt");
 	}
 }
 
 /*
  * Somebody doesn't want the vnode recycled.
  */
 void
 vhold(vp)
 	register struct vnode *vp;
 {
 	int s;
 
   	s = splbio();
 	vp->v_holdcnt++;
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 	splx(s);
 }
 
 /*
  * One less who cares about this vnode.
  */
 void
 vdrop(vp)
 	register struct vnode *vp;
 {
 	int s;
 
 	s = splbio();
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt");
 	vp->v_holdcnt--;
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 	splx(s);
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush()
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
 vflush(mp, rootrefs, flags)
 	struct mount *mp;
 	int rootrefs;
 	int flags;
 {
 	struct proc *p = curproc;	/* XXX */
 	struct vnode *vp, *nvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, &rootvp)) != 0)
 			return (error);
 		vput(rootvp);
 	}
 	simple_lock(&mntvnode_slock);
 loop:
 	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
 		/*
 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
 		 * Start over if it has (it won't be on the list anymore).
 		 */
 		if (vp->v_mount != mp)
 			goto loop;
 		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
 
 		simple_lock(&vp->v_interlock);
 		/*
 		 * Skip over a vnodes marked VSYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
 			simple_unlock(&vp->v_interlock);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing. 
 		 */
 		if ((flags & WRITECLOSE) &&
 		    (vp->v_type == VNON ||
 		    (VOP_GETATTR(vp, &vattr, p->p_ucred, p) == 0 &&
 		    vattr.va_nlink > 0)) &&
 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 			simple_unlock(&vp->v_interlock);
 			continue;
 		}
 
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 */
 		if (vp->v_usecount == 0) {
 			simple_unlock(&mntvnode_slock);
 			vgonel(vp, p);
 			simple_lock(&mntvnode_slock);
 			continue;
 		}
 
 		/*
 		 * If FORCECLOSE is set, forcibly close the vnode. For block
 		 * or character devices, revert to an anonymous device. For
 		 * all other files, just kill them.
 		 */
 		if (flags & FORCECLOSE) {
 			simple_unlock(&mntvnode_slock);
 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
 				vgonel(vp, p);
 			} else {
 				vclean(vp, 0, p);
 				vp->v_op = spec_vnodeop_p;
 				insmntque(vp, (struct mount *) 0);
 			}
 			simple_lock(&mntvnode_slock);
 			continue;
 		}
 #ifdef DIAGNOSTIC
 		if (busyprt)
 			vprint("vflush: busy vnode", vp);
 #endif
 		simple_unlock(&vp->v_interlock);
 		busy++;
 	}
 	simple_unlock(&mntvnode_slock);
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		simple_lock(&rootvp->v_interlock);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			vgonel(rootvp, p);
 			busy = 0;
 		} else
 			simple_unlock(&rootvp->v_interlock);
 	}
 	if (busy)
 		return (EBUSY);
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * We do not want to recycle the vnode too quickly.
  *
  * XXX we can't move vp's around the nvnodelist without really screwing
  * up the efficiency of filesystem SYNC and friends.  This code is 
  * disabled until we fix the syncing code's scanning algorithm.
  */
 static void
 vlruvp(struct vnode *vp)
 {
 #if 0
 	struct mount *mp;
 
 	if ((mp = vp->v_mount) != NULL) {
 		simple_lock(&mntvnode_slock);
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		simple_unlock(&mntvnode_slock);
 	}
 #endif
 }
 
 /*
  * Disassociate the underlying file system from a vnode.
  */
 static void
 vclean(vp, flags, p)
 	struct vnode *vp;
 	int flags;
 	struct proc *p;
 {
 	int active;
 
 	/*
 	 * Check to see if the vnode is in use. If so we have to reference it
 	 * before we clean it out so that its count cannot fall to zero and
 	 * generate a race against ourselves to recycle it.
 	 */
 	if ((active = vp->v_usecount))
 		vp->v_usecount++;
 
 	/*
 	 * Prevent the vnode from being recycled or brought into use while we
 	 * clean it out.
 	 */
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock");
 	vp->v_flag |= VXLOCK;
 	vp->v_vxproc = curproc;
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 	 * have the object locked while it cleans it out. The VOP_LOCK
 	 * ensures that the VOP_INACTIVE routine is done with its work.
 	 * For active vnodes, it ensures that no other activity can
 	 * occur while the underlying object is being cleaned out.
 	 */
 	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
 
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 */
 	vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
 
 	VOP_DESTROYVOBJECT(vp);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed. Note that the
 	 * VOP_INACTIVE will unlock the vnode.
 	 */
 	if (active) {
 		if (flags & DOCLOSE)
 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
 		VOP_INACTIVE(vp, p);
 	} else {
 		/*
 		 * Any other processes trying to obtain this lock must first
 		 * wait for VXLOCK to clear, then call the new lock operation.
 		 */
 		VOP_UNLOCK(vp, 0, p);
 	}
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, p))
 		panic("vclean: cannot reclaim");
 
 	if (active) {
 		/*
 		 * Inline copy of vrele() since VOP_INACTIVE
 		 * has already been called.
 		 */
 		simple_lock(&vp->v_interlock);
 		if (--vp->v_usecount <= 0) {
 #ifdef DIAGNOSTIC
 			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
 				vprint("vclean: bad ref count", vp);
 				panic("vclean: ref cnt");
 			}
 #endif
 			vfree(vp);
 		}
 		simple_unlock(&vp->v_interlock);
 	}
 
 	cache_purge(vp);
 	vp->v_vnlock = NULL;
 
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 	
 	/*
 	 * Done with purge, notify sleepers of the grim news.
 	 */
 	vp->v_op = dead_vnodeop_p;
 	vn_pollgone(vp);
 	vp->v_tag = VT_NON;
 	vp->v_flag &= ~VXLOCK;
 	vp->v_vxproc = NULL;
 	if (vp->v_flag & VXWANT) {
 		vp->v_flag &= ~VXWANT;
 		wakeup((caddr_t) vp);
 	}
 }
 
 /*
  * Eliminate all activity associated with the requested vnode
  * and with all vnodes aliased to the requested vnode.
  */
 int
 vop_revoke(ap)
 	struct vop_revoke_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 	} */ *ap;
 {
 	struct vnode *vp, *vq;
 	dev_t dev;
 
 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
 
 	vp = ap->a_vp;
 	/*
 	 * If a vgone (or vclean) is already in progress,
 	 * wait until it is done and return.
 	 */
 	if (vp->v_flag & VXLOCK) {
 		vp->v_flag |= VXWANT;
 		simple_unlock(&vp->v_interlock);
 		tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
 		return (0);
 	}
 	dev = vp->v_rdev;
 	for (;;) {
 		simple_lock(&spechash_slock);
 		vq = SLIST_FIRST(&dev->si_hlist);
 		simple_unlock(&spechash_slock);
 		if (!vq)
 			break;
 		vgone(vq);
 	}
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  * Release the passed interlock if the vnode will be recycled.
  */
 int
 vrecycle(vp, inter_lkp, p)
 	struct vnode *vp;
 	struct simplelock *inter_lkp;
 	struct proc *p;
 {
 
 	simple_lock(&vp->v_interlock);
 	if (vp->v_usecount == 0) {
 		if (inter_lkp) {
 			simple_unlock(inter_lkp);
 		}
 		vgonel(vp, p);
 		return (1);
 	}
 	simple_unlock(&vp->v_interlock);
 	return (0);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(vp)
 	register struct vnode *vp;
 {
 	struct proc *p = curproc;	/* XXX */
 
 	simple_lock(&vp->v_interlock);
 	vgonel(vp, p);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 void
 vgonel(vp, p)
 	struct vnode *vp;
 	struct proc *p;
 {
 	int s;
 
 	/*
 	 * If a vgone (or vclean) is already in progress,
 	 * wait until it is done and return.
 	 */
 	if (vp->v_flag & VXLOCK) {
 		vp->v_flag |= VXWANT;
 		simple_unlock(&vp->v_interlock);
 		tsleep((caddr_t)vp, PINOD, "vgone", 0);
 		return;
 	}
 
 	/*
 	 * Clean out the filesystem specific data.
 	 */
 	vclean(vp, DOCLOSE, p);
 	simple_lock(&vp->v_interlock);
 
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
 	if (vp->v_mount != NULL)
 		insmntque(vp, (struct mount *)0);
 	/*
 	 * If special device, remove it from special device alias list
 	 * if it is on one.
 	 */
 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {
 		simple_lock(&spechash_slock);
 		SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext);
 		freedev(vp->v_rdev);
 		simple_unlock(&spechash_slock);
 		vp->v_rdev = NULL;
 	}
 
 	/*
 	 * If it is on the freelist and not already at the head,
 	 * move it to the head of the list. The test of the
 	 * VDOOMED flag and the reference count of zero is because
 	 * it will be removed from the free list by getnewvnode,
 	 * but will not have its reference count incremented until
 	 * after calling vgone. If the reference count were
 	 * incremented first, vgone would (incorrectly) try to
 	 * close the previous instance of the underlying object.
 	 */
 	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
 		s = splbio();
 		simple_lock(&vnode_free_list_slock);
 		if (vp->v_flag & VFREE)
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		else
 			freevnodes++;
 		vp->v_flag |= VFREE;
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 		simple_unlock(&vnode_free_list_slock);
 		splx(s);
 	}
 
 	vp->v_type = VBAD;
 	simple_unlock(&vp->v_interlock);
 }
 
 /*
  * Lookup a vnode by device number.
  */
 int
 vfinddev(dev, type, vpp)
 	dev_t dev;
 	enum vtype type;
 	struct vnode **vpp;
 {
 	struct vnode *vp;
 
 	simple_lock(&spechash_slock);
 	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
 		if (type == vp->v_type) {
 			*vpp = vp;
 			simple_unlock(&spechash_slock);
 			return (1);
 		}
 	}
 	simple_unlock(&spechash_slock);
 	return (0);
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(vp)
 	struct vnode *vp;
 {
 	struct vnode *vq;
 	int count;
 
 	count = 0;
 	simple_lock(&spechash_slock);
 	SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext)
 		count += vq->v_usecount;
 	simple_unlock(&spechash_slock);
 	return (count);
 }
 
 /*
  * Same as above, but using the dev_t as argument
  */
 
 int
 count_dev(dev)
 	dev_t dev;
 {
 	struct vnode *vp;
 
 	vp = SLIST_FIRST(&dev->si_hlist);
 	if (vp == NULL)
 		return (0);
 	return(vcount(vp));
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
 
 void
 vprint(label, vp)
 	char *label;
 	struct vnode *vp;
 {
 	char buf[96];
 
 	if (label != NULL)
 		printf("%s: %p: ", label, (void *)vp);
 	else
 		printf("%p: ", (void *)vp);
 	printf("type %s, usecount %d, writecount %d, refcount %d,",
 	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
 	    vp->v_holdcnt);
 	buf[0] = '\0';
 	if (vp->v_flag & VROOT)
 		strcat(buf, "|VROOT");
 	if (vp->v_flag & VTEXT)
 		strcat(buf, "|VTEXT");
 	if (vp->v_flag & VSYSTEM)
 		strcat(buf, "|VSYSTEM");
 	if (vp->v_flag & VXLOCK)
 		strcat(buf, "|VXLOCK");
 	if (vp->v_flag & VXWANT)
 		strcat(buf, "|VXWANT");
 	if (vp->v_flag & VBWAIT)
 		strcat(buf, "|VBWAIT");
 	if (vp->v_flag & VDOOMED)
 		strcat(buf, "|VDOOMED");
 	if (vp->v_flag & VFREE)
 		strcat(buf, "|VFREE");
 	if (vp->v_flag & VOBJBUF)
 		strcat(buf, "|VOBJBUF");
 	if (buf[0] != '\0')
 		printf(" flags (%s)", &buf[1]);
 	if (vp->v_data == NULL) {
 		printf("\n");
 	} else {
 		printf("\n\t");
 		VOP_PRINT(vp);
 	}
 }
 
 #ifdef DDB
 #include <ddb/ddb.h>
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
 {
 	struct proc *p = curproc;	/* XXX */
 	struct mount *mp, *nmp;
 	struct vnode *vp;
 
 	printf("Locked vnodes\n");
 	simple_lock(&mountlist_slock);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (VOP_ISLOCKED(vp, NULL))
 				vprint((char *)0, vp);
 		}
 		simple_lock(&mountlist_slock);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, p);
 	}
 	simple_unlock(&mountlist_slock);
 }
 #endif
 
 /*
  * Top level filesystem related information gathering.
  */
 static int	sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS));
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 #ifdef notyet
 	/* all sysctl names at this level are at least name and field */
 	if (namelen < 2)
 		return (ENOTDIR);		/* overloaded */
 	if (name[0] != VFS_GENERIC) {
 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 			if (vfsp->vfc_typenum == name[0])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
 		    oldp, oldlenp, newp, newlen, p));
 	}
 #endif
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
 	}
 	return (EOPNOTSUPP);
 }
 
 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
 	"Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error)
 			return error;
 	}
 	return 0;
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 
 #if 0
 #define KINFO_VNODESLOP	10
 /*
  * Dump vnode list (via sysctl).
  * Copyout address of vnode followed by vnode.
  */
 /* ARGSUSED */
 static int
 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p = curproc;	/* XXX */
 	struct mount *mp, *nmp;
 	struct vnode *nvp, *vp;
 	int error;
 
 #define VPTRSZ	sizeof (struct vnode *)
 #define VNODESZ	sizeof (struct vnode)
 
 	req->lock = 0;
 	if (!req->oldptr) /* Make an estimate */
 		return (SYSCTL_OUT(req, 0,
 			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
 
 	simple_lock(&mountlist_slock);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 again:
 		simple_lock(&mntvnode_slock);
 		for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 		     vp != NULL;
 		     vp = nvp) {
 			/*
 			 * Check that the vp is still associated with
 			 * this filesystem.  RACE: could have been
 			 * recycled onto the same filesystem.
 			 */
 			if (vp->v_mount != mp) {
 				simple_unlock(&mntvnode_slock);
 				goto again;
 			}
 			nvp = TAILQ_NEXT(vp, v_nmntvnodes);
 			simple_unlock(&mntvnode_slock);
 			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
 			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
 				return (error);
 			simple_lock(&mntvnode_slock);
 		}
 		simple_unlock(&mntvnode_slock);
 		simple_lock(&mountlist_slock);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, p);
 	}
 	simple_unlock(&mountlist_slock);
 
 	return (0);
 }
 #endif
 
 /*
  * XXX
  * Exporting the vnode list on large systems causes them to crash.
  * Exporting the vnode list on medium systems causes sysctl to coredump.
  */
 #if 0
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_vnode, "S,vnode", "");
 #endif
 
 /*
  * Check to see if a filesystem is mounted on a block device.
  */
 int
 vfs_mountedon(vp)
 	struct vnode *vp;
 {
 
 	if (vp->v_specmountpoint != NULL)
 		return (EBUSY);
 	return (0);
 }
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall()
 {
 	struct mount *mp;
 	struct proc *p;
 	int error;
 
 	if (curproc != NULL)
 		p = curproc;
 	else
 		p = initproc;	/* XXX XXX should this be proc0? */
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	while(!TAILQ_EMPTY(&mountlist)) {
 		mp = TAILQ_LAST(&mountlist, mntlist);
 		error = dounmount(mp, MNT_FORCE, p);
 		if (error) {
 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
 			printf("unmount of %s failed (",
 			    mp->mnt_stat.f_mntonname);
 			if (error == EBUSY)
 				printf("BUSY)\n");
 			else
 				printf("%d)\n", error);
 		} else {
 			/* The unmount has removed mp from the mountlist */
 		}
 	}
 }
 
 /*
  * Build hash lists of net addresses and hang them off the mount point.
  * Called by ufs_mount() to set up the lists of export addresses.
  */
 static int
 vfs_hang_addrlist(mp, nep, argp)
 	struct mount *mp;
 	struct netexport *nep;
 	struct export_args *argp;
 {
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
 	register int i;
 	struct radix_node *rn;
 	struct sockaddr *saddr, *smask = 0;
 	struct domain *dom;
 	int error;
 
 	if (argp->ex_addrlen == 0) {
 		if (mp->mnt_flag & MNT_DEFEXPORTED)
 			return (EPERM);
 		np = &nep->ne_defexported;
 		np->netc_exflags = argp->ex_flags;
 		np->netc_anon = argp->ex_anon;
 		np->netc_anon.cr_ref = 1;
 		mp->mnt_flag |= MNT_DEFEXPORTED;
 		return (0);
 	}
 
 	if (argp->ex_addrlen > MLEN)
 		return (EINVAL);
 
 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
 	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
 	bzero((caddr_t) np, i);
 	saddr = (struct sockaddr *) (np + 1);
 	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
 		goto out;
 	if (saddr->sa_len > argp->ex_addrlen)
 		saddr->sa_len = argp->ex_addrlen;
 	if (argp->ex_masklen) {
 		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
 		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
 		if (error)
 			goto out;
 		if (smask->sa_len > argp->ex_masklen)
 			smask->sa_len = argp->ex_masklen;
 	}
 	i = saddr->sa_family;
 	if ((rnh = nep->ne_rtable[i]) == 0) {
 		/*
 		 * Seems silly to initialize every AF when most are not used,
 		 * do so on demand here
 		 */
 		for (dom = domains; dom; dom = dom->dom_next)
 			if (dom->dom_family == i && dom->dom_rtattach) {
 				dom->dom_rtattach((void **) &nep->ne_rtable[i],
 				    dom->dom_rtoffset);
 				break;
 			}
 		if ((rnh = nep->ne_rtable[i]) == 0) {
 			error = ENOBUFS;
 			goto out;
 		}
 	}
 	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
 	    np->netc_rnodes);
 	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
 		error = EPERM;
 		goto out;
 	}
 	np->netc_exflags = argp->ex_flags;
 	np->netc_anon = argp->ex_anon;
 	np->netc_anon.cr_ref = 1;
 	return (0);
 out:
 	free(np, M_NETADDR);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 vfs_free_netcred(rn, w)
 	struct radix_node *rn;
 	void *w;
 {
 	register struct radix_node_head *rnh = (struct radix_node_head *) w;
 
 	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
 	free((caddr_t) rn, M_NETADDR);
 	return (0);
 }
 
 /*
  * Free the net address hash lists that are hanging off the mount points.
  */
 static void
 vfs_free_addrlist(nep)
 	struct netexport *nep;
 {
 	register int i;
 	register struct radix_node_head *rnh;
 
 	for (i = 0; i <= AF_MAX; i++)
 		if ((rnh = nep->ne_rtable[i])) {
 			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
 			    (caddr_t) rnh);
 			free((caddr_t) rnh, M_RTABLE);
 			nep->ne_rtable[i] = 0;
 		}
 }
 
 int
 vfs_export(mp, nep, argp)
 	struct mount *mp;
 	struct netexport *nep;
 	struct export_args *argp;
 {
 	int error;
 
 	if (argp->ex_flags & MNT_DELEXPORT) {
 		if (mp->mnt_flag & MNT_EXPUBLIC) {
 			vfs_setpublicfs(NULL, NULL, NULL);
 			mp->mnt_flag &= ~MNT_EXPUBLIC;
 		}
 		vfs_free_addrlist(nep);
 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
 	}
 	if (argp->ex_flags & MNT_EXPORTED) {
 		if (argp->ex_flags & MNT_EXPUBLIC) {
 			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
 				return (error);
 			mp->mnt_flag |= MNT_EXPUBLIC;
 		}
 		if ((error = vfs_hang_addrlist(mp, nep, argp)))
 			return (error);
 		mp->mnt_flag |= MNT_EXPORTED;
 	}
 	return (0);
 }
 
 
 /*
  * Set the publicly exported filesystem (WebNFS). Currently, only
  * one public filesystem is possible in the spec (RFC 2054 and 2055)
  */
 int
 vfs_setpublicfs(mp, nep, argp)
 	struct mount *mp;
 	struct netexport *nep;
 	struct export_args *argp;
 {
 	int error;
 	struct vnode *rvp;
 	char *cp;
 
 	/*
 	 * mp == NULL -> invalidate the current info, the FS is
 	 * no longer exported. May be called from either vfs_export
 	 * or unmount, so check if it hasn't already been done.
 	 */
 	if (mp == NULL) {
 		if (nfs_pub.np_valid) {
 			nfs_pub.np_valid = 0;
 			if (nfs_pub.np_index != NULL) {
 				FREE(nfs_pub.np_index, M_TEMP);
 				nfs_pub.np_index = NULL;
 			}
 		}
 		return (0);
 	}
 
 	/*
 	 * Only one allowed at a time.
 	 */
 	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
 		return (EBUSY);
 
 	/*
 	 * Get real filehandle for root of exported FS.
 	 */
 	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
 	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
 
 	if ((error = VFS_ROOT(mp, &rvp)))
 		return (error);
 
 	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
 		return (error);
 
 	vput(rvp);
 
 	/*
 	 * If an indexfile was specified, pull it in.
 	 */
 	if (argp->ex_indexfile != NULL) {
 		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
 		    M_WAITOK);
 		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
 		    MAXNAMLEN, (size_t *)0);
 		if (!error) {
 			/*
 			 * Check for illegal filenames.
 			 */
 			for (cp = nfs_pub.np_index; *cp; cp++) {
 				if (*cp == '/') {
 					error = EINVAL;
 					break;
 				}
 			}
 		}
 		if (error) {
 			FREE(nfs_pub.np_index, M_TEMP);
 			return (error);
 		}
 	}
 
 	nfs_pub.np_mount = mp;
 	nfs_pub.np_valid = 1;
 	return (0);
 }
 
 struct netcred *
 vfs_export_lookup(mp, nep, nam)
 	register struct mount *mp;
 	struct netexport *nep;
 	struct sockaddr *nam;
 {
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
 	struct sockaddr *saddr;
 
 	np = NULL;
 	if (mp->mnt_flag & MNT_EXPORTED) {
 		/*
 		 * Lookup in the export list first.
 		 */
 		if (nam != NULL) {
 			saddr = nam;
 			rnh = nep->ne_rtable[saddr->sa_family];
 			if (rnh != NULL) {
 				np = (struct netcred *)
 					(*rnh->rnh_matchaddr)((caddr_t)saddr,
 							      rnh);
 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
 					np = NULL;
 			}
 		}
 		/*
 		 * If no address match, use the default if it exists.
 		 */
 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
 			np = &nep->ne_defexported;
 	}
 	return (np);
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags) 
 {
 	struct vnode *vp, *nvp;
 	struct vm_object *obj;
 	int tries;
 
 	tries = 5;
 	simple_lock(&mntvnode_slock);
 loop:
 	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
 		if (vp->v_mount != mp) {
 			if (--tries > 0)
 				goto loop;
 			break;
 		}
 		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
 
 		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
 			continue;
 
 		/*
 		 * There could be hundreds of thousands of vnodes, we cannot
 		 * afford to do anything heavy-weight until we have a fairly
 		 * good indication that there is something to do.
 		 */
 		if ((vp->v_flag & VOBJDIRTY) &&
 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
 			simple_unlock(&mntvnode_slock);
 			if (!vget(vp,
 			    LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
 				if (VOP_GETVOBJECT(vp, &obj) == 0) {
 					vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
 				}
 				vput(vp);
 			}
 			simple_lock(&mntvnode_slock);
 			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
 				if (--tries > 0)
 					goto loop;
 				break;
 			}
 		}
 	}
 	simple_unlock(&mntvnode_slock);
 }
 
 /*
  * Create the VM object needed for VMIO and mmap support.  This
  * is done for all VREG files in the system.  Some filesystems might
  * afford the additional metadata buffering capability of the
  * VMIO code by making the device node be VMIO mode also.
  *
  * vp must be locked when vfs_object_create is called.
  */
 int
 vfs_object_create(vp, p, cred)
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 {
 	return (VOP_CREATEVOBJECT(vp, cred, p));
 }
 
 void
 vfree(vp)
 	struct vnode *vp;
 {
 	int s;
 
 	s = splbio();
 	simple_lock(&vnode_free_list_slock);
 	KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
 	if (vp->v_flag & VAGE) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	} else {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
 	freevnodes++;
 	simple_unlock(&vnode_free_list_slock);
 	vp->v_flag &= ~VAGE;
 	vp->v_flag |= VFREE;
 	splx(s);
 }
 
 void
 vbusy(vp)
 	struct vnode *vp;
 {
 	int s;
 
 	s = splbio();
 	simple_lock(&vnode_free_list_slock);
 	KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 	freevnodes--;
 	simple_unlock(&vnode_free_list_slock);
 	vp->v_flag &= ~(VFREE|VAGE);
 	splx(s);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(vp, p, events)
 	struct vnode *vp;
 	struct proc *p;
 	short events;
 {
 	simple_lock(&vp->v_pollinfo.vpi_lock);
 	if (vp->v_pollinfo.vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo.vpi_revents;
 		vp->v_pollinfo.vpi_revents &= ~events;
 
 		simple_unlock(&vp->v_pollinfo.vpi_lock);
 		return events;
 	}
 	vp->v_pollinfo.vpi_events |= events;
 	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
 	simple_unlock(&vp->v_pollinfo.vpi_lock);
 	return 0;
 }
 
 /*
  * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
  * it is possible for us to miss an event due to race conditions, but
  * that condition is expected to be rare, so for the moment it is the
  * preferred interface.
  */
 void
 vn_pollevent(vp, events)
 	struct vnode *vp;
 	short events;
 {
 	simple_lock(&vp->v_pollinfo.vpi_lock);
 	if (vp->v_pollinfo.vpi_events & events) {
 		/*
 		 * We clear vpi_events so that we don't
 		 * call selwakeup() twice if two events are
 		 * posted before the polling process(es) is
 		 * awakened.  This also ensures that we take at
 		 * most one selwakeup() if the polling process
 		 * is no longer interested.  However, it does
 		 * mean that only one event can be noticed at
 		 * a time.  (Perhaps we should only clear those
 		 * event bits which we note?) XXX
 		 */
 		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
 		vp->v_pollinfo.vpi_revents |= events;
 		selwakeup(&vp->v_pollinfo.vpi_selinfo);
 	}
 	simple_unlock(&vp->v_pollinfo.vpi_lock);
 }
 
 /*
  * Wake up anyone polling on vp because it is being revoked.
  * This depends on dead_poll() returning POLLHUP for correct
  * behavior.
  */
 void
 vn_pollgone(vp)
 	struct vnode *vp;
 {
 	simple_lock(&vp->v_pollinfo.vpi_lock);
 	if (vp->v_pollinfo.vpi_events) {
 		vp->v_pollinfo.vpi_events = 0;
 		selwakeup(&vp->v_pollinfo.vpi_selinfo);
 	}
 	simple_unlock(&vp->v_pollinfo.vpi_lock);
 }
 
 
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
 static int	sync_fsync __P((struct  vop_fsync_args *));
 static int	sync_inactive __P((struct  vop_inactive_args *));
 static int	sync_reclaim  __P((struct  vop_reclaim_args *));
 #define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
 #define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
 static int	sync_print __P((struct vop_print_args *));
 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
 
 static vop_t **sync_vnodeop_p;
 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
 	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
 	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
 	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
 	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
 	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
 	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
 	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
 	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
 	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
 	{ NULL, NULL }
 };
 static struct vnodeopv_desc sync_vnodeop_opv_desc =
 	{ &sync_vnodeop_p, sync_vnodeop_entries };
 
 VNODEOP_SET(sync_vnodeop_opv_desc);
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 int
 vfs_allocate_syncvnode(mp)
 	struct mount *mp;
 {
 	struct vnode *vp;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
 		mp->mnt_syncer = NULL;
 		return (error);
 	}
 	vp->v_type = VNON;
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
 	mp->mnt_syncer = vp;
 	return (0);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int a_waitfor;
 		struct proc *a_p;
 	} */ *ap;
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	struct proc *p = ap->a_p;
 	int asyncflag;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	vn_syncer_add_to_worklist(syncvp, syncdelay);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	simple_lock(&mountlist_slock);
 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
 		simple_unlock(&mountlist_slock);
 		return (0);
 	}
 	asyncflag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	vfs_msync(mp, MNT_NOWAIT);
 	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
 	if (asyncflag)
 		mp->mnt_flag |= MNT_ASYNC;
 	vfs_unbusy(mp, p);
 	return (0);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct proc *a_p;
 	} */ *ap;
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected at splbio().
  */
 static int
 sync_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int s;
 
 	s = splbio();
 	vp->v_mount->mnt_syncer = NULL;
 	if (vp->v_flag & VONWORKLST) {
 		LIST_REMOVE(vp, v_synclist);
 		vp->v_flag &= ~VONWORKLST;
 	}
 	splx(s);
 
 	return (0);
 }
 
 /*
  * Print out a syncer vnode.
  */
 static int
 sync_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	printf("syncer vnode");
 	if (vp->v_vnlock != NULL)
 		lockmgr_printinfo(vp->v_vnlock);
 	printf("\n");
 	return (0);
 }
 
 /*
  * extract the dev_t from a VBLK or VCHR
  */
 dev_t
 vn_todev(vp)
 	struct vnode *vp;
 {
 	if (vp->v_type != VBLK && vp->v_type != VCHR)
 		return (NODEV);
 	return (vp->v_rdev);
 }
 
 /*
  * Check if vnode represents a disk device
  */
 int
 vn_isdisk(vp, errp)
 	struct vnode *vp;
 	int *errp;
 {
 	if (vp->v_type != VBLK && vp->v_type != VCHR) {
 		if (errp != NULL)
 			*errp = ENOTBLK;
 		return (0);
 	}
 	if (vp->v_rdev == NULL) {
 		if (errp != NULL)
 			*errp = ENXIO;
 		return (0);
 	}
 	if (!devsw(vp->v_rdev)) {
 		if (errp != NULL)
 			*errp = ENXIO;
 		return (0);
 	}
 	if (!(devsw(vp->v_rdev)->d_flags & D_DISK)) {
 		if (errp != NULL)
 			*errp = ENOTBLK;
 		return (0);
 	}
 	if (errp != NULL)
 		*errp = 0;
 	return (1);
 }
 
 void
 NDFREE(ndp, flags)
      struct nameidata *ndp;
      const uint flags;
 {
 	if (!(flags & NDF_NO_FREE_PNBUF) &&
 	    (ndp->ni_cnd.cn_flags & HASBUF)) {
 		zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
 		ndp->ni_cnd.cn_flags &= ~HASBUF;
 	}
 	if (!(flags & NDF_NO_DVP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
 	    ndp->ni_dvp != ndp->ni_vp)
 		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc);
 	if (!(flags & NDF_NO_DVP_RELE) &&
 	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
 		vrele(ndp->ni_dvp);
 		ndp->ni_dvp = NULL;
 	}
 	if (!(flags & NDF_NO_VP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
 		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc);
 	if (!(flags & NDF_NO_VP_RELE) &&
 	    ndp->ni_vp) {
 		vrele(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 	}
 	if (!(flags & NDF_NO_STARTDIR_RELE) &&
 	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
 		vrele(ndp->ni_startdir);
 		ndp->ni_startdir = NULL;
 	}
 }
Index: stable/4/sys/net/if_mib.c
===================================================================
--- stable/4/sys/net/if_mib.c	(revision 145952)
+++ stable/4/sys/net/if_mib.c	(revision 145953)
@@ -1,150 +1,151 @@
 /*
  * Copyright 1996 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  * 
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_mib.h>
 
 /*
  * A sysctl(3) MIB for generic interface information.  This information
  * is exported in the net.link.generic branch, which has the following
  * structure:
  *
  * net.link.generic	.system			- system-wide control variables
  *						  and statistics (node)
  *			.ifdata.<ifindex>.general
  *						- what's in `struct ifdata'
  *						  plus some other info
  *			.ifdata.<ifindex>.linkspecific
  *						- a link-type-specific data
  *						  structure (as might be used
  *						  by an SNMP agent
  *
  * Perhaps someday we will make addresses accessible via this interface
  * as well (then there will be four such...).  The reason that the
  * index comes before the last element in the name is because it
  * seems more orthogonal that way, particularly with the possibility
  * of other per-interface data living down here as well (e.g., integrated
  * services stuff).
  */
 
 SYSCTL_DECL(_net_link_generic);
 SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RW, 0,
 	    "Variables global to all interfaces");
 SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD,
 	   &if_index, 0, "Number of configured interfaces");
 
 static int
 sysctl_ifdata(SYSCTL_HANDLER_ARGS) /* XXX bad syntax! */
 {
 	int *name = (int *)arg1;
 	int error, ifnlen;
 	u_int namelen = arg2;
 	struct ifnet *ifp;
 	char workbuf[64];
 	struct ifmibdata ifmd;
 
 	if (namelen != 2)
 		return EINVAL;
 
 	if (name[0] <= 0 || name[0] > if_index ||
 	    ifnet_addrs[name[0] - 1] == NULL)
 		return ENOENT;
 
 	ifp = ifnet_addrs[name[0] - 1]->ifa_ifp;
 
 	switch(name[1]) {
 	default:
 		return ENOENT;
 
 	case IFDATA_GENERAL:
+		bzero(&ifmd, sizeof(ifmd));
 		ifnlen = snprintf(workbuf, sizeof(workbuf),
 		    "%s%d", ifp->if_name, ifp->if_unit);
 		if(ifnlen + 1 > sizeof ifmd.ifmd_name) {
 			return ENAMETOOLONG;
 		} else {
 			strcpy(ifmd.ifmd_name, workbuf);
 		}
 
 #define COPY(fld) ifmd.ifmd_##fld = ifp->if_##fld
 		COPY(pcount);
 		COPY(flags);
 		COPY(data);
 #undef COPY
 		ifmd.ifmd_snd_len = ifp->if_snd.ifq_len;
 		ifmd.ifmd_snd_maxlen = ifp->if_snd.ifq_maxlen;
 		ifmd.ifmd_snd_drops = ifp->if_snd.ifq_drops;
 
 		error = SYSCTL_OUT(req, &ifmd, sizeof ifmd);
 		if (error || !req->newptr)
 			return error;
 
 		error = SYSCTL_IN(req, &ifmd, sizeof ifmd);
 		if (error)
 			return error;
 
 #define DONTCOPY(fld) ifmd.ifmd_data.ifi_##fld = ifp->if_data.ifi_##fld
 		DONTCOPY(type);
 		DONTCOPY(physical);
 		DONTCOPY(addrlen);
 		DONTCOPY(hdrlen);
 		DONTCOPY(mtu);
 		DONTCOPY(metric);
 		DONTCOPY(baudrate);
 #undef DONTCOPY
 #define COPY(fld) ifp->if_##fld = ifmd.ifmd_##fld
 		COPY(data);
 		ifp->if_snd.ifq_maxlen = ifmd.ifmd_snd_maxlen;
 		ifp->if_snd.ifq_drops = ifmd.ifmd_snd_drops;
 #undef COPY
 		break;
 
 	case IFDATA_LINKSPECIFIC:
 		error = SYSCTL_OUT(req, ifp->if_linkmib, ifp->if_linkmiblen);
 		if (error || !req->newptr)
 			return error;
 
 		error = SYSCTL_IN(req, ifp->if_linkmib, ifp->if_linkmiblen);
 		if (error)
 			return error;
 		
 	}
 	return 0;
 }
 
 SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RW,
 	    sysctl_ifdata, "Interface table");
 
Index: stable/4/sys/netinet/ip_divert.c
===================================================================
--- stable/4/sys/netinet/ip_divert.c	(revision 145952)
+++ stable/4/sys/netinet/ip_divert.c	(revision 145953)
@@ -1,518 +1,519 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_ipfw.h"
 #include "opt_ipdivert.h"
 #include "opt_ipsec.h"
 
 #ifndef INET
 #error "IPDIVERT requires INET."
 #endif
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #include <vm/vm_zone.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 
 /*
  * Divert sockets
  */
 
 /*
  * Allocate enough space to hold a full IP packet
  */
 #define	DIVSNDQ		(65536 + 100)
 #define	DIVRCVQ		(65536 + 100)
 
 /*
  * Divert sockets work in conjunction with ipfw, see the divert(4)
  * manpage for features.
  * Internally, packets selected by ipfw in ip_input() or ip_output(),
  * and never diverted before, are passed to the input queue of the
  * divert socket with a given 'divert_port' number (as specified in
  * the matching ipfw rule), and they are tagged with a 16 bit cookie
  * (representing the rule number of the matching ipfw rule), which
  * is passed to process reading from the socket.
  *
  * Packets written to the divert socket are again tagged with a cookie
  * (usually the same as above) and a destination address.
  * If the destination address is INADDR_ANY then the packet is
  * treated as outgoing and sent to ip_output(), otherwise it is
  * treated as incoming and sent to ip_input().
  * In both cases, the packet is tagged with the cookie.
  *
  * On reinjection, processing in ip_input() and ip_output()
  * will be exactly the same as for the original packet, except that
  * ipfw processing will start at the rule number after the one
  * written in the cookie (so, tagging a packet with a cookie of 0
  * will cause it to be effectively considered as a standard packet).
  */
 
 /* Internal variables */
 static struct inpcbhead divcb;
 static struct inpcbinfo divcbinfo;
 
 static u_long	div_sendspace = DIVSNDQ;	/* XXX sysctl ? */
 static u_long	div_recvspace = DIVRCVQ;	/* XXX sysctl ? */
 
 /* Optimization: have this preinitialized */
 static struct sockaddr_in divsrc = { sizeof(divsrc), AF_INET };
 
 /*
  * Initialize divert connection block queue.
  */
 void
 div_init(void)
 {
 	LIST_INIT(&divcb);
 	divcbinfo.listhead = &divcb;
 	/*
 	 * XXX We don't use the hash list for divert IP, but it's easier
 	 * to allocate a one entry hash list than it is to check all
 	 * over the place for hashbase == NULL.
 	 */
 	divcbinfo.hashbase = hashinit(1, M_PCB, &divcbinfo.hashmask);
 	divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask);
 	divcbinfo.ipi_zone = zinit("divcb", sizeof(struct inpcb),
 				   maxsockets, ZONE_INTERRUPT, 0);
 }
 
 /*
  * IPPROTO_DIVERT is not a real IP protocol; don't allow any packets
  * with that protocol number to enter the system from the outside.
  */
 void
 div_input(struct mbuf *m, int off, int proto)
 {
 	ipstat.ips_noproto++;
 	m_freem(m);
 }
 
 /*
  * Divert a packet by passing it up to the divert socket at port 'port'.
  *
  * Setup generic address and protocol structures for div_input routine,
  * then pass them along with mbuf chain.
  */
 void
 divert_packet(struct mbuf *m, int incoming, int port, int rule)
 {
 	struct ip *ip;
 	struct inpcb *inp;
 	struct socket *sa;
 	u_int16_t nport;
 
 	/* Sanity check */
 	KASSERT(port != 0, ("%s: port=0", __FUNCTION__));
 
 	divsrc.sin_port = rule;		/* record matching rule */
 
 	/* Assure header */
 	if (m->m_len < sizeof(struct ip) &&
 	    (m = m_pullup(m, sizeof(struct ip))) == 0)
 		return;
 	ip = mtod(m, struct ip *);
 
 	/*
 	 * Record receive interface address, if any.
 	 * But only for incoming packets.
 	 */
 	divsrc.sin_addr.s_addr = 0;
 	if (incoming) {
 		struct ifaddr *ifa;
 
 		/* Sanity check */
 		KASSERT((m->m_flags & M_PKTHDR), ("%s: !PKTHDR", __FUNCTION__));
 
 		/* Find IP address for receive interface */
 		TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr == NULL)
 				continue;
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			divsrc.sin_addr =
 			    ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
 			break;
 		}
 	}
 	/*
 	 * Record the incoming interface name whenever we have one.
 	 */
 	bzero(&divsrc.sin_zero, sizeof(divsrc.sin_zero));
 	if (m->m_pkthdr.rcvif) {
 		/*
 		 * Hide the actual interface name in there in the 
 		 * sin_zero array. XXX This needs to be moved to a
 		 * different sockaddr type for divert, e.g.
 		 * sockaddr_div with multiple fields like 
 		 * sockaddr_dl. Presently we have only 7 bytes
 		 * but that will do for now as most interfaces
 		 * are 4 or less + 2 or less bytes for unit.
 		 * There is probably a faster way of doing this,
 		 * possibly taking it from the sockaddr_dl on the iface.
 		 * This solves the problem of a P2P link and a LAN interface
 		 * having the same address, which can result in the wrong
 		 * interface being assigned to the packet when fed back
 		 * into the divert socket. Theoretically if the daemon saves
 		 * and re-uses the sockaddr_in as suggested in the man pages,
 		 * this iface name will come along for the ride.
 		 * (see div_output for the other half of this.)
 		 */ 
 		snprintf(divsrc.sin_zero, sizeof(divsrc.sin_zero),
 			"%s%d", m->m_pkthdr.rcvif->if_name,
 			m->m_pkthdr.rcvif->if_unit);
 	}
 
 	/* Put packet on socket queue, if any */
 	sa = NULL;
 	nport = htons((u_int16_t)port);
 	LIST_FOREACH(inp, &divcb, inp_list) {
 		if (inp->inp_lport == nport)
 			sa = inp->inp_socket;
 	}
 	if (sa) {
 		if (sbappendaddr(&sa->so_rcv, (struct sockaddr *)&divsrc,
 				m, (struct mbuf *)0) == 0)
 			m_freem(m);
 		else
 			sorwakeup(sa);
 	} else {
 		m_freem(m);
 		ipstat.ips_noproto++;
 		ipstat.ips_delivered--;
         }
 }
 
 /*
  * Deliver packet back into the IP processing machinery.
  *
  * If no address specified, or address is 0.0.0.0, send to ip_output();
  * otherwise, send to ip_input() and mark as having been received on
  * the interface with that address.
  */
 static int
 div_output(struct socket *so, struct mbuf *m,
 	struct sockaddr_in *sin, struct mbuf *control)
 {
 	int error = 0;
 	struct m_hdr divert_tag;
 
 	/*
 	 * Prepare the tag for divert info. Note that a packet
 	 * with a 0 tag in mh_data is effectively untagged,
 	 * so we could optimize that case.
 	 */
 	divert_tag.mh_type = MT_TAG;
 	divert_tag.mh_flags = PACKET_TAG_DIVERT;
 	divert_tag.mh_next = m;
 	divert_tag.mh_data = 0;		/* the matching rule # */
 	m->m_pkthdr.rcvif = NULL;	/* XXX is it necessary ? */
 
 	if (control)
 		m_freem(control);		/* XXX */
 
 	/* Loopback avoidance and state recovery */
 	if (sin) {
 		int i;
 
 		divert_tag.mh_data = (caddr_t)(int)sin->sin_port;
 		/*
 		 * Find receive interface with the given name, stuffed
 		 * (if it exists) in the sin_zero[] field.
 		 * The name is user supplied data so don't trust its size
 		 * or that it is zero terminated.
 		 */
 		for (i = 0; sin->sin_zero[i] && i < sizeof(sin->sin_zero); i++)
 			;
 		if ( i > 0 && i < sizeof(sin->sin_zero))
 			m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
 	}
 
 	/* Reinject packet into the system as incoming or outgoing */
 	if (!sin || sin->sin_addr.s_addr == 0) {
 		struct inpcb *const inp = sotoinpcb(so);
 		struct ip *const ip = mtod(m, struct ip *);
 
 		/*
 		 * Don't allow both user specified and setsockopt options,
 		 * and don't allow packet length sizes that will crash
 		 */
 		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) ||
 		     ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
 			error = EINVAL;
 			goto cantsend;
 		}
 
 		/* Convert fields to host order for ip_output() */
 		ip->ip_len = ntohs(ip->ip_len);
 		ip->ip_off = ntohs(ip->ip_off);
 
 		/* Send packet to output processing */
 		ipstat.ips_rawout++;			/* XXX */
 		error = ip_output((struct mbuf *)&divert_tag,
 			    inp->inp_options, &inp->inp_route,
 			    (so->so_options & SO_DONTROUTE) |
 			    IP_ALLOWBROADCAST | IP_RAWOUTPUT,
 			    inp->inp_moptions, NULL);
 	} else {
 		if (m->m_pkthdr.rcvif == NULL) {
 			/*
 			 * No luck with the name, check by IP address.
 			 * Clear the port and the ifname to make sure
 			 * there are no distractions for ifa_ifwithaddr.
 			 */
 			struct	ifaddr *ifa;
 
 			bzero(sin->sin_zero, sizeof(sin->sin_zero));
 			sin->sin_port = 0;
 			ifa = ifa_ifwithaddr((struct sockaddr *) sin);
 			if (ifa == NULL) {
 				error = EADDRNOTAVAIL;
 				goto cantsend;
 			}
 			m->m_pkthdr.rcvif = ifa->ifa_ifp;
 		}
 		/* Send packet to input processing */
 		ip_input((struct mbuf *)&divert_tag);
 	}
 
 	return error;
 
 cantsend:
 	m_freem(m);
 	return error;
 }
 
 static int
 div_attach(struct socket *so, int proto, struct proc *p)
 {
 	struct inpcb *inp;
 	int error, s;
 
 	inp  = sotoinpcb(so);
 	if (inp)
 		panic("div_attach");
 	if (p && (error = suser(p)) != 0)
 		return error;
 
 	error = soreserve(so, div_sendspace, div_recvspace);
 	if (error)
 		return error;
 	s = splnet();
 	error = in_pcballoc(so, &divcbinfo, p);
 	splx(s);
 	if (error)
 		return error;
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_ip_p = proto;
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_flags |= INP_HDRINCL;
 	return 0;
 }
 
 static int
 div_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	if (inp == 0)
 		panic("div_detach");
 	in_pcbdetach(inp);
 	return 0;
 }
 
 static int
 div_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
 {
 	struct inpcb *inp;
 	int s;
 	int error;
 
 	s = splnet();
 	inp = sotoinpcb(so);
 	/* in_pcbbind assumes that nam is a sockaddr_in
 	 * and in_pcbbind requires a valid address. Since divert
 	 * sockets don't we need to make sure the address is
 	 * filled in properly.
 	 * XXX -- divert should not be abusing in_pcbind
 	 * and should probably have its own family.
 	 */
 	if (nam->sa_family != AF_INET)
 		error = EAFNOSUPPORT;
 	else {
 		((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
 		error = in_pcbbind(inp, nam, p);
 	}
 	splx(s);
 	return error;
 }
 
 static int
 div_shutdown(struct socket *so)
 {
 	socantsendmore(so);
 	return 0;
 }
 
 static int
 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct proc *p)
 {
 	/* Packet must have a header (but that's about it) */
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == 0) {
 		ipstat.ips_toosmall++;
 		m_freem(m);
 		return EINVAL;
 	}
 
 	/* Send packet */
 	return div_output(so, m, (struct sockaddr_in *)nam, control);
 }
 
 static int
 div_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n, s;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = divcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	s = splnet();
 	gencnt = divcbinfo.ipi_gencnt;
 	n = divcbinfo.ipi_count;
 	splx(s);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	s = splnet();
 	for (inp = LIST_FIRST(divcbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
 			inp_list[i++] = inp;
 	}
 	splx(s);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
+			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		}
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		s = splnet();
 		xig.xig_gen = divcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = divcbinfo.ipi_count;
 		splx(s);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 SYSCTL_DECL(_net_inet_divert);
 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0,
 	    div_pcblist, "S,xinpcb", "List of active divert sockets");
 
 struct pr_usrreqs div_usrreqs = {
 	NULL, pru_accept_notsupp, div_attach, div_bind,
 	pru_connect_notsupp, pru_connect2_notsupp, in_control, div_detach,
 	NULL, pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp,
 	pru_rcvoob_notsupp, div_send, pru_sense_null, div_shutdown,
 	in_setsockaddr, sosend, soreceive, sopoll
 };
Index: stable/4/sys/netinet/raw_ip.c
===================================================================
--- stable/4/sys/netinet/raw_ip.c	(revision 145952)
+++ stable/4/sys/netinet/raw_ip.c	(revision 145953)
@@ -1,725 +1,726 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
  * $FreeBSD$
  */
 
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_random_ip_id.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm_zone.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #define _IP_VHL
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_mroute.h>
 
 #include <netinet/ip_fw.h>
 #include <netinet/ip_dummynet.h>
 
 #ifdef FAST_IPSEC
 #include <netipsec/ipsec.h>
 #endif /*FAST_IPSEC*/
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #endif /*IPSEC*/
 
 struct	inpcbhead ripcb;
 struct	inpcbinfo ripcbinfo;
 
 /* control hooks for ipfw and dummynet */
 ip_fw_ctl_t *ip_fw_ctl_ptr;
 ip_dn_ctl_t *ip_dn_ctl_ptr;
 
 /*
  * hooks for multicast routing. They all default to NULL,
  * so leave them not initialized and rely on BSS being set to 0.
  */
 
 /* The socket used to communicate with the multicast routing daemon.  */
 struct socket  *ip_mrouter;
 
 /* The various mrouter and rsvp functions */
 int (*ip_mrouter_set)(struct socket *, struct sockopt *);
 int (*ip_mrouter_get)(struct socket *, struct sockopt *);
 int (*ip_mrouter_done)(void);
 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 		struct ip_moptions *);
 int (*mrt_ioctl)(int, caddr_t);
 int (*legal_vif_num)(int);
 u_long (*ip_mcast_src)(int);
 
 void (*rsvp_input_p)(struct mbuf *m, int off, int proto);
 int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
 void (*ip_rsvp_force_done)(struct socket *);
 
 /*
  * Nominal space allocated to a raw ip socket.
  */
 #define	RIPSNDQ		8192
 #define	RIPRCVQ		8192
 
 /*
  * Raw interface to IP protocol.
  */
 
 /*
  * Initialize raw connection block queue.
  */
 void
 rip_init(void)
 {
 	LIST_INIT(&ripcb);
 	ripcbinfo.listhead = &ripcb;
 	/*
 	 * XXX We don't use the hash list for raw IP, but it's easier
 	 * to allocate a one entry hash list than it is to check all
 	 * over the place for hashbase == NULL.
 	 */
 	ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask);
 	ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask);
 	ripcbinfo.ipi_zone = zinit("ripcb", sizeof(struct inpcb),
 				   maxsockets, ZONE_INTERRUPT, 0);
 }
 
 /*
  * XXX ripsrc is modified in rip_input, so we must be fix this
  * when we want to make this code smp-friendly.
  */
 static struct	sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET };
 
 /*
  * Setup generic address and protocol structures
  * for raw_input routine, then pass them along with
  * mbuf chain.
  */
 void
 rip_input(struct mbuf *m, int off, int proto)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct inpcb *inp;
 	struct inpcb *last = NULL;
 	struct mbuf *opts = NULL;
 
 	ripsrc.sin_addr = ip->ip_src;
 	LIST_FOREACH(inp, &ripcb, inp_list) {
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_ip_p && inp->inp_ip_p != proto)
 			continue;
 		if (inp->inp_laddr.s_addr != INADDR_ANY &&
 		    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 			continue;
 		if (inp->inp_faddr.s_addr != INADDR_ANY &&
 		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 			continue;
 		if (last) {
 			struct mbuf *n = m_copypacket(m, M_DONTWAIT);
 
 #ifdef IPSEC
 			/* check AH/ESP integrity. */
 			if (n && ipsec4_in_reject_so(n, last->inp_socket)) {
 				m_freem(n);
 				ipsecstat.in_polvio++;
 				/* do not inject data to pcb */
 			} else
 #endif /*IPSEC*/
 #ifdef FAST_IPSEC
 			/* check AH/ESP integrity. */
 			if (ipsec4_in_reject(n, last)) {
 				m_freem(n);
 				/* do not inject data to pcb */
 			} else
 #endif /*FAST_IPSEC*/
 			if (n) {
 				if (last->inp_flags & INP_CONTROLOPTS ||
 				    last->inp_socket->so_options & SO_TIMESTAMP)
 				    ip_savecontrol(last, &opts, ip, n);
 				if (sbappendaddr(&last->inp_socket->so_rcv,
 				    (struct sockaddr *)&ripsrc, n,
 				    opts) == 0) {
 					/* should notify about lost packet */
 					m_freem(n);
 					if (opts)
 					    m_freem(opts);
 				} else
 					sorwakeup(last->inp_socket);
 				opts = 0;
 			}
 		}
 		last = inp;
 	}
 #ifdef IPSEC
 	/* check AH/ESP integrity. */
 	if (last && ipsec4_in_reject_so(m, last->inp_socket)) {
 		m_freem(m);
 		ipsecstat.in_polvio++;
 		ipstat.ips_delivered--;
 		/* do not inject data to pcb */
 	} else
 #endif /*IPSEC*/
 #ifdef FAST_IPSEC
 	/* check AH/ESP integrity. */
 	if (last && ipsec4_in_reject(m, last)) {
 		m_freem(m);
 		ipstat.ips_delivered--;
 		/* do not inject data to pcb */
 	} else
 #endif /*FAST_IPSEC*/
 	if (last) {
 		if (last->inp_flags & INP_CONTROLOPTS ||
 		    last->inp_socket->so_options & SO_TIMESTAMP)
 			ip_savecontrol(last, &opts, ip, m);
 		if (sbappendaddr(&last->inp_socket->so_rcv,
 		    (struct sockaddr *)&ripsrc, m, opts) == 0) {
 			m_freem(m);
 			if (opts)
 			    m_freem(opts);
 		} else
 			sorwakeup(last->inp_socket);
 	} else {
 		m_freem(m);
 		ipstat.ips_noproto++;
 		ipstat.ips_delivered--;
 	}
 }
 
 /*
  * Generate IP header and pass packet to ip_output.
  * Tack on options user may have setup with control call.
  */
 int
 rip_output(struct mbuf *m, struct socket *so, u_long dst)
 {
 	struct ip *ip;
 	struct inpcb *inp = sotoinpcb(so);
 	int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
 
 	/*
 	 * If the user handed us a complete IP packet, use it.
 	 * Otherwise, allocate an mbuf for a header and fill it in.
 	 */
 	if ((inp->inp_flags & INP_HDRINCL) == 0) {
 		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		M_PREPEND(m, sizeof(struct ip), M_WAIT);
 		if (m == NULL)
 			return(ENOBUFS);
 		ip = mtod(m, struct ip *);
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_off = 0;
 		ip->ip_p = inp->inp_ip_p;
 		ip->ip_len = m->m_pkthdr.len;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst.s_addr = dst;
 		ip->ip_ttl = inp->inp_ip_ttl;
 	} else {
 		if (m->m_pkthdr.len > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		ip = mtod(m, struct ip *);
 		/* don't allow both user specified and setsockopt options,
 		   and don't allow packet length sizes that will crash */
 		if (((IP_VHL_HL(ip->ip_vhl) != (sizeof (*ip) >> 2))
 		     && inp->inp_options)
 		    || (ip->ip_len > m->m_pkthdr.len)
 		    || (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) {
 			m_freem(m);
 			return EINVAL;
 		}
 		if (ip->ip_id == 0)
 #ifdef RANDOM_IP_ID
 			ip->ip_id = ip_randomid();
 #else
 			ip->ip_id = htons(ip_id++);
 #endif
 		/* XXX prevent ip_output from overwriting header fields */
 		flags |= IP_RAWOUTPUT;
 		ipstat.ips_rawout++;
 	}
 
 	if (inp->inp_flags & INP_ONESBCAST)
 		flags |= IP_SENDONES;
 
 	return (ip_output(m, inp->inp_options, &inp->inp_route, flags,
 			  inp->inp_moptions, inp));
 }
 
 /*
  * Raw IP socket option processing.
  */
 int
 rip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 
 	if (sopt->sopt_level != IPPROTO_IP)
 		return (EINVAL);
 
 	error = 0;
 
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			optval = inp->inp_flags & INP_HDRINCL;
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		case IP_FW_ADD: /* ADD actually returns the body... */
 		case IP_FW_GET:
 		case IP_FW_TABLE_GETSIZE:
 		case IP_FW_TABLE_LIST:
 			if (IPFW_LOADED)
 				error = ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET_GET:
 			if (DUMMYNET_LOADED)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break ;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case MRT_API_SUPPORT:
 		case MRT_API_CONFIG:
 		case MRT_ADD_BW_UPCALL:
 		case MRT_DEL_BW_UPCALL:
 			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
 				EOPNOTSUPP;
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 			if (optval)
 				inp->inp_flags |= INP_HDRINCL;
 			else
 				inp->inp_flags &= ~INP_HDRINCL;
 			break;
 
 		case IP_FW_ADD:
 		case IP_FW_DEL:
 		case IP_FW_FLUSH:
 		case IP_FW_ZERO:
 		case IP_FW_RESETLOG:
 		case IP_FW_TABLE_ADD:
 		case IP_FW_TABLE_DEL:
 		case IP_FW_TABLE_FLUSH:
 			if (IPFW_LOADED)
 				error = ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET_CONFIGURE:
 		case IP_DUMMYNET_DEL:
 		case IP_DUMMYNET_FLUSH:
 			if (DUMMYNET_LOADED)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT ;
 			break ;
 
 		case IP_RSVP_ON:
 			error = ip_rsvp_init(so);
 			break;
 
 		case IP_RSVP_OFF:
 			error = ip_rsvp_done();
 			break;
 
 		case IP_RSVP_VIF_ON:
 		case IP_RSVP_VIF_OFF:
 			error = ip_rsvp_vif ?
 				ip_rsvp_vif(so, sopt) : EINVAL;
 			break;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case MRT_API_SUPPORT:
 		case MRT_API_CONFIG:
 		case MRT_ADD_BW_UPCALL:
 		case MRT_DEL_BW_UPCALL:
 			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
 					EOPNOTSUPP;
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * This function exists solely to receive the PRC_IFDOWN messages which
  * are sent by if_down().  It looks for an ifaddr whose ifa_addr is sa,
  * and calls in_ifadown() to remove all routes corresponding to that address.
  * It also receives the PRC_IFUP messages from if_up() and reinstalls the
  * interface routes.
  */
 void
 rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct in_ifaddr *ia;
 	struct ifnet *ifp;
 	int err;
 	int flags;
 
 	switch (cmd) {
 	case PRC_IFDOWN:
 		TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
 			if (ia->ia_ifa.ifa_addr == sa
 			    && (ia->ia_flags & IFA_ROUTE)) {
 				/*
 				 * in_ifscrub kills the interface route.
 				 */
 				in_ifscrub(ia->ia_ifp, ia);
 				/*
 				 * in_ifadown gets rid of all the rest of
 				 * the routes.  This is not quite the right
 				 * thing to do, but at least if we are running
 				 * a routing process they will come back.
 				 */
 				in_ifadown(&ia->ia_ifa, 0);
 				break;
 			}
 		}
 		break;
 
 	case PRC_IFUP:
 		TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
 			if (ia->ia_ifa.ifa_addr == sa)
 				break;
 		}
 		if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
 			return;
 		flags = RTF_UP;
 		ifp = ia->ia_ifa.ifa_ifp;
 
 		if ((ifp->if_flags & IFF_LOOPBACK)
 		    || (ifp->if_flags & IFF_POINTOPOINT))
 			flags |= RTF_HOST;
 
 		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
 		if (err == 0)
 			ia->ia_flags |= IFA_ROUTE;
 		break;
 	}
 }
 
 u_long	rip_sendspace = RIPSNDQ;
 u_long	rip_recvspace = RIPRCVQ;
 
 SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
     &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
 SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
     &rip_recvspace, 0, "Maximum incoming raw IP datagram size");
 
 static int
 rip_attach(struct socket *so, int proto, struct proc *p)
 {
 	struct inpcb *inp;
 	int error, s;
 
 	inp = sotoinpcb(so);
 	if (inp)
 		panic("rip_attach");
 	if (p && (error = suser(p)) != 0)
 		return error;
 
 	error = soreserve(so, rip_sendspace, rip_recvspace);
 	if (error)
 		return error;
 	s = splnet();
 	error = in_pcballoc(so, &ripcbinfo, p);
 	splx(s);
 	if (error)
 		return error;
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_p = proto;
 	inp->inp_ip_ttl = ip_defttl;
 	return 0;
 }
 
 static int
 rip_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	if (inp == 0)
 		panic("rip_detach");
 	if (so == ip_mrouter && ip_mrouter_done)
 		ip_mrouter_done();
 	if (ip_rsvp_force_done)
 		ip_rsvp_force_done(so);
 	if (so == ip_rsvpd)
 		ip_rsvp_done();
 	in_pcbdetach(inp);
 	return 0;
 }
 
 static int
 rip_abort(struct socket *so)
 {
 	soisdisconnected(so);
 	if (so->so_state & SS_NOFDREF)
 		return rip_detach(so);
 	return 0;
 }
 
 static int
 rip_disconnect(struct socket *so)
 {
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return ENOTCONN;
 	return rip_abort(so);
 }
 
 static int
 rip_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 
 	if (nam->sa_len != sizeof(*addr))
 		return EINVAL;
 
 	if (TAILQ_EMPTY(&ifnet) || ((addr->sin_family != AF_INET) &&
 				    (addr->sin_family != AF_IMPLINK)) ||
 	    (addr->sin_addr.s_addr != INADDR_ANY &&
 	     ifa_ifwithaddr((struct sockaddr *)addr) == 0))
 		return EADDRNOTAVAIL;
 	inp->inp_laddr = addr->sin_addr;
 	return 0;
 }
 
 static int
 rip_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 
 	if (nam->sa_len != sizeof(*addr))
 		return EINVAL;
 	if (TAILQ_EMPTY(&ifnet))
 		return EADDRNOTAVAIL;
 	if ((addr->sin_family != AF_INET) &&
 	    (addr->sin_family != AF_IMPLINK))
 		return EAFNOSUPPORT;
 	inp->inp_faddr = addr->sin_addr;
 	soisconnected(so);
 	return 0;
 }
 
 static int
 rip_shutdown(struct socket *so)
 {
 	socantsendmore(so);
 	return 0;
 }
 
 static int
 rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct proc *p)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	u_long dst;
 
 	if (so->so_state & SS_ISCONNECTED) {
 		if (nam) {
 			m_freem(m);
 			return EISCONN;
 		}
 		dst = inp->inp_faddr.s_addr;
 	} else {
 		if (nam == NULL) {
 			m_freem(m);
 			return ENOTCONN;
 		}
 		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
 	}
 	return rip_output(m, so, dst);
 }
 
 static int
 rip_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n, s;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = ripcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	s = splnet();
 	gencnt = ripcbinfo.ipi_gencnt;
 	n = ripcbinfo.ipi_count;
 	splx(s);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	s = splnet();
 	for (inp = LIST_FIRST(ripcbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		if (inp->inp_gencnt <= gencnt)
 			inp_list[i++] = inp;
 	}
 	splx(s);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
+			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		}
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		s = splnet();
 		xig.xig_gen = ripcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = ripcbinfo.ipi_count;
 		splx(s);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
 	    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
 
 struct pr_usrreqs rip_usrreqs = {
 	rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect,
 	pru_connect2_notsupp, in_control, rip_detach, rip_disconnect,
 	pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp,
 	pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown,
 	in_setsockaddr, sosend, soreceive, sopoll
 };
Index: stable/4/sys/netinet/udp_usrreq.c
===================================================================
--- stable/4/sys/netinet/udp_usrreq.c	(revision 145952)
+++ stable/4/sys/netinet/udp_usrreq.c	(revision 145953)
@@ -1,949 +1,950 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  * $FreeBSD$
  */
 
 #include "opt_ipsec.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <vm/vm_zone.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #ifdef FAST_IPSEC
 #include <netipsec/ipsec.h>
 #endif /*FAST_IPSEC*/
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 /*
  * UDP protocol implementation.
  * Per RFC 768, August, 1980.
  */
 #ifndef	COMPAT_42
 static int	udpcksum = 1;
 #else
 static int	udpcksum = 0;		/* XXX */
 #endif
 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW,
 		&udpcksum, 0, "");
 
 int	log_in_vain = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, 
     &log_in_vain, 0, "Log all incoming UDP packets");
 
 static int	blackhole = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
 	&blackhole, 0, "Do not send port unreachables for refused connects");
 
 struct	inpcbhead udb;		/* from udp_var.h */
 #define	udb6	udb  /* for KAME src sync over BSD*'s */
 struct	inpcbinfo udbinfo;
 
 #ifndef UDBHASHSIZE
 #define UDBHASHSIZE 16
 #endif
 
 struct	udpstat udpstat;	/* from udp_var.h */
 SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW,
     &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
 
 static struct	sockaddr_in udp_in = { sizeof(udp_in), AF_INET };
 #ifdef INET6
 struct udp_in6 {
 	struct sockaddr_in6	uin6_sin;
 	u_char			uin6_init_done : 1;
 } udp_in6 = {
 	{ sizeof(udp_in6.uin6_sin), AF_INET6 },
 	0
 };
 struct udp_ip6 {
 	struct ip6_hdr		uip6_ip6;
 	u_char			uip6_init_done : 1;
 } udp_ip6;
 #endif /* INET6 */
 
 static void udp_append __P((struct inpcb *last, struct ip *ip,
 			    struct mbuf *n, int off));
 #ifdef INET6
 static void ip_2_ip6_hdr __P((struct ip6_hdr *ip6, struct ip *ip));
 #endif
 
 static int udp_detach __P((struct socket *so));
 static	int udp_output __P((struct inpcb *, struct mbuf *, struct sockaddr *,
 			    struct mbuf *, struct proc *));
 
 void
 udp_init()
 {
 	LIST_INIT(&udb);
 	udbinfo.listhead = &udb;
 	udbinfo.hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.hashmask);
 	udbinfo.porthashbase = hashinit(UDBHASHSIZE, M_PCB,
 					&udbinfo.porthashmask);
 	udbinfo.ipi_zone = zinit("udpcb", sizeof(struct inpcb), maxsockets,
 				 ZONE_INTERRUPT, 0);
 }
 
 void
 udp_input(m, off, proto)
 	register struct mbuf *m;
 	int off, proto;
 {
 	int iphlen = off;
 	register struct ip *ip;
 	register struct udphdr *uh;
 	register struct inpcb *inp;
 	struct mbuf *opts = 0;
 	int len;
 	struct ip save_ip;
 	struct sockaddr *append_sa;
 
 	udpstat.udps_ipackets++;
 
 	/*
 	 * Strip IP options, if any; should skip this,
 	 * make available to user, and use on returned packets,
 	 * but we don't yet have a way to check the checksum
 	 * with options still present.
 	 */
 	if (iphlen > sizeof (struct ip)) {
 		ip_stripoptions(m, (struct mbuf *)0);
 		iphlen = sizeof(struct ip);
 	}
 
 	/*
 	 * Get IP and UDP header together in first mbuf.
 	 */
 	ip = mtod(m, struct ip *);
 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
 			udpstat.udps_hdrops++;
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
 
 	/* destination port of 0 is illegal, based on RFC768. */
 	if (uh->uh_dport == 0)
 		goto bad;
 
 	/*
 	 * Make mbuf data length reflect UDP length.
 	 * If not enough data to reflect UDP length, drop.
 	 */
 	len = ntohs((u_short)uh->uh_ulen);
 	if (ip->ip_len != len) {
 		if (len > ip->ip_len || len < sizeof(struct udphdr)) {
 			udpstat.udps_badlen++;
 			goto bad;
 		}
 		m_adj(m, len - ip->ip_len);
 		/* ip->ip_len = len; */
 	}
 	/*
 	 * Save a copy of the IP header in case we want restore it
 	 * for sending an ICMP error message in response.
 	 */
 	save_ip = *ip;
 
 	/*
 	 * Checksum extended UDP header and data.
 	 */
 	if (uh->uh_sum) {
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				uh->uh_sum = m->m_pkthdr.csum_data;
 			else
 	                	uh->uh_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr, htonl((u_short)len +
 				    m->m_pkthdr.csum_data + IPPROTO_UDP));
 			uh->uh_sum ^= 0xffff;
 		} else {
 			char b[9];
 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
 			bzero(((struct ipovly *)ip)->ih_x1, 9);
 			((struct ipovly *)ip)->ih_len = uh->uh_ulen;
 			uh->uh_sum = in_cksum(m, len + sizeof (struct ip));
 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
 		}
 		if (uh->uh_sum) {
 			udpstat.udps_badsum++;
 			m_freem(m);
 			return;
 		}
 	} else
 		udpstat.udps_nosum++;
 
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
 		struct inpcb *last;
 		/*
 		 * Deliver a multicast or broadcast datagram to *all* sockets
 		 * for which the local and remote addresses and ports match
 		 * those of the incoming datagram.  This allows more than
 		 * one process to receive multi/broadcasts on the same port.
 		 * (This really ought to be done for unicast datagrams as
 		 * well, but that would cause problems with existing
 		 * applications that open both address-specific sockets and
 		 * a wildcard socket listening to the same port -- they would
 		 * end up receiving duplicates of every unicast datagram.
 		 * Those applications open the multiple sockets to overcome an
 		 * inadequacy of the UDP socket interface, but for backwards
 		 * compatibility we avoid the problem here rather than
 		 * fixing the interface.  Maybe 4.5BSD will remedy this?)
 		 */
 
 		/*
 		 * Construct sockaddr format source address.
 		 */
 		udp_in.sin_port = uh->uh_sport;
 		udp_in.sin_addr = ip->ip_src;
 		/*
 		 * Locate pcb(s) for datagram.
 		 * (Algorithm copied from raw_intr().)
 		 */
 		last = NULL;
 #ifdef INET6
 		udp_in6.uin6_init_done = udp_ip6.uip6_init_done = 0;
 #endif
 		LIST_FOREACH(inp, &udb, inp_list) {
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_lport != uh->uh_dport)
 				continue;
 			if (inp->inp_laddr.s_addr != INADDR_ANY) {
 				if (inp->inp_laddr.s_addr !=
 				    ip->ip_dst.s_addr)
 					continue;
 			}
 			if (inp->inp_faddr.s_addr != INADDR_ANY) {
 				if (inp->inp_faddr.s_addr !=
 				    ip->ip_src.s_addr ||
 				    inp->inp_fport != uh->uh_sport)
 					continue;
 			}
 
 			if (last != NULL) {
 				struct mbuf *n;
 
 #ifdef IPSEC
 				/* check AH/ESP integrity. */
 				if (ipsec4_in_reject_so(m, last->inp_socket))
 					ipsecstat.in_polvio++;
 					/* do not inject data to pcb */
 				else
 #endif /*IPSEC*/
 #ifdef FAST_IPSEC
 				/* check AH/ESP integrity. */
 				if (ipsec4_in_reject(m, last))
 					;
 				else
 #endif /*FAST_IPSEC*/
 				if ((n = m_copy(m, 0, M_COPYALL)) != NULL)
 					udp_append(last, ip, n,
 						   iphlen +
 						   sizeof(struct udphdr));
 			}
 			last = inp;
 			/*
 			 * Don't look for additional matches if this one does
 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
 			 * socket options set.  This heuristic avoids searching
 			 * through all pcbs in the common case of a non-shared
 			 * port.  It * assumes that an application will never
 			 * clear these options after setting them.
 			 */
 			if ((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0)
 				break;
 		}
 
 		if (last == NULL) {
 			/*
 			 * No matching pcb found; discard datagram.
 			 * (No need to send an ICMP Port Unreachable
 			 * for a broadcast or multicast datgram.)
 			 */
 			udpstat.udps_noportbcast++;
 			goto bad;
 		}
 #ifdef IPSEC
 		/* check AH/ESP integrity. */
 		if (ipsec4_in_reject_so(m, last->inp_socket)) {
 			ipsecstat.in_polvio++;
 			goto bad;
 		}
 #endif /*IPSEC*/
 #ifdef FAST_IPSEC
 		/* check AH/ESP integrity. */
 		if (ipsec4_in_reject(m, last))
 			goto bad;
 #endif /*FAST_IPSEC*/
 		udp_append(last, ip, m, iphlen + sizeof(struct udphdr));
 		return;
 	}
 	/*
 	 * Locate pcb for datagram.
 	 */
 	inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport,
 	    ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif);
 	if (inp == NULL) {
 		if (log_in_vain) {
 			char buf[4*sizeof "123"];
 
 			strcpy(buf, inet_ntoa(ip->ip_dst));
 			log(LOG_INFO,
 			    "Connection attempt to UDP %s:%d from %s:%d\n",
 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
 			    ntohs(uh->uh_sport));
 		}
 		udpstat.udps_noport++;
 		if (m->m_flags & (M_BCAST | M_MCAST)) {
 			udpstat.udps_noportbcast++;
 			goto bad;
 		}
 		if (blackhole)
 			goto bad;
 #ifdef ICMP_BANDLIM
 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
 			goto bad;
 #endif
 		*ip = save_ip;
 		ip->ip_len += iphlen;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
 		return;
 	}
 #ifdef IPSEC
 	if (ipsec4_in_reject_so(m, inp->inp_socket)) {
 		ipsecstat.in_polvio++;
 		goto bad;
 	}
 #endif /*IPSEC*/
 #ifdef FAST_IPSEC
 	if (ipsec4_in_reject(m, inp))
 		goto bad;
 #endif /*FAST_IPSEC*/
 
 	/*
 	 * Construct sockaddr format source address.
 	 * Stuff source address and datagram in user buffer.
 	 */
 	udp_in.sin_port = uh->uh_sport;
 	udp_in.sin_addr = ip->ip_src;
 	if (inp->inp_flags & INP_CONTROLOPTS
 	    || inp->inp_socket->so_options & SO_TIMESTAMP) {
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6) {
 			int savedflags;
 
 			ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip);
 			savedflags = inp->inp_flags;
 			inp->inp_flags &= ~INP_UNMAPPABLEOPTS;
 			ip6_savecontrol(inp, &opts, &udp_ip6.uip6_ip6, m);
 			inp->inp_flags = savedflags;
 		} else
 #endif
 		ip_savecontrol(inp, &opts, ip, m);
 	}
  	m_adj(m, iphlen + sizeof(struct udphdr));
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin);
 		append_sa = (struct sockaddr *)&udp_in6;
 	} else
 #endif
 	append_sa = (struct sockaddr *)&udp_in;
 	if (sbappendaddr(&inp->inp_socket->so_rcv, append_sa, m, opts) == 0) {
 		udpstat.udps_fullsock++;
 		goto bad;
 	}
 	sorwakeup(inp->inp_socket);
 	return;
 bad:
 	m_freem(m);
 	if (opts)
 		m_freem(opts);
 	return;
 }
 
 #ifdef INET6
 static void
 ip_2_ip6_hdr(ip6, ip)
 	struct ip6_hdr *ip6;
 	struct ip *ip;
 {
 	bzero(ip6, sizeof(*ip6));
 
 	ip6->ip6_vfc = IPV6_VERSION;
 	ip6->ip6_plen = ip->ip_len;
 	ip6->ip6_nxt = ip->ip_p;
 	ip6->ip6_hlim = ip->ip_ttl;
 	ip6->ip6_src.s6_addr32[2] = ip6->ip6_dst.s6_addr32[2] =
 		IPV6_ADDR_INT32_SMP;
 	ip6->ip6_src.s6_addr32[3] = ip->ip_src.s_addr;
 	ip6->ip6_dst.s6_addr32[3] = ip->ip_dst.s_addr;
 }
 #endif
 
 /*
  * subroutine of udp_input(), mainly for source code readability.
  * caller must properly init udp_ip6 and udp_in6 beforehand.
  */
 static void
 udp_append(last, ip, n, off)
 	struct inpcb *last;
 	struct ip *ip;
 	struct mbuf *n;
 	int off;
 {
 	struct sockaddr *append_sa;
 	struct mbuf *opts = 0;
 
 	if (last->inp_flags & INP_CONTROLOPTS ||
 	    last->inp_socket->so_options & SO_TIMESTAMP) {
 #ifdef INET6
 		if (last->inp_vflag & INP_IPV6) {
 			int savedflags;
 
 			if (udp_ip6.uip6_init_done == 0) {
 				ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip);
 				udp_ip6.uip6_init_done = 1;
 			}
 			savedflags = last->inp_flags;
 			last->inp_flags &= ~INP_UNMAPPABLEOPTS;
 			ip6_savecontrol(last, &opts, &udp_ip6.uip6_ip6, n);
 			last->inp_flags = savedflags;
 		} else
 #endif
 		ip_savecontrol(last, &opts, ip, n);
 	}
 #ifdef INET6
 	if (last->inp_vflag & INP_IPV6) {
 		if (udp_in6.uin6_init_done == 0) {
 			in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin);
 			udp_in6.uin6_init_done = 1;
 		}
 		append_sa = (struct sockaddr *)&udp_in6.uin6_sin;
 	} else
 #endif
 	append_sa = (struct sockaddr *)&udp_in;
 	m_adj(n, off);
 	if (sbappendaddr(&last->inp_socket->so_rcv, append_sa, n, opts) == 0) {
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		udpstat.udps_fullsock++;
 	} else
 		sorwakeup(last->inp_socket);
 }
 
 /*
  * Notify a udp user of an asynchronous error;
  * just wake up so that he can collect error status.
  */
 void
 udp_notify(inp, errno)
 	register struct inpcb *inp;
 	int errno;
 {
 	inp->inp_socket->so_error = errno;
 	sorwakeup(inp->inp_socket);
 	sowwakeup(inp->inp_socket);
 }
 
 void
 udp_ctlinput(cmd, sa, vip)
 	int cmd;
 	struct sockaddr *sa;
 	void *vip;
 {
 	struct ip *ip = vip;
 	struct udphdr *uh;
 	void (*notify) __P((struct inpcb *, int)) = udp_notify;
         struct in_addr faddr;
 	struct inpcb *inp;
 	int s;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
         	return;
 
 	if (PRC_IS_REDIRECT(cmd)) {
 		ip = 0;
 		notify = in_rtchange;
 	} else if (cmd == PRC_HOSTDEAD)
 		ip = 0;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip) {
 		s = splnet();
 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport,
                     ip->ip_src, uh->uh_sport, 0, NULL);
 		if (inp != NULL && inp->inp_socket != NULL)
 			(*notify)(inp, inetctlerrmap[cmd]);
 		splx(s);
 	} else
 		in_pcbnotifyall(&udb, faddr, inetctlerrmap[cmd], notify);
 }
 
 static int
 udp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n, s;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = udbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	s = splnet();
 	gencnt = udbinfo.ipi_gencnt;
 	n = udbinfo.ipi_count;
 	splx(s);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	s = splnet();
 	for (inp = LIST_FIRST(udbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
 			inp_list[i++] = inp;
 	}
 	splx(s);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
+			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		}
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		s = splnet();
 		xig.xig_gen = udbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = udbinfo.ipi_count;
 		splx(s);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
 	    udp_pcblist, "S,xinpcb", "List of active UDP sockets");
 
 static int
 udp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error, s;
 
 	error = suser(req->p);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	s = splnet();
 	inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 				addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);
 	if (inp == NULL || inp->inp_socket == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = SYSCTL_OUT(req, inp->inp_socket->so_cred, sizeof(struct ucred));
 out:
 	splx(s);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
     0, 0, udp_getcred, "S,ucred", "Get the ucred of a UDP connection");
 
 static int
 udp_output(inp, m, addr, control, p)
 	register struct inpcb *inp;
 	struct mbuf *m;
 	struct sockaddr *addr;
 	struct mbuf *control;
 	struct proc *p;
 {
 	register struct udpiphdr *ui;
 	register int len = m->m_pkthdr.len;
 	struct in_addr faddr, laddr;
 	struct sockaddr_in *sin;
 	int s = 0, error = 0;
 	int ipflags;
 
 	if (control)
 		m_freem(control);		/* XXX */
 
 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
 		error = EMSGSIZE;
 		goto release;
 	}
 
 	if (addr) {
 		sin = (struct sockaddr_in *)addr;
 		prison_remote_ip(p, 0, &sin->sin_addr.s_addr);
 		laddr = inp->inp_laddr;
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto release;
 		}
 		/*
 		 * Must block input while temporarily connected.
 		 */
 		s = splnet();
 		error = in_pcbconnect(inp, addr, p);
 		if (error) {
 			splx(s);
 			goto release;
 		}
 	} else {
 		if (inp->inp_faddr.s_addr == INADDR_ANY) {
 			error = ENOTCONN;
 			goto release;
 		}
 	}
 	/*
 	 * Calculate data length and get a mbuf
 	 * for UDP and IP headers.
 	 */
 	M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT);
 	if (m == 0) {
 		error = ENOBUFS;
 		if (addr)
 			splx(s);
 		goto release;
 	}
 
 	/*
 	 * Fill in mbuf with extended UDP header
 	 * and addresses and length put into network format.
 	 */
 	ui = mtod(m, struct udpiphdr *);
 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
 	ui->ui_pr = IPPROTO_UDP;
 	ui->ui_src = inp->inp_laddr;
 	ui->ui_dst = inp->inp_faddr;
 	ui->ui_sport = inp->inp_lport;
 	ui->ui_dport = inp->inp_fport;
 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
 
 	ipflags = inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST);
 	if (inp->inp_flags & INP_ONESBCAST)
 		ipflags |= IP_SENDONES;
 
 	/*
 	 * Set up checksum and output datagram.
 	 */
 	if (udpcksum) {
 		faddr = ui->ui_dst;
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
 		    htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
 		m->m_pkthdr.csum_flags = CSUM_UDP;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	} else {
 		ui->ui_sum = 0;
 	}
 	((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
 	((struct ip *)ui)->ip_tos = inp->inp_ip_tos;	/* XXX */
 	udpstat.udps_opackets++;
 
 	error = ip_output(m, inp->inp_options, &inp->inp_route, ipflags,
 	    inp->inp_moptions, inp);
 
 	if (addr) {
 		in_pcbdisconnect(inp);
 		inp->inp_laddr = laddr;	/* XXX rehash? */
 		splx(s);
 	}
 	return (error);
 
 release:
 	m_freem(m);
 	return (error);
 }
 
 u_long	udp_sendspace = 9216;		/* really max datagram size */
 					/* 40 1K datagrams */
 SYSCTL_INT(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
 
 u_long	udp_recvspace = 40 * (1024 +
 #ifdef INET6
 				      sizeof(struct sockaddr_in6)
 #else
 				      sizeof(struct sockaddr_in)
 #endif
 				      );
 SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &udp_recvspace, 0, "Maximum incoming UDP datagram size");
 
 static int
 udp_abort(struct socket *so)
 {
 	struct inpcb *inp;
 	int s;
 
 	inp = sotoinpcb(so);
 	if (inp == 0)
 		return EINVAL;	/* ??? possible? panic instead? */
 	soisdisconnected(so);
 	s = splnet();
 	in_pcbdetach(inp);
 	splx(s);
 	return 0;
 }
 
 static int
 udp_attach(struct socket *so, int proto, struct proc *p)
 {
 	struct inpcb *inp;
 	int s, error;
 
 	inp = sotoinpcb(so);
 	if (inp != 0)
 		return EINVAL;
 
 	error = soreserve(so, udp_sendspace, udp_recvspace);
 	if (error)
 		return error;
 	s = splnet();
 	error = in_pcballoc(so, &udbinfo, p);
 	splx(s);
 	if (error)
 		return error;
 
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_ttl = ip_defttl;
 	return 0;
 }
 
 static int
 udp_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
 {
 	struct inpcb *inp;
 	int s, error;
 
 	inp = sotoinpcb(so);
 	if (inp == 0)
 		return EINVAL;
 	s = splnet();
 	error = in_pcbbind(inp, nam, p);
 	splx(s);
 	return error;
 }
 
 static int
 udp_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
 {
 	struct inpcb *inp;
 	int s, error;
 	struct sockaddr_in *sin;
 
 	inp = sotoinpcb(so);
 	if (inp == 0)
 		return EINVAL;
 	if (inp->inp_faddr.s_addr != INADDR_ANY)
 		return EISCONN;
 	error = 0;
 	s = splnet();
 	if (inp->inp_laddr.s_addr == INADDR_ANY && p->p_prison != NULL)
 		error = in_pcbbind(inp, NULL, p);
 	if (error == 0) {
 		sin = (struct sockaddr_in *)nam;
 		prison_remote_ip(p, 0, &sin->sin_addr.s_addr);
 		error = in_pcbconnect(inp, nam, p);
 	}
 	splx(s);
 	if (error == 0)
 		soisconnected(so);
 	return error;
 }
 
 static int
 udp_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	int s;
 
 	inp = sotoinpcb(so);
 	if (inp == 0)
 		return EINVAL;
 	s = splnet();
 	in_pcbdetach(inp);
 	splx(s);
 	return 0;
 }
 
 static int
 udp_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	int s;
 
 	inp = sotoinpcb(so);
 	if (inp == 0)
 		return EINVAL;
 	if (inp->inp_faddr.s_addr == INADDR_ANY)
 		return ENOTCONN;
 
 	s = splnet();
 	in_pcbdisconnect(inp);
 	inp->inp_laddr.s_addr = INADDR_ANY;
 	splx(s);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	return 0;
 }
 
 static int
 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
 	    struct mbuf *control, struct proc *p)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		m_freem(m);
 		return EINVAL;
 	}
 	return udp_output(inp, m, addr, control, p);
 }
 
 int
 udp_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	if (inp == 0)
 		return EINVAL;
 	socantsendmore(so);
 	return 0;
 }
 
 struct pr_usrreqs udp_usrreqs = {
 	udp_abort, pru_accept_notsupp, udp_attach, udp_bind, udp_connect, 
 	pru_connect2_notsupp, in_control, udp_detach, udp_disconnect, 
 	pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp, 
 	pru_rcvoob_notsupp, udp_send, pru_sense_null, udp_shutdown,
 	in_setsockaddr, sosend, soreceive, sopoll
 };
 
Index: stable/5/sys/kern/subr_bus.c
===================================================================
--- stable/5/sys/kern/subr_bus.c	(revision 145952)
+++ stable/5/sys/kern/subr_bus.c	(revision 145953)
@@ -1,3918 +1,3921 @@
 /*-
  * Copyright (c) 1997,1998,2003 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bus.h"
 
 #define __RMAN_RESOURCE_VISIBLE
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/filio.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/condvar.h>
 #include <sys/queue.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 #include <sys/bus.h>
 
 #include <machine/stdarg.h>
 
 #include <vm/uma.h>
 
 SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL);
 SYSCTL_NODE(, OID_AUTO, dev, CTLFLAG_RW, NULL, NULL);
 
 /*
  * Used to attach drivers to devclasses.
  */
 typedef struct driverlink *driverlink_t;
 struct driverlink {
 	kobj_class_t	driver;
 	TAILQ_ENTRY(driverlink) link;	/* list of drivers in devclass */
 };
 
 /*
  * Forward declarations
  */
 typedef TAILQ_HEAD(devclass_list, devclass) devclass_list_t;
 typedef TAILQ_HEAD(driver_list, driverlink) driver_list_t;
 typedef TAILQ_HEAD(device_list, device) device_list_t;
 
 struct devclass {
 	TAILQ_ENTRY(devclass) link;
 	devclass_t	parent;		/* parent in devclass hierarchy */
 	driver_list_t	drivers;     /* bus devclasses store drivers for bus */
 	char		*name;
 	device_t	*devices;	/* array of devices indexed by unit */
 	int		maxunit;	/* size of devices array */
 
 	struct sysctl_ctx_list sysctl_ctx;
 	struct sysctl_oid *sysctl_tree;
 };
 
 /**
  * @brief Implementation of device.
  */
 struct device {
 	/*
 	 * A device is a kernel object. The first field must be the
 	 * current ops table for the object.
 	 */
 	KOBJ_FIELDS;
 
 	/*
 	 * Device hierarchy.
 	 */
 	TAILQ_ENTRY(device)	link;	/**< list of devices in parent */
 	TAILQ_ENTRY(device)	devlink; /**< global device list membership */
 	device_t	parent;		/**< parent of this device  */
 	device_list_t	children;	/**< list of child devices */
 
 	/*
 	 * Details of this device.
 	 */
 	driver_t	*driver;	/**< current driver */
 	devclass_t	devclass;	/**< current device class */
 	int		unit;		/**< current unit number */
 	char*		nameunit;	/**< name+unit e.g. foodev0 */
 	char*		desc;		/**< driver specific description */
 	int		busy;		/**< count of calls to device_busy() */
 	device_state_t	state;		/**< current device state  */
 	u_int32_t	devflags;	/**< api level flags for device_get_flags() */
 	u_short		flags;		/**< internal device flags  */
 #define	DF_ENABLED	1		/* device should be probed/attached */
 #define	DF_FIXEDCLASS	2		/* devclass specified at create time */
 #define	DF_WILDCARD	4		/* unit was originally wildcard */
 #define	DF_DESCMALLOCED	8		/* description was malloced */
 #define	DF_QUIET	16		/* don't print verbose attach message */
 #define	DF_DONENOMATCH	32		/* don't execute DEVICE_NOMATCH again */
 #define	DF_EXTERNALSOFTC 64		/* softc not allocated by us */
 	u_char	order;			/**< order from device_add_child_ordered() */
 	u_char	pad;
 	void	*ivars;			/**< instance variables  */
 	void	*softc;			/**< current driver's variables  */
 
 	struct sysctl_ctx_list sysctl_ctx; /**< state for sysctl variables  */
 	struct sysctl_oid *sysctl_tree;	/**< state for sysctl variables */
 };
 
 static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures");
 static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc");
 
 #ifdef BUS_DEBUG
 
 static int bus_debug = 1;
 TUNABLE_INT("bus.debug", &bus_debug);
 SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RW, &bus_debug, 0,
     "Debug bus code");
 
 #define PDEBUG(a)	if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a; printf("\n");}
 #define DEVICENAME(d)	((d)? device_get_name(d): "no device")
 #define DRIVERNAME(d)	((d)? d->name : "no driver")
 #define DEVCLANAME(d)	((d)? d->name : "no devclass")
 
 /**
  * Produce the indenting, indent*2 spaces plus a '.' ahead of that to
  * prevent syslog from deleting initial spaces
  */
 #define indentprintf(p)	do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf("  "); printf p ; } while (0)
 
 static void print_device_short(device_t dev, int indent);
 static void print_device(device_t dev, int indent);
 void print_device_tree_short(device_t dev, int indent);
 void print_device_tree(device_t dev, int indent);
 static void print_driver_short(driver_t *driver, int indent);
 static void print_driver(driver_t *driver, int indent);
 static void print_driver_list(driver_list_t drivers, int indent);
 static void print_devclass_short(devclass_t dc, int indent);
 static void print_devclass(devclass_t dc, int indent);
 void print_devclass_list_short(void);
 void print_devclass_list(void);
 
 #else
 /* Make the compiler ignore the function calls */
 #define PDEBUG(a)			/* nop */
 #define DEVICENAME(d)			/* nop */
 #define DRIVERNAME(d)			/* nop */
 #define DEVCLANAME(d)			/* nop */
 
 #define print_device_short(d,i)		/* nop */
 #define print_device(d,i)		/* nop */
 #define print_device_tree_short(d,i)	/* nop */
 #define print_device_tree(d,i)		/* nop */
 #define print_driver_short(d,i)		/* nop */
 #define print_driver(d,i)		/* nop */
 #define print_driver_list(d,i)		/* nop */
 #define print_devclass_short(d,i)	/* nop */
 #define print_devclass(d,i)		/* nop */
 #define print_devclass_list_short()	/* nop */
 #define print_devclass_list()		/* nop */
 #endif
 
 /*
  * dev sysctl tree
  */
 
 enum {
 	DEVCLASS_SYSCTL_PARENT,
 };
 
 static int
 devclass_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	devclass_t dc = (devclass_t)arg1;
 	const char *value;
 	char *buf;
 	int error;
 
 	buf = NULL;
 	switch (arg2) {
 	case DEVCLASS_SYSCTL_PARENT:
 		value = dc->parent ? dc->parent->name : "";
 		break;
 	default:
 		return (EINVAL);
 	}
 	error = SYSCTL_OUT(req, value, strlen(value));
 	if (buf != NULL)
 		free(buf, M_BUS);
 	return (error);
 }
 
 static void
 devclass_sysctl_init(devclass_t dc)
 {
 
 	if (dc->sysctl_tree != NULL)
 		return;
 	sysctl_ctx_init(&dc->sysctl_ctx);
 	dc->sysctl_tree = SYSCTL_ADD_NODE(&dc->sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, dc->name,
 	    CTLFLAG_RD, 0, "");
 	SYSCTL_ADD_PROC(&dc->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree),
 	    OID_AUTO, "%parent", CTLFLAG_RD,
 	    dc, DEVCLASS_SYSCTL_PARENT, devclass_sysctl_handler, "A",
 	    "parent class");
 }
 
 enum {
 	DEVICE_SYSCTL_DESC,
 	DEVICE_SYSCTL_DRIVER,
 	DEVICE_SYSCTL_LOCATION,
 	DEVICE_SYSCTL_PNPINFO,
 	DEVICE_SYSCTL_PARENT,
 };
 
 static int
 device_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	device_t dev = (device_t)arg1;
 	const char *value;
 	char *buf;
 	int error;
 
 	buf = NULL;
 	switch (arg2) {
 	case DEVICE_SYSCTL_DESC:
 		value = dev->desc ? dev->desc : "";
 		break;
 	case DEVICE_SYSCTL_DRIVER:
 		value = dev->driver ? dev->driver->name : "";
 		break;
 	case DEVICE_SYSCTL_LOCATION:
 		value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
 		bus_child_location_str(dev, buf, 1024);
 		break;
 	case DEVICE_SYSCTL_PNPINFO:
 		value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
 		bus_child_pnpinfo_str(dev, buf, 1024);
 		break;
 	case DEVICE_SYSCTL_PARENT:
 		value = dev->parent ? dev->parent->nameunit : "";
 		break;
 	default:
 		return (EINVAL);
 	}
 	error = SYSCTL_OUT(req, value, strlen(value));
 	if (buf != NULL)
 		free(buf, M_BUS);
 	return (error);
 }
 
 static void
 device_sysctl_init(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 
 	if (dev->sysctl_tree != NULL)
 		return;
 	devclass_sysctl_init(dc);
 	sysctl_ctx_init(&dev->sysctl_ctx);
 	dev->sysctl_tree = SYSCTL_ADD_NODE(&dev->sysctl_ctx,
 	    SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO,
 	    dev->nameunit + strlen(dc->name),
 	    CTLFLAG_RD, 0, "");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%desc", CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_DESC, device_sysctl_handler, "A",
 	    "device description");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%driver", CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_DRIVER, device_sysctl_handler, "A",
 	    "device driver name");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%location", CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_LOCATION, device_sysctl_handler, "A",
 	    "device location relative to parent");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%pnpinfo", CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_PNPINFO, device_sysctl_handler, "A",
 	    "device identification");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%parent", CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A",
 	    "parent device");
 }
 
 static void
 device_sysctl_fini(device_t dev)
 {
 	if (dev->sysctl_tree == NULL)
 		return;
 	sysctl_ctx_free(&dev->sysctl_ctx);
 	dev->sysctl_tree = NULL;
 }
 
 /*
  * /dev/devctl implementation
  */
 
 /*
  * This design allows only one reader for /dev/devctl.  This is not desirable
  * in the long run, but will get a lot of hair out of this implementation.
  * Maybe we should make this device a clonable device.
  *
  * Also note: we specifically do not attach a device to the device_t tree
  * to avoid potential chicken and egg problems.  One could argue that all
  * of this belongs to the root node.  One could also further argue that the
  * sysctl interface that we have not might more properly be an ioctl
  * interface, but at this stage of the game, I'm not inclined to rock that
  * boat.
  *
  * I'm also not sure that the SIGIO support is done correctly or not, as
  * I copied it from a driver that had SIGIO support that likely hasn't been
  * tested since 3.4 or 2.2.8!
  */
 
 static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS);
 static int devctl_disable = 0;
 TUNABLE_INT("hw.bus.devctl_disable", &devctl_disable);
 SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_disable, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
     sysctl_devctl_disable, "I", "devctl disable");
 
 static d_open_t		devopen;
 static d_close_t	devclose;
 static d_read_t		devread;
 static d_ioctl_t	devioctl;
 static d_poll_t		devpoll;
 
 #define CDEV_MAJOR 173
 static struct cdevsw dev_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_NEEDGIANT,
 	.d_open =	devopen,
 	.d_close =	devclose,
 	.d_read =	devread,
 	.d_ioctl =	devioctl,
 	.d_poll =	devpoll,
 	.d_name =	"devctl",
 	.d_maj =	CDEV_MAJOR,
 };
 
 struct dev_event_info
 {
 	char *dei_data;
 	TAILQ_ENTRY(dev_event_info) dei_link;
 };
 
 TAILQ_HEAD(devq, dev_event_info);
 
 static struct dev_softc
 {
 	int	inuse;
 	int	nonblock;
 	struct mtx mtx;
 	struct cv cv;
 	struct selinfo sel;
 	struct devq devq;
 	struct proc *async_proc;
 } devsoftc;
 
 static struct cdev *devctl_dev;
 
 static void
 devinit(void)
 {
 	devctl_dev = make_dev(&dev_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
 	    "devctl");
 	mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF);
 	cv_init(&devsoftc.cv, "dev cv");
 	TAILQ_INIT(&devsoftc.devq);
 }
 
 static int
 devopen(struct cdev *dev, int oflags, int devtype, d_thread_t *td)
 {
 	if (devsoftc.inuse)
 		return (EBUSY);
 	/* move to init */
 	devsoftc.inuse = 1;
 	devsoftc.nonblock = 0;
 	devsoftc.async_proc = NULL;
 	return (0);
 }
 
 static int
 devclose(struct cdev *dev, int fflag, int devtype, d_thread_t *td)
 {
 	devsoftc.inuse = 0;
 	mtx_lock(&devsoftc.mtx);
 	cv_broadcast(&devsoftc.cv);
 	mtx_unlock(&devsoftc.mtx);
 
 	return (0);
 }
 
 /*
  * The read channel for this device is used to report changes to
  * userland in realtime.  We are required to free the data as well as
  * the n1 object because we allocate them separately.  Also note that
  * we return one record at a time.  If you try to read this device a
  * character at a time, you will loose the rest of the data.  Listening
  * programs are expected to cope.
  */
 static int
 devread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct dev_event_info *n1;
 	int rv;
 
 	mtx_lock(&devsoftc.mtx);
 	while (TAILQ_EMPTY(&devsoftc.devq)) {
 		if (devsoftc.nonblock) {
 			mtx_unlock(&devsoftc.mtx);
 			return (EAGAIN);
 		}
 		rv = cv_wait_sig(&devsoftc.cv, &devsoftc.mtx);
 		if (rv) {
 			/*
 			 * Need to translate ERESTART to EINTR here? -- jake
 			 */
 			mtx_unlock(&devsoftc.mtx);
 			return (rv);
 		}
 	}
 	n1 = TAILQ_FIRST(&devsoftc.devq);
 	TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
 	mtx_unlock(&devsoftc.mtx);
 	rv = uiomove(n1->dei_data, strlen(n1->dei_data), uio);
 	free(n1->dei_data, M_BUS);
 	free(n1, M_BUS);
 	return (rv);
 }
 
 static	int
 devioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, d_thread_t *td)
 {
 	switch (cmd) {
 
 	case FIONBIO:
 		if (*(int*)data)
 			devsoftc.nonblock = 1;
 		else
 			devsoftc.nonblock = 0;
 		return (0);
 	case FIOASYNC:
 		if (*(int*)data)
 			devsoftc.async_proc = td->td_proc;
 		else
 			devsoftc.async_proc = NULL;
 		return (0);
 
 		/* (un)Support for other fcntl() calls. */
 	case FIOCLEX:
 	case FIONCLEX:
 	case FIONREAD:
 	case FIOSETOWN:
 	case FIOGETOWN:
 	default:
 		break;
 	}
 	return (ENOTTY);
 }
 
 static	int
 devpoll(struct cdev *dev, int events, d_thread_t *td)
 {
 	int	revents = 0;
 
 	mtx_lock(&devsoftc.mtx);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (!TAILQ_EMPTY(&devsoftc.devq))
 			revents = events & (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &devsoftc.sel);
 	}
 	mtx_unlock(&devsoftc.mtx);
 
 	return (revents);
 }
 
 /**
  * @brief Queue data to be read from the devctl device
  *
  * Generic interface to queue data to the devctl device.  It is
  * assumed that @p data is properly formatted.  It is further assumed
  * that @p data is allocated using the M_BUS malloc type.
  */
 void
 devctl_queue_data(char *data)
 {
 	struct dev_event_info *n1 = NULL;
 	struct proc *p;
 
 	n1 = malloc(sizeof(*n1), M_BUS, M_NOWAIT);
 	if (n1 == NULL)
 		return;
 	n1->dei_data = data;
 	mtx_lock(&devsoftc.mtx);
 	TAILQ_INSERT_TAIL(&devsoftc.devq, n1, dei_link);
 	cv_broadcast(&devsoftc.cv);
 	mtx_unlock(&devsoftc.mtx);
 	selwakeup(&devsoftc.sel);
 	p = devsoftc.async_proc;
 	if (p != NULL) {
 		PROC_LOCK(p);
 		psignal(p, SIGIO);
 		PROC_UNLOCK(p);
 	}
 }
 
 /**
  * @brief Send a 'notification' to userland, using standard ways
  */
 void
 devctl_notify(const char *system, const char *subsystem, const char *type,
     const char *data)
 {
 	int len = 0;
 	char *msg;
 
 	if (system == NULL)
 		return;		/* BOGUS!  Must specify system. */
 	if (subsystem == NULL)
 		return;		/* BOGUS!  Must specify subsystem. */
 	if (type == NULL)
 		return;		/* BOGUS!  Must specify type. */
 	len += strlen(" system=") + strlen(system);
 	len += strlen(" subsystem=") + strlen(subsystem);
 	len += strlen(" type=") + strlen(type);
 	/* add in the data message plus newline. */
 	if (data != NULL)
 		len += strlen(data);
 	len += 3;	/* '!', '\n', and NUL */
 	msg = malloc(len, M_BUS, M_NOWAIT);
 	if (msg == NULL)
 		return;		/* Drop it on the floor */
 	snprintf(msg, len, "!system=%s subsystem=%s type=%s %s\n", system,
 	    subsystem, type, data);
 	devctl_queue_data(msg);
 }
 
 /*
  * Common routine that tries to make sending messages as easy as possible.
  * We allocate memory for the data, copy strings into that, but do not
  * free it unless there's an error.  The dequeue part of the driver should
  * free the data.  We don't send data when the device is disabled.  We do
  * send data, even when we have no listeners, because we wish to avoid
  * races relating to startup and restart of listening applications.
  */
 static void
 devaddq(const char *type, const char *what, device_t dev)
 {
 	char *data = NULL;
 	char *loc;
 	const char *parstr;
 
 	if (devctl_disable)
 		return;
 	data = malloc(1024, M_BUS, M_NOWAIT);
 	if (data == NULL)
 		goto bad;
 	loc = malloc(1024, M_BUS, M_NOWAIT);
 	if (loc == NULL)
 		goto bad;
 	*loc = '\0';
 	bus_child_location_str(dev, loc, 1024);
 	if (device_get_parent(dev) == NULL)
 		parstr = ".";	/* Or '/' ? */
 	else
 		parstr = device_get_nameunit(device_get_parent(dev));
 	snprintf(data, 1024, "%s%s at %s on %s\n", type, what, loc, parstr);
 	free(loc, M_BUS);
 	devctl_queue_data(data);
 	return;
 bad:
 	free(data, M_BUS);
 	return;
 }
 
 /*
  * A device was added to the tree.  We are called just after it successfully
  * attaches (that is, probe and attach success for this device).  No call
  * is made if a device is merely parented into the tree.  See devnomatch
  * if probe fails.  If attach fails, no notification is sent (but maybe
  * we should have a different message for this).
  */
 static void
 devadded(device_t dev)
 {
 	char *pnp = NULL;
 	char *tmp = NULL;
 
 	pnp = malloc(1024, M_BUS, M_NOWAIT);
 	if (pnp == NULL)
 		goto fail;
 	tmp = malloc(1024, M_BUS, M_NOWAIT);
 	if (tmp == NULL)
 		goto fail;
 	*pnp = '\0';
 	bus_child_pnpinfo_str(dev, pnp, 1024);
 	snprintf(tmp, 1024, "%s %s", device_get_nameunit(dev), pnp);
 	devaddq("+", tmp, dev);
 fail:
 	if (pnp != NULL)
 		free(pnp, M_BUS);
 	if (tmp != NULL)
 		free(tmp, M_BUS);
 	return;
 }
 
 /*
  * A device was removed from the tree.  We are called just before this
  * happens.
  */
 static void
 devremoved(device_t dev)
 {
 	char *pnp = NULL;
 	char *tmp = NULL;
 
 	pnp = malloc(1024, M_BUS, M_NOWAIT);
 	if (pnp == NULL)
 		goto fail;
 	tmp = malloc(1024, M_BUS, M_NOWAIT);
 	if (tmp == NULL)
 		goto fail;
 	*pnp = '\0';
 	bus_child_pnpinfo_str(dev, pnp, 1024);
 	snprintf(tmp, 1024, "%s %s", device_get_nameunit(dev), pnp);
 	devaddq("-", tmp, dev);
 fail:
 	if (pnp != NULL)
 		free(pnp, M_BUS);
 	if (tmp != NULL)
 		free(tmp, M_BUS);
 	return;
 }
 
 /*
  * Called when there's no match for this device.  This is only called
  * the first time that no match happens, so we don't keep getitng this
  * message.  Should that prove to be undesirable, we can change it.
  * This is called when all drivers that can attach to a given bus
  * decline to accept this device.  Other errrors may not be detected.
  */
 static void
 devnomatch(device_t dev)
 {
 	char *pnp = NULL;
 
 	pnp = malloc(1024, M_BUS, M_NOWAIT);
 	if (pnp == NULL)
 		return;
 	*pnp = '\0';
 	bus_child_pnpinfo_str(dev, pnp, 1024);
 	devaddq("?", pnp, dev);
 	free(pnp, M_BUS);
 	return;
 }
 
 static int
 sysctl_devctl_disable(SYSCTL_HANDLER_ARGS)
 {
 	struct dev_event_info *n1;
 	int dis, error;
 
 	dis = devctl_disable;
 	error = sysctl_handle_int(oidp, &dis, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	mtx_lock(&devsoftc.mtx);
 	devctl_disable = dis;
 	if (dis) {
 		while (!TAILQ_EMPTY(&devsoftc.devq)) {
 			n1 = TAILQ_FIRST(&devsoftc.devq);
 			TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
 			free(n1->dei_data, M_BUS);
 			free(n1, M_BUS);
 		}
 	}
 	mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 /* End of /dev/devctl code */
 
 TAILQ_HEAD(,device)	bus_data_devices;
 static int bus_data_generation = 1;
 
 kobj_method_t null_methods[] = {
 	{ 0, 0 }
 };
 
 DEFINE_CLASS(null, null_methods, 0);
 
 /*
  * Devclass implementation
  */
 
 static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);
 
 
 /**
  * @internal
  * @brief Find or create a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise if @p create is non-zero create and return a new device
  * class.
  *
  * If @p parentname is non-NULL, the parent of the devclass is set to
  * the devclass of that name.
  *
  * @param classname	the devclass name to find or create
  * @param parentname	the parent devclass name or @c NULL
  * @param create	non-zero to create a devclass
  */
 static devclass_t
 devclass_find_internal(const char *classname, const char *parentname,
 		       int create)
 {
 	devclass_t dc;
 
 	PDEBUG(("looking for %s", classname));
 	if (!classname)
 		return (NULL);
 
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		if (!strcmp(dc->name, classname))
 			break;
 	}
 
 	if (create && !dc) {
 		PDEBUG(("creating %s", classname));
 		dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
 		    M_BUS, M_NOWAIT|M_ZERO);
 		if (!dc)
 			return (NULL);
 		dc->parent = NULL;
 		dc->name = (char*) (dc + 1);
 		strcpy(dc->name, classname);
 		TAILQ_INIT(&dc->drivers);
 		TAILQ_INSERT_TAIL(&devclasses, dc, link);
 
 		bus_data_generation_update();
 	}
 	if (parentname && dc && !dc->parent) {
 		dc->parent = devclass_find_internal(parentname, 0, FALSE);
 	}
 
 	return (dc);
 }
 
 /**
  * @brief Create a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise create and return a new device class.
  *
  * @param classname	the devclass name to find or create
  */
 devclass_t
 devclass_create(const char *classname)
 {
 	return (devclass_find_internal(classname, 0, TRUE));
 }
 
 /**
  * @brief Find a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise return @c NULL.
  *
  * @param classname	the devclass name to find
  */
 devclass_t
 devclass_find(const char *classname)
 {
 	return (devclass_find_internal(classname, 0, FALSE));
 }
 
 /**
  * @brief Add a device driver to a device class
  *
  * Add a device driver to a devclass. This is normally called
  * automatically by DRIVER_MODULE(). The BUS_DRIVER_ADDED() method of
  * all devices in the devclass will be called to allow them to attempt
  * to re-probe any unmatched children.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to register
  */
 int
 devclass_add_driver(devclass_t dc, driver_t *driver)
 {
 	driverlink_t dl;
 	int i;
 
 	PDEBUG(("%s", DRIVERNAME(driver)));
 
 	dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO);
 	if (!dl)
 		return (ENOMEM);
 
 	/*
 	 * Compile the driver's methods. Also increase the reference count
 	 * so that the class doesn't get freed when the last instance
 	 * goes. This means we can safely use static methods and avoids a
 	 * double-free in devclass_delete_driver.
 	 */
 	kobj_class_compile((kobj_class_t) driver);
 
 	/*
 	 * Make sure the devclass which the driver is implementing exists.
 	 */
 	devclass_find_internal(driver->name, 0, TRUE);
 
 	dl->driver = driver;
 	TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
 	driver->refs++;
 
 	/*
 	 * Call BUS_DRIVER_ADDED for any existing busses in this class.
 	 */
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i])
 			BUS_DRIVER_ADDED(dc->devices[i], driver);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Delete a device driver from a device class
  *
  * Delete a device driver from a devclass. This is normally called
  * automatically by DRIVER_MODULE().
  *
  * If the driver is currently attached to any devices,
  * devclass_delete_driver() will first attempt to detach from each
  * device. If one of the detach calls fails, the driver will not be
  * deleted.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to unregister
  */
 int
 devclass_delete_driver(devclass_t busclass, driver_t *driver)
 {
 	devclass_t dc = devclass_find(driver->name);
 	driverlink_t dl;
 	device_t dev;
 	int i;
 	int error;
 
 	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
 
 	if (!dc)
 		return (0);
 
 	/*
 	 * Find the link structure in the bus' list of drivers.
 	 */
 	TAILQ_FOREACH(dl, &busclass->drivers, link) {
 		if (dl->driver == driver)
 			break;
 	}
 
 	if (!dl) {
 		PDEBUG(("%s not found in %s list", driver->name,
 		    busclass->name));
 		return (ENOENT);
 	}
 
 	/*
 	 * Disassociate from any devices.  We iterate through all the
 	 * devices in the devclass of the driver and detach any which are
 	 * using the driver and which have a parent in the devclass which
 	 * we are deleting from.
 	 *
 	 * Note that since a driver can be in multiple devclasses, we
 	 * should not detach devices which are not children of devices in
 	 * the affected devclass.
 	 */
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			dev = dc->devices[i];
 			if (dev->driver == driver && dev->parent &&
 			    dev->parent->devclass == busclass) {
 				if ((error = device_detach(dev)) != 0)
 					return (error);
 				device_set_driver(dev, NULL);
 			}
 		}
 	}
 
 	TAILQ_REMOVE(&busclass->drivers, dl, link);
 	free(dl, M_BUS);
 
 	driver->refs--;
 	if (driver->refs == 0)
 		kobj_class_free((kobj_class_t) driver);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @internal
  */
 static driverlink_t
 devclass_find_driver_internal(devclass_t dc, const char *classname)
 {
 	driverlink_t dl;
 
 	PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));
 
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		if (!strcmp(dl->driver->name, classname))
 			return (dl);
 	}
 
 	PDEBUG(("not found"));
 	return (NULL);
 }
 
 /**
  * @brief Search a devclass for a driver
  *
  * This function searches the devclass's list of drivers and returns
  * the first driver whose name is @p classname or @c NULL if there is
  * no driver of that name.
  *
  * @param dc		the devclass to search
  * @param classname	the driver name to search for
  */
 kobj_class_t
 devclass_find_driver(devclass_t dc, const char *classname)
 {
 	driverlink_t dl;
 
 	dl = devclass_find_driver_internal(dc, classname);
 	if (dl)
 		return (dl->driver);
 	return (NULL);
 }
 
 /**
  * @brief Return the name of the devclass
  */
 const char *
 devclass_get_name(devclass_t dc)
 {
 	return (dc->name);
 }
 
 /**
  * @brief Find a device given a unit number
  *
  * @param dc		the devclass to search
  * @param unit		the unit number to search for
  * 
  * @returns		the device with the given unit number or @c
  *			NULL if there is no such device
  */
 device_t
 devclass_get_device(devclass_t dc, int unit)
 {
 	if (dc == NULL || unit < 0 || unit >= dc->maxunit)
 		return (NULL);
 	return (dc->devices[unit]);
 }
 
 /**
  * @brief Find the softc field of a device given a unit number
  *
  * @param dc		the devclass to search
  * @param unit		the unit number to search for
  * 
  * @returns		the softc field of the device with the given
  *			unit number or @c NULL if there is no such
  *			device
  */
 void *
 devclass_get_softc(devclass_t dc, int unit)
 {
 	device_t dev;
 
 	dev = devclass_get_device(dc, unit);
 	if (!dev)
 		return (NULL);
 
 	return (device_get_softc(dev));
 }
 
 /**
  * @brief Get a list of devices in the devclass
  *
  * An array containing a list of all the devices in the given devclass
  * is allocated and returned in @p *devlistp. The number of devices
  * in the array is returned in @p *devcountp. The caller should free
  * the array using @c free(p, M_TEMP).
  *
  * @param dc		the devclass to examine
  * @param devlistp	points at location for array pointer return
  *			value
  * @param devcountp	points at location for array size return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp)
 {
 	int count, i;
 	device_t *list;
 
 	count = devclass_get_count(dc);
 	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
 	if (!list)
 		return (ENOMEM);
 
 	count = 0;
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			list[count] = dc->devices[i];
 			count++;
 		}
 	}
 
 	*devlistp = list;
 	*devcountp = count;
 
 	return (0);
 }
 
 /**
  * @brief Get a list of drivers in the devclass
  *
  * An array containing a list of pointers to all the drivers in the
  * given devclass is allocated and returned in @p *listp.  The number
  * of drivers in the array is returned in @p *countp. The caller should
  * free the array using @c free(p, M_TEMP).
  *
  * @param dc		the devclass to examine
  * @param listp		gives location for array pointer return value
  * @param countp	gives location for number of array elements
  *			return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 devclass_get_drivers(devclass_t dc, driver_t ***listp, int *countp)
 {
 	driverlink_t dl;
 	driver_t **list;
 	int count;
 
 	count = 0;
 	TAILQ_FOREACH(dl, &dc->drivers, link)
 		count++;
 	list = malloc(count * sizeof(driver_t *), M_TEMP, M_NOWAIT);
 	if (list == NULL)
 		return (ENOMEM);
 
 	count = 0;
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		list[count] = dl->driver;
 		count++;
 	}
 	*listp = list;
 	*countp = count;
 
 	return (0);
 }
 
 /**
  * @brief Get the number of devices in a devclass
  *
  * @param dc		the devclass to examine
  */
 int
 devclass_get_count(devclass_t dc)
 {
 	int count, i;
 
 	count = 0;
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i])
 			count++;
 	return (count);
 }
 
 /**
  * @brief Get the maximum unit number used in a devclass
  *
  * Note that this is one greater than the highest currently-allocated
  * unit.
  *
  * @param dc		the devclass to examine
  */
 int
 devclass_get_maxunit(devclass_t dc)
 {
 	return (dc->maxunit);
 }
 
 /**
  * @brief Find a free unit number in a devclass
  *
  * This function searches for the first unused unit number greater
  * that or equal to @p unit.
  *
  * @param dc		the devclass to examine
  * @param unit		the first unit number to check
  */
 int
 devclass_find_free_unit(devclass_t dc, int unit)
 {
 	if (dc == NULL)
 		return (unit);
 	while (unit < dc->maxunit && dc->devices[unit] != NULL)
 		unit++;
 	return (unit);
 }
 
 /**
  * @brief Set the parent of a devclass
  *
  * The parent class is normally initialised automatically by
  * DRIVER_MODULE().
  *
  * @param dc		the devclass to edit
  * @param pdc		the new parent devclass
  */
 void
 devclass_set_parent(devclass_t dc, devclass_t pdc)
 {
 	dc->parent = pdc;
 }
 
 /**
  * @brief Get the parent of a devclass
  *
  * @param dc		the devclass to examine
  */
 devclass_t
 devclass_get_parent(devclass_t dc)
 {
 	return (dc->parent);
 }
 
 struct sysctl_ctx_list *
 devclass_get_sysctl_ctx(devclass_t dc)
 {
 	return (&dc->sysctl_ctx);
 }
 
 struct sysctl_oid *
 devclass_get_sysctl_tree(devclass_t dc)
 {
 	return (dc->sysctl_tree);
 }
 
 /**
  * @internal
  * @brief Allocate a unit number
  *
  * On entry, @p *unitp is the desired unit number (or @c -1 if any
  * will do). The allocated unit number is returned in @p *unitp.
 
  * @param dc		the devclass to allocate from
  * @param unitp		points at the location for the allocated unit
  *			number
  *
  * @retval 0		success
  * @retval EEXIST	the requested unit number is already allocated
  * @retval ENOMEM	memory allocation failure
  */
 static int
 devclass_alloc_unit(devclass_t dc, int *unitp)
 {
 	int unit = *unitp;
 
 	PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));
 
 	/* If we were given a wired unit number, check for existing device */
 	/* XXX imp XXX */
 	if (unit != -1) {
 		if (unit >= 0 && unit < dc->maxunit &&
 		    dc->devices[unit] != NULL) {
 			if (bootverbose)
 				printf("%s: %s%d already exists; skipping it\n",
 				    dc->name, dc->name, *unitp);
 			return (EEXIST);
 		}
 	} else {
 		/* Unwired device, find the next available slot for it */
 		unit = 0;
 		while (unit < dc->maxunit && dc->devices[unit] != NULL)
 			unit++;
 	}
 
 	/*
 	 * We've selected a unit beyond the length of the table, so let's
 	 * extend the table to make room for all units up to and including
 	 * this one.
 	 */
 	if (unit >= dc->maxunit) {
 		device_t *newlist;
 		int newsize;
 
 		newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t));
 		newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT);
 		if (!newlist)
 			return (ENOMEM);
 		bcopy(dc->devices, newlist, sizeof(device_t) * dc->maxunit);
 		bzero(newlist + dc->maxunit,
 		    sizeof(device_t) * (newsize - dc->maxunit));
 		if (dc->devices)
 			free(dc->devices, M_BUS);
 		dc->devices = newlist;
 		dc->maxunit = newsize;
 	}
 	PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));
 
 	*unitp = unit;
 	return (0);
 }
 
 /**
  * @internal
  * @brief Add a device to a devclass
  *
  * A unit number is allocated for the device (using the device's
  * preferred unit number if any) and the device is registered in the
  * devclass. This allows the device to be looked up by its unit
  * number, e.g. by decoding a dev_t minor number.
  *
  * @param dc		the devclass to add to
  * @param dev		the device to add
  *
  * @retval 0		success
  * @retval EEXIST	the requested unit number is already allocated
  * @retval ENOMEM	memory allocation failure
  */
 static int
 devclass_add_device(devclass_t dc, device_t dev)
 {
 	int buflen, error;
 
 	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
 
 	buflen = snprintf(NULL, 0, "%s%d$", dc->name, dev->unit);
 	if (buflen < 0)
 		return (ENOMEM);
 	dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO);
 	if (!dev->nameunit)
 		return (ENOMEM);
 
 	if ((error = devclass_alloc_unit(dc, &dev->unit)) != 0) {
 		free(dev->nameunit, M_BUS);
 		dev->nameunit = NULL;
 		return (error);
 	}
 	dc->devices[dev->unit] = dev;
 	dev->devclass = dc;
 	snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit);
 
 	return (0);
 }
 
 /**
  * @internal
  * @brief Delete a device from a devclass
  *
  * The device is removed from the devclass's device list and its unit
  * number is freed.
 
  * @param dc		the devclass to delete from
  * @param dev		the device to delete
  *
  * @retval 0		success
  */
 static int
 devclass_delete_device(devclass_t dc, device_t dev)
 {
 	if (!dc || !dev)
 		return (0);
 
 	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
 
 	if (dev->devclass != dc || dc->devices[dev->unit] != dev)
 		panic("devclass_delete_device: inconsistent device class");
 	dc->devices[dev->unit] = NULL;
 	if (dev->flags & DF_WILDCARD)
 		dev->unit = -1;
 	dev->devclass = NULL;
 	free(dev->nameunit, M_BUS);
 	dev->nameunit = NULL;
 
 	return (0);
 }
 
 /**
  * @internal
  * @brief Make a new device and add it as a child of @p parent
  *
  * @param parent	the parent of the new device
  * @param name		the devclass name of the new device or @c NULL
  *			to leave the devclass unspecified
  * @parem unit		the unit number of the new device of @c -1 to
  *			leave the unit number unspecified
  *
  * @returns the new device
  */
 static device_t
 make_device(device_t parent, const char *name, int unit)
 {
 	device_t dev;
 	devclass_t dc;
 
 	PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));
 
 	if (name) {
 		dc = devclass_find_internal(name, 0, TRUE);
 		if (!dc) {
 			printf("make_device: can't find device class %s\n",
 			    name);
 			return (NULL);
 		}
 	} else {
 		dc = NULL;
 	}
 
 	dev = malloc(sizeof(struct device), M_BUS, M_NOWAIT|M_ZERO);
 	if (!dev)
 		return (NULL);
 
 	dev->parent = parent;
 	TAILQ_INIT(&dev->children);
 	kobj_init((kobj_t) dev, &null_class);
 	dev->driver = NULL;
 	dev->devclass = NULL;
 	dev->unit = unit;
 	dev->nameunit = NULL;
 	dev->desc = NULL;
 	dev->busy = 0;
 	dev->devflags = 0;
 	dev->flags = DF_ENABLED;
 	dev->order = 0;
 	if (unit == -1)
 		dev->flags |= DF_WILDCARD;
 	if (name) {
 		dev->flags |= DF_FIXEDCLASS;
 		if (devclass_add_device(dc, dev)) {
 			kobj_delete((kobj_t) dev, M_BUS);
 			return (NULL);
 		}
 	}
 	dev->ivars = NULL;
 	dev->softc = NULL;
 
 	dev->state = DS_NOTPRESENT;
 
 	TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink);
 	bus_data_generation_update();
 
 	return (dev);
 }
 
 /**
  * @internal
  * @brief Print a description of a device.
  */
 static int
 device_print_child(device_t dev, device_t child)
 {
 	int retval = 0;
 
 	if (device_is_alive(child))
 		retval += BUS_PRINT_CHILD(dev, child);
 	else
 		retval += device_printf(child, " not found\n");
 
 	return (retval);
 }
 
 /**
  * @brief Create a new device
  *
  * This creates a new device and adds it as a child of an existing
  * parent device. The new device will be added after the last existing
  * child with order zero.
  * 
  * @param dev		the device which will be the parent of the
  *			new child device
  * @param name		devclass name for new device or @c NULL if not
  *			specified
  * @param unit		unit number for new device or @c -1 if not
  *			specified
  * 
  * @returns		the new device
  */
 device_t
 device_add_child(device_t dev, const char *name, int unit)
 {
 	return (device_add_child_ordered(dev, 0, name, unit));
 }
 
 /**
  * @brief Create a new device
  *
  * This creates a new device and adds it as a child of an existing
  * parent device. The new device will be added after the last existing
  * child with the same order.
  * 
  * @param dev		the device which will be the parent of the
  *			new child device
  * @param order		a value which is used to partially sort the
  *			children of @p dev - devices created using
  *			lower values of @p order appear first in @p
  *			dev's list of children
  * @param name		devclass name for new device or @c NULL if not
  *			specified
  * @param unit		unit number for new device or @c -1 if not
  *			specified
  * 
  * @returns		the new device
  */
 device_t
 device_add_child_ordered(device_t dev, int order, const char *name, int unit)
 {
 	device_t child;
 	device_t place;
 
 	PDEBUG(("%s at %s with order %d as unit %d",
 	    name, DEVICENAME(dev), order, unit));
 
 	child = make_device(dev, name, unit);
 	if (child == NULL)
 		return (child);
 	child->order = order;
 
 	TAILQ_FOREACH(place, &dev->children, link) {
 		if (place->order > order)
 			break;
 	}
 
 	if (place) {
 		/*
 		 * The device 'place' is the first device whose order is
 		 * greater than the new child.
 		 */
 		TAILQ_INSERT_BEFORE(place, child, link);
 	} else {
 		/*
 		 * The new child's order is greater or equal to the order of
 		 * any existing device. Add the child to the tail of the list.
 		 */
 		TAILQ_INSERT_TAIL(&dev->children, child, link);
 	}
 
 	bus_data_generation_update();
 	return (child);
 }
 
 /**
  * @brief Delete a device
  *
  * This function deletes a device along with all of its children. If
  * the device currently has a driver attached to it, the device is
  * detached first using device_detach().
  * 
  * @param dev		the parent device
  * @param child		the device to delete
  *
  * @retval 0		success
  * @retval non-zero	a unit error code describing the error
  */
 int
 device_delete_child(device_t dev, device_t child)
 {
 	int error;
 	device_t grandchild;
 
 	PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
 
 	/* remove children first */
 	while ( (grandchild = TAILQ_FIRST(&child->children)) ) {
 		error = device_delete_child(child, grandchild);
 		if (error)
 			return (error);
 	}
 
 	if ((error = device_detach(child)) != 0)
 		return (error);
 	if (child->devclass)
 		devclass_delete_device(child->devclass, child);
 	TAILQ_REMOVE(&dev->children, child, link);
 	TAILQ_REMOVE(&bus_data_devices, child, devlink);
 	kobj_delete((kobj_t) child, M_BUS);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Find a device given a unit number
  *
  * This is similar to devclass_get_devices() but only searches for
  * devices which have @p dev as a parent.
  *
  * @param dev		the parent device to search
  * @param unit		the unit number to search for.  If the unit is -1,
  *			return the first child of @p dev which has name
  *			@p classname (that is, the one with the lowest unit.)
  *
  * @returns		the device with the given unit number or @c
  *			NULL if there is no such device
  */
 device_t
 device_find_child(device_t dev, const char *classname, int unit)
 {
 	devclass_t dc;
 	device_t child;
 
 	dc = devclass_find(classname);
 	if (!dc)
 		return (NULL);
 
 	if (unit != -1) {
 		child = devclass_get_device(dc, unit);
 		if (child && child->parent == dev)
 			return (child);
 	} else {
 		for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
 			child = devclass_get_device(dc, unit);
 			if (child && child->parent == dev)
 				return (child);
 		}
 	}
 	return (NULL);
 }
 
 /**
  * @internal
  */
 static driverlink_t
 first_matching_driver(devclass_t dc, device_t dev)
 {
 	if (dev->devclass)
 		return (devclass_find_driver_internal(dc, dev->devclass->name));
 	return (TAILQ_FIRST(&dc->drivers));
 }
 
 /**
  * @internal
  */
 static driverlink_t
 next_matching_driver(devclass_t dc, device_t dev, driverlink_t last)
 {
 	if (dev->devclass) {
 		driverlink_t dl;
 		for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link))
 			if (!strcmp(dev->devclass->name, dl->driver->name))
 				return (dl);
 		return (NULL);
 	}
 	return (TAILQ_NEXT(last, link));
 }
 
 /**
  * @internal
  */
 static int
 device_probe_child(device_t dev, device_t child)
 {
 	devclass_t dc;
 	driverlink_t best = 0;
 	driverlink_t dl;
 	int result, pri = 0;
 	int hasclass = (child->devclass != 0);
 
 	dc = dev->devclass;
 	if (!dc)
 		panic("device_probe_child: parent device has no devclass");
 
 	if (child->state == DS_ALIVE)
 		return (0);
 
 	for (; dc; dc = dc->parent) {
 		for (dl = first_matching_driver(dc, child);
 		     dl;
 		     dl = next_matching_driver(dc, child, dl)) {
 			PDEBUG(("Trying %s", DRIVERNAME(dl->driver)));
 			device_set_driver(child, dl->driver);
 			if (!hasclass)
 				device_set_devclass(child, dl->driver->name);
 
 			/* Fetch any flags for the device before probing. */
 			resource_int_value(dl->driver->name, child->unit,
 			    "flags", &child->devflags);
 
 			result = DEVICE_PROBE(child);
 
 			/* Reset flags and devclass before the next probe. */
 			child->devflags = 0;
 			if (!hasclass)
 				device_set_devclass(child, 0);
 
 			/*
 			 * If the driver returns SUCCESS, there can be
 			 * no higher match for this device.
 			 */
 			if (result == 0) {
 				best = dl;
 				pri = 0;
 				break;
 			}
 
 			/*
 			 * The driver returned an error so it
 			 * certainly doesn't match.
 			 */
 			if (result > 0) {
 				device_set_driver(child, 0);
 				continue;
 			}
 
 			/*
 			 * A priority lower than SUCCESS, remember the
 			 * best matching driver. Initialise the value
 			 * of pri for the first match.
 			 */
 			if (best == 0 || result > pri) {
 				best = dl;
 				pri = result;
 				continue;
 			}
 		}
 		/*
 		 * If we have an unambiguous match in this devclass,
 		 * don't look in the parent.
 		 */
 		if (best && pri == 0)
 			break;
 	}
 
 	/*
 	 * If we found a driver, change state and initialise the devclass.
 	 */
 	if (best) {
 		/* Set the winning driver, devclass, and flags. */
 		if (!child->devclass)
 			device_set_devclass(child, best->driver->name);
 		device_set_driver(child, best->driver);
 		resource_int_value(best->driver->name, child->unit,
 		    "flags", &child->devflags);
 
 		if (pri < 0) {
 			/*
 			 * A bit bogus. Call the probe method again to make
 			 * sure that we have the right description.
 			 */
 			DEVICE_PROBE(child);
 		}
 		child->state = DS_ALIVE;
 
 		bus_data_generation_update();
 		return (0);
 	}
 
 	return (ENXIO);
 }
 
 /**
  * @brief Return the parent of a device
  */
 device_t
 device_get_parent(device_t dev)
 {
 	return (dev->parent);
 }
 
 /**
  * @brief Get a list of children of a device
  *
  * An array containing a list of all the children of the given device
  * is allocated and returned in @p *devlistp. The number of devices
  * in the array is returned in @p *devcountp. The caller should free
  * the array using @c free(p, M_TEMP).
  *
  * @param dev		the device to examine
  * @param devlistp	points at location for array pointer return
  *			value
  * @param devcountp	points at location for array size return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 device_get_children(device_t dev, device_t **devlistp, int *devcountp)
 {
 	int count;
 	device_t child;
 	device_t *list;
 
 	count = 0;
 	TAILQ_FOREACH(child, &dev->children, link) {
 		count++;
 	}
 
 	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
 	if (!list)
 		return (ENOMEM);
 
 	count = 0;
 	TAILQ_FOREACH(child, &dev->children, link) {
 		list[count] = child;
 		count++;
 	}
 
 	*devlistp = list;
 	*devcountp = count;
 
 	return (0);
 }
 
 /**
  * @brief Return the current driver for the device or @c NULL if there
  * is no driver currently attached
  */
 driver_t *
 device_get_driver(device_t dev)
 {
 	return (dev->driver);
 }
 
 /**
  * @brief Return the current devclass for the device or @c NULL if
  * there is none.
  */
 devclass_t
 device_get_devclass(device_t dev)
 {
 	return (dev->devclass);
 }
 
 /**
  * @brief Return the name of the device's devclass or @c NULL if there
  * is none.
  */
 const char *
 device_get_name(device_t dev)
 {
 	if (dev != NULL && dev->devclass)
 		return (devclass_get_name(dev->devclass));
 	return (NULL);
 }
 
 /**
  * @brief Return a string containing the device's devclass name
  * followed by an ascii representation of the device's unit number
  * (e.g. @c "foo2").
  */
 const char *
 device_get_nameunit(device_t dev)
 {
 	return (dev->nameunit);
 }
 
 /**
  * @brief Return the device's unit number.
  */
 int
 device_get_unit(device_t dev)
 {
 	return (dev->unit);
 }
 
 /**
  * @brief Return the device's description string
  */
 const char *
 device_get_desc(device_t dev)
 {
 	return (dev->desc);
 }
 
 /**
  * @brief Return the device's flags
  */
 u_int32_t
 device_get_flags(device_t dev)
 {
 	return (dev->devflags);
 }
 
 struct sysctl_ctx_list *
 device_get_sysctl_ctx(device_t dev)
 {
 	return (&dev->sysctl_ctx);
 }
 
 struct sysctl_oid *
 device_get_sysctl_tree(device_t dev)
 {
 	return (dev->sysctl_tree);
 }
 
 /**
  * @brief Print the name of the device followed by a colon and a space
  *
  * @returns the number of characters printed
  */
 int
 device_print_prettyname(device_t dev)
 {
 	const char *name = device_get_name(dev);
 
 	if (name == 0)
 		return (printf("unknown: "));
 	return (printf("%s%d: ", name, device_get_unit(dev)));
 }
 
 /**
  * @brief Print the name of the device followed by a colon, a space
  * and the result of calling vprintf() with the value of @p fmt and
  * the following arguments.
  *
  * @returns the number of characters printed
  */
 int
 device_printf(device_t dev, const char * fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	retval = device_print_prettyname(dev);
 	va_start(ap, fmt);
 	retval += vprintf(fmt, ap);
 	va_end(ap);
 	return (retval);
 }
 
 /**
  * @internal
  */
 static void
 device_set_desc_internal(device_t dev, const char* desc, int copy)
 {
 	if (dev->desc && (dev->flags & DF_DESCMALLOCED)) {
 		free(dev->desc, M_BUS);
 		dev->flags &= ~DF_DESCMALLOCED;
 		dev->desc = NULL;
 	}
 
 	if (copy && desc) {
 		dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT);
 		if (dev->desc) {
 			strcpy(dev->desc, desc);
 			dev->flags |= DF_DESCMALLOCED;
 		}
 	} else {
 		/* Avoid a -Wcast-qual warning */
 		dev->desc = (char *)(uintptr_t) desc;
 	}
 
 	bus_data_generation_update();
 }
 
 /**
  * @brief Set the device's description
  *
  * The value of @c desc should be a string constant that will not
  * change (at least until the description is changed in a subsequent
  * call to device_set_desc() or device_set_desc_copy()).
  */
 void
 device_set_desc(device_t dev, const char* desc)
 {
 	device_set_desc_internal(dev, desc, FALSE);
 }
 
 /**
  * @brief Set the device's description
  *
  * The string pointed to by @c desc is copied. Use this function if
  * the device description is generated, (e.g. with sprintf()).
  */
 void
 device_set_desc_copy(device_t dev, const char* desc)
 {
 	device_set_desc_internal(dev, desc, TRUE);
 }
 
 /**
  * @brief Set the device's flags
  */
 void
 device_set_flags(device_t dev, u_int32_t flags)
 {
 	dev->devflags = flags;
 }
 
 /**
  * @brief Return the device's softc field
  *
  * The softc is allocated and zeroed when a driver is attached, based
  * on the size field of the driver.
  */
 void *
 device_get_softc(device_t dev)
 {
 	return (dev->softc);
 }
 
 /**
  * @brief Set the device's softc field
  *
  * Most drivers do not need to use this since the softc is allocated
  * automatically when the driver is attached.
  */
 void
 device_set_softc(device_t dev, void *softc)
 {
 	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC))
 		free(dev->softc, M_BUS_SC);
 	dev->softc = softc;
 	if (dev->softc)
 		dev->flags |= DF_EXTERNALSOFTC;
 	else
 		dev->flags &= ~DF_EXTERNALSOFTC;
 }
 
 /**
  * @brief Get the device's ivars field
  *
  * The ivars field is used by the parent device to store per-device
  * state (e.g. the physical location of the device or a list of
  * resources).
  */
 void *
 device_get_ivars(device_t dev)
 {
 
 	KASSERT(dev != NULL, ("device_get_ivars(NULL, ...)"));
 	return (dev->ivars);
 }
 
 /**
  * @brief Set the device's ivars field
  */
 void
 device_set_ivars(device_t dev, void * ivars)
 {
 
 	KASSERT(dev != NULL, ("device_set_ivars(NULL, ...)"));
 	dev->ivars = ivars;
 }
 
 /**
  * @brief Return the device's state
  */
 device_state_t
 device_get_state(device_t dev)
 {
 	return (dev->state);
 }
 
 /**
  * @brief Set the DF_ENABLED flag for the device
  */
 void
 device_enable(device_t dev)
 {
 	dev->flags |= DF_ENABLED;
 }
 
 /**
  * @brief Clear the DF_ENABLED flag for the device
  */
 void
 device_disable(device_t dev)
 {
 	dev->flags &= ~DF_ENABLED;
 }
 
 /**
  * @brief Increment the busy counter for the device
  */
 void
 device_busy(device_t dev)
 {
 	if (dev->state < DS_ATTACHED)
 		panic("device_busy: called for unattached device");
 	if (dev->busy == 0 && dev->parent)
 		device_busy(dev->parent);
 	dev->busy++;
 	dev->state = DS_BUSY;
 }
 
 /**
  * @brief Decrement the busy counter for the device
  */
 void
 device_unbusy(device_t dev)
 {
 	if (dev->state != DS_BUSY)
 		panic("device_unbusy: called for non-busy device");
 	dev->busy--;
 	if (dev->busy == 0) {
 		if (dev->parent)
 			device_unbusy(dev->parent);
 		dev->state = DS_ATTACHED;
 	}
 }
 
 /**
  * @brief Set the DF_QUIET flag for the device
  */
 void
 device_quiet(device_t dev)
 {
 	dev->flags |= DF_QUIET;
 }
 
 /**
  * @brief Clear the DF_QUIET flag for the device
  */
 void
 device_verbose(device_t dev)
 {
 	dev->flags &= ~DF_QUIET;
 }
 
 /**
  * @brief Return non-zero if the DF_QUIET flag is set on the device
  */
 int
 device_is_quiet(device_t dev)
 {
 	return ((dev->flags & DF_QUIET) != 0);
 }
 
 /**
  * @brief Return non-zero if the DF_ENABLED flag is set on the device
  */
 int
 device_is_enabled(device_t dev)
 {
 	return ((dev->flags & DF_ENABLED) != 0);
 }
 
 /**
  * @brief Return non-zero if the device was successfully probed
  */
 int
 device_is_alive(device_t dev)
 {
 	return (dev->state >= DS_ALIVE);
 }
 
 /**
  * @brief Return non-zero if the device currently has a driver
  * attached to it
  */
 int
 device_is_attached(device_t dev)
 {
 	return (dev->state >= DS_ATTACHED);
 }
 
 /**
  * @brief Set the devclass of a device
  * @see devclass_add_device().
  */
 int
 device_set_devclass(device_t dev, const char *classname)
 {
 	devclass_t dc;
 	int error;
 
 	if (!classname) {
 		if (dev->devclass)
 			devclass_delete_device(dev->devclass, dev);
 		return (0);
 	}
 
 	if (dev->devclass) {
 		printf("device_set_devclass: device class already set\n");
 		return (EINVAL);
 	}
 
 	dc = devclass_find_internal(classname, 0, TRUE);
 	if (!dc)
 		return (ENOMEM);
 
 	error = devclass_add_device(dc, dev);
 
 	bus_data_generation_update();
 	return (error);
 }
 
 /**
  * @brief Set the driver of a device
  *
  * @retval 0		success
  * @retval EBUSY	the device already has a driver attached
  * @retval ENOMEM	a memory allocation failure occurred
  */
 int
 device_set_driver(device_t dev, driver_t *driver)
 {
 	if (dev->state >= DS_ATTACHED)
 		return (EBUSY);
 
 	if (dev->driver == driver)
 		return (0);
 
 	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) {
 		free(dev->softc, M_BUS_SC);
 		dev->softc = NULL;
 	}
 	kobj_delete((kobj_t) dev, 0);
 	dev->driver = driver;
 	if (driver) {
 		kobj_init((kobj_t) dev, (kobj_class_t) driver);
 		if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) {
 			dev->softc = malloc(driver->size, M_BUS_SC,
 			    M_NOWAIT | M_ZERO);
 			if (!dev->softc) {
 				kobj_delete((kobj_t) dev, 0);
 				kobj_init((kobj_t) dev, &null_class);
 				dev->driver = NULL;
 				return (ENOMEM);
 			}
 		}
 	} else {
 		kobj_init((kobj_t) dev, &null_class);
 	}
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Probe a device and attach a driver if possible
  *
  * This function is the core of the device autoconfiguration
  * system. Its purpose is to select a suitable driver for a device and
  * then call that driver to initialise the hardware appropriately. The
  * driver is selected by calling the DEVICE_PROBE() method of a set of
  * candidate drivers and then choosing the driver which returned the
  * best value. This driver is then attached to the device using
  * device_attach().
  *
  * The set of suitable drivers is taken from the list of drivers in
  * the parent device's devclass. If the device was originally created
  * with a specific class name (see device_add_child()), only drivers
  * with that name are probed, otherwise all drivers in the devclass
  * are probed. If no drivers return successful probe values in the
  * parent devclass, the search continues in the parent of that
  * devclass (see devclass_get_parent()) if any.
  *
  * @param dev		the device to initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_probe_and_attach(device_t dev)
 {
 	int error;
 
 	if (dev->state >= DS_ALIVE)
 		return (0);
 
 	if (!(dev->flags & DF_ENABLED)) {
 		if (bootverbose) {
 			device_print_prettyname(dev);
 			printf("not probed (disabled)\n");
 		}
 		return (0);
 	}
 	if ((error = device_probe_child(dev->parent, dev)) != 0) {
 		if (!(dev->flags & DF_DONENOMATCH)) {
 			BUS_PROBE_NOMATCH(dev->parent, dev);
 			devnomatch(dev);
 			dev->flags |= DF_DONENOMATCH;
 		}
 		return (error);
 	}
 	error = device_attach(dev);
 
 	return (error);
 }
 
 /**
  * @brief Attach a device driver to a device
  *
  * This function is a wrapper around the DEVICE_ATTACH() driver
  * method. In addition to calling DEVICE_ATTACH(), it initialises the
  * device's sysctl tree, optionally prints a description of the device
  * and queues a notification event for user-based device management
  * services.
  *
  * Normally this function is only called internally from
  * device_probe_and_attach().
  *
  * @param dev		the device to initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_attach(device_t dev)
 {
 	int error;
 
 	device_sysctl_init(dev);
 	if (!device_is_quiet(dev))
 		device_print_child(dev->parent, dev);
 	if ((error = DEVICE_ATTACH(dev)) != 0) {
 		printf("device_attach: %s%d attach returned %d\n",
 		    dev->driver->name, dev->unit, error);
 		/* Unset the class; set in device_probe_child */
 		if (dev->devclass == 0)
 			device_set_devclass(dev, 0);
 		device_set_driver(dev, NULL);
 		device_sysctl_fini(dev);
 		dev->state = DS_NOTPRESENT;
 		return (error);
 	}
 	dev->state = DS_ATTACHED;
 	devadded(dev);
 	return (0);
 }
 
 /**
  * @brief Detach a driver from a device
  *
  * This function is a wrapper around the DEVICE_DETACH() driver
  * method. If the call to DEVICE_DETACH() succeeds, it calls
  * BUS_CHILD_DETACHED() for the parent of @p dev, queues a
  * notification event for user-based device management services and
  * cleans up the device's sysctl tree.
  *
  * @param dev		the device to un-initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_detach(device_t dev)
 {
 	int error;
 
 	PDEBUG(("%s", DEVICENAME(dev)));
 	if (dev->state == DS_BUSY)
 		return (EBUSY);
 	if (dev->state != DS_ATTACHED)
 		return (0);
 
 	if ((error = DEVICE_DETACH(dev)) != 0)
 		return (error);
 	devremoved(dev);
 	device_printf(dev, "detached\n");
 	if (dev->parent)
 		BUS_CHILD_DETACHED(dev->parent, dev);
 
 	if (!(dev->flags & DF_FIXEDCLASS))
 		devclass_delete_device(dev->devclass, dev);
 
 	dev->state = DS_NOTPRESENT;
 	device_set_driver(dev, NULL);
 	device_set_desc(dev, NULL);
 	device_sysctl_fini(dev);
 
 	return (0);
 }
 
 /**
  * @brief Notify a device of system shutdown
  *
  * This function calls the DEVICE_SHUTDOWN() driver method if the
  * device currently has an attached driver.
  *
  * @returns the value returned by DEVICE_SHUTDOWN()
  */
 int
 device_shutdown(device_t dev)
 {
 	if (dev->state < DS_ATTACHED)
 		return (0);
 	return (DEVICE_SHUTDOWN(dev));
 }
 
 /**
  * @brief Set the unit number of a device
  *
  * This function can be used to override the unit number used for a
  * device (e.g. to wire a device to a pre-configured unit number).
  */
 int
 device_set_unit(device_t dev, int unit)
 {
 	devclass_t dc;
 	int err;
 
 	dc = device_get_devclass(dev);
 	if (unit < dc->maxunit && dc->devices[unit])
 		return (EBUSY);
 	err = devclass_delete_device(dc, dev);
 	if (err)
 		return (err);
 	dev->unit = unit;
 	err = devclass_add_device(dc, dev);
 	if (err)
 		return (err);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /*======================================*/
 /*
  * Some useful method implementations to make life easier for bus drivers.
  */
 
 /**
  * @brief Initialise a resource list.
  *
  * @param rl		the resource list to initialise
  */
 void
 resource_list_init(struct resource_list *rl)
 {
 	SLIST_INIT(rl);
 }
 
 /**
  * @brief Reclaim memory used by a resource list.
  *
  * This function frees the memory for all resource entries on the list
  * (if any).
  *
  * @param rl		the resource list to free		
  */
 void
 resource_list_free(struct resource_list *rl)
 {
 	struct resource_list_entry *rle;
 
 	while ((rle = SLIST_FIRST(rl)) != NULL) {
 		if (rle->res)
 			panic("resource_list_free: resource entry is busy");
 		SLIST_REMOVE_HEAD(rl, link);
 		free(rle, M_BUS);
 	}
 }
 
 /**
  * @brief Add a resource entry.
  *
  * This function adds a resource entry using the given @p type, @p
  * start, @p end and @p count values. A rid value is chosen by
  * searching sequentially for the first unused rid starting at zero.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param start		the start address of the resource
  * @param end		the end address of the resource
  * @param count		XXX end-start+1
  */
 int
 resource_list_add_next(struct resource_list *rl, int type, u_long start,
     u_long end, u_long count)
 {
 	int rid;
 
 	rid = 0;
 	while (resource_list_find(rl, type, rid) != NULL)
 		rid++;
 	resource_list_add(rl, type, rid, start, end, count);
 	return (rid);
 }
 
 /**
  * @brief Add or modify a resource entry.
  *
  * If an existing entry exists with the same type and rid, it will be
  * modified using the given values of @p start, @p end and @p
  * count. If no entry exists, a new one will be created using the
  * given values.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  * @param start		the start address of the resource
  * @param end		the end address of the resource
  * @param count		XXX end-start+1
  */
 void
 resource_list_add(struct resource_list *rl, int type, int rid,
     u_long start, u_long end, u_long count)
 {
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (!rle) {
 		rle = malloc(sizeof(struct resource_list_entry), M_BUS,
 		    M_NOWAIT);
 		if (!rle)
 			panic("resource_list_add: can't record entry");
 		SLIST_INSERT_HEAD(rl, rle, link);
 		rle->type = type;
 		rle->rid = rid;
 		rle->res = NULL;
 	}
 
 	if (rle->res)
 		panic("resource_list_add: resource entry is busy");
 
 	rle->start = start;
 	rle->end = end;
 	rle->count = count;
 }
 
 /**
  * @brief Find a resource entry by type and rid.
  *
  * @param rl		the resource list to search
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  *
  * @returns the resource entry pointer or NULL if there is no such
  * entry.
  */
 struct resource_list_entry *
 resource_list_find(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle;
 
 	SLIST_FOREACH(rle, rl, link) {
 		if (rle->type == type && rle->rid == rid)
 			return (rle);
 	}
 	return (NULL);
 }
 
 /**
  * @brief Delete a resource entry.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  */
 void
 resource_list_delete(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle = resource_list_find(rl, type, rid);
 
 	if (rle) {
 		if (rle->res != NULL)
 			panic("resource_list_delete: resource has not been released");
 		SLIST_REMOVE(rl, rle, resource_list_entry, link);
 		free(rle, M_BUS);
 	}
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE()
  *
  * Implement BUS_ALLOC_RESOURCE() by looking up a resource from the list
  * and passing the allocation up to the parent of @p bus. This assumes
  * that the first entry of @c device_get_ivars(child) is a struct
  * resource_list. This also handles 'passthrough' allocations where a
  * child is a remote descendant of bus by passing the allocation up to
  * the parent of bus.
  *
  * Typically, a bus driver would store a list of child resources
  * somewhere in the child device's ivars (see device_get_ivars()) and
  * its implementation of BUS_ALLOC_RESOURCE() would find that list and
  * then call resource_list_alloc() to perform the allocation.
  *
  * @param rl		the resource list to allocate from
  * @param bus		the parent device of @p child
  * @param child		the device which is requesting an allocation
  * @param type		the type of resource to allocate
  * @param rid		a pointer to the resource identifier
  * @param start		hint at the start of the resource range - pass
  *			@c 0UL for any start address
  * @param end		hint at the end of the resource range - pass
  *			@c ~0UL for any end address
  * @param count		hint at the size of range required - pass @c 1
  *			for any size
  * @param flags		any extra flags to control the resource
  *			allocation - see @c RF_XXX flags in
  *			<sys/rman.h> for details
  * 
  * @returns		the resource which was allocated or @c NULL if no
  *			resource could be allocated
  */
 struct resource *
 resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
     int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
 {
 	struct resource_list_entry *rle = 0;
 	int passthrough = (device_get_parent(child) != bus);
 	int isdefault = (start == 0UL && end == ~0UL);
 
 	if (passthrough) {
 		return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
 		    type, rid, start, end, count, flags));
 	}
 
 	rle = resource_list_find(rl, type, *rid);
 
 	if (!rle)
 		return (NULL);		/* no resource of that type/rid */
 
 	if (rle->res)
 		panic("resource_list_alloc: resource entry is busy");
 
 	if (isdefault) {
 		start = rle->start;
 		count = ulmax(count, rle->count);
 		end = ulmax(rle->end, start + count - 1);
 	}
 
 	rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
 	    type, rid, start, end, count, flags);
 
 	/*
 	 * Record the new range.
 	 */
 	if (rle->res) {
 		rle->start = rman_get_start(rle->res);
 		rle->end = rman_get_end(rle->res);
 		rle->count = count;
 	}
 
 	return (rle->res);
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE()
  * 
  * Implement BUS_RELEASE_RESOURCE() using a resource list. Normally
  * used with resource_list_alloc().
  * 
  * @param rl		the resource list which was allocated from
  * @param bus		the parent device of @p child
  * @param child		the device which is requesting a release
  * @param type		the type of resource to allocate
  * @param rid		the resource identifier
  * @param res		the resource to release
  * 
  * @retval 0		success
  * @retval non-zero	a standard unix error code indicating what
  *			error condition prevented the operation
  */
 int
 resource_list_release(struct resource_list *rl, device_t bus, device_t child,
     int type, int rid, struct resource *res)
 {
 	struct resource_list_entry *rle = 0;
 	int passthrough = (device_get_parent(child) != bus);
 	int error;
 
 	if (passthrough) {
 		return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
 		    type, rid, res));
 	}
 
 	rle = resource_list_find(rl, type, rid);
 
 	if (!rle)
 		panic("resource_list_release: can't find resource");
 	if (!rle->res)
 		panic("resource_list_release: resource entry is not busy");
 
 	error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
 	    type, rid, res);
 	if (error)
 		return (error);
 
 	rle->res = NULL;
 	return (0);
 }
 
 /**
  * @brief Print a description of resources in a resource list
  *
  * Print all resources of a specified type, for use in BUS_PRINT_CHILD().
  * The name is printed if at least one resource of the given type is available.
  * The format is used to print resource start and end.
  *
  * @param rl		the resource list to print
  * @param name		the name of @p type, e.g. @c "memory"
  * @param type		type type of resource entry to print
  * @param format	printf(9) format string to print resource
  *			start and end values
  * 
  * @returns		the number of characters printed
  */
 int
 resource_list_print_type(struct resource_list *rl, const char *name, int type,
     const char *format)
 {
 	struct resource_list_entry *rle;
 	int printed, retval;
 
 	printed = 0;
 	retval = 0;
 	/* Yes, this is kinda cheating */
 	SLIST_FOREACH(rle, rl, link) {
 		if (rle->type == type) {
 			if (printed == 0)
 				retval += printf(" %s ", name);
 			else
 				retval += printf(",");
 			printed++;
 			retval += printf(format, rle->start);
 			if (rle->count > 1) {
 				retval += printf("-");
 				retval += printf(format, rle->start +
 						 rle->count - 1);
 			}
 		}
 	}
 	return (retval);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_PROBE()
  *
  * This function can be used to help implement the DEVICE_PROBE() for
  * a bus (i.e. a device which has other devices attached to it). It
  * calls the DEVICE_IDENTIFY() method of each driver in the device's
  * devclass.
  */
 int
 bus_generic_probe(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 	driverlink_t dl;
 
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		DEVICE_IDENTIFY(dl->driver, dev);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_ATTACH()
  *
  * This function can be used to help implement the DEVICE_ATTACH() for
  * a bus. It calls device_probe_and_attach() for each of the device's
  * children.
  */
 int
 bus_generic_attach(device_t dev)
 {
 	device_t child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		device_probe_and_attach(child);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_DETACH()
  *
  * This function can be used to help implement the DEVICE_DETACH() for
  * a bus. It calls device_detach() for each of the device's
  * children.
  */
 int
 bus_generic_detach(device_t dev)
 {
 	device_t child;
 	int error;
 
 	if (dev->state != DS_ATTACHED)
 		return (EBUSY);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		if ((error = device_detach(child)) != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_SHUTDOWN()
  *
  * This function can be used to help implement the DEVICE_SHUTDOWN()
  * for a bus. It calls device_shutdown() for each of the device's
  * children.
  */
 int
 bus_generic_shutdown(device_t dev)
 {
 	device_t child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		device_shutdown(child);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_SUSPEND()
  *
  * This function can be used to help implement the DEVICE_SUSPEND()
  * for a bus. It calls DEVICE_SUSPEND() for each of the device's
  * children. If any call to DEVICE_SUSPEND() fails, the suspend
  * operation is aborted and any devices which were suspended are
  * resumed immediately by calling their DEVICE_RESUME() methods.
  */
 int
 bus_generic_suspend(device_t dev)
 {
 	int		error;
 	device_t	child, child2;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		error = DEVICE_SUSPEND(child);
 		if (error) {
 			for (child2 = TAILQ_FIRST(&dev->children);
 			     child2 && child2 != child;
 			     child2 = TAILQ_NEXT(child2, link))
 				DEVICE_RESUME(child2);
 			return (error);
 		}
 	}
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_RESUME()
  *
  * This function can be used to help implement the DEVICE_RESUME() for
  * a bus. It calls DEVICE_RESUME() on each of the device's children.
  */
 int
 bus_generic_resume(device_t dev)
 {
 	device_t	child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		DEVICE_RESUME(child);
 		/* if resume fails, there's nothing we can usefully do... */
 	}
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints the first part of the ascii representation of
  * @p child, including its name, unit and description (if any - see
  * device_set_desc()).
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_header(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	if (device_get_desc(child)) {
 		retval += device_printf(child, "<%s>", device_get_desc(child));
 	} else {
 		retval += printf("%s", device_get_nameunit(child));
 	}
 
 	return (retval);
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints the last part of the ascii representation of
  * @p child, which consists of the string @c " on " followed by the
  * name and unit of the @p dev.
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_footer(device_t dev, device_t child)
 {
 	return (printf(" on %s\n", device_get_nameunit(dev)));
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function simply calls bus_print_child_header() followed by
  * bus_print_child_footer().
  *
  * @returns the number of characters printed
  */
 int
 bus_generic_print_child(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	retval += bus_print_child_header(dev, child);
 	retval += bus_print_child_footer(dev, child);
 
 	return (retval);
 }
 
 /**
  * @brief Stub function for implementing BUS_READ_IVAR().
  * 
  * @returns ENOENT
  */
 int
 bus_generic_read_ivar(device_t dev, device_t child, int index,
     uintptr_t * result)
 {
 	return (ENOENT);
 }
 
 /**
  * @brief Stub function for implementing BUS_WRITE_IVAR().
  * 
  * @returns ENOENT
  */
 int
 bus_generic_write_ivar(device_t dev, device_t child, int index,
     uintptr_t value)
 {
 	return (ENOENT);
 }
 
 /**
  * @brief Stub function for implementing BUS_GET_RESOURCE_LIST().
  * 
  * @returns NULL
  */
 struct resource_list *
 bus_generic_get_resource_list(device_t dev, device_t child)
 {
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DRIVER_ADDED().
  *
  * This implementation of BUS_DRIVER_ADDED() simply calls the driver's
  * DEVICE_IDENTIFY() method to allow it to add new children to the bus
  * and then calls device_probe_and_attach() for each unattached child.
  */
 void
 bus_generic_driver_added(device_t dev, driver_t *driver)
 {
 	device_t child;
 
 	DEVICE_IDENTIFY(driver, dev);
 	TAILQ_FOREACH(child, &dev->children, link) {
 		if (child->state == DS_NOTPRESENT)
 			device_probe_and_attach(child);
 	}
 }
 
 /**
  * @brief Helper function for implementing BUS_SETUP_INTR().
  *
  * This simple implementation of BUS_SETUP_INTR() simply calls the
  * BUS_SETUP_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
     int flags, driver_intr_t *intr, void *arg, void **cookiep)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
 		    intr, arg, cookiep));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_TEARDOWN_INTR().
  *
  * This simple implementation of BUS_TEARDOWN_INTR() simply calls the
  * BUS_TEARDOWN_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
     void *cookie)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
  *
  * This simple implementation of BUS_ALLOC_RESOURCE() simply calls the
  * BUS_ALLOC_RESOURCE() method of the parent of @p dev.
  */
 struct resource *
 bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
     u_long start, u_long end, u_long count, u_int flags)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid,
 		    start, end, count, flags));
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
  *
  * This simple implementation of BUS_RELEASE_RESOURCE() simply calls the
  * BUS_RELEASE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
     struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_ACTIVATE_RESOURCE().
  *
  * This simple implementation of BUS_ACTIVATE_RESOURCE() simply calls the
  * BUS_ACTIVATE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
     struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DEACTIVATE_RESOURCE().
  *
  * This simple implementation of BUS_DEACTIVATE_RESOURCE() simply calls the
  * BUS_DEACTIVATE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_deactivate_resource(device_t dev, device_t child, int type,
     int rid, struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_CONFIG_INTR().
  *
  * This simple implementation of BUS_CONFIG_INTR() simply calls the
  * BUS_CONFIG_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_config_intr(device_t dev, int irq, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_CONFIG_INTR(dev->parent, irq, trig, pol));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_RESOURCE().
  *
  * This implementation of BUS_GET_RESOURCE() uses the
  * resource_list_find() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * search.
  */
 int
 bus_generic_rl_get_resource(device_t dev, device_t child, int type, int rid,
     u_long *startp, u_long *countp)
 {
 	struct resource_list *		rl = NULL;
 	struct resource_list_entry *	rle = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	rle = resource_list_find(rl, type, rid);
 	if (!rle)
 		return (ENOENT);
 
 	if (startp)
 		*startp = rle->start;
 	if (countp)
 		*countp = rle->count;
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_SET_RESOURCE().
  *
  * This implementation of BUS_SET_RESOURCE() uses the
  * resource_list_add() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * edit.
  */
 int
 bus_generic_rl_set_resource(device_t dev, device_t child, int type, int rid,
     u_long start, u_long count)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	resource_list_add(rl, type, rid, start, (start + count - 1), count);
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_DELETE_RESOURCE().
  *
  * This implementation of BUS_DELETE_RESOURCE() uses the
  * resource_list_delete() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * edit.
  */
 void
 bus_generic_rl_delete_resource(device_t dev, device_t child, int type, int rid)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return;
 
 	resource_list_delete(rl, type, rid);
 
 	return;
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
  *
  * This implementation of BUS_RELEASE_RESOURCE() uses the
  * resource_list_release() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
  */
 int
 bus_generic_rl_release_resource(device_t dev, device_t child, int type,
     int rid, struct resource *r)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	return (resource_list_release(rl, dev, child, type, rid, r));
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
  *
  * This implementation of BUS_ALLOC_RESOURCE() uses the
  * resource_list_alloc() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
  */
 struct resource *
 bus_generic_rl_alloc_resource(device_t dev, device_t child, int type,
     int *rid, u_long start, u_long end, u_long count, u_int flags)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (NULL);
 
 	return (resource_list_alloc(rl, dev, child, type, rid,
 	    start, end, count, flags));
 }
 
 /**
  * @brief Helper function for implementing BUS_CHILD_PRESENT().
  *
  * This simple implementation of BUS_CHILD_PRESENT() simply calls the
  * BUS_CHILD_PRESENT() method of the parent of @p dev.
  */
 int
 bus_generic_child_present(device_t dev, device_t child)
 {
 	return (BUS_CHILD_PRESENT(device_get_parent(dev), dev));
 }
 
 /*
  * Some convenience functions to make it easier for drivers to use the
  * resource-management functions.  All these really do is hide the
  * indirection through the parent's method table, making for slightly
  * less-wordy code.  In the future, it might make sense for this code
  * to maintain some sort of a list of resources allocated by each device.
  */
 
 /**
  * @brief Wrapper function for BUS_ALLOC_RESOURCE().
  *
  * This function simply calls the BUS_ALLOC_RESOURCE() method of the
  * parent of @p dev.
  */
 struct resource *
 bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
     u_long count, u_int flags)
 {
 	if (dev->parent == 0)
 		return (0);
 	return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
 	    count, flags));
 }
 
 /**
  * @brief Wrapper function for BUS_ACTIVATE_RESOURCE().
  *
  * This function simply calls the BUS_ACTIVATE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == 0)
 		return (EINVAL);
 	return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_DEACTIVATE_RESOURCE().
  *
  * This function simply calls the BUS_DEACTIVATE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == 0)
 		return (EINVAL);
 	return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_RELEASE_RESOURCE().
  *
  * This function simply calls the BUS_RELEASE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_release_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == 0)
 		return (EINVAL);
 	return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_SETUP_INTR().
  *
  * This function simply calls the BUS_SETUP_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_setup_intr(device_t dev, struct resource *r, int flags,
     driver_intr_t handler, void *arg, void **cookiep)
 {
 	int error;
 
 	if (dev->parent != 0) {
 		if ((flags &~ INTR_ENTROPY) == (INTR_TYPE_NET | INTR_MPSAFE) &&
 		    !debug_mpsafenet)
 			flags &= ~INTR_MPSAFE;
 		error = BUS_SETUP_INTR(dev->parent, dev, r, flags,
 		    handler, arg, cookiep);
 		if (error == 0) {
 			if (bootverbose && !(flags & (INTR_MPSAFE | INTR_FAST)))
 				device_printf(dev, "[GIANT-LOCKED]\n");
 			if (bootverbose && (flags & INTR_MPSAFE))
 				device_printf(dev, "[MPSAFE]\n");
 			if (bootverbose && (flags & INTR_FAST))
 				device_printf(dev, "[FAST]\n");
 		}
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 /**
  * @brief Wrapper function for BUS_TEARDOWN_INTR().
  *
  * This function simply calls the BUS_TEARDOWN_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
 {
 	if (dev->parent == 0)
 		return (EINVAL);
 	return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
 }
 
 /**
  * @brief Wrapper function for BUS_SET_RESOURCE().
  *
  * This function simply calls the BUS_SET_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_set_resource(device_t dev, int type, int rid,
     u_long start, u_long count)
 {
 	return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    start, count));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_get_resource(device_t dev, int type, int rid,
     u_long *startp, u_long *countp)
 {
 	return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    startp, countp));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev and returns the start value.
  */
 u_long
 bus_get_resource_start(device_t dev, int type, int rid)
 {
 	u_long start, count;
 	int error;
 
 	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    &start, &count);
 	if (error)
 		return (0);
 	return (start);
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev and returns the count value.
  */
 u_long
 bus_get_resource_count(device_t dev, int type, int rid)
 {
 	u_long start, count;
 	int error;
 
 	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    &start, &count);
 	if (error)
 		return (0);
 	return (count);
 }
 
 /**
  * @brief Wrapper function for BUS_DELETE_RESOURCE().
  *
  * This function simply calls the BUS_DELETE_RESOURCE() method of the
  * parent of @p dev.
  */
 void
 bus_delete_resource(device_t dev, int type, int rid)
 {
 	BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid);
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_PRESENT().
  *
  * This function simply calls the BUS_CHILD_PRESENT() method of the
  * parent of @p dev.
  */
 int
 bus_child_present(device_t child)
 {
 	return (BUS_CHILD_PRESENT(device_get_parent(child), child));
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_PNPINFO_STR().
  *
  * This function simply calls the BUS_CHILD_PNPINFO_STR() method of the
  * parent of @p dev.
  */
 int
 bus_child_pnpinfo_str(device_t child, char *buf, size_t buflen)
 {
 	device_t parent;
 
 	parent = device_get_parent(child);
 	if (parent == NULL) {
 		*buf = '\0';
 		return (0);
 	}
 	return (BUS_CHILD_PNPINFO_STR(parent, child, buf, buflen));
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_LOCATION_STR().
  *
  * This function simply calls the BUS_CHILD_LOCATION_STR() method of the
  * parent of @p dev.
  */
 int
 bus_child_location_str(device_t child, char *buf, size_t buflen)
 {
 	device_t parent;
 
 	parent = device_get_parent(child);
 	if (parent == NULL) {
 		*buf = '\0';
 		return (0);
 	}
 	return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen));
 }
 
 static int
 root_print_child(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	retval += bus_print_child_header(dev, child);
 	retval += printf("\n");
 
 	return (retval);
 }
 
 static int
 root_setup_intr(device_t dev, device_t child, driver_intr_t *intr, void *arg,
     void **cookiep)
 {
 	/*
 	 * If an interrupt mapping gets to here something bad has happened.
 	 */
 	panic("root_setup_intr");
 }
 
 /*
  * If we get here, assume that the device is permanant and really is
  * present in the system.  Removable bus drivers are expected to intercept
  * this call long before it gets here.  We return -1 so that drivers that
  * really care can check vs -1 or some ERRNO returned higher in the food
  * chain.
  */
 static int
 root_child_present(device_t dev, device_t child)
 {
 	return (-1);
 }
 
 static kobj_method_t root_methods[] = {
 	/* Device interface */
 	KOBJMETHOD(device_shutdown,	bus_generic_shutdown),
 	KOBJMETHOD(device_suspend,	bus_generic_suspend),
 	KOBJMETHOD(device_resume,	bus_generic_resume),
 
 	/* Bus interface */
 	KOBJMETHOD(bus_print_child,	root_print_child),
 	KOBJMETHOD(bus_read_ivar,	bus_generic_read_ivar),
 	KOBJMETHOD(bus_write_ivar,	bus_generic_write_ivar),
 	KOBJMETHOD(bus_setup_intr,	root_setup_intr),
 	KOBJMETHOD(bus_child_present,	root_child_present),
 
 	{ 0, 0 }
 };
 
 static driver_t root_driver = {
 	"root",
 	root_methods,
 	1,			/* no softc */
 };
 
 device_t	root_bus;
 devclass_t	root_devclass;
 
 static int
 root_bus_module_handler(module_t mod, int what, void* arg)
 {
 	switch (what) {
 	case MOD_LOAD:
 		TAILQ_INIT(&bus_data_devices);
 		kobj_class_compile((kobj_class_t) &root_driver);
 		root_bus = make_device(NULL, "root", 0);
 		root_bus->desc = "System root bus";
 		kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
 		root_bus->driver = &root_driver;
 		root_bus->state = DS_ATTACHED;
 		root_devclass = devclass_find_internal("root", 0, FALSE);
 		devinit();
 		return (0);
 
 	case MOD_SHUTDOWN:
 		device_shutdown(root_bus);
 		return (0);
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static moduledata_t root_bus_mod = {
 	"rootbus",
 	root_bus_module_handler,
 	0
 };
 DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
 
 /**
  * @brief Automatically configure devices
  *
  * This function begins the autoconfiguration process by calling
  * device_probe_and_attach() for each child of the @c root0 device.
  */ 
 void
 root_bus_configure(void)
 {
 	device_t dev;
 
 	PDEBUG(("."));
 
 	TAILQ_FOREACH(dev, &root_bus->children, link) {
 		device_probe_and_attach(dev);
 	}
 }
 
 /**
  * @brief Module handler for registering device drivers
  *
  * This module handler is used to automatically register device
  * drivers when modules are loaded. If @p what is MOD_LOAD, it calls
  * devclass_add_driver() for the driver described by the
  * driver_module_data structure pointed to by @p arg
  */
 int
 driver_module_handler(module_t mod, int what, void *arg)
 {
 	int error;
 	struct driver_module_data *dmd;
 	devclass_t bus_devclass;
 	kobj_class_t driver;
 
 	dmd = (struct driver_module_data *)arg;
 	bus_devclass = devclass_find_internal(dmd->dmd_busname, 0, TRUE);
 	error = 0;
 
 	switch (what) {
 	case MOD_LOAD:
 		if (dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 
 		driver = dmd->dmd_driver;
 		PDEBUG(("Loading module: driver %s on bus %s",
 		    DRIVERNAME(driver), dmd->dmd_busname));
 		error = devclass_add_driver(bus_devclass, driver);
 		if (error)
 			break;
 
 		/*
 		 * If the driver has any base classes, make the
 		 * devclass inherit from the devclass of the driver's
 		 * first base class. This will allow the system to
 		 * search for drivers in both devclasses for children
 		 * of a device using this driver.
 		 */
 		if (driver->baseclasses) {
 			const char *parentname;
 			parentname = driver->baseclasses[0]->name;
 			*dmd->dmd_devclass =
 				devclass_find_internal(driver->name,
 				    parentname, TRUE);
 		} else {
 			*dmd->dmd_devclass =
 				devclass_find_internal(driver->name, 0, TRUE);
 		}
 		break;
 
 	case MOD_UNLOAD:
 		PDEBUG(("Unloading module: driver %s from bus %s",
 		    DRIVERNAME(dmd->dmd_driver),
 		    dmd->dmd_busname));
 		error = devclass_delete_driver(bus_devclass,
 		    dmd->dmd_driver);
 
 		if (!error && dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 #ifdef BUS_DEBUG
 
 /* the _short versions avoid iteration by not calling anything that prints
  * more than oneliners. I love oneliners.
  */
 
 static void
 print_device_short(device_t dev, int indent)
 {
 	if (!dev)
 		return;
 
 	indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s,%sivars,%ssoftc,busy=%d\n",
 	    dev->unit, dev->desc,
 	    (dev->parent? "":"no "),
 	    (TAILQ_EMPTY(&dev->children)? "no ":""),
 	    (dev->flags&DF_ENABLED? "enabled,":"disabled,"),
 	    (dev->flags&DF_FIXEDCLASS? "fixed,":""),
 	    (dev->flags&DF_WILDCARD? "wildcard,":""),
 	    (dev->flags&DF_DESCMALLOCED? "descmalloced,":""),
 	    (dev->ivars? "":"no "),
 	    (dev->softc? "":"no "),
 	    dev->busy));
 }
 
 static void
 print_device(device_t dev, int indent)
 {
 	if (!dev)
 		return;
 
 	print_device_short(dev, indent);
 
 	indentprintf(("Parent:\n"));
 	print_device_short(dev->parent, indent+1);
 	indentprintf(("Driver:\n"));
 	print_driver_short(dev->driver, indent+1);
 	indentprintf(("Devclass:\n"));
 	print_devclass_short(dev->devclass, indent+1);
 }
 
 void
 print_device_tree_short(device_t dev, int indent)
 /* print the device and all its children (indented) */
 {
 	device_t child;
 
 	if (!dev)
 		return;
 
 	print_device_short(dev, indent);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		print_device_tree_short(child, indent+1);
 	}
 }
 
 void
 print_device_tree(device_t dev, int indent)
 /* print the device and all its children (indented) */
 {
 	device_t child;
 
 	if (!dev)
 		return;
 
 	print_device(dev, indent);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		print_device_tree(child, indent+1);
 	}
 }
 
 static void
 print_driver_short(driver_t *driver, int indent)
 {
 	if (!driver)
 		return;
 
 	indentprintf(("driver %s: softc size = %zd\n",
 	    driver->name, driver->size));
 }
 
 static void
 print_driver(driver_t *driver, int indent)
 {
 	if (!driver)
 		return;
 
 	print_driver_short(driver, indent);
 }
 
 
 static void
 print_driver_list(driver_list_t drivers, int indent)
 {
 	driverlink_t driver;
 
 	TAILQ_FOREACH(driver, &drivers, link) {
 		print_driver(driver->driver, indent);
 	}
 }
 
 static void
 print_devclass_short(devclass_t dc, int indent)
 {
 	if ( !dc )
 		return;
 
 	indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit));
 }
 
 static void
 print_devclass(devclass_t dc, int indent)
 {
 	int i;
 
 	if ( !dc )
 		return;
 
 	print_devclass_short(dc, indent);
 	indentprintf(("Drivers:\n"));
 	print_driver_list(dc->drivers, indent+1);
 
 	indentprintf(("Devices:\n"));
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i])
 			print_device(dc->devices[i], indent+1);
 }
 
 void
 print_devclass_list_short(void)
 {
 	devclass_t dc;
 
 	printf("Short listing of devclasses, drivers & devices:\n");
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		print_devclass_short(dc, 0);
 	}
 }
 
 void
 print_devclass_list(void)
 {
 	devclass_t dc;
 
 	printf("Full listing of devclasses, drivers & devices:\n");
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		print_devclass(dc, 0);
 	}
 }
 
 #endif
 
 /*
  * User-space access to the device tree.
  *
  * We implement a small set of nodes:
  *
  * hw.bus			Single integer read method to obtain the
  *				current generation count.
  * hw.bus.devices		Reads the entire device tree in flat space.
  * hw.bus.rman			Resource manager interface
  *
  * We might like to add the ability to scan devclasses and/or drivers to
  * determine what else is currently loaded/available.
  */
 
 static int
 sysctl_bus(SYSCTL_HANDLER_ARGS)
 {
 	struct u_businfo	ubus;
 
 	ubus.ub_version = BUS_USER_VERSION;
 	ubus.ub_generation = bus_data_generation;
 
 	return (SYSCTL_OUT(req, &ubus, sizeof(ubus)));
 }
 SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus,
     "bus-related data");
 
 static int
 sysctl_devices(SYSCTL_HANDLER_ARGS)
 {
 	int			*name = (int *)arg1;
 	u_int			namelen = arg2;
 	int			index;
 	struct device		*dev;
 	struct u_device		udev;	/* XXX this is a bit big */
 	int			error;
 
 	if (namelen != 2)
 		return (EINVAL);
 
 	if (bus_data_generation_check(name[0]))
 		return (EINVAL);
 
 	index = name[1];
 
 	/*
 	 * Scan the list of devices, looking for the requested index.
 	 */
 	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
 		if (index-- == 0)
 			break;
 	}
 	if (dev == NULL)
 		return (ENOENT);
 
 	/*
 	 * Populate the return array.
 	 */
+	bzero(&udev, sizeof(udev));
 	udev.dv_handle = (uintptr_t)dev;
 	udev.dv_parent = (uintptr_t)dev->parent;
 	if (dev->nameunit == NULL)
 		udev.dv_name[0] = '\0';
 	else
 		strlcpy(udev.dv_name, dev->nameunit, sizeof(udev.dv_name));
 
 	if (dev->desc == NULL)
 		udev.dv_desc[0] = '\0';
 	else
 		strlcpy(udev.dv_desc, dev->desc, sizeof(udev.dv_desc));
 	if (dev->driver == NULL || dev->driver->name == NULL)
 		udev.dv_drivername[0] = '\0';
 	else
 		strlcpy(udev.dv_drivername, dev->driver->name,
 		    sizeof(udev.dv_drivername));
 	udev.dv_pnpinfo[0] = '\0';
 	udev.dv_location[0] = '\0';
 	bus_child_pnpinfo_str(dev, udev.dv_pnpinfo, sizeof(udev.dv_pnpinfo));
 	bus_child_location_str(dev, udev.dv_location, sizeof(udev.dv_location));
 	udev.dv_devflags = dev->devflags;
 	udev.dv_flags = dev->flags;
 	udev.dv_state = dev->state;
 	error = SYSCTL_OUT(req, &udev, sizeof(udev));
 	return (error);
 }
 
 SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices,
     "system device tree");
 
 /*
  * Sysctl interface for scanning the resource lists.
  *
  * We take two input parameters; the index into the list of resource
  * managers, and the resource offset into the list.
  */
 static int
 sysctl_rman(SYSCTL_HANDLER_ARGS)
 {
 	int			*name = (int *)arg1;
 	u_int			namelen = arg2;
 	int			rman_idx, res_idx;
 	struct rman		*rm;
 	struct resource		*res;
 	struct u_rman		urm;
 	struct u_resource	ures;
 	int			error;
 
 	if (namelen != 3)
 		return (EINVAL);
 
 	if (bus_data_generation_check(name[0]))
 		return (EINVAL);
 	rman_idx = name[1];
 	res_idx = name[2];
 
 	/*
 	 * Find the indexed resource manager
 	 */
 	TAILQ_FOREACH(rm, &rman_head, rm_link) {
 		if (rman_idx-- == 0)
 			break;
 	}
 	if (rm == NULL)
 		return (ENOENT);
 
 	/*
 	 * If the resource index is -1, we want details on the
 	 * resource manager.
 	 */
 	if (res_idx == -1) {
+		bzero(&urm, sizeof(urm));
 		urm.rm_handle = (uintptr_t)rm;
 		strlcpy(urm.rm_descr, rm->rm_descr, RM_TEXTLEN);
 		urm.rm_start = rm->rm_start;
 		urm.rm_size = rm->rm_end - rm->rm_start + 1;
 		urm.rm_type = rm->rm_type;
 
 		error = SYSCTL_OUT(req, &urm, sizeof(urm));
 		return (error);
 	}
 
 	/*
 	 * Find the indexed resource and return it.
 	 */
 	TAILQ_FOREACH(res, &rm->rm_list, r_link) {
 		if (res_idx-- == 0) {
+			bzero(&ures, sizeof(ures));
 			ures.r_handle = (uintptr_t)res;
 			ures.r_parent = (uintptr_t)res->r_rm;
 			ures.r_device = (uintptr_t)res->r_dev;
 			if (res->r_dev != NULL) {
 				if (device_get_name(res->r_dev) != NULL) {
 					snprintf(ures.r_devname, RM_TEXTLEN,
 					    "%s%d",
 					    device_get_name(res->r_dev),
 					    device_get_unit(res->r_dev));
 				} else {
 					strlcpy(ures.r_devname, "nomatch",
 					    RM_TEXTLEN);
 				}
 			} else {
 				ures.r_devname[0] = '\0';
 			}
 			ures.r_start = res->r_start;
 			ures.r_size = res->r_end - res->r_start + 1;
 			ures.r_flags = res->r_flags;
 
 			error = SYSCTL_OUT(req, &ures, sizeof(ures));
 			return (error);
 		}
 	}
 	return (ENOENT);
 }
 
 SYSCTL_NODE(_hw_bus, OID_AUTO, rman, CTLFLAG_RD, sysctl_rman,
     "kernel resource manager");
 
 int
 bus_data_generation_check(int generation)
 {
 	if (generation != bus_data_generation)
 		return (1);
 
 	/* XXX generate optimised lists here? */
 	return (0);
 }
 
 void
 bus_data_generation_update(void)
 {
 	bus_data_generation++;
 }
Index: stable/5/sys/kern/vfs_subr.c
===================================================================
--- stable/5/sys/kern/vfs_subr.c	(revision 145952)
+++ stable/5/sys/kern/vfs_subr.c	(revision 145953)
@@ -1,4060 +1,4063 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  */
 
 /*
  * External virtual filesystem routines
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/fcntl.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/reboot.h>
 #include <sys/sleepqueue.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
 
 static void	addalias(struct vnode *vp, struct cdev *nvp_rdev);
 static void	delmntque(struct vnode *vp);
 static void	insmntque(struct vnode *vp, struct mount *mp);
 static void	vclean(struct vnode *vp, int flags, struct thread *td);
 static void	vlruvp(struct vnode *vp);
 static int	flushbuflist(struct buf *blist, int flags, struct vnode *vp,
 		    int slpflag, int slptimeo, int *errorp);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	vx_lock(struct vnode *vp);
 static void	vx_unlock(struct vnode *vp);
 static void	vgonechrl(struct vnode *vp, struct thread *td);
 
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, never decreased.
  */
 static unsigned long	numvnodes;
 
 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
 int vttoif_tab[9] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT,
 };
 
 /*
  * List of vnodes that are ready for recycling.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
  * Minimum number of free vnodes.  If there are fewer than this free vnodes,
  * getnewvnode() will return a newly allocated vnode.
  */
 static u_long wantfreevnodes = 25;
 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 /* Number of vnodes in the free list. */
 static u_long freevnodes;
 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 
 /*
  * Various variables used for debugging the new implementation of
  * reassignbuf().
  * XXX these are probably of (very) limited utility now.
  */
 static int reassignbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
 static int nameileafonly;
 SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
 
 /*
  * Cache for the mount type id assigned to NFS.  This is used for
  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
  */
 int	nfs_mount_type = -1;
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_free_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx vnode_free_list_mtx;
 
 /*
  * For any iteration/modification of dev->si_hlist (linked through
  * v_specnext)
  */
 static struct mtx spechash_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 
 /* Set to 1 to print out reclaim of active vnodes */
 int	prtactive;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, vnode);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	vp->v_synclist
  *	sync_vnode_count
  *	syncer_delayno
  *	syncer_state
  *	syncer_workitem_pending
  *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
 #define SYNCER_SHUTDOWN_SPEEDUP		4
 static int sync_vnode_count;
 static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
 /*
  * Number of vnodes we want to exist at any one time.  This is mostly used
  * to size hash tables in vnode-related code.  It is normally not used in
  * getnewvnode(), as wantfreevnodes is normally nonzero.)
  *
  * XXX desiredvnodes is historical cruft and should not exist.
  */
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
     &desiredvnodes, 0, "Maximum number of vnodes");
 static int minvnodes;
 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &minvnodes, 0, "Minimum number of vnodes");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
 /* Hook for calling soft updates. */
 int (*softdep_process_worklist_hook)(struct mount *);
 
 /*
  * Initialize the vnode management data structures.
  */
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	100000
 #endif
 static void
 vntblinit(void *dummy __unused)
 {
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and
 	 * the kernel's heap size.  Specifically, desiredvnodes scales
 	 * in proportion to the physical memory size until two fifths
 	 * of the kernel's heap size is consumed by vnodes and vm
 	 * objects.
 	 */
 	desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
 	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
 	if (desiredvnodes > MAXVNODES_MAX) {
 		if (bootverbose)
 			printf("Reducing kern.maxvnodes %d -> %d\n",
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
 	minvnodes = desiredvnodes / 4;
 	mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 		&syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
 
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Interlock is not released on failure.
  */
 int
 vfs_busy(mp, flags, interlkp, td)
 	struct mount *mp;
 	int flags;
 	struct mtx *interlkp;
 	struct thread *td;
 {
 	int lkflags;
 
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & LK_NOWAIT)
 			return (ENOENT);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		/*
 		 * Since all busy locks are shared except the exclusive
 		 * lock granted when unmounting, the only place that a
 		 * wakeup needs to be done is at the release of the
 		 * exclusive lock at the end of dounmount.
 		 */
 		msleep(mp, interlkp, PVFS, "vfs_busy", 0);
 		return (ENOENT);
 	}
 	lkflags = LK_SHARED | LK_NOPAUSE;
 	if (interlkp)
 		lkflags |= LK_INTERLOCK;
 	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
 		panic("vfs_busy: unexpected lock failure");
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(mp, td)
 	struct mount *mp;
 	struct thread *td;
 {
 
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid)
 	fsid_t *fsid;
 {
 	register struct mount *mp;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Check if a user can access priveledged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
 	if ((mp->mnt_flag & MNT_USER) == 0 ||
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 		if ((error = suser(td)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(mp)
 	struct mount *mp;
 {
 	static u_int16_t mntid_base;
 	fsid_t tfsid;
 	int mtype;
 
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if (vfs_getvfs(&tfsid) == NULL)
 			break;
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_SEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(tsp)
 	struct timespec *tsp;
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(vap)
 	register struct vattr *vap;
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desireable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
 vlrureclaim(struct mount *mp)
 {
 	struct vnode *vp;
 	int done;
 	int trigger;
 	int usevnodes;
 	int count;
 
 	/*
 	 * Calculate the trigger point, don't allow user
 	 * screwups to blow us up.   This prevents us from
 	 * recycling vnodes with lots of resident pages.  We
 	 * aren't trying to free memory, we are trying to
 	 * free vnodes.
 	 */
 	usevnodes = desiredvnodes;
 	if (usevnodes <= 0)
 		usevnodes = 1;
 	trigger = cnt.v_page_count * 2 / usevnodes;
 
 	done = 0;
 	MNT_ILOCK(mp);
 	count = mp->mnt_nvnodelistsize / 10 + 1;
 	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 
 		if (vp->v_type != VNON &&
 		    vp->v_type != VBAD &&
 		    VI_TRYLOCK(vp)) {
 			if (VMIGHTFREE(vp) &&           /* critical path opt */
 			    (vp->v_object == NULL ||
 			    vp->v_object->resident_page_count < trigger)) {
 				MNT_IUNLOCK(mp);
 				vgonel(vp, curthread);
 				done++;
 				MNT_ILOCK(mp);
 			} else
 				VI_UNLOCK(vp);
 		}
 		--count;
 	}
 	MNT_IUNLOCK(mp);
 	return done;
 }
 
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
 	int done;
 	struct proc *p = vnlruproc;
 	struct thread *td = FIRST_THREAD_IN_PROC(p);
 
 	mtx_lock(&Giant);
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 	    SHUTDOWN_PRI_FIRST);
 
 	for (;;) {
 		kthread_suspend_check(p);
 		mtx_lock(&vnode_free_list_mtx);
 		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
 			mtx_unlock(&vnode_free_list_mtx);
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			tsleep(vnlruproc, PVFS, "vlruwt", hz);
 			continue;
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 			done += vlrureclaim(mp);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp, td);
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (done == 0) {
 #if 0
 			/* These messages are temporary debugging aids */
 			if (vnlru_nowhere < 5)
 				printf("vnlru process getting nowhere..\n");
 			else if (vnlru_nowhere == 5)
 				printf("vnlru process messages stopped.\n");
 #endif
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		}
 	}
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
 
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 /*
  * Check to see if a free vnode can be recycled. If it can,
  * recycle it and return it with the vnode interlock held.
  */
 static int
 vtryrecycle(struct vnode *vp)
 {
 	struct thread *td = curthread;
 	vm_object_t object;
 	struct mount *vnmp;
 	int error;
 
 	/* Don't recycle if we can't get the interlock */
 	if (!VI_TRYLOCK(vp))
 		return (EWOULDBLOCK);
 	/*
 	 * This vnode may found and locked via some other list, if so we
 	 * can't recycle it yet.
 	 */
 	if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
 		return (EWOULDBLOCK);
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 		VOP_UNLOCK(vp, 0, td);
 		return (EBUSY);
 	}
 
 	/*
 	 * Don't recycle if we still have cached pages.
 	 */
 	if (VOP_GETVOBJECT(vp, &object) == 0) {
 		VM_OBJECT_LOCK(object);
 		if (object->resident_page_count ||
 		    object->ref_count) {
 			VM_OBJECT_UNLOCK(object);
 			error = EBUSY;
 			goto done;
 		}
 		VM_OBJECT_UNLOCK(object);
 	}
 	if (LIST_FIRST(&vp->v_cache_src)) {
 		/*
 		 * note: nameileafonly sysctl is temporary,
 		 * for debugging only, and will eventually be
 		 * removed.
 		 */
 		if (nameileafonly > 0) {
 			/*
 			 * Do not reuse namei-cached directory
 			 * vnodes that have cached
 			 * subdirectories.
 			 */
 			if (cache_leaf_test(vp) < 0) {
 				error = EISDIR;
 				goto done;
 			}
 		} else if (nameileafonly < 0 ||
 			    vmiodirenable == 0) {
 			/*
 			 * Do not reuse namei-cached directory
 			 * vnodes if nameileafonly is -1 or
 			 * if VMIO backing for directories is
 			 * turned off (otherwise we reuse them
 			 * too quickly).
 			 */
 			error = EBUSY;
 			goto done;
 		}
 	}
 	/*
 	 * If we got this far, we need to acquire the interlock and see if
 	 * anyone picked up this vnode from another list.  If not, we will
 	 * mark it with XLOCK via vgonel() so that anyone who does find it
 	 * will skip over it.
 	 */
 	VI_LOCK(vp);
 	if (VSHOULDBUSY(vp) && (vp->v_iflag & VI_XLOCK) == 0) {
 		VI_UNLOCK(vp);
 		error = EBUSY;
 		goto done;
 	}
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 	vp->v_iflag &= ~VI_FREE;
 	mtx_unlock(&vnode_free_list_mtx);
 	vp->v_iflag |= VI_DOOMED;
 	if (vp->v_type != VBAD) {
 		VOP_UNLOCK(vp, 0, td);
 		vgonel(vp, td);
 		VI_LOCK(vp);
 	} else
 		VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(vnmp);
 	return (0);
 done:
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(vnmp);
 	return (error);
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(tag, mp, vops, vpp)
 	const char *tag;
 	struct mount *mp;
 	vop_t **vops;
 	struct vnode **vpp;
 {
 	struct vnode *vp = NULL;
 	struct vpollinfo *pollinfo = NULL;
 
 	mtx_lock(&vnode_free_list_mtx);
 
 	/*
 	 * Try to reuse vnodes if we hit the max.  This situation only
 	 * occurs in certain large-memory (2G+) situations.  We cannot
 	 * attempt to directly reclaim vnodes due to nasty recursion
 	 * problems.
 	 */
 	while (numvnodes - freevnodes > desiredvnodes) {
 		if (vnlruproc_sig == 0) {
 			vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
 			wakeup(vnlruproc);
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		tsleep(&vnlruproc_sig, PVFS, "vlruwk", hz);
 		mtx_lock(&vnode_free_list_mtx);
 	}
 
 	/*
 	 * Attempt to reuse a vnode already on the free list, allocating
 	 * a new vnode if we can't find one or if we have not reached a
 	 * good minimum for good LRU performance.
 	 */
 
 	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
 		int error;
 		int count;
 
 		for (count = 0; count < freevnodes; count++) {
 			vp = TAILQ_FIRST(&vnode_free_list);
 
 			KASSERT(vp->v_usecount == 0 &&
 			    (vp->v_iflag & VI_DOINGINACT) == 0,
 			    ("getnewvnode: free vnode isn't"));
 
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 			mtx_unlock(&vnode_free_list_mtx);
 			error = vtryrecycle(vp);
 			mtx_lock(&vnode_free_list_mtx);
 			if (error == 0)
 				break;
 			vp = NULL;
 		}
 	}
 	if (vp) {
 		freevnodes--;
 		mtx_unlock(&vnode_free_list_mtx);
 
 #ifdef INVARIANTS
 		{
 			if (vp->v_data)
 				panic("cleaned vnode isn't");
 			if (vp->v_numoutput)
 				panic("Clean vnode has pending I/O's");
 			if (vp->v_writecount != 0)
 				panic("Non-zero write count");
 		}
 #endif
 		if ((pollinfo = vp->v_pollinfo) != NULL) {
 			/*
 			 * To avoid lock order reversals, the call to
 			 * uma_zfree() must be delayed until the vnode
 			 * interlock is released.
 			 */
 			vp->v_pollinfo = NULL;
 		}
 #ifdef MAC
 		mac_destroy_vnode(vp);
 #endif
 		vp->v_iflag = 0;
 		vp->v_vflag = 0;
 		vp->v_lastw = 0;
 		vp->v_lasta = 0;
 		vp->v_cstart = 0;
 		vp->v_clen = 0;
 		vp->v_socket = 0;
 		lockdestroy(vp->v_vnlock);
 		lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
 		KASSERT(vp->v_cleanbufcnt == 0, ("cleanbufcnt not 0"));
 		KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
 		KASSERT(vp->v_dirtybufcnt == 0, ("dirtybufcnt not 0"));
 		KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
 	} else {
 		numvnodes++;
 		mtx_unlock(&vnode_free_list_mtx);
 
 		vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
 		mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 		VI_LOCK(vp);
 		vp->v_dd = vp;
 		vp->v_vnlock = &vp->v_lock;
 		lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
 		cache_purge(vp);		/* Sets up v_id. */
 		LIST_INIT(&vp->v_cache_src);
 		TAILQ_INIT(&vp->v_cache_dst);
 	}
 
 	TAILQ_INIT(&vp->v_cleanblkhd);
 	TAILQ_INIT(&vp->v_dirtyblkhd);
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	*vpp = vp;
 	vp->v_usecount = 1;
 	vp->v_data = 0;
 	vp->v_cachedid = -1;
 	VI_UNLOCK(vp);
 	if (pollinfo != NULL) {
 		knlist_destroy(&pollinfo->vpi_selinfo.si_note);
 		mtx_destroy(&pollinfo->vpi_lock);
 		uma_zfree(vnodepoll_zone, pollinfo);
 	}
 #ifdef MAC
 	mac_init_vnode(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 		mac_associate_vnode_singlelabel(mp, vp);
 #endif
 	delmntque(vp);
 	if (mp != NULL) {
 		insmntque(vp, mp);
 		vp->v_bsize = mp->mnt_stat.f_iosize;
 	}
 
 	return (0);
 }
 
 /*
  * Delete from old mount point vnode list, if on one.
  */
 static void
 delmntque(struct vnode *vp)
 {
 	struct mount *mp;
 
 	if (vp->v_mount == NULL)
 		return;
 	mp = vp->v_mount;
 	MNT_ILOCK(mp);
 	vp->v_mount = NULL;
 	KASSERT(mp->mnt_nvnodelistsize > 0,
 		("bad mount point vnode list size"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize--;
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Insert into list of vnodes for the new mount point, if available.
  */
 static void
 insmntque(struct vnode *vp, struct mount *mp)
 {
 
 	vp->v_mount = mp;
 	KASSERT(mp != NULL, ("Don't call insmntque(foo, NULL)"));
 	MNT_ILOCK(vp->v_mount);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize++;
 	MNT_IUNLOCK(vp->v_mount);
 }
 
 /*
  * Update outstanding I/O count and do wakeup if requested.
  */
 void
 vwakeup(bp)
 	register struct buf *bp;
 {
 	register struct vnode *vp;
 
 	bp->b_flags &= ~B_WRITEINPROG;
 	if ((vp = bp->b_vp)) {
 		VI_LOCK(vp);
 		vp->v_numoutput--;
 		if (vp->v_numoutput < 0)
 			panic("vwakeup: neg numoutput");
 		if ((vp->v_numoutput == 0) && (vp->v_iflag & VI_BWAIT)) {
 			vp->v_iflag &= ~VI_BWAIT;
 			wakeup(&vp->v_numoutput);
 		}
 		VI_UNLOCK(vp);
 	}
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
 	struct vnode *vp;
 	int flags;
 	struct ucred *cred;
 	struct thread *td;
 	int slpflag, slptimeo;
 {
 	struct buf *blist;
 	int error;
 	vm_object_t object;
 
 	GIANT_REQUIRED;
 
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 
 	VI_LOCK(vp);
 	if (flags & V_SAVE) {
 		while (vp->v_numoutput) {
 			vp->v_iflag |= VI_BWAIT;
 			error = msleep(&vp->v_numoutput, VI_MTX(vp),
 			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
 			if (error) {
 				VI_UNLOCK(vp);
 				return (error);
 			}
 		}
 		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 			VI_UNLOCK(vp);
 			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
 				return (error);
 			/*
 			 * XXX We could save a lock/unlock if this was only
 			 * enabled under INVARIANTS
 			 */
 			VI_LOCK(vp);
 			if (vp->v_numoutput > 0 ||
 			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
 				panic("vinvalbuf: dirty bufs");
 		}
 	}
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	for (error = 0;;) {
 		if ((blist = TAILQ_FIRST(&vp->v_cleanblkhd)) != 0 &&
 		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
 			if (error)
 				break;
 			continue;
 		}
 		if ((blist = TAILQ_FIRST(&vp->v_dirtyblkhd)) != 0 &&
 		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
 			if (error)
 				break;
 			continue;
 		}
 		break;
 	}
 	if (error) {
 		VI_UNLOCK(vp);
 		return (error);
 	}
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		while (vp->v_numoutput > 0) {
 			vp->v_iflag |= VI_BWAIT;
 			msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vnvlbv", 0);
 		}
 		VI_UNLOCK(vp);
 		if (VOP_GETVOBJECT(vp, &object) == 0) {
 			VM_OBJECT_LOCK(object);
 			vm_object_pip_wait(object, "vnvlbx");
 			VM_OBJECT_UNLOCK(object);
 		}
 		VI_LOCK(vp);
 	} while (vp->v_numoutput > 0);
 	VI_UNLOCK(vp);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (VOP_GETVOBJECT(vp, &object) == 0) {
 		VM_OBJECT_LOCK(object);
 		vm_object_page_remove(object, 0, 0,
 			(flags & V_SAVE) ? TRUE : FALSE);
 		VM_OBJECT_UNLOCK(object);
 	}
 
 #ifdef INVARIANTS
 	VI_LOCK(vp);
 	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
 	    (!TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
 	     !TAILQ_EMPTY(&vp->v_cleanblkhd)))
 		panic("vinvalbuf: flush failed");
 	VI_UNLOCK(vp);
 #endif
 	return (0);
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
 	struct buf *blist;
 	int flags;
 	struct vnode *vp;
 	int slpflag, slptimeo;
 	int *errorp;
 {
 	struct buf *bp, *nbp;
 	int found, error;
 
 	ASSERT_VI_LOCKED(vp, "flushbuflist");
 
 	for (found = 0, bp = blist; bp; bp = nbp) {
 		nbp = TAILQ_NEXT(bp, b_vnbufs);
 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
 			continue;
 		}
 		found += 1;
 		error = BUF_TIMELOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			if (error != ENOLCK)
 				*errorp = error;
 			goto done;
 		}
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.  Note that vfs_bio_awrite expects
 		 * buffers to reside on a queue, while bwrite and
 		 * brelse do not.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 			(flags & V_SAVE)) {
 
 			if (bp->b_vp == vp) {
 				if (bp->b_flags & B_CLUSTEROK) {
 					vfs_bio_awrite(bp);
 				} else {
 					bremfree(bp);
 					bp->b_flags |= B_ASYNC;
 					bwrite(bp);
 				}
 			} else {
 				bremfree(bp);
 				(void) bwrite(bp);
 			}
 			goto done;
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		VI_LOCK(vp);
 	}
 	return (found);
 done:
 	VI_LOCK(vp);
 	return (found);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(vp, cred, td, length, blksize)
 	register struct vnode *vp;
 	struct ucred *cred;
 	struct thread *td;
 	off_t length;
 	int blksize;
 {
 	register struct buf *bp;
 	struct buf *nbp;
 	int anyfreed;
 	int trunclbn;
 
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	trunclbn = (length + blksize - 1) / blksize;
 
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 restart:
 	VI_LOCK(vp);
 	anyfreed = 1;
 	for (;anyfreed;) {
 		anyfreed = 0;
 		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if (bp->b_lblkno >= trunclbn) {
 				if (BUF_LOCK(bp,
 				    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 				    VI_MTX(vp)) == ENOLCK)
 					goto restart;
 
 				bremfree(bp);
 				bp->b_flags |= (B_INVAL | B_RELBUF);
 				bp->b_flags &= ~B_ASYNC;
 				brelse(bp);
 				anyfreed = 1;
 
 				if (nbp &&
 				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 				    (nbp->b_vp != vp) ||
 				    (nbp->b_flags & B_DELWRI))) {
 					goto restart;
 				}
 				VI_LOCK(vp);
 			}
 		}
 
 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if (bp->b_lblkno >= trunclbn) {
 				if (BUF_LOCK(bp,
 				    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 				    VI_MTX(vp)) == ENOLCK)
 					goto restart;
 				bremfree(bp);
 				bp->b_flags |= (B_INVAL | B_RELBUF);
 				bp->b_flags &= ~B_ASYNC;
 				brelse(bp);
 				anyfreed = 1;
 				if (nbp &&
 				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 				    (nbp->b_vp != vp) ||
 				    (nbp->b_flags & B_DELWRI) == 0)) {
 					goto restart;
 				}
 				VI_LOCK(vp);
 			}
 		}
 	}
 
 	if (length > 0) {
 restartsync:
 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if (bp->b_lblkno > 0)
 				continue;
 			/*
 			 * Since we hold the vnode lock this should only
 			 * fail if we're racing with the buf daemon.
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK) {
 				goto restart;
 			}
 			KASSERT((bp->b_flags & B_DELWRI),
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
 			bremfree(bp);
 			bawrite(bp);
 			VI_LOCK(vp);
 			goto restartsync;
 		}
 	}
 
 	while (vp->v_numoutput > 0) {
 		vp->v_iflag |= VI_BWAIT;
 		msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vbtrunc", 0);
 	}
 	VI_UNLOCK(vp);
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * buf_splay() - splay tree core for the clean/dirty list of buffers in
  * 		 a vnode.
  *
  *	NOTE: We have to deal with the special case of a background bitmap
  *	buffer, a situation where two buffers will have the same logical
  *	block offset.  We want (1) only the foreground buffer to be accessed
  *	in a lookup and (2) must differentiate between the foreground and
  *	background buffer in the splay tree algorithm because the splay
  *	tree cannot normally handle multiple entities with the same 'index'.
  *	We accomplish this by adding differentiating flags to the splay tree's
  *	numerical domain.
  */
 static
 struct buf *
 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
 {
 	struct buf dummy;
 	struct buf *lefttreemax, *righttreemin, *y;
 
 	if (root == NULL)
 		return (NULL);
 	lefttreemax = righttreemin = &dummy;
 	for (;;) {
 		if (lblkno < root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_left) == NULL)
 				break;
 			if (lblkno < y->b_lblkno) {
 				/* Rotate right. */
 				root->b_left = y->b_right;
 				y->b_right = root;
 				root = y;
 				if ((y = root->b_left) == NULL)
 					break;
 			}
 			/* Link into the new root's right tree. */
 			righttreemin->b_left = root;
 			righttreemin = root;
 		} else if (lblkno > root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_right) == NULL)
 				break;
 			if (lblkno > y->b_lblkno) {
 				/* Rotate left. */
 				root->b_right = y->b_left;
 				y->b_left = root;
 				root = y;
 				if ((y = root->b_right) == NULL)
 					break;
 			}
 			/* Link into the new root's left tree. */
 			lefttreemax->b_right = root;
 			lefttreemax = root;
 		} else {
 			break;
 		}
 		root = y;
 	}
 	/* Assemble the new root. */
 	lefttreemax->b_right = root->b_left;
 	righttreemin->b_left = root->b_right;
 	root->b_left = dummy.b_right;
 	root->b_right = dummy.b_left;
 	return (root);
 }
 
 static
 void
 buf_vlist_remove(struct buf *bp)
 {
 	struct vnode *vp = bp->b_vp;
 	struct buf *root;
 
 	ASSERT_VI_LOCKED(vp, "buf_vlist_remove");
 	if (bp->b_xflags & BX_VNDIRTY) {
 		if (bp != vp->v_dirtyblkroot) {
 			root = buf_splay(bp->b_lblkno, bp->b_xflags,
 			    vp->v_dirtyblkroot);
 			KASSERT(root == bp,
 			    ("splay lookup failed during dirty remove"));
 		}
 		if (bp->b_left == NULL) {
 			root = bp->b_right;
 		} else {
 			root = buf_splay(bp->b_lblkno, bp->b_xflags,
 			    bp->b_left);
 			root->b_right = bp->b_right;
 		}
 		vp->v_dirtyblkroot = root;
 		TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
 		vp->v_dirtybufcnt--;
 	} else {
 		/* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
 		if (bp != vp->v_cleanblkroot) {
 			root = buf_splay(bp->b_lblkno, bp->b_xflags,
 			    vp->v_cleanblkroot);
 			KASSERT(root == bp,
 			    ("splay lookup failed during clean remove"));
 		}
 		if (bp->b_left == NULL) {
 			root = bp->b_right;
 		} else {
 			root = buf_splay(bp->b_lblkno, bp->b_xflags,
 			    bp->b_left);
 			root->b_right = bp->b_right;
 		}
 		vp->v_cleanblkroot = root;
 		TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
 		vp->v_cleanbufcnt--;
 	}
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list using a
  * splay tree algorithm.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static
 void
 buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
 {
 	struct buf *root;
 
 	ASSERT_VI_LOCKED(vp, "buf_vlist_add");
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY) {
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
 		if (root == NULL) {
 			bp->b_left = NULL;
 			bp->b_right = NULL;
 			TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
 		} else if (bp->b_lblkno < root->b_lblkno ||
 		    (bp->b_lblkno == root->b_lblkno &&
 		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 			bp->b_left = root->b_left;
 			bp->b_right = root;
 			root->b_left = NULL;
 			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
 		} else {
 			bp->b_right = root->b_right;
 			bp->b_left = root;
 			root->b_right = NULL;
 			TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
 			    root, bp, b_vnbufs);
 		}
 		vp->v_dirtybufcnt++;
 		vp->v_dirtyblkroot = bp;
 	} else {
 		/* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
 		if (root == NULL) {
 			bp->b_left = NULL;
 			bp->b_right = NULL;
 			TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
 		} else if (bp->b_lblkno < root->b_lblkno ||
 		    (bp->b_lblkno == root->b_lblkno &&
 		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 			bp->b_left = root->b_left;
 			bp->b_right = root;
 			root->b_left = NULL;
 			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
 		} else {
 			bp->b_right = root->b_right;
 			bp->b_left = root;
 			root->b_right = NULL;
 			TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
 			    root, bp, b_vnbufs);
 		}
 		vp->v_cleanbufcnt++;
 		vp->v_cleanblkroot = bp;
 	}
 }
 
 /*
  * Lookup a buffer using the splay tree.  Note that we specifically avoid
  * shadow buffers used in background bitmap writes.
  *
  * This code isn't quite efficient as it could be because we are maintaining
  * two sorted lists and do not know which list the block resides in.
  *
  * During a "make buildworld" the desired buffer is found at one of
  * the roots more than 60% of the time.  Thus, checking both roots
  * before performing either splay eliminates unnecessary splays on the
  * first tree splayed.
  */
 struct buf *
 gbincore(struct vnode *vp, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	GIANT_REQUIRED;
 
 	ASSERT_VI_LOCKED(vp, "gbincore");
 	if ((bp = vp->v_cleanblkroot) != NULL &&
 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return (bp);
 	if ((bp = vp->v_dirtyblkroot) != NULL &&
 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return (bp);
 	if ((bp = vp->v_cleanblkroot) != NULL) {
 		vp->v_cleanblkroot = bp = buf_splay(lblkno, 0, bp);
 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 			return (bp);
 	}
 	if ((bp = vp->v_dirtyblkroot) != NULL) {
 		vp->v_dirtyblkroot = bp = buf_splay(lblkno, 0, bp);
 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 			return (bp);
 	}
 	return (NULL);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 
 	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
 
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	ASSERT_VI_LOCKED(vp, "bgetvp");
 	vholdl(vp);
 	bp->b_vp = vp;
 	bp->b_dev = vn_todev(vp);
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	buf_vlist_add(bp, vp, BX_VNCLEAN);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(bp)
 	register struct buf *bp;
 {
 	struct vnode *vp;
 
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;
 	VI_LOCK(vp);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	if ((vp->v_iflag & VI_ONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 		vp->v_iflag &= ~VI_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(vp, v_synclist);
  		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	vdropl(vp);
 	bp->b_vp = (struct vnode *) 0;
 	if (bp->b_object)
 		bp->b_object = NULL;
 	VI_UNLOCK(vp);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct vnode *vp, int delay)
 {
 	int slot;
 
 	ASSERT_VI_LOCKED(vp, "vn_syncer_add_to_worklist");
 
 	mtx_lock(&sync_mtx);
 	if (vp->v_iflag & VI_ONWORKLST)
 		LIST_REMOVE(vp, v_synclist);
 	else {
 		vp->v_iflag |= VI_ONWORKLST;
  		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
 static int
 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 {
 	int error, len;
 
 	mtx_lock(&sync_mtx);
 	len = syncer_worklist_len - sync_vnode_count;
 	mtx_unlock(&sync_mtx);
 	error = SYSCTL_OUT(req, &len, sizeof(len));
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 struct  proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next;
 	struct synclist *slp;
 	struct vnode *vp;
 	struct mount *mp;
 	long starttime;
 	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
 	static int dummychan;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int first_printf;
 
 	mtx_lock(&Giant);
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_second;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	for (;;) {
 		mtx_lock(&sync_mtx);
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kthread_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_second) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining...");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_second;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes, 
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
 		while ((vp = LIST_FIRST(slp)) != NULL) {
 			if (VOP_ISLOCKED(vp, NULL) != 0 ||
 			    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 				LIST_REMOVE(vp, v_synclist);
 				LIST_INSERT_HEAD(next, vp, v_synclist);
 				continue;
 			}
 			if (VI_TRYLOCK(vp) == 0) {
 				LIST_REMOVE(vp, v_synclist);
 				LIST_INSERT_HEAD(next, vp, v_synclist);
 				vn_finished_write(mp);
 				continue;
 			}
 			/*
 			 * We use vhold in case the vnode does not
 			 * successfully sync.  vhold prevents the vnode from
 			 * going away when we unlock the sync_mtx so that
 			 * we can acquire the vnode interlock.
 			 */
 			vholdl(vp);
 			mtx_unlock(&sync_mtx);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td);
 			(void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
 			VOP_UNLOCK(vp, 0, td);
 			vn_finished_write(mp);
 			VI_LOCK(vp);
 			if ((vp->v_iflag & VI_ONWORKLST) != 0) {
 				/*
 				 * Put us back on the worklist.  The worklist
 				 * routine will remove us from our current
 				 * position and then add us back in at a later
 				 * position.
 				 */
 				vn_syncer_add_to_worklist(vp, syncdelay);
 			}
 			vdropl(vp);
 			VI_UNLOCK(vp);
 			mtx_lock(&sync_mtx);
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		mtx_unlock(&sync_mtx);
 
 		/*
 		 * Do soft update processing.
 		 */
 		if (softdep_process_worklist_hook != NULL)
 			(*softdep_process_worklist_hook)(NULL);
 
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		mtx_lock(&sync_mtx);
 		if (rushjob > 0) {
 			rushjob -= 1;
 			mtx_unlock(&sync_mtx);
 			continue;
 		}
 		mtx_unlock(&sync_mtx);
 		/*
 		 * Just sleep for a short period if time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING)
 			tsleep(&dummychan, PPAUSE, "syncfnl",
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
 		else if (time_second == starttime)
 			tsleep(&lbolt, PPAUSE, "syncer", 0);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer()
 {
 	struct thread *td;
 	int ret = 0;
 
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	sleepq_remove(td, &lbolt);
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	return (ret);
 }
 
 /*
  * Tell the syncer to speed up its work and run though its work
  * list several times, then tell it to shut down.
  */
 static void
 syncer_shutdown(void *arg, int howto)
 {
 	struct thread *td;
 
 	if (howto & RB_NOSYNC)
 		return;
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	sleepq_remove(td, &lbolt);
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
 	kproc_shutdown(arg, howto);
 }
 
 /*
  * Associate a p-buffer with a vnode.
  *
  * Also sets B_PAGING flag to indicate that vnode is not fully associated
  * with the buffer.  i.e. the bp has not been linked into the vnode or
  * ref-counted.
  */
 void
 pbgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 
 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
 
 	bp->b_vp = vp;
 	bp->b_object = vp->v_object;
 	bp->b_flags |= B_PAGING;
 	bp->b_dev = vn_todev(vp);
 }
 
 /*
  * Disassociate a p-buffer from a vnode.
  */
 void
 pbrelvp(bp)
 	register struct buf *bp;
 {
 
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 
 	/* XXX REMOVE ME */
 	VI_LOCK(bp->b_vp);
 	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
 		panic(
 		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
 		    bp,
 		    (int)bp->b_flags
 		);
 	}
 	VI_UNLOCK(bp->b_vp);
 	bp->b_vp = (struct vnode *) 0;
 	bp->b_object = NULL;
 	bp->b_flags &= ~B_PAGING;
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(struct buf *bp)
 {
 	struct vnode *vp;
 	int delay;
 
 	vp = bp->b_vp;
 	++reassignbufcalls;
 
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	VI_LOCK(vp);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		if ((vp->v_iflag & VI_ONWORKLST) == 0) {
 			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				delay = metadelay;
 				break;
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(vp, delay);
 		}
 		buf_vlist_add(bp, vp, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, vp, BX_VNCLEAN);
 
 		if ((vp->v_iflag & VI_ONWORKLST) &&
 		    TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(vp, v_synclist);
  			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			vp->v_iflag &= ~VI_ONWORKLST;
 		}
 	}
 	VI_UNLOCK(vp);
 }
 
 /*
  * Create a vnode for a device.
  * Used for mounting the root filesystem.
  */
 int
 bdevvp(dev, vpp)
 	struct cdev *dev;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 	struct vnode *nvp;
 	int error;
 
 	if (dev == NULL) {
 		*vpp = NULLVP;
 		return (ENXIO);
 	}
 	if (vfinddev(dev, vpp))
 		return (0);
 
 	error = getnewvnode("none", (struct mount *)0, spec_vnodeop_p, &nvp);
 	if (error) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	vp = nvp;
 	vp->v_type = VCHR;
 	vp->v_bsize = DEV_BSIZE;
 	addalias(vp, dev);
 	*vpp = vp;
 	return (0);
 }
 
 static void
 v_incr_usecount(struct vnode *vp, int delta)
 {
 
 	vp->v_usecount += delta;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		mtx_lock(&spechash_mtx);
 		vp->v_rdev->si_usecount += delta;
 		mtx_unlock(&spechash_mtx);
 	}
 }
 
 /*
  * Add vnode to the alias list hung off the struct cdev *.
  *
  * The reason for this gunk is that multiple vnodes can reference
  * the same physical device, so checking vp->v_usecount to see
  * how many users there are is inadequate; the v_usecount for
  * the vnodes need to be accumulated.  vcount() does that.
  */
 struct vnode *
 addaliasu(nvp, nvp_rdev)
 	struct vnode *nvp;
 	dev_t nvp_rdev;
 {
 	struct vnode *ovp;
 	vop_t **ops;
 	struct cdev *dev;
 
 	if (nvp->v_type == VBLK)
 		return (nvp);
 	if (nvp->v_type != VCHR)
 		panic("addaliasu on non-special vnode");
 	dev = findcdev(nvp_rdev);
 	if (dev == NULL)
 		return (nvp);
 	/*
 	 * Check to see if we have a bdevvp vnode with no associated
 	 * filesystem. If so, we want to associate the filesystem of
 	 * the new newly instigated vnode with the bdevvp vnode and
 	 * discard the newly created vnode rather than leaving the
 	 * bdevvp vnode lying around with no associated filesystem.
 	 */
 	if (vfinddev(dev, &ovp) == 0 || ovp->v_data != NULL) {
 		addalias(nvp, dev);
 		return (nvp);
 	}
 	/*
 	 * Discard unneeded vnode, but save its node specific data.
 	 * Note that if there is a lock, it is carried over in the
 	 * node specific data to the replacement vnode.
 	 */
 	vref(ovp);
 	ovp->v_data = nvp->v_data;
 	ovp->v_tag = nvp->v_tag;
 	nvp->v_data = NULL;
 	lockdestroy(ovp->v_vnlock);
 	lockinit(ovp->v_vnlock, PVFS, nvp->v_vnlock->lk_wmesg,
 	    nvp->v_vnlock->lk_timo, nvp->v_vnlock->lk_flags & LK_EXTFLG_MASK);
 	ops = ovp->v_op;
 	ovp->v_op = nvp->v_op;
 	if (VOP_ISLOCKED(nvp, curthread)) {
 		VOP_UNLOCK(nvp, 0, curthread);
 		vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
 	}
 	nvp->v_op = ops;
 	delmntque(ovp);
 	insmntque(ovp, nvp->v_mount);
 	vrele(nvp);
 	vgone(nvp);
 	return (ovp);
 }
 
 /* This is a local helper function that do the same as addaliasu, but for a
  * struct cdev *instead of an dev_t. */
 static void
 addalias(nvp, dev)
 	struct vnode *nvp;
 	struct cdev *dev;
 {
 
 	KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
 	dev_ref(dev);
 	nvp->v_rdev = dev;
 	VI_LOCK(nvp);
 	mtx_lock(&spechash_mtx);
 	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
 	dev->si_usecount += nvp->v_usecount;
 	mtx_unlock(&spechash_mtx);
 	VI_UNLOCK(nvp);
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it. The vnode lock bit is set if the
  * vnode is being eliminated in vgone. The process is awakened
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new filesystem type).
  */
 int
 vget(vp, flags, td)
 	register struct vnode *vp;
 	int flags;
 	struct thread *td;
 {
 	int error;
 
 	/*
 	 * If the vnode is in the process of being cleaned out for
 	 * another use, we wait for the cleaning to finish and then
 	 * return failure. Cleaning is determined by checking that
 	 * the VI_XLOCK flag is set.
 	 */
 	if ((flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
 	if (vp->v_iflag & VI_XLOCK && vp->v_vxthread != curthread) {
 		if ((flags & LK_NOWAIT) == 0) {
 			vp->v_iflag |= VI_XWANT;
 			msleep(vp, VI_MTX(vp), PINOD | PDROP, "vget", 0);
 			return (ENOENT);
 		}
 		VI_UNLOCK(vp);
 		return (EBUSY);
 	}
 
 	v_incr_usecount(vp, 1);
 
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 	if (flags & LK_TYPE_MASK) {
 		if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
 			/*
 			 * must expand vrele here because we do not want
 			 * to call VOP_INACTIVE if the reference count
 			 * drops back to zero since it was never really
 			 * active. We must remove it from the free list
 			 * before sleeping so that multiple processes do
 			 * not try to recycle it.
 			 */
 			VI_LOCK(vp);
 			v_incr_usecount(vp, -1);
 			if (VSHOULDFREE(vp))
 				vfree(vp);
 			else
 				vlruvp(vp);
 			VI_UNLOCK(vp);
 		}
 		return (error);
 	}
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * Increase the reference count of a vnode.
  */
 void
 vref(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	v_incr_usecount(vp, 1);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Return reference count of a vnode.
  *
  * The results of this call are only guaranteed when some mechanism other
  * than the VI lock is used to stop other processes from gaining references
  * to the vnode.  This may be the case if the caller holds the only reference.
  * This is also useful when stale data is acceptable as race conditions may
  * be accounted for by some other means.
  */
 int
 vrefcnt(struct vnode *vp)
 {
 	int usecnt;
 
 	VI_LOCK(vp);
 	usecnt = vp->v_usecount;
 	VI_UNLOCK(vp);
 
 	return (usecnt);
 }
 
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(vp)
 	struct vnode *vp;
 {
 	struct thread *td = curthread;	/* XXX */
 
 	GIANT_REQUIRED;
 
 	KASSERT(vp != NULL, ("vrele: null vp"));
 
 	VI_LOCK(vp);
 
 	/* Skip this v_writecount check if we're going to panic below. */
 	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
 	    ("vrele: missed vn_close"));
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		v_incr_usecount(vp, -1);
 		VI_UNLOCK(vp);
 
 		return;
 	}
 
 	if (vp->v_usecount == 1) {
 		v_incr_usecount(vp, -1);
 		/*
 		 * We must call VOP_INACTIVE with the node locked. Mark
 		 * as VI_DOINGINACT to avoid recursion.
 		 */
 		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
 			VI_LOCK(vp);
 			vp->v_iflag |= VI_DOINGINACT;
 			VI_UNLOCK(vp);
 			VOP_INACTIVE(vp, td);
 			VI_LOCK(vp);
 			KASSERT(vp->v_iflag & VI_DOINGINACT,
 			    ("vrele: lost VI_DOINGINACT"));
 			vp->v_iflag &= ~VI_DOINGINACT;
 		} else
 			VI_LOCK(vp);
 		if (VSHOULDFREE(vp))
 			vfree(vp);
 		else
 			vlruvp(vp);
 		VI_UNLOCK(vp);
 
 	} else {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 #endif
 		VI_UNLOCK(vp);
 		panic("vrele: negative ref cnt");
 	}
 }
 
 /*
  * Release an already locked vnode.  This give the same effects as
  * unlock+vrele(), but takes less time and avoids releasing and
  * re-aquiring the lock (as vrele() aquires the lock internally.)
  */
 void
 vput(vp)
 	struct vnode *vp;
 {
 	struct thread *td = curthread;	/* XXX */
 
 	GIANT_REQUIRED;
 
 	KASSERT(vp != NULL, ("vput: null vp"));
 	VI_LOCK(vp);
 	/* Skip this v_writecount check if we're going to panic below. */
 	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
 	    ("vput: missed vn_close"));
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		v_incr_usecount(vp, -1);
 		VOP_UNLOCK(vp, LK_INTERLOCK, td);
 		return;
 	}
 
 	if (vp->v_usecount == 1) {
 		v_incr_usecount(vp, -1);
 		/*
 		 * We must call VOP_INACTIVE with the node locked, so
 		 * we just need to release the vnode mutex. Mark as
 		 * as VI_DOINGINACT to avoid recursion.
 		 */
 		vp->v_iflag |= VI_DOINGINACT;
 		VI_UNLOCK(vp);
 		VOP_INACTIVE(vp, td);
 		VI_LOCK(vp);
 		KASSERT(vp->v_iflag & VI_DOINGINACT,
 		    ("vput: lost VI_DOINGINACT"));
 		vp->v_iflag &= ~VI_DOINGINACT;
 		if (VSHOULDFREE(vp))
 			vfree(vp);
 		else
 			vlruvp(vp);
 		VI_UNLOCK(vp);
 
 	} else {
 #ifdef DIAGNOSTIC
 		vprint("vput: negative ref count", vp);
 #endif
 		panic("vput: negative ref cnt");
 	}
 }
 
 /*
  * Somebody doesn't want the vnode recycled.
  */
 void
 vhold(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vholdl(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vholdl(vp)
 	register struct vnode *vp;
 {
 
 	vp->v_holdcnt++;
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 }
 
 /*
  * Note that there is one less who cares about this vnode.  vdrop() is the
  * opposite of vhold().
  */
 void
 vdrop(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vdropl(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vdropl(vp)
 	register struct vnode *vp;
 {
 
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt");
 	vp->v_holdcnt--;
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 	else
 		vlruvp(vp);
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush(, td)
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
 vflush(mp, rootrefs, flags, td)
 	struct mount *mp;
 	int rootrefs;
 	int flags;
 	struct thread *td;
 {
 	struct vnode *vp, *nvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, &rootvp, td)) != 0)
 			return (error);
 		vput(rootvp);
 
 	}
 	MNT_ILOCK(mp);
 loop:
 	MNT_VNODE_FOREACH(vp, mp, nvp) {
 
 		VI_LOCK(vp);
 		MNT_IUNLOCK(mp);
 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
 		if (error) {
 			MNT_ILOCK(mp);
 			goto loop;
 		}
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp, 0, td);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp, LK_INTERLOCK, td);
 				MNT_ILOCK(mp);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 
 		VOP_UNLOCK(vp, 0, td);
 
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 */
 		if (vp->v_usecount == 0) {
 			vgonel(vp, td);
 			MNT_ILOCK(mp);
 			continue;
 		}
 
 		/*
 		 * If FORCECLOSE is set, forcibly close the vnode. For block
 		 * or character devices, revert to an anonymous device. For
 		 * all other files, just kill them.
 		 */
 		if (flags & FORCECLOSE) {
 			if (vp->v_type != VCHR)
 				vgonel(vp, td);
 			else
 				vgonechrl(vp, td);
 			MNT_ILOCK(mp);
 			continue;
 		}
 #ifdef DIAGNOSTIC
 		if (busyprt)
 			vprint("vflush: busy vnode", vp);
 #endif
 		VI_UNLOCK(vp);
 		MNT_ILOCK(mp);
 		busy++;
 	}
 	MNT_IUNLOCK(mp);
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		KASSERT(rootvp->v_usecount >= rootrefs,
 		    ("vflush: usecount %d < rootrefs %d",
 		     rootvp->v_usecount, rootrefs));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			vgonel(rootvp, td);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy)
 		return (EBUSY);
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * This moves a now (likely recyclable) vnode to the end of the
  * mountlist.  XXX However, it is temporarily disabled until we
  * can clean up ffs_sync() and friends, which have loop restart
  * conditions which this code causes to operate O(N^2).
  */
 static void
 vlruvp(struct vnode *vp)
 {
 #if 0
 	struct mount *mp;
 
 	if ((mp = vp->v_mount) != NULL) {
 		MNT_ILOCK(mp);
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		MNT_IUNLOCK(mp);
 	}
 #endif
 }
 
 static void
 vx_lock(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, "vx_lock");
 
 	/*
 	 * Prevent the vnode from being recycled or brought into use while we
 	 * clean it out.
 	 */
 	if (vp->v_iflag & VI_XLOCK)
 		panic("vclean: deadlock");
 	vp->v_iflag |= VI_XLOCK;
 	vp->v_vxthread = curthread;
 }
 
 static void
 vx_unlock(struct vnode *vp)
 {
 	ASSERT_VI_LOCKED(vp, "vx_unlock");
 	vp->v_iflag &= ~VI_XLOCK;
 	vp->v_vxthread = NULL;
 	if (vp->v_iflag & VI_XWANT) {
 		vp->v_iflag &= ~VI_XWANT;
 		wakeup(vp);
 	}
 }
 
 /*
  * Disassociate the underlying filesystem from a vnode.
  */
 static void
 vclean(vp, flags, td)
 	struct vnode *vp;
 	int flags;
 	struct thread *td;
 {
 	int active;
 
 	ASSERT_VI_LOCKED(vp, "vclean");
 	/*
 	 * Check to see if the vnode is in use. If so we have to reference it
 	 * before we clean it out so that its count cannot fall to zero and
 	 * generate a race against ourselves to recycle it.
 	 */
 	if ((active = vp->v_usecount))
 		v_incr_usecount(vp, 1);
 
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 	 * have the object locked while it cleans it out. The VOP_LOCK
 	 * ensures that the VOP_INACTIVE routine is done with its work.
 	 * For active vnodes, it ensures that no other activity can
 	 * occur while the underlying object is being cleaned out.
 	 */
 	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
 
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	if (flags & DOCLOSE) {
 		struct buf *bp;
 		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
 		if (bp != NULL)
 			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
 		if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
 			vinvalbuf(vp, 0, NOCRED, td, 0, 0);
 	}
 
 	VOP_DESTROYVOBJECT(vp);
 
 	/*
 	 * Any other processes trying to obtain this lock must first
 	 * wait for VXLOCK to clear, then call the new lock operation.
 	 */
 	VOP_UNLOCK(vp, 0, td);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed. Note that the
 	 * VOP_INACTIVE will unlock the vnode.
 	 */
 	if (active) {
 		if (flags & DOCLOSE)
 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_DOINGINACT) == 0) {
 			vp->v_iflag |= VI_DOINGINACT;
 			VI_UNLOCK(vp);
 			if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
 				panic("vclean: cannot relock.");
 			VOP_INACTIVE(vp, td);
 			VI_LOCK(vp);
 			KASSERT(vp->v_iflag & VI_DOINGINACT,
 			    ("vclean: lost VI_DOINGINACT"));
 			vp->v_iflag &= ~VI_DOINGINACT;
 		}
 		VI_UNLOCK(vp);
 	}
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, td))
 		panic("vclean: cannot reclaim");
 
 	if (active) {
 		/*
 		 * Inline copy of vrele() since VOP_INACTIVE
 		 * has already been called.
 		 */
 		VI_LOCK(vp);
 		v_incr_usecount(vp, -1);
 		if (vp->v_usecount <= 0) {
 #ifdef INVARIANTS
 			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
 				vprint("vclean: bad ref count", vp);
 				panic("vclean: ref cnt");
 			}
 #endif
 			if (VSHOULDFREE(vp))
 				vfree(vp);
 		}
 		VI_UNLOCK(vp);
 	}
 	/*
 	 * Delete from old mount point vnode list.
 	 */
 	delmntque(vp);
 	cache_purge(vp);
 	VI_LOCK(vp);
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 
 	/*
 	 * Done with purge, reset to the standard lock and
 	 * notify sleepers of the grim news.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = dead_vnodeop_p;
 	if (vp->v_pollinfo != NULL)
 		vn_pollgone(vp);
 	vp->v_tag = "none";
 }
 
 /*
  * Eliminate all activity associated with the requested vnode
  * and with all vnodes aliased to the requested vnode.
  */
 int
 vop_revoke(ap)
 	struct vop_revoke_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 	} */ *ap;
 {
 	struct vnode *vp, *vq;
 	struct cdev *dev;
 
 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
 	vp = ap->a_vp;
 	KASSERT((vp->v_type == VCHR), ("vop_revoke: not VCHR"));
 
 	VI_LOCK(vp);
 	/*
 	 * If a vgone (or vclean) is already in progress,
 	 * wait until it is done and return.
 	 */
 	if (vp->v_iflag & VI_XLOCK) {
 		vp->v_iflag |= VI_XWANT;
 		msleep(vp, VI_MTX(vp), PINOD | PDROP,
 		    "vop_revokeall", 0);
 		return (0);
 	}
 	VI_UNLOCK(vp);
 	dev = vp->v_rdev;
 	for (;;) {
 		mtx_lock(&spechash_mtx);
 		vq = SLIST_FIRST(&dev->si_hlist);
 		mtx_unlock(&spechash_mtx);
 		if (vq == NULL)
 			break;
 		vgone(vq);
 	}
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  * Release the passed interlock if the vnode will be recycled.
  */
 int
 vrecycle(struct vnode *vp, void *dummyarg, struct thread *td)
 {
 
 	KASSERT(dummyarg == NULL,
 	    ("vrecycle with non-dummy arg %p", dummyarg));
 	VI_LOCK(vp);
 	if (vp->v_usecount == 0) {
 		vgonel(vp, td);
 		return (1);
 	}
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(vp)
 	register struct vnode *vp;
 {
 	struct thread *td = curthread;	/* XXX */
 
 	VI_LOCK(vp);
 	vgonel(vp, td);
 }
 
 /*
  * Disassociate a character device from the its underlying filesystem and
  * attach it to spec.  This is for use when the chr device is still active
  * and the filesystem is going away.
  */
 static void
 vgonechrl(struct vnode *vp, struct thread *td)
 {
 	ASSERT_VI_LOCKED(vp, "vgonechrl");
 	vx_lock(vp);
 	/*
 	 * This is a custom version of vclean() which does not tearm down
 	 * the bufs or vm objects held by this vnode.  This allows filesystems
 	 * to continue using devices which were discovered via another
 	 * filesystem that has been unmounted.
 	 */
 	if (vp->v_usecount != 0) {
 		v_incr_usecount(vp, 1);
 		/*
 		 * Ensure that no other activity can occur while the
 		 * underlying object is being cleaned out.
 		 */
 		VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
 		/*
 		 * Any other processes trying to obtain this lock must first
 		 * wait for VXLOCK to clear, then call the new lock operation.
 		 */
 		VOP_UNLOCK(vp, 0, td);
 		vp->v_vnlock = &vp->v_lock;
 		vp->v_tag = "orphanchr";
 		vp->v_op = spec_vnodeop_p;
 		delmntque(vp);
 		cache_purge(vp);
 		vrele(vp);
 		VI_LOCK(vp);
 	} else
 		vclean(vp, 0, td);
 	vp->v_op = spec_vnodeop_p;
 	vx_unlock(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 void
 vgonel(vp, td)
 	struct vnode *vp;
 	struct thread *td;
 {
 	/*
 	 * If a vgone (or vclean) is already in progress,
 	 * wait until it is done and return.
 	 */
 	ASSERT_VI_LOCKED(vp, "vgonel");
 	if (vp->v_iflag & VI_XLOCK) {
 		vp->v_iflag |= VI_XWANT;
 		msleep(vp, VI_MTX(vp), PINOD | PDROP, "vgone", 0);
 		return;
 	}
 	vx_lock(vp);
 
 	/*
 	 * Clean out the filesystem specific data.
 	 */
 	vclean(vp, DOCLOSE, td);
 	VI_UNLOCK(vp);
 
 	/*
 	 * If special device, remove it from special device alias list
 	 * if it is on one.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		mtx_lock(&spechash_mtx);
 		SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
 		vp->v_rdev->si_usecount -= vp->v_usecount;
 		mtx_unlock(&spechash_mtx);
 		dev_rel(vp->v_rdev);
 		vp->v_rdev = NULL;
 	}
 
 	/*
 	 * If it is on the freelist and not already at the head,
 	 * move it to the head of the list. The test of the
 	 * VDOOMED flag and the reference count of zero is because
 	 * it will be removed from the free list by getnewvnode,
 	 * but will not have its reference count incremented until
 	 * after calling vgone. If the reference count were
 	 * incremented first, vgone would (incorrectly) try to
 	 * close the previous instance of the underlying object.
 	 */
 	if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) {
 		mtx_lock(&vnode_free_list_mtx);
 		if (vp->v_iflag & VI_FREE) {
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		} else {
 			vp->v_iflag |= VI_FREE;
 			freevnodes++;
 		}
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 		mtx_unlock(&vnode_free_list_mtx);
 	}
 
 	vp->v_type = VBAD;
 	vx_unlock(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Lookup a vnode by device number.
  */
 int
 vfinddev(dev, vpp)
 	struct cdev *dev;
 	struct vnode **vpp;
 {
 	struct vnode *vp;
 
 	mtx_lock(&spechash_mtx);
 	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
 		*vpp = vp;
 		mtx_unlock(&spechash_mtx);
 		return (1);
 	}
 	mtx_unlock(&spechash_mtx);
 	return (0);
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(vp)
 	struct vnode *vp;
 {
 	int count;
 
 	mtx_lock(&spechash_mtx);
 	count = vp->v_rdev->si_usecount;
 	mtx_unlock(&spechash_mtx);
 	return (count);
 }
 
 /*
  * Same as above, but using the struct cdev *as argument
  */
 int
 count_dev(dev)
 	struct cdev *dev;
 {
 	int count;
 
 	mtx_lock(&spechash_mtx);
 	count = dev->si_usecount;
 	mtx_unlock(&spechash_mtx);
 	return(count);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
 
 void
 vprint(label, vp)
 	char *label;
 	struct vnode *vp;
 {
 	char buf[96];
 
 	if (label != NULL)
 		printf("%s: %p: ", label, (void *)vp);
 	else
 		printf("%p: ", (void *)vp);
 	printf("tag %s, type %s, usecount %d, writecount %d, refcount %d,",
 	    vp->v_tag, typename[vp->v_type], vp->v_usecount,
 	    vp->v_writecount, vp->v_holdcnt);
 	buf[0] = '\0';
 	if (vp->v_vflag & VV_ROOT)
 		strcat(buf, "|VV_ROOT");
 	if (vp->v_vflag & VV_TEXT)
 		strcat(buf, "|VV_TEXT");
 	if (vp->v_vflag & VV_SYSTEM)
 		strcat(buf, "|VV_SYSTEM");
 	if (vp->v_iflag & VI_XLOCK)
 		strcat(buf, "|VI_XLOCK");
 	if (vp->v_iflag & VI_XWANT)
 		strcat(buf, "|VI_XWANT");
 	if (vp->v_iflag & VI_BWAIT)
 		strcat(buf, "|VI_BWAIT");
 	if (vp->v_iflag & VI_DOOMED)
 		strcat(buf, "|VI_DOOMED");
 	if (vp->v_iflag & VI_FREE)
 		strcat(buf, "|VI_FREE");
 	if (vp->v_vflag & VV_OBJBUF)
 		strcat(buf, "|VV_OBJBUF");
 	if (buf[0] != '\0')
 		printf(" flags (%s),", &buf[1]);
 	lockmgr_printinfo(vp->v_vnlock);
 	printf("\n");
 	if (vp->v_data != NULL)
 		VOP_PRINT(vp);
 }
 
 #ifdef DDB
 #include <ddb/ddb.h>
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 {
 	struct mount *mp, *nmp;
 	struct vnode *vp;
 
 	/*
 	 * Note: because this is DDB, we can't obey the locking semantics
 	 * for these structures, which means we could catch an inconsistent
 	 * state and dereference a nasty pointer.  Not much to be done
 	 * about that.
 	 */
 	printf("Locked vnodes\n");
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (VOP_ISLOCKED(vp, NULL))
 				vprint(NULL, vp);
 		}
 		nmp = TAILQ_NEXT(mp, mnt_list);
 	}
 }
 #endif
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static void
 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
 {
 
 	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
 	xvfsp->vfc_typenum = vfsp->vfc_typenum;
 	xvfsp->vfc_refcount = vfsp->vfc_refcount;
 	xvfsp->vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp->vfc_vfsops = NULL;
 	xvfsp->vfc_next = NULL;
 }
 
 /*
  * Top level filesystem related information gathering.
  */
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 	int error;
 
 	error = 0;
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+		bzero(&xvfsp, sizeof(xvfsp));
 		vfsconf2x(vfsp, &xvfsp);
 		error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 
 	printf("WARNING: userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
+		bzero(&xvfsp, sizeof(xvfsp));
 		vfsconf2x(vfsp, &xvfsp);
 		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 	}
 	return (EOPNOTSUPP);
 }
 
 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl,
 	"Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error)
 			return error;
 	}
 	return 0;
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 #endif /* !BURN_BRIDGES */
 
 #define KINFO_VNODESLOP		10
 #ifdef notyet
 /*
  * Dump vnode list (via sysctl).
  */
 /* ARGSUSED */
 static int
 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct xvnode *xvn;
 	struct thread *td = req->td;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, len, n;
 
 	/*
 	 * Stale numvnodes access is not fatal here.
 	 */
 	req->lock = 0;
 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 	if (!req->oldptr)
 		/* Make an estimate */
 		return (SYSCTL_OUT(req, 0, len));
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 	n = 0;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 			continue;
 		MNT_ILOCK(mp);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (n == len)
 				break;
 			vref(vp);
 			xvn[n].xv_size = sizeof *xvn;
 			xvn[n].xv_vnode = vp;
 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 			XV_COPY(usecount);
 			XV_COPY(writecount);
 			XV_COPY(holdcnt);
 			XV_COPY(id);
 			XV_COPY(mount);
 			XV_COPY(numoutput);
 			XV_COPY(type);
 #undef XV_COPY
 			xvn[n].xv_flag = vp->v_vflag;
 
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				xvn[n].xv_dev = vp->v_cachedfs;
 				xvn[n].xv_ino = vp->v_cachedid;
 				break;
 			case VBLK:
 			case VCHR:
 				if (vp->v_rdev == NULL) {
 					vrele(vp);
 					continue;
 				}
 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
 				break;
 			case VSOCK:
 				xvn[n].xv_socket = vp->v_socket;
 				break;
 			case VFIFO:
 				xvn[n].xv_fifo = vp->v_fifoinfo;
 				break;
 			case VNON:
 			case VBAD:
 			default:
 				/* shouldn't happen? */
 				vrele(vp);
 				continue;
 			}
 			vrele(vp);
 			++n;
 		}
 		MNT_IUNLOCK(mp);
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp, td);
 		if (n == len)
 			break;
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 	free(xvn, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_vnode, "S,xvnode", "");
 #endif
 
 /*
  * Check to see if a filesystem is mounted on a block device.
  */
 int
 vfs_mountedon(vp)
 	struct vnode *vp;
 {
 
 	if (vp->v_rdev->si_mountpoint != NULL)
 		return (EBUSY);
 	return (0);
 }
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall()
 {
 	struct mount *mp;
 	struct thread *td;
 	int error;
 
 	if (curthread != NULL)
 		td = curthread;
 	else
 		td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	while(!TAILQ_EMPTY(&mountlist)) {
 		mp = TAILQ_LAST(&mountlist, mntlist);
 		error = dounmount(mp, MNT_FORCE, td);
 		if (error) {
 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
 			printf("unmount of %s failed (",
 			    mp->mnt_stat.f_mntonname);
 			if (error == EBUSY)
 				printf("BUSY)\n");
 			else
 				printf("%d)\n", error);
 		} else {
 			/* The unmount has removed mp from the mountlist */
 		}
 	}
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags)
 {
 	struct vnode *vp, *nvp;
 	struct vm_object *obj;
 	int tries;
 
 	GIANT_REQUIRED;
 
 	tries = 5;
 	MNT_ILOCK(mp);
 loop:
 	TAILQ_FOREACH_SAFE(vp, &mp->mnt_nvnodelist, v_nmntvnodes, nvp) {
 		if (vp->v_mount != mp) {
 			if (--tries > 0)
 				goto loop;
 			break;
 		}
 
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_XLOCK) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 
 		if ((vp->v_iflag & VI_OBJDIRTY) &&
 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
 			MNT_IUNLOCK(mp);
 			if (!vget(vp,
 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
 			    curthread)) {
 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
 					vput(vp);
 					MNT_ILOCK(mp);
 					continue;
 				}
 
 				if (VOP_GETVOBJECT(vp, &obj) == 0) {
 					VM_OBJECT_LOCK(obj);
 					vm_object_page_clean(obj, 0, 0,
 					    flags == MNT_WAIT ?
 					    OBJPC_SYNC : OBJPC_NOSYNC);
 					VM_OBJECT_UNLOCK(obj);
 				}
 				vput(vp);
 			}
 			MNT_ILOCK(mp);
 			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
 				if (--tries > 0)
 					goto loop;
 				break;
 			}
 		} else
 			VI_UNLOCK(vp);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Create the VM object needed for VMIO and mmap support.  This
  * is done for all VREG files in the system.  Some filesystems might
  * afford the additional metadata buffering capability of the
  * VMIO code by making the device node be VMIO mode also.
  *
  * vp must be locked when vfs_object_create is called.
  */
 int
 vfs_object_create(vp, td, cred)
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 {
 
 	GIANT_REQUIRED;
 	return (VOP_CREATEVOBJECT(vp, cred, td));
 }
 
 /*
  * Mark a vnode as free, putting it up for recycling.
  */
 void
 vfree(vp)
 	struct vnode *vp;
 {
 
 	ASSERT_VI_LOCKED(vp, "vfree");
 	mtx_lock(&vnode_free_list_mtx);
 	KASSERT((vp->v_iflag & VI_FREE) == 0, ("vnode already free"));
 	if (vp->v_iflag & VI_AGE) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	} else {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
 	freevnodes++;
 	mtx_unlock(&vnode_free_list_mtx);
 	vp->v_iflag &= ~VI_AGE;
 	vp->v_iflag |= VI_FREE;
 }
 
 /*
  * Opposite of vfree() - mark a vnode as in use.
  */
 void
 vbusy(vp)
 	struct vnode *vp;
 {
 
 	ASSERT_VI_LOCKED(vp, "vbusy");
 	KASSERT((vp->v_iflag & VI_FREE) != 0, ("vnode not free"));
 
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 	freevnodes--;
 	mtx_unlock(&vnode_free_list_mtx);
 
 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
 }
 
 /*
  * Initalize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
 	if (vp->v_pollinfo != NULL) {
 		uma_zfree(vnodepoll_zone, vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
 	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note,
 	    &vp->v_pollinfo->vpi_lock);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(vp, td, events)
 	struct vnode *vp;
 	struct thread *td;
 	short events;
 {
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return events;
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return 0;
 }
 
 /*
  * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
  * it is possible for us to miss an event due to race conditions, but
  * that condition is expected to be rare, so for the moment it is the
  * preferred interface.
  */
 void
 vn_pollevent(vp, events)
 	struct vnode *vp;
 	short events;
 {
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_events & events) {
 		/*
 		 * We clear vpi_events so that we don't
 		 * call selwakeup() twice if two events are
 		 * posted before the polling process(es) is
 		 * awakened.  This also ensures that we take at
 		 * most one selwakeup() if the polling process
 		 * is no longer interested.  However, it does
 		 * mean that only one event can be noticed at
 		 * a time.  (Perhaps we should only clear those
 		 * event bits which we note?) XXX
 		 */
 		vp->v_pollinfo->vpi_events = 0;	/* &= ~events ??? */
 		vp->v_pollinfo->vpi_revents |= events;
 		selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
 	}
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 }
 
 /*
  * Wake up anyone polling on vp because it is being revoked.
  * This depends on dead_poll() returning POLLHUP for correct
  * behavior.
  */
 void
 vn_pollgone(vp)
 	struct vnode *vp;
 {
 
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	VN_KNOTE_LOCKED(vp, NOTE_REVOKE);
 	if (vp->v_pollinfo->vpi_events) {
 		vp->v_pollinfo->vpi_events = 0;
 		selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
 	}
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 }
 
 
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 
 static vop_t **sync_vnodeop_p;
 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
 	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
 	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
 	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
 	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
 	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
 	{ &vop_lock_desc,	(vop_t *) vop_stdlock },	/* lock */
 	{ &vop_unlock_desc,	(vop_t *) vop_stdunlock },	/* unlock */
 	{ &vop_islocked_desc,	(vop_t *) vop_stdislocked },	/* islocked */
 	{ NULL, NULL }
 };
 static struct vnodeopv_desc sync_vnodeop_opv_desc =
 	{ &sync_vnodeop_p, sync_vnodeop_entries };
 
 VNODEOP_SET(sync_vnodeop_opv_desc);
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 int
 vfs_allocate_syncvnode(mp)
 	struct mount *mp;
 {
 	struct vnode *vp;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	if ((error = getnewvnode("syncer", mp, sync_vnodeop_p, &vp)) != 0) {
 		mp->mnt_syncer = NULL;
 		return (error);
 	}
 	vp->v_type = VNON;
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	VI_LOCK(vp);
 	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 	mtx_lock(&sync_mtx);
 	sync_vnode_count++;
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	mp->mnt_syncer = vp;
 	return (0);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	struct thread *td = ap->a_td;
 	int error, asyncflag;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	VI_LOCK(syncvp);
 	vn_syncer_add_to_worklist(syncvp, syncdelay);
 	VI_UNLOCK(syncvp);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	mtx_lock(&mountlist_mtx);
 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
 		mtx_unlock(&mountlist_mtx);
 		return (0);
 	}
 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 		vfs_unbusy(mp, td);
 		return (0);
 	}
 	asyncflag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	vfs_msync(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
 	if (asyncflag)
 		mp->mnt_flag |= MNT_ASYNC;
 	vn_finished_write(mp);
 	vfs_unbusy(mp, td);
 	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
 sync_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	VI_LOCK(vp);
 	vp->v_mount->mnt_syncer = NULL;
 	if (vp->v_iflag & VI_ONWORKLST) {
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(vp, v_synclist);
  		syncer_worklist_len--;
 		sync_vnode_count--;
 		mtx_unlock(&sync_mtx);
 		vp->v_iflag &= ~VI_ONWORKLST;
 	}
 	VI_UNLOCK(vp);
 
 	return (0);
 }
 
 /*
  * extract the struct cdev *from a VCHR
  */
 struct cdev *
 vn_todev(vp)
 	struct vnode *vp;
 {
 
 	if (vp->v_type != VCHR)
 		return (NULL);
 	return (vp->v_rdev);
 }
 
 /*
  * Check if vnode represents a disk device
  */
 int
 vn_isdisk(vp, errp)
 	struct vnode *vp;
 	int *errp;
 {
 	int error;
 
 	error = 0;
 	if (vp->v_type != VCHR)
 		error = ENOTBLK;
 	else if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (!(devsw(vp->v_rdev)->d_flags & D_DISK))
 		error = ENOTBLK;
 	if (errp != NULL)
 		*errp = error;
 	return (error == 0);
 }
 
 /*
  * Free data allocated by namei(); see namei(9) for details.
  */
 void
 NDFREE(ndp, flags)
      struct nameidata *ndp;
      const u_int flags;
 {
 
 	if (!(flags & NDF_NO_FREE_PNBUF) &&
 	    (ndp->ni_cnd.cn_flags & HASBUF)) {
 		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
 		ndp->ni_cnd.cn_flags &= ~HASBUF;
 	}
 	if (!(flags & NDF_NO_DVP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
 	    ndp->ni_dvp != ndp->ni_vp)
 		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
 	if (!(flags & NDF_NO_DVP_RELE) &&
 	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
 		vrele(ndp->ni_dvp);
 		ndp->ni_dvp = NULL;
 	}
 	if (!(flags & NDF_NO_VP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
 		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
 	if (!(flags & NDF_NO_VP_RELE) &&
 	    ndp->ni_vp) {
 		vrele(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 	}
 	if (!(flags & NDF_NO_STARTDIR_RELE) &&
 	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
 		vrele(ndp->ni_startdir);
 		ndp->ni_startdir = NULL;
 	}
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, credentials,
  * and optional call-by-reference privused argument allowing vaccess()
  * to indicate to the caller whether privilege was used to satisfy the
  * request (obsoleted).  Returns 0 on success, or an errno on failure.
  */
 int
 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
 	enum vtype type;
 	mode_t file_mode;
 	uid_t file_uid;
 	gid_t file_gid;
 	mode_t acc_mode;
 	struct ucred *cred;
 	int *privused;
 {
 	mode_t dac_granted;
 #ifdef CAPABILITIES
 	mode_t cap_granted;
 #endif
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	if (privused != NULL)
 		*privused = 0;
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((acc_mode & dac_granted) == acc_mode)
 		return (0);
 
 privcheck:
 	if (!suser_cred(cred, SUSER_ALLOWJAIL)) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 #ifdef CAPABILITIES
 	/*
 	 * Build a capability mask to determine if the set of capabilities
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.
 	 * For each capability, if the capability is required, bitwise
 	 * or the request type onto the cap_granted mask.
 	 */
 	cap_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
 		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
 		 */
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
 			cap_granted |= VEXEC;
 	} else {
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL))
 			cap_granted |= VEXEC;
 	}
 
 	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
 		cap_granted |= VREAD;
 
 	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL))
 		cap_granted |= (VWRITE | VAPPEND);
 
 	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL))
 		cap_granted |= VADMIN;
 
 	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 #endif
 
 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace,
     struct ucred *cred, struct thread *td, int access)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly
 	 * manipulate system attributes.
 	 *
 	 * XXX What capability should apply here?
 	 * Probably CAP_SYS_SETFFLAG.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (suser_cred(cred, 0));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, access, cred, td));
 	default:
 		return (EPERM);
 	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * This only exists to supress warnings from unlocked specfs accesses.  It is
  * no longer ok to have an unlocked VFS.
  */
 #define	IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
 
 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "");
 
 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "");
 
 int vfs_badlock_print = 1;	/* Print lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "");
 
 #ifdef KDB
 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "");
 #endif
 
 static void
 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 {
 
 #ifdef KDB
 	if (vfs_badlock_backtrace)
 		kdb_backtrace();
 #endif
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, (void *)vp, msg);
 	if (vfs_badlock_ddb)
 		kdb_enter("lock violation");
 }
 
 void
 assert_vi_locked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
 		vfs_badlock("is not locked but should be", str, vp);
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 #if 0
 void
 assert_vop_elocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 
 void
 assert_vop_elocked_other(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
 		vfs_badlock("is not exclusive locked by another thread",
 		    str, vp);
 }
 
 void
 assert_vop_slocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
 		vfs_badlock("is not locked shared but should be", str, vp);
 }
 #endif /* 0 */
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
 	if (a->a_tdvp != a->a_fdvp)
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp != a->a_fvp)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 }
 
 void
 vop_strategy_pre(void *ap)
 {
 	struct vop_strategy_args *a;
 	struct buf *bp;
 
 	a = ap;
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (BUF_REFCNT(bp) < 1) {
 		if (vfs_badlock_print)
 			printf(
 			    "VOP_STRATEGY: bp is not locked but should be\n");
 		if (vfs_badlock_ddb)
 			kdb_enter("lock violation");
 	}
 }
 
 void
 vop_lookup_pre(void *ap)
 {
 	struct vop_lookup_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 }
 
 void
 vop_lookup_post(void *ap, int rc)
 {
 	struct vop_lookup_args *a;
 	struct componentname *cnp;
 	struct vnode *dvp;
 	struct vnode *vp;
 	int flags;
 
 	a = ap;
 	dvp = a->a_dvp;
 	cnp = a->a_cnp;
 	vp = *(a->a_vpp);
 	flags = cnp->cn_flags;
 
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 
 	/*
 	 * If this is the last path component for this lookup and LOCKPARENT
 	 * is set, OR if there is an error the directory has to be locked.
 	 */
 	if ((flags & LOCKPARENT) && (flags & ISLASTCN))
 		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)");
 	else if (rc != 0)
 		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)");
 	else if (dvp != vp)
 		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)");
 	if (flags & PDIRUNLOCK)
 		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)");
 }
 
 void
 vop_lock_pre(void *ap)
 {
 	struct vop_lock_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_lock_post(void *ap, int rc)
 {
 	struct vop_lock_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0)
 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_unlock_pre(void *ap)
 {
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 }
 
 void
 vop_unlock_post(void *ap, int rc)
 {
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
 }
 #endif /* DEBUG_VFS_LOCKS */
 
 static struct knlist fs_knlist;
 
 static void
 vfs_event_init(void *arg)
 {
 	knlist_init(&fs_knlist, NULL);
 }
 /* XXX - correct order? */
 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 
 void
 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
 {
 
 	KNOTE_UNLOCKED(&fs_knlist, event);
 }
 
 static int	filt_fsattach(struct knote *kn);
 static void	filt_fsdetach(struct knote *kn);
 static int	filt_fsevent(struct knote *kn, long hint);
 
 struct filterops fs_filtops =
 	{ 0, filt_fsattach, filt_fsdetach, filt_fsevent };
 
 static int
 filt_fsattach(struct knote *kn)
 {
 
 	kn->kn_flags |= EV_CLEAR;
 	knlist_add(&fs_knlist, kn, 0);
 	return (0);
 }
 
 static void
 filt_fsdetach(struct knote *kn)
 {
 
 	knlist_remove(&fs_knlist, kn, 0);
 }
 
 static int
 filt_fsevent(struct knote *kn, long hint)
 {
 
 	kn->kn_fflags |= hint;
 	return (kn->kn_fflags != 0);
 }
 
 static int
 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsidctl vc;
 	int error;
 	struct mount *mp;
 
 	error = SYSCTL_IN(req, &vc, sizeof(vc));
 	if (error)
 		return (error);
 	if (vc.vc_vers != VFS_CTL_VERS1)
 		return (EINVAL);
 	mp = vfs_getvfs(&vc.vc_fsid);
 	if (mp == NULL)
 		return (ENOENT);
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
 	return (VFS_SYSCTL(mp, vc.vc_op, req));
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR,
         NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid");
Index: stable/5/sys/net/if_mib.c
===================================================================
--- stable/5/sys/net/if_mib.c	(revision 145952)
+++ stable/5/sys/net/if_mib.c	(revision 145953)
@@ -1,143 +1,144 @@
 /*-
  * Copyright 1996 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  * 
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_mib.h>
 
 /*
  * A sysctl(3) MIB for generic interface information.  This information
  * is exported in the net.link.generic branch, which has the following
  * structure:
  *
  * net.link.generic	.system			- system-wide control variables
  *						  and statistics (node)
  *			.ifdata.<ifindex>.general
  *						- what's in `struct ifdata'
  *						  plus some other info
  *			.ifdata.<ifindex>.linkspecific
  *						- a link-type-specific data
  *						  structure (as might be used
  *						  by an SNMP agent
  *
  * Perhaps someday we will make addresses accessible via this interface
  * as well (then there will be four such...).  The reason that the
  * index comes before the last element in the name is because it
  * seems more orthogonal that way, particularly with the possibility
  * of other per-interface data living down here as well (e.g., integrated
  * services stuff).
  */
 
 SYSCTL_DECL(_net_link_generic);
 SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RW, 0,
 	    "Variables global to all interfaces");
 SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD,
 	   &if_index, 0, "Number of configured interfaces");
 
 static int
 sysctl_ifdata(SYSCTL_HANDLER_ARGS) /* XXX bad syntax! */
 {
 	int *name = (int *)arg1;
 	int error;
 	u_int namelen = arg2;
 	struct ifnet *ifp;
 	struct ifmibdata ifmd;
 
 	if (namelen != 2)
 		return EINVAL;
 
 	if (name[0] <= 0 || name[0] > if_index ||
 	    ifaddr_byindex(name[0]) == NULL)
 		return ENOENT;
 
 	ifp = ifaddr_byindex(name[0])->ifa_ifp;
 
 	switch(name[1]) {
 	default:
 		return ENOENT;
 
 	case IFDATA_GENERAL:
+		bzero(&ifmd, sizeof(ifmd));
 		strlcpy(ifmd.ifmd_name, ifp->if_xname, sizeof(ifmd.ifmd_name));
 
 #define COPY(fld) ifmd.ifmd_##fld = ifp->if_##fld
 		COPY(pcount);
 		COPY(flags);
 		COPY(data);
 #undef COPY
 		ifmd.ifmd_snd_len = ifp->if_snd.ifq_len;
 		ifmd.ifmd_snd_maxlen = ifp->if_snd.ifq_maxlen;
 		ifmd.ifmd_snd_drops = ifp->if_snd.ifq_drops;
 
 		error = SYSCTL_OUT(req, &ifmd, sizeof ifmd);
 		if (error || !req->newptr)
 			return error;
 
 		error = SYSCTL_IN(req, &ifmd, sizeof ifmd);
 		if (error)
 			return error;
 
 #define DONTCOPY(fld) ifmd.ifmd_data.ifi_##fld = ifp->if_data.ifi_##fld
 		DONTCOPY(type);
 		DONTCOPY(physical);
 		DONTCOPY(addrlen);
 		DONTCOPY(hdrlen);
 		DONTCOPY(mtu);
 		DONTCOPY(metric);
 		DONTCOPY(baudrate);
 #undef DONTCOPY
 #define COPY(fld) ifp->if_##fld = ifmd.ifmd_##fld
 		COPY(data);
 		ifp->if_snd.ifq_maxlen = ifmd.ifmd_snd_maxlen;
 		ifp->if_snd.ifq_drops = ifmd.ifmd_snd_drops;
 #undef COPY
 		break;
 
 	case IFDATA_LINKSPECIFIC:
 		error = SYSCTL_OUT(req, ifp->if_linkmib, ifp->if_linkmiblen);
 		if (error || !req->newptr)
 			return error;
 
 		error = SYSCTL_IN(req, ifp->if_linkmib, ifp->if_linkmiblen);
 		if (error)
 			return error;
 		
 	}
 	return 0;
 }
 
 SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RW,
 	    sysctl_ifdata, "Interface table");
 
Index: stable/5/sys/netinet/ip_divert.c
===================================================================
--- stable/5/sys/netinet/ip_divert.c	(revision 145952)
+++ stable/5/sys/netinet/ip_divert.c	(revision 145953)
@@ -1,628 +1,629 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_ipfw.h"
 #include "opt_ipdivert.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 
 #ifndef INET
 #error "IPDIVERT requires INET."
 #endif
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mac.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_divert.h>
 #include <netinet/ip_var.h>
 
 /*
  * Divert sockets
  */
 
 /*
  * Allocate enough space to hold a full IP packet
  */
 #define	DIVSNDQ		(65536 + 100)
 #define	DIVRCVQ		(65536 + 100)
 
 /*
  * Divert sockets work in conjunction with ipfw, see the divert(4)
  * manpage for features.
  * Internally, packets selected by ipfw in ip_input() or ip_output(),
  * and never diverted before, are passed to the input queue of the
  * divert socket with a given 'divert_port' number (as specified in
  * the matching ipfw rule), and they are tagged with a 16 bit cookie
  * (representing the rule number of the matching ipfw rule), which
  * is passed to process reading from the socket.
  *
  * Packets written to the divert socket are again tagged with a cookie
  * (usually the same as above) and a destination address.
  * If the destination address is INADDR_ANY then the packet is
  * treated as outgoing and sent to ip_output(), otherwise it is
  * treated as incoming and sent to ip_input().
  * In both cases, the packet is tagged with the cookie.
  *
  * On reinjection, processing in ip_input() and ip_output()
  * will be exactly the same as for the original packet, except that
  * ipfw processing will start at the rule number after the one
  * written in the cookie (so, tagging a packet with a cookie of 0
  * will cause it to be effectively considered as a standard packet).
  */
 
 /* Internal variables */
 static struct inpcbhead divcb;
 static struct inpcbinfo divcbinfo;
 
 static u_long	div_sendspace = DIVSNDQ;	/* XXX sysctl ? */
 static u_long	div_recvspace = DIVRCVQ;	/* XXX sysctl ? */
 
 /*
  * Initialize divert connection block queue.
  */
 void
 div_init(void)
 {
 	INP_INFO_LOCK_INIT(&divcbinfo, "div");
 	LIST_INIT(&divcb);
 	divcbinfo.listhead = &divcb;
 	/*
 	 * XXX We don't use the hash list for divert IP, but it's easier
 	 * to allocate a one entry hash list than it is to check all
 	 * over the place for hashbase == NULL.
 	 */
 	divcbinfo.hashbase = hashinit(1, M_PCB, &divcbinfo.hashmask);
 	divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask);
 	divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(divcbinfo.ipi_zone, maxsockets);
 }
 
 /*
  * IPPROTO_DIVERT is not in the real IP protocol number space; this
  * function should never be called.  Just in case, drop any packets.
  */
 void
 div_input(struct mbuf *m, int off)
 {
 	ipstat.ips_noproto++;
 	m_freem(m);
 }
 
 /*
  * Divert a packet by passing it up to the divert socket at port 'port'.
  *
  * Setup generic address and protocol structures for div_input routine,
  * then pass them along with mbuf chain.
  */
 void
 divert_packet(struct mbuf *m, int incoming)
 {
 	struct ip *ip;
 	struct inpcb *inp;
 	struct socket *sa;
 	u_int16_t nport;
 	struct sockaddr_in divsrc;
 	struct m_tag *mtag;
 
 	mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
 	if (mtag == NULL) {
 		printf("%s: no divert tag\n", __func__);
 		m_freem(m);
 		return;
 	}
 	/* Assure header */
 	if (m->m_len < sizeof(struct ip) &&
 	    (m = m_pullup(m, sizeof(struct ip))) == 0)
 		return;
 	ip = mtod(m, struct ip *);
 
 	/* Delayed checksums are currently not compatible with divert. */
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		ip->ip_len = ntohs(ip->ip_len);
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 		ip->ip_len = htons(ip->ip_len);
 	}
 
 	/*
 	 * Record receive interface address, if any.
 	 * But only for incoming packets.
 	 */
 	bzero(&divsrc, sizeof(divsrc));
 	divsrc.sin_len = sizeof(divsrc);
 	divsrc.sin_family = AF_INET;
 	divsrc.sin_port = divert_cookie(mtag);	/* record matching rule */
 	if (incoming) {
 		struct ifaddr *ifa;
 
 		/* Sanity check */
 		M_ASSERTPKTHDR(m);
 
 		/* Find IP address for receive interface */
 		TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr == NULL)
 				continue;
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			divsrc.sin_addr =
 			    ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
 			break;
 		}
 	}
 	/*
 	 * Record the incoming interface name whenever we have one.
 	 */
 	if (m->m_pkthdr.rcvif) {
 		/*
 		 * Hide the actual interface name in there in the 
 		 * sin_zero array. XXX This needs to be moved to a
 		 * different sockaddr type for divert, e.g.
 		 * sockaddr_div with multiple fields like 
 		 * sockaddr_dl. Presently we have only 7 bytes
 		 * but that will do for now as most interfaces
 		 * are 4 or less + 2 or less bytes for unit.
 		 * There is probably a faster way of doing this,
 		 * possibly taking it from the sockaddr_dl on the iface.
 		 * This solves the problem of a P2P link and a LAN interface
 		 * having the same address, which can result in the wrong
 		 * interface being assigned to the packet when fed back
 		 * into the divert socket. Theoretically if the daemon saves
 		 * and re-uses the sockaddr_in as suggested in the man pages,
 		 * this iface name will come along for the ride.
 		 * (see div_output for the other half of this.)
 		 */ 
 		strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname,
 		    sizeof(divsrc.sin_zero));
 	}
 
 	/* Put packet on socket queue, if any */
 	sa = NULL;
 	nport = htons((u_int16_t)divert_info(mtag));
 	INP_INFO_RLOCK(&divcbinfo);
 	LIST_FOREACH(inp, &divcb, inp_list) {
 		INP_LOCK(inp);
 		/* XXX why does only one socket match? */
 		if (inp->inp_lport == nport) {
 			sa = inp->inp_socket;
 			SOCKBUF_LOCK(&sa->so_rcv);
 			if (sbappendaddr_locked(&sa->so_rcv,
 			    (struct sockaddr *)&divsrc, m,
 			    (struct mbuf *)0) == 0) {
 				SOCKBUF_UNLOCK(&sa->so_rcv);
 				sa = NULL;	/* force mbuf reclaim below */
 			} else
 				sorwakeup_locked(sa);
 			INP_UNLOCK(inp);
 			break;
 		}
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&divcbinfo);
 	if (sa == NULL) {
 		m_freem(m);
 		ipstat.ips_noproto++;
 		ipstat.ips_delivered--;
         }
 }
 
 /*
  * Deliver packet back into the IP processing machinery.
  *
  * If no address specified, or address is 0.0.0.0, send to ip_output();
  * otherwise, send to ip_input() and mark as having been received on
  * the interface with that address.
  */
 static int
 div_output(struct socket *so, struct mbuf *m,
 	struct sockaddr_in *sin, struct mbuf *control)
 {
 	int error = 0;
 
 	m->m_pkthdr.rcvif = NULL;
 
 	if (control)
 		m_freem(control);		/* XXX */
 
 	/* Loopback avoidance and state recovery */
 	if (sin) {
 		struct m_tag *mtag;
 		struct divert_tag *dt;
 		int i;
 
 		mtag = m_tag_get(PACKET_TAG_DIVERT,
 				sizeof(struct divert_tag), M_NOWAIT);
 		if (mtag == NULL) {
 			error = ENOBUFS;
 			goto cantsend;
 		}
 		dt = (struct divert_tag *)(mtag+1);
 		dt->info = 0;
 		dt->cookie = sin->sin_port;
 		m_tag_prepend(m, mtag);
 
 		/*
 		 * Find receive interface with the given name, stuffed
 		 * (if it exists) in the sin_zero[] field.
 		 * The name is user supplied data so don't trust its size
 		 * or that it is zero terminated.
 		 */
 		for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++)
 			;
 		if ( i > 0 && i < sizeof(sin->sin_zero))
 			m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
 	}
 
 	/* Reinject packet into the system as incoming or outgoing */
 	if (!sin || sin->sin_addr.s_addr == 0) {
 		struct ip *const ip = mtod(m, struct ip *);
 		struct inpcb *inp;
 
 		INP_INFO_WLOCK(&divcbinfo);
 		inp = sotoinpcb(so);
 		INP_LOCK(inp);
 		/*
 		 * Don't allow both user specified and setsockopt options,
 		 * and don't allow packet length sizes that will crash
 		 */
 		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) ||
 		     ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
 			error = EINVAL;
 			m_freem(m);
 		} else {
 			/* Convert fields to host order for ip_output() */
 			ip->ip_len = ntohs(ip->ip_len);
 			ip->ip_off = ntohs(ip->ip_off);
 
 			/* Send packet to output processing */
 			ipstat.ips_rawout++;			/* XXX */
 
 #ifdef MAC
 			mac_create_mbuf_from_inpcb(inp, m);
 #endif
 			error = ip_output(m,
 				    inp->inp_options, NULL,
 				    (so->so_options & SO_DONTROUTE) |
 				    IP_ALLOWBROADCAST | IP_RAWOUTPUT,
 				    inp->inp_moptions, NULL);
 		}
 		INP_UNLOCK(inp);
 		INP_INFO_WUNLOCK(&divcbinfo);
 	} else {
 		if (m->m_pkthdr.rcvif == NULL) {
 			/*
 			 * No luck with the name, check by IP address.
 			 * Clear the port and the ifname to make sure
 			 * there are no distractions for ifa_ifwithaddr.
 			 */
 			struct	ifaddr *ifa;
 
 			bzero(sin->sin_zero, sizeof(sin->sin_zero));
 			sin->sin_port = 0;
 			ifa = ifa_ifwithaddr((struct sockaddr *) sin);
 			if (ifa == NULL) {
 				error = EADDRNOTAVAIL;
 				goto cantsend;
 			}
 			m->m_pkthdr.rcvif = ifa->ifa_ifp;
 		}
 #ifdef MAC
 		SOCK_LOCK(so);
 		mac_create_mbuf_from_socket(so, m);
 		SOCK_UNLOCK(so);
 #endif
 		/* Send packet to input processing */
 		ip_input(m);
 	}
 
 	return error;
 
 cantsend:
 	m_freem(m);
 	return error;
 }
 
 static int
 div_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	INP_INFO_WLOCK(&divcbinfo);
 	inp  = sotoinpcb(so);
 	if (inp != 0) {
 		INP_INFO_WUNLOCK(&divcbinfo);
 		return EINVAL;
 	}
 	if (td && (error = suser(td)) != 0) {
 		INP_INFO_WUNLOCK(&divcbinfo);
 		return error;
 	}
 	error = soreserve(so, div_sendspace, div_recvspace);
 	if (error) {
 		INP_INFO_WUNLOCK(&divcbinfo);
 		return error;
 	}
 	error = in_pcballoc(so, &divcbinfo, "divinp");
 	if (error) {
 		INP_INFO_WUNLOCK(&divcbinfo);
 		return error;
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	INP_LOCK(inp);
 	INP_INFO_WUNLOCK(&divcbinfo);
 	inp->inp_ip_p = proto;
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_flags |= INP_HDRINCL;
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 static int
 div_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_WLOCK(&divcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&divcbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	in_pcbdetach(inp);
 	INP_INFO_WUNLOCK(&divcbinfo);
 	return 0;
 }
 
 static int
 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	INP_INFO_WLOCK(&divcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&divcbinfo);
 		return EINVAL;
 	}
 	/* in_pcbbind assumes that nam is a sockaddr_in
 	 * and in_pcbbind requires a valid address. Since divert
 	 * sockets don't we need to make sure the address is
 	 * filled in properly.
 	 * XXX -- divert should not be abusing in_pcbind
 	 * and should probably have its own family.
 	 */
 	if (nam->sa_family != AF_INET)
 		error = EAFNOSUPPORT;
 	else {
 		((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
 		INP_LOCK(inp);
 		error = in_pcbbind(inp, nam, td->td_ucred);
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&divcbinfo);
 	return error;
 }
 
 static int
 div_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_RLOCK(&divcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_RUNLOCK(&divcbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	INP_INFO_RUNLOCK(&divcbinfo);
 	socantsendmore(so);
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 static int
 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct thread *td)
 {
 	/* Packet must have a header (but that's about it) */
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == 0) {
 		ipstat.ips_toosmall++;
 		m_freem(m);
 		return EINVAL;
 	}
 
 	/* Send packet */
 	return div_output(so, m, (struct sockaddr_in *)nam, control);
 }
 
 void
 div_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
         struct in_addr faddr;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
         	return;
 	if (PRC_IS_REDIRECT(cmd))
 		return;
 }
 
 static int
 div_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = divcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&divcbinfo);
 	gencnt = divcbinfo.ipi_gencnt;
 	n = divcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&divcbinfo);
 
 	error = sysctl_wire_old_buffer(req,
 	    2 * sizeof(xig) + n*sizeof(struct xinpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	INP_INFO_RLOCK(&divcbinfo);
 	for (inp = LIST_FIRST(divcbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
 			inp_list[i++] = inp;
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&divcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
+			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		}
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&divcbinfo);
 		xig.xig_gen = divcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = divcbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&divcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 /*
  * This is the wrapper function for in_setsockaddr.  We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 div_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setsockaddr(so, nam, &divcbinfo));
 }
 
 /*
  * This is the wrapper function for in_setpeeraddr. We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 div_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setpeeraddr(so, nam, &divcbinfo));
 }
 
 
 SYSCTL_DECL(_net_inet_divert);
 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0,
 	    div_pcblist, "S,xinpcb", "List of active divert sockets");
 
 struct pr_usrreqs div_usrreqs = {
 	NULL, pru_accept_notsupp, div_attach, div_bind,
 	pru_connect_notsupp, pru_connect2_notsupp, in_control, div_detach,
 	NULL, pru_listen_notsupp, div_peeraddr, pru_rcvd_notsupp,
 	pru_rcvoob_notsupp, div_send, pru_sense_null, div_shutdown,
 	div_sockaddr, sosend, soreceive, sopoll, in_pcbsosetlabel
 };
Index: stable/5/sys/netinet/raw_ip.c
===================================================================
--- stable/5/sys/netinet/raw_ip.c	(revision 145952)
+++ stable/5/sys/netinet/raw_ip.c	(revision 145953)
@@ -1,907 +1,908 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
  * $FreeBSD$
  */
 
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_mroute.h>
 
 #include <netinet/ip_fw.h>
 #include <netinet/ip_dummynet.h>
 
 #ifdef FAST_IPSEC
 #include <netipsec/ipsec.h>
 #endif /*FAST_IPSEC*/
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #endif /*IPSEC*/
 
 struct	inpcbhead ripcb;
 struct	inpcbinfo ripcbinfo;
 
 /* control hooks for ipfw and dummynet */
 ip_fw_ctl_t *ip_fw_ctl_ptr = NULL;
 ip_dn_ctl_t *ip_dn_ctl_ptr = NULL;
 
 /*
  * hooks for multicast routing. They all default to NULL,
  * so leave them not initialized and rely on BSS being set to 0.
  */
 
 /* The socket used to communicate with the multicast routing daemon.  */
 struct socket  *ip_mrouter;
 
 /* The various mrouter and rsvp functions */
 int (*ip_mrouter_set)(struct socket *, struct sockopt *);
 int (*ip_mrouter_get)(struct socket *, struct sockopt *);
 int (*ip_mrouter_done)(void);
 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 		   struct ip_moptions *);
 int (*mrt_ioctl)(int, caddr_t);
 int (*legal_vif_num)(int);
 u_long (*ip_mcast_src)(int);
 
 void (*rsvp_input_p)(struct mbuf *m, int off);
 int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
 void (*ip_rsvp_force_done)(struct socket *);
 
 /*
  * Nominal space allocated to a raw ip socket.
  */
 #define	RIPSNDQ		8192
 #define	RIPRCVQ		8192
 
 /*
  * Raw interface to IP protocol.
  */
 
 /*
  * Initialize raw connection block q.
  */
 void
 rip_init()
 {
 	INP_INFO_LOCK_INIT(&ripcbinfo, "rip");
 	LIST_INIT(&ripcb);
 	ripcbinfo.listhead = &ripcb;
 	/*
 	 * XXX We don't use the hash list for raw IP, but it's easier
 	 * to allocate a one entry hash list than it is to check all
 	 * over the place for hashbase == NULL.
 	 */
 	ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask);
 	ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask);
 	ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets);
 }
 
 static struct	sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET };
 
 static int
 raw_append(struct inpcb *last, struct ip *ip, struct mbuf *n)
 {
 	int policyfail = 0;
 
 	INP_LOCK_ASSERT(last);
 
 #if defined(IPSEC) || defined(FAST_IPSEC)
 	/* check AH/ESP integrity. */
 	if (ipsec4_in_reject(n, last)) {
 		policyfail = 1;
 #ifdef IPSEC
 		ipsecstat.in_polvio++;
 #endif /*IPSEC*/
 		/* do not inject data to pcb */
 	}
 #endif /*IPSEC || FAST_IPSEC*/
 #ifdef MAC
 	if (!policyfail && mac_check_inpcb_deliver(last, n) != 0)
 		policyfail = 1;
 #endif
 	if (!policyfail) {
 		struct mbuf *opts = NULL;
 		struct socket *so;
 
 		so = last->inp_socket;
 		if ((last->inp_flags & INP_CONTROLOPTS) ||
 		    (so->so_options & SO_TIMESTAMP))
 			ip_savecontrol(last, &opts, ip, n);
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (sbappendaddr_locked(&so->so_rcv,
 		    (struct sockaddr *)&ripsrc, n, opts) == 0) {
 			/* should notify about lost packet */
 			m_freem(n);
 			if (opts)
 				m_freem(opts);
 			SOCKBUF_UNLOCK(&so->so_rcv);
 		} else
 			sorwakeup_locked(so);
 	} else
 		m_freem(n);
 	return policyfail;
 }
 
 /*
  * Setup generic address and protocol structures
  * for raw_input routine, then pass them along with
  * mbuf chain.
  */
 void
 rip_input(struct mbuf *m, int off)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	int proto = ip->ip_p;
 	struct inpcb *inp, *last;
 
 	INP_INFO_RLOCK(&ripcbinfo);
 	ripsrc.sin_addr = ip->ip_src;
 	last = NULL;
 	LIST_FOREACH(inp, &ripcb, inp_list) {
 		INP_LOCK(inp);
 		if (inp->inp_ip_p && inp->inp_ip_p != proto) {
 	docontinue:
 			INP_UNLOCK(inp);
 			continue;
 		}
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			goto docontinue;
 #endif
 		if (inp->inp_laddr.s_addr &&
 		    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 			goto docontinue;
 		if (inp->inp_faddr.s_addr &&
 		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 			goto docontinue;
 		if (jailed(inp->inp_socket->so_cred))
 			if (htonl(prison_getip(inp->inp_socket->so_cred)) !=
 			    ip->ip_dst.s_addr)
 				goto docontinue;
 		if (last) {
 			struct mbuf *n;
 
 			n = m_copy(m, 0, (int)M_COPYALL);
 			if (n != NULL)
 				(void) raw_append(last, ip, n);
 			/* XXX count dropped packet */
 			INP_UNLOCK(last);
 		}
 		last = inp;
 	}
 	if (last != NULL) {
 		if (raw_append(last, ip, m) != 0)
 			ipstat.ips_delivered--;
 		INP_UNLOCK(last);
 	} else {
 		m_freem(m);
 		ipstat.ips_noproto++;
 		ipstat.ips_delivered--;
 	}
 	INP_INFO_RUNLOCK(&ripcbinfo);
 }
 
 /*
  * Generate IP header and pass packet to ip_output.
  * Tack on options user may have setup with control call.
  */
 int
 rip_output(struct mbuf *m, struct socket *so, u_long dst)
 {
 	struct ip *ip;
 	int error;
 	struct inpcb *inp = sotoinpcb(so);
 	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
 	    IP_ALLOWBROADCAST;
 
 	/*
 	 * If the user handed us a complete IP packet, use it.
 	 * Otherwise, allocate an mbuf for a header and fill it in.
 	 */
 	if ((inp->inp_flags & INP_HDRINCL) == 0) {
 		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
 		if (m == NULL)
 			return(ENOBUFS);
 
 		INP_LOCK(inp);
 		ip = mtod(m, struct ip *);
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_off = 0;
 		ip->ip_p = inp->inp_ip_p;
 		ip->ip_len = m->m_pkthdr.len;
 		if (jailed(inp->inp_socket->so_cred))
 			ip->ip_src.s_addr =
 			    htonl(prison_getip(inp->inp_socket->so_cred));
 		else
 			ip->ip_src = inp->inp_laddr;
 		ip->ip_dst.s_addr = dst;
 		ip->ip_ttl = inp->inp_ip_ttl;
 	} else {
 		if (m->m_pkthdr.len > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		INP_LOCK(inp);
 		ip = mtod(m, struct ip *);
 		if (jailed(inp->inp_socket->so_cred)) {
 			if (ip->ip_src.s_addr !=
 			    htonl(prison_getip(inp->inp_socket->so_cred))) {
 				INP_UNLOCK(inp);
 				m_freem(m);
 				return (EPERM);
 			}
 		}
 		/* don't allow both user specified and setsockopt options,
 		   and don't allow packet length sizes that will crash */
 		if (((ip->ip_hl != (sizeof (*ip) >> 2))
 		     && inp->inp_options)
 		    || (ip->ip_len > m->m_pkthdr.len)
 		    || (ip->ip_len < (ip->ip_hl << 2))) {
 			INP_UNLOCK(inp);
 			m_freem(m);
 			return EINVAL;
 		}
 		if (ip->ip_id == 0)
 			ip->ip_id = ip_newid();
 		/* XXX prevent ip_output from overwriting header fields */
 		flags |= IP_RAWOUTPUT;
 		ipstat.ips_rawout++;
 	}
 
 	if (inp->inp_flags & INP_ONESBCAST)
 		flags |= IP_SENDONES;
 
 #ifdef MAC
 	mac_create_mbuf_from_inpcb(inp, m);
 #endif
 
 	error = ip_output(m, inp->inp_options, NULL, flags,
 	    inp->inp_moptions, inp);
 	INP_UNLOCK(inp);
 	return error;
 }
 
 /*
  * Raw IP socket option processing.
  *
  * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
  * only be created by a privileged process, and as such, socket option
  * operations to manage system properties on any raw socket were allowed to
  * take place without explicit additional access control checks.  However,
  * raw sockets can now also be created in jail(), and therefore explicit
  * checks are now required.  Likewise, raw sockets can be used by a process
  * after it gives up privilege, so some caution is required.  For options
  * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
  * performed in ip_ctloutput() and therefore no check occurs here.
  * Unilaterally checking suser() here breaks normal IP socket option
  * operations on raw sockets.
  *
  * When adding new socket options here, make sure to add access control
  * checks here as necessary.
  */
 int
 rip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 
 	if (sopt->sopt_level != IPPROTO_IP)
 		return (EINVAL);
 
 	error = 0;
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			optval = inp->inp_flags & INP_HDRINCL;
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		case IP_FW_ADD:	/* ADD actually returns the body... */
 		case IP_FW_GET:
 		case IP_FW_TABLE_GETSIZE:
 		case IP_FW_TABLE_LIST:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			if (ip_fw_ctl_ptr != NULL)
 				error = ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET_GET:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			if (ip_dn_ctl_ptr != NULL)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break ;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case MRT_API_SUPPORT:
 		case MRT_API_CONFIG:
 		case MRT_ADD_BW_UPCALL:
 		case MRT_DEL_BW_UPCALL:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
 				EOPNOTSUPP;
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 			if (optval)
 				inp->inp_flags |= INP_HDRINCL;
 			else
 				inp->inp_flags &= ~INP_HDRINCL;
 			break;
 
 		case IP_FW_ADD:
 		case IP_FW_DEL:
 		case IP_FW_FLUSH:
 		case IP_FW_ZERO:
 		case IP_FW_RESETLOG:
 		case IP_FW_TABLE_ADD:
 		case IP_FW_TABLE_DEL:
 		case IP_FW_TABLE_FLUSH:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			if (ip_fw_ctl_ptr != NULL)
 				error = ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET_CONFIGURE:
 		case IP_DUMMYNET_DEL:
 		case IP_DUMMYNET_FLUSH:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			if (ip_dn_ctl_ptr != NULL)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT ;
 			break ;
 
 		case IP_RSVP_ON:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_init(so);
 			break;
 
 		case IP_RSVP_OFF:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_done();
 			break;
 
 		case IP_RSVP_VIF_ON:
 		case IP_RSVP_VIF_OFF:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_vif ?
 				ip_rsvp_vif(so, sopt) : EINVAL;
 			break;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case MRT_API_SUPPORT:
 		case MRT_API_CONFIG:
 		case MRT_ADD_BW_UPCALL:
 		case MRT_DEL_BW_UPCALL:
 			error = suser(curthread);
 			if (error != 0)
 				return (error);
 			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
 					EOPNOTSUPP;
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * This function exists solely to receive the PRC_IFDOWN messages which
  * are sent by if_down().  It looks for an ifaddr whose ifa_addr is sa,
  * and calls in_ifadown() to remove all routes corresponding to that address.
  * It also receives the PRC_IFUP messages from if_up() and reinstalls the
  * interface routes.
  */
 void
 rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct in_ifaddr *ia;
 	struct ifnet *ifp;
 	int err;
 	int flags;
 
 	switch (cmd) {
 	case PRC_IFDOWN:
 		TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
 			if (ia->ia_ifa.ifa_addr == sa
 			    && (ia->ia_flags & IFA_ROUTE)) {
 				/*
 				 * in_ifscrub kills the interface route.
 				 */
 				in_ifscrub(ia->ia_ifp, ia);
 				/*
 				 * in_ifadown gets rid of all the rest of
 				 * the routes.  This is not quite the right
 				 * thing to do, but at least if we are running
 				 * a routing process they will come back.
 				 */
 				in_ifadown(&ia->ia_ifa, 0);
 				break;
 			}
 		}
 		break;
 
 	case PRC_IFUP:
 		TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
 			if (ia->ia_ifa.ifa_addr == sa)
 				break;
 		}
 		if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
 			return;
 		flags = RTF_UP;
 		ifp = ia->ia_ifa.ifa_ifp;
 
 		if ((ifp->if_flags & IFF_LOOPBACK)
 		    || (ifp->if_flags & IFF_POINTOPOINT))
 			flags |= RTF_HOST;
 
 		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
 		if (err == 0)
 			ia->ia_flags |= IFA_ROUTE;
 		break;
 	}
 }
 
 u_long	rip_sendspace = RIPSNDQ;
 u_long	rip_recvspace = RIPRCVQ;
 
 SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
     &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
 SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
     &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
 
 static int
 rip_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	/* XXX why not lower? */
 	INP_INFO_WLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (inp) {
 		/* XXX counter, printf */
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return EINVAL;
 	}
 	if (td && jailed(td->td_ucred) && !jail_allow_raw_sockets) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return (EPERM);
 	}
 	if (td && (error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL)) != 0) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return error;
 	}
 	if (proto >= IPPROTO_MAX || proto < 0) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return EPROTONOSUPPORT;
 	}
 
 	error = soreserve(so, rip_sendspace, rip_recvspace);
 	if (error) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return error;
 	}
 	error = in_pcballoc(so, &ripcbinfo, "rawinp");
 	if (error) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return error;
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	INP_LOCK(inp);
 	INP_INFO_WUNLOCK(&ripcbinfo);
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_p = proto;
 	inp->inp_ip_ttl = ip_defttl;
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 static void
 rip_pcbdetach(struct socket *so, struct inpcb *inp)
 {
 	INP_INFO_WLOCK_ASSERT(&ripcbinfo);
 	INP_LOCK_ASSERT(inp);
 
 	if (so == ip_mrouter && ip_mrouter_done)
 		ip_mrouter_done();
 	if (ip_rsvp_force_done)
 		ip_rsvp_force_done(so);
 	if (so == ip_rsvpd)
 		ip_rsvp_done();
 	in_pcbdetach(inp);
 }
 
 static int
 rip_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_WLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		/* XXX counter, printf */
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	rip_pcbdetach(so, inp);
 	INP_INFO_WUNLOCK(&ripcbinfo);
 	return 0;
 }
 
 static int
 rip_abort(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_WLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return EINVAL;	/* ??? possible? panic instead? */
 	}
 	INP_LOCK(inp);
 	soisdisconnected(so);
 	if (so->so_state & SS_NOFDREF)
 		rip_pcbdetach(so, inp);
 	else
 		INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&ripcbinfo);
 	return 0;
 }
 
 static int
 rip_disconnect(struct socket *so)
 {
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return ENOTCONN;
 	return rip_abort(so);
 }
 
 static int
 rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 	struct inpcb *inp;
 
 	if (nam->sa_len != sizeof(*addr))
 		return EINVAL;
 
 	if (jailed(td->td_ucred)) {
 		if (addr->sin_addr.s_addr == INADDR_ANY)
 			addr->sin_addr.s_addr =
 			    htonl(prison_getip(td->td_ucred));
 		if (htonl(prison_getip(td->td_ucred)) != addr->sin_addr.s_addr)
 			return (EADDRNOTAVAIL);
 	}
 
 	if (TAILQ_EMPTY(&ifnet) ||
 	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
 	    (addr->sin_addr.s_addr &&
 	     ifa_ifwithaddr((struct sockaddr *)addr) == 0))
 		return EADDRNOTAVAIL;
 
 	INP_INFO_WLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	inp->inp_laddr = addr->sin_addr;
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&ripcbinfo);
 	return 0;
 }
 
 static int
 rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 	struct inpcb *inp;
 
 	if (nam->sa_len != sizeof(*addr))
 		return EINVAL;
 	if (TAILQ_EMPTY(&ifnet))
 		return EADDRNOTAVAIL;
 	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
 		return EAFNOSUPPORT;
 
 	INP_INFO_WLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&ripcbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	inp->inp_faddr = addr->sin_addr;
 	soisconnected(so);
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&ripcbinfo);
 	return 0;
 }
 
 static int
 rip_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_RLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_RUNLOCK(&ripcbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	INP_INFO_RUNLOCK(&ripcbinfo);
 	socantsendmore(so);
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 static int
 rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 	u_long dst;
 	int ret;
 
 	INP_INFO_WLOCK(&ripcbinfo);
 	inp = sotoinpcb(so);
 	if (so->so_state & SS_ISCONNECTED) {
 		if (nam) {
 			INP_INFO_WUNLOCK(&ripcbinfo);
 			m_freem(m);
 			return EISCONN;
 		}
 		dst = inp->inp_faddr.s_addr;
 	} else {
 		if (nam == NULL) {
 			INP_INFO_WUNLOCK(&ripcbinfo);
 			m_freem(m);
 			return ENOTCONN;
 		}
 		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
 	}
 	ret = rip_output(m, so, dst);
 	INP_INFO_WUNLOCK(&ripcbinfo);
 	return ret;
 }
 
 static int
 rip_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = ripcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&ripcbinfo);
 	gencnt = ripcbinfo.ipi_gencnt;
 	n = ripcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&ripcbinfo);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	INP_INFO_RLOCK(&ripcbinfo);
 	for (inp = LIST_FIRST(ripcbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) {
 			/* XXX held references? */
 			inp_list[i++] = inp;
 		}
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&ripcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
+			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		}
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&ripcbinfo);
 		xig.xig_gen = ripcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = ripcbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&ripcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 /*
  * This is the wrapper function for in_setsockaddr.  We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 rip_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setsockaddr(so, nam, &ripcbinfo));
 }
 
 /*
  * This is the wrapper function for in_setpeeraddr.  We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 rip_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setpeeraddr(so, nam, &ripcbinfo));
 }
 
 
 SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
 	    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
 
 struct pr_usrreqs rip_usrreqs = {
 	rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect,
 	pru_connect2_notsupp, in_control, rip_detach, rip_disconnect,
 	pru_listen_notsupp, rip_peeraddr, pru_rcvd_notsupp,
 	pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown,
 	rip_sockaddr, sosend, soreceive, sopoll, in_pcbsosetlabel
 };
Index: stable/5/sys/netinet/udp_usrreq.c
===================================================================
--- stable/5/sys/netinet/udp_usrreq.c	(revision 145952)
+++ stable/5/sys/netinet/udp_usrreq.c	(revision 145953)
@@ -1,1120 +1,1121 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  * $FreeBSD$
  */
 
 #include "opt_ipsec.h"
 #include "opt_inet6.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #ifdef FAST_IPSEC
 #include <netipsec/ipsec.h>
 #endif /*FAST_IPSEC*/
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 /*
  * UDP protocol implementation.
  * Per RFC 768, August, 1980.
  */
 #ifndef	COMPAT_42
 static int	udpcksum = 1;
 #else
 static int	udpcksum = 0;		/* XXX */
 #endif
 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW,
 		&udpcksum, 0, "");
 
 int	log_in_vain = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &log_in_vain, 0, "Log all incoming UDP packets");
 
 static int	blackhole = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
 	&blackhole, 0, "Do not send port unreachables for refused connects");
 
 static int	strict_mcast_mship = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, strict_mcast_mship, CTLFLAG_RW,
 	&strict_mcast_mship, 0, "Only send multicast to member sockets");
 
 struct	inpcbhead udb;		/* from udp_var.h */
 #define	udb6	udb  /* for KAME src sync over BSD*'s */
 struct	inpcbinfo udbinfo;
 
 #ifndef UDBHASHSIZE
 #define UDBHASHSIZE 16
 #endif
 
 struct	udpstat udpstat;	/* from udp_var.h */
 SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW,
     &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
 
 static void udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
 		int off, struct sockaddr_in *udp_in);
 
 static int udp_detach(struct socket *so);
 static	int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
 		struct mbuf *, struct thread *);
 
 void
 udp_init()
 {
 	INP_INFO_LOCK_INIT(&udbinfo, "udp");
 	LIST_INIT(&udb);
 	udbinfo.listhead = &udb;
 	udbinfo.hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.hashmask);
 	udbinfo.porthashbase = hashinit(UDBHASHSIZE, M_PCB,
 					&udbinfo.porthashmask);
 	udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(udbinfo.ipi_zone, maxsockets);
 }
 
 void
 udp_input(m, off)
 	register struct mbuf *m;
 	int off;
 {
 	int iphlen = off;
 	register struct ip *ip;
 	register struct udphdr *uh;
 	register struct inpcb *inp;
 	struct mbuf *opts = 0;
 	int len;
 	struct ip save_ip;
 	struct sockaddr_in udp_in;
 
 	udpstat.udps_ipackets++;
 
 	/*
 	 * Strip IP options, if any; should skip this,
 	 * make available to user, and use on returned packets,
 	 * but we don't yet have a way to check the checksum
 	 * with options still present.
 	 */
 	if (iphlen > sizeof (struct ip)) {
 		ip_stripoptions(m, (struct mbuf *)0);
 		iphlen = sizeof(struct ip);
 	}
 
 	/*
 	 * Get IP and UDP header together in first mbuf.
 	 */
 	ip = mtod(m, struct ip *);
 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
 			udpstat.udps_hdrops++;
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
 
 	/* destination port of 0 is illegal, based on RFC768. */
 	if (uh->uh_dport == 0)
 		goto badunlocked;
 
 	/*
 	 * Construct sockaddr format source address.
 	 * Stuff source address and datagram in user buffer.
 	 */
 	bzero(&udp_in, sizeof(udp_in));
 	udp_in.sin_len = sizeof(udp_in);
 	udp_in.sin_family = AF_INET;
 	udp_in.sin_port = uh->uh_sport;
 	udp_in.sin_addr = ip->ip_src;
 
 	/*
 	 * Make mbuf data length reflect UDP length.
 	 * If not enough data to reflect UDP length, drop.
 	 */
 	len = ntohs((u_short)uh->uh_ulen);
 	if (ip->ip_len != len) {
 		if (len > ip->ip_len || len < sizeof(struct udphdr)) {
 			udpstat.udps_badlen++;
 			goto badunlocked;
 		}
 		m_adj(m, len - ip->ip_len);
 		/* ip->ip_len = len; */
 	}
 	/*
 	 * Save a copy of the IP header in case we want restore it
 	 * for sending an ICMP error message in response.
 	 */
 	if (!blackhole)
 		save_ip = *ip;
 
 	/*
 	 * Checksum extended UDP header and data.
 	 */
 	if (uh->uh_sum) {
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				uh->uh_sum = m->m_pkthdr.csum_data;
 			else
 				uh->uh_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr, htonl((u_short)len +
 				    m->m_pkthdr.csum_data + IPPROTO_UDP));
 			uh->uh_sum ^= 0xffff;
 		} else {
 			char b[9];
 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
 			bzero(((struct ipovly *)ip)->ih_x1, 9);
 			((struct ipovly *)ip)->ih_len = uh->uh_ulen;
 			uh->uh_sum = in_cksum(m, len + sizeof (struct ip));
 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
 		}
 		if (uh->uh_sum) {
 			udpstat.udps_badsum++;
 			m_freem(m);
 			return;
 		}
 	} else
 		udpstat.udps_nosum++;
 
 	INP_INFO_RLOCK(&udbinfo);
 
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
 		struct inpcb *last;
 		/*
 		 * Deliver a multicast or broadcast datagram to *all* sockets
 		 * for which the local and remote addresses and ports match
 		 * those of the incoming datagram.  This allows more than
 		 * one process to receive multi/broadcasts on the same port.
 		 * (This really ought to be done for unicast datagrams as
 		 * well, but that would cause problems with existing
 		 * applications that open both address-specific sockets and
 		 * a wildcard socket listening to the same port -- they would
 		 * end up receiving duplicates of every unicast datagram.
 		 * Those applications open the multiple sockets to overcome an
 		 * inadequacy of the UDP socket interface, but for backwards
 		 * compatibility we avoid the problem here rather than
 		 * fixing the interface.  Maybe 4.5BSD will remedy this?)
 		 */
 
 		/*
 		 * Locate pcb(s) for datagram.
 		 * (Algorithm copied from raw_intr().)
 		 */
 		last = NULL;
 		LIST_FOREACH(inp, &udb, inp_list) {
 			if (inp->inp_lport != uh->uh_dport)
 				continue;
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_laddr.s_addr != INADDR_ANY) {
 				if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 					continue;
 			}
 			if (inp->inp_faddr.s_addr != INADDR_ANY) {
 				if (inp->inp_faddr.s_addr !=
 				    ip->ip_src.s_addr ||
 				    inp->inp_fport != uh->uh_sport)
 					continue;
 			}
 			INP_LOCK(inp);
 
 			/*
 			 * Check multicast packets to make sure they are only
 			 * sent to sockets with multicast memberships for the
 			 * packet's destination address and arrival interface
 			 */
 #define MSHIP(_inp, n) ((_inp)->inp_moptions->imo_membership[(n)])
 #define NMSHIPS(_inp) ((_inp)->inp_moptions->imo_num_memberships)
 			if (strict_mcast_mship && inp->inp_moptions != NULL) {
 				int mship, foundmship = 0;
 
 				for (mship = 0; mship < NMSHIPS(inp); mship++) {
 					if (MSHIP(inp, mship)->inm_addr.s_addr
 					    == ip->ip_dst.s_addr &&
 					    MSHIP(inp, mship)->inm_ifp
 					    == m->m_pkthdr.rcvif) {
 						foundmship = 1;
 						break;
 					}
 				}
 				if (foundmship == 0) {
 					INP_UNLOCK(inp);
 					continue;
 				}
 			}
 #undef NMSHIPS
 #undef MSHIP
 			if (last != NULL) {
 				struct mbuf *n;
 
 				n = m_copy(m, 0, M_COPYALL);
 				if (n != NULL)
 					udp_append(last, ip, n,
 						   iphlen +
 						   sizeof(struct udphdr),
 						   &udp_in);
 				INP_UNLOCK(last);
 			}
 			last = inp;
 			/*
 			 * Don't look for additional matches if this one does
 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
 			 * socket options set.  This heuristic avoids searching
 			 * through all pcbs in the common case of a non-shared
 			 * port.  It * assumes that an application will never
 			 * clear these options after setting them.
 			 */
 			if ((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0)
 				break;
 		}
 
 		if (last == NULL) {
 			/*
 			 * No matching pcb found; discard datagram.
 			 * (No need to send an ICMP Port Unreachable
 			 * for a broadcast or multicast datgram.)
 			 */
 			udpstat.udps_noportbcast++;
 			goto badheadlocked;
 		}
 		udp_append(last, ip, m, iphlen + sizeof(struct udphdr),
 		    &udp_in);
 		INP_UNLOCK(last);
 		INP_INFO_RUNLOCK(&udbinfo);
 		return;
 	}
 	/*
 	 * Locate pcb for datagram.
 	 */
 	inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport,
 	    ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif);
 	if (inp == NULL) {
 		if (log_in_vain) {
 			char buf[4*sizeof "123"];
 
 			strcpy(buf, inet_ntoa(ip->ip_dst));
 			log(LOG_INFO,
 			    "Connection attempt to UDP %s:%d from %s:%d\n",
 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
 			    ntohs(uh->uh_sport));
 		}
 		udpstat.udps_noport++;
 		if (m->m_flags & (M_BCAST | M_MCAST)) {
 			udpstat.udps_noportbcast++;
 			goto badheadlocked;
 		}
 		if (blackhole)
 			goto badheadlocked;
 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
 			goto badheadlocked;
 		*ip = save_ip;
 		ip->ip_len += iphlen;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
 		INP_INFO_RUNLOCK(&udbinfo);
 		return;
 	}
 	INP_LOCK(inp);
 	udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in);
 	INP_UNLOCK(inp);
 	INP_INFO_RUNLOCK(&udbinfo);
 	return;
 
 badheadlocked:
 	if (inp)
 		INP_UNLOCK(inp);
 	INP_INFO_RUNLOCK(&udbinfo);
 badunlocked:
 	m_freem(m);
 	if (opts)
 		m_freem(opts);
 	return;
 }
 
 /*
  * Subroutine of udp_input(), which appends the provided mbuf chain to the
  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
  * contains the source address.  If the socket ends up being an IPv6 socket,
  * udp_append() will convert to a sockaddr_in6 before passing the address
  * into the socket code.
  */
 static void
 udp_append(last, ip, n, off, udp_in)
 	struct inpcb *last;
 	struct ip *ip;
 	struct mbuf *n;
 	int off;
 	struct sockaddr_in *udp_in;
 {
 	struct sockaddr *append_sa;
 	struct socket *so;
 	struct mbuf *opts = 0;
 #ifdef INET6
 	struct sockaddr_in6 udp_in6;
 #endif
 
 	INP_LOCK_ASSERT(last);
 
 #if defined(IPSEC) || defined(FAST_IPSEC)
 	/* check AH/ESP integrity. */
 	if (ipsec4_in_reject(n, last)) {
 #ifdef IPSEC
 		ipsecstat.in_polvio++;
 #endif /*IPSEC*/
 		m_freem(n);
 		return;
 	}
 #endif /*IPSEC || FAST_IPSEC*/
 #ifdef MAC
 	if (mac_check_inpcb_deliver(last, n) != 0) {
 		m_freem(n);
 		return;
 	}
 #endif
 	if (last->inp_flags & INP_CONTROLOPTS ||
 	    last->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
 #ifdef INET6
 		if (last->inp_vflag & INP_IPV6) {
 			int savedflags;
 
 			savedflags = last->inp_flags;
 			last->inp_flags &= ~INP_UNMAPPABLEOPTS;
 			ip6_savecontrol(last, n, &opts);
 			last->inp_flags = savedflags;
 		} else
 #endif
 		ip_savecontrol(last, &opts, ip, n);
 	}
 #ifdef INET6
 	if (last->inp_vflag & INP_IPV6) {
 		bzero(&udp_in6, sizeof(udp_in6));
 		udp_in6.sin6_len = sizeof(udp_in6);
 		udp_in6.sin6_family = AF_INET6;
 		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
 		append_sa = (struct sockaddr *)&udp_in6;
 	} else
 #endif
 	append_sa = (struct sockaddr *)udp_in;
 	m_adj(n, off);
 
 	so = last->inp_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		udpstat.udps_fullsock++;
 		SOCKBUF_UNLOCK(&so->so_rcv);
 	} else
 		sorwakeup_locked(so);
 }
 
 /*
  * Notify a udp user of an asynchronous error;
  * just wake up so that he can collect error status.
  */
 struct inpcb *
 udp_notify(inp, errno)
 	register struct inpcb *inp;
 	int errno;
 {
 	inp->inp_socket->so_error = errno;
 	sorwakeup(inp->inp_socket);
 	sowwakeup(inp->inp_socket);
 	return inp;
 }
 
 void
 udp_ctlinput(cmd, sa, vip)
 	int cmd;
 	struct sockaddr *sa;
 	void *vip;
 {
 	struct ip *ip = vip;
 	struct udphdr *uh;
 	struct inpcb *(*notify)(struct inpcb *, int) = udp_notify;
 	struct in_addr faddr;
 	struct inpcb *inp;
 	int s;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	/*
 	 * Redirects don't need to be handled up here.
 	 */
 	if (PRC_IS_REDIRECT(cmd))
 		return;
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 * XXX: We never get this from ICMP, otherwise it makes an
 	 * excellent DoS attack on machines with many connections.
 	 */
 	if (cmd == PRC_HOSTDEAD)
 		ip = 0;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip) {
 		s = splnet();
 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		INP_INFO_RLOCK(&udbinfo);
 		inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport,
 		    ip->ip_src, uh->uh_sport, 0, NULL);
 		if (inp != NULL) {
 			INP_LOCK(inp);
 			if (inp->inp_socket != NULL) {
 				(*notify)(inp, inetctlerrmap[cmd]);
 			}
 			INP_UNLOCK(inp);
 		}
 		INP_INFO_RUNLOCK(&udbinfo);
 		splx(s);
 	} else
 		in_pcbnotifyall(&udbinfo, faddr, inetctlerrmap[cmd], notify);
 }
 
 static int
 udp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n, s;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = udbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	s = splnet();
 	INP_INFO_RLOCK(&udbinfo);
 	gencnt = udbinfo.ipi_gencnt;
 	n = udbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&udbinfo);
 	splx(s);
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ n * sizeof(struct xinpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 
 	s = splnet();
 	INP_INFO_RLOCK(&udbinfo);
 	for (inp = LIST_FIRST(udbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
 			inp_list[i++] = inp;
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&udbinfo);
 	splx(s);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
+			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		}
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		s = splnet();
 		INP_INFO_RLOCK(&udbinfo);
 		xig.xig_gen = udbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = udbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&udbinfo);
 		splx(s);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
 	    udp_pcblist, "S,xinpcb", "List of active UDP sockets");
 
 static int
 udp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error, s;
 
 	error = suser_cred(req->td->td_ucred, SUSER_ALLOWJAIL);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	s = splnet();
 	INP_INFO_RLOCK(&udbinfo);
 	inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 				addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);
 	if (inp == NULL || inp->inp_socket == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
 	if (error)
 		goto out;
 	cru2x(inp->inp_socket->so_cred, &xuc);
 out:
 	INP_INFO_RUNLOCK(&udbinfo);
 	splx(s);
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
 
 static int
 udp_output(inp, m, addr, control, td)
 	register struct inpcb *inp;
 	struct mbuf *m;
 	struct sockaddr *addr;
 	struct mbuf *control;
 	struct thread *td;
 {
 	register struct udpiphdr *ui;
 	register int len = m->m_pkthdr.len;
 	struct in_addr faddr, laddr;
 	struct cmsghdr *cm;
 	struct sockaddr_in *sin, src;
 	int error = 0;
 	int ipflags;
 	u_short fport, lport;
 	int unlock_udbinfo;
 
 	/*
 	 * udp_output() may need to temporarily bind or connect the current
 	 * inpcb.  As such, we don't know up front what inpcb locks we will
 	 * need.  Do any work to decide what is needed up front before
 	 * acquiring locks.
 	 */
 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
 		if (control)
 			m_freem(control);
 		m_freem(m);
 		return EMSGSIZE;
 	}
 
 	src.sin_addr.s_addr = INADDR_ANY;
 	if (control != NULL) {
 		/*
 		 * XXX: Currently, we assume all the optional information
 		 * is stored in a single mbuf.
 		 */
 		if (control->m_next) {
 			m_freem(control);
 			m_freem(m);
 			return EINVAL;
 		}
 		for (; control->m_len > 0;
 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 			cm = mtod(control, struct cmsghdr *);
 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 ||
 			    cm->cmsg_len > control->m_len) {
 				error = EINVAL;
 				break;
 			}
 			if (cm->cmsg_level != IPPROTO_IP)
 				continue;
 
 			switch (cm->cmsg_type) {
 			case IP_SENDSRCADDR:
 				if (cm->cmsg_len !=
 				    CMSG_LEN(sizeof(struct in_addr))) {
 					error = EINVAL;
 					break;
 				}
 				bzero(&src, sizeof(src));
 				src.sin_family = AF_INET;
 				src.sin_len = sizeof(src);
 				src.sin_port = inp->inp_lport;
 				src.sin_addr = *(struct in_addr *)CMSG_DATA(cm);
 				break;
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			if (error)
 				break;
 		}
 		m_freem(control);
 	}
 	if (error) {
 		m_freem(m);
 		return error;
 	}
 
 	if (src.sin_addr.s_addr != INADDR_ANY ||
 	    addr != NULL) {
 		INP_INFO_WLOCK(&udbinfo);
 		unlock_udbinfo = 1;
 	} else
 		unlock_udbinfo = 0;
 	INP_LOCK(inp);
 
 #ifdef MAC
 	mac_create_mbuf_from_inpcb(inp, m);
 #endif
 
 	laddr = inp->inp_laddr;
 	lport = inp->inp_lport;
 	if (src.sin_addr.s_addr != INADDR_ANY) {
 		if (lport == 0) {
 			error = EINVAL;
 			goto release;
 		}
 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
 		    &laddr.s_addr, &lport, td->td_ucred);
 		if (error)
 			goto release;
 	}
 
 	if (addr) {
 		sin = (struct sockaddr_in *)addr;
 		if (td && jailed(td->td_ucred))
 			prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr);
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto release;
 		}
 		error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport,
 		    &faddr.s_addr, &fport, NULL, td->td_ucred);
 		if (error)
 			goto release;
 
 		/* Commit the local port if newly assigned. */
 		if (inp->inp_laddr.s_addr == INADDR_ANY &&
 		    inp->inp_lport == 0) {
 			/*
 			 * Remember addr if jailed, to prevent rebinding.
 			 */
 			if (jailed(td->td_ucred))
 				inp->inp_laddr = laddr;
 			inp->inp_lport = lport;
 			if (in_pcbinshash(inp) != 0) {
 				inp->inp_lport = 0;
 				error = EAGAIN;
 				goto release;
 			}
 			inp->inp_flags |= INP_ANONPORT;
 		}
 	} else {
 		faddr = inp->inp_faddr;
 		fport = inp->inp_fport;
 		if (faddr.s_addr == INADDR_ANY) {
 			error = ENOTCONN;
 			goto release;
 		}
 	}
 
 	/*
 	 * Calculate data length and get a mbuf for UDP, IP, and possible
 	 * link-layer headers.  Immediate slide the data pointer back forward
 	 * since we won't use that space at this layer.
 	 */
 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto release;
 	}
 	m->m_data += max_linkhdr;
 	m->m_len -= max_linkhdr;
 	m->m_pkthdr.len -= max_linkhdr;
 
 	/*
 	 * Fill in mbuf with extended UDP header
 	 * and addresses and length put into network format.
 	 */
 	ui = mtod(m, struct udpiphdr *);
 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
 	ui->ui_pr = IPPROTO_UDP;
 	ui->ui_src = laddr;
 	ui->ui_dst = faddr;
 	ui->ui_sport = lport;
 	ui->ui_dport = fport;
 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
 
 	ipflags = inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST);
 	if (inp->inp_flags & INP_ONESBCAST)
 		ipflags |= IP_SENDONES;
 
 	/*
 	 * Set up checksum and output datagram.
 	 */
 	if (udpcksum) {
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
 		    htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
 		m->m_pkthdr.csum_flags = CSUM_UDP;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	} else {
 		ui->ui_sum = 0;
 	}
 	((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
 	((struct ip *)ui)->ip_tos = inp->inp_ip_tos;	/* XXX */
 	udpstat.udps_opackets++;
 
 	if (unlock_udbinfo)
 		INP_INFO_WUNLOCK(&udbinfo);
 	error = ip_output(m, inp->inp_options, NULL, ipflags,
 	    inp->inp_moptions, inp);
 	INP_UNLOCK(inp);
 	return (error);
 
 release:
 	INP_UNLOCK(inp);
 	if (unlock_udbinfo)
 		INP_INFO_WUNLOCK(&udbinfo);
 	m_freem(m);
 	return (error);
 }
 
 u_long	udp_sendspace = 9216;		/* really max datagram size */
 					/* 40 1K datagrams */
 SYSCTL_INT(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
 
 u_long	udp_recvspace = 40 * (1024 +
 #ifdef INET6
 				      sizeof(struct sockaddr_in6)
 #else
 				      sizeof(struct sockaddr_in)
 #endif
 				      );
 SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
 
 static int
 udp_abort(struct socket *so)
 {
 	struct inpcb *inp;
 	int s;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;	/* ??? possible? panic instead? */
 	}
 	INP_LOCK(inp);
 	soisdisconnected(so);
 	s = splnet();
 	in_pcbdetach(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	splx(s);
 	return 0;
 }
 
 static int
 udp_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int s, error;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp != 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	error = soreserve(so, udp_sendspace, udp_recvspace);
 	if (error) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return error;
 	}
 	s = splnet();
 	error = in_pcballoc(so, &udbinfo, "udpinp");
 	splx(s);
 	if (error) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return error;
 	}
 
 	inp = (struct inpcb *)so->so_pcb;
 	INP_LOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_ttl = ip_defttl;
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 static int
 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int s, error;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	s = splnet();
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	splx(s);
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	return error;
 }
 
 static int
 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int s, error;
 	struct sockaddr_in *sin;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_UNLOCK(inp);
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EISCONN;
 	}
 	s = splnet();
 	sin = (struct sockaddr_in *)nam;
 	if (td && jailed(td->td_ucred))
 		prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr);
 	error = in_pcbconnect(inp, nam, td->td_ucred);
 	splx(s);
 	if (error == 0)
 		soisconnected(so);
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	return error;
 }
 
 static int
 udp_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	int s;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	s = splnet();
 	in_pcbdetach(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	splx(s);
 	return 0;
 }
 
 static int
 udp_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	int s;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		INP_UNLOCK(inp);
 		return ENOTCONN;
 	}
 
 	s = splnet();
 	in_pcbdisconnect(inp);
 	inp->inp_laddr.s_addr = INADDR_ANY;
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	splx(s);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	return 0;
 }
 
 static int
 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
 	    struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	return udp_output(inp, m, addr, control, td);
 }
 
 int
 udp_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_RLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_RUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	INP_INFO_RUNLOCK(&udbinfo);
 	socantsendmore(so);
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 /*
  * This is the wrapper function for in_setsockaddr.  We just pass down
  * the pcbinfo for in_setsockaddr to lock.  We don't want to do the locking
  * here because in_setsockaddr will call malloc and might block.
  */
 static int
 udp_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setsockaddr(so, nam, &udbinfo));
 }
 
 /*
  * This is the wrapper function for in_setpeeraddr.  We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 udp_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setpeeraddr(so, nam, &udbinfo));
 }
 
 struct pr_usrreqs udp_usrreqs = {
 	udp_abort, pru_accept_notsupp, udp_attach, udp_bind, udp_connect,
 	pru_connect2_notsupp, in_control, udp_detach, udp_disconnect,
 	pru_listen_notsupp, udp_peeraddr, pru_rcvd_notsupp,
 	pru_rcvoob_notsupp, udp_send, pru_sense_null, udp_shutdown,
 	udp_sockaddr, sosend, soreceive, sopoll, in_pcbsosetlabel
 };