Index: stable/12/sys/cam/cam_xpt.c
===================================================================
--- stable/12/sys/cam/cam_xpt.c	(revision 355609)
+++ stable/12/sys/cam/cam_xpt.c	(revision 355610)
@@ -1,5633 +1,5633 @@
 /*-
  * Implementation of the Common Access Method Transport (XPT) layer.
  *
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1997, 1998, 1999 Justin T. Gibbs.
  * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_printf.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/time.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/taskqueue.h>
 
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/kthread.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_iosched.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_queue.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_xpt_internal.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_compat.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_message.h>
 #include <cam/scsi/scsi_pass.h>
 
 #include <machine/md_var.h>	/* geometry translation */
 #include <machine/stdarg.h>	/* for xpt_print below */
 
 #include "opt_cam.h"
 
 /* Wild guess based on not wanting to grow the stack too much */
 #define XPT_PRINT_MAXLEN	512
 #ifdef PRINTF_BUFR_SIZE
 #define XPT_PRINT_LEN	PRINTF_BUFR_SIZE
 #else
 #define XPT_PRINT_LEN	128
 #endif
 _Static_assert(XPT_PRINT_LEN <= XPT_PRINT_MAXLEN, "XPT_PRINT_LEN is too large");
 
 /*
  * This is the maximum number of high powered commands (e.g. start unit)
  * that can be outstanding at a particular time.
  */
 #ifndef CAM_MAX_HIGHPOWER
 #define CAM_MAX_HIGHPOWER  4
 #endif
 
 /* Datastructures internal to the xpt layer */
 MALLOC_DEFINE(M_CAMXPT, "CAM XPT", "CAM XPT buffers");
 MALLOC_DEFINE(M_CAMDEV, "CAM DEV", "CAM devices");
 MALLOC_DEFINE(M_CAMCCB, "CAM CCB", "CAM CCBs");
 MALLOC_DEFINE(M_CAMPATH, "CAM path", "CAM paths");
 
 struct xpt_softc {
 	uint32_t		xpt_generation;
 
 	/* number of high powered commands that can go through right now */
 	struct mtx		xpt_highpower_lock;
 	STAILQ_HEAD(highpowerlist, cam_ed)	highpowerq;
 	int			num_highpower;
 
 	/* queue for handling async rescan requests. */
 	TAILQ_HEAD(, ccb_hdr) ccb_scanq;
 	int buses_to_config;
 	int buses_config_done;
 	int announce_nosbuf;
 
 	/*
 	 * Registered buses
 	 *
 	 * N.B., "busses" is an archaic spelling of "buses".  In new code
 	 * "buses" is preferred.
 	 */
 	TAILQ_HEAD(,cam_eb)	xpt_busses;
 	u_int			bus_generation;
 
 	int			boot_delay;
 	struct callout 		boot_callout;
 	struct task		boot_task;
 	struct root_hold_token	xpt_rootmount;
 
 	struct mtx		xpt_topo_lock;
 	struct taskqueue	*xpt_taskq;
 };
 
 typedef enum {
 	DM_RET_COPY		= 0x01,
 	DM_RET_FLAG_MASK	= 0x0f,
 	DM_RET_NONE		= 0x00,
 	DM_RET_STOP		= 0x10,
 	DM_RET_DESCEND		= 0x20,
 	DM_RET_ERROR		= 0x30,
 	DM_RET_ACTION_MASK	= 0xf0
 } dev_match_ret;
 
 typedef enum {
 	XPT_DEPTH_BUS,
 	XPT_DEPTH_TARGET,
 	XPT_DEPTH_DEVICE,
 	XPT_DEPTH_PERIPH
 } xpt_traverse_depth;
 
 struct xpt_traverse_config {
 	xpt_traverse_depth	depth;
 	void			*tr_func;
 	void			*tr_arg;
 };
 
 typedef	int	xpt_busfunc_t (struct cam_eb *bus, void *arg);
 typedef	int	xpt_targetfunc_t (struct cam_et *target, void *arg);
 typedef	int	xpt_devicefunc_t (struct cam_ed *device, void *arg);
 typedef	int	xpt_periphfunc_t (struct cam_periph *periph, void *arg);
 typedef int	xpt_pdrvfunc_t (struct periph_driver **pdrv, void *arg);
 
 /* Transport layer configuration information */
 static struct xpt_softc xsoftc;
 
 MTX_SYSINIT(xpt_topo_init, &xsoftc.xpt_topo_lock, "XPT topology lock", MTX_DEF);
 
 SYSCTL_INT(_kern_cam, OID_AUTO, boot_delay, CTLFLAG_RDTUN,
            &xsoftc.boot_delay, 0, "Bus registration wait time");
 SYSCTL_UINT(_kern_cam, OID_AUTO, xpt_generation, CTLFLAG_RD,
 	    &xsoftc.xpt_generation, 0, "CAM peripheral generation count");
 SYSCTL_INT(_kern_cam, OID_AUTO, announce_nosbuf, CTLFLAG_RWTUN,
 	    &xsoftc.announce_nosbuf, 0, "Don't use sbuf for announcements");
 
 struct cam_doneq {
 	struct mtx_padalign	cam_doneq_mtx;
 	STAILQ_HEAD(, ccb_hdr)	cam_doneq;
 	int			cam_doneq_sleep;
 };
 
 static struct cam_doneq cam_doneqs[MAXCPU];
 static int cam_num_doneqs;
 static struct proc *cam_proc;
 
 SYSCTL_INT(_kern_cam, OID_AUTO, num_doneqs, CTLFLAG_RDTUN,
            &cam_num_doneqs, 0, "Number of completion queues/threads");
 
 struct cam_periph *xpt_periph;
 
 static periph_init_t xpt_periph_init;
 
 static struct periph_driver xpt_driver =
 {
 	xpt_periph_init, "xpt",
 	TAILQ_HEAD_INITIALIZER(xpt_driver.units), /* generation */ 0,
 	CAM_PERIPH_DRV_EARLY
 };
 
 PERIPHDRIVER_DECLARE(xpt, xpt_driver);
 
 static d_open_t xptopen;
 static d_close_t xptclose;
 static d_ioctl_t xptioctl;
 static d_ioctl_t xptdoioctl;
 
 static struct cdevsw xpt_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	0,
 	.d_open =	xptopen,
 	.d_close =	xptclose,
 	.d_ioctl =	xptioctl,
 	.d_name =	"xpt",
 };
 
 /* Storage for debugging datastructures */
 struct cam_path *cam_dpath;
-u_int32_t cam_dflags = CAM_DEBUG_FLAGS;
+u_int32_t __read_mostly cam_dflags = CAM_DEBUG_FLAGS;
 SYSCTL_UINT(_kern_cam, OID_AUTO, dflags, CTLFLAG_RWTUN,
 	&cam_dflags, 0, "Enabled debug flags");
 u_int32_t cam_debug_delay = CAM_DEBUG_DELAY;
 SYSCTL_UINT(_kern_cam, OID_AUTO, debug_delay, CTLFLAG_RWTUN,
 	&cam_debug_delay, 0, "Delay in us after each debug message");
 
 /* Our boot-time initialization hook */
 static int cam_module_event_handler(module_t, int /*modeventtype_t*/, void *);
 
 static moduledata_t cam_moduledata = {
 	"cam",
 	cam_module_event_handler,
 	NULL
 };
 
 static int	xpt_init(void *);
 
 DECLARE_MODULE(cam, cam_moduledata, SI_SUB_CONFIGURE, SI_ORDER_SECOND);
 MODULE_VERSION(cam, 1);
 
 
 static void		xpt_async_bcast(struct async_list *async_head,
 					u_int32_t async_code,
 					struct cam_path *path,
 					void *async_arg);
 static path_id_t xptnextfreepathid(void);
 static path_id_t xptpathid(const char *sim_name, int sim_unit, int sim_bus);
 static union ccb *xpt_get_ccb(struct cam_periph *periph);
 static union ccb *xpt_get_ccb_nowait(struct cam_periph *periph);
 static void	 xpt_run_allocq(struct cam_periph *periph, int sleep);
 static void	 xpt_run_allocq_task(void *context, int pending);
 static void	 xpt_run_devq(struct cam_devq *devq);
 static timeout_t xpt_release_devq_timeout;
 static void	 xpt_release_simq_timeout(void *arg) __unused;
 static void	 xpt_acquire_bus(struct cam_eb *bus);
 static void	 xpt_release_bus(struct cam_eb *bus);
 static uint32_t	 xpt_freeze_devq_device(struct cam_ed *dev, u_int count);
 static int	 xpt_release_devq_device(struct cam_ed *dev, u_int count,
 		    int run_queue);
 static struct cam_et*
 		 xpt_alloc_target(struct cam_eb *bus, target_id_t target_id);
 static void	 xpt_acquire_target(struct cam_et *target);
 static void	 xpt_release_target(struct cam_et *target);
 static struct cam_eb*
 		 xpt_find_bus(path_id_t path_id);
 static struct cam_et*
 		 xpt_find_target(struct cam_eb *bus, target_id_t target_id);
 static struct cam_ed*
 		 xpt_find_device(struct cam_et *target, lun_id_t lun_id);
 static void	 xpt_config(void *arg);
 static void	 xpt_hold_boot_locked(void);
 static int	 xpt_schedule_dev(struct camq *queue, cam_pinfo *dev_pinfo,
 				 u_int32_t new_priority);
 static xpt_devicefunc_t xptpassannouncefunc;
 static void	 xptaction(struct cam_sim *sim, union ccb *work_ccb);
 static void	 xptpoll(struct cam_sim *sim);
 static void	 camisr_runqueue(void);
 static void	 xpt_done_process(struct ccb_hdr *ccb_h);
 static void	 xpt_done_td(void *);
 static dev_match_ret	xptbusmatch(struct dev_match_pattern *patterns,
 				    u_int num_patterns, struct cam_eb *bus);
 static dev_match_ret	xptdevicematch(struct dev_match_pattern *patterns,
 				       u_int num_patterns,
 				       struct cam_ed *device);
 static dev_match_ret	xptperiphmatch(struct dev_match_pattern *patterns,
 				       u_int num_patterns,
 				       struct cam_periph *periph);
 static xpt_busfunc_t	xptedtbusfunc;
 static xpt_targetfunc_t	xptedttargetfunc;
 static xpt_devicefunc_t	xptedtdevicefunc;
 static xpt_periphfunc_t	xptedtperiphfunc;
 static xpt_pdrvfunc_t	xptplistpdrvfunc;
 static xpt_periphfunc_t	xptplistperiphfunc;
 static int		xptedtmatch(struct ccb_dev_match *cdm);
 static int		xptperiphlistmatch(struct ccb_dev_match *cdm);
 static int		xptbustraverse(struct cam_eb *start_bus,
 				       xpt_busfunc_t *tr_func, void *arg);
 static int		xpttargettraverse(struct cam_eb *bus,
 					  struct cam_et *start_target,
 					  xpt_targetfunc_t *tr_func, void *arg);
 static int		xptdevicetraverse(struct cam_et *target,
 					  struct cam_ed *start_device,
 					  xpt_devicefunc_t *tr_func, void *arg);
 static int		xptperiphtraverse(struct cam_ed *device,
 					  struct cam_periph *start_periph,
 					  xpt_periphfunc_t *tr_func, void *arg);
 static int		xptpdrvtraverse(struct periph_driver **start_pdrv,
 					xpt_pdrvfunc_t *tr_func, void *arg);
 static int		xptpdperiphtraverse(struct periph_driver **pdrv,
 					    struct cam_periph *start_periph,
 					    xpt_periphfunc_t *tr_func,
 					    void *arg);
 static xpt_busfunc_t	xptdefbusfunc;
 static xpt_targetfunc_t	xptdeftargetfunc;
 static xpt_devicefunc_t	xptdefdevicefunc;
 static xpt_periphfunc_t	xptdefperiphfunc;
 static void		xpt_finishconfig_task(void *context, int pending);
 static void		xpt_dev_async_default(u_int32_t async_code,
 					      struct cam_eb *bus,
 					      struct cam_et *target,
 					      struct cam_ed *device,
 					      void *async_arg);
 static struct cam_ed *	xpt_alloc_device_default(struct cam_eb *bus,
 						 struct cam_et *target,
 						 lun_id_t lun_id);
 static xpt_devicefunc_t	xptsetasyncfunc;
 static xpt_busfunc_t	xptsetasyncbusfunc;
 static cam_status	xptregister(struct cam_periph *periph,
 				    void *arg);
 static __inline int device_is_queued(struct cam_ed *device);
 
 static __inline int
 xpt_schedule_devq(struct cam_devq *devq, struct cam_ed *dev)
 {
 	int	retval;
 
 	mtx_assert(&devq->send_mtx, MA_OWNED);
 	if ((dev->ccbq.queue.entries > 0) &&
 	    (dev->ccbq.dev_openings > 0) &&
 	    (dev->ccbq.queue.qfrozen_cnt == 0)) {
 		/*
 		 * The priority of a device waiting for controller
 		 * resources is that of the highest priority CCB
 		 * enqueued.
 		 */
 		retval =
 		    xpt_schedule_dev(&devq->send_queue,
 				     &dev->devq_entry,
 				     CAMQ_GET_PRIO(&dev->ccbq.queue));
 	} else {
 		retval = 0;
 	}
 	return (retval);
 }
 
 static __inline int
 device_is_queued(struct cam_ed *device)
 {
 	return (device->devq_entry.index != CAM_UNQUEUED_INDEX);
 }
 
 static void
 xpt_periph_init()
 {
 	make_dev(&xpt_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0600, "xpt0");
 }
 
 static int
 xptopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 
 	/*
 	 * Only allow read-write access.
 	 */
 	if (((flags & FWRITE) == 0) || ((flags & FREAD) == 0))
 		return(EPERM);
 
 	/*
 	 * We don't allow nonblocking access.
 	 */
 	if ((flags & O_NONBLOCK) != 0) {
 		printf("%s: can't do nonblocking access\n", devtoname(dev));
 		return(ENODEV);
 	}
 
 	return(0);
 }
 
 static int
 xptclose(struct cdev *dev, int flag, int fmt, struct thread *td)
 {
 
 	return(0);
 }
 
 /*
  * Don't automatically grab the xpt softc lock here even though this is going
  * through the xpt device.  The xpt device is really just a back door for
  * accessing other devices and SIMs, so the right thing to do is to grab
  * the appropriate SIM lock once the bus/SIM is located.
  */
 static int
 xptioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
 {
 	int error;
 
 	if ((error = xptdoioctl(dev, cmd, addr, flag, td)) == ENOTTY) {
 		error = cam_compat_ioctl(dev, cmd, addr, flag, td, xptdoioctl);
 	}
 	return (error);
 }
 
 static int
 xptdoioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
 {
 	int error;
 
 	error = 0;
 
 	switch(cmd) {
 	/*
 	 * For the transport layer CAMIOCOMMAND ioctl, we really only want
 	 * to accept CCB types that don't quite make sense to send through a
 	 * passthrough driver. XPT_PATH_INQ is an exception to this, as stated
 	 * in the CAM spec.
 	 */
 	case CAMIOCOMMAND: {
 		union ccb *ccb;
 		union ccb *inccb;
 		struct cam_eb *bus;
 
 		inccb = (union ccb *)addr;
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 		if (inccb->ccb_h.func_code == XPT_SCSI_IO)
 			inccb->csio.bio = NULL;
 #endif
 
 		if (inccb->ccb_h.flags & CAM_UNLOCKED)
 			return (EINVAL);
 
 		bus = xpt_find_bus(inccb->ccb_h.path_id);
 		if (bus == NULL)
 			return (EINVAL);
 
 		switch (inccb->ccb_h.func_code) {
 		case XPT_SCAN_BUS:
 		case XPT_RESET_BUS:
 			if (inccb->ccb_h.target_id != CAM_TARGET_WILDCARD ||
 			    inccb->ccb_h.target_lun != CAM_LUN_WILDCARD) {
 				xpt_release_bus(bus);
 				return (EINVAL);
 			}
 			break;
 		case XPT_SCAN_TGT:
 			if (inccb->ccb_h.target_id == CAM_TARGET_WILDCARD ||
 			    inccb->ccb_h.target_lun != CAM_LUN_WILDCARD) {
 				xpt_release_bus(bus);
 				return (EINVAL);
 			}
 			break;
 		default:
 			break;
 		}
 
 		switch(inccb->ccb_h.func_code) {
 		case XPT_SCAN_BUS:
 		case XPT_RESET_BUS:
 		case XPT_PATH_INQ:
 		case XPT_ENG_INQ:
 		case XPT_SCAN_LUN:
 		case XPT_SCAN_TGT:
 
 			ccb = xpt_alloc_ccb();
 
 			/*
 			 * Create a path using the bus, target, and lun the
 			 * user passed in.
 			 */
 			if (xpt_create_path(&ccb->ccb_h.path, NULL,
 					    inccb->ccb_h.path_id,
 					    inccb->ccb_h.target_id,
 					    inccb->ccb_h.target_lun) !=
 					    CAM_REQ_CMP){
 				error = EINVAL;
 				xpt_free_ccb(ccb);
 				break;
 			}
 			/* Ensure all of our fields are correct */
 			xpt_setup_ccb(&ccb->ccb_h, ccb->ccb_h.path,
 				      inccb->ccb_h.pinfo.priority);
 			xpt_merge_ccb(ccb, inccb);
 			xpt_path_lock(ccb->ccb_h.path);
 			cam_periph_runccb(ccb, NULL, 0, 0, NULL);
 			xpt_path_unlock(ccb->ccb_h.path);
 			bcopy(ccb, inccb, sizeof(union ccb));
 			xpt_free_path(ccb->ccb_h.path);
 			xpt_free_ccb(ccb);
 			break;
 
 		case XPT_DEBUG: {
 			union ccb ccb;
 
 			/*
 			 * This is an immediate CCB, so it's okay to
 			 * allocate it on the stack.
 			 */
 
 			/*
 			 * Create a path using the bus, target, and lun the
 			 * user passed in.
 			 */
 			if (xpt_create_path(&ccb.ccb_h.path, NULL,
 					    inccb->ccb_h.path_id,
 					    inccb->ccb_h.target_id,
 					    inccb->ccb_h.target_lun) !=
 					    CAM_REQ_CMP){
 				error = EINVAL;
 				break;
 			}
 			/* Ensure all of our fields are correct */
 			xpt_setup_ccb(&ccb.ccb_h, ccb.ccb_h.path,
 				      inccb->ccb_h.pinfo.priority);
 			xpt_merge_ccb(&ccb, inccb);
 			xpt_action(&ccb);
 			bcopy(&ccb, inccb, sizeof(union ccb));
 			xpt_free_path(ccb.ccb_h.path);
 			break;
 
 		}
 		case XPT_DEV_MATCH: {
 			struct cam_periph_map_info mapinfo;
 			struct cam_path *old_path;
 
 			/*
 			 * We can't deal with physical addresses for this
 			 * type of transaction.
 			 */
 			if ((inccb->ccb_h.flags & CAM_DATA_MASK) !=
 			    CAM_DATA_VADDR) {
 				error = EINVAL;
 				break;
 			}
 
 			/*
 			 * Save this in case the caller had it set to
 			 * something in particular.
 			 */
 			old_path = inccb->ccb_h.path;
 
 			/*
 			 * We really don't need a path for the matching
 			 * code.  The path is needed because of the
 			 * debugging statements in xpt_action().  They
 			 * assume that the CCB has a valid path.
 			 */
 			inccb->ccb_h.path = xpt_periph->path;
 
 			bzero(&mapinfo, sizeof(mapinfo));
 
 			/*
 			 * Map the pattern and match buffers into kernel
 			 * virtual address space.
 			 */
 			error = cam_periph_mapmem(inccb, &mapinfo, MAXPHYS);
 
 			if (error) {
 				inccb->ccb_h.path = old_path;
 				break;
 			}
 
 			/*
 			 * This is an immediate CCB, we can send it on directly.
 			 */
 			xpt_action(inccb);
 
 			/*
 			 * Map the buffers back into user space.
 			 */
 			cam_periph_unmapmem(inccb, &mapinfo);
 
 			inccb->ccb_h.path = old_path;
 
 			error = 0;
 			break;
 		}
 		default:
 			error = ENOTSUP;
 			break;
 		}
 		xpt_release_bus(bus);
 		break;
 	}
 	/*
 	 * This is the getpassthru ioctl. It takes a XPT_GDEVLIST ccb as input,
 	 * with the periphal driver name and unit name filled in.  The other
 	 * fields don't really matter as input.  The passthrough driver name
 	 * ("pass"), and unit number are passed back in the ccb.  The current
 	 * device generation number, and the index into the device peripheral
 	 * driver list, and the status are also passed back.  Note that
 	 * since we do everything in one pass, unlike the XPT_GDEVLIST ccb,
 	 * we never return a status of CAM_GDEVLIST_LIST_CHANGED.  It is
 	 * (or rather should be) impossible for the device peripheral driver
 	 * list to change since we look at the whole thing in one pass, and
 	 * we do it with lock protection.
 	 *
 	 */
 	case CAMGETPASSTHRU: {
 		union ccb *ccb;
 		struct cam_periph *periph;
 		struct periph_driver **p_drv;
 		char   *name;
 		u_int unit;
 		int base_periph_found;
 
 		ccb = (union ccb *)addr;
 		unit = ccb->cgdl.unit_number;
 		name = ccb->cgdl.periph_name;
 		base_periph_found = 0;
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 		if (ccb->ccb_h.func_code == XPT_SCSI_IO)
 			ccb->csio.bio = NULL;
 #endif
 
 		/*
 		 * Sanity check -- make sure we don't get a null peripheral
 		 * driver name.
 		 */
 		if (*ccb->cgdl.periph_name == '\0') {
 			error = EINVAL;
 			break;
 		}
 
 		/* Keep the list from changing while we traverse it */
 		xpt_lock_buses();
 
 		/* first find our driver in the list of drivers */
 		for (p_drv = periph_drivers; *p_drv != NULL; p_drv++)
 			if (strcmp((*p_drv)->driver_name, name) == 0)
 				break;
 
 		if (*p_drv == NULL) {
 			xpt_unlock_buses();
 			ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 			ccb->cgdl.status = CAM_GDEVLIST_ERROR;
 			*ccb->cgdl.periph_name = '\0';
 			ccb->cgdl.unit_number = 0;
 			error = ENOENT;
 			break;
 		}
 
 		/*
 		 * Run through every peripheral instance of this driver
 		 * and check to see whether it matches the unit passed
 		 * in by the user.  If it does, get out of the loops and
 		 * find the passthrough driver associated with that
 		 * peripheral driver.
 		 */
 		for (periph = TAILQ_FIRST(&(*p_drv)->units); periph != NULL;
 		     periph = TAILQ_NEXT(periph, unit_links)) {
 
 			if (periph->unit_number == unit)
 				break;
 		}
 		/*
 		 * If we found the peripheral driver that the user passed
 		 * in, go through all of the peripheral drivers for that
 		 * particular device and look for a passthrough driver.
 		 */
 		if (periph != NULL) {
 			struct cam_ed *device;
 			int i;
 
 			base_periph_found = 1;
 			device = periph->path->device;
 			for (i = 0, periph = SLIST_FIRST(&device->periphs);
 			     periph != NULL;
 			     periph = SLIST_NEXT(periph, periph_links), i++) {
 				/*
 				 * Check to see whether we have a
 				 * passthrough device or not.
 				 */
 				if (strcmp(periph->periph_name, "pass") == 0) {
 					/*
 					 * Fill in the getdevlist fields.
 					 */
 					strlcpy(ccb->cgdl.periph_name,
 					       periph->periph_name,
 					       sizeof(ccb->cgdl.periph_name));
 					ccb->cgdl.unit_number =
 						periph->unit_number;
 					if (SLIST_NEXT(periph, periph_links))
 						ccb->cgdl.status =
 							CAM_GDEVLIST_MORE_DEVS;
 					else
 						ccb->cgdl.status =
 						       CAM_GDEVLIST_LAST_DEVICE;
 					ccb->cgdl.generation =
 						device->generation;
 					ccb->cgdl.index = i;
 					/*
 					 * Fill in some CCB header fields
 					 * that the user may want.
 					 */
 					ccb->ccb_h.path_id =
 						periph->path->bus->path_id;
 					ccb->ccb_h.target_id =
 						periph->path->target->target_id;
 					ccb->ccb_h.target_lun =
 						periph->path->device->lun_id;
 					ccb->ccb_h.status = CAM_REQ_CMP;
 					break;
 				}
 			}
 		}
 
 		/*
 		 * If the periph is null here, one of two things has
 		 * happened.  The first possibility is that we couldn't
 		 * find the unit number of the particular peripheral driver
 		 * that the user is asking about.  e.g. the user asks for
 		 * the passthrough driver for "da11".  We find the list of
 		 * "da" peripherals all right, but there is no unit 11.
 		 * The other possibility is that we went through the list
 		 * of peripheral drivers attached to the device structure,
 		 * but didn't find one with the name "pass".  Either way,
 		 * we return ENOENT, since we couldn't find something.
 		 */
 		if (periph == NULL) {
 			ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 			ccb->cgdl.status = CAM_GDEVLIST_ERROR;
 			*ccb->cgdl.periph_name = '\0';
 			ccb->cgdl.unit_number = 0;
 			error = ENOENT;
 			/*
 			 * It is unfortunate that this is even necessary,
 			 * but there are many, many clueless users out there.
 			 * If this is true, the user is looking for the
 			 * passthrough driver, but doesn't have one in his
 			 * kernel.
 			 */
 			if (base_periph_found == 1) {
 				printf("xptioctl: pass driver is not in the "
 				       "kernel\n");
 				printf("xptioctl: put \"device pass\" in "
 				       "your kernel config file\n");
 			}
 		}
 		xpt_unlock_buses();
 		break;
 		}
 	default:
 		error = ENOTTY;
 		break;
 	}
 
 	return(error);
 }
 
 static int
 cam_module_event_handler(module_t mod, int what, void *arg)
 {
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		if ((error = xpt_init(NULL)) != 0)
 			return (error);
 		break;
 	case MOD_UNLOAD:
 		return EBUSY;
 	default:
 		return EOPNOTSUPP;
 	}
 
 	return 0;
 }
 
 static struct xpt_proto *
 xpt_proto_find(cam_proto proto)
 {
 	struct xpt_proto **pp;
 
 	SET_FOREACH(pp, cam_xpt_proto_set) {
 		if ((*pp)->proto == proto)
 			return *pp;
 	}
 
 	return NULL;
 }
 
 static void
 xpt_rescan_done(struct cam_periph *periph, union ccb *done_ccb)
 {
 
 	if (done_ccb->ccb_h.ppriv_ptr1 == NULL) {
 		xpt_free_path(done_ccb->ccb_h.path);
 		xpt_free_ccb(done_ccb);
 	} else {
 		done_ccb->ccb_h.cbfcnp = done_ccb->ccb_h.ppriv_ptr1;
 		(*done_ccb->ccb_h.cbfcnp)(periph, done_ccb);
 	}
 	xpt_release_boot();
 }
 
 /* thread to handle bus rescans */
 static void
 xpt_scanner_thread(void *dummy)
 {
 	union ccb	*ccb;
 	struct cam_path	 path;
 
 	xpt_lock_buses();
 	for (;;) {
 		if (TAILQ_EMPTY(&xsoftc.ccb_scanq))
 			msleep(&xsoftc.ccb_scanq, &xsoftc.xpt_topo_lock, PRIBIO,
 			       "-", 0);
 		if ((ccb = (union ccb *)TAILQ_FIRST(&xsoftc.ccb_scanq)) != NULL) {
 			TAILQ_REMOVE(&xsoftc.ccb_scanq, &ccb->ccb_h, sim_links.tqe);
 			xpt_unlock_buses();
 
 			/*
 			 * Since lock can be dropped inside and path freed
 			 * by completion callback even before return here,
 			 * take our own path copy for reference.
 			 */
 			xpt_copy_path(&path, ccb->ccb_h.path);
 			xpt_path_lock(&path);
 			xpt_action(ccb);
 			xpt_path_unlock(&path);
 			xpt_release_path(&path);
 
 			xpt_lock_buses();
 		}
 	}
 }
 
 void
 xpt_rescan(union ccb *ccb)
 {
 	struct ccb_hdr *hdr;
 
 	/* Prepare request */
 	if (ccb->ccb_h.path->target->target_id == CAM_TARGET_WILDCARD &&
 	    ccb->ccb_h.path->device->lun_id == CAM_LUN_WILDCARD)
 		ccb->ccb_h.func_code = XPT_SCAN_BUS;
 	else if (ccb->ccb_h.path->target->target_id != CAM_TARGET_WILDCARD &&
 	    ccb->ccb_h.path->device->lun_id == CAM_LUN_WILDCARD)
 		ccb->ccb_h.func_code = XPT_SCAN_TGT;
 	else if (ccb->ccb_h.path->target->target_id != CAM_TARGET_WILDCARD &&
 	    ccb->ccb_h.path->device->lun_id != CAM_LUN_WILDCARD)
 		ccb->ccb_h.func_code = XPT_SCAN_LUN;
 	else {
 		xpt_print(ccb->ccb_h.path, "illegal scan path\n");
 		xpt_free_path(ccb->ccb_h.path);
 		xpt_free_ccb(ccb);
 		return;
 	}
 	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
 	    ("xpt_rescan: func %#x %s\n", ccb->ccb_h.func_code,
  		xpt_action_name(ccb->ccb_h.func_code)));
 
 	ccb->ccb_h.ppriv_ptr1 = ccb->ccb_h.cbfcnp;
 	ccb->ccb_h.cbfcnp = xpt_rescan_done;
 	xpt_setup_ccb(&ccb->ccb_h, ccb->ccb_h.path, CAM_PRIORITY_XPT);
 	/* Don't make duplicate entries for the same paths. */
 	xpt_lock_buses();
 	if (ccb->ccb_h.ppriv_ptr1 == NULL) {
 		TAILQ_FOREACH(hdr, &xsoftc.ccb_scanq, sim_links.tqe) {
 			if (xpt_path_comp(hdr->path, ccb->ccb_h.path) == 0) {
 				wakeup(&xsoftc.ccb_scanq);
 				xpt_unlock_buses();
 				xpt_print(ccb->ccb_h.path, "rescan already queued\n");
 				xpt_free_path(ccb->ccb_h.path);
 				xpt_free_ccb(ccb);
 				return;
 			}
 		}
 	}
 	TAILQ_INSERT_TAIL(&xsoftc.ccb_scanq, &ccb->ccb_h, sim_links.tqe);
 	xpt_hold_boot_locked();
 	wakeup(&xsoftc.ccb_scanq);
 	xpt_unlock_buses();
 }
 
 /* Functions accessed by the peripheral drivers */
 static int
 xpt_init(void *dummy)
 {
 	struct cam_sim *xpt_sim;
 	struct cam_path *path;
 	struct cam_devq *devq;
 	cam_status status;
 	int error, i;
 
 	TAILQ_INIT(&xsoftc.xpt_busses);
 	TAILQ_INIT(&xsoftc.ccb_scanq);
 	STAILQ_INIT(&xsoftc.highpowerq);
 	xsoftc.num_highpower = CAM_MAX_HIGHPOWER;
 
 	mtx_init(&xsoftc.xpt_highpower_lock, "XPT highpower lock", NULL, MTX_DEF);
 	xsoftc.xpt_taskq = taskqueue_create("CAM XPT task", M_WAITOK,
 	    taskqueue_thread_enqueue, /*context*/&xsoftc.xpt_taskq);
 
 #ifdef CAM_BOOT_DELAY
 	/*
 	 * Override this value at compile time to assist our users
 	 * who don't use loader to boot a kernel.
 	 */
 	xsoftc.boot_delay = CAM_BOOT_DELAY;
 #endif
 
 	/*
 	 * The xpt layer is, itself, the equivalent of a SIM.
 	 * Allow 16 ccbs in the ccb pool for it.  This should
 	 * give decent parallelism when we probe buses and
 	 * perform other XPT functions.
 	 */
 	devq = cam_simq_alloc(16);
 	xpt_sim = cam_sim_alloc(xptaction,
 				xptpoll,
 				"xpt",
 				/*softc*/NULL,
 				/*unit*/0,
 				/*mtx*/NULL,
 				/*max_dev_transactions*/0,
 				/*max_tagged_dev_transactions*/0,
 				devq);
 	if (xpt_sim == NULL)
 		return (ENOMEM);
 
 	if ((status = xpt_bus_register(xpt_sim, NULL, 0)) != CAM_SUCCESS) {
 		printf("xpt_init: xpt_bus_register failed with status %#x,"
 		       " failing attach\n", status);
 		return (EINVAL);
 	}
 
 	/*
 	 * Looking at the XPT from the SIM layer, the XPT is
 	 * the equivalent of a peripheral driver.  Allocate
 	 * a peripheral driver entry for us.
 	 */
 	if ((status = xpt_create_path(&path, NULL, CAM_XPT_PATH_ID,
 				      CAM_TARGET_WILDCARD,
 				      CAM_LUN_WILDCARD)) != CAM_REQ_CMP) {
 		printf("xpt_init: xpt_create_path failed with status %#x,"
 		       " failing attach\n", status);
 		return (EINVAL);
 	}
 	xpt_path_lock(path);
 	cam_periph_alloc(xptregister, NULL, NULL, NULL, "xpt", CAM_PERIPH_BIO,
 			 path, NULL, 0, xpt_sim);
 	xpt_path_unlock(path);
 	xpt_free_path(path);
 
 	if (cam_num_doneqs < 1)
 		cam_num_doneqs = 1 + mp_ncpus / 6;
 	else if (cam_num_doneqs > MAXCPU)
 		cam_num_doneqs = MAXCPU;
 	for (i = 0; i < cam_num_doneqs; i++) {
 		mtx_init(&cam_doneqs[i].cam_doneq_mtx, "CAM doneq", NULL,
 		    MTX_DEF);
 		STAILQ_INIT(&cam_doneqs[i].cam_doneq);
 		error = kproc_kthread_add(xpt_done_td, &cam_doneqs[i],
 		    &cam_proc, NULL, 0, 0, "cam", "doneq%d", i);
 		if (error != 0) {
 			cam_num_doneqs = i;
 			break;
 		}
 	}
 	if (cam_num_doneqs < 1) {
 		printf("xpt_init: Cannot init completion queues "
 		       "- failing attach\n");
 		return (ENOMEM);
 	}
 
 	/*
 	 * Register a callback for when interrupts are enabled.
 	 */
 	config_intrhook_oneshot(xpt_config, NULL);
 
 	return (0);
 }
 
 static cam_status
 xptregister(struct cam_periph *periph, void *arg)
 {
 	struct cam_sim *xpt_sim;
 
 	if (periph == NULL) {
 		printf("xptregister: periph was NULL!!\n");
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	xpt_sim = (struct cam_sim *)arg;
 	xpt_sim->softc = periph;
 	xpt_periph = periph;
 	periph->softc = NULL;
 
 	return(CAM_REQ_CMP);
 }
 
 int32_t
 xpt_add_periph(struct cam_periph *periph)
 {
 	struct cam_ed *device;
 	int32_t	 status;
 
 	TASK_INIT(&periph->periph_run_task, 0, xpt_run_allocq_task, periph);
 	device = periph->path->device;
 	status = CAM_REQ_CMP;
 	if (device != NULL) {
 		mtx_lock(&device->target->bus->eb_mtx);
 		device->generation++;
 		SLIST_INSERT_HEAD(&device->periphs, periph, periph_links);
 		mtx_unlock(&device->target->bus->eb_mtx);
 		atomic_add_32(&xsoftc.xpt_generation, 1);
 	}
 
 	return (status);
 }
 
 void
 xpt_remove_periph(struct cam_periph *periph)
 {
 	struct cam_ed *device;
 
 	device = periph->path->device;
 	if (device != NULL) {
 		mtx_lock(&device->target->bus->eb_mtx);
 		device->generation++;
 		SLIST_REMOVE(&device->periphs, periph, cam_periph, periph_links);
 		mtx_unlock(&device->target->bus->eb_mtx);
 		atomic_add_32(&xsoftc.xpt_generation, 1);
 	}
 }
 
 
 void
 xpt_announce_periph(struct cam_periph *periph, char *announce_string)
 {
 	struct	cam_path *path = periph->path;
 	struct  xpt_proto *proto;
 
 	cam_periph_assert(periph, MA_OWNED);
 	periph->flags |= CAM_PERIPH_ANNOUNCED;
 
 	printf("%s%d at %s%d bus %d scbus%d target %d lun %jx\n",
 	       periph->periph_name, periph->unit_number,
 	       path->bus->sim->sim_name,
 	       path->bus->sim->unit_number,
 	       path->bus->sim->bus_id,
 	       path->bus->path_id,
 	       path->target->target_id,
 	       (uintmax_t)path->device->lun_id);
 	printf("%s%d: ", periph->periph_name, periph->unit_number);
 	proto = xpt_proto_find(path->device->protocol);
 	if (proto)
 		proto->ops->announce(path->device);
 	else
 		printf("%s%d: Unknown protocol device %d\n",
 		    periph->periph_name, periph->unit_number,
 		    path->device->protocol);
 	if (path->device->serial_num_len > 0) {
 		/* Don't wrap the screen  - print only the first 60 chars */
 		printf("%s%d: Serial Number %.60s\n", periph->periph_name,
 		       periph->unit_number, path->device->serial_num);
 	}
 	/* Announce transport details. */
 	path->bus->xport->ops->announce(periph);
 	/* Announce command queueing. */
 	if (path->device->inq_flags & SID_CmdQue
 	 || path->device->flags & CAM_DEV_TAG_AFTER_COUNT) {
 		printf("%s%d: Command Queueing enabled\n",
 		       periph->periph_name, periph->unit_number);
 	}
 	/* Announce caller's details if they've passed in. */
 	if (announce_string != NULL)
 		printf("%s%d: %s\n", periph->periph_name,
 		       periph->unit_number, announce_string);
 }
 
 void
 xpt_announce_periph_sbuf(struct cam_periph *periph, struct sbuf *sb,
     char *announce_string)
 {
 	struct	cam_path *path = periph->path;
 	struct  xpt_proto *proto;
 
 	cam_periph_assert(periph, MA_OWNED);
 	periph->flags |= CAM_PERIPH_ANNOUNCED;
 
 	/* Fall back to the non-sbuf method if necessary */
 	if (xsoftc.announce_nosbuf != 0) {
 		xpt_announce_periph(periph, announce_string);
 		return;
 	}
 	proto = xpt_proto_find(path->device->protocol);
 	if (((proto != NULL) && (proto->ops->announce_sbuf == NULL)) ||
 	    (path->bus->xport->ops->announce_sbuf == NULL)) {
 		xpt_announce_periph(periph, announce_string);
 		return;
 	}
 
 	sbuf_printf(sb, "%s%d at %s%d bus %d scbus%d target %d lun %jx\n",
 	    periph->periph_name, periph->unit_number,
 	    path->bus->sim->sim_name,
 	    path->bus->sim->unit_number,
 	    path->bus->sim->bus_id,
 	    path->bus->path_id,
 	    path->target->target_id,
 	    (uintmax_t)path->device->lun_id);
 	sbuf_printf(sb, "%s%d: ", periph->periph_name, periph->unit_number);
 
 	if (proto)
 		proto->ops->announce_sbuf(path->device, sb);
 	else
 		sbuf_printf(sb, "%s%d: Unknown protocol device %d\n",
 		    periph->periph_name, periph->unit_number,
 		    path->device->protocol);
 	if (path->device->serial_num_len > 0) {
 		/* Don't wrap the screen  - print only the first 60 chars */
 		sbuf_printf(sb, "%s%d: Serial Number %.60s\n",
 		    periph->periph_name, periph->unit_number,
 		    path->device->serial_num);
 	}
 	/* Announce transport details. */
 	path->bus->xport->ops->announce_sbuf(periph, sb);
 	/* Announce command queueing. */
 	if (path->device->inq_flags & SID_CmdQue
 	 || path->device->flags & CAM_DEV_TAG_AFTER_COUNT) {
 		sbuf_printf(sb, "%s%d: Command Queueing enabled\n",
 		    periph->periph_name, periph->unit_number);
 	}
 	/* Announce caller's details if they've passed in. */
 	if (announce_string != NULL)
 		sbuf_printf(sb, "%s%d: %s\n", periph->periph_name,
 		    periph->unit_number, announce_string);
 }
 
 void
 xpt_announce_quirks(struct cam_periph *periph, int quirks, char *bit_string)
 {
 	if (quirks != 0) {
 		printf("%s%d: quirks=0x%b\n", periph->periph_name,
 		    periph->unit_number, quirks, bit_string);
 	}
 }
 
 void
 xpt_announce_quirks_sbuf(struct cam_periph *periph, struct sbuf *sb,
 			 int quirks, char *bit_string)
 {
 	if (xsoftc.announce_nosbuf != 0) {
 		xpt_announce_quirks(periph, quirks, bit_string);
 		return;
 	}
 
 	if (quirks != 0) {
 		sbuf_printf(sb, "%s%d: quirks=0x%b\n", periph->periph_name,
 		    periph->unit_number, quirks, bit_string);
 	}
 }
 
 void
 xpt_denounce_periph(struct cam_periph *periph)
 {
 	struct	cam_path *path = periph->path;
 	struct  xpt_proto *proto;
 
 	cam_periph_assert(periph, MA_OWNED);
 	printf("%s%d at %s%d bus %d scbus%d target %d lun %jx\n",
 	       periph->periph_name, periph->unit_number,
 	       path->bus->sim->sim_name,
 	       path->bus->sim->unit_number,
 	       path->bus->sim->bus_id,
 	       path->bus->path_id,
 	       path->target->target_id,
 	       (uintmax_t)path->device->lun_id);
 	printf("%s%d: ", periph->periph_name, periph->unit_number);
 	proto = xpt_proto_find(path->device->protocol);
 	if (proto)
 		proto->ops->denounce(path->device);
 	else
 		printf("%s%d: Unknown protocol device %d\n",
 		    periph->periph_name, periph->unit_number,
 		    path->device->protocol);
 	if (path->device->serial_num_len > 0)
 		printf(" s/n %.60s", path->device->serial_num);
 	printf(" detached\n");
 }
 
 void
 xpt_denounce_periph_sbuf(struct cam_periph *periph, struct sbuf *sb)
 {
 	struct cam_path *path = periph->path;
 	struct xpt_proto *proto;
 
 	cam_periph_assert(periph, MA_OWNED);
 
 	/* Fall back to the non-sbuf method if necessary */
 	if (xsoftc.announce_nosbuf != 0) {
 		xpt_denounce_periph(periph);
 		return;
 	}
 	proto = xpt_proto_find(path->device->protocol);
 	if ((proto != NULL) && (proto->ops->denounce_sbuf == NULL)) {
 		xpt_denounce_periph(periph);
 		return;
 	}
 
 	sbuf_printf(sb, "%s%d at %s%d bus %d scbus%d target %d lun %jx\n",
 	    periph->periph_name, periph->unit_number,
 	    path->bus->sim->sim_name,
 	    path->bus->sim->unit_number,
 	    path->bus->sim->bus_id,
 	    path->bus->path_id,
 	    path->target->target_id,
 	    (uintmax_t)path->device->lun_id);
 	sbuf_printf(sb, "%s%d: ", periph->periph_name, periph->unit_number);
 
 	if (proto)
 		proto->ops->denounce_sbuf(path->device, sb);
 	else
 		sbuf_printf(sb, "%s%d: Unknown protocol device %d\n",
 		    periph->periph_name, periph->unit_number,
 		    path->device->protocol);
 	if (path->device->serial_num_len > 0)
 		sbuf_printf(sb, " s/n %.60s", path->device->serial_num);
 	sbuf_printf(sb, " detached\n");
 }
 
 int
 xpt_getattr(char *buf, size_t len, const char *attr, struct cam_path *path)
 {
 	int ret = -1, l, o;
 	struct ccb_dev_advinfo cdai;
 	struct scsi_vpd_device_id *did;
 	struct scsi_vpd_id_descriptor *idd;
 
 	xpt_path_assert(path, MA_OWNED);
 
 	memset(&cdai, 0, sizeof(cdai));
 	xpt_setup_ccb(&cdai.ccb_h, path, CAM_PRIORITY_NORMAL);
 	cdai.ccb_h.func_code = XPT_DEV_ADVINFO;
 	cdai.flags = CDAI_FLAG_NONE;
 	cdai.bufsiz = len;
 	cdai.buf = buf;
 
 	if (!strcmp(attr, "GEOM::ident"))
 		cdai.buftype = CDAI_TYPE_SERIAL_NUM;
 	else if (!strcmp(attr, "GEOM::physpath"))
 		cdai.buftype = CDAI_TYPE_PHYS_PATH;
 	else if (strcmp(attr, "GEOM::lunid") == 0 ||
 		 strcmp(attr, "GEOM::lunname") == 0) {
 		cdai.buftype = CDAI_TYPE_SCSI_DEVID;
 		cdai.bufsiz = CAM_SCSI_DEVID_MAXLEN;
 		cdai.buf = malloc(cdai.bufsiz, M_CAMXPT, M_NOWAIT);
 		if (cdai.buf == NULL) {
 			ret = ENOMEM;
 			goto out;
 		}
 	} else
 		goto out;
 
 	xpt_action((union ccb *)&cdai); /* can only be synchronous */
 	if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0)
 		cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE);
 	if (cdai.provsiz == 0)
 		goto out;
 	switch(cdai.buftype) {
 	case CDAI_TYPE_SCSI_DEVID:
 		did = (struct scsi_vpd_device_id *)cdai.buf;
 		if (strcmp(attr, "GEOM::lunid") == 0) {
 			idd = scsi_get_devid(did, cdai.provsiz,
 			    scsi_devid_is_lun_naa);
 			if (idd == NULL)
 				idd = scsi_get_devid(did, cdai.provsiz,
 				    scsi_devid_is_lun_eui64);
 			if (idd == NULL)
 				idd = scsi_get_devid(did, cdai.provsiz,
 				    scsi_devid_is_lun_uuid);
 			if (idd == NULL)
 				idd = scsi_get_devid(did, cdai.provsiz,
 				    scsi_devid_is_lun_md5);
 		} else
 			idd = NULL;
 
 		if (idd == NULL)
 			idd = scsi_get_devid(did, cdai.provsiz,
 			    scsi_devid_is_lun_t10);
 		if (idd == NULL)
 			idd = scsi_get_devid(did, cdai.provsiz,
 			    scsi_devid_is_lun_name);
 		if (idd == NULL)
 			break;
 
 		ret = 0;
 		if ((idd->proto_codeset & SVPD_ID_CODESET_MASK) ==
 		    SVPD_ID_CODESET_ASCII) {
 			if (idd->length < len) {
 				for (l = 0; l < idd->length; l++)
 					buf[l] = idd->identifier[l] ?
 					    idd->identifier[l] : ' ';
 				buf[l] = 0;
 			} else
 				ret = EFAULT;
 			break;
 		}
 		if ((idd->proto_codeset & SVPD_ID_CODESET_MASK) ==
 		    SVPD_ID_CODESET_UTF8) {
 			l = strnlen(idd->identifier, idd->length);
 			if (l < len) {
 				bcopy(idd->identifier, buf, l);
 				buf[l] = 0;
 			} else
 				ret = EFAULT;
 			break;
 		}
 		if ((idd->id_type & SVPD_ID_TYPE_MASK) ==
 		    SVPD_ID_TYPE_UUID && idd->identifier[0] == 0x10) {
 			if ((idd->length - 2) * 2 + 4 >= len) {
 				ret = EFAULT;
 				break;
 			}
 			for (l = 2, o = 0; l < idd->length; l++) {
 				if (l == 6 || l == 8 || l == 10 || l == 12)
 				    o += sprintf(buf + o, "-");
 				o += sprintf(buf + o, "%02x",
 				    idd->identifier[l]);
 			}
 			break;
 		}
 		if (idd->length * 2 < len) {
 			for (l = 0; l < idd->length; l++)
 				sprintf(buf + l * 2, "%02x",
 				    idd->identifier[l]);
 		} else
 				ret = EFAULT;
 		break;
 	default:
 		if (cdai.provsiz < len) {
 			cdai.buf[cdai.provsiz] = 0;
 			ret = 0;
 		} else
 			ret = EFAULT;
 		break;
 	}
 
 out:
 	if ((char *)cdai.buf != buf)
 		free(cdai.buf, M_CAMXPT);
 	return ret;
 }
 
 static dev_match_ret
 xptbusmatch(struct dev_match_pattern *patterns, u_int num_patterns,
 	    struct cam_eb *bus)
 {
 	dev_match_ret retval;
 	u_int i;
 
 	retval = DM_RET_NONE;
 
 	/*
 	 * If we aren't given something to match against, that's an error.
 	 */
 	if (bus == NULL)
 		return(DM_RET_ERROR);
 
 	/*
 	 * If there are no match entries, then this bus matches no
 	 * matter what.
 	 */
 	if ((patterns == NULL) || (num_patterns == 0))
 		return(DM_RET_DESCEND | DM_RET_COPY);
 
 	for (i = 0; i < num_patterns; i++) {
 		struct bus_match_pattern *cur_pattern;
 
 		/*
 		 * If the pattern in question isn't for a bus node, we
 		 * aren't interested.  However, we do indicate to the
 		 * calling routine that we should continue descending the
 		 * tree, since the user wants to match against lower-level
 		 * EDT elements.
 		 */
 		if (patterns[i].type != DEV_MATCH_BUS) {
 			if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE)
 				retval |= DM_RET_DESCEND;
 			continue;
 		}
 
 		cur_pattern = &patterns[i].pattern.bus_pattern;
 
 		/*
 		 * If they want to match any bus node, we give them any
 		 * device node.
 		 */
 		if (cur_pattern->flags == BUS_MATCH_ANY) {
 			/* set the copy flag */
 			retval |= DM_RET_COPY;
 
 			/*
 			 * If we've already decided on an action, go ahead
 			 * and return.
 			 */
 			if ((retval & DM_RET_ACTION_MASK) != DM_RET_NONE)
 				return(retval);
 		}
 
 		/*
 		 * Not sure why someone would do this...
 		 */
 		if (cur_pattern->flags == BUS_MATCH_NONE)
 			continue;
 
 		if (((cur_pattern->flags & BUS_MATCH_PATH) != 0)
 		 && (cur_pattern->path_id != bus->path_id))
 			continue;
 
 		if (((cur_pattern->flags & BUS_MATCH_BUS_ID) != 0)
 		 && (cur_pattern->bus_id != bus->sim->bus_id))
 			continue;
 
 		if (((cur_pattern->flags & BUS_MATCH_UNIT) != 0)
 		 && (cur_pattern->unit_number != bus->sim->unit_number))
 			continue;
 
 		if (((cur_pattern->flags & BUS_MATCH_NAME) != 0)
 		 && (strncmp(cur_pattern->dev_name, bus->sim->sim_name,
 			     DEV_IDLEN) != 0))
 			continue;
 
 		/*
 		 * If we get to this point, the user definitely wants
 		 * information on this bus.  So tell the caller to copy the
 		 * data out.
 		 */
 		retval |= DM_RET_COPY;
 
 		/*
 		 * If the return action has been set to descend, then we
 		 * know that we've already seen a non-bus matching
 		 * expression, therefore we need to further descend the tree.
 		 * This won't change by continuing around the loop, so we
 		 * go ahead and return.  If we haven't seen a non-bus
 		 * matching expression, we keep going around the loop until
 		 * we exhaust the matching expressions.  We'll set the stop
 		 * flag once we fall out of the loop.
 		 */
 		if ((retval & DM_RET_ACTION_MASK) == DM_RET_DESCEND)
 			return(retval);
 	}
 
 	/*
 	 * If the return action hasn't been set to descend yet, that means
 	 * we haven't seen anything other than bus matching patterns.  So
 	 * tell the caller to stop descending the tree -- the user doesn't
 	 * want to match against lower level tree elements.
 	 */
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE)
 		retval |= DM_RET_STOP;
 
 	return(retval);
 }
 
 static dev_match_ret
 xptdevicematch(struct dev_match_pattern *patterns, u_int num_patterns,
 	       struct cam_ed *device)
 {
 	dev_match_ret retval;
 	u_int i;
 
 	retval = DM_RET_NONE;
 
 	/*
 	 * If we aren't given something to match against, that's an error.
 	 */
 	if (device == NULL)
 		return(DM_RET_ERROR);
 
 	/*
 	 * If there are no match entries, then this device matches no
 	 * matter what.
 	 */
 	if ((patterns == NULL) || (num_patterns == 0))
 		return(DM_RET_DESCEND | DM_RET_COPY);
 
 	for (i = 0; i < num_patterns; i++) {
 		struct device_match_pattern *cur_pattern;
 		struct scsi_vpd_device_id *device_id_page;
 
 		/*
 		 * If the pattern in question isn't for a device node, we
 		 * aren't interested.
 		 */
 		if (patterns[i].type != DEV_MATCH_DEVICE) {
 			if ((patterns[i].type == DEV_MATCH_PERIPH)
 			 && ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE))
 				retval |= DM_RET_DESCEND;
 			continue;
 		}
 
 		cur_pattern = &patterns[i].pattern.device_pattern;
 
 		/* Error out if mutually exclusive options are specified. */
 		if ((cur_pattern->flags & (DEV_MATCH_INQUIRY|DEV_MATCH_DEVID))
 		 == (DEV_MATCH_INQUIRY|DEV_MATCH_DEVID))
 			return(DM_RET_ERROR);
 
 		/*
 		 * If they want to match any device node, we give them any
 		 * device node.
 		 */
 		if (cur_pattern->flags == DEV_MATCH_ANY)
 			goto copy_dev_node;
 
 		/*
 		 * Not sure why someone would do this...
 		 */
 		if (cur_pattern->flags == DEV_MATCH_NONE)
 			continue;
 
 		if (((cur_pattern->flags & DEV_MATCH_PATH) != 0)
 		 && (cur_pattern->path_id != device->target->bus->path_id))
 			continue;
 
 		if (((cur_pattern->flags & DEV_MATCH_TARGET) != 0)
 		 && (cur_pattern->target_id != device->target->target_id))
 			continue;
 
 		if (((cur_pattern->flags & DEV_MATCH_LUN) != 0)
 		 && (cur_pattern->target_lun != device->lun_id))
 			continue;
 
 		if (((cur_pattern->flags & DEV_MATCH_INQUIRY) != 0)
 		 && (cam_quirkmatch((caddr_t)&device->inq_data,
 				    (caddr_t)&cur_pattern->data.inq_pat,
 				    1, sizeof(cur_pattern->data.inq_pat),
 				    scsi_static_inquiry_match) == NULL))
 			continue;
 
 		device_id_page = (struct scsi_vpd_device_id *)device->device_id;
 		if (((cur_pattern->flags & DEV_MATCH_DEVID) != 0)
 		 && (device->device_id_len < SVPD_DEVICE_ID_HDR_LEN
 		  || scsi_devid_match((uint8_t *)device_id_page->desc_list,
 				      device->device_id_len
 				    - SVPD_DEVICE_ID_HDR_LEN,
 				      cur_pattern->data.devid_pat.id,
 				      cur_pattern->data.devid_pat.id_len) != 0))
 			continue;
 
 copy_dev_node:
 		/*
 		 * If we get to this point, the user definitely wants
 		 * information on this device.  So tell the caller to copy
 		 * the data out.
 		 */
 		retval |= DM_RET_COPY;
 
 		/*
 		 * If the return action has been set to descend, then we
 		 * know that we've already seen a peripheral matching
 		 * expression, therefore we need to further descend the tree.
 		 * This won't change by continuing around the loop, so we
 		 * go ahead and return.  If we haven't seen a peripheral
 		 * matching expression, we keep going around the loop until
 		 * we exhaust the matching expressions.  We'll set the stop
 		 * flag once we fall out of the loop.
 		 */
 		if ((retval & DM_RET_ACTION_MASK) == DM_RET_DESCEND)
 			return(retval);
 	}
 
 	/*
 	 * If the return action hasn't been set to descend yet, that means
 	 * we haven't seen any peripheral matching patterns.  So tell the
 	 * caller to stop descending the tree -- the user doesn't want to
 	 * match against lower level tree elements.
 	 */
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE)
 		retval |= DM_RET_STOP;
 
 	return(retval);
 }
 
 /*
  * Match a single peripheral against any number of match patterns.
  */
 static dev_match_ret
 xptperiphmatch(struct dev_match_pattern *patterns, u_int num_patterns,
 	       struct cam_periph *periph)
 {
 	dev_match_ret retval;
 	u_int i;
 
 	/*
 	 * If we aren't given something to match against, that's an error.
 	 */
 	if (periph == NULL)
 		return(DM_RET_ERROR);
 
 	/*
 	 * If there are no match entries, then this peripheral matches no
 	 * matter what.
 	 */
 	if ((patterns == NULL) || (num_patterns == 0))
 		return(DM_RET_STOP | DM_RET_COPY);
 
 	/*
 	 * There aren't any nodes below a peripheral node, so there's no
 	 * reason to descend the tree any further.
 	 */
 	retval = DM_RET_STOP;
 
 	for (i = 0; i < num_patterns; i++) {
 		struct periph_match_pattern *cur_pattern;
 
 		/*
 		 * If the pattern in question isn't for a peripheral, we
 		 * aren't interested.
 		 */
 		if (patterns[i].type != DEV_MATCH_PERIPH)
 			continue;
 
 		cur_pattern = &patterns[i].pattern.periph_pattern;
 
 		/*
 		 * If they want to match on anything, then we will do so.
 		 */
 		if (cur_pattern->flags == PERIPH_MATCH_ANY) {
 			/* set the copy flag */
 			retval |= DM_RET_COPY;
 
 			/*
 			 * We've already set the return action to stop,
 			 * since there are no nodes below peripherals in
 			 * the tree.
 			 */
 			return(retval);
 		}
 
 		/*
 		 * Not sure why someone would do this...
 		 */
 		if (cur_pattern->flags == PERIPH_MATCH_NONE)
 			continue;
 
 		if (((cur_pattern->flags & PERIPH_MATCH_PATH) != 0)
 		 && (cur_pattern->path_id != periph->path->bus->path_id))
 			continue;
 
 		/*
 		 * For the target and lun id's, we have to make sure the
 		 * target and lun pointers aren't NULL.  The xpt peripheral
 		 * has a wildcard target and device.
 		 */
 		if (((cur_pattern->flags & PERIPH_MATCH_TARGET) != 0)
 		 && ((periph->path->target == NULL)
 		 ||(cur_pattern->target_id != periph->path->target->target_id)))
 			continue;
 
 		if (((cur_pattern->flags & PERIPH_MATCH_LUN) != 0)
 		 && ((periph->path->device == NULL)
 		 || (cur_pattern->target_lun != periph->path->device->lun_id)))
 			continue;
 
 		if (((cur_pattern->flags & PERIPH_MATCH_UNIT) != 0)
 		 && (cur_pattern->unit_number != periph->unit_number))
 			continue;
 
 		if (((cur_pattern->flags & PERIPH_MATCH_NAME) != 0)
 		 && (strncmp(cur_pattern->periph_name, periph->periph_name,
 			     DEV_IDLEN) != 0))
 			continue;
 
 		/*
 		 * If we get to this point, the user definitely wants
 		 * information on this peripheral.  So tell the caller to
 		 * copy the data out.
 		 */
 		retval |= DM_RET_COPY;
 
 		/*
 		 * The return action has already been set to stop, since
 		 * peripherals don't have any nodes below them in the EDT.
 		 */
 		return(retval);
 	}
 
 	/*
 	 * If we get to this point, the peripheral that was passed in
 	 * doesn't match any of the patterns.
 	 */
 	return(retval);
 }
 
 static int
 xptedtbusfunc(struct cam_eb *bus, void *arg)
 {
 	struct ccb_dev_match *cdm;
 	struct cam_et *target;
 	dev_match_ret retval;
 
 	cdm = (struct ccb_dev_match *)arg;
 
 	/*
 	 * If our position is for something deeper in the tree, that means
 	 * that we've already seen this node.  So, we keep going down.
 	 */
 	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
 	 && (cdm->pos.cookie.bus == bus)
 	 && (cdm->pos.position_type & CAM_DEV_POS_TARGET)
 	 && (cdm->pos.cookie.target != NULL))
 		retval = DM_RET_DESCEND;
 	else
 		retval = xptbusmatch(cdm->patterns, cdm->num_patterns, bus);
 
 	/*
 	 * If we got an error, bail out of the search.
 	 */
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) {
 		cdm->status = CAM_DEV_MATCH_ERROR;
 		return(0);
 	}
 
 	/*
 	 * If the copy flag is set, copy this bus out.
 	 */
 	if (retval & DM_RET_COPY) {
 		int spaceleft, j;
 
 		spaceleft = cdm->match_buf_len - (cdm->num_matches *
 			sizeof(struct dev_match_result));
 
 		/*
 		 * If we don't have enough space to put in another
 		 * match result, save our position and tell the
 		 * user there are more devices to check.
 		 */
 		if (spaceleft < sizeof(struct dev_match_result)) {
 			bzero(&cdm->pos, sizeof(cdm->pos));
 			cdm->pos.position_type =
 				CAM_DEV_POS_EDT | CAM_DEV_POS_BUS;
 
 			cdm->pos.cookie.bus = bus;
 			cdm->pos.generations[CAM_BUS_GENERATION]=
 				xsoftc.bus_generation;
 			cdm->status = CAM_DEV_MATCH_MORE;
 			return(0);
 		}
 		j = cdm->num_matches;
 		cdm->num_matches++;
 		cdm->matches[j].type = DEV_MATCH_BUS;
 		cdm->matches[j].result.bus_result.path_id = bus->path_id;
 		cdm->matches[j].result.bus_result.bus_id = bus->sim->bus_id;
 		cdm->matches[j].result.bus_result.unit_number =
 			bus->sim->unit_number;
 		strlcpy(cdm->matches[j].result.bus_result.dev_name,
 			bus->sim->sim_name,
 			sizeof(cdm->matches[j].result.bus_result.dev_name));
 	}
 
 	/*
 	 * If the user is only interested in buses, there's no
 	 * reason to descend to the next level in the tree.
 	 */
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_STOP)
 		return(1);
 
 	/*
 	 * If there is a target generation recorded, check it to
 	 * make sure the target list hasn't changed.
 	 */
 	mtx_lock(&bus->eb_mtx);
 	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
 	 && (cdm->pos.cookie.bus == bus)
 	 && (cdm->pos.position_type & CAM_DEV_POS_TARGET)
 	 && (cdm->pos.cookie.target != NULL)) {
 		if ((cdm->pos.generations[CAM_TARGET_GENERATION] !=
 		    bus->generation)) {
 			mtx_unlock(&bus->eb_mtx);
 			cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
 			return (0);
 		}
 		target = (struct cam_et *)cdm->pos.cookie.target;
 		target->refcount++;
 	} else
 		target = NULL;
 	mtx_unlock(&bus->eb_mtx);
 
 	return (xpttargettraverse(bus, target, xptedttargetfunc, arg));
 }
 
 static int
 xptedttargetfunc(struct cam_et *target, void *arg)
 {
 	struct ccb_dev_match *cdm;
 	struct cam_eb *bus;
 	struct cam_ed *device;
 
 	cdm = (struct ccb_dev_match *)arg;
 	bus = target->bus;
 
 	/*
 	 * If there is a device list generation recorded, check it to
 	 * make sure the device list hasn't changed.
 	 */
 	mtx_lock(&bus->eb_mtx);
 	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
 	 && (cdm->pos.cookie.bus == bus)
 	 && (cdm->pos.position_type & CAM_DEV_POS_TARGET)
 	 && (cdm->pos.cookie.target == target)
 	 && (cdm->pos.position_type & CAM_DEV_POS_DEVICE)
 	 && (cdm->pos.cookie.device != NULL)) {
 		if (cdm->pos.generations[CAM_DEV_GENERATION] !=
 		    target->generation) {
 			mtx_unlock(&bus->eb_mtx);
 			cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
 			return(0);
 		}
 		device = (struct cam_ed *)cdm->pos.cookie.device;
 		device->refcount++;
 	} else
 		device = NULL;
 	mtx_unlock(&bus->eb_mtx);
 
 	return (xptdevicetraverse(target, device, xptedtdevicefunc, arg));
 }
 
 static int
 xptedtdevicefunc(struct cam_ed *device, void *arg)
 {
 	struct cam_eb *bus;
 	struct cam_periph *periph;
 	struct ccb_dev_match *cdm;
 	dev_match_ret retval;
 
 	cdm = (struct ccb_dev_match *)arg;
 	bus = device->target->bus;
 
 	/*
 	 * If our position is for something deeper in the tree, that means
 	 * that we've already seen this node.  So, we keep going down.
 	 */
 	if ((cdm->pos.position_type & CAM_DEV_POS_DEVICE)
 	 && (cdm->pos.cookie.device == device)
 	 && (cdm->pos.position_type & CAM_DEV_POS_PERIPH)
 	 && (cdm->pos.cookie.periph != NULL))
 		retval = DM_RET_DESCEND;
 	else
 		retval = xptdevicematch(cdm->patterns, cdm->num_patterns,
 					device);
 
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) {
 		cdm->status = CAM_DEV_MATCH_ERROR;
 		return(0);
 	}
 
 	/*
 	 * If the copy flag is set, copy this device out.
 	 */
 	if (retval & DM_RET_COPY) {
 		int spaceleft, j;
 
 		spaceleft = cdm->match_buf_len - (cdm->num_matches *
 			sizeof(struct dev_match_result));
 
 		/*
 		 * If we don't have enough space to put in another
 		 * match result, save our position and tell the
 		 * user there are more devices to check.
 		 */
 		if (spaceleft < sizeof(struct dev_match_result)) {
 			bzero(&cdm->pos, sizeof(cdm->pos));
 			cdm->pos.position_type =
 				CAM_DEV_POS_EDT | CAM_DEV_POS_BUS |
 				CAM_DEV_POS_TARGET | CAM_DEV_POS_DEVICE;
 
 			cdm->pos.cookie.bus = device->target->bus;
 			cdm->pos.generations[CAM_BUS_GENERATION]=
 				xsoftc.bus_generation;
 			cdm->pos.cookie.target = device->target;
 			cdm->pos.generations[CAM_TARGET_GENERATION] =
 				device->target->bus->generation;
 			cdm->pos.cookie.device = device;
 			cdm->pos.generations[CAM_DEV_GENERATION] =
 				device->target->generation;
 			cdm->status = CAM_DEV_MATCH_MORE;
 			return(0);
 		}
 		j = cdm->num_matches;
 		cdm->num_matches++;
 		cdm->matches[j].type = DEV_MATCH_DEVICE;
 		cdm->matches[j].result.device_result.path_id =
 			device->target->bus->path_id;
 		cdm->matches[j].result.device_result.target_id =
 			device->target->target_id;
 		cdm->matches[j].result.device_result.target_lun =
 			device->lun_id;
 		cdm->matches[j].result.device_result.protocol =
 			device->protocol;
 		bcopy(&device->inq_data,
 		      &cdm->matches[j].result.device_result.inq_data,
 		      sizeof(struct scsi_inquiry_data));
 		bcopy(&device->ident_data,
 		      &cdm->matches[j].result.device_result.ident_data,
 		      sizeof(struct ata_params));
 
 		/* Let the user know whether this device is unconfigured */
 		if (device->flags & CAM_DEV_UNCONFIGURED)
 			cdm->matches[j].result.device_result.flags =
 				DEV_RESULT_UNCONFIGURED;
 		else
 			cdm->matches[j].result.device_result.flags =
 				DEV_RESULT_NOFLAG;
 	}
 
 	/*
 	 * If the user isn't interested in peripherals, don't descend
 	 * the tree any further.
 	 */
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_STOP)
 		return(1);
 
 	/*
 	 * If there is a peripheral list generation recorded, make sure
 	 * it hasn't changed.
 	 */
 	xpt_lock_buses();
 	mtx_lock(&bus->eb_mtx);
 	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
 	 && (cdm->pos.cookie.bus == bus)
 	 && (cdm->pos.position_type & CAM_DEV_POS_TARGET)
 	 && (cdm->pos.cookie.target == device->target)
 	 && (cdm->pos.position_type & CAM_DEV_POS_DEVICE)
 	 && (cdm->pos.cookie.device == device)
 	 && (cdm->pos.position_type & CAM_DEV_POS_PERIPH)
 	 && (cdm->pos.cookie.periph != NULL)) {
 		if (cdm->pos.generations[CAM_PERIPH_GENERATION] !=
 		    device->generation) {
 			mtx_unlock(&bus->eb_mtx);
 			xpt_unlock_buses();
 			cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
 			return(0);
 		}
 		periph = (struct cam_periph *)cdm->pos.cookie.periph;
 		periph->refcount++;
 	} else
 		periph = NULL;
 	mtx_unlock(&bus->eb_mtx);
 	xpt_unlock_buses();
 
 	return (xptperiphtraverse(device, periph, xptedtperiphfunc, arg));
 }
 
 static int
 xptedtperiphfunc(struct cam_periph *periph, void *arg)
 {
 	struct ccb_dev_match *cdm;
 	dev_match_ret retval;
 
 	cdm = (struct ccb_dev_match *)arg;
 
 	retval = xptperiphmatch(cdm->patterns, cdm->num_patterns, periph);
 
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) {
 		cdm->status = CAM_DEV_MATCH_ERROR;
 		return(0);
 	}
 
 	/*
 	 * If the copy flag is set, copy this peripheral out.
 	 */
 	if (retval & DM_RET_COPY) {
 		int spaceleft, j;
 		size_t l;
 
 		spaceleft = cdm->match_buf_len - (cdm->num_matches *
 			sizeof(struct dev_match_result));
 
 		/*
 		 * If we don't have enough space to put in another
 		 * match result, save our position and tell the
 		 * user there are more devices to check.
 		 */
 		if (spaceleft < sizeof(struct dev_match_result)) {
 			bzero(&cdm->pos, sizeof(cdm->pos));
 			cdm->pos.position_type =
 				CAM_DEV_POS_EDT | CAM_DEV_POS_BUS |
 				CAM_DEV_POS_TARGET | CAM_DEV_POS_DEVICE |
 				CAM_DEV_POS_PERIPH;
 
 			cdm->pos.cookie.bus = periph->path->bus;
 			cdm->pos.generations[CAM_BUS_GENERATION]=
 				xsoftc.bus_generation;
 			cdm->pos.cookie.target = periph->path->target;
 			cdm->pos.generations[CAM_TARGET_GENERATION] =
 				periph->path->bus->generation;
 			cdm->pos.cookie.device = periph->path->device;
 			cdm->pos.generations[CAM_DEV_GENERATION] =
 				periph->path->target->generation;
 			cdm->pos.cookie.periph = periph;
 			cdm->pos.generations[CAM_PERIPH_GENERATION] =
 				periph->path->device->generation;
 			cdm->status = CAM_DEV_MATCH_MORE;
 			return(0);
 		}
 
 		j = cdm->num_matches;
 		cdm->num_matches++;
 		cdm->matches[j].type = DEV_MATCH_PERIPH;
 		cdm->matches[j].result.periph_result.path_id =
 			periph->path->bus->path_id;
 		cdm->matches[j].result.periph_result.target_id =
 			periph->path->target->target_id;
 		cdm->matches[j].result.periph_result.target_lun =
 			periph->path->device->lun_id;
 		cdm->matches[j].result.periph_result.unit_number =
 			periph->unit_number;
 		l = sizeof(cdm->matches[j].result.periph_result.periph_name);
 		strlcpy(cdm->matches[j].result.periph_result.periph_name,
 			periph->periph_name, l);
 	}
 
 	return(1);
 }
 
 static int
 xptedtmatch(struct ccb_dev_match *cdm)
 {
 	struct cam_eb *bus;
 	int ret;
 
 	cdm->num_matches = 0;
 
 	/*
 	 * Check the bus list generation.  If it has changed, the user
 	 * needs to reset everything and start over.
 	 */
 	xpt_lock_buses();
 	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
 	 && (cdm->pos.cookie.bus != NULL)) {
 		if (cdm->pos.generations[CAM_BUS_GENERATION] !=
 		    xsoftc.bus_generation) {
 			xpt_unlock_buses();
 			cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
 			return(0);
 		}
 		bus = (struct cam_eb *)cdm->pos.cookie.bus;
 		bus->refcount++;
 	} else
 		bus = NULL;
 	xpt_unlock_buses();
 
 	ret = xptbustraverse(bus, xptedtbusfunc, cdm);
 
 	/*
 	 * If we get back 0, that means that we had to stop before fully
 	 * traversing the EDT.  It also means that one of the subroutines
 	 * has set the status field to the proper value.  If we get back 1,
 	 * we've fully traversed the EDT and copied out any matching entries.
 	 */
 	if (ret == 1)
 		cdm->status = CAM_DEV_MATCH_LAST;
 
 	return(ret);
 }
 
 static int
 xptplistpdrvfunc(struct periph_driver **pdrv, void *arg)
 {
 	struct cam_periph *periph;
 	struct ccb_dev_match *cdm;
 
 	cdm = (struct ccb_dev_match *)arg;
 
 	xpt_lock_buses();
 	if ((cdm->pos.position_type & CAM_DEV_POS_PDPTR)
 	 && (cdm->pos.cookie.pdrv == pdrv)
 	 && (cdm->pos.position_type & CAM_DEV_POS_PERIPH)
 	 && (cdm->pos.cookie.periph != NULL)) {
 		if (cdm->pos.generations[CAM_PERIPH_GENERATION] !=
 		    (*pdrv)->generation) {
 			xpt_unlock_buses();
 			cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
 			return(0);
 		}
 		periph = (struct cam_periph *)cdm->pos.cookie.periph;
 		periph->refcount++;
 	} else
 		periph = NULL;
 	xpt_unlock_buses();
 
 	return (xptpdperiphtraverse(pdrv, periph, xptplistperiphfunc, arg));
 }
 
 static int
 xptplistperiphfunc(struct cam_periph *periph, void *arg)
 {
 	struct ccb_dev_match *cdm;
 	dev_match_ret retval;
 
 	cdm = (struct ccb_dev_match *)arg;
 
 	retval = xptperiphmatch(cdm->patterns, cdm->num_patterns, periph);
 
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) {
 		cdm->status = CAM_DEV_MATCH_ERROR;
 		return(0);
 	}
 
 	/*
 	 * If the copy flag is set, copy this peripheral out.
 	 */
 	if (retval & DM_RET_COPY) {
 		int spaceleft, j;
 		size_t l;
 
 		spaceleft = cdm->match_buf_len - (cdm->num_matches *
 			sizeof(struct dev_match_result));
 
 		/*
 		 * If we don't have enough space to put in another
 		 * match result, save our position and tell the
 		 * user there are more devices to check.
 		 */
 		if (spaceleft < sizeof(struct dev_match_result)) {
 			struct periph_driver **pdrv;
 
 			pdrv = NULL;
 			bzero(&cdm->pos, sizeof(cdm->pos));
 			cdm->pos.position_type =
 				CAM_DEV_POS_PDRV | CAM_DEV_POS_PDPTR |
 				CAM_DEV_POS_PERIPH;
 
 			/*
 			 * This may look a bit non-sensical, but it is
 			 * actually quite logical.  There are very few
 			 * peripheral drivers, and bloating every peripheral
 			 * structure with a pointer back to its parent
 			 * peripheral driver linker set entry would cost
 			 * more in the long run than doing this quick lookup.
 			 */
 			for (pdrv = periph_drivers; *pdrv != NULL; pdrv++) {
 				if (strcmp((*pdrv)->driver_name,
 				    periph->periph_name) == 0)
 					break;
 			}
 
 			if (*pdrv == NULL) {
 				cdm->status = CAM_DEV_MATCH_ERROR;
 				return(0);
 			}
 
 			cdm->pos.cookie.pdrv = pdrv;
 			/*
 			 * The periph generation slot does double duty, as
 			 * does the periph pointer slot.  They are used for
 			 * both edt and pdrv lookups and positioning.
 			 */
 			cdm->pos.cookie.periph = periph;
 			cdm->pos.generations[CAM_PERIPH_GENERATION] =
 				(*pdrv)->generation;
 			cdm->status = CAM_DEV_MATCH_MORE;
 			return(0);
 		}
 
 		j = cdm->num_matches;
 		cdm->num_matches++;
 		cdm->matches[j].type = DEV_MATCH_PERIPH;
 		cdm->matches[j].result.periph_result.path_id =
 			periph->path->bus->path_id;
 
 		/*
 		 * The transport layer peripheral doesn't have a target or
 		 * lun.
 		 */
 		if (periph->path->target)
 			cdm->matches[j].result.periph_result.target_id =
 				periph->path->target->target_id;
 		else
 			cdm->matches[j].result.periph_result.target_id =
 				CAM_TARGET_WILDCARD;
 
 		if (periph->path->device)
 			cdm->matches[j].result.periph_result.target_lun =
 				periph->path->device->lun_id;
 		else
 			cdm->matches[j].result.periph_result.target_lun =
 				CAM_LUN_WILDCARD;
 
 		cdm->matches[j].result.periph_result.unit_number =
 			periph->unit_number;
 		l = sizeof(cdm->matches[j].result.periph_result.periph_name);
 		strlcpy(cdm->matches[j].result.periph_result.periph_name,
 			periph->periph_name, l);
 	}
 
 	return(1);
 }
 
 static int
 xptperiphlistmatch(struct ccb_dev_match *cdm)
 {
 	int ret;
 
 	cdm->num_matches = 0;
 
 	/*
 	 * At this point in the edt traversal function, we check the bus
 	 * list generation to make sure that no buses have been added or
 	 * removed since the user last sent a XPT_DEV_MATCH ccb through.
 	 * For the peripheral driver list traversal function, however, we
 	 * don't have to worry about new peripheral driver types coming or
 	 * going; they're in a linker set, and therefore can't change
 	 * without a recompile.
 	 */
 
 	if ((cdm->pos.position_type & CAM_DEV_POS_PDPTR)
 	 && (cdm->pos.cookie.pdrv != NULL))
 		ret = xptpdrvtraverse(
 				(struct periph_driver **)cdm->pos.cookie.pdrv,
 				xptplistpdrvfunc, cdm);
 	else
 		ret = xptpdrvtraverse(NULL, xptplistpdrvfunc, cdm);
 
 	/*
 	 * If we get back 0, that means that we had to stop before fully
 	 * traversing the peripheral driver tree.  It also means that one of
 	 * the subroutines has set the status field to the proper value.  If
 	 * we get back 1, we've fully traversed the EDT and copied out any
 	 * matching entries.
 	 */
 	if (ret == 1)
 		cdm->status = CAM_DEV_MATCH_LAST;
 
 	return(ret);
 }
 
 static int
 xptbustraverse(struct cam_eb *start_bus, xpt_busfunc_t *tr_func, void *arg)
 {
 	struct cam_eb *bus, *next_bus;
 	int retval;
 
 	retval = 1;
 	if (start_bus)
 		bus = start_bus;
 	else {
 		xpt_lock_buses();
 		bus = TAILQ_FIRST(&xsoftc.xpt_busses);
 		if (bus == NULL) {
 			xpt_unlock_buses();
 			return (retval);
 		}
 		bus->refcount++;
 		xpt_unlock_buses();
 	}
 	for (; bus != NULL; bus = next_bus) {
 		retval = tr_func(bus, arg);
 		if (retval == 0) {
 			xpt_release_bus(bus);
 			break;
 		}
 		xpt_lock_buses();
 		next_bus = TAILQ_NEXT(bus, links);
 		if (next_bus)
 			next_bus->refcount++;
 		xpt_unlock_buses();
 		xpt_release_bus(bus);
 	}
 	return(retval);
 }
 
 static int
 xpttargettraverse(struct cam_eb *bus, struct cam_et *start_target,
 		  xpt_targetfunc_t *tr_func, void *arg)
 {
 	struct cam_et *target, *next_target;
 	int retval;
 
 	retval = 1;
 	if (start_target)
 		target = start_target;
 	else {
 		mtx_lock(&bus->eb_mtx);
 		target = TAILQ_FIRST(&bus->et_entries);
 		if (target == NULL) {
 			mtx_unlock(&bus->eb_mtx);
 			return (retval);
 		}
 		target->refcount++;
 		mtx_unlock(&bus->eb_mtx);
 	}
 	for (; target != NULL; target = next_target) {
 		retval = tr_func(target, arg);
 		if (retval == 0) {
 			xpt_release_target(target);
 			break;
 		}
 		mtx_lock(&bus->eb_mtx);
 		next_target = TAILQ_NEXT(target, links);
 		if (next_target)
 			next_target->refcount++;
 		mtx_unlock(&bus->eb_mtx);
 		xpt_release_target(target);
 	}
 	return(retval);
 }
 
 static int
 xptdevicetraverse(struct cam_et *target, struct cam_ed *start_device,
 		  xpt_devicefunc_t *tr_func, void *arg)
 {
 	struct cam_eb *bus;
 	struct cam_ed *device, *next_device;
 	int retval;
 
 	retval = 1;
 	bus = target->bus;
 	if (start_device)
 		device = start_device;
 	else {
 		mtx_lock(&bus->eb_mtx);
 		device = TAILQ_FIRST(&target->ed_entries);
 		if (device == NULL) {
 			mtx_unlock(&bus->eb_mtx);
 			return (retval);
 		}
 		device->refcount++;
 		mtx_unlock(&bus->eb_mtx);
 	}
 	for (; device != NULL; device = next_device) {
 		mtx_lock(&device->device_mtx);
 		retval = tr_func(device, arg);
 		mtx_unlock(&device->device_mtx);
 		if (retval == 0) {
 			xpt_release_device(device);
 			break;
 		}
 		mtx_lock(&bus->eb_mtx);
 		next_device = TAILQ_NEXT(device, links);
 		if (next_device)
 			next_device->refcount++;
 		mtx_unlock(&bus->eb_mtx);
 		xpt_release_device(device);
 	}
 	return(retval);
 }
 
 static int
 xptperiphtraverse(struct cam_ed *device, struct cam_periph *start_periph,
 		  xpt_periphfunc_t *tr_func, void *arg)
 {
 	struct cam_eb *bus;
 	struct cam_periph *periph, *next_periph;
 	int retval;
 
 	retval = 1;
 
 	bus = device->target->bus;
 	if (start_periph)
 		periph = start_periph;
 	else {
 		xpt_lock_buses();
 		mtx_lock(&bus->eb_mtx);
 		periph = SLIST_FIRST(&device->periphs);
 		while (periph != NULL && (periph->flags & CAM_PERIPH_FREE) != 0)
 			periph = SLIST_NEXT(periph, periph_links);
 		if (periph == NULL) {
 			mtx_unlock(&bus->eb_mtx);
 			xpt_unlock_buses();
 			return (retval);
 		}
 		periph->refcount++;
 		mtx_unlock(&bus->eb_mtx);
 		xpt_unlock_buses();
 	}
 	for (; periph != NULL; periph = next_periph) {
 		retval = tr_func(periph, arg);
 		if (retval == 0) {
 			cam_periph_release_locked(periph);
 			break;
 		}
 		xpt_lock_buses();
 		mtx_lock(&bus->eb_mtx);
 		next_periph = SLIST_NEXT(periph, periph_links);
 		while (next_periph != NULL &&
 		    (next_periph->flags & CAM_PERIPH_FREE) != 0)
 			next_periph = SLIST_NEXT(next_periph, periph_links);
 		if (next_periph)
 			next_periph->refcount++;
 		mtx_unlock(&bus->eb_mtx);
 		xpt_unlock_buses();
 		cam_periph_release_locked(periph);
 	}
 	return(retval);
 }
 
 static int
 xptpdrvtraverse(struct periph_driver **start_pdrv,
 		xpt_pdrvfunc_t *tr_func, void *arg)
 {
 	struct periph_driver **pdrv;
 	int retval;
 
 	retval = 1;
 
 	/*
 	 * We don't traverse the peripheral driver list like we do the
 	 * other lists, because it is a linker set, and therefore cannot be
 	 * changed during runtime.  If the peripheral driver list is ever
 	 * re-done to be something other than a linker set (i.e. it can
 	 * change while the system is running), the list traversal should
 	 * be modified to work like the other traversal functions.
 	 */
 	for (pdrv = (start_pdrv ? start_pdrv : periph_drivers);
 	     *pdrv != NULL; pdrv++) {
 		retval = tr_func(pdrv, arg);
 
 		if (retval == 0)
 			return(retval);
 	}
 
 	return(retval);
 }
 
 static int
 xptpdperiphtraverse(struct periph_driver **pdrv,
 		    struct cam_periph *start_periph,
 		    xpt_periphfunc_t *tr_func, void *arg)
 {
 	struct cam_periph *periph, *next_periph;
 	int retval;
 
 	retval = 1;
 
 	if (start_periph)
 		periph = start_periph;
 	else {
 		xpt_lock_buses();
 		periph = TAILQ_FIRST(&(*pdrv)->units);
 		while (periph != NULL && (periph->flags & CAM_PERIPH_FREE) != 0)
 			periph = TAILQ_NEXT(periph, unit_links);
 		if (periph == NULL) {
 			xpt_unlock_buses();
 			return (retval);
 		}
 		periph->refcount++;
 		xpt_unlock_buses();
 	}
 	for (; periph != NULL; periph = next_periph) {
 		cam_periph_lock(periph);
 		retval = tr_func(periph, arg);
 		cam_periph_unlock(periph);
 		if (retval == 0) {
 			cam_periph_release(periph);
 			break;
 		}
 		xpt_lock_buses();
 		next_periph = TAILQ_NEXT(periph, unit_links);
 		while (next_periph != NULL &&
 		    (next_periph->flags & CAM_PERIPH_FREE) != 0)
 			next_periph = TAILQ_NEXT(next_periph, unit_links);
 		if (next_periph)
 			next_periph->refcount++;
 		xpt_unlock_buses();
 		cam_periph_release(periph);
 	}
 	return(retval);
 }
 
 static int
 xptdefbusfunc(struct cam_eb *bus, void *arg)
 {
 	struct xpt_traverse_config *tr_config;
 
 	tr_config = (struct xpt_traverse_config *)arg;
 
 	if (tr_config->depth == XPT_DEPTH_BUS) {
 		xpt_busfunc_t *tr_func;
 
 		tr_func = (xpt_busfunc_t *)tr_config->tr_func;
 
 		return(tr_func(bus, tr_config->tr_arg));
 	} else
 		return(xpttargettraverse(bus, NULL, xptdeftargetfunc, arg));
 }
 
 static int
 xptdeftargetfunc(struct cam_et *target, void *arg)
 {
 	struct xpt_traverse_config *tr_config;
 
 	tr_config = (struct xpt_traverse_config *)arg;
 
 	if (tr_config->depth == XPT_DEPTH_TARGET) {
 		xpt_targetfunc_t *tr_func;
 
 		tr_func = (xpt_targetfunc_t *)tr_config->tr_func;
 
 		return(tr_func(target, tr_config->tr_arg));
 	} else
 		return(xptdevicetraverse(target, NULL, xptdefdevicefunc, arg));
 }
 
 static int
 xptdefdevicefunc(struct cam_ed *device, void *arg)
 {
 	struct xpt_traverse_config *tr_config;
 
 	tr_config = (struct xpt_traverse_config *)arg;
 
 	if (tr_config->depth == XPT_DEPTH_DEVICE) {
 		xpt_devicefunc_t *tr_func;
 
 		tr_func = (xpt_devicefunc_t *)tr_config->tr_func;
 
 		return(tr_func(device, tr_config->tr_arg));
 	} else
 		return(xptperiphtraverse(device, NULL, xptdefperiphfunc, arg));
 }
 
 static int
 xptdefperiphfunc(struct cam_periph *periph, void *arg)
 {
 	struct xpt_traverse_config *tr_config;
 	xpt_periphfunc_t *tr_func;
 
 	tr_config = (struct xpt_traverse_config *)arg;
 
 	tr_func = (xpt_periphfunc_t *)tr_config->tr_func;
 
 	/*
 	 * Unlike the other default functions, we don't check for depth
 	 * here.  The peripheral driver level is the last level in the EDT,
 	 * so if we're here, we should execute the function in question.
 	 */
 	return(tr_func(periph, tr_config->tr_arg));
 }
 
 /*
  * Execute the given function for every bus in the EDT.
  */
 static int
 xpt_for_all_busses(xpt_busfunc_t *tr_func, void *arg)
 {
 	struct xpt_traverse_config tr_config;
 
 	tr_config.depth = XPT_DEPTH_BUS;
 	tr_config.tr_func = tr_func;
 	tr_config.tr_arg = arg;
 
 	return(xptbustraverse(NULL, xptdefbusfunc, &tr_config));
 }
 
 /*
  * Execute the given function for every device in the EDT.
  */
 static int
 xpt_for_all_devices(xpt_devicefunc_t *tr_func, void *arg)
 {
 	struct xpt_traverse_config tr_config;
 
 	tr_config.depth = XPT_DEPTH_DEVICE;
 	tr_config.tr_func = tr_func;
 	tr_config.tr_arg = arg;
 
 	return(xptbustraverse(NULL, xptdefbusfunc, &tr_config));
 }
 
 static int
 xptsetasyncfunc(struct cam_ed *device, void *arg)
 {
 	struct cam_path path;
 	struct ccb_getdev cgd;
 	struct ccb_setasync *csa = (struct ccb_setasync *)arg;
 
 	/*
 	 * Don't report unconfigured devices (Wildcard devs,
 	 * devices only for target mode, device instances
 	 * that have been invalidated but are waiting for
 	 * their last reference count to be released).
 	 */
 	if ((device->flags & CAM_DEV_UNCONFIGURED) != 0)
 		return (1);
 
 	xpt_compile_path(&path,
 			 NULL,
 			 device->target->bus->path_id,
 			 device->target->target_id,
 			 device->lun_id);
 	xpt_setup_ccb(&cgd.ccb_h, &path, CAM_PRIORITY_NORMAL);
 	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
 	xpt_action((union ccb *)&cgd);
 	csa->callback(csa->callback_arg,
 			    AC_FOUND_DEVICE,
 			    &path, &cgd);
 	xpt_release_path(&path);
 
 	return(1);
 }
 
 static int
 xptsetasyncbusfunc(struct cam_eb *bus, void *arg)
 {
 	struct cam_path path;
 	struct ccb_pathinq cpi;
 	struct ccb_setasync *csa = (struct ccb_setasync *)arg;
 
 	xpt_compile_path(&path, /*periph*/NULL,
 			 bus->path_id,
 			 CAM_TARGET_WILDCARD,
 			 CAM_LUN_WILDCARD);
 	xpt_path_lock(&path);
 	xpt_path_inq(&cpi, &path);
 	csa->callback(csa->callback_arg,
 			    AC_PATH_REGISTERED,
 			    &path, &cpi);
 	xpt_path_unlock(&path);
 	xpt_release_path(&path);
 
 	return(1);
 }
 
 void
 xpt_action(union ccb *start_ccb)
 {
 
 	CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_TRACE,
 	    ("xpt_action: func %#x %s\n", start_ccb->ccb_h.func_code,
 		xpt_action_name(start_ccb->ccb_h.func_code)));
 
 	start_ccb->ccb_h.status = CAM_REQ_INPROG;
 	(*(start_ccb->ccb_h.path->bus->xport->ops->action))(start_ccb);
 }
 
 void
 xpt_action_default(union ccb *start_ccb)
 {
 	struct cam_path *path;
 	struct cam_sim *sim;
 	struct mtx *mtx;
 
 	path = start_ccb->ccb_h.path;
 	CAM_DEBUG(path, CAM_DEBUG_TRACE,
 	    ("xpt_action_default: func %#x %s\n", start_ccb->ccb_h.func_code,
 		xpt_action_name(start_ccb->ccb_h.func_code)));
 
 	switch (start_ccb->ccb_h.func_code) {
 	case XPT_SCSI_IO:
 	{
 		struct cam_ed *device;
 
 		/*
 		 * For the sake of compatibility with SCSI-1
 		 * devices that may not understand the identify
 		 * message, we include lun information in the
 		 * second byte of all commands.  SCSI-1 specifies
 		 * that luns are a 3 bit value and reserves only 3
 		 * bits for lun information in the CDB.  Later
 		 * revisions of the SCSI spec allow for more than 8
 		 * luns, but have deprecated lun information in the
 		 * CDB.  So, if the lun won't fit, we must omit.
 		 *
 		 * Also be aware that during initial probing for devices,
 		 * the inquiry information is unknown but initialized to 0.
 		 * This means that this code will be exercised while probing
 		 * devices with an ANSI revision greater than 2.
 		 */
 		device = path->device;
 		if (device->protocol_version <= SCSI_REV_2
 		 && start_ccb->ccb_h.target_lun < 8
 		 && (start_ccb->ccb_h.flags & CAM_CDB_POINTER) == 0) {
 
 			start_ccb->csio.cdb_io.cdb_bytes[1] |=
 			    start_ccb->ccb_h.target_lun << 5;
 		}
 		start_ccb->csio.scsi_status = SCSI_STATUS_OK;
 	}
 	/* FALLTHROUGH */
 	case XPT_TARGET_IO:
 	case XPT_CONT_TARGET_IO:
 		start_ccb->csio.sense_resid = 0;
 		start_ccb->csio.resid = 0;
 		/* FALLTHROUGH */
 	case XPT_ATA_IO:
 		if (start_ccb->ccb_h.func_code == XPT_ATA_IO)
 			start_ccb->ataio.resid = 0;
 		/* FALLTHROUGH */
 	case XPT_NVME_IO:
 		/* FALLTHROUGH */
 	case XPT_NVME_ADMIN:
 		/* FALLTHROUGH */
 	case XPT_MMC_IO:
 		/* XXX just like nmve_io? */
 	case XPT_RESET_DEV:
 	case XPT_ENG_EXEC:
 	case XPT_SMP_IO:
 	{
 		struct cam_devq *devq;
 
 		devq = path->bus->sim->devq;
 		mtx_lock(&devq->send_mtx);
 		cam_ccbq_insert_ccb(&path->device->ccbq, start_ccb);
 		if (xpt_schedule_devq(devq, path->device) != 0)
 			xpt_run_devq(devq);
 		mtx_unlock(&devq->send_mtx);
 		break;
 	}
 	case XPT_CALC_GEOMETRY:
 		/* Filter out garbage */
 		if (start_ccb->ccg.block_size == 0
 		 || start_ccb->ccg.volume_size == 0) {
 			start_ccb->ccg.cylinders = 0;
 			start_ccb->ccg.heads = 0;
 			start_ccb->ccg.secs_per_track = 0;
 			start_ccb->ccb_h.status = CAM_REQ_CMP;
 			break;
 		}
 #if defined(__sparc64__)
 		/*
 		 * For sparc64, we may need adjust the geometry of large
 		 * disks in order to fit the limitations of the 16-bit
 		 * fields of the VTOC8 disk label.
 		 */
 		if (scsi_da_bios_params(&start_ccb->ccg) != 0) {
 			start_ccb->ccb_h.status = CAM_REQ_CMP;
 			break;
 		}
 #endif
 		goto call_sim;
 	case XPT_ABORT:
 	{
 		union ccb* abort_ccb;
 
 		abort_ccb = start_ccb->cab.abort_ccb;
 		if (XPT_FC_IS_DEV_QUEUED(abort_ccb)) {
 			struct cam_ed *device;
 			struct cam_devq *devq;
 
 			device = abort_ccb->ccb_h.path->device;
 			devq = device->sim->devq;
 
 			mtx_lock(&devq->send_mtx);
 			if (abort_ccb->ccb_h.pinfo.index > 0) {
 				cam_ccbq_remove_ccb(&device->ccbq, abort_ccb);
 				abort_ccb->ccb_h.status =
 				    CAM_REQ_ABORTED|CAM_DEV_QFRZN;
 				xpt_freeze_devq_device(device, 1);
 				mtx_unlock(&devq->send_mtx);
 				xpt_done(abort_ccb);
 				start_ccb->ccb_h.status = CAM_REQ_CMP;
 				break;
 			}
 			mtx_unlock(&devq->send_mtx);
 
 			if (abort_ccb->ccb_h.pinfo.index == CAM_UNQUEUED_INDEX
 			 && (abort_ccb->ccb_h.status & CAM_SIM_QUEUED) == 0) {
 				/*
 				 * We've caught this ccb en route to
 				 * the SIM.  Flag it for abort and the
 				 * SIM will do so just before starting
 				 * real work on the CCB.
 				 */
 				abort_ccb->ccb_h.status =
 				    CAM_REQ_ABORTED|CAM_DEV_QFRZN;
 				xpt_freeze_devq(abort_ccb->ccb_h.path, 1);
 				start_ccb->ccb_h.status = CAM_REQ_CMP;
 				break;
 			}
 		}
 		if (XPT_FC_IS_QUEUED(abort_ccb)
 		 && (abort_ccb->ccb_h.pinfo.index == CAM_DONEQ_INDEX)) {
 			/*
 			 * It's already completed but waiting
 			 * for our SWI to get to it.
 			 */
 			start_ccb->ccb_h.status = CAM_UA_ABORT;
 			break;
 		}
 		/*
 		 * If we weren't able to take care of the abort request
 		 * in the XPT, pass the request down to the SIM for processing.
 		 */
 	}
 	/* FALLTHROUGH */
 	case XPT_ACCEPT_TARGET_IO:
 	case XPT_EN_LUN:
 	case XPT_IMMED_NOTIFY:
 	case XPT_NOTIFY_ACK:
 	case XPT_RESET_BUS:
 	case XPT_IMMEDIATE_NOTIFY:
 	case XPT_NOTIFY_ACKNOWLEDGE:
 	case XPT_GET_SIM_KNOB_OLD:
 	case XPT_GET_SIM_KNOB:
 	case XPT_SET_SIM_KNOB:
 	case XPT_GET_TRAN_SETTINGS:
 	case XPT_SET_TRAN_SETTINGS:
 	case XPT_PATH_INQ:
 call_sim:
 		sim = path->bus->sim;
 		mtx = sim->mtx;
 		if (mtx && !mtx_owned(mtx))
 			mtx_lock(mtx);
 		else
 			mtx = NULL;
 
 		CAM_DEBUG(path, CAM_DEBUG_TRACE,
 		    ("Calling sim->sim_action(): func=%#x\n", start_ccb->ccb_h.func_code));
 		(*(sim->sim_action))(sim, start_ccb);
 		CAM_DEBUG(path, CAM_DEBUG_TRACE,
 		    ("sim->sim_action returned: status=%#x\n", start_ccb->ccb_h.status));
 		if (mtx)
 			mtx_unlock(mtx);
 		break;
 	case XPT_PATH_STATS:
 		start_ccb->cpis.last_reset = path->bus->last_reset;
 		start_ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	case XPT_GDEV_TYPE:
 	{
 		struct cam_ed *dev;
 
 		dev = path->device;
 		if ((dev->flags & CAM_DEV_UNCONFIGURED) != 0) {
 			start_ccb->ccb_h.status = CAM_DEV_NOT_THERE;
 		} else {
 			struct ccb_getdev *cgd;
 
 			cgd = &start_ccb->cgd;
 			cgd->protocol = dev->protocol;
 			cgd->inq_data = dev->inq_data;
 			cgd->ident_data = dev->ident_data;
 			cgd->inq_flags = dev->inq_flags;
 			cgd->ccb_h.status = CAM_REQ_CMP;
 			cgd->serial_num_len = dev->serial_num_len;
 			if ((dev->serial_num_len > 0)
 			 && (dev->serial_num != NULL))
 				bcopy(dev->serial_num, cgd->serial_num,
 				      dev->serial_num_len);
 		}
 		break;
 	}
 	case XPT_GDEV_STATS:
 	{
 		struct ccb_getdevstats *cgds = &start_ccb->cgds;
 		struct cam_ed *dev = path->device;
 		struct cam_eb *bus = path->bus;
 		struct cam_et *tar = path->target;
 		struct cam_devq *devq = bus->sim->devq;
 
 		mtx_lock(&devq->send_mtx);
 		cgds->dev_openings = dev->ccbq.dev_openings;
 		cgds->dev_active = dev->ccbq.dev_active;
 		cgds->allocated = dev->ccbq.allocated;
 		cgds->queued = cam_ccbq_pending_ccb_count(&dev->ccbq);
 		cgds->held = cgds->allocated - cgds->dev_active - cgds->queued;
 		cgds->last_reset = tar->last_reset;
 		cgds->maxtags = dev->maxtags;
 		cgds->mintags = dev->mintags;
 		if (timevalcmp(&tar->last_reset, &bus->last_reset, <))
 			cgds->last_reset = bus->last_reset;
 		mtx_unlock(&devq->send_mtx);
 		cgds->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_GDEVLIST:
 	{
 		struct cam_periph	*nperiph;
 		struct periph_list	*periph_head;
 		struct ccb_getdevlist	*cgdl;
 		u_int			i;
 		struct cam_ed		*device;
 		int			found;
 
 
 		found = 0;
 
 		/*
 		 * Don't want anyone mucking with our data.
 		 */
 		device = path->device;
 		periph_head = &device->periphs;
 		cgdl = &start_ccb->cgdl;
 
 		/*
 		 * Check and see if the list has changed since the user
 		 * last requested a list member.  If so, tell them that the
 		 * list has changed, and therefore they need to start over
 		 * from the beginning.
 		 */
 		if ((cgdl->index != 0) &&
 		    (cgdl->generation != device->generation)) {
 			cgdl->status = CAM_GDEVLIST_LIST_CHANGED;
 			break;
 		}
 
 		/*
 		 * Traverse the list of peripherals and attempt to find
 		 * the requested peripheral.
 		 */
 		for (nperiph = SLIST_FIRST(periph_head), i = 0;
 		     (nperiph != NULL) && (i <= cgdl->index);
 		     nperiph = SLIST_NEXT(nperiph, periph_links), i++) {
 			if (i == cgdl->index) {
 				strlcpy(cgdl->periph_name,
 					nperiph->periph_name,
 					sizeof(cgdl->periph_name));
 				cgdl->unit_number = nperiph->unit_number;
 				found = 1;
 			}
 		}
 		if (found == 0) {
 			cgdl->status = CAM_GDEVLIST_ERROR;
 			break;
 		}
 
 		if (nperiph == NULL)
 			cgdl->status = CAM_GDEVLIST_LAST_DEVICE;
 		else
 			cgdl->status = CAM_GDEVLIST_MORE_DEVS;
 
 		cgdl->index++;
 		cgdl->generation = device->generation;
 
 		cgdl->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_DEV_MATCH:
 	{
 		dev_pos_type position_type;
 		struct ccb_dev_match *cdm;
 
 		cdm = &start_ccb->cdm;
 
 		/*
 		 * There are two ways of getting at information in the EDT.
 		 * The first way is via the primary EDT tree.  It starts
 		 * with a list of buses, then a list of targets on a bus,
 		 * then devices/luns on a target, and then peripherals on a
 		 * device/lun.  The "other" way is by the peripheral driver
 		 * lists.  The peripheral driver lists are organized by
 		 * peripheral driver.  (obviously)  So it makes sense to
 		 * use the peripheral driver list if the user is looking
 		 * for something like "da1", or all "da" devices.  If the
 		 * user is looking for something on a particular bus/target
 		 * or lun, it's generally better to go through the EDT tree.
 		 */
 
 		if (cdm->pos.position_type != CAM_DEV_POS_NONE)
 			position_type = cdm->pos.position_type;
 		else {
 			u_int i;
 
 			position_type = CAM_DEV_POS_NONE;
 
 			for (i = 0; i < cdm->num_patterns; i++) {
 				if ((cdm->patterns[i].type == DEV_MATCH_BUS)
 				 ||(cdm->patterns[i].type == DEV_MATCH_DEVICE)){
 					position_type = CAM_DEV_POS_EDT;
 					break;
 				}
 			}
 
 			if (cdm->num_patterns == 0)
 				position_type = CAM_DEV_POS_EDT;
 			else if (position_type == CAM_DEV_POS_NONE)
 				position_type = CAM_DEV_POS_PDRV;
 		}
 
 		switch(position_type & CAM_DEV_POS_TYPEMASK) {
 		case CAM_DEV_POS_EDT:
 			xptedtmatch(cdm);
 			break;
 		case CAM_DEV_POS_PDRV:
 			xptperiphlistmatch(cdm);
 			break;
 		default:
 			cdm->status = CAM_DEV_MATCH_ERROR;
 			break;
 		}
 
 		if (cdm->status == CAM_DEV_MATCH_ERROR)
 			start_ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 		else
 			start_ccb->ccb_h.status = CAM_REQ_CMP;
 
 		break;
 	}
 	case XPT_SASYNC_CB:
 	{
 		struct ccb_setasync *csa;
 		struct async_node *cur_entry;
 		struct async_list *async_head;
 		u_int32_t added;
 
 		csa = &start_ccb->csa;
 		added = csa->event_enable;
 		async_head = &path->device->asyncs;
 
 		/*
 		 * If there is already an entry for us, simply
 		 * update it.
 		 */
 		cur_entry = SLIST_FIRST(async_head);
 		while (cur_entry != NULL) {
 			if ((cur_entry->callback_arg == csa->callback_arg)
 			 && (cur_entry->callback == csa->callback))
 				break;
 			cur_entry = SLIST_NEXT(cur_entry, links);
 		}
 
 		if (cur_entry != NULL) {
 		 	/*
 			 * If the request has no flags set,
 			 * remove the entry.
 			 */
 			added &= ~cur_entry->event_enable;
 			if (csa->event_enable == 0) {
 				SLIST_REMOVE(async_head, cur_entry,
 					     async_node, links);
 				xpt_release_device(path->device);
 				free(cur_entry, M_CAMXPT);
 			} else {
 				cur_entry->event_enable = csa->event_enable;
 			}
 			csa->event_enable = added;
 		} else {
 			cur_entry = malloc(sizeof(*cur_entry), M_CAMXPT,
 					   M_NOWAIT);
 			if (cur_entry == NULL) {
 				csa->ccb_h.status = CAM_RESRC_UNAVAIL;
 				break;
 			}
 			cur_entry->event_enable = csa->event_enable;
 			cur_entry->event_lock = (path->bus->sim->mtx &&
 			    mtx_owned(path->bus->sim->mtx)) ? 1 : 0;
 			cur_entry->callback_arg = csa->callback_arg;
 			cur_entry->callback = csa->callback;
 			SLIST_INSERT_HEAD(async_head, cur_entry, links);
 			xpt_acquire_device(path->device);
 		}
 		start_ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_REL_SIMQ:
 	{
 		struct ccb_relsim *crs;
 		struct cam_ed *dev;
 
 		crs = &start_ccb->crs;
 		dev = path->device;
 		if (dev == NULL) {
 
 			crs->ccb_h.status = CAM_DEV_NOT_THERE;
 			break;
 		}
 
 		if ((crs->release_flags & RELSIM_ADJUST_OPENINGS) != 0) {
 
 			/* Don't ever go below one opening */
 			if (crs->openings > 0) {
 				xpt_dev_ccbq_resize(path, crs->openings);
 				if (bootverbose) {
 					xpt_print(path,
 					    "number of openings is now %d\n",
 					    crs->openings);
 				}
 			}
 		}
 
 		mtx_lock(&dev->sim->devq->send_mtx);
 		if ((crs->release_flags & RELSIM_RELEASE_AFTER_TIMEOUT) != 0) {
 
 			if ((dev->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0) {
 
 				/*
 				 * Just extend the old timeout and decrement
 				 * the freeze count so that a single timeout
 				 * is sufficient for releasing the queue.
 				 */
 				start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE;
 				callout_stop(&dev->callout);
 			} else {
 
 				start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
 			}
 
 			callout_reset_sbt(&dev->callout,
 			    SBT_1MS * crs->release_timeout, 0,
 			    xpt_release_devq_timeout, dev, 0);
 
 			dev->flags |= CAM_DEV_REL_TIMEOUT_PENDING;
 
 		}
 
 		if ((crs->release_flags & RELSIM_RELEASE_AFTER_CMDCMPLT) != 0) {
 
 			if ((dev->flags & CAM_DEV_REL_ON_COMPLETE) != 0) {
 				/*
 				 * Decrement the freeze count so that a single
 				 * completion is still sufficient to unfreeze
 				 * the queue.
 				 */
 				start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE;
 			} else {
 
 				dev->flags |= CAM_DEV_REL_ON_COMPLETE;
 				start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
 			}
 		}
 
 		if ((crs->release_flags & RELSIM_RELEASE_AFTER_QEMPTY) != 0) {
 
 			if ((dev->flags & CAM_DEV_REL_ON_QUEUE_EMPTY) != 0
 			 || (dev->ccbq.dev_active == 0)) {
 
 				start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE;
 			} else {
 
 				dev->flags |= CAM_DEV_REL_ON_QUEUE_EMPTY;
 				start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
 			}
 		}
 		mtx_unlock(&dev->sim->devq->send_mtx);
 
 		if ((start_ccb->ccb_h.flags & CAM_DEV_QFREEZE) == 0)
 			xpt_release_devq(path, /*count*/1, /*run_queue*/TRUE);
 		start_ccb->crs.qfrozen_cnt = dev->ccbq.queue.qfrozen_cnt;
 		start_ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_DEBUG: {
 		struct cam_path *oldpath;
 
 		/* Check that all request bits are supported. */
 		if (start_ccb->cdbg.flags & ~(CAM_DEBUG_COMPILE)) {
 			start_ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
 			break;
 		}
 
 		cam_dflags = CAM_DEBUG_NONE;
 		if (cam_dpath != NULL) {
 			oldpath = cam_dpath;
 			cam_dpath = NULL;
 			xpt_free_path(oldpath);
 		}
 		if (start_ccb->cdbg.flags != CAM_DEBUG_NONE) {
 			if (xpt_create_path(&cam_dpath, NULL,
 					    start_ccb->ccb_h.path_id,
 					    start_ccb->ccb_h.target_id,
 					    start_ccb->ccb_h.target_lun) !=
 					    CAM_REQ_CMP) {
 				start_ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
 			} else {
 				cam_dflags = start_ccb->cdbg.flags;
 				start_ccb->ccb_h.status = CAM_REQ_CMP;
 				xpt_print(cam_dpath, "debugging flags now %x\n",
 				    cam_dflags);
 			}
 		} else
 			start_ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_NOOP:
 		if ((start_ccb->ccb_h.flags & CAM_DEV_QFREEZE) != 0)
 			xpt_freeze_devq(path, 1);
 		start_ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	case XPT_REPROBE_LUN:
 		xpt_async(AC_INQ_CHANGED, path, NULL);
 		start_ccb->ccb_h.status = CAM_REQ_CMP;
 		xpt_done(start_ccb);
 		break;
 	default:
 	case XPT_SDEV_TYPE:
 	case XPT_TERM_IO:
 	case XPT_ENG_INQ:
 		/* XXX Implement */
 		xpt_print(start_ccb->ccb_h.path,
 		    "%s: CCB type %#x %s not supported\n", __func__,
 		    start_ccb->ccb_h.func_code,
 		    xpt_action_name(start_ccb->ccb_h.func_code));
 		start_ccb->ccb_h.status = CAM_PROVIDE_FAIL;
 		if (start_ccb->ccb_h.func_code & XPT_FC_DEV_QUEUED) {
 			xpt_done(start_ccb);
 		}
 		break;
 	}
 	CAM_DEBUG(path, CAM_DEBUG_TRACE,
 	    ("xpt_action_default: func= %#x %s status %#x\n",
 		start_ccb->ccb_h.func_code,
  		xpt_action_name(start_ccb->ccb_h.func_code),
 		start_ccb->ccb_h.status));
 }
 
 /*
  * Call the sim poll routine to allow the sim to complete
  * any inflight requests, then call camisr_runqueue to
  * complete any CCB that the polling completed.
  */
 void
 xpt_sim_poll(struct cam_sim *sim)
 {
 	struct mtx *mtx;
 
 	mtx = sim->mtx;
 	if (mtx)
 		mtx_lock(mtx);
 	(*(sim->sim_poll))(sim);
 	if (mtx)
 		mtx_unlock(mtx);
 	camisr_runqueue();
 }
 
 uint32_t
 xpt_poll_setup(union ccb *start_ccb)
 {
 	u_int32_t timeout;
 	struct	  cam_sim *sim;
 	struct	  cam_devq *devq;
 	struct	  cam_ed *dev;
 
 	timeout = start_ccb->ccb_h.timeout * 10;
 	sim = start_ccb->ccb_h.path->bus->sim;
 	devq = sim->devq;
 	dev = start_ccb->ccb_h.path->device;
 
 	/*
 	 * Steal an opening so that no other queued requests
 	 * can get it before us while we simulate interrupts.
 	 */
 	mtx_lock(&devq->send_mtx);
 	dev->ccbq.dev_openings--;
 	while((devq->send_openings <= 0 || dev->ccbq.dev_openings < 0) &&
 	    (--timeout > 0)) {
 		mtx_unlock(&devq->send_mtx);
 		DELAY(100);
 		xpt_sim_poll(sim);
 		mtx_lock(&devq->send_mtx);
 	}
 	dev->ccbq.dev_openings++;
 	mtx_unlock(&devq->send_mtx);
 
 	return (timeout);
 }
 
 void
 xpt_pollwait(union ccb *start_ccb, uint32_t timeout)
 {
 
 	while (--timeout > 0) {
 		xpt_sim_poll(start_ccb->ccb_h.path->bus->sim);
 		if ((start_ccb->ccb_h.status & CAM_STATUS_MASK)
 		    != CAM_REQ_INPROG)
 			break;
 		DELAY(100);
 	}
 
 	if (timeout == 0) {
 		/*
 		 * XXX Is it worth adding a sim_timeout entry
 		 * point so we can attempt recovery?  If
 		 * this is only used for dumps, I don't think
 		 * it is.
 		 */
 		start_ccb->ccb_h.status = CAM_CMD_TIMEOUT;
 	}
 }
 
 void
 xpt_polled_action(union ccb *start_ccb)
 {
 	uint32_t	timeout;
 	struct cam_ed	*dev;
 
 	timeout = start_ccb->ccb_h.timeout * 10;
 	dev = start_ccb->ccb_h.path->device;
 
 	mtx_unlock(&dev->device_mtx);
 
 	timeout = xpt_poll_setup(start_ccb);
 	if (timeout > 0) {
 		xpt_action(start_ccb);
 		xpt_pollwait(start_ccb, timeout);
 	} else {
 		start_ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
 	}
 
 	mtx_lock(&dev->device_mtx);
 }
 
 /*
  * Schedule a peripheral driver to receive a ccb when its
  * target device has space for more transactions.
  */
 void
 xpt_schedule(struct cam_periph *periph, u_int32_t new_priority)
 {
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("xpt_schedule\n"));
 	cam_periph_assert(periph, MA_OWNED);
 	if (new_priority < periph->scheduled_priority) {
 		periph->scheduled_priority = new_priority;
 		xpt_run_allocq(periph, 0);
 	}
 }
 
 
 /*
  * Schedule a device to run on a given queue.
  * If the device was inserted as a new entry on the queue,
  * return 1 meaning the device queue should be run. If we
  * were already queued, implying someone else has already
  * started the queue, return 0 so the caller doesn't attempt
  * to run the queue.
  */
 static int
 xpt_schedule_dev(struct camq *queue, cam_pinfo *pinfo,
 		 u_int32_t new_priority)
 {
 	int retval;
 	u_int32_t old_priority;
 
 	CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_schedule_dev\n"));
 
 
 	old_priority = pinfo->priority;
 
 	/*
 	 * Are we already queued?
 	 */
 	if (pinfo->index != CAM_UNQUEUED_INDEX) {
 		/* Simply reorder based on new priority */
 		if (new_priority < old_priority) {
 			camq_change_priority(queue, pinfo->index,
 					     new_priority);
 			CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
 					("changed priority to %d\n",
 					 new_priority));
 			retval = 1;
 		} else
 			retval = 0;
 	} else {
 		/* New entry on the queue */
 		if (new_priority < old_priority)
 			pinfo->priority = new_priority;
 
 		CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
 				("Inserting onto queue\n"));
 		pinfo->generation = ++queue->generation;
 		camq_insert(queue, pinfo);
 		retval = 1;
 	}
 	return (retval);
 }
 
 static void
 xpt_run_allocq_task(void *context, int pending)
 {
 	struct cam_periph *periph = context;
 
 	cam_periph_lock(periph);
 	periph->flags &= ~CAM_PERIPH_RUN_TASK;
 	xpt_run_allocq(periph, 1);
 	cam_periph_unlock(periph);
 	cam_periph_release(periph);
 }
 
 static void
 xpt_run_allocq(struct cam_periph *periph, int sleep)
 {
 	struct cam_ed	*device;
 	union ccb	*ccb;
 	uint32_t	 prio;
 
 	cam_periph_assert(periph, MA_OWNED);
 	if (periph->periph_allocating)
 		return;
 	cam_periph_doacquire(periph);
 	periph->periph_allocating = 1;
 	CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_run_allocq(%p)\n", periph));
 	device = periph->path->device;
 	ccb = NULL;
 restart:
 	while ((prio = min(periph->scheduled_priority,
 	    periph->immediate_priority)) != CAM_PRIORITY_NONE &&
 	    (periph->periph_allocated - (ccb != NULL ? 1 : 0) <
 	     device->ccbq.total_openings || prio <= CAM_PRIORITY_OOB)) {
 
 		if (ccb == NULL &&
 		    (ccb = xpt_get_ccb_nowait(periph)) == NULL) {
 			if (sleep) {
 				ccb = xpt_get_ccb(periph);
 				goto restart;
 			}
 			if (periph->flags & CAM_PERIPH_RUN_TASK)
 				break;
 			cam_periph_doacquire(periph);
 			periph->flags |= CAM_PERIPH_RUN_TASK;
 			taskqueue_enqueue(xsoftc.xpt_taskq,
 			    &periph->periph_run_task);
 			break;
 		}
 		xpt_setup_ccb(&ccb->ccb_h, periph->path, prio);
 		if (prio == periph->immediate_priority) {
 			periph->immediate_priority = CAM_PRIORITY_NONE;
 			CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
 					("waking cam_periph_getccb()\n"));
 			SLIST_INSERT_HEAD(&periph->ccb_list, &ccb->ccb_h,
 					  periph_links.sle);
 			wakeup(&periph->ccb_list);
 		} else {
 			periph->scheduled_priority = CAM_PRIORITY_NONE;
 			CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
 					("calling periph_start()\n"));
 			periph->periph_start(periph, ccb);
 		}
 		ccb = NULL;
 	}
 	if (ccb != NULL)
 		xpt_release_ccb(ccb);
 	periph->periph_allocating = 0;
 	cam_periph_release_locked(periph);
 }
 
 static void
 xpt_run_devq(struct cam_devq *devq)
 {
 	struct mtx *mtx;
 
 	CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_run_devq\n"));
 
 	devq->send_queue.qfrozen_cnt++;
 	while ((devq->send_queue.entries > 0)
 	    && (devq->send_openings > 0)
 	    && (devq->send_queue.qfrozen_cnt <= 1)) {
 		struct	cam_ed *device;
 		union ccb *work_ccb;
 		struct	cam_sim *sim;
 		struct xpt_proto *proto;
 
 		device = (struct cam_ed *)camq_remove(&devq->send_queue,
 							   CAMQ_HEAD);
 		CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
 				("running device %p\n", device));
 
 		work_ccb = cam_ccbq_peek_ccb(&device->ccbq, CAMQ_HEAD);
 		if (work_ccb == NULL) {
 			printf("device on run queue with no ccbs???\n");
 			continue;
 		}
 
 		if ((work_ccb->ccb_h.flags & CAM_HIGH_POWER) != 0) {
 
 			mtx_lock(&xsoftc.xpt_highpower_lock);
 		 	if (xsoftc.num_highpower <= 0) {
 				/*
 				 * We got a high power command, but we
 				 * don't have any available slots.  Freeze
 				 * the device queue until we have a slot
 				 * available.
 				 */
 				xpt_freeze_devq_device(device, 1);
 				STAILQ_INSERT_TAIL(&xsoftc.highpowerq, device,
 						   highpowerq_entry);
 
 				mtx_unlock(&xsoftc.xpt_highpower_lock);
 				continue;
 			} else {
 				/*
 				 * Consume a high power slot while
 				 * this ccb runs.
 				 */
 				xsoftc.num_highpower--;
 			}
 			mtx_unlock(&xsoftc.xpt_highpower_lock);
 		}
 		cam_ccbq_remove_ccb(&device->ccbq, work_ccb);
 		cam_ccbq_send_ccb(&device->ccbq, work_ccb);
 		devq->send_openings--;
 		devq->send_active++;
 		xpt_schedule_devq(devq, device);
 		mtx_unlock(&devq->send_mtx);
 
 		if ((work_ccb->ccb_h.flags & CAM_DEV_QFREEZE) != 0) {
 			/*
 			 * The client wants to freeze the queue
 			 * after this CCB is sent.
 			 */
 			xpt_freeze_devq(work_ccb->ccb_h.path, 1);
 		}
 
 		/* In Target mode, the peripheral driver knows best... */
 		if (work_ccb->ccb_h.func_code == XPT_SCSI_IO) {
 			if ((device->inq_flags & SID_CmdQue) != 0
 			 && work_ccb->csio.tag_action != CAM_TAG_ACTION_NONE)
 				work_ccb->ccb_h.flags |= CAM_TAG_ACTION_VALID;
 			else
 				/*
 				 * Clear this in case of a retried CCB that
 				 * failed due to a rejected tag.
 				 */
 				work_ccb->ccb_h.flags &= ~CAM_TAG_ACTION_VALID;
 		}
 
 		KASSERT(device == work_ccb->ccb_h.path->device,
 		    ("device (%p) / path->device (%p) mismatch",
 			device, work_ccb->ccb_h.path->device));
 		proto = xpt_proto_find(device->protocol);
 		if (proto && proto->ops->debug_out)
 			proto->ops->debug_out(work_ccb);
 
 		/*
 		 * Device queues can be shared among multiple SIM instances
 		 * that reside on different buses.  Use the SIM from the
 		 * queued device, rather than the one from the calling bus.
 		 */
 		sim = device->sim;
 		mtx = sim->mtx;
 		if (mtx && !mtx_owned(mtx))
 			mtx_lock(mtx);
 		else
 			mtx = NULL;
 		work_ccb->ccb_h.qos.periph_data = cam_iosched_now();
 		(*(sim->sim_action))(sim, work_ccb);
 		if (mtx)
 			mtx_unlock(mtx);
 		mtx_lock(&devq->send_mtx);
 	}
 	devq->send_queue.qfrozen_cnt--;
 }
 
 /*
  * This function merges stuff from the slave ccb into the master ccb, while
  * keeping important fields in the master ccb constant.
  */
 void
 xpt_merge_ccb(union ccb *master_ccb, union ccb *slave_ccb)
 {
 
 	/*
 	 * Pull fields that are valid for peripheral drivers to set
 	 * into the master CCB along with the CCB "payload".
 	 */
 	master_ccb->ccb_h.retry_count = slave_ccb->ccb_h.retry_count;
 	master_ccb->ccb_h.func_code = slave_ccb->ccb_h.func_code;
 	master_ccb->ccb_h.timeout = slave_ccb->ccb_h.timeout;
 	master_ccb->ccb_h.flags = slave_ccb->ccb_h.flags;
 	bcopy(&(&slave_ccb->ccb_h)[1], &(&master_ccb->ccb_h)[1],
 	      sizeof(union ccb) - sizeof(struct ccb_hdr));
 }
 
 void
 xpt_setup_ccb_flags(struct ccb_hdr *ccb_h, struct cam_path *path,
 		    u_int32_t priority, u_int32_t flags)
 {
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_setup_ccb\n"));
 	ccb_h->pinfo.priority = priority;
 	ccb_h->path = path;
 	ccb_h->path_id = path->bus->path_id;
 	if (path->target)
 		ccb_h->target_id = path->target->target_id;
 	else
 		ccb_h->target_id = CAM_TARGET_WILDCARD;
 	if (path->device) {
 		ccb_h->target_lun = path->device->lun_id;
 		ccb_h->pinfo.generation = ++path->device->ccbq.queue.generation;
 	} else {
 		ccb_h->target_lun = CAM_TARGET_WILDCARD;
 	}
 	ccb_h->pinfo.index = CAM_UNQUEUED_INDEX;
 	ccb_h->flags = flags;
 	ccb_h->xflags = 0;
 }
 
 void
 xpt_setup_ccb(struct ccb_hdr *ccb_h, struct cam_path *path, u_int32_t priority)
 {
 	xpt_setup_ccb_flags(ccb_h, path, priority, /*flags*/ 0);
 }
 
 /* Path manipulation functions */
 cam_status
 xpt_create_path(struct cam_path **new_path_ptr, struct cam_periph *perph,
 		path_id_t path_id, target_id_t target_id, lun_id_t lun_id)
 {
 	struct	   cam_path *path;
 	cam_status status;
 
 	path = (struct cam_path *)malloc(sizeof(*path), M_CAMPATH, M_NOWAIT);
 
 	if (path == NULL) {
 		status = CAM_RESRC_UNAVAIL;
 		return(status);
 	}
 	status = xpt_compile_path(path, perph, path_id, target_id, lun_id);
 	if (status != CAM_REQ_CMP) {
 		free(path, M_CAMPATH);
 		path = NULL;
 	}
 	*new_path_ptr = path;
 	return (status);
 }
 
 cam_status
 xpt_create_path_unlocked(struct cam_path **new_path_ptr,
 			 struct cam_periph *periph, path_id_t path_id,
 			 target_id_t target_id, lun_id_t lun_id)
 {
 
 	return (xpt_create_path(new_path_ptr, periph, path_id, target_id,
 	    lun_id));
 }
 
 cam_status
 xpt_compile_path(struct cam_path *new_path, struct cam_periph *perph,
 		 path_id_t path_id, target_id_t target_id, lun_id_t lun_id)
 {
 	struct	     cam_eb *bus;
 	struct	     cam_et *target;
 	struct	     cam_ed *device;
 	cam_status   status;
 
 	status = CAM_REQ_CMP;	/* Completed without error */
 	target = NULL;		/* Wildcarded */
 	device = NULL;		/* Wildcarded */
 
 	/*
 	 * We will potentially modify the EDT, so block interrupts
 	 * that may attempt to create cam paths.
 	 */
 	bus = xpt_find_bus(path_id);
 	if (bus == NULL) {
 		status = CAM_PATH_INVALID;
 	} else {
 		xpt_lock_buses();
 		mtx_lock(&bus->eb_mtx);
 		target = xpt_find_target(bus, target_id);
 		if (target == NULL) {
 			/* Create one */
 			struct cam_et *new_target;
 
 			new_target = xpt_alloc_target(bus, target_id);
 			if (new_target == NULL) {
 				status = CAM_RESRC_UNAVAIL;
 			} else {
 				target = new_target;
 			}
 		}
 		xpt_unlock_buses();
 		if (target != NULL) {
 			device = xpt_find_device(target, lun_id);
 			if (device == NULL) {
 				/* Create one */
 				struct cam_ed *new_device;
 
 				new_device =
 				    (*(bus->xport->ops->alloc_device))(bus,
 								       target,
 								       lun_id);
 				if (new_device == NULL) {
 					status = CAM_RESRC_UNAVAIL;
 				} else {
 					device = new_device;
 				}
 			}
 		}
 		mtx_unlock(&bus->eb_mtx);
 	}
 
 	/*
 	 * Only touch the user's data if we are successful.
 	 */
 	if (status == CAM_REQ_CMP) {
 		new_path->periph = perph;
 		new_path->bus = bus;
 		new_path->target = target;
 		new_path->device = device;
 		CAM_DEBUG(new_path, CAM_DEBUG_TRACE, ("xpt_compile_path\n"));
 	} else {
 		if (device != NULL)
 			xpt_release_device(device);
 		if (target != NULL)
 			xpt_release_target(target);
 		if (bus != NULL)
 			xpt_release_bus(bus);
 	}
 	return (status);
 }
 
 cam_status
 xpt_clone_path(struct cam_path **new_path_ptr, struct cam_path *path)
 {
 	struct	   cam_path *new_path;
 
 	new_path = (struct cam_path *)malloc(sizeof(*path), M_CAMPATH, M_NOWAIT);
 	if (new_path == NULL)
 		return(CAM_RESRC_UNAVAIL);
 	xpt_copy_path(new_path, path);
 	*new_path_ptr = new_path;
 	return (CAM_REQ_CMP);
 }
 
 void
 xpt_copy_path(struct cam_path *new_path, struct cam_path *path)
 {
 
 	*new_path = *path;
 	if (path->bus != NULL)
 		xpt_acquire_bus(path->bus);
 	if (path->target != NULL)
 		xpt_acquire_target(path->target);
 	if (path->device != NULL)
 		xpt_acquire_device(path->device);
 }
 
 void
 xpt_release_path(struct cam_path *path)
 {
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_release_path\n"));
 	if (path->device != NULL) {
 		xpt_release_device(path->device);
 		path->device = NULL;
 	}
 	if (path->target != NULL) {
 		xpt_release_target(path->target);
 		path->target = NULL;
 	}
 	if (path->bus != NULL) {
 		xpt_release_bus(path->bus);
 		path->bus = NULL;
 	}
 }
 
 void
 xpt_free_path(struct cam_path *path)
 {
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_free_path\n"));
 	xpt_release_path(path);
 	free(path, M_CAMPATH);
 }
 
 void
 xpt_path_counts(struct cam_path *path, uint32_t *bus_ref,
     uint32_t *periph_ref, uint32_t *target_ref, uint32_t *device_ref)
 {
 
 	xpt_lock_buses();
 	if (bus_ref) {
 		if (path->bus)
 			*bus_ref = path->bus->refcount;
 		else
 			*bus_ref = 0;
 	}
 	if (periph_ref) {
 		if (path->periph)
 			*periph_ref = path->periph->refcount;
 		else
 			*periph_ref = 0;
 	}
 	xpt_unlock_buses();
 	if (target_ref) {
 		if (path->target)
 			*target_ref = path->target->refcount;
 		else
 			*target_ref = 0;
 	}
 	if (device_ref) {
 		if (path->device)
 			*device_ref = path->device->refcount;
 		else
 			*device_ref = 0;
 	}
 }
 
 /*
  * Return -1 for failure, 0 for exact match, 1 for match with wildcards
  * in path1, 2 for match with wildcards in path2.
  */
 int
 xpt_path_comp(struct cam_path *path1, struct cam_path *path2)
 {
 	int retval = 0;
 
 	if (path1->bus != path2->bus) {
 		if (path1->bus->path_id == CAM_BUS_WILDCARD)
 			retval = 1;
 		else if (path2->bus->path_id == CAM_BUS_WILDCARD)
 			retval = 2;
 		else
 			return (-1);
 	}
 	if (path1->target != path2->target) {
 		if (path1->target->target_id == CAM_TARGET_WILDCARD) {
 			if (retval == 0)
 				retval = 1;
 		} else if (path2->target->target_id == CAM_TARGET_WILDCARD)
 			retval = 2;
 		else
 			return (-1);
 	}
 	if (path1->device != path2->device) {
 		if (path1->device->lun_id == CAM_LUN_WILDCARD) {
 			if (retval == 0)
 				retval = 1;
 		} else if (path2->device->lun_id == CAM_LUN_WILDCARD)
 			retval = 2;
 		else
 			return (-1);
 	}
 	return (retval);
 }
 
 int
 xpt_path_comp_dev(struct cam_path *path, struct cam_ed *dev)
 {
 	int retval = 0;
 
 	if (path->bus != dev->target->bus) {
 		if (path->bus->path_id == CAM_BUS_WILDCARD)
 			retval = 1;
 		else if (dev->target->bus->path_id == CAM_BUS_WILDCARD)
 			retval = 2;
 		else
 			return (-1);
 	}
 	if (path->target != dev->target) {
 		if (path->target->target_id == CAM_TARGET_WILDCARD) {
 			if (retval == 0)
 				retval = 1;
 		} else if (dev->target->target_id == CAM_TARGET_WILDCARD)
 			retval = 2;
 		else
 			return (-1);
 	}
 	if (path->device != dev) {
 		if (path->device->lun_id == CAM_LUN_WILDCARD) {
 			if (retval == 0)
 				retval = 1;
 		} else if (dev->lun_id == CAM_LUN_WILDCARD)
 			retval = 2;
 		else
 			return (-1);
 	}
 	return (retval);
 }
 
 void
 xpt_print_path(struct cam_path *path)
 {
 	struct sbuf sb;
 	char buffer[XPT_PRINT_LEN];
 
 	sbuf_new(&sb, buffer, XPT_PRINT_LEN, SBUF_FIXEDLEN);
 	xpt_path_sbuf(path, &sb);
 	sbuf_finish(&sb);
 	printf("%s", sbuf_data(&sb));
 	sbuf_delete(&sb);
 }
 
 void
 xpt_print_device(struct cam_ed *device)
 {
 
 	if (device == NULL)
 		printf("(nopath): ");
 	else {
 		printf("(noperiph:%s%d:%d:%d:%jx): ", device->sim->sim_name,
 		       device->sim->unit_number,
 		       device->sim->bus_id,
 		       device->target->target_id,
 		       (uintmax_t)device->lun_id);
 	}
 }
 
 void
 xpt_print(struct cam_path *path, const char *fmt, ...)
 {
 	va_list ap;
 	struct sbuf sb;
 	char buffer[XPT_PRINT_LEN];
 
 	sbuf_new(&sb, buffer, XPT_PRINT_LEN, SBUF_FIXEDLEN);
 
 	xpt_path_sbuf(path, &sb);
 	va_start(ap, fmt);
 	sbuf_vprintf(&sb, fmt, ap);
 	va_end(ap);
 
 	sbuf_finish(&sb);
 	printf("%s", sbuf_data(&sb));
 	sbuf_delete(&sb);
 }
 
 int
 xpt_path_string(struct cam_path *path, char *str, size_t str_len)
 {
 	struct sbuf sb;
 	int len;
 
 	sbuf_new(&sb, str, str_len, 0);
 	len = xpt_path_sbuf(path, &sb);
 	sbuf_finish(&sb);
 	return (len);
 }
 
 int
 xpt_path_sbuf(struct cam_path *path, struct sbuf *sb)
 {
 
 	if (path == NULL)
 		sbuf_printf(sb, "(nopath): ");
 	else {
 		if (path->periph != NULL)
 			sbuf_printf(sb, "(%s%d:", path->periph->periph_name,
 				    path->periph->unit_number);
 		else
 			sbuf_printf(sb, "(noperiph:");
 
 		if (path->bus != NULL)
 			sbuf_printf(sb, "%s%d:%d:", path->bus->sim->sim_name,
 				    path->bus->sim->unit_number,
 				    path->bus->sim->bus_id);
 		else
 			sbuf_printf(sb, "nobus:");
 
 		if (path->target != NULL)
 			sbuf_printf(sb, "%d:", path->target->target_id);
 		else
 			sbuf_printf(sb, "X:");
 
 		if (path->device != NULL)
 			sbuf_printf(sb, "%jx): ",
 			    (uintmax_t)path->device->lun_id);
 		else
 			sbuf_printf(sb, "X): ");
 	}
 
 	return(sbuf_len(sb));
 }
 
 path_id_t
 xpt_path_path_id(struct cam_path *path)
 {
 	return(path->bus->path_id);
 }
 
 target_id_t
 xpt_path_target_id(struct cam_path *path)
 {
 	if (path->target != NULL)
 		return (path->target->target_id);
 	else
 		return (CAM_TARGET_WILDCARD);
 }
 
 lun_id_t
 xpt_path_lun_id(struct cam_path *path)
 {
 	if (path->device != NULL)
 		return (path->device->lun_id);
 	else
 		return (CAM_LUN_WILDCARD);
 }
 
 struct cam_sim *
 xpt_path_sim(struct cam_path *path)
 {
 
 	return (path->bus->sim);
 }
 
 struct cam_periph*
 xpt_path_periph(struct cam_path *path)
 {
 
 	return (path->periph);
 }
 
 /*
  * Release a CAM control block for the caller.  Remit the cost of the structure
  * to the device referenced by the path.  If the this device had no 'credits'
  * and peripheral drivers have registered async callbacks for this notification
  * call them now.
  */
 void
 xpt_release_ccb(union ccb *free_ccb)
 {
 	struct	 cam_ed *device;
 	struct	 cam_periph *periph;
 
 	CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_release_ccb\n"));
 	xpt_path_assert(free_ccb->ccb_h.path, MA_OWNED);
 	device = free_ccb->ccb_h.path->device;
 	periph = free_ccb->ccb_h.path->periph;
 
 	xpt_free_ccb(free_ccb);
 	periph->periph_allocated--;
 	cam_ccbq_release_opening(&device->ccbq);
 	xpt_run_allocq(periph, 0);
 }
 
 /* Functions accessed by SIM drivers */
 
 static struct xpt_xport_ops xport_default_ops = {
 	.alloc_device = xpt_alloc_device_default,
 	.action = xpt_action_default,
 	.async = xpt_dev_async_default,
 };
 static struct xpt_xport xport_default = {
 	.xport = XPORT_UNKNOWN,
 	.name = "unknown",
 	.ops = &xport_default_ops,
 };
 
 CAM_XPT_XPORT(xport_default);
 
 /*
  * A sim structure, listing the SIM entry points and instance
  * identification info is passed to xpt_bus_register to hook the SIM
  * into the CAM framework.  xpt_bus_register creates a cam_eb entry
  * for this new bus and places it in the array of buses and assigns
  * it a path_id.  The path_id may be influenced by "hard wiring"
  * information specified by the user.  Once interrupt services are
  * available, the bus will be probed.
  */
 int32_t
 xpt_bus_register(struct cam_sim *sim, device_t parent, u_int32_t bus)
 {
 	struct cam_eb *new_bus;
 	struct cam_eb *old_bus;
 	struct ccb_pathinq cpi;
 	struct cam_path *path;
 	cam_status status;
 
 	sim->bus_id = bus;
 	new_bus = (struct cam_eb *)malloc(sizeof(*new_bus),
 					  M_CAMXPT, M_NOWAIT|M_ZERO);
 	if (new_bus == NULL) {
 		/* Couldn't satisfy request */
 		return (CAM_RESRC_UNAVAIL);
 	}
 
 	mtx_init(&new_bus->eb_mtx, "CAM bus lock", NULL, MTX_DEF);
 	TAILQ_INIT(&new_bus->et_entries);
 	cam_sim_hold(sim);
 	new_bus->sim = sim;
 	timevalclear(&new_bus->last_reset);
 	new_bus->flags = 0;
 	new_bus->refcount = 1;	/* Held until a bus_deregister event */
 	new_bus->generation = 0;
 
 	xpt_lock_buses();
 	sim->path_id = new_bus->path_id =
 	    xptpathid(sim->sim_name, sim->unit_number, sim->bus_id);
 	old_bus = TAILQ_FIRST(&xsoftc.xpt_busses);
 	while (old_bus != NULL
 	    && old_bus->path_id < new_bus->path_id)
 		old_bus = TAILQ_NEXT(old_bus, links);
 	if (old_bus != NULL)
 		TAILQ_INSERT_BEFORE(old_bus, new_bus, links);
 	else
 		TAILQ_INSERT_TAIL(&xsoftc.xpt_busses, new_bus, links);
 	xsoftc.bus_generation++;
 	xpt_unlock_buses();
 
 	/*
 	 * Set a default transport so that a PATH_INQ can be issued to
 	 * the SIM.  This will then allow for probing and attaching of
 	 * a more appropriate transport.
 	 */
 	new_bus->xport = &xport_default;
 
 	status = xpt_create_path(&path, /*periph*/NULL, sim->path_id,
 				  CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD);
 	if (status != CAM_REQ_CMP) {
 		xpt_release_bus(new_bus);
 		return (CAM_RESRC_UNAVAIL);
 	}
 
 	xpt_path_inq(&cpi, path);
 
 	if (cpi.ccb_h.status == CAM_REQ_CMP) {
 		struct xpt_xport **xpt;
 
 		SET_FOREACH(xpt, cam_xpt_xport_set) {
 			if ((*xpt)->xport == cpi.transport) {
 				new_bus->xport = *xpt;
 				break;
 			}
 		}
 		if (new_bus->xport == NULL) {
 			xpt_print(path,
 			    "No transport found for %d\n", cpi.transport);
 			xpt_release_bus(new_bus);
 			free(path, M_CAMXPT);
 			return (CAM_RESRC_UNAVAIL);
 		}
 	}
 
 	/* Notify interested parties */
 	if (sim->path_id != CAM_XPT_PATH_ID) {
 
 		xpt_async(AC_PATH_REGISTERED, path, &cpi);
 		if ((cpi.hba_misc & PIM_NOSCAN) == 0) {
 			union	ccb *scan_ccb;
 
 			/* Initiate bus rescan. */
 			scan_ccb = xpt_alloc_ccb_nowait();
 			if (scan_ccb != NULL) {
 				scan_ccb->ccb_h.path = path;
 				scan_ccb->ccb_h.func_code = XPT_SCAN_BUS;
 				scan_ccb->crcn.flags = 0;
 				xpt_rescan(scan_ccb);
 			} else {
 				xpt_print(path,
 					  "Can't allocate CCB to scan bus\n");
 				xpt_free_path(path);
 			}
 		} else
 			xpt_free_path(path);
 	} else
 		xpt_free_path(path);
 	return (CAM_SUCCESS);
 }
 
 int32_t
 xpt_bus_deregister(path_id_t pathid)
 {
 	struct cam_path bus_path;
 	cam_status status;
 
 	status = xpt_compile_path(&bus_path, NULL, pathid,
 				  CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD);
 	if (status != CAM_REQ_CMP)
 		return (status);
 
 	xpt_async(AC_LOST_DEVICE, &bus_path, NULL);
 	xpt_async(AC_PATH_DEREGISTERED, &bus_path, NULL);
 
 	/* Release the reference count held while registered. */
 	xpt_release_bus(bus_path.bus);
 	xpt_release_path(&bus_path);
 
 	return (CAM_REQ_CMP);
 }
 
 static path_id_t
 xptnextfreepathid(void)
 {
 	struct cam_eb *bus;
 	path_id_t pathid;
 	const char *strval;
 
 	mtx_assert(&xsoftc.xpt_topo_lock, MA_OWNED);
 	pathid = 0;
 	bus = TAILQ_FIRST(&xsoftc.xpt_busses);
 retry:
 	/* Find an unoccupied pathid */
 	while (bus != NULL && bus->path_id <= pathid) {
 		if (bus->path_id == pathid)
 			pathid++;
 		bus = TAILQ_NEXT(bus, links);
 	}
 
 	/*
 	 * Ensure that this pathid is not reserved for
 	 * a bus that may be registered in the future.
 	 */
 	if (resource_string_value("scbus", pathid, "at", &strval) == 0) {
 		++pathid;
 		/* Start the search over */
 		goto retry;
 	}
 	return (pathid);
 }
 
 static path_id_t
 xptpathid(const char *sim_name, int sim_unit, int sim_bus)
 {
 	path_id_t pathid;
 	int i, dunit, val;
 	char buf[32];
 	const char *dname;
 
 	pathid = CAM_XPT_PATH_ID;
 	snprintf(buf, sizeof(buf), "%s%d", sim_name, sim_unit);
 	if (strcmp(buf, "xpt0") == 0 && sim_bus == 0)
 		return (pathid);
 	i = 0;
 	while ((resource_find_match(&i, &dname, &dunit, "at", buf)) == 0) {
 		if (strcmp(dname, "scbus")) {
 			/* Avoid a bit of foot shooting. */
 			continue;
 		}
 		if (dunit < 0)		/* unwired?! */
 			continue;
 		if (resource_int_value("scbus", dunit, "bus", &val) == 0) {
 			if (sim_bus == val) {
 				pathid = dunit;
 				break;
 			}
 		} else if (sim_bus == 0) {
 			/* Unspecified matches bus 0 */
 			pathid = dunit;
 			break;
 		} else {
 			printf("Ambiguous scbus configuration for %s%d "
 			       "bus %d, cannot wire down.  The kernel "
 			       "config entry for scbus%d should "
 			       "specify a controller bus.\n"
 			       "Scbus will be assigned dynamically.\n",
 			       sim_name, sim_unit, sim_bus, dunit);
 			break;
 		}
 	}
 
 	if (pathid == CAM_XPT_PATH_ID)
 		pathid = xptnextfreepathid();
 	return (pathid);
 }
 
 static const char *
 xpt_async_string(u_int32_t async_code)
 {
 
 	switch (async_code) {
 	case AC_BUS_RESET: return ("AC_BUS_RESET");
 	case AC_UNSOL_RESEL: return ("AC_UNSOL_RESEL");
 	case AC_SCSI_AEN: return ("AC_SCSI_AEN");
 	case AC_SENT_BDR: return ("AC_SENT_BDR");
 	case AC_PATH_REGISTERED: return ("AC_PATH_REGISTERED");
 	case AC_PATH_DEREGISTERED: return ("AC_PATH_DEREGISTERED");
 	case AC_FOUND_DEVICE: return ("AC_FOUND_DEVICE");
 	case AC_LOST_DEVICE: return ("AC_LOST_DEVICE");
 	case AC_TRANSFER_NEG: return ("AC_TRANSFER_NEG");
 	case AC_INQ_CHANGED: return ("AC_INQ_CHANGED");
 	case AC_GETDEV_CHANGED: return ("AC_GETDEV_CHANGED");
 	case AC_CONTRACT: return ("AC_CONTRACT");
 	case AC_ADVINFO_CHANGED: return ("AC_ADVINFO_CHANGED");
 	case AC_UNIT_ATTENTION: return ("AC_UNIT_ATTENTION");
 	}
 	return ("AC_UNKNOWN");
 }
 
 static int
 xpt_async_size(u_int32_t async_code)
 {
 
 	switch (async_code) {
 	case AC_BUS_RESET: return (0);
 	case AC_UNSOL_RESEL: return (0);
 	case AC_SCSI_AEN: return (0);
 	case AC_SENT_BDR: return (0);
 	case AC_PATH_REGISTERED: return (sizeof(struct ccb_pathinq));
 	case AC_PATH_DEREGISTERED: return (0);
 	case AC_FOUND_DEVICE: return (sizeof(struct ccb_getdev));
 	case AC_LOST_DEVICE: return (0);
 	case AC_TRANSFER_NEG: return (sizeof(struct ccb_trans_settings));
 	case AC_INQ_CHANGED: return (0);
 	case AC_GETDEV_CHANGED: return (0);
 	case AC_CONTRACT: return (sizeof(struct ac_contract));
 	case AC_ADVINFO_CHANGED: return (-1);
 	case AC_UNIT_ATTENTION: return (sizeof(struct ccb_scsiio));
 	}
 	return (0);
 }
 
 static int
 xpt_async_process_dev(struct cam_ed *device, void *arg)
 {
 	union ccb *ccb = arg;
 	struct cam_path *path = ccb->ccb_h.path;
 	void *async_arg = ccb->casync.async_arg_ptr;
 	u_int32_t async_code = ccb->casync.async_code;
 	int relock;
 
 	if (path->device != device
 	 && path->device->lun_id != CAM_LUN_WILDCARD
 	 && device->lun_id != CAM_LUN_WILDCARD)
 		return (1);
 
 	/*
 	 * The async callback could free the device.
 	 * If it is a broadcast async, it doesn't hold
 	 * device reference, so take our own reference.
 	 */
 	xpt_acquire_device(device);
 
 	/*
 	 * If async for specific device is to be delivered to
 	 * the wildcard client, take the specific device lock.
 	 * XXX: We may need a way for client to specify it.
 	 */
 	if ((device->lun_id == CAM_LUN_WILDCARD &&
 	     path->device->lun_id != CAM_LUN_WILDCARD) ||
 	    (device->target->target_id == CAM_TARGET_WILDCARD &&
 	     path->target->target_id != CAM_TARGET_WILDCARD) ||
 	    (device->target->bus->path_id == CAM_BUS_WILDCARD &&
 	     path->target->bus->path_id != CAM_BUS_WILDCARD)) {
 		mtx_unlock(&device->device_mtx);
 		xpt_path_lock(path);
 		relock = 1;
 	} else
 		relock = 0;
 
 	(*(device->target->bus->xport->ops->async))(async_code,
 	    device->target->bus, device->target, device, async_arg);
 	xpt_async_bcast(&device->asyncs, async_code, path, async_arg);
 
 	if (relock) {
 		xpt_path_unlock(path);
 		mtx_lock(&device->device_mtx);
 	}
 	xpt_release_device(device);
 	return (1);
 }
 
 static int
 xpt_async_process_tgt(struct cam_et *target, void *arg)
 {
 	union ccb *ccb = arg;
 	struct cam_path *path = ccb->ccb_h.path;
 
 	if (path->target != target
 	 && path->target->target_id != CAM_TARGET_WILDCARD
 	 && target->target_id != CAM_TARGET_WILDCARD)
 		return (1);
 
 	if (ccb->casync.async_code == AC_SENT_BDR) {
 		/* Update our notion of when the last reset occurred */
 		microtime(&target->last_reset);
 	}
 
 	return (xptdevicetraverse(target, NULL, xpt_async_process_dev, ccb));
 }
 
 static void
 xpt_async_process(struct cam_periph *periph, union ccb *ccb)
 {
 	struct cam_eb *bus;
 	struct cam_path *path;
 	void *async_arg;
 	u_int32_t async_code;
 
 	path = ccb->ccb_h.path;
 	async_code = ccb->casync.async_code;
 	async_arg = ccb->casync.async_arg_ptr;
 	CAM_DEBUG(path, CAM_DEBUG_TRACE | CAM_DEBUG_INFO,
 	    ("xpt_async(%s)\n", xpt_async_string(async_code)));
 	bus = path->bus;
 
 	if (async_code == AC_BUS_RESET) {
 		/* Update our notion of when the last reset occurred */
 		microtime(&bus->last_reset);
 	}
 
 	xpttargettraverse(bus, NULL, xpt_async_process_tgt, ccb);
 
 	/*
 	 * If this wasn't a fully wildcarded async, tell all
 	 * clients that want all async events.
 	 */
 	if (bus != xpt_periph->path->bus) {
 		xpt_path_lock(xpt_periph->path);
 		xpt_async_process_dev(xpt_periph->path->device, ccb);
 		xpt_path_unlock(xpt_periph->path);
 	}
 
 	if (path->device != NULL && path->device->lun_id != CAM_LUN_WILDCARD)
 		xpt_release_devq(path, 1, TRUE);
 	else
 		xpt_release_simq(path->bus->sim, TRUE);
 	if (ccb->casync.async_arg_size > 0)
 		free(async_arg, M_CAMXPT);
 	xpt_free_path(path);
 	xpt_free_ccb(ccb);
 }
 
 static void
 xpt_async_bcast(struct async_list *async_head,
 		u_int32_t async_code,
 		struct cam_path *path, void *async_arg)
 {
 	struct async_node *cur_entry;
 	struct mtx *mtx;
 
 	cur_entry = SLIST_FIRST(async_head);
 	while (cur_entry != NULL) {
 		struct async_node *next_entry;
 		/*
 		 * Grab the next list entry before we call the current
 		 * entry's callback.  This is because the callback function
 		 * can delete its async callback entry.
 		 */
 		next_entry = SLIST_NEXT(cur_entry, links);
 		if ((cur_entry->event_enable & async_code) != 0) {
 			mtx = cur_entry->event_lock ?
 			    path->device->sim->mtx : NULL;
 			if (mtx)
 				mtx_lock(mtx);
 			cur_entry->callback(cur_entry->callback_arg,
 					    async_code, path,
 					    async_arg);
 			if (mtx)
 				mtx_unlock(mtx);
 		}
 		cur_entry = next_entry;
 	}
 }
 
 void
 xpt_async(u_int32_t async_code, struct cam_path *path, void *async_arg)
 {
 	union ccb *ccb;
 	int size;
 
 	ccb = xpt_alloc_ccb_nowait();
 	if (ccb == NULL) {
 		xpt_print(path, "Can't allocate CCB to send %s\n",
 		    xpt_async_string(async_code));
 		return;
 	}
 
 	if (xpt_clone_path(&ccb->ccb_h.path, path) != CAM_REQ_CMP) {
 		xpt_print(path, "Can't allocate path to send %s\n",
 		    xpt_async_string(async_code));
 		xpt_free_ccb(ccb);
 		return;
 	}
 	ccb->ccb_h.path->periph = NULL;
 	ccb->ccb_h.func_code = XPT_ASYNC;
 	ccb->ccb_h.cbfcnp = xpt_async_process;
 	ccb->ccb_h.flags |= CAM_UNLOCKED;
 	ccb->casync.async_code = async_code;
 	ccb->casync.async_arg_size = 0;
 	size = xpt_async_size(async_code);
 	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
 	    ("xpt_async: func %#x %s aync_code %d %s\n",
 		ccb->ccb_h.func_code,
 		xpt_action_name(ccb->ccb_h.func_code),
 		async_code,
 		xpt_async_string(async_code)));
 	if (size > 0 && async_arg != NULL) {
 		ccb->casync.async_arg_ptr = malloc(size, M_CAMXPT, M_NOWAIT);
 		if (ccb->casync.async_arg_ptr == NULL) {
 			xpt_print(path, "Can't allocate argument to send %s\n",
 			    xpt_async_string(async_code));
 			xpt_free_path(ccb->ccb_h.path);
 			xpt_free_ccb(ccb);
 			return;
 		}
 		memcpy(ccb->casync.async_arg_ptr, async_arg, size);
 		ccb->casync.async_arg_size = size;
 	} else if (size < 0) {
 		ccb->casync.async_arg_ptr = async_arg;
 		ccb->casync.async_arg_size = size;
 	}
 	if (path->device != NULL && path->device->lun_id != CAM_LUN_WILDCARD)
 		xpt_freeze_devq(path, 1);
 	else
 		xpt_freeze_simq(path->bus->sim, 1);
 	xpt_done(ccb);
 }
 
 static void
 xpt_dev_async_default(u_int32_t async_code, struct cam_eb *bus,
 		      struct cam_et *target, struct cam_ed *device,
 		      void *async_arg)
 {
 
 	/*
 	 * We only need to handle events for real devices.
 	 */
 	if (target->target_id == CAM_TARGET_WILDCARD
 	 || device->lun_id == CAM_LUN_WILDCARD)
 		return;
 
 	printf("%s called\n", __func__);
 }
 
 static uint32_t
 xpt_freeze_devq_device(struct cam_ed *dev, u_int count)
 {
 	struct cam_devq	*devq;
 	uint32_t freeze;
 
 	devq = dev->sim->devq;
 	mtx_assert(&devq->send_mtx, MA_OWNED);
 	CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE,
 	    ("xpt_freeze_devq_device(%d) %u->%u\n", count,
 	    dev->ccbq.queue.qfrozen_cnt, dev->ccbq.queue.qfrozen_cnt + count));
 	freeze = (dev->ccbq.queue.qfrozen_cnt += count);
 	/* Remove frozen device from sendq. */
 	if (device_is_queued(dev))
 		camq_remove(&devq->send_queue, dev->devq_entry.index);
 	return (freeze);
 }
 
 u_int32_t
 xpt_freeze_devq(struct cam_path *path, u_int count)
 {
 	struct cam_ed	*dev = path->device;
 	struct cam_devq	*devq;
 	uint32_t	 freeze;
 
 	devq = dev->sim->devq;
 	mtx_lock(&devq->send_mtx);
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_freeze_devq(%d)\n", count));
 	freeze = xpt_freeze_devq_device(dev, count);
 	mtx_unlock(&devq->send_mtx);
 	return (freeze);
 }
 
 u_int32_t
 xpt_freeze_simq(struct cam_sim *sim, u_int count)
 {
 	struct cam_devq	*devq;
 	uint32_t	 freeze;
 
 	devq = sim->devq;
 	mtx_lock(&devq->send_mtx);
 	freeze = (devq->send_queue.qfrozen_cnt += count);
 	mtx_unlock(&devq->send_mtx);
 	return (freeze);
 }
 
 static void
 xpt_release_devq_timeout(void *arg)
 {
 	struct cam_ed *dev;
 	struct cam_devq *devq;
 
 	dev = (struct cam_ed *)arg;
 	CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE, ("xpt_release_devq_timeout\n"));
 	devq = dev->sim->devq;
 	mtx_assert(&devq->send_mtx, MA_OWNED);
 	if (xpt_release_devq_device(dev, /*count*/1, /*run_queue*/TRUE))
 		xpt_run_devq(devq);
 }
 
 void
 xpt_release_devq(struct cam_path *path, u_int count, int run_queue)
 {
 	struct cam_ed *dev;
 	struct cam_devq *devq;
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_release_devq(%d, %d)\n",
 	    count, run_queue));
 	dev = path->device;
 	devq = dev->sim->devq;
 	mtx_lock(&devq->send_mtx);
 	if (xpt_release_devq_device(dev, count, run_queue))
 		xpt_run_devq(dev->sim->devq);
 	mtx_unlock(&devq->send_mtx);
 }
 
 static int
 xpt_release_devq_device(struct cam_ed *dev, u_int count, int run_queue)
 {
 
 	mtx_assert(&dev->sim->devq->send_mtx, MA_OWNED);
 	CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE,
 	    ("xpt_release_devq_device(%d, %d) %u->%u\n", count, run_queue,
 	    dev->ccbq.queue.qfrozen_cnt, dev->ccbq.queue.qfrozen_cnt - count));
 	if (count > dev->ccbq.queue.qfrozen_cnt) {
 #ifdef INVARIANTS
 		printf("xpt_release_devq(): requested %u > present %u\n",
 		    count, dev->ccbq.queue.qfrozen_cnt);
 #endif
 		count = dev->ccbq.queue.qfrozen_cnt;
 	}
 	dev->ccbq.queue.qfrozen_cnt -= count;
 	if (dev->ccbq.queue.qfrozen_cnt == 0) {
 		/*
 		 * No longer need to wait for a successful
 		 * command completion.
 		 */
 		dev->flags &= ~CAM_DEV_REL_ON_COMPLETE;
 		/*
 		 * Remove any timeouts that might be scheduled
 		 * to release this queue.
 		 */
 		if ((dev->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0) {
 			callout_stop(&dev->callout);
 			dev->flags &= ~CAM_DEV_REL_TIMEOUT_PENDING;
 		}
 		/*
 		 * Now that we are unfrozen schedule the
 		 * device so any pending transactions are
 		 * run.
 		 */
 		xpt_schedule_devq(dev->sim->devq, dev);
 	} else
 		run_queue = 0;
 	return (run_queue);
 }
 
 void
 xpt_release_simq(struct cam_sim *sim, int run_queue)
 {
 	struct cam_devq	*devq;
 
 	devq = sim->devq;
 	mtx_lock(&devq->send_mtx);
 	if (devq->send_queue.qfrozen_cnt <= 0) {
 #ifdef INVARIANTS
 		printf("xpt_release_simq: requested 1 > present %u\n",
 		    devq->send_queue.qfrozen_cnt);
 #endif
 	} else
 		devq->send_queue.qfrozen_cnt--;
 	if (devq->send_queue.qfrozen_cnt == 0) {
 		/*
 		 * If there is a timeout scheduled to release this
 		 * sim queue, remove it.  The queue frozen count is
 		 * already at 0.
 		 */
 		if ((sim->flags & CAM_SIM_REL_TIMEOUT_PENDING) != 0){
 			callout_stop(&sim->callout);
 			sim->flags &= ~CAM_SIM_REL_TIMEOUT_PENDING;
 		}
 		if (run_queue) {
 			/*
 			 * Now that we are unfrozen run the send queue.
 			 */
 			xpt_run_devq(sim->devq);
 		}
 	}
 	mtx_unlock(&devq->send_mtx);
 }
 
 /*
  * XXX Appears to be unused.
  */
 static void
 xpt_release_simq_timeout(void *arg)
 {
 	struct cam_sim *sim;
 
 	sim = (struct cam_sim *)arg;
 	xpt_release_simq(sim, /* run_queue */ TRUE);
 }
 
 void
 xpt_done(union ccb *done_ccb)
 {
 	struct cam_doneq *queue;
 	int	run, hash;
 
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 	if (done_ccb->ccb_h.func_code == XPT_SCSI_IO &&
 	    done_ccb->csio.bio != NULL)
 		biotrack(done_ccb->csio.bio, __func__);
 #endif
 
 	CAM_DEBUG(done_ccb->ccb_h.path, CAM_DEBUG_TRACE,
 	    ("xpt_done: func= %#x %s status %#x\n",
 		done_ccb->ccb_h.func_code,
 		xpt_action_name(done_ccb->ccb_h.func_code),
 		done_ccb->ccb_h.status));
 	if ((done_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0)
 		return;
 
 	/* Store the time the ccb was in the sim */
 	done_ccb->ccb_h.qos.periph_data = cam_iosched_delta_t(done_ccb->ccb_h.qos.periph_data);
 	hash = (done_ccb->ccb_h.path_id + done_ccb->ccb_h.target_id +
 	    done_ccb->ccb_h.target_lun) % cam_num_doneqs;
 	queue = &cam_doneqs[hash];
 	mtx_lock(&queue->cam_doneq_mtx);
 	run = (queue->cam_doneq_sleep && STAILQ_EMPTY(&queue->cam_doneq));
 	STAILQ_INSERT_TAIL(&queue->cam_doneq, &done_ccb->ccb_h, sim_links.stqe);
 	done_ccb->ccb_h.pinfo.index = CAM_DONEQ_INDEX;
 	mtx_unlock(&queue->cam_doneq_mtx);
 	if (run)
 		wakeup(&queue->cam_doneq);
 }
 
 void
 xpt_done_direct(union ccb *done_ccb)
 {
 
 	CAM_DEBUG(done_ccb->ccb_h.path, CAM_DEBUG_TRACE,
 	    ("xpt_done_direct: status %#x\n", done_ccb->ccb_h.status));
 	if ((done_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0)
 		return;
 
 	/* Store the time the ccb was in the sim */
 	done_ccb->ccb_h.qos.periph_data = cam_iosched_delta_t(done_ccb->ccb_h.qos.periph_data);
 	xpt_done_process(&done_ccb->ccb_h);
 }
 
 union ccb *
 xpt_alloc_ccb()
 {
 	union ccb *new_ccb;
 
 	new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_WAITOK);
 	return (new_ccb);
 }
 
 union ccb *
 xpt_alloc_ccb_nowait()
 {
 	union ccb *new_ccb;
 
 	new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_NOWAIT);
 	return (new_ccb);
 }
 
 void
 xpt_free_ccb(union ccb *free_ccb)
 {
 	free(free_ccb, M_CAMCCB);
 }
 
 
 
 /* Private XPT functions */
 
 /*
  * Get a CAM control block for the caller. Charge the structure to the device
  * referenced by the path.  If we don't have sufficient resources to allocate
  * more ccbs, we return NULL.
  */
 static union ccb *
 xpt_get_ccb_nowait(struct cam_periph *periph)
 {
 	union ccb *new_ccb;
 
 	new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_NOWAIT);
 	if (new_ccb == NULL)
 		return (NULL);
 	periph->periph_allocated++;
 	cam_ccbq_take_opening(&periph->path->device->ccbq);
 	return (new_ccb);
 }
 
 static union ccb *
 xpt_get_ccb(struct cam_periph *periph)
 {
 	union ccb *new_ccb;
 
 	cam_periph_unlock(periph);
 	new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_WAITOK);
 	cam_periph_lock(periph);
 	periph->periph_allocated++;
 	cam_ccbq_take_opening(&periph->path->device->ccbq);
 	return (new_ccb);
 }
 
 union ccb *
 cam_periph_getccb(struct cam_periph *periph, u_int32_t priority)
 {
 	struct ccb_hdr *ccb_h;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("cam_periph_getccb\n"));
 	cam_periph_assert(periph, MA_OWNED);
 	while ((ccb_h = SLIST_FIRST(&periph->ccb_list)) == NULL ||
 	    ccb_h->pinfo.priority != priority) {
 		if (priority < periph->immediate_priority) {
 			periph->immediate_priority = priority;
 			xpt_run_allocq(periph, 0);
 		} else
 			cam_periph_sleep(periph, &periph->ccb_list, PRIBIO,
 			    "cgticb", 0);
 	}
 	SLIST_REMOVE_HEAD(&periph->ccb_list, periph_links.sle);
 	return ((union ccb *)ccb_h);
 }
 
 static void
 xpt_acquire_bus(struct cam_eb *bus)
 {
 
 	xpt_lock_buses();
 	bus->refcount++;
 	xpt_unlock_buses();
 }
 
 static void
 xpt_release_bus(struct cam_eb *bus)
 {
 
 	xpt_lock_buses();
 	KASSERT(bus->refcount >= 1, ("bus->refcount >= 1"));
 	if (--bus->refcount > 0) {
 		xpt_unlock_buses();
 		return;
 	}
 	TAILQ_REMOVE(&xsoftc.xpt_busses, bus, links);
 	xsoftc.bus_generation++;
 	xpt_unlock_buses();
 	KASSERT(TAILQ_EMPTY(&bus->et_entries),
 	    ("destroying bus, but target list is not empty"));
 	cam_sim_release(bus->sim);
 	mtx_destroy(&bus->eb_mtx);
 	free(bus, M_CAMXPT);
 }
 
 static struct cam_et *
 xpt_alloc_target(struct cam_eb *bus, target_id_t target_id)
 {
 	struct cam_et *cur_target, *target;
 
 	mtx_assert(&xsoftc.xpt_topo_lock, MA_OWNED);
 	mtx_assert(&bus->eb_mtx, MA_OWNED);
 	target = (struct cam_et *)malloc(sizeof(*target), M_CAMXPT,
 					 M_NOWAIT|M_ZERO);
 	if (target == NULL)
 		return (NULL);
 
 	TAILQ_INIT(&target->ed_entries);
 	target->bus = bus;
 	target->target_id = target_id;
 	target->refcount = 1;
 	target->generation = 0;
 	target->luns = NULL;
 	mtx_init(&target->luns_mtx, "CAM LUNs lock", NULL, MTX_DEF);
 	timevalclear(&target->last_reset);
 	/*
 	 * Hold a reference to our parent bus so it
 	 * will not go away before we do.
 	 */
 	bus->refcount++;
 
 	/* Insertion sort into our bus's target list */
 	cur_target = TAILQ_FIRST(&bus->et_entries);
 	while (cur_target != NULL && cur_target->target_id < target_id)
 		cur_target = TAILQ_NEXT(cur_target, links);
 	if (cur_target != NULL) {
 		TAILQ_INSERT_BEFORE(cur_target, target, links);
 	} else {
 		TAILQ_INSERT_TAIL(&bus->et_entries, target, links);
 	}
 	bus->generation++;
 	return (target);
 }
 
 static void
 xpt_acquire_target(struct cam_et *target)
 {
 	struct cam_eb *bus = target->bus;
 
 	mtx_lock(&bus->eb_mtx);
 	target->refcount++;
 	mtx_unlock(&bus->eb_mtx);
 }
 
 static void
 xpt_release_target(struct cam_et *target)
 {
 	struct cam_eb *bus = target->bus;
 
 	mtx_lock(&bus->eb_mtx);
 	if (--target->refcount > 0) {
 		mtx_unlock(&bus->eb_mtx);
 		return;
 	}
 	TAILQ_REMOVE(&bus->et_entries, target, links);
 	bus->generation++;
 	mtx_unlock(&bus->eb_mtx);
 	KASSERT(TAILQ_EMPTY(&target->ed_entries),
 	    ("destroying target, but device list is not empty"));
 	xpt_release_bus(bus);
 	mtx_destroy(&target->luns_mtx);
 	if (target->luns)
 		free(target->luns, M_CAMXPT);
 	free(target, M_CAMXPT);
 }
 
 static struct cam_ed *
 xpt_alloc_device_default(struct cam_eb *bus, struct cam_et *target,
 			 lun_id_t lun_id)
 {
 	struct cam_ed *device;
 
 	device = xpt_alloc_device(bus, target, lun_id);
 	if (device == NULL)
 		return (NULL);
 
 	device->mintags = 1;
 	device->maxtags = 1;
 	return (device);
 }
 
 static void
 xpt_destroy_device(void *context, int pending)
 {
 	struct cam_ed	*device = context;
 
 	mtx_lock(&device->device_mtx);
 	mtx_destroy(&device->device_mtx);
 	free(device, M_CAMDEV);
 }
 
 struct cam_ed *
 xpt_alloc_device(struct cam_eb *bus, struct cam_et *target, lun_id_t lun_id)
 {
 	struct cam_ed	*cur_device, *device;
 	struct cam_devq	*devq;
 	cam_status status;
 
 	mtx_assert(&bus->eb_mtx, MA_OWNED);
 	/* Make space for us in the device queue on our bus */
 	devq = bus->sim->devq;
 	mtx_lock(&devq->send_mtx);
 	status = cam_devq_resize(devq, devq->send_queue.array_size + 1);
 	mtx_unlock(&devq->send_mtx);
 	if (status != CAM_REQ_CMP)
 		return (NULL);
 
 	device = (struct cam_ed *)malloc(sizeof(*device),
 					 M_CAMDEV, M_NOWAIT|M_ZERO);
 	if (device == NULL)
 		return (NULL);
 
 	cam_init_pinfo(&device->devq_entry);
 	device->target = target;
 	device->lun_id = lun_id;
 	device->sim = bus->sim;
 	if (cam_ccbq_init(&device->ccbq,
 			  bus->sim->max_dev_openings) != 0) {
 		free(device, M_CAMDEV);
 		return (NULL);
 	}
 	SLIST_INIT(&device->asyncs);
 	SLIST_INIT(&device->periphs);
 	device->generation = 0;
 	device->flags = CAM_DEV_UNCONFIGURED;
 	device->tag_delay_count = 0;
 	device->tag_saved_openings = 0;
 	device->refcount = 1;
 	mtx_init(&device->device_mtx, "CAM device lock", NULL, MTX_DEF);
 	callout_init_mtx(&device->callout, &devq->send_mtx, 0);
 	TASK_INIT(&device->device_destroy_task, 0, xpt_destroy_device, device);
 	/*
 	 * Hold a reference to our parent bus so it
 	 * will not go away before we do.
 	 */
 	target->refcount++;
 
 	cur_device = TAILQ_FIRST(&target->ed_entries);
 	while (cur_device != NULL && cur_device->lun_id < lun_id)
 		cur_device = TAILQ_NEXT(cur_device, links);
 	if (cur_device != NULL)
 		TAILQ_INSERT_BEFORE(cur_device, device, links);
 	else
 		TAILQ_INSERT_TAIL(&target->ed_entries, device, links);
 	target->generation++;
 	return (device);
 }
 
 void
 xpt_acquire_device(struct cam_ed *device)
 {
 	struct cam_eb *bus = device->target->bus;
 
 	mtx_lock(&bus->eb_mtx);
 	device->refcount++;
 	mtx_unlock(&bus->eb_mtx);
 }
 
 void
 xpt_release_device(struct cam_ed *device)
 {
 	struct cam_eb *bus = device->target->bus;
 	struct cam_devq *devq;
 
 	mtx_lock(&bus->eb_mtx);
 	if (--device->refcount > 0) {
 		mtx_unlock(&bus->eb_mtx);
 		return;
 	}
 
 	TAILQ_REMOVE(&device->target->ed_entries, device,links);
 	device->target->generation++;
 	mtx_unlock(&bus->eb_mtx);
 
 	/* Release our slot in the devq */
 	devq = bus->sim->devq;
 	mtx_lock(&devq->send_mtx);
 	cam_devq_resize(devq, devq->send_queue.array_size - 1);
 	mtx_unlock(&devq->send_mtx);
 
 	KASSERT(SLIST_EMPTY(&device->periphs),
 	    ("destroying device, but periphs list is not empty"));
 	KASSERT(device->devq_entry.index == CAM_UNQUEUED_INDEX,
 	    ("destroying device while still queued for ccbs"));
 
 	if ((device->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0)
 		callout_stop(&device->callout);
 
 	xpt_release_target(device->target);
 
 	cam_ccbq_fini(&device->ccbq);
 	/*
 	 * Free allocated memory.  free(9) does nothing if the
 	 * supplied pointer is NULL, so it is safe to call without
 	 * checking.
 	 */
 	free(device->supported_vpds, M_CAMXPT);
 	free(device->device_id, M_CAMXPT);
 	free(device->ext_inq, M_CAMXPT);
 	free(device->physpath, M_CAMXPT);
 	free(device->rcap_buf, M_CAMXPT);
 	free(device->serial_num, M_CAMXPT);
 	free(device->nvme_data, M_CAMXPT);
 	free(device->nvme_cdata, M_CAMXPT);
 	taskqueue_enqueue(xsoftc.xpt_taskq, &device->device_destroy_task);
 }
 
 u_int32_t
 xpt_dev_ccbq_resize(struct cam_path *path, int newopenings)
 {
 	int	result;
 	struct	cam_ed *dev;
 
 	dev = path->device;
 	mtx_lock(&dev->sim->devq->send_mtx);
 	result = cam_ccbq_resize(&dev->ccbq, newopenings);
 	mtx_unlock(&dev->sim->devq->send_mtx);
 	if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0
 	 || (dev->inq_flags & SID_CmdQue) != 0)
 		dev->tag_saved_openings = newopenings;
 	return (result);
 }
 
 static struct cam_eb *
 xpt_find_bus(path_id_t path_id)
 {
 	struct cam_eb *bus;
 
 	xpt_lock_buses();
 	for (bus = TAILQ_FIRST(&xsoftc.xpt_busses);
 	     bus != NULL;
 	     bus = TAILQ_NEXT(bus, links)) {
 		if (bus->path_id == path_id) {
 			bus->refcount++;
 			break;
 		}
 	}
 	xpt_unlock_buses();
 	return (bus);
 }
 
 static struct cam_et *
 xpt_find_target(struct cam_eb *bus, target_id_t	target_id)
 {
 	struct cam_et *target;
 
 	mtx_assert(&bus->eb_mtx, MA_OWNED);
 	for (target = TAILQ_FIRST(&bus->et_entries);
 	     target != NULL;
 	     target = TAILQ_NEXT(target, links)) {
 		if (target->target_id == target_id) {
 			target->refcount++;
 			break;
 		}
 	}
 	return (target);
 }
 
 static struct cam_ed *
 xpt_find_device(struct cam_et *target, lun_id_t lun_id)
 {
 	struct cam_ed *device;
 
 	mtx_assert(&target->bus->eb_mtx, MA_OWNED);
 	for (device = TAILQ_FIRST(&target->ed_entries);
 	     device != NULL;
 	     device = TAILQ_NEXT(device, links)) {
 		if (device->lun_id == lun_id) {
 			device->refcount++;
 			break;
 		}
 	}
 	return (device);
 }
 
 void
 xpt_start_tags(struct cam_path *path)
 {
 	struct ccb_relsim crs;
 	struct cam_ed *device;
 	struct cam_sim *sim;
 	int    newopenings;
 
 	device = path->device;
 	sim = path->bus->sim;
 	device->flags &= ~CAM_DEV_TAG_AFTER_COUNT;
 	xpt_freeze_devq(path, /*count*/1);
 	device->inq_flags |= SID_CmdQue;
 	if (device->tag_saved_openings != 0)
 		newopenings = device->tag_saved_openings;
 	else
 		newopenings = min(device->maxtags,
 				  sim->max_tagged_dev_openings);
 	xpt_dev_ccbq_resize(path, newopenings);
 	xpt_async(AC_GETDEV_CHANGED, path, NULL);
 	xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL);
 	crs.ccb_h.func_code = XPT_REL_SIMQ;
 	crs.release_flags = RELSIM_RELEASE_AFTER_QEMPTY;
 	crs.openings
 	    = crs.release_timeout
 	    = crs.qfrozen_cnt
 	    = 0;
 	xpt_action((union ccb *)&crs);
 }
 
 void
 xpt_stop_tags(struct cam_path *path)
 {
 	struct ccb_relsim crs;
 	struct cam_ed *device;
 	struct cam_sim *sim;
 
 	device = path->device;
 	sim = path->bus->sim;
 	device->flags &= ~CAM_DEV_TAG_AFTER_COUNT;
 	device->tag_delay_count = 0;
 	xpt_freeze_devq(path, /*count*/1);
 	device->inq_flags &= ~SID_CmdQue;
 	xpt_dev_ccbq_resize(path, sim->max_dev_openings);
 	xpt_async(AC_GETDEV_CHANGED, path, NULL);
 	xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL);
 	crs.ccb_h.func_code = XPT_REL_SIMQ;
 	crs.release_flags = RELSIM_RELEASE_AFTER_QEMPTY;
 	crs.openings
 	    = crs.release_timeout
 	    = crs.qfrozen_cnt
 	    = 0;
 	xpt_action((union ccb *)&crs);
 }
 
 /*
  * Assume all possible buses are detected by this time, so allow boot
  * as soon as they all are scanned.
  */
 static void
 xpt_boot_delay(void *arg)
 {
 
 	xpt_release_boot();
 }
 
 /*
  * Now that all config hooks have completed, start boot_delay timer,
  * waiting for possibly still undetected buses (USB) to appear.
  */
 static void
 xpt_ch_done(void *arg)
 {
 
 	callout_init(&xsoftc.boot_callout, 1);
 	callout_reset_sbt(&xsoftc.boot_callout, SBT_1MS * xsoftc.boot_delay, 0,
 	    xpt_boot_delay, NULL, 0);
 }
 SYSINIT(xpt_hw_delay, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_ANY, xpt_ch_done, NULL);
 
 /*
  * Now that interrupts are enabled, go find our devices
  */
 static void
 xpt_config(void *arg)
 {
 	if (taskqueue_start_threads(&xsoftc.xpt_taskq, 1, PRIBIO, "CAM taskq"))
 		printf("xpt_config: failed to create taskqueue thread.\n");
 
 	/* Setup debugging path */
 	if (cam_dflags != CAM_DEBUG_NONE) {
 		if (xpt_create_path(&cam_dpath, NULL,
 				    CAM_DEBUG_BUS, CAM_DEBUG_TARGET,
 				    CAM_DEBUG_LUN) != CAM_REQ_CMP) {
 			printf("xpt_config: xpt_create_path() failed for debug"
 			       " target %d:%d:%d, debugging disabled\n",
 			       CAM_DEBUG_BUS, CAM_DEBUG_TARGET, CAM_DEBUG_LUN);
 			cam_dflags = CAM_DEBUG_NONE;
 		}
 	} else
 		cam_dpath = NULL;
 
 	periphdriver_init(1);
 	xpt_hold_boot();
 
 	/* Fire up rescan thread. */
 	if (kproc_kthread_add(xpt_scanner_thread, NULL, &cam_proc, NULL, 0, 0,
 	    "cam", "scanner")) {
 		printf("xpt_config: failed to create rescan thread.\n");
 	}
 }
 
 void
 xpt_hold_boot_locked(void)
 {
 
 	if (xsoftc.buses_to_config++ == 0)
 		root_mount_hold_token("CAM", &xsoftc.xpt_rootmount);
 }
 
 void
 xpt_hold_boot(void)
 {
 
 	xpt_lock_buses();
 	xpt_hold_boot_locked();
 	xpt_unlock_buses();
 }
 
 void
 xpt_release_boot(void)
 {
 
 	xpt_lock_buses();
 	if (--xsoftc.buses_to_config == 0) {
 		if (xsoftc.buses_config_done == 0) {
 			xsoftc.buses_config_done = 1;
 			xsoftc.buses_to_config++;
 			TASK_INIT(&xsoftc.boot_task, 0, xpt_finishconfig_task,
 			    NULL);
 			taskqueue_enqueue(taskqueue_thread, &xsoftc.boot_task);
 		} else
 			root_mount_rel(&xsoftc.xpt_rootmount);
 	}
 	xpt_unlock_buses();
 }
 
 /*
  * If the given device only has one peripheral attached to it, and if that
  * peripheral is the passthrough driver, announce it.  This insures that the
  * user sees some sort of announcement for every peripheral in their system.
  */
 static int
 xptpassannouncefunc(struct cam_ed *device, void *arg)
 {
 	struct cam_periph *periph;
 	int i;
 
 	for (periph = SLIST_FIRST(&device->periphs), i = 0; periph != NULL;
 	     periph = SLIST_NEXT(periph, periph_links), i++);
 
 	periph = SLIST_FIRST(&device->periphs);
 	if ((i == 1)
 	 && (strncmp(periph->periph_name, "pass", 4) == 0))
 		xpt_announce_periph(periph, NULL);
 
 	return(1);
 }
 
 static void
 xpt_finishconfig_task(void *context, int pending)
 {
 
 	periphdriver_init(2);
 	/*
 	 * Check for devices with no "standard" peripheral driver
 	 * attached.  For any devices like that, announce the
 	 * passthrough driver so the user will see something.
 	 */
 	if (!bootverbose)
 		xpt_for_all_devices(xptpassannouncefunc, NULL);
 
 	xpt_release_boot();
 }
 
 cam_status
 xpt_register_async(int event, ac_callback_t *cbfunc, void *cbarg,
 		   struct cam_path *path)
 {
 	struct ccb_setasync csa;
 	cam_status status;
 	int xptpath = 0;
 
 	if (path == NULL) {
 		status = xpt_create_path(&path, /*periph*/NULL, CAM_XPT_PATH_ID,
 					 CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD);
 		if (status != CAM_REQ_CMP)
 			return (status);
 		xpt_path_lock(path);
 		xptpath = 1;
 	}
 
 	xpt_setup_ccb(&csa.ccb_h, path, CAM_PRIORITY_NORMAL);
 	csa.ccb_h.func_code = XPT_SASYNC_CB;
 	csa.event_enable = event;
 	csa.callback = cbfunc;
 	csa.callback_arg = cbarg;
 	xpt_action((union ccb *)&csa);
 	status = csa.ccb_h.status;
 
 	CAM_DEBUG(csa.ccb_h.path, CAM_DEBUG_TRACE,
 	    ("xpt_register_async: func %p\n", cbfunc));
 
 	if (xptpath) {
 		xpt_path_unlock(path);
 		xpt_free_path(path);
 	}
 
 	if ((status == CAM_REQ_CMP) &&
 	    (csa.event_enable & AC_FOUND_DEVICE)) {
 		/*
 		 * Get this peripheral up to date with all
 		 * the currently existing devices.
 		 */
 		xpt_for_all_devices(xptsetasyncfunc, &csa);
 	}
 	if ((status == CAM_REQ_CMP) &&
 	    (csa.event_enable & AC_PATH_REGISTERED)) {
 		/*
 		 * Get this peripheral up to date with all
 		 * the currently existing buses.
 		 */
 		xpt_for_all_busses(xptsetasyncbusfunc, &csa);
 	}
 
 	return (status);
 }
 
 static void
 xptaction(struct cam_sim *sim, union ccb *work_ccb)
 {
 	CAM_DEBUG(work_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xptaction\n"));
 
 	switch (work_ccb->ccb_h.func_code) {
 	/* Common cases first */
 	case XPT_PATH_INQ:		/* Path routing inquiry */
 	{
 		struct ccb_pathinq *cpi;
 
 		cpi = &work_ccb->cpi;
 		cpi->version_num = 1; /* XXX??? */
 		cpi->hba_inquiry = 0;
 		cpi->target_sprt = 0;
 		cpi->hba_misc = 0;
 		cpi->hba_eng_cnt = 0;
 		cpi->max_target = 0;
 		cpi->max_lun = 0;
 		cpi->initiator_id = 0;
 		strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
 		strlcpy(cpi->hba_vid, "", HBA_IDLEN);
 		strlcpy(cpi->dev_name, sim->sim_name, DEV_IDLEN);
 		cpi->unit_number = sim->unit_number;
 		cpi->bus_id = sim->bus_id;
 		cpi->base_transfer_speed = 0;
 		cpi->protocol = PROTO_UNSPECIFIED;
 		cpi->protocol_version = PROTO_VERSION_UNSPECIFIED;
 		cpi->transport = XPORT_UNSPECIFIED;
 		cpi->transport_version = XPORT_VERSION_UNSPECIFIED;
 		cpi->ccb_h.status = CAM_REQ_CMP;
 		xpt_done(work_ccb);
 		break;
 	}
 	default:
 		work_ccb->ccb_h.status = CAM_REQ_INVALID;
 		xpt_done(work_ccb);
 		break;
 	}
 }
 
 /*
  * The xpt as a "controller" has no interrupt sources, so polling
  * is a no-op.
  */
 static void
 xptpoll(struct cam_sim *sim)
 {
 }
 
 void
 xpt_lock_buses(void)
 {
 	mtx_lock(&xsoftc.xpt_topo_lock);
 }
 
 void
 xpt_unlock_buses(void)
 {
 	mtx_unlock(&xsoftc.xpt_topo_lock);
 }
 
 struct mtx *
 xpt_path_mtx(struct cam_path *path)
 {
 
 	return (&path->device->device_mtx);
 }
 
 static void
 xpt_done_process(struct ccb_hdr *ccb_h)
 {
 	struct cam_sim *sim = NULL;
 	struct cam_devq *devq = NULL;
 	struct mtx *mtx = NULL;
 
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 	struct ccb_scsiio *csio;
 
 	if (ccb_h->func_code == XPT_SCSI_IO) {
 		csio = &((union ccb *)ccb_h)->csio;
 		if (csio->bio != NULL)
 			biotrack(csio->bio, __func__);
 	}
 #endif
 
 	if (ccb_h->flags & CAM_HIGH_POWER) {
 		struct highpowerlist	*hphead;
 		struct cam_ed		*device;
 
 		mtx_lock(&xsoftc.xpt_highpower_lock);
 		hphead = &xsoftc.highpowerq;
 
 		device = STAILQ_FIRST(hphead);
 
 		/*
 		 * Increment the count since this command is done.
 		 */
 		xsoftc.num_highpower++;
 
 		/*
 		 * Any high powered commands queued up?
 		 */
 		if (device != NULL) {
 
 			STAILQ_REMOVE_HEAD(hphead, highpowerq_entry);
 			mtx_unlock(&xsoftc.xpt_highpower_lock);
 
 			mtx_lock(&device->sim->devq->send_mtx);
 			xpt_release_devq_device(device,
 					 /*count*/1, /*runqueue*/TRUE);
 			mtx_unlock(&device->sim->devq->send_mtx);
 		} else
 			mtx_unlock(&xsoftc.xpt_highpower_lock);
 	}
 
 	/*
 	 * Insulate against a race where the periph is destroyed but CCBs are
 	 * still not all processed. This shouldn't happen, but allows us better
 	 * bug diagnostic when it does.
 	 */
 	if (ccb_h->path->bus)
 		sim = ccb_h->path->bus->sim;
 
 	if (ccb_h->status & CAM_RELEASE_SIMQ) {
 		KASSERT(sim, ("sim missing for CAM_RELEASE_SIMQ request"));
 		xpt_release_simq(sim, /*run_queue*/FALSE);
 		ccb_h->status &= ~CAM_RELEASE_SIMQ;
 	}
 
 	if ((ccb_h->flags & CAM_DEV_QFRZDIS)
 	 && (ccb_h->status & CAM_DEV_QFRZN)) {
 		xpt_release_devq(ccb_h->path, /*count*/1, /*run_queue*/TRUE);
 		ccb_h->status &= ~CAM_DEV_QFRZN;
 	}
 
 	if ((ccb_h->func_code & XPT_FC_USER_CCB) == 0) {
 		struct cam_ed *dev = ccb_h->path->device;
 
 		if (sim)
 			devq = sim->devq;
 		KASSERT(devq, ("Periph disappeared with request pending."));
 
 		mtx_lock(&devq->send_mtx);
 		devq->send_active--;
 		devq->send_openings++;
 		cam_ccbq_ccb_done(&dev->ccbq, (union ccb *)ccb_h);
 
 		if (((dev->flags & CAM_DEV_REL_ON_QUEUE_EMPTY) != 0
 		  && (dev->ccbq.dev_active == 0))) {
 			dev->flags &= ~CAM_DEV_REL_ON_QUEUE_EMPTY;
 			xpt_release_devq_device(dev, /*count*/1,
 					 /*run_queue*/FALSE);
 		}
 
 		if (((dev->flags & CAM_DEV_REL_ON_COMPLETE) != 0
 		  && (ccb_h->status&CAM_STATUS_MASK) != CAM_REQUEUE_REQ)) {
 			dev->flags &= ~CAM_DEV_REL_ON_COMPLETE;
 			xpt_release_devq_device(dev, /*count*/1,
 					 /*run_queue*/FALSE);
 		}
 
 		if (!device_is_queued(dev))
 			(void)xpt_schedule_devq(devq, dev);
 		xpt_run_devq(devq);
 		mtx_unlock(&devq->send_mtx);
 
 		if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0) {
 			mtx = xpt_path_mtx(ccb_h->path);
 			mtx_lock(mtx);
 
 			if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0
 			 && (--dev->tag_delay_count == 0))
 				xpt_start_tags(ccb_h->path);
 		}
 	}
 
 	if ((ccb_h->flags & CAM_UNLOCKED) == 0) {
 		if (mtx == NULL) {
 			mtx = xpt_path_mtx(ccb_h->path);
 			mtx_lock(mtx);
 		}
 	} else {
 		if (mtx != NULL) {
 			mtx_unlock(mtx);
 			mtx = NULL;
 		}
 	}
 
 	/* Call the peripheral driver's callback */
 	ccb_h->pinfo.index = CAM_UNQUEUED_INDEX;
 	(*ccb_h->cbfcnp)(ccb_h->path->periph, (union ccb *)ccb_h);
 	if (mtx != NULL)
 		mtx_unlock(mtx);
 }
 
 void
 xpt_done_td(void *arg)
 {
 	struct cam_doneq *queue = arg;
 	struct ccb_hdr *ccb_h;
 	STAILQ_HEAD(, ccb_hdr)	doneq;
 
 	STAILQ_INIT(&doneq);
 	mtx_lock(&queue->cam_doneq_mtx);
 	while (1) {
 		while (STAILQ_EMPTY(&queue->cam_doneq)) {
 			queue->cam_doneq_sleep = 1;
 			msleep(&queue->cam_doneq, &queue->cam_doneq_mtx,
 			    PRIBIO, "-", 0);
 			queue->cam_doneq_sleep = 0;
 		}
 		STAILQ_CONCAT(&doneq, &queue->cam_doneq);
 		mtx_unlock(&queue->cam_doneq_mtx);
 
 		THREAD_NO_SLEEPING();
 		while ((ccb_h = STAILQ_FIRST(&doneq)) != NULL) {
 			STAILQ_REMOVE_HEAD(&doneq, sim_links.stqe);
 			xpt_done_process(ccb_h);
 		}
 		THREAD_SLEEPING_OK();
 
 		mtx_lock(&queue->cam_doneq_mtx);
 	}
 }
 
 static void
 camisr_runqueue(void)
 {
 	struct	ccb_hdr *ccb_h;
 	struct cam_doneq *queue;
 	int i;
 
 	/* Process global queues. */
 	for (i = 0; i < cam_num_doneqs; i++) {
 		queue = &cam_doneqs[i];
 		mtx_lock(&queue->cam_doneq_mtx);
 		while ((ccb_h = STAILQ_FIRST(&queue->cam_doneq)) != NULL) {
 			STAILQ_REMOVE_HEAD(&queue->cam_doneq, sim_links.stqe);
 			mtx_unlock(&queue->cam_doneq_mtx);
 			xpt_done_process(ccb_h);
 			mtx_lock(&queue->cam_doneq_mtx);
 		}
 		mtx_unlock(&queue->cam_doneq_mtx);
 	}
 }
 
 struct kv 
 {
 	uint32_t v;
 	const char *name;
 };
 
 static struct kv map[] = {
 	{ XPT_NOOP, "XPT_NOOP" },
 	{ XPT_SCSI_IO, "XPT_SCSI_IO" },
 	{ XPT_GDEV_TYPE, "XPT_GDEV_TYPE" },
 	{ XPT_GDEVLIST, "XPT_GDEVLIST" },
 	{ XPT_PATH_INQ, "XPT_PATH_INQ" },
 	{ XPT_REL_SIMQ, "XPT_REL_SIMQ" },
 	{ XPT_SASYNC_CB, "XPT_SASYNC_CB" },
 	{ XPT_SDEV_TYPE, "XPT_SDEV_TYPE" },
 	{ XPT_SCAN_BUS, "XPT_SCAN_BUS" },
 	{ XPT_DEV_MATCH, "XPT_DEV_MATCH" },
 	{ XPT_DEBUG, "XPT_DEBUG" },
 	{ XPT_PATH_STATS, "XPT_PATH_STATS" },
 	{ XPT_GDEV_STATS, "XPT_GDEV_STATS" },
 	{ XPT_DEV_ADVINFO, "XPT_DEV_ADVINFO" },
 	{ XPT_ASYNC, "XPT_ASYNC" },
 	{ XPT_ABORT, "XPT_ABORT" },
 	{ XPT_RESET_BUS, "XPT_RESET_BUS" },
 	{ XPT_RESET_DEV, "XPT_RESET_DEV" },
 	{ XPT_TERM_IO, "XPT_TERM_IO" },
 	{ XPT_SCAN_LUN, "XPT_SCAN_LUN" },
 	{ XPT_GET_TRAN_SETTINGS, "XPT_GET_TRAN_SETTINGS" },
 	{ XPT_SET_TRAN_SETTINGS, "XPT_SET_TRAN_SETTINGS" },
 	{ XPT_CALC_GEOMETRY, "XPT_CALC_GEOMETRY" },
 	{ XPT_ATA_IO, "XPT_ATA_IO" },
 	{ XPT_GET_SIM_KNOB, "XPT_GET_SIM_KNOB" },
 	{ XPT_SET_SIM_KNOB, "XPT_SET_SIM_KNOB" },
 	{ XPT_NVME_IO, "XPT_NVME_IO" },
 	{ XPT_MMC_IO, "XPT_MMC_IO" },
 	{ XPT_SMP_IO, "XPT_SMP_IO" },
 	{ XPT_SCAN_TGT, "XPT_SCAN_TGT" },
 	{ XPT_NVME_ADMIN, "XPT_NVME_ADMIN" },
 	{ XPT_ENG_INQ, "XPT_ENG_INQ" },
 	{ XPT_ENG_EXEC, "XPT_ENG_EXEC" },
 	{ XPT_EN_LUN, "XPT_EN_LUN" },
 	{ XPT_TARGET_IO, "XPT_TARGET_IO" },
 	{ XPT_ACCEPT_TARGET_IO, "XPT_ACCEPT_TARGET_IO" },
 	{ XPT_CONT_TARGET_IO, "XPT_CONT_TARGET_IO" },
 	{ XPT_IMMED_NOTIFY, "XPT_IMMED_NOTIFY" },
 	{ XPT_NOTIFY_ACK, "XPT_NOTIFY_ACK" },
 	{ XPT_IMMEDIATE_NOTIFY, "XPT_IMMEDIATE_NOTIFY" },
 	{ XPT_NOTIFY_ACKNOWLEDGE, "XPT_NOTIFY_ACKNOWLEDGE" },
 	{ 0, 0 }
 };
 
 const char *
 xpt_action_name(uint32_t action) 
 {
 	static char buffer[32];	/* Only for unknown messages -- racy */
 	struct kv *walker = map;
 
 	while (walker->name != NULL) {
 		if (walker->v == action)
 			return (walker->name);
 		walker++;
 	}
 
 	snprintf(buffer, sizeof(buffer), "%#x", action);
 	return (buffer);
 }
Index: stable/12/sys/geom/geom_io.c
===================================================================
--- stable/12/sys/geom/geom_io.c	(revision 355609)
+++ stable/12/sys/geom/geom_io.c	(revision 355610)
@@ -1,1095 +1,1095 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/ktr.h>
 #include <sys/proc.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/vmem.h>
 
 #include <sys/errno.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 #include <sys/devicestat.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 static int	g_io_transient_map_bio(struct bio *bp);
 
 static struct g_bioq g_bio_run_down;
 static struct g_bioq g_bio_run_up;
 
 /*
  * Pace is a hint that we've had some trouble recently allocating
  * bios, so we should back off trying to send I/O down the stack
  * a bit to let the problem resolve. When pacing, we also turn
  * off direct dispatch to also reduce memory pressure from I/Os
  * there, at the expxense of some added latency while the memory
  * pressures exist. See g_io_schedule_down() for more details
  * and limitations.
  */
-static volatile u_int pace;
+static volatile u_int __read_mostly pace;
 
-static uma_zone_t	biozone;
+static uma_zone_t __read_mostly biozone;
 
 /*
  * The head of the list of classifiers used in g_io_request.
  * Use g_register_classifier() and g_unregister_classifier()
  * to add/remove entries to the list.
  * Classifiers are invoked in registration order.
  */
-static TAILQ_HEAD(g_classifier_tailq, g_classifier_hook)
-    g_classifier_tailq = TAILQ_HEAD_INITIALIZER(g_classifier_tailq);
+static TAILQ_HEAD(, g_classifier_hook) g_classifier_tailq __read_mostly =
+    TAILQ_HEAD_INITIALIZER(g_classifier_tailq);
 
 #include <machine/atomic.h>
 
 static void
 g_bioq_lock(struct g_bioq *bq)
 {
 
 	mtx_lock(&bq->bio_queue_lock);
 }
 
 static void
 g_bioq_unlock(struct g_bioq *bq)
 {
 
 	mtx_unlock(&bq->bio_queue_lock);
 }
 
 #if 0
 static void
 g_bioq_destroy(struct g_bioq *bq)
 {
 
 	mtx_destroy(&bq->bio_queue_lock);
 }
 #endif
 
 static void
 g_bioq_init(struct g_bioq *bq)
 {
 
 	TAILQ_INIT(&bq->bio_queue);
 	mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF);
 }
 
 static struct bio *
 g_bioq_first(struct g_bioq *bq)
 {
 	struct bio *bp;
 
 	bp = TAILQ_FIRST(&bq->bio_queue);
 	if (bp != NULL) {
 		KASSERT((bp->bio_flags & BIO_ONQUEUE),
 		    ("Bio not on queue bp=%p target %p", bp, bq));
 		bp->bio_flags &= ~BIO_ONQUEUE;
 		TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue);
 		bq->bio_queue_length--;
 	}
 	return (bp);
 }
 
 struct bio *
 g_new_bio(void)
 {
 	struct bio *bp;
 
 	bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR1(KTR_GEOM, "g_new_bio(): %p", bp);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3, 0);
 	}
 #endif
 	return (bp);
 }
 
 struct bio *
 g_alloc_bio(void)
 {
 	struct bio *bp;
 
 	bp = uma_zalloc(biozone, M_WAITOK | M_ZERO);
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3, 0);
 	}
 #endif
 	return (bp);
 }
 
 void
 g_destroy_bio(struct bio *bp)
 {
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3, 0);
 	}
 #endif
 	uma_zfree(biozone, bp);
 }
 
 struct bio *
 g_clone_bio(struct bio *bp)
 {
 	struct bio *bp2;
 
 	bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
 	if (bp2 != NULL) {
 		bp2->bio_parent = bp;
 		bp2->bio_cmd = bp->bio_cmd;
 		/*
 		 *  BIO_ORDERED flag may be used by disk drivers to enforce
 		 *  ordering restrictions, so this flag needs to be cloned.
 		 *  BIO_UNMAPPED and BIO_VLIST should be inherited, to properly
 		 *  indicate which way the buffer is passed.
 		 *  Other bio flags are not suitable for cloning.
 		 */
 		bp2->bio_flags = bp->bio_flags &
 		    (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST);
 		bp2->bio_length = bp->bio_length;
 		bp2->bio_offset = bp->bio_offset;
 		bp2->bio_data = bp->bio_data;
 		bp2->bio_ma = bp->bio_ma;
 		bp2->bio_ma_n = bp->bio_ma_n;
 		bp2->bio_ma_offset = bp->bio_ma_offset;
 		bp2->bio_attribute = bp->bio_attribute;
 		if (bp->bio_cmd == BIO_ZONE)
 			bcopy(&bp->bio_zone, &bp2->bio_zone,
 			    sizeof(bp->bio_zone));
 		/* Inherit classification info from the parent */
 		bp2->bio_classifier1 = bp->bio_classifier1;
 		bp2->bio_classifier2 = bp->bio_classifier2;
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 		bp2->bio_track_bp = bp->bio_track_bp;
 #endif
 		bp->bio_children++;
 	}
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3, 0);
 	}
 #endif
 	return(bp2);
 }
 
 struct bio *
 g_duplicate_bio(struct bio *bp)
 {
 	struct bio *bp2;
 
 	bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
 	bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST);
 	bp2->bio_parent = bp;
 	bp2->bio_cmd = bp->bio_cmd;
 	bp2->bio_length = bp->bio_length;
 	bp2->bio_offset = bp->bio_offset;
 	bp2->bio_data = bp->bio_data;
 	bp2->bio_ma = bp->bio_ma;
 	bp2->bio_ma_n = bp->bio_ma_n;
 	bp2->bio_ma_offset = bp->bio_ma_offset;
 	bp2->bio_attribute = bp->bio_attribute;
 	bp->bio_children++;
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3, 0);
 	}
 #endif
 	return(bp2);
 }
 
 void
 g_reset_bio(struct bio *bp)
 {
 
 	bzero(bp, sizeof(*bp));
 }
 
 void
 g_io_init()
 {
 
 	g_bioq_init(&g_bio_run_down);
 	g_bioq_init(&g_bio_run_up);
 	biozone = uma_zcreate("g_bio", sizeof (struct bio),
 	    NULL, NULL,
 	    NULL, NULL,
 	    0, 0);
 }
 
 int
 g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr)
 {
 	struct bio *bp;
 	int error;
 
 	g_trace(G_T_BIO, "bio_getattr(%s)", attr);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_GETATTR;
 	bp->bio_done = NULL;
 	bp->bio_attribute = attr;
 	bp->bio_length = *len;
 	bp->bio_data = ptr;
 	g_io_request(bp, cp);
 	error = biowait(bp, "ggetattr");
 	*len = bp->bio_completed;
 	g_destroy_bio(bp);
 	return (error);
 }
 
 int
 g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp)
 {
 	struct bio *bp;
 	int error;
 	
 	g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_ZONE;
 	bp->bio_done = NULL;
 	/*
 	 * XXX KDM need to handle report zone data.
 	 */
 	bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args));
 	if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES)
 		bp->bio_length =
 		    zone_args->zone_params.report.entries_allocated *
 		    sizeof(struct disk_zone_rep_entry);
 	else
 		bp->bio_length = 0;
 
 	g_io_request(bp, cp);
 	error = biowait(bp, "gzone");
 	bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args));
 	g_destroy_bio(bp);
 	return (error);
 }
 
 int
 g_io_flush(struct g_consumer *cp)
 {
 	struct bio *bp;
 	int error;
 
 	g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_FLUSH;
 	bp->bio_flags |= BIO_ORDERED;
 	bp->bio_done = NULL;
 	bp->bio_attribute = NULL;
 	bp->bio_offset = cp->provider->mediasize;
 	bp->bio_length = 0;
 	bp->bio_data = NULL;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gflush");
 	g_destroy_bio(bp);
 	return (error);
 }
 
 static int
 g_io_check(struct bio *bp)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	off_t excess;
 	int error;
 
 	biotrack(bp, __func__);
 
 	cp = bp->bio_from;
 	pp = bp->bio_to;
 
 	/* Fail if access counters dont allow the operation */
 	switch(bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_GETATTR:
 		if (cp->acr == 0)
 			return (EPERM);
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 	case BIO_FLUSH:
 		if (cp->acw == 0)
 			return (EPERM);
 		break;
 	case BIO_ZONE:
 		if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) ||
 		    (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) {
 			if (cp->acr == 0)
 				return (EPERM);
 		} else if (cp->acw == 0)
 			return (EPERM);
 		break;
 	default:
 		return (EPERM);
 	}
 	/* if provider is marked for error, don't disturb. */
 	if (pp->error)
 		return (pp->error);
 	if (cp->flags & G_CF_ORPHAN)
 		return (ENXIO);
 
 	switch(bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		/* Zero sectorsize or mediasize is probably a lack of media. */
 		if (pp->sectorsize == 0 || pp->mediasize == 0)
 			return (ENXIO);
 		/* Reject I/O not on sector boundary */
 		if (bp->bio_offset % pp->sectorsize)
 			return (EINVAL);
 		/* Reject I/O not integral sector long */
 		if (bp->bio_length % pp->sectorsize)
 			return (EINVAL);
 		/* Reject requests before or past the end of media. */
 		if (bp->bio_offset < 0)
 			return (EIO);
 		if (bp->bio_offset > pp->mediasize)
 			return (EIO);
 
 		/* Truncate requests to the end of providers media. */
 		excess = bp->bio_offset + bp->bio_length;
 		if (excess > bp->bio_to->mediasize) {
 			KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 ||
 			    round_page(bp->bio_ma_offset +
 			    bp->bio_length) / PAGE_SIZE == bp->bio_ma_n,
 			    ("excess bio %p too short", bp));
 			excess -= bp->bio_to->mediasize;
 			bp->bio_length -= excess;
 			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 				bp->bio_ma_n = round_page(bp->bio_ma_offset +
 				    bp->bio_length) / PAGE_SIZE;
 			}
 			if (excess > 0)
 				CTR3(KTR_GEOM, "g_down truncated bio "
 				    "%p provider %s by %d", bp,
 				    bp->bio_to->name, excess);
 		}
 
 		/* Deliver zero length transfers right here. */
 		if (bp->bio_length == 0) {
 			CTR2(KTR_GEOM, "g_down terminated 0-length "
 			    "bp %p provider %s", bp, bp->bio_to->name);
 			return (0);
 		}
 
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
 		    (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
 		    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
 			if ((error = g_io_transient_map_bio(bp)) >= 0)
 				return (error);
 		}
 		break;
 	default:
 		break;
 	}
 	return (EJUSTRETURN);
 }
 
 /*
  * bio classification support.
  *
  * g_register_classifier() and g_unregister_classifier()
  * are used to add/remove a classifier from the list.
  * The list is protected using the g_bio_run_down lock,
  * because the classifiers are called in this path.
  *
  * g_io_request() passes bio's that are not already classified
  * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers().
  * Classifiers can store their result in the two fields
  * bio_classifier1 and bio_classifier2.
  * A classifier that updates one of the fields should
  * return a non-zero value.
  * If no classifier updates the field, g_run_classifiers() sets
  * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls.
  */
 
 int
 g_register_classifier(struct g_classifier_hook *hook)
 {
 
 	g_bioq_lock(&g_bio_run_down);
 	TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link);
 	g_bioq_unlock(&g_bio_run_down);
 
 	return (0);
 }
 
 void
 g_unregister_classifier(struct g_classifier_hook *hook)
 {
 	struct g_classifier_hook *entry;
 
 	g_bioq_lock(&g_bio_run_down);
 	TAILQ_FOREACH(entry, &g_classifier_tailq, link) {
 		if (entry == hook) {
 			TAILQ_REMOVE(&g_classifier_tailq, hook, link);
 			break;
 		}
 	}
 	g_bioq_unlock(&g_bio_run_down);
 }
 
 static void
 g_run_classifiers(struct bio *bp)
 {
 	struct g_classifier_hook *hook;
 	int classified = 0;
 
 	biotrack(bp, __func__);
 
 	TAILQ_FOREACH(hook, &g_classifier_tailq, link)
 		classified |= hook->func(hook->arg, bp);
 
 	if (!classified)
 		bp->bio_classifier1 = BIO_NOTCLASSIFIED;
 }
 
 void
 g_io_request(struct bio *bp, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	struct mtx *mtxp;
 	int direct, error, first;
 	uint8_t cmd;
 
 	biotrack(bp, __func__);
 
 	KASSERT(cp != NULL, ("NULL cp in g_io_request"));
 	KASSERT(bp != NULL, ("NULL bp in g_io_request"));
 	pp = cp->provider;
 	KASSERT(pp != NULL, ("consumer not attached in g_io_request"));
 #ifdef DIAGNOSTIC
 	KASSERT(bp->bio_driver1 == NULL,
 	    ("bio_driver1 used by the consumer (geom %s)", cp->geom->name));
 	KASSERT(bp->bio_driver2 == NULL,
 	    ("bio_driver2 used by the consumer (geom %s)", cp->geom->name));
 	KASSERT(bp->bio_pflags == 0,
 	    ("bio_pflags used by the consumer (geom %s)", cp->geom->name));
 	/*
 	 * Remember consumer's private fields, so we can detect if they were
 	 * modified by the provider.
 	 */
 	bp->_bio_caller1 = bp->bio_caller1;
 	bp->_bio_caller2 = bp->bio_caller2;
 	bp->_bio_cflags = bp->bio_cflags;
 #endif
 
 	cmd = bp->bio_cmd;
 	if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) {
 		KASSERT(bp->bio_data != NULL,
 		    ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd));
 	}
 	if (cmd == BIO_DELETE || cmd == BIO_FLUSH) {
 		KASSERT(bp->bio_data == NULL,
 		    ("non-NULL bp->data in g_io_request(cmd=%hu)",
 		    bp->bio_cmd));
 	}
 	if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) {
 		KASSERT(bp->bio_offset % cp->provider->sectorsize == 0,
 		    ("wrong offset %jd for sectorsize %u",
 		    bp->bio_offset, cp->provider->sectorsize));
 		KASSERT(bp->bio_length % cp->provider->sectorsize == 0,
 		    ("wrong length %jd for sectorsize %u",
 		    bp->bio_length, cp->provider->sectorsize));
 	}
 
 	g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d",
 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd);
 
 	bp->bio_from = cp;
 	bp->bio_to = pp;
 	bp->bio_error = 0;
 	bp->bio_completed = 0;
 
 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
 	    ("Bio already on queue bp=%p", bp));
 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
 		binuptime(&bp->bio_t0);
 	else
 		getbinuptime(&bp->bio_t0);
 
 #ifdef GET_STACK_USAGE
 	direct = (cp->flags & G_CF_DIRECT_SEND) != 0 &&
 	    (pp->flags & G_PF_DIRECT_RECEIVE) != 0 &&
 	    !g_is_geom_thread(curthread) &&
 	    ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ||
 	    (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) &&
 	    pace == 0;
 	if (direct) {
 		/* Block direct execution if less then half of stack left. */
 		size_t	st, su;
 		GET_STACK_USAGE(st, su);
 		if (su * 2 > st)
 			direct = 0;
 	}
 #else
 	direct = 0;
 #endif
 
 	if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) {
 		g_bioq_lock(&g_bio_run_down);
 		g_run_classifiers(bp);
 		g_bioq_unlock(&g_bio_run_down);
 	}
 
 	/*
 	 * The statistics collection is lockless, as such, but we
 	 * can not update one instance of the statistics from more
 	 * than one thread at a time, so grab the lock first.
 	 */
 	mtxp = mtx_pool_find(mtxpool_sleep, pp);
 	mtx_lock(mtxp);
 	if (g_collectstats & G_STATS_PROVIDERS)
 		devstat_start_transaction(pp->stat, &bp->bio_t0);
 	if (g_collectstats & G_STATS_CONSUMERS)
 		devstat_start_transaction(cp->stat, &bp->bio_t0);
 	pp->nstart++;
 	cp->nstart++;
 	mtx_unlock(mtxp);
 
 	if (direct) {
 		error = g_io_check(bp);
 		if (error >= 0) {
 			CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p "
 			    "provider %s returned %d", bp, bp->bio_to->name,
 			    error);
 			g_io_deliver(bp, error);
 			return;
 		}
 		bp->bio_to->geom->start(bp);
 	} else {
 		g_bioq_lock(&g_bio_run_down);
 		first = TAILQ_EMPTY(&g_bio_run_down.bio_queue);
 		TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue);
 		bp->bio_flags |= BIO_ONQUEUE;
 		g_bio_run_down.bio_queue_length++;
 		g_bioq_unlock(&g_bio_run_down);
 		/* Pass it on down. */
 		if (first)
 			wakeup(&g_wait_down);
 	}
 }
 
 void
 g_io_deliver(struct bio *bp, int error)
 {
 	struct bintime now;
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	struct mtx *mtxp;
 	int direct, first;
 
 	biotrack(bp, __func__);
 
 	KASSERT(bp != NULL, ("NULL bp in g_io_deliver"));
 	pp = bp->bio_to;
 	KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver"));
 	cp = bp->bio_from;
 	if (cp == NULL) {
 		bp->bio_error = error;
 		bp->bio_done(bp);
 		return;
 	}
 	KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver"));
 	KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver"));
 #ifdef DIAGNOSTIC
 	/*
 	 * Some classes - GJournal in particular - can modify bio's
 	 * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO
 	 * flag means it's an expected behaviour for that particular geom.
 	 */
 	if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) {
 		KASSERT(bp->bio_caller1 == bp->_bio_caller1,
 		    ("bio_caller1 used by the provider %s", pp->name));
 		KASSERT(bp->bio_caller2 == bp->_bio_caller2,
 		    ("bio_caller2 used by the provider %s", pp->name));
 		KASSERT(bp->bio_cflags == bp->_bio_cflags,
 		    ("bio_cflags used by the provider %s", pp->name));
 	}
 #endif
 	KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0"));
 	KASSERT(bp->bio_completed <= bp->bio_length,
 	    ("bio_completed can't be greater than bio_length"));
 
 	g_trace(G_T_BIO,
 "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd",
 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error,
 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
 
 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
 	    ("Bio already on queue bp=%p", bp));
 
 	/*
 	 * XXX: next two doesn't belong here
 	 */
 	bp->bio_bcount = bp->bio_length;
 	bp->bio_resid = bp->bio_bcount - bp->bio_completed;
 
 #ifdef GET_STACK_USAGE
 	direct = (pp->flags & G_PF_DIRECT_SEND) &&
 		 (cp->flags & G_CF_DIRECT_RECEIVE) &&
 		 !g_is_geom_thread(curthread);
 	if (direct) {
 		/* Block direct execution if less then half of stack left. */
 		size_t	st, su;
 		GET_STACK_USAGE(st, su);
 		if (su * 2 > st)
 			direct = 0;
 	}
 #else
 	direct = 0;
 #endif
 
 	/*
 	 * The statistics collection is lockless, as such, but we
 	 * can not update one instance of the statistics from more
 	 * than one thread at a time, so grab the lock first.
 	 */
 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
 		binuptime(&now);
 	mtxp = mtx_pool_find(mtxpool_sleep, cp);
 	mtx_lock(mtxp);
 	if (g_collectstats & G_STATS_PROVIDERS)
 		devstat_end_transaction_bio_bt(pp->stat, bp, &now);
 	if (g_collectstats & G_STATS_CONSUMERS)
 		devstat_end_transaction_bio_bt(cp->stat, bp, &now);
 	cp->nend++;
 	pp->nend++;
 	mtx_unlock(mtxp);
 
 	if (error != ENOMEM) {
 		bp->bio_error = error;
 		if (direct) {
 			biodone(bp);
 		} else {
 			g_bioq_lock(&g_bio_run_up);
 			first = TAILQ_EMPTY(&g_bio_run_up.bio_queue);
 			TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue);
 			bp->bio_flags |= BIO_ONQUEUE;
 			g_bio_run_up.bio_queue_length++;
 			g_bioq_unlock(&g_bio_run_up);
 			if (first)
 				wakeup(&g_wait_up);
 		}
 		return;
 	}
 
 	if (bootverbose)
 		printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name);
 	bp->bio_children = 0;
 	bp->bio_inbed = 0;
 	bp->bio_driver1 = NULL;
 	bp->bio_driver2 = NULL;
 	bp->bio_pflags = 0;
 	g_io_request(bp, cp);
 	pace = 1;
 	return;
 }
 
 SYSCTL_DECL(_kern_geom);
 
 static long transient_maps;
 SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
     &transient_maps, 0,
     "Total count of the transient mapping requests");
 u_int transient_map_retries = 10;
 SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
     &transient_map_retries, 0,
     "Max count of retries used before giving up on creating transient map");
 int transient_map_hard_failures;
 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
     &transient_map_hard_failures, 0,
     "Failures to establish the transient mapping due to retry attempts "
     "exhausted");
 int transient_map_soft_failures;
 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
     &transient_map_soft_failures, 0,
     "Count of retried failures to establish the transient mapping");
 int inflight_transient_maps;
 SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
     &inflight_transient_maps, 0,
     "Current count of the active transient maps");
 
 static int
 g_io_transient_map_bio(struct bio *bp)
 {
 	vm_offset_t addr;
 	long size;
 	u_int retried;
 
 	KASSERT(unmapped_buf_allowed, ("unmapped disabled"));
 
 	size = round_page(bp->bio_ma_offset + bp->bio_length);
 	KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
 	addr = 0;
 	retried = 0;
 	atomic_add_long(&transient_maps, 1);
 retry:
 	if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) {
 		if (transient_map_retries != 0 &&
 		    retried >= transient_map_retries) {
 			CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
 			    bp, bp->bio_to->name);
 			atomic_add_int(&transient_map_hard_failures, 1);
 			return (EDEADLK/* XXXKIB */);
 		} else {
 			/*
 			 * Naive attempt to quisce the I/O to get more
 			 * in-flight requests completed and defragment
 			 * the transient_arena.
 			 */
 			CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
 			    bp, bp->bio_to->name, retried);
 			pause("g_d_tra", hz / 10);
 			retried++;
 			atomic_add_int(&transient_map_soft_failures, 1);
 			goto retry;
 		}
 	}
 	atomic_add_int(&inflight_transient_maps, 1);
 	pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
 	bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
 	bp->bio_flags |= BIO_TRANSIENT_MAPPING;
 	bp->bio_flags &= ~BIO_UNMAPPED;
 	return (EJUSTRETURN);
 }
 
 void
 g_io_schedule_down(struct thread *tp __unused)
 {
 	struct bio *bp;
 	int error;
 
 	for(;;) {
 		g_bioq_lock(&g_bio_run_down);
 		bp = g_bioq_first(&g_bio_run_down);
 		if (bp == NULL) {
 			CTR0(KTR_GEOM, "g_down going to sleep");
 			msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock,
 			    PRIBIO | PDROP, "-", 0);
 			continue;
 		}
 		CTR0(KTR_GEOM, "g_down has work to do");
 		g_bioq_unlock(&g_bio_run_down);
 		biotrack(bp, __func__);
 		if (pace != 0) {
 			/*
 			 * There has been at least one memory allocation
 			 * failure since the last I/O completed. Pause 1ms to
 			 * give the system a chance to free up memory. We only
 			 * do this once because a large number of allocations
 			 * can fail in the direct dispatch case and there's no
 			 * relationship between the number of these failures and
 			 * the length of the outage. If there's still an outage,
 			 * we'll pause again and again until it's
 			 * resolved. Older versions paused longer and once per
 			 * allocation failure. This was OK for a single threaded
 			 * g_down, but with direct dispatch would lead to max of
 			 * 10 IOPs for minutes at a time when transient memory
 			 * issues prevented allocation for a batch of requests
 			 * from the upper layers.
 			 *
 			 * XXX This pacing is really lame. It needs to be solved
 			 * by other methods. This is OK only because the worst
 			 * case scenario is so rare. In the worst case scenario
 			 * all memory is tied up waiting for I/O to complete
 			 * which can never happen since we can't allocate bios
 			 * for that I/O.
 			 */
 			CTR0(KTR_GEOM, "g_down pacing self");
 			pause("g_down", min(hz/1000, 1));
 			pace = 0;
 		}
 		CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp,
 		    bp->bio_to->name);
 		error = g_io_check(bp);
 		if (error >= 0) {
 			CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider "
 			    "%s returned %d", bp, bp->bio_to->name, error);
 			g_io_deliver(bp, error);
 			continue;
 		}
 		THREAD_NO_SLEEPING();
 		CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
 		    "len %ld", bp, bp->bio_to->name, bp->bio_offset,
 		    bp->bio_length);
 		bp->bio_to->geom->start(bp);
 		THREAD_SLEEPING_OK();
 	}
 }
 
 void
 g_io_schedule_up(struct thread *tp __unused)
 {
 	struct bio *bp;
 
 	for(;;) {
 		g_bioq_lock(&g_bio_run_up);
 		bp = g_bioq_first(&g_bio_run_up);
 		if (bp == NULL) {
 			CTR0(KTR_GEOM, "g_up going to sleep");
 			msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock,
 			    PRIBIO | PDROP, "-", 0);
 			continue;
 		}
 		g_bioq_unlock(&g_bio_run_up);
 		THREAD_NO_SLEEPING();
 		CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off "
 		    "%jd len %ld", bp, bp->bio_to->name,
 		    bp->bio_offset, bp->bio_length);
 		biodone(bp);
 		THREAD_SLEEPING_OK();
 	}
 }
 
 void *
 g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error)
 {
 	struct bio *bp;
 	void *ptr;
 	int errorc;
 
 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
 	    length <= MAXPHYS, ("g_read_data(): invalid length %jd",
 	    (intmax_t)length));
 
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_READ;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = length;
 	ptr = g_malloc(length, M_WAITOK);
 	bp->bio_data = ptr;
 	g_io_request(bp, cp);
 	errorc = biowait(bp, "gread");
 	if (error != NULL)
 		*error = errorc;
 	g_destroy_bio(bp);
 	if (errorc) {
 		g_free(ptr);
 		ptr = NULL;
 	}
 	return (ptr);
 }
 
 /*
  * A read function for use by ffs_sbget when used by GEOM-layer routines.
  */
 int
 g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size)
 {
 	struct g_consumer *cp;
 
 	KASSERT(*bufp == NULL,
 	    ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp));
 
 	cp = (struct g_consumer *)devfd;
 	/*
 	 * Take care not to issue an invalid I/O request. The offset of
 	 * the superblock candidate must be multiples of the provider's
 	 * sector size, otherwise an FFS can't exist on the provider
 	 * anyway.
 	 */
 	if (loc % cp->provider->sectorsize != 0)
 		return (ENOENT);
 	*bufp = g_read_data(cp, loc, size, NULL);
 	if (*bufp == NULL)
 		return (ENOENT);
 	return (0);
 }
 
 int
 g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length)
 {
 	struct bio *bp;
 	int error;
 
 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
 	    length <= MAXPHYS, ("g_write_data(): invalid length %jd",
 	    (intmax_t)length));
 
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_WRITE;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = length;
 	bp->bio_data = ptr;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gwrite");
 	g_destroy_bio(bp);
 	return (error);
 }
 
 /*
  * A write function for use by ffs_sbput when used by GEOM-layer routines.
  */
 int
 g_use_g_write_data(void *devfd, off_t loc, void *buf, int size)
 {
 
 	return (g_write_data((struct g_consumer *)devfd, loc, buf, size));
 }
 
 int
 g_delete_data(struct g_consumer *cp, off_t offset, off_t length)
 {
 	struct bio *bp;
 	int error;
 
 	KASSERT(length > 0 && length >= cp->provider->sectorsize,
 	    ("g_delete_data(): invalid length %jd", (intmax_t)length));
 
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_DELETE;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = length;
 	bp->bio_data = NULL;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gdelete");
 	g_destroy_bio(bp);
 	return (error);
 }
 
 void
 g_print_bio(struct bio *bp)
 {
 	const char *pname, *cmd = NULL;
 
 	if (bp->bio_to != NULL)
 		pname = bp->bio_to->name;
 	else
 		pname = "[unknown]";
 
 	switch (bp->bio_cmd) {
 	case BIO_GETATTR:
 		cmd = "GETATTR";
 		printf("%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute);
 		return;
 	case BIO_FLUSH:
 		cmd = "FLUSH";
 		printf("%s[%s]", pname, cmd);
 		return;
 	case BIO_ZONE: {
 		char *subcmd = NULL;
 		cmd = "ZONE";
 		switch (bp->bio_zone.zone_cmd) {
 		case DISK_ZONE_OPEN:
 			subcmd = "OPEN";
 			break;
 		case DISK_ZONE_CLOSE:
 			subcmd = "CLOSE";
 			break;
 		case DISK_ZONE_FINISH:
 			subcmd = "FINISH";
 			break;
 		case DISK_ZONE_RWP:
 			subcmd = "RWP";
 			break;
 		case DISK_ZONE_REPORT_ZONES:
 			subcmd = "REPORT ZONES";
 			break;
 		case DISK_ZONE_GET_PARAMS:
 			subcmd = "GET PARAMS";
 			break;
 		default:
 			subcmd = "UNKNOWN";
 			break;
 		}
 		printf("%s[%s,%s]", pname, cmd, subcmd);
 		return;
 	}
 	case BIO_READ:
 		cmd = "READ";
 		break;
 	case BIO_WRITE:
 		cmd = "WRITE";
 		break;
 	case BIO_DELETE:
 		cmd = "DELETE";
 		break;
 	default:
 		cmd = "UNKNOWN";
 		printf("%s[%s()]", pname, cmd);
 		return;
 	}
 	printf("%s[%s(offset=%jd, length=%jd)]", pname, cmd,
 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
 }
Index: stable/12/sys/geom/geom_kern.c
===================================================================
--- stable/12/sys/geom/geom_kern.c	(revision 355609)
+++ stable/12/sys/geom/geom_kern.c	(revision 355610)
@@ -1,240 +1,240 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/unistd.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 
 MALLOC_DEFINE(M_GEOM, "GEOM", "Geom data structures");
 
 struct sx topology_lock;
 
 static struct proc *g_proc;
-static struct thread *g_up_td;
-static struct thread *g_down_td;
-static struct thread *g_event_td;
+static struct thread __read_mostly *g_up_td;
+static struct thread __read_mostly *g_down_td;
+static struct thread __read_mostly *g_event_td;
 
-int g_debugflags;
-int g_collectstats = 1;
+int __read_mostly g_debugflags;
+int __read_mostly g_collectstats = G_STATS_PROVIDERS;
 int g_shutdown;
 int g_notaste;
 
 /*
  * G_UP and G_DOWN are the two threads which push I/O through the
  * stack.
  *
  * Things are procesed in a FIFO order, but these threads could be
  * part of I/O prioritization by deciding which bios/bioqs to service
  * in what order.
  *
  * We have only one thread in each direction, it is believed that until
  * a very non-trivial workload in the UP/DOWN path this will be enough,
  * but more than one can actually be run without problems.
  *
  * Holding the "mymutex" is a debugging feature:  It prevents people
  * from sleeping in the UP/DOWN I/O path by mistake or design (doing
  * so almost invariably result in deadlocks since it stalls all I/O
  * processing in the given direction.
  */
 
 static void
 g_up_procbody(void *arg)
 {
 
 	thread_lock(g_up_td);
 	sched_prio(g_up_td, PRIBIO);
 	thread_unlock(g_up_td);
 	for(;;) {
 		g_io_schedule_up(g_up_td);
 	}
 }
 
 static void
 g_down_procbody(void *arg)
 {
 
 	thread_lock(g_down_td);
 	sched_prio(g_down_td, PRIBIO);
 	thread_unlock(g_down_td);
 	for(;;) {
 		g_io_schedule_down(g_down_td);
 	}
 }
 
 static void
 g_event_procbody(void *arg)
 {
 
 	thread_lock(g_event_td);
 	sched_prio(g_event_td, PRIBIO);
 	thread_unlock(g_event_td);
 	g_run_events();
 	/* NOTREACHED */
 }
 
 int
 g_is_geom_thread(struct thread *td)
 {
 
 	return (td == g_up_td || td == g_down_td || td == g_event_td);
 }
 
 static void
 geom_shutdown(void *foo __unused)
 {
 
 	g_shutdown = 1;
 }
 
 void
 g_init(void)
 {
 
 	g_trace(G_T_TOPOLOGY, "g_ignition");
 	sx_init(&topology_lock, "GEOM topology");
 	g_io_init();
 	g_event_init();
 	g_ctl_init();
 	kproc_kthread_add(g_event_procbody, NULL, &g_proc, &g_event_td,
 	    RFHIGHPID, 0, "geom", "g_event");
 	kproc_kthread_add(g_up_procbody, NULL, &g_proc, &g_up_td,
 	    RFHIGHPID, 0, "geom", "g_up");
 	kproc_kthread_add(g_down_procbody, NULL, &g_proc, &g_down_td,
 	    RFHIGHPID, 0, "geom", "g_down");
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, geom_shutdown, NULL,
 		SHUTDOWN_PRI_FIRST);
 }
 
 static int
 sysctl_kern_geom_confany(struct sysctl_req *req, g_event_t *func, size_t *hint)
 {
 	size_t len = 0;
 	int error = 0;
 	struct sbuf *sb;
 
 	if (req->oldptr == NULL) {
 		sb = sbuf_new(NULL, NULL, PAGE_SIZE, SBUF_FIXEDLEN |
 		    SBUF_INCLUDENUL);
 		sbuf_set_drain(sb, sbuf_count_drain, &len);
 		g_waitfor_event(func, sb, M_WAITOK, NULL);
 		req->oldidx = *hint = len;
 	} else {
 		sb = sbuf_new(NULL, NULL, *hint, SBUF_AUTOEXTEND |
 		    SBUF_INCLUDENUL);
 		g_waitfor_event(func, sb, M_WAITOK, NULL);
 		*hint = sbuf_len(sb);
 		error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
 	}
 	sbuf_delete(sb);
 	return error;
 }
 
 static int
 sysctl_kern_geom_conftxt(SYSCTL_HANDLER_ARGS)
 {
 	static size_t hint = PAGE_SIZE;
 
 	return (sysctl_kern_geom_confany(req, g_conftxt, &hint));
 }
  
 static int
 sysctl_kern_geom_confdot(SYSCTL_HANDLER_ARGS)
 {
 	static size_t hint = PAGE_SIZE;
 
 	return (sysctl_kern_geom_confany(req, g_confdot, &hint));
 }
 
 static int
 sysctl_kern_geom_confxml(SYSCTL_HANDLER_ARGS)
 {
 	static size_t hint = PAGE_SIZE;
 
 	return (sysctl_kern_geom_confany(req, g_confxml, &hint));
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, geom, CTLFLAG_RW, 0, "GEOMetry management");
 
 SYSCTL_PROC(_kern_geom, OID_AUTO, confxml, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_kern_geom_confxml, "",
 	"Dump the GEOM config in XML");
 
 SYSCTL_PROC(_kern_geom, OID_AUTO, confdot, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_kern_geom_confdot, "",
 	"Dump the GEOM config in dot");
 
 SYSCTL_PROC(_kern_geom, OID_AUTO, conftxt, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_kern_geom_conftxt, "",
 	"Dump the GEOM config in txt");
 
 SYSCTL_INT(_kern_geom, OID_AUTO, debugflags, CTLFLAG_RWTUN,
 	&g_debugflags, 0, "Set various trace levels for GEOM debugging");
 
 SYSCTL_INT(_kern_geom, OID_AUTO, notaste, CTLFLAG_RW,
 	&g_notaste, 0, "Prevent GEOM tasting");
 
 SYSCTL_INT(_kern_geom, OID_AUTO, collectstats, CTLFLAG_RW,
 	&g_collectstats, 0,
 	"Control statistics collection on GEOM providers and consumers");
 
 SYSCTL_INT(_debug_sizeof, OID_AUTO, g_class, CTLFLAG_RD,
 	SYSCTL_NULL_INT_PTR, sizeof(struct g_class), "sizeof(struct g_class)");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, g_geom, CTLFLAG_RD,
 	SYSCTL_NULL_INT_PTR, sizeof(struct g_geom), "sizeof(struct g_geom)");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, g_provider, CTLFLAG_RD,
 	SYSCTL_NULL_INT_PTR, sizeof(struct g_provider), "sizeof(struct g_provider)");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, g_consumer, CTLFLAG_RD,
 	SYSCTL_NULL_INT_PTR, sizeof(struct g_consumer), "sizeof(struct g_consumer)");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, g_bioq, CTLFLAG_RD,
 	SYSCTL_NULL_INT_PTR, sizeof(struct g_bioq), "sizeof(struct g_bioq)");
Index: stable/12/sys/kern/kern_mtxpool.c
===================================================================
--- stable/12/sys/kern/kern_mtxpool.c	(revision 355609)
+++ stable/12/sys/kern/kern_mtxpool.c	(revision 355610)
@@ -1,190 +1,190 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001 Matthew Dillon.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /* Mutex pool routines.  These routines are designed to be used as short
  * term leaf mutexes (e.g. the last mutex you might acquire other then
  * calling msleep()).  They operate using a shared pool.  A mutex is chosen
  * from the pool based on the supplied pointer (which may or may not be
  * valid).
  *
  * Advantages:
  *	- no structural overhead.  Mutexes can be associated with structures
  *	  without adding bloat to the structures.
  *	- mutexes can be obtained for invalid pointers, useful when uses
  *	  mutexes to interlock destructor ops.
  *	- no initialization/destructor overhead.
  *	- can be used with msleep.
  *
  * Disadvantages:
  *	- should generally only be used as leaf mutexes.
  *	- pool/pool dependency ordering cannot be depended on.
  *	- possible L1 cache mastersip contention between cpus.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 
 
 static MALLOC_DEFINE(M_MTXPOOL, "mtx_pool", "mutex pool");
 
 /* Pool sizes must be a power of two */
 #ifndef MTX_POOL_SLEEP_SIZE
 #define MTX_POOL_SLEEP_SIZE		1024
 #endif
 
 struct mtxpool_header {
 	int		mtxpool_size;
 	int		mtxpool_mask;
 	int		mtxpool_shift;
 	int		mtxpool_next __aligned(CACHE_LINE_SIZE);
 };
 
 struct mtx_pool {
 	struct mtxpool_header mtx_pool_header;
 	struct mtx	mtx_pool_ary[1];
 };
 
 #define mtx_pool_size	mtx_pool_header.mtxpool_size
 #define mtx_pool_mask	mtx_pool_header.mtxpool_mask
 #define mtx_pool_shift	mtx_pool_header.mtxpool_shift
 #define mtx_pool_next	mtx_pool_header.mtxpool_next
 
-struct mtx_pool *mtxpool_sleep;
+struct mtx_pool __read_frequently *mtxpool_sleep;
 
 #if UINTPTR_MAX == UINT64_MAX	/* 64 bits */
 # define POINTER_BITS		64
 # define HASH_MULTIPLIER	11400714819323198485u /* (2^64)*(sqrt(5)-1)/2 */
 #else				/* assume 32 bits */
 # define POINTER_BITS		32
 # define HASH_MULTIPLIER	2654435769u	      /* (2^32)*(sqrt(5)-1)/2 */
 #endif
 
 /*
  * Return the (shared) pool mutex associated with the specified address.
  * The returned mutex is a leaf level mutex, meaning that if you obtain it
  * you cannot obtain any other mutexes until you release it.  You can
  * legally msleep() on the mutex.
  */
 struct mtx *
 mtx_pool_find(struct mtx_pool *pool, void *ptr)
 {
 	int p;
 
 	KASSERT(pool != NULL, ("_mtx_pool_find(): null pool"));
 	/*
 	 * Fibonacci hash, see Knuth's
 	 * _Art of Computer Programming, Volume 3 / Sorting and Searching_
 	 */
 	p = ((HASH_MULTIPLIER * (uintptr_t)ptr) >> pool->mtx_pool_shift) &
 	    pool->mtx_pool_mask;
 	return (&pool->mtx_pool_ary[p]);
 }
 
 static void
 mtx_pool_initialize(struct mtx_pool *pool, const char *mtx_name, int pool_size,
     int opts)
 {
 	int i, maskbits;
 
 	pool->mtx_pool_size = pool_size;
 	pool->mtx_pool_mask = pool_size - 1;
 	for (i = 1, maskbits = 0; (i & pool_size) == 0; i = i << 1)
 		maskbits++;
 	pool->mtx_pool_shift = POINTER_BITS - maskbits;
 	pool->mtx_pool_next = 0;
 	for (i = 0; i < pool_size; ++i)
 		mtx_init(&pool->mtx_pool_ary[i], mtx_name, NULL, opts);
 }
 
 struct mtx_pool *
 mtx_pool_create(const char *mtx_name, int pool_size, int opts)
 {
 	struct mtx_pool *pool;
 
 	if (pool_size <= 0 || !powerof2(pool_size)) {
 		printf("WARNING: %s pool size is not a power of 2.\n",
 		    mtx_name);
 		pool_size = 128;
 	}
 	pool = malloc(sizeof (struct mtx_pool) +
 	    ((pool_size - 1) * sizeof (struct mtx)),
 	    M_MTXPOOL, M_WAITOK | M_ZERO);
 	mtx_pool_initialize(pool, mtx_name, pool_size, opts);
 	return pool;
 }
 
 void
 mtx_pool_destroy(struct mtx_pool **poolp)
 {
 	int i;
 	struct mtx_pool *pool = *poolp;
 
 	for (i = pool->mtx_pool_size - 1; i >= 0; --i)
 		mtx_destroy(&pool->mtx_pool_ary[i]);
 	free(pool, M_MTXPOOL);
 	*poolp = NULL;
 }
 
 static void
 mtx_pool_setup_dynamic(void *dummy __unused)
 {
 	mtxpool_sleep = mtx_pool_create("sleep mtxpool",
 	    MTX_POOL_SLEEP_SIZE, MTX_DEF);
 }
 
 /*
  * Obtain a (shared) mutex from the pool.  The returned mutex is a leaf
  * level mutex, meaning that if you obtain it you cannot obtain any other
  * mutexes until you release it.  You can legally msleep() on the mutex.
  */
 struct mtx *
 mtx_pool_alloc(struct mtx_pool *pool)
 {
 	int i;
 
 	KASSERT(pool != NULL, ("mtx_pool_alloc(): null pool"));
 	/*
 	 * mtx_pool_next is unprotected against multiple accesses,
 	 * but simultaneous access by two CPUs should not be very
 	 * harmful.
 	 */
 	i = pool->mtx_pool_next;
 	pool->mtx_pool_next = (i + 1) & pool->mtx_pool_mask;
 	return (&pool->mtx_pool_ary[i]);
 }
 
 SYSINIT(mtxpooli2, SI_SUB_MTX_POOL_DYNAMIC, SI_ORDER_FIRST,
     mtx_pool_setup_dynamic, NULL);
Index: stable/12/sys/kern/kern_shutdown.c
===================================================================
--- stable/12/sys/kern/kern_shutdown.c	(revision 355609)
+++ stable/12/sys/kern/kern_shutdown.c	(revision 355610)
@@ -1,1588 +1,1588 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_shutdown.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ekcd.h"
 #include "opt_kdb.h"
 #include "opt_panic.h"
 #include "opt_sched.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/compressor.h>
 #include <sys/cons.h>
 #include <sys/eventhandler.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 
 #include <crypto/rijndael/rijndael-api-fst.h>
 #include <crypto/sha2/sha256.h>
 
 #include <ddb/ddb.h>
 
 #include <machine/cpu.h>
 #include <machine/dump.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 #include <sys/signalvar.h>
 
 static MALLOC_DEFINE(M_DUMPER, "dumper", "dumper block buffer");
 
 #ifndef PANIC_REBOOT_WAIT_TIME
 #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
 #endif
 static int panic_reboot_wait_time = PANIC_REBOOT_WAIT_TIME;
 SYSCTL_INT(_kern, OID_AUTO, panic_reboot_wait_time, CTLFLAG_RWTUN,
     &panic_reboot_wait_time, 0,
     "Seconds to wait before rebooting after a panic");
 
 /*
  * Note that stdarg.h and the ANSI style va_start macro is used for both
  * ANSI and traditional C compilers.
  */
 #include <machine/stdarg.h>
 
 #ifdef KDB
 #ifdef KDB_UNATTENDED
 static int debugger_on_panic = 0;
 #else
 static int debugger_on_panic = 1;
 #endif
 SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic,
     CTLFLAG_RWTUN | CTLFLAG_SECURE,
     &debugger_on_panic, 0, "Run debugger on kernel panic");
 
 int debugger_on_trap = 0;
 SYSCTL_INT(_debug, OID_AUTO, debugger_on_trap,
     CTLFLAG_RWTUN | CTLFLAG_SECURE,
     &debugger_on_trap, 0, "Run debugger on kernel trap before panic");
 
 #ifdef KDB_TRACE
 static int trace_on_panic = 1;
 static bool trace_all_panics = true;
 #else
 static int trace_on_panic = 0;
 static bool trace_all_panics = false;
 #endif
 SYSCTL_INT(_debug, OID_AUTO, trace_on_panic,
     CTLFLAG_RWTUN | CTLFLAG_SECURE,
     &trace_on_panic, 0, "Print stack trace on kernel panic");
 SYSCTL_BOOL(_debug, OID_AUTO, trace_all_panics, CTLFLAG_RWTUN,
     &trace_all_panics, 0, "Print stack traces on secondary kernel panics");
 #endif /* KDB */
 
 static int sync_on_panic = 0;
 SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RWTUN,
 	&sync_on_panic, 0, "Do a sync before rebooting from a panic");
 
 static bool poweroff_on_panic = 0;
 SYSCTL_BOOL(_kern, OID_AUTO, poweroff_on_panic, CTLFLAG_RWTUN,
 	&poweroff_on_panic, 0, "Do a power off instead of a reboot on a panic");
 
 static bool powercycle_on_panic = 0;
 SYSCTL_BOOL(_kern, OID_AUTO, powercycle_on_panic, CTLFLAG_RWTUN,
 	&powercycle_on_panic, 0, "Do a power cycle instead of a reboot on a panic");
 
 static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0,
     "Shutdown environment");
 
 #ifndef DIAGNOSTIC
 static int show_busybufs;
 #else
 static int show_busybufs = 1;
 #endif
 SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW,
 	&show_busybufs, 0, "");
 
 int suspend_blocked = 0;
 SYSCTL_INT(_kern, OID_AUTO, suspend_blocked, CTLFLAG_RW,
 	&suspend_blocked, 0, "Block suspend due to a pending shutdown");
 
 #ifdef EKCD
 FEATURE(ekcd, "Encrypted kernel crash dumps support");
 
 MALLOC_DEFINE(M_EKCD, "ekcd", "Encrypted kernel crash dumps data");
 
 struct kerneldumpcrypto {
 	uint8_t			kdc_encryption;
 	uint8_t			kdc_iv[KERNELDUMP_IV_MAX_SIZE];
 	keyInstance		kdc_ki;
 	cipherInstance		kdc_ci;
 	uint32_t		kdc_dumpkeysize;
 	struct kerneldumpkey	kdc_dumpkey[];
 };
 #endif
 
 struct kerneldumpcomp {
 	uint8_t			kdc_format;
 	struct compressor	*kdc_stream;
 	uint8_t			*kdc_buf;
 	size_t			kdc_resid;
 };
 
 static struct kerneldumpcomp *kerneldumpcomp_create(struct dumperinfo *di,
 		    uint8_t compression);
 static void	kerneldumpcomp_destroy(struct dumperinfo *di);
 static int	kerneldumpcomp_write_cb(void *base, size_t len, off_t off, void *arg);
 
 static int kerneldump_gzlevel = 6;
 SYSCTL_INT(_kern, OID_AUTO, kerneldump_gzlevel, CTLFLAG_RWTUN,
     &kerneldump_gzlevel, 0,
     "Kernel crash dump compression level");
 
 /*
  * Variable panicstr contains argument to first call to panic; used as flag
  * to indicate that the kernel has already called panic.
  */
-const char *panicstr;
+const char __read_mostly *panicstr;
 
-int dumping;				/* system is dumping */
+int __read_mostly dumping;		/* system is dumping */
 int rebooting;				/* system is rebooting */
 static struct dumperinfo dumper;	/* our selected dumper */
 
 /* Context information for dump-debuggers. */
 static struct pcb dumppcb;		/* Registers. */
 lwpid_t dumptid;			/* Thread ID. */
 
 static struct cdevsw reroot_cdevsw = {
      .d_version = D_VERSION,
      .d_name    = "reroot",
 };
 
 static void poweroff_wait(void *, int);
 static void shutdown_halt(void *junk, int howto);
 static void shutdown_panic(void *junk, int howto);
 static void shutdown_reset(void *junk, int howto);
 static int kern_reroot(void);
 
 /* register various local shutdown events */
 static void
 shutdown_conf(void *unused)
 {
 
 	EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL,
 	    SHUTDOWN_PRI_FIRST);
 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL,
 	    SHUTDOWN_PRI_LAST + 100);
 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL,
 	    SHUTDOWN_PRI_LAST + 100);
 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL,
 	    SHUTDOWN_PRI_LAST + 200);
 }
 
 SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL);
 
 /*
  * The only reason this exists is to create the /dev/reroot/ directory,
  * used by reroot code in init(8) as a mountpoint for tmpfs.
  */
 static void
 reroot_conf(void *unused)
 {
 	int error;
 	struct cdev *cdev;
 
 	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &cdev,
 	    &reroot_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "reroot/reroot");
 	if (error != 0) {
 		printf("%s: failed to create device node, error %d",
 		    __func__, error);
 	}
 }
 
 SYSINIT(reroot_conf, SI_SUB_DEVFS, SI_ORDER_ANY, reroot_conf, NULL);
 
 /*
  * The system call that results in a reboot.
  */
 /* ARGSUSED */
 int
 sys_reboot(struct thread *td, struct reboot_args *uap)
 {
 	int error;
 
 	error = 0;
 #ifdef MAC
 	error = mac_system_check_reboot(td->td_ucred, uap->opt);
 #endif
 	if (error == 0)
 		error = priv_check(td, PRIV_REBOOT);
 	if (error == 0) {
 		if (uap->opt & RB_REROOT)
 			error = kern_reroot();
 		else
 			kern_reboot(uap->opt);
 	}
 	return (error);
 }
 
 static void
 shutdown_nice_task_fn(void *arg, int pending __unused)
 {
 	int howto;
 
 	howto = (uintptr_t)arg;
 	/* Send a signal to init(8) and have it shutdown the world. */
 	PROC_LOCK(initproc);
 	if (howto & RB_POWEROFF)
 		kern_psignal(initproc, SIGUSR2);
 	else if (howto & RB_POWERCYCLE)
 		kern_psignal(initproc, SIGWINCH);
 	else if (howto & RB_HALT)
 		kern_psignal(initproc, SIGUSR1);
 	else
 		kern_psignal(initproc, SIGINT);
 	PROC_UNLOCK(initproc);
 }
 
 static struct task shutdown_nice_task = TASK_INITIALIZER(0,
     &shutdown_nice_task_fn, NULL);
 
 /*
  * Called by events that want to shut down.. e.g  <CTL><ALT><DEL> on a PC
  */
 void
 shutdown_nice(int howto)
 {
 
 	if (initproc != NULL && !SCHEDULER_STOPPED()) {
 		shutdown_nice_task.ta_context = (void *)(uintptr_t)howto;
 		taskqueue_enqueue(taskqueue_fast, &shutdown_nice_task);
 	} else {
 		/*
 		 * No init(8) running, or scheduler would not allow it
 		 * to run, so simply reboot.
 		 */
 		kern_reboot(howto | RB_NOSYNC);
 	}
 }
 
 static void
 print_uptime(void)
 {
 	int f;
 	struct timespec ts;
 
 	getnanouptime(&ts);
 	printf("Uptime: ");
 	f = 0;
 	if (ts.tv_sec >= 86400) {
 		printf("%ldd", (long)ts.tv_sec / 86400);
 		ts.tv_sec %= 86400;
 		f = 1;
 	}
 	if (f || ts.tv_sec >= 3600) {
 		printf("%ldh", (long)ts.tv_sec / 3600);
 		ts.tv_sec %= 3600;
 		f = 1;
 	}
 	if (f || ts.tv_sec >= 60) {
 		printf("%ldm", (long)ts.tv_sec / 60);
 		ts.tv_sec %= 60;
 		f = 1;
 	}
 	printf("%lds\n", (long)ts.tv_sec);
 }
 
 int
 doadump(boolean_t textdump)
 {
 	boolean_t coredump;
 	int error;
 
 	error = 0;
 	if (dumping)
 		return (EBUSY);
 	if (dumper.dumper == NULL)
 		return (ENXIO);
 
 	savectx(&dumppcb);
 	dumptid = curthread->td_tid;
 	dumping++;
 
 	coredump = TRUE;
 #ifdef DDB
 	if (textdump && textdump_pending) {
 		coredump = FALSE;
 		textdump_dumpsys(&dumper);
 	}
 #endif
 	if (coredump)
 		error = dumpsys(&dumper);
 
 	dumping--;
 	return (error);
 }
 
 /*
  * Shutdown the system cleanly to prepare for reboot, halt, or power off.
  */
 void
 kern_reboot(int howto)
 {
 	static int once = 0;
 
 	/*
 	 * Normal paths here don't hold Giant, but we can wind up here
 	 * unexpectedly with it held.  Drop it now so we don't have to
 	 * drop and pick it up elsewhere. The paths it is locking will
 	 * never be returned to, and it is preferable to preclude
 	 * deadlock than to lock against code that won't ever
 	 * continue.
 	 */
 	while (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 
 #if defined(SMP)
 	/*
 	 * Bind us to the first CPU so that all shutdown code runs there.  Some
 	 * systems don't shutdown properly (i.e., ACPI power off) if we
 	 * run on another processor.
 	 */
 	if (!SCHEDULER_STOPPED()) {
 		thread_lock(curthread);
 		sched_bind(curthread, CPU_FIRST());
 		thread_unlock(curthread);
 		KASSERT(PCPU_GET(cpuid) == CPU_FIRST(),
 		    ("boot: not running on cpu 0"));
 	}
 #endif
 	/* We're in the process of rebooting. */
 	rebooting = 1;
 
 	/* We are out of the debugger now. */
 	kdb_active = 0;
 
 	/*
 	 * Do any callouts that should be done BEFORE syncing the filesystems.
 	 */
 	EVENTHANDLER_INVOKE(shutdown_pre_sync, howto);
 
 	/* 
 	 * Now sync filesystems
 	 */
 	if (!cold && (howto & RB_NOSYNC) == 0 && once == 0) {
 		once = 1;
 		bufshutdown(show_busybufs);
 	}
 
 	print_uptime();
 
 	cngrab();
 
 	/*
 	 * Ok, now do things that assume all filesystem activity has
 	 * been completed.
 	 */
 	EVENTHANDLER_INVOKE(shutdown_post_sync, howto);
 
 	if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) 
 		doadump(TRUE);
 
 	/* Now that we're going to really halt the system... */
 	EVENTHANDLER_INVOKE(shutdown_final, howto);
 
 	for(;;) ;	/* safety against shutdown_reset not working */
 	/* NOTREACHED */
 }
 
 /*
  * The system call that results in changing the rootfs.
  */
 static int
 kern_reroot(void)
 {
 	struct vnode *oldrootvnode, *vp;
 	struct mount *mp, *devmp;
 	int error;
 
 	if (curproc != initproc)
 		return (EPERM);
 
 	/*
 	 * Mark the filesystem containing currently-running executable
 	 * (the temporary copy of init(8)) busy.
 	 */
 	vp = curproc->p_textvp;
 	error = vn_lock(vp, LK_SHARED);
 	if (error != 0)
 		return (error);
 	mp = vp->v_mount;
 	error = vfs_busy(mp, MBF_NOWAIT);
 	if (error != 0) {
 		vfs_ref(mp);
 		VOP_UNLOCK(vp, 0);
 		error = vfs_busy(mp, 0);
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		vfs_rel(mp);
 		if (error != 0) {
 			VOP_UNLOCK(vp, 0);
 			return (ENOENT);
 		}
 		if (vp->v_iflag & VI_DOOMED) {
 			VOP_UNLOCK(vp, 0);
 			vfs_unbusy(mp);
 			return (ENOENT);
 		}
 	}
 	VOP_UNLOCK(vp, 0);
 
 	/*
 	 * Remove the filesystem containing currently-running executable
 	 * from the mount list, to prevent it from being unmounted
 	 * by vfs_unmountall(), and to avoid confusing vfs_mountroot().
 	 *
 	 * Also preserve /dev - forcibly unmounting it could cause driver
 	 * reinitialization.
 	 */
 
 	vfs_ref(rootdevmp);
 	devmp = rootdevmp;
 	rootdevmp = NULL;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	TAILQ_REMOVE(&mountlist, devmp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 
 	oldrootvnode = rootvnode;
 
 	/*
 	 * Unmount everything except for the two filesystems preserved above.
 	 */
 	vfs_unmountall();
 
 	/*
 	 * Add /dev back; vfs_mountroot() will move it into its new place.
 	 */
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_HEAD(&mountlist, devmp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	rootdevmp = devmp;
 	vfs_rel(rootdevmp);
 
 	/*
 	 * Mount the new rootfs.
 	 */
 	vfs_mountroot();
 
 	/*
 	 * Update all references to the old rootvnode.
 	 */
 	mountcheckdirs(oldrootvnode, rootvnode);
 
 	/*
 	 * Add the temporary filesystem back and unbusy it.
 	 */
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	vfs_unbusy(mp);
 
 	return (0);
 }
 
 /*
  * If the shutdown was a clean halt, behave accordingly.
  */
 static void
 shutdown_halt(void *junk, int howto)
 {
 
 	if (howto & RB_HALT) {
 		printf("\n");
 		printf("The operating system has halted.\n");
 		printf("Please press any key to reboot.\n\n");
 
 		wdog_kern_pat(WD_TO_NEVER);
 
 		switch (cngetc()) {
 		case -1:		/* No console, just die */
 			cpu_halt();
 			/* NOTREACHED */
 		default:
 			break;
 		}
 	}
 }
 
 /*
  * Check to see if the system paniced, pause and then reboot
  * according to the specified delay.
  */
 static void
 shutdown_panic(void *junk, int howto)
 {
 	int loop;
 
 	if (howto & RB_DUMP) {
 		if (panic_reboot_wait_time != 0) {
 			if (panic_reboot_wait_time != -1) {
 				printf("Automatic reboot in %d seconds - "
 				       "press a key on the console to abort\n",
 					panic_reboot_wait_time);
 				for (loop = panic_reboot_wait_time * 10;
 				     loop > 0; --loop) {
 					DELAY(1000 * 100); /* 1/10th second */
 					/* Did user type a key? */
 					if (cncheckc() != -1)
 						break;
 				}
 				if (!loop)
 					return;
 			}
 		} else { /* zero time specified - reboot NOW */
 			return;
 		}
 		printf("--> Press a key on the console to reboot,\n");
 		printf("--> or switch off the system now.\n");
 		cngetc();
 	}
 }
 
 /*
  * Everything done, now reset
  */
 static void
 shutdown_reset(void *junk, int howto)
 {
 
 	printf("Rebooting...\n");
 	DELAY(1000000);	/* wait 1 sec for printf's to complete and be read */
 
 	/*
 	 * Acquiring smp_ipi_mtx here has a double effect:
 	 * - it disables interrupts avoiding CPU0 preemption
 	 *   by fast handlers (thus deadlocking  against other CPUs)
 	 * - it avoids deadlocks against smp_rendezvous() or, more 
 	 *   generally, threads busy-waiting, with this spinlock held,
 	 *   and waiting for responses by threads on other CPUs
 	 *   (ie. smp_tlb_shootdown()).
 	 *
 	 * For the !SMP case it just needs to handle the former problem.
 	 */
 #ifdef SMP
 	mtx_lock_spin(&smp_ipi_mtx);
 #else
 	spinlock_enter();
 #endif
 
 	/* cpu_boot(howto); */ /* doesn't do anything at the moment */
 	cpu_reset();
 	/* NOTREACHED */ /* assuming reset worked */
 }
 
 #if defined(WITNESS) || defined(INVARIANT_SUPPORT)
 static int kassert_warn_only = 0;
 #ifdef KDB
 static int kassert_do_kdb = 0;
 #endif
 #ifdef KTR
 static int kassert_do_ktr = 0;
 #endif
 static int kassert_do_log = 1;
 static int kassert_log_pps_limit = 4;
 static int kassert_log_mute_at = 0;
 static int kassert_log_panic_at = 0;
 static int kassert_suppress_in_panic = 0;
 static int kassert_warnings = 0;
 
 SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options");
 
 #ifdef KASSERT_PANIC_OPTIONAL
 #define KASSERT_RWTUN	CTLFLAG_RWTUN
 #else
 #define KASSERT_RWTUN	CTLFLAG_RDTUN
 #endif
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, KASSERT_RWTUN,
     &kassert_warn_only, 0,
     "KASSERT triggers a panic (0) or just a warning (1)");
 
 #ifdef KDB
 SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, KASSERT_RWTUN,
     &kassert_do_kdb, 0, "KASSERT will enter the debugger");
 #endif
 
 #ifdef KTR
 SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, KASSERT_RWTUN,
     &kassert_do_ktr, 0,
     "KASSERT does a KTR, set this to the KTRMASK you want");
 #endif
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, KASSERT_RWTUN,
     &kassert_do_log, 0,
     "If warn_only is enabled, log (1) or do not log (0) assertion violations");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RD | CTLFLAG_STATS,
     &kassert_warnings, 0, "number of KASSERTs that have been triggered");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, KASSERT_RWTUN,
     &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, KASSERT_RWTUN,
     &kassert_log_pps_limit, 0, "limit number of log messages per second");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, KASSERT_RWTUN,
     &kassert_log_mute_at, 0, "max number of KASSERTS to log");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, suppress_in_panic, KASSERT_RWTUN,
     &kassert_suppress_in_panic, 0,
     "KASSERTs will be suppressed while handling a panic");
 #undef KASSERT_RWTUN
 
 static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
     kassert_sysctl_kassert, "I", "set to trigger a test kassert");
 
 static int
 kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error == 0) {
 		i = 0;
 		error = sysctl_handle_int(oidp, &i, 0, req);
 	}
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i));
 	return (0);
 }
 
 #ifdef KASSERT_PANIC_OPTIONAL
 /*
  * Called by KASSERT, this decides if we will panic
  * or if we will log via printf and/or ktr.
  */
 void
 kassert_panic(const char *fmt, ...)
 {
 	static char buf[256];
 	va_list ap;
 
 	va_start(ap, fmt);
 	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
 
 	/*
 	 * If we are suppressing secondary panics, log the warning but do not
 	 * re-enter panic/kdb.
 	 */
 	if (panicstr != NULL && kassert_suppress_in_panic) {
 		if (kassert_do_log) {
 			printf("KASSERT failed: %s\n", buf);
 #ifdef KDB
 			if (trace_all_panics && trace_on_panic)
 				kdb_backtrace();
 #endif
 		}
 		return;
 	}
 
 	/*
 	 * panic if we're not just warning, or if we've exceeded
 	 * kassert_log_panic_at warnings.
 	 */
 	if (!kassert_warn_only ||
 	    (kassert_log_panic_at > 0 &&
 	     kassert_warnings >= kassert_log_panic_at)) {
 		va_start(ap, fmt);
 		vpanic(fmt, ap);
 		/* NORETURN */
 	}
 #ifdef KTR
 	if (kassert_do_ktr)
 		CTR0(ktr_mask, buf);
 #endif /* KTR */
 	/*
 	 * log if we've not yet met the mute limit.
 	 */
 	if (kassert_do_log &&
 	    (kassert_log_mute_at == 0 ||
 	     kassert_warnings < kassert_log_mute_at)) {
 		static  struct timeval lasterr;
 		static  int curerr;
 
 		if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) {
 			printf("KASSERT failed: %s\n", buf);
 			kdb_backtrace();
 		}
 	}
 #ifdef KDB
 	if (kassert_do_kdb) {
 		kdb_enter(KDB_WHY_KASSERT, buf);
 	}
 #endif
 	atomic_add_int(&kassert_warnings, 1);
 }
 #endif /* KASSERT_PANIC_OPTIONAL */
 #endif
 
 /*
  * Panic is called on unresolvable fatal errors.  It prints "panic: mesg",
  * and then reboots.  If we are called twice, then we avoid trying to sync
  * the disks as this often leads to recursive panics.
  */
 void
 panic(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vpanic(fmt, ap);
 }
 
 void
 vpanic(const char *fmt, va_list ap)
 {
 #ifdef SMP
 	cpuset_t other_cpus;
 #endif
 	struct thread *td = curthread;
 	int bootopt, newpanic;
 	static char buf[256];
 
 	spinlock_enter();
 
 #ifdef SMP
 	/*
 	 * stop_cpus_hard(other_cpus) should prevent multiple CPUs from
 	 * concurrently entering panic.  Only the winner will proceed
 	 * further.
 	 */
 	if (panicstr == NULL && !kdb_active) {
 		other_cpus = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 		stop_cpus_hard(other_cpus);
 	}
 #endif
 
 	/*
 	 * Ensure that the scheduler is stopped while panicking, even if panic
 	 * has been entered from kdb.
 	 */
 	td->td_stopsched = 1;
 
 	bootopt = RB_AUTOBOOT;
 	newpanic = 0;
 	if (panicstr)
 		bootopt |= RB_NOSYNC;
 	else {
 		bootopt |= RB_DUMP;
 		panicstr = fmt;
 		newpanic = 1;
 	}
 
 	if (newpanic) {
 		(void)vsnprintf(buf, sizeof(buf), fmt, ap);
 		panicstr = buf;
 		cngrab();
 		printf("panic: %s\n", buf);
 	} else {
 		printf("panic: ");
 		vprintf(fmt, ap);
 		printf("\n");
 	}
 #ifdef SMP
 	printf("cpuid = %d\n", PCPU_GET(cpuid));
 #endif
 	printf("time = %jd\n", (intmax_t )time_second);
 #ifdef KDB
 	if ((newpanic || trace_all_panics) && trace_on_panic)
 		kdb_backtrace();
 	if (debugger_on_panic)
 		kdb_enter(KDB_WHY_PANIC, "panic");
 #endif
 	/*thread_lock(td); */
 	td->td_flags |= TDF_INPANIC;
 	/* thread_unlock(td); */
 	if (!sync_on_panic)
 		bootopt |= RB_NOSYNC;
 	if (poweroff_on_panic)
 		bootopt |= RB_POWEROFF;
 	if (powercycle_on_panic)
 		bootopt |= RB_POWERCYCLE;
 	kern_reboot(bootopt);
 }
 
 /*
  * Support for poweroff delay.
  *
  * Please note that setting this delay too short might power off your machine
  * before the write cache on your hard disk has been flushed, leading to
  * soft-updates inconsistencies.
  */
 #ifndef POWEROFF_DELAY
 # define POWEROFF_DELAY 5000
 #endif
 static int poweroff_delay = POWEROFF_DELAY;
 
 SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW,
     &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)");
 
 static void
 poweroff_wait(void *junk, int howto)
 {
 
 	if ((howto & (RB_POWEROFF | RB_POWERCYCLE)) == 0 || poweroff_delay <= 0)
 		return;
 	DELAY(poweroff_delay * 1000);
 }
 
 /*
  * Some system processes (e.g. syncer) need to be stopped at appropriate
  * points in their main loops prior to a system shutdown, so that they
  * won't interfere with the shutdown process (e.g. by holding a disk buf
  * to cause sync to fail).  For each of these system processes, register
  * shutdown_kproc() as a handler for one of shutdown events.
  */
 static int kproc_shutdown_wait = 60;
 SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW,
     &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process");
 
 void
 kproc_shutdown(void *arg, int howto)
 {
 	struct proc *p;
 	int error;
 
 	if (panicstr)
 		return;
 
 	p = (struct proc *)arg;
 	printf("Waiting (max %d seconds) for system process `%s' to stop... ",
 	    kproc_shutdown_wait, p->p_comm);
 	error = kproc_suspend(p, kproc_shutdown_wait * hz);
 
 	if (error == EWOULDBLOCK)
 		printf("timed out\n");
 	else
 		printf("done\n");
 }
 
 void
 kthread_shutdown(void *arg, int howto)
 {
 	struct thread *td;
 	int error;
 
 	if (panicstr)
 		return;
 
 	td = (struct thread *)arg;
 	printf("Waiting (max %d seconds) for system thread `%s' to stop... ",
 	    kproc_shutdown_wait, td->td_name);
 	error = kthread_suspend(td, kproc_shutdown_wait * hz);
 
 	if (error == EWOULDBLOCK)
 		printf("timed out\n");
 	else
 		printf("done\n");
 }
 
 static char dumpdevname[sizeof(((struct cdev*)NULL)->si_name)];
 SYSCTL_STRING(_kern_shutdown, OID_AUTO, dumpdevname, CTLFLAG_RD,
     dumpdevname, 0, "Device for kernel dumps");
 
 static int	_dump_append(struct dumperinfo *di, void *virtual,
 		    vm_offset_t physical, size_t length);
 
 #ifdef EKCD
 static struct kerneldumpcrypto *
 kerneldumpcrypto_create(size_t blocksize, uint8_t encryption,
     const uint8_t *key, uint32_t encryptedkeysize, const uint8_t *encryptedkey)
 {
 	struct kerneldumpcrypto *kdc;
 	struct kerneldumpkey *kdk;
 	uint32_t dumpkeysize;
 
 	dumpkeysize = roundup2(sizeof(*kdk) + encryptedkeysize, blocksize);
 	kdc = malloc(sizeof(*kdc) + dumpkeysize, M_EKCD, M_WAITOK | M_ZERO);
 
 	arc4rand(kdc->kdc_iv, sizeof(kdc->kdc_iv), 0);
 
 	kdc->kdc_encryption = encryption;
 	switch (kdc->kdc_encryption) {
 	case KERNELDUMP_ENC_AES_256_CBC:
 		if (rijndael_makeKey(&kdc->kdc_ki, DIR_ENCRYPT, 256, key) <= 0)
 			goto failed;
 		break;
 	default:
 		goto failed;
 	}
 
 	kdc->kdc_dumpkeysize = dumpkeysize;
 	kdk = kdc->kdc_dumpkey;
 	kdk->kdk_encryption = kdc->kdc_encryption;
 	memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv));
 	kdk->kdk_encryptedkeysize = htod32(encryptedkeysize);
 	memcpy(kdk->kdk_encryptedkey, encryptedkey, encryptedkeysize);
 
 	return (kdc);
 failed:
 	explicit_bzero(kdc, sizeof(*kdc) + dumpkeysize);
 	free(kdc, M_EKCD);
 	return (NULL);
 }
 
 static int
 kerneldumpcrypto_init(struct kerneldumpcrypto *kdc)
 {
 	uint8_t hash[SHA256_DIGEST_LENGTH];
 	SHA256_CTX ctx;
 	struct kerneldumpkey *kdk;
 	int error;
 
 	error = 0;
 
 	if (kdc == NULL)
 		return (0);
 
 	/*
 	 * When a user enters ddb it can write a crash dump multiple times.
 	 * Each time it should be encrypted using a different IV.
 	 */
 	SHA256_Init(&ctx);
 	SHA256_Update(&ctx, kdc->kdc_iv, sizeof(kdc->kdc_iv));
 	SHA256_Final(hash, &ctx);
 	bcopy(hash, kdc->kdc_iv, sizeof(kdc->kdc_iv));
 
 	switch (kdc->kdc_encryption) {
 	case KERNELDUMP_ENC_AES_256_CBC:
 		if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC,
 		    kdc->kdc_iv) <= 0) {
 			error = EINVAL;
 			goto out;
 		}
 		break;
 	default:
 		error = EINVAL;
 		goto out;
 	}
 
 	kdk = kdc->kdc_dumpkey;
 	memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv));
 out:
 	explicit_bzero(hash, sizeof(hash));
 	return (error);
 }
 
 static uint32_t
 kerneldumpcrypto_dumpkeysize(const struct kerneldumpcrypto *kdc)
 {
 
 	if (kdc == NULL)
 		return (0);
 	return (kdc->kdc_dumpkeysize);
 }
 #endif /* EKCD */
 
 static struct kerneldumpcomp *
 kerneldumpcomp_create(struct dumperinfo *di, uint8_t compression)
 {
 	struct kerneldumpcomp *kdcomp;
 	int format;
 
 	switch (compression) {
 	case KERNELDUMP_COMP_GZIP:
 		format = COMPRESS_GZIP;
 		break;
 	case KERNELDUMP_COMP_ZSTD:
 		format = COMPRESS_ZSTD;
 		break;
 	default:
 		return (NULL);
 	}
 
 	kdcomp = malloc(sizeof(*kdcomp), M_DUMPER, M_WAITOK | M_ZERO);
 	kdcomp->kdc_format = compression;
 	kdcomp->kdc_stream = compressor_init(kerneldumpcomp_write_cb,
 	    format, di->maxiosize, kerneldump_gzlevel, di);
 	if (kdcomp->kdc_stream == NULL) {
 		free(kdcomp, M_DUMPER);
 		return (NULL);
 	}
 	kdcomp->kdc_buf = malloc(di->maxiosize, M_DUMPER, M_WAITOK | M_NODUMP);
 	return (kdcomp);
 }
 
 static void
 kerneldumpcomp_destroy(struct dumperinfo *di)
 {
 	struct kerneldumpcomp *kdcomp;
 
 	kdcomp = di->kdcomp;
 	if (kdcomp == NULL)
 		return;
 	compressor_fini(kdcomp->kdc_stream);
 	explicit_bzero(kdcomp->kdc_buf, di->maxiosize);
 	free(kdcomp->kdc_buf, M_DUMPER);
 	free(kdcomp, M_DUMPER);
 }
 
 /* Registration of dumpers */
 int
 set_dumper(struct dumperinfo *di, const char *devname, struct thread *td,
     uint8_t compression, uint8_t encryption, const uint8_t *key,
     uint32_t encryptedkeysize, const uint8_t *encryptedkey)
 {
 	size_t wantcopy;
 	int error;
 
 	error = priv_check(td, PRIV_SETDUMPER);
 	if (error != 0)
 		return (error);
 
 	if (dumper.dumper != NULL)
 		return (EBUSY);
 	dumper = *di;
 	dumper.blockbuf = NULL;
 	dumper.kdcrypto = NULL;
 	dumper.kdcomp = NULL;
 
 	if (encryption != KERNELDUMP_ENC_NONE) {
 #ifdef EKCD
 		dumper.kdcrypto = kerneldumpcrypto_create(di->blocksize,
 		    encryption, key, encryptedkeysize, encryptedkey);
 		if (dumper.kdcrypto == NULL) {
 			error = EINVAL;
 			goto cleanup;
 		}
 #else
 		error = EOPNOTSUPP;
 		goto cleanup;
 #endif
 	}
 
 	wantcopy = strlcpy(dumpdevname, devname, sizeof(dumpdevname));
 	if (wantcopy >= sizeof(dumpdevname)) {
 		printf("set_dumper: device name truncated from '%s' -> '%s'\n",
 		    devname, dumpdevname);
 	}
 
 	if (compression != KERNELDUMP_COMP_NONE) {
 		/*
 		 * We currently can't support simultaneous encryption and
 		 * compression.
 		 */
 		if (encryption != KERNELDUMP_ENC_NONE) {
 			error = EOPNOTSUPP;
 			goto cleanup;
 		}
 		dumper.kdcomp = kerneldumpcomp_create(&dumper, compression);
 		if (dumper.kdcomp == NULL) {
 			error = EINVAL;
 			goto cleanup;
 		}
 	}
 
 	dumper.blockbuf = malloc(di->blocksize, M_DUMPER, M_WAITOK | M_ZERO);
 	return (0);
 
 cleanup:
 	(void)clear_dumper(td);
 	return (error);
 }
 
 int
 clear_dumper(struct thread *td)
 {
 	int error;
 
 	error = priv_check(td, PRIV_SETDUMPER);
 	if (error != 0)
 		return (error);
 
 #ifdef NETDUMP
 	netdump_mbuf_drain();
 #endif
 
 #ifdef EKCD
 	if (dumper.kdcrypto != NULL) {
 		explicit_bzero(dumper.kdcrypto, sizeof(*dumper.kdcrypto) +
 		    dumper.kdcrypto->kdc_dumpkeysize);
 		free(dumper.kdcrypto, M_EKCD);
 	}
 #endif
 
 	kerneldumpcomp_destroy(&dumper);
 
 	if (dumper.blockbuf != NULL) {
 		explicit_bzero(dumper.blockbuf, dumper.blocksize);
 		free(dumper.blockbuf, M_DUMPER);
 	}
 	explicit_bzero(&dumper, sizeof(dumper));
 	dumpdevname[0] = '\0';
 	return (0);
 }
 
 static int
 dump_check_bounds(struct dumperinfo *di, off_t offset, size_t length)
 {
 
 	if (di->mediasize > 0 && length != 0 && (offset < di->mediaoffset ||
 	    offset - di->mediaoffset + length > di->mediasize)) {
 		if (di->kdcomp != NULL && offset >= di->mediaoffset) {
 			printf(
 		    "Compressed dump failed to fit in device boundaries.\n");
 			return (E2BIG);
 		}
 
 		printf("Attempt to write outside dump device boundaries.\n"
 	    "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n",
 		    (intmax_t)offset, (intmax_t)di->mediaoffset,
 		    (uintmax_t)length, (intmax_t)di->mediasize);
 		return (ENOSPC);
 	}
 	if (length % di->blocksize != 0) {
 		printf("Attempt to write partial block of length %ju.\n",
 		    (uintmax_t)length);
 		return (EINVAL);
 	}
 	if (offset % di->blocksize != 0) {
 		printf("Attempt to write at unaligned offset %jd.\n",
 		    (intmax_t)offset);
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 #ifdef EKCD
 static int
 dump_encrypt(struct kerneldumpcrypto *kdc, uint8_t *buf, size_t size)
 {
 
 	switch (kdc->kdc_encryption) {
 	case KERNELDUMP_ENC_AES_256_CBC:
 		if (rijndael_blockEncrypt(&kdc->kdc_ci, &kdc->kdc_ki, buf,
 		    8 * size, buf) <= 0) {
 			return (EIO);
 		}
 		if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC,
 		    buf + size - 16 /* IV size for AES-256-CBC */) <= 0) {
 			return (EIO);
 		}
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 /* Encrypt data and call dumper. */
 static int
 dump_encrypted_write(struct dumperinfo *di, void *virtual,
     vm_offset_t physical, off_t offset, size_t length)
 {
 	static uint8_t buf[KERNELDUMP_BUFFER_SIZE];
 	struct kerneldumpcrypto *kdc;
 	int error;
 	size_t nbytes;
 
 	kdc = di->kdcrypto;
 
 	while (length > 0) {
 		nbytes = MIN(length, sizeof(buf));
 		bcopy(virtual, buf, nbytes);
 
 		if (dump_encrypt(kdc, buf, nbytes) != 0)
 			return (EIO);
 
 		error = dump_write(di, buf, physical, offset, nbytes);
 		if (error != 0)
 			return (error);
 
 		offset += nbytes;
 		virtual = (void *)((uint8_t *)virtual + nbytes);
 		length -= nbytes;
 	}
 
 	return (0);
 }
 #endif /* EKCD */
 
 static int
 kerneldumpcomp_write_cb(void *base, size_t length, off_t offset, void *arg)
 {
 	struct dumperinfo *di;
 	size_t resid, rlength;
 	int error;
 
 	di = arg;
 
 	if (length % di->blocksize != 0) {
 		/*
 		 * This must be the final write after flushing the compression
 		 * stream. Write as many full blocks as possible and stash the
 		 * residual data in the dumper's block buffer. It will be
 		 * padded and written in dump_finish().
 		 */
 		rlength = rounddown(length, di->blocksize);
 		if (rlength != 0) {
 			error = _dump_append(di, base, 0, rlength);
 			if (error != 0)
 				return (error);
 		}
 		resid = length - rlength;
 		memmove(di->blockbuf, (uint8_t *)base + rlength, resid);
 		di->kdcomp->kdc_resid = resid;
 		return (EAGAIN);
 	}
 	return (_dump_append(di, base, 0, length));
 }
 
 /*
  * Write kernel dump headers at the beginning and end of the dump extent.
  * Write the kernel dump encryption key after the leading header if we were
  * configured to do so.
  */
 static int
 dump_write_headers(struct dumperinfo *di, struct kerneldumpheader *kdh)
 {
 #ifdef EKCD
 	struct kerneldumpcrypto *kdc;
 #endif
 	void *buf, *key;
 	size_t hdrsz;
 	uint64_t extent;
 	uint32_t keysize;
 	int error;
 
 	hdrsz = sizeof(*kdh);
 	if (hdrsz > di->blocksize)
 		return (ENOMEM);
 
 #ifdef EKCD
 	kdc = di->kdcrypto;
 	key = kdc->kdc_dumpkey;
 	keysize = kerneldumpcrypto_dumpkeysize(kdc);
 #else
 	key = NULL;
 	keysize = 0;
 #endif
 
 	/*
 	 * If the dump device has special handling for headers, let it take care
 	 * of writing them out.
 	 */
 	if (di->dumper_hdr != NULL)
 		return (di->dumper_hdr(di, kdh, key, keysize));
 
 	if (hdrsz == di->blocksize)
 		buf = kdh;
 	else {
 		buf = di->blockbuf;
 		memset(buf, 0, di->blocksize);
 		memcpy(buf, kdh, hdrsz);
 	}
 
 	extent = dtoh64(kdh->dumpextent);
 #ifdef EKCD
 	if (kdc != NULL) {
 		error = dump_write(di, kdc->kdc_dumpkey, 0,
 		    di->mediaoffset + di->mediasize - di->blocksize - extent -
 		    keysize, keysize);
 		if (error != 0)
 			return (error);
 	}
 #endif
 
 	error = dump_write(di, buf, 0,
 	    di->mediaoffset + di->mediasize - 2 * di->blocksize - extent -
 	    keysize, di->blocksize);
 	if (error == 0)
 		error = dump_write(di, buf, 0, di->mediaoffset + di->mediasize -
 		    di->blocksize, di->blocksize);
 	return (error);
 }
 
 /*
  * Don't touch the first SIZEOF_METADATA bytes on the dump device.  This is to
  * protect us from metadata and metadata from us.
  */
 #define	SIZEOF_METADATA		(64 * 1024)
 
 /*
  * Do some preliminary setup for a kernel dump: initialize state for encryption,
  * if requested, and make sure that we have enough space on the dump device.
  *
  * We set things up so that the dump ends before the last sector of the dump
  * device, at which the trailing header is written.
  *
  *     +-----------+------+-----+----------------------------+------+
  *     |           | lhdr | key |    ... kernel dump ...     | thdr |
  *     +-----------+------+-----+----------------------------+------+
  *                   1 blk  opt <------- dump extent --------> 1 blk
  *
  * Dumps written using dump_append() start at the beginning of the extent.
  * Uncompressed dumps will use the entire extent, but compressed dumps typically
  * will not. The true length of the dump is recorded in the leading and trailing
  * headers once the dump has been completed.
  *
  * The dump device may provide a callback, in which case it will initialize
  * dumpoff and take care of laying out the headers.
  */
 int
 dump_start(struct dumperinfo *di, struct kerneldumpheader *kdh)
 {
 	uint64_t dumpextent, span;
 	uint32_t keysize;
 	int error;
 
 #ifdef EKCD
 	error = kerneldumpcrypto_init(di->kdcrypto);
 	if (error != 0)
 		return (error);
 	keysize = kerneldumpcrypto_dumpkeysize(di->kdcrypto);
 #else
 	error = 0;
 	keysize = 0;
 #endif
 
 	if (di->dumper_start != NULL) {
 		error = di->dumper_start(di);
 	} else {
 		dumpextent = dtoh64(kdh->dumpextent);
 		span = SIZEOF_METADATA + dumpextent + 2 * di->blocksize +
 		    keysize;
 		if (di->mediasize < span) {
 			if (di->kdcomp == NULL)
 				return (E2BIG);
 
 			/*
 			 * We don't yet know how much space the compressed dump
 			 * will occupy, so try to use the whole swap partition
 			 * (minus the first 64KB) in the hope that the
 			 * compressed dump will fit. If that doesn't turn out to
 			 * be enough, the bounds checking in dump_write()
 			 * will catch us and cause the dump to fail.
 			 */
 			dumpextent = di->mediasize - span + dumpextent;
 			kdh->dumpextent = htod64(dumpextent);
 		}
 
 		/*
 		 * The offset at which to begin writing the dump.
 		 */
 		di->dumpoff = di->mediaoffset + di->mediasize - di->blocksize -
 		    dumpextent;
 	}
 	di->origdumpoff = di->dumpoff;
 	return (error);
 }
 
 static int
 _dump_append(struct dumperinfo *di, void *virtual, vm_offset_t physical,
     size_t length)
 {
 	int error;
 
 #ifdef EKCD
 	if (di->kdcrypto != NULL)
 		error = dump_encrypted_write(di, virtual, physical, di->dumpoff,
 		    length);
 	else
 #endif
 		error = dump_write(di, virtual, physical, di->dumpoff, length);
 	if (error == 0)
 		di->dumpoff += length;
 	return (error);
 }
 
 /*
  * Write to the dump device starting at dumpoff. When compression is enabled,
  * writes to the device will be performed using a callback that gets invoked
  * when the compression stream's output buffer is full.
  */
 int
 dump_append(struct dumperinfo *di, void *virtual, vm_offset_t physical,
     size_t length)
 {
 	void *buf;
 
 	if (di->kdcomp != NULL) {
 		/* Bounce through a buffer to avoid CRC errors. */
 		if (length > di->maxiosize)
 			return (EINVAL);
 		buf = di->kdcomp->kdc_buf;
 		memmove(buf, virtual, length);
 		return (compressor_write(di->kdcomp->kdc_stream, buf, length));
 	}
 	return (_dump_append(di, virtual, physical, length));
 }
 
 /*
  * Write to the dump device at the specified offset.
  */
 int
 dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical,
     off_t offset, size_t length)
 {
 	int error;
 
 	error = dump_check_bounds(di, offset, length);
 	if (error != 0)
 		return (error);
 	return (di->dumper(di->priv, virtual, physical, offset, length));
 }
 
 /*
  * Perform kernel dump finalization: flush the compression stream, if necessary,
  * write the leading and trailing kernel dump headers now that we know the true
  * length of the dump, and optionally write the encryption key following the
  * leading header.
  */
 int
 dump_finish(struct dumperinfo *di, struct kerneldumpheader *kdh)
 {
 	int error;
 
 	if (di->kdcomp != NULL) {
 		error = compressor_flush(di->kdcomp->kdc_stream);
 		if (error == EAGAIN) {
 			/* We have residual data in di->blockbuf. */
 			error = dump_write(di, di->blockbuf, 0, di->dumpoff,
 			    di->blocksize);
 			di->dumpoff += di->kdcomp->kdc_resid;
 			di->kdcomp->kdc_resid = 0;
 		}
 		if (error != 0)
 			return (error);
 
 		/*
 		 * We now know the size of the compressed dump, so update the
 		 * header accordingly and recompute parity.
 		 */
 		kdh->dumplength = htod64(di->dumpoff - di->origdumpoff);
 		kdh->parity = 0;
 		kdh->parity = kerneldump_parity(kdh);
 
 		compressor_reset(di->kdcomp->kdc_stream);
 	}
 
 	error = dump_write_headers(di, kdh);
 	if (error != 0)
 		return (error);
 
 	(void)dump_write(di, NULL, 0, 0, 0);
 	return (0);
 }
 
 void
 dump_init_header(const struct dumperinfo *di, struct kerneldumpheader *kdh,
     char *magic, uint32_t archver, uint64_t dumplen)
 {
 	size_t dstsize;
 
 	bzero(kdh, sizeof(*kdh));
 	strlcpy(kdh->magic, magic, sizeof(kdh->magic));
 	strlcpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture));
 	kdh->version = htod32(KERNELDUMPVERSION);
 	kdh->architectureversion = htod32(archver);
 	kdh->dumplength = htod64(dumplen);
 	kdh->dumpextent = kdh->dumplength;
 	kdh->dumptime = htod64(time_second);
 #ifdef EKCD
 	kdh->dumpkeysize = htod32(kerneldumpcrypto_dumpkeysize(di->kdcrypto));
 #else
 	kdh->dumpkeysize = 0;
 #endif
 	kdh->blocksize = htod32(di->blocksize);
 	strlcpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname));
 	dstsize = sizeof(kdh->versionstring);
 	if (strlcpy(kdh->versionstring, version, dstsize) >= dstsize)
 		kdh->versionstring[dstsize - 2] = '\n';
 	if (panicstr != NULL)
 		strlcpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring));
 	if (di->kdcomp != NULL)
 		kdh->compression = di->kdcomp->kdc_format;
 	kdh->parity = kerneldump_parity(kdh);
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(panic, db_show_panic)
 {
 
 	if (panicstr == NULL)
 		db_printf("panicstr not set\n");
 	else
 		db_printf("panic: %s\n", panicstr);
 }
 #endif
Index: stable/12/sys/kern/kern_timeout.c
===================================================================
--- stable/12/sys/kern/kern_timeout.c	(revision 355609)
+++ stable/12/sys/kern/kern_timeout.c	(revision 355610)
@@ -1,1675 +1,1676 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)kern_clock.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_callout_profiling.h"
 #include "opt_ddb.h"
 #if defined(__arm__)
 #include "opt_timer.h"
 #endif
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/domainset.h>
 #include <sys/file.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <machine/_inttypes.h>
 #endif
 
 #ifdef SMP
 #include <machine/cpu.h>
 #endif
 
 #ifndef NO_EVENTTIMERS
 DPCPU_DECLARE(sbintime_t, hardclocktime);
 #endif
 
 SDT_PROVIDER_DEFINE(callout_execute);
 SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *");
 SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *");
 
 #ifdef CALLOUT_PROFILING
 static int avg_depth;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
     "Average number of items examined per softclock call. Units = 1/1000");
 static int avg_gcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
     "Average number of Giant callouts made per softclock call. Units = 1/1000");
 static int avg_lockcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
     "Average number of lock callouts made per softclock call. Units = 1/1000");
 static int avg_mpcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
     "Average number of MP callouts made per softclock call. Units = 1/1000");
 static int avg_depth_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
     "Average number of direct callouts examined per callout_process call. "
     "Units = 1/1000");
 static int avg_lockcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
     &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
     "callout_process call. Units = 1/1000");
 static int avg_mpcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
     0, "Average number of MP direct callouts made per callout_process call. "
     "Units = 1/1000");
 #endif
 
 static int ncallout;
 SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0,
     "Number of entries in callwheel and size of timeout() preallocation");
 
 #ifdef	RSS
 static int pin_default_swi = 1;
 static int pin_pcpu_swi = 1;
 #else
 static int pin_default_swi = 0;
 static int pin_pcpu_swi = 0;
 #endif
 
 SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi,
     0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)");
 SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi,
     0, "Pin the per-CPU swis (except PCPU 0, which is also default");
 
 /*
  * TODO:
  *	allocate more timeout table slots when table overflows.
  */
-u_int callwheelsize, callwheelmask;
+static u_int __read_mostly callwheelsize;
+static u_int __read_mostly callwheelmask;
 
 /*
  * The callout cpu exec entities represent informations necessary for
  * describing the state of callouts currently running on the CPU and the ones
  * necessary for migrating callouts to the new callout cpu. In particular,
  * the first entry of the array cc_exec_entity holds informations for callout
  * running in SWI thread context, while the second one holds informations
  * for callout running directly from hardware interrupt context.
  * The cached informations are very important for deferring migration when
  * the migrating callout is already running.
  */
 struct cc_exec {
 	struct callout		*cc_curr;
 	void			(*cc_drain)(void *);
 #ifdef SMP
 	void			(*ce_migration_func)(void *);
 	void			*ce_migration_arg;
 	int			ce_migration_cpu;
 	sbintime_t		ce_migration_time;
 	sbintime_t		ce_migration_prec;
 #endif
 	bool			cc_cancel;
 	bool			cc_waiting;
 };
 
 /*
  * There is one struct callout_cpu per cpu, holding all relevant
  * state for the callout processing thread on the individual CPU.
  */
 struct callout_cpu {
 	struct mtx_padalign	cc_lock;
 	struct cc_exec 		cc_exec_entity[2];
 	struct callout		*cc_next;
 	struct callout		*cc_callout;
 	struct callout_list	*cc_callwheel;
 	struct callout_tailq	cc_expireq;
 	struct callout_slist	cc_callfree;
 	sbintime_t		cc_firstevent;
 	sbintime_t		cc_lastscan;
 	void			*cc_cookie;
 	u_int			cc_bucket;
 	u_int			cc_inited;
 	char			cc_ktr_event_name[20];
 };
 
 #define	callout_migrating(c)	((c)->c_iflags & CALLOUT_DFRMIGRATION)
 
 #define	cc_exec_curr(cc, dir)		cc->cc_exec_entity[dir].cc_curr
 #define	cc_exec_drain(cc, dir)		cc->cc_exec_entity[dir].cc_drain
 #define	cc_exec_next(cc)		cc->cc_next
 #define	cc_exec_cancel(cc, dir)		cc->cc_exec_entity[dir].cc_cancel
 #define	cc_exec_waiting(cc, dir)	cc->cc_exec_entity[dir].cc_waiting
 #ifdef SMP
 #define	cc_migration_func(cc, dir)	cc->cc_exec_entity[dir].ce_migration_func
 #define	cc_migration_arg(cc, dir)	cc->cc_exec_entity[dir].ce_migration_arg
 #define	cc_migration_cpu(cc, dir)	cc->cc_exec_entity[dir].ce_migration_cpu
 #define	cc_migration_time(cc, dir)	cc->cc_exec_entity[dir].ce_migration_time
 #define	cc_migration_prec(cc, dir)	cc->cc_exec_entity[dir].ce_migration_prec
 
 struct callout_cpu cc_cpu[MAXCPU];
 #define	CPUBLOCK	MAXCPU
 #define	CC_CPU(cpu)	(&cc_cpu[(cpu)])
 #define	CC_SELF()	CC_CPU(PCPU_GET(cpuid))
 #else
 struct callout_cpu cc_cpu;
 #define	CC_CPU(cpu)	&cc_cpu
 #define	CC_SELF()	&cc_cpu
 #endif
 #define	CC_LOCK(cc)	mtx_lock_spin(&(cc)->cc_lock)
 #define	CC_UNLOCK(cc)	mtx_unlock_spin(&(cc)->cc_lock)
 #define	CC_LOCK_ASSERT(cc)	mtx_assert(&(cc)->cc_lock, MA_OWNED)
 
-static int timeout_cpu;
+static int __read_mostly timeout_cpu;
 
 static void	callout_cpu_init(struct callout_cpu *cc, int cpu);
 static void	softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
 		    int *mpcalls, int *lockcalls, int *gcalls,
 #endif
 		    int direct);
 
 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
 
 /**
  * Locked by cc_lock:
  *   cc_curr         - If a callout is in progress, it is cc_curr.
  *                     If cc_curr is non-NULL, threads waiting in
  *                     callout_drain() will be woken up as soon as the
  *                     relevant callout completes.
  *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
  *                     guarantees that the current callout will not run.
  *                     The softclock() function sets this to 0 before it
  *                     drops callout_lock to acquire c_lock, and it calls
  *                     the handler only if curr_cancelled is still 0 after
  *                     cc_lock is successfully acquired.
  *   cc_waiting      - If a thread is waiting in callout_drain(), then
  *                     callout_wait is nonzero.  Set only when
  *                     cc_curr is non-NULL.
  */
 
 /*
  * Resets the execution entity tied to a specific callout cpu.
  */
 static void
 cc_cce_cleanup(struct callout_cpu *cc, int direct)
 {
 
 	cc_exec_curr(cc, direct) = NULL;
 	cc_exec_cancel(cc, direct) = false;
 	cc_exec_waiting(cc, direct) = false;
 #ifdef SMP
 	cc_migration_cpu(cc, direct) = CPUBLOCK;
 	cc_migration_time(cc, direct) = 0;
 	cc_migration_prec(cc, direct) = 0;
 	cc_migration_func(cc, direct) = NULL;
 	cc_migration_arg(cc, direct) = NULL;
 #endif
 }
 
 /*
  * Checks if migration is requested by a specific callout cpu.
  */
 static int
 cc_cce_migrating(struct callout_cpu *cc, int direct)
 {
 
 #ifdef SMP
 	return (cc_migration_cpu(cc, direct) != CPUBLOCK);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Kernel low level callwheel initialization
  * called on the BSP during kernel startup.
  */
 static void
 callout_callwheel_init(void *dummy)
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Calculate the size of the callout wheel and the preallocated
 	 * timeout() structures.
 	 * XXX: Clip callout to result of previous function of maxusers
 	 * maximum 384.  This is still huge, but acceptable.
 	 */
 	memset(CC_CPU(curcpu), 0, sizeof(cc_cpu));
 	ncallout = imin(16 + maxproc + maxfiles, 18508);
 	TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
 
 	/*
 	 * Calculate callout wheel size, should be next power of two higher
 	 * than 'ncallout'.
 	 */
 	callwheelsize = 1 << fls(ncallout);
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Fetch whether we're pinning the swi's or not.
 	 */
 	TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi);
 	TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi);
 
 	/*
 	 * Only BSP handles timeout(9) and receives a preallocation.
 	 *
 	 * XXX: Once all timeout(9) consumers are converted this can
 	 * be removed.
 	 */
 	timeout_cpu = PCPU_GET(cpuid);
 	cc = CC_CPU(timeout_cpu);
 	cc->cc_callout = malloc(ncallout * sizeof(struct callout),
 	    M_CALLOUT, M_WAITOK);
 	callout_cpu_init(cc, timeout_cpu);
 }
 SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
 
 /*
  * Initialize the per-cpu callout structures.
  */
 static void
 callout_cpu_init(struct callout_cpu *cc, int cpu)
 {
 	struct callout *c;
 	int i;
 
 	mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
 	SLIST_INIT(&cc->cc_callfree);
 	cc->cc_inited = 1;
 	cc->cc_callwheel = malloc_domainset(sizeof(struct callout_list) *
 	    callwheelsize, M_CALLOUT,
 	    DOMAINSET_PREF(pcpu_find(cpu)->pc_domain), M_WAITOK);
 	for (i = 0; i < callwheelsize; i++)
 		LIST_INIT(&cc->cc_callwheel[i]);
 	TAILQ_INIT(&cc->cc_expireq);
 	cc->cc_firstevent = SBT_MAX;
 	for (i = 0; i < 2; i++)
 		cc_cce_cleanup(cc, i);
 	snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
 	    "callwheel cpu %d", cpu);
 	if (cc->cc_callout == NULL)	/* Only BSP handles timeout(9) */
 		return;
 	for (i = 0; i < ncallout; i++) {
 		c = &cc->cc_callout[i];
 		callout_init(c, 0);
 		c->c_iflags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 	}
 }
 
 #ifdef SMP
 /*
  * Switches the cpu tied to a specific callout.
  * The function expects a locked incoming callout cpu and returns with
  * locked outcoming callout cpu.
  */
 static struct callout_cpu *
 callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
 {
 	struct callout_cpu *new_cc;
 
 	MPASS(c != NULL && cc != NULL);
 	CC_LOCK_ASSERT(cc);
 
 	/*
 	 * Avoid interrupts and preemption firing after the callout cpu
 	 * is blocked in order to avoid deadlocks as the new thread
 	 * may be willing to acquire the callout cpu lock.
 	 */
 	c->c_cpu = CPUBLOCK;
 	spinlock_enter();
 	CC_UNLOCK(cc);
 	new_cc = CC_CPU(new_cpu);
 	CC_LOCK(new_cc);
 	spinlock_exit();
 	c->c_cpu = new_cpu;
 	return (new_cc);
 }
 #endif
 
 /*
  * Start standard softclock thread.
  */
 static void
 start_softclock(void *dummy)
 {
 	struct callout_cpu *cc;
 	char name[MAXCOMLEN];
 #ifdef SMP
 	int cpu;
 	struct intr_event *ie;
 #endif
 
 	cc = CC_CPU(timeout_cpu);
 	snprintf(name, sizeof(name), "clock (%d)", timeout_cpu);
 	if (swi_add(&clk_intr_event, name, softclock, cc, SWI_CLOCK,
 	    INTR_MPSAFE, &cc->cc_cookie))
 		panic("died while creating standard software ithreads");
 	if (pin_default_swi &&
 	    (intr_event_bind(clk_intr_event, timeout_cpu) != 0)) {
 		printf("%s: timeout clock couldn't be pinned to cpu %d\n",
 		    __func__,
 		    timeout_cpu);
 	}
 
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		if (cpu == timeout_cpu)
 			continue;
 		cc = CC_CPU(cpu);
 		cc->cc_callout = NULL;	/* Only BSP handles timeout(9). */
 		callout_cpu_init(cc, cpu);
 		snprintf(name, sizeof(name), "clock (%d)", cpu);
 		ie = NULL;
 		if (swi_add(&ie, name, softclock, cc, SWI_CLOCK,
 		    INTR_MPSAFE, &cc->cc_cookie))
 			panic("died while creating standard software ithreads");
 		if (pin_pcpu_swi && (intr_event_bind(ie, cpu) != 0)) {
 			printf("%s: per-cpu clock couldn't be pinned to "
 			    "cpu %d\n",
 			    __func__,
 			    cpu);
 		}
 	}
 #endif
 }
 SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
 
 #define	CC_HASH_SHIFT	8
 
 static inline u_int
 callout_hash(sbintime_t sbt)
 {
 
 	return (sbt >> (32 - CC_HASH_SHIFT));
 }
 
 static inline u_int
 callout_get_bucket(sbintime_t sbt)
 {
 
 	return (callout_hash(sbt) & callwheelmask);
 }
 
 void
 callout_process(sbintime_t now)
 {
 	struct callout *tmp, *tmpn;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t first, last, max, tmp_max;
 	uint32_t lookahead;
 	u_int firstb, lastb, nowb;
 #ifdef CALLOUT_PROFILING
 	int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
 #endif
 
 	cc = CC_SELF();
 	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
 
 	/* Compute the buckets of the last scan and present times. */
 	firstb = callout_hash(cc->cc_lastscan);
 	cc->cc_lastscan = now;
 	nowb = callout_hash(now);
 
 	/* Compute the last bucket and minimum time of the bucket after it. */
 	if (nowb == firstb)
 		lookahead = (SBT_1S / 16);
 	else if (nowb - firstb == 1)
 		lookahead = (SBT_1S / 8);
 	else
 		lookahead = (SBT_1S / 2);
 	first = last = now;
 	first += (lookahead / 2);
 	last += lookahead;
 	last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT));
 	lastb = callout_hash(last) - 1;
 	max = last;
 
 	/*
 	 * Check if we wrapped around the entire wheel from the last scan.
 	 * In case, we need to scan entirely the wheel for pending callouts.
 	 */
 	if (lastb - firstb >= callwheelsize) {
 		lastb = firstb + callwheelsize - 1;
 		if (nowb - firstb >= callwheelsize)
 			nowb = lastb;
 	}
 
 	/* Iterate callwheel from firstb to nowb and then up to lastb. */
 	do {
 		sc = &cc->cc_callwheel[firstb & callwheelmask];
 		tmp = LIST_FIRST(sc);
 		while (tmp != NULL) {
 			/* Run the callout if present time within allowed. */
 			if (tmp->c_time <= now) {
 				/*
 				 * Consumer told us the callout may be run
 				 * directly from hardware interrupt context.
 				 */
 				if (tmp->c_iflags & CALLOUT_DIRECT) {
 #ifdef CALLOUT_PROFILING
 					++depth_dir;
 #endif
 					cc_exec_next(cc) =
 					    LIST_NEXT(tmp, c_links.le);
 					cc->cc_bucket = firstb & callwheelmask;
 					LIST_REMOVE(tmp, c_links.le);
 					softclock_call_cc(tmp, cc,
 #ifdef CALLOUT_PROFILING
 					    &mpcalls_dir, &lockcalls_dir, NULL,
 #endif
 					    1);
 					tmp = cc_exec_next(cc);
 					cc_exec_next(cc) = NULL;
 				} else {
 					tmpn = LIST_NEXT(tmp, c_links.le);
 					LIST_REMOVE(tmp, c_links.le);
 					TAILQ_INSERT_TAIL(&cc->cc_expireq,
 					    tmp, c_links.tqe);
 					tmp->c_iflags |= CALLOUT_PROCESSED;
 					tmp = tmpn;
 				}
 				continue;
 			}
 			/* Skip events from distant future. */
 			if (tmp->c_time >= max)
 				goto next;
 			/*
 			 * Event minimal time is bigger than present maximal
 			 * time, so it cannot be aggregated.
 			 */
 			if (tmp->c_time > last) {
 				lastb = nowb;
 				goto next;
 			}
 			/* Update first and last time, respecting this event. */
 			if (tmp->c_time < first)
 				first = tmp->c_time;
 			tmp_max = tmp->c_time + tmp->c_precision;
 			if (tmp_max < last)
 				last = tmp_max;
 next:
 			tmp = LIST_NEXT(tmp, c_links.le);
 		}
 		/* Proceed with the next bucket. */
 		firstb++;
 		/*
 		 * Stop if we looked after present time and found
 		 * some event we can't execute at now.
 		 * Stop if we looked far enough into the future.
 		 */
 	} while (((int)(firstb - lastb)) <= 0);
 	cc->cc_firstevent = last;
 #ifndef NO_EVENTTIMERS
 	cpu_new_callout(curcpu, last, first);
 #endif
 #ifdef CALLOUT_PROFILING
 	avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
 	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
 	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
 #endif
 	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
 	/*
 	 * swi_sched acquires the thread lock, so we don't want to call it
 	 * with cc_lock held; incorrect locking order.
 	 */
 	if (!TAILQ_EMPTY(&cc->cc_expireq))
 		swi_sched(cc->cc_cookie, 0);
 }
 
 static struct callout_cpu *
 callout_lock(struct callout *c)
 {
 	struct callout_cpu *cc;
 	int cpu;
 
 	for (;;) {
 		cpu = c->c_cpu;
 #ifdef SMP
 		if (cpu == CPUBLOCK) {
 			while (c->c_cpu == CPUBLOCK)
 				cpu_spinwait();
 			continue;
 		}
 #endif
 		cc = CC_CPU(cpu);
 		CC_LOCK(cc);
 		if (cpu == c->c_cpu)
 			break;
 		CC_UNLOCK(cc);
 	}
 	return (cc);
 }
 
 static void
 callout_cc_add(struct callout *c, struct callout_cpu *cc,
     sbintime_t sbt, sbintime_t precision, void (*func)(void *),
     void *arg, int cpu, int flags)
 {
 	int bucket;
 
 	CC_LOCK_ASSERT(cc);
 	if (sbt < cc->cc_lastscan)
 		sbt = cc->cc_lastscan;
 	c->c_arg = arg;
 	c->c_iflags |= CALLOUT_PENDING;
 	c->c_iflags &= ~CALLOUT_PROCESSED;
 	c->c_flags |= CALLOUT_ACTIVE;
 	if (flags & C_DIRECT_EXEC)
 		c->c_iflags |= CALLOUT_DIRECT;
 	c->c_func = func;
 	c->c_time = sbt;
 	c->c_precision = precision;
 	bucket = callout_get_bucket(c->c_time);
 	CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
 	    c, (int)(c->c_precision >> 32),
 	    (u_int)(c->c_precision & 0xffffffff));
 	LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
 	if (cc->cc_bucket == bucket)
 		cc_exec_next(cc) = c;
 #ifndef NO_EVENTTIMERS
 	/*
 	 * Inform the eventtimers(4) subsystem there's a new callout
 	 * that has been inserted, but only if really required.
 	 */
 	if (SBT_MAX - c->c_time < c->c_precision)
 		c->c_precision = SBT_MAX - c->c_time;
 	sbt = c->c_time + c->c_precision;
 	if (sbt < cc->cc_firstevent) {
 		cc->cc_firstevent = sbt;
 		cpu_new_callout(cpu, sbt, c->c_time);
 	}
 #endif
 }
 
 static void
 callout_cc_del(struct callout *c, struct callout_cpu *cc)
 {
 
 	if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) == 0)
 		return;
 	c->c_func = NULL;
 	SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 }
 
 static void
 softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
     int *mpcalls, int *lockcalls, int *gcalls,
 #endif
     int direct)
 {
 	struct rm_priotracker tracker;
 	void (*c_func)(void *);
 	void *c_arg;
 	struct lock_class *class;
 	struct lock_object *c_lock;
 	uintptr_t lock_status;
 	int c_iflags;
 #ifdef SMP
 	struct callout_cpu *new_cc;
 	void (*new_func)(void *);
 	void *new_arg;
 	int flags, new_cpu;
 	sbintime_t new_prec, new_time;
 #endif
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) 
 	sbintime_t sbt1, sbt2;
 	struct timespec ts2;
 	static sbintime_t maxdt = 2 * SBT_1MS;	/* 2 msec */
 	static timeout_t *lastfunc;
 #endif
 
 	KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING,
 	    ("softclock_call_cc: pend %p %x", c, c->c_iflags));
 	KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE,
 	    ("softclock_call_cc: act %p %x", c, c->c_flags));
 	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
 	lock_status = 0;
 	if (c->c_flags & CALLOUT_SHAREDLOCK) {
 		if (class == &lock_class_rm)
 			lock_status = (uintptr_t)&tracker;
 		else
 			lock_status = 1;
 	}
 	c_lock = c->c_lock;
 	c_func = c->c_func;
 	c_arg = c->c_arg;
 	c_iflags = c->c_iflags;
 	if (c->c_iflags & CALLOUT_LOCAL_ALLOC)
 		c->c_iflags = CALLOUT_LOCAL_ALLOC;
 	else
 		c->c_iflags &= ~CALLOUT_PENDING;
 	
 	cc_exec_curr(cc, direct) = c;
 	cc_exec_cancel(cc, direct) = false;
 	cc_exec_drain(cc, direct) = NULL;
 	CC_UNLOCK(cc);
 	if (c_lock != NULL) {
 		class->lc_lock(c_lock, lock_status);
 		/*
 		 * The callout may have been cancelled
 		 * while we switched locks.
 		 */
 		if (cc_exec_cancel(cc, direct)) {
 			class->lc_unlock(c_lock);
 			goto skip;
 		}
 		/* The callout cannot be stopped now. */
 		cc_exec_cancel(cc, direct) = true;
 		if (c_lock == &Giant.lock_object) {
 #ifdef CALLOUT_PROFILING
 			(*gcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
 			    c, c_func, c_arg);
 		} else {
 #ifdef CALLOUT_PROFILING
 			(*lockcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
 			    c, c_func, c_arg);
 		}
 	} else {
 #ifdef CALLOUT_PROFILING
 		(*mpcalls)++;
 #endif
 		CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
 		    c, c_func, c_arg);
 	}
 	KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running",
 	    "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct);
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt1 = sbinuptime();
 #endif
 	THREAD_NO_SLEEPING();
 	SDT_PROBE1(callout_execute, , , callout__start, c);
 	c_func(c_arg);
 	SDT_PROBE1(callout_execute, , , callout__end, c);
 	THREAD_SLEEPING_OK();
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt2 = sbinuptime();
 	sbt2 -= sbt1;
 	if (sbt2 > maxdt) {
 		if (lastfunc != c_func || sbt2 > maxdt * 2) {
 			ts2 = sbttots(sbt2);
 			printf(
 		"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
 			    c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
 		}
 		maxdt = sbt2;
 		lastfunc = c_func;
 	}
 #endif
 	KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
 	CTR1(KTR_CALLOUT, "callout %p finished", c);
 	if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0)
 		class->lc_unlock(c_lock);
 skip:
 	CC_LOCK(cc);
 	KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr"));
 	cc_exec_curr(cc, direct) = NULL;
 	if (cc_exec_drain(cc, direct)) {
 		void (*drain)(void *);
 		
 		drain = cc_exec_drain(cc, direct);
 		cc_exec_drain(cc, direct) = NULL;
 		CC_UNLOCK(cc);
 		drain(c_arg);
 		CC_LOCK(cc);
 	}
 	if (cc_exec_waiting(cc, direct)) {
 		/*
 		 * There is someone waiting for the
 		 * callout to complete.
 		 * If the callout was scheduled for
 		 * migration just cancel it.
 		 */
 		if (cc_cce_migrating(cc, direct)) {
 			cc_cce_cleanup(cc, direct);
 
 			/*
 			 * It should be assert here that the callout is not
 			 * destroyed but that is not easy.
 			 */
 			c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 		}
 		cc_exec_waiting(cc, direct) = false;
 		CC_UNLOCK(cc);
 		wakeup(&cc_exec_waiting(cc, direct));
 		CC_LOCK(cc);
 	} else if (cc_cce_migrating(cc, direct)) {
 		KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0,
 		    ("Migrating legacy callout %p", c));
 #ifdef SMP
 		/*
 		 * If the callout was scheduled for
 		 * migration just perform it now.
 		 */
 		new_cpu = cc_migration_cpu(cc, direct);
 		new_time = cc_migration_time(cc, direct);
 		new_prec = cc_migration_prec(cc, direct);
 		new_func = cc_migration_func(cc, direct);
 		new_arg = cc_migration_arg(cc, direct);
 		cc_cce_cleanup(cc, direct);
 
 		/*
 		 * It should be assert here that the callout is not destroyed
 		 * but that is not easy.
 		 *
 		 * As first thing, handle deferred callout stops.
 		 */
 		if (!callout_migrating(c)) {
 			CTR3(KTR_CALLOUT,
 			     "deferred cancelled %p func %p arg %p",
 			     c, new_func, new_arg);
 			callout_cc_del(c, cc);
 			return;
 		}
 		c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 
 		new_cc = callout_cpu_switch(c, cc, new_cpu);
 		flags = (direct) ? C_DIRECT_EXEC : 0;
 		callout_cc_add(c, new_cc, new_time, new_prec, new_func,
 		    new_arg, new_cpu, flags);
 		CC_UNLOCK(new_cc);
 		CC_LOCK(cc);
 #else
 		panic("migration should not happen");
 #endif
 	}
 	/*
 	 * If the current callout is locally allocated (from
 	 * timeout(9)) then put it on the freelist.
 	 *
 	 * Note: we need to check the cached copy of c_iflags because
 	 * if it was not local, then it's not safe to deref the
 	 * callout pointer.
 	 */
 	KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0 ||
 	    c->c_iflags == CALLOUT_LOCAL_ALLOC,
 	    ("corrupted callout"));
 	if (c_iflags & CALLOUT_LOCAL_ALLOC)
 		callout_cc_del(c, cc);
 }
 
 /*
  * The callout mechanism is based on the work of Adam M. Costello and
  * George Varghese, published in a technical report entitled "Redesigning
  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
  * used in this implementation was published by G. Varghese and T. Lauck in
  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
  * the Efficient Implementation of a Timer Facility" in the Proceedings of
  * the 11th ACM Annual Symposium on Operating Systems Principles,
  * Austin, Texas Nov 1987.
  */
 
 /*
  * Software (low priority) clock interrupt.
  * Run periodic events from timeout queue.
  */
 void
 softclock(void *arg)
 {
 	struct callout_cpu *cc;
 	struct callout *c;
 #ifdef CALLOUT_PROFILING
 	int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
 #endif
 
 	cc = (struct callout_cpu *)arg;
 	CC_LOCK(cc);
 	while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
 		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		softclock_call_cc(c, cc,
 #ifdef CALLOUT_PROFILING
 		    &mpcalls, &lockcalls, &gcalls,
 #endif
 		    0);
 #ifdef CALLOUT_PROFILING
 		++depth;
 #endif
 	}
 #ifdef CALLOUT_PROFILING
 	avg_depth += (depth * 1000 - avg_depth) >> 8;
 	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
 	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
 	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
 #endif
 	CC_UNLOCK(cc);
 }
 
 /*
  * timeout --
  *	Execute a function after a specified length of time.
  *
  * untimeout --
  *	Cancel previous timeout function call.
  *
  * callout_handle_init --
  *	Initialize a handle so that using it with untimeout is benign.
  *
  *	See AT&T BCI Driver Reference Manual for specification.  This
  *	implementation differs from that one in that although an
  *	identification value is returned from timeout, the original
  *	arguments to timeout as well as the identifier are used to
  *	identify entries for untimeout.
  */
 struct callout_handle
 timeout(timeout_t *ftn, void *arg, int to_ticks)
 {
 	struct callout_cpu *cc;
 	struct callout *new;
 	struct callout_handle handle;
 
 	cc = CC_CPU(timeout_cpu);
 	CC_LOCK(cc);
 	/* Fill in the next free callout structure. */
 	new = SLIST_FIRST(&cc->cc_callfree);
 	if (new == NULL)
 		/* XXX Attempt to malloc first */
 		panic("timeout table full");
 	SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
 	callout_reset(new, to_ticks, ftn, arg);
 	handle.callout = new;
 	CC_UNLOCK(cc);
 
 	return (handle);
 }
 
 void
 untimeout(timeout_t *ftn, void *arg, struct callout_handle handle)
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Check for a handle that was initialized
 	 * by callout_handle_init, but never used
 	 * for a real timeout.
 	 */
 	if (handle.callout == NULL)
 		return;
 
 	cc = callout_lock(handle.callout);
 	if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
 		callout_stop(handle.callout);
 	CC_UNLOCK(cc);
 }
 
 void
 callout_handle_init(struct callout_handle *handle)
 {
 	handle->callout = NULL;
 }
 
 void
 callout_when(sbintime_t sbt, sbintime_t precision, int flags,
     sbintime_t *res, sbintime_t *prec_res)
 {
 	sbintime_t to_sbt, to_pr;
 
 	if ((flags & (C_ABSOLUTE | C_PRECALC)) != 0) {
 		*res = sbt;
 		*prec_res = precision;
 		return;
 	}
 	if ((flags & C_HARDCLOCK) != 0 && sbt < tick_sbt)
 		sbt = tick_sbt;
 	if ((flags & C_HARDCLOCK) != 0 ||
 #ifdef NO_EVENTTIMERS
 	    sbt >= sbt_timethreshold) {
 		to_sbt = getsbinuptime();
 
 		/* Add safety belt for the case of hz > 1000. */
 		to_sbt += tc_tick_sbt - tick_sbt;
 #else
 	    sbt >= sbt_tickthreshold) {
 		/*
 		 * Obtain the time of the last hardclock() call on
 		 * this CPU directly from the kern_clocksource.c.
 		 * This value is per-CPU, but it is equal for all
 		 * active ones.
 		 */
 #ifdef __LP64__
 		to_sbt = DPCPU_GET(hardclocktime);
 #else
 		spinlock_enter();
 		to_sbt = DPCPU_GET(hardclocktime);
 		spinlock_exit();
 #endif
 #endif
 		if (cold && to_sbt == 0)
 			to_sbt = sbinuptime();
 		if ((flags & C_HARDCLOCK) == 0)
 			to_sbt += tick_sbt;
 	} else
 		to_sbt = sbinuptime();
 	if (SBT_MAX - to_sbt < sbt)
 		to_sbt = SBT_MAX;
 	else
 		to_sbt += sbt;
 	*res = to_sbt;
 	to_pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
 	    sbt >> C_PRELGET(flags));
 	*prec_res = to_pr > precision ? to_pr : precision;
 }
 
 /*
  * New interface; clients allocate their own callout structures.
  *
  * callout_reset() - establish or change a timeout
  * callout_stop() - disestablish a timeout
  * callout_init() - initialize a callout structure so that it can
  *	safely be passed to callout_reset() and callout_stop()
  *
  * <sys/callout.h> defines three convenience macros:
  *
  * callout_active() - returns truth if callout has not been stopped,
  *	drained, or deactivated since the last time the callout was
  *	reset.
  * callout_pending() - returns truth if callout is still waiting for timeout
  * callout_deactivate() - marks the callout as having been serviced
  */
 int
 callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t prec,
     void (*ftn)(void *), void *arg, int cpu, int flags)
 {
 	sbintime_t to_sbt, precision;
 	struct callout_cpu *cc;
 	int cancelled, direct;
 	int ignore_cpu=0;
 
 	cancelled = 0;
 	if (cpu == -1) {
 		ignore_cpu = 1;
 	} else if ((cpu >= MAXCPU) ||
 		   ((CC_CPU(cpu))->cc_inited == 0)) {
 		/* Invalid CPU spec */
 		panic("Invalid CPU in callout %d", cpu);
 	}
 	callout_when(sbt, prec, flags, &to_sbt, &precision);
 
 	/* 
 	 * This flag used to be added by callout_cc_add, but the
 	 * first time you call this we could end up with the
 	 * wrong direct flag if we don't do it before we add.
 	 */
 	if (flags & C_DIRECT_EXEC) {
 		direct = 1;
 	} else {
 		direct = 0;
 	}
 	KASSERT(!direct || c->c_lock == NULL,
 	    ("%s: direct callout %p has lock", __func__, c));
 	cc = callout_lock(c);
 	/*
 	 * Don't allow migration of pre-allocated callouts lest they
 	 * become unbalanced or handle the case where the user does
 	 * not care. 
 	 */
 	if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) ||
 	    ignore_cpu) {
 		cpu = c->c_cpu;
 	}
 
 	if (cc_exec_curr(cc, direct) == c) {
 		/*
 		 * We're being asked to reschedule a callout which is
 		 * currently in progress.  If there is a lock then we
 		 * can cancel the callout if it has not really started.
 		 */
 		if (c->c_lock != NULL && !cc_exec_cancel(cc, direct))
 			cancelled = cc_exec_cancel(cc, direct) = true;
 		if (cc_exec_waiting(cc, direct) || cc_exec_drain(cc, direct)) {
 			/*
 			 * Someone has called callout_drain to kill this
 			 * callout.  Don't reschedule.
 			 */
 			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
 			    cancelled ? "cancelled" : "failed to cancel",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 #ifdef SMP
 		if (callout_migrating(c)) {
 			/* 
 			 * This only occurs when a second callout_reset_sbt_on
 			 * is made after a previous one moved it into
 			 * deferred migration (below). Note we do *not* change
 			 * the prev_cpu even though the previous target may
 			 * be different.
 			 */
 			cc_migration_cpu(cc, direct) = cpu;
 			cc_migration_time(cc, direct) = to_sbt;
 			cc_migration_prec(cc, direct) = precision;
 			cc_migration_func(cc, direct) = ftn;
 			cc_migration_arg(cc, direct) = arg;
 			cancelled = 1;
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 #endif
 	}
 	if (c->c_iflags & CALLOUT_PENDING) {
 		if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
 			if (cc_exec_next(cc) == c)
 				cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
 			LIST_REMOVE(c, c_links.le);
 		} else {
 			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		}
 		cancelled = 1;
 		c->c_iflags &= ~ CALLOUT_PENDING;
 		c->c_flags &= ~ CALLOUT_ACTIVE;
 	}
 
 #ifdef SMP
 	/*
 	 * If the callout must migrate try to perform it immediately.
 	 * If the callout is currently running, just defer the migration
 	 * to a more appropriate moment.
 	 */
 	if (c->c_cpu != cpu) {
 		if (cc_exec_curr(cc, direct) == c) {
 			/* 
 			 * Pending will have been removed since we are
 			 * actually executing the callout on another
 			 * CPU. That callout should be waiting on the
 			 * lock the caller holds. If we set both
 			 * active/and/pending after we return and the
 			 * lock on the executing callout proceeds, it
 			 * will then see pending is true and return.
 			 * At the return from the actual callout execution
 			 * the migration will occur in softclock_call_cc
 			 * and this new callout will be placed on the 
 			 * new CPU via a call to callout_cpu_switch() which
 			 * will get the lock on the right CPU followed
 			 * by a call callout_cc_add() which will add it there.
 			 * (see above in softclock_call_cc()).
 			 */
 			cc_migration_cpu(cc, direct) = cpu;
 			cc_migration_time(cc, direct) = to_sbt;
 			cc_migration_prec(cc, direct) = precision;
 			cc_migration_func(cc, direct) = ftn;
 			cc_migration_arg(cc, direct) = arg;
 			c->c_iflags |= (CALLOUT_DFRMIGRATION | CALLOUT_PENDING);
 			c->c_flags |= CALLOUT_ACTIVE;
 			CTR6(KTR_CALLOUT,
 		    "migration of %p func %p arg %p in %d.%08x to %u deferred",
 			    c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 			    (u_int)(to_sbt & 0xffffffff), cpu);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 		cc = callout_cpu_switch(c, cc, cpu);
 	}
 #endif
 
 	callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
 	CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
 	    cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 	    (u_int)(to_sbt & 0xffffffff));
 	CC_UNLOCK(cc);
 
 	return (cancelled);
 }
 
 /*
  * Common idioms that can be optimized in the future.
  */
 int
 callout_schedule_on(struct callout *c, int to_ticks, int cpu)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
 }
 
 int
 callout_schedule(struct callout *c, int to_ticks)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
 }
 
 int
 _callout_stop_safe(struct callout *c, int flags, void (*drain)(void *))
 {
 	struct callout_cpu *cc, *old_cc;
 	struct lock_class *class;
 	int direct, sq_locked, use_lock;
 	int cancelled, not_on_a_list;
 
 	if ((flags & CS_DRAIN) != 0)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock,
 		    "calling %s", __func__);
 
 	/*
 	 * Some old subsystems don't hold Giant while running a callout_stop(),
 	 * so just discard this check for the moment.
 	 */
 	if ((flags & CS_DRAIN) == 0 && c->c_lock != NULL) {
 		if (c->c_lock == &Giant.lock_object)
 			use_lock = mtx_owned(&Giant);
 		else {
 			use_lock = 1;
 			class = LOCK_CLASS(c->c_lock);
 			class->lc_assert(c->c_lock, LA_XLOCKED);
 		}
 	} else
 		use_lock = 0;
 	if (c->c_iflags & CALLOUT_DIRECT) {
 		direct = 1;
 	} else {
 		direct = 0;
 	}
 	sq_locked = 0;
 	old_cc = NULL;
 again:
 	cc = callout_lock(c);
 
 	if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) ==
 	    (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) &&
 	    ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) {
 		/*
 		 * Special case where this slipped in while we
 		 * were migrating *as* the callout is about to
 		 * execute. The caller probably holds the lock
 		 * the callout wants.
 		 *
 		 * Get rid of the migration first. Then set
 		 * the flag that tells this code *not* to
 		 * try to remove it from any lists (its not
 		 * on one yet). When the callout wheel runs,
 		 * it will ignore this callout.
 		 */
 		c->c_iflags &= ~CALLOUT_PENDING;
 		c->c_flags &= ~CALLOUT_ACTIVE;
 		not_on_a_list = 1;
 	} else {
 		not_on_a_list = 0;
 	}
 
 	/*
 	 * If the callout was migrating while the callout cpu lock was
 	 * dropped,  just drop the sleepqueue lock and check the states
 	 * again.
 	 */
 	if (sq_locked != 0 && cc != old_cc) {
 #ifdef SMP
 		CC_UNLOCK(cc);
 		sleepq_release(&cc_exec_waiting(old_cc, direct));
 		sq_locked = 0;
 		old_cc = NULL;
 		goto again;
 #else
 		panic("migration should not happen");
 #endif
 	}
 
 	/*
 	 * If the callout is running, try to stop it or drain it.
 	 */
 	if (cc_exec_curr(cc, direct) == c) {
 		/*
 		 * Succeed we to stop it or not, we must clear the
 		 * active flag - this is what API users expect.  If we're
 		 * draining and the callout is currently executing, first wait
 		 * until it finishes.
 		 */
 		if ((flags & CS_DRAIN) == 0)
 			c->c_flags &= ~CALLOUT_ACTIVE;
 
 		if ((flags & CS_DRAIN) != 0) {
 			/*
 			 * The current callout is running (or just
 			 * about to run) and blocking is allowed, so
 			 * just wait for the current invocation to
 			 * finish.
 			 */
 			while (cc_exec_curr(cc, direct) == c) {
 				/*
 				 * Use direct calls to sleepqueue interface
 				 * instead of cv/msleep in order to avoid
 				 * a LOR between cc_lock and sleepqueue
 				 * chain spinlocks.  This piece of code
 				 * emulates a msleep_spin() call actually.
 				 *
 				 * If we already have the sleepqueue chain
 				 * locked, then we can safely block.  If we
 				 * don't already have it locked, however,
 				 * we have to drop the cc_lock to lock
 				 * it.  This opens several races, so we
 				 * restart at the beginning once we have
 				 * both locks.  If nothing has changed, then
 				 * we will end up back here with sq_locked
 				 * set.
 				 */
 				if (!sq_locked) {
 					CC_UNLOCK(cc);
 					sleepq_lock(
 					    &cc_exec_waiting(cc, direct));
 					sq_locked = 1;
 					old_cc = cc;
 					goto again;
 				}
 
 				/*
 				 * Migration could be cancelled here, but
 				 * as long as it is still not sure when it
 				 * will be packed up, just let softclock()
 				 * take care of it.
 				 */
 				cc_exec_waiting(cc, direct) = true;
 				DROP_GIANT();
 				CC_UNLOCK(cc);
 				sleepq_add(
 				    &cc_exec_waiting(cc, direct),
 				    &cc->cc_lock.lock_object, "codrain",
 				    SLEEPQ_SLEEP, 0);
 				sleepq_wait(
 				    &cc_exec_waiting(cc, direct),
 					     0);
 				sq_locked = 0;
 				old_cc = NULL;
 
 				/* Reacquire locks previously released. */
 				PICKUP_GIANT();
 				CC_LOCK(cc);
 			}
 			c->c_flags &= ~CALLOUT_ACTIVE;
 		} else if (use_lock &&
 			   !cc_exec_cancel(cc, direct) && (drain == NULL)) {
 			
 			/*
 			 * The current callout is waiting for its
 			 * lock which we hold.  Cancel the callout
 			 * and return.  After our caller drops the
 			 * lock, the callout will be skipped in
 			 * softclock(). This *only* works with a
 			 * callout_stop() *not* callout_drain() or
 			 * callout_async_drain().
 			 */
 			cc_exec_cancel(cc, direct) = true;
 			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			KASSERT(!cc_cce_migrating(cc, direct),
 			    ("callout wrongly scheduled for migration"));
 			if (callout_migrating(c)) {
 				c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 #ifdef SMP
 				cc_migration_cpu(cc, direct) = CPUBLOCK;
 				cc_migration_time(cc, direct) = 0;
 				cc_migration_prec(cc, direct) = 0;
 				cc_migration_func(cc, direct) = NULL;
 				cc_migration_arg(cc, direct) = NULL;
 #endif
 			}
 			CC_UNLOCK(cc);
 			KASSERT(!sq_locked, ("sleepqueue chain locked"));
 			return (1);
 		} else if (callout_migrating(c)) {
 			/*
 			 * The callout is currently being serviced
 			 * and the "next" callout is scheduled at
 			 * its completion with a migration. We remove
 			 * the migration flag so it *won't* get rescheduled,
 			 * but we can't stop the one thats running so
 			 * we return 0.
 			 */
 			c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 #ifdef SMP
 			/* 
 			 * We can't call cc_cce_cleanup here since
 			 * if we do it will remove .ce_curr and
 			 * its still running. This will prevent a
 			 * reschedule of the callout when the 
 			 * execution completes.
 			 */
 			cc_migration_cpu(cc, direct) = CPUBLOCK;
 			cc_migration_time(cc, direct) = 0;
 			cc_migration_prec(cc, direct) = 0;
 			cc_migration_func(cc, direct) = NULL;
 			cc_migration_arg(cc, direct) = NULL;
 #endif
 			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
  			if (drain) {
 				cc_exec_drain(cc, direct) = drain;
 			}
 			CC_UNLOCK(cc);
 			return ((flags & CS_EXECUTING) != 0);
 		}
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
 		if (drain) {
 			cc_exec_drain(cc, direct) = drain;
 		}
 		KASSERT(!sq_locked, ("sleepqueue chain still locked"));
 		cancelled = ((flags & CS_EXECUTING) != 0);
 	} else
 		cancelled = 1;
 
 	if (sq_locked)
 		sleepq_release(&cc_exec_waiting(cc, direct));
 
 	if ((c->c_iflags & CALLOUT_PENDING) == 0) {
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
 		/*
 		 * For not scheduled and not executing callout return
 		 * negative value.
 		 */
 		if (cc_exec_curr(cc, direct) != c)
 			cancelled = -1;
 		CC_UNLOCK(cc);
 		return (cancelled);
 	}
 
 	c->c_iflags &= ~CALLOUT_PENDING;
 	c->c_flags &= ~CALLOUT_ACTIVE;
 
 	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 	    c, c->c_func, c->c_arg);
 	if (not_on_a_list == 0) {
 		if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
 			if (cc_exec_next(cc) == c)
 				cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
 			LIST_REMOVE(c, c_links.le);
 		} else {
 			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		}
 	}
 	callout_cc_del(c, cc);
 	CC_UNLOCK(cc);
 	return (cancelled);
 }
 
 void
 callout_init(struct callout *c, int mpsafe)
 {
 	bzero(c, sizeof *c);
 	if (mpsafe) {
 		c->c_lock = NULL;
 		c->c_iflags = CALLOUT_RETURNUNLOCKED;
 	} else {
 		c->c_lock = &Giant.lock_object;
 		c->c_iflags = 0;
 	}
 	c->c_cpu = timeout_cpu;
 }
 
 void
 _callout_init_lock(struct callout *c, struct lock_object *lock, int flags)
 {
 	bzero(c, sizeof *c);
 	c->c_lock = lock;
 	KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
 	    ("callout_init_lock: bad flags %d", flags));
 	KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
 	    ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
 	KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
 	    (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
 	    __func__));
 	c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
 	c->c_cpu = timeout_cpu;
 }
 
 #ifdef APM_FIXUP_CALLTODO
 /* 
  * Adjust the kernel calltodo timeout list.  This routine is used after 
  * an APM resume to recalculate the calltodo timer list values with the 
  * number of hz's we have been sleeping.  The next hardclock() will detect 
  * that there are fired timers and run softclock() to execute them.
  *
  * Please note, I have not done an exhaustive analysis of what code this
  * might break.  I am motivated to have my select()'s and alarm()'s that
  * have expired during suspend firing upon resume so that the applications
  * which set the timer can do the maintanence the timer was for as close
  * as possible to the originally intended time.  Testing this code for a 
  * week showed that resuming from a suspend resulted in 22 to 25 timers 
  * firing, which seemed independent on whether the suspend was 2 hours or
  * 2 days.  Your milage may vary.   - Ken Key <key@cs.utk.edu>
  */
 void
 adjust_timeout_calltodo(struct timeval *time_change)
 {
 	struct callout *p;
 	unsigned long delta_ticks;
 
 	/* 
 	 * How many ticks were we asleep?
 	 * (stolen from tvtohz()).
 	 */
 
 	/* Don't do anything */
 	if (time_change->tv_sec < 0)
 		return;
 	else if (time_change->tv_sec <= LONG_MAX / 1000000)
 		delta_ticks = howmany(time_change->tv_sec * 1000000 +
 		    time_change->tv_usec, tick) + 1;
 	else if (time_change->tv_sec <= LONG_MAX / hz)
 		delta_ticks = time_change->tv_sec * hz +
 		    howmany(time_change->tv_usec, tick) + 1;
 	else
 		delta_ticks = LONG_MAX;
 
 	if (delta_ticks > INT_MAX)
 		delta_ticks = INT_MAX;
 
 	/* 
 	 * Now rip through the timer calltodo list looking for timers
 	 * to expire.
 	 */
 
 	/* don't collide with softclock() */
 	CC_LOCK(cc);
 	for (p = calltodo.c_next; p != NULL; p = p->c_next) {
 		p->c_time -= delta_ticks;
 
 		/* Break if the timer had more time on it than delta_ticks */
 		if (p->c_time > 0)
 			break;
 
 		/* take back the ticks the timer didn't use (p->c_time <= 0) */
 		delta_ticks = -p->c_time;
 	}
 	CC_UNLOCK(cc);
 
 	return;
 }
 #endif /* APM_FIXUP_CALLTODO */
 
 static int
 flssbt(sbintime_t sbt)
 {
 
 	sbt += (uint64_t)sbt >> 1;
 	if (sizeof(long) >= sizeof(sbintime_t))
 		return (flsl(sbt));
 	if (sbt >= SBT_1S)
 		return (flsl(((uint64_t)sbt) >> 32) + 32);
 	return (flsl(sbt));
 }
 
 /*
  * Dump immediate statistic snapshot of the scheduled callouts.
  */
 static int
 sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
 {
 	struct callout *tmp;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t;
 	int ct[64], cpr[64], ccpbk[32];
 	int error, val, i, count, tcum, pcum, maxc, c, medc;
 #ifdef SMP
 	int cpu;
 #endif
 
 	val = 0;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	count = maxc = 0;
 	st = spr = maxt = maxpr = 0;
 	bzero(ccpbk, sizeof(ccpbk));
 	bzero(ct, sizeof(ct));
 	bzero(cpr, sizeof(cpr));
 	now = sbinuptime();
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		cc = CC_CPU(cpu);
 #else
 		cc = CC_CPU(timeout_cpu);
 #endif
 		CC_LOCK(cc);
 		for (i = 0; i < callwheelsize; i++) {
 			sc = &cc->cc_callwheel[i];
 			c = 0;
 			LIST_FOREACH(tmp, sc, c_links.le) {
 				c++;
 				t = tmp->c_time - now;
 				if (t < 0)
 					t = 0;
 				st += t / SBT_1US;
 				spr += tmp->c_precision / SBT_1US;
 				if (t > maxt)
 					maxt = t;
 				if (tmp->c_precision > maxpr)
 					maxpr = tmp->c_precision;
 				ct[flssbt(t)]++;
 				cpr[flssbt(tmp->c_precision)]++;
 			}
 			if (c > maxc)
 				maxc = c;
 			ccpbk[fls(c + c / 2)]++;
 			count += c;
 		}
 		CC_UNLOCK(cc);
 #ifdef SMP
 	}
 #endif
 
 	for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
 		tcum += ct[i];
 	medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++)
 		pcum += cpr[i];
 	medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, c = 0; i < 32 && c < count / 2; i++)
 		c += ccpbk[i];
 	medc = (i >= 2) ? (1 << (i - 2)) : 0;
 
 	printf("Scheduled callouts statistic snapshot:\n");
 	printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
 	    count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
 	printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
 	    medc,
 	    count / callwheelsize / mp_ncpus,
 	    (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
 	    maxc);
 	printf("  Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
 	    (st / count) / 1000000, (st / count) % 1000000,
 	    maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32);
 	printf("  Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32,
 	    (spr / count) / 1000000, (spr / count) % 1000000,
 	    maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32);
 	printf("  Distribution:       \tbuckets\t   time\t   tcum\t"
 	    "   prec\t   pcum\n");
 	for (i = 0, tcum = pcum = 0; i < 64; i++) {
 		if (ct[i] == 0 && cpr[i] == 0)
 			continue;
 		t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
 		tcum += ct[i];
 		pcum += cpr[i];
 		printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n",
 		    t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
 		    i - 1 - (32 - CC_HASH_SHIFT),
 		    ct[i], tcum, cpr[i], pcum);
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_callout_stat, "I",
     "Dump immediate statistic snapshot of the scheduled callouts");
 
 #ifdef DDB
 static void
 _show_callout(struct callout *c)
 {
 
 	db_printf("callout %p\n", c);
 #define	C_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, c->e);
 	db_printf("   &c_links = %p\n", &(c->c_links));
 	C_DB_PRINTF("%" PRId64,	c_time);
 	C_DB_PRINTF("%" PRId64,	c_precision);
 	C_DB_PRINTF("%p",	c_arg);
 	C_DB_PRINTF("%p",	c_func);
 	C_DB_PRINTF("%p",	c_lock);
 	C_DB_PRINTF("%#x",	c_flags);
 	C_DB_PRINTF("%#x",	c_iflags);
 	C_DB_PRINTF("%d",	c_cpu);
 #undef	C_DB_PRINTF
 }
 
 DB_SHOW_COMMAND(callout, db_show_callout)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show callout <struct callout *>\n");
 		return;
 	}
 
 	_show_callout((struct callout *)addr);
 }
 #endif /* DDB */
Index: stable/12/sys/kern/sched_4bsd.c
===================================================================
--- stable/12/sys/kern/sched_4bsd.c	(revision 355609)
+++ stable/12/sys/kern/sched_4bsd.c	(revision 355610)
@@ -1,1768 +1,1768 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/kthread.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
 #include <sys/umtx.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
-int				dtrace_vtime_active;
+int __read_mostly		dtrace_vtime_active;
 dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
 #endif
 
 /*
  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
  * the range 100-256 Hz (approximately).
  */
 #define	ESTCPULIM(e) \
     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
 #ifdef SMP
 #define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
 #else
 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
 #endif
 #define	NICE_WEIGHT		1	/* Priorities per nice level. */
 
 #define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
 
 /*
  * The schedulable entity that runs a context.
  * This is  an extension to the thread structure and is tailored to
  * the requirements of this scheduler.
  * All fields are protected by the scheduler lock.
  */
 struct td_sched {
 	fixpt_t		ts_pctcpu;	/* %cpu during p_swtime. */
 	u_int		ts_estcpu;	/* Estimated cpu utilization. */
 	int		ts_cpticks;	/* Ticks of cpu time. */
 	int		ts_slptime;	/* Seconds !RUNNING. */
 	int		ts_slice;	/* Remaining part of time slice. */
 	int		ts_flags;
 	struct runq	*ts_runq;	/* runq the thread is currently on */
 #ifdef KTR
 	char		ts_name[TS_NAME_LEN];
 #endif
 };
 
 /* flags kept in td_flags */
 #define TDF_DIDRUN	TDF_SCHED0	/* thread actually ran. */
 #define TDF_BOUND	TDF_SCHED1	/* Bound to one CPU. */
 #define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
 
 /* flags kept in ts_flags */
 #define	TSF_AFFINITY	0x0001		/* Has a non-"full" CPU set. */
 
 #define SKE_RUNQ_PCPU(ts)						\
     ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
 
 #define	THREAD_CAN_SCHED(td, cpu)	\
     CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
 
 _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
     sizeof(struct thread0_storage),
     "increase struct thread0_storage.t0st_sched size");
 
 static struct mtx sched_lock;
 
 static int	realstathz = 127; /* stathz is sometimes 0 and run off of hz. */
 static int	sched_tdcnt;	/* Total runnable threads in the system. */
 static int	sched_slice = 12; /* Thread run time before rescheduling. */
 
 static void	setup_runqs(void);
 static void	schedcpu(void);
 static void	schedcpu_thread(void);
 static void	sched_priority(struct thread *td, u_char prio);
 static void	sched_setup(void *dummy);
 static void	maybe_resched(struct thread *td);
 static void	updatepri(struct thread *td);
 static void	resetpriority(struct thread *td);
 static void	resetpriority_thread(struct thread *td);
 #ifdef SMP
 static int	sched_pickcpu(struct thread *td);
 static int	forward_wakeup(int cpunum);
 static void	kick_other_cpu(int pri, int cpuid);
 #endif
 
 static struct kproc_desc sched_kp = {
         "schedcpu",
         schedcpu_thread,
         NULL
 };
 SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start,
     &sched_kp);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
 
 static void sched_initticks(void *dummy);
 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
     NULL);
 
 /*
  * Global run queue.
  */
 static struct runq runq;
 
 #ifdef SMP
 /*
  * Per-CPU run queues
  */
 static struct runq runq_pcpu[MAXCPU];
 long runq_length[MAXCPU];
 
 static cpuset_t idle_cpus_mask;
 #endif
 
 struct pcpuidlestat {
 	u_int idlecalls;
 	u_int oldidlecalls;
 };
 DPCPU_DEFINE_STATIC(struct pcpuidlestat, idlestat);
 
 static void
 setup_runqs(void)
 {
 #ifdef SMP
 	int i;
 
 	for (i = 0; i < MAXCPU; ++i)
 		runq_init(&runq_pcpu[i]);
 #endif
 
 	runq_init(&runq);
 }
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val, period;
 
 	period = 1000000 / realstathz;
 	new_val = period * sched_slice;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val <= 0)
 		return (EINVAL);
 	sched_slice = imax(1, (new_val + period / 2) / period);
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 	return (0);
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
 
 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
     "Scheduler name");
 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, sysctl_kern_quantum, "I",
     "Quantum for timeshare threads in microseconds");
 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
     "Quantum for timeshare threads in stathz ticks");
 #ifdef SMP
 /* Enable forwarding of wakeups to all other cpus */
 static SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL,
     "Kernel SMP");
 
 static int runq_fuzz = 1;
 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
 
 static int forward_wakeup_enabled = 1;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
 	   &forward_wakeup_enabled, 0,
 	   "Forwarding of wakeup to idle CPUs");
 
 static int forward_wakeups_requested = 0;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
 	   &forward_wakeups_requested, 0,
 	   "Requests for Forwarding of wakeup to idle CPUs");
 
 static int forward_wakeups_delivered = 0;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
 	   &forward_wakeups_delivered, 0,
 	   "Completed Forwarding of wakeup to idle CPUs");
 
 static int forward_wakeup_use_mask = 1;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
 	   &forward_wakeup_use_mask, 0,
 	   "Use the mask of idle cpus");
 
 static int forward_wakeup_use_loop = 0;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
 	   &forward_wakeup_use_loop, 0,
 	   "Use a loop to find idle cpus");
 
 #endif
 #if 0
 static int sched_followon = 0;
 SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
 	   &sched_followon, 0,
 	   "allow threads to share a quantum");
 #endif
 
 SDT_PROVIDER_DEFINE(sched);
 
 SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", 
     "struct proc *", "uint8_t");
 SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", 
     "struct proc *", "void *");
 SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", 
     "struct proc *", "void *", "int");
 SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", 
     "struct proc *", "uint8_t", "struct thread *");
 SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
 SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *",
     "struct proc *");
 SDT_PROBE_DEFINE(sched, , , on__cpu);
 SDT_PROBE_DEFINE(sched, , , remain__cpu);
 SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *",
     "struct proc *");
 
 static __inline void
 sched_load_add(void)
 {
 
 	sched_tdcnt++;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
 	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
 }
 
 static __inline void
 sched_load_rem(void)
 {
 
 	sched_tdcnt--;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
 	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
 }
 /*
  * Arrange to reschedule if necessary, taking the priorities and
  * schedulers into account.
  */
 static void
 maybe_resched(struct thread *td)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority < curthread->td_priority)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * This function is called when a thread is about to be put on run queue
  * because it has been made runnable or its priority has been adjusted.  It
  * determines if the new thread should preempt the current thread.  If so,
  * it sets td_owepreempt to request a preemption.
  */
 int
 maybe_preempt(struct thread *td)
 {
 #ifdef PREEMPTION
 	struct thread *ctd;
 	int cpri, pri;
 
 	/*
 	 * The new thread should not preempt the current thread if any of the
 	 * following conditions are true:
 	 *
 	 *  - The kernel is in the throes of crashing (panicstr).
 	 *  - The current thread has a higher (numerically lower) or
 	 *    equivalent priority.  Note that this prevents curthread from
 	 *    trying to preempt to itself.
 	 *  - The current thread has an inhibitor set or is in the process of
 	 *    exiting.  In this case, the current thread is about to switch
 	 *    out anyways, so there's no point in preempting.  If we did,
 	 *    the current thread would not be properly resumed as well, so
 	 *    just avoid that whole landmine.
 	 *  - If the new thread's priority is not a realtime priority and
 	 *    the current thread's priority is not an idle priority and
 	 *    FULL_PREEMPTION is disabled.
 	 *
 	 * If all of these conditions are false, but the current thread is in
 	 * a nested critical section, then we have to defer the preemption
 	 * until we exit the critical section.  Otherwise, switch immediately
 	 * to the new thread.
 	 */
 	ctd = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 			("maybe_preempt: trying to run inhibited thread"));
 	pri = td->td_priority;
 	cpri = ctd->td_priority;
 	if (panicstr != NULL || pri >= cpri /* || dumping */ ||
 	    TD_IS_INHIBITED(ctd))
 		return (0);
 #ifndef FULL_PREEMPTION
 	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
 		return (0);
 #endif
 
 	CTR0(KTR_PROC, "maybe_preempt: scheduling preemption");
 	ctd->td_owepreempt = 1;
 	return (1);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Constants for digital decay and forget:
  *	90% of (ts_estcpu) usage in 5 * loadav time
  *	95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
  * Note that schedclock() updates ts_estcpu and p_cpticks asynchronously.
  *
  * We wish to decay away 90% of ts_estcpu in (5 * loadavg) seconds.
  * That is, the system wants to compute a value of decay such
  * that the following for loop:
  * 	for (i = 0; i < (5 * loadavg); i++)
  * 		ts_estcpu *= decay;
  * will compute
  * 	ts_estcpu *= 0.1;
  * for all values of loadavg:
  *
  * Mathematically this loop can be expressed by saying:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * The system computes decay as:
  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
  *
  * We wish to prove that the system's computation of decay
  * will always fulfill the equation:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * If we compute b as:
  * 	b = 2 * loadavg
  * then
  * 	decay = b / (b + 1)
  *
  * We now need to prove two things:
  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
  *
  * Facts:
  *         For x close to zero, exp(x) =~ 1 + x, since
  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
  *         For x close to zero, ln(1+x) =~ x, since
  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
  *         ln(.1) =~ -2.30
  *
  * Proof of (1):
  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
  *	solving for factor,
  *      ln(factor) =~ (-2.30/5*loadav), or
  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
  *
  * Proof of (2):
  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
  *	solving for power,
  *      power*ln(b/(b+1)) =~ -2.30, or
  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
  *
  * Actual power values for the implemented algorithm are as follows:
  *      loadav: 1       2       3       4
  *      power:  5.68    10.32   14.94   19.55
  */
 
 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 #define	loadfactor(loadav)	(2 * (loadav))
 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 
 /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
 SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 /*
  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
  *
  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
  *
  * If you don't want to bother with the faster/more-accurate formula, you
  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
  * (more general) method of calculating the %age of CPU used by a process.
  */
 #define	CCPU_SHIFT	11
 
 /*
  * Recompute process priorities, every hz ticks.
  * MP-safe, called without the Giant mutex.
  */
 /* ARGSUSED */
 static void
 schedcpu(void)
 {
 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 	struct thread *td;
 	struct proc *p;
 	struct td_sched *ts;
 	int awake;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		FOREACH_THREAD_IN_PROC(p, td) {
 			awake = 0;
 			ts = td_get_sched(td);
 			thread_lock(td);
 			/*
 			 * Increment sleep time (if sleeping).  We
 			 * ignore overflow, as above.
 			 */
 			/*
 			 * The td_sched slptimes are not touched in wakeup
 			 * because the thread may not HAVE everything in
 			 * memory? XXX I think this is out of date.
 			 */
 			if (TD_ON_RUNQ(td)) {
 				awake = 1;
 				td->td_flags &= ~TDF_DIDRUN;
 			} else if (TD_IS_RUNNING(td)) {
 				awake = 1;
 				/* Do not clear TDF_DIDRUN */
 			} else if (td->td_flags & TDF_DIDRUN) {
 				awake = 1;
 				td->td_flags &= ~TDF_DIDRUN;
 			}
 
 			/*
 			 * ts_pctcpu is only for ps and ttyinfo().
 			 */
 			ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
 			/*
 			 * If the td_sched has been idle the entire second,
 			 * stop recalculating its priority until
 			 * it wakes up.
 			 */
 			if (ts->ts_cpticks != 0) {
 #if	(FSHIFT >= CCPU_SHIFT)
 				ts->ts_pctcpu += (realstathz == 100)
 				    ? ((fixpt_t) ts->ts_cpticks) <<
 				    (FSHIFT - CCPU_SHIFT) :
 				    100 * (((fixpt_t) ts->ts_cpticks)
 				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 				ts->ts_pctcpu += ((FSCALE - ccpu) *
 				    (ts->ts_cpticks *
 				    FSCALE / realstathz)) >> FSHIFT;
 #endif
 				ts->ts_cpticks = 0;
 			}
 			/*
 			 * If there are ANY running threads in this process,
 			 * then don't count it as sleeping.
 			 * XXX: this is broken.
 			 */
 			if (awake) {
 				if (ts->ts_slptime > 1) {
 					/*
 					 * In an ideal world, this should not
 					 * happen, because whoever woke us
 					 * up from the long sleep should have
 					 * unwound the slptime and reset our
 					 * priority before we run at the stale
 					 * priority.  Should KASSERT at some
 					 * point when all the cases are fixed.
 					 */
 					updatepri(td);
 				}
 				ts->ts_slptime = 0;
 			} else
 				ts->ts_slptime++;
 			if (ts->ts_slptime > 1) {
 				thread_unlock(td);
 				continue;
 			}
 			ts->ts_estcpu = decay_cpu(loadfac, ts->ts_estcpu);
 		      	resetpriority(td);
 			resetpriority_thread(td);
 			thread_unlock(td);
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 }
 
 /*
  * Main loop for a kthread that executes schedcpu once a second.
  */
 static void
 schedcpu_thread(void)
 {
 
 	for (;;) {
 		schedcpu();
 		pause("-", hz);
 	}
 }
 
 /*
  * Recalculate the priority of a process after it has slept for a while.
  * For all load averages >= 1 and max ts_estcpu of 255, sleeping for at
  * least six times the loadfactor will decay ts_estcpu to zero.
  */
 static void
 updatepri(struct thread *td)
 {
 	struct td_sched *ts;
 	fixpt_t loadfac;
 	unsigned int newcpu;
 
 	ts = td_get_sched(td);
 	loadfac = loadfactor(averunnable.ldavg[0]);
 	if (ts->ts_slptime > 5 * loadfac)
 		ts->ts_estcpu = 0;
 	else {
 		newcpu = ts->ts_estcpu;
 		ts->ts_slptime--;	/* was incremented in schedcpu() */
 		while (newcpu && --ts->ts_slptime)
 			newcpu = decay_cpu(loadfac, newcpu);
 		ts->ts_estcpu = newcpu;
 	}
 }
 
 /*
  * Compute the priority of a process when running in user mode.
  * Arrange to reschedule if the resulting priority is better
  * than that of the current process.
  */
 static void
 resetpriority(struct thread *td)
 {
 	u_int newpriority;
 
 	if (td->td_pri_class != PRI_TIMESHARE)
 		return;
 	newpriority = PUSER +
 	    td_get_sched(td)->ts_estcpu / INVERSE_ESTCPU_WEIGHT +
 	    NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
 	newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
 	    PRI_MAX_TIMESHARE);
 	sched_user_prio(td, newpriority);
 }
 
 /*
  * Update the thread's priority when the associated process's user
  * priority changes.
  */
 static void
 resetpriority_thread(struct thread *td)
 {
 
 	/* Only change threads with a time sharing user priority. */
 	if (td->td_priority < PRI_MIN_TIMESHARE ||
 	    td->td_priority > PRI_MAX_TIMESHARE)
 		return;
 
 	/* XXX the whole needresched thing is broken, but not silly. */
 	maybe_resched(td);
 
 	sched_prio(td, td->td_user_pri);
 }
 
 /* ARGSUSED */
 static void
 sched_setup(void *dummy)
 {
 
 	setup_runqs();
 
 	/* Account for thread0. */
 	sched_load_add();
 }
 
 /*
  * This routine determines time constants after stathz and hz are setup.
  */
 static void
 sched_initticks(void *dummy)
 {
 
 	realstathz = stathz ? stathz : hz;
 	sched_slice = realstathz / 10;	/* ~100ms */
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 }
 
 /* External interfaces start here */
 
 /*
  * Very early in the boot some setup of scheduler-specific
  * parts of proc0 and of some scheduler resources needs to be done.
  * Called from:
  *  proc0_init()
  */
 void
 schedinit(void)
 {
 
 	/*
 	 * Set up the scheduler specific parts of thread0.
 	 */
 	thread0.td_lock = &sched_lock;
 	td_get_sched(&thread0)->ts_slice = sched_slice;
 	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
 }
 
 int
 sched_runnable(void)
 {
 #ifdef SMP
 	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
 #else
 	return runq_check(&runq);
 #endif
 }
 
 int
 sched_rr_interval(void)
 {
 
 	/* Convert sched_slice from stathz to hz. */
 	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
 }
 
 /*
  * We adjust the priority of the current process.  The priority of a
  * process gets worse as it accumulates CPU time.  The cpu usage
  * estimator (ts_estcpu) is increased here.  resetpriority() will
  * compute a different priority each time ts_estcpu increases by
  * INVERSE_ESTCPU_WEIGHT (until PRI_MAX_TIMESHARE is reached).  The
  * cpu usage estimator ramps up quite quickly when the process is
  * running (linearly), and decays away exponentially, at a rate which
  * is proportionally slower when the system is busy.  The basic
  * principle is that the system will 90% forget that the process used
  * a lot of CPU time in 5 * loadav seconds.  This causes the system to
  * favor processes which haven't run much recently, and to round-robin
  * among other processes.
  */
 void
 sched_clock(struct thread *td)
 {
 	struct pcpuidlestat *stat;
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 
 	ts->ts_cpticks++;
 	ts->ts_estcpu = ESTCPULIM(ts->ts_estcpu + 1);
 	if ((ts->ts_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
 		resetpriority(td);
 		resetpriority_thread(td);
 	}
 
 	/*
 	 * Force a context switch if the current thread has used up a full
 	 * time slice (default is 100ms).
 	 */
 	if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) {
 		ts->ts_slice = sched_slice;
 		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
 	}
 
 	stat = DPCPU_PTR(idlestat);
 	stat->oldidlecalls = stat->idlecalls;
 	stat->idlecalls = 0;
 }
 
 /*
  * Charge child's scheduling CPU usage to parent.
  */
 void
 sched_exit(struct proc *p, struct thread *td)
 {
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit",
 	    "prio:%d", td->td_priority);
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit",
 	    "prio:%d", child->td_priority);
 	thread_lock(td);
 	td_get_sched(td)->ts_estcpu = ESTCPULIM(td_get_sched(td)->ts_estcpu +
 	    td_get_sched(child)->ts_estcpu);
 	thread_unlock(td);
 	thread_lock(child);
 	if ((child->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
 	thread_unlock(child);
 }
 
 void
 sched_fork(struct thread *td, struct thread *childtd)
 {
 	sched_fork_thread(td, childtd);
 }
 
 void
 sched_fork_thread(struct thread *td, struct thread *childtd)
 {
 	struct td_sched *ts, *tsc;
 
 	childtd->td_oncpu = NOCPU;
 	childtd->td_lastcpu = NOCPU;
 	childtd->td_lock = &sched_lock;
 	childtd->td_cpuset = cpuset_ref(td->td_cpuset);
 	childtd->td_domain.dr_policy = td->td_cpuset->cs_domain;
 	childtd->td_priority = childtd->td_base_pri;
 	ts = td_get_sched(childtd);
 	bzero(ts, sizeof(*ts));
 	tsc = td_get_sched(td);
 	ts->ts_estcpu = tsc->ts_estcpu;
 	ts->ts_flags |= (tsc->ts_flags & TSF_AFFINITY);
 	ts->ts_slice = 1;
 }
 
 void
 sched_nice(struct proc *p, int nice)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		resetpriority(td);
 		resetpriority_thread(td);
 		thread_unlock(td);
 	}
 }
 
 void
 sched_class(struct thread *td, int class)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_pri_class = class;
 }
 
 /*
  * Adjust the priority of a thread.
  */
 static void
 sched_priority(struct thread *td, u_char prio)
 {
 
 
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
 	    "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
 	if (td != curthread && prio > td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
 		SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, 
 		    curthread);
 	}
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
 	td->td_priority = prio;
 	if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) {
 		sched_rem(td);
 		sched_add(td, SRQ_BORING);
 	}
 }
 
 /*
  * Update a thread's priority when it is lent another thread's
  * priority.
  */
 void
 sched_lend_prio(struct thread *td, u_char prio)
 {
 
 	td->td_flags |= TDF_BORROWING;
 	sched_priority(td, prio);
 }
 
 /*
  * Restore a thread's priority when priority propagation is
  * over.  The prio argument is the minimum priority the thread
  * needs to have to satisfy other possible priority lending
  * requests.  If the thread's regulary priority is less
  * important than prio the thread will keep a priority boost
  * of prio.
  */
 void
 sched_unlend_prio(struct thread *td, u_char prio)
 {
 	u_char base_pri;
 
 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
 		base_pri = td->td_user_pri;
 	else
 		base_pri = td->td_base_pri;
 	if (prio >= base_pri) {
 		td->td_flags &= ~TDF_BORROWING;
 		sched_prio(td, base_pri);
 	} else
 		sched_lend_prio(td, prio);
 }
 
 void
 sched_prio(struct thread *td, u_char prio)
 {
 	u_char oldprio;
 
 	/* First, update the base priority. */
 	td->td_base_pri = prio;
 
 	/*
 	 * If the thread is borrowing another thread's priority, don't ever
 	 * lower the priority.
 	 */
 	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
 		return;
 
 	/* Change the real priority. */
 	oldprio = td->td_priority;
 	sched_priority(td, prio);
 
 	/*
 	 * If the thread is on a turnstile, then let the turnstile update
 	 * its state.
 	 */
 	if (TD_ON_LOCK(td) && oldprio != prio)
 		turnstile_adjust(td, oldprio);
 }
 
 void
 sched_user_prio(struct thread *td, u_char prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_base_user_pri = prio;
 	if (td->td_lend_user_pri <= prio)
 		return;
 	td->td_user_pri = prio;
 }
 
 void
 sched_lend_user_prio(struct thread *td, u_char prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_lend_user_pri = prio;
 	td->td_user_pri = min(prio, td->td_base_user_pri);
 	if (td->td_priority > td->td_user_pri)
 		sched_prio(td, td->td_user_pri);
 	else if (td->td_priority != td->td_user_pri)
 		td->td_flags |= TDF_NEEDRESCHED;
 }
 
 void
 sched_sleep(struct thread *td, int pri)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_slptick = ticks;
 	td_get_sched(td)->ts_slptime = 0;
 	if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_prio(td, pri);
 	if (TD_IS_SUSPENDED(td) || pri >= PSOCK)
 		td->td_flags |= TDF_CANSWAP;
 }
 
 void
 sched_switch(struct thread *td, struct thread *newtd, int flags)
 {
 	struct mtx *tmtx;
 	struct td_sched *ts;
 	struct proc *p;
 	int preempted;
 
 	tmtx = NULL;
 	ts = td_get_sched(td);
 	p = td->td_proc;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* 
 	 * Switch to the sched lock to fix things up and pick
 	 * a new thread.
 	 * Block the td_lock in order to avoid breaking the critical path.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
 		tmtx = thread_lock_block(td);
 	}
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
 
 	td->td_lastcpu = td->td_oncpu;
 	preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
 	    (flags & SW_PREEMPT) != 0;
 	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
 	td->td_owepreempt = 0;
 	td->td_oncpu = NOCPU;
 
 	/*
 	 * At the last moment, if this thread is still marked RUNNING,
 	 * then put it back on the run queue as it has not been suspended
 	 * or stopped or any thing else similar.  We never put the idle
 	 * threads on the run queue, however.
 	 */
 	if (td->td_flags & TDF_IDLETD) {
 		TD_SET_CAN_RUN(td);
 #ifdef SMP
 		CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask);
 #endif
 	} else {
 		if (TD_IS_RUNNING(td)) {
 			/* Put us back on the run queue. */
 			sched_add(td, preempted ?
 			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 			    SRQ_OURSELF|SRQ_YIELDING);
 		}
 	}
 	if (newtd) {
 		/*
 		 * The thread we are about to run needs to be counted
 		 * as if it had been added to the run queue and selected.
 		 * It came from:
 		 * * A preemption
 		 * * An upcall
 		 * * A followon
 		 */
 		KASSERT((newtd->td_inhibitors == 0),
 			("trying to run inhibited thread"));
 		newtd->td_flags |= TDF_DIDRUN;
         	TD_SET_RUNNING(newtd);
 		if ((newtd->td_flags & TDF_NOLOAD) == 0)
 			sched_load_add();
 	} else {
 		newtd = choosethread();
 		MPASS(newtd->td_lock == &sched_lock);
 	}
 
 #if (KTR_COMPILE & KTR_SCHED) != 0
 	if (TD_IS_IDLETHREAD(td))
 		KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
 		    "prio:%d", td->td_priority);
 	else
 		KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
 		    "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
 		    "lockname:\"%s\"", td->td_lockname);
 #endif
 
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 
 		SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
 
                 /* I feel sleepy */
 		lock_profile_release_lock(&sched_lock.lock_object);
 #ifdef KDTRACE_HOOKS
 		/*
 		 * If DTrace has set the active vtime enum to anything
 		 * other than INACTIVE (0), then it should have set the
 		 * function to call.
 		 */
 		if (dtrace_vtime_active)
 			(*dtrace_vtime_switch_func)(newtd);
 #endif
 
 		cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock);
 		lock_profile_obtain_lock_success(&sched_lock.lock_object,
 		    0, 0, __FILE__, __LINE__);
 		/*
 		 * Where am I?  What year is it?
 		 * We are in the same thread that went to sleep above,
 		 * but any amount of time may have passed. All our context
 		 * will still be available as will local variables.
 		 * PCPU values however may have changed as we may have
 		 * changed CPU so don't trust cached values of them.
 		 * New threads will go to fork_exit() instead of here
 		 * so if you change things here you may need to change
 		 * things there too.
 		 *
 		 * If the thread above was exiting it will never wake
 		 * up again here, so either it has saved everything it
 		 * needed to, or the thread_wait() or wait() will
 		 * need to reap it.
 		 */
 
 		SDT_PROBE0(sched, , , on__cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
 	} else
 		SDT_PROBE0(sched, , , remain__cpu);
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
 	    "prio:%d", td->td_priority);
 
 #ifdef SMP
 	if (td->td_flags & TDF_IDLETD)
 		CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask);
 #endif
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
 	MPASS(td->td_lock == &sched_lock);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 	td->td_flags &= ~TDF_CANSWAP;
 	if (ts->ts_slptime > 1) {
 		updatepri(td);
 		resetpriority(td);
 	}
 	td->td_slptick = 0;
 	ts->ts_slptime = 0;
 	ts->ts_slice = sched_slice;
 	sched_add(td, SRQ_BORING);
 }
 
 #ifdef SMP
 static int
 forward_wakeup(int cpunum)
 {
 	struct pcpu *pc;
 	cpuset_t dontuse, map, map2;
 	u_int id, me;
 	int iscpuset;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	CTR0(KTR_RUNQ, "forward_wakeup()");
 
 	if ((!forward_wakeup_enabled) ||
 	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
 		return (0);
 	if (!smp_started || panicstr)
 		return (0);
 
 	forward_wakeups_requested++;
 
 	/*
 	 * Check the idle mask we received against what we calculated
 	 * before in the old version.
 	 */
 	me = PCPU_GET(cpuid);
 
 	/* Don't bother if we should be doing it ourself. */
 	if (CPU_ISSET(me, &idle_cpus_mask) &&
 	    (cpunum == NOCPU || me == cpunum))
 		return (0);
 
 	CPU_SETOF(me, &dontuse);
 	CPU_OR(&dontuse, &stopped_cpus);
 	CPU_OR(&dontuse, &hlt_cpus_mask);
 	CPU_ZERO(&map2);
 	if (forward_wakeup_use_loop) {
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 			id = pc->pc_cpuid;
 			if (!CPU_ISSET(id, &dontuse) &&
 			    pc->pc_curthread == pc->pc_idlethread) {
 				CPU_SET(id, &map2);
 			}
 		}
 	}
 
 	if (forward_wakeup_use_mask) {
 		map = idle_cpus_mask;
 		CPU_NAND(&map, &dontuse);
 
 		/* If they are both on, compare and use loop if different. */
 		if (forward_wakeup_use_loop) {
 			if (CPU_CMP(&map, &map2)) {
 				printf("map != map2, loop method preferred\n");
 				map = map2;
 			}
 		}
 	} else {
 		map = map2;
 	}
 
 	/* If we only allow a specific CPU, then mask off all the others. */
 	if (cpunum != NOCPU) {
 		KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
 		iscpuset = CPU_ISSET(cpunum, &map);
 		if (iscpuset == 0)
 			CPU_ZERO(&map);
 		else
 			CPU_SETOF(cpunum, &map);
 	}
 	if (!CPU_EMPTY(&map)) {
 		forward_wakeups_delivered++;
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 			id = pc->pc_cpuid;
 			if (!CPU_ISSET(id, &map))
 				continue;
 			if (cpu_idle_wakeup(pc->pc_cpuid))
 				CPU_CLR(id, &map);
 		}
 		if (!CPU_EMPTY(&map))
 			ipi_selected(map, IPI_AST);
 		return (1);
 	}
 	if (cpunum == NOCPU)
 		printf("forward_wakeup: Idle processor not found\n");
 	return (0);
 }
 
 static void
 kick_other_cpu(int pri, int cpuid)
 {
 	struct pcpu *pcpu;
 	int cpri;
 
 	pcpu = pcpu_find(cpuid);
 	if (CPU_ISSET(cpuid, &idle_cpus_mask)) {
 		forward_wakeups_delivered++;
 		if (!cpu_idle_wakeup(cpuid))
 			ipi_cpu(cpuid, IPI_AST);
 		return;
 	}
 
 	cpri = pcpu->pc_curthread->td_priority;
 	if (pri >= cpri)
 		return;
 
 #if defined(IPI_PREEMPTION) && defined(PREEMPTION)
 #if !defined(FULL_PREEMPTION)
 	if (pri <= PRI_MAX_ITHD)
 #endif /* ! FULL_PREEMPTION */
 	{
 		ipi_cpu(cpuid, IPI_PREEMPT);
 		return;
 	}
 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
 
 	pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
 	ipi_cpu(cpuid, IPI_AST);
 	return;
 }
 #endif /* SMP */
 
 #ifdef SMP
 static int
 sched_pickcpu(struct thread *td)
 {
 	int best, cpu;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	if (td->td_lastcpu != NOCPU && THREAD_CAN_SCHED(td, td->td_lastcpu))
 		best = td->td_lastcpu;
 	else
 		best = NOCPU;
 	CPU_FOREACH(cpu) {
 		if (!THREAD_CAN_SCHED(td, cpu))
 			continue;
 	
 		if (best == NOCPU)
 			best = cpu;
 		else if (runq_length[cpu] < runq_length[best])
 			best = cpu;
 	}
 	KASSERT(best != NOCPU, ("no valid CPUs"));
 
 	return (best);
 }
 #endif
 
 void
 sched_add(struct thread *td, int flags)
 #ifdef SMP
 {
 	cpuset_t tidlemsk;
 	struct td_sched *ts;
 	u_int cpu, cpuid;
 	int forwarded = 0;
 	int single_cpu = 0;
 
 	ts = td_get_sched(td);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
 	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
 	    flags & SRQ_PREEMPTED);
 
 
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
 	 * to the scheduler's lock.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
 		thread_lock_set(td, &sched_lock);
 	}
 	TD_SET_RUNQ(td);
 
 	/*
 	 * If SMP is started and the thread is pinned or otherwise limited to
 	 * a specific set of CPUs, queue the thread to a per-CPU run queue.
 	 * Otherwise, queue the thread to the global run queue.
 	 *
 	 * If SMP has not yet been started we must use the global run queue
 	 * as per-CPU state may not be initialized yet and we may crash if we
 	 * try to access the per-CPU run queues.
 	 */
 	if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND ||
 	    ts->ts_flags & TSF_AFFINITY)) {
 		if (td->td_pinned != 0)
 			cpu = td->td_lastcpu;
 		else if (td->td_flags & TDF_BOUND) {
 			/* Find CPU from bound runq. */
 			KASSERT(SKE_RUNQ_PCPU(ts),
 			    ("sched_add: bound td_sched not on cpu runq"));
 			cpu = ts->ts_runq - &runq_pcpu[0];
 		} else
 			/* Find a valid CPU for our cpuset */
 			cpu = sched_pickcpu(td);
 		ts->ts_runq = &runq_pcpu[cpu];
 		single_cpu = 1;
 		CTR3(KTR_RUNQ,
 		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td,
 		    cpu);
 	} else {
 		CTR2(KTR_RUNQ,
 		    "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts,
 		    td);
 		cpu = NOCPU;
 		ts->ts_runq = &runq;
 	}
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_add();
 	runq_add(ts->ts_runq, td, flags);
 	if (cpu != NOCPU)
 		runq_length[cpu]++;
 
 	cpuid = PCPU_GET(cpuid);
 	if (single_cpu && cpu != cpuid) {
 	        kick_other_cpu(td->td_priority, cpu);
 	} else {
 		if (!single_cpu) {
 			tidlemsk = idle_cpus_mask;
 			CPU_NAND(&tidlemsk, &hlt_cpus_mask);
 			CPU_CLR(cpuid, &tidlemsk);
 
 			if (!CPU_ISSET(cpuid, &idle_cpus_mask) &&
 			    ((flags & SRQ_INTR) == 0) &&
 			    !CPU_EMPTY(&tidlemsk))
 				forwarded = forward_wakeup(cpu);
 		}
 
 		if (!forwarded) {
 			if (!maybe_preempt(td))
 				maybe_resched(td);
 		}
 	}
 }
 #else /* SMP */
 {
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
 	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
 	    flags & SRQ_PREEMPTED);
 
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
 	 * to the scheduler's lock.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
 		thread_lock_set(td, &sched_lock);
 	}
 	TD_SET_RUNQ(td);
 	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
 	ts->ts_runq = &runq;
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_add();
 	runq_add(ts->ts_runq, td, flags);
 	if (!maybe_preempt(td))
 		maybe_resched(td);
 }
 #endif /* SMP */
 
 void
 sched_rem(struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_rem: thread swapped out"));
 	KASSERT(TD_ON_RUNQ(td),
 	    ("sched_rem: thread not on run queue"));
 	mtx_assert(&sched_lock, MA_OWNED);
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
 #ifdef SMP
 	if (ts->ts_runq != &runq)
 		runq_length[ts->ts_runq - runq_pcpu]--;
 #endif
 	runq_remove(ts->ts_runq, td);
 	TD_SET_CAN_RUN(td);
 }
 
 /*
  * Select threads to run.  Note that running threads still consume a
  * slot.
  */
 struct thread *
 sched_choose(void)
 {
 	struct thread *td;
 	struct runq *rq;
 
 	mtx_assert(&sched_lock,  MA_OWNED);
 #ifdef SMP
 	struct thread *tdcpu;
 
 	rq = &runq;
 	td = runq_choose_fuzz(&runq, runq_fuzz);
 	tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
 
 	if (td == NULL ||
 	    (tdcpu != NULL &&
 	     tdcpu->td_priority < td->td_priority)) {
 		CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu,
 		     PCPU_GET(cpuid));
 		td = tdcpu;
 		rq = &runq_pcpu[PCPU_GET(cpuid)];
 	} else {
 		CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td);
 	}
 
 #else
 	rq = &runq;
 	td = runq_choose(&runq);
 #endif
 
 	if (td) {
 #ifdef SMP
 		if (td == tdcpu)
 			runq_length[PCPU_GET(cpuid)]--;
 #endif
 		runq_remove(rq, td);
 		td->td_flags |= TDF_DIDRUN;
 
 		KASSERT(td->td_flags & TDF_INMEM,
 		    ("sched_choose: thread swapped out"));
 		return (td);
 	}
 	return (PCPU_GET(idlethread));
 }
 
 void
 sched_preempt(struct thread *td)
 {
 
 	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
 	thread_lock(td);
 	if (td->td_critnest > 1)
 		td->td_owepreempt = 1;
 	else
 		mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL);
 	thread_unlock(td);
 }
 
 void
 sched_userret_slowpath(struct thread *td)
 {
 
 	thread_lock(td);
 	td->td_priority = td->td_user_pri;
 	td->td_base_pri = td->td_user_pri;
 	thread_unlock(td);
 }
 
 void
 sched_bind(struct thread *td, int cpu)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
 	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
 
 	ts = td_get_sched(td);
 
 	td->td_flags |= TDF_BOUND;
 #ifdef SMP
 	ts->ts_runq = &runq_pcpu[cpu];
 	if (PCPU_GET(cpuid) == cpu)
 		return;
 
 	mi_switch(SW_VOL, NULL);
 #endif
 }
 
 void
 sched_unbind(struct thread* td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
 	td->td_flags &= ~TDF_BOUND;
 }
 
 int
 sched_is_bound(struct thread *td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	return (td->td_flags & TDF_BOUND);
 }
 
 void
 sched_relinquish(struct thread *td)
 {
 	thread_lock(td);
 	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
 	thread_unlock(td);
 }
 
 int
 sched_load(void)
 {
 	return (sched_tdcnt);
 }
 
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
 
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 	return (ts->ts_pctcpu);
 }
 
 #ifdef RACCT
 /*
  * Calculates the contribution to the thread cpu usage for the latest
  * (unfinished) second.
  */
 fixpt_t
 sched_pctcpu_delta(struct thread *td)
 {
 	struct td_sched *ts;
 	fixpt_t delta;
 	int realstathz;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 	delta = 0;
 	realstathz = stathz ? stathz : hz;
 	if (ts->ts_cpticks != 0) {
 #if	(FSHIFT >= CCPU_SHIFT)
 		delta = (realstathz == 100)
 		    ? ((fixpt_t) ts->ts_cpticks) <<
 		    (FSHIFT - CCPU_SHIFT) :
 		    100 * (((fixpt_t) ts->ts_cpticks)
 		    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 		delta = ((FSCALE - ccpu) *
 		    (ts->ts_cpticks *
 		    FSCALE / realstathz)) >> FSHIFT;
 #endif
 	}
 
 	return (delta);
 }
 #endif
 
 u_int
 sched_estcpu(struct thread *td)
 {
 	
 	return (td_get_sched(td)->ts_estcpu);
 }
 
 /*
  * The actual idle process.
  */
 void
 sched_idletd(void *dummy)
 {
 	struct pcpuidlestat *stat;
 
 	THREAD_NO_SLEEPING();
 	stat = DPCPU_PTR(idlestat);
 	for (;;) {
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 		while (sched_runnable() == 0) {
 			cpu_idle(stat->idlecalls + stat->oldidlecalls > 64);
 			stat->idlecalls++;
 		}
 
 		mtx_lock_spin(&sched_lock);
 		mi_switch(SW_VOL | SWT_IDLE, NULL);
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 /*
  * A CPU is entering for the first time or a thread is exiting.
  */
 void
 sched_throw(struct thread *td)
 {
 	/*
 	 * Correct spinlock nesting.  The idle thread context that we are
 	 * borrowing was created so that it would start out with a single
 	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
 	 * explicitly acquired locks in this function, the nesting count
 	 * is now 2 rather than 1.  Since we are nested, calling
 	 * spinlock_exit() will simply adjust the counts without allowing
 	 * spin lock using code to interrupt us.
 	 */
 	if (td == NULL) {
 		mtx_lock_spin(&sched_lock);
 		spinlock_exit();
 		PCPU_SET(switchtime, cpu_ticks());
 		PCPU_SET(switchticks, ticks);
 	} else {
 		lock_profile_release_lock(&sched_lock.lock_object);
 		MPASS(td->td_lock == &sched_lock);
 		td->td_lastcpu = td->td_oncpu;
 		td->td_oncpu = NOCPU;
 	}
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
 	cpu_throw(td, choosethread());	/* doesn't return */
 }
 
 void
 sched_fork_exit(struct thread *td)
 {
 
 	/*
 	 * Finish setting up thread glue so that it begins execution in a
 	 * non-nested critical section with sched_lock held but not recursed.
 	 */
 	td->td_oncpu = PCPU_GET(cpuid);
 	sched_lock.mtx_lock = (uintptr_t)td;
 	lock_profile_obtain_lock_success(&sched_lock.lock_object,
 	    0, 0, __FILE__, __LINE__);
 	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
 	    "prio:%d", td->td_priority);
 	SDT_PROBE0(sched, , , on__cpu);
 }
 
 char *
 sched_tdname(struct thread *td)
 {
 #ifdef KTR
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	if (ts->ts_name[0] == '\0')
 		snprintf(ts->ts_name, sizeof(ts->ts_name),
 		    "%s tid %d", td->td_name, td->td_tid);
 	return (ts->ts_name);
 #else   
 	return (td->td_name);
 #endif
 }
 
 #ifdef KTR
 void
 sched_clear_tdname(struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	ts->ts_name[0] = '\0';
 }
 #endif
 
 void
 sched_affinity(struct thread *td)
 {
 #ifdef SMP
 	struct td_sched *ts;
 	int cpu;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);	
 
 	/*
 	 * Set the TSF_AFFINITY flag if there is at least one CPU this
 	 * thread can't run on.
 	 */
 	ts = td_get_sched(td);
 	ts->ts_flags &= ~TSF_AFFINITY;
 	CPU_FOREACH(cpu) {
 		if (!THREAD_CAN_SCHED(td, cpu)) {
 			ts->ts_flags |= TSF_AFFINITY;
 			break;
 		}
 	}
 
 	/*
 	 * If this thread can run on all CPUs, nothing else to do.
 	 */
 	if (!(ts->ts_flags & TSF_AFFINITY))
 		return;
 
 	/* Pinned threads and bound threads should be left alone. */
 	if (td->td_pinned != 0 || td->td_flags & TDF_BOUND)
 		return;
 
 	switch (td->td_state) {
 	case TDS_RUNQ:
 		/*
 		 * If we are on a per-CPU runqueue that is in the set,
 		 * then nothing needs to be done.
 		 */
 		if (ts->ts_runq != &runq &&
 		    THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu))
 			return;
 
 		/* Put this thread on a valid per-CPU runqueue. */
 		sched_rem(td);
 		sched_add(td, SRQ_BORING);
 		break;
 	case TDS_RUNNING:
 		/*
 		 * See if our current CPU is in the set.  If not, force a
 		 * context switch.
 		 */
 		if (THREAD_CAN_SCHED(td, td->td_oncpu))
 			return;
 
 		td->td_flags |= TDF_NEEDRESCHED;
 		if (td != curthread)
 			ipi_cpu(cpu, IPI_AST);
 		break;
 	default:
 		break;
 	}
 #endif
 }
Index: stable/12/sys/kern/sched_ule.c
===================================================================
--- stable/12/sys/kern/sched_ule.c	(revision 355609)
+++ stable/12/sys/kern/sched_ule.c	(revision 355610)
@@ -1,3106 +1,3106 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * This file implements the ULE scheduler.  ULE supports independent CPU
  * run queues and fine grain locking.  It has superior interactive
  * performance under load even on uni-processor systems.
  *
  * etymology:
  *   ULE is the last three letters in schedule.  It owes its name to a
  * generic user created for a scheduling system by Paul Mikesell at
  * Isilon Systems and a general lack of creativity on the part of the author.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/turnstile.h>
 #include <sys/umtx.h>
 #include <sys/vmmeter.h>
 #include <sys/cpuset.h>
 #include <sys/sbuf.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
-int				dtrace_vtime_active;
+int __read_mostly		dtrace_vtime_active;
 dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
 #endif
 
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
 #define	KTR_ULE	0
 
 #define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
 #define	TDQ_NAME_LEN	(sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
 #define	TDQ_LOADNAME_LEN	(sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load"))
 
 /*
  * Thread scheduler specific section.  All fields are protected
  * by the thread lock.
  */
 struct td_sched {	
 	struct runq	*ts_runq;	/* Run-queue we're queued on. */
 	short		ts_flags;	/* TSF_* flags. */
 	int		ts_cpu;		/* CPU that we have affinity for. */
 	int		ts_rltick;	/* Real last tick, for affinity. */
 	int		ts_slice;	/* Ticks of slice remaining. */
 	u_int		ts_slptime;	/* Number of ticks we vol. slept */
 	u_int		ts_runtime;	/* Number of ticks we were running */
 	int		ts_ltick;	/* Last tick that we were running on */
 	int		ts_ftick;	/* First tick that we were running on */
 	int		ts_ticks;	/* Tick count */
 #ifdef KTR
 	char		ts_name[TS_NAME_LEN];
 #endif
 };
 /* flags kept in ts_flags */
 #define	TSF_BOUND	0x0001		/* Thread can not migrate. */
 #define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
 
 #define	THREAD_CAN_MIGRATE(td)	((td)->td_pinned == 0)
 #define	THREAD_CAN_SCHED(td, cpu)	\
     CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
 
 _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
     sizeof(struct thread0_storage),
     "increase struct thread0_storage.t0st_sched size");
 
 /*
  * Priority ranges used for interactive and non-interactive timeshare
  * threads.  The timeshare priorities are split up into four ranges.
  * The first range handles interactive threads.  The last three ranges
  * (NHALF, x, and NHALF) handle non-interactive threads with the outer
  * ranges supporting nice values.
  */
 #define	PRI_TIMESHARE_RANGE	(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
 #define	PRI_INTERACT_RANGE	((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2)
 #define	PRI_BATCH_RANGE		(PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE)
 
 #define	PRI_MIN_INTERACT	PRI_MIN_TIMESHARE
 #define	PRI_MAX_INTERACT	(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1)
 #define	PRI_MIN_BATCH		(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
 #define	PRI_MAX_BATCH		PRI_MAX_TIMESHARE
 
 /*
  * Cpu percentage computation macros and defines.
  *
  * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
  * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
  * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
  * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
  * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
  * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
  */
 #define	SCHED_TICK_SECS		10
 #define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
 #define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
 #define	SCHED_TICK_SHIFT	10
 #define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
 #define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
 
 /*
  * These macros determine priorities for non-interactive threads.  They are
  * assigned a priority based on their recent cpu utilization as expressed
  * by the ratio of ticks to the tick total.  NHALF priorities at the start
  * and end of the MIN to MAX timeshare range are only reachable with negative
  * or positive nice respectively.
  *
  * PRI_RANGE:	Priority range for utilization dependent priorities.
  * PRI_NRESV:	Number of nice values.
  * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
  * PRI_NICE:	Determines the part of the priority inherited from nice.
  */
 #define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
 #define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
 #define	SCHED_PRI_MIN		(PRI_MIN_BATCH + SCHED_PRI_NHALF)
 #define	SCHED_PRI_MAX		(PRI_MAX_BATCH - SCHED_PRI_NHALF)
 #define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
 #define	SCHED_PRI_TICKS(ts)						\
     (SCHED_TICK_HZ((ts)) /						\
     (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
 #define	SCHED_PRI_NICE(nice)	(nice)
 
 /*
  * These determine the interactivity of a process.  Interactivity differs from
  * cpu utilization in that it expresses the voluntary time slept vs time ran
  * while cpu utilization includes all time not running.  This more accurately
  * models the intent of the thread.
  *
  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
  *		before throttling back.
  * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
  * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
  * INTERACT_THRESH:	Threshold for placement on the current runq.
  */
 #define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
 #define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
 #define	SCHED_INTERACT_MAX	(100)
 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
 #define	SCHED_INTERACT_THRESH	(30)
 
 /*
  * These parameters determine the slice behavior for batch work.
  */
 #define	SCHED_SLICE_DEFAULT_DIVISOR	10	/* ~94 ms, 12 stathz ticks. */
 #define	SCHED_SLICE_MIN_DIVISOR		6	/* DEFAULT/MIN = ~16 ms. */
 
 /* Flags kept in td_flags. */
 #define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
 
 /*
  * tickincr:		Converts a stathz tick into a hz domain scaled by
  *			the shift factor.  Without the shift the error rate
  *			due to rounding would be unacceptably high.
  * realstathz:		stathz is sometimes 0 and run off of hz.
  * sched_slice:		Runtime of each thread before rescheduling.
  * preempt_thresh:	Priority threshold for preemption and remote IPIs.
  */
-static int sched_interact = SCHED_INTERACT_THRESH;
-static int tickincr = 8 << SCHED_TICK_SHIFT;
-static int realstathz = 127;	/* reset during boot. */
-static int sched_slice = 10;	/* reset during boot. */
-static int sched_slice_min = 1;	/* reset during boot. */
+static int __read_mostly sched_interact = SCHED_INTERACT_THRESH;
+static int __read_mostly tickincr = 8 << SCHED_TICK_SHIFT;
+static int __read_mostly realstathz = 127;	/* reset during boot. */
+static int __read_mostly sched_slice = 10;	/* reset during boot. */
+static int __read_mostly sched_slice_min = 1;	/* reset during boot. */
 #ifdef PREEMPTION
 #ifdef FULL_PREEMPTION
-static int preempt_thresh = PRI_MAX_IDLE;
+static int __read_mostly preempt_thresh = PRI_MAX_IDLE;
 #else
-static int preempt_thresh = PRI_MIN_KERN;
+static int __read_mostly preempt_thresh = PRI_MIN_KERN;
 #endif
 #else 
-static int preempt_thresh = 0;
+static int __read_mostly preempt_thresh = 0;
 #endif
-static int static_boost = PRI_MIN_BATCH;
-static int sched_idlespins = 10000;
-static int sched_idlespinthresh = -1;
+static int __read_mostly static_boost = PRI_MIN_BATCH;
+static int __read_mostly sched_idlespins = 10000;
+static int __read_mostly sched_idlespinthresh = -1;
 
 /*
  * tdq - per processor runqs and statistics.  All fields are protected by the
  * tdq_lock.  The load and lowpri may be accessed without to avoid excess
  * locking in sched_pickcpu();
  */
 struct tdq {
 	/* 
 	 * Ordered to improve efficiency of cpu_search() and switch().
 	 * tdq_lock is padded to avoid false sharing with tdq_load and
 	 * tdq_cpu_idle.
 	 */
 	struct mtx_padalign tdq_lock;		/* run queue lock. */
 	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
 	volatile int	tdq_load;		/* Aggregate load. */
 	volatile int	tdq_cpu_idle;		/* cpu_idle() is active. */
 	int		tdq_sysload;		/* For loadavg, !ITHD load. */
 	volatile int	tdq_transferable;	/* Transferable thread count. */
 	volatile short	tdq_switchcnt;		/* Switches this tick. */
 	volatile short	tdq_oldswitchcnt;	/* Switches last tick. */
 	u_char		tdq_lowpri;		/* Lowest priority thread. */
 	u_char		tdq_ipipending;		/* IPI pending. */
 	u_char		tdq_idx;		/* Current insert index. */
 	u_char		tdq_ridx;		/* Current removal index. */
 	int		tdq_id;			/* cpuid. */
 	struct runq	tdq_realtime;		/* real-time run queue. */
 	struct runq	tdq_timeshare;		/* timeshare run queue. */
 	struct runq	tdq_idle;		/* Queue of IDLE threads. */
 	char		tdq_name[TDQ_NAME_LEN];
 #ifdef KTR
 	char		tdq_loadname[TDQ_LOADNAME_LEN];
 #endif
 } __aligned(64);
 
 /* Idle thread states and config. */
 #define	TDQ_RUNNING	1
 #define	TDQ_IDLE	2
 
 #ifdef SMP
-struct cpu_group *cpu_top;		/* CPU topology */
+struct cpu_group __read_mostly *cpu_top;		/* CPU topology */
 
 #define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
 #define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
 
 /*
  * Run-time tunables.
  */
 static int rebalance = 1;
 static int balance_interval = 128;	/* Default set in sched_initticks(). */
-static int affinity;
-static int steal_idle = 1;
-static int steal_thresh = 2;
-static int always_steal = 0;
-static int trysteal_limit = 2;
+static int __read_mostly affinity;
+static int __read_mostly steal_idle = 1;
+static int __read_mostly steal_thresh = 2;
+static int __read_mostly always_steal = 0;
+static int __read_mostly trysteal_limit = 2;
 
 /*
  * One thread queue per processor.
  */
-static struct tdq	*balance_tdq;
+static struct tdq __read_mostly *balance_tdq;
 static int balance_ticks;
 DPCPU_DEFINE_STATIC(struct tdq, tdq);
 DPCPU_DEFINE_STATIC(uint32_t, randomval);
 
 #define	TDQ_SELF()	(DPCPU_PTR(tdq))
 #define	TDQ_CPU(x)	(DPCPU_ID_PTR((x), tdq))
 #define	TDQ_ID(x)	((x)->tdq_id)
 #else	/* !SMP */
 static struct tdq	tdq_cpu;
 
 #define	TDQ_ID(x)	(0)
 #define	TDQ_SELF()	(&tdq_cpu)
 #define	TDQ_CPU(x)	(&tdq_cpu)
 #endif
 
 #define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
 #define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
 #define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
 #define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
 #define	TDQ_LOCKPTR(t)		((struct mtx *)(&(t)->tdq_lock))
 
 static void sched_priority(struct thread *);
 static void sched_thread_priority(struct thread *, u_char);
 static int sched_interact_score(struct thread *);
 static void sched_interact_update(struct thread *);
 static void sched_interact_fork(struct thread *);
 static void sched_pctcpu_update(struct td_sched *, int);
 
 /* Operations on per processor queues */
 static struct thread *tdq_choose(struct tdq *);
 static void tdq_setup(struct tdq *, int i);
 static void tdq_load_add(struct tdq *, struct thread *);
 static void tdq_load_rem(struct tdq *, struct thread *);
 static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
 static __inline void tdq_runq_rem(struct tdq *, struct thread *);
 static inline int sched_shouldpreempt(int, int, int);
 void tdq_print(int cpu);
 static void runq_print(struct runq *rq);
 static void tdq_add(struct tdq *, struct thread *, int);
 #ifdef SMP
 static struct thread *tdq_move(struct tdq *, struct tdq *);
 static int tdq_idled(struct tdq *);
 static void tdq_notify(struct tdq *, struct thread *);
 static struct thread *tdq_steal(struct tdq *, int);
 static struct thread *runq_steal(struct runq *, int);
 static int sched_pickcpu(struct thread *, int);
 static void sched_balance(void);
 static int sched_balance_pair(struct tdq *, struct tdq *);
 static inline struct tdq *sched_setcpu(struct thread *, int, int);
 static inline void thread_unblock_switch(struct thread *, struct mtx *);
 static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
 static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, 
     struct cpu_group *cg, int indent);
 #endif
 
 static void sched_setup(void *dummy);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
 
 static void sched_initticks(void *dummy);
 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
     NULL);
 
 SDT_PROVIDER_DEFINE(sched);
 
 SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", 
     "struct proc *", "uint8_t");
 SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", 
     "struct proc *", "void *");
 SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", 
     "struct proc *", "void *", "int");
 SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", 
     "struct proc *", "uint8_t", "struct thread *");
 SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
 SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", 
     "struct proc *");
 SDT_PROBE_DEFINE(sched, , , on__cpu);
 SDT_PROBE_DEFINE(sched, , , remain__cpu);
 SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", 
     "struct proc *");
 
 /*
  * Print the threads waiting on a run-queue.
  */
 static void
 runq_print(struct runq *rq)
 {
 	struct rqhead *rqh;
 	struct thread *td;
 	int pri;
 	int j;
 	int i;
 
 	for (i = 0; i < RQB_LEN; i++) {
 		printf("\t\trunq bits %d 0x%zx\n",
 		    i, rq->rq_status.rqb_bits[i]);
 		for (j = 0; j < RQB_BPW; j++)
 			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
 				pri = j + (i << RQB_L2BPW);
 				rqh = &rq->rq_queues[pri];
 				TAILQ_FOREACH(td, rqh, td_runq) {
 					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
 					    td, td->td_name, td->td_priority,
 					    td->td_rqindex, pri);
 				}
 			}
 	}
 }
 
 /*
  * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
  */
 void
 tdq_print(int cpu)
 {
 	struct tdq *tdq;
 
 	tdq = TDQ_CPU(cpu);
 
 	printf("tdq %d:\n", TDQ_ID(tdq));
 	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
 	printf("\tLock name:      %s\n", tdq->tdq_name);
 	printf("\tload:           %d\n", tdq->tdq_load);
 	printf("\tswitch cnt:     %d\n", tdq->tdq_switchcnt);
 	printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
 	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
 	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
 	printf("\tload transferable: %d\n", tdq->tdq_transferable);
 	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
 	printf("\trealtime runq:\n");
 	runq_print(&tdq->tdq_realtime);
 	printf("\ttimeshare runq:\n");
 	runq_print(&tdq->tdq_timeshare);
 	printf("\tidle runq:\n");
 	runq_print(&tdq->tdq_idle);
 }
 
 static inline int
 sched_shouldpreempt(int pri, int cpri, int remote)
 {
 	/*
 	 * If the new priority is not better than the current priority there is
 	 * nothing to do.
 	 */
 	if (pri >= cpri)
 		return (0);
 	/*
 	 * Always preempt idle.
 	 */
 	if (cpri >= PRI_MIN_IDLE)
 		return (1);
 	/*
 	 * If preemption is disabled don't preempt others.
 	 */
 	if (preempt_thresh == 0)
 		return (0);
 	/*
 	 * Preempt if we exceed the threshold.
 	 */
 	if (pri <= preempt_thresh)
 		return (1);
 	/*
 	 * If we're interactive or better and there is non-interactive
 	 * or worse running preempt only remote processors.
 	 */
 	if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT)
 		return (1);
 	return (0);
 }
 
 /*
  * Add a thread to the actual run-queue.  Keeps transferable counts up to
  * date with what is actually on the run-queue.  Selects the correct
  * queue position for timeshare threads.
  */
 static __inline void
 tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
 {
 	struct td_sched *ts;
 	u_char pri;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	pri = td->td_priority;
 	ts = td_get_sched(td);
 	TD_SET_RUNQ(td);
 	if (THREAD_CAN_MIGRATE(td)) {
 		tdq->tdq_transferable++;
 		ts->ts_flags |= TSF_XFERABLE;
 	}
 	if (pri < PRI_MIN_BATCH) {
 		ts->ts_runq = &tdq->tdq_realtime;
 	} else if (pri <= PRI_MAX_BATCH) {
 		ts->ts_runq = &tdq->tdq_timeshare;
 		KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH,
 			("Invalid priority %d on timeshare runq", pri));
 		/*
 		 * This queue contains only priorities between MIN and MAX
 		 * realtime.  Use the whole queue to represent these values.
 		 */
 		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
 			pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE;
 			pri = (pri + tdq->tdq_idx) % RQ_NQS;
 			/*
 			 * This effectively shortens the queue by one so we
 			 * can have a one slot difference between idx and
 			 * ridx while we wait for threads to drain.
 			 */
 			if (tdq->tdq_ridx != tdq->tdq_idx &&
 			    pri == tdq->tdq_ridx)
 				pri = (unsigned char)(pri - 1) % RQ_NQS;
 		} else
 			pri = tdq->tdq_ridx;
 		runq_add_pri(ts->ts_runq, td, pri, flags);
 		return;
 	} else
 		ts->ts_runq = &tdq->tdq_idle;
 	runq_add(ts->ts_runq, td, flags);
 }
 
 /* 
  * Remove a thread from a run-queue.  This typically happens when a thread
  * is selected to run.  Running threads are not on the queue and the
  * transferable count does not reflect them.
  */
 static __inline void
 tdq_runq_rem(struct tdq *tdq, struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT(ts->ts_runq != NULL,
 	    ("tdq_runq_remove: thread %p null ts_runq", td));
 	if (ts->ts_flags & TSF_XFERABLE) {
 		tdq->tdq_transferable--;
 		ts->ts_flags &= ~TSF_XFERABLE;
 	}
 	if (ts->ts_runq == &tdq->tdq_timeshare) {
 		if (tdq->tdq_idx != tdq->tdq_ridx)
 			runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
 		else
 			runq_remove_idx(ts->ts_runq, td, NULL);
 	} else
 		runq_remove(ts->ts_runq, td);
 }
 
 /*
  * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
  * for this thread to the referenced thread queue.
  */
 static void
 tdq_load_add(struct tdq *tdq, struct thread *td)
 {
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	tdq->tdq_load++;
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload++;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
 	SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
  * Remove the load from a thread that is transitioning to a sleep state or
  * exiting.
  */
 static void
 tdq_load_rem(struct tdq *tdq, struct thread *td)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT(tdq->tdq_load != 0,
 	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
 
 	tdq->tdq_load--;
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload--;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
 	SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
  * Bound timeshare latency by decreasing slice size as load increases.  We
  * consider the maximum latency as the sum of the threads waiting to run
  * aside from curthread and target no more than sched_slice latency but
  * no less than sched_slice_min runtime.
  */
 static inline int
 tdq_slice(struct tdq *tdq)
 {
 	int load;
 
 	/*
 	 * It is safe to use sys_load here because this is called from
 	 * contexts where timeshare threads are running and so there
 	 * cannot be higher priority load in the system.
 	 */
 	load = tdq->tdq_sysload - 1;
 	if (load >= SCHED_SLICE_MIN_DIVISOR)
 		return (sched_slice_min);
 	if (load <= 1)
 		return (sched_slice);
 	return (sched_slice / load);
 }
 
 /*
  * Set lowpri to its exact value by searching the run-queue and
  * evaluating curthread.  curthread may be passed as an optimization.
  */
 static void
 tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
 {
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	if (ctd == NULL)
 		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
 	td = tdq_choose(tdq);
 	if (td == NULL || td->td_priority > ctd->td_priority)
 		tdq->tdq_lowpri = ctd->td_priority;
 	else
 		tdq->tdq_lowpri = td->td_priority;
 }
 
 #ifdef SMP
 /*
  * We need some randomness. Implement a classic Linear Congruential
  * Generator X_{n+1}=(aX_n+c) mod m. These values are optimized for
  * m = 2^32, a = 69069 and c = 5. We only return the upper 16 bits
  * of the random state (in the low bits of our answer) to keep
  * the maximum randomness.
  */
 static uint32_t
 sched_random(void)
 {
 	uint32_t *rndptr;
 
 	rndptr = DPCPU_PTR(randomval);
 	*rndptr = *rndptr * 69069 + 5;
 
 	return (*rndptr >> 16);
 }
 
 struct cpu_search {
 	cpuset_t cs_mask;
 	u_int	cs_prefer;
 	int	cs_pri;		/* Min priority for low. */
 	int	cs_limit;	/* Max load for low, min load for high. */
 	int	cs_cpu;
 	int	cs_load;
 };
 
 #define	CPU_SEARCH_LOWEST	0x1
 #define	CPU_SEARCH_HIGHEST	0x2
 #define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
 
 static __always_inline int cpu_search(const struct cpu_group *cg,
     struct cpu_search *low, struct cpu_search *high, const int match);
 int __noinline cpu_search_lowest(const struct cpu_group *cg,
     struct cpu_search *low);
 int __noinline cpu_search_highest(const struct cpu_group *cg,
     struct cpu_search *high);
 int __noinline cpu_search_both(const struct cpu_group *cg,
     struct cpu_search *low, struct cpu_search *high);
 
 /*
  * Search the tree of cpu_groups for the lowest or highest loaded cpu
  * according to the match argument.  This routine actually compares the
  * load on all paths through the tree and finds the least loaded cpu on
  * the least loaded path, which may differ from the least loaded cpu in
  * the system.  This balances work among caches and buses.
  *
  * This inline is instantiated in three forms below using constants for the
  * match argument.  It is reduced to the minimum set for each case.  It is
  * also recursive to the depth of the tree.
  */
 static __always_inline int
 cpu_search(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high, const int match)
 {
 	struct cpu_search lgroup;
 	struct cpu_search hgroup;
 	cpuset_t cpumask;
 	struct cpu_group *child;
 	struct tdq *tdq;
 	int cpu, i, hload, lload, load, total, rnd;
 
 	total = 0;
 	cpumask = cg->cg_mask;
 	if (match & CPU_SEARCH_LOWEST) {
 		lload = INT_MAX;
 		lgroup = *low;
 	}
 	if (match & CPU_SEARCH_HIGHEST) {
 		hload = INT_MIN;
 		hgroup = *high;
 	}
 
 	/* Iterate through the child CPU groups and then remaining CPUs. */
 	for (i = cg->cg_children, cpu = mp_maxid; ; ) {
 		if (i == 0) {
 #ifdef HAVE_INLINE_FFSL
 			cpu = CPU_FFS(&cpumask) - 1;
 #else
 			while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask))
 				cpu--;
 #endif
 			if (cpu < 0)
 				break;
 			child = NULL;
 		} else
 			child = &cg->cg_child[i - 1];
 
 		if (match & CPU_SEARCH_LOWEST)
 			lgroup.cs_cpu = -1;
 		if (match & CPU_SEARCH_HIGHEST)
 			hgroup.cs_cpu = -1;
 		if (child) {			/* Handle child CPU group. */
 			CPU_NAND(&cpumask, &child->cg_mask);
 			switch (match) {
 			case CPU_SEARCH_LOWEST:
 				load = cpu_search_lowest(child, &lgroup);
 				break;
 			case CPU_SEARCH_HIGHEST:
 				load = cpu_search_highest(child, &hgroup);
 				break;
 			case CPU_SEARCH_BOTH:
 				load = cpu_search_both(child, &lgroup, &hgroup);
 				break;
 			}
 		} else {			/* Handle child CPU. */
 			CPU_CLR(cpu, &cpumask);
 			tdq = TDQ_CPU(cpu);
 			load = tdq->tdq_load * 256;
 			rnd = sched_random() % 32;
 			if (match & CPU_SEARCH_LOWEST) {
 				if (cpu == low->cs_prefer)
 					load -= 64;
 				/* If that CPU is allowed and get data. */
 				if (tdq->tdq_lowpri > lgroup.cs_pri &&
 				    tdq->tdq_load <= lgroup.cs_limit &&
 				    CPU_ISSET(cpu, &lgroup.cs_mask)) {
 					lgroup.cs_cpu = cpu;
 					lgroup.cs_load = load - rnd;
 				}
 			}
 			if (match & CPU_SEARCH_HIGHEST)
 				if (tdq->tdq_load >= hgroup.cs_limit &&
 				    tdq->tdq_transferable &&
 				    CPU_ISSET(cpu, &hgroup.cs_mask)) {
 					hgroup.cs_cpu = cpu;
 					hgroup.cs_load = load - rnd;
 				}
 		}
 		total += load;
 
 		/* We have info about child item. Compare it. */
 		if (match & CPU_SEARCH_LOWEST) {
 			if (lgroup.cs_cpu >= 0 &&
 			    (load < lload ||
 			     (load == lload && lgroup.cs_load < low->cs_load))) {
 				lload = load;
 				low->cs_cpu = lgroup.cs_cpu;
 				low->cs_load = lgroup.cs_load;
 			}
 		}
 		if (match & CPU_SEARCH_HIGHEST)
 			if (hgroup.cs_cpu >= 0 &&
 			    (load > hload ||
 			     (load == hload && hgroup.cs_load > high->cs_load))) {
 				hload = load;
 				high->cs_cpu = hgroup.cs_cpu;
 				high->cs_load = hgroup.cs_load;
 			}
 		if (child) {
 			i--;
 			if (i == 0 && CPU_EMPTY(&cpumask))
 				break;
 		}
 #ifndef HAVE_INLINE_FFSL
 		else
 			cpu--;
 #endif
 	}
 	return (total);
 }
 
 /*
  * cpu_search instantiations must pass constants to maintain the inline
  * optimization.
  */
 int
 cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
 {
 	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
 }
 
 int
 cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
 {
 	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
 }
 
 int
 cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high)
 {
 	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
 }
 
 /*
  * Find the cpu with the least load via the least loaded path that has a
  * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
  * acceptable.
  */
 static inline int
 sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
     int prefer)
 {
 	struct cpu_search low;
 
 	low.cs_cpu = -1;
 	low.cs_prefer = prefer;
 	low.cs_mask = mask;
 	low.cs_pri = pri;
 	low.cs_limit = maxload;
 	cpu_search_lowest(cg, &low);
 	return low.cs_cpu;
 }
 
 /*
  * Find the cpu with the highest load via the highest loaded path.
  */
 static inline int
 sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
 {
 	struct cpu_search high;
 
 	high.cs_cpu = -1;
 	high.cs_mask = mask;
 	high.cs_limit = minload;
 	cpu_search_highest(cg, &high);
 	return high.cs_cpu;
 }
 
 static void
 sched_balance_group(struct cpu_group *cg)
 {
 	struct tdq *tdq;
 	cpuset_t hmask, lmask;
 	int high, low, anylow;
 
 	CPU_FILL(&hmask);
 	for (;;) {
 		high = sched_highest(cg, hmask, 2);
 		/* Stop if there is no more CPU with transferrable threads. */
 		if (high == -1)
 			break;
 		CPU_CLR(high, &hmask);
 		CPU_COPY(&hmask, &lmask);
 		/* Stop if there is no more CPU left for low. */
 		if (CPU_EMPTY(&lmask))
 			break;
 		anylow = 1;
 		tdq = TDQ_CPU(high);
 nextlow:
 		low = sched_lowest(cg, lmask, -1, tdq->tdq_load - 1, high);
 		/* Stop if we looked well and found no less loaded CPU. */
 		if (anylow && low == -1)
 			break;
 		/* Go to next high if we found no less loaded CPU. */
 		if (low == -1)
 			continue;
 		/* Transfer thread from high to low. */
 		if (sched_balance_pair(tdq, TDQ_CPU(low))) {
 			/* CPU that got thread can no longer be a donor. */
 			CPU_CLR(low, &hmask);
 		} else {
 			/*
 			 * If failed, then there is no threads on high
 			 * that can run on this low. Drop low from low
 			 * mask and look for different one.
 			 */
 			CPU_CLR(low, &lmask);
 			anylow = 0;
 			goto nextlow;
 		}
 	}
 }
 
 static void
 sched_balance(void)
 {
 	struct tdq *tdq;
 
 	balance_ticks = max(balance_interval / 2, 1) +
 	    (sched_random() % balance_interval);
 	tdq = TDQ_SELF();
 	TDQ_UNLOCK(tdq);
 	sched_balance_group(cpu_top);
 	TDQ_LOCK(tdq);
 }
 
 /*
  * Lock two thread queues using their address to maintain lock order.
  */
 static void
 tdq_lock_pair(struct tdq *one, struct tdq *two)
 {
 	if (one < two) {
 		TDQ_LOCK(one);
 		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
 	} else {
 		TDQ_LOCK(two);
 		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
 	}
 }
 
 /*
  * Unlock two thread queues.  Order is not important here.
  */
 static void
 tdq_unlock_pair(struct tdq *one, struct tdq *two)
 {
 	TDQ_UNLOCK(one);
 	TDQ_UNLOCK(two);
 }
 
 /*
  * Transfer load between two imbalanced thread queues.
  */
 static int
 sched_balance_pair(struct tdq *high, struct tdq *low)
 {
 	struct thread *td;
 	int cpu;
 
 	tdq_lock_pair(high, low);
 	td = NULL;
 	/*
 	 * Transfer a thread from high to low.
 	 */
 	if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
 	    (td = tdq_move(high, low)) != NULL) {
 		/*
 		 * In case the target isn't the current cpu notify it of the
 		 * new load, possibly sending an IPI to force it to reschedule.
 		 */
 		cpu = TDQ_ID(low);
 		if (cpu != PCPU_GET(cpuid))
 			tdq_notify(low, td);
 	}
 	tdq_unlock_pair(high, low);
 	return (td != NULL);
 }
 
 /*
  * Move a thread from one thread queue to another.
  */
 static struct thread *
 tdq_move(struct tdq *from, struct tdq *to)
 {
 	struct td_sched *ts;
 	struct thread *td;
 	struct tdq *tdq;
 	int cpu;
 
 	TDQ_LOCK_ASSERT(from, MA_OWNED);
 	TDQ_LOCK_ASSERT(to, MA_OWNED);
 
 	tdq = from;
 	cpu = TDQ_ID(to);
 	td = tdq_steal(tdq, cpu);
 	if (td == NULL)
 		return (NULL);
 	ts = td_get_sched(td);
 	/*
 	 * Although the run queue is locked the thread may be blocked.  Lock
 	 * it to clear this and acquire the run-queue lock.
 	 */
 	thread_lock(td);
 	/* Drop recursive lock on from acquired via thread_lock(). */
 	TDQ_UNLOCK(from);
 	sched_rem(td);
 	ts->ts_cpu = cpu;
 	td->td_lock = TDQ_LOCKPTR(to);
 	tdq_add(to, td, SRQ_YIELDING);
 	return (td);
 }
 
 /*
  * This tdq has idled.  Try to steal a thread from another cpu and switch
  * to it.
  */
 static int
 tdq_idled(struct tdq *tdq)
 {
 	struct cpu_group *cg;
 	struct tdq *steal;
 	cpuset_t mask;
 	int cpu, switchcnt;
 
 	if (smp_started == 0 || steal_idle == 0 || tdq->tdq_cg == NULL)
 		return (1);
 	CPU_FILL(&mask);
 	CPU_CLR(PCPU_GET(cpuid), &mask);
     restart:
 	switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 	for (cg = tdq->tdq_cg; ; ) {
 		cpu = sched_highest(cg, mask, steal_thresh);
 		/*
 		 * We were assigned a thread but not preempted.  Returning
 		 * 0 here will cause our caller to switch to it.
 		 */
 		if (tdq->tdq_load)
 			return (0);
 		if (cpu == -1) {
 			cg = cg->cg_parent;
 			if (cg == NULL)
 				return (1);
 			continue;
 		}
 		steal = TDQ_CPU(cpu);
 		/*
 		 * The data returned by sched_highest() is stale and
 		 * the chosen CPU no longer has an eligible thread.
 		 *
 		 * Testing this ahead of tdq_lock_pair() only catches
 		 * this situation about 20% of the time on an 8 core
 		 * 16 thread Ryzen 7, but it still helps performance.
 		 */
 		if (steal->tdq_load < steal_thresh ||
 		    steal->tdq_transferable == 0)
 			goto restart;
 		tdq_lock_pair(tdq, steal);
 		/*
 		 * We were assigned a thread while waiting for the locks.
 		 * Switch to it now instead of stealing a thread.
 		 */
 		if (tdq->tdq_load)
 			break;
 		/*
 		 * The data returned by sched_highest() is stale and
 		 * the chosen CPU no longer has an eligible thread, or
 		 * we were preempted and the CPU loading info may be out
 		 * of date.  The latter is rare.  In either case restart
 		 * the search.
 		 */
 		if (steal->tdq_load < steal_thresh ||
 		    steal->tdq_transferable == 0 ||
 		    switchcnt != tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt) {
 			tdq_unlock_pair(tdq, steal);
 			goto restart;
 		}
 		/*
 		 * Steal the thread and switch to it.
 		 */
 		if (tdq_move(steal, tdq) != NULL)
 			break;
 		/*
 		 * We failed to acquire a thread even though it looked
 		 * like one was available.  This could be due to affinity
 		 * restrictions or for other reasons.  Loop again after
 		 * removing this CPU from the set.  The restart logic
 		 * above does not restore this CPU to the set due to the
 		 * likelyhood of failing here again.
 		 */
 		CPU_CLR(cpu, &mask);
 		tdq_unlock_pair(tdq, steal);
 	}
 	TDQ_UNLOCK(steal);
 	mi_switch(SW_VOL | SWT_IDLE, NULL);
 	thread_unlock(curthread);
 	return (0);
 }
 
 /*
  * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
  */
 static void
 tdq_notify(struct tdq *tdq, struct thread *td)
 {
 	struct thread *ctd;
 	int pri;
 	int cpu;
 
 	if (tdq->tdq_ipipending)
 		return;
 	cpu = td_get_sched(td)->ts_cpu;
 	pri = td->td_priority;
 	ctd = pcpu_find(cpu)->pc_curthread;
 	if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
 		return;
 
 	/*
 	 * Make sure that our caller's earlier update to tdq_load is
 	 * globally visible before we read tdq_cpu_idle.  Idle thread
 	 * accesses both of them without locks, and the order is important.
 	 */
 	atomic_thread_fence_seq_cst();
 
 	if (TD_IS_IDLETHREAD(ctd)) {
 		/*
 		 * If the MD code has an idle wakeup routine try that before
 		 * falling back to IPI.
 		 */
 		if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu))
 			return;
 	}
 	tdq->tdq_ipipending = 1;
 	ipi_cpu(cpu, IPI_PREEMPT);
 }
 
 /*
  * Steals load from a timeshare queue.  Honors the rotating queue head
  * index.
  */
 static struct thread *
 runq_steal_from(struct runq *rq, int cpu, u_char start)
 {
 	struct rqbits *rqb;
 	struct rqhead *rqh;
 	struct thread *td, *first;
 	int bit;
 	int i;
 
 	rqb = &rq->rq_status;
 	bit = start & (RQB_BPW -1);
 	first = NULL;
 again:
 	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
 		if (rqb->rqb_bits[i] == 0)
 			continue;
 		if (bit == 0)
 			bit = RQB_FFS(rqb->rqb_bits[i]);
 		for (; bit < RQB_BPW; bit++) {
 			if ((rqb->rqb_bits[i] & (1ul << bit)) == 0)
 				continue;
 			rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)];
 			TAILQ_FOREACH(td, rqh, td_runq) {
 				if (first && THREAD_CAN_MIGRATE(td) &&
 				    THREAD_CAN_SCHED(td, cpu))
 					return (td);
 				first = td;
 			}
 		}
 	}
 	if (start != 0) {
 		start = 0;
 		goto again;
 	}
 
 	if (first && THREAD_CAN_MIGRATE(first) &&
 	    THREAD_CAN_SCHED(first, cpu))
 		return (first);
 	return (NULL);
 }
 
 /*
  * Steals load from a standard linear queue.
  */
 static struct thread *
 runq_steal(struct runq *rq, int cpu)
 {
 	struct rqhead *rqh;
 	struct rqbits *rqb;
 	struct thread *td;
 	int word;
 	int bit;
 
 	rqb = &rq->rq_status;
 	for (word = 0; word < RQB_LEN; word++) {
 		if (rqb->rqb_bits[word] == 0)
 			continue;
 		for (bit = 0; bit < RQB_BPW; bit++) {
 			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
 				continue;
 			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
 			TAILQ_FOREACH(td, rqh, td_runq)
 				if (THREAD_CAN_MIGRATE(td) &&
 				    THREAD_CAN_SCHED(td, cpu))
 					return (td);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Attempt to steal a thread in priority order from a thread queue.
  */
 static struct thread *
 tdq_steal(struct tdq *tdq, int cpu)
 {
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
 		return (td);
 	if ((td = runq_steal_from(&tdq->tdq_timeshare,
 	    cpu, tdq->tdq_ridx)) != NULL)
 		return (td);
 	return (runq_steal(&tdq->tdq_idle, cpu));
 }
 
 /*
  * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
  * current lock and returns with the assigned queue locked.
  */
 static inline struct tdq *
 sched_setcpu(struct thread *td, int cpu, int flags)
 {
 
 	struct tdq *tdq;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_CPU(cpu);
 	td_get_sched(td)->ts_cpu = cpu;
 	/*
 	 * If the lock matches just return the queue.
 	 */
 	if (td->td_lock == TDQ_LOCKPTR(tdq))
 		return (tdq);
 #ifdef notyet
 	/*
 	 * If the thread isn't running its lockptr is a
 	 * turnstile or a sleepqueue.  We can just lock_set without
 	 * blocking.
 	 */
 	if (TD_CAN_RUN(td)) {
 		TDQ_LOCK(tdq);
 		thread_lock_set(td, TDQ_LOCKPTR(tdq));
 		return (tdq);
 	}
 #endif
 	/*
 	 * The hard case, migration, we need to block the thread first to
 	 * prevent order reversals with other cpus locks.
 	 */
 	spinlock_enter();
 	thread_lock_block(td);
 	TDQ_LOCK(tdq);
 	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
 	spinlock_exit();
 	return (tdq);
 }
 
 SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
 SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
 SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
 SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
 SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
 SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
 
 static int
 sched_pickcpu(struct thread *td, int flags)
 {
 	struct cpu_group *cg, *ccg;
 	struct td_sched *ts;
 	struct tdq *tdq;
 	cpuset_t mask;
 	int cpu, pri, self, intr;
 
 	self = PCPU_GET(cpuid);
 	ts = td_get_sched(td);
 	KASSERT(!CPU_ABSENT(ts->ts_cpu), ("sched_pickcpu: Start scheduler on "
 	    "absent CPU %d for thread %s.", ts->ts_cpu, td->td_name));
 	if (smp_started == 0)
 		return (self);
 	/*
 	 * Don't migrate a running thread from sched_switch().
 	 */
 	if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
 		return (ts->ts_cpu);
 	/*
 	 * Prefer to run interrupt threads on the processors that generate
 	 * the interrupt.
 	 */
 	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
 	    curthread->td_intr_nesting_level) {
 		tdq = TDQ_SELF();
 		if (tdq->tdq_lowpri >= PRI_MIN_IDLE) {
 			SCHED_STAT_INC(pickcpu_idle_affinity);
 			return (self);
 		}
 		ts->ts_cpu = self;
 		intr = 1;
 		cg = tdq->tdq_cg;
 		goto llc;
 	} else {
 		intr = 0;
 		tdq = TDQ_CPU(ts->ts_cpu);
 		cg = tdq->tdq_cg;
 	}
 	/*
 	 * If the thread can run on the last cpu and the affinity has not
 	 * expired and it is idle, run it there.
 	 */
 	if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
 	    tdq->tdq_lowpri >= PRI_MIN_IDLE &&
 	    SCHED_AFFINITY(ts, CG_SHARE_L2)) {
 		if (cg->cg_flags & CG_FLAG_THREAD) {
 			/* Check all SMT threads for being idle. */
 			for (cpu = CPU_FFS(&cg->cg_mask) - 1; ; cpu++) {
 				if (CPU_ISSET(cpu, &cg->cg_mask) &&
 				    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
 					break;
 				if (cpu >= mp_maxid) {
 					SCHED_STAT_INC(pickcpu_idle_affinity);
 					return (ts->ts_cpu);
 				}
 			}
 		} else {
 			SCHED_STAT_INC(pickcpu_idle_affinity);
 			return (ts->ts_cpu);
 		}
 	}
 llc:
 	/*
 	 * Search for the last level cache CPU group in the tree.
 	 * Skip SMT, identical groups and caches with expired affinity.
 	 * Interrupt threads affinity is explicit and never expires.
 	 */
 	for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
 		if (cg->cg_flags & CG_FLAG_THREAD)
 			continue;
 		if (cg->cg_children == 1 || cg->cg_count == 1)
 			continue;
 		if (cg->cg_level == CG_SHARE_NONE ||
 		    (!intr && !SCHED_AFFINITY(ts, cg->cg_level)))
 			continue;
 		ccg = cg;
 	}
 	/* Found LLC shared by all CPUs, so do a global search. */
 	if (ccg == cpu_top)
 		ccg = NULL;
 	cpu = -1;
 	mask = td->td_cpuset->cs_mask;
 	pri = td->td_priority;
 	/*
 	 * Try hard to keep interrupts within found LLC.  Search the LLC for
 	 * the least loaded CPU we can run now.  For NUMA systems it should
 	 * be within target domain, and it also reduces scheduling overhead.
 	 */
 	if (ccg != NULL && intr) {
 		cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_intrbind);
 	} else
 	/* Search the LLC for the least loaded idle CPU we can run now. */
 	if (ccg != NULL) {
 		cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE),
 		    INT_MAX, ts->ts_cpu);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_affinity);
 	}
 	/* Search globally for the least loaded CPU we can run now. */
 	if (cpu < 0) {
 		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_lowest);
 	}
 	/* Search globally for the least loaded CPU. */
 	if (cpu < 0) {
 		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_lowest);
 	}
 	KASSERT(cpu >= 0, ("sched_pickcpu: Failed to find a cpu."));
 	KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu));
 	/*
 	 * Compare the lowest loaded cpu to current cpu.
 	 */
 	tdq = TDQ_CPU(cpu);
 	if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri &&
 	    tdq->tdq_lowpri < PRI_MIN_IDLE &&
 	    TDQ_SELF()->tdq_load <= tdq->tdq_load + 1) {
 		SCHED_STAT_INC(pickcpu_local);
 		cpu = self;
 	}
 	if (cpu != ts->ts_cpu)
 		SCHED_STAT_INC(pickcpu_migration);
 	return (cpu);
 }
 #endif
 
 /*
  * Pick the highest priority task we have and return it.
  */
 static struct thread *
 tdq_choose(struct tdq *tdq)
 {
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	td = runq_choose(&tdq->tdq_realtime);
 	if (td != NULL)
 		return (td);
 	td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
 	if (td != NULL) {
 		KASSERT(td->td_priority >= PRI_MIN_BATCH,
 		    ("tdq_choose: Invalid priority on timeshare queue %d",
 		    td->td_priority));
 		return (td);
 	}
 	td = runq_choose(&tdq->tdq_idle);
 	if (td != NULL) {
 		KASSERT(td->td_priority >= PRI_MIN_IDLE,
 		    ("tdq_choose: Invalid priority on idle queue %d",
 		    td->td_priority));
 		return (td);
 	}
 
 	return (NULL);
 }
 
 /*
  * Initialize a thread queue.
  */
 static void
 tdq_setup(struct tdq *tdq, int id)
 {
 
 	if (bootverbose)
 		printf("ULE: setup cpu %d\n", id);
 	runq_init(&tdq->tdq_realtime);
 	runq_init(&tdq->tdq_timeshare);
 	runq_init(&tdq->tdq_idle);
 	tdq->tdq_id = id;
 	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
 	    "sched lock %d", (int)TDQ_ID(tdq));
 	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
 	    MTX_SPIN | MTX_RECURSE);
 #ifdef KTR
 	snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
 	    "CPU %d load", (int)TDQ_ID(tdq));
 #endif
 }
 
 #ifdef SMP
 static void
 sched_setup_smp(void)
 {
 	struct tdq *tdq;
 	int i;
 
 	cpu_top = smp_topo();
 	CPU_FOREACH(i) {
 		tdq = DPCPU_ID_PTR(i, tdq);
 		tdq_setup(tdq, i);
 		tdq->tdq_cg = smp_topo_find(cpu_top, i);
 		if (tdq->tdq_cg == NULL)
 			panic("Can't find cpu group for %d\n", i);
 	}
 	balance_tdq = TDQ_SELF();
 }
 #endif
 
 /*
  * Setup the thread queues and initialize the topology based on MD
  * information.
  */
 static void
 sched_setup(void *dummy)
 {
 	struct tdq *tdq;
 
 #ifdef SMP
 	sched_setup_smp();
 #else
 	tdq_setup(TDQ_SELF(), 0);
 #endif
 	tdq = TDQ_SELF();
 
 	/* Add thread0's load since it's running. */
 	TDQ_LOCK(tdq);
 	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
 	tdq_load_add(tdq, &thread0);
 	tdq->tdq_lowpri = thread0.td_priority;
 	TDQ_UNLOCK(tdq);
 }
 
 /*
  * This routine determines time constants after stathz and hz are setup.
  */
 /* ARGSUSED */
 static void
 sched_initticks(void *dummy)
 {
 	int incr;
 
 	realstathz = stathz ? stathz : hz;
 	sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR;
 	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 
 	/*
 	 * tickincr is shifted out by 10 to avoid rounding errors due to
 	 * hz not being evenly divisible by stathz on all platforms.
 	 */
 	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
 	/*
 	 * This does not work for values of stathz that are more than
 	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
 	 */
 	if (incr == 0)
 		incr = 1;
 	tickincr = incr;
 #ifdef SMP
 	/*
 	 * Set the default balance interval now that we know
 	 * what realstathz is.
 	 */
 	balance_interval = realstathz;
 	balance_ticks = balance_interval;
 	affinity = SCHED_AFFINITY_DEFAULT;
 #endif
 	if (sched_idlespinthresh < 0)
 		sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz;
 }
 
 
 /*
  * This is the core of the interactivity algorithm.  Determines a score based
  * on past behavior.  It is the ratio of sleep time to run time scaled to
  * a [0, 100] integer.  This is the voluntary sleep time of a process, which
  * differs from the cpu usage because it does not account for time spent
  * waiting on a run-queue.  Would be prettier if we had floating point.
  *
  * When a thread's sleep time is greater than its run time the
  * calculation is:
  *
  *                           scaling factor 
  * interactivity score =  ---------------------
  *                        sleep time / run time
  *
  *
  * When a thread's run time is greater than its sleep time the
  * calculation is:
  *
  *                           scaling factor 
  * interactivity score =  ---------------------    + scaling factor
  *                        run time / sleep time
  */
 static int
 sched_interact_score(struct thread *td)
 {
 	struct td_sched *ts;
 	int div;
 
 	ts = td_get_sched(td);
 	/*
 	 * The score is only needed if this is likely to be an interactive
 	 * task.  Don't go through the expense of computing it if there's
 	 * no chance.
 	 */
 	if (sched_interact <= SCHED_INTERACT_HALF &&
 		ts->ts_runtime >= ts->ts_slptime)
 			return (SCHED_INTERACT_HALF);
 
 	if (ts->ts_runtime > ts->ts_slptime) {
 		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
 		return (SCHED_INTERACT_HALF +
 		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
 	}
 	if (ts->ts_slptime > ts->ts_runtime) {
 		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
 		return (ts->ts_runtime / div);
 	}
 	/* runtime == slptime */
 	if (ts->ts_runtime)
 		return (SCHED_INTERACT_HALF);
 
 	/*
 	 * This can happen if slptime and runtime are 0.
 	 */
 	return (0);
 
 }
 
 /*
  * Scale the scheduling priority according to the "interactivity" of this
  * process.
  */
 static void
 sched_priority(struct thread *td)
 {
 	int score;
 	int pri;
 
 	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
 		return;
 	/*
 	 * If the score is interactive we place the thread in the realtime
 	 * queue with a priority that is less than kernel and interrupt
 	 * priorities.  These threads are not subject to nice restrictions.
 	 *
 	 * Scores greater than this are placed on the normal timeshare queue
 	 * where the priority is partially decided by the most recent cpu
 	 * utilization and the rest is decided by nice value.
 	 *
 	 * The nice value of the process has a linear effect on the calculated
 	 * score.  Negative nice values make it easier for a thread to be
 	 * considered interactive.
 	 */
 	score = imax(0, sched_interact_score(td) + td->td_proc->p_nice);
 	if (score < sched_interact) {
 		pri = PRI_MIN_INTERACT;
 		pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) /
 		    sched_interact) * score;
 		KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT,
 		    ("sched_priority: invalid interactive priority %d score %d",
 		    pri, score));
 	} else {
 		pri = SCHED_PRI_MIN;
 		if (td_get_sched(td)->ts_ticks)
 			pri += min(SCHED_PRI_TICKS(td_get_sched(td)),
 			    SCHED_PRI_RANGE - 1);
 		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
 		KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH,
 		    ("sched_priority: invalid priority %d: nice %d, " 
 		    "ticks %d ftick %d ltick %d tick pri %d",
 		    pri, td->td_proc->p_nice, td_get_sched(td)->ts_ticks,
 		    td_get_sched(td)->ts_ftick, td_get_sched(td)->ts_ltick,
 		    SCHED_PRI_TICKS(td_get_sched(td))));
 	}
 	sched_user_prio(td, pri);
 
 	return;
 }
 
 /*
  * This routine enforces a maximum limit on the amount of scheduling history
  * kept.  It is called after either the slptime or runtime is adjusted.  This
  * function is ugly due to integer math.
  */
 static void
 sched_interact_update(struct thread *td)
 {
 	struct td_sched *ts;
 	u_int sum;
 
 	ts = td_get_sched(td);
 	sum = ts->ts_runtime + ts->ts_slptime;
 	if (sum < SCHED_SLP_RUN_MAX)
 		return;
 	/*
 	 * This only happens from two places:
 	 * 1) We have added an unusual amount of run time from fork_exit.
 	 * 2) We have added an unusual amount of sleep time from sched_sleep().
 	 */
 	if (sum > SCHED_SLP_RUN_MAX * 2) {
 		if (ts->ts_runtime > ts->ts_slptime) {
 			ts->ts_runtime = SCHED_SLP_RUN_MAX;
 			ts->ts_slptime = 1;
 		} else {
 			ts->ts_slptime = SCHED_SLP_RUN_MAX;
 			ts->ts_runtime = 1;
 		}
 		return;
 	}
 	/*
 	 * If we have exceeded by more than 1/5th then the algorithm below
 	 * will not bring us back into range.  Dividing by two here forces
 	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
 	 */
 	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
 		ts->ts_runtime /= 2;
 		ts->ts_slptime /= 2;
 		return;
 	}
 	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
 	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
 }
 
 /*
  * Scale back the interactivity history when a child thread is created.  The
  * history is inherited from the parent but the thread may behave totally
  * differently.  For example, a shell spawning a compiler process.  We want
  * to learn that the compiler is behaving badly very quickly.
  */
 static void
 sched_interact_fork(struct thread *td)
 {
 	struct td_sched *ts;
 	int ratio;
 	int sum;
 
 	ts = td_get_sched(td);
 	sum = ts->ts_runtime + ts->ts_slptime;
 	if (sum > SCHED_SLP_RUN_FORK) {
 		ratio = sum / SCHED_SLP_RUN_FORK;
 		ts->ts_runtime /= ratio;
 		ts->ts_slptime /= ratio;
 	}
 }
 
 /*
  * Called from proc0_init() to setup the scheduler fields.
  */
 void
 schedinit(void)
 {
 	struct td_sched *ts0;
 
 	/*
 	 * Set up the scheduler specific parts of thread0.
 	 */
 	ts0 = td_get_sched(&thread0);
 	ts0->ts_ltick = ticks;
 	ts0->ts_ftick = ticks;
 	ts0->ts_slice = 0;
 	ts0->ts_cpu = curcpu;	/* set valid CPU number */
 }
 
 /*
  * This is only somewhat accurate since given many processes of the same
  * priority they will switch when their slices run out, which will be
  * at most sched_slice stathz ticks.
  */
 int
 sched_rr_interval(void)
 {
 
 	/* Convert sched_slice from stathz to hz. */
 	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
 }
 
 /*
  * Update the percent cpu tracking information when it is requested or
  * the total history exceeds the maximum.  We keep a sliding history of
  * tick counts that slowly decays.  This is less precise than the 4BSD
  * mechanism since it happens with less regular and frequent events.
  */
 static void
 sched_pctcpu_update(struct td_sched *ts, int run)
 {
 	int t = ticks;
 
 	/*
 	 * The signed difference may be negative if the thread hasn't run for
 	 * over half of the ticks rollover period.
 	 */
 	if ((u_int)(t - ts->ts_ltick) >= SCHED_TICK_TARG) {
 		ts->ts_ticks = 0;
 		ts->ts_ftick = t - SCHED_TICK_TARG;
 	} else if (t - ts->ts_ftick >= SCHED_TICK_MAX) {
 		ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) *
 		    (ts->ts_ltick - (t - SCHED_TICK_TARG));
 		ts->ts_ftick = t - SCHED_TICK_TARG;
 	}
 	if (run)
 		ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT;
 	ts->ts_ltick = t;
 }
 
 /*
  * Adjust the priority of a thread.  Move it to the appropriate run-queue
  * if necessary.  This is the back-end for several priority related
  * functions.
  */
 static void
 sched_thread_priority(struct thread *td, u_char prio)
 {
 	struct td_sched *ts;
 	struct tdq *tdq;
 	int oldpri;
 
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
 	    "prio:%d", td->td_priority, "new prio:%d", prio,
 	    KTR_ATTR_LINKED, sched_tdname(curthread));
 	SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
 	if (td != curthread && prio < td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
 		SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, 
 		    curthread);
 	} 
 	ts = td_get_sched(td);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
 	/*
 	 * If the priority has been elevated due to priority
 	 * propagation, we may have to move ourselves to a new
 	 * queue.  This could be optimized to not re-add in some
 	 * cases.
 	 */
 	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
 		sched_rem(td);
 		td->td_priority = prio;
 		sched_add(td, SRQ_BORROWING);
 		return;
 	}
 	/*
 	 * If the thread is currently running we may have to adjust the lowpri
 	 * information so other cpus are aware of our current priority.
 	 */
 	if (TD_IS_RUNNING(td)) {
 		tdq = TDQ_CPU(ts->ts_cpu);
 		oldpri = td->td_priority;
 		td->td_priority = prio;
 		if (prio < tdq->tdq_lowpri)
 			tdq->tdq_lowpri = prio;
 		else if (tdq->tdq_lowpri == oldpri)
 			tdq_setlowpri(tdq, td);
 		return;
 	}
 	td->td_priority = prio;
 }
 
 /*
  * Update a thread's priority when it is lent another thread's
  * priority.
  */
 void
 sched_lend_prio(struct thread *td, u_char prio)
 {
 
 	td->td_flags |= TDF_BORROWING;
 	sched_thread_priority(td, prio);
 }
 
 /*
  * Restore a thread's priority when priority propagation is
  * over.  The prio argument is the minimum priority the thread
  * needs to have to satisfy other possible priority lending
  * requests.  If the thread's regular priority is less
  * important than prio, the thread will keep a priority boost
  * of prio.
  */
 void
 sched_unlend_prio(struct thread *td, u_char prio)
 {
 	u_char base_pri;
 
 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
 		base_pri = td->td_user_pri;
 	else
 		base_pri = td->td_base_pri;
 	if (prio >= base_pri) {
 		td->td_flags &= ~TDF_BORROWING;
 		sched_thread_priority(td, base_pri);
 	} else
 		sched_lend_prio(td, prio);
 }
 
 /*
  * Standard entry for setting the priority to an absolute value.
  */
 void
 sched_prio(struct thread *td, u_char prio)
 {
 	u_char oldprio;
 
 	/* First, update the base priority. */
 	td->td_base_pri = prio;
 
 	/*
 	 * If the thread is borrowing another thread's priority, don't
 	 * ever lower the priority.
 	 */
 	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
 		return;
 
 	/* Change the real priority. */
 	oldprio = td->td_priority;
 	sched_thread_priority(td, prio);
 
 	/*
 	 * If the thread is on a turnstile, then let the turnstile update
 	 * its state.
 	 */
 	if (TD_ON_LOCK(td) && oldprio != prio)
 		turnstile_adjust(td, oldprio);
 }
 
 /*
  * Set the base user priority, does not effect current running priority.
  */
 void
 sched_user_prio(struct thread *td, u_char prio)
 {
 
 	td->td_base_user_pri = prio;
 	if (td->td_lend_user_pri <= prio)
 		return;
 	td->td_user_pri = prio;
 }
 
 void
 sched_lend_user_prio(struct thread *td, u_char prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_lend_user_pri = prio;
 	td->td_user_pri = min(prio, td->td_base_user_pri);
 	if (td->td_priority > td->td_user_pri)
 		sched_prio(td, td->td_user_pri);
 	else if (td->td_priority != td->td_user_pri)
 		td->td_flags |= TDF_NEEDRESCHED;
 }
 
 #ifdef SMP
 /*
  * This tdq is about to idle.  Try to steal a thread from another CPU before
  * choosing the idle thread.
  */
 static void
 tdq_trysteal(struct tdq *tdq)
 {
 	struct cpu_group *cg;
 	struct tdq *steal;
 	cpuset_t mask;
 	int cpu, i;
 
 	if (smp_started == 0 || trysteal_limit == 0 || tdq->tdq_cg == NULL)
 		return;
 	CPU_FILL(&mask);
 	CPU_CLR(PCPU_GET(cpuid), &mask);
 	/* We don't want to be preempted while we're iterating. */
 	spinlock_enter();
 	TDQ_UNLOCK(tdq);
 	for (i = 1, cg = tdq->tdq_cg; ; ) {
 		cpu = sched_highest(cg, mask, steal_thresh);
 		/*
 		 * If a thread was added while interrupts were disabled don't
 		 * steal one here.
 		 */
 		if (tdq->tdq_load > 0) {
 			TDQ_LOCK(tdq);
 			break;
 		}
 		if (cpu == -1) {
 			i++;
 			cg = cg->cg_parent;
 			if (cg == NULL || i > trysteal_limit) {
 				TDQ_LOCK(tdq);
 				break;
 			}
 			continue;
 		}
 		steal = TDQ_CPU(cpu);
 		/*
 		 * The data returned by sched_highest() is stale and
                  * the chosen CPU no longer has an eligible thread.
 		 */
 		if (steal->tdq_load < steal_thresh ||
 		    steal->tdq_transferable == 0)
 			continue;
 		tdq_lock_pair(tdq, steal);
 		/*
 		 * If we get to this point, unconditonally exit the loop
 		 * to bound the time spent in the critcal section.
 		 *
 		 * If a thread was added while interrupts were disabled don't
 		 * steal one here.
 		 */
 		if (tdq->tdq_load > 0) {
 			TDQ_UNLOCK(steal);
 			break;
 		}
 		/*
 		 * The data returned by sched_highest() is stale and
                  * the chosen CPU no longer has an eligible thread.
 		 */
 		if (steal->tdq_load < steal_thresh ||
 		    steal->tdq_transferable == 0) {
 			TDQ_UNLOCK(steal);
 			break;
 		}
 		/*
 		 * If we fail to acquire one due to affinity restrictions,
 		 * bail out and let the idle thread to a more complete search
 		 * outside of a critical section.
 		 */
 		if (tdq_move(steal, tdq) == NULL) {
 			TDQ_UNLOCK(steal);
 			break;
 		}
 		TDQ_UNLOCK(steal);
 		break;
 	}
 	spinlock_exit();
 }
 #endif
 
 /*
  * Handle migration from sched_switch().  This happens only for
  * cpu binding.
  */
 static struct mtx *
 sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
 {
 	struct tdq *tdn;
 
 	KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), ("sched_switch_migrate: "
 	    "thread %s queued on absent CPU %d.", td->td_name,
 	    td_get_sched(td)->ts_cpu));
 	tdn = TDQ_CPU(td_get_sched(td)->ts_cpu);
 #ifdef SMP
 	tdq_load_rem(tdq, td);
 	/*
 	 * Do the lock dance required to avoid LOR.  We grab an extra
 	 * spinlock nesting to prevent preemption while we're
 	 * not holding either run-queue lock.
 	 */
 	spinlock_enter();
 	thread_lock_block(td);	/* This releases the lock on tdq. */
 
 	/*
 	 * Acquire both run-queue locks before placing the thread on the new
 	 * run-queue to avoid deadlocks created by placing a thread with a
 	 * blocked lock on the run-queue of a remote processor.  The deadlock
 	 * occurs when a third processor attempts to lock the two queues in
 	 * question while the target processor is spinning with its own
 	 * run-queue lock held while waiting for the blocked lock to clear.
 	 */
 	tdq_lock_pair(tdn, tdq);
 	tdq_add(tdn, td, flags);
 	tdq_notify(tdn, td);
 	TDQ_UNLOCK(tdn);
 	spinlock_exit();
 #endif
 	return (TDQ_LOCKPTR(tdn));
 }
 
 /*
  * Variadic version of thread_lock_unblock() that does not assume td_lock
  * is blocked.
  */
 static inline void
 thread_unblock_switch(struct thread *td, struct mtx *mtx)
 {
 	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
 	    (uintptr_t)mtx);
 }
 
 /*
  * Switch threads.  This function has to handle threads coming in while
  * blocked for some reason, running, or idle.  It also must deal with
  * migrating a thread from one queue to another as running threads may
  * be assigned elsewhere via binding.
  */
 void
 sched_switch(struct thread *td, struct thread *newtd, int flags)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
 	struct mtx *mtx;
 	int srqflag;
 	int cpuid, preempted;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument"));
 
 	cpuid = PCPU_GET(cpuid);
 	tdq = TDQ_SELF();
 	ts = td_get_sched(td);
 	mtx = td->td_lock;
 	sched_pctcpu_update(ts, 1);
 	ts->ts_rltick = ticks;
 	td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
 	preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
 	    (flags & SW_PREEMPT) != 0;
 	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
 	td->td_owepreempt = 0;
 	if (!TD_IS_IDLETHREAD(td))
 		tdq->tdq_switchcnt++;
 	/*
 	 * The lock pointer in an idle thread should never change.  Reset it
 	 * to CAN_RUN as well.
 	 */
 	if (TD_IS_IDLETHREAD(td)) {
 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 		TD_SET_CAN_RUN(td);
 	} else if (TD_IS_RUNNING(td)) {
 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 		srqflag = preempted ?
 		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 		    SRQ_OURSELF|SRQ_YIELDING;
 #ifdef SMP
 		if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu))
 			ts->ts_cpu = sched_pickcpu(td, 0);
 #endif
 		if (ts->ts_cpu == cpuid)
 			tdq_runq_add(tdq, td, srqflag);
 		else {
 			KASSERT(THREAD_CAN_MIGRATE(td) ||
 			    (ts->ts_flags & TSF_BOUND) != 0,
 			    ("Thread %p shouldn't migrate", td));
 			mtx = sched_switch_migrate(tdq, td, srqflag);
 		}
 	} else {
 		/* This thread must be going to sleep. */
 		TDQ_LOCK(tdq);
 		mtx = thread_lock_block(td);
 		tdq_load_rem(tdq, td);
 #ifdef SMP
 		if (tdq->tdq_load == 0)
 			tdq_trysteal(tdq);
 #endif
 	}
 
 #if (KTR_COMPILE & KTR_SCHED) != 0
 	if (TD_IS_IDLETHREAD(td))
 		KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
 		    "prio:%d", td->td_priority);
 	else
 		KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
 		    "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
 		    "lockname:\"%s\"", td->td_lockname);
 #endif
 
 	/*
 	 * We enter here with the thread blocked and assigned to the
 	 * appropriate cpu run-queue or sleep-queue and with the current
 	 * thread-queue locked.
 	 */
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
 	newtd = choosethread();
 	/*
 	 * Call the MD code to switch contexts if necessary.
 	 */
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 		SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
 		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
 		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
 		sched_pctcpu_update(td_get_sched(newtd), 0);
 
 #ifdef KDTRACE_HOOKS
 		/*
 		 * If DTrace has set the active vtime enum to anything
 		 * other than INACTIVE (0), then it should have set the
 		 * function to call.
 		 */
 		if (dtrace_vtime_active)
 			(*dtrace_vtime_switch_func)(newtd);
 #endif
 
 		cpu_switch(td, newtd, mtx);
 		/*
 		 * We may return from cpu_switch on a different cpu.  However,
 		 * we always return with td_lock pointing to the current cpu's
 		 * run queue lock.
 		 */
 		cpuid = PCPU_GET(cpuid);
 		tdq = TDQ_SELF();
 		lock_profile_obtain_lock_success(
 		    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
 
 		SDT_PROBE0(sched, , , on__cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
 	} else {
 		thread_unblock_switch(td, mtx);
 		SDT_PROBE0(sched, , , remain__cpu);
 	}
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
 	    "prio:%d", td->td_priority);
 
 	/*
 	 * Assert that all went well and return.
 	 */
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	td->td_oncpu = cpuid;
 }
 
 /*
  * Adjust thread priorities as a result of a nice request.
  */
 void
 sched_nice(struct proc *p, int nice)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		sched_priority(td);
 		sched_prio(td, td->td_base_user_pri);
 		thread_unlock(td);
 	}
 }
 
 /*
  * Record the sleep time for the interactivity scorer.
  */
 void
 sched_sleep(struct thread *td, int prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	td->td_slptick = ticks;
 	if (TD_IS_SUSPENDED(td) || prio >= PSOCK)
 		td->td_flags |= TDF_CANSWAP;
 	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
 		return;
 	if (static_boost == 1 && prio)
 		sched_prio(td, prio);
 	else if (static_boost && td->td_priority > static_boost)
 		sched_prio(td, static_boost);
 }
 
 /*
  * Schedule a thread to resume execution and record how long it voluntarily
  * slept.  We also update the pctcpu, interactivity, and priority.
  */
 void
 sched_wakeup(struct thread *td)
 {
 	struct td_sched *ts;
 	int slptick;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 	td->td_flags &= ~TDF_CANSWAP;
 	/*
 	 * If we slept for more than a tick update our interactivity and
 	 * priority.
 	 */
 	slptick = td->td_slptick;
 	td->td_slptick = 0;
 	if (slptick && slptick != ticks) {
 		ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT;
 		sched_interact_update(td);
 		sched_pctcpu_update(ts, 0);
 	}
 	/*
 	 * Reset the slice value since we slept and advanced the round-robin.
 	 */
 	ts->ts_slice = 0;
 	sched_add(td, SRQ_BORING);
 }
 
 /*
  * Penalize the parent for creating a new child and initialize the child's
  * priority.
  */
 void
 sched_fork(struct thread *td, struct thread *child)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_pctcpu_update(td_get_sched(td), 1);
 	sched_fork_thread(td, child);
 	/*
 	 * Penalize the parent and child for forking.
 	 */
 	sched_interact_fork(child);
 	sched_priority(child);
 	td_get_sched(td)->ts_runtime += tickincr;
 	sched_interact_update(td);
 	sched_priority(td);
 }
 
 /*
  * Fork a new thread, may be within the same process.
  */
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
 	struct td_sched *ts;
 	struct td_sched *ts2;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * Initialize child.
 	 */
 	ts = td_get_sched(td);
 	ts2 = td_get_sched(child);
 	child->td_oncpu = NOCPU;
 	child->td_lastcpu = NOCPU;
 	child->td_lock = TDQ_LOCKPTR(tdq);
 	child->td_cpuset = cpuset_ref(td->td_cpuset);
 	child->td_domain.dr_policy = td->td_cpuset->cs_domain;
 	ts2->ts_cpu = ts->ts_cpu;
 	ts2->ts_flags = 0;
 	/*
 	 * Grab our parents cpu estimation information.
 	 */
 	ts2->ts_ticks = ts->ts_ticks;
 	ts2->ts_ltick = ts->ts_ltick;
 	ts2->ts_ftick = ts->ts_ftick;
 	/*
 	 * Do not inherit any borrowed priority from the parent.
 	 */
 	child->td_priority = child->td_base_pri;
 	/*
 	 * And update interactivity score.
 	 */
 	ts2->ts_slptime = ts->ts_slptime;
 	ts2->ts_runtime = ts->ts_runtime;
 	/* Attempt to quickly learn interactivity. */
 	ts2->ts_slice = tdq_slice(tdq) - sched_slice_min;
 #ifdef KTR
 	bzero(ts2->ts_name, sizeof(ts2->ts_name));
 #endif
 }
 
 /*
  * Adjust the priority class of a thread.
  */
 void
 sched_class(struct thread *td, int class)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_pri_class == class)
 		return;
 	td->td_pri_class = class;
 }
 
 /*
  * Return some of the child's priority and interactivity to the parent.
  */
 void
 sched_exit(struct proc *p, struct thread *child)
 {
 	struct thread *td;
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
 	    "prio:%d", child->td_priority);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	td = FIRST_THREAD_IN_PROC(p);
 	sched_exit_thread(td, child);
 }
 
 /*
  * Penalize another thread for the time spent on this one.  This helps to
  * worsen the priority and interactivity of processes which schedule batch
  * jobs such as make.  This has little effect on the make process itself but
  * causes new processes spawned by it to receive worse scores immediately.
  */
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
 	    "prio:%d", child->td_priority);
 	/*
 	 * Give the child's runtime to the parent without returning the
 	 * sleep time as a penalty to the parent.  This causes shells that
 	 * launch expensive things to mark their children as expensive.
 	 */
 	thread_lock(td);
 	td_get_sched(td)->ts_runtime += td_get_sched(child)->ts_runtime;
 	sched_interact_update(td);
 	sched_priority(td);
 	thread_unlock(td);
 }
 
 void
 sched_preempt(struct thread *td)
 {
 	struct tdq *tdq;
 
 	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
 
 	thread_lock(td);
 	tdq = TDQ_SELF();
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	tdq->tdq_ipipending = 0;
 	if (td->td_priority > tdq->tdq_lowpri) {
 		int flags;
 
 		flags = SW_INVOL | SW_PREEMPT;
 		if (td->td_critnest > 1)
 			td->td_owepreempt = 1;
 		else if (TD_IS_IDLETHREAD(td))
 			mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL);
 		else
 			mi_switch(flags | SWT_REMOTEPREEMPT, NULL);
 	}
 	thread_unlock(td);
 }
 
 /*
  * Fix priorities on return to user-space.  Priorities may be elevated due
  * to static priorities in msleep() or similar.
  */
 void
 sched_userret_slowpath(struct thread *td)
 {
 
 	thread_lock(td);
 	td->td_priority = td->td_user_pri;
 	td->td_base_pri = td->td_user_pri;
 	tdq_setlowpri(TDQ_SELF(), td);
 	thread_unlock(td);
 }
 
 /*
  * Handle a stathz tick.  This is really only relevant for timeshare
  * threads.
  */
 void
 sched_clock(struct thread *td)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_SELF();
 #ifdef SMP
 	/*
 	 * We run the long term load balancer infrequently on the first cpu.
 	 */
 	if (balance_tdq == tdq && smp_started != 0 && rebalance != 0) {
 		if (balance_ticks && --balance_ticks == 0)
 			sched_balance();
 	}
 #endif
 	/*
 	 * Save the old switch count so we have a record of the last ticks
 	 * activity.   Initialize the new switch count based on our load.
 	 * If there is some activity seed it to reflect that.
 	 */
 	tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
 	tdq->tdq_switchcnt = tdq->tdq_load;
 	/*
 	 * Advance the insert index once for each tick to ensure that all
 	 * threads get a chance to run.
 	 */
 	if (tdq->tdq_idx == tdq->tdq_ridx) {
 		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
 		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
 			tdq->tdq_ridx = tdq->tdq_idx;
 	}
 	ts = td_get_sched(td);
 	sched_pctcpu_update(ts, 1);
 	if (td->td_pri_class & PRI_FIFO_BIT)
 		return;
 	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) {
 		/*
 		 * We used a tick; charge it to the thread so
 		 * that we can compute our interactivity.
 		 */
 		td_get_sched(td)->ts_runtime += tickincr;
 		sched_interact_update(td);
 		sched_priority(td);
 	}
 
 	/*
 	 * Force a context switch if the current thread has used up a full
 	 * time slice (default is 100ms).
 	 */
 	if (!TD_IS_IDLETHREAD(td) && ++ts->ts_slice >= tdq_slice(tdq)) {
 		ts->ts_slice = 0;
 		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
 	}
 }
 
 u_int
 sched_estcpu(struct thread *td __unused)
 {
 
 	return (0);
 }
 
 /*
  * Return whether the current CPU has runnable tasks.  Used for in-kernel
  * cooperative idle threads.
  */
 int
 sched_runnable(void)
 {
 	struct tdq *tdq;
 	int load;
 
 	load = 1;
 
 	tdq = TDQ_SELF();
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
 		if (tdq->tdq_load > 0)
 			goto out;
 	} else
 		if (tdq->tdq_load - 1 > 0)
 			goto out;
 	load = 0;
 out:
 	return (load);
 }
 
 /*
  * Choose the highest priority thread to run.  The thread is removed from
  * the run-queue while running however the load remains.  For SMP we set
  * the tdq in the global idle bitmask if it idles here.
  */
 struct thread *
 sched_choose(void)
 {
 	struct thread *td;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	td = tdq_choose(tdq);
 	if (td) {
 		tdq_runq_rem(tdq, td);
 		tdq->tdq_lowpri = td->td_priority;
 		return (td);
 	}
 	tdq->tdq_lowpri = PRI_MAX_IDLE;
 	return (PCPU_GET(idlethread));
 }
 
 /*
  * Set owepreempt if necessary.  Preemption never happens directly in ULE,
  * we always request it once we exit a critical section.
  */
 static inline void
 sched_setpreempt(struct thread *td)
 {
 	struct thread *ctd;
 	int cpri;
 	int pri;
 
 	THREAD_LOCK_ASSERT(curthread, MA_OWNED);
 
 	ctd = curthread;
 	pri = td->td_priority;
 	cpri = ctd->td_priority;
 	if (pri < cpri)
 		ctd->td_flags |= TDF_NEEDRESCHED;
 	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
 		return;
 	if (!sched_shouldpreempt(pri, cpri, 0))
 		return;
 	ctd->td_owepreempt = 1;
 }
 
 /*
  * Add a thread to a thread queue.  Select the appropriate runq and add the
  * thread to it.  This is the internal function called when the tdq is
  * predetermined.
  */
 void
 tdq_add(struct tdq *tdq, struct thread *td, int flags)
 {
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 
 	if (td->td_priority < tdq->tdq_lowpri)
 		tdq->tdq_lowpri = td->td_priority;
 	tdq_runq_add(tdq, td, flags);
 	tdq_load_add(tdq, td);
 }
 
 /*
  * Select the target thread queue and add a thread to it.  Request
  * preemption or IPI a remote processor if required.
  */
 void
 sched_add(struct thread *td, int flags)
 {
 	struct tdq *tdq;
 #ifdef SMP
 	int cpu;
 #endif
 
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
 	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
 	    flags & SRQ_PREEMPTED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * Recalculate the priority before we select the target cpu or
 	 * run-queue.
 	 */
 	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_priority(td);
 #ifdef SMP
 	/*
 	 * Pick the destination cpu and if it isn't ours transfer to the
 	 * target cpu.
 	 */
 	cpu = sched_pickcpu(td, flags);
 	tdq = sched_setcpu(td, cpu, flags);
 	tdq_add(tdq, td, flags);
 	if (cpu != PCPU_GET(cpuid)) {
 		tdq_notify(tdq, td);
 		return;
 	}
 #else
 	tdq = TDQ_SELF();
 	TDQ_LOCK(tdq);
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
 	 * to the scheduler's lock.
 	 */
 	thread_lock_set(td, TDQ_LOCKPTR(tdq));
 	tdq_add(tdq, td, flags);
 #endif
 	if (!(flags & SRQ_YIELDING))
 		sched_setpreempt(td);
 }
 
 /*
  * Remove a thread from a run-queue without running it.  This is used
  * when we're stealing a thread from a remote queue.  Otherwise all threads
  * exit by calling sched_exit_thread() and sched_throw() themselves.
  */
 void
 sched_rem(struct thread *td)
 {
 	struct tdq *tdq;
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
 	    "prio:%d", td->td_priority);
 	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
 	tdq = TDQ_CPU(td_get_sched(td)->ts_cpu);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	KASSERT(TD_ON_RUNQ(td),
 	    ("sched_rem: thread not on run queue"));
 	tdq_runq_rem(tdq, td);
 	tdq_load_rem(tdq, td);
 	TD_SET_CAN_RUN(td);
 	if (td->td_priority == tdq->tdq_lowpri)
 		tdq_setlowpri(tdq, NULL);
 }
 
 /*
  * Fetch cpu utilization information.  Updates on demand.
  */
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	fixpt_t pctcpu;
 	struct td_sched *ts;
 
 	pctcpu = 0;
 	ts = td_get_sched(td);
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_pctcpu_update(ts, TD_IS_RUNNING(td));
 	if (ts->ts_ticks) {
 		int rtick;
 
 		/* How many rtick per second ? */
 		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
 		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
 	}
 
 	return (pctcpu);
 }
 
 /*
  * Enforce affinity settings for a thread.  Called after adjustments to
  * cpumask.
  */
 void
 sched_affinity(struct thread *td)
 {
 #ifdef SMP
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 	if (THREAD_CAN_SCHED(td, ts->ts_cpu))
 		return;
 	if (TD_ON_RUNQ(td)) {
 		sched_rem(td);
 		sched_add(td, SRQ_BORING);
 		return;
 	}
 	if (!TD_IS_RUNNING(td))
 		return;
 	/*
 	 * Force a switch before returning to userspace.  If the
 	 * target thread is not running locally send an ipi to force
 	 * the issue.
 	 */
 	td->td_flags |= TDF_NEEDRESCHED;
 	if (td != curthread)
 		ipi_cpu(ts->ts_cpu, IPI_PREEMPT);
 #endif
 }
 
 /*
  * Bind a thread to a target cpu.
  */
 void
 sched_bind(struct thread *td, int cpu)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
 	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
 	ts = td_get_sched(td);
 	if (ts->ts_flags & TSF_BOUND)
 		sched_unbind(td);
 	KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td));
 	ts->ts_flags |= TSF_BOUND;
 	sched_pin();
 	if (PCPU_GET(cpuid) == cpu)
 		return;
 	ts->ts_cpu = cpu;
 	/* When we return from mi_switch we'll be on the correct cpu. */
 	mi_switch(SW_VOL, NULL);
 }
 
 /*
  * Release a bound thread.
  */
 void
 sched_unbind(struct thread *td)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
 	ts = td_get_sched(td);
 	if ((ts->ts_flags & TSF_BOUND) == 0)
 		return;
 	ts->ts_flags &= ~TSF_BOUND;
 	sched_unpin();
 }
 
 int
 sched_is_bound(struct thread *td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	return (td_get_sched(td)->ts_flags & TSF_BOUND);
 }
 
 /*
  * Basic yield call.
  */
 void
 sched_relinquish(struct thread *td)
 {
 	thread_lock(td);
 	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
 	thread_unlock(td);
 }
 
 /*
  * Return the total system load.
  */
 int
 sched_load(void)
 {
 #ifdef SMP
 	int total;
 	int i;
 
 	total = 0;
 	CPU_FOREACH(i)
 		total += TDQ_CPU(i)->tdq_sysload;
 	return (total);
 #else
 	return (TDQ_SELF()->tdq_sysload);
 #endif
 }
 
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
 
 #ifdef SMP
 #define	TDQ_IDLESPIN(tdq)						\
     ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
 #else
 #define	TDQ_IDLESPIN(tdq)	1
 #endif
 
 /*
  * The actual idle process.
  */
 void
 sched_idletd(void *dummy)
 {
 	struct thread *td;
 	struct tdq *tdq;
 	int oldswitchcnt, switchcnt;
 	int i;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	td = curthread;
 	tdq = TDQ_SELF();
 	THREAD_NO_SLEEPING();
 	oldswitchcnt = -1;
 	for (;;) {
 		if (tdq->tdq_load) {
 			thread_lock(td);
 			mi_switch(SW_VOL | SWT_IDLE, NULL);
 			thread_unlock(td);
 		}
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 #ifdef SMP
 		if (always_steal || switchcnt != oldswitchcnt) {
 			oldswitchcnt = switchcnt;
 			if (tdq_idled(tdq) == 0)
 				continue;
 		}
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 #else
 		oldswitchcnt = switchcnt;
 #endif
 		/*
 		 * If we're switching very frequently, spin while checking
 		 * for load rather than entering a low power state that 
 		 * may require an IPI.  However, don't do any busy
 		 * loops while on SMT machines as this simply steals
 		 * cycles from cores doing useful work.
 		 */
 		if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
 			for (i = 0; i < sched_idlespins; i++) {
 				if (tdq->tdq_load)
 					break;
 				cpu_spinwait();
 			}
 		}
 
 		/* If there was context switch during spin, restart it. */
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 		if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt)
 			continue;
 
 		/* Run main MD idle handler. */
 		tdq->tdq_cpu_idle = 1;
 		/*
 		 * Make sure that tdq_cpu_idle update is globally visible
 		 * before cpu_idle() read tdq_load.  The order is important
 		 * to avoid race with tdq_notify.
 		 */
 		atomic_thread_fence_seq_cst();
 		/*
 		 * Checking for again after the fence picks up assigned
 		 * threads often enough to make it worthwhile to do so in
 		 * order to avoid calling cpu_idle().
 		 */
 		if (tdq->tdq_load != 0) {
 			tdq->tdq_cpu_idle = 0;
 			continue;
 		}
 		cpu_idle(switchcnt * 4 > sched_idlespinthresh);
 		tdq->tdq_cpu_idle = 0;
 
 		/*
 		 * Account thread-less hardware interrupts and
 		 * other wakeup reasons equal to context switches.
 		 */
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 		if (switchcnt != oldswitchcnt)
 			continue;
 		tdq->tdq_switchcnt++;
 		oldswitchcnt++;
 	}
 }
 
 /*
  * A CPU is entering for the first time or a thread is exiting.
  */
 void
 sched_throw(struct thread *td)
 {
 	struct thread *newtd;
 	struct tdq *tdq;
 
 	if (td == NULL) {
 		/* Correct spinlock nesting and acquire the correct lock. */
 		tdq = TDQ_SELF();
 		TDQ_LOCK(tdq);
 		spinlock_exit();
 		PCPU_SET(switchtime, cpu_ticks());
 		PCPU_SET(switchticks, ticks);
 	} else {
 		tdq = TDQ_SELF();
 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 		tdq_load_rem(tdq, td);
 		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
 		td->td_lastcpu = td->td_oncpu;
 		td->td_oncpu = NOCPU;
 	}
 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
 	newtd = choosethread();
 	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
 	cpu_throw(td, newtd);		/* doesn't return */
 }
 
 /*
  * This is called from fork_exit().  Just acquire the correct locks and
  * let fork do the rest of the work.
  */
 void
 sched_fork_exit(struct thread *td)
 {
 	struct tdq *tdq;
 	int cpuid;
 
 	/*
 	 * Finish setting up thread glue so that it begins execution in a
 	 * non-nested critical section with the scheduler lock held.
 	 */
 	cpuid = PCPU_GET(cpuid);
 	tdq = TDQ_SELF();
 	if (TD_IS_IDLETHREAD(td))
 		td->td_lock = TDQ_LOCKPTR(tdq);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	td->td_oncpu = cpuid;
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
 	lock_profile_obtain_lock_success(
 	    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
 	    "prio:%d", td->td_priority);
 	SDT_PROBE0(sched, , , on__cpu);
 }
 
 /*
  * Create on first use to catch odd startup conditons.
  */
 char *
 sched_tdname(struct thread *td)
 {
 #ifdef KTR
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	if (ts->ts_name[0] == '\0')
 		snprintf(ts->ts_name, sizeof(ts->ts_name),
 		    "%s tid %d", td->td_name, td->td_tid);
 	return (ts->ts_name);
 #else
 	return (td->td_name);
 #endif
 }
 
 #ifdef KTR
 void
 sched_clear_tdname(struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	ts->ts_name[0] = '\0';
 }
 #endif
 
 #ifdef SMP
 
 /*
  * Build the CPU topology dump string. Is recursively called to collect
  * the topology tree.
  */
 static int
 sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
     int indent)
 {
 	char cpusetbuf[CPUSETBUFSIZ];
 	int i, first;
 
 	sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
 	    "", 1 + indent / 2, cg->cg_level);
 	sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",
 	    cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));
 	first = TRUE;
 	for (i = 0; i < MAXCPU; i++) {
 		if (CPU_ISSET(i, &cg->cg_mask)) {
 			if (!first)
 				sbuf_printf(sb, ", ");
 			else
 				first = FALSE;
 			sbuf_printf(sb, "%d", i);
 		}
 	}
 	sbuf_printf(sb, "</cpu>\n");
 
 	if (cg->cg_flags != 0) {
 		sbuf_printf(sb, "%*s <flags>", indent, "");
 		if ((cg->cg_flags & CG_FLAG_HTT) != 0)
 			sbuf_printf(sb, "<flag name=\"HTT\">HTT group</flag>");
 		if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
 			sbuf_printf(sb, "<flag name=\"THREAD\">THREAD group</flag>");
 		if ((cg->cg_flags & CG_FLAG_SMT) != 0)
 			sbuf_printf(sb, "<flag name=\"SMT\">SMT group</flag>");
 		sbuf_printf(sb, "</flags>\n");
 	}
 
 	if (cg->cg_children > 0) {
 		sbuf_printf(sb, "%*s <children>\n", indent, "");
 		for (i = 0; i < cg->cg_children; i++)
 			sysctl_kern_sched_topology_spec_internal(sb, 
 			    &cg->cg_child[i], indent+2);
 		sbuf_printf(sb, "%*s </children>\n", indent, "");
 	}
 	sbuf_printf(sb, "%*s</group>\n", indent, "");
 	return (0);
 }
 
 /*
  * Sysctl handler for retrieving topology dump. It's a wrapper for
  * the recursive sysctl_kern_smp_topology_spec_internal().
  */
 static int
 sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *topo;
 	int err;
 
 	KASSERT(cpu_top != NULL, ("cpu_top isn't initialized"));
 
 	topo = sbuf_new_for_sysctl(NULL, NULL, 512, req);
 	if (topo == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(topo, "<groups>\n");
 	err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
 	sbuf_printf(topo, "</groups>\n");
 
 	if (err == 0) {
 		err = sbuf_finish(topo);
 	}
 	sbuf_delete(topo);
 	return (err);
 }
 
 #endif
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val, period;
 
 	period = 1000000 / realstathz;
 	new_val = period * sched_slice;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val <= 0)
 		return (EINVAL);
 	sched_slice = imax(1, (new_val + period / 2) / period);
 	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 	return (0);
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
     "Scheduler name");
 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, sysctl_kern_quantum, "I",
     "Quantum for timeshare threads in microseconds");
 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
     "Quantum for timeshare threads in stathz ticks");
 SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
     "Interactivity score threshold");
 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW,
     &preempt_thresh, 0,
     "Maximal (lowest) priority for preemption");
 SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0,
     "Assign static kernel priorities to sleeping threads");
 SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0,
     "Number of times idle thread will spin waiting for new work");
 SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW,
     &sched_idlespinthresh, 0,
     "Threshold before we will permit idle thread spinning");
 #ifdef SMP
 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
     "Number of hz ticks to keep thread affinity for");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
     "Enables the long-term load balancer");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
     &balance_interval, 0,
     "Average period in stathz ticks to run the long-term balancer");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
     "Attempts to steal work from other cores before idling");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
     "Minimum load on remote CPU before we'll steal");
 SYSCTL_INT(_kern_sched, OID_AUTO, trysteal_limit, CTLFLAG_RW, &trysteal_limit,
     0, "Topological distance limit for stealing threads in sched_switch()");
 SYSCTL_INT(_kern_sched, OID_AUTO, always_steal, CTLFLAG_RW, &always_steal, 0,
     "Always run the stealer from the idle thread");
 SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
     CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
     "XML dump of detected CPU topology");
 #endif
 
 /* ps compat.  All cpu percentages from ULE are weighted. */
 static int ccpu = 0;
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
Index: stable/12/sys/kern/vfs_bio.c
===================================================================
--- stable/12/sys/kern/vfs_bio.c	(revision 355609)
+++ stable/12/sys/kern/vfs_bio.c	(revision 355610)
@@ -1,5474 +1,5474 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004 Poul-Henning Kamp
  * Copyright (c) 1994,1997 John S. Dyson
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  *
  * see man buf(9) for more info.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/bitset.h>
 #include <sys/conf.h>
 #include <sys/counter.h>
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/fail.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 #include <geom/geom.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/swap_pager.h>
 #include "opt_swap.h"
 
 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
 
 struct	bio_ops bioops;		/* I/O operation notification */
 
 struct	buf_ops buf_ops_bio = {
 	.bop_name	=	"buf_ops_bio",
 	.bop_write	=	bufwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 struct bufqueue {
 	struct mtx_padalign	bq_lock;
 	TAILQ_HEAD(, buf)	bq_queue;
 	uint8_t			bq_index;
 	uint16_t		bq_subqueue;
 	int			bq_len;
 } __aligned(CACHE_LINE_SIZE);
 
 #define	BQ_LOCKPTR(bq)		(&(bq)->bq_lock)
 #define	BQ_LOCK(bq)		mtx_lock(BQ_LOCKPTR((bq)))
 #define	BQ_UNLOCK(bq)		mtx_unlock(BQ_LOCKPTR((bq)))
 #define	BQ_ASSERT_LOCKED(bq)	mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
 
 struct bufdomain {
 	struct bufqueue	bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
 	struct bufqueue bd_dirtyq;
 	struct bufqueue	*bd_cleanq;
 	struct mtx_padalign bd_run_lock;
 	/* Constants */
 	long		bd_maxbufspace;
 	long		bd_hibufspace;
 	long 		bd_lobufspace;
 	long 		bd_bufspacethresh;
 	int		bd_hifreebuffers;
 	int		bd_lofreebuffers;
 	int		bd_hidirtybuffers;
 	int		bd_lodirtybuffers;
 	int		bd_dirtybufthresh;
 	int		bd_lim;
 	/* atomics */
 	int		bd_wanted;
 	int __aligned(CACHE_LINE_SIZE)	bd_numdirtybuffers;
 	int __aligned(CACHE_LINE_SIZE)	bd_running;
 	long __aligned(CACHE_LINE_SIZE) bd_bufspace;
 	int __aligned(CACHE_LINE_SIZE)	bd_freebuffers;
 } __aligned(CACHE_LINE_SIZE);
 
 #define	BD_LOCKPTR(bd)		(&(bd)->bd_cleanq->bq_lock)
 #define	BD_LOCK(bd)		mtx_lock(BD_LOCKPTR((bd)))
 #define	BD_UNLOCK(bd)		mtx_unlock(BD_LOCKPTR((bd)))
 #define	BD_ASSERT_LOCKED(bd)	mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
 #define	BD_RUN_LOCKPTR(bd)	(&(bd)->bd_run_lock)
 #define	BD_RUN_LOCK(bd)		mtx_lock(BD_RUN_LOCKPTR((bd)))
 #define	BD_RUN_UNLOCK(bd)	mtx_unlock(BD_RUN_LOCKPTR((bd)))
 #define	BD_DOMAIN(bd)		(bd - bdomain)
 
 static struct buf *buf;		/* buffer header pool */
 extern struct buf *swbuf;	/* Swap buffer header pool. */
-caddr_t unmapped_buf;
+caddr_t __read_mostly unmapped_buf;
 
 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
 struct proc *bufdaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
 		vm_offset_t to);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
 		vm_page_t m);
 static void vfs_clean_pages_dirty_buf(struct buf *bp);
 static void vfs_setdirty_locked_object(struct buf *bp);
 static void vfs_vmio_invalidate(struct buf *bp);
 static void vfs_vmio_truncate(struct buf *bp, int npages);
 static void vfs_vmio_extend(struct buf *bp, int npages, int size);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
 static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
 		void (*)(struct buf *));
 static int buf_flush(struct vnode *vp, struct bufdomain *, int);
 static int flushbufqueues(struct vnode *, struct bufdomain *, int, int);
 static void buf_daemon(void);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 static void bufkva_reclaim(vmem_t *, int);
 static void bufkva_free(struct buf *);
 static int buf_import(void *, void **, int, int, int);
 static void buf_release(void *, void **, int);
 static void maxbcachebuf_adjust(void);
 static inline struct bufdomain *bufdomain(struct buf *);
 static void bq_remove(struct bufqueue *bq, struct buf *bp);
 static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
 static int buf_recycle(struct bufdomain *, bool kva);
 static void bq_init(struct bufqueue *bq, int qindex, int cpu,
 	    const char *lockname);
 static void bd_init(struct bufdomain *bd);
 static int bd_flushall(struct bufdomain *bd);
 static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS);
 static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS);
 
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
 int vmiodirenable = TRUE;
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
     "Use the VM system for directory writes");
 long runningbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
     "Amount of presently outstanding async buffer io");
 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
     NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers");
 static counter_u64_t bufkvaspace;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
 SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace,
     CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace,
     __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L",
     "Maximum allowed value of bufspace (including metadata)");
 static long bufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
     "Amount of malloced memory for buffers");
 static long maxbufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
     0, "Maximum amount of malloced memory for buffers");
 static long lobufspace;
 SYSCTL_PROC(_vfs, OID_AUTO, lobufspace,
     CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace,
     __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L",
     "Minimum amount of buffers we want to have");
 long hibufspace;
 SYSCTL_PROC(_vfs, OID_AUTO, hibufspace,
     CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace,
     __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L",
     "Maximum allowed value of bufspace (excluding metadata)");
 long bufspacethresh;
 SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh,
     CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh,
     __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L",
     "Bufspace consumed before waking the daemon to free some");
 static counter_u64_t buffreekvacnt;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
     "Number of times we have freed the KVA space from some buffer");
 static counter_u64_t bufdefragcnt;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt,
     "Number of times we have had to repeat buffer allocation to defragment");
 static long lorunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
     "Minimum preferred space used for in-progress I/O");
 static long hirunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
     "Maximum amount of space to use for in-progress I/O");
 int dirtybufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
     0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
 int bdwriteskip;
 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
     0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
 int altbufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
     0, "Number of fsync flushes to limit dirty buffers");
 static int recursiveflushes;
 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
     0, "Number of flushes skipped due to being recursive");
 static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I",
     "Number of buffers that are dirty (has unwritten changes) at the moment");
 static int lodirtybuffers;
 SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers,
     __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I",
     "How many buffers we want to have free before bufdaemon can sleep");
 static int hidirtybuffers;
 SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers,
     __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I",
     "When the number of dirty buffers is considered severe");
 int dirtybufthresh;
 SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh,
     __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I",
     "Number of bdwrite to bawrite conversions to clear dirty buffers");
 static int numfreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
     "Number of free buffers");
 static int lofreebuffers;
 SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers,
     __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I",
    "Target number of free buffers");
 static int hifreebuffers;
 SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers,
     __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I",
    "Threshold for clean buffer recycling");
 static counter_u64_t getnewbufcalls;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
    &getnewbufcalls, "Number of calls to getnewbuf");
 static counter_u64_t getnewbufrestarts;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD,
     &getnewbufrestarts,
     "Number of times getnewbuf has had to restart a buffer acquisition");
 static counter_u64_t mappingrestarts;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD,
     &mappingrestarts,
     "Number of times getblk has had to restart a buffer mapping for "
     "unmapped buffer");
 static counter_u64_t numbufallocfails;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW,
     &numbufallocfails, "Number of times buffer allocations failed");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
 static counter_u64_t notbufdflushes;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
     "Number of barrier writes");
 SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
     &unmapped_buf_allowed, 0,
     "Permit the use of the unmapped i/o");
 int maxbcachebuf = MAXBCACHEBUF;
 SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0,
     "Maximum size of a buffer cache block");
 
 /*
  * This lock synchronizes access to bd_request.
  */
 static struct mtx_padalign __exclusive_cache_line bdlock;
 
 /*
  * This lock protects the runningbufreq and synchronizes runningbufwakeup and
  * waitrunningbufspace().
  */
 static struct mtx_padalign __exclusive_cache_line rbreqlock;
 
 /*
  * Lock that protects bdirtywait.
  */
 static struct mtx_padalign __exclusive_cache_line bdirtylock;
 
 /*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
  * is idling.
  */
 static int bd_request;
 
 /*
  * Request for the buf daemon to write more buffers than is indicated by
  * lodirtybuf.  This may be necessary to push out excess dependencies or
  * defragment the address space where a simple count of the number of dirty
  * buffers is insufficient to characterize the demand for flushing them.
  */
 static int bd_speedupreq;
 
 /*
  * Synchronization (sleep/wakeup) variable for active buffer space requests.
  * Set when wait starts, cleared prior to wakeup().
  * Used in runningbufwakeup() and waitrunningbufspace().
  */
 static int runningbufreq;
 
 /*
  * Synchronization for bwillwrite() waiters.
  */
 static int bdirtywait;
 
 /*
  * Definitions for the buffer free lists.
  */
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_EMPTY	1	/* empty buffer headers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
 #define QUEUE_CLEAN	3	/* non-B_DELWRI buffers */
 #define QUEUE_SENTINEL	4	/* not an queue index, but mark for sentinel */
 
 /* Maximum number of buffer domains. */
 #define	BUF_DOMAINS	8
 
 struct bufdomainset bdlodirty;		/* Domains > lodirty */
 struct bufdomainset bdhidirty;		/* Domains > hidirty */
 
 /* Configured number of clean queues. */
 static int __read_mostly buf_domains;
 
 BITSET_DEFINE(bufdomainset, BUF_DOMAINS);
 struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS];
 struct bufqueue __exclusive_cache_line bqempty;
 
 /*
  * per-cpu empty buffer cache.
  */
 uma_zone_t buf_zone;
 
 /*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
  */
 const char *buf_wmesg = BUF_WMESG;
 
 static int
 sysctl_runningspace(SYSCTL_HANDLER_ARGS)
 {
 	long value;
 	int error;
 
 	value = *(long *)arg1;
 	error = sysctl_handle_long(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	mtx_lock(&rbreqlock);
 	if (arg1 == &hirunningspace) {
 		if (value < lorunningspace)
 			error = EINVAL;
 		else
 			hirunningspace = value;
 	} else {
 		KASSERT(arg1 == &lorunningspace,
 		    ("%s: unknown arg1", __func__));
 		if (value > hirunningspace)
 			error = EINVAL;
 		else
 			lorunningspace = value;
 	}
 	mtx_unlock(&rbreqlock);
 	return (error);
 }
 
 static int
 sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int value;
 	int i;
 
 	value = *(int *)arg1;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	*(int *)arg1 = value;
 	for (i = 0; i < buf_domains; i++)
 		*(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
 		    value / buf_domains;
 
 	return (error);
 }
 
 static int
 sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS)
 {
 	long value;
 	int error;
 	int i;
 
 	value = *(long *)arg1;
 	error = sysctl_handle_long(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	*(long *)arg1 = value;
 	for (i = 0; i < buf_domains; i++)
 		*(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
 		    value / buf_domains;
 
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int
 sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
 	long lvalue;
 	int ivalue;
 	int i;
 
 	lvalue = 0;
 	for (i = 0; i < buf_domains; i++)
 		lvalue += bdomain[i].bd_bufspace;
 	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	if (lvalue > INT_MAX)
 		/* On overflow, still write out a long to trigger ENOMEM. */
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	ivalue = lvalue;
 	return (sysctl_handle_int(oidp, &ivalue, 0, req));
 }
 #else
 static int
 sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
 	long lvalue;
 	int i;
 
 	lvalue = 0;
 	for (i = 0; i < buf_domains; i++)
 		lvalue += bdomain[i].bd_bufspace;
 	return (sysctl_handle_long(oidp, &lvalue, 0, req));
 }
 #endif
 
 static int
 sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS)
 {
 	int value;
 	int i;
 
 	value = 0;
 	for (i = 0; i < buf_domains; i++)
 		value += bdomain[i].bd_numdirtybuffers;
 	return (sysctl_handle_int(oidp, &value, 0, req));
 }
 
 /*
  *	bdirtywakeup:
  *
  *	Wakeup any bwillwrite() waiters.
  */
 static void
 bdirtywakeup(void)
 {
 	mtx_lock(&bdirtylock);
 	if (bdirtywait) {
 		bdirtywait = 0;
 		wakeup(&bdirtywait);
 	}
 	mtx_unlock(&bdirtylock);
 }
 
 /*
  *	bd_clear:
  *
  *	Clear a domain from the appropriate bitsets when dirtybuffers
  *	is decremented.
  */
 static void
 bd_clear(struct bufdomain *bd)
 {
 
 	mtx_lock(&bdirtylock);
 	if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers)
 		BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
 	if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers)
 		BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
 	mtx_unlock(&bdirtylock);
 }
 
 /*
  *	bd_set:
  *
  *	Set a domain in the appropriate bitsets when dirtybuffers
  *	is incremented.
  */
 static void
 bd_set(struct bufdomain *bd)
 {
 
 	mtx_lock(&bdirtylock);
 	if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers)
 		BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
 	if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers)
 		BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
 	mtx_unlock(&bdirtylock);
 }
 
 /*
  *	bdirtysub:
  *
  *	Decrement the numdirtybuffers count by one and wakeup any
  *	threads blocked in bwillwrite().
  */
 static void
 bdirtysub(struct buf *bp)
 {
 	struct bufdomain *bd;
 	int num;
 
 	bd = bufdomain(bp);
 	num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1);
 	if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
 		bdirtywakeup();
 	if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
 		bd_clear(bd);
 }
 
 /*
  *	bdirtyadd:
  *
  *	Increment the numdirtybuffers count by one and wakeup the buf 
  *	daemon if needed.
  */
 static void
 bdirtyadd(struct buf *bp)
 {
 	struct bufdomain *bd;
 	int num;
 
 	/*
 	 * Only do the wakeup once as we cross the boundary.  The
 	 * buf daemon will keep running until the condition clears.
 	 */
 	bd = bufdomain(bp);
 	num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1);
 	if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
 		bd_wakeup();
 	if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
 		bd_set(bd);
 }
 
 /*
  *	bufspace_daemon_wakeup:
  *
  *	Wakeup the daemons responsible for freeing clean bufs.
  */
 static void
 bufspace_daemon_wakeup(struct bufdomain *bd)
 {
 
 	/*
 	 * avoid the lock if the daemon is running.
 	 */
 	if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) {
 		BD_RUN_LOCK(bd);
 		atomic_store_int(&bd->bd_running, 1);
 		wakeup(&bd->bd_running);
 		BD_RUN_UNLOCK(bd);
 	}
 }
 
 /*
  *	bufspace_daemon_wait:
  *
  *	Sleep until the domain falls below a limit or one second passes.
  */
 static void
 bufspace_daemon_wait(struct bufdomain *bd)
 {
 	/*
 	 * Re-check our limits and sleep.  bd_running must be
 	 * cleared prior to checking the limits to avoid missed
 	 * wakeups.  The waker will adjust one of bufspace or
 	 * freebuffers prior to checking bd_running.
 	 */
 	BD_RUN_LOCK(bd);
 	atomic_store_int(&bd->bd_running, 0);
 	if (bd->bd_bufspace < bd->bd_bufspacethresh &&
 	    bd->bd_freebuffers > bd->bd_lofreebuffers) {
 		msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP,
 		    "-", hz);
 	} else {
 		/* Avoid spurious wakeups while running. */
 		atomic_store_int(&bd->bd_running, 1);
 		BD_RUN_UNLOCK(bd);
 	}
 }
 
 /*
  *	bufspace_adjust:
  *
  *	Adjust the reported bufspace for a KVA managed buffer, possibly
  * 	waking any waiters.
  */
 static void
 bufspace_adjust(struct buf *bp, int bufsize)
 {
 	struct bufdomain *bd;
 	long space;
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) == 0,
 	    ("bufspace_adjust: malloc buf %p", bp));
 	bd = bufdomain(bp);
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0) {
 		atomic_subtract_long(&bd->bd_bufspace, -diff);
 	} else if (diff > 0) {
 		space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
 		/* Wake up the daemon on the transition. */
 		if (space < bd->bd_bufspacethresh &&
 		    space + diff >= bd->bd_bufspacethresh)
 			bufspace_daemon_wakeup(bd);
 	}
 	bp->b_bufsize = bufsize;
 }
 
 /*
  *	bufspace_reserve:
  *
  *	Reserve bufspace before calling allocbuf().  metadata has a
  *	different space limit than data.
  */
 static int
 bufspace_reserve(struct bufdomain *bd, int size, bool metadata)
 {
 	long limit, new;
 	long space;
 
 	if (metadata)
 		limit = bd->bd_maxbufspace;
 	else
 		limit = bd->bd_hibufspace;
 	space = atomic_fetchadd_long(&bd->bd_bufspace, size);
 	new = space + size;
 	if (new > limit) {
 		atomic_subtract_long(&bd->bd_bufspace, size);
 		return (ENOSPC);
 	}
 
 	/* Wake up the daemon on the transition. */
 	if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh)
 		bufspace_daemon_wakeup(bd);
 
 	return (0);
 }
 
 /*
  *	bufspace_release:
  *
  *	Release reserved bufspace after bufspace_adjust() has consumed it.
  */
 static void
 bufspace_release(struct bufdomain *bd, int size)
 {
 
 	atomic_subtract_long(&bd->bd_bufspace, size);
 }
 
 /*
  *	bufspace_wait:
  *
  *	Wait for bufspace, acting as the buf daemon if a locked vnode is
  *	supplied.  bd_wanted must be set prior to polling for space.  The
  *	operation must be re-tried on return.
  */
 static void
 bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags,
     int slpflag, int slptimeo)
 {
 	struct thread *td;
 	int error, fl, norunbuf;
 
 	if ((gbflags & GB_NOWAIT_BD) != 0)
 		return;
 
 	td = curthread;
 	BD_LOCK(bd);
 	while (bd->bd_wanted) {
 		if (vp != NULL && vp->v_type != VCHR &&
 		    (td->td_pflags & TDP_BUFNEED) == 0) {
 			BD_UNLOCK(bd);
 			/*
 			 * getblk() is called with a vnode locked, and
 			 * some majority of the dirty buffers may as
 			 * well belong to the vnode.  Flushing the
 			 * buffers there would make a progress that
 			 * cannot be achieved by the buf_daemon, that
 			 * cannot lock the vnode.
 			 */
 			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
 			    (td->td_pflags & TDP_NORUNNINGBUF);
 
 			/*
 			 * Play bufdaemon.  The getnewbuf() function
 			 * may be called while the thread owns lock
 			 * for another dirty buffer for the same
 			 * vnode, which makes it impossible to use
 			 * VOP_FSYNC() there, due to the buffer lock
 			 * recursion.
 			 */
 			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
 			fl = buf_flush(vp, bd, flushbufqtarget);
 			td->td_pflags &= norunbuf;
 			BD_LOCK(bd);
 			if (fl != 0)
 				continue;
 			if (bd->bd_wanted == 0)
 				break;
 		}
 		error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
 		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
 		if (error != 0)
 			break;
 	}
 	BD_UNLOCK(bd);
 }
 
 
 /*
  *	bufspace_daemon:
  *
  *	buffer space management daemon.  Tries to maintain some marginal
  *	amount of free buffer space so that requesting processes neither
  *	block nor work to reclaim buffers.
  */
 static void
 bufspace_daemon(void *arg)
 {
 	struct bufdomain *bd;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread,
 	    SHUTDOWN_PRI_LAST + 100);
 
 	bd = arg;
 	for (;;) {
 		kthread_suspend_check();
 
 		/*
 		 * Free buffers from the clean queue until we meet our
 		 * targets.
 		 *
 		 * Theory of operation:  The buffer cache is most efficient
 		 * when some free buffer headers and space are always
 		 * available to getnewbuf().  This daemon attempts to prevent
 		 * the excessive blocking and synchronization associated
 		 * with shortfall.  It goes through three phases according
 		 * demand:
 		 *
 		 * 1)	The daemon wakes up voluntarily once per-second
 		 *	during idle periods when the counters are below
 		 *	the wakeup thresholds (bufspacethresh, lofreebuffers).
 		 *
 		 * 2)	The daemon wakes up as we cross the thresholds
 		 *	ahead of any potential blocking.  This may bounce
 		 *	slightly according to the rate of consumption and
 		 *	release.
 		 *
 		 * 3)	The daemon and consumers are starved for working
 		 *	clean buffers.  This is the 'bufspace' sleep below
 		 *	which will inefficiently trade bufs with bqrelse
 		 *	until we return to condition 2.
 		 */
 		while (bd->bd_bufspace > bd->bd_lobufspace ||
 		    bd->bd_freebuffers < bd->bd_hifreebuffers) {
 			if (buf_recycle(bd, false) != 0) {
 				if (bd_flushall(bd))
 					continue;
 				/*
 				 * Speedup dirty if we've run out of clean
 				 * buffers.  This is possible in particular
 				 * because softdep may held many bufs locked
 				 * pending writes to other bufs which are
 				 * marked for delayed write, exhausting
 				 * clean space until they are written.
 				 */
 				bd_speedup();
 				BD_LOCK(bd);
 				if (bd->bd_wanted) {
 					msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
 					    PRIBIO|PDROP, "bufspace", hz/10);
 				} else
 					BD_UNLOCK(bd);
 			}
 			maybe_yield();
 		}
 		bufspace_daemon_wait(bd);
 	}
 }
 
 /*
  *	bufmallocadjust:
  *
  *	Adjust the reported bufspace for a malloc managed buffer, possibly
  *	waking any waiters.
  */
 static void
 bufmallocadjust(struct buf *bp, int bufsize)
 {
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) != 0,
 	    ("bufmallocadjust: non-malloc buf %p", bp));
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0)
 		atomic_subtract_long(&bufmallocspace, -diff);
 	else
 		atomic_add_long(&bufmallocspace, diff);
 	bp->b_bufsize = bufsize;
 }
 
 /*
  *	runningwakeup:
  *
  *	Wake up processes that are waiting on asynchronous writes to fall
  *	below lorunningspace.
  */
 static void
 runningwakeup(void)
 {
 
 	mtx_lock(&rbreqlock);
 	if (runningbufreq) {
 		runningbufreq = 0;
 		wakeup(&runningbufreq);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 /*
  *	runningbufwakeup:
  *
  *	Decrement the outstanding write count according.
  */
 void
 runningbufwakeup(struct buf *bp)
 {
 	long space, bspace;
 
 	bspace = bp->b_runningbufspace;
 	if (bspace == 0)
 		return;
 	space = atomic_fetchadd_long(&runningbufspace, -bspace);
 	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
 	    space, bspace));
 	bp->b_runningbufspace = 0;
 	/*
 	 * Only acquire the lock and wakeup on the transition from exceeding
 	 * the threshold to falling below it.
 	 */
 	if (space < lorunningspace)
 		return;
 	if (space - bspace > lorunningspace)
 		return;
 	runningwakeup();
 }
 
 /*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
  *	running.  This routine is used in async-write situations to
  *	prevent creating huge backups of pending writes to a device.
  *	Only asynchronous writes are governed by this function.
  *
  *	This does NOT turn an async write into a sync write.  It waits  
  *	for earlier writes to complete and generally returns before the
  *	caller's write has reached the device.
  */
 void
 waitrunningbufspace(void)
 {
 
 	mtx_lock(&rbreqlock);
 	while (runningbufspace > hirunningspace) {
 		runningbufreq = 1;
 		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 
 /*
  *	vfs_buf_test_cache:
  *
  *	Called when a buffer is extended.  This function clears the B_CACHE
  *	bit if the newly extended portion of the buffer does not contain
  *	valid data.
  */
 static __inline void
 vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
     vm_offset_t size, vm_page_t m)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if (bp->b_flags & B_CACHE) {
 		int base = (foff + off) & PAGE_MASK;
 		if (vm_page_is_valid(m, base, size) == 0)
 			bp->b_flags &= ~B_CACHE;
 	}
 }
 
 /* Wake up the buffer daemon if necessary */
 static void
 bd_wakeup(void)
 {
 
 	mtx_lock(&bdlock);
 	if (bd_request == 0) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
 	mtx_unlock(&bdlock);
 }
 
 /*
  * Adjust the maxbcachbuf tunable.
  */
 static void
 maxbcachebuf_adjust(void)
 {
 	int i;
 
 	/*
 	 * maxbcachebuf must be a power of 2 >= MAXBSIZE.
 	 */
 	i = 2;
 	while (i * 2 <= maxbcachebuf)
 		i *= 2;
 	maxbcachebuf = i;
 	if (maxbcachebuf < MAXBSIZE)
 		maxbcachebuf = MAXBSIZE;
 	if (maxbcachebuf > MAXPHYS)
 		maxbcachebuf = MAXPHYS;
 	if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF)
 		printf("maxbcachebuf=%d\n", maxbcachebuf);
 }
 
 /*
  * bd_speedup - speedup the buffer cache flushing code
  */
 void
 bd_speedup(void)
 {
 	int needwake;
 
 	mtx_lock(&bdlock);
 	needwake = 0;
 	if (bd_speedupreq == 0 || bd_request == 0)
 		needwake = 1;
 	bd_speedupreq = 1;
 	bd_request = 1;
 	if (needwake)
 		wakeup(&bd_request);
 	mtx_unlock(&bdlock);
 }
 
 #ifndef NSWBUF_MIN
 #define	NSWBUF_MIN	16
 #endif
 
 #ifdef __i386__
 #define	TRANSIENT_DENOM	5
 #else
 #define	TRANSIENT_DENOM 10
 #endif
 
 /*
  * Calculating buffer cache scaling values and reserve space for buffer
  * headers.  This is called during low level kernel initialization and
  * may be called more then once.  We CANNOT write to the memory area
  * being reserved at this time.
  */
 caddr_t
 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 {
 	int tuned_nbuf;
 	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
 
 	/*
 	 * physmem_est is in pages.  Convert it to kilobytes (assumes
 	 * PAGE_SIZE is >= 1K)
 	 */
 	physmem_est = physmem_est * (PAGE_SIZE / 1024);
 
 	maxbcachebuf_adjust();
 	/*
 	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 	 * For the first 64MB of ram nominally allocate sufficient buffers to
 	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
 	 * the buffer cache we limit the eventual kva reservation to
 	 * maxbcache bytes.
 	 *
 	 * factor represents the 1/4 x ram conversion.
 	 */
 	if (nbuf == 0) {
 		int factor = 4 * BKVASIZE / 1024;
 
 		nbuf = 50;
 		if (physmem_est > 4096)
 			nbuf += min((physmem_est - 4096) / factor,
 			    65536 / factor);
 		if (physmem_est > 65536)
 			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
 			    32 * 1024 * 1024 / (factor * 5));
 
 		if (maxbcache && nbuf > maxbcache / BKVASIZE)
 			nbuf = maxbcache / BKVASIZE;
 		tuned_nbuf = 1;
 	} else
 		tuned_nbuf = 0;
 
 	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
 	maxbuf = (LONG_MAX / 3) / BKVASIZE;
 	if (nbuf > maxbuf) {
 		if (!tuned_nbuf)
 			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
 			    maxbuf);
 		nbuf = maxbuf;
 	}
 
 	/*
 	 * Ideal allocation size for the transient bio submap is 10%
 	 * of the maximal space buffer map.  This roughly corresponds
 	 * to the amount of the buffer mapped for typical UFS load.
 	 *
 	 * Clip the buffer map to reserve space for the transient
 	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
 	 * maximum buffer map extent on the platform.
 	 *
 	 * The fall-back to the maxbuf in case of maxbcache unset,
 	 * allows to not trim the buffer KVA for the architectures
 	 * with ample KVA space.
 	 */
 	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
 		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
 		buf_sz = (long)nbuf * BKVASIZE;
 		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
 		    (TRANSIENT_DENOM - 1)) {
 			/*
 			 * There is more KVA than memory.  Do not
 			 * adjust buffer map size, and assign the rest
 			 * of maxbuf to transient map.
 			 */
 			biotmap_sz = maxbuf_sz - buf_sz;
 		} else {
 			/*
 			 * Buffer map spans all KVA we could afford on
 			 * this platform.  Give 10% (20% on i386) of
 			 * the buffer map to the transient bio map.
 			 */
 			biotmap_sz = buf_sz / TRANSIENT_DENOM;
 			buf_sz -= biotmap_sz;
 		}
 		if (biotmap_sz / INT_MAX > MAXPHYS)
 			bio_transient_maxcnt = INT_MAX;
 		else
 			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
 		/*
 		 * Artificially limit to 1024 simultaneous in-flight I/Os
 		 * using the transient mapping.
 		 */
 		if (bio_transient_maxcnt > 1024)
 			bio_transient_maxcnt = 1024;
 		if (tuned_nbuf)
 			nbuf = buf_sz / BKVASIZE;
 	}
 
 	/*
 	 * swbufs are used as temporary holders for I/O, such as paging I/O.
 	 * We have no less then 16 and no more then 256.
 	 */
 	nswbuf = min(nbuf / 4, 256);
 	TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
 	if (nswbuf < NSWBUF_MIN)
 		nswbuf = NSWBUF_MIN;
 
 	/*
 	 * Reserve space for the buffer cache buffers
 	 */
 	swbuf = (void *)v;
 	v = (caddr_t)(swbuf + nswbuf);
 	buf = (void *)v;
 	v = (caddr_t)(buf + nbuf);
 
 	return(v);
 }
 
 /* Initialize the buffer subsystem.  Called before use of any buffers. */
 void
 bufinit(void)
 {
 	struct buf *bp;
 	int i;
 
 	KASSERT(maxbcachebuf >= MAXBSIZE,
 	    ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
 	    MAXBSIZE));
 	bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
 	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
 
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_NONE;
 		bp->b_domain = -1;
 		bp->b_subqueue = mp_maxid + 1;
 		bp->b_xflags = 0;
 		bp->b_data = bp->b_kvabase = unmapped_buf;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		bq_insert(&bqempty, bp, false);
 	}
 
 	/*
 	 * maxbufspace is the absolute maximum amount of buffer space we are 
 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
 	 * is nominally used by metadata.  hibufspace is the nominal maximum
 	 * used by most other requests.  The differential is required to 
 	 * ensure that metadata deadlocks don't occur.
 	 *
 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
 	 * this may result in KVM fragmentation which is not handled optimally
 	 * by the system. XXX This is less true with vmem.  We could use
 	 * PAGE_SIZE.
 	 */
 	maxbufspace = (long)nbuf * BKVASIZE;
 	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10);
 	lobufspace = (hibufspace / 20) * 19; /* 95% */
 	bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
 
 	/*
 	 * Note: The 16 MiB upper limit for hirunningspace was chosen
 	 * arbitrarily and may need further tuning. It corresponds to
 	 * 128 outstanding write IO requests (if IO size is 128 KiB),
 	 * which fits with many RAID controllers' tagged queuing limits.
 	 * The lower 1 MiB limit is the historical upper limit for
 	 * hirunningspace.
 	 */
 	hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf),
 	    16 * 1024 * 1024), 1024 * 1024);
 	lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf);
 
 	/*
 	 * Limit the amount of malloc memory since it is wired permanently into
 	 * the kernel space.  Even though this is accounted for in the buffer
 	 * allocation, we don't want the malloced region to grow uncontrolled.
 	 * The malloc scheme improves memory utilization significantly on
 	 * average (small) directories.
 	 */
 	maxbufmallocspace = hibufspace / 20;
 
 	/*
 	 * Reduce the chance of a deadlock occurring by limiting the number
 	 * of delayed-write dirty buffers we allow to stack up.
 	 */
 	hidirtybuffers = nbuf / 4 + 20;
 	dirtybufthresh = hidirtybuffers * 9 / 10;
 	/*
 	 * To support extreme low-memory systems, make sure hidirtybuffers
 	 * cannot eat up all available buffer space.  This occurs when our
 	 * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
 	 * buffer space assuming BKVASIZE'd buffers.
 	 */
 	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 		hidirtybuffers >>= 1;
 	}
 	lodirtybuffers = hidirtybuffers / 2;
 
 	/*
 	 * lofreebuffers should be sufficient to avoid stalling waiting on
 	 * buf headers under heavy utilization.  The bufs in per-cpu caches
 	 * are counted as free but will be unavailable to threads executing
 	 * on other cpus.
 	 *
 	 * hifreebuffers is the free target for the bufspace daemon.  This
 	 * should be set appropriately to limit work per-iteration.
 	 */
 	lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
 	hifreebuffers = (3 * lofreebuffers) / 2;
 	numfreebuffers = nbuf;
 
 	/* Setup the kva and free list allocators. */
 	vmem_set_reclaim(buffer_arena, bufkva_reclaim);
 	buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
 	    NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
 
 	/*
 	 * Size the clean queue according to the amount of buffer space.
 	 * One queue per-256mb up to the max.  More queues gives better
 	 * concurrency but less accurate LRU.
 	 */
 	buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS);
 	for (i = 0 ; i < buf_domains; i++) {
 		struct bufdomain *bd;
 
 		bd = &bdomain[i];
 		bd_init(bd);
 		bd->bd_freebuffers = nbuf / buf_domains;
 		bd->bd_hifreebuffers = hifreebuffers / buf_domains;
 		bd->bd_lofreebuffers = lofreebuffers / buf_domains;
 		bd->bd_bufspace = 0;
 		bd->bd_maxbufspace = maxbufspace / buf_domains;
 		bd->bd_hibufspace = hibufspace / buf_domains;
 		bd->bd_lobufspace = lobufspace / buf_domains;
 		bd->bd_bufspacethresh = bufspacethresh / buf_domains;
 		bd->bd_numdirtybuffers = 0;
 		bd->bd_hidirtybuffers = hidirtybuffers / buf_domains;
 		bd->bd_lodirtybuffers = lodirtybuffers / buf_domains;
 		bd->bd_dirtybufthresh = dirtybufthresh / buf_domains;
 		/* Don't allow more than 2% of bufs in the per-cpu caches. */
 		bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus;
 	}
 	getnewbufcalls = counter_u64_alloc(M_WAITOK);
 	getnewbufrestarts = counter_u64_alloc(M_WAITOK);
 	mappingrestarts = counter_u64_alloc(M_WAITOK);
 	numbufallocfails = counter_u64_alloc(M_WAITOK);
 	notbufdflushes = counter_u64_alloc(M_WAITOK);
 	buffreekvacnt = counter_u64_alloc(M_WAITOK);
 	bufdefragcnt = counter_u64_alloc(M_WAITOK);
 	bufkvaspace = counter_u64_alloc(M_WAITOK);
 }
 
 #ifdef INVARIANTS
 static inline void
 vfs_buf_check_mapped(struct buf *bp)
 {
 
 	KASSERT(bp->b_kvabase != unmapped_buf,
 	    ("mapped buf: b_kvabase was not updated %p", bp));
 	KASSERT(bp->b_data != unmapped_buf,
 	    ("mapped buf: b_data was not updated %p", bp));
 	KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf +
 	    MAXPHYS, ("b_data + b_offset unmapped %p", bp));
 }
 
 static inline void
 vfs_buf_check_unmapped(struct buf *bp)
 {
 
 	KASSERT(bp->b_data == unmapped_buf,
 	    ("unmapped buf: corrupted b_data %p", bp));
 }
 
 #define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
 #define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
 #else
 #define	BUF_CHECK_MAPPED(bp) do {} while (0)
 #define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
 #endif
 
 static int
 isbufbusy(struct buf *bp)
 {
 	if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) ||
 	    ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
 		return (1);
 	return (0);
 }
 
 /*
  * Shutdown the system cleanly to prepare for reboot, halt, or power off.
  */
 void
 bufshutdown(int show_busybufs)
 {
 	static int first_buf_printf = 1;
 	struct buf *bp;
 	int iter, nbusy, pbusy;
 #ifndef PREEMPTION
 	int subiter;
 #endif
 
 	/* 
 	 * Sync filesystems for shutdown
 	 */
 	wdog_kern_pat(WD_LASTVAL);
 	sys_sync(curthread, NULL);
 
 	/*
 	 * With soft updates, some buffers that are
 	 * written will be remarked as dirty until other
 	 * buffers are written.
 	 */
 	for (iter = pbusy = 0; iter < 20; iter++) {
 		nbusy = 0;
 		for (bp = &buf[nbuf]; --bp >= buf; )
 			if (isbufbusy(bp))
 				nbusy++;
 		if (nbusy == 0) {
 			if (first_buf_printf)
 				printf("All buffers synced.");
 			break;
 		}
 		if (first_buf_printf) {
 			printf("Syncing disks, buffers remaining... ");
 			first_buf_printf = 0;
 		}
 		printf("%d ", nbusy);
 		if (nbusy < pbusy)
 			iter = 0;
 		pbusy = nbusy;
 
 		wdog_kern_pat(WD_LASTVAL);
 		sys_sync(curthread, NULL);
 
 #ifdef PREEMPTION
 		/*
 		 * Spin for a while to allow interrupt threads to run.
 		 */
 		DELAY(50000 * iter);
 #else
 		/*
 		 * Context switch several times to allow interrupt
 		 * threads to run.
 		 */
 		for (subiter = 0; subiter < 50 * iter; subiter++) {
 			thread_lock(curthread);
 			mi_switch(SW_VOL, NULL);
 			thread_unlock(curthread);
 			DELAY(1000);
 		}
 #endif
 	}
 	printf("\n");
 	/*
 	 * Count only busy local buffers to prevent forcing 
 	 * a fsck if we're just a client of a wedged NFS server
 	 */
 	nbusy = 0;
 	for (bp = &buf[nbuf]; --bp >= buf; ) {
 		if (isbufbusy(bp)) {
 #if 0
 /* XXX: This is bogus.  We should probably have a BO_REMOTE flag instead */
 			if (bp->b_dev == NULL) {
 				TAILQ_REMOVE(&mountlist,
 				    bp->b_vp->v_mount, mnt_list);
 				continue;
 			}
 #endif
 			nbusy++;
 			if (show_busybufs > 0) {
 				printf(
 	    "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
 				    nbusy, bp, bp->b_vp, bp->b_flags,
 				    (intmax_t)bp->b_blkno,
 				    (intmax_t)bp->b_lblkno);
 				BUF_LOCKPRINTINFO(bp);
 				if (show_busybufs > 1)
 					vn_printf(bp->b_vp,
 					    "vnode content: ");
 			}
 		}
 	}
 	if (nbusy) {
 		/*
 		 * Failed to sync all blocks. Indicate this and don't
 		 * unmount filesystems (thus forcing an fsck on reboot).
 		 */
 		printf("Giving up on %d buffers\n", nbusy);
 		DELAY(5000000);	/* 5 seconds */
 	} else {
 		if (!first_buf_printf)
 			printf("Final sync complete\n");
 		/*
 		 * Unmount filesystems
 		 */
 		if (panicstr == NULL)
 			vfs_unmountall();
 	}
 	swapoff_all();
 	DELAY(100000);		/* wait for console output to finish */
 }
 
 static void
 bpmap_qenter(struct buf *bp)
 {
 
 	BUF_CHECK_MAPPED(bp);
 
 	/*
 	 * bp->b_data is relative to bp->b_offset, but
 	 * bp->b_offset may be offset into the first page.
 	 */
 	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
 	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
 	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
 	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
 static inline struct bufdomain *
 bufdomain(struct buf *bp)
 {
 
 	return (&bdomain[bp->b_domain]);
 }
 
 static struct bufqueue *
 bufqueue(struct buf *bp)
 {
 
 	switch (bp->b_qindex) {
 	case QUEUE_NONE:
 		/* FALLTHROUGH */
 	case QUEUE_SENTINEL:
 		return (NULL);
 	case QUEUE_EMPTY:
 		return (&bqempty);
 	case QUEUE_DIRTY:
 		return (&bufdomain(bp)->bd_dirtyq);
 	case QUEUE_CLEAN:
 		return (&bufdomain(bp)->bd_subq[bp->b_subqueue]);
 	default:
 		break;
 	}
 	panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex);
 }
 
 /*
  * Return the locked bufqueue that bp is a member of.
  */
 static struct bufqueue *
 bufqueue_acquire(struct buf *bp)
 {
 	struct bufqueue *bq, *nbq;
 
 	/*
 	 * bp can be pushed from a per-cpu queue to the
 	 * cleanq while we're waiting on the lock.  Retry
 	 * if the queues don't match.
 	 */
 	bq = bufqueue(bp);
 	BQ_LOCK(bq);
 	for (;;) {
 		nbq = bufqueue(bp);
 		if (bq == nbq)
 			break;
 		BQ_UNLOCK(bq);
 		BQ_LOCK(nbq);
 		bq = nbq;
 	}
 	return (bq);
 }
 
 /*
  *	binsfree:
  *
  *	Insert the buffer into the appropriate free list.  Requires a
  *	locked buffer on entry and buffer is unlocked before return.
  */
 static void
 binsfree(struct buf *bp, int qindex)
 {
 	struct bufdomain *bd;
 	struct bufqueue *bq;
 
 	KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY,
 	    ("binsfree: Invalid qindex %d", qindex));
 	BUF_ASSERT_XLOCKED(bp);
 
 	/*
 	 * Handle delayed bremfree() processing.
 	 */
 	if (bp->b_flags & B_REMFREE) {
 		if (bp->b_qindex == qindex) {
 			bp->b_flags |= B_REUSE;
 			bp->b_flags &= ~B_REMFREE;
 			BUF_UNLOCK(bp);
 			return;
 		}
 		bq = bufqueue_acquire(bp);
 		bq_remove(bq, bp);
 		BQ_UNLOCK(bq);
 	}
 	bd = bufdomain(bp);
 	if (qindex == QUEUE_CLEAN) {
 		if (bd->bd_lim != 0)
 			bq = &bd->bd_subq[PCPU_GET(cpuid)];
 		else
 			bq = bd->bd_cleanq;
 	} else
 		bq = &bd->bd_dirtyq;
 	bq_insert(bq, bp, true);
 }
 
 /*
  * buf_free:
  *
  *	Free a buffer to the buf zone once it no longer has valid contents.
  */
 static void
 buf_free(struct buf *bp)
 {
 
 	if (bp->b_flags & B_REMFREE)
 		bremfreef(bp);
 	if (bp->b_vflags & BV_BKGRDINPROG)
 		panic("losing buffer 1");
 	if (bp->b_rcred != NOCRED) {
 		crfree(bp->b_rcred);
 		bp->b_rcred = NOCRED;
 	}
 	if (bp->b_wcred != NOCRED) {
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_deallocate(bp);
 	bufkva_free(bp);
 	atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1);
 	BUF_UNLOCK(bp);
 	uma_zfree(buf_zone, bp);
 }
 
 /*
  * buf_import:
  *
  *	Import bufs into the uma cache from the buf list.  The system still
  *	expects a static array of bufs and much of the synchronization
  *	around bufs assumes type stable storage.  As a result, UMA is used
  *	only as a per-cpu cache of bufs still maintained on a global list.
  */
 static int
 buf_import(void *arg, void **store, int cnt, int domain, int flags)
 {
 	struct buf *bp;
 	int i;
 
 	BQ_LOCK(&bqempty);
 	for (i = 0; i < cnt; i++) {
 		bp = TAILQ_FIRST(&bqempty.bq_queue);
 		if (bp == NULL)
 			break;
 		bq_remove(&bqempty, bp);
 		store[i] = bp;
 	}
 	BQ_UNLOCK(&bqempty);
 
 	return (i);
 }
 
 /*
  * buf_release:
  *
  *	Release bufs from the uma cache back to the buffer queues.
  */
 static void
 buf_release(void *arg, void **store, int cnt)
 {
 	struct bufqueue *bq;
 	struct buf *bp;
         int i;
 
 	bq = &bqempty;
 	BQ_LOCK(bq);
         for (i = 0; i < cnt; i++) {
 		bp = store[i];
 		/* Inline bq_insert() to batch locking. */
 		TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
 		bp->b_flags &= ~(B_AGE | B_REUSE);
 		bq->bq_len++;
 		bp->b_qindex = bq->bq_index;
 	}
 	BQ_UNLOCK(bq);
 }
 
 /*
  * buf_alloc:
  *
  *	Allocate an empty buffer header.
  */
 static struct buf *
 buf_alloc(struct bufdomain *bd)
 {
 	struct buf *bp;
 	int freebufs;
 
 	/*
 	 * We can only run out of bufs in the buf zone if the average buf
 	 * is less than BKVASIZE.  In this case the actual wait/block will
 	 * come from buf_reycle() failing to flush one of these small bufs.
 	 */
 	bp = NULL;
 	freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1);
 	if (freebufs > 0)
 		bp = uma_zalloc(buf_zone, M_NOWAIT);
 	if (bp == NULL) {
 		atomic_add_int(&bd->bd_freebuffers, 1);
 		bufspace_daemon_wakeup(bd);
 		counter_u64_add(numbufallocfails, 1);
 		return (NULL);
 	}
 	/*
 	 * Wake-up the bufspace daemon on transition below threshold.
 	 */
 	if (freebufs == bd->bd_lofreebuffers)
 		bufspace_daemon_wakeup(bd);
 
 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
 	
 	KASSERT(bp->b_vp == NULL,
 	    ("bp: %p still has vnode %p.", bp, bp->b_vp));
 	KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
 	    ("invalid buffer %p flags %#x", bp, bp->b_flags));
 	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
 	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
 	KASSERT(bp->b_npages == 0,
 	    ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
 	KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
 	KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
 
 	bp->b_domain = BD_DOMAIN(bd);
 	bp->b_flags = 0;
 	bp->b_ioflags = 0;
 	bp->b_xflags = 0;
 	bp->b_vflags = 0;
 	bp->b_vp = NULL;
 	bp->b_blkno = bp->b_lblkno = 0;
 	bp->b_offset = NOOFFSET;
 	bp->b_iodone = 0;
 	bp->b_error = 0;
 	bp->b_resid = 0;
 	bp->b_bcount = 0;
 	bp->b_npages = 0;
 	bp->b_dirtyoff = bp->b_dirtyend = 0;
 	bp->b_bufobj = NULL;
 	bp->b_data = bp->b_kvabase = unmapped_buf;
 	bp->b_fsprivate1 = NULL;
 	bp->b_fsprivate2 = NULL;
 	bp->b_fsprivate3 = NULL;
 	LIST_INIT(&bp->b_dep);
 
 	return (bp);
 }
 
 /*
  *	buf_recycle:
  *
  *	Free a buffer from the given bufqueue.  kva controls whether the
  *	freed buf must own some kva resources.  This is used for
  *	defragmenting.
  */
 static int
 buf_recycle(struct bufdomain *bd, bool kva)
 {
 	struct bufqueue *bq;
 	struct buf *bp, *nbp;
 
 	if (kva)
 		counter_u64_add(bufdefragcnt, 1);
 	nbp = NULL;
 	bq = bd->bd_cleanq;
 	BQ_LOCK(bq);
 	KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd),
 	    ("buf_recycle: Locks don't match"));
 	nbp = TAILQ_FIRST(&bq->bq_queue);
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
 	while ((bp = nbp) != NULL) {
 		/*
 		 * Calculate next bp (we can only use it if we do not
 		 * release the bqlock).
 		 */
 		nbp = TAILQ_NEXT(bp, b_freelist);
 
 		/*
 		 * If we are defragging then we need a buffer with 
 		 * some kva to reclaim.
 		 */
 		if (kva && bp->b_kvasize == 0)
 			continue;
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
 
 		/*
 		 * Implement a second chance algorithm for frequently
 		 * accessed buffers.
 		 */
 		if ((bp->b_flags & B_REUSE) != 0) {
 			TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
 			TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
 			bp->b_flags &= ~B_REUSE;
 			BUF_UNLOCK(bp);
 			continue;
 		}
 
 		/*
 		 * Skip buffers with background writes in progress.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 
 		KASSERT(bp->b_qindex == QUEUE_CLEAN,
 		    ("buf_recycle: inconsistent queue %d bp %p",
 		    bp->b_qindex, bp));
 		KASSERT(bp->b_domain == BD_DOMAIN(bd),
 		    ("getnewbuf: queue domain %d doesn't match request %d",
 		    bp->b_domain, (int)BD_DOMAIN(bd)));
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 */
 		bq_remove(bq, bp);
 		BQ_UNLOCK(bq);
 
 		/*
 		 * Requeue the background write buffer with error and
 		 * restart the scan.
 		 */
 		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
 			bqrelse(bp);
 			BQ_LOCK(bq);
 			nbp = TAILQ_FIRST(&bq->bq_queue);
 			continue;
 		}
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 		return (0);
 	}
 	bd->bd_wanted = 1;
 	BQ_UNLOCK(bq);
 
 	return (ENOBUFS);
 }
 
 /*
  *	bremfree:
  *
  *	Mark the buffer for removal from the appropriate free list.
  *	
  */
 void
 bremfree(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT((bp->b_flags & B_REMFREE) == 0,
 	    ("bremfree: buffer %p already marked for delayed removal.", bp));
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfree: buffer %p not on a queue.", bp));
 	BUF_ASSERT_XLOCKED(bp);
 
 	bp->b_flags |= B_REMFREE;
 }
 
 /*
  *	bremfreef:
  *
  *	Force an immediate removal from a free list.  Used only in nfs when
  *	it abuses the b_freelist pointer.
  */
 void
 bremfreef(struct buf *bp)
 {
 	struct bufqueue *bq;
 
 	bq = bufqueue_acquire(bp);
 	bq_remove(bq, bp);
 	BQ_UNLOCK(bq);
 }
 
 static void
 bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname)
 {
 
 	mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF);
 	TAILQ_INIT(&bq->bq_queue);
 	bq->bq_len = 0;
 	bq->bq_index = qindex;
 	bq->bq_subqueue = subqueue;
 }
 
 static void
 bd_init(struct bufdomain *bd)
 {
 	int i;
 
 	bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1];
 	bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock");
 	bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock");
 	for (i = 0; i <= mp_maxid; i++)
 		bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i,
 		    "bufq clean subqueue lock");
 	mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF);
 }
 
 /*
  *	bq_remove:
  *
  *	Removes a buffer from the free list, must be called with the
  *	correct qlock held.
  */
 static void
 bq_remove(struct bufqueue *bq, struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bq_remove: buffer %p not on a queue.", bp));
 	KASSERT(bufqueue(bp) == bq,
 	    ("bq_remove: Remove buffer %p from wrong queue.", bp));
 
 	BQ_ASSERT_LOCKED(bq);
 	if (bp->b_qindex != QUEUE_EMPTY) {
 		BUF_ASSERT_XLOCKED(bp);
 	}
 	KASSERT(bq->bq_len >= 1,
 	    ("queue %d underflow", bp->b_qindex));
 	TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
 	bq->bq_len--;
 	bp->b_qindex = QUEUE_NONE;
 	bp->b_flags &= ~(B_REMFREE | B_REUSE);
 }
 
 static void
 bd_flush(struct bufdomain *bd, struct bufqueue *bq)
 {
 	struct buf *bp;
 
 	BQ_ASSERT_LOCKED(bq);
 	if (bq != bd->bd_cleanq) {
 		BD_LOCK(bd);
 		while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) {
 			TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
 			TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp,
 			    b_freelist);
 			bp->b_subqueue = bd->bd_cleanq->bq_subqueue;
 		}
 		bd->bd_cleanq->bq_len += bq->bq_len;
 		bq->bq_len = 0;
 	}
 	if (bd->bd_wanted) {
 		bd->bd_wanted = 0;
 		wakeup(&bd->bd_wanted);
 	}
 	if (bq != bd->bd_cleanq)
 		BD_UNLOCK(bd);
 }
 
 static int
 bd_flushall(struct bufdomain *bd)
 {
 	struct bufqueue *bq;
 	int flushed;
 	int i;
 
 	if (bd->bd_lim == 0)
 		return (0);
 	flushed = 0;
 	for (i = 0; i <= mp_maxid; i++) {
 		bq = &bd->bd_subq[i];
 		if (bq->bq_len == 0)
 			continue;
 		BQ_LOCK(bq);
 		bd_flush(bd, bq);
 		BQ_UNLOCK(bq);
 		flushed++;
 	}
 
 	return (flushed);
 }
 
 static void
 bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock)
 {
 	struct bufdomain *bd;
 
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("bq_insert: free buffer %p onto another queue?", bp);
 
 	bd = bufdomain(bp);
 	if (bp->b_flags & B_AGE) {
 		/* Place this buf directly on the real queue. */
 		if (bq->bq_index == QUEUE_CLEAN)
 			bq = bd->bd_cleanq;
 		BQ_LOCK(bq);
 		TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist);
 	} else {
 		BQ_LOCK(bq);
 		TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
 	}
 	bp->b_flags &= ~(B_AGE | B_REUSE);
 	bq->bq_len++;
 	bp->b_qindex = bq->bq_index;
 	bp->b_subqueue = bq->bq_subqueue;
 
 	/*
 	 * Unlock before we notify so that we don't wakeup a waiter that
 	 * fails a trylock on the buf and sleeps again.
 	 */
 	if (unlock)
 		BUF_UNLOCK(bp);
 
 	if (bp->b_qindex == QUEUE_CLEAN) {
 		/*
 		 * Flush the per-cpu queue and notify any waiters.
 		 */
 		if (bd->bd_wanted || (bq != bd->bd_cleanq &&
 		    bq->bq_len >= bd->bd_lim))
 			bd_flush(bd, bq);
 	}
 	BQ_UNLOCK(bq);
 }
 
 /*
  *	bufkva_free:
  *
  *	Free the kva allocation for a buffer.
  *
  */
 static void
 bufkva_free(struct buf *bp)
 {
 
 #ifdef INVARIANTS
 	if (bp->b_kvasize == 0) {
 		KASSERT(bp->b_kvabase == unmapped_buf &&
 		    bp->b_data == unmapped_buf,
 		    ("Leaked KVA space on %p", bp));
 	} else if (buf_mapped(bp))
 		BUF_CHECK_MAPPED(bp);
 	else
 		BUF_CHECK_UNMAPPED(bp);
 #endif
 	if (bp->b_kvasize == 0)
 		return;
 
 	vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
 	counter_u64_add(bufkvaspace, -bp->b_kvasize);
 	counter_u64_add(buffreekvacnt, 1);
 	bp->b_data = bp->b_kvabase = unmapped_buf;
 	bp->b_kvasize = 0;
 }
 
 /*
  *	bufkva_alloc:
  *
  *	Allocate the buffer KVA and set b_kvasize and b_kvabase.
  */
 static int
 bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
 {
 	vm_offset_t addr;
 	int error;
 
 	KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
 	    ("Invalid gbflags 0x%x in %s", gbflags, __func__));
 
 	bufkva_free(bp);
 
 	addr = 0;
 	error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
 	if (error != 0) {
 		/*
 		 * Buffer map is too fragmented.  Request the caller
 		 * to defragment the map.
 		 */
 		return (error);
 	}
 	bp->b_kvabase = (caddr_t)addr;
 	bp->b_kvasize = maxsize;
 	counter_u64_add(bufkvaspace, bp->b_kvasize);
 	if ((gbflags & GB_UNMAPPED) != 0) {
 		bp->b_data = unmapped_buf;
 		BUF_CHECK_UNMAPPED(bp);
 	} else {
 		bp->b_data = bp->b_kvabase;
 		BUF_CHECK_MAPPED(bp);
 	}
 	return (0);
 }
 
 /*
  *	bufkva_reclaim:
  *
  *	Reclaim buffer kva by freeing buffers holding kva.  This is a vmem
  *	callback that fires to avoid returning failure.
  */
 static void
 bufkva_reclaim(vmem_t *vmem, int flags)
 {
 	bool done;
 	int q;
 	int i;
 
 	done = false;
 	for (i = 0; i < 5; i++) {
 		for (q = 0; q < buf_domains; q++)
 			if (buf_recycle(&bdomain[q], true) != 0)
 				done = true;
 		if (done)
 			break;
 	}
 	return;
 }
 
 /*
  * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
  * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
  * the buffer is valid and we do not have to do anything.
  */
 static void
 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt,
     struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *))
 {
 	struct buf *rabp;
 	struct thread *td;
 	int i;
 
 	td = curthread;
 
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
 		if ((rabp->b_flags & B_CACHE) != 0) {
 			brelse(rabp);
 			continue;
 		}
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(curproc);
 			racct_add_buf(curproc, rabp, 0);
 			PROC_UNLOCK(curproc);
 		}
 #endif /* RACCT */
 		td->td_ru.ru_inblock++;
 		rabp->b_flags |= B_ASYNC;
 		rabp->b_flags &= ~B_INVAL;
 		if ((flags & GB_CKHASH) != 0) {
 			rabp->b_flags |= B_CKHASH;
 			rabp->b_ckhashcalc = ckhashfunc;
 		}
 		rabp->b_ioflags &= ~BIO_ERROR;
 		rabp->b_iocmd = BIO_READ;
 		if (rabp->b_rcred == NOCRED && cred != NOCRED)
 			rabp->b_rcred = crhold(cred);
 		vfs_busy_pages(rabp, 0);
 		BUF_KERNPROC(rabp);
 		rabp->b_iooffset = dbtob(rabp->b_blkno);
 		bstrategy(rabp);
 	}
 }
 
 /*
  * Entry point for bread() and breadn() via #defines in sys/buf.h.
  *
  * Get a buffer with the specified data.  Look in the cache first.  We
  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  * is set, the buffer is valid and we do not have to do anything, see
  * getblk(). Also starts asynchronous I/O on read-ahead blocks.
  *
  * Always return a NULL buffer pointer (in bpp) when returning an error.
  */
 int
 breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
     int *rabsize, int cnt, struct ucred *cred, int flags,
     void (*ckhashfunc)(struct buf *), struct buf **bpp)
 {
 	struct buf *bp;
 	struct thread *td;
 	int error, readwait, rv;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
 	td = curthread;
 	/*
 	 * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags
 	 * are specified.
 	 */
 	error = getblkx(vp, blkno, size, 0, 0, flags, &bp);
 	if (error != 0) {
 		*bpp = NULL;
 		return (error);
 	}
 	flags &= ~GB_NOSPARSE;
 	*bpp = bp;
 
 	/*
 	 * If not found in cache, do some I/O
 	 */
 	readwait = 0;
 	if ((bp->b_flags & B_CACHE) == 0) {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			racct_add_buf(td->td_proc, bp, 0);
 			PROC_UNLOCK(td->td_proc);
 		}
 #endif /* RACCT */
 		td->td_ru.ru_inblock++;
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		if ((flags & GB_CKHASH) != 0) {
 			bp->b_flags |= B_CKHASH;
 			bp->b_ckhashcalc = ckhashfunc;
 		}
 		bp->b_ioflags &= ~BIO_ERROR;
 		if (bp->b_rcred == NOCRED && cred != NOCRED)
 			bp->b_rcred = crhold(cred);
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
 		++readwait;
 	}
 
 	/*
 	 * Attempt to initiate asynchronous I/O on read-ahead blocks.
 	 */
 	breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc);
 
 	rv = 0;
 	if (readwait) {
 		rv = bufwait(bp);
 		if (rv != 0) {
 			brelse(bp);
 			*bpp = NULL;
 		}
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 int
 bufwrite(struct buf *bp)
 {
 	int oldflags;
 	struct vnode *vp;
 	long space;
 	int vp_md;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
 		bp->b_flags |= B_INVAL | B_RELBUF;
 		bp->b_flags &= ~B_CACHE;
 		brelse(bp);
 		return (ENXIO);
 	}
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	if (bp->b_flags & B_BARRIER)
 		atomic_add_long(&barrierwrites, 1);
 
 	oldflags = bp->b_flags;
 
 	BUF_ASSERT_HELD(bp);
 
 	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
 	    ("FFS background buffer should not get here %p", bp));
 
 	vp = bp->b_vp;
 	if (vp)
 		vp_md = vp->v_vflag & VV_MD;
 	else
 		vp_md = 0;
 
 	/*
 	 * Mark the buffer clean.  Increment the bufobj write count
 	 * before bundirty() call, to prevent other thread from seeing
 	 * empty dirty list and zero counter for writes in progress,
 	 * falsely indicating that the bufobj is clean.
 	 */
 	bufobj_wref(bp->b_bufobj);
 	bundirty(bp);
 
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_flags |= B_CACHE;
 	bp->b_iocmd = BIO_WRITE;
 
 	vfs_busy_pages(bp, 1);
 
 	/*
 	 * Normal bwrites pipeline writes
 	 */
 	bp->b_runningbufspace = bp->b_bufsize;
 	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(curproc);
 		racct_add_buf(curproc, bp, 1);
 		PROC_UNLOCK(curproc);
 	}
 #endif /* RACCT */
 	curthread->td_ru.ru_oublock++;
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	buf_track(bp, __func__);
 	bstrategy(bp);
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
 	} else if (space > hirunningspace) {
 		/*
 		 * don't allow the async write to saturate the I/O
 		 * system.  We will not deadlock here because
 		 * we are blocking waiting for I/O that is already in-progress
 		 * to complete. We do not block here if it is the update
 		 * or syncer daemon trying to clean up as that can lead
 		 * to deadlock.
 		 */
 		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
 			waitrunningbufspace();
 	}
 
 	return (0);
 }
 
 void
 bufbdflush(struct bufobj *bo, struct buf *bp)
 {
 	struct buf *nbp;
 
 	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
 		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
 		altbufferflushes++;
 	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
 		BO_LOCK(bo);
 		/*
 		 * Try to find a buffer to flush.
 		 */
 		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 			    BUF_LOCK(nbp,
 				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
 				continue;
 			if (bp == nbp)
 				panic("bdwrite: found ourselves");
 			BO_UNLOCK(bo);
 			/* Don't countdeps with the bo lock held. */
 			if (buf_countdeps(nbp, 0)) {
 				BO_LOCK(bo);
 				BUF_UNLOCK(nbp);
 				continue;
 			}
 			if (nbp->b_flags & B_CLUSTEROK) {
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
 				bawrite(nbp);
 			}
 			dirtybufferflushes++;
 			break;
 		}
 		if (nbp == NULL)
 			BO_UNLOCK(bo);
 	}
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  * anything if the buffer is marked invalid.
  *
  * Note that since the buffer must be completely valid, we can safely
  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  * biodone() in order to prevent getblk from writing the buffer
  * out synchronously.
  */
 void
 bdwrite(struct buf *bp)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT((bp->b_flags & B_BARRIER) == 0,
 	    ("Barrier request in delayed write %p", bp));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 
 	/*
 	 * If we have too many dirty buffers, don't create any more.
 	 * If we are wildly over our limit, then force a complete
 	 * cleanup. Otherwise, just keep the situation from getting
 	 * out of control. Note that we have to avoid a recursive
 	 * disaster and not try to clean up after our own cleanup!
 	 */
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
 		td->td_pflags |= TDP_INBDFLUSH;
 		BO_BDFLUSH(bo, bp);
 		td->td_pflags &= ~TDP_INBDFLUSH;
 	} else
 		recursiveflushes++;
 
 	bdirty(bp);
 	/*
 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
 	 * true even of NFS now.
 	 */
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
 	buf_track(bp, __func__);
 
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty
 	 * pages.
 	 *
 	 * Mark the buffer pages as clean.  We need to do this here to
 	 * satisfy the vnode_pager and the pageout daemon, so that it
 	 * thinks that the pages have been "cleaned".  Note that since
 	 * the pages are in a delayed write buffer -- the VFS layer
 	 * "will" see that the pages get written out on the next sync,
 	 * or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages_dirty_buf(bp);
 	bqrelse(bp);
 
 	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 	 * due to the softdep code.
 	 */
 }
 
 /*
  *	bdirty:
  *
  *	Turn buffer into delayed write request.  We must clear BIO_READ and
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
  *	clears B_DONE ( else a panic will occur later ).  
  *
  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *
  *	The buffer must be on QUEUE_NONE.
  */
 void
 bdirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	BUF_ASSERT_HELD(bp);
 	bp->b_flags &= ~(B_RELBUF);
 	bp->b_iocmd = BIO_WRITE;
 
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
 		reassignbuf(bp);
 		bdirtyadd(bp);
 	}
 }
 
 /*
  *	bundirty:
  *
  *	Clear B_DELWRI for buffer.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *	
  *	The buffer must be on QUEUE_NONE.
  */
 
 void
 bundirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
 		bdirtysub(bp);
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
 	 */
 	bp->b_flags &= ~B_DEFERRED;
 }
 
 /*
  *	bawrite:
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
  *
  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC;
 	(void) bwrite(bp);
 }
 
 /*
  *	babarrierwrite:
  *
  *	Asynchronous barrier write.  Start output on a buffer, but do not
  *	wait for it to complete.  Place a write barrier after this write so
  *	that this buffer and all buffers written before it are committed to
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 void
 babarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC | B_BARRIER;
 	(void) bwrite(bp);
 }
 
 /*
  *	bbarrierwrite:
  *
  *	Synchronous barrier write.  Start output on a buffer and wait for
  *	it to complete.  Place a write barrier after this write so that
  *	this buffer and all buffers written before it are committed to 
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 int
 bbarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_BARRIER;
 	return (bwrite(bp));
 }
 
 /*
  *	bwillwrite:
  *
  *	Called prior to the locking of any vnodes when we are expecting to
  *	write.  We do not want to starve the buffer cache with too many
  *	dirty buffers so we block here.  By blocking prior to the locking
  *	of any vnodes we attempt to avoid the situation where a locked vnode
  *	prevents the various system daemons from flushing related buffers.
  */
 void
 bwillwrite(void)
 {
 
 	if (buf_dirty_count_severe()) {
 		mtx_lock(&bdirtylock);
 		while (buf_dirty_count_severe()) {
 			bdirtywait = 1;
 			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
 			    "flswai", 0);
 		}
 		mtx_unlock(&bdirtylock);
 	}
 }
 
 /*
  * Return true if we have too many dirty buffers.
  */
 int
 buf_dirty_count_severe(void)
 {
 
 	return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty));
 }
 
 /*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
  *	buffer will be stashed in the appropriate bufqueue[] allowing it
  *	to be accessed later as a cache entity or reused for other purposes.
  */
 void
 brelse(struct buf *bp)
 {
 	struct mount *v_mnt;
 	int qindex;
 
 	/*
 	 * Many functions erroneously call brelse with a NULL bp under rare
 	 * error conditions. Simply return when called with a NULL bp.
 	 */
 	if (bp == NULL)
 		return;
 	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 	KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0,
 	    ("brelse: non-VMIO buffer marked NOREUSE"));
 
 	if (BUF_LOCKRECURSED(bp)) {
 		/*
 		 * Do not process, in particular, do not handle the
 		 * B_INVAL/B_RELBUF and do not release to free list.
 		 */
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	if (bp->b_flags & B_MANAGED) {
 		bqrelse(bp);
 		return;
 	}
 
 	if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags &= ~BV_BKGRDERR;
 		BO_UNLOCK(bp->b_bufobj);
 		bdirty(bp);
 	}
 	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
 	    (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) &&
 	    !(bp->b_flags & B_INVAL)) {
 		/*
 		 * Failed write, redirty.  All errors except ENXIO (which
 		 * means the device is gone) are treated as being
 		 * transient.
 		 *
 		 * XXX Treating EIO as transient is not correct; the
 		 * contract with the local storage device drivers is that
 		 * they will only return EIO once the I/O is no longer
 		 * retriable.  Network I/O also respects this through the
 		 * guarantees of TCP and/or the internal retries of NFS.
 		 * ENOMEM might be transient, but we also have no way of
 		 * knowing when its ok to retry/reschedule.  In general,
 		 * this entire case should be made obsolete through better
 		 * error handling/recovery and resource scheduling.
 		 *
 		 * Do this also for buffers that failed with ENXIO, but have
 		 * non-empty dependencies - the soft updates code might need
 		 * to access the buffer to untangle them.
 		 *
 		 * Must clear BIO_ERROR to prevent pages from being scrapped.
 		 */
 		bp->b_ioflags &= ~BIO_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
 	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
 		/*
 		 * Either a failed read I/O, or we were asked to free or not
 		 * cache the buffer, or we failed to write to a device that's
 		 * no longer present.
 		 */
 		bp->b_flags |= B_INVAL;
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI)
 			bdirtysub(bp);
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			allocbuf(bp, 0);
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 	}
 
 	/*
 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_truncate() 
 	 * is called with B_DELWRI set, the underlying pages may wind up
 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 	 * because pages associated with a B_DELWRI bp are marked clean.
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_truncate(), even
 	 * if B_DELWRI is set.
 	 */
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
 	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
 	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 	 * the commit state and we cannot afford to lose the buffer. If the
 	 * buffer has a background write in progress, we need to keep it
 	 * around to prevent it from being reconstituted and starting a second
 	 * background write.
 	 */
 
 	v_mnt = bp->b_vp != NULL ? bp->b_vp->v_mount : NULL;
 
 	if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE ||
 	    (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) &&
 	    (v_mnt == NULL || (v_mnt->mnt_vfc->vfc_flags & VFCF_NETWORK) == 0 ||
 	    vn_isdisk(bp->b_vp, NULL) || (bp->b_flags & B_DELWRI) == 0)) {
 		vfs_vmio_invalidate(bp);
 		allocbuf(bp, 0);
 	}
 
 	if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 ||
 	    (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) {
 		allocbuf(bp, 0);
 		bp->b_flags &= ~B_NOREUSE;
 		if (bp->b_vp != NULL)
 			brelvp(bp);
 	}
 			
 	/*
 	 * If the buffer has junk contents signal it and eventually
 	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
 	 * doesn't find it.
 	 */
 	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
 	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
 		bp->b_flags |= B_INVAL;
 	if (bp->b_flags & B_INVAL) {
 		if (bp->b_flags & B_DELWRI)
 			bundirty(bp);
 		if (bp->b_vp)
 			brelvp(bp);
 	}
 
 	buf_track(bp, __func__);
 
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		buf_free(bp);
 		return;
 	}
 	/* buffers with junk contents */
 	if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
 	    (bp->b_ioflags & BIO_ERROR)) {
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 2");
 		qindex = QUEUE_CLEAN;
 		bp->b_flags |= B_AGE;
 	/* remaining buffers */
 	} else if (bp->b_flags & B_DELWRI)
 		qindex = QUEUE_DIRTY;
 	else
 		qindex = QUEUE_CLEAN;
 
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("brelse: not dirty");
 
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT);
 	/* binsfree unlocks bp. */
 	binsfree(bp, qindex);
 }
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
  * known good buffers need to be requeued but we think we may need the data
  * again soon.
  *
  * XXX we should be able to leave the B_RELBUF hint set on completion.
  */
 void
 bqrelse(struct buf *bp)
 {
 	int qindex;
 
 	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	qindex = QUEUE_NONE;
 	if (BUF_LOCKRECURSED(bp)) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		return;
 	}
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 
 	if (bp->b_flags & B_MANAGED) {
 		if (bp->b_flags & B_REMFREE)
 			bremfreef(bp);
 		goto out;
 	}
 
 	/* buffers with stale but valid contents */
 	if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
 	    BV_BKGRDERR)) == BV_BKGRDERR) {
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags &= ~BV_BKGRDERR;
 		BO_UNLOCK(bp->b_bufobj);
 		qindex = QUEUE_DIRTY;
 	} else {
 		if ((bp->b_flags & B_DELWRI) == 0 &&
 		    (bp->b_xflags & BX_VNDIRTY))
 			panic("bqrelse: not dirty");
 		if ((bp->b_flags & B_NOREUSE) != 0) {
 			brelse(bp);
 			return;
 		}
 		qindex = QUEUE_CLEAN;
 	}
 	buf_track(bp, __func__);
 	/* binsfree unlocks bp. */
 	binsfree(bp, qindex);
 	return;
 
 out:
 	buf_track(bp, __func__);
 	/* unlock */
 	BUF_UNLOCK(bp);
 }
 
 /*
  * Complete I/O to a VMIO backed page.  Validate the pages as appropriate,
  * restore bogus pages.
  */
 static void
 vfs_vmio_iodone(struct buf *bp)
 {
 	vm_ooffset_t foff;
 	vm_page_t m;
 	vm_object_t obj;
 	struct vnode *vp __unused;
 	int i, iosize, resid;
 	bool bogus;
 
 	obj = bp->b_bufobj->bo_object;
 	KASSERT(obj->paging_in_progress >= bp->b_npages,
 	    ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)",
 	    obj->paging_in_progress, bp->b_npages));
 
 	vp = bp->b_vp;
 	KASSERT(vp->v_holdcnt > 0,
 	    ("vfs_vmio_iodone: vnode %p has zero hold count", vp));
 	KASSERT(vp->v_object != NULL,
 	    ("vfs_vmio_iodone: vnode %p has no vm_object", vp));
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_vmio_iodone: bp %p has no buffer offset", bp));
 
 	bogus = false;
 	iosize = bp->b_bcount - bp->b_resid;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 		if (resid > iosize)
 			resid = iosize;
 
 		/*
 		 * cleanup bogus pages, restoring the originals
 		 */
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			bogus = true;
 			m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 			if (m == NULL)
 				panic("biodone: page disappeared!");
 			bp->b_pages[i] = m;
 		} else if ((bp->b_iocmd == BIO_READ) && resid > 0) {
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly ( see bdwrite() ), so we 
 			 * only need to do this here in the read case.
 			 */
 			KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK,
 			    resid)) == 0, ("vfs_vmio_iodone: page %p "
 			    "has unexpected dirty bits", m));
 			vfs_page_set_valid(bp, foff, m);
 		}
 		KASSERT(OFF_TO_IDX(foff) == m->pindex,
 		    ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch",
 		    (intmax_t)foff, (uintmax_t)m->pindex));
 
 		vm_page_sunbusy(m);
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		iosize -= resid;
 	}
 	vm_object_pip_wakeupn(obj, bp->b_npages);
 	VM_OBJECT_WUNLOCK(obj);
 	if (bogus && buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  * Perform page invalidation when a buffer is released.  The fully invalid
  * pages will be reclaimed later in vfs_vmio_truncate().
  */
 static void
 vfs_vmio_invalidate(struct buf *bp)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	int flags, i, resid, poffset, presid;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 	/*
 	 * Get the base offset and length of the buffer.  Note that 
 	 * in the VMIO case if the buffer block size is not
 	 * page-aligned then b_data pointer may not be page-aligned.
 	 * But our b_pages[] array *IS* page aligned.
 	 *
 	 * block sizes less then DEV_BSIZE (usually 512) are not 
 	 * supported due to the page granularity bits (m->valid,
 	 * m->dirty, etc...). 
 	 *
 	 * See man buf(9) for more information
 	 */
 	flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0;
 	obj = bp->b_bufobj->bo_object;
 	resid = bp->b_bufsize;
 	poffset = bp->b_offset & PAGE_MASK;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page)
 			panic("vfs_vmio_invalidate: Unexpected bogus page.");
 		bp->b_pages[i] = NULL;
 
 		presid = resid > (PAGE_SIZE - poffset) ?
 		    (PAGE_SIZE - poffset) : resid;
 		KASSERT(presid >= 0, ("brelse: extra page"));
 		while (vm_page_xbusied(m)) {
 			vm_page_lock(m);
 			VM_OBJECT_WUNLOCK(obj);
 			vm_page_busy_sleep(m, "mbncsh", true);
 			VM_OBJECT_WLOCK(obj);
 		}
 		if (pmap_page_wired_mappings(m) == 0)
 			vm_page_set_invalid(m, poffset, presid);
 		vm_page_release_locked(m, flags);
 		resid -= presid;
 		poffset = 0;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	bp->b_npages = 0;
 }
 
 /*
  * Page-granular truncation of an existing VMIO buffer.
  */
 static void
 vfs_vmio_truncate(struct buf *bp, int desiredpages)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	int flags, i;
 
 	if (bp->b_npages == desiredpages)
 		return;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) +
 		    (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 
 	/*
 	 * The object lock is needed only if we will attempt to free pages.
 	 */
 	flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0;
 	if ((bp->b_flags & B_DIRECT) != 0) {
 		flags |= VPR_TRYFREE;
 		obj = bp->b_bufobj->bo_object;
 		VM_OBJECT_WLOCK(obj);
 	} else {
 		obj = NULL;
 	}
 	for (i = desiredpages; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		KASSERT(m != bogus_page, ("allocbuf: bogus page found"));
 		bp->b_pages[i] = NULL;
 		if (obj != NULL)
 			vm_page_release_locked(m, flags);
 		else
 			vm_page_release(m, flags);
 	}
 	if (obj != NULL)
 		VM_OBJECT_WUNLOCK(obj);
 	bp->b_npages = desiredpages;
 }
 
 /*
  * Byte granular extension of VMIO buffers.
  */
 static void
 vfs_vmio_extend(struct buf *bp, int desiredpages, int size)
 {
 	/*
 	 * We are growing the buffer, possibly in a 
 	 * byte-granular fashion.
 	 */
 	vm_object_t obj;
 	vm_offset_t toff;
 	vm_offset_t tinc;
 	vm_page_t m;
 
 	/*
 	 * Step 1, bring in the VM pages from the object, allocating
 	 * them if necessary.  We must clear B_CACHE if these pages
 	 * are not valid for the range covered by the buffer.
 	 */
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_WLOCK(obj);
 	if (bp->b_npages < desiredpages) {
 		/*
 		 * We must allocate system pages since blocking
 		 * here could interfere with paging I/O, no
 		 * matter which process we are.
 		 *
 		 * Only exclusive busy can be tested here.
 		 * Blocking on shared busy might lead to
 		 * deadlocks once allocbuf() is called after
 		 * pages are vfs_busy_pages().
 		 */
 		(void)vm_page_grab_pages(obj,
 		    OFF_TO_IDX(bp->b_offset) + bp->b_npages,
 		    VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY |
 		    VM_ALLOC_NOBUSY | VM_ALLOC_WIRED,
 		    &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages);
 		bp->b_npages = desiredpages;
 	}
 
 	/*
 	 * Step 2.  We've loaded the pages into the buffer,
 	 * we have to figure out if we can still have B_CACHE
 	 * set.  Note that B_CACHE is set according to the
 	 * byte-granular range ( bcount and size ), not the
 	 * aligned range ( newbsize ).
 	 *
 	 * The VM test is against m->valid, which is DEV_BSIZE
 	 * aligned.  Needless to say, the validity of the data
 	 * needs to also be DEV_BSIZE aligned.  Note that this
 	 * fails with NFS if the server or some other client
 	 * extends the file's EOF.  If our buffer is resized, 
 	 * B_CACHE may remain set! XXX
 	 */
 	toff = bp->b_bcount;
 	tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 	while ((bp->b_flags & B_CACHE) && toff < size) {
 		vm_pindex_t pi;
 
 		if (tinc > (size - toff))
 			tinc = size - toff;
 		pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT;
 		m = bp->b_pages[pi];
 		vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m);
 		toff += tinc;
 		tinc = PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 
 	/*
 	 * Step 3, fixup the KVA pmap.
 	 */
 	if (buf_mapped(bp))
 		bpmap_qenter(bp);
 	else
 		BUF_CHECK_UNMAPPED(bp);
 }
 
 /*
  * Check to see if a block at a particular lbn is available for a clustered
  * write.
  */
 static int
 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
 {
 	struct buf *bpa;
 	int match;
 
 	match = 0;
 
 	/* If the buf isn't in core skip it */
 	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
 		return (0);
 
 	/* If the buf is busy we don't want to wait for it */
 	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		return (0);
 
 	/* Only cluster with valid clusterable delayed write buffers */
 	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
 	    (B_DELWRI | B_CLUSTEROK))
 		goto done;
 
 	if (bpa->b_bufsize != size)
 		goto done;
 
 	/*
 	 * Check to see if it is in the expected place on disk and that the
 	 * block has been mapped.
 	 */
 	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
 		match = 1;
 done:
 	BUF_UNLOCK(bpa);
 	return (match);
 }
 
 /*
  *	vfs_bio_awrite:
  *
  *	Implement clustered async writes for clearing out B_DELWRI buffers.
  *	This is much better then the old way of writing only one buffer at
  *	a time.  Note that we may not be presented with the buffers in the 
  *	correct order, so we search for the cluster in both directions.
  */
 int
 vfs_bio_awrite(struct buf *bp)
 {
 	struct bufobj *bo;
 	int i;
 	int j;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int ncl;
 	int nwritten;
 	int size;
 	int maxcl;
 	int gbflags;
 
 	bo = &vp->v_bufobj;
 	gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0;
 	/*
 	 * right now we support clustered writing only to regular files.  If
 	 * we find a clusterable block we could be in the middle of a cluster
 	 * rather then at the beginning.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
 		BO_RLOCK(bo);
 		for (i = 1; i < maxcl; i++)
 			if (vfs_bio_clcheck(vp, size, lblkno + i,
 			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
 				break;
 
 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) 
 			if (vfs_bio_clcheck(vp, size, lblkno - j,
 			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
 				break;
 		BO_RUNLOCK(bo);
 		--j;
 		ncl = i + j;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			BUF_UNLOCK(bp);
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
 			    gbflags);
 			return (nwritten);
 		}
 	}
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;
 	/*
 	 * default (old) behavior, writing out only one block
 	 *
 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) bwrite(bp);
 
 	return (nwritten);
 }
 
 /*
  *	getnewbuf_kva:
  *
  *	Allocate KVA for an empty buf header according to gbflags.
  */
 static int
 getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
 {
 
 	if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) {
 		/*
 		 * In order to keep fragmentation sane we only allocate kva
 		 * in BKVASIZE chunks.  XXX with vmem we can do page size.
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
 		if (maxsize != bp->b_kvasize &&
 		    bufkva_alloc(bp, maxsize, gbflags))
 			return (ENOSPC);
 	}
 	return (0);
 }
 
 /*
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers
  *	in the bufqueues as necessary.  The new buffer is returned locked.
  *
  *	We block if:
  *		We have insufficient buffer headers
  *		We have insufficient buffer space
  *		buffer_arena is too fragmented ( space reservation fails )
  *		If we have to flush dirty buffers ( but we try to avoid this )
  *
  *	The caller is responsible for releasing the reserved bufspace after
  *	allocbuf() is called.
  */
 static struct buf *
 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
 {
 	struct bufdomain *bd;
 	struct buf *bp;
 	bool metadata, reserved;
 
 	bp = NULL;
 	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	if (!unmapped_buf_allowed)
 		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
 	    vp->v_type == VCHR)
 		metadata = true;
 	else
 		metadata = false;
 	if (vp == NULL)
 		bd = &bdomain[0];
 	else
 		bd = &bdomain[vp->v_bufobj.bo_domain];
 
 	counter_u64_add(getnewbufcalls, 1);
 	reserved = false;
 	do {
 		if (reserved == false &&
 		    bufspace_reserve(bd, maxsize, metadata) != 0) {
 			counter_u64_add(getnewbufrestarts, 1);
 			continue;
 		}
 		reserved = true;
 		if ((bp = buf_alloc(bd)) == NULL) {
 			counter_u64_add(getnewbufrestarts, 1);
 			continue;
 		}
 		if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
 			return (bp);
 		break;
 	} while (buf_recycle(bd, false) == 0);
 
 	if (reserved)
 		bufspace_release(bd, maxsize);
 	if (bp != NULL) {
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 	}
 	bufspace_wait(bd, vp, gbflags, slpflag, slptimeo);
 
 	return (NULL);
 }
 
 /*
  *	buf_daemon:
  *
  *	buffer flushing daemon.  Buffers are normally flushed by the
  *	update daemon but if it cannot keep up this process starts to
  *	take the load in an attempt to prevent getnewbuf() from blocking.
  */
 static struct kproc_desc buf_kp = {
 	"bufdaemon",
 	buf_daemon,
 	&bufdaemonproc
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
 static int
 buf_flush(struct vnode *vp, struct bufdomain *bd, int target)
 {
 	int flushed;
 
 	flushed = flushbufqueues(vp, bd, target, 0);
 	if (flushed == 0) {
 		/*
 		 * Could not find any buffers without rollback
 		 * dependencies, so just write the first one
 		 * in the hopes of eventually making progress.
 		 */
 		if (vp != NULL && target > 2)
 			target /= 2;
 		flushbufqueues(vp, bd, target, 1);
 	}
 	return (flushed);
 }
 
 static void
 buf_daemon()
 {
 	struct bufdomain *bd;
 	int speedupreq;
 	int lodirty;
 	int i;
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
 	 */
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread,
 	    SHUTDOWN_PRI_LAST + 100);
 
 	/*
 	 * Start the buf clean daemons as children threads.
 	 */
 	for (i = 0 ; i < buf_domains; i++) {
 		int error;
 
 		error = kthread_add((void (*)(void *))bufspace_daemon,
 		    &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i);
 		if (error)
 			panic("error %d spawning bufspace daemon", error);
 	}
 
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
 	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
 	mtx_lock(&bdlock);
 	for (;;) {
 		bd_request = 0;
 		mtx_unlock(&bdlock);
 
 		kthread_suspend_check();
 
 		/*
 		 * Save speedupreq for this pass and reset to capture new
 		 * requests.
 		 */
 		speedupreq = bd_speedupreq;
 		bd_speedupreq = 0;
 
 		/*
 		 * Flush each domain sequentially according to its level and
 		 * the speedup request.
 		 */
 		for (i = 0; i < buf_domains; i++) {
 			bd = &bdomain[i];
 			if (speedupreq)
 				lodirty = bd->bd_numdirtybuffers / 2;
 			else
 				lodirty = bd->bd_lodirtybuffers;
 			while (bd->bd_numdirtybuffers > lodirty) {
 				if (buf_flush(NULL, bd,
 				    bd->bd_numdirtybuffers - lodirty) == 0)
 					break;
 				kern_yield(PRI_USER);
 			}
 		}
 
 		/*
 		 * Only clear bd_request if we have reached our low water
 		 * mark.  The buf_daemon normally waits 1 second and
 		 * then incrementally flushes any dirty buffers that have
 		 * built up, within reason.
 		 *
 		 * If we were unable to hit our low water mark and couldn't
 		 * find any flushable buffers, we sleep for a short period
 		 * to avoid endless loops on unlockable buffers.
 		 */
 		mtx_lock(&bdlock);
 		if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) {
 			/*
 			 * We reached our low water mark, reset the
 			 * request and sleep until we are needed again.
 			 * The sleep is just so the suspend code works.
 			 */
 			bd_request = 0;
 			/*
 			 * Do an extra wakeup in case dirty threshold
 			 * changed via sysctl and the explicit transition
 			 * out of shortfall was missed.
 			 */
 			bdirtywakeup();
 			if (runningbufspace <= lorunningspace)
 				runningwakeup();
 			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
 		} else {
 			/*
 			 * We couldn't find any flushable dirty buffers but
 			 * still have too many dirty buffers, we
 			 * have to sleep and try again.  (rare)
 			 */
 			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
 		}
 	}
 }
 
 /*
  *	flushbufqueues:
  *
  *	Try to flush a buffer in the dirty queue.  We must be careful to
  *	free up B_INVAL buffers instead of write them, which NFS is 
  *	particularly sensitive to.
  */
 static int flushwithdeps = 0;
 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
 flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target,
     int flushdeps)
 {
 	struct bufqueue *bq;
 	struct buf *sentinel;
 	struct vnode *vp;
 	struct mount *mp;
 	struct buf *bp;
 	int hasdeps;
 	int flushed;
 	int error;
 	bool unlock;
 
 	flushed = 0;
 	bq = &bd->bd_dirtyq;
 	bp = NULL;
 	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
 	sentinel->b_qindex = QUEUE_SENTINEL;
 	BQ_LOCK(bq);
 	TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist);
 	BQ_UNLOCK(bq);
 	while (flushed != target) {
 		maybe_yield();
 		BQ_LOCK(bq);
 		bp = TAILQ_NEXT(sentinel, b_freelist);
 		if (bp != NULL) {
 			TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist);
 			TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel,
 			    b_freelist);
 		} else {
 			BQ_UNLOCK(bq);
 			break;
 		}
 		/*
 		 * Skip sentinels inserted by other invocations of the
 		 * flushbufqueues(), taking care to not reorder them.
 		 *
 		 * Only flush the buffers that belong to the
 		 * vnode locked by the curthread.
 		 */
 		if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
 		    bp->b_vp != lvp)) {
 			BQ_UNLOCK(bq);
 			continue;
 		}
 		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
 		BQ_UNLOCK(bq);
 		if (error != 0)
 			continue;
 
 		/*
 		 * BKGRDINPROG can only be set with the buf and bufobj
 		 * locks both held.  We tolerate a race to clear it here.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
 		    (bp->b_flags & B_DELWRI) == 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (bp->b_flags & B_INVAL) {
 			bremfreef(bp);
 			brelse(bp);
 			flushed++;
 			continue;
 		}
 
 		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
 			if (flushdeps == 0) {
 				BUF_UNLOCK(bp);
 				continue;
 			}
 			hasdeps = 1;
 		} else
 			hasdeps = 0;
 		/*
 		 * We must hold the lock on a vnode before writing
 		 * one of its buffers. Otherwise we may confuse, or
 		 * in the case of a snapshot vnode, deadlock the
 		 * system.
 		 *
 		 * The lock order here is the reverse of the normal
 		 * of vnode followed by buf lock.  This is ok because
 		 * the NOWAIT will prevent deadlock.
 		 */
 		vp = bp->b_vp;
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (lvp == NULL) {
 			unlock = true;
 			error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
 		} else {
 			ASSERT_VOP_LOCKED(vp, "getbuf");
 			unlock = false;
 			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
 			    vn_lock(vp, LK_TRYUPGRADE);
 		}
 		if (error == 0) {
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
 			if (curproc == bufdaemonproc) {
 				vfs_bio_awrite(bp);
 			} else {
 				bremfree(bp);
 				bwrite(bp);
 				counter_u64_add(notbufdflushes, 1);
 			}
 			vn_finished_write(mp);
 			if (unlock)
 				VOP_UNLOCK(vp, 0);
 			flushwithdeps += hasdeps;
 			flushed++;
 
 			/*
 			 * Sleeping on runningbufspace while holding
 			 * vnode lock leads to deadlock.
 			 */
 			if (curproc == bufdaemonproc &&
 			    runningbufspace > hirunningspace)
 				waitrunningbufspace();
 			continue;
 		}
 		vn_finished_write(mp);
 		BUF_UNLOCK(bp);
 	}
 	BQ_LOCK(bq);
 	TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist);
 	BQ_UNLOCK(bq);
 	free(sentinel, M_TEMP);
 	return (flushed);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct bufobj *bo, daddr_t blkno)
 {
 	struct buf *bp;
 
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	BO_RUNLOCK(bo);
 	return (bp);
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 
 static int
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t toff, tinc, size;
 	vm_page_t m;
 	vm_ooffset_t off;
 
 	ASSERT_VOP_LOCKED(vp, "inmem");
 
 	if (incore(&vp->v_bufobj, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
 	obj = vp->v_object;
 	if (obj == NULL)
 		return (0);
 
 	size = PAGE_SIZE;
 	if (size > vp->v_mount->mnt_stat.f_iosize)
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
 	VM_OBJECT_RLOCK(obj);
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
 			goto notinmem;
 		tinc = size;
 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 		if (vm_page_is_valid(m,
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 			goto notinmem;
 	}
 	VM_OBJECT_RUNLOCK(obj);
 	return 1;
 
 notinmem:
 	VM_OBJECT_RUNLOCK(obj);
 	return (0);
 }
 
 /*
  * Set the dirty range for a buffer based on the status of the dirty
  * bits in the pages comprising the buffer.  The range is limited
  * to the size of the buffer.
  *
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  *
  * Note that while we only really need to clean through to b_bcount, we
  * just go ahead and clean through to b_bufsize.
  */
 static void
 vfs_clean_pages_dirty_buf(struct buf *bp)
 {
 	vm_ooffset_t foff, noff, eoff;
 	vm_page_t m;
 	int i;
 
 	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
 		return;
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
 
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	vfs_drain_busy_pages(bp);
 	vfs_setdirty_locked_object(bp);
 	for (i = 0; i < bp->b_npages; i++) {
 		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		eoff = noff;
 		if (eoff > bp->b_offset + bp->b_bufsize)
 			eoff = bp->b_offset + bp->b_bufsize;
 		m = bp->b_pages[i];
 		vfs_page_set_validclean(bp, foff, m);
 		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 		foff = noff;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 static void
 vfs_setdirty_locked_object(struct buf *bp)
 {
 	vm_object_t object;
 	int i;
 
 	object = bp->b_bufobj->bo_object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * We qualify the scan for modified pages on whether the
 	 * object has been flushed yet.
 	 */
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		vm_offset_t boffset;
 		vm_offset_t eoffset;
 
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
 		 */
 		for (i = 0; i < bp->b_npages; i++)
 			vm_page_test_dirty(bp->b_pages[i]);
 
 		/*
 		 * Calculate the encompassing dirty range, boffset and eoffset,
 		 * (eoffset - boffset) bytes.
 		 */
 
 		for (i = 0; i < bp->b_npages; i++) {
 			if (bp->b_pages[i]->dirty)
 				break;
 		}
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		/*
 		 * Fit it to the buffer.
 		 */
 
 		if (eoffset > bp->b_bcount)
 			eoffset = bp->b_bcount;
 
 		/*
 		 * If we have a good dirty range, merge with the existing
 		 * dirty range.
 		 */
 
 		if (boffset < eoffset) {
 			if (bp->b_dirtyoff > boffset)
 				bp->b_dirtyoff = boffset;
 			if (bp->b_dirtyend < eoffset)
 				bp->b_dirtyend = eoffset;
 		}
 	}
 }
 
 /*
  * Allocate the KVA mapping for an existing buffer.
  * If an unmapped buffer is provided but a mapped buffer is requested, take
  * also care to properly setup mappings between pages and KVA.
  */
 static void
 bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
 {
 	int bsize, maxsize, need_mapping, need_kva;
 	off_t offset;
 
 	need_mapping = bp->b_data == unmapped_buf &&
 	    (gbflags & GB_UNMAPPED) == 0;
 	need_kva = bp->b_kvabase == unmapped_buf &&
 	    bp->b_data == unmapped_buf &&
 	    (gbflags & GB_KVAALLOC) != 0;
 	if (!need_mapping && !need_kva)
 		return;
 
 	BUF_CHECK_UNMAPPED(bp);
 
 	if (need_mapping && bp->b_kvabase != unmapped_buf) {
 		/*
 		 * Buffer is not mapped, but the KVA was already
 		 * reserved at the time of the instantiation.  Use the
 		 * allocated space.
 		 */
 		goto has_addr;
 	}
 
 	/*
 	 * Calculate the amount of the address space we would reserve
 	 * if the buffer was mapped.
 	 */
 	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
 	KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
 	offset = blkno * bsize;
 	maxsize = size + (offset & PAGE_MASK);
 	maxsize = imax(maxsize, bsize);
 
 	while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
 		if ((gbflags & GB_NOWAIT_BD) != 0) {
 			/*
 			 * XXXKIB: defragmentation cannot
 			 * succeed, not sure what else to do.
 			 */
 			panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
 		}
 		counter_u64_add(mappingrestarts, 1);
 		bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0);
 	}
 has_addr:
 	if (need_mapping) {
 		/* b_offset is handled by bpmap_qenter. */
 		bp->b_data = bp->b_kvabase;
 		BUF_CHECK_MAPPED(bp);
 		bpmap_qenter(bp);
 	}
 }
 
 struct buf *
 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
     int flags)
 {
 	struct buf *bp;
 	int error;
 
 	error = getblkx(vp, blkno, size, slpflag, slptimeo, flags, &bp);
 	if (error != 0)
 		return (NULL);
 	return (bp);
 }
 
 /*
  *	getblkx:
  *
  *	Get a block given a specified block and offset into a file/device.
  *	The buffers B_DONE bit will be cleared on return, making it almost
  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
  *	return.  The caller should clear B_INVAL prior to initiating a
  *	READ.
  *
  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
  *	an existing buffer.
  *
  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
  *	and then cleared based on the backing VM.  If the previous buffer is
  *	non-0-sized but invalid, B_CACHE will be cleared.
  *
  *	If getblk() must create a new buffer, the new buffer is returned with
  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
  *	case it is returned with B_INVAL clear and B_CACHE set based on the
  *	backing VM.
  *
  *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
  *	B_CACHE bit is clear.
  *	
  *	What this means, basically, is that the caller should use B_CACHE to
  *	determine whether the buffer is fully valid or not and should clear
  *	B_INVAL prior to issuing a read.  If the caller intends to validate
  *	the buffer by loading its data area with something, the caller needs
  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
  *	the caller should set B_CACHE ( as an optimization ), else the caller
  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
  *	a write attempt or if it was a successful read.  If the caller 
  *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 int
 getblkx(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
     int flags, struct buf **bpp)
 {
 	struct buf *bp;
 	struct bufobj *bo;
 	daddr_t d_blkno;
 	int bsize, error, maxsize, vmio;
 	off_t offset;
 
 	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
 	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	ASSERT_VOP_LOCKED(vp, "getblk");
 	if (size > maxbcachebuf)
 		panic("getblk: size(%d) > maxbcachebuf(%d)\n", size,
 		    maxbcachebuf);
 	if (!unmapped_buf_allowed)
 		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	bo = &vp->v_bufobj;
 	d_blkno = blkno;
 loop:
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	if (bp != NULL) {
 		int lockflags;
 		/*
 		 * Buffer is in-core.  If the buffer is not busy nor managed,
 		 * it must be on a queue.
 		 */
 		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
 
 		if ((flags & GB_LOCK_NOWAIT) != 0)
 			lockflags |= LK_NOWAIT;
 
 		error = BUF_TIMELOCK(bp, lockflags,
 		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
 
 		/*
 		 * If we slept and got the lock we have to restart in case
 		 * the buffer changed identities.
 		 */
 		if (error == ENOLCK)
 			goto loop;
 		/* We timed out or were interrupted. */
 		else if (error != 0)
 			return (error);
 		/* If recursed, assume caller knows the rules. */
 		else if (BUF_LOCKRECURSED(bp))
 			goto end;
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 		 * backing VM cache.
 		 */
 		if (bp->b_flags & B_INVAL)
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
 		if (bp->b_flags & B_MANAGED)
 			MPASS(bp->b_qindex == QUEUE_NONE);
 		else
 			bremfree(bp);
 
 		/*
 		 * check for size inconsistencies for non-VMIO case.
 		 */
 		if (bp->b_bcount != size) {
 			if ((bp->b_flags & B_VMIO) == 0 ||
 			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
 					bp->b_flags |= B_NOCACHE;
 					bwrite(bp);
 				} else {
 					if (LIST_EMPTY(&bp->b_dep)) {
 						bp->b_flags |= B_RELBUF;
 						brelse(bp);
 					} else {
 						bp->b_flags |= B_NOCACHE;
 						bwrite(bp);
 					}
 				}
 				goto loop;
 			}
 		}
 
 		/*
 		 * Handle the case of unmapped buffer which should
 		 * become mapped, or the buffer for which KVA
 		 * reservation is requested.
 		 */
 		bp_unmapped_get_kva(bp, blkno, size, flags);
 
 		/*
 		 * If the size is inconsistent in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
 		allocbuf(bp, size);
 
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
 
 		/*
 		 * A buffer with B_DELWRI set and B_CACHE clear must
 		 * be committed before we can return the buffer in
 		 * order to prevent the caller from issuing a read
 		 * ( due to B_CACHE not being set ) and overwriting
 		 * it.
 		 *
 		 * Most callers, including NFS and FFS, need this to
 		 * operate properly either because they assume they
 		 * can issue a read if B_CACHE is not set, or because
 		 * ( for example ) an uncached B_DELWRI might loop due 
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
 		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 		 * above while extending the buffer, we cannot allow the
 		 * buffer to remain with B_CACHE set after the write
 		 * completes or it will represent a corrupt state.  To
 		 * deal with this we set B_NOCACHE to scrap the buffer
 		 * after the write.
 		 *
 		 * We might be able to do something fancy, like setting
 		 * B_CACHE in bwrite() except if B_DELWRI is already set,
 		 * so the below call doesn't set B_CACHE, but that gets real
 		 * confusing.  This is much easier.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 			bp->b_flags |= B_NOCACHE;
 			bwrite(bp);
 			goto loop;
 		}
 		bp->b_flags &= ~B_DONE;
 	} else {
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
 		BO_RUNLOCK(bo);
 		/*
 		 * If the user does not want us to create the buffer, bail out
 		 * here.
 		 */
 		if (flags & GB_NOCREAT)
 			return (EEXIST);
 
 		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
 		KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
 		offset = blkno * bsize;
 		vmio = vp->v_object != NULL;
 		if (vmio) {
 			maxsize = size + (offset & PAGE_MASK);
 		} else {
 			maxsize = size;
 			/* Do not allow non-VMIO notmapped buffers. */
 			flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 		}
 		maxsize = imax(maxsize, bsize);
 		if ((flags & GB_NOSPARSE) != 0 && vmio &&
 		    !vn_isdisk(vp, NULL)) {
 			error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0);
 			KASSERT(error != EOPNOTSUPP,
 			    ("GB_NOSPARSE from fs not supporting bmap, vp %p",
 			    vp));
 			if (error != 0)
 				return (error);
 			if (d_blkno == -1)
 				return (EJUSTRETURN);
 		}
 
 		bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
 		if (bp == NULL) {
 			if (slpflag || slptimeo)
 				return (ETIMEDOUT);
 			/*
 			 * XXX This is here until the sleep path is diagnosed
 			 * enough to work under very low memory conditions.
 			 *
 			 * There's an issue on low memory, 4BSD+non-preempt
 			 * systems (eg MIPS routers with 32MB RAM) where buffer
 			 * exhaustion occurs without sleeping for buffer
 			 * reclaimation.  This just sticks in a loop and
 			 * constantly attempts to allocate a buffer, which
 			 * hits exhaustion and tries to wakeup bufdaemon.
 			 * This never happens because we never yield.
 			 *
 			 * The real solution is to identify and fix these cases
 			 * so we aren't effectively busy-waiting in a loop
 			 * until the reclaimation path has cycles to run.
 			 */
 			kern_yield(PRI_USER);
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.
 		 *
 		 * Note: this must occur before we associate the buffer
 		 * with the vp especially considering limitations in
 		 * the splay tree implementation when dealing with duplicate
 		 * lblkno's.
 		 */
 		BO_LOCK(bo);
 		if (gbincore(bo, blkno)) {
 			BO_UNLOCK(bo);
 			bp->b_flags |= B_INVAL;
 			bufspace_release(bufdomain(bp), maxsize);
 			brelse(bp);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_lblkno = blkno;
 		bp->b_blkno = d_blkno;
 		bp->b_offset = offset;
 		bgetvp(vp, bp);
 		BO_UNLOCK(bo);
 
 		/*
 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 		 * buffer size starts out as 0, B_CACHE will be set by
 		 * allocbuf() for the VMIO case prior to it testing the
 		 * backing store for validity.
 		 */
 
 		if (vmio) {
 			bp->b_flags |= B_VMIO;
 			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
 			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
 			    bp, vp->v_object, bp->b_bufobj->bo_object));
 		} else {
 			bp->b_flags &= ~B_VMIO;
 			KASSERT(bp->b_bufobj->bo_object == NULL,
 			    ("ARGH! has b_bufobj->bo_object %p %p\n",
 			    bp, bp->b_bufobj->bo_object));
 			BUF_CHECK_MAPPED(bp);
 		}
 
 		allocbuf(bp, size);
 		bufspace_release(bufdomain(bp), maxsize);
 		bp->b_flags &= ~B_DONE;
 	}
 	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
 	BUF_ASSERT_HELD(bp);
 end:
 	buf_track(bp, __func__);
 	KASSERT(bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	*bpp = bp;
 	return (0);
 }
 
 /*
  * Get an empty, disassociated buffer of given size.  The buffer is initially
  * set to B_INVAL.
  */
 struct buf *
 geteblk(int size, int flags)
 {
 	struct buf *bp;
 	int maxsize;
 
 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 	while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
 		if ((flags & GB_NOWAIT_BD) &&
 		    (curthread->td_pflags & TDP_BUFNEED) != 0)
 			return (NULL);
 	}
 	allocbuf(bp, size);
 	bufspace_release(bufdomain(bp), maxsize);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	BUF_ASSERT_HELD(bp);
 	return (bp);
 }
 
 /*
  * Truncate the backing store for a non-vmio buffer.
  */
 static void
 vfs_nonvmio_truncate(struct buf *bp, int newbsize)
 {
 
 	if (bp->b_flags & B_MALLOC) {
 		/*
 		 * malloced buffers are not shrunk
 		 */
 		if (newbsize == 0) {
 			bufmallocadjust(bp, 0);
 			free(bp->b_data, M_BIOBUF);
 			bp->b_data = bp->b_kvabase;
 			bp->b_flags &= ~B_MALLOC;
 		}
 		return;
 	}
 	vm_hold_free_pages(bp, newbsize);
 	bufspace_adjust(bp, newbsize);
 }
 
 /*
  * Extend the backing for a non-VMIO buffer.
  */
 static void
 vfs_nonvmio_extend(struct buf *bp, int newbsize)
 {
 	caddr_t origbuf;
 	int origbufsize;
 
 	/*
 	 * We only use malloced memory on the first allocation.
 	 * and revert to page-allocated memory when the buffer
 	 * grows.
 	 *
 	 * There is a potential smp race here that could lead
 	 * to bufmallocspace slightly passing the max.  It
 	 * is probably extremely rare and not worth worrying
 	 * over.
 	 */
 	if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 &&
 	    bufmallocspace < maxbufmallocspace) {
 		bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK);
 		bp->b_flags |= B_MALLOC;
 		bufmallocadjust(bp, newbsize);
 		return;
 	}
 
 	/*
 	 * If the buffer is growing on its other-than-first
 	 * allocation then we revert to the page-allocation
 	 * scheme.
 	 */
 	origbuf = NULL;
 	origbufsize = 0;
 	if (bp->b_flags & B_MALLOC) {
 		origbuf = bp->b_data;
 		origbufsize = bp->b_bufsize;
 		bp->b_data = bp->b_kvabase;
 		bufmallocadjust(bp, 0);
 		bp->b_flags &= ~B_MALLOC;
 		newbsize = round_page(newbsize);
 	}
 	vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
 	    (vm_offset_t) bp->b_data + newbsize);
 	if (origbuf != NULL) {
 		bcopy(origbuf, bp->b_data, origbufsize);
 		free(origbuf, M_BIOBUF);
 	}
 	bufspace_adjust(bp, newbsize);
 }
 
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).  This code is able to
  * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistent data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
  *
  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
  * B_CACHE for the non-VMIO case.
  */
 int
 allocbuf(struct buf *bp, int size)
 {
 	int newbsize;
 
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_bcount == size)
 		return (1);
 
 	if (bp->b_kvasize != 0 && bp->b_kvasize < size)
 		panic("allocbuf: buffer too small");
 
 	newbsize = roundup2(size, DEV_BSIZE);
 	if ((bp->b_flags & B_VMIO) == 0) {
 		if ((bp->b_flags & B_MALLOC) == 0)
 			newbsize = round_page(newbsize);
 		/*
 		 * Just get anonymous memory from the kernel.  Don't
 		 * mess with B_CACHE.
 		 */
 		if (newbsize < bp->b_bufsize)
 			vfs_nonvmio_truncate(bp, newbsize);
 		else if (newbsize > bp->b_bufsize)
 			vfs_nonvmio_extend(bp, newbsize);
 	} else {
 		int desiredpages;
 
 		desiredpages = (size == 0) ? 0 :
 		    num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 		/*
 		 * Set B_CACHE initially if buffer is 0 length or will become
 		 * 0-length.
 		 */
 		if (size == 0 || bp->b_bufsize == 0)
 			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize)
 			vfs_vmio_truncate(bp, desiredpages);
 		/* XXX This looks as if it should be newbsize > b_bufsize */
 		else if (size > bp->b_bcount)
 			vfs_vmio_extend(bp, desiredpages, size);
 		bufspace_adjust(bp, newbsize);
 	}
 	bp->b_bcount = size;		/* requested buffer size. */
 	return (1);
 }
 
 extern int inflight_transient_maps;
 
 static struct bio_queue nondump_bios;
 
 void
 biodone(struct bio *bp)
 {
 	struct mtx *mtxp;
 	void (*done)(struct bio *);
 	vm_offset_t start, end;
 
 	biotrack(bp, __func__);
 
 	/*
 	 * Avoid completing I/O when dumping after a panic since that may
 	 * result in a deadlock in the filesystem or pager code.  Note that
 	 * this doesn't affect dumps that were started manually since we aim
 	 * to keep the system usable after it has been resumed.
 	 */
 	if (__predict_false(dumping && SCHEDULER_STOPPED())) {
 		TAILQ_INSERT_HEAD(&nondump_bios, bp, bio_queue);
 		return;
 	}
 	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
 		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
 		bp->bio_flags |= BIO_UNMAPPED;
 		start = trunc_page((vm_offset_t)bp->bio_data);
 		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
 		bp->bio_data = unmapped_buf;
 		pmap_qremove(start, atop(end - start));
 		vmem_free(transient_arena, start, end - start);
 		atomic_add_int(&inflight_transient_maps, -1);
 	}
 	done = bp->bio_done;
 	if (done == NULL) {
 		mtxp = mtx_pool_find(mtxpool_sleep, bp);
 		mtx_lock(mtxp);
 		bp->bio_flags |= BIO_DONE;
 		wakeup(bp);
 		mtx_unlock(mtxp);
 	} else
 		done(bp);
 }
 
 /*
  * Wait for a BIO to finish.
  */
 int
 biowait(struct bio *bp, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->bio_flags & BIO_DONE) == 0)
 		msleep(bp, mtxp, PRIBIO, wchan, 0);
 	mtx_unlock(mtxp);
 	if (bp->bio_error != 0)
 		return (bp->bio_error);
 	if (!(bp->bio_flags & BIO_ERROR))
 		return (0);
 	return (EIO);
 }
 
 void
 biofinish(struct bio *bp, struct devstat *stat, int error)
 {
 	
 	if (error) {
 		bp->bio_error = error;
 		bp->bio_flags |= BIO_ERROR;
 	}
 	if (stat != NULL)
 		devstat_end_transaction_bio(stat, bp);
 	biodone(bp);
 }
 
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 void
 biotrack_buf(struct bio *bp, const char *location)
 {
 
 	buf_track(bp->bio_track_bp, location);
 }
 #endif
 
 /*
  *	bufwait:
  *
  *	Wait for buffer I/O completion, returning error status.  The buffer
  *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
  *	error and cleared.
  */
 int
 bufwait(struct buf *bp)
 {
 	if (bp->b_iocmd == BIO_READ)
 		bwait(bp, PRIBIO, "biord");
 	else
 		bwait(bp, PRIBIO, "biowr");
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_ioflags & BIO_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
 /*
  *	bufdone:
  *
  *	Finish I/O on a buffer, optionally calling a completion function.
  *	This is usually called from an interrupt so process blocking is
  *	not allowed.
  *
  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
  *	assuming B_INVAL is clear.
  *
  *	For the VMIO case, we set B_CACHE if the op was a read and no
  *	read error occurred, or if the op was a write.  B_CACHE is never
  *	set if the buffer is invalid or otherwise uncacheable.
  *
  *	bufdone does not mess with B_INVAL, allowing the I/O routine or the
  *	initiator to leave B_INVAL set to brelse the buffer out of existence
  *	in the biodone routine.
  */
 void
 bufdone(struct buf *bp)
 {
 	struct bufobj *dropobj;
 	void    (*biodone)(struct buf *);
 
 	buf_track(bp, __func__);
 	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	dropobj = NULL;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 	BUF_ASSERT_HELD(bp);
 
 	runningbufwakeup(bp);
 	if (bp->b_iocmd == BIO_WRITE)
 		dropobj = bp->b_bufobj;
 	/* call optional completion function if requested */
 	if (bp->b_iodone != NULL) {
 		biodone = bp->b_iodone;
 		bp->b_iodone = NULL;
 		(*biodone) (bp);
 		if (dropobj)
 			bufobj_wdrop(dropobj);
 		return;
 	}
 	if (bp->b_flags & B_VMIO) {
 		/*
 		 * Set B_CACHE if the op was a normal read and no error
 		 * occurred.  B_CACHE is set for writes in the b*write()
 		 * routines.
 		 */
 		if (bp->b_iocmd == BIO_READ &&
 		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
 		    !(bp->b_ioflags & BIO_ERROR))
 			bp->b_flags |= B_CACHE;
 		vfs_vmio_iodone(bp);
 	}
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_complete(bp);
 	if ((bp->b_flags & B_CKHASH) != 0) {
 		KASSERT(bp->b_iocmd == BIO_READ,
 		    ("bufdone: b_iocmd %d not BIO_READ", bp->b_iocmd));
 		KASSERT(buf_mapped(bp), ("bufdone: bp %p not mapped", bp));
 		(*bp->b_ckhashcalc)(bp);
 	}
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * will do a wakeup there if necessary - so no need to do a wakeup
 	 * here in the async case. The sync case always needs to do a wakeup.
 	 */
 	if (bp->b_flags & B_ASYNC) {
 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) ||
 		    (bp->b_ioflags & BIO_ERROR))
 			brelse(bp);
 		else
 			bqrelse(bp);
 	} else
 		bdone(bp);
 	if (dropobj)
 		bufobj_wdrop(dropobj);
 }
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistent.
  */
 void
 vfs_unbusy_pages(struct buf *bp)
 {
 	int i;
 	vm_object_t obj;
 	vm_page_t m;
 
 	runningbufwakeup(bp);
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 			if (!m)
 				panic("vfs_unbusy_pages: page missing\n");
 			bp->b_pages[i] = m;
 			if (buf_mapped(bp)) {
 				BUF_CHECK_MAPPED(bp);
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 				    bp->b_pages, bp->b_npages);
 			} else
 				BUF_CHECK_UNMAPPED(bp);
 		}
 		vm_page_sunbusy(m);
 	}
 	vm_object_pip_wakeupn(obj, bp->b_npages);
 	VM_OBJECT_WUNLOCK(obj);
 }
 
 /*
  * vfs_page_set_valid:
  *
  *	Set the valid bits in a page based on the supplied offset.   The
  *	range is restricted to the buffer's size.
  *
  *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t eoff;
 
 	/*
 	 * Compute the end offset, eoff, such that [off, eoff) does not span a
 	 * page boundary and eoff is not greater than the end of the buffer.
 	 * The end of the buffer, in this case, is our file EOF, not the
 	 * allocation size of the buffer.
 	 */
 	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > off)
 		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
 }
 
 /*
  * vfs_page_set_validclean:
  *
  *	Set the valid bits and clear the dirty bits in a page based on the
  *	supplied offset.   The range is restricted to the buffer's size.
  */
 static void
 vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t soff, eoff;
 
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 	 * page boundary or cross the end of the buffer.  The end of the
 	 * buffer, in this case, is our file EOF, not the allocation size
 	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > soff) {
 		vm_page_set_validclean(
 		    m,
 		   (vm_offset_t) (soff & PAGE_MASK),
 		   (vm_offset_t) (eoff - soff)
 		);
 	}
 }
 
 /*
  * Ensure that all buffer pages are not exclusive busied.  If any page is
  * exclusive busy, drain it.
  */
 void
 vfs_drain_busy_pages(struct buf *bp)
 {
 	vm_page_t m;
 	int i, last_busied;
 
 	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
 	last_busied = 0;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (vm_page_xbusied(m)) {
 			for (; last_busied < i; last_busied++)
 				vm_page_sbusy(bp->b_pages[last_busied]);
 			while (vm_page_xbusied(m)) {
 				vm_page_lock(m);
 				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 				vm_page_busy_sleep(m, "vbpage", true);
 				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 			}
 		}
 	}
 	for (i = 0; i < last_busied; i++)
 		vm_page_sunbusy(bp->b_pages[i]);
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being exclusive busy.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistent.
  *
  * Since I/O has not been initiated yet, certain buffer flags
  * such as BIO_ERROR or B_INVAL may be in an inconsistent state
  * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf *bp, int clear_modify)
 {
 	vm_object_t obj;
 	vm_ooffset_t foff;
 	vm_page_t m;
 	int i;
 	bool bogus;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_busy_pages: no buffer offset"));
 	VM_OBJECT_WLOCK(obj);
 	vfs_drain_busy_pages(bp);
 	if (bp->b_bufsize != 0)
 		vfs_setdirty_locked_object(bp);
 	bogus = false;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 
 		if ((bp->b_flags & B_CLUSTER) == 0) {
 			vm_object_pip_add(obj, 1);
 			vm_page_sbusy(m);
 		}
 		/*
 		 * When readying a buffer for a read ( i.e
 		 * clear_modify == 0 ), it is important to do
 		 * bogus_page replacement for valid pages in 
 		 * partially instantiated buffers.  Partially 
 		 * instantiated buffers can, in turn, occur when
 		 * reconstituting a buffer from its VM backing store
 		 * base.  We only have to do this if B_CACHE is
 		 * clear ( which causes the I/O to occur in the
 		 * first place ).  The replacement prevents the read
 		 * I/O from overwriting potentially dirty VM-backed
 		 * pages.  XXX bogus page replacement is, uh, bogus.
 		 * It may not work properly with small-block devices.
 		 * We need to find a better way.
 		 */
 		if (clear_modify) {
 			pmap_remove_write(m);
 			vfs_page_set_validclean(bp, foff, m);
 		} else if (m->valid == VM_PAGE_BITS_ALL &&
 		    (bp->b_flags & B_CACHE) == 0) {
 			bp->b_pages[i] = bogus_page;
 			bogus = true;
 		}
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	if (bogus && buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  *	vfs_bio_set_valid:
  *
  *	Set the range within the buffer to valid.  The range is
  *	relative to the beginning of the buffer, b_offset.  Note that
  *	b_offset itself may be offset from the beginning of the first
  *	page.
  */
 void   
 vfs_bio_set_valid(struct buf *bp, int base, int size)
 {
 	int i, n;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	/*
 	 * Fixup base to be relative to beginning of first page.
 	 * Set initial n to be the maximum number of bytes in the
 	 * first page that can be validated.
 	 */
 	base += (bp->b_offset & PAGE_MASK);
 	n = PAGE_SIZE - (base & PAGE_MASK);
 
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 		m = bp->b_pages[i];
 		if (n > size)
 			n = size;
 		vm_page_set_valid_range(m, base & PAGE_MASK, n);
 		base += n;
 		size -= n;
 		n = PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 /*
  *	vfs_bio_clrbuf:
  *
  *	If the specified buffer is a non-VMIO buffer, clear the entire
  *	buffer.  If the specified buffer is a VMIO buffer, clear and
  *	validate only the previously invalid portions of the buffer.
  *	This routine essentially fakes an I/O, so we need to clear
  *	BIO_ERROR and B_INVAL.
  *
  *	Note that while we only theoretically need to clear through b_bcount,
  *	we go ahead and clear through b_bufsize.
  */
 void
 vfs_bio_clrbuf(struct buf *bp) 
 {
 	int i, j, mask, sa, ea, slide;
 
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
 		clrbuf(bp);
 		return;
 	}
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 	    (bp->b_offset & PAGE_MASK) == 0) {
 		if (bp->b_pages[0] == bogus_page)
 			goto unlock;
 		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
 		if ((bp->b_pages[0]->valid & mask) == mask)
 			goto unlock;
 		if ((bp->b_pages[0]->valid & mask) == 0) {
 			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
 			bp->b_pages[0]->valid |= mask;
 			goto unlock;
 		}
 	}
 	sa = bp->b_offset & PAGE_MASK;
 	slide = 0;
 	for (i = 0; i < bp->b_npages; i++, sa = 0) {
 		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
 		ea = slide & PAGE_MASK;
 		if (ea == 0)
 			ea = PAGE_SIZE;
 		if (bp->b_pages[i] == bogus_page)
 			continue;
 		j = sa / DEV_BSIZE;
 		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
 		if ((bp->b_pages[i]->valid & mask) == mask)
 			continue;
 		if ((bp->b_pages[i]->valid & mask) == 0)
 			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
 		else {
 			for (; sa < ea; sa += DEV_BSIZE, j++) {
 				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
 					pmap_zero_page_area(bp->b_pages[i],
 					    sa, DEV_BSIZE);
 				}
 			}
 		}
 		bp->b_pages[i]->valid |= mask;
 	}
 unlock:
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 	bp->b_resid = 0;
 }
 
 void
 vfs_bio_bzero_buf(struct buf *bp, int base, int size)
 {
 	vm_page_t m;
 	int i, n;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		bzero(bp->b_data + base, size);
 	} else {
 		BUF_CHECK_UNMAPPED(bp);
 		n = PAGE_SIZE - (base & PAGE_MASK);
 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 			m = bp->b_pages[i];
 			if (n > size)
 				n = size;
 			pmap_zero_page_area(m, base & PAGE_MASK, n);
 			base += n;
 			size -= n;
 			n = PAGE_SIZE;
 		}
 	}
 }
 
 /*
  * Update buffer flags based on I/O request parameters, optionally releasing the
  * buffer.  If it's VMIO or direct I/O, the buffer pages are released to the VM,
  * where they may be placed on a page queue (VMIO) or freed immediately (direct
  * I/O).  Otherwise the buffer is released to the cache.
  */
 static void
 b_io_dismiss(struct buf *bp, int ioflag, bool release)
 {
 
 	KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0,
 	    ("buf %p non-VMIO noreuse", bp));
 
 	if ((ioflag & IO_DIRECT) != 0)
 		bp->b_flags |= B_DIRECT;
 	if ((ioflag & IO_EXT) != 0)
 		bp->b_xflags |= BX_ALTDATA;
 	if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) {
 		bp->b_flags |= B_RELBUF;
 		if ((ioflag & IO_NOREUSE) != 0)
 			bp->b_flags |= B_NOREUSE;
 		if (release)
 			brelse(bp);
 	} else if (release)
 		bqrelse(bp);
 }
 
 void
 vfs_bio_brelse(struct buf *bp, int ioflag)
 {
 
 	b_io_dismiss(bp, ioflag, true);
 }
 
 void
 vfs_bio_set_flags(struct buf *bp, int ioflag)
 {
 
 	b_io_dismiss(bp, ioflag, false);
 }
 
 /*
  * vm_hold_load_pages and vm_hold_free_pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 static void
 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
 	BUF_CHECK_MAPPED(bp);
 
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 		/*
 		 * note: must allocate system pages since blocking here
 		 * could interfere with paging I/O, no matter which
 		 * process we are.
 		 */
 		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) |
 		    VM_ALLOC_WAITOK);
 		pmap_qenter(pg, &p, 1);
 		bp->b_pages[index] = p;
 	}
 	bp->b_npages = index;
 }
 
 /* Return pages associated with this buf to the vm system */
 static void
 vm_hold_free_pages(struct buf *bp, int newbsize)
 {
 	vm_offset_t from;
 	vm_page_t p;
 	int index, newnpages;
 
 	BUF_CHECK_MAPPED(bp);
 
 	from = round_page((vm_offset_t)bp->b_data + newbsize);
 	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 	if (bp->b_npages > newnpages)
 		pmap_qremove(from, bp->b_npages - newnpages);
 	for (index = newnpages; index < bp->b_npages; index++) {
 		p = bp->b_pages[index];
 		bp->b_pages[index] = NULL;
 		p->wire_count--;
 		vm_page_free(p);
 	}
 	vm_wire_sub(bp->b_npages - newnpages);
 	bp->b_npages = newnpages;
 }
 
 /*
  * Map an IO request into kernel virtual address space.
  *
  * All requests are (re)mapped into kernel VA space.
  * Notice that we use b_bufsize for the size of the buffer
  * to be mapped.  b_bcount might be modified by the driver.
  *
  * Note that even if the caller determines that the address space should
  * be valid, a race or a smaller-file mapped into a larger space may
  * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
  * check the return value.
  *
  * This function only works with pager buffers.
  */
 int
 vmapbuf(struct buf *bp, int mapbuf)
 {
 	vm_prot_t prot;
 	int pidx;
 
 	if (bp->b_bufsize < 0)
 		return (-1);
 	prot = VM_PROT_READ;
 	if (bp->b_iocmd == BIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
 	    btoc(MAXPHYS))) < 0)
 		return (-1);
 	bp->b_npages = pidx;
 	bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
 	if (mapbuf || !unmapped_buf_allowed) {
 		pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx);
 		bp->b_data = bp->b_kvabase + bp->b_offset;
 	} else
 		bp->b_data = unmapped_buf;
 	return(0);
 }
 
 /*
  * Free the io map PTEs associated with this IO operation.
  * We also invalidate the TLB entries and restore the original b_addr.
  *
  * This function only works with pager buffers.
  */
 void
 vunmapbuf(struct buf *bp)
 {
 	int npages;
 
 	npages = bp->b_npages;
 	if (buf_mapped(bp))
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
 	vm_page_unhold_pages(bp->b_pages, npages);
 
 	bp->b_data = unmapped_buf;
 }
 
 void
 bdone(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	bp->b_flags |= B_DONE;
 	wakeup(bp);
 	mtx_unlock(mtxp);
 }
 
 void
 bwait(struct buf *bp, u_char pri, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->b_flags & B_DONE) == 0)
 		msleep(bp, mtxp, pri, wchan, 0);
 	mtx_unlock(mtxp);
 }
 
 int
 bufsync(struct bufobj *bo, int waitfor)
 {
 
 	return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread));
 }
 
 void
 bufstrategy(struct bufobj *bo, struct buf *bp)
 {
 	int i __unused;
 	struct vnode *vp;
 
 	vp = bp->b_vp;
 	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
 	i = VOP_STRATEGY(vp, bp);
 	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
 }
 
 /*
  * Initialize a struct bufobj before use.  Memory is assumed zero filled.
  */
 void
 bufobj_init(struct bufobj *bo, void *private)
 {
 	static volatile int bufobj_cleanq;
 
         bo->bo_domain =
             atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains;
         rw_init(BO_LOCKPTR(bo), "bufobj interlock");
         bo->bo_private = private;
         TAILQ_INIT(&bo->bo_clean.bv_hd);
         TAILQ_INIT(&bo->bo_dirty.bv_hd);
 }
 
 void
 bufobj_wrefl(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	ASSERT_BO_WLOCKED(bo);
 	bo->bo_numoutput++;
 }
 
 void
 bufobj_wref(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	BO_LOCK(bo);
 	bo->bo_numoutput++;
 	BO_UNLOCK(bo);
 }
 
 void
 bufobj_wdrop(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
 	BO_LOCK(bo);
 	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
 	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
 		bo->bo_flag &= ~BO_WWAIT;
 		wakeup(&bo->bo_numoutput);
 	}
 	BO_UNLOCK(bo);
 }
 
 int
 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
 {
 	int error;
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
 	ASSERT_BO_WLOCKED(bo);
 	error = 0;
 	while (bo->bo_numoutput) {
 		bo->bo_flag |= BO_WWAIT;
 		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
 		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Set bio_data or bio_ma for struct bio from the struct buf.
  */
 void
 bdata2bio(struct buf *bp, struct bio *bip)
 {
 
 	if (!buf_mapped(bp)) {
 		KASSERT(unmapped_buf_allowed, ("unmapped"));
 		bip->bio_ma = bp->b_pages;
 		bip->bio_ma_n = bp->b_npages;
 		bip->bio_data = unmapped_buf;
 		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
 		bip->bio_flags |= BIO_UNMAPPED;
 		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
 		    PAGE_SIZE == bp->b_npages,
 		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
 		    (long long)bip->bio_length, bip->bio_ma_n));
 	} else {
 		bip->bio_data = bp->b_data;
 		bip->bio_ma = NULL;
 	}
 }
 
 /*
  * The MIPS pmap code currently doesn't handle aliased pages.
  * The VIPT caches may not handle page aliasing themselves, leading
  * to data corruption.
  *
  * As such, this code makes a system extremely unhappy if said
  * system doesn't support unaliasing the above situation in hardware.
  * Some "recent" systems (eg some mips24k/mips74k cores) don't enable
  * this feature at build time, so it has to be handled in software.
  *
  * Once the MIPS pmap/cache code grows to support this function on
  * earlier chips, it should be flipped back off.
  */
 #ifdef	__mips__
 static int buf_pager_relbuf = 1;
 #else
 static int buf_pager_relbuf = 0;
 #endif
 SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
     &buf_pager_relbuf, 0,
     "Make buffer pager release buffers after reading");
 
 /*
  * The buffer pager.  It uses buffer reads to validate pages.
  *
  * In contrast to the generic local pager from vm/vnode_pager.c, this
  * pager correctly and easily handles volumes where the underlying
  * device block size is greater than the machine page size.  The
  * buffer cache transparently extends the requested page run to be
  * aligned at the block boundary, and does the necessary bogus page
  * replacements in the addends to avoid obliterating already valid
  * pages.
  *
  * The only non-trivial issue is that the exclusive busy state for
  * pages, which is assumed by the vm_pager_getpages() interface, is
  * incompatible with the VMIO buffer cache's desire to share-busy the
  * pages.  This function performs a trivial downgrade of the pages'
  * state before reading buffers, and a less trivial upgrade from the
  * shared-busy to excl-busy state after the read.
  */
 int
 vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count,
     int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno,
     vbg_get_blksize_t get_blksize)
 {
 	vm_page_t m;
 	vm_object_t object;
 	struct buf *bp;
 	struct mount *mp;
 	daddr_t lbn, lbnp;
 	vm_ooffset_t la, lb, poff, poffe;
 	long bsize;
 	int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b;
 	bool redo, lpart;
 
 	object = vp->v_object;
 	mp = vp->v_mount;
 	error = 0;
 	la = IDX_TO_OFF(ma[count - 1]->pindex);
 	if (la >= object->un_pager.vnp.vnp_size)
 		return (VM_PAGER_BAD);
 
 	/*
 	 * Change the meaning of la from where the last requested page starts
 	 * to where it ends, because that's the end of the requested region
 	 * and the start of the potential read-ahead region.
 	 */
 	la += PAGE_SIZE;
 	lpart = la > object->un_pager.vnp.vnp_size;
 	bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)));
 
 	/*
 	 * Calculate read-ahead, behind and total pages.
 	 */
 	pgsin = count;
 	lb = IDX_TO_OFF(ma[0]->pindex);
 	pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
 	pgsin += pgsin_b;
 	if (rbehind != NULL)
 		*rbehind = pgsin_b;
 	pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la);
 	if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size)
 		pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size,
 		    PAGE_SIZE) - la);
 	pgsin += pgsin_a;
 	if (rahead != NULL)
 		*rahead = pgsin_a;
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, pgsin);
 
 	br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS)
 	    != 0) ? GB_UNMAPPED : 0;
 	VM_OBJECT_WLOCK(object);
 again:
 	for (i = 0; i < count; i++)
 		vm_page_busy_downgrade(ma[i]);
 	VM_OBJECT_WUNLOCK(object);
 
 	lbnp = -1;
 	for (i = 0; i < count; i++) {
 		m = ma[i];
 
 		/*
 		 * Pages are shared busy and the object lock is not
 		 * owned, which together allow for the pages'
 		 * invalidation.  The racy test for validity avoids
 		 * useless creation of the buffer for the most typical
 		 * case when invalidation is not used in redo or for
 		 * parallel read.  The shared->excl upgrade loop at
 		 * the end of the function catches the race in a
 		 * reliable way (protected by the object lock).
 		 */
 		if (m->valid == VM_PAGE_BITS_ALL)
 			continue;
 
 		poff = IDX_TO_OFF(m->pindex);
 		poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size);
 		for (; poff < poffe; poff += bsize) {
 			lbn = get_lblkno(vp, poff);
 			if (lbn == lbnp)
 				goto next_page;
 			lbnp = lbn;
 
 			bsize = get_blksize(vp, lbn);
 			error = bread_gb(vp, lbn, bsize, curthread->td_ucred,
 			    br_flags, &bp);
 			if (error != 0)
 				goto end_pages;
 			if (LIST_EMPTY(&bp->b_dep)) {
 				/*
 				 * Invalidation clears m->valid, but
 				 * may leave B_CACHE flag if the
 				 * buffer existed at the invalidation
 				 * time.  In this case, recycle the
 				 * buffer to do real read on next
 				 * bread() after redo.
 				 *
 				 * Otherwise B_RELBUF is not strictly
 				 * necessary, enable to reduce buf
 				 * cache pressure.
 				 */
 				if (buf_pager_relbuf ||
 				    m->valid != VM_PAGE_BITS_ALL)
 					bp->b_flags |= B_RELBUF;
 
 				bp->b_flags &= ~B_NOCACHE;
 				brelse(bp);
 			} else {
 				bqrelse(bp);
 			}
 		}
 		KASSERT(1 /* racy, enable for debugging */ ||
 		    m->valid == VM_PAGE_BITS_ALL || i == count - 1,
 		    ("buf %d %p invalid", i, m));
 		if (i == count - 1 && lpart) {
 			VM_OBJECT_WLOCK(object);
 			if (m->valid != 0 &&
 			    m->valid != VM_PAGE_BITS_ALL)
 				vm_page_zero_invalid(m, TRUE);
 			VM_OBJECT_WUNLOCK(object);
 		}
 next_page:;
 	}
 end_pages:
 
 	VM_OBJECT_WLOCK(object);
 	redo = false;
 	for (i = 0; i < count; i++) {
 		vm_page_sunbusy(ma[i]);
 		ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
 
 		/*
 		 * Since the pages were only sbusy while neither the
 		 * buffer nor the object lock was held by us, or
 		 * reallocated while vm_page_grab() slept for busy
 		 * relinguish, they could have been invalidated.
 		 * Recheck the valid bits and re-read as needed.
 		 *
 		 * Note that the last page is made fully valid in the
 		 * read loop, and partial validity for the page at
 		 * index count - 1 could mean that the page was
 		 * invalidated or removed, so we must restart for
 		 * safety as well.
 		 */
 		if (ma[i]->valid != VM_PAGE_BITS_ALL)
 			redo = true;
 	}
 	if (redo && error == 0)
 		goto again;
 	VM_OBJECT_WUNLOCK(object);
 	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 /* DDB command to show buffer data */
 DB_SHOW_COMMAND(buffer, db_show_buffer)
 {
 	/* get args */
 	struct buf *bp = (struct buf *)addr;
 #ifdef FULL_BUF_TRACKING
 	uint32_t i, j;
 #endif
 
 	if (!have_addr) {
 		db_printf("usage: show buffer <addr>\n");
 		return;
 	}
 
 	db_printf("buf at %p\n", bp);
 	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
 	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
 	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
 	db_printf(
 	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
 	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
 	    "b_dep = %p\n",
 	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
 	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
 	db_printf("b_kvabase = %p, b_kvasize = %d\n",
 	    bp->b_kvabase, bp->b_kvasize);
 	if (bp->b_npages) {
 		int i;
 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m;
 			m = bp->b_pages[i];
 			if (m != NULL)
 				db_printf("(%p, 0x%lx, 0x%lx)", m->object,
 				    (u_long)m->pindex,
 				    (u_long)VM_PAGE_TO_PHYS(m));
 			else
 				db_printf("( ??? )");
 			if ((i + 1) < bp->b_npages)
 				db_printf(",");
 		}
 		db_printf("\n");
 	}
 	BUF_LOCKPRINTINFO(bp);
 #if defined(FULL_BUF_TRACKING)
 	db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt);
 
 	i = bp->b_io_tcnt % BUF_TRACKING_SIZE;
 	for (j = 1; j <= BUF_TRACKING_SIZE; j++) {
 		if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL)
 			continue;
 		db_printf(" %2u: %s\n", j,
 		    bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]);
 	}
 #elif defined(BUF_TRACKING)
 	db_printf("b_io_tracking: %s\n", bp->b_io_tracking);
 #endif
 	db_printf(" ");
 }
 
 DB_SHOW_COMMAND(bufqueues, bufqueues)
 {
 	struct bufdomain *bd;
 	struct buf *bp;
 	long total;
 	int i, j, cnt;
 
 	db_printf("bqempty: %d\n", bqempty.bq_len);
 
 	for (i = 0; i < buf_domains; i++) {
 		bd = &bdomain[i];
 		db_printf("Buf domain %d\n", i);
 		db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers);
 		db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers);
 		db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers);
 		db_printf("\n");
 		db_printf("\tbufspace\t%ld\n", bd->bd_bufspace);
 		db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace);
 		db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace);
 		db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace);
 		db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh);
 		db_printf("\n");
 		db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers);
 		db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers);
 		db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers);
 		db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh);
 		db_printf("\n");
 		total = 0;
 		TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist)
 			total += bp->b_bufsize;
 		db_printf("\tcleanq count\t%d (%ld)\n",
 		    bd->bd_cleanq->bq_len, total);
 		total = 0;
 		TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist)
 			total += bp->b_bufsize;
 		db_printf("\tdirtyq count\t%d (%ld)\n",
 		    bd->bd_dirtyq.bq_len, total);
 		db_printf("\twakeup\t\t%d\n", bd->bd_wanted);
 		db_printf("\tlim\t\t%d\n", bd->bd_lim);
 		db_printf("\tCPU ");
 		for (j = 0; j <= mp_maxid; j++)
 			db_printf("%d, ", bd->bd_subq[j].bq_len);
 		db_printf("\n");
 		cnt = 0;
 		total = 0;
 		for (j = 0; j < nbuf; j++)
 			if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) {
 				cnt++;
 				total += buf[j].b_bufsize;
 			}
 		db_printf("\tLocked buffers: %d space %ld\n", cnt, total);
 		cnt = 0;
 		total = 0;
 		for (j = 0; j < nbuf; j++)
 			if (buf[j].b_domain == i) {
 				cnt++;
 				total += buf[j].b_bufsize;
 			}
 		db_printf("\tTotal buffers: %d space %ld\n", cnt, total);
 	}
 }
 
 DB_SHOW_COMMAND(lockedbufs, lockedbufs)
 {
 	struct buf *bp;
 	int i;
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (BUF_ISLOCKED(bp)) {
 			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 			db_printf("\n");
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
 {
 	struct vnode *vp;
 	struct buf *bp;
 
 	if (!have_addr) {
 		db_printf("usage: show vnodebufs <addr>\n");
 		return;
 	}
 	vp = (struct vnode *)addr;
 	db_printf("Clean buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 	db_printf("Dirty buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 }
 
 DB_COMMAND(countfreebufs, db_coundfreebufs)
 {
 	struct buf *bp;
 	int i, used = 0, nfree = 0;
 
 	if (have_addr) {
 		db_printf("usage: countfreebufs\n");
 		return;
 	}
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (bp->b_qindex == QUEUE_EMPTY)
 			nfree++;
 		else
 			used++;
 	}
 
 	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
 	    nfree + used);
 	db_printf("numfreebuffers is %d\n", numfreebuffers);
 }
 #endif /* DDB */
Index: stable/12
===================================================================
--- stable/12	(revision 355609)
+++ stable/12	(revision 355610)

Property changes on: stable/12
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r355404