Index: head/sys/cam/cam_periph.c
===================================================================
--- head/sys/cam/cam_periph.c	(revision 366910)
+++ head/sys/cam/cam_periph.c	(revision 366911)
@@ -1,2173 +1,2167 @@
 /*-
  * Common functions for CAM "type" (peripheral) drivers.
  *
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1997, 1998 Justin T. Gibbs.
  * Copyright (c) 1997, 1998, 1999, 2000 Kenneth D. Merry.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/devctl.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/devicestat.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_queue.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_sim.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_message.h>
 #include <cam/scsi/scsi_pass.h>
 
 static	u_int		camperiphnextunit(struct periph_driver *p_drv,
 					  u_int newunit, int wired,
 					  path_id_t pathid, target_id_t target,
 					  lun_id_t lun);
 static	u_int		camperiphunit(struct periph_driver *p_drv,
 				      path_id_t pathid, target_id_t target,
 				      lun_id_t lun); 
 static	void		camperiphdone(struct cam_periph *periph, 
 					union ccb *done_ccb);
 static  void		camperiphfree(struct cam_periph *periph);
 static int		camperiphscsistatuserror(union ccb *ccb,
 					        union ccb **orig_ccb,
 						 cam_flags camflags,
 						 u_int32_t sense_flags,
 						 int *openings,
 						 u_int32_t *relsim_flags,
 						 u_int32_t *timeout,
 						 u_int32_t  *action,
 						 const char **action_string);
 static	int		camperiphscsisenseerror(union ccb *ccb,
 					        union ccb **orig_ccb,
 					        cam_flags camflags,
 					        u_int32_t sense_flags,
 					        int *openings,
 					        u_int32_t *relsim_flags,
 					        u_int32_t *timeout,
 					        u_int32_t *action,
 					        const char **action_string);
 static void		cam_periph_devctl_notify(union ccb *ccb);
 
 static int nperiph_drivers;
 static int initialized = 0;
 struct periph_driver **periph_drivers;
 
 static MALLOC_DEFINE(M_CAMPERIPH, "CAM periph", "CAM peripheral buffers");
 
 static int periph_selto_delay = 1000;
 TUNABLE_INT("kern.cam.periph_selto_delay", &periph_selto_delay);
 static int periph_noresrc_delay = 500;
 TUNABLE_INT("kern.cam.periph_noresrc_delay", &periph_noresrc_delay);
 static int periph_busy_delay = 500;
 TUNABLE_INT("kern.cam.periph_busy_delay", &periph_busy_delay);
 
 static u_int periph_mapmem_thresh = 65536;
 SYSCTL_UINT(_kern_cam, OID_AUTO, mapmem_thresh, CTLFLAG_RWTUN,
     &periph_mapmem_thresh, 0, "Threshold for user-space buffer mapping");
 
 void
 periphdriver_register(void *data)
 {
 	struct periph_driver *drv = (struct periph_driver *)data;
 	struct periph_driver **newdrivers, **old;
 	int ndrivers;
 
 again:
 	ndrivers = nperiph_drivers + 2;
 	newdrivers = malloc(sizeof(*newdrivers) * ndrivers, M_CAMPERIPH,
 			    M_WAITOK);
 	xpt_lock_buses();
 	if (ndrivers != nperiph_drivers + 2) {
 		/*
 		 * Lost race against itself; go around.
 		 */
 		xpt_unlock_buses();
 		free(newdrivers, M_CAMPERIPH);
 		goto again;
 	}
 	if (periph_drivers)
 		bcopy(periph_drivers, newdrivers,
 		      sizeof(*newdrivers) * nperiph_drivers);
 	newdrivers[nperiph_drivers] = drv;
 	newdrivers[nperiph_drivers + 1] = NULL;
 	old = periph_drivers;
 	periph_drivers = newdrivers;
 	nperiph_drivers++;
 	xpt_unlock_buses();
 	if (old)
 		free(old, M_CAMPERIPH);
 	/* If driver marked as early or it is late now, initialize it. */
 	if (((drv->flags & CAM_PERIPH_DRV_EARLY) != 0 && initialized > 0) ||
 	    initialized > 1)
 		(*drv->init)();
 }
 
 int
 periphdriver_unregister(void *data)
 {
 	struct periph_driver *drv = (struct periph_driver *)data;
 	int error, n;
 
 	/* If driver marked as early or it is late now, deinitialize it. */
 	if (((drv->flags & CAM_PERIPH_DRV_EARLY) != 0 && initialized > 0) ||
 	    initialized > 1) {
 		if (drv->deinit == NULL) {
 			printf("CAM periph driver '%s' doesn't have deinit.\n",
 			    drv->driver_name);
 			return (EOPNOTSUPP);
 		}
 		error = drv->deinit();
 		if (error != 0)
 			return (error);
 	}
 
 	xpt_lock_buses();
 	for (n = 0; n < nperiph_drivers && periph_drivers[n] != drv; n++)
 		;
 	KASSERT(n < nperiph_drivers,
 	    ("Periph driver '%s' was not registered", drv->driver_name));
 	for (; n + 1 < nperiph_drivers; n++)
 		periph_drivers[n] = periph_drivers[n + 1];
 	periph_drivers[n + 1] = NULL;
 	nperiph_drivers--;
 	xpt_unlock_buses();
 	return (0);
 }
 
 void
 periphdriver_init(int level)
 {
 	int	i, early;
 
 	initialized = max(initialized, level);
 	for (i = 0; periph_drivers[i] != NULL; i++) {
 		early = (periph_drivers[i]->flags & CAM_PERIPH_DRV_EARLY) ? 1 : 2;
 		if (early == initialized)
 			(*periph_drivers[i]->init)();
 	}
 }
 
 cam_status
 cam_periph_alloc(periph_ctor_t *periph_ctor,
 		 periph_oninv_t *periph_oninvalidate,
 		 periph_dtor_t *periph_dtor, periph_start_t *periph_start,
 		 char *name, cam_periph_type type, struct cam_path *path,
 		 ac_callback_t *ac_callback, ac_code code, void *arg)
 {
 	struct		periph_driver **p_drv;
 	struct		cam_sim *sim;
 	struct		cam_periph *periph;
 	struct		cam_periph *cur_periph;
 	path_id_t	path_id;
 	target_id_t	target_id;
 	lun_id_t	lun_id;
 	cam_status	status;
 	u_int		init_level;
 
 	init_level = 0;
 	/*
 	 * Handle Hot-Plug scenarios.  If there is already a peripheral
 	 * of our type assigned to this path, we are likely waiting for
 	 * final close on an old, invalidated, peripheral.  If this is
 	 * the case, queue up a deferred call to the peripheral's async
 	 * handler.  If it looks like a mistaken re-allocation, complain.
 	 */
 	if ((periph = cam_periph_find(path, name)) != NULL) {
 		if ((periph->flags & CAM_PERIPH_INVALID) != 0
 		 && (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) == 0) {
 			periph->flags |= CAM_PERIPH_NEW_DEV_FOUND;
 			periph->deferred_callback = ac_callback;
 			periph->deferred_ac = code;
 			return (CAM_REQ_INPROG);
 		} else {
 			printf("cam_periph_alloc: attempt to re-allocate "
 			       "valid device %s%d rejected flags %#x "
 			       "refcount %d\n", periph->periph_name,
 			       periph->unit_number, periph->flags,
 			       periph->refcount);
 		}
 		return (CAM_REQ_INVALID);
 	}
 
 	periph = (struct cam_periph *)malloc(sizeof(*periph), M_CAMPERIPH,
 					     M_NOWAIT|M_ZERO);
 
 	if (periph == NULL)
 		return (CAM_RESRC_UNAVAIL);
 
 	init_level++;
 
 	sim = xpt_path_sim(path);
 	path_id = xpt_path_path_id(path);
 	target_id = xpt_path_target_id(path);
 	lun_id = xpt_path_lun_id(path);
 	periph->periph_start = periph_start;
 	periph->periph_dtor = periph_dtor;
 	periph->periph_oninval = periph_oninvalidate;
 	periph->type = type;
 	periph->periph_name = name;
 	periph->scheduled_priority = CAM_PRIORITY_NONE;
 	periph->immediate_priority = CAM_PRIORITY_NONE;
 	periph->refcount = 1;		/* Dropped by invalidation. */
 	periph->sim = sim;
 	SLIST_INIT(&periph->ccb_list);
 	status = xpt_create_path(&path, periph, path_id, target_id, lun_id);
 	if (status != CAM_REQ_CMP)
 		goto failure;
 	periph->path = path;
 
 	xpt_lock_buses();
 	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
 		if (strcmp((*p_drv)->driver_name, name) == 0)
 			break;
 	}
 	if (*p_drv == NULL) {
 		printf("cam_periph_alloc: invalid periph name '%s'\n", name);
 		xpt_unlock_buses();
 		xpt_free_path(periph->path);
 		free(periph, M_CAMPERIPH);
 		return (CAM_REQ_INVALID);
 	}
 	periph->unit_number = camperiphunit(*p_drv, path_id, target_id, lun_id);
 	cur_periph = TAILQ_FIRST(&(*p_drv)->units);
 	while (cur_periph != NULL
 	    && cur_periph->unit_number < periph->unit_number)
 		cur_periph = TAILQ_NEXT(cur_periph, unit_links);
 	if (cur_periph != NULL) {
 		KASSERT(cur_periph->unit_number != periph->unit_number, ("duplicate units on periph list"));
 		TAILQ_INSERT_BEFORE(cur_periph, periph, unit_links);
 	} else {
 		TAILQ_INSERT_TAIL(&(*p_drv)->units, periph, unit_links);
 		(*p_drv)->generation++;
 	}
 	xpt_unlock_buses();
 
 	init_level++;
 
 	status = xpt_add_periph(periph);
 	if (status != CAM_REQ_CMP)
 		goto failure;
 
 	init_level++;
 	CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph created\n"));
 
 	status = periph_ctor(periph, arg);
 
 	if (status == CAM_REQ_CMP)
 		init_level++;
 
 failure:
 	switch (init_level) {
 	case 4:
 		/* Initialized successfully */
 		break;
 	case 3:
 		CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n"));
 		xpt_remove_periph(periph);
 		/* FALLTHROUGH */
 	case 2:
 		xpt_lock_buses();
 		TAILQ_REMOVE(&(*p_drv)->units, periph, unit_links);
 		xpt_unlock_buses();
 		xpt_free_path(periph->path);
 		/* FALLTHROUGH */
 	case 1:
 		free(periph, M_CAMPERIPH);
 		/* FALLTHROUGH */
 	case 0:
 		/* No cleanup to perform. */
 		break;
 	default:
 		panic("%s: Unknown init level", __func__);
 	}
 	return(status);
 }
 
 /*
  * Find a peripheral structure with the specified path, target, lun, 
  * and (optionally) type.  If the name is NULL, this function will return
  * the first peripheral driver that matches the specified path.
  */
 struct cam_periph *
 cam_periph_find(struct cam_path *path, char *name)
 {
 	struct periph_driver **p_drv;
 	struct cam_periph *periph;
 
 	xpt_lock_buses();
 	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
 		if (name != NULL && (strcmp((*p_drv)->driver_name, name) != 0))
 			continue;
 
 		TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) {
 			if (xpt_path_comp(periph->path, path) == 0) {
 				xpt_unlock_buses();
 				cam_periph_assert(periph, MA_OWNED);
 				return(periph);
 			}
 		}
 		if (name != NULL) {
 			xpt_unlock_buses();
 			return(NULL);
 		}
 	}
 	xpt_unlock_buses();
 	return(NULL);
 }
 
 /*
  * Find peripheral driver instances attached to the specified path.
  */
 int
 cam_periph_list(struct cam_path *path, struct sbuf *sb)
 {
 	struct sbuf local_sb;
 	struct periph_driver **p_drv;
 	struct cam_periph *periph;
 	int count;
 	int sbuf_alloc_len;
 
 	sbuf_alloc_len = 16;
 retry:
 	sbuf_new(&local_sb, NULL, sbuf_alloc_len, SBUF_FIXEDLEN);
 	count = 0;
 	xpt_lock_buses();
 	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
 		TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) {
 			if (xpt_path_comp(periph->path, path) != 0)
 				continue;
 
 			if (sbuf_len(&local_sb) != 0)
 				sbuf_cat(&local_sb, ",");
 
 			sbuf_printf(&local_sb, "%s%d", periph->periph_name,
 				    periph->unit_number);
 
 			if (sbuf_error(&local_sb) == ENOMEM) {
 				sbuf_alloc_len *= 2;
 				xpt_unlock_buses();
 				sbuf_delete(&local_sb);
 				goto retry;
 			}
 			count++;
 		}
 	}
 	xpt_unlock_buses();
 	sbuf_finish(&local_sb);
 	if (sbuf_len(sb) != 0)
 		sbuf_cat(sb, ",");
 	sbuf_cat(sb, sbuf_data(&local_sb));
 	sbuf_delete(&local_sb);
 	return (count);
 }
 
 int
 cam_periph_acquire(struct cam_periph *periph)
 {
 	int status;
 
 	if (periph == NULL)
 		return (EINVAL);
 
 	status = ENOENT;
 	xpt_lock_buses();
 	if ((periph->flags & CAM_PERIPH_INVALID) == 0) {
 		periph->refcount++;
 		status = 0;
 	}
 	xpt_unlock_buses();
 
 	return (status);
 }
 
 void
 cam_periph_doacquire(struct cam_periph *periph)
 {
 
 	xpt_lock_buses();
 	KASSERT(periph->refcount >= 1,
 	    ("cam_periph_doacquire() with refcount == %d", periph->refcount));
 	periph->refcount++;
 	xpt_unlock_buses();
 }
 
 void
 cam_periph_release_locked_buses(struct cam_periph *periph)
 {
 
 	cam_periph_assert(periph, MA_OWNED);
 	KASSERT(periph->refcount >= 1, ("periph->refcount >= 1"));
 	if (--periph->refcount == 0)
 		camperiphfree(periph);
 }
 
 void
 cam_periph_release_locked(struct cam_periph *periph)
 {
 
 	if (periph == NULL)
 		return;
 
 	xpt_lock_buses();
 	cam_periph_release_locked_buses(periph);
 	xpt_unlock_buses();
 }
 
 void
 cam_periph_release(struct cam_periph *periph)
 {
 	struct mtx *mtx;
 
 	if (periph == NULL)
 		return;
 
 	cam_periph_assert(periph, MA_NOTOWNED);
 	mtx = cam_periph_mtx(periph);
 	mtx_lock(mtx);
 	cam_periph_release_locked(periph);
 	mtx_unlock(mtx);
 }
 
 /*
  * hold/unhold act as mutual exclusion for sections of the code that
  * need to sleep and want to make sure that other sections that
  * will interfere are held off. This only protects exclusive sections
  * from each other.
  */
 int
 cam_periph_hold(struct cam_periph *periph, int priority)
 {
 	int error;
 
 	/*
 	 * Increment the reference count on the peripheral
 	 * while we wait for our lock attempt to succeed
 	 * to ensure the peripheral doesn't disappear out
 	 * from user us while we sleep.
 	 */
 
 	if (cam_periph_acquire(periph) != 0)
 		return (ENXIO);
 
 	cam_periph_assert(periph, MA_OWNED);
 	while ((periph->flags & CAM_PERIPH_LOCKED) != 0) {
 		periph->flags |= CAM_PERIPH_LOCK_WANTED;
 		if ((error = cam_periph_sleep(periph, periph, priority,
 		    "caplck", 0)) != 0) {
 			cam_periph_release_locked(periph);
 			return (error);
 		}
 		if (periph->flags & CAM_PERIPH_INVALID) {
 			cam_periph_release_locked(periph);
 			return (ENXIO);
 		}
 	}
 
 	periph->flags |= CAM_PERIPH_LOCKED;
 	return (0);
 }
 
 void
 cam_periph_unhold(struct cam_periph *periph)
 {
 
 	cam_periph_assert(periph, MA_OWNED);
 
 	periph->flags &= ~CAM_PERIPH_LOCKED;
 	if ((periph->flags & CAM_PERIPH_LOCK_WANTED) != 0) {
 		periph->flags &= ~CAM_PERIPH_LOCK_WANTED;
 		wakeup(periph);
 	}
 
 	cam_periph_release_locked(periph);
 }
 
 /*
  * Look for the next unit number that is not currently in use for this
  * peripheral type starting at "newunit".  Also exclude unit numbers that
  * are reserved by for future "hardwiring" unless we already know that this
  * is a potential wired device.  Only assume that the device is "wired" the
  * first time through the loop since after that we'll be looking at unit
  * numbers that did not match a wiring entry.
  */
 static u_int
 camperiphnextunit(struct periph_driver *p_drv, u_int newunit, int wired,
 		  path_id_t pathid, target_id_t target, lun_id_t lun)
 {
 	struct	cam_periph *periph;
 	char	*periph_name;
 	int	i, val, dunit, r;
 	const char *dname, *strval;
 
 	periph_name = p_drv->driver_name;
 	for (;;newunit++) {
 		for (periph = TAILQ_FIRST(&p_drv->units);
 		     periph != NULL && periph->unit_number != newunit;
 		     periph = TAILQ_NEXT(periph, unit_links))
 			;
 
 		if (periph != NULL && periph->unit_number == newunit) {
 			if (wired != 0) {
 				xpt_print(periph->path, "Duplicate Wired "
 				    "Device entry!\n");
 				xpt_print(periph->path, "Second device (%s "
 				    "device at scbus%d target %d lun %d) will "
 				    "not be wired\n", periph_name, pathid,
 				    target, lun);
 				wired = 0;
 			}
 			continue;
 		}
 		if (wired)
 			break;
 
 		/*
 		 * Don't match entries like "da 4" as a wired down
 		 * device, but do match entries like "da 4 target 5"
 		 * or even "da 4 scbus 1". 
 		 */
 		i = 0;
 		dname = periph_name;
 		for (;;) {
 			r = resource_find_dev(&i, dname, &dunit, NULL, NULL);
 			if (r != 0)
 				break;
 			/* if no "target" and no specific scbus, skip */
 			if (resource_int_value(dname, dunit, "target", &val) &&
 			    (resource_string_value(dname, dunit, "at",&strval)||
 			     strcmp(strval, "scbus") == 0))
 				continue;
 			if (newunit == dunit)
 				break;
 		}
 		if (r != 0)
 			break;
 	}
 	return (newunit);
 }
 
 static u_int
 camperiphunit(struct periph_driver *p_drv, path_id_t pathid,
 	      target_id_t target, lun_id_t lun)
 {
 	u_int	unit;
 	int	wired, i, val, dunit;
 	const char *dname, *strval;
 	char	pathbuf[32], *periph_name;
 
 	periph_name = p_drv->driver_name;
 	snprintf(pathbuf, sizeof(pathbuf), "scbus%d", pathid);
 	unit = 0;
 	i = 0;
 	dname = periph_name;
 	for (wired = 0; resource_find_dev(&i, dname, &dunit, NULL, NULL) == 0;
 	     wired = 0) {
 		if (resource_string_value(dname, dunit, "at", &strval) == 0) {
 			if (strcmp(strval, pathbuf) != 0)
 				continue;
 			wired++;
 		}
 		if (resource_int_value(dname, dunit, "target", &val) == 0) {
 			if (val != target)
 				continue;
 			wired++;
 		}
 		if (resource_int_value(dname, dunit, "lun", &val) == 0) {
 			if (val != lun)
 				continue;
 			wired++;
 		}
 		if (wired != 0) {
 			unit = dunit;
 			break;
 		}
 	}
 
 	/*
 	 * Either start from 0 looking for the next unit or from
 	 * the unit number given in the resource config.  This way,
 	 * if we have wildcard matches, we don't return the same
 	 * unit number twice.
 	 */
 	unit = camperiphnextunit(p_drv, unit, wired, pathid, target, lun);
 
 	return (unit);
 }
 
 void
 cam_periph_invalidate(struct cam_periph *periph)
 {
 
 	cam_periph_assert(periph, MA_OWNED);
 	/*
 	 * We only tear down the device the first time a peripheral is
 	 * invalidated.
 	 */
 	if ((periph->flags & CAM_PERIPH_INVALID) != 0)
 		return;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph invalidated\n"));
 	if ((periph->flags & CAM_PERIPH_ANNOUNCED) && !rebooting) {
 		struct sbuf sb;
 		char buffer[160];
 
 		sbuf_new(&sb, buffer, 160, SBUF_FIXEDLEN);
 		xpt_denounce_periph_sbuf(periph, &sb);
 		sbuf_finish(&sb);
 		sbuf_putbuf(&sb);
 	}
 	periph->flags |= CAM_PERIPH_INVALID;
 	periph->flags &= ~CAM_PERIPH_NEW_DEV_FOUND;
 	if (periph->periph_oninval != NULL)
 		periph->periph_oninval(periph);
 	cam_periph_release_locked(periph);
 }
 
 static void
 camperiphfree(struct cam_periph *periph)
 {
 	struct periph_driver **p_drv;
 	struct periph_driver *drv;
 
 	cam_periph_assert(periph, MA_OWNED);
 	KASSERT(periph->periph_allocating == 0, ("%s%d: freed while allocating",
 	    periph->periph_name, periph->unit_number));
 	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
 		if (strcmp((*p_drv)->driver_name, periph->periph_name) == 0)
 			break;
 	}
 	if (*p_drv == NULL) {
 		printf("camperiphfree: attempt to free non-existant periph\n");
 		return;
 	}
 	/*
 	 * Cache a pointer to the periph_driver structure.  If a
 	 * periph_driver is added or removed from the array (see
 	 * periphdriver_register()) while we drop the toplogy lock
 	 * below, p_drv may change.  This doesn't protect against this
 	 * particular periph_driver going away.  That will require full
 	 * reference counting in the periph_driver infrastructure.
 	 */
 	drv = *p_drv;
 
 	/*
 	 * We need to set this flag before dropping the topology lock, to
 	 * let anyone who is traversing the list that this peripheral is
 	 * about to be freed, and there will be no more reference count
 	 * checks.
 	 */
 	periph->flags |= CAM_PERIPH_FREE;
 
 	/*
 	 * The peripheral destructor semantics dictate calling with only the
 	 * SIM mutex held.  Since it might sleep, it should not be called
 	 * with the topology lock held.
 	 */
 	xpt_unlock_buses();
 
 	/*
 	 * We need to call the peripheral destructor prior to removing the
 	 * peripheral from the list.  Otherwise, we risk running into a
 	 * scenario where the peripheral unit number may get reused
 	 * (because it has been removed from the list), but some resources
 	 * used by the peripheral are still hanging around.  In particular,
 	 * the devfs nodes used by some peripherals like the pass(4) driver
 	 * aren't fully cleaned up until the destructor is run.  If the
 	 * unit number is reused before the devfs instance is fully gone,
 	 * devfs will panic.
 	 */
 	if (periph->periph_dtor != NULL)
 		periph->periph_dtor(periph);
 
 	/*
 	 * The peripheral list is protected by the topology lock. We have to
 	 * remove the periph from the drv list before we call deferred_ac. The
 	 * AC_FOUND_DEVICE callback won't create a new periph if it's still there.
 	 */
 	xpt_lock_buses();
 
 	TAILQ_REMOVE(&drv->units, periph, unit_links);
 	drv->generation++;
 
 	xpt_remove_periph(periph);
 
 	xpt_unlock_buses();
 	if ((periph->flags & CAM_PERIPH_ANNOUNCED) && !rebooting)
 		xpt_print(periph->path, "Periph destroyed\n");
 	else
 		CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n"));
 
 	if (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) {
 		union ccb ccb;
 		void *arg;
 
 		switch (periph->deferred_ac) {
 		case AC_FOUND_DEVICE:
 			ccb.ccb_h.func_code = XPT_GDEV_TYPE;
 			xpt_setup_ccb(&ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 			xpt_action(&ccb);
 			arg = &ccb;
 			break;
 		case AC_PATH_REGISTERED:
 			xpt_path_inq(&ccb.cpi, periph->path);
 			arg = &ccb;
 			break;
 		default:
 			arg = NULL;
 			break;
 		}
 		periph->deferred_callback(NULL, periph->deferred_ac,
 					  periph->path, arg);
 	}
 	xpt_free_path(periph->path);
 	free(periph, M_CAMPERIPH);
 	xpt_lock_buses();
 }
 
 /*
  * Map user virtual pointers into kernel virtual address space, so we can
  * access the memory.  This is now a generic function that centralizes most
  * of the sanity checks on the data flags, if any.
  * This also only works for up to MAXPHYS memory.  Since we use
  * buffers to map stuff in and out, we're limited to the buffer size.
  */
 int
 cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo,
     u_int maxmap)
 {
 	int numbufs, i;
 	u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
 	u_int32_t lengths[CAM_PERIPH_MAXMAPS];
 	u_int32_t dirs[CAM_PERIPH_MAXMAPS];
 	bool misaligned[CAM_PERIPH_MAXMAPS];
 
 	bzero(mapinfo, sizeof(*mapinfo));
 	if (maxmap == 0)
 		maxmap = DFLTPHYS;	/* traditional default */
 	else if (maxmap > MAXPHYS)
 		maxmap = MAXPHYS;	/* for safety */
 	switch(ccb->ccb_h.func_code) {
 	case XPT_DEV_MATCH:
 		if (ccb->cdm.match_buf_len == 0) {
 			printf("cam_periph_mapmem: invalid match buffer "
 			       "length 0\n");
 			return(EINVAL);
 		}
 		if (ccb->cdm.pattern_buf_len > 0) {
 			data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
 			lengths[0] = ccb->cdm.pattern_buf_len;
 			dirs[0] = CAM_DIR_OUT;
 			data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
 			lengths[1] = ccb->cdm.match_buf_len;
 			dirs[1] = CAM_DIR_IN;
 			numbufs = 2;
 		} else {
 			data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
 			lengths[0] = ccb->cdm.match_buf_len;
 			dirs[0] = CAM_DIR_IN;
 			numbufs = 1;
 		}
 		/*
 		 * This request will not go to the hardware, no reason
 		 * to be so strict. vmapbuf() is able to map up to MAXPHYS.
 		 */
 		maxmap = MAXPHYS;
 		break;
 	case XPT_SCSI_IO:
 	case XPT_CONT_TARGET_IO:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return(0);
 		if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
 			return (EINVAL);
 		data_ptrs[0] = &ccb->csio.data_ptr;
 		lengths[0] = ccb->csio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 1;
 		break;
 	case XPT_ATA_IO:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return(0);
 		if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
 			return (EINVAL);
 		data_ptrs[0] = &ccb->ataio.data_ptr;
 		lengths[0] = ccb->ataio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 1;
 		break;
 	case XPT_MMC_IO:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return(0);
 		/* Two mappings: one for cmd->data and one for cmd->data->data */
 		data_ptrs[0] = (unsigned char **)&ccb->mmcio.cmd.data;
 		lengths[0] = sizeof(struct mmc_data *);
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		data_ptrs[1] = (unsigned char **)&ccb->mmcio.cmd.data->data;
 		lengths[1] = ccb->mmcio.cmd.data->len;
 		dirs[1] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 2;
 		break;
 	case XPT_SMP_IO:
 		data_ptrs[0] = &ccb->smpio.smp_request;
 		lengths[0] = ccb->smpio.smp_request_len;
 		dirs[0] = CAM_DIR_OUT;
 		data_ptrs[1] = &ccb->smpio.smp_response;
 		lengths[1] = ccb->smpio.smp_response_len;
 		dirs[1] = CAM_DIR_IN;
 		numbufs = 2;
 		break;
 	case XPT_NVME_IO:
 	case XPT_NVME_ADMIN:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return (0);
 		if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
 			return (EINVAL);
 		data_ptrs[0] = &ccb->nvmeio.data_ptr;
 		lengths[0] = ccb->nvmeio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 1;
 		break;
 	case XPT_DEV_ADVINFO:
 		if (ccb->cdai.bufsiz == 0)
 			return (0);
 
 		data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
 		lengths[0] = ccb->cdai.bufsiz;
 		dirs[0] = CAM_DIR_IN;
 		numbufs = 1;
 
 		/*
 		 * This request will not go to the hardware, no reason
 		 * to be so strict. vmapbuf() is able to map up to MAXPHYS.
 		 */
 		maxmap = MAXPHYS;
 		break;
 	default:
 		return(EINVAL);
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * Check the transfer length and permissions first, so we don't
 	 * have to unmap any previously mapped buffers.
 	 */
 	for (i = 0; i < numbufs; i++) {
 		if (lengths[i] > maxmap) {
 			printf("cam_periph_mapmem: attempt to map %lu bytes, "
 			       "which is greater than %lu\n",
 			       (long)(lengths[i]), (u_long)maxmap);
 			return (E2BIG);
 		}
 
 		/*
 		 * The userland data pointer passed in may not be page
 		 * aligned.  vmapbuf() truncates the address to a page
 		 * boundary, so if the address isn't page aligned, we'll
 		 * need enough space for the given transfer length, plus
 		 * whatever extra space is necessary to make it to the page
 		 * boundary.
 		 */
 		misaligned[i] = (lengths[i] +
 		    (((vm_offset_t)(*data_ptrs[i])) & PAGE_MASK) > MAXPHYS);
 	}
 
 	/*
 	 * This keeps the kernel stack of current thread from getting
 	 * swapped.  In low-memory situations where the kernel stack might
 	 * otherwise get swapped out, this holds it and allows the thread
 	 * to make progress and release the kernel mapped pages sooner.
 	 *
 	 * XXX KDM should I use P_NOSWAP instead?
 	 */
 	PHOLD(curproc);
 
 	for (i = 0; i < numbufs; i++) {
 		/* Save the user's data address. */
 		mapinfo->orig[i] = *data_ptrs[i];
 
 		/*
 		 * For small buffers use malloc+copyin/copyout instead of
 		 * mapping to KVA to avoid expensive TLB shootdowns.  For
 		 * small allocations malloc is backed by UMA, and so much
 		 * cheaper on SMP systems.
 		 */
 		if ((lengths[i] <= periph_mapmem_thresh || misaligned[i]) &&
 		    ccb->ccb_h.func_code != XPT_MMC_IO) {
 			*data_ptrs[i] = malloc(lengths[i], M_CAMPERIPH,
 			    M_WAITOK);
 			if (dirs[i] != CAM_DIR_IN) {
 				if (copyin(mapinfo->orig[i], *data_ptrs[i],
 				    lengths[i]) != 0) {
 					free(*data_ptrs[i], M_CAMPERIPH);
 					*data_ptrs[i] = mapinfo->orig[i];
 					goto fail;
 				}
 			} else
 				bzero(*data_ptrs[i], lengths[i]);
 			continue;
 		}
 
 		/*
 		 * Get the buffer.
 		 */
 		mapinfo->bp[i] = uma_zalloc(pbuf_zone, M_WAITOK);
 
-		/* put our pointer in the data slot */
-		mapinfo->bp[i]->b_data = *data_ptrs[i];
-
-		/* set the transfer length, we know it's < MAXPHYS */
-		mapinfo->bp[i]->b_bufsize = lengths[i];
-
 		/* set the direction */
 		mapinfo->bp[i]->b_iocmd = (dirs[i] == CAM_DIR_OUT) ?
 		    BIO_WRITE : BIO_READ;
 
 		/* Map the buffer into kernel memory. */
-		if (vmapbuf(mapinfo->bp[i], 1) < 0) {
+		if (vmapbuf(mapinfo->bp[i], *data_ptrs[i], lengths[i], 1) < 0) {
 			uma_zfree(pbuf_zone, mapinfo->bp[i]);
 			goto fail;
 		}
 
 		/* set our pointer to the new mapped area */
 		*data_ptrs[i] = mapinfo->bp[i]->b_data;
 	}
 
 	/*
 	 * Now that we've gotten this far, change ownership to the kernel
 	 * of the buffers so that we don't run afoul of returning to user
 	 * space with locks (on the buffer) held.
 	 */
 	for (i = 0; i < numbufs; i++) {
 		if (mapinfo->bp[i])
 			BUF_KERNPROC(mapinfo->bp[i]);
 	}
 
 	mapinfo->num_bufs_used = numbufs;
 	return(0);
 
 fail:
 	for (i--; i >= 0; i--) {
 		if (mapinfo->bp[i]) {
 			vunmapbuf(mapinfo->bp[i]);
 			uma_zfree(pbuf_zone, mapinfo->bp[i]);
 		} else
 			free(*data_ptrs[i], M_CAMPERIPH);
 		*data_ptrs[i] = mapinfo->orig[i];
 	}
 	PRELE(curproc);
 	return(EACCES);
 }
 
 /*
  * Unmap memory segments mapped into kernel virtual address space by
  * cam_periph_mapmem().
  */
 void
 cam_periph_unmapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo)
 {
 	int numbufs, i;
 	u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
 	u_int32_t lengths[CAM_PERIPH_MAXMAPS];
 	u_int32_t dirs[CAM_PERIPH_MAXMAPS];
 
 	if (mapinfo->num_bufs_used <= 0) {
 		/* nothing to free and the process wasn't held. */
 		return;
 	}
 
 	switch (ccb->ccb_h.func_code) {
 	case XPT_DEV_MATCH:
 		if (ccb->cdm.pattern_buf_len > 0) {
 			data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
 			lengths[0] = ccb->cdm.pattern_buf_len;
 			dirs[0] = CAM_DIR_OUT;
 			data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
 			lengths[1] = ccb->cdm.match_buf_len;
 			dirs[1] = CAM_DIR_IN;
 			numbufs = 2;
 		} else {
 			data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
 			lengths[0] = ccb->cdm.match_buf_len;
 			dirs[0] = CAM_DIR_IN;
 			numbufs = 1;
 		}
 		break;
 	case XPT_SCSI_IO:
 	case XPT_CONT_TARGET_IO:
 		data_ptrs[0] = &ccb->csio.data_ptr;
 		lengths[0] = ccb->csio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 1;
 		break;
 	case XPT_ATA_IO:
 		data_ptrs[0] = &ccb->ataio.data_ptr;
 		lengths[0] = ccb->ataio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 1;
 		break;
 	case XPT_MMC_IO:
 		data_ptrs[0] = (u_int8_t **)&ccb->mmcio.cmd.data;
 		lengths[0] = sizeof(struct mmc_data *);
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		data_ptrs[1] = (u_int8_t **)&ccb->mmcio.cmd.data->data;
 		lengths[1] = ccb->mmcio.cmd.data->len;
 		dirs[1] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 2;
 		break;
 	case XPT_SMP_IO:
 		data_ptrs[0] = &ccb->smpio.smp_request;
 		lengths[0] = ccb->smpio.smp_request_len;
 		dirs[0] = CAM_DIR_OUT;
 		data_ptrs[1] = &ccb->smpio.smp_response;
 		lengths[1] = ccb->smpio.smp_response_len;
 		dirs[1] = CAM_DIR_IN;
 		numbufs = 2;
 		break;
 	case XPT_NVME_IO:
 	case XPT_NVME_ADMIN:
 		data_ptrs[0] = &ccb->nvmeio.data_ptr;
 		lengths[0] = ccb->nvmeio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 1;
 		break;
 	case XPT_DEV_ADVINFO:
 		data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
 		lengths[0] = ccb->cdai.bufsiz;
 		dirs[0] = CAM_DIR_IN;
 		numbufs = 1;
 		break;
 	default:
 		/* allow ourselves to be swapped once again */
 		PRELE(curproc);
 		return;
 		break; /* NOTREACHED */ 
 	}
 
 	for (i = 0; i < numbufs; i++) {
 		if (mapinfo->bp[i]) {
 			/* unmap the buffer */
 			vunmapbuf(mapinfo->bp[i]);
 
 			/* release the buffer */
 			uma_zfree(pbuf_zone, mapinfo->bp[i]);
 		} else {
 			if (dirs[i] != CAM_DIR_OUT) {
 				copyout(*data_ptrs[i], mapinfo->orig[i],
 				    lengths[i]);
 			}
 			free(*data_ptrs[i], M_CAMPERIPH);
 		}
 
 		/* Set the user's pointer back to the original value */
 		*data_ptrs[i] = mapinfo->orig[i];
 	}
 
 	/* allow ourselves to be swapped once again */
 	PRELE(curproc);
 }
 
 int
 cam_periph_ioctl(struct cam_periph *periph, u_long cmd, caddr_t addr,
 		 int (*error_routine)(union ccb *ccb, 
 				      cam_flags camflags,
 				      u_int32_t sense_flags))
 {
 	union ccb 	     *ccb;
 	int 		     error;
 	int		     found;
 
 	error = found = 0;
 
 	switch(cmd){
 	case CAMGETPASSTHRU:
 		ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 		xpt_setup_ccb(&ccb->ccb_h,
 			      ccb->ccb_h.path,
 			      CAM_PRIORITY_NORMAL);
 		ccb->ccb_h.func_code = XPT_GDEVLIST;
 
 		/*
 		 * Basically, the point of this is that we go through
 		 * getting the list of devices, until we find a passthrough
 		 * device.  In the current version of the CAM code, the
 		 * only way to determine what type of device we're dealing
 		 * with is by its name.
 		 */
 		while (found == 0) {
 			ccb->cgdl.index = 0;
 			ccb->cgdl.status = CAM_GDEVLIST_MORE_DEVS;
 			while (ccb->cgdl.status == CAM_GDEVLIST_MORE_DEVS) {
 				/* we want the next device in the list */
 				xpt_action(ccb);
 				if (strncmp(ccb->cgdl.periph_name, 
 				    "pass", 4) == 0){
 					found = 1;
 					break;
 				}
 			}
 			if ((ccb->cgdl.status == CAM_GDEVLIST_LAST_DEVICE) &&
 			    (found == 0)) {
 				ccb->cgdl.periph_name[0] = '\0';
 				ccb->cgdl.unit_number = 0;
 				break;
 			}
 		}
 
 		/* copy the result back out */	
 		bcopy(ccb, addr, sizeof(union ccb));
 
 		/* and release the ccb */
 		xpt_release_ccb(ccb);
 
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 	return(error);
 }
 
 static void
 cam_periph_done_panic(struct cam_periph *periph, union ccb *done_ccb)
 {
 
 	panic("%s: already done with ccb %p", __func__, done_ccb);
 }
 
 static void
 cam_periph_done(struct cam_periph *periph, union ccb *done_ccb)
 {
 
 	/* Caller will release the CCB */
 	xpt_path_assert(done_ccb->ccb_h.path, MA_OWNED);
 	done_ccb->ccb_h.cbfcnp = cam_periph_done_panic;
 	wakeup(&done_ccb->ccb_h.cbfcnp);
 }
 
 static void
 cam_periph_ccbwait(union ccb *ccb)
 {
 
 	if ((ccb->ccb_h.func_code & XPT_FC_QUEUED) != 0) {
 		while (ccb->ccb_h.cbfcnp != cam_periph_done_panic)
 			xpt_path_sleep(ccb->ccb_h.path, &ccb->ccb_h.cbfcnp,
 			    PRIBIO, "cbwait", 0);
 	}
 	KASSERT(ccb->ccb_h.pinfo.index == CAM_UNQUEUED_INDEX &&
 	    (ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_INPROG,
 	    ("%s: proceeding with incomplete ccb: ccb=%p, func_code=%#x, "
 	     "status=%#x, index=%d", __func__, ccb, ccb->ccb_h.func_code,
 	     ccb->ccb_h.status, ccb->ccb_h.pinfo.index));
 }
 
 /*
  * Dispatch a CCB and wait for it to complete.  If the CCB has set a
  * callback function (ccb->ccb_h.cbfcnp), it will be overwritten and lost.
  */
 int
 cam_periph_runccb(union ccb *ccb,
 		  int (*error_routine)(union ccb *ccb,
 				       cam_flags camflags,
 				       u_int32_t sense_flags),
 		  cam_flags camflags, u_int32_t sense_flags,
 		  struct devstat *ds)
 {
 	struct bintime *starttime;
 	struct bintime ltime;
 	int error;
 	bool must_poll;
 	uint32_t timeout = 1;
 
 	starttime = NULL;
 	xpt_path_assert(ccb->ccb_h.path, MA_OWNED);
 	KASSERT((ccb->ccb_h.flags & CAM_UNLOCKED) == 0,
 	    ("%s: ccb=%p, func_code=%#x, flags=%#x", __func__, ccb,
 	     ccb->ccb_h.func_code, ccb->ccb_h.flags));
 
 	/*
 	 * If the user has supplied a stats structure, and if we understand
 	 * this particular type of ccb, record the transaction start.
 	 */
 	if (ds != NULL &&
 	    (ccb->ccb_h.func_code == XPT_SCSI_IO ||
 	    ccb->ccb_h.func_code == XPT_ATA_IO ||
 	    ccb->ccb_h.func_code == XPT_NVME_IO)) {
 		starttime = &ltime;
 		binuptime(starttime);
 		devstat_start_transaction(ds, starttime);
 	}
 
 	/*
 	 * We must poll the I/O while we're dumping. The scheduler is normally
 	 * stopped for dumping, except when we call doadump from ddb. While the
 	 * scheduler is running in this case, we still need to poll the I/O to
 	 * avoid sleeping waiting for the ccb to complete.
 	 *
 	 * A panic triggered dump stops the scheduler, any callback from the
 	 * shutdown_post_sync event will run with the scheduler stopped, but
 	 * before we're officially dumping. To avoid hanging in adashutdown
 	 * initiated commands (or other similar situations), we have to test for
 	 * either SCHEDULER_STOPPED() here as well.
 	 *
 	 * To avoid locking problems, dumping/polling callers must call
 	 * without a periph lock held.
 	 */
 	must_poll = dumping || SCHEDULER_STOPPED();
 	ccb->ccb_h.cbfcnp = cam_periph_done;
 
 	/*
 	 * If we're polling, then we need to ensure that we have ample resources
 	 * in the periph.  cam_periph_error can reschedule the ccb by calling
 	 * xpt_action and returning ERESTART, so we have to effect the polling
 	 * in the do loop below.
 	 */
 	if (must_poll) {
 		timeout = xpt_poll_setup(ccb);
 	}
 
 	if (timeout == 0) {
 		ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
 		error = EBUSY;
 	} else {
 		xpt_action(ccb);
 		do {
 			if (must_poll) {
 				xpt_pollwait(ccb, timeout);
 				timeout = ccb->ccb_h.timeout * 10;
 			} else {
 				cam_periph_ccbwait(ccb);
 			}
 			if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP)
 				error = 0;
 			else if (error_routine != NULL) {
 				ccb->ccb_h.cbfcnp = cam_periph_done;
 				error = (*error_routine)(ccb, camflags, sense_flags);
 			} else
 				error = 0;
 		} while (error == ERESTART);
 	}
 
 	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
 		cam_release_devq(ccb->ccb_h.path,
 				 /* relsim_flags */0,
 				 /* openings */0,
 				 /* timeout */0,
 				 /* getcount_only */ FALSE);
 		ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 	}
 
 	if (ds != NULL) {
 		uint32_t bytes;
 		devstat_tag_type tag;
 		bool valid = true;
 
 		if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
 			bytes = ccb->csio.dxfer_len - ccb->csio.resid;
 			tag = (devstat_tag_type)(ccb->csio.tag_action & 0x3);
 		} else if (ccb->ccb_h.func_code == XPT_ATA_IO) {
 			bytes = ccb->ataio.dxfer_len - ccb->ataio.resid;
 			tag = (devstat_tag_type)0;
 		} else if (ccb->ccb_h.func_code == XPT_NVME_IO) {
 			bytes = ccb->nvmeio.dxfer_len; /* NB: resid no possible */
 			tag = (devstat_tag_type)0;
 		} else {
 			valid = false;
 		}
 		if (valid)
 			devstat_end_transaction(ds, bytes, tag,
 			    ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) ?
 			    DEVSTAT_NO_DATA : (ccb->ccb_h.flags & CAM_DIR_OUT) ?
 			    DEVSTAT_WRITE : DEVSTAT_READ, NULL, starttime);
 	}
 
 	return(error);
 }
 
 void
 cam_freeze_devq(struct cam_path *path)
 {
 	struct ccb_hdr ccb_h;
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("cam_freeze_devq\n"));
 	xpt_setup_ccb(&ccb_h, path, /*priority*/1);
 	ccb_h.func_code = XPT_NOOP;
 	ccb_h.flags = CAM_DEV_QFREEZE;
 	xpt_action((union ccb *)&ccb_h);
 }
 
 u_int32_t
 cam_release_devq(struct cam_path *path, u_int32_t relsim_flags,
 		 u_int32_t openings, u_int32_t arg,
 		 int getcount_only)
 {
 	struct ccb_relsim crs;
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("cam_release_devq(%u, %u, %u, %d)\n",
 	    relsim_flags, openings, arg, getcount_only));
 	xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL);
 	crs.ccb_h.func_code = XPT_REL_SIMQ;
 	crs.ccb_h.flags = getcount_only ? CAM_DEV_QFREEZE : 0;
 	crs.release_flags = relsim_flags;
 	crs.openings = openings;
 	crs.release_timeout = arg;
 	xpt_action((union ccb *)&crs);
 	return (crs.qfrozen_cnt);
 }
 
 #define saved_ccb_ptr ppriv_ptr0
 static void
 camperiphdone(struct cam_periph *periph, union ccb *done_ccb)
 {
 	union ccb      *saved_ccb;
 	cam_status	status;
 	struct scsi_start_stop_unit *scsi_cmd;
 	int		error = 0, error_code, sense_key, asc, ascq;
 
 	scsi_cmd = (struct scsi_start_stop_unit *)
 	    &done_ccb->csio.cdb_io.cdb_bytes;
 	status = done_ccb->ccb_h.status;
 
 	if ((status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		if (scsi_extract_sense_ccb(done_ccb,
 		    &error_code, &sense_key, &asc, &ascq)) {
 			/*
 			 * If the error is "invalid field in CDB",
 			 * and the load/eject flag is set, turn the
 			 * flag off and try again.  This is just in
 			 * case the drive in question barfs on the
 			 * load eject flag.  The CAM code should set
 			 * the load/eject flag by default for
 			 * removable media.
 			 */
 			if ((scsi_cmd->opcode == START_STOP_UNIT) &&
 			    ((scsi_cmd->how & SSS_LOEJ) != 0) &&
 			     (asc == 0x24) && (ascq == 0x00)) {
 				scsi_cmd->how &= ~SSS_LOEJ;
 				if (status & CAM_DEV_QFRZN) {
 					cam_release_devq(done_ccb->ccb_h.path,
 					    0, 0, 0, 0);
 					done_ccb->ccb_h.status &=
 					    ~CAM_DEV_QFRZN;
 				}
 				xpt_action(done_ccb);
 				goto out;
 			}
 		}
 		error = cam_periph_error(done_ccb, 0,
 		    SF_RETRY_UA | SF_NO_PRINT);
 		if (error == ERESTART)
 			goto out;
 		if (done_ccb->ccb_h.status & CAM_DEV_QFRZN) {
 			cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0);
 			done_ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 		}
 	} else {
 		/*
 		 * If we have successfully taken a device from the not
 		 * ready to ready state, re-scan the device and re-get
 		 * the inquiry information.  Many devices (mostly disks)
 		 * don't properly report their inquiry information unless
 		 * they are spun up.
 		 */
 		if (scsi_cmd->opcode == START_STOP_UNIT)
 			xpt_async(AC_INQ_CHANGED, done_ccb->ccb_h.path, NULL);
 	}
 
 	/* If we tried long wait and still failed, remember that. */
 	if ((periph->flags & CAM_PERIPH_RECOVERY_WAIT) &&
 	    (done_ccb->csio.cdb_io.cdb_bytes[0] == TEST_UNIT_READY)) {
 		periph->flags &= ~CAM_PERIPH_RECOVERY_WAIT;
 		if (error != 0 && done_ccb->ccb_h.retry_count == 0)
 			periph->flags |= CAM_PERIPH_RECOVERY_WAIT_FAILED;
 	}
 
 	/*
 	 * After recovery action(s) completed, return to the original CCB.
 	 * If the recovery CCB has failed, considering its own possible
 	 * retries and recovery, assume we are back in state where we have
 	 * been originally, but without recovery hopes left.  In such case,
 	 * after the final attempt below, we cancel any further retries,
 	 * blocking by that also any new recovery attempts for this CCB,
 	 * and the result will be the final one returned to the CCB owher.
 	 */
 	saved_ccb = (union ccb *)done_ccb->ccb_h.saved_ccb_ptr;
 	bcopy(saved_ccb, done_ccb, sizeof(*done_ccb));
 	xpt_free_ccb(saved_ccb);
 	if (done_ccb->ccb_h.cbfcnp != camperiphdone)
 		periph->flags &= ~CAM_PERIPH_RECOVERY_INPROG;
 	if (error != 0)
 		done_ccb->ccb_h.retry_count = 0;
 	xpt_action(done_ccb);
 
 out:
 	/* Drop freeze taken due to CAM_DEV_QFREEZE flag set. */
 	cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0);
 }
 
 /*
  * Generic Async Event handler.  Peripheral drivers usually
  * filter out the events that require personal attention,
  * and leave the rest to this function.
  */
 void
 cam_periph_async(struct cam_periph *periph, u_int32_t code,
 		 struct cam_path *path, void *arg)
 {
 	switch (code) {
 	case AC_LOST_DEVICE:
 		cam_periph_invalidate(periph);
 		break; 
 	default:
 		break;
 	}
 }
 
 void
 cam_periph_bus_settle(struct cam_periph *periph, u_int bus_settle)
 {
 	struct ccb_getdevstats cgds;
 
 	xpt_setup_ccb(&cgds.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 	cgds.ccb_h.func_code = XPT_GDEV_STATS;
 	xpt_action((union ccb *)&cgds);
 	cam_periph_freeze_after_event(periph, &cgds.last_reset, bus_settle);
 }
 
 void
 cam_periph_freeze_after_event(struct cam_periph *periph,
 			      struct timeval* event_time, u_int duration_ms)
 {
 	struct timeval delta;
 	struct timeval duration_tv;
 
 	if (!timevalisset(event_time))
 		return;
 
 	microtime(&delta);
 	timevalsub(&delta, event_time);
 	duration_tv.tv_sec = duration_ms / 1000;
 	duration_tv.tv_usec = (duration_ms % 1000) * 1000;
 	if (timevalcmp(&delta, &duration_tv, <)) {
 		timevalsub(&duration_tv, &delta);
 
 		duration_ms = duration_tv.tv_sec * 1000;
 		duration_ms += duration_tv.tv_usec / 1000;
 		cam_freeze_devq(periph->path); 
 		cam_release_devq(periph->path,
 				RELSIM_RELEASE_AFTER_TIMEOUT,
 				/*reduction*/0,
 				/*timeout*/duration_ms,
 				/*getcount_only*/0);
 	}
 
 }
 
 static int
 camperiphscsistatuserror(union ccb *ccb, union ccb **orig_ccb,
     cam_flags camflags, u_int32_t sense_flags,
     int *openings, u_int32_t *relsim_flags,
     u_int32_t *timeout, u_int32_t *action, const char **action_string)
 {
 	struct cam_periph *periph;
 	int error;
 
 	switch (ccb->csio.scsi_status) {
 	case SCSI_STATUS_OK:
 	case SCSI_STATUS_COND_MET:
 	case SCSI_STATUS_INTERMED:
 	case SCSI_STATUS_INTERMED_COND_MET:
 		error = 0;
 		break;
 	case SCSI_STATUS_CMD_TERMINATED:
 	case SCSI_STATUS_CHECK_COND:
 		error = camperiphscsisenseerror(ccb, orig_ccb,
 					        camflags,
 					        sense_flags,
 					        openings,
 					        relsim_flags,
 					        timeout,
 					        action,
 					        action_string);
 		break;
 	case SCSI_STATUS_QUEUE_FULL:
 	{
 		/* no decrement */
 		struct ccb_getdevstats cgds;
 
 		/*
 		 * First off, find out what the current
 		 * transaction counts are.
 		 */
 		xpt_setup_ccb(&cgds.ccb_h,
 			      ccb->ccb_h.path,
 			      CAM_PRIORITY_NORMAL);
 		cgds.ccb_h.func_code = XPT_GDEV_STATS;
 		xpt_action((union ccb *)&cgds);
 
 		/*
 		 * If we were the only transaction active, treat
 		 * the QUEUE FULL as if it were a BUSY condition.
 		 */
 		if (cgds.dev_active != 0) {
 			int total_openings;
 
 			/*
 		 	 * Reduce the number of openings to
 			 * be 1 less than the amount it took
 			 * to get a queue full bounded by the
 			 * minimum allowed tag count for this
 			 * device.
 		 	 */
 			total_openings = cgds.dev_active + cgds.dev_openings;
 			*openings = cgds.dev_active;
 			if (*openings < cgds.mintags)
 				*openings = cgds.mintags;
 			if (*openings < total_openings)
 				*relsim_flags = RELSIM_ADJUST_OPENINGS;
 			else {
 				/*
 				 * Some devices report queue full for
 				 * temporary resource shortages.  For
 				 * this reason, we allow a minimum
 				 * tag count to be entered via a
 				 * quirk entry to prevent the queue
 				 * count on these devices from falling
 				 * to a pessimisticly low value.  We
 				 * still wait for the next successful
 				 * completion, however, before queueing
 				 * more transactions to the device.
 				 */
 				*relsim_flags = RELSIM_RELEASE_AFTER_CMDCMPLT;
 			}
 			*timeout = 0;
 			error = ERESTART;
 			*action &= ~SSQ_PRINT_SENSE;
 			break;
 		}
 		/* FALLTHROUGH */
 	}
 	case SCSI_STATUS_BUSY:
 		/*
 		 * Restart the queue after either another
 		 * command completes or a 1 second timeout.
 		 */
 		periph = xpt_path_periph(ccb->ccb_h.path);
 		if (periph->flags & CAM_PERIPH_INVALID) {
 			error = EIO;
 			*action_string = "Periph was invalidated";
 		} else if ((sense_flags & SF_RETRY_BUSY) != 0 ||
 		    ccb->ccb_h.retry_count > 0) {
 			if ((sense_flags & SF_RETRY_BUSY) == 0)
 				ccb->ccb_h.retry_count--;
 			error = ERESTART;
 			*relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT
 				      | RELSIM_RELEASE_AFTER_CMDCMPLT;
 			*timeout = 1000;
 		} else {
 			error = EIO;
 			*action_string = "Retries exhausted";
 		}
 		break;
 	case SCSI_STATUS_RESERV_CONFLICT:
 	default:
 		error = EIO;
 		break;
 	}
 	return (error);
 }
 
 static int
 camperiphscsisenseerror(union ccb *ccb, union ccb **orig,
     cam_flags camflags, u_int32_t sense_flags,
     int *openings, u_int32_t *relsim_flags,
     u_int32_t *timeout, u_int32_t *action, const char **action_string)
 {
 	struct cam_periph *periph;
 	union ccb *orig_ccb = ccb;
 	int error, recoveryccb;
 
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 	if (ccb->ccb_h.func_code == XPT_SCSI_IO && ccb->csio.bio != NULL)
 		biotrack(ccb->csio.bio, __func__);
 #endif
 
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	recoveryccb = (ccb->ccb_h.cbfcnp == camperiphdone);
 	if ((periph->flags & CAM_PERIPH_RECOVERY_INPROG) && !recoveryccb) {
 		/*
 		 * If error recovery is already in progress, don't attempt
 		 * to process this error, but requeue it unconditionally
 		 * and attempt to process it once error recovery has
 		 * completed.  This failed command is probably related to
 		 * the error that caused the currently active error recovery
 		 * action so our  current recovery efforts should also
 		 * address this command.  Be aware that the error recovery
 		 * code assumes that only one recovery action is in progress
 		 * on a particular peripheral instance at any given time
 		 * (e.g. only one saved CCB for error recovery) so it is
 		 * imperitive that we don't violate this assumption.
 		 */
 		error = ERESTART;
 		*action &= ~SSQ_PRINT_SENSE;
 	} else {
 		scsi_sense_action err_action;
 		struct ccb_getdev cgd;
 
 		/*
 		 * Grab the inquiry data for this device.
 		 */
 		xpt_setup_ccb(&cgd.ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL);
 		cgd.ccb_h.func_code = XPT_GDEV_TYPE;
 		xpt_action((union ccb *)&cgd);
 
 		err_action = scsi_error_action(&ccb->csio, &cgd.inq_data,
 		    sense_flags);
 		error = err_action & SS_ERRMASK;
 
 		/*
 		 * Do not autostart sequential access devices
 		 * to avoid unexpected tape loading.
 		 */
 		if ((err_action & SS_MASK) == SS_START &&
 		    SID_TYPE(&cgd.inq_data) == T_SEQUENTIAL) {
 			*action_string = "Will not autostart a "
 			    "sequential access device";
 			goto sense_error_done;
 		}
 
 		/*
 		 * Avoid recovery recursion if recovery action is the same.
 		 */
 		if ((err_action & SS_MASK) >= SS_START && recoveryccb) {
 			if (((err_action & SS_MASK) == SS_START &&
 			     ccb->csio.cdb_io.cdb_bytes[0] == START_STOP_UNIT) ||
 			    ((err_action & SS_MASK) == SS_TUR &&
 			     (ccb->csio.cdb_io.cdb_bytes[0] == TEST_UNIT_READY))) {
 				err_action = SS_RETRY|SSQ_DECREMENT_COUNT|EIO;
 				*relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
 				*timeout = 500;
 			}
 		}
 
 		/*
 		 * If the recovery action will consume a retry,
 		 * make sure we actually have retries available.
 		 */
 		if ((err_action & SSQ_DECREMENT_COUNT) != 0) {
 		 	if (ccb->ccb_h.retry_count > 0 &&
 			    (periph->flags & CAM_PERIPH_INVALID) == 0)
 		 		ccb->ccb_h.retry_count--;
 			else {
 				*action_string = "Retries exhausted";
 				goto sense_error_done;
 			}
 		}
 
 		if ((err_action & SS_MASK) >= SS_START) {
 			/*
 			 * Do common portions of commands that
 			 * use recovery CCBs.
 			 */
 			orig_ccb = xpt_alloc_ccb_nowait();
 			if (orig_ccb == NULL) {
 				*action_string = "Can't allocate recovery CCB";
 				goto sense_error_done;
 			}
 			/*
 			 * Clear freeze flag for original request here, as
 			 * this freeze will be dropped as part of ERESTART.
 			 */
 			ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 			bcopy(ccb, orig_ccb, sizeof(*orig_ccb));
 		}
 
 		switch (err_action & SS_MASK) {
 		case SS_NOP:
 			*action_string = "No recovery action needed";
 			error = 0;
 			break;
 		case SS_RETRY:
 			*action_string = "Retrying command (per sense data)";
 			error = ERESTART;
 			break;
 		case SS_FAIL:
 			*action_string = "Unretryable error";
 			break;
 		case SS_START:
 		{
 			int le;
 
 			/*
 			 * Send a start unit command to the device, and
 			 * then retry the command.
 			 */
 			*action_string = "Attempting to start unit";
 			periph->flags |= CAM_PERIPH_RECOVERY_INPROG;
 
 			/*
 			 * Check for removable media and set
 			 * load/eject flag appropriately.
 			 */
 			if (SID_IS_REMOVABLE(&cgd.inq_data))
 				le = TRUE;
 			else
 				le = FALSE;
 
 			scsi_start_stop(&ccb->csio,
 					/*retries*/1,
 					camperiphdone,
 					MSG_SIMPLE_Q_TAG,
 					/*start*/TRUE,
 					/*load/eject*/le,
 					/*immediate*/FALSE,
 					SSD_FULL_SIZE,
 					/*timeout*/50000);
 			break;
 		}
 		case SS_TUR:
 		{
 			/*
 			 * Send a Test Unit Ready to the device.
 			 * If the 'many' flag is set, we send 120
 			 * test unit ready commands, one every half 
 			 * second.  Otherwise, we just send one TUR.
 			 * We only want to do this if the retry 
 			 * count has not been exhausted.
 			 */
 			int retries;
 
 			if ((err_action & SSQ_MANY) != 0 && (periph->flags &
 			     CAM_PERIPH_RECOVERY_WAIT_FAILED) == 0) {
 				periph->flags |= CAM_PERIPH_RECOVERY_WAIT;
 				*action_string = "Polling device for readiness";
 				retries = 120;
 			} else {
 				*action_string = "Testing device for readiness";
 				retries = 1;
 			}
 			periph->flags |= CAM_PERIPH_RECOVERY_INPROG;
 			scsi_test_unit_ready(&ccb->csio,
 					     retries,
 					     camperiphdone,
 					     MSG_SIMPLE_Q_TAG,
 					     SSD_FULL_SIZE,
 					     /*timeout*/5000);
 
 			/*
 			 * Accomplish our 500ms delay by deferring
 			 * the release of our device queue appropriately.
 			 */
 			*relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
 			*timeout = 500;
 			break;
 		}
 		default:
 			panic("Unhandled error action %x", err_action);
 		}
 		
 		if ((err_action & SS_MASK) >= SS_START) {
 			/*
 			 * Drop the priority, so that the recovery
 			 * CCB is the first to execute.  Freeze the queue
 			 * after this command is sent so that we can
 			 * restore the old csio and have it queued in
 			 * the proper order before we release normal 
 			 * transactions to the device.
 			 */
 			ccb->ccb_h.pinfo.priority--;
 			ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
 			ccb->ccb_h.saved_ccb_ptr = orig_ccb;
 			error = ERESTART;
 			*orig = orig_ccb;
 		}
 
 sense_error_done:
 		*action = err_action;
 	}
 	return (error);
 }
 
 /*
  * Generic error handler.  Peripheral drivers usually filter
  * out the errors that they handle in a unique manner, then
  * call this function.
  */
 int
 cam_periph_error(union ccb *ccb, cam_flags camflags,
 		 u_int32_t sense_flags)
 {
 	struct cam_path *newpath;
 	union ccb  *orig_ccb, *scan_ccb;
 	struct cam_periph *periph;
 	const char *action_string;
 	cam_status  status;
 	int	    frozen, error, openings, devctl_err;
 	u_int32_t   action, relsim_flags, timeout;
 
 	action = SSQ_PRINT_SENSE;
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	action_string = NULL;
 	status = ccb->ccb_h.status;
 	frozen = (status & CAM_DEV_QFRZN) != 0;
 	status &= CAM_STATUS_MASK;
 	devctl_err = openings = relsim_flags = timeout = 0;
 	orig_ccb = ccb;
 
 	/* Filter the errors that should be reported via devctl */
 	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
 	case CAM_CMD_TIMEOUT:
 	case CAM_REQ_ABORTED:
 	case CAM_REQ_CMP_ERR:
 	case CAM_REQ_TERMIO:
 	case CAM_UNREC_HBA_ERROR:
 	case CAM_DATA_RUN_ERR:
 	case CAM_SCSI_STATUS_ERROR:
 	case CAM_ATA_STATUS_ERROR:
 	case CAM_SMP_STATUS_ERROR:
 		devctl_err++;
 		break;
 	default:
 		break;
 	}
 
 	switch (status) {
 	case CAM_REQ_CMP:
 		error = 0;
 		action &= ~SSQ_PRINT_SENSE;
 		break;
 	case CAM_SCSI_STATUS_ERROR:
 		error = camperiphscsistatuserror(ccb, &orig_ccb,
 		    camflags, sense_flags, &openings, &relsim_flags,
 		    &timeout, &action, &action_string);
 		break;
 	case CAM_AUTOSENSE_FAIL:
 		error = EIO;	/* we have to kill the command */
 		break;
 	case CAM_UA_ABORT:
 	case CAM_UA_TERMIO:
 	case CAM_MSG_REJECT_REC:
 		/* XXX Don't know that these are correct */
 		error = EIO;
 		break;
 	case CAM_SEL_TIMEOUT:
 		if ((camflags & CAM_RETRY_SELTO) != 0) {
 			if (ccb->ccb_h.retry_count > 0 &&
 			    (periph->flags & CAM_PERIPH_INVALID) == 0) {
 				ccb->ccb_h.retry_count--;
 				error = ERESTART;
 
 				/*
 				 * Wait a bit to give the device
 				 * time to recover before we try again.
 				 */
 				relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
 				timeout = periph_selto_delay;
 				break;
 			}
 			action_string = "Retries exhausted";
 		}
 		/* FALLTHROUGH */
 	case CAM_DEV_NOT_THERE:
 		error = ENXIO;
 		action = SSQ_LOST;
 		break;
 	case CAM_REQ_INVALID:
 	case CAM_PATH_INVALID:
 	case CAM_NO_HBA:
 	case CAM_PROVIDE_FAIL:
 	case CAM_REQ_TOO_BIG:
 	case CAM_LUN_INVALID:
 	case CAM_TID_INVALID:
 	case CAM_FUNC_NOTAVAIL:
 		error = EINVAL;
 		break;
 	case CAM_SCSI_BUS_RESET:
 	case CAM_BDR_SENT:
 		/*
 		 * Commands that repeatedly timeout and cause these
 		 * kinds of error recovery actions, should return
 		 * CAM_CMD_TIMEOUT, which allows us to safely assume
 		 * that this command was an innocent bystander to
 		 * these events and should be unconditionally
 		 * retried.
 		 */
 	case CAM_REQUEUE_REQ:
 		/* Unconditional requeue if device is still there */
 		if (periph->flags & CAM_PERIPH_INVALID) {
 			action_string = "Periph was invalidated";
 			error = EIO;
 		} else if (sense_flags & SF_NO_RETRY) {
 			error = EIO;
 			action_string = "Retry was blocked";
 		} else {
 			error = ERESTART;
 			action &= ~SSQ_PRINT_SENSE;
 		}
 		break;
 	case CAM_RESRC_UNAVAIL:
 		/* Wait a bit for the resource shortage to abate. */
 		timeout = periph_noresrc_delay;
 		/* FALLTHROUGH */
 	case CAM_BUSY:
 		if (timeout == 0) {
 			/* Wait a bit for the busy condition to abate. */
 			timeout = periph_busy_delay;
 		}
 		relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
 		/* FALLTHROUGH */
 	case CAM_ATA_STATUS_ERROR:
 	case CAM_REQ_CMP_ERR:
 	case CAM_CMD_TIMEOUT:
 	case CAM_UNEXP_BUSFREE:
 	case CAM_UNCOR_PARITY:
 	case CAM_DATA_RUN_ERR:
 	default:
 		if (periph->flags & CAM_PERIPH_INVALID) {
 			error = EIO;
 			action_string = "Periph was invalidated";
 		} else if (ccb->ccb_h.retry_count == 0) {
 			error = EIO;
 			action_string = "Retries exhausted";
 		} else if (sense_flags & SF_NO_RETRY) {
 			error = EIO;
 			action_string = "Retry was blocked";
 		} else {
 			ccb->ccb_h.retry_count--;
 			error = ERESTART;
 		}
 		break;
 	}
 
 	if ((sense_flags & SF_PRINT_ALWAYS) ||
 	    CAM_DEBUGGED(ccb->ccb_h.path, CAM_DEBUG_INFO))
 		action |= SSQ_PRINT_SENSE;
 	else if (sense_flags & SF_NO_PRINT)
 		action &= ~SSQ_PRINT_SENSE;
 	if ((action & SSQ_PRINT_SENSE) != 0)
 		cam_error_print(orig_ccb, CAM_ESF_ALL, CAM_EPF_ALL);
 	if (error != 0 && (action & SSQ_PRINT_SENSE) != 0) {
 		if (error != ERESTART) {
 			if (action_string == NULL)
 				action_string = "Unretryable error";
 			xpt_print(ccb->ccb_h.path, "Error %d, %s\n",
 			    error, action_string);
 		} else if (action_string != NULL)
 			xpt_print(ccb->ccb_h.path, "%s\n", action_string);
 		else {
 			xpt_print(ccb->ccb_h.path,
 			    "Retrying command, %d more tries remain\n",
 			    ccb->ccb_h.retry_count);
 		}
 	}
 
 	if (devctl_err && (error != 0 || (action & SSQ_PRINT_SENSE) != 0))
 		cam_periph_devctl_notify(orig_ccb);
 
 	if ((action & SSQ_LOST) != 0) {
 		lun_id_t lun_id;
 
 		/*
 		 * For a selection timeout, we consider all of the LUNs on
 		 * the target to be gone.  If the status is CAM_DEV_NOT_THERE,
 		 * then we only get rid of the device(s) specified by the
 		 * path in the original CCB.
 		 */
 		if (status == CAM_SEL_TIMEOUT)
 			lun_id = CAM_LUN_WILDCARD;
 		else
 			lun_id = xpt_path_lun_id(ccb->ccb_h.path);
 
 		/* Should we do more if we can't create the path?? */
 		if (xpt_create_path(&newpath, periph,
 				    xpt_path_path_id(ccb->ccb_h.path),
 				    xpt_path_target_id(ccb->ccb_h.path),
 				    lun_id) == CAM_REQ_CMP) {
 			/*
 			 * Let peripheral drivers know that this
 			 * device has gone away.
 			 */
 			xpt_async(AC_LOST_DEVICE, newpath, NULL);
 			xpt_free_path(newpath);
 		}
 	}
 
 	/* Broadcast UNIT ATTENTIONs to all periphs. */
 	if ((action & SSQ_UA) != 0)
 		xpt_async(AC_UNIT_ATTENTION, orig_ccb->ccb_h.path, orig_ccb);
 
 	/* Rescan target on "Reported LUNs data has changed" */
 	if ((action & SSQ_RESCAN) != 0) {
 		if (xpt_create_path(&newpath, NULL,
 				    xpt_path_path_id(ccb->ccb_h.path),
 				    xpt_path_target_id(ccb->ccb_h.path),
 				    CAM_LUN_WILDCARD) == CAM_REQ_CMP) {
 			scan_ccb = xpt_alloc_ccb_nowait();
 			if (scan_ccb != NULL) {
 				scan_ccb->ccb_h.path = newpath;
 				scan_ccb->ccb_h.func_code = XPT_SCAN_TGT;
 				scan_ccb->crcn.flags = 0;
 				xpt_rescan(scan_ccb);
 			} else {
 				xpt_print(newpath,
 				    "Can't allocate CCB to rescan target\n");
 				xpt_free_path(newpath);
 			}
 		}
 	}
 
 	/* Attempt a retry */
 	if (error == ERESTART || error == 0) {
 		if (frozen != 0)
 			ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 		if (error == ERESTART)
 			xpt_action(ccb);
 		if (frozen != 0)
 			cam_release_devq(ccb->ccb_h.path,
 					 relsim_flags,
 					 openings,
 					 timeout,
 					 /*getcount_only*/0);
 	}
 
 	return (error);
 }
 
 #define CAM_PERIPH_DEVD_MSG_SIZE	256
 
 static void
 cam_periph_devctl_notify(union ccb *ccb)
 {
 	struct cam_periph *periph;
 	struct ccb_getdev *cgd;
 	struct sbuf sb;
 	int serr, sk, asc, ascq;
 	char *sbmsg, *type;
 
 	sbmsg = malloc(CAM_PERIPH_DEVD_MSG_SIZE, M_CAMPERIPH, M_NOWAIT);
 	if (sbmsg == NULL)
 		return;
 
 	sbuf_new(&sb, sbmsg, CAM_PERIPH_DEVD_MSG_SIZE, SBUF_FIXEDLEN);
 
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	sbuf_printf(&sb, "device=%s%d ", periph->periph_name,
 	    periph->unit_number);
 
 	sbuf_printf(&sb, "serial=\"");
 	if ((cgd = (struct ccb_getdev *)xpt_alloc_ccb_nowait()) != NULL) {
 		xpt_setup_ccb(&cgd->ccb_h, ccb->ccb_h.path,
 		    CAM_PRIORITY_NORMAL);
 		cgd->ccb_h.func_code = XPT_GDEV_TYPE;
 		xpt_action((union ccb *)cgd);
 
 		if (cgd->ccb_h.status == CAM_REQ_CMP)
 			sbuf_bcat(&sb, cgd->serial_num, cgd->serial_num_len);
 		xpt_free_ccb((union ccb *)cgd);
 	}
 	sbuf_printf(&sb, "\" ");
 	sbuf_printf(&sb, "cam_status=\"0x%x\" ", ccb->ccb_h.status);
 
 	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
 	case CAM_CMD_TIMEOUT:
 		sbuf_printf(&sb, "timeout=%d ", ccb->ccb_h.timeout);
 		type = "timeout";
 		break;
 	case CAM_SCSI_STATUS_ERROR:
 		sbuf_printf(&sb, "scsi_status=%d ", ccb->csio.scsi_status);
 		if (scsi_extract_sense_ccb(ccb, &serr, &sk, &asc, &ascq))
 			sbuf_printf(&sb, "scsi_sense=\"%02x %02x %02x %02x\" ",
 			    serr, sk, asc, ascq);
 		type = "error";
 		break;
 	case CAM_ATA_STATUS_ERROR:
 		sbuf_printf(&sb, "RES=\"");
 		ata_res_sbuf(&ccb->ataio.res, &sb);
 		sbuf_printf(&sb, "\" ");
 		type = "error";
 		break;
 	default:
 		type = "error";
 		break;
 	}
 
 	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
 		sbuf_printf(&sb, "CDB=\"");
 		scsi_cdb_sbuf(scsiio_cdb_ptr(&ccb->csio), &sb);
 		sbuf_printf(&sb, "\" ");
 	} else if (ccb->ccb_h.func_code == XPT_ATA_IO) {
 		sbuf_printf(&sb, "ACB=\"");
 		ata_cmd_sbuf(&ccb->ataio.cmd, &sb);
 		sbuf_printf(&sb, "\" ");
 	}
 
 	if (sbuf_finish(&sb) == 0)
 		devctl_notify("CAM", "periph", type, sbuf_data(&sb));
 	sbuf_delete(&sb);
 	free(sbmsg, M_CAMPERIPH);
 }
 
 /*
  * Sysctl to force an invalidation of the drive right now. Can be
  * called with CTLFLAG_MPSAFE since we take periph lock.
  */
 int
 cam_periph_invalidate_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct cam_periph *periph;
 	int error, value;
 
 	periph = arg1;
 	value = 0;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL || value != 1)
 		return (error);
 
 	cam_periph_lock(periph);
 	cam_periph_invalidate(periph);
 	cam_periph_unlock(periph);
 
 	return (0);
 }
Index: head/sys/dev/nvme/nvme_ctrlr.c
===================================================================
--- head/sys/dev/nvme/nvme_ctrlr.c	(revision 366910)
+++ head/sys/dev/nvme/nvme_ctrlr.c	(revision 366911)
@@ -1,1654 +1,1652 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (C) 2012-2016 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_cam.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/ioccom.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/uio.h>
 #include <sys/sbuf.h>
 #include <sys/endian.h>
 #include <machine/stdarg.h>
 #include <vm/vm.h>
 
 #include "nvme_private.h"
 
 #define B4_CHK_RDY_DELAY_MS	2300		/* work around controller bug */
 
 static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
 						struct nvme_async_event_request *aer);
 
 static void
 nvme_ctrlr_devctl_log(struct nvme_controller *ctrlr, const char *type, const char *msg, ...)
 {
 	struct sbuf sb;
 	va_list ap;
 	int error;
 
 	if (sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND | SBUF_NOWAIT) == NULL)
 		return;
 	sbuf_printf(&sb, "%s: ", device_get_nameunit(ctrlr->dev));
 	va_start(ap, msg);
 	sbuf_vprintf(&sb, msg, ap);
 	va_end(ap);
 	error = sbuf_finish(&sb);
 	if (error == 0)
 		printf("%s\n", sbuf_data(&sb));
 
 	sbuf_clear(&sb);
 	sbuf_printf(&sb, "name=\"%s\" reason=\"", device_get_nameunit(ctrlr->dev));
 	va_start(ap, msg);
 	sbuf_vprintf(&sb, msg, ap);
 	va_end(ap);
 	sbuf_printf(&sb, "\"");
 	error = sbuf_finish(&sb);
 	if (error == 0)
 		devctl_notify("nvme", "controller", type, sbuf_data(&sb));
 	sbuf_delete(&sb);
 }
 
 static int
 nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
 {
 	struct nvme_qpair	*qpair;
 	uint32_t		num_entries;
 	int			error;
 
 	qpair = &ctrlr->adminq;
 	qpair->id = 0;
 	qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
 	qpair->domain = ctrlr->domain;
 
 	num_entries = NVME_ADMIN_ENTRIES;
 	TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
 	/*
 	 * If admin_entries was overridden to an invalid value, revert it
 	 *  back to our default value.
 	 */
 	if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
 	    num_entries > NVME_MAX_ADMIN_ENTRIES) {
 		nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d "
 		    "specified\n", num_entries);
 		num_entries = NVME_ADMIN_ENTRIES;
 	}
 
 	/*
 	 * The admin queue's max xfer size is treated differently than the
 	 *  max I/O xfer size.  16KB is sufficient here - maybe even less?
 	 */
 	error = nvme_qpair_construct(qpair, num_entries, NVME_ADMIN_TRACKERS,
 	     ctrlr);
 	return (error);
 }
 
 #define QP(ctrlr, c)	((c) * (ctrlr)->num_io_queues / mp_ncpus)
 
 static int
 nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
 {
 	struct nvme_qpair	*qpair;
 	uint32_t		cap_lo;
 	uint16_t		mqes;
 	int			c, error, i, n;
 	int			num_entries, num_trackers, max_entries;
 
 	/*
 	 * NVMe spec sets a hard limit of 64K max entries, but devices may
 	 * specify a smaller limit, so we need to check the MQES field in the
 	 * capabilities register. We have to cap the number of entries to the
 	 * current stride allows for in BAR 0/1, otherwise the remainder entries
 	 * are inaccessable. MQES should reflect this, and this is just a
 	 * fail-safe.
 	 */
 	max_entries =
 	    (rman_get_size(ctrlr->resource) - nvme_mmio_offsetof(doorbell[0])) /
 	    (1 << (ctrlr->dstrd + 1));
 	num_entries = NVME_IO_ENTRIES;
 	TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
 	cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
 	mqes = NVME_CAP_LO_MQES(cap_lo);
 	num_entries = min(num_entries, mqes + 1);
 	num_entries = min(num_entries, max_entries);
 
 	num_trackers = NVME_IO_TRACKERS;
 	TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers);
 
 	num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS);
 	num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS);
 	/*
 	 * No need to have more trackers than entries in the submit queue.  Note
 	 * also that for a queue size of N, we can only have (N-1) commands
 	 * outstanding, hence the "-1" here.
 	 */
 	num_trackers = min(num_trackers, (num_entries-1));
 
 	/*
 	 * Our best estimate for the maximum number of I/Os that we should
 	 * normally have in flight at one time. This should be viewed as a hint,
 	 * not a hard limit and will need to be revisited when the upper layers
 	 * of the storage system grows multi-queue support.
 	 */
 	ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4;
 
 	ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
 	    M_NVME, M_ZERO | M_WAITOK);
 
 	for (i = c = n = 0; i < ctrlr->num_io_queues; i++, c += n) {
 		qpair = &ctrlr->ioq[i];
 
 		/*
 		 * Admin queue has ID=0. IO queues start at ID=1 -
 		 *  hence the 'i+1' here.
 		 */
 		qpair->id = i + 1;
 		if (ctrlr->num_io_queues > 1) {
 			/* Find number of CPUs served by this queue. */
 			for (n = 1; QP(ctrlr, c + n) == i; n++)
 				;
 			/* Shuffle multiple NVMe devices between CPUs. */
 			qpair->cpu = c + (device_get_unit(ctrlr->dev)+n/2) % n;
 			qpair->domain = pcpu_find(qpair->cpu)->pc_domain;
 		} else {
 			qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
 			qpair->domain = ctrlr->domain;
 		}
 
 		/*
 		 * For I/O queues, use the controller-wide max_xfer_size
 		 *  calculated in nvme_attach().
 		 */
 		error = nvme_qpair_construct(qpair, num_entries, num_trackers,
 		    ctrlr);
 		if (error)
 			return (error);
 
 		/*
 		 * Do not bother binding interrupts if we only have one I/O
 		 *  interrupt thread for this controller.
 		 */
 		if (ctrlr->num_io_queues > 1)
 			bus_bind_intr(ctrlr->dev, qpair->res, qpair->cpu);
 	}
 
 	return (0);
 }
 
 static void
 nvme_ctrlr_fail(struct nvme_controller *ctrlr)
 {
 	int i;
 
 	ctrlr->is_failed = true;
 	nvme_admin_qpair_disable(&ctrlr->adminq);
 	nvme_qpair_fail(&ctrlr->adminq);
 	if (ctrlr->ioq != NULL) {
 		for (i = 0; i < ctrlr->num_io_queues; i++) {
 			nvme_io_qpair_disable(&ctrlr->ioq[i]);
 			nvme_qpair_fail(&ctrlr->ioq[i]);
 		}
 	}
 	nvme_notify_fail_consumers(ctrlr);
 }
 
 void
 nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,
     struct nvme_request *req)
 {
 
 	mtx_lock(&ctrlr->lock);
 	STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq);
 	mtx_unlock(&ctrlr->lock);
 	taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
 }
 
 static void
 nvme_ctrlr_fail_req_task(void *arg, int pending)
 {
 	struct nvme_controller	*ctrlr = arg;
 	struct nvme_request	*req;
 
 	mtx_lock(&ctrlr->lock);
 	while ((req = STAILQ_FIRST(&ctrlr->fail_req)) != NULL) {
 		STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq);
 		mtx_unlock(&ctrlr->lock);
 		nvme_qpair_manual_complete_request(req->qpair, req,
 		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST);
 		mtx_lock(&ctrlr->lock);
 	}
 	mtx_unlock(&ctrlr->lock);
 }
 
 static int
 nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val)
 {
 	int ms_waited;
 	uint32_t csts;
 
 	ms_waited = 0;
 	while (1) {
 		csts = nvme_mmio_read_4(ctrlr, csts);
 		if (csts == 0xffffffff)		/* Hot unplug. */
 			return (ENXIO);
 		if (((csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK)
 		    == desired_val)
 			break;
 		if (ms_waited++ > ctrlr->ready_timeout_in_ms) {
 			nvme_printf(ctrlr, "controller ready did not become %d "
 			    "within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms);
 			return (ENXIO);
 		}
 		DELAY(1000);
 	}
 
 	return (0);
 }
 
 static int
 nvme_ctrlr_disable(struct nvme_controller *ctrlr)
 {
 	uint32_t cc;
 	uint32_t csts;
 	uint8_t  en, rdy;
 	int err;
 
 	cc = nvme_mmio_read_4(ctrlr, cc);
 	csts = nvme_mmio_read_4(ctrlr, csts);
 
 	en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK;
 	rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK;
 
 	/*
 	 * Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1
 	 * when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when
 	 * CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY
 	 * isn't the desired value. Short circuit if we're already disabled.
 	 */
 	if (en == 1) {
 		if (rdy == 0) {
 			/* EN == 1, wait for  RDY == 1 or fail */
 			err = nvme_ctrlr_wait_for_ready(ctrlr, 1);
 			if (err != 0)
 				return (err);
 		}
 	} else {
 		/* EN == 0 already wait for RDY == 0 */
 		if (rdy == 0)
 			return (0);
 		else
 			return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
 	}
 
 	cc &= ~NVME_CC_REG_EN_MASK;
 	nvme_mmio_write_4(ctrlr, cc, cc);
 	/*
 	 * Some drives have issues with accessing the mmio after we
 	 * disable, so delay for a bit after we write the bit to
 	 * cope with these issues.
 	 */
 	if (ctrlr->quirks & QUIRK_DELAY_B4_CHK_RDY)
 		pause("nvmeR", B4_CHK_RDY_DELAY_MS * hz / 1000);
 	return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
 }
 
 static int
 nvme_ctrlr_enable(struct nvme_controller *ctrlr)
 {
 	uint32_t	cc;
 	uint32_t	csts;
 	uint32_t	aqa;
 	uint32_t	qsize;
 	uint8_t		en, rdy;
 	int		err;
 
 	cc = nvme_mmio_read_4(ctrlr, cc);
 	csts = nvme_mmio_read_4(ctrlr, csts);
 
 	en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK;
 	rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK;
 
 	/*
 	 * See note in nvme_ctrlr_disable. Short circuit if we're already enabled.
 	 */
 	if (en == 1) {
 		if (rdy == 1)
 			return (0);
 		else
 			return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
 	} else {
 		/* EN == 0 already wait for RDY == 0 or fail */
 		err = nvme_ctrlr_wait_for_ready(ctrlr, 0);
 		if (err != 0)
 			return (err);
 	}
 
 	nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
 	DELAY(5000);
 	nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
 	DELAY(5000);
 
 	/* acqs and asqs are 0-based. */
 	qsize = ctrlr->adminq.num_entries - 1;
 
 	aqa = 0;
 	aqa = (qsize & NVME_AQA_REG_ACQS_MASK) << NVME_AQA_REG_ACQS_SHIFT;
 	aqa |= (qsize & NVME_AQA_REG_ASQS_MASK) << NVME_AQA_REG_ASQS_SHIFT;
 	nvme_mmio_write_4(ctrlr, aqa, aqa);
 	DELAY(5000);
 
 	/* Initialization values for CC */
 	cc = 0;
 	cc |= 1 << NVME_CC_REG_EN_SHIFT;
 	cc |= 0 << NVME_CC_REG_CSS_SHIFT;
 	cc |= 0 << NVME_CC_REG_AMS_SHIFT;
 	cc |= 0 << NVME_CC_REG_SHN_SHIFT;
 	cc |= 6 << NVME_CC_REG_IOSQES_SHIFT; /* SQ entry size == 64 == 2^6 */
 	cc |= 4 << NVME_CC_REG_IOCQES_SHIFT; /* CQ entry size == 16 == 2^4 */
 
 	/* This evaluates to 0, which is according to spec. */
 	cc |= (PAGE_SIZE >> 13) << NVME_CC_REG_MPS_SHIFT;
 
 	nvme_mmio_write_4(ctrlr, cc, cc);
 
 	return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
 }
 
 static void
 nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr)
 {
 	int i;
 
 	nvme_admin_qpair_disable(&ctrlr->adminq);
 	/*
 	 * I/O queues are not allocated before the initial HW
 	 *  reset, so do not try to disable them.  Use is_initialized
 	 *  to determine if this is the initial HW reset.
 	 */
 	if (ctrlr->is_initialized) {
 		for (i = 0; i < ctrlr->num_io_queues; i++)
 			nvme_io_qpair_disable(&ctrlr->ioq[i]);
 	}
 }
 
 int
 nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
 {
 	int err;
 
 	nvme_ctrlr_disable_qpairs(ctrlr);
 
 	DELAY(100*1000);
 
 	err = nvme_ctrlr_disable(ctrlr);
 	if (err != 0)
 		return err;
 	return (nvme_ctrlr_enable(ctrlr));
 }
 
 void
 nvme_ctrlr_reset(struct nvme_controller *ctrlr)
 {
 	int cmpset;
 
 	cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
 
 	if (cmpset == 0 || ctrlr->is_failed)
 		/*
 		 * Controller is already resetting or has failed.  Return
 		 *  immediately since there is no need to kick off another
 		 *  reset in these cases.
 		 */
 		return;
 
 	taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
 }
 
 static int
 nvme_ctrlr_identify(struct nvme_controller *ctrlr)
 {
 	struct nvme_completion_poll_status	status;
 
 	status.done = 0;
 	nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
 	    nvme_completion_poll_cb, &status);
 	nvme_completion_poll(&status);
 	if (nvme_completion_is_error(&status.cpl)) {
 		nvme_printf(ctrlr, "nvme_identify_controller failed!\n");
 		return (ENXIO);
 	}
 
 	/* Convert data to host endian */
 	nvme_controller_data_swapbytes(&ctrlr->cdata);
 
 	/*
 	 * Use MDTS to ensure our default max_xfer_size doesn't exceed what the
 	 *  controller supports.
 	 */
 	if (ctrlr->cdata.mdts > 0)
 		ctrlr->max_xfer_size = min(ctrlr->max_xfer_size,
 		    ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts)));
 
 	return (0);
 }
 
 static int
 nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
 {
 	struct nvme_completion_poll_status	status;
 	int					cq_allocated, sq_allocated;
 
 	status.done = 0;
 	nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
 	    nvme_completion_poll_cb, &status);
 	nvme_completion_poll(&status);
 	if (nvme_completion_is_error(&status.cpl)) {
 		nvme_printf(ctrlr, "nvme_ctrlr_set_num_qpairs failed!\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Data in cdw0 is 0-based.
 	 * Lower 16-bits indicate number of submission queues allocated.
 	 * Upper 16-bits indicate number of completion queues allocated.
 	 */
 	sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1;
 	cq_allocated = (status.cpl.cdw0 >> 16) + 1;
 
 	/*
 	 * Controller may allocate more queues than we requested,
 	 *  so use the minimum of the number requested and what was
 	 *  actually allocated.
 	 */
 	ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated);
 	ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated);
 	if (ctrlr->num_io_queues > vm_ndomains)
 		ctrlr->num_io_queues -= ctrlr->num_io_queues % vm_ndomains;
 
 	return (0);
 }
 
 static int
 nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
 {
 	struct nvme_completion_poll_status	status;
 	struct nvme_qpair			*qpair;
 	int					i;
 
 	for (i = 0; i < ctrlr->num_io_queues; i++) {
 		qpair = &ctrlr->ioq[i];
 
 		status.done = 0;
 		nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair,
 		    nvme_completion_poll_cb, &status);
 		nvme_completion_poll(&status);
 		if (nvme_completion_is_error(&status.cpl)) {
 			nvme_printf(ctrlr, "nvme_create_io_cq failed!\n");
 			return (ENXIO);
 		}
 
 		status.done = 0;
 		nvme_ctrlr_cmd_create_io_sq(ctrlr, qpair,
 		    nvme_completion_poll_cb, &status);
 		nvme_completion_poll(&status);
 		if (nvme_completion_is_error(&status.cpl)) {
 			nvme_printf(ctrlr, "nvme_create_io_sq failed!\n");
 			return (ENXIO);
 		}
 	}
 
 	return (0);
 }
 
 static int
 nvme_ctrlr_delete_qpairs(struct nvme_controller *ctrlr)
 {
 	struct nvme_completion_poll_status	status;
 	struct nvme_qpair			*qpair;
 
 	for (int i = 0; i < ctrlr->num_io_queues; i++) {
 		qpair = &ctrlr->ioq[i];
 
 		status.done = 0;
 		nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair,
 		    nvme_completion_poll_cb, &status);
 		nvme_completion_poll(&status);
 		if (nvme_completion_is_error(&status.cpl)) {
 			nvme_printf(ctrlr, "nvme_destroy_io_sq failed!\n");
 			return (ENXIO);
 		}
 
 		status.done = 0;
 		nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair,
 		    nvme_completion_poll_cb, &status);
 		nvme_completion_poll(&status);
 		if (nvme_completion_is_error(&status.cpl)) {
 			nvme_printf(ctrlr, "nvme_destroy_io_cq failed!\n");
 			return (ENXIO);
 		}
 	}
 
 	return (0);
 }
 
 static int
 nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
 {
 	struct nvme_namespace	*ns;
 	uint32_t 		i;
 
 	for (i = 0; i < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); i++) {
 		ns = &ctrlr->ns[i];
 		nvme_ns_construct(ns, i+1, ctrlr);
 	}
 
 	return (0);
 }
 
 static bool
 is_log_page_id_valid(uint8_t page_id)
 {
 
 	switch (page_id) {
 	case NVME_LOG_ERROR:
 	case NVME_LOG_HEALTH_INFORMATION:
 	case NVME_LOG_FIRMWARE_SLOT:
 	case NVME_LOG_CHANGED_NAMESPACE:
 	case NVME_LOG_COMMAND_EFFECT:
 	case NVME_LOG_RES_NOTIFICATION:
 	case NVME_LOG_SANITIZE_STATUS:
 		return (true);
 	}
 
 	return (false);
 }
 
 static uint32_t
 nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id)
 {
 	uint32_t	log_page_size;
 
 	switch (page_id) {
 	case NVME_LOG_ERROR:
 		log_page_size = min(
 		    sizeof(struct nvme_error_information_entry) *
 		    (ctrlr->cdata.elpe + 1), NVME_MAX_AER_LOG_SIZE);
 		break;
 	case NVME_LOG_HEALTH_INFORMATION:
 		log_page_size = sizeof(struct nvme_health_information_page);
 		break;
 	case NVME_LOG_FIRMWARE_SLOT:
 		log_page_size = sizeof(struct nvme_firmware_page);
 		break;
 	case NVME_LOG_CHANGED_NAMESPACE:
 		log_page_size = sizeof(struct nvme_ns_list);
 		break;
 	case NVME_LOG_COMMAND_EFFECT:
 		log_page_size = sizeof(struct nvme_command_effects_page);
 		break;
 	case NVME_LOG_RES_NOTIFICATION:
 		log_page_size = sizeof(struct nvme_res_notification_page);
 		break;
 	case NVME_LOG_SANITIZE_STATUS:
 		log_page_size = sizeof(struct nvme_sanitize_status_page);
 		break;
 	default:
 		log_page_size = 0;
 		break;
 	}
 
 	return (log_page_size);
 }
 
 static void
 nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr,
     uint8_t state)
 {
 
 	if (state & NVME_CRIT_WARN_ST_AVAILABLE_SPARE)
 		nvme_ctrlr_devctl_log(ctrlr, "critical",
 		    "available spare space below threshold");
 
 	if (state & NVME_CRIT_WARN_ST_TEMPERATURE)
 		nvme_ctrlr_devctl_log(ctrlr, "critical",
 		    "temperature above threshold");
 
 	if (state & NVME_CRIT_WARN_ST_DEVICE_RELIABILITY)
 		nvme_ctrlr_devctl_log(ctrlr, "critical",
 		    "device reliability degraded");
 
 	if (state & NVME_CRIT_WARN_ST_READ_ONLY)
 		nvme_ctrlr_devctl_log(ctrlr, "critical",
 		    "media placed in read only mode");
 
 	if (state & NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP)
 		nvme_ctrlr_devctl_log(ctrlr, "critical",
 		    "volatile memory backup device failed");
 
 	if (state & NVME_CRIT_WARN_ST_RESERVED_MASK)
 		nvme_ctrlr_devctl_log(ctrlr, "critical",
 		    "unknown critical warning(s): state = 0x%02x", state);
 }
 
 static void
 nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl)
 {
 	struct nvme_async_event_request		*aer = arg;
 	struct nvme_health_information_page	*health_info;
 	struct nvme_ns_list			*nsl;
 	struct nvme_error_information_entry	*err;
 	int i;
 
 	/*
 	 * If the log page fetch for some reason completed with an error,
 	 *  don't pass log page data to the consumers.  In practice, this case
 	 *  should never happen.
 	 */
 	if (nvme_completion_is_error(cpl))
 		nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
 		    aer->log_page_id, NULL, 0);
 	else {
 		/* Convert data to host endian */
 		switch (aer->log_page_id) {
 		case NVME_LOG_ERROR:
 			err = (struct nvme_error_information_entry *)aer->log_page_buffer;
 			for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
 				nvme_error_information_entry_swapbytes(err++);
 			break;
 		case NVME_LOG_HEALTH_INFORMATION:
 			nvme_health_information_page_swapbytes(
 			    (struct nvme_health_information_page *)aer->log_page_buffer);
 			break;
 		case NVME_LOG_FIRMWARE_SLOT:
 			nvme_firmware_page_swapbytes(
 			    (struct nvme_firmware_page *)aer->log_page_buffer);
 			break;
 		case NVME_LOG_CHANGED_NAMESPACE:
 			nvme_ns_list_swapbytes(
 			    (struct nvme_ns_list *)aer->log_page_buffer);
 			break;
 		case NVME_LOG_COMMAND_EFFECT:
 			nvme_command_effects_page_swapbytes(
 			    (struct nvme_command_effects_page *)aer->log_page_buffer);
 			break;
 		case NVME_LOG_RES_NOTIFICATION:
 			nvme_res_notification_page_swapbytes(
 			    (struct nvme_res_notification_page *)aer->log_page_buffer);
 			break;
 		case NVME_LOG_SANITIZE_STATUS:
 			nvme_sanitize_status_page_swapbytes(
 			    (struct nvme_sanitize_status_page *)aer->log_page_buffer);
 			break;
 		case INTEL_LOG_TEMP_STATS:
 			intel_log_temp_stats_swapbytes(
 			    (struct intel_log_temp_stats *)aer->log_page_buffer);
 			break;
 		default:
 			break;
 		}
 
 		if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
 			health_info = (struct nvme_health_information_page *)
 			    aer->log_page_buffer;
 			nvme_ctrlr_log_critical_warnings(aer->ctrlr,
 			    health_info->critical_warning);
 			/*
 			 * Critical warnings reported through the
 			 *  SMART/health log page are persistent, so
 			 *  clear the associated bits in the async event
 			 *  config so that we do not receive repeated
 			 *  notifications for the same event.
 			 */
 			aer->ctrlr->async_event_config &=
 			    ~health_info->critical_warning;
 			nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
 			    aer->ctrlr->async_event_config, NULL, NULL);
 		} else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE &&
 		    !nvme_use_nvd) {
 			nsl = (struct nvme_ns_list *)aer->log_page_buffer;
 			for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
 				if (nsl->ns[i] > NVME_MAX_NAMESPACES)
 					break;
 				nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
 			}
 		}
 
 		/*
 		 * Pass the cpl data from the original async event completion,
 		 *  not the log page fetch.
 		 */
 		nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
 		    aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
 	}
 
 	/*
 	 * Repost another asynchronous event request to replace the one
 	 *  that just completed.
 	 */
 	nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
 }
 
 static void
 nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
 {
 	struct nvme_async_event_request	*aer = arg;
 
 	if (nvme_completion_is_error(cpl)) {
 		/*
 		 *  Do not retry failed async event requests.  This avoids
 		 *  infinite loops where a new async event request is submitted
 		 *  to replace the one just failed, only to fail again and
 		 *  perpetuate the loop.
 		 */
 		return;
 	}
 
 	/* Associated log page is in bits 23:16 of completion entry dw0. */
 	aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
 
 	nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x,"
 	    " page 0x%02x)\n", (cpl->cdw0 & 0x07), (cpl->cdw0 & 0xFF00) >> 8,
 	    aer->log_page_id);
 
 	if (is_log_page_id_valid(aer->log_page_id)) {
 		aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
 		    aer->log_page_id);
 		memcpy(&aer->cpl, cpl, sizeof(*cpl));
 		nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
 		    NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
 		    aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
 		    aer);
 		/* Wait to notify consumers until after log page is fetched. */
 	} else {
 		nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
 		    NULL, 0);
 
 		/*
 		 * Repost another asynchronous event request to replace the one
 		 *  that just completed.
 		 */
 		nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
 	}
 }
 
 static void
 nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
     struct nvme_async_event_request *aer)
 {
 	struct nvme_request *req;
 
 	aer->ctrlr = ctrlr;
 	req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer);
 	aer->req = req;
 
 	/*
 	 * Disable timeout here, since asynchronous event requests should by
 	 *  nature never be timed out.
 	 */
 	req->timeout = false;
 	req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
 	nvme_ctrlr_submit_admin_request(ctrlr, req);
 }
 
 static void
 nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
 {
 	struct nvme_completion_poll_status	status;
 	struct nvme_async_event_request		*aer;
 	uint32_t				i;
 
 	ctrlr->async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE |
 	    NVME_CRIT_WARN_ST_DEVICE_RELIABILITY |
 	    NVME_CRIT_WARN_ST_READ_ONLY |
 	    NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP;
 	if (ctrlr->cdata.ver >= NVME_REV(1, 2))
 		ctrlr->async_event_config |= NVME_ASYNC_EVENT_NS_ATTRIBUTE |
 		    NVME_ASYNC_EVENT_FW_ACTIVATE;
 
 	status.done = 0;
 	nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD,
 	    0, NULL, 0, nvme_completion_poll_cb, &status);
 	nvme_completion_poll(&status);
 	if (nvme_completion_is_error(&status.cpl) ||
 	    (status.cpl.cdw0 & 0xFFFF) == 0xFFFF ||
 	    (status.cpl.cdw0 & 0xFFFF) == 0x0000) {
 		nvme_printf(ctrlr, "temperature threshold not supported\n");
 	} else
 		ctrlr->async_event_config |= NVME_CRIT_WARN_ST_TEMPERATURE;
 
 	nvme_ctrlr_cmd_set_async_event_config(ctrlr,
 	    ctrlr->async_event_config, NULL, NULL);
 
 	/* aerl is a zero-based value, so we need to add 1 here. */
 	ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
 
 	for (i = 0; i < ctrlr->num_aers; i++) {
 		aer = &ctrlr->aer[i];
 		nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
 	}
 }
 
 static void
 nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
 {
 
 	ctrlr->int_coal_time = 0;
 	TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
 	    &ctrlr->int_coal_time);
 
 	ctrlr->int_coal_threshold = 0;
 	TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
 	    &ctrlr->int_coal_threshold);
 
 	nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
 	    ctrlr->int_coal_threshold, NULL, NULL);
 }
 
 static void
 nvme_ctrlr_hmb_free(struct nvme_controller *ctrlr)
 {
 	struct nvme_hmb_chunk *hmbc;
 	int i;
 
 	if (ctrlr->hmb_desc_paddr) {
 		bus_dmamap_unload(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map);
 		bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr,
 		    ctrlr->hmb_desc_map);
 		ctrlr->hmb_desc_paddr = 0;
 	}
 	if (ctrlr->hmb_desc_tag) {
 		bus_dma_tag_destroy(ctrlr->hmb_desc_tag);
 		ctrlr->hmb_desc_tag = NULL;
 	}
 	for (i = 0; i < ctrlr->hmb_nchunks; i++) {
 		hmbc = &ctrlr->hmb_chunks[i];
 		bus_dmamap_unload(ctrlr->hmb_tag, hmbc->hmbc_map);
 		bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr,
 		    hmbc->hmbc_map);
 	}
 	ctrlr->hmb_nchunks = 0;
 	if (ctrlr->hmb_tag) {
 		bus_dma_tag_destroy(ctrlr->hmb_tag);
 		ctrlr->hmb_tag = NULL;
 	}
 	if (ctrlr->hmb_chunks) {
 		free(ctrlr->hmb_chunks, M_NVME);
 		ctrlr->hmb_chunks = NULL;
 	}
 }
 
 static void
 nvme_ctrlr_hmb_alloc(struct nvme_controller *ctrlr)
 {
 	struct nvme_hmb_chunk *hmbc;
 	size_t pref, min, minc, size;
 	int err, i;
 	uint64_t max;
 
 	/* Limit HMB to 5% of RAM size per device by default. */
 	max = (uint64_t)physmem * PAGE_SIZE / 20;
 	TUNABLE_UINT64_FETCH("hw.nvme.hmb_max", &max);
 
 	min = (long long unsigned)ctrlr->cdata.hmmin * 4096;
 	if (max == 0 || max < min)
 		return;
 	pref = MIN((long long unsigned)ctrlr->cdata.hmpre * 4096, max);
 	minc = MAX(ctrlr->cdata.hmminds * 4096, PAGE_SIZE);
 	if (min > 0 && ctrlr->cdata.hmmaxd > 0)
 		minc = MAX(minc, min / ctrlr->cdata.hmmaxd);
 	ctrlr->hmb_chunk = pref;
 
 again:
 	ctrlr->hmb_chunk = roundup2(ctrlr->hmb_chunk, PAGE_SIZE);
 	ctrlr->hmb_nchunks = howmany(pref, ctrlr->hmb_chunk);
 	if (ctrlr->cdata.hmmaxd > 0 && ctrlr->hmb_nchunks > ctrlr->cdata.hmmaxd)
 		ctrlr->hmb_nchunks = ctrlr->cdata.hmmaxd;
 	ctrlr->hmb_chunks = malloc(sizeof(struct nvme_hmb_chunk) *
 	    ctrlr->hmb_nchunks, M_NVME, M_WAITOK);
 	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
 	    PAGE_SIZE, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
 	    ctrlr->hmb_chunk, 1, ctrlr->hmb_chunk, 0, NULL, NULL, &ctrlr->hmb_tag);
 	if (err != 0) {
 		nvme_printf(ctrlr, "HMB tag create failed %d\n", err);
 		nvme_ctrlr_hmb_free(ctrlr);
 		return;
 	}
 
 	for (i = 0; i < ctrlr->hmb_nchunks; i++) {
 		hmbc = &ctrlr->hmb_chunks[i];
 		if (bus_dmamem_alloc(ctrlr->hmb_tag,
 		    (void **)&hmbc->hmbc_vaddr, BUS_DMA_NOWAIT,
 		    &hmbc->hmbc_map)) {
 			nvme_printf(ctrlr, "failed to alloc HMB\n");
 			break;
 		}
 		if (bus_dmamap_load(ctrlr->hmb_tag, hmbc->hmbc_map,
 		    hmbc->hmbc_vaddr, ctrlr->hmb_chunk, nvme_single_map,
 		    &hmbc->hmbc_paddr, BUS_DMA_NOWAIT) != 0) {
 			bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr,
 			    hmbc->hmbc_map);
 			nvme_printf(ctrlr, "failed to load HMB\n");
 			break;
 		}
 		bus_dmamap_sync(ctrlr->hmb_tag, hmbc->hmbc_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	}
 
 	if (i < ctrlr->hmb_nchunks && i * ctrlr->hmb_chunk < min &&
 	    ctrlr->hmb_chunk / 2 >= minc) {
 		ctrlr->hmb_nchunks = i;
 		nvme_ctrlr_hmb_free(ctrlr);
 		ctrlr->hmb_chunk /= 2;
 		goto again;
 	}
 	ctrlr->hmb_nchunks = i;
 	if (ctrlr->hmb_nchunks * ctrlr->hmb_chunk < min) {
 		nvme_ctrlr_hmb_free(ctrlr);
 		return;
 	}
 
 	size = sizeof(struct nvme_hmb_desc) * ctrlr->hmb_nchunks;
 	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
 	    16, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
 	    size, 1, size, 0, NULL, NULL, &ctrlr->hmb_desc_tag);
 	if (err != 0) {
 		nvme_printf(ctrlr, "HMB desc tag create failed %d\n", err);
 		nvme_ctrlr_hmb_free(ctrlr);
 		return;
 	}
 	if (bus_dmamem_alloc(ctrlr->hmb_desc_tag,
 	    (void **)&ctrlr->hmb_desc_vaddr, BUS_DMA_WAITOK,
 	    &ctrlr->hmb_desc_map)) {
 		nvme_printf(ctrlr, "failed to alloc HMB desc\n");
 		nvme_ctrlr_hmb_free(ctrlr);
 		return;
 	}
 	if (bus_dmamap_load(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map,
 	    ctrlr->hmb_desc_vaddr, size, nvme_single_map,
 	    &ctrlr->hmb_desc_paddr, BUS_DMA_NOWAIT) != 0) {
 		bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr,
 		    ctrlr->hmb_desc_map);
 		nvme_printf(ctrlr, "failed to load HMB desc\n");
 		nvme_ctrlr_hmb_free(ctrlr);
 		return;
 	}
 
 	for (i = 0; i < ctrlr->hmb_nchunks; i++) {
 		ctrlr->hmb_desc_vaddr[i].addr =
 		    htole64(ctrlr->hmb_chunks[i].hmbc_paddr);
 		ctrlr->hmb_desc_vaddr[i].size = htole32(ctrlr->hmb_chunk / 4096);
 	}
 	bus_dmamap_sync(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map,
 	    BUS_DMASYNC_PREWRITE);
 
 	nvme_printf(ctrlr, "Allocated %lluMB host memory buffer\n",
 	    (long long unsigned)ctrlr->hmb_nchunks * ctrlr->hmb_chunk
 	    / 1024 / 1024);
 }
 
 static void
 nvme_ctrlr_hmb_enable(struct nvme_controller *ctrlr, bool enable, bool memret)
 {
 	struct nvme_completion_poll_status	status;
 	uint32_t cdw11;
 
 	cdw11 = 0;
 	if (enable)
 		cdw11 |= 1;
 	if (memret)
 		cdw11 |= 2;
 	status.done = 0;
 	nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_HOST_MEMORY_BUFFER, cdw11,
 	    ctrlr->hmb_nchunks * ctrlr->hmb_chunk / 4096, ctrlr->hmb_desc_paddr,
 	    ctrlr->hmb_desc_paddr >> 32, ctrlr->hmb_nchunks, NULL, 0,
 	    nvme_completion_poll_cb, &status);
 	nvme_completion_poll(&status);
 	if (nvme_completion_is_error(&status.cpl))
 		nvme_printf(ctrlr, "nvme_ctrlr_hmb_enable failed!\n");
 }
 
 static void
 nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
 {
 	struct nvme_controller *ctrlr = ctrlr_arg;
 	uint32_t old_num_io_queues;
 	int i;
 
 	/*
 	 * Only reset adminq here when we are restarting the
 	 *  controller after a reset.  During initialization,
 	 *  we have already submitted admin commands to get
 	 *  the number of I/O queues supported, so cannot reset
 	 *  the adminq again here.
 	 */
 	if (resetting)
 		nvme_qpair_reset(&ctrlr->adminq);
 
 	if (ctrlr->ioq != NULL) {
 		for (i = 0; i < ctrlr->num_io_queues; i++)
 			nvme_qpair_reset(&ctrlr->ioq[i]);
 	}
 
 	nvme_admin_qpair_enable(&ctrlr->adminq);
 
 	/*
 	 * If it was a reset on initialization command timeout, just
 	 * return here, letting initialization code fail gracefully.
 	 */
 	if (resetting && !ctrlr->is_initialized)
 		return;
 
 	if (nvme_ctrlr_identify(ctrlr) != 0) {
 		nvme_ctrlr_fail(ctrlr);
 		return;
 	}
 
 	/*
 	 * The number of qpairs are determined during controller initialization,
 	 *  including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the
 	 *  HW limit.  We call SET_FEATURES again here so that it gets called
 	 *  after any reset for controllers that depend on the driver to
 	 *  explicit specify how many queues it will use.  This value should
 	 *  never change between resets, so panic if somehow that does happen.
 	 */
 	if (resetting) {
 		old_num_io_queues = ctrlr->num_io_queues;
 		if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
 			nvme_ctrlr_fail(ctrlr);
 			return;
 		}
 
 		if (old_num_io_queues != ctrlr->num_io_queues) {
 			panic("num_io_queues changed from %u to %u",
 			      old_num_io_queues, ctrlr->num_io_queues);
 		}
 	}
 
 	if (ctrlr->cdata.hmpre > 0 && ctrlr->hmb_nchunks == 0) {
 		nvme_ctrlr_hmb_alloc(ctrlr);
 		if (ctrlr->hmb_nchunks > 0)
 			nvme_ctrlr_hmb_enable(ctrlr, true, false);
 	} else if (ctrlr->hmb_nchunks > 0)
 		nvme_ctrlr_hmb_enable(ctrlr, true, true);
 
 	if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
 		nvme_ctrlr_fail(ctrlr);
 		return;
 	}
 
 	if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) {
 		nvme_ctrlr_fail(ctrlr);
 		return;
 	}
 
 	nvme_ctrlr_configure_aer(ctrlr);
 	nvme_ctrlr_configure_int_coalescing(ctrlr);
 
 	for (i = 0; i < ctrlr->num_io_queues; i++)
 		nvme_io_qpair_enable(&ctrlr->ioq[i]);
 }
 
 void
 nvme_ctrlr_start_config_hook(void *arg)
 {
 	struct nvme_controller *ctrlr = arg;
 
 	/*
 	 * Reset controller twice to ensure we do a transition from cc.en==1 to
 	 * cc.en==0.  This is because we don't really know what status the
 	 * controller was left in when boot handed off to OS.  Linux doesn't do
 	 * this, however. If we adopt that policy, see also nvme_ctrlr_resume().
 	 */
 	if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
 fail:
 		nvme_ctrlr_fail(ctrlr);
 		config_intrhook_disestablish(&ctrlr->config_hook);
 		return;
 	}
 
 	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 		goto fail;
 
 	nvme_qpair_reset(&ctrlr->adminq);
 	nvme_admin_qpair_enable(&ctrlr->adminq);
 
 	if (nvme_ctrlr_set_num_qpairs(ctrlr) == 0 &&
 	    nvme_ctrlr_construct_io_qpairs(ctrlr) == 0)
 		nvme_ctrlr_start(ctrlr, false);
 	else
 		goto fail;
 
 	nvme_sysctl_initialize_ctrlr(ctrlr);
 	config_intrhook_disestablish(&ctrlr->config_hook);
 
 	ctrlr->is_initialized = 1;
 	nvme_notify_new_controller(ctrlr);
 }
 
 static void
 nvme_ctrlr_reset_task(void *arg, int pending)
 {
 	struct nvme_controller	*ctrlr = arg;
 	int			status;
 
 	nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller");
 	status = nvme_ctrlr_hw_reset(ctrlr);
 	/*
 	 * Use pause instead of DELAY, so that we yield to any nvme interrupt
 	 *  handlers on this CPU that were blocked on a qpair lock. We want
 	 *  all nvme interrupts completed before proceeding with restarting the
 	 *  controller.
 	 *
 	 * XXX - any way to guarantee the interrupt handlers have quiesced?
 	 */
 	pause("nvmereset", hz / 10);
 	if (status == 0)
 		nvme_ctrlr_start(ctrlr, true);
 	else
 		nvme_ctrlr_fail(ctrlr);
 
 	atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 }
 
 /*
  * Poll all the queues enabled on the device for completion.
  */
 void
 nvme_ctrlr_poll(struct nvme_controller *ctrlr)
 {
 	int i;
 
 	nvme_qpair_process_completions(&ctrlr->adminq);
 
 	for (i = 0; i < ctrlr->num_io_queues; i++)
 		if (ctrlr->ioq && ctrlr->ioq[i].cpl)
 			nvme_qpair_process_completions(&ctrlr->ioq[i]);
 }
 
 /*
  * Poll the single-vector interrupt case: num_io_queues will be 1 and
  * there's only a single vector. While we're polling, we mask further
  * interrupts in the controller.
  */
 void
 nvme_ctrlr_intx_handler(void *arg)
 {
 	struct nvme_controller *ctrlr = arg;
 
 	nvme_mmio_write_4(ctrlr, intms, 1);
 	nvme_ctrlr_poll(ctrlr);
 	nvme_mmio_write_4(ctrlr, intmc, 1);
 }
 
 static void
 nvme_pt_done(void *arg, const struct nvme_completion *cpl)
 {
 	struct nvme_pt_command *pt = arg;
 	struct mtx *mtx = pt->driver_lock;
 	uint16_t status;
 
 	bzero(&pt->cpl, sizeof(pt->cpl));
 	pt->cpl.cdw0 = cpl->cdw0;
 
 	status = cpl->status;
 	status &= ~NVME_STATUS_P_MASK;
 	pt->cpl.status = status;
 
 	mtx_lock(mtx);
 	pt->driver_lock = NULL;
 	wakeup(pt);
 	mtx_unlock(mtx);
 }
 
 int
 nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
     struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer,
     int is_admin_cmd)
 {
 	struct nvme_request	*req;
 	struct mtx		*mtx;
 	struct buf		*buf = NULL;
 	int			ret = 0;
 	vm_offset_t		addr, end;
 
 	if (pt->len > 0) {
 		/*
 		 * vmapbuf calls vm_fault_quick_hold_pages which only maps full
 		 * pages. Ensure this request has fewer than MAXPHYS bytes when
 		 * extended to full pages.
 		 */
 		addr = (vm_offset_t)pt->buf;
 		end = round_page(addr + pt->len);
 		addr = trunc_page(addr);
 		if (end - addr > MAXPHYS)
 			return EIO;
 
 		if (pt->len > ctrlr->max_xfer_size) {
 			nvme_printf(ctrlr, "pt->len (%d) "
 			    "exceeds max_xfer_size (%d)\n", pt->len,
 			    ctrlr->max_xfer_size);
 			return EIO;
 		}
 		if (is_user_buffer) {
 			/*
 			 * Ensure the user buffer is wired for the duration of
 			 *  this pass-through command.
 			 */
 			PHOLD(curproc);
 			buf = uma_zalloc(pbuf_zone, M_WAITOK);
-			buf->b_data = pt->buf;
-			buf->b_bufsize = pt->len;
 			buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE;
-			if (vmapbuf(buf, 1) < 0) {
+			if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) {
 				ret = EFAULT;
 				goto err;
 			}
 			req = nvme_allocate_request_vaddr(buf->b_data, pt->len, 
 			    nvme_pt_done, pt);
 		} else
 			req = nvme_allocate_request_vaddr(pt->buf, pt->len,
 			    nvme_pt_done, pt);
 	} else
 		req = nvme_allocate_request_null(nvme_pt_done, pt);
 
 	/* Assume user space already converted to little-endian */
 	req->cmd.opc = pt->cmd.opc;
 	req->cmd.fuse = pt->cmd.fuse;
 	req->cmd.rsvd2 = pt->cmd.rsvd2;
 	req->cmd.rsvd3 = pt->cmd.rsvd3;
 	req->cmd.cdw10 = pt->cmd.cdw10;
 	req->cmd.cdw11 = pt->cmd.cdw11;
 	req->cmd.cdw12 = pt->cmd.cdw12;
 	req->cmd.cdw13 = pt->cmd.cdw13;
 	req->cmd.cdw14 = pt->cmd.cdw14;
 	req->cmd.cdw15 = pt->cmd.cdw15;
 
 	req->cmd.nsid = htole32(nsid);
 
 	mtx = mtx_pool_find(mtxpool_sleep, pt);
 	pt->driver_lock = mtx;
 
 	if (is_admin_cmd)
 		nvme_ctrlr_submit_admin_request(ctrlr, req);
 	else
 		nvme_ctrlr_submit_io_request(ctrlr, req);
 
 	mtx_lock(mtx);
 	while (pt->driver_lock != NULL)
 		mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0);
 	mtx_unlock(mtx);
 
 err:
 	if (buf != NULL) {
 		uma_zfree(pbuf_zone, buf);
 		PRELE(curproc);
 	}
 
 	return (ret);
 }
 
 static int
 nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
     struct thread *td)
 {
 	struct nvme_controller			*ctrlr;
 	struct nvme_pt_command			*pt;
 
 	ctrlr = cdev->si_drv1;
 
 	switch (cmd) {
 	case NVME_RESET_CONTROLLER:
 		nvme_ctrlr_reset(ctrlr);
 		break;
 	case NVME_PASSTHROUGH_CMD:
 		pt = (struct nvme_pt_command *)arg;
 		return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, le32toh(pt->cmd.nsid),
 		    1 /* is_user_buffer */, 1 /* is_admin_cmd */));
 	case NVME_GET_NSID:
 	{
 		struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg;
 		strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
 		    sizeof(gnsid->cdev));
 		gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
 		gnsid->nsid = 0;
 		break;
 	}
 	case NVME_GET_MAX_XFER_SIZE:
 		*(uint64_t *)arg = ctrlr->max_xfer_size;
 		break;
 	default:
 		return (ENOTTY);
 	}
 
 	return (0);
 }
 
 static struct cdevsw nvme_ctrlr_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	0,
 	.d_ioctl =	nvme_ctrlr_ioctl
 };
 
 int
 nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
 {
 	struct make_dev_args	md_args;
 	uint32_t	cap_lo;
 	uint32_t	cap_hi;
 	uint32_t	to;
 	uint8_t		mpsmin;
 	int		status, timeout_period;
 
 	ctrlr->dev = dev;
 
 	mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF);
 	if (bus_get_domain(dev, &ctrlr->domain) != 0)
 		ctrlr->domain = 0;
 
 	cap_hi = nvme_mmio_read_4(ctrlr, cap_hi);
 	ctrlr->dstrd = NVME_CAP_HI_DSTRD(cap_hi) + 2;
 
 	mpsmin = NVME_CAP_HI_MPSMIN(cap_hi);
 	ctrlr->min_page_size = 1 << (12 + mpsmin);
 
 	/* Get ready timeout value from controller, in units of 500ms. */
 	cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
 	to = NVME_CAP_LO_TO(cap_lo) + 1;
 	ctrlr->ready_timeout_in_ms = to * 500;
 
 	timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD;
 	TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period);
 	timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD);
 	timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD);
 	ctrlr->timeout_period = timeout_period;
 
 	nvme_retry_count = NVME_DEFAULT_RETRY_COUNT;
 	TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count);
 
 	ctrlr->enable_aborts = 0;
 	TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);
 
 	ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
 	if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0)
 		return (ENXIO);
 
 	ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
 	    taskqueue_thread_enqueue, &ctrlr->taskqueue);
 	taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq");
 
 	ctrlr->is_resetting = 0;
 	ctrlr->is_initialized = 0;
 	ctrlr->notification_sent = 0;
 	TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
 	TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr);
 	STAILQ_INIT(&ctrlr->fail_req);
 	ctrlr->is_failed = false;
 
 	make_dev_args_init(&md_args);
 	md_args.mda_devsw = &nvme_ctrlr_cdevsw;
 	md_args.mda_uid = UID_ROOT;
 	md_args.mda_gid = GID_WHEEL;
 	md_args.mda_mode = 0600;
 	md_args.mda_unit = device_get_unit(dev);
 	md_args.mda_si_drv1 = (void *)ctrlr;
 	status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d",
 	    device_get_unit(dev));
 	if (status != 0)
 		return (ENXIO);
 
 	return (0);
 }
 
 void
 nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
 {
 	int	gone, i;
 
 	if (ctrlr->resource == NULL)
 		goto nores;
 
 	/*
 	 * Check whether it is a hot unplug or a clean driver detach.
 	 * If device is not there any more, skip any shutdown commands.
 	 */
 	gone = (nvme_mmio_read_4(ctrlr, csts) == 0xffffffff);
 	if (gone)
 		nvme_ctrlr_fail(ctrlr);
 	else
 		nvme_notify_fail_consumers(ctrlr);
 
 	for (i = 0; i < NVME_MAX_NAMESPACES; i++)
 		nvme_ns_destruct(&ctrlr->ns[i]);
 
 	if (ctrlr->cdev)
 		destroy_dev(ctrlr->cdev);
 
 	if (ctrlr->is_initialized) {
 		if (!gone) {
 			if (ctrlr->hmb_nchunks > 0)
 				nvme_ctrlr_hmb_enable(ctrlr, false, false);
 			nvme_ctrlr_delete_qpairs(ctrlr);
 		}
 		nvme_ctrlr_hmb_free(ctrlr);
 	}
 	if (ctrlr->ioq != NULL) {
 		for (i = 0; i < ctrlr->num_io_queues; i++)
 			nvme_io_qpair_destroy(&ctrlr->ioq[i]);
 		free(ctrlr->ioq, M_NVME);
 	}
 	nvme_admin_qpair_destroy(&ctrlr->adminq);
 
 	/*
 	 *  Notify the controller of a shutdown, even though this is due to
 	 *   a driver unload, not a system shutdown (this path is not invoked
 	 *   during shutdown).  This ensures the controller receives a
 	 *   shutdown notification in case the system is shutdown before
 	 *   reloading the driver.
 	 */
 	if (!gone)
 		nvme_ctrlr_shutdown(ctrlr);
 
 	if (!gone)
 		nvme_ctrlr_disable(ctrlr);
 
 	if (ctrlr->taskqueue)
 		taskqueue_free(ctrlr->taskqueue);
 
 	if (ctrlr->tag)
 		bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
 
 	if (ctrlr->res)
 		bus_release_resource(ctrlr->dev, SYS_RES_IRQ,
 		    rman_get_rid(ctrlr->res), ctrlr->res);
 
 	if (ctrlr->bar4_resource != NULL) {
 		bus_release_resource(dev, SYS_RES_MEMORY,
 		    ctrlr->bar4_resource_id, ctrlr->bar4_resource);
 	}
 
 	bus_release_resource(dev, SYS_RES_MEMORY,
 	    ctrlr->resource_id, ctrlr->resource);
 
 nores:
 	mtx_destroy(&ctrlr->lock);
 }
 
 void
 nvme_ctrlr_shutdown(struct nvme_controller *ctrlr)
 {
 	uint32_t	cc;
 	uint32_t	csts;
 	int		ticks = 0, timeout;
 
 	cc = nvme_mmio_read_4(ctrlr, cc);
 	cc &= ~(NVME_CC_REG_SHN_MASK << NVME_CC_REG_SHN_SHIFT);
 	cc |= NVME_SHN_NORMAL << NVME_CC_REG_SHN_SHIFT;
 	nvme_mmio_write_4(ctrlr, cc, cc);
 
 	timeout = ctrlr->cdata.rtd3e == 0 ? 5 * hz :
 	    ((uint64_t)ctrlr->cdata.rtd3e * hz + 999999) / 1000000;
 	while (1) {
 		csts = nvme_mmio_read_4(ctrlr, csts);
 		if (csts == 0xffffffff)		/* Hot unplug. */
 			break;
 		if (NVME_CSTS_GET_SHST(csts) == NVME_SHST_COMPLETE)
 			break;
 		if (ticks++ > timeout) {
 			nvme_printf(ctrlr, "did not complete shutdown within"
 			    " %d ticks of notification\n", timeout);
 			break;
 		}
 		pause("nvme shn", 1);
 	}
 }
 
 void
 nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
     struct nvme_request *req)
 {
 
 	nvme_qpair_submit_request(&ctrlr->adminq, req);
 }
 
 void
 nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
     struct nvme_request *req)
 {
 	struct nvme_qpair       *qpair;
 
 	qpair = &ctrlr->ioq[QP(ctrlr, curcpu)];
 	nvme_qpair_submit_request(qpair, req);
 }
 
 device_t
 nvme_ctrlr_get_device(struct nvme_controller *ctrlr)
 {
 
 	return (ctrlr->dev);
 }
 
 const struct nvme_controller_data *
 nvme_ctrlr_get_data(struct nvme_controller *ctrlr)
 {
 
 	return (&ctrlr->cdata);
 }
 
 int
 nvme_ctrlr_suspend(struct nvme_controller *ctrlr)
 {
 	int to = hz;
 
 	/*
 	 * Can't touch failed controllers, so it's already suspended.
 	 */
 	if (ctrlr->is_failed)
 		return (0);
 
 	/*
 	 * We don't want the reset taskqueue running, since it does similar
 	 * things, so prevent it from running after we start. Wait for any reset
 	 * that may have been started to complete. The reset process we follow
 	 * will ensure that any new I/O will queue and be given to the hardware
 	 * after we resume (though there should be none).
 	 */
 	while (atomic_cmpset_32(&ctrlr->is_resetting, 0, 1) == 0 && to-- > 0)
 		pause("nvmesusp", 1);
 	if (to <= 0) {
 		nvme_printf(ctrlr,
 		    "Competing reset task didn't finish. Try again later.\n");
 		return (EWOULDBLOCK);
 	}
 
 	if (ctrlr->hmb_nchunks > 0)
 		nvme_ctrlr_hmb_enable(ctrlr, false, false);
 
 	/*
 	 * Per Section 7.6.2 of NVMe spec 1.4, to properly suspend, we need to
 	 * delete the hardware I/O queues, and then shutdown. This properly
 	 * flushes any metadata the drive may have stored so it can survive
 	 * having its power removed and prevents the unsafe shutdown count from
 	 * incriminating. Once we delete the qpairs, we have to disable them
 	 * before shutting down. The delay is out of paranoia in
 	 * nvme_ctrlr_hw_reset, and is repeated here (though we should have no
 	 * pending I/O that the delay copes with).
 	 */
 	nvme_ctrlr_delete_qpairs(ctrlr);
 	nvme_ctrlr_disable_qpairs(ctrlr);
 	DELAY(100*1000);
 	nvme_ctrlr_shutdown(ctrlr);
 
 	return (0);
 }
 
 int
 nvme_ctrlr_resume(struct nvme_controller *ctrlr)
 {
 
 	/*
 	 * Can't touch failed controllers, so nothing to do to resume.
 	 */
 	if (ctrlr->is_failed)
 		return (0);
 
 	/*
 	 * Have to reset the hardware twice, just like we do on attach. See
 	 * nmve_attach() for why.
 	 */
 	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 		goto fail;
 	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 		goto fail;
 
 	/*
 	 * Now that we've reset the hardware, we can restart the controller. Any
 	 * I/O that was pending is requeued. Any admin commands are aborted with
 	 * an error. Once we've restarted, take the controller out of reset.
 	 */
 	nvme_ctrlr_start(ctrlr, true);
 	(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 
 	return (0);
 fail:
 	/*
 	 * Since we can't bring the controller out of reset, announce and fail
 	 * the controller. However, we have to return success for the resume
 	 * itself, due to questionable APIs.
 	 */
 	nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
 	nvme_ctrlr_fail(ctrlr);
 	(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 	return (0);
 }
Index: head/sys/kern/vfs_bio.c
===================================================================
--- head/sys/kern/vfs_bio.c	(revision 366910)
+++ head/sys/kern/vfs_bio.c	(revision 366911)
@@ -1,5479 +1,5478 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004 Poul-Henning Kamp
  * Copyright (c) 1994,1997 John S. Dyson
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  *
  * see man buf(9) for more info.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/bitset.h>
 #include <sys/conf.h>
 #include <sys/counter.h>
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/fail.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/syscallsubr.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 #include <geom/geom.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/swap_pager.h>
 
 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
 
 struct	bio_ops bioops;		/* I/O operation notification */
 
 struct	buf_ops buf_ops_bio = {
 	.bop_name	=	"buf_ops_bio",
 	.bop_write	=	bufwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 struct bufqueue {
 	struct mtx_padalign	bq_lock;
 	TAILQ_HEAD(, buf)	bq_queue;
 	uint8_t			bq_index;
 	uint16_t		bq_subqueue;
 	int			bq_len;
 } __aligned(CACHE_LINE_SIZE);
 
 #define	BQ_LOCKPTR(bq)		(&(bq)->bq_lock)
 #define	BQ_LOCK(bq)		mtx_lock(BQ_LOCKPTR((bq)))
 #define	BQ_UNLOCK(bq)		mtx_unlock(BQ_LOCKPTR((bq)))
 #define	BQ_ASSERT_LOCKED(bq)	mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
 
 struct bufdomain {
 	struct bufqueue	bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
 	struct bufqueue bd_dirtyq;
 	struct bufqueue	*bd_cleanq;
 	struct mtx_padalign bd_run_lock;
 	/* Constants */
 	long		bd_maxbufspace;
 	long		bd_hibufspace;
 	long 		bd_lobufspace;
 	long 		bd_bufspacethresh;
 	int		bd_hifreebuffers;
 	int		bd_lofreebuffers;
 	int		bd_hidirtybuffers;
 	int		bd_lodirtybuffers;
 	int		bd_dirtybufthresh;
 	int		bd_lim;
 	/* atomics */
 	int		bd_wanted;
 	int __aligned(CACHE_LINE_SIZE)	bd_numdirtybuffers;
 	int __aligned(CACHE_LINE_SIZE)	bd_running;
 	long __aligned(CACHE_LINE_SIZE) bd_bufspace;
 	int __aligned(CACHE_LINE_SIZE)	bd_freebuffers;
 } __aligned(CACHE_LINE_SIZE);
 
 #define	BD_LOCKPTR(bd)		(&(bd)->bd_cleanq->bq_lock)
 #define	BD_LOCK(bd)		mtx_lock(BD_LOCKPTR((bd)))
 #define	BD_UNLOCK(bd)		mtx_unlock(BD_LOCKPTR((bd)))
 #define	BD_ASSERT_LOCKED(bd)	mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
 #define	BD_RUN_LOCKPTR(bd)	(&(bd)->bd_run_lock)
 #define	BD_RUN_LOCK(bd)		mtx_lock(BD_RUN_LOCKPTR((bd)))
 #define	BD_RUN_UNLOCK(bd)	mtx_unlock(BD_RUN_LOCKPTR((bd)))
 #define	BD_DOMAIN(bd)		(bd - bdomain)
 
 static struct buf *buf;		/* buffer header pool */
 extern struct buf *swbuf;	/* Swap buffer header pool. */
 caddr_t __read_mostly unmapped_buf;
 
 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
 struct proc *bufdaemonproc;
 
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
 		vm_offset_t to);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
 		vm_page_t m);
 static void vfs_clean_pages_dirty_buf(struct buf *bp);
 static void vfs_setdirty_range(struct buf *bp);
 static void vfs_vmio_invalidate(struct buf *bp);
 static void vfs_vmio_truncate(struct buf *bp, int npages);
 static void vfs_vmio_extend(struct buf *bp, int npages, int size);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
 static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
 		void (*)(struct buf *));
 static int buf_flush(struct vnode *vp, struct bufdomain *, int);
 static int flushbufqueues(struct vnode *, struct bufdomain *, int, int);
 static void buf_daemon(void);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 static void bufkva_reclaim(vmem_t *, int);
 static void bufkva_free(struct buf *);
 static int buf_import(void *, void **, int, int, int);
 static void buf_release(void *, void **, int);
 static void maxbcachebuf_adjust(void);
 static inline struct bufdomain *bufdomain(struct buf *);
 static void bq_remove(struct bufqueue *bq, struct buf *bp);
 static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
 static int buf_recycle(struct bufdomain *, bool kva);
 static void bq_init(struct bufqueue *bq, int qindex, int cpu,
 	    const char *lockname);
 static void bd_init(struct bufdomain *bd);
 static int bd_flushall(struct bufdomain *bd);
 static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS);
 static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS);
 
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
 int vmiodirenable = TRUE;
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
     "Use the VM system for directory writes");
 long runningbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
     "Amount of presently outstanding async buffer io");
 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
     NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers");
 static counter_u64_t bufkvaspace;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
 SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace,
     CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace,
     __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L",
     "Maximum allowed value of bufspace (including metadata)");
 static long bufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
     "Amount of malloced memory for buffers");
 static long maxbufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
     0, "Maximum amount of malloced memory for buffers");
 static long lobufspace;
 SYSCTL_PROC(_vfs, OID_AUTO, lobufspace,
     CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace,
     __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L",
     "Minimum amount of buffers we want to have");
 long hibufspace;
 SYSCTL_PROC(_vfs, OID_AUTO, hibufspace,
     CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace,
     __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L",
     "Maximum allowed value of bufspace (excluding metadata)");
 long bufspacethresh;
 SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh,
     CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh,
     __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L",
     "Bufspace consumed before waking the daemon to free some");
 static counter_u64_t buffreekvacnt;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
     "Number of times we have freed the KVA space from some buffer");
 static counter_u64_t bufdefragcnt;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt,
     "Number of times we have had to repeat buffer allocation to defragment");
 static long lorunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
     "Minimum preferred space used for in-progress I/O");
 static long hirunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
     "Maximum amount of space to use for in-progress I/O");
 int dirtybufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
     0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
 int bdwriteskip;
 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
     0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
 int altbufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW | CTLFLAG_STATS,
     &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers");
 static int recursiveflushes;
 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW | CTLFLAG_STATS,
     &recursiveflushes, 0, "Number of flushes skipped due to being recursive");
 static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I",
     "Number of buffers that are dirty (has unwritten changes) at the moment");
 static int lodirtybuffers;
 SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers,
     __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I",
     "How many buffers we want to have free before bufdaemon can sleep");
 static int hidirtybuffers;
 SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers,
     __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I",
     "When the number of dirty buffers is considered severe");
 int dirtybufthresh;
 SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh,
     __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I",
     "Number of bdwrite to bawrite conversions to clear dirty buffers");
 static int numfreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
     "Number of free buffers");
 static int lofreebuffers;
 SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers,
     __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I",
    "Target number of free buffers");
 static int hifreebuffers;
 SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers,
     __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I",
    "Threshold for clean buffer recycling");
 static counter_u64_t getnewbufcalls;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
    &getnewbufcalls, "Number of calls to getnewbuf");
 static counter_u64_t getnewbufrestarts;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD,
     &getnewbufrestarts,
     "Number of times getnewbuf has had to restart a buffer acquisition");
 static counter_u64_t mappingrestarts;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD,
     &mappingrestarts,
     "Number of times getblk has had to restart a buffer mapping for "
     "unmapped buffer");
 static counter_u64_t numbufallocfails;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW,
     &numbufallocfails, "Number of times buffer allocations failed");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
 static counter_u64_t notbufdflushes;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW | CTLFLAG_STATS,
     &barrierwrites, 0, "Number of barrier writes");
 SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
     &unmapped_buf_allowed, 0,
     "Permit the use of the unmapped i/o");
 int maxbcachebuf = MAXBCACHEBUF;
 SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0,
     "Maximum size of a buffer cache block");
 
 /*
  * This lock synchronizes access to bd_request.
  */
 static struct mtx_padalign __exclusive_cache_line bdlock;
 
 /*
  * This lock protects the runningbufreq and synchronizes runningbufwakeup and
  * waitrunningbufspace().
  */
 static struct mtx_padalign __exclusive_cache_line rbreqlock;
 
 /*
  * Lock that protects bdirtywait.
  */
 static struct mtx_padalign __exclusive_cache_line bdirtylock;
 
 /*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
  * is idling.
  */
 static int bd_request;
 
 /*
  * Request for the buf daemon to write more buffers than is indicated by
  * lodirtybuf.  This may be necessary to push out excess dependencies or
  * defragment the address space where a simple count of the number of dirty
  * buffers is insufficient to characterize the demand for flushing them.
  */
 static int bd_speedupreq;
 
 /*
  * Synchronization (sleep/wakeup) variable for active buffer space requests.
  * Set when wait starts, cleared prior to wakeup().
  * Used in runningbufwakeup() and waitrunningbufspace().
  */
 static int runningbufreq;
 
 /*
  * Synchronization for bwillwrite() waiters.
  */
 static int bdirtywait;
 
 /*
  * Definitions for the buffer free lists.
  */
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_EMPTY	1	/* empty buffer headers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
 #define QUEUE_CLEAN	3	/* non-B_DELWRI buffers */
 #define QUEUE_SENTINEL	4	/* not an queue index, but mark for sentinel */
 
 /* Maximum number of buffer domains. */
 #define	BUF_DOMAINS	8
 
 struct bufdomainset bdlodirty;		/* Domains > lodirty */
 struct bufdomainset bdhidirty;		/* Domains > hidirty */
 
 /* Configured number of clean queues. */
 static int __read_mostly buf_domains;
 
 BITSET_DEFINE(bufdomainset, BUF_DOMAINS);
 struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS];
 struct bufqueue __exclusive_cache_line bqempty;
 
 /*
  * per-cpu empty buffer cache.
  */
 uma_zone_t buf_zone;
 
 /*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
  */
 const char *buf_wmesg = BUF_WMESG;
 
 static int
 sysctl_runningspace(SYSCTL_HANDLER_ARGS)
 {
 	long value;
 	int error;
 
 	value = *(long *)arg1;
 	error = sysctl_handle_long(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	mtx_lock(&rbreqlock);
 	if (arg1 == &hirunningspace) {
 		if (value < lorunningspace)
 			error = EINVAL;
 		else
 			hirunningspace = value;
 	} else {
 		KASSERT(arg1 == &lorunningspace,
 		    ("%s: unknown arg1", __func__));
 		if (value > hirunningspace)
 			error = EINVAL;
 		else
 			lorunningspace = value;
 	}
 	mtx_unlock(&rbreqlock);
 	return (error);
 }
 
 static int
 sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int value;
 	int i;
 
 	value = *(int *)arg1;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	*(int *)arg1 = value;
 	for (i = 0; i < buf_domains; i++)
 		*(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
 		    value / buf_domains;
 
 	return (error);
 }
 
 static int
 sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS)
 {
 	long value;
 	int error;
 	int i;
 
 	value = *(long *)arg1;
 	error = sysctl_handle_long(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	*(long *)arg1 = value;
 	for (i = 0; i < buf_domains; i++)
 		*(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
 		    value / buf_domains;
 
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int
 sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
 	long lvalue;
 	int ivalue;
 	int i;
 
 	lvalue = 0;
 	for (i = 0; i < buf_domains; i++)
 		lvalue += bdomain[i].bd_bufspace;
 	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	if (lvalue > INT_MAX)
 		/* On overflow, still write out a long to trigger ENOMEM. */
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	ivalue = lvalue;
 	return (sysctl_handle_int(oidp, &ivalue, 0, req));
 }
 #else
 static int
 sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
 	long lvalue;
 	int i;
 
 	lvalue = 0;
 	for (i = 0; i < buf_domains; i++)
 		lvalue += bdomain[i].bd_bufspace;
 	return (sysctl_handle_long(oidp, &lvalue, 0, req));
 }
 #endif
 
 static int
 sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS)
 {
 	int value;
 	int i;
 
 	value = 0;
 	for (i = 0; i < buf_domains; i++)
 		value += bdomain[i].bd_numdirtybuffers;
 	return (sysctl_handle_int(oidp, &value, 0, req));
 }
 
 /*
  *	bdirtywakeup:
  *
  *	Wakeup any bwillwrite() waiters.
  */
 static void
 bdirtywakeup(void)
 {
 	mtx_lock(&bdirtylock);
 	if (bdirtywait) {
 		bdirtywait = 0;
 		wakeup(&bdirtywait);
 	}
 	mtx_unlock(&bdirtylock);
 }
 
 /*
  *	bd_clear:
  *
  *	Clear a domain from the appropriate bitsets when dirtybuffers
  *	is decremented.
  */
 static void
 bd_clear(struct bufdomain *bd)
 {
 
 	mtx_lock(&bdirtylock);
 	if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers)
 		BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
 	if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers)
 		BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
 	mtx_unlock(&bdirtylock);
 }
 
 /*
  *	bd_set:
  *
  *	Set a domain in the appropriate bitsets when dirtybuffers
  *	is incremented.
  */
 static void
 bd_set(struct bufdomain *bd)
 {
 
 	mtx_lock(&bdirtylock);
 	if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers)
 		BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
 	if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers)
 		BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
 	mtx_unlock(&bdirtylock);
 }
 
 /*
  *	bdirtysub:
  *
  *	Decrement the numdirtybuffers count by one and wakeup any
  *	threads blocked in bwillwrite().
  */
 static void
 bdirtysub(struct buf *bp)
 {
 	struct bufdomain *bd;
 	int num;
 
 	bd = bufdomain(bp);
 	num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1);
 	if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
 		bdirtywakeup();
 	if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
 		bd_clear(bd);
 }
 
 /*
  *	bdirtyadd:
  *
  *	Increment the numdirtybuffers count by one and wakeup the buf 
  *	daemon if needed.
  */
 static void
 bdirtyadd(struct buf *bp)
 {
 	struct bufdomain *bd;
 	int num;
 
 	/*
 	 * Only do the wakeup once as we cross the boundary.  The
 	 * buf daemon will keep running until the condition clears.
 	 */
 	bd = bufdomain(bp);
 	num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1);
 	if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
 		bd_wakeup();
 	if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
 		bd_set(bd);
 }
 
 /*
  *	bufspace_daemon_wakeup:
  *
  *	Wakeup the daemons responsible for freeing clean bufs.
  */
 static void
 bufspace_daemon_wakeup(struct bufdomain *bd)
 {
 
 	/*
 	 * avoid the lock if the daemon is running.
 	 */
 	if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) {
 		BD_RUN_LOCK(bd);
 		atomic_store_int(&bd->bd_running, 1);
 		wakeup(&bd->bd_running);
 		BD_RUN_UNLOCK(bd);
 	}
 }
 
 /*
  *	bufspace_daemon_wait:
  *
  *	Sleep until the domain falls below a limit or one second passes.
  */
 static void
 bufspace_daemon_wait(struct bufdomain *bd)
 {
 	/*
 	 * Re-check our limits and sleep.  bd_running must be
 	 * cleared prior to checking the limits to avoid missed
 	 * wakeups.  The waker will adjust one of bufspace or
 	 * freebuffers prior to checking bd_running.
 	 */
 	BD_RUN_LOCK(bd);
 	atomic_store_int(&bd->bd_running, 0);
 	if (bd->bd_bufspace < bd->bd_bufspacethresh &&
 	    bd->bd_freebuffers > bd->bd_lofreebuffers) {
 		msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP,
 		    "-", hz);
 	} else {
 		/* Avoid spurious wakeups while running. */
 		atomic_store_int(&bd->bd_running, 1);
 		BD_RUN_UNLOCK(bd);
 	}
 }
 
 /*
  *	bufspace_adjust:
  *
  *	Adjust the reported bufspace for a KVA managed buffer, possibly
  * 	waking any waiters.
  */
 static void
 bufspace_adjust(struct buf *bp, int bufsize)
 {
 	struct bufdomain *bd;
 	long space;
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) == 0,
 	    ("bufspace_adjust: malloc buf %p", bp));
 	bd = bufdomain(bp);
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0) {
 		atomic_subtract_long(&bd->bd_bufspace, -diff);
 	} else if (diff > 0) {
 		space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
 		/* Wake up the daemon on the transition. */
 		if (space < bd->bd_bufspacethresh &&
 		    space + diff >= bd->bd_bufspacethresh)
 			bufspace_daemon_wakeup(bd);
 	}
 	bp->b_bufsize = bufsize;
 }
 
 /*
  *	bufspace_reserve:
  *
  *	Reserve bufspace before calling allocbuf().  metadata has a
  *	different space limit than data.
  */
 static int
 bufspace_reserve(struct bufdomain *bd, int size, bool metadata)
 {
 	long limit, new;
 	long space;
 
 	if (metadata)
 		limit = bd->bd_maxbufspace;
 	else
 		limit = bd->bd_hibufspace;
 	space = atomic_fetchadd_long(&bd->bd_bufspace, size);
 	new = space + size;
 	if (new > limit) {
 		atomic_subtract_long(&bd->bd_bufspace, size);
 		return (ENOSPC);
 	}
 
 	/* Wake up the daemon on the transition. */
 	if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh)
 		bufspace_daemon_wakeup(bd);
 
 	return (0);
 }
 
 /*
  *	bufspace_release:
  *
  *	Release reserved bufspace after bufspace_adjust() has consumed it.
  */
 static void
 bufspace_release(struct bufdomain *bd, int size)
 {
 
 	atomic_subtract_long(&bd->bd_bufspace, size);
 }
 
 /*
  *	bufspace_wait:
  *
  *	Wait for bufspace, acting as the buf daemon if a locked vnode is
  *	supplied.  bd_wanted must be set prior to polling for space.  The
  *	operation must be re-tried on return.
  */
 static void
 bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags,
     int slpflag, int slptimeo)
 {
 	struct thread *td;
 	int error, fl, norunbuf;
 
 	if ((gbflags & GB_NOWAIT_BD) != 0)
 		return;
 
 	td = curthread;
 	BD_LOCK(bd);
 	while (bd->bd_wanted) {
 		if (vp != NULL && vp->v_type != VCHR &&
 		    (td->td_pflags & TDP_BUFNEED) == 0) {
 			BD_UNLOCK(bd);
 			/*
 			 * getblk() is called with a vnode locked, and
 			 * some majority of the dirty buffers may as
 			 * well belong to the vnode.  Flushing the
 			 * buffers there would make a progress that
 			 * cannot be achieved by the buf_daemon, that
 			 * cannot lock the vnode.
 			 */
 			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
 			    (td->td_pflags & TDP_NORUNNINGBUF);
 
 			/*
 			 * Play bufdaemon.  The getnewbuf() function
 			 * may be called while the thread owns lock
 			 * for another dirty buffer for the same
 			 * vnode, which makes it impossible to use
 			 * VOP_FSYNC() there, due to the buffer lock
 			 * recursion.
 			 */
 			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
 			fl = buf_flush(vp, bd, flushbufqtarget);
 			td->td_pflags &= norunbuf;
 			BD_LOCK(bd);
 			if (fl != 0)
 				continue;
 			if (bd->bd_wanted == 0)
 				break;
 		}
 		error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
 		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
 		if (error != 0)
 			break;
 	}
 	BD_UNLOCK(bd);
 }
 
 /*
  *	bufspace_daemon:
  *
  *	buffer space management daemon.  Tries to maintain some marginal
  *	amount of free buffer space so that requesting processes neither
  *	block nor work to reclaim buffers.
  */
 static void
 bufspace_daemon(void *arg)
 {
 	struct bufdomain *bd;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread,
 	    SHUTDOWN_PRI_LAST + 100);
 
 	bd = arg;
 	for (;;) {
 		kthread_suspend_check();
 
 		/*
 		 * Free buffers from the clean queue until we meet our
 		 * targets.
 		 *
 		 * Theory of operation:  The buffer cache is most efficient
 		 * when some free buffer headers and space are always
 		 * available to getnewbuf().  This daemon attempts to prevent
 		 * the excessive blocking and synchronization associated
 		 * with shortfall.  It goes through three phases according
 		 * demand:
 		 *
 		 * 1)	The daemon wakes up voluntarily once per-second
 		 *	during idle periods when the counters are below
 		 *	the wakeup thresholds (bufspacethresh, lofreebuffers).
 		 *
 		 * 2)	The daemon wakes up as we cross the thresholds
 		 *	ahead of any potential blocking.  This may bounce
 		 *	slightly according to the rate of consumption and
 		 *	release.
 		 *
 		 * 3)	The daemon and consumers are starved for working
 		 *	clean buffers.  This is the 'bufspace' sleep below
 		 *	which will inefficiently trade bufs with bqrelse
 		 *	until we return to condition 2.
 		 */
 		while (bd->bd_bufspace > bd->bd_lobufspace ||
 		    bd->bd_freebuffers < bd->bd_hifreebuffers) {
 			if (buf_recycle(bd, false) != 0) {
 				if (bd_flushall(bd))
 					continue;
 				/*
 				 * Speedup dirty if we've run out of clean
 				 * buffers.  This is possible in particular
 				 * because softdep may held many bufs locked
 				 * pending writes to other bufs which are
 				 * marked for delayed write, exhausting
 				 * clean space until they are written.
 				 */
 				bd_speedup();
 				BD_LOCK(bd);
 				if (bd->bd_wanted) {
 					msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
 					    PRIBIO|PDROP, "bufspace", hz/10);
 				} else
 					BD_UNLOCK(bd);
 			}
 			maybe_yield();
 		}
 		bufspace_daemon_wait(bd);
 	}
 }
 
 /*
  *	bufmallocadjust:
  *
  *	Adjust the reported bufspace for a malloc managed buffer, possibly
  *	waking any waiters.
  */
 static void
 bufmallocadjust(struct buf *bp, int bufsize)
 {
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) != 0,
 	    ("bufmallocadjust: non-malloc buf %p", bp));
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0)
 		atomic_subtract_long(&bufmallocspace, -diff);
 	else
 		atomic_add_long(&bufmallocspace, diff);
 	bp->b_bufsize = bufsize;
 }
 
 /*
  *	runningwakeup:
  *
  *	Wake up processes that are waiting on asynchronous writes to fall
  *	below lorunningspace.
  */
 static void
 runningwakeup(void)
 {
 
 	mtx_lock(&rbreqlock);
 	if (runningbufreq) {
 		runningbufreq = 0;
 		wakeup(&runningbufreq);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 /*
  *	runningbufwakeup:
  *
  *	Decrement the outstanding write count according.
  */
 void
 runningbufwakeup(struct buf *bp)
 {
 	long space, bspace;
 
 	bspace = bp->b_runningbufspace;
 	if (bspace == 0)
 		return;
 	space = atomic_fetchadd_long(&runningbufspace, -bspace);
 	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
 	    space, bspace));
 	bp->b_runningbufspace = 0;
 	/*
 	 * Only acquire the lock and wakeup on the transition from exceeding
 	 * the threshold to falling below it.
 	 */
 	if (space < lorunningspace)
 		return;
 	if (space - bspace > lorunningspace)
 		return;
 	runningwakeup();
 }
 
 /*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
  *	running.  This routine is used in async-write situations to
  *	prevent creating huge backups of pending writes to a device.
  *	Only asynchronous writes are governed by this function.
  *
  *	This does NOT turn an async write into a sync write.  It waits  
  *	for earlier writes to complete and generally returns before the
  *	caller's write has reached the device.
  */
 void
 waitrunningbufspace(void)
 {
 
 	mtx_lock(&rbreqlock);
 	while (runningbufspace > hirunningspace) {
 		runningbufreq = 1;
 		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 /*
  *	vfs_buf_test_cache:
  *
  *	Called when a buffer is extended.  This function clears the B_CACHE
  *	bit if the newly extended portion of the buffer does not contain
  *	valid data.
  */
 static __inline void
 vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
     vm_offset_t size, vm_page_t m)
 {
 
 	/*
 	 * This function and its results are protected by higher level
 	 * synchronization requiring vnode and buf locks to page in and
 	 * validate pages.
 	 */
 	if (bp->b_flags & B_CACHE) {
 		int base = (foff + off) & PAGE_MASK;
 		if (vm_page_is_valid(m, base, size) == 0)
 			bp->b_flags &= ~B_CACHE;
 	}
 }
 
 /* Wake up the buffer daemon if necessary */
 static void
 bd_wakeup(void)
 {
 
 	mtx_lock(&bdlock);
 	if (bd_request == 0) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
 	mtx_unlock(&bdlock);
 }
 
 /*
  * Adjust the maxbcachbuf tunable.
  */
 static void
 maxbcachebuf_adjust(void)
 {
 	int i;
 
 	/*
 	 * maxbcachebuf must be a power of 2 >= MAXBSIZE.
 	 */
 	i = 2;
 	while (i * 2 <= maxbcachebuf)
 		i *= 2;
 	maxbcachebuf = i;
 	if (maxbcachebuf < MAXBSIZE)
 		maxbcachebuf = MAXBSIZE;
 	if (maxbcachebuf > MAXPHYS)
 		maxbcachebuf = MAXPHYS;
 	if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF)
 		printf("maxbcachebuf=%d\n", maxbcachebuf);
 }
 
 /*
  * bd_speedup - speedup the buffer cache flushing code
  */
 void
 bd_speedup(void)
 {
 	int needwake;
 
 	mtx_lock(&bdlock);
 	needwake = 0;
 	if (bd_speedupreq == 0 || bd_request == 0)
 		needwake = 1;
 	bd_speedupreq = 1;
 	bd_request = 1;
 	if (needwake)
 		wakeup(&bd_request);
 	mtx_unlock(&bdlock);
 }
 
 #ifdef __i386__
 #define	TRANSIENT_DENOM	5
 #else
 #define	TRANSIENT_DENOM 10
 #endif
 
 /*
  * Calculating buffer cache scaling values and reserve space for buffer
  * headers.  This is called during low level kernel initialization and
  * may be called more then once.  We CANNOT write to the memory area
  * being reserved at this time.
  */
 caddr_t
 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 {
 	int tuned_nbuf;
 	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
 
 	/*
 	 * physmem_est is in pages.  Convert it to kilobytes (assumes
 	 * PAGE_SIZE is >= 1K)
 	 */
 	physmem_est = physmem_est * (PAGE_SIZE / 1024);
 
 	maxbcachebuf_adjust();
 	/*
 	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 	 * For the first 64MB of ram nominally allocate sufficient buffers to
 	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
 	 * the buffer cache we limit the eventual kva reservation to
 	 * maxbcache bytes.
 	 *
 	 * factor represents the 1/4 x ram conversion.
 	 */
 	if (nbuf == 0) {
 		int factor = 4 * BKVASIZE / 1024;
 
 		nbuf = 50;
 		if (physmem_est > 4096)
 			nbuf += min((physmem_est - 4096) / factor,
 			    65536 / factor);
 		if (physmem_est > 65536)
 			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
 			    32 * 1024 * 1024 / (factor * 5));
 
 		if (maxbcache && nbuf > maxbcache / BKVASIZE)
 			nbuf = maxbcache / BKVASIZE;
 		tuned_nbuf = 1;
 	} else
 		tuned_nbuf = 0;
 
 	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
 	maxbuf = (LONG_MAX / 3) / BKVASIZE;
 	if (nbuf > maxbuf) {
 		if (!tuned_nbuf)
 			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
 			    maxbuf);
 		nbuf = maxbuf;
 	}
 
 	/*
 	 * Ideal allocation size for the transient bio submap is 10%
 	 * of the maximal space buffer map.  This roughly corresponds
 	 * to the amount of the buffer mapped for typical UFS load.
 	 *
 	 * Clip the buffer map to reserve space for the transient
 	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
 	 * maximum buffer map extent on the platform.
 	 *
 	 * The fall-back to the maxbuf in case of maxbcache unset,
 	 * allows to not trim the buffer KVA for the architectures
 	 * with ample KVA space.
 	 */
 	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
 		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
 		buf_sz = (long)nbuf * BKVASIZE;
 		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
 		    (TRANSIENT_DENOM - 1)) {
 			/*
 			 * There is more KVA than memory.  Do not
 			 * adjust buffer map size, and assign the rest
 			 * of maxbuf to transient map.
 			 */
 			biotmap_sz = maxbuf_sz - buf_sz;
 		} else {
 			/*
 			 * Buffer map spans all KVA we could afford on
 			 * this platform.  Give 10% (20% on i386) of
 			 * the buffer map to the transient bio map.
 			 */
 			biotmap_sz = buf_sz / TRANSIENT_DENOM;
 			buf_sz -= biotmap_sz;
 		}
 		if (biotmap_sz / INT_MAX > MAXPHYS)
 			bio_transient_maxcnt = INT_MAX;
 		else
 			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
 		/*
 		 * Artificially limit to 1024 simultaneous in-flight I/Os
 		 * using the transient mapping.
 		 */
 		if (bio_transient_maxcnt > 1024)
 			bio_transient_maxcnt = 1024;
 		if (tuned_nbuf)
 			nbuf = buf_sz / BKVASIZE;
 	}
 
 	if (nswbuf == 0) {
 		nswbuf = min(nbuf / 4, 256);
 		if (nswbuf < NSWBUF_MIN)
 			nswbuf = NSWBUF_MIN;
 	}
 
 	/*
 	 * Reserve space for the buffer cache buffers
 	 */
 	buf = (void *)v;
 	v = (caddr_t)(buf + nbuf);
 
 	return(v);
 }
 
 /* Initialize the buffer subsystem.  Called before use of any buffers. */
 void
 bufinit(void)
 {
 	struct buf *bp;
 	int i;
 
 	KASSERT(maxbcachebuf >= MAXBSIZE,
 	    ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
 	    MAXBSIZE));
 	bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
 	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
 
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_NONE;
 		bp->b_domain = -1;
 		bp->b_subqueue = mp_maxid + 1;
 		bp->b_xflags = 0;
 		bp->b_data = bp->b_kvabase = unmapped_buf;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		bq_insert(&bqempty, bp, false);
 	}
 
 	/*
 	 * maxbufspace is the absolute maximum amount of buffer space we are 
 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
 	 * is nominally used by metadata.  hibufspace is the nominal maximum
 	 * used by most other requests.  The differential is required to 
 	 * ensure that metadata deadlocks don't occur.
 	 *
 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
 	 * this may result in KVM fragmentation which is not handled optimally
 	 * by the system. XXX This is less true with vmem.  We could use
 	 * PAGE_SIZE.
 	 */
 	maxbufspace = (long)nbuf * BKVASIZE;
 	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10);
 	lobufspace = (hibufspace / 20) * 19; /* 95% */
 	bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
 
 	/*
 	 * Note: The 16 MiB upper limit for hirunningspace was chosen
 	 * arbitrarily and may need further tuning. It corresponds to
 	 * 128 outstanding write IO requests (if IO size is 128 KiB),
 	 * which fits with many RAID controllers' tagged queuing limits.
 	 * The lower 1 MiB limit is the historical upper limit for
 	 * hirunningspace.
 	 */
 	hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf),
 	    16 * 1024 * 1024), 1024 * 1024);
 	lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf);
 
 	/*
 	 * Limit the amount of malloc memory since it is wired permanently into
 	 * the kernel space.  Even though this is accounted for in the buffer
 	 * allocation, we don't want the malloced region to grow uncontrolled.
 	 * The malloc scheme improves memory utilization significantly on
 	 * average (small) directories.
 	 */
 	maxbufmallocspace = hibufspace / 20;
 
 	/*
 	 * Reduce the chance of a deadlock occurring by limiting the number
 	 * of delayed-write dirty buffers we allow to stack up.
 	 */
 	hidirtybuffers = nbuf / 4 + 20;
 	dirtybufthresh = hidirtybuffers * 9 / 10;
 	/*
 	 * To support extreme low-memory systems, make sure hidirtybuffers
 	 * cannot eat up all available buffer space.  This occurs when our
 	 * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
 	 * buffer space assuming BKVASIZE'd buffers.
 	 */
 	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 		hidirtybuffers >>= 1;
 	}
 	lodirtybuffers = hidirtybuffers / 2;
 
 	/*
 	 * lofreebuffers should be sufficient to avoid stalling waiting on
 	 * buf headers under heavy utilization.  The bufs in per-cpu caches
 	 * are counted as free but will be unavailable to threads executing
 	 * on other cpus.
 	 *
 	 * hifreebuffers is the free target for the bufspace daemon.  This
 	 * should be set appropriately to limit work per-iteration.
 	 */
 	lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
 	hifreebuffers = (3 * lofreebuffers) / 2;
 	numfreebuffers = nbuf;
 
 	/* Setup the kva and free list allocators. */
 	vmem_set_reclaim(buffer_arena, bufkva_reclaim);
 	buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
 	    NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
 
 	/*
 	 * Size the clean queue according to the amount of buffer space.
 	 * One queue per-256mb up to the max.  More queues gives better
 	 * concurrency but less accurate LRU.
 	 */
 	buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS);
 	for (i = 0 ; i < buf_domains; i++) {
 		struct bufdomain *bd;
 
 		bd = &bdomain[i];
 		bd_init(bd);
 		bd->bd_freebuffers = nbuf / buf_domains;
 		bd->bd_hifreebuffers = hifreebuffers / buf_domains;
 		bd->bd_lofreebuffers = lofreebuffers / buf_domains;
 		bd->bd_bufspace = 0;
 		bd->bd_maxbufspace = maxbufspace / buf_domains;
 		bd->bd_hibufspace = hibufspace / buf_domains;
 		bd->bd_lobufspace = lobufspace / buf_domains;
 		bd->bd_bufspacethresh = bufspacethresh / buf_domains;
 		bd->bd_numdirtybuffers = 0;
 		bd->bd_hidirtybuffers = hidirtybuffers / buf_domains;
 		bd->bd_lodirtybuffers = lodirtybuffers / buf_domains;
 		bd->bd_dirtybufthresh = dirtybufthresh / buf_domains;
 		/* Don't allow more than 2% of bufs in the per-cpu caches. */
 		bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus;
 	}
 	getnewbufcalls = counter_u64_alloc(M_WAITOK);
 	getnewbufrestarts = counter_u64_alloc(M_WAITOK);
 	mappingrestarts = counter_u64_alloc(M_WAITOK);
 	numbufallocfails = counter_u64_alloc(M_WAITOK);
 	notbufdflushes = counter_u64_alloc(M_WAITOK);
 	buffreekvacnt = counter_u64_alloc(M_WAITOK);
 	bufdefragcnt = counter_u64_alloc(M_WAITOK);
 	bufkvaspace = counter_u64_alloc(M_WAITOK);
 }
 
 #ifdef INVARIANTS
 static inline void
 vfs_buf_check_mapped(struct buf *bp)
 {
 
 	KASSERT(bp->b_kvabase != unmapped_buf,
 	    ("mapped buf: b_kvabase was not updated %p", bp));
 	KASSERT(bp->b_data != unmapped_buf,
 	    ("mapped buf: b_data was not updated %p", bp));
 	KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf +
 	    MAXPHYS, ("b_data + b_offset unmapped %p", bp));
 }
 
 static inline void
 vfs_buf_check_unmapped(struct buf *bp)
 {
 
 	KASSERT(bp->b_data == unmapped_buf,
 	    ("unmapped buf: corrupted b_data %p", bp));
 }
 
 #define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
 #define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
 #else
 #define	BUF_CHECK_MAPPED(bp) do {} while (0)
 #define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
 #endif
 
 static int
 isbufbusy(struct buf *bp)
 {
 	if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) ||
 	    ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
 		return (1);
 	return (0);
 }
 
 /*
  * Shutdown the system cleanly to prepare for reboot, halt, or power off.
  */
 void
 bufshutdown(int show_busybufs)
 {
 	static int first_buf_printf = 1;
 	struct buf *bp;
 	int iter, nbusy, pbusy;
 #ifndef PREEMPTION
 	int subiter;
 #endif
 
 	/*
 	 * Sync filesystems for shutdown
 	 */
 	wdog_kern_pat(WD_LASTVAL);
 	kern_sync(curthread);
 
 	/*
 	 * With soft updates, some buffers that are
 	 * written will be remarked as dirty until other
 	 * buffers are written.
 	 */
 	for (iter = pbusy = 0; iter < 20; iter++) {
 		nbusy = 0;
 		for (bp = &buf[nbuf]; --bp >= buf; )
 			if (isbufbusy(bp))
 				nbusy++;
 		if (nbusy == 0) {
 			if (first_buf_printf)
 				printf("All buffers synced.");
 			break;
 		}
 		if (first_buf_printf) {
 			printf("Syncing disks, buffers remaining... ");
 			first_buf_printf = 0;
 		}
 		printf("%d ", nbusy);
 		if (nbusy < pbusy)
 			iter = 0;
 		pbusy = nbusy;
 
 		wdog_kern_pat(WD_LASTVAL);
 		kern_sync(curthread);
 
 #ifdef PREEMPTION
 		/*
 		 * Spin for a while to allow interrupt threads to run.
 		 */
 		DELAY(50000 * iter);
 #else
 		/*
 		 * Context switch several times to allow interrupt
 		 * threads to run.
 		 */
 		for (subiter = 0; subiter < 50 * iter; subiter++) {
 			thread_lock(curthread);
 			mi_switch(SW_VOL);
 			DELAY(1000);
 		}
 #endif
 	}
 	printf("\n");
 	/*
 	 * Count only busy local buffers to prevent forcing 
 	 * a fsck if we're just a client of a wedged NFS server
 	 */
 	nbusy = 0;
 	for (bp = &buf[nbuf]; --bp >= buf; ) {
 		if (isbufbusy(bp)) {
 #if 0
 /* XXX: This is bogus.  We should probably have a BO_REMOTE flag instead */
 			if (bp->b_dev == NULL) {
 				TAILQ_REMOVE(&mountlist,
 				    bp->b_vp->v_mount, mnt_list);
 				continue;
 			}
 #endif
 			nbusy++;
 			if (show_busybufs > 0) {
 				printf(
 	    "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
 				    nbusy, bp, bp->b_vp, bp->b_flags,
 				    (intmax_t)bp->b_blkno,
 				    (intmax_t)bp->b_lblkno);
 				BUF_LOCKPRINTINFO(bp);
 				if (show_busybufs > 1)
 					vn_printf(bp->b_vp,
 					    "vnode content: ");
 			}
 		}
 	}
 	if (nbusy) {
 		/*
 		 * Failed to sync all blocks. Indicate this and don't
 		 * unmount filesystems (thus forcing an fsck on reboot).
 		 */
 		printf("Giving up on %d buffers\n", nbusy);
 		DELAY(5000000);	/* 5 seconds */
 	} else {
 		if (!first_buf_printf)
 			printf("Final sync complete\n");
 		/*
 		 * Unmount filesystems
 		 */
 		if (!KERNEL_PANICKED())
 			vfs_unmountall();
 	}
 	swapoff_all();
 	DELAY(100000);		/* wait for console output to finish */
 }
 
 static void
 bpmap_qenter(struct buf *bp)
 {
 
 	BUF_CHECK_MAPPED(bp);
 
 	/*
 	 * bp->b_data is relative to bp->b_offset, but
 	 * bp->b_offset may be offset into the first page.
 	 */
 	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
 	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
 	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
 	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
 static inline struct bufdomain *
 bufdomain(struct buf *bp)
 {
 
 	return (&bdomain[bp->b_domain]);
 }
 
 static struct bufqueue *
 bufqueue(struct buf *bp)
 {
 
 	switch (bp->b_qindex) {
 	case QUEUE_NONE:
 		/* FALLTHROUGH */
 	case QUEUE_SENTINEL:
 		return (NULL);
 	case QUEUE_EMPTY:
 		return (&bqempty);
 	case QUEUE_DIRTY:
 		return (&bufdomain(bp)->bd_dirtyq);
 	case QUEUE_CLEAN:
 		return (&bufdomain(bp)->bd_subq[bp->b_subqueue]);
 	default:
 		break;
 	}
 	panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex);
 }
 
 /*
  * Return the locked bufqueue that bp is a member of.
  */
 static struct bufqueue *
 bufqueue_acquire(struct buf *bp)
 {
 	struct bufqueue *bq, *nbq;
 
 	/*
 	 * bp can be pushed from a per-cpu queue to the
 	 * cleanq while we're waiting on the lock.  Retry
 	 * if the queues don't match.
 	 */
 	bq = bufqueue(bp);
 	BQ_LOCK(bq);
 	for (;;) {
 		nbq = bufqueue(bp);
 		if (bq == nbq)
 			break;
 		BQ_UNLOCK(bq);
 		BQ_LOCK(nbq);
 		bq = nbq;
 	}
 	return (bq);
 }
 
 /*
  *	binsfree:
  *
  *	Insert the buffer into the appropriate free list.  Requires a
  *	locked buffer on entry and buffer is unlocked before return.
  */
 static void
 binsfree(struct buf *bp, int qindex)
 {
 	struct bufdomain *bd;
 	struct bufqueue *bq;
 
 	KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY,
 	    ("binsfree: Invalid qindex %d", qindex));
 	BUF_ASSERT_XLOCKED(bp);
 
 	/*
 	 * Handle delayed bremfree() processing.
 	 */
 	if (bp->b_flags & B_REMFREE) {
 		if (bp->b_qindex == qindex) {
 			bp->b_flags |= B_REUSE;
 			bp->b_flags &= ~B_REMFREE;
 			BUF_UNLOCK(bp);
 			return;
 		}
 		bq = bufqueue_acquire(bp);
 		bq_remove(bq, bp);
 		BQ_UNLOCK(bq);
 	}
 	bd = bufdomain(bp);
 	if (qindex == QUEUE_CLEAN) {
 		if (bd->bd_lim != 0)
 			bq = &bd->bd_subq[PCPU_GET(cpuid)];
 		else
 			bq = bd->bd_cleanq;
 	} else
 		bq = &bd->bd_dirtyq;
 	bq_insert(bq, bp, true);
 }
 
 /*
  * buf_free:
  *
  *	Free a buffer to the buf zone once it no longer has valid contents.
  */
 static void
 buf_free(struct buf *bp)
 {
 
 	if (bp->b_flags & B_REMFREE)
 		bremfreef(bp);
 	if (bp->b_vflags & BV_BKGRDINPROG)
 		panic("losing buffer 1");
 	if (bp->b_rcred != NOCRED) {
 		crfree(bp->b_rcred);
 		bp->b_rcred = NOCRED;
 	}
 	if (bp->b_wcred != NOCRED) {
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_deallocate(bp);
 	bufkva_free(bp);
 	atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1);
 	BUF_UNLOCK(bp);
 	uma_zfree(buf_zone, bp);
 }
 
 /*
  * buf_import:
  *
  *	Import bufs into the uma cache from the buf list.  The system still
  *	expects a static array of bufs and much of the synchronization
  *	around bufs assumes type stable storage.  As a result, UMA is used
  *	only as a per-cpu cache of bufs still maintained on a global list.
  */
 static int
 buf_import(void *arg, void **store, int cnt, int domain, int flags)
 {
 	struct buf *bp;
 	int i;
 
 	BQ_LOCK(&bqempty);
 	for (i = 0; i < cnt; i++) {
 		bp = TAILQ_FIRST(&bqempty.bq_queue);
 		if (bp == NULL)
 			break;
 		bq_remove(&bqempty, bp);
 		store[i] = bp;
 	}
 	BQ_UNLOCK(&bqempty);
 
 	return (i);
 }
 
 /*
  * buf_release:
  *
  *	Release bufs from the uma cache back to the buffer queues.
  */
 static void
 buf_release(void *arg, void **store, int cnt)
 {
 	struct bufqueue *bq;
 	struct buf *bp;
         int i;
 
 	bq = &bqempty;
 	BQ_LOCK(bq);
         for (i = 0; i < cnt; i++) {
 		bp = store[i];
 		/* Inline bq_insert() to batch locking. */
 		TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
 		bp->b_flags &= ~(B_AGE | B_REUSE);
 		bq->bq_len++;
 		bp->b_qindex = bq->bq_index;
 	}
 	BQ_UNLOCK(bq);
 }
 
 /*
  * buf_alloc:
  *
  *	Allocate an empty buffer header.
  */
 static struct buf *
 buf_alloc(struct bufdomain *bd)
 {
 	struct buf *bp;
 	int freebufs, error;
 
 	/*
 	 * We can only run out of bufs in the buf zone if the average buf
 	 * is less than BKVASIZE.  In this case the actual wait/block will
 	 * come from buf_reycle() failing to flush one of these small bufs.
 	 */
 	bp = NULL;
 	freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1);
 	if (freebufs > 0)
 		bp = uma_zalloc(buf_zone, M_NOWAIT);
 	if (bp == NULL) {
 		atomic_add_int(&bd->bd_freebuffers, 1);
 		bufspace_daemon_wakeup(bd);
 		counter_u64_add(numbufallocfails, 1);
 		return (NULL);
 	}
 	/*
 	 * Wake-up the bufspace daemon on transition below threshold.
 	 */
 	if (freebufs == bd->bd_lofreebuffers)
 		bufspace_daemon_wakeup(bd);
 
 	error = BUF_LOCK(bp, LK_EXCLUSIVE, NULL);
 	KASSERT(error == 0, ("%s: BUF_LOCK on free buf %p: %d.", __func__, bp,
 	    error));
 	(void)error;
 
 	KASSERT(bp->b_vp == NULL,
 	    ("bp: %p still has vnode %p.", bp, bp->b_vp));
 	KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
 	    ("invalid buffer %p flags %#x", bp, bp->b_flags));
 	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
 	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
 	KASSERT(bp->b_npages == 0,
 	    ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
 	KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
 	KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
 
 	bp->b_domain = BD_DOMAIN(bd);
 	bp->b_flags = 0;
 	bp->b_ioflags = 0;
 	bp->b_xflags = 0;
 	bp->b_vflags = 0;
 	bp->b_vp = NULL;
 	bp->b_blkno = bp->b_lblkno = 0;
 	bp->b_offset = NOOFFSET;
 	bp->b_iodone = 0;
 	bp->b_error = 0;
 	bp->b_resid = 0;
 	bp->b_bcount = 0;
 	bp->b_npages = 0;
 	bp->b_dirtyoff = bp->b_dirtyend = 0;
 	bp->b_bufobj = NULL;
 	bp->b_data = bp->b_kvabase = unmapped_buf;
 	bp->b_fsprivate1 = NULL;
 	bp->b_fsprivate2 = NULL;
 	bp->b_fsprivate3 = NULL;
 	LIST_INIT(&bp->b_dep);
 
 	return (bp);
 }
 
 /*
  *	buf_recycle:
  *
  *	Free a buffer from the given bufqueue.  kva controls whether the
  *	freed buf must own some kva resources.  This is used for
  *	defragmenting.
  */
 static int
 buf_recycle(struct bufdomain *bd, bool kva)
 {
 	struct bufqueue *bq;
 	struct buf *bp, *nbp;
 
 	if (kva)
 		counter_u64_add(bufdefragcnt, 1);
 	nbp = NULL;
 	bq = bd->bd_cleanq;
 	BQ_LOCK(bq);
 	KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd),
 	    ("buf_recycle: Locks don't match"));
 	nbp = TAILQ_FIRST(&bq->bq_queue);
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
 	while ((bp = nbp) != NULL) {
 		/*
 		 * Calculate next bp (we can only use it if we do not
 		 * release the bqlock).
 		 */
 		nbp = TAILQ_NEXT(bp, b_freelist);
 
 		/*
 		 * If we are defragging then we need a buffer with 
 		 * some kva to reclaim.
 		 */
 		if (kva && bp->b_kvasize == 0)
 			continue;
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
 
 		/*
 		 * Implement a second chance algorithm for frequently
 		 * accessed buffers.
 		 */
 		if ((bp->b_flags & B_REUSE) != 0) {
 			TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
 			TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
 			bp->b_flags &= ~B_REUSE;
 			BUF_UNLOCK(bp);
 			continue;
 		}
 
 		/*
 		 * Skip buffers with background writes in progress.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 
 		KASSERT(bp->b_qindex == QUEUE_CLEAN,
 		    ("buf_recycle: inconsistent queue %d bp %p",
 		    bp->b_qindex, bp));
 		KASSERT(bp->b_domain == BD_DOMAIN(bd),
 		    ("getnewbuf: queue domain %d doesn't match request %d",
 		    bp->b_domain, (int)BD_DOMAIN(bd)));
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 */
 		bq_remove(bq, bp);
 		BQ_UNLOCK(bq);
 
 		/*
 		 * Requeue the background write buffer with error and
 		 * restart the scan.
 		 */
 		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
 			bqrelse(bp);
 			BQ_LOCK(bq);
 			nbp = TAILQ_FIRST(&bq->bq_queue);
 			continue;
 		}
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 		return (0);
 	}
 	bd->bd_wanted = 1;
 	BQ_UNLOCK(bq);
 
 	return (ENOBUFS);
 }
 
 /*
  *	bremfree:
  *
  *	Mark the buffer for removal from the appropriate free list.
  *
  */
 void
 bremfree(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT((bp->b_flags & B_REMFREE) == 0,
 	    ("bremfree: buffer %p already marked for delayed removal.", bp));
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfree: buffer %p not on a queue.", bp));
 	BUF_ASSERT_XLOCKED(bp);
 
 	bp->b_flags |= B_REMFREE;
 }
 
 /*
  *	bremfreef:
  *
  *	Force an immediate removal from a free list.  Used only in nfs when
  *	it abuses the b_freelist pointer.
  */
 void
 bremfreef(struct buf *bp)
 {
 	struct bufqueue *bq;
 
 	bq = bufqueue_acquire(bp);
 	bq_remove(bq, bp);
 	BQ_UNLOCK(bq);
 }
 
 static void
 bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname)
 {
 
 	mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF);
 	TAILQ_INIT(&bq->bq_queue);
 	bq->bq_len = 0;
 	bq->bq_index = qindex;
 	bq->bq_subqueue = subqueue;
 }
 
 static void
 bd_init(struct bufdomain *bd)
 {
 	int i;
 
 	bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1];
 	bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock");
 	bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock");
 	for (i = 0; i <= mp_maxid; i++)
 		bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i,
 		    "bufq clean subqueue lock");
 	mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF);
 }
 
 /*
  *	bq_remove:
  *
  *	Removes a buffer from the free list, must be called with the
  *	correct qlock held.
  */
 static void
 bq_remove(struct bufqueue *bq, struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bq_remove: buffer %p not on a queue.", bp));
 	KASSERT(bufqueue(bp) == bq,
 	    ("bq_remove: Remove buffer %p from wrong queue.", bp));
 
 	BQ_ASSERT_LOCKED(bq);
 	if (bp->b_qindex != QUEUE_EMPTY) {
 		BUF_ASSERT_XLOCKED(bp);
 	}
 	KASSERT(bq->bq_len >= 1,
 	    ("queue %d underflow", bp->b_qindex));
 	TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
 	bq->bq_len--;
 	bp->b_qindex = QUEUE_NONE;
 	bp->b_flags &= ~(B_REMFREE | B_REUSE);
 }
 
 static void
 bd_flush(struct bufdomain *bd, struct bufqueue *bq)
 {
 	struct buf *bp;
 
 	BQ_ASSERT_LOCKED(bq);
 	if (bq != bd->bd_cleanq) {
 		BD_LOCK(bd);
 		while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) {
 			TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
 			TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp,
 			    b_freelist);
 			bp->b_subqueue = bd->bd_cleanq->bq_subqueue;
 		}
 		bd->bd_cleanq->bq_len += bq->bq_len;
 		bq->bq_len = 0;
 	}
 	if (bd->bd_wanted) {
 		bd->bd_wanted = 0;
 		wakeup(&bd->bd_wanted);
 	}
 	if (bq != bd->bd_cleanq)
 		BD_UNLOCK(bd);
 }
 
 static int
 bd_flushall(struct bufdomain *bd)
 {
 	struct bufqueue *bq;
 	int flushed;
 	int i;
 
 	if (bd->bd_lim == 0)
 		return (0);
 	flushed = 0;
 	for (i = 0; i <= mp_maxid; i++) {
 		bq = &bd->bd_subq[i];
 		if (bq->bq_len == 0)
 			continue;
 		BQ_LOCK(bq);
 		bd_flush(bd, bq);
 		BQ_UNLOCK(bq);
 		flushed++;
 	}
 
 	return (flushed);
 }
 
 static void
 bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock)
 {
 	struct bufdomain *bd;
 
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("bq_insert: free buffer %p onto another queue?", bp);
 
 	bd = bufdomain(bp);
 	if (bp->b_flags & B_AGE) {
 		/* Place this buf directly on the real queue. */
 		if (bq->bq_index == QUEUE_CLEAN)
 			bq = bd->bd_cleanq;
 		BQ_LOCK(bq);
 		TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist);
 	} else {
 		BQ_LOCK(bq);
 		TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
 	}
 	bp->b_flags &= ~(B_AGE | B_REUSE);
 	bq->bq_len++;
 	bp->b_qindex = bq->bq_index;
 	bp->b_subqueue = bq->bq_subqueue;
 
 	/*
 	 * Unlock before we notify so that we don't wakeup a waiter that
 	 * fails a trylock on the buf and sleeps again.
 	 */
 	if (unlock)
 		BUF_UNLOCK(bp);
 
 	if (bp->b_qindex == QUEUE_CLEAN) {
 		/*
 		 * Flush the per-cpu queue and notify any waiters.
 		 */
 		if (bd->bd_wanted || (bq != bd->bd_cleanq &&
 		    bq->bq_len >= bd->bd_lim))
 			bd_flush(bd, bq);
 	}
 	BQ_UNLOCK(bq);
 }
 
 /*
  *	bufkva_free:
  *
  *	Free the kva allocation for a buffer.
  *
  */
 static void
 bufkva_free(struct buf *bp)
 {
 
 #ifdef INVARIANTS
 	if (bp->b_kvasize == 0) {
 		KASSERT(bp->b_kvabase == unmapped_buf &&
 		    bp->b_data == unmapped_buf,
 		    ("Leaked KVA space on %p", bp));
 	} else if (buf_mapped(bp))
 		BUF_CHECK_MAPPED(bp);
 	else
 		BUF_CHECK_UNMAPPED(bp);
 #endif
 	if (bp->b_kvasize == 0)
 		return;
 
 	vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
 	counter_u64_add(bufkvaspace, -bp->b_kvasize);
 	counter_u64_add(buffreekvacnt, 1);
 	bp->b_data = bp->b_kvabase = unmapped_buf;
 	bp->b_kvasize = 0;
 }
 
 /*
  *	bufkva_alloc:
  *
  *	Allocate the buffer KVA and set b_kvasize and b_kvabase.
  */
 static int
 bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
 {
 	vm_offset_t addr;
 	int error;
 
 	KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
 	    ("Invalid gbflags 0x%x in %s", gbflags, __func__));
 
 	bufkva_free(bp);
 
 	addr = 0;
 	error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
 	if (error != 0) {
 		/*
 		 * Buffer map is too fragmented.  Request the caller
 		 * to defragment the map.
 		 */
 		return (error);
 	}
 	bp->b_kvabase = (caddr_t)addr;
 	bp->b_kvasize = maxsize;
 	counter_u64_add(bufkvaspace, bp->b_kvasize);
 	if ((gbflags & GB_UNMAPPED) != 0) {
 		bp->b_data = unmapped_buf;
 		BUF_CHECK_UNMAPPED(bp);
 	} else {
 		bp->b_data = bp->b_kvabase;
 		BUF_CHECK_MAPPED(bp);
 	}
 	return (0);
 }
 
 /*
  *	bufkva_reclaim:
  *
  *	Reclaim buffer kva by freeing buffers holding kva.  This is a vmem
  *	callback that fires to avoid returning failure.
  */
 static void
 bufkva_reclaim(vmem_t *vmem, int flags)
 {
 	bool done;
 	int q;
 	int i;
 
 	done = false;
 	for (i = 0; i < 5; i++) {
 		for (q = 0; q < buf_domains; q++)
 			if (buf_recycle(&bdomain[q], true) != 0)
 				done = true;
 		if (done)
 			break;
 	}
 	return;
 }
 
 /*
  * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
  * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
  * the buffer is valid and we do not have to do anything.
  */
 static void
 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt,
     struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *))
 {
 	struct buf *rabp;
 	struct thread *td;
 	int i;
 
 	td = curthread;
 
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
 		if ((rabp->b_flags & B_CACHE) != 0) {
 			brelse(rabp);
 			continue;
 		}
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(curproc);
 			racct_add_buf(curproc, rabp, 0);
 			PROC_UNLOCK(curproc);
 		}
 #endif /* RACCT */
 		td->td_ru.ru_inblock++;
 		rabp->b_flags |= B_ASYNC;
 		rabp->b_flags &= ~B_INVAL;
 		if ((flags & GB_CKHASH) != 0) {
 			rabp->b_flags |= B_CKHASH;
 			rabp->b_ckhashcalc = ckhashfunc;
 		}
 		rabp->b_ioflags &= ~BIO_ERROR;
 		rabp->b_iocmd = BIO_READ;
 		if (rabp->b_rcred == NOCRED && cred != NOCRED)
 			rabp->b_rcred = crhold(cred);
 		vfs_busy_pages(rabp, 0);
 		BUF_KERNPROC(rabp);
 		rabp->b_iooffset = dbtob(rabp->b_blkno);
 		bstrategy(rabp);
 	}
 }
 
 /*
  * Entry point for bread() and breadn() via #defines in sys/buf.h.
  *
  * Get a buffer with the specified data.  Look in the cache first.  We
  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  * is set, the buffer is valid and we do not have to do anything, see
  * getblk(). Also starts asynchronous I/O on read-ahead blocks.
  *
  * Always return a NULL buffer pointer (in bpp) when returning an error.
  *
  * The blkno parameter is the logical block being requested. Normally
  * the mapping of logical block number to disk block address is done
  * by calling VOP_BMAP(). However, if the mapping is already known, the
  * disk block address can be passed using the dblkno parameter. If the
  * disk block address is not known, then the same value should be passed
  * for blkno and dblkno.
  */
 int
 breadn_flags(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size,
     daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags,
     void (*ckhashfunc)(struct buf *), struct buf **bpp)
 {
 	struct buf *bp;
 	struct thread *td;
 	int error, readwait, rv;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
 	td = curthread;
 	/*
 	 * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags
 	 * are specified.
 	 */
 	error = getblkx(vp, blkno, dblkno, size, 0, 0, flags, &bp);
 	if (error != 0) {
 		*bpp = NULL;
 		return (error);
 	}
 	KASSERT(blkno == bp->b_lblkno,
 	    ("getblkx returned buffer for blkno %jd instead of blkno %jd",
 	    (intmax_t)bp->b_lblkno, (intmax_t)blkno));
 	flags &= ~GB_NOSPARSE;
 	*bpp = bp;
 
 	/*
 	 * If not found in cache, do some I/O
 	 */
 	readwait = 0;
 	if ((bp->b_flags & B_CACHE) == 0) {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			racct_add_buf(td->td_proc, bp, 0);
 			PROC_UNLOCK(td->td_proc);
 		}
 #endif /* RACCT */
 		td->td_ru.ru_inblock++;
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		if ((flags & GB_CKHASH) != 0) {
 			bp->b_flags |= B_CKHASH;
 			bp->b_ckhashcalc = ckhashfunc;
 		}
 		if ((flags & GB_CVTENXIO) != 0)
 			bp->b_xflags |= BX_CVTENXIO;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if (bp->b_rcred == NOCRED && cred != NOCRED)
 			bp->b_rcred = crhold(cred);
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
 		++readwait;
 	}
 
 	/*
 	 * Attempt to initiate asynchronous I/O on read-ahead blocks.
 	 */
 	breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc);
 
 	rv = 0;
 	if (readwait) {
 		rv = bufwait(bp);
 		if (rv != 0) {
 			brelse(bp);
 			*bpp = NULL;
 		}
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 int
 bufwrite(struct buf *bp)
 {
 	int oldflags;
 	struct vnode *vp;
 	long space;
 	int vp_md;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
 		bp->b_flags |= B_INVAL | B_RELBUF;
 		bp->b_flags &= ~B_CACHE;
 		brelse(bp);
 		return (ENXIO);
 	}
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	if (bp->b_flags & B_BARRIER)
 		atomic_add_long(&barrierwrites, 1);
 
 	oldflags = bp->b_flags;
 
 	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
 	    ("FFS background buffer should not get here %p", bp));
 
 	vp = bp->b_vp;
 	if (vp)
 		vp_md = vp->v_vflag & VV_MD;
 	else
 		vp_md = 0;
 
 	/*
 	 * Mark the buffer clean.  Increment the bufobj write count
 	 * before bundirty() call, to prevent other thread from seeing
 	 * empty dirty list and zero counter for writes in progress,
 	 * falsely indicating that the bufobj is clean.
 	 */
 	bufobj_wref(bp->b_bufobj);
 	bundirty(bp);
 
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_flags |= B_CACHE;
 	bp->b_iocmd = BIO_WRITE;
 
 	vfs_busy_pages(bp, 1);
 
 	/*
 	 * Normal bwrites pipeline writes
 	 */
 	bp->b_runningbufspace = bp->b_bufsize;
 	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(curproc);
 		racct_add_buf(curproc, bp, 1);
 		PROC_UNLOCK(curproc);
 	}
 #endif /* RACCT */
 	curthread->td_ru.ru_oublock++;
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	buf_track(bp, __func__);
 	bstrategy(bp);
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
 	} else if (space > hirunningspace) {
 		/*
 		 * don't allow the async write to saturate the I/O
 		 * system.  We will not deadlock here because
 		 * we are blocking waiting for I/O that is already in-progress
 		 * to complete. We do not block here if it is the update
 		 * or syncer daemon trying to clean up as that can lead
 		 * to deadlock.
 		 */
 		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
 			waitrunningbufspace();
 	}
 
 	return (0);
 }
 
 void
 bufbdflush(struct bufobj *bo, struct buf *bp)
 {
 	struct buf *nbp;
 
 	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
 		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
 		altbufferflushes++;
 	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
 		BO_LOCK(bo);
 		/*
 		 * Try to find a buffer to flush.
 		 */
 		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 			    BUF_LOCK(nbp,
 				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
 				continue;
 			if (bp == nbp)
 				panic("bdwrite: found ourselves");
 			BO_UNLOCK(bo);
 			/* Don't countdeps with the bo lock held. */
 			if (buf_countdeps(nbp, 0)) {
 				BO_LOCK(bo);
 				BUF_UNLOCK(nbp);
 				continue;
 			}
 			if (nbp->b_flags & B_CLUSTEROK) {
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
 				bawrite(nbp);
 			}
 			dirtybufferflushes++;
 			break;
 		}
 		if (nbp == NULL)
 			BO_UNLOCK(bo);
 	}
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  * anything if the buffer is marked invalid.
  *
  * Note that since the buffer must be completely valid, we can safely
  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  * biodone() in order to prevent getblk from writing the buffer
  * out synchronously.
  */
 void
 bdwrite(struct buf *bp)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT((bp->b_flags & B_BARRIER) == 0,
 	    ("Barrier request in delayed write %p", bp));
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 
 	/*
 	 * If we have too many dirty buffers, don't create any more.
 	 * If we are wildly over our limit, then force a complete
 	 * cleanup. Otherwise, just keep the situation from getting
 	 * out of control. Note that we have to avoid a recursive
 	 * disaster and not try to clean up after our own cleanup!
 	 */
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
 		td->td_pflags |= TDP_INBDFLUSH;
 		BO_BDFLUSH(bo, bp);
 		td->td_pflags &= ~TDP_INBDFLUSH;
 	} else
 		recursiveflushes++;
 
 	bdirty(bp);
 	/*
 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
 	 * true even of NFS now.
 	 */
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
 	buf_track(bp, __func__);
 
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty
 	 * pages.
 	 *
 	 * Mark the buffer pages as clean.  We need to do this here to
 	 * satisfy the vnode_pager and the pageout daemon, so that it
 	 * thinks that the pages have been "cleaned".  Note that since
 	 * the pages are in a delayed write buffer -- the VFS layer
 	 * "will" see that the pages get written out on the next sync,
 	 * or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages_dirty_buf(bp);
 	bqrelse(bp);
 
 	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 	 * due to the softdep code.
 	 */
 }
 
 /*
  *	bdirty:
  *
  *	Turn buffer into delayed write request.  We must clear BIO_READ and
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
  *	clears B_DONE ( else a panic will occur later ).  
  *
  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *
  *	The buffer must be on QUEUE_NONE.
  */
 void
 bdirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	bp->b_flags &= ~(B_RELBUF);
 	bp->b_iocmd = BIO_WRITE;
 
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
 		reassignbuf(bp);
 		bdirtyadd(bp);
 	}
 }
 
 /*
  *	bundirty:
  *
  *	Clear B_DELWRI for buffer.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *
  *	The buffer must be on QUEUE_NONE.
  */
 
 void
 bundirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
 
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
 		bdirtysub(bp);
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
 	 */
 	bp->b_flags &= ~B_DEFERRED;
 }
 
 /*
  *	bawrite:
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
  *
  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC;
 	(void) bwrite(bp);
 }
 
 /*
  *	babarrierwrite:
  *
  *	Asynchronous barrier write.  Start output on a buffer, but do not
  *	wait for it to complete.  Place a write barrier after this write so
  *	that this buffer and all buffers written before it are committed to
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 void
 babarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC | B_BARRIER;
 	(void) bwrite(bp);
 }
 
 /*
  *	bbarrierwrite:
  *
  *	Synchronous barrier write.  Start output on a buffer and wait for
  *	it to complete.  Place a write barrier after this write so that
  *	this buffer and all buffers written before it are committed to 
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 int
 bbarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_BARRIER;
 	return (bwrite(bp));
 }
 
 /*
  *	bwillwrite:
  *
  *	Called prior to the locking of any vnodes when we are expecting to
  *	write.  We do not want to starve the buffer cache with too many
  *	dirty buffers so we block here.  By blocking prior to the locking
  *	of any vnodes we attempt to avoid the situation where a locked vnode
  *	prevents the various system daemons from flushing related buffers.
  */
 void
 bwillwrite(void)
 {
 
 	if (buf_dirty_count_severe()) {
 		mtx_lock(&bdirtylock);
 		while (buf_dirty_count_severe()) {
 			bdirtywait = 1;
 			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
 			    "flswai", 0);
 		}
 		mtx_unlock(&bdirtylock);
 	}
 }
 
 /*
  * Return true if we have too many dirty buffers.
  */
 int
 buf_dirty_count_severe(void)
 {
 
 	return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty));
 }
 
 /*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
  *	buffer will be stashed in the appropriate bufqueue[] allowing it
  *	to be accessed later as a cache entity or reused for other purposes.
  */
 void
 brelse(struct buf *bp)
 {
 	struct mount *v_mnt;
 	int qindex;
 
 	/*
 	 * Many functions erroneously call brelse with a NULL bp under rare
 	 * error conditions. Simply return when called with a NULL bp.
 	 */
 	if (bp == NULL)
 		return;
 	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 	KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0,
 	    ("brelse: non-VMIO buffer marked NOREUSE"));
 
 	if (BUF_LOCKRECURSED(bp)) {
 		/*
 		 * Do not process, in particular, do not handle the
 		 * B_INVAL/B_RELBUF and do not release to free list.
 		 */
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	if (bp->b_flags & B_MANAGED) {
 		bqrelse(bp);
 		return;
 	}
 
 	if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags &= ~BV_BKGRDERR;
 		BO_UNLOCK(bp->b_bufobj);
 		bdirty(bp);
 	}
 
 	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
 	    (bp->b_flags & B_INVALONERR)) {
 		/*
 		 * Forced invalidation of dirty buffer contents, to be used
 		 * after a failed write in the rare case that the loss of the
 		 * contents is acceptable.  The buffer is invalidated and
 		 * freed.
 		 */
 		bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
 		bp->b_flags &= ~(B_ASYNC | B_CACHE);
 	}
 
 	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
 	    (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) &&
 	    !(bp->b_flags & B_INVAL)) {
 		/*
 		 * Failed write, redirty.  All errors except ENXIO (which
 		 * means the device is gone) are treated as being
 		 * transient.
 		 *
 		 * XXX Treating EIO as transient is not correct; the
 		 * contract with the local storage device drivers is that
 		 * they will only return EIO once the I/O is no longer
 		 * retriable.  Network I/O also respects this through the
 		 * guarantees of TCP and/or the internal retries of NFS.
 		 * ENOMEM might be transient, but we also have no way of
 		 * knowing when its ok to retry/reschedule.  In general,
 		 * this entire case should be made obsolete through better
 		 * error handling/recovery and resource scheduling.
 		 *
 		 * Do this also for buffers that failed with ENXIO, but have
 		 * non-empty dependencies - the soft updates code might need
 		 * to access the buffer to untangle them.
 		 *
 		 * Must clear BIO_ERROR to prevent pages from being scrapped.
 		 */
 		bp->b_ioflags &= ~BIO_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
 	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
 		/*
 		 * Either a failed read I/O, or we were asked to free or not
 		 * cache the buffer, or we failed to write to a device that's
 		 * no longer present.
 		 */
 		bp->b_flags |= B_INVAL;
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI)
 			bdirtysub(bp);
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			allocbuf(bp, 0);
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 	}
 
 	/*
 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_truncate() 
 	 * is called with B_DELWRI set, the underlying pages may wind up
 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 	 * because pages associated with a B_DELWRI bp are marked clean.
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_truncate(), even
 	 * if B_DELWRI is set.
 	 */
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
 	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
 	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 	 * the commit state and we cannot afford to lose the buffer. If the
 	 * buffer has a background write in progress, we need to keep it
 	 * around to prevent it from being reconstituted and starting a second
 	 * background write.
 	 */
 
 	v_mnt = bp->b_vp != NULL ? bp->b_vp->v_mount : NULL;
 
 	if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE ||
 	    (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) &&
 	    (v_mnt == NULL || (v_mnt->mnt_vfc->vfc_flags & VFCF_NETWORK) == 0 ||
 	    vn_isdisk(bp->b_vp) || (bp->b_flags & B_DELWRI) == 0)) {
 		vfs_vmio_invalidate(bp);
 		allocbuf(bp, 0);
 	}
 
 	if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 ||
 	    (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) {
 		allocbuf(bp, 0);
 		bp->b_flags &= ~B_NOREUSE;
 		if (bp->b_vp != NULL)
 			brelvp(bp);
 	}
 
 	/*
 	 * If the buffer has junk contents signal it and eventually
 	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
 	 * doesn't find it.
 	 */
 	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
 	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
 		bp->b_flags |= B_INVAL;
 	if (bp->b_flags & B_INVAL) {
 		if (bp->b_flags & B_DELWRI)
 			bundirty(bp);
 		if (bp->b_vp)
 			brelvp(bp);
 	}
 
 	buf_track(bp, __func__);
 
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		buf_free(bp);
 		return;
 	}
 	/* buffers with junk contents */
 	if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
 	    (bp->b_ioflags & BIO_ERROR)) {
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 2");
 		qindex = QUEUE_CLEAN;
 		bp->b_flags |= B_AGE;
 	/* remaining buffers */
 	} else if (bp->b_flags & B_DELWRI)
 		qindex = QUEUE_DIRTY;
 	else
 		qindex = QUEUE_CLEAN;
 
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("brelse: not dirty");
 
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT);
 	bp->b_xflags &= ~(BX_CVTENXIO);
 	/* binsfree unlocks bp. */
 	binsfree(bp, qindex);
 }
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
  * known good buffers need to be requeued but we think we may need the data
  * again soon.
  *
  * XXX we should be able to leave the B_RELBUF hint set on completion.
  */
 void
 bqrelse(struct buf *bp)
 {
 	int qindex;
 
 	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	qindex = QUEUE_NONE;
 	if (BUF_LOCKRECURSED(bp)) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		return;
 	}
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 	bp->b_xflags &= ~(BX_CVTENXIO);
 
 	if (bp->b_flags & B_MANAGED) {
 		if (bp->b_flags & B_REMFREE)
 			bremfreef(bp);
 		goto out;
 	}
 
 	/* buffers with stale but valid contents */
 	if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
 	    BV_BKGRDERR)) == BV_BKGRDERR) {
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags &= ~BV_BKGRDERR;
 		BO_UNLOCK(bp->b_bufobj);
 		qindex = QUEUE_DIRTY;
 	} else {
 		if ((bp->b_flags & B_DELWRI) == 0 &&
 		    (bp->b_xflags & BX_VNDIRTY))
 			panic("bqrelse: not dirty");
 		if ((bp->b_flags & B_NOREUSE) != 0) {
 			brelse(bp);
 			return;
 		}
 		qindex = QUEUE_CLEAN;
 	}
 	buf_track(bp, __func__);
 	/* binsfree unlocks bp. */
 	binsfree(bp, qindex);
 	return;
 
 out:
 	buf_track(bp, __func__);
 	/* unlock */
 	BUF_UNLOCK(bp);
 }
 
 /*
  * Complete I/O to a VMIO backed page.  Validate the pages as appropriate,
  * restore bogus pages.
  */
 static void
 vfs_vmio_iodone(struct buf *bp)
 {
 	vm_ooffset_t foff;
 	vm_page_t m;
 	vm_object_t obj;
 	struct vnode *vp __unused;
 	int i, iosize, resid;
 	bool bogus;
 
 	obj = bp->b_bufobj->bo_object;
 	KASSERT(blockcount_read(&obj->paging_in_progress) >= bp->b_npages,
 	    ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)",
 	    blockcount_read(&obj->paging_in_progress), bp->b_npages));
 
 	vp = bp->b_vp;
 	VNPASS(vp->v_holdcnt > 0, vp);
 	VNPASS(vp->v_object != NULL, vp);
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_vmio_iodone: bp %p has no buffer offset", bp));
 
 	bogus = false;
 	iosize = bp->b_bcount - bp->b_resid;
 	for (i = 0; i < bp->b_npages; i++) {
 		resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 		if (resid > iosize)
 			resid = iosize;
 
 		/*
 		 * cleanup bogus pages, restoring the originals
 		 */
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			bogus = true;
 			m = vm_page_relookup(obj, OFF_TO_IDX(foff));
 			if (m == NULL)
 				panic("biodone: page disappeared!");
 			bp->b_pages[i] = m;
 		} else if ((bp->b_iocmd == BIO_READ) && resid > 0) {
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly ( see bdwrite() ), so we 
 			 * only need to do this here in the read case.
 			 */
 			KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK,
 			    resid)) == 0, ("vfs_vmio_iodone: page %p "
 			    "has unexpected dirty bits", m));
 			vfs_page_set_valid(bp, foff, m);
 		}
 		KASSERT(OFF_TO_IDX(foff) == m->pindex,
 		    ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch",
 		    (intmax_t)foff, (uintmax_t)m->pindex));
 
 		vm_page_sunbusy(m);
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		iosize -= resid;
 	}
 	vm_object_pip_wakeupn(obj, bp->b_npages);
 	if (bogus && buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  * Perform page invalidation when a buffer is released.  The fully invalid
  * pages will be reclaimed later in vfs_vmio_truncate().
  */
 static void
 vfs_vmio_invalidate(struct buf *bp)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	int flags, i, resid, poffset, presid;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 	/*
 	 * Get the base offset and length of the buffer.  Note that 
 	 * in the VMIO case if the buffer block size is not
 	 * page-aligned then b_data pointer may not be page-aligned.
 	 * But our b_pages[] array *IS* page aligned.
 	 *
 	 * block sizes less then DEV_BSIZE (usually 512) are not 
 	 * supported due to the page granularity bits (m->valid,
 	 * m->dirty, etc...). 
 	 *
 	 * See man buf(9) for more information
 	 */
 	flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0;
 	obj = bp->b_bufobj->bo_object;
 	resid = bp->b_bufsize;
 	poffset = bp->b_offset & PAGE_MASK;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page)
 			panic("vfs_vmio_invalidate: Unexpected bogus page.");
 		bp->b_pages[i] = NULL;
 
 		presid = resid > (PAGE_SIZE - poffset) ?
 		    (PAGE_SIZE - poffset) : resid;
 		KASSERT(presid >= 0, ("brelse: extra page"));
 		vm_page_busy_acquire(m, VM_ALLOC_SBUSY);
 		if (pmap_page_wired_mappings(m) == 0)
 			vm_page_set_invalid(m, poffset, presid);
 		vm_page_sunbusy(m);
 		vm_page_release_locked(m, flags);
 		resid -= presid;
 		poffset = 0;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	bp->b_npages = 0;
 }
 
 /*
  * Page-granular truncation of an existing VMIO buffer.
  */
 static void
 vfs_vmio_truncate(struct buf *bp, int desiredpages)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	int flags, i;
 
 	if (bp->b_npages == desiredpages)
 		return;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) +
 		    (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 
 	/*
 	 * The object lock is needed only if we will attempt to free pages.
 	 */
 	flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0;
 	if ((bp->b_flags & B_DIRECT) != 0) {
 		flags |= VPR_TRYFREE;
 		obj = bp->b_bufobj->bo_object;
 		VM_OBJECT_WLOCK(obj);
 	} else {
 		obj = NULL;
 	}
 	for (i = desiredpages; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		KASSERT(m != bogus_page, ("allocbuf: bogus page found"));
 		bp->b_pages[i] = NULL;
 		if (obj != NULL)
 			vm_page_release_locked(m, flags);
 		else
 			vm_page_release(m, flags);
 	}
 	if (obj != NULL)
 		VM_OBJECT_WUNLOCK(obj);
 	bp->b_npages = desiredpages;
 }
 
 /*
  * Byte granular extension of VMIO buffers.
  */
 static void
 vfs_vmio_extend(struct buf *bp, int desiredpages, int size)
 {
 	/*
 	 * We are growing the buffer, possibly in a 
 	 * byte-granular fashion.
 	 */
 	vm_object_t obj;
 	vm_offset_t toff;
 	vm_offset_t tinc;
 	vm_page_t m;
 
 	/*
 	 * Step 1, bring in the VM pages from the object, allocating
 	 * them if necessary.  We must clear B_CACHE if these pages
 	 * are not valid for the range covered by the buffer.
 	 */
 	obj = bp->b_bufobj->bo_object;
 	if (bp->b_npages < desiredpages) {
 		/*
 		 * We must allocate system pages since blocking
 		 * here could interfere with paging I/O, no
 		 * matter which process we are.
 		 *
 		 * Only exclusive busy can be tested here.
 		 * Blocking on shared busy might lead to
 		 * deadlocks once allocbuf() is called after
 		 * pages are vfs_busy_pages().
 		 */
 		(void)vm_page_grab_pages_unlocked(obj,
 		    OFF_TO_IDX(bp->b_offset) + bp->b_npages,
 		    VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY |
 		    VM_ALLOC_NOBUSY | VM_ALLOC_WIRED,
 		    &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages);
 		bp->b_npages = desiredpages;
 	}
 
 	/*
 	 * Step 2.  We've loaded the pages into the buffer,
 	 * we have to figure out if we can still have B_CACHE
 	 * set.  Note that B_CACHE is set according to the
 	 * byte-granular range ( bcount and size ), not the
 	 * aligned range ( newbsize ).
 	 *
 	 * The VM test is against m->valid, which is DEV_BSIZE
 	 * aligned.  Needless to say, the validity of the data
 	 * needs to also be DEV_BSIZE aligned.  Note that this
 	 * fails with NFS if the server or some other client
 	 * extends the file's EOF.  If our buffer is resized, 
 	 * B_CACHE may remain set! XXX
 	 */
 	toff = bp->b_bcount;
 	tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 	while ((bp->b_flags & B_CACHE) && toff < size) {
 		vm_pindex_t pi;
 
 		if (tinc > (size - toff))
 			tinc = size - toff;
 		pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT;
 		m = bp->b_pages[pi];
 		vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m);
 		toff += tinc;
 		tinc = PAGE_SIZE;
 	}
 
 	/*
 	 * Step 3, fixup the KVA pmap.
 	 */
 	if (buf_mapped(bp))
 		bpmap_qenter(bp);
 	else
 		BUF_CHECK_UNMAPPED(bp);
 }
 
 /*
  * Check to see if a block at a particular lbn is available for a clustered
  * write.
  */
 static int
 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
 {
 	struct buf *bpa;
 	int match;
 
 	match = 0;
 
 	/* If the buf isn't in core skip it */
 	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
 		return (0);
 
 	/* If the buf is busy we don't want to wait for it */
 	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		return (0);
 
 	/* Only cluster with valid clusterable delayed write buffers */
 	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
 	    (B_DELWRI | B_CLUSTEROK))
 		goto done;
 
 	if (bpa->b_bufsize != size)
 		goto done;
 
 	/*
 	 * Check to see if it is in the expected place on disk and that the
 	 * block has been mapped.
 	 */
 	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
 		match = 1;
 done:
 	BUF_UNLOCK(bpa);
 	return (match);
 }
 
 /*
  *	vfs_bio_awrite:
  *
  *	Implement clustered async writes for clearing out B_DELWRI buffers.
  *	This is much better then the old way of writing only one buffer at
  *	a time.  Note that we may not be presented with the buffers in the 
  *	correct order, so we search for the cluster in both directions.
  */
 int
 vfs_bio_awrite(struct buf *bp)
 {
 	struct bufobj *bo;
 	int i;
 	int j;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int ncl;
 	int nwritten;
 	int size;
 	int maxcl;
 	int gbflags;
 
 	bo = &vp->v_bufobj;
 	gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0;
 	/*
 	 * right now we support clustered writing only to regular files.  If
 	 * we find a clusterable block we could be in the middle of a cluster
 	 * rather then at the beginning.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
 		BO_RLOCK(bo);
 		for (i = 1; i < maxcl; i++)
 			if (vfs_bio_clcheck(vp, size, lblkno + i,
 			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
 				break;
 
 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) 
 			if (vfs_bio_clcheck(vp, size, lblkno - j,
 			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
 				break;
 		BO_RUNLOCK(bo);
 		--j;
 		ncl = i + j;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			BUF_UNLOCK(bp);
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
 			    gbflags);
 			return (nwritten);
 		}
 	}
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;
 	/*
 	 * default (old) behavior, writing out only one block
 	 *
 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) bwrite(bp);
 
 	return (nwritten);
 }
 
 /*
  *	getnewbuf_kva:
  *
  *	Allocate KVA for an empty buf header according to gbflags.
  */
 static int
 getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
 {
 
 	if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) {
 		/*
 		 * In order to keep fragmentation sane we only allocate kva
 		 * in BKVASIZE chunks.  XXX with vmem we can do page size.
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
 		if (maxsize != bp->b_kvasize &&
 		    bufkva_alloc(bp, maxsize, gbflags))
 			return (ENOSPC);
 	}
 	return (0);
 }
 
 /*
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers
  *	in the bufqueues as necessary.  The new buffer is returned locked.
  *
  *	We block if:
  *		We have insufficient buffer headers
  *		We have insufficient buffer space
  *		buffer_arena is too fragmented ( space reservation fails )
  *		If we have to flush dirty buffers ( but we try to avoid this )
  *
  *	The caller is responsible for releasing the reserved bufspace after
  *	allocbuf() is called.
  */
 static struct buf *
 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
 {
 	struct bufdomain *bd;
 	struct buf *bp;
 	bool metadata, reserved;
 
 	bp = NULL;
 	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	if (!unmapped_buf_allowed)
 		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
 	    vp->v_type == VCHR)
 		metadata = true;
 	else
 		metadata = false;
 	if (vp == NULL)
 		bd = &bdomain[0];
 	else
 		bd = &bdomain[vp->v_bufobj.bo_domain];
 
 	counter_u64_add(getnewbufcalls, 1);
 	reserved = false;
 	do {
 		if (reserved == false &&
 		    bufspace_reserve(bd, maxsize, metadata) != 0) {
 			counter_u64_add(getnewbufrestarts, 1);
 			continue;
 		}
 		reserved = true;
 		if ((bp = buf_alloc(bd)) == NULL) {
 			counter_u64_add(getnewbufrestarts, 1);
 			continue;
 		}
 		if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
 			return (bp);
 		break;
 	} while (buf_recycle(bd, false) == 0);
 
 	if (reserved)
 		bufspace_release(bd, maxsize);
 	if (bp != NULL) {
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 	}
 	bufspace_wait(bd, vp, gbflags, slpflag, slptimeo);
 
 	return (NULL);
 }
 
 /*
  *	buf_daemon:
  *
  *	buffer flushing daemon.  Buffers are normally flushed by the
  *	update daemon but if it cannot keep up this process starts to
  *	take the load in an attempt to prevent getnewbuf() from blocking.
  */
 static struct kproc_desc buf_kp = {
 	"bufdaemon",
 	buf_daemon,
 	&bufdaemonproc
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
 static int
 buf_flush(struct vnode *vp, struct bufdomain *bd, int target)
 {
 	int flushed;
 
 	flushed = flushbufqueues(vp, bd, target, 0);
 	if (flushed == 0) {
 		/*
 		 * Could not find any buffers without rollback
 		 * dependencies, so just write the first one
 		 * in the hopes of eventually making progress.
 		 */
 		if (vp != NULL && target > 2)
 			target /= 2;
 		flushbufqueues(vp, bd, target, 1);
 	}
 	return (flushed);
 }
 
 static void
 buf_daemon()
 {
 	struct bufdomain *bd;
 	int speedupreq;
 	int lodirty;
 	int i;
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
 	 */
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread,
 	    SHUTDOWN_PRI_LAST + 100);
 
 	/*
 	 * Start the buf clean daemons as children threads.
 	 */
 	for (i = 0 ; i < buf_domains; i++) {
 		int error;
 
 		error = kthread_add((void (*)(void *))bufspace_daemon,
 		    &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i);
 		if (error)
 			panic("error %d spawning bufspace daemon", error);
 	}
 
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
 	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
 	mtx_lock(&bdlock);
 	for (;;) {
 		bd_request = 0;
 		mtx_unlock(&bdlock);
 
 		kthread_suspend_check();
 
 		/*
 		 * Save speedupreq for this pass and reset to capture new
 		 * requests.
 		 */
 		speedupreq = bd_speedupreq;
 		bd_speedupreq = 0;
 
 		/*
 		 * Flush each domain sequentially according to its level and
 		 * the speedup request.
 		 */
 		for (i = 0; i < buf_domains; i++) {
 			bd = &bdomain[i];
 			if (speedupreq)
 				lodirty = bd->bd_numdirtybuffers / 2;
 			else
 				lodirty = bd->bd_lodirtybuffers;
 			while (bd->bd_numdirtybuffers > lodirty) {
 				if (buf_flush(NULL, bd,
 				    bd->bd_numdirtybuffers - lodirty) == 0)
 					break;
 				kern_yield(PRI_USER);
 			}
 		}
 
 		/*
 		 * Only clear bd_request if we have reached our low water
 		 * mark.  The buf_daemon normally waits 1 second and
 		 * then incrementally flushes any dirty buffers that have
 		 * built up, within reason.
 		 *
 		 * If we were unable to hit our low water mark and couldn't
 		 * find any flushable buffers, we sleep for a short period
 		 * to avoid endless loops on unlockable buffers.
 		 */
 		mtx_lock(&bdlock);
 		if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) {
 			/*
 			 * We reached our low water mark, reset the
 			 * request and sleep until we are needed again.
 			 * The sleep is just so the suspend code works.
 			 */
 			bd_request = 0;
 			/*
 			 * Do an extra wakeup in case dirty threshold
 			 * changed via sysctl and the explicit transition
 			 * out of shortfall was missed.
 			 */
 			bdirtywakeup();
 			if (runningbufspace <= lorunningspace)
 				runningwakeup();
 			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
 		} else {
 			/*
 			 * We couldn't find any flushable dirty buffers but
 			 * still have too many dirty buffers, we
 			 * have to sleep and try again.  (rare)
 			 */
 			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
 		}
 	}
 }
 
 /*
  *	flushbufqueues:
  *
  *	Try to flush a buffer in the dirty queue.  We must be careful to
  *	free up B_INVAL buffers instead of write them, which NFS is 
  *	particularly sensitive to.
  */
 static int flushwithdeps = 0;
 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW | CTLFLAG_STATS,
     &flushwithdeps, 0,
     "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
 flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target,
     int flushdeps)
 {
 	struct bufqueue *bq;
 	struct buf *sentinel;
 	struct vnode *vp;
 	struct mount *mp;
 	struct buf *bp;
 	int hasdeps;
 	int flushed;
 	int error;
 	bool unlock;
 
 	flushed = 0;
 	bq = &bd->bd_dirtyq;
 	bp = NULL;
 	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
 	sentinel->b_qindex = QUEUE_SENTINEL;
 	BQ_LOCK(bq);
 	TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist);
 	BQ_UNLOCK(bq);
 	while (flushed != target) {
 		maybe_yield();
 		BQ_LOCK(bq);
 		bp = TAILQ_NEXT(sentinel, b_freelist);
 		if (bp != NULL) {
 			TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist);
 			TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel,
 			    b_freelist);
 		} else {
 			BQ_UNLOCK(bq);
 			break;
 		}
 		/*
 		 * Skip sentinels inserted by other invocations of the
 		 * flushbufqueues(), taking care to not reorder them.
 		 *
 		 * Only flush the buffers that belong to the
 		 * vnode locked by the curthread.
 		 */
 		if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
 		    bp->b_vp != lvp)) {
 			BQ_UNLOCK(bq);
 			continue;
 		}
 		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
 		BQ_UNLOCK(bq);
 		if (error != 0)
 			continue;
 
 		/*
 		 * BKGRDINPROG can only be set with the buf and bufobj
 		 * locks both held.  We tolerate a race to clear it here.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
 		    (bp->b_flags & B_DELWRI) == 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (bp->b_flags & B_INVAL) {
 			bremfreef(bp);
 			brelse(bp);
 			flushed++;
 			continue;
 		}
 
 		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
 			if (flushdeps == 0) {
 				BUF_UNLOCK(bp);
 				continue;
 			}
 			hasdeps = 1;
 		} else
 			hasdeps = 0;
 		/*
 		 * We must hold the lock on a vnode before writing
 		 * one of its buffers. Otherwise we may confuse, or
 		 * in the case of a snapshot vnode, deadlock the
 		 * system.
 		 *
 		 * The lock order here is the reverse of the normal
 		 * of vnode followed by buf lock.  This is ok because
 		 * the NOWAIT will prevent deadlock.
 		 */
 		vp = bp->b_vp;
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (lvp == NULL) {
 			unlock = true;
 			error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
 		} else {
 			ASSERT_VOP_LOCKED(vp, "getbuf");
 			unlock = false;
 			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
 			    vn_lock(vp, LK_TRYUPGRADE);
 		}
 		if (error == 0) {
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
 			if (curproc == bufdaemonproc) {
 				vfs_bio_awrite(bp);
 			} else {
 				bremfree(bp);
 				bwrite(bp);
 				counter_u64_add(notbufdflushes, 1);
 			}
 			vn_finished_write(mp);
 			if (unlock)
 				VOP_UNLOCK(vp);
 			flushwithdeps += hasdeps;
 			flushed++;
 
 			/*
 			 * Sleeping on runningbufspace while holding
 			 * vnode lock leads to deadlock.
 			 */
 			if (curproc == bufdaemonproc &&
 			    runningbufspace > hirunningspace)
 				waitrunningbufspace();
 			continue;
 		}
 		vn_finished_write(mp);
 		BUF_UNLOCK(bp);
 	}
 	BQ_LOCK(bq);
 	TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist);
 	BQ_UNLOCK(bq);
 	free(sentinel, M_TEMP);
 	return (flushed);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct bufobj *bo, daddr_t blkno)
 {
 	return (gbincore_unlocked(bo, blkno));
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 bool
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t toff, tinc, size;
 	vm_page_t m, n;
 	vm_ooffset_t off;
 	int valid;
 
 	ASSERT_VOP_LOCKED(vp, "inmem");
 
 	if (incore(&vp->v_bufobj, blkno))
 		return (true);
 	if (vp->v_mount == NULL)
 		return (false);
 	obj = vp->v_object;
 	if (obj == NULL)
 		return (false);
 
 	size = PAGE_SIZE;
 	if (size > vp->v_mount->mnt_stat.f_iosize)
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup_unlocked(obj, OFF_TO_IDX(off + toff));
 recheck:
 		if (m == NULL)
 			return (false);
 
 		tinc = size;
 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 		/*
 		 * Consider page validity only if page mapping didn't change
 		 * during the check.
 		 */
 		valid = vm_page_is_valid(m,
 		    (vm_offset_t)((toff + off) & PAGE_MASK), tinc);
 		n = vm_page_lookup_unlocked(obj, OFF_TO_IDX(off + toff));
 		if (m != n) {
 			m = n;
 			goto recheck;
 		}
 		if (!valid)
 			return (false);
 	}
 	return (true);
 }
 
 /*
  * Set the dirty range for a buffer based on the status of the dirty
  * bits in the pages comprising the buffer.  The range is limited
  * to the size of the buffer.
  *
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  *
  * Note that while we only really need to clean through to b_bcount, we
  * just go ahead and clean through to b_bufsize.
  */
 static void
 vfs_clean_pages_dirty_buf(struct buf *bp)
 {
 	vm_ooffset_t foff, noff, eoff;
 	vm_page_t m;
 	int i;
 
 	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
 		return;
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
 
 	vfs_busy_pages_acquire(bp);
 	vfs_setdirty_range(bp);
 	for (i = 0; i < bp->b_npages; i++) {
 		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		eoff = noff;
 		if (eoff > bp->b_offset + bp->b_bufsize)
 			eoff = bp->b_offset + bp->b_bufsize;
 		m = bp->b_pages[i];
 		vfs_page_set_validclean(bp, foff, m);
 		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 		foff = noff;
 	}
 	vfs_busy_pages_release(bp);
 }
 
 static void
 vfs_setdirty_range(struct buf *bp)
 {
 	vm_offset_t boffset;
 	vm_offset_t eoffset;
 	int i;
 
 	/*
 	 * test the pages to see if they have been modified directly
 	 * by users through the VM system.
 	 */
 	for (i = 0; i < bp->b_npages; i++)
 		vm_page_test_dirty(bp->b_pages[i]);
 
 	/*
 	 * Calculate the encompassing dirty range, boffset and eoffset,
 	 * (eoffset - boffset) bytes.
 	 */
 
 	for (i = 0; i < bp->b_npages; i++) {
 		if (bp->b_pages[i]->dirty)
 			break;
 	}
 	boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 	for (i = bp->b_npages - 1; i >= 0; --i) {
 		if (bp->b_pages[i]->dirty) {
 			break;
 		}
 	}
 	eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 	/*
 	 * Fit it to the buffer.
 	 */
 
 	if (eoffset > bp->b_bcount)
 		eoffset = bp->b_bcount;
 
 	/*
 	 * If we have a good dirty range, merge with the existing
 	 * dirty range.
 	 */
 
 	if (boffset < eoffset) {
 		if (bp->b_dirtyoff > boffset)
 			bp->b_dirtyoff = boffset;
 		if (bp->b_dirtyend < eoffset)
 			bp->b_dirtyend = eoffset;
 	}
 }
 
 /*
  * Allocate the KVA mapping for an existing buffer.
  * If an unmapped buffer is provided but a mapped buffer is requested, take
  * also care to properly setup mappings between pages and KVA.
  */
 static void
 bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
 {
 	int bsize, maxsize, need_mapping, need_kva;
 	off_t offset;
 
 	need_mapping = bp->b_data == unmapped_buf &&
 	    (gbflags & GB_UNMAPPED) == 0;
 	need_kva = bp->b_kvabase == unmapped_buf &&
 	    bp->b_data == unmapped_buf &&
 	    (gbflags & GB_KVAALLOC) != 0;
 	if (!need_mapping && !need_kva)
 		return;
 
 	BUF_CHECK_UNMAPPED(bp);
 
 	if (need_mapping && bp->b_kvabase != unmapped_buf) {
 		/*
 		 * Buffer is not mapped, but the KVA was already
 		 * reserved at the time of the instantiation.  Use the
 		 * allocated space.
 		 */
 		goto has_addr;
 	}
 
 	/*
 	 * Calculate the amount of the address space we would reserve
 	 * if the buffer was mapped.
 	 */
 	bsize = vn_isdisk(bp->b_vp) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
 	KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
 	offset = blkno * bsize;
 	maxsize = size + (offset & PAGE_MASK);
 	maxsize = imax(maxsize, bsize);
 
 	while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
 		if ((gbflags & GB_NOWAIT_BD) != 0) {
 			/*
 			 * XXXKIB: defragmentation cannot
 			 * succeed, not sure what else to do.
 			 */
 			panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
 		}
 		counter_u64_add(mappingrestarts, 1);
 		bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0);
 	}
 has_addr:
 	if (need_mapping) {
 		/* b_offset is handled by bpmap_qenter. */
 		bp->b_data = bp->b_kvabase;
 		BUF_CHECK_MAPPED(bp);
 		bpmap_qenter(bp);
 	}
 }
 
 struct buf *
 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
     int flags)
 {
 	struct buf *bp;
 	int error;
 
 	error = getblkx(vp, blkno, blkno, size, slpflag, slptimeo, flags, &bp);
 	if (error != 0)
 		return (NULL);
 	return (bp);
 }
 
 /*
  *	getblkx:
  *
  *	Get a block given a specified block and offset into a file/device.
  *	The buffers B_DONE bit will be cleared on return, making it almost
  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
  *	return.  The caller should clear B_INVAL prior to initiating a
  *	READ.
  *
  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
  *	an existing buffer.
  *
  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
  *	and then cleared based on the backing VM.  If the previous buffer is
  *	non-0-sized but invalid, B_CACHE will be cleared.
  *
  *	If getblk() must create a new buffer, the new buffer is returned with
  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
  *	case it is returned with B_INVAL clear and B_CACHE set based on the
  *	backing VM.
  *
  *	getblk() also forces a bwrite() for any B_DELWRI buffer whose
  *	B_CACHE bit is clear.
  *
  *	What this means, basically, is that the caller should use B_CACHE to
  *	determine whether the buffer is fully valid or not and should clear
  *	B_INVAL prior to issuing a read.  If the caller intends to validate
  *	the buffer by loading its data area with something, the caller needs
  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
  *	the caller should set B_CACHE ( as an optimization ), else the caller
  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
  *	a write attempt or if it was a successful read.  If the caller 
  *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  *
  *	The blkno parameter is the logical block being requested. Normally
  *	the mapping of logical block number to disk block address is done
  *	by calling VOP_BMAP(). However, if the mapping is already known, the
  *	disk block address can be passed using the dblkno parameter. If the
  *	disk block address is not known, then the same value should be passed
  *	for blkno and dblkno.
  */
 int
 getblkx(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, int slpflag,
     int slptimeo, int flags, struct buf **bpp)
 {
 	struct buf *bp;
 	struct bufobj *bo;
 	daddr_t d_blkno;
 	int bsize, error, maxsize, vmio;
 	off_t offset;
 
 	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
 	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	ASSERT_VOP_LOCKED(vp, "getblk");
 	if (size > maxbcachebuf)
 		panic("getblk: size(%d) > maxbcachebuf(%d)\n", size,
 		    maxbcachebuf);
 	if (!unmapped_buf_allowed)
 		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	bo = &vp->v_bufobj;
 	d_blkno = dblkno;
 
 	/* Attempt lockless lookup first. */
 	bp = gbincore_unlocked(bo, blkno);
 	if (bp == NULL)
 		goto newbuf_unlocked;
 
 	error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL, "getblku", 0,
 	    0);
 	if (error != 0)
 		goto loop;
 
 	/* Verify buf identify has not changed since lookup. */
 	if (bp->b_bufobj == bo && bp->b_lblkno == blkno)
 		goto foundbuf_fastpath;
 
 	/* It changed, fallback to locked lookup. */
 	BUF_UNLOCK_RAW(bp);
 
 loop:
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	if (bp != NULL) {
 		int lockflags;
 
 		/*
 		 * Buffer is in-core.  If the buffer is not busy nor managed,
 		 * it must be on a queue.
 		 */
 		lockflags = LK_EXCLUSIVE | LK_INTERLOCK |
 		    ((flags & GB_LOCK_NOWAIT) ? LK_NOWAIT : LK_SLEEPFAIL);
 
 		error = BUF_TIMELOCK(bp, lockflags,
 		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
 
 		/*
 		 * If we slept and got the lock we have to restart in case
 		 * the buffer changed identities.
 		 */
 		if (error == ENOLCK)
 			goto loop;
 		/* We timed out or were interrupted. */
 		else if (error != 0)
 			return (error);
 
 foundbuf_fastpath:
 		/* If recursed, assume caller knows the rules. */
 		if (BUF_LOCKRECURSED(bp))
 			goto end;
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 		 * backing VM cache.
 		 */
 		if (bp->b_flags & B_INVAL)
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
 		if (bp->b_flags & B_MANAGED)
 			MPASS(bp->b_qindex == QUEUE_NONE);
 		else
 			bremfree(bp);
 
 		/*
 		 * check for size inconsistencies for non-VMIO case.
 		 */
 		if (bp->b_bcount != size) {
 			if ((bp->b_flags & B_VMIO) == 0 ||
 			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
 					bp->b_flags |= B_NOCACHE;
 					bwrite(bp);
 				} else {
 					if (LIST_EMPTY(&bp->b_dep)) {
 						bp->b_flags |= B_RELBUF;
 						brelse(bp);
 					} else {
 						bp->b_flags |= B_NOCACHE;
 						bwrite(bp);
 					}
 				}
 				goto loop;
 			}
 		}
 
 		/*
 		 * Handle the case of unmapped buffer which should
 		 * become mapped, or the buffer for which KVA
 		 * reservation is requested.
 		 */
 		bp_unmapped_get_kva(bp, blkno, size, flags);
 
 		/*
 		 * If the size is inconsistent in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
 		allocbuf(bp, size);
 
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
 
 		/*
 		 * A buffer with B_DELWRI set and B_CACHE clear must
 		 * be committed before we can return the buffer in
 		 * order to prevent the caller from issuing a read
 		 * ( due to B_CACHE not being set ) and overwriting
 		 * it.
 		 *
 		 * Most callers, including NFS and FFS, need this to
 		 * operate properly either because they assume they
 		 * can issue a read if B_CACHE is not set, or because
 		 * ( for example ) an uncached B_DELWRI might loop due 
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
 		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 		 * above while extending the buffer, we cannot allow the
 		 * buffer to remain with B_CACHE set after the write
 		 * completes or it will represent a corrupt state.  To
 		 * deal with this we set B_NOCACHE to scrap the buffer
 		 * after the write.
 		 *
 		 * We might be able to do something fancy, like setting
 		 * B_CACHE in bwrite() except if B_DELWRI is already set,
 		 * so the below call doesn't set B_CACHE, but that gets real
 		 * confusing.  This is much easier.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 			bp->b_flags |= B_NOCACHE;
 			bwrite(bp);
 			goto loop;
 		}
 		bp->b_flags &= ~B_DONE;
 	} else {
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
 		BO_RUNLOCK(bo);
 newbuf_unlocked:
 		/*
 		 * If the user does not want us to create the buffer, bail out
 		 * here.
 		 */
 		if (flags & GB_NOCREAT)
 			return (EEXIST);
 
 		bsize = vn_isdisk(vp) ? DEV_BSIZE : bo->bo_bsize;
 		KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
 		offset = blkno * bsize;
 		vmio = vp->v_object != NULL;
 		if (vmio) {
 			maxsize = size + (offset & PAGE_MASK);
 		} else {
 			maxsize = size;
 			/* Do not allow non-VMIO notmapped buffers. */
 			flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 		}
 		maxsize = imax(maxsize, bsize);
 		if ((flags & GB_NOSPARSE) != 0 && vmio &&
 		    !vn_isdisk(vp)) {
 			error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0);
 			KASSERT(error != EOPNOTSUPP,
 			    ("GB_NOSPARSE from fs not supporting bmap, vp %p",
 			    vp));
 			if (error != 0)
 				return (error);
 			if (d_blkno == -1)
 				return (EJUSTRETURN);
 		}
 
 		bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
 		if (bp == NULL) {
 			if (slpflag || slptimeo)
 				return (ETIMEDOUT);
 			/*
 			 * XXX This is here until the sleep path is diagnosed
 			 * enough to work under very low memory conditions.
 			 *
 			 * There's an issue on low memory, 4BSD+non-preempt
 			 * systems (eg MIPS routers with 32MB RAM) where buffer
 			 * exhaustion occurs without sleeping for buffer
 			 * reclaimation.  This just sticks in a loop and
 			 * constantly attempts to allocate a buffer, which
 			 * hits exhaustion and tries to wakeup bufdaemon.
 			 * This never happens because we never yield.
 			 *
 			 * The real solution is to identify and fix these cases
 			 * so we aren't effectively busy-waiting in a loop
 			 * until the reclaimation path has cycles to run.
 			 */
 			kern_yield(PRI_USER);
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.
 		 *
 		 * Note: this must occur before we associate the buffer
 		 * with the vp especially considering limitations in
 		 * the splay tree implementation when dealing with duplicate
 		 * lblkno's.
 		 */
 		BO_LOCK(bo);
 		if (gbincore(bo, blkno)) {
 			BO_UNLOCK(bo);
 			bp->b_flags |= B_INVAL;
 			bufspace_release(bufdomain(bp), maxsize);
 			brelse(bp);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_lblkno = blkno;
 		bp->b_blkno = d_blkno;
 		bp->b_offset = offset;
 		bgetvp(vp, bp);
 		BO_UNLOCK(bo);
 
 		/*
 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 		 * buffer size starts out as 0, B_CACHE will be set by
 		 * allocbuf() for the VMIO case prior to it testing the
 		 * backing store for validity.
 		 */
 
 		if (vmio) {
 			bp->b_flags |= B_VMIO;
 			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
 			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
 			    bp, vp->v_object, bp->b_bufobj->bo_object));
 		} else {
 			bp->b_flags &= ~B_VMIO;
 			KASSERT(bp->b_bufobj->bo_object == NULL,
 			    ("ARGH! has b_bufobj->bo_object %p %p\n",
 			    bp, bp->b_bufobj->bo_object));
 			BUF_CHECK_MAPPED(bp);
 		}
 
 		allocbuf(bp, size);
 		bufspace_release(bufdomain(bp), maxsize);
 		bp->b_flags &= ~B_DONE;
 	}
 	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
 end:
 	buf_track(bp, __func__);
 	KASSERT(bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	*bpp = bp;
 	return (0);
 }
 
 /*
  * Get an empty, disassociated buffer of given size.  The buffer is initially
  * set to B_INVAL.
  */
 struct buf *
 geteblk(int size, int flags)
 {
 	struct buf *bp;
 	int maxsize;
 
 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 	while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
 		if ((flags & GB_NOWAIT_BD) &&
 		    (curthread->td_pflags & TDP_BUFNEED) != 0)
 			return (NULL);
 	}
 	allocbuf(bp, size);
 	bufspace_release(bufdomain(bp), maxsize);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	return (bp);
 }
 
 /*
  * Truncate the backing store for a non-vmio buffer.
  */
 static void
 vfs_nonvmio_truncate(struct buf *bp, int newbsize)
 {
 
 	if (bp->b_flags & B_MALLOC) {
 		/*
 		 * malloced buffers are not shrunk
 		 */
 		if (newbsize == 0) {
 			bufmallocadjust(bp, 0);
 			free(bp->b_data, M_BIOBUF);
 			bp->b_data = bp->b_kvabase;
 			bp->b_flags &= ~B_MALLOC;
 		}
 		return;
 	}
 	vm_hold_free_pages(bp, newbsize);
 	bufspace_adjust(bp, newbsize);
 }
 
 /*
  * Extend the backing for a non-VMIO buffer.
  */
 static void
 vfs_nonvmio_extend(struct buf *bp, int newbsize)
 {
 	caddr_t origbuf;
 	int origbufsize;
 
 	/*
 	 * We only use malloced memory on the first allocation.
 	 * and revert to page-allocated memory when the buffer
 	 * grows.
 	 *
 	 * There is a potential smp race here that could lead
 	 * to bufmallocspace slightly passing the max.  It
 	 * is probably extremely rare and not worth worrying
 	 * over.
 	 */
 	if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 &&
 	    bufmallocspace < maxbufmallocspace) {
 		bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK);
 		bp->b_flags |= B_MALLOC;
 		bufmallocadjust(bp, newbsize);
 		return;
 	}
 
 	/*
 	 * If the buffer is growing on its other-than-first
 	 * allocation then we revert to the page-allocation
 	 * scheme.
 	 */
 	origbuf = NULL;
 	origbufsize = 0;
 	if (bp->b_flags & B_MALLOC) {
 		origbuf = bp->b_data;
 		origbufsize = bp->b_bufsize;
 		bp->b_data = bp->b_kvabase;
 		bufmallocadjust(bp, 0);
 		bp->b_flags &= ~B_MALLOC;
 		newbsize = round_page(newbsize);
 	}
 	vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
 	    (vm_offset_t) bp->b_data + newbsize);
 	if (origbuf != NULL) {
 		bcopy(origbuf, bp->b_data, origbufsize);
 		free(origbuf, M_BIOBUF);
 	}
 	bufspace_adjust(bp, newbsize);
 }
 
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).  This code is able to
  * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistent data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
  *
  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
  * B_CACHE for the non-VMIO case.
  */
 int
 allocbuf(struct buf *bp, int size)
 {
 	int newbsize;
 
 	if (bp->b_bcount == size)
 		return (1);
 
 	if (bp->b_kvasize != 0 && bp->b_kvasize < size)
 		panic("allocbuf: buffer too small");
 
 	newbsize = roundup2(size, DEV_BSIZE);
 	if ((bp->b_flags & B_VMIO) == 0) {
 		if ((bp->b_flags & B_MALLOC) == 0)
 			newbsize = round_page(newbsize);
 		/*
 		 * Just get anonymous memory from the kernel.  Don't
 		 * mess with B_CACHE.
 		 */
 		if (newbsize < bp->b_bufsize)
 			vfs_nonvmio_truncate(bp, newbsize);
 		else if (newbsize > bp->b_bufsize)
 			vfs_nonvmio_extend(bp, newbsize);
 	} else {
 		int desiredpages;
 
 		desiredpages = (size == 0) ? 0 :
 		    num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 		/*
 		 * Set B_CACHE initially if buffer is 0 length or will become
 		 * 0-length.
 		 */
 		if (size == 0 || bp->b_bufsize == 0)
 			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize)
 			vfs_vmio_truncate(bp, desiredpages);
 		/* XXX This looks as if it should be newbsize > b_bufsize */
 		else if (size > bp->b_bcount)
 			vfs_vmio_extend(bp, desiredpages, size);
 		bufspace_adjust(bp, newbsize);
 	}
 	bp->b_bcount = size;		/* requested buffer size. */
 	return (1);
 }
 
 extern int inflight_transient_maps;
 
 static struct bio_queue nondump_bios;
 
 void
 biodone(struct bio *bp)
 {
 	struct mtx *mtxp;
 	void (*done)(struct bio *);
 	vm_offset_t start, end;
 
 	biotrack(bp, __func__);
 
 	/*
 	 * Avoid completing I/O when dumping after a panic since that may
 	 * result in a deadlock in the filesystem or pager code.  Note that
 	 * this doesn't affect dumps that were started manually since we aim
 	 * to keep the system usable after it has been resumed.
 	 */
 	if (__predict_false(dumping && SCHEDULER_STOPPED())) {
 		TAILQ_INSERT_HEAD(&nondump_bios, bp, bio_queue);
 		return;
 	}
 	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
 		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
 		bp->bio_flags |= BIO_UNMAPPED;
 		start = trunc_page((vm_offset_t)bp->bio_data);
 		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
 		bp->bio_data = unmapped_buf;
 		pmap_qremove(start, atop(end - start));
 		vmem_free(transient_arena, start, end - start);
 		atomic_add_int(&inflight_transient_maps, -1);
 	}
 	done = bp->bio_done;
 	if (done == NULL) {
 		mtxp = mtx_pool_find(mtxpool_sleep, bp);
 		mtx_lock(mtxp);
 		bp->bio_flags |= BIO_DONE;
 		wakeup(bp);
 		mtx_unlock(mtxp);
 	} else
 		done(bp);
 }
 
 /*
  * Wait for a BIO to finish.
  */
 int
 biowait(struct bio *bp, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->bio_flags & BIO_DONE) == 0)
 		msleep(bp, mtxp, PRIBIO, wchan, 0);
 	mtx_unlock(mtxp);
 	if (bp->bio_error != 0)
 		return (bp->bio_error);
 	if (!(bp->bio_flags & BIO_ERROR))
 		return (0);
 	return (EIO);
 }
 
 void
 biofinish(struct bio *bp, struct devstat *stat, int error)
 {
 
 	if (error) {
 		bp->bio_error = error;
 		bp->bio_flags |= BIO_ERROR;
 	}
 	if (stat != NULL)
 		devstat_end_transaction_bio(stat, bp);
 	biodone(bp);
 }
 
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 void
 biotrack_buf(struct bio *bp, const char *location)
 {
 
 	buf_track(bp->bio_track_bp, location);
 }
 #endif
 
 /*
  *	bufwait:
  *
  *	Wait for buffer I/O completion, returning error status.  The buffer
  *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
  *	error and cleared.
  */
 int
 bufwait(struct buf *bp)
 {
 	if (bp->b_iocmd == BIO_READ)
 		bwait(bp, PRIBIO, "biord");
 	else
 		bwait(bp, PRIBIO, "biowr");
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_ioflags & BIO_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
 /*
  *	bufdone:
  *
  *	Finish I/O on a buffer, optionally calling a completion function.
  *	This is usually called from an interrupt so process blocking is
  *	not allowed.
  *
  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
  *	assuming B_INVAL is clear.
  *
  *	For the VMIO case, we set B_CACHE if the op was a read and no
  *	read error occurred, or if the op was a write.  B_CACHE is never
  *	set if the buffer is invalid or otherwise uncacheable.
  *
  *	bufdone does not mess with B_INVAL, allowing the I/O routine or the
  *	initiator to leave B_INVAL set to brelse the buffer out of existence
  *	in the biodone routine.
  */
 void
 bufdone(struct buf *bp)
 {
 	struct bufobj *dropobj;
 	void    (*biodone)(struct buf *);
 
 	buf_track(bp, __func__);
 	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	dropobj = NULL;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 
 	runningbufwakeup(bp);
 	if (bp->b_iocmd == BIO_WRITE)
 		dropobj = bp->b_bufobj;
 	/* call optional completion function if requested */
 	if (bp->b_iodone != NULL) {
 		biodone = bp->b_iodone;
 		bp->b_iodone = NULL;
 		(*biodone) (bp);
 		if (dropobj)
 			bufobj_wdrop(dropobj);
 		return;
 	}
 	if (bp->b_flags & B_VMIO) {
 		/*
 		 * Set B_CACHE if the op was a normal read and no error
 		 * occurred.  B_CACHE is set for writes in the b*write()
 		 * routines.
 		 */
 		if (bp->b_iocmd == BIO_READ &&
 		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
 		    !(bp->b_ioflags & BIO_ERROR))
 			bp->b_flags |= B_CACHE;
 		vfs_vmio_iodone(bp);
 	}
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_complete(bp);
 	if ((bp->b_flags & B_CKHASH) != 0) {
 		KASSERT(bp->b_iocmd == BIO_READ,
 		    ("bufdone: b_iocmd %d not BIO_READ", bp->b_iocmd));
 		KASSERT(buf_mapped(bp), ("bufdone: bp %p not mapped", bp));
 		(*bp->b_ckhashcalc)(bp);
 	}
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * will do a wakeup there if necessary - so no need to do a wakeup
 	 * here in the async case. The sync case always needs to do a wakeup.
 	 */
 	if (bp->b_flags & B_ASYNC) {
 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) ||
 		    (bp->b_ioflags & BIO_ERROR))
 			brelse(bp);
 		else
 			bqrelse(bp);
 	} else
 		bdone(bp);
 	if (dropobj)
 		bufobj_wdrop(dropobj);
 }
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistent.
  */
 void
 vfs_unbusy_pages(struct buf *bp)
 {
 	int i;
 	vm_object_t obj;
 	vm_page_t m;
 
 	runningbufwakeup(bp);
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			m = vm_page_relookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 			if (!m)
 				panic("vfs_unbusy_pages: page missing\n");
 			bp->b_pages[i] = m;
 			if (buf_mapped(bp)) {
 				BUF_CHECK_MAPPED(bp);
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 				    bp->b_pages, bp->b_npages);
 			} else
 				BUF_CHECK_UNMAPPED(bp);
 		}
 		vm_page_sunbusy(m);
 	}
 	vm_object_pip_wakeupn(obj, bp->b_npages);
 }
 
 /*
  * vfs_page_set_valid:
  *
  *	Set the valid bits in a page based on the supplied offset.   The
  *	range is restricted to the buffer's size.
  *
  *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t eoff;
 
 	/*
 	 * Compute the end offset, eoff, such that [off, eoff) does not span a
 	 * page boundary and eoff is not greater than the end of the buffer.
 	 * The end of the buffer, in this case, is our file EOF, not the
 	 * allocation size of the buffer.
 	 */
 	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > off)
 		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
 }
 
 /*
  * vfs_page_set_validclean:
  *
  *	Set the valid bits and clear the dirty bits in a page based on the
  *	supplied offset.   The range is restricted to the buffer's size.
  */
 static void
 vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t soff, eoff;
 
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 	 * page boundary or cross the end of the buffer.  The end of the
 	 * buffer, in this case, is our file EOF, not the allocation size
 	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > soff) {
 		vm_page_set_validclean(
 		    m,
 		   (vm_offset_t) (soff & PAGE_MASK),
 		   (vm_offset_t) (eoff - soff)
 		);
 	}
 }
 
 /*
  * Acquire a shared busy on all pages in the buf.
  */
 void
 vfs_busy_pages_acquire(struct buf *bp)
 {
 	int i;
 
 	for (i = 0; i < bp->b_npages; i++)
 		vm_page_busy_acquire(bp->b_pages[i], VM_ALLOC_SBUSY);
 }
 
 void
 vfs_busy_pages_release(struct buf *bp)
 {
 	int i;
 
 	for (i = 0; i < bp->b_npages; i++)
 		vm_page_sunbusy(bp->b_pages[i]);
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being exclusive busy.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistent.
  *
  * Since I/O has not been initiated yet, certain buffer flags
  * such as BIO_ERROR or B_INVAL may be in an inconsistent state
  * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf *bp, int clear_modify)
 {
 	vm_object_t obj;
 	vm_ooffset_t foff;
 	vm_page_t m;
 	int i;
 	bool bogus;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_busy_pages: no buffer offset"));
 	if ((bp->b_flags & B_CLUSTER) == 0) {
 		vm_object_pip_add(obj, bp->b_npages);
 		vfs_busy_pages_acquire(bp);
 	}
 	if (bp->b_bufsize != 0)
 		vfs_setdirty_range(bp);
 	bogus = false;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		vm_page_assert_sbusied(m);
 
 		/*
 		 * When readying a buffer for a read ( i.e
 		 * clear_modify == 0 ), it is important to do
 		 * bogus_page replacement for valid pages in 
 		 * partially instantiated buffers.  Partially 
 		 * instantiated buffers can, in turn, occur when
 		 * reconstituting a buffer from its VM backing store
 		 * base.  We only have to do this if B_CACHE is
 		 * clear ( which causes the I/O to occur in the
 		 * first place ).  The replacement prevents the read
 		 * I/O from overwriting potentially dirty VM-backed
 		 * pages.  XXX bogus page replacement is, uh, bogus.
 		 * It may not work properly with small-block devices.
 		 * We need to find a better way.
 		 */
 		if (clear_modify) {
 			pmap_remove_write(m);
 			vfs_page_set_validclean(bp, foff, m);
 		} else if (vm_page_all_valid(m) &&
 		    (bp->b_flags & B_CACHE) == 0) {
 			bp->b_pages[i] = bogus_page;
 			bogus = true;
 		}
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	}
 	if (bogus && buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  *	vfs_bio_set_valid:
  *
  *	Set the range within the buffer to valid.  The range is
  *	relative to the beginning of the buffer, b_offset.  Note that
  *	b_offset itself may be offset from the beginning of the first
  *	page.
  */
 void
 vfs_bio_set_valid(struct buf *bp, int base, int size)
 {
 	int i, n;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	/*
 	 * Fixup base to be relative to beginning of first page.
 	 * Set initial n to be the maximum number of bytes in the
 	 * first page that can be validated.
 	 */
 	base += (bp->b_offset & PAGE_MASK);
 	n = PAGE_SIZE - (base & PAGE_MASK);
 
 	/*
 	 * Busy may not be strictly necessary here because the pages are
 	 * unlikely to be fully valid and the vnode lock will synchronize
 	 * their access via getpages.  It is grabbed for consistency with
 	 * other page validation.
 	 */
 	vfs_busy_pages_acquire(bp);
 	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 		m = bp->b_pages[i];
 		if (n > size)
 			n = size;
 		vm_page_set_valid_range(m, base & PAGE_MASK, n);
 		base += n;
 		size -= n;
 		n = PAGE_SIZE;
 	}
 	vfs_busy_pages_release(bp);
 }
 
 /*
  *	vfs_bio_clrbuf:
  *
  *	If the specified buffer is a non-VMIO buffer, clear the entire
  *	buffer.  If the specified buffer is a VMIO buffer, clear and
  *	validate only the previously invalid portions of the buffer.
  *	This routine essentially fakes an I/O, so we need to clear
  *	BIO_ERROR and B_INVAL.
  *
  *	Note that while we only theoretically need to clear through b_bcount,
  *	we go ahead and clear through b_bufsize.
  */
 void
 vfs_bio_clrbuf(struct buf *bp) 
 {
 	int i, j, mask, sa, ea, slide;
 
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
 		clrbuf(bp);
 		return;
 	}
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 	vfs_busy_pages_acquire(bp);
 	sa = bp->b_offset & PAGE_MASK;
 	slide = 0;
 	for (i = 0; i < bp->b_npages; i++, sa = 0) {
 		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
 		ea = slide & PAGE_MASK;
 		if (ea == 0)
 			ea = PAGE_SIZE;
 		if (bp->b_pages[i] == bogus_page)
 			continue;
 		j = sa / DEV_BSIZE;
 		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 		if ((bp->b_pages[i]->valid & mask) == mask)
 			continue;
 		if ((bp->b_pages[i]->valid & mask) == 0)
 			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
 		else {
 			for (; sa < ea; sa += DEV_BSIZE, j++) {
 				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
 					pmap_zero_page_area(bp->b_pages[i],
 					    sa, DEV_BSIZE);
 				}
 			}
 		}
 		vm_page_set_valid_range(bp->b_pages[i], j * DEV_BSIZE,
 		    roundup2(ea - sa, DEV_BSIZE));
 	}
 	vfs_busy_pages_release(bp);
 	bp->b_resid = 0;
 }
 
 void
 vfs_bio_bzero_buf(struct buf *bp, int base, int size)
 {
 	vm_page_t m;
 	int i, n;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		bzero(bp->b_data + base, size);
 	} else {
 		BUF_CHECK_UNMAPPED(bp);
 		n = PAGE_SIZE - (base & PAGE_MASK);
 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 			m = bp->b_pages[i];
 			if (n > size)
 				n = size;
 			pmap_zero_page_area(m, base & PAGE_MASK, n);
 			base += n;
 			size -= n;
 			n = PAGE_SIZE;
 		}
 	}
 }
 
 /*
  * Update buffer flags based on I/O request parameters, optionally releasing the
  * buffer.  If it's VMIO or direct I/O, the buffer pages are released to the VM,
  * where they may be placed on a page queue (VMIO) or freed immediately (direct
  * I/O).  Otherwise the buffer is released to the cache.
  */
 static void
 b_io_dismiss(struct buf *bp, int ioflag, bool release)
 {
 
 	KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0,
 	    ("buf %p non-VMIO noreuse", bp));
 
 	if ((ioflag & IO_DIRECT) != 0)
 		bp->b_flags |= B_DIRECT;
 	if ((ioflag & IO_EXT) != 0)
 		bp->b_xflags |= BX_ALTDATA;
 	if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) {
 		bp->b_flags |= B_RELBUF;
 		if ((ioflag & IO_NOREUSE) != 0)
 			bp->b_flags |= B_NOREUSE;
 		if (release)
 			brelse(bp);
 	} else if (release)
 		bqrelse(bp);
 }
 
 void
 vfs_bio_brelse(struct buf *bp, int ioflag)
 {
 
 	b_io_dismiss(bp, ioflag, true);
 }
 
 void
 vfs_bio_set_flags(struct buf *bp, int ioflag)
 {
 
 	b_io_dismiss(bp, ioflag, false);
 }
 
 /*
  * vm_hold_load_pages and vm_hold_free_pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 static void
 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
 	BUF_CHECK_MAPPED(bp);
 
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 		/*
 		 * note: must allocate system pages since blocking here
 		 * could interfere with paging I/O, no matter which
 		 * process we are.
 		 */
 		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) |
 		    VM_ALLOC_WAITOK);
 		pmap_qenter(pg, &p, 1);
 		bp->b_pages[index] = p;
 	}
 	bp->b_npages = index;
 }
 
 /* Return pages associated with this buf to the vm system */
 static void
 vm_hold_free_pages(struct buf *bp, int newbsize)
 {
 	vm_offset_t from;
 	vm_page_t p;
 	int index, newnpages;
 
 	BUF_CHECK_MAPPED(bp);
 
 	from = round_page((vm_offset_t)bp->b_data + newbsize);
 	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 	if (bp->b_npages > newnpages)
 		pmap_qremove(from, bp->b_npages - newnpages);
 	for (index = newnpages; index < bp->b_npages; index++) {
 		p = bp->b_pages[index];
 		bp->b_pages[index] = NULL;
 		vm_page_unwire_noq(p);
 		vm_page_free(p);
 	}
 	bp->b_npages = newnpages;
 }
 
 /*
  * Map an IO request into kernel virtual address space.
  *
  * All requests are (re)mapped into kernel VA space.
  * Notice that we use b_bufsize for the size of the buffer
  * to be mapped.  b_bcount might be modified by the driver.
  *
  * Note that even if the caller determines that the address space should
  * be valid, a race or a smaller-file mapped into a larger space may
  * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
  * check the return value.
  *
  * This function only works with pager buffers.
  */
 int
-vmapbuf(struct buf *bp, int mapbuf)
+vmapbuf(struct buf *bp, void *uaddr, size_t len, int mapbuf)
 {
 	vm_prot_t prot;
 	int pidx;
 
-	if (bp->b_bufsize < 0)
-		return (-1);
 	prot = VM_PROT_READ;
 	if (bp->b_iocmd == BIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
-	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
+	    (vm_offset_t)uaddr, len, prot, bp->b_pages,
 	    btoc(MAXPHYS))) < 0)
 		return (-1);
+	bp->b_bufsize = len;
 	bp->b_npages = pidx;
-	bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
+	bp->b_offset = ((vm_offset_t)uaddr) & PAGE_MASK;
 	if (mapbuf || !unmapped_buf_allowed) {
 		pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx);
 		bp->b_data = bp->b_kvabase + bp->b_offset;
 	} else
 		bp->b_data = unmapped_buf;
 	return(0);
 }
 
 /*
  * Free the io map PTEs associated with this IO operation.
  * We also invalidate the TLB entries and restore the original b_addr.
  *
  * This function only works with pager buffers.
  */
 void
 vunmapbuf(struct buf *bp)
 {
 	int npages;
 
 	npages = bp->b_npages;
 	if (buf_mapped(bp))
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
 	vm_page_unhold_pages(bp->b_pages, npages);
 
 	bp->b_data = unmapped_buf;
 }
 
 void
 bdone(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	bp->b_flags |= B_DONE;
 	wakeup(bp);
 	mtx_unlock(mtxp);
 }
 
 void
 bwait(struct buf *bp, u_char pri, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->b_flags & B_DONE) == 0)
 		msleep(bp, mtxp, pri, wchan, 0);
 	mtx_unlock(mtxp);
 }
 
 int
 bufsync(struct bufobj *bo, int waitfor)
 {
 
 	return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread));
 }
 
 void
 bufstrategy(struct bufobj *bo, struct buf *bp)
 {
 	int i __unused;
 	struct vnode *vp;
 
 	vp = bp->b_vp;
 	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
 	i = VOP_STRATEGY(vp, bp);
 	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
 }
 
 /*
  * Initialize a struct bufobj before use.  Memory is assumed zero filled.
  */
 void
 bufobj_init(struct bufobj *bo, void *private)
 {
 	static volatile int bufobj_cleanq;
 
         bo->bo_domain =
             atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains;
         rw_init(BO_LOCKPTR(bo), "bufobj interlock");
         bo->bo_private = private;
         TAILQ_INIT(&bo->bo_clean.bv_hd);
         TAILQ_INIT(&bo->bo_dirty.bv_hd);
 }
 
 void
 bufobj_wrefl(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	ASSERT_BO_WLOCKED(bo);
 	bo->bo_numoutput++;
 }
 
 void
 bufobj_wref(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	BO_LOCK(bo);
 	bo->bo_numoutput++;
 	BO_UNLOCK(bo);
 }
 
 void
 bufobj_wdrop(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
 	BO_LOCK(bo);
 	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
 	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
 		bo->bo_flag &= ~BO_WWAIT;
 		wakeup(&bo->bo_numoutput);
 	}
 	BO_UNLOCK(bo);
 }
 
 int
 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
 {
 	int error;
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
 	ASSERT_BO_WLOCKED(bo);
 	error = 0;
 	while (bo->bo_numoutput) {
 		bo->bo_flag |= BO_WWAIT;
 		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
 		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Set bio_data or bio_ma for struct bio from the struct buf.
  */
 void
 bdata2bio(struct buf *bp, struct bio *bip)
 {
 
 	if (!buf_mapped(bp)) {
 		KASSERT(unmapped_buf_allowed, ("unmapped"));
 		bip->bio_ma = bp->b_pages;
 		bip->bio_ma_n = bp->b_npages;
 		bip->bio_data = unmapped_buf;
 		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
 		bip->bio_flags |= BIO_UNMAPPED;
 		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
 		    PAGE_SIZE == bp->b_npages,
 		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
 		    (long long)bip->bio_length, bip->bio_ma_n));
 	} else {
 		bip->bio_data = bp->b_data;
 		bip->bio_ma = NULL;
 	}
 }
 
 /*
  * The MIPS pmap code currently doesn't handle aliased pages.
  * The VIPT caches may not handle page aliasing themselves, leading
  * to data corruption.
  *
  * As such, this code makes a system extremely unhappy if said
  * system doesn't support unaliasing the above situation in hardware.
  * Some "recent" systems (eg some mips24k/mips74k cores) don't enable
  * this feature at build time, so it has to be handled in software.
  *
  * Once the MIPS pmap/cache code grows to support this function on
  * earlier chips, it should be flipped back off.
  */
 #ifdef	__mips__
 static int buf_pager_relbuf = 1;
 #else
 static int buf_pager_relbuf = 0;
 #endif
 SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
     &buf_pager_relbuf, 0,
     "Make buffer pager release buffers after reading");
 
 /*
  * The buffer pager.  It uses buffer reads to validate pages.
  *
  * In contrast to the generic local pager from vm/vnode_pager.c, this
  * pager correctly and easily handles volumes where the underlying
  * device block size is greater than the machine page size.  The
  * buffer cache transparently extends the requested page run to be
  * aligned at the block boundary, and does the necessary bogus page
  * replacements in the addends to avoid obliterating already valid
  * pages.
  *
  * The only non-trivial issue is that the exclusive busy state for
  * pages, which is assumed by the vm_pager_getpages() interface, is
  * incompatible with the VMIO buffer cache's desire to share-busy the
  * pages.  This function performs a trivial downgrade of the pages'
  * state before reading buffers, and a less trivial upgrade from the
  * shared-busy to excl-busy state after the read.
  */
 int
 vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count,
     int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno,
     vbg_get_blksize_t get_blksize)
 {
 	vm_page_t m;
 	vm_object_t object;
 	struct buf *bp;
 	struct mount *mp;
 	daddr_t lbn, lbnp;
 	vm_ooffset_t la, lb, poff, poffe;
 	long bsize;
 	int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b;
 	bool redo, lpart;
 
 	object = vp->v_object;
 	mp = vp->v_mount;
 	error = 0;
 	la = IDX_TO_OFF(ma[count - 1]->pindex);
 	if (la >= object->un_pager.vnp.vnp_size)
 		return (VM_PAGER_BAD);
 
 	/*
 	 * Change the meaning of la from where the last requested page starts
 	 * to where it ends, because that's the end of the requested region
 	 * and the start of the potential read-ahead region.
 	 */
 	la += PAGE_SIZE;
 	lpart = la > object->un_pager.vnp.vnp_size;
 	bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)));
 
 	/*
 	 * Calculate read-ahead, behind and total pages.
 	 */
 	pgsin = count;
 	lb = IDX_TO_OFF(ma[0]->pindex);
 	pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
 	pgsin += pgsin_b;
 	if (rbehind != NULL)
 		*rbehind = pgsin_b;
 	pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la);
 	if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size)
 		pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size,
 		    PAGE_SIZE) - la);
 	pgsin += pgsin_a;
 	if (rahead != NULL)
 		*rahead = pgsin_a;
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, pgsin);
 
 	br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS)
 	    != 0) ? GB_UNMAPPED : 0;
 again:
 	for (i = 0; i < count; i++) {
 		if (ma[i] != bogus_page)
 			vm_page_busy_downgrade(ma[i]);
 	}
 
 	lbnp = -1;
 	for (i = 0; i < count; i++) {
 		m = ma[i];
 		if (m == bogus_page)
 			continue;
 
 		/*
 		 * Pages are shared busy and the object lock is not
 		 * owned, which together allow for the pages'
 		 * invalidation.  The racy test for validity avoids
 		 * useless creation of the buffer for the most typical
 		 * case when invalidation is not used in redo or for
 		 * parallel read.  The shared->excl upgrade loop at
 		 * the end of the function catches the race in a
 		 * reliable way (protected by the object lock).
 		 */
 		if (vm_page_all_valid(m))
 			continue;
 
 		poff = IDX_TO_OFF(m->pindex);
 		poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size);
 		for (; poff < poffe; poff += bsize) {
 			lbn = get_lblkno(vp, poff);
 			if (lbn == lbnp)
 				goto next_page;
 			lbnp = lbn;
 
 			bsize = get_blksize(vp, lbn);
 			error = bread_gb(vp, lbn, bsize, curthread->td_ucred,
 			    br_flags, &bp);
 			if (error != 0)
 				goto end_pages;
 			if (bp->b_rcred == curthread->td_ucred) {
 				crfree(bp->b_rcred);
 				bp->b_rcred = NOCRED;
 			}
 			if (LIST_EMPTY(&bp->b_dep)) {
 				/*
 				 * Invalidation clears m->valid, but
 				 * may leave B_CACHE flag if the
 				 * buffer existed at the invalidation
 				 * time.  In this case, recycle the
 				 * buffer to do real read on next
 				 * bread() after redo.
 				 *
 				 * Otherwise B_RELBUF is not strictly
 				 * necessary, enable to reduce buf
 				 * cache pressure.
 				 */
 				if (buf_pager_relbuf ||
 				    !vm_page_all_valid(m))
 					bp->b_flags |= B_RELBUF;
 
 				bp->b_flags &= ~B_NOCACHE;
 				brelse(bp);
 			} else {
 				bqrelse(bp);
 			}
 		}
 		KASSERT(1 /* racy, enable for debugging */ ||
 		    vm_page_all_valid(m) || i == count - 1,
 		    ("buf %d %p invalid", i, m));
 		if (i == count - 1 && lpart) {
 			if (!vm_page_none_valid(m) &&
 			    !vm_page_all_valid(m))
 				vm_page_zero_invalid(m, TRUE);
 		}
 next_page:;
 	}
 end_pages:
 
 	redo = false;
 	for (i = 0; i < count; i++) {
 		if (ma[i] == bogus_page)
 			continue;
 		if (vm_page_busy_tryupgrade(ma[i]) == 0) {
 			vm_page_sunbusy(ma[i]);
 			ma[i] = vm_page_grab_unlocked(object, ma[i]->pindex,
 			    VM_ALLOC_NORMAL);
 		}
 
 		/*
 		 * Since the pages were only sbusy while neither the
 		 * buffer nor the object lock was held by us, or
 		 * reallocated while vm_page_grab() slept for busy
 		 * relinguish, they could have been invalidated.
 		 * Recheck the valid bits and re-read as needed.
 		 *
 		 * Note that the last page is made fully valid in the
 		 * read loop, and partial validity for the page at
 		 * index count - 1 could mean that the page was
 		 * invalidated or removed, so we must restart for
 		 * safety as well.
 		 */
 		if (!vm_page_all_valid(ma[i]))
 			redo = true;
 	}
 	if (redo && error == 0)
 		goto again;
 	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 /* DDB command to show buffer data */
 DB_SHOW_COMMAND(buffer, db_show_buffer)
 {
 	/* get args */
 	struct buf *bp = (struct buf *)addr;
 #ifdef FULL_BUF_TRACKING
 	uint32_t i, j;
 #endif
 
 	if (!have_addr) {
 		db_printf("usage: show buffer <addr>\n");
 		return;
 	}
 
 	db_printf("buf at %p\n", bp);
 	db_printf("b_flags = 0x%b, b_xflags=0x%b\n",
 	    (u_int)bp->b_flags, PRINT_BUF_FLAGS,
 	    (u_int)bp->b_xflags, PRINT_BUF_XFLAGS);
 	db_printf("b_vflags=0x%b b_ioflags0x%b\n",
 	    (u_int)bp->b_vflags, PRINT_BUF_VFLAGS,
 	    (u_int)bp->b_ioflags, PRINT_BIO_FLAGS);
 	db_printf(
 	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
 	    "b_bufobj = (%p), b_data = %p\n, b_blkno = %jd, b_lblkno = %jd, "
 	    "b_vp = %p, b_dep = %p\n",
 	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
 	    (intmax_t)bp->b_lblkno, bp->b_vp, bp->b_dep.lh_first);
 	db_printf("b_kvabase = %p, b_kvasize = %d\n",
 	    bp->b_kvabase, bp->b_kvasize);
 	if (bp->b_npages) {
 		int i;
 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m;
 			m = bp->b_pages[i];
 			if (m != NULL)
 				db_printf("(%p, 0x%lx, 0x%lx)", m->object,
 				    (u_long)m->pindex,
 				    (u_long)VM_PAGE_TO_PHYS(m));
 			else
 				db_printf("( ??? )");
 			if ((i + 1) < bp->b_npages)
 				db_printf(",");
 		}
 		db_printf("\n");
 	}
 	BUF_LOCKPRINTINFO(bp);
 #if defined(FULL_BUF_TRACKING)
 	db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt);
 
 	i = bp->b_io_tcnt % BUF_TRACKING_SIZE;
 	for (j = 1; j <= BUF_TRACKING_SIZE; j++) {
 		if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL)
 			continue;
 		db_printf(" %2u: %s\n", j,
 		    bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]);
 	}
 #elif defined(BUF_TRACKING)
 	db_printf("b_io_tracking: %s\n", bp->b_io_tracking);
 #endif
 	db_printf(" ");
 }
 
 DB_SHOW_COMMAND(bufqueues, bufqueues)
 {
 	struct bufdomain *bd;
 	struct buf *bp;
 	long total;
 	int i, j, cnt;
 
 	db_printf("bqempty: %d\n", bqempty.bq_len);
 
 	for (i = 0; i < buf_domains; i++) {
 		bd = &bdomain[i];
 		db_printf("Buf domain %d\n", i);
 		db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers);
 		db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers);
 		db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers);
 		db_printf("\n");
 		db_printf("\tbufspace\t%ld\n", bd->bd_bufspace);
 		db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace);
 		db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace);
 		db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace);
 		db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh);
 		db_printf("\n");
 		db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers);
 		db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers);
 		db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers);
 		db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh);
 		db_printf("\n");
 		total = 0;
 		TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist)
 			total += bp->b_bufsize;
 		db_printf("\tcleanq count\t%d (%ld)\n",
 		    bd->bd_cleanq->bq_len, total);
 		total = 0;
 		TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist)
 			total += bp->b_bufsize;
 		db_printf("\tdirtyq count\t%d (%ld)\n",
 		    bd->bd_dirtyq.bq_len, total);
 		db_printf("\twakeup\t\t%d\n", bd->bd_wanted);
 		db_printf("\tlim\t\t%d\n", bd->bd_lim);
 		db_printf("\tCPU ");
 		for (j = 0; j <= mp_maxid; j++)
 			db_printf("%d, ", bd->bd_subq[j].bq_len);
 		db_printf("\n");
 		cnt = 0;
 		total = 0;
 		for (j = 0; j < nbuf; j++)
 			if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) {
 				cnt++;
 				total += buf[j].b_bufsize;
 			}
 		db_printf("\tLocked buffers: %d space %ld\n", cnt, total);
 		cnt = 0;
 		total = 0;
 		for (j = 0; j < nbuf; j++)
 			if (buf[j].b_domain == i) {
 				cnt++;
 				total += buf[j].b_bufsize;
 			}
 		db_printf("\tTotal buffers: %d space %ld\n", cnt, total);
 	}
 }
 
 DB_SHOW_COMMAND(lockedbufs, lockedbufs)
 {
 	struct buf *bp;
 	int i;
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (BUF_ISLOCKED(bp)) {
 			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 			db_printf("\n");
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
 {
 	struct vnode *vp;
 	struct buf *bp;
 
 	if (!have_addr) {
 		db_printf("usage: show vnodebufs <addr>\n");
 		return;
 	}
 	vp = (struct vnode *)addr;
 	db_printf("Clean buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 	db_printf("Dirty buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 }
 
 DB_COMMAND(countfreebufs, db_coundfreebufs)
 {
 	struct buf *bp;
 	int i, used = 0, nfree = 0;
 
 	if (have_addr) {
 		db_printf("usage: countfreebufs\n");
 		return;
 	}
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (bp->b_qindex == QUEUE_EMPTY)
 			nfree++;
 		else
 			used++;
 	}
 
 	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
 	    nfree + used);
 	db_printf("numfreebuffers is %d\n", numfreebuffers);
 }
 #endif /* DDB */
Index: head/sys/sys/buf.h
===================================================================
--- head/sys/sys/buf.h	(revision 366910)
+++ head/sys/sys/buf.h	(revision 366911)
@@ -1,599 +1,599 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_BUF_H_
 #define	_SYS_BUF_H_
 
 #include <sys/bufobj.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <vm/uma.h>
 
 struct bio;
 struct buf;
 struct bufobj;
 struct mount;
 struct vnode;
 struct uio;
 
 /*
  * To avoid including <ufs/ffs/softdep.h> 
  */   
 LIST_HEAD(workhead, worklist);
 /*
  * These are currently used only by the soft dependency code, hence
  * are stored once in a global variable. If other subsystems wanted
  * to use these hooks, a pointer to a set of bio_ops could be added
  * to each buffer.
  */
 extern struct bio_ops {
 	void	(*io_start)(struct buf *);
 	void	(*io_complete)(struct buf *);
 	void	(*io_deallocate)(struct buf *);
 	int	(*io_countdeps)(struct buf *, int);
 } bioops;
 
 struct vm_object;
 struct vm_page;
 
 typedef uint32_t b_xflags_t;
 
 /*
  * The buffer header describes an I/O operation in the kernel.
  *
  * NOTES:
  *	b_bufsize, b_bcount.  b_bufsize is the allocation size of the
  *	buffer, either DEV_BSIZE or PAGE_SIZE aligned.  b_bcount is the
  *	originally requested buffer size and can serve as a bounds check
  *	against EOF.  For most, but not all uses, b_bcount == b_bufsize.
  *
  *	b_dirtyoff, b_dirtyend.  Buffers support piecemeal, unaligned
  *	ranges of dirty data that need to be written to backing store.
  *	The range is typically clipped at b_bcount ( not b_bufsize ).
  *
  *	b_resid.  Number of bytes remaining in I/O.  After an I/O operation
  *	completes, b_resid is usually 0 indicating 100% success.
  *
  *	All fields are protected by the buffer lock except those marked:
  *		V - Protected by owning bufobj lock
  *		Q - Protected by the buf queue lock
  *		D - Protected by an dependency implementation specific lock
  */
 struct buf {
 	struct bufobj	*b_bufobj;
 	long		b_bcount;
 	void		*b_caller1;
 	caddr_t		b_data;
 	int		b_error;
 	uint16_t	b_iocmd;	/* BIO_* bio_cmd from bio.h */
 	uint16_t	b_ioflags;	/* BIO_* bio_flags from bio.h */
 	off_t		b_iooffset;
 	long		b_resid;
 	void	(*b_iodone)(struct buf *);
 	void	(*b_ckhashcalc)(struct buf *);
 	uint64_t	b_ckhash;	/* B_CKHASH requested check-hash */
 	daddr_t b_blkno;		/* Underlying physical block number. */
 	off_t	b_offset;		/* Offset into file. */
 	TAILQ_ENTRY(buf) b_bobufs;	/* (V) Buffer's associated vnode. */
 	uint32_t	b_vflags;	/* (V) BV_* flags */
 	uint8_t		b_qindex;	/* (Q) buffer queue index */
 	uint8_t		b_domain;	/* (Q) buf domain this resides in */
 	uint16_t	b_subqueue;	/* (Q) per-cpu q if any */
 	uint32_t	b_flags;	/* B_* flags. */
 	b_xflags_t b_xflags;		/* extra flags */
 	struct lock b_lock;		/* Buffer lock */
 	long	b_bufsize;		/* Allocated buffer size. */
 	int	b_runningbufspace;	/* when I/O is running, pipelining */
 	int	b_kvasize;		/* size of kva for buffer */
 	int	b_dirtyoff;		/* Offset in buffer of dirty region. */
 	int	b_dirtyend;		/* Offset of end of dirty region. */
 	caddr_t	b_kvabase;		/* base kva for buffer */
 	daddr_t b_lblkno;		/* Logical block number. */
 	struct	vnode *b_vp;		/* Device vnode. */
 	struct	ucred *b_rcred;		/* Read credentials reference. */
 	struct	ucred *b_wcred;		/* Write credentials reference. */
 	union {
 		TAILQ_ENTRY(buf) b_freelist; /* (Q) */
 		struct {
 			void	(*b_pgiodone)(void *, vm_page_t *, int, int);
 			int	b_pgbefore;
 			int	b_pgafter;
 		};
 	};
 	union	cluster_info {
 		TAILQ_HEAD(cluster_list_head, buf) cluster_head;
 		TAILQ_ENTRY(buf) cluster_entry;
 	} b_cluster;
 	struct	vm_page *b_pages[btoc(MAXPHYS)];
 	int		b_npages;
 	struct	workhead b_dep;		/* (D) List of filesystem dependencies. */
 	void	*b_fsprivate1;
 	void	*b_fsprivate2;
 	void	*b_fsprivate3;
 
 #if defined(FULL_BUF_TRACKING)
 #define BUF_TRACKING_SIZE	32
 #define BUF_TRACKING_ENTRY(x)	((x) & (BUF_TRACKING_SIZE - 1))
 	const char	*b_io_tracking[BUF_TRACKING_SIZE];
 	uint32_t	b_io_tcnt;
 #elif defined(BUF_TRACKING)
 	const char	*b_io_tracking;
 #endif
 };
 
 #define b_object	b_bufobj->bo_object
 
 /*
  * These flags are kept in b_flags.
  *
  * Notes:
  *
  *	B_ASYNC		VOP calls on bp's are usually async whether or not
  *			B_ASYNC is set, but some subsystems, such as NFS, like 
  *			to know what is best for the caller so they can
  *			optimize the I/O.
  *
  *	B_PAGING	Indicates that bp is being used by the paging system or
  *			some paging system and that the bp is not linked into
  *			the b_vp's clean/dirty linked lists or ref counts.
  *			Buffer vp reassignments are illegal in this case.
  *
  *	B_CACHE		This may only be set if the buffer is entirely valid.
  *			The situation where B_DELWRI is set and B_CACHE is
  *			clear MUST be committed to disk by getblk() so 
  *			B_DELWRI can also be cleared.  See the comments for
  *			getblk() in kern/vfs_bio.c.  If B_CACHE is clear,
  *			the caller is expected to clear BIO_ERROR and B_INVAL,
  *			set BIO_READ, and initiate an I/O.
  *
  *			The 'entire buffer' is defined to be the range from
  *			0 through b_bcount.
  *
  *	B_MALLOC	Request that the buffer be allocated from the malloc
  *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
  *
  *	B_CLUSTEROK	This flag is typically set for B_DELWRI buffers
  *			by filesystems that allow clustering when the buffer
  *			is fully dirty and indicates that it may be clustered
  *			with other adjacent dirty buffers.  Note the clustering
  *			may not be used with the stage 1 data write under NFS
  *			but may be used for the commit rpc portion.
  *
  *	B_INVALONERR	This flag is set on dirty buffers.  It specifies that a
  *			write error should forcibly invalidate the buffer
  *			contents.  This flag should be used with caution, as it
  *			discards data.  It is incompatible with B_ASYNC.
  *
  *	B_VMIO		Indicates that the buffer is tied into an VM object.
  *			The buffer's data is always PAGE_SIZE aligned even
  *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
  *			always at least DEV_BSIZE aligned, though ).
  *
  *	B_DIRECT	Hint that we should attempt to completely free
  *			the pages underlying the buffer.  B_DIRECT is
  *			sticky until the buffer is released and typically
  *			only has an effect when B_RELBUF is also set.
  *
  */
 
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
 #define	B_NEEDCOMMIT	0x00000002	/* Append-write in progress. */
 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
 #define	B_DIRECT	0x00000008	/* direct I/O flag (pls free vmio) */
 #define	B_DEFERRED	0x00000010	/* Skipped over for cleaning */
 #define	B_CACHE		0x00000020	/* Bread found us in the cache. */
 #define	B_VALIDSUSPWRT	0x00000040	/* Valid write during suspension. */
 #define	B_DELWRI	0x00000080	/* Delay I/O until buffer reused. */
 #define	B_CKHASH	0x00000100	/* checksum hash calculated on read */
 #define	B_DONE		0x00000200	/* I/O completed. */
 #define	B_EINTR		0x00000400	/* I/O was interrupted */
 #define	B_NOREUSE	0x00000800	/* Contents not reused once released. */
 #define	B_REUSE		0x00001000	/* Contents reused, second chance. */
 #define	B_INVAL		0x00002000	/* Does not contain valid info. */
 #define	B_BARRIER	0x00004000	/* Write this and all preceding first. */
 #define	B_NOCACHE	0x00008000	/* Do not cache block after use. */
 #define	B_MALLOC	0x00010000	/* malloced b_data */
 #define	B_CLUSTEROK	0x00020000	/* Pagein op, so swap() can count it. */
 #define	B_INVALONERR	0x00040000	/* Invalidate on write error. */
 #define	B_00080000	0x00080000	/* Available flag. */
 #define	B_00100000	0x00100000	/* Available flag. */
 #define	B_00200000	0x00200000	/* Available flag. */
 #define	B_RELBUF	0x00400000	/* Release VMIO buffer. */
 #define	B_FS_FLAG1	0x00800000	/* Available flag for FS use. */
 #define	B_NOCOPY	0x01000000	/* Don't copy-on-write this buf. */
 #define	B_INFREECNT	0x02000000	/* buf is counted in numfreebufs */
 #define	B_PAGING	0x04000000	/* volatile paging I/O -- bypass VMIO */
 #define B_MANAGED	0x08000000	/* Managed by FS. */
 #define B_RAM		0x10000000	/* Read ahead mark (flag) */
 #define B_VMIO		0x20000000	/* VMIO flag */
 #define B_CLUSTER	0x40000000	/* pagein op, so swap() can count it */
 #define B_REMFREE	0x80000000	/* Delayed bremfree */
 
 #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \
 	"\33paging\32infreecnt\31nocopy\30b23\27relbuf\26b21\25b20" \
 	"\24b19\23invalonerr\22clusterok\21malloc\20nocache\17b14\16inval" \
 	"\15reuse\14noreuse\13eintr\12done\11b8\10delwri" \
 	"\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age"
 
 /*
  * These flags are kept in b_xflags.
  *
  * BX_FSPRIV reserves a set of eight flags that may be used by individual
  * filesystems for their own purpose. Their specific definitions are
  * found in the header files for each filesystem that uses them.
  */
 #define	BX_VNDIRTY	0x00000001	/* On vnode dirty list */
 #define	BX_VNCLEAN	0x00000002	/* On vnode clean list */
 #define	BX_CVTENXIO	0x00000004	/* Convert errors to ENXIO */
 #define	BX_BKGRDWRITE	0x00000010	/* Do writes in background */
 #define	BX_BKGRDMARKER	0x00000020	/* Mark buffer for splay tree */
 #define	BX_ALTDATA	0x00000040	/* Holds extended data */
 #define	BX_FSPRIV	0x00FF0000	/* Filesystem-specific flags mask */
 
 #define	PRINT_BUF_XFLAGS "\20\7altdata\6bkgrdmarker\5bkgrdwrite\3cvtenxio" \
 	"\2clean\1dirty"
 
 #define	NOOFFSET	(-1LL)		/* No buffer offset calculated yet */
 
 /*
  * These flags are kept in b_vflags.
  */
 #define	BV_SCANNED	0x00000001	/* VOP_FSYNC funcs mark written bufs */
 #define	BV_BKGRDINPROG	0x00000002	/* Background write in progress */
 #define	BV_BKGRDWAIT	0x00000004	/* Background write waiting */
 #define	BV_BKGRDERR	0x00000008	/* Error from background write */
 
 #define	PRINT_BUF_VFLAGS "\20\4bkgrderr\3bkgrdwait\2bkgrdinprog\1scanned"
 
 #ifdef _KERNEL
 
 #ifndef NSWBUF_MIN
 #define	NSWBUF_MIN	16
 #endif
 
 /*
  * Buffer locking
  */
 extern const char *buf_wmesg;		/* Default buffer lock message */
 #define BUF_WMESG "bufwait"
 #include <sys/proc.h>			/* XXX for curthread */
 #include <sys/mutex.h>
 
 /*
  * Initialize a lock.
  */
 #define BUF_LOCKINIT(bp)						\
 	lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, LK_NEW)
 /*
  *
  * Get a lock sleeping non-interruptably until it becomes available.
  */
 #define	BUF_LOCK(bp, locktype, interlock)				\
 	_lockmgr_args_rw(&(bp)->b_lock, (locktype), (interlock),	\
 	    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,		\
 	    LOCK_FILE, LOCK_LINE)
 
 /*
  * Get a lock sleeping with specified interruptably and timeout.
  */
 #define	BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo)	\
 	_lockmgr_args_rw(&(bp)->b_lock, (locktype) | LK_TIMELOCK,	\
 	    (interlock), (wmesg), (PRIBIO + 4) | (catch), (timo),	\
 	    LOCK_FILE, LOCK_LINE)
 
 /*
  * Release a lock. Only the acquiring process may free the lock unless
  * it has been handed off to biodone.
  */
 #define	BUF_UNLOCK(bp) do {						\
 	KASSERT(((bp)->b_flags & B_REMFREE) == 0,			\
 	    ("BUF_UNLOCK %p while B_REMFREE is still set.", (bp)));	\
 									\
 	BUF_UNLOCK_RAW((bp));						\
 } while (0)
 #define	BUF_UNLOCK_RAW(bp) do {						\
 	(void)_lockmgr_args(&(bp)->b_lock, LK_RELEASE, NULL,		\
 	    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,		\
 	    LOCK_FILE, LOCK_LINE);					\
 } while (0)
 
 /*
  * Check if a buffer lock is recursed.
  */
 #define	BUF_LOCKRECURSED(bp)						\
 	lockmgr_recursed(&(bp)->b_lock)
 
 /*
  * Check if a buffer lock is currently held.
  */
 #define	BUF_ISLOCKED(bp)						\
 	lockstatus(&(bp)->b_lock)
 /*
  * Free a buffer lock.
  */
 #define BUF_LOCKFREE(bp) 						\
 	lockdestroy(&(bp)->b_lock)
 
 /*
  * Print informations on a buffer lock.
  */
 #define BUF_LOCKPRINTINFO(bp) 						\
 	lockmgr_printinfo(&(bp)->b_lock)
 
 /*
  * Buffer lock assertions.
  */
 #if defined(INVARIANTS) && defined(INVARIANT_SUPPORT)
 #define	BUF_ASSERT_LOCKED(bp)						\
 	_lockmgr_assert(&(bp)->b_lock, KA_LOCKED, LOCK_FILE, LOCK_LINE)
 #define	BUF_ASSERT_SLOCKED(bp)						\
 	_lockmgr_assert(&(bp)->b_lock, KA_SLOCKED, LOCK_FILE, LOCK_LINE)
 #define	BUF_ASSERT_XLOCKED(bp)						\
 	_lockmgr_assert(&(bp)->b_lock, KA_XLOCKED, LOCK_FILE, LOCK_LINE)
 #define	BUF_ASSERT_UNLOCKED(bp)						\
 	_lockmgr_assert(&(bp)->b_lock, KA_UNLOCKED, LOCK_FILE, LOCK_LINE)
 #else
 #define	BUF_ASSERT_LOCKED(bp)
 #define	BUF_ASSERT_SLOCKED(bp)
 #define	BUF_ASSERT_XLOCKED(bp)
 #define	BUF_ASSERT_UNLOCKED(bp)
 #endif
 
 #ifdef _SYS_PROC_H_	/* Avoid #include <sys/proc.h> pollution */
 /*
  * When initiating asynchronous I/O, change ownership of the lock to the
  * kernel. Once done, the lock may legally released by biodone. The
  * original owning process can no longer acquire it recursively, but must
  * wait until the I/O is completed and the lock has been freed by biodone.
  */
 #define	BUF_KERNPROC(bp)						\
 	_lockmgr_disown(&(bp)->b_lock, LOCK_FILE, LOCK_LINE)
 #endif
 
 #endif /* _KERNEL */
 
 struct buf_queue_head {
 	TAILQ_HEAD(buf_queue, buf) queue;
 	daddr_t last_pblkno;
 	struct	buf *insert_point;
 	struct	buf *switch_point;
 };
 
 /*
  * This structure describes a clustered I/O. 
  */
 struct cluster_save {
 	long	bs_bcount;		/* Saved b_bcount. */
 	long	bs_bufsize;		/* Saved b_bufsize. */
 	int	bs_nchildren;		/* Number of associated buffers. */
 	struct buf **bs_children;	/* List of associated buffers. */
 };
 
 #ifdef _KERNEL
 
 static __inline int
 bwrite(struct buf *bp)
 {
 
 	KASSERT(bp->b_bufobj != NULL, ("bwrite: no bufobj bp=%p", bp));
 	KASSERT(bp->b_bufobj->bo_ops != NULL, ("bwrite: no bo_ops bp=%p", bp));
 	KASSERT(bp->b_bufobj->bo_ops->bop_write != NULL,
 	    ("bwrite: no bop_write bp=%p", bp));
 	return (BO_WRITE(bp->b_bufobj, bp));
 }
 
 static __inline void
 bstrategy(struct buf *bp)
 {
 
 	KASSERT(bp->b_bufobj != NULL, ("bstrategy: no bufobj bp=%p", bp));
 	KASSERT(bp->b_bufobj->bo_ops != NULL,
 	    ("bstrategy: no bo_ops bp=%p", bp));
 	KASSERT(bp->b_bufobj->bo_ops->bop_strategy != NULL,
 	    ("bstrategy: no bop_strategy bp=%p", bp));
 	BO_STRATEGY(bp->b_bufobj, bp);
 }
 
 static __inline void
 buf_start(struct buf *bp)
 {
 	if (bioops.io_start)
 		(*bioops.io_start)(bp);
 }
 
 static __inline void
 buf_complete(struct buf *bp)
 {
 	if (bioops.io_complete)
 		(*bioops.io_complete)(bp);
 }
 
 static __inline void
 buf_deallocate(struct buf *bp)
 {
 	if (bioops.io_deallocate)
 		(*bioops.io_deallocate)(bp);
 }
 
 static __inline int
 buf_countdeps(struct buf *bp, int i)
 {
 	if (bioops.io_countdeps)
 		return ((*bioops.io_countdeps)(bp, i));
 	else
 		return (0);
 }
 
 static __inline void
 buf_track(struct buf *bp __unused, const char *location __unused)
 {
 
 #if defined(FULL_BUF_TRACKING)
 	bp->b_io_tracking[BUF_TRACKING_ENTRY(bp->b_io_tcnt++)] = location;
 #elif defined(BUF_TRACKING)
 	bp->b_io_tracking = location;
 #endif
 }
 
 #endif /* _KERNEL */
 
 /*
  * Zero out the buffer's data area.
  */
 #define	clrbuf(bp) {							\
 	bzero((bp)->b_data, (u_int)(bp)->b_bcount);			\
 	(bp)->b_resid = 0;						\
 }
 
 /*
  * Flags for getblk's last parameter.
  */
 #define	GB_LOCK_NOWAIT	0x0001		/* Fail if we block on a buf lock. */
 #define	GB_NOCREAT	0x0002		/* Don't create a buf if not found. */
 #define	GB_NOWAIT_BD	0x0004		/* Do not wait for bufdaemon. */
 #define	GB_UNMAPPED	0x0008		/* Do not mmap buffer pages. */
 #define	GB_KVAALLOC	0x0010		/* But allocate KVA. */
 #define	GB_CKHASH	0x0020		/* If reading, calc checksum hash */
 #define	GB_NOSPARSE	0x0040		/* Do not instantiate holes */
 #define	GB_CVTENXIO	0x0080		/* Convert errors to ENXIO */
 
 #ifdef _KERNEL
 extern int	nbuf;			/* The number of buffer headers */
 extern long	maxswzone;		/* Max KVA for swap structures */
 extern long	maxbcache;		/* Max KVA for buffer cache */
 extern int	maxbcachebuf;		/* Max buffer cache block size */
 extern long	runningbufspace;
 extern long	hibufspace;
 extern int	dirtybufthresh;
 extern int	bdwriteskip;
 extern int	dirtybufferflushes;
 extern int	altbufferflushes;
 extern int	nswbuf;			/* Number of swap I/O buffer headers. */
 extern caddr_t	unmapped_buf;	/* Data address for unmapped buffers. */
 
 static inline int
 buf_mapped(struct buf *bp)
 {
 
 	return (bp->b_data != unmapped_buf);
 }
 
 void	runningbufwakeup(struct buf *);
 void	waitrunningbufspace(void);
 caddr_t	kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est);
 void	bufinit(void);
 void	bufshutdown(int);
 void	bdata2bio(struct buf *bp, struct bio *bip);
 void	bwillwrite(void);
 int	buf_dirty_count_severe(void);
 void	bremfree(struct buf *);
 void	bremfreef(struct buf *);	/* XXX Force bremfree, only for nfs. */
 #define bread(vp, blkno, size, cred, bpp) \
 	    breadn_flags(vp, blkno, blkno, size, NULL, NULL, 0, cred, 0, \
 		NULL, bpp)
 #define bread_gb(vp, blkno, size, cred, gbflags, bpp) \
 	    breadn_flags(vp, blkno, blkno, size, NULL, NULL, 0, cred, \
 		gbflags, NULL, bpp)
 #define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \
 	    breadn_flags(vp, blkno, blkno, size, rablkno, rabsize, cnt, cred, \
 		0, NULL, bpp)
 int	breadn_flags(struct vnode *, daddr_t, daddr_t, int, daddr_t *, int *, 
 	    int, struct ucred *, int, void (*)(struct buf *), struct buf **);
 void	bdwrite(struct buf *);
 void	bawrite(struct buf *);
 void	babarrierwrite(struct buf *);
 int	bbarrierwrite(struct buf *);
 void	bdirty(struct buf *);
 void	bundirty(struct buf *);
 void	bufstrategy(struct bufobj *, struct buf *);
 void	brelse(struct buf *);
 void	bqrelse(struct buf *);
 int	vfs_bio_awrite(struct buf *);
 void	vfs_busy_pages_acquire(struct buf *bp);
 void	vfs_busy_pages_release(struct buf *bp);
 struct buf *incore(struct bufobj *, daddr_t);
 bool	inmem(struct vnode *, daddr_t);
 struct buf *gbincore(struct bufobj *, daddr_t);
 struct buf *gbincore_unlocked(struct bufobj *, daddr_t);
 struct buf *getblk(struct vnode *, daddr_t, int, int, int, int);
 int	getblkx(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size,
 	    int slpflag, int slptimeo, int flags, struct buf **bpp);
 struct buf *geteblk(int, int);
 int	bufwait(struct buf *);
 int	bufwrite(struct buf *);
 void	bufdone(struct buf *);
 void	bd_speedup(void);
 
 extern uma_zone_t pbuf_zone;
 uma_zone_t pbuf_zsecond_create(const char *name, int max);
 
 int	cluster_read(struct vnode *, u_quad_t, daddr_t, long,
 	    struct ucred *, long, int, int, struct buf **);
 int	cluster_wbuild(struct vnode *, long, daddr_t, int, int);
 void	cluster_write(struct vnode *, struct buf *, u_quad_t, int, int);
 void	vfs_bio_brelse(struct buf *bp, int ioflags);
 void	vfs_bio_bzero_buf(struct buf *bp, int base, int size);
 void	vfs_bio_clrbuf(struct buf *);
 void	vfs_bio_set_flags(struct buf *bp, int ioflags);
 void	vfs_bio_set_valid(struct buf *, int base, int size);
 void	vfs_busy_pages(struct buf *, int clear_modify);
 void	vfs_unbusy_pages(struct buf *);
-int	vmapbuf(struct buf *, int);
+int	vmapbuf(struct buf *, void *, size_t, int);
 void	vunmapbuf(struct buf *);
 void	brelvp(struct buf *);
 void	bgetvp(struct vnode *, struct buf *);
 void	pbgetbo(struct bufobj *bo, struct buf *bp);
 void	pbgetvp(struct vnode *, struct buf *);
 void	pbrelbo(struct buf *);
 void	pbrelvp(struct buf *);
 int	allocbuf(struct buf *bp, int size);
 void	reassignbuf(struct buf *);
 void	bwait(struct buf *, u_char, const char *);
 void	bdone(struct buf *);
 
 typedef daddr_t (vbg_get_lblkno_t)(struct vnode *, vm_ooffset_t);
 typedef int (vbg_get_blksize_t)(struct vnode *, daddr_t);
 int	vfs_bio_getpages(struct vnode *vp, struct vm_page **ma, int count,
 	    int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno,
 	    vbg_get_blksize_t get_blksize);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_BUF_H_ */
Index: head/sys/ufs/ffs/ffs_rawread.c
===================================================================
--- head/sys/ufs/ffs/ffs_rawread.c	(revision 366910)
+++ head/sys/ufs/ffs/ffs_rawread.c	(revision 366911)
@@ -1,471 +1,468 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2000-2003 Tor Egge
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <sys/filio.h>
 #include <sys/ttycom.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/rwlock.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 static int ffs_rawread_readahead(struct vnode *vp,
 				 caddr_t udata,
 				 off_t offset,
 				 size_t len,
 				 struct thread *td,
 				 struct buf *bp);
 static int ffs_rawread_main(struct vnode *vp,
 			    struct uio *uio);
 
 static int ffs_rawread_sync(struct vnode *vp);
 
 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
 
 SYSCTL_DECL(_vfs_ffs);
 
 static uma_zone_t ffsraw_pbuf_zone;
 
 static int allowrawread = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0,
 	   "Flag to enable raw reads");
 
 static int rawreadahead = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0,
 	   "Flag to enable readahead for long raw reads");
 
 static void
 ffs_rawread_setup(void *arg __unused)
 {
 
 	ffsraw_pbuf_zone = pbuf_zsecond_create("ffsrawpbuf",
 	    (nswbuf > 100 ) ?  (nswbuf - (nswbuf >> 4)) : nswbuf - 8);
 }
 SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL);
 
 static int
 ffs_rawread_sync(struct vnode *vp)
 {
 	int error;
 	int upgraded;
 	struct bufobj *bo;
 	struct mount *mp;
 	vm_object_t obj;
 
 	/* Check for dirty mmap, pending writes and dirty buffers */
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	VI_LOCK(vp);
 	if (bo->bo_numoutput > 0 ||
 	    bo->bo_dirty.bv_cnt > 0 ||
 	    ((obj = vp->v_object) != NULL &&
 	     vm_object_mightbedirty(obj))) {
 		VI_UNLOCK(vp);
 		BO_UNLOCK(bo);
 		
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 				upgraded = 1;
 			else
 				upgraded = 0;
 			VOP_UNLOCK(vp);
 			(void) vn_start_write(vp, &mp, V_WAIT);
 			VOP_LOCK(vp, LK_EXCLUSIVE);
 		} else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 			upgraded = 1;
 			/* Upgrade to exclusive lock, this might block */
 			VOP_LOCK(vp, LK_UPGRADE);
 		} else
 			upgraded = 0;
 			
 		
 		VI_LOCK(vp);
 		/* Check if vnode was reclaimed while unlocked. */
 		if (VN_IS_DOOMED(vp)) {
 			VI_UNLOCK(vp);
 			if (upgraded != 0)
 				VOP_LOCK(vp, LK_DOWNGRADE);
 			vn_finished_write(mp);
 			return (EIO);
 		}
 		/* Attempt to msync mmap() regions to clean dirty mmap */ 
 		if ((obj = vp->v_object) != NULL &&
 		    vm_object_mightbedirty(obj)) {
 			VI_UNLOCK(vp);
 			VM_OBJECT_WLOCK(obj);
 			vm_object_page_clean(obj, 0, 0, OBJPC_SYNC);
 			VM_OBJECT_WUNLOCK(obj);
 		} else
 			VI_UNLOCK(vp);
 
 		/* Wait for pending writes to complete */
 		BO_LOCK(bo);
 		error = bufobj_wwait(&vp->v_bufobj, 0, 0);
 		if (error != 0) {
 			/* XXX: can't happen with a zero timeout ??? */
 			BO_UNLOCK(bo);
 			if (upgraded != 0)
 				VOP_LOCK(vp, LK_DOWNGRADE);
 			vn_finished_write(mp);
 			return (error);
 		}
 		/* Flush dirty buffers */
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
 			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) {
 				if (upgraded != 0)
 					VOP_LOCK(vp, LK_DOWNGRADE);
 				vn_finished_write(mp);
 				return (error);
 			}
 			BO_LOCK(bo);
 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
 				panic("ffs_rawread_sync: dirty bufs");
 		}
 		BO_UNLOCK(bo);
 		if (upgraded != 0)
 			VOP_LOCK(vp, LK_DOWNGRADE);
 		vn_finished_write(mp);
 	} else {
 		VI_UNLOCK(vp);
 		BO_UNLOCK(bo);
 	}
 	return 0;
 }
 
 static int
 ffs_rawread_readahead(struct vnode *vp,
 		      caddr_t udata,
 		      off_t offset,
 		      size_t len,
 		      struct thread *td,
 		      struct buf *bp)
 {
 	int error;
 	u_int iolen;
 	off_t blockno;
 	int blockoff;
 	int bsize;
 	struct vnode *dp;
 	int bforwards;
 	struct inode *ip;
 	ufs2_daddr_t blkno;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 
 	ip = VTOI(vp);
 	dp = ITODEVVP(ip);
 
 	iolen = ((vm_offset_t) udata) & PAGE_MASK;
 	bp->b_bcount = len;
 	if (bp->b_bcount + iolen > bp->b_kvasize) {
 		bp->b_bcount = bp->b_kvasize;
 		if (iolen != 0)
 			bp->b_bcount -= PAGE_SIZE;
 	}
 	bp->b_flags = 0;	/* XXX necessary ? */
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = bdone;
-	bp->b_data = udata;
 	blockno = offset / bsize;
 	blockoff = (offset % bsize) / DEV_BSIZE;
 	if ((daddr_t) blockno != blockno) {
 		return EINVAL; /* blockno overflow */
 	}
 
 	bp->b_lblkno = bp->b_blkno = blockno;
 
 	error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL);
 	if (error != 0)
 		return error;
 	if (blkno == -1) {
 		/* Fill holes with NULs to preserve semantics */
 		
 		if (bp->b_bcount + blockoff * DEV_BSIZE > bsize)
 			bp->b_bcount = bsize - blockoff * DEV_BSIZE;
-		bp->b_bufsize = bp->b_bcount;
 		
-		if (vmapbuf(bp, 1) < 0)
+		if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0)
 			return EFAULT;
 		
 		maybe_yield();
 		bzero(bp->b_data, bp->b_bufsize);
 
 		/* Mark operation completed (similar to bufdone()) */
 
 		bp->b_resid = 0;
 		bp->b_flags |= B_DONE;
 		return 0;
 	}
 	bp->b_blkno = blkno + blockoff;
 	bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE;
 
 	if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards))
 		bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE;
-	bp->b_bufsize = bp->b_bcount;
 
-	if (vmapbuf(bp, 1) < 0)
+	if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0)
 		return EFAULT;
 
 	BO_STRATEGY(&dp->v_bufobj, bp);
 	return 0;
 }
 
 static int
 ffs_rawread_main(struct vnode *vp,
 		 struct uio *uio)
 {
 	int error, nerror;
 	struct buf *bp, *nbp, *tbp;
 	u_int iolen;
 	caddr_t udata;
 	long resid;
 	off_t offset;
 	struct thread *td;
 
 	td = uio->uio_td ? uio->uio_td : curthread;
 	udata = uio->uio_iov->iov_base;
 	resid = uio->uio_resid;
 	offset = uio->uio_offset;
 
 	/*
 	 * keep the process from being swapped
 	 */
 	PHOLD(td->td_proc);
 
 	error = 0;
 	nerror = 0;
 
 	bp = NULL;
 	nbp = NULL;
 
 	while (resid > 0) {
 		
 		if (bp == NULL) { /* Setup first read */
 			bp = uma_zalloc(ffsraw_pbuf_zone, M_WAITOK);
 			pbgetvp(vp, bp);
 			error = ffs_rawread_readahead(vp, udata, offset,
 						     resid, td, bp);
 			if (error != 0)
 				break;
 			
 			if (resid > bp->b_bufsize) { /* Setup fist readahead */
 				if (rawreadahead != 0) 
 					nbp = uma_zalloc(ffsraw_pbuf_zone,
 					    M_NOWAIT);
 				else
 					nbp = NULL;
 				if (nbp != NULL) {
 					pbgetvp(vp, nbp);
 					
 					nerror = ffs_rawread_readahead(vp, 
 								       udata +
 								       bp->b_bufsize,
 								       offset +
 								       bp->b_bufsize,
 								       resid -
 								       bp->b_bufsize,
 								       td,
 								       nbp);
 					if (nerror) {
 						pbrelvp(nbp);
 						uma_zfree(ffsraw_pbuf_zone,
 						    nbp);
 						nbp = NULL;
 					}
 				}
 			}
 		}
 		
 		bwait(bp, PRIBIO, "rawrd");
 		vunmapbuf(bp);
 		
 		iolen = bp->b_bcount - bp->b_resid;
 		if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) {
 			nerror = 0;	/* Ignore possible beyond EOF error */
 			break; /* EOF */
 		}
 		
 		if ((bp->b_ioflags & BIO_ERROR) != 0) {
 			error = bp->b_error;
 			break;
 		}
 		resid -= iolen;
 		udata += iolen;
 		offset += iolen;
 		if (iolen < bp->b_bufsize) {
 			/* Incomplete read.  Try to read remaining part */
 			error = ffs_rawread_readahead(vp,
 						      udata,
 						      offset,
 						      bp->b_bufsize - iolen,
 						      td,
 						      bp);
 			if (error != 0)
 				break;
 		} else if (nbp != NULL) { /* Complete read with readahead */
 			
 			tbp = bp;
 			bp = nbp;
 			nbp = tbp;
 			
 			if (resid <= bp->b_bufsize) { /* No more readaheads */
 				pbrelvp(nbp);
 				uma_zfree(ffsraw_pbuf_zone, nbp);
 				nbp = NULL;
 			} else { /* Setup next readahead */
 				nerror = ffs_rawread_readahead(vp,
 							       udata +
 							       bp->b_bufsize,
 							       offset +
 							       bp->b_bufsize,
 							       resid -
 							       bp->b_bufsize,
 							       td,
 							       nbp);
 				if (nerror != 0) {
 					pbrelvp(nbp);
 					uma_zfree(ffsraw_pbuf_zone, nbp);
 					nbp = NULL;
 				}
 			}
 		} else if (nerror != 0) {/* Deferred Readahead error */
 			break;		
 		}  else if (resid > 0) { /* More to read, no readahead */
 			error = ffs_rawread_readahead(vp, udata, offset,
 						      resid, td, bp);
 			if (error != 0)
 				break;
 		}
 	}
 
 	if (bp != NULL) {
 		pbrelvp(bp);
 		uma_zfree(ffsraw_pbuf_zone, bp);
 	}
 	if (nbp != NULL) {			/* Run down readahead buffer */
 		bwait(nbp, PRIBIO, "rawrd");
 		vunmapbuf(nbp);
 		pbrelvp(nbp);
 		uma_zfree(ffsraw_pbuf_zone, nbp);
 	}
 
 	if (error == 0)
 		error = nerror;
 	PRELE(td->td_proc);
 	uio->uio_iov->iov_base = udata;
 	uio->uio_resid = resid;
 	uio->uio_offset = offset;
 	return error;
 }
 
 int
 ffs_rawread(struct vnode *vp,
 	    struct uio *uio,
 	    int *workdone)
 {
 	if (allowrawread != 0 &&
 	    uio->uio_iovcnt == 1 && 
 	    uio->uio_segflg == UIO_USERSPACE &&
 	    uio->uio_resid == uio->uio_iov->iov_len &&
 	    (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags &
 	     TDP_DEADLKTREAT) == 0) {
 		int secsize;		/* Media sector size */
 		off_t filebytes;	/* Bytes left of file */
 		int blockbytes;		/* Bytes left of file in full blocks */
 		int partialbytes;	/* Bytes in last partial block */
 		int skipbytes;		/* Bytes not to read in ffs_rawread */
 		struct inode *ip;
 		int error;
 		
 
 		/* Only handle sector aligned reads */
 		ip = VTOI(vp);
 		secsize = ITODEVVP(ip)->v_bufobj.bo_bsize;
 		if ((uio->uio_offset & (secsize - 1)) == 0 &&
 		    (uio->uio_resid & (secsize - 1)) == 0) {
 			
 			/* Sync dirty pages and buffers if needed */
 			error = ffs_rawread_sync(vp);
 			if (error != 0)
 				return error;
 			
 			/* Check for end of file */
 			if (ip->i_size > uio->uio_offset) {
 				filebytes = ip->i_size - uio->uio_offset;
 
 				/* No special eof handling needed ? */
 				if (uio->uio_resid <= filebytes) {
 					*workdone = 1;
 					return ffs_rawread_main(vp, uio);
 				}
 				
 				partialbytes = ((unsigned int) ip->i_size) %
 				    ITOFS(ip)->fs_bsize;
 				blockbytes = (int) filebytes - partialbytes;
 				if (blockbytes > 0) {
 					skipbytes = uio->uio_resid -
 						blockbytes;
 					uio->uio_resid = blockbytes;
 					error = ffs_rawread_main(vp, uio);
 					uio->uio_resid += skipbytes;
 					if (error != 0)
 						return error;
 					/* Read remaining part using buffer */
 				}
 			}
 		}
 	}
 	*workdone = 0;
 	return 0;
 }