Index: head/sys/amd64/conf/GENERIC
===================================================================
--- head/sys/amd64/conf/GENERIC	(revision 308154)
+++ head/sys/amd64/conf/GENERIC	(revision 308155)
@@ -1,369 +1,371 @@
 #
 # GENERIC -- Generic kernel configuration file for FreeBSD/amd64
 #
 # For more information on this file, please read the config(5) manual page,
 # and/or the handbook section on Kernel Configuration Files:
 #
 #    http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
 #
 # The handbook is also available locally in /usr/share/doc/handbook
 # if you've installed the doc distribution, otherwise always see the
 # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
 # latest information.
 #
 # An exhaustive list of options and more detailed explanations of the
 # device lines is also present in the ../../conf/NOTES and NOTES files.
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
 # $FreeBSD$
 
 cpu		HAMMER
 ident		GENERIC
 
 makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
 makeoptions	WITH_CTF=1		# Run ctfconvert(1) for DTrace support
 
 options 	SCHED_ULE		# ULE scheduler
 options 	PREEMPTION		# Enable kernel thread preemption
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
 options 	IPSEC			# IP (v4/v6) security
 options 	TCP_OFFLOAD		# TCP offload
 options 	TCP_HHOOK		# hhook(9) framework for TCP
 options 	SCTP			# Stream Control Transmission Protocol
 options 	FFS			# Berkeley Fast Filesystem
 options 	SOFTUPDATES		# Enable FFS soft updates support
 options 	UFS_ACL			# Support for access control lists
 options 	UFS_DIRHASH		# Improve performance on big directories
 options 	UFS_GJOURNAL		# Enable gjournal-based UFS journaling
 options 	QUOTA			# Enable disk quotas for UFS
 options 	MD_ROOT			# MD is a potential root device
 options 	NFSCL			# Network Filesystem Client
 options 	NFSD			# Network Filesystem Server
 options 	NFSLOCKD		# Network Lock Manager
 options 	NFS_ROOT		# NFS usable as /, requires NFSCL
 options 	MSDOSFS			# MSDOS Filesystem
 options 	CD9660			# ISO 9660 Filesystem
 options 	PROCFS			# Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
 options 	GEOM_RAID		# Soft RAID functionality.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
 options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
 options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
 options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
 options 	COMPAT_FREEBSD7		# Compatible with FreeBSD7
 options 	COMPAT_FREEBSD9		# Compatible with FreeBSD9
 options 	COMPAT_FREEBSD10	# Compatible with FreeBSD10
 options 	SCSI_DELAY=5000		# Delay (in ms) before probing SCSI
 options 	KTRACE			# ktrace(1) support
 options 	STACK			# stack(9) support
 options 	SYSVSHM			# SYSV-style shared memory
 options 	SYSVMSG			# SYSV-style message queues
 options 	SYSVSEM			# SYSV-style semaphores
 options 	_KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
 options 	PRINTF_BUFR_SIZE=128	# Prevent printf output being interspersed.
 options 	KBD_INSTALL_CDEV	# install a CDEV entry in /dev
 options 	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
 options 	AUDIT			# Security event auditing
 options 	CAPABILITY_MODE		# Capsicum capability mode
 options 	CAPABILITIES		# Capsicum capabilities
 options 	MAC			# TrustedBSD MAC Framework
 options 	KDTRACE_FRAME		# Ensure frames are compiled in
 options 	KDTRACE_HOOKS		# Kernel DTrace hooks
 options 	DDB_CTF			# Kernel ELF linker loads CTF data
 options 	INCLUDE_CONFIG_FILE	# Include this file in kernel
 options 	RACCT			# Resource accounting framework
 options 	RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default
 options 	RCTL			# Resource limits
 
 # Debugging support.  Always need this:
 options 	KDB			# Enable kernel debugger support.
 options 	KDB_TRACE		# Print a stack trace for a panic.
 # For full debugger support use (turn off in stable branch):
+options 	BUF_TRACKING		# Track buffer history
 options 	DDB			# Support DDB.
+options 	FULL_BUF_TRACKING	# Track more buffer history
 options 	GDB			# Support remote GDB.
 options 	DEADLKRES		# Enable the deadlock resolver
 options 	INVARIANTS		# Enable calls of extra sanity checking
 options 	INVARIANT_SUPPORT	# Extra sanity checks of internal structures, required by INVARIANTS
 options 	WITNESS			# Enable checks to detect deadlocks and cycles
 options 	WITNESS_SKIPSPIN	# Don't run witness on spinlocks for speed
 options 	MALLOC_DEBUG_MAXZONES=8	# Separate malloc(9) zones
 
 # Make an SMP-capable kernel by default
 options 	SMP			# Symmetric MultiProcessor Kernel
 options 	DEVICE_NUMA		# I/O Device Affinity
 
 # CPU frequency control
 device		cpufreq
 
 # Bus support.
 device		acpi
 options 	ACPI_DMAR
 device		pci
 options 	PCI_HP			# PCI-Express native HotPlug
 options		PCI_IOV			# PCI SR-IOV support
 
 # Floppy drives
 device		fdc
 
 # ATA controllers
 device		ahci			# AHCI-compatible SATA controllers
 device		ata			# Legacy ATA/SATA controllers
 device		mvs			# Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA
 device		siis			# SiliconImage SiI3124/SiI3132/SiI3531 SATA
 
 # SCSI Controllers
 device		ahc			# AHA2940 and onboard AIC7xxx devices
 options 	AHC_REG_PRETTY_PRINT	# Print register bitfields in debug
 					# output.  Adds ~128k to driver.
 device		ahd			# AHA39320/29320 and onboard AIC79xx devices
 options 	AHD_REG_PRETTY_PRINT	# Print register bitfields in debug
 					# output.  Adds ~215k to driver.
 device		esp			# AMD Am53C974 (Tekram DC-390(T))
 device		hptiop			# Highpoint RocketRaid 3xxx series
 device		isp			# Qlogic family
 #device		ispfw			# Firmware for QLogic HBAs- normally a module
 device		mpt			# LSI-Logic MPT-Fusion
 device		mps			# LSI-Logic MPT-Fusion 2
 device		mpr			# LSI-Logic MPT-Fusion 3
 #device		ncr			# NCR/Symbios Logic
 device		sym			# NCR/Symbios Logic (newer chipsets + those of `ncr')
 device		trm			# Tekram DC395U/UW/F DC315U adapters
 
 device		adv			# Advansys SCSI adapters
 device		adw			# Advansys wide SCSI adapters
 device		aic			# Adaptec 15[012]x SCSI adapters, AIC-6[23]60.
 device		bt			# Buslogic/Mylex MultiMaster SCSI adapters
 device		isci			# Intel C600 SAS controller
 
 # ATA/SCSI peripherals
 device		scbus			# SCSI bus (required for ATA/SCSI)
 device		ch			# SCSI media changers
 device		da			# Direct Access (disks)
 device		sa			# Sequential Access (tape etc)
 device		cd			# CD
 device		pass			# Passthrough device (direct ATA/SCSI access)
 device		ses			# Enclosure Services (SES and SAF-TE)
 #device		ctl			# CAM Target Layer
 
 # RAID controllers interfaced to the SCSI subsystem
 device		amr			# AMI MegaRAID
 device		arcmsr			# Areca SATA II RAID
 device		ciss			# Compaq Smart RAID 5*
 device		dpt			# DPT Smartcache III, IV - See NOTES for options
 device		hptmv			# Highpoint RocketRAID 182x
 device		hptnr			# Highpoint DC7280, R750
 device		hptrr			# Highpoint RocketRAID 17xx, 22xx, 23xx, 25xx
 device		hpt27xx			# Highpoint RocketRAID 27xx
 device		iir			# Intel Integrated RAID
 device		ips			# IBM (Adaptec) ServeRAID
 device		mly			# Mylex AcceleRAID/eXtremeRAID
 device		twa			# 3ware 9000 series PATA/SATA RAID
 device		tws			# LSI 3ware 9750 SATA+SAS 6Gb/s RAID controller
 
 # RAID controllers
 device		aac			# Adaptec FSA RAID
 device		aacp			# SCSI passthrough for aac (requires CAM)
 device		aacraid			# Adaptec by PMC RAID
 device		ida			# Compaq Smart RAID
 device		mfi			# LSI MegaRAID SAS
 device		mlx			# Mylex DAC960 family
 device		mrsas			# LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
 device		pmspcv			# PMC-Sierra SAS/SATA Controller driver
 #XXX pointer/int warnings
 #device		pst			# Promise Supertrak SX6000
 device		twe			# 3ware ATA RAID
 
 # NVM Express (NVMe) support
 device		nvme			# base NVMe driver
 device		nvd			# expose NVMe namespaces as disks, depends on nvme
 
 # atkbdc0 controls both the keyboard and the PS/2 mouse
 device		atkbdc			# AT keyboard controller
 device		atkbd			# AT keyboard
 device		psm			# PS/2 mouse
 
 device		kbdmux			# keyboard multiplexer
 
 device		vga			# VGA video card driver
 options 	VESA			# Add support for VESA BIOS Extensions (VBE)
 
 device		splash			# Splash screen and screen saver support
 
 # syscons is the default console driver, resembling an SCO console
 device		sc
 options 	SC_PIXEL_MODE		# add support for the raster text mode
 
 # vt is the new video console driver
 device		vt
 device		vt_vga
 device		vt_efifb
 
 device		agp			# support several AGP chipsets
 
 # PCCARD (PCMCIA) support
 # PCMCIA and cardbus bridge support
 device		cbb			# cardbus (yenta) bridge
 device		pccard			# PC Card (16-bit) bus
 device		cardbus			# CardBus (32-bit) bus
 
 # Serial (COM) ports
 device		uart			# Generic UART driver
 
 # Parallel port
 device		ppc
 device		ppbus			# Parallel port bus (required)
 device		lpt			# Printer
 device		ppi			# Parallel port interface device
 #device		vpo			# Requires scbus and da
 
 device		puc			# Multi I/O cards and multi-channel UARTs
 
 # PCI Ethernet NICs.
 device		bxe			# Broadcom NetXtreme II BCM5771X/BCM578XX 10GbE
 device		de			# DEC/Intel DC21x4x (``Tulip'')
 device		em			# Intel PRO/1000 Gigabit Ethernet Family
 device		igb			# Intel PRO/1000 PCIE Server Gigabit Family
 device		ix			# Intel PRO/10GbE PCIE PF Ethernet
 device		ixv			# Intel PRO/10GbE PCIE VF Ethernet
 device		ixl			# Intel XL710 40Gbe PCIE Ethernet
 device		ixlv			# Intel XL710 40Gbe VF PCIE Ethernet
 device		le			# AMD Am7900 LANCE and Am79C9xx PCnet
 device		ti			# Alteon Networks Tigon I/II gigabit Ethernet
 device		txp			# 3Com 3cR990 (``Typhoon'')
 device		vx			# 3Com 3c590, 3c595 (``Vortex'')
 
 # PCI Ethernet NICs that use the common MII bus controller code.
 # NOTE: Be sure to keep the 'device miibus' line in order to use these NICs!
 device		miibus			# MII bus support
 device		ae			# Attansic/Atheros L2 FastEthernet
 device		age			# Attansic/Atheros L1 Gigabit Ethernet
 device		alc			# Atheros AR8131/AR8132 Ethernet
 device		ale			# Atheros AR8121/AR8113/AR8114 Ethernet
 device		bce			# Broadcom BCM5706/BCM5708 Gigabit Ethernet
 device		bfe			# Broadcom BCM440x 10/100 Ethernet
 device		bge			# Broadcom BCM570xx Gigabit Ethernet
 device		cas			# Sun Cassini/Cassini+ and NS DP83065 Saturn
 device		dc			# DEC/Intel 21143 and various workalikes
 device		et			# Agere ET1310 10/100/Gigabit Ethernet
 device		fxp			# Intel EtherExpress PRO/100B (82557, 82558)
 device		gem			# Sun GEM/Sun ERI/Apple GMAC
 device		hme			# Sun HME (Happy Meal Ethernet)
 device		jme			# JMicron JMC250 Gigabit/JMC260 Fast Ethernet
 device		lge			# Level 1 LXT1001 gigabit Ethernet
 device		msk			# Marvell/SysKonnect Yukon II Gigabit Ethernet
 device		nfe			# nVidia nForce MCP on-board Ethernet
 device		nge			# NatSemi DP83820 gigabit Ethernet
 device		pcn			# AMD Am79C97x PCI 10/100 (precedence over 'le')
 device		re			# RealTek 8139C+/8169/8169S/8110S
 device		rl			# RealTek 8129/8139
 device		sf			# Adaptec AIC-6915 (``Starfire'')
 device		sge			# Silicon Integrated Systems SiS190/191
 device		sis			# Silicon Integrated Systems SiS 900/SiS 7016
 device		sk			# SysKonnect SK-984x & SK-982x gigabit Ethernet
 device		ste			# Sundance ST201 (D-Link DFE-550TX)
 device		stge			# Sundance/Tamarack TC9021 gigabit Ethernet
 device		tl			# Texas Instruments ThunderLAN
 device		tx			# SMC EtherPower II (83c170 ``EPIC'')
 device		vge			# VIA VT612x gigabit Ethernet
 device		vr			# VIA Rhine, Rhine II
 device		wb			# Winbond W89C840F
 device		xl			# 3Com 3c90x (``Boomerang'', ``Cyclone'')
 
 # Wireless NIC cards
 device		wlan			# 802.11 support
 options 	IEEE80211_DEBUG		# enable debug msgs
 options 	IEEE80211_AMPDU_AGE	# age frames in AMPDU reorder q's
 options 	IEEE80211_SUPPORT_MESH	# enable 802.11s draft support
 device		wlan_wep		# 802.11 WEP support
 device		wlan_ccmp		# 802.11 CCMP support
 device		wlan_tkip		# 802.11 TKIP support
 device		wlan_amrr		# AMRR transmit rate control algorithm
 device		an			# Aironet 4500/4800 802.11 wireless NICs.
 device		ath			# Atheros NICs
 device		ath_pci			# Atheros pci/cardbus glue
 device		ath_hal			# pci/cardbus chip support
 options 	AH_SUPPORT_AR5416	# enable AR5416 tx/rx descriptors
 options 	AH_AR5416_INTERRUPT_MITIGATION # AR5416 interrupt mitigation
 options 	ATH_ENABLE_11N		# Enable 802.11n support for AR5416 and later
 device		ath_rate_sample		# SampleRate tx rate control for ath
 #device		bwi			# Broadcom BCM430x/BCM431x wireless NICs.
 #device		bwn			# Broadcom BCM43xx wireless NICs.
 device		ipw			# Intel 2100 wireless NICs.
 device		iwi			# Intel 2200BG/2225BG/2915ABG wireless NICs.
 device		iwn			# Intel 4965/1000/5000/6000 wireless NICs.
 device		malo			# Marvell Libertas wireless NICs.
 device		mwl			# Marvell 88W8363 802.11n wireless NICs.
 device		ral			# Ralink Technology RT2500 wireless NICs.
 device		wi			# WaveLAN/Intersil/Symbol 802.11 wireless NICs.
 device		wpi			# Intel 3945ABG wireless NICs.
 
 # Pseudo devices.
 device		loop			# Network loopback
 device		random			# Entropy device
 device		padlock_rng		# VIA Padlock RNG
 device		rdrand_rng		# Intel Bull Mountain RNG
 device		ether			# Ethernet support
 device		vlan			# 802.1Q VLAN support
 device		tun			# Packet tunnel.
 device		md			# Memory "disks"
 device		gif			# IPv6 and IPv4 tunneling
 device		firmware		# firmware assist module
 
 # The `bpf' device enables the Berkeley Packet Filter.
 # Be aware of the administrative consequences of enabling this!
 # Note that 'bpf' is required for DHCP.
 device		bpf			# Berkeley packet filter
 
 # USB support
 options 	USB_DEBUG		# enable debug msgs
 device		uhci			# UHCI PCI->USB interface
 device		ohci			# OHCI PCI->USB interface
 device		ehci			# EHCI PCI->USB interface (USB 2.0)
 device		xhci			# XHCI PCI->USB interface (USB 3.0)
 device		usb			# USB Bus (required)
 device		ukbd			# Keyboard
 device		umass			# Disks/Mass storage - Requires scbus and da
 
 # Sound support
 device		sound			# Generic sound driver (required)
 device		snd_cmi			# CMedia CMI8338/CMI8738
 device		snd_csa			# Crystal Semiconductor CS461x/428x
 device		snd_emu10kx		# Creative SoundBlaster Live! and Audigy
 device		snd_es137x		# Ensoniq AudioPCI ES137x
 device		snd_hda			# Intel High Definition Audio
 device		snd_ich			# Intel, NVidia and other ICH AC'97 Audio
 device		snd_via8233		# VIA VT8233x Audio
 
 # MMC/SD
 device		mmc			# MMC/SD bus
 device		mmcsd			# MMC/SD memory card
 device		sdhci			# Generic PCI SD Host Controller
 
 # VirtIO support
 device		virtio			# Generic VirtIO bus (required)
 device		virtio_pci		# VirtIO PCI device
 device		vtnet			# VirtIO Ethernet device
 device		virtio_blk		# VirtIO Block device
 device		virtio_scsi		# VirtIO SCSI device
 device		virtio_balloon		# VirtIO Memory Balloon device
 
 # HyperV drivers and enhancement support
 device		hyperv			# HyperV drivers 
 
 # Xen HVM Guest Optimizations
 # NOTE: XENHVM depends on xenpci.  They must be added or removed together.
 options 	XENHVM			# Xen HVM kernel infrastructure
 device		xenpci			# Xen HVM Hypervisor services driver
 
 # VMware support
 device		vmx			# VMware VMXNET3 Ethernet
 
 # Netmap provides direct access to TX/RX rings on supported NICs
 device		netmap			# netmap(4) support
 
 # The crypto framework is required by IPSEC
 device		crypto			# Required by IPSEC
Index: head/sys/amd64/conf/GENERIC-NODEBUG
===================================================================
--- head/sys/amd64/conf/GENERIC-NODEBUG	(revision 308154)
+++ head/sys/amd64/conf/GENERIC-NODEBUG	(revision 308155)
@@ -1,38 +1,40 @@
 #
 # GENERIC-NODEBUG -- WITNESS and INVARIANTS free kernel configuration file 
 #		     for FreeBSD/amd64
 #
 # This configuration file removes several debugging options, including
 # WITNESS and INVARIANTS checking, which are known to have significant
 # performance impact on running systems.  When benchmarking new features
 # this kernel should be used instead of the standard GENERIC.
 # This kernel configuration should never appear outside of the HEAD
 # of the FreeBSD tree.
 #
 # For more information on this file, please read the config(5) manual page,
 # and/or the handbook section on Kernel Configuration Files:
 #
 #    http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
 #
 # The handbook is also available locally in /usr/share/doc/handbook
 # if you've installed the doc distribution, otherwise always see the
 # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
 # latest information.
 #
 # An exhaustive list of options and more detailed explanations of the
 # device lines is also present in the ../../conf/NOTES and NOTES files.
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
 # $FreeBSD$
 
 include GENERIC
 
 ident   GENERIC-NODEBUG
 
 nooptions       INVARIANTS
 nooptions       INVARIANT_SUPPORT
 nooptions       WITNESS
 nooptions       WITNESS_SKIPSPIN
+nooptions       BUF_TRACKING
 nooptions       DEADLKRES
+nooptions       FULL_BUF_TRACKING
 
Index: head/sys/cam/cam_ccb.h
===================================================================
--- head/sys/cam/cam_ccb.h	(revision 308154)
+++ head/sys/cam/cam_ccb.h	(revision 308155)
@@ -1,1435 +1,1441 @@
 /*-
  * Data structures and definitions for CAM Control Blocks (CCBs).
  *
  * Copyright (c) 1997, 1998 Justin T. Gibbs.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _CAM_CAM_CCB_H
 #define _CAM_CAM_CCB_H 1
 
 #include <sys/queue.h>
 #include <sys/cdefs.h>
 #include <sys/time.h>
 #include <sys/limits.h>
 #ifndef _KERNEL
 #include <sys/callout.h>
 #endif
 #include <cam/cam_debug.h>
 #include <cam/scsi/scsi_all.h>
 #include <cam/ata/ata_all.h>
 #include <cam/nvme/nvme_all.h>
 
 /* General allocation length definitions for CCB structures */
 #define	IOCDBLEN	CAM_MAX_CDBLEN	/* Space for CDB bytes/pointer */
 #define	VUHBALEN	14		/* Vendor Unique HBA length */
 #define	SIM_IDLEN	16		/* ASCII string len for SIM ID */
 #define	HBA_IDLEN	16		/* ASCII string len for HBA ID */
 #define	DEV_IDLEN	16		/* ASCII string len for device names */
 #define CCB_PERIPH_PRIV_SIZE 	2	/* size of peripheral private area */
 #define CCB_SIM_PRIV_SIZE 	2	/* size of sim private area */
 
 /* Struct definitions for CAM control blocks */
 
 /* Common CCB header */
 /* CAM CCB flags */
 typedef enum {
 	CAM_CDB_POINTER		= 0x00000001,/* The CDB field is a pointer    */
 	CAM_QUEUE_ENABLE	= 0x00000002,/* SIM queue actions are enabled */
 	CAM_CDB_LINKED		= 0x00000004,/* CCB contains a linked CDB     */
 	CAM_NEGOTIATE		= 0x00000008,/*
 					      * Perform transport negotiation
 					      * with this command.
 					      */
 	CAM_DATA_ISPHYS		= 0x00000010,/* Data type with physical addrs */
 	CAM_DIS_AUTOSENSE	= 0x00000020,/* Disable autosense feature     */
 	CAM_DIR_BOTH		= 0x00000000,/* Data direction (00:IN/OUT)    */
 	CAM_DIR_IN		= 0x00000040,/* Data direction (01:DATA IN)   */
 	CAM_DIR_OUT		= 0x00000080,/* Data direction (10:DATA OUT)  */
 	CAM_DIR_NONE		= 0x000000C0,/* Data direction (11:no data)   */
 	CAM_DIR_MASK		= 0x000000C0,/* Data direction Mask	      */
 	CAM_DATA_VADDR		= 0x00000000,/* Data type (000:Virtual)       */
 	CAM_DATA_PADDR		= 0x00000010,/* Data type (001:Physical)      */
 	CAM_DATA_SG		= 0x00040000,/* Data type (010:sglist)        */
 	CAM_DATA_SG_PADDR	= 0x00040010,/* Data type (011:sglist phys)   */
 	CAM_DATA_BIO		= 0x00200000,/* Data type (100:bio)           */
 	CAM_DATA_MASK		= 0x00240010,/* Data type mask                */
 	CAM_SOFT_RST_OP		= 0x00000100,/* Use Soft reset alternative    */
 	CAM_ENG_SYNC		= 0x00000200,/* Flush resid bytes on complete */
 	CAM_DEV_QFRZDIS		= 0x00000400,/* Disable DEV Q freezing	      */
 	CAM_DEV_QFREEZE		= 0x00000800,/* Freeze DEV Q on execution     */
 	CAM_HIGH_POWER		= 0x00001000,/* Command takes a lot of power  */
 	CAM_SENSE_PTR		= 0x00002000,/* Sense data is a pointer	      */
 	CAM_SENSE_PHYS		= 0x00004000,/* Sense pointer is physical addr*/
 	CAM_TAG_ACTION_VALID	= 0x00008000,/* Use the tag action in this ccb*/
 	CAM_PASS_ERR_RECOVER	= 0x00010000,/* Pass driver does err. recovery*/
 	CAM_DIS_DISCONNECT	= 0x00020000,/* Disable disconnect	      */
 	CAM_MSG_BUF_PHYS	= 0x00080000,/* Message buffer ptr is physical*/
 	CAM_SNS_BUF_PHYS	= 0x00100000,/* Autosense data ptr is physical*/
 	CAM_CDB_PHYS		= 0x00400000,/* CDB poiner is physical	      */
 	CAM_ENG_SGLIST		= 0x00800000,/* SG list is for the HBA engine */
 
 /* Phase cognizant mode flags */
 	CAM_DIS_AUTOSRP		= 0x01000000,/* Disable autosave/restore ptrs */
 	CAM_DIS_AUTODISC	= 0x02000000,/* Disable auto disconnect	      */
 	CAM_TGT_CCB_AVAIL	= 0x04000000,/* Target CCB available	      */
 	CAM_TGT_PHASE_MODE	= 0x08000000,/* The SIM runs in phase mode    */
 	CAM_MSGB_VALID		= 0x10000000,/* Message buffer valid	      */
 	CAM_STATUS_VALID	= 0x20000000,/* Status buffer valid	      */
 	CAM_DATAB_VALID		= 0x40000000,/* Data buffer valid	      */
 
 /* Host target Mode flags */
 	CAM_SEND_SENSE		= 0x08000000,/* Send sense data with status   */
 	CAM_TERM_IO		= 0x10000000,/* Terminate I/O Message sup.    */
 	CAM_DISCONNECT		= 0x20000000,/* Disconnects are mandatory     */
 	CAM_SEND_STATUS		= 0x40000000,/* Send status after data phase  */
 
 	CAM_UNLOCKED		= 0x80000000 /* Call callback without lock.   */
 } ccb_flags;
 
 typedef enum {
 	CAM_USER_DATA_ADDR	= 0x00000002,/* Userspace data pointers */
 	CAM_SG_FORMAT_IOVEC	= 0x00000004,/* iovec instead of busdma S/G*/
 	CAM_UNMAPPED_BUF	= 0x00000008 /* use unmapped I/O */
 } ccb_xflags;
 
 /* XPT Opcodes for xpt_action */
 typedef enum {
 /* Function code flags are bits greater than 0xff */
 	XPT_FC_QUEUED		= 0x100,
 				/* Non-immediate function code */
 	XPT_FC_USER_CCB		= 0x200,
 	XPT_FC_XPT_ONLY		= 0x400,
 				/* Only for the transport layer device */
 	XPT_FC_DEV_QUEUED	= 0x800 | XPT_FC_QUEUED,
 				/* Passes through the device queues */
 /* Common function commands: 0x00->0x0F */
 	XPT_NOOP 		= 0x00,
 				/* Execute Nothing */
 	XPT_SCSI_IO		= 0x01 | XPT_FC_DEV_QUEUED,
 				/* Execute the requested I/O operation */
 	XPT_GDEV_TYPE		= 0x02,
 				/* Get type information for specified device */
 	XPT_GDEVLIST		= 0x03,
 				/* Get a list of peripheral devices */
 	XPT_PATH_INQ		= 0x04,
 				/* Path routing inquiry */
 	XPT_REL_SIMQ		= 0x05,
 				/* Release a frozen device queue */
 	XPT_SASYNC_CB		= 0x06,
 				/* Set Asynchronous Callback Parameters */
 	XPT_SDEV_TYPE		= 0x07,
 				/* Set device type information */
 	XPT_SCAN_BUS		= 0x08 | XPT_FC_QUEUED | XPT_FC_USER_CCB
 				       | XPT_FC_XPT_ONLY,
 				/* (Re)Scan the SCSI Bus */
 	XPT_DEV_MATCH		= 0x09 | XPT_FC_XPT_ONLY,
 				/* Get EDT entries matching the given pattern */
 	XPT_DEBUG		= 0x0a,
 				/* Turn on debugging for a bus, target or lun */
 	XPT_PATH_STATS		= 0x0b,
 				/* Path statistics (error counts, etc.) */
 	XPT_GDEV_STATS		= 0x0c,
 				/* Device statistics (error counts, etc.) */
 	XPT_DEV_ADVINFO		= 0x0e,
 				/* Get/Set Device advanced information */
 	XPT_ASYNC		= 0x0f | XPT_FC_QUEUED | XPT_FC_USER_CCB
 				       | XPT_FC_XPT_ONLY,
 				/* Asynchronous event */
 /* SCSI Control Functions: 0x10->0x1F */
 	XPT_ABORT		= 0x10,
 				/* Abort the specified CCB */
 	XPT_RESET_BUS		= 0x11 | XPT_FC_XPT_ONLY,
 				/* Reset the specified SCSI bus */
 	XPT_RESET_DEV		= 0x12 | XPT_FC_DEV_QUEUED,
 				/* Bus Device Reset the specified SCSI device */
 	XPT_TERM_IO		= 0x13,
 				/* Terminate the I/O process */
 	XPT_SCAN_LUN		= 0x14 | XPT_FC_QUEUED | XPT_FC_USER_CCB
 				       | XPT_FC_XPT_ONLY,
 				/* Scan Logical Unit */
 	XPT_GET_TRAN_SETTINGS	= 0x15,
 				/*
 				 * Get default/user transfer settings
 				 * for the target
 				 */
 	XPT_SET_TRAN_SETTINGS	= 0x16,
 				/*
 				 * Set transfer rate/width
 				 * negotiation settings
 				 */
 	XPT_CALC_GEOMETRY	= 0x17,
 				/*
 				 * Calculate the geometry parameters for
 				 * a device give the sector size and
 				 * volume size.
 				 */
 	XPT_ATA_IO		= 0x18 | XPT_FC_DEV_QUEUED,
 				/* Execute the requested ATA I/O operation */
 
 	XPT_GET_SIM_KNOB_OLD	= 0x18, /* Compat only */
 
 	XPT_SET_SIM_KNOB	= 0x19,
 				/*
 				 * Set SIM specific knob values.
 				 */
 
 	XPT_GET_SIM_KNOB	= 0x1a,
 				/*
 				 * Get SIM specific knob values.
 				 */
 
 	XPT_SMP_IO		= 0x1b | XPT_FC_DEV_QUEUED,
 				/* Serial Management Protocol */
 
 	XPT_NVME_IO		= 0x1c | XPT_FC_DEV_QUEUED,
 				/* Execiute the requestred NVMe I/O operation */
 
 	XPT_MMCSD_IO		= 0x1d | XPT_FC_DEV_QUEUED,
 				/* Placeholder for MMC / SD / SDIO I/O stuff */
 
 	XPT_SCAN_TGT		= 0x1E | XPT_FC_QUEUED | XPT_FC_USER_CCB
 				       | XPT_FC_XPT_ONLY,
 				/* Scan Target */
 
 /* HBA engine commands 0x20->0x2F */
 	XPT_ENG_INQ		= 0x20 | XPT_FC_XPT_ONLY,
 				/* HBA engine feature inquiry */
 	XPT_ENG_EXEC		= 0x21 | XPT_FC_DEV_QUEUED,
 				/* HBA execute engine request */
 
 /* Target mode commands: 0x30->0x3F */
 	XPT_EN_LUN		= 0x30,
 				/* Enable LUN as a target */
 	XPT_TARGET_IO		= 0x31 | XPT_FC_DEV_QUEUED,
 				/* Execute target I/O request */
 	XPT_ACCEPT_TARGET_IO	= 0x32 | XPT_FC_QUEUED | XPT_FC_USER_CCB,
 				/* Accept Host Target Mode CDB */
 	XPT_CONT_TARGET_IO	= 0x33 | XPT_FC_DEV_QUEUED,
 				/* Continue Host Target I/O Connection */
 	XPT_IMMED_NOTIFY	= 0x34 | XPT_FC_QUEUED | XPT_FC_USER_CCB,
 				/* Notify Host Target driver of event (obsolete) */
 	XPT_NOTIFY_ACK		= 0x35,
 				/* Acknowledgement of event (obsolete) */
 	XPT_IMMEDIATE_NOTIFY	= 0x36 | XPT_FC_QUEUED | XPT_FC_USER_CCB,
 				/* Notify Host Target driver of event */
 	XPT_NOTIFY_ACKNOWLEDGE	= 0x37 | XPT_FC_QUEUED | XPT_FC_USER_CCB,
 				/* Acknowledgement of event */
 	XPT_REPROBE_LUN		= 0x38 | XPT_FC_QUEUED | XPT_FC_USER_CCB,
 				/* Query device capacity and notify GEOM */
 
 /* Vendor Unique codes: 0x80->0x8F */
 	XPT_VUNIQUE		= 0x80
 } xpt_opcode;
 
 #define XPT_FC_GROUP_MASK		0xF0
 #define XPT_FC_GROUP(op) ((op) & XPT_FC_GROUP_MASK)
 #define XPT_FC_GROUP_COMMON		0x00
 #define XPT_FC_GROUP_SCSI_CONTROL	0x10
 #define XPT_FC_GROUP_HBA_ENGINE		0x20
 #define XPT_FC_GROUP_TMODE		0x30
 #define XPT_FC_GROUP_VENDOR_UNIQUE	0x80
 
 #define XPT_FC_IS_DEV_QUEUED(ccb) 	\
     (((ccb)->ccb_h.func_code & XPT_FC_DEV_QUEUED) == XPT_FC_DEV_QUEUED)
 #define XPT_FC_IS_QUEUED(ccb) 	\
     (((ccb)->ccb_h.func_code & XPT_FC_QUEUED) != 0)
 
 typedef enum {
 	PROTO_UNKNOWN,
 	PROTO_UNSPECIFIED,
 	PROTO_SCSI,	/* Small Computer System Interface */
 	PROTO_ATA,	/* AT Attachment */
 	PROTO_ATAPI,	/* AT Attachment Packetized Interface */
 	PROTO_SATAPM,	/* SATA Port Multiplier */
 	PROTO_SEMB,	/* SATA Enclosure Management Bridge */
 	PROTO_NVME,	/* NVME */
 } cam_proto;
 
 typedef enum {
 	XPORT_UNKNOWN,
 	XPORT_UNSPECIFIED,
 	XPORT_SPI,	/* SCSI Parallel Interface */
 	XPORT_FC,	/* Fiber Channel */
 	XPORT_SSA,	/* Serial Storage Architecture */
 	XPORT_USB,	/* Universal Serial Bus */
 	XPORT_PPB,	/* Parallel Port Bus */
 	XPORT_ATA,	/* AT Attachment */
 	XPORT_SAS,	/* Serial Attached SCSI */
 	XPORT_SATA,	/* Serial AT Attachment */
 	XPORT_ISCSI,	/* iSCSI */
 	XPORT_SRP,	/* SCSI RDMA Protocol */
 	XPORT_NVME,	/* NVMe over PCIe */
 } cam_xport;
 
 #define XPORT_IS_NVME(t)	((t) == XPORT_NVME)
 #define XPORT_IS_ATA(t)		((t) == XPORT_ATA || (t) == XPORT_SATA)
 #define XPORT_IS_SCSI(t)	((t) != XPORT_UNKNOWN && \
 				 (t) != XPORT_UNSPECIFIED && \
 				 !XPORT_IS_ATA(t) && !XPORT_IS_NVME(t))
 #define XPORT_DEVSTAT_TYPE(t)	(XPORT_IS_ATA(t) ? DEVSTAT_TYPE_IF_IDE : \
 				 XPORT_IS_SCSI(t) ? DEVSTAT_TYPE_IF_SCSI : \
 				 DEVSTAT_TYPE_IF_OTHER)
 
 #define PROTO_VERSION_UNKNOWN (UINT_MAX - 1)
 #define PROTO_VERSION_UNSPECIFIED UINT_MAX
 #define XPORT_VERSION_UNKNOWN (UINT_MAX - 1)
 #define XPORT_VERSION_UNSPECIFIED UINT_MAX
 
 typedef union {
 	LIST_ENTRY(ccb_hdr) le;
 	SLIST_ENTRY(ccb_hdr) sle;
 	TAILQ_ENTRY(ccb_hdr) tqe;
 	STAILQ_ENTRY(ccb_hdr) stqe;
 } camq_entry;
 
 typedef union {
 	void		*ptr;
 	u_long		field;
 	u_int8_t	bytes[sizeof(uintptr_t)];
 } ccb_priv_entry;
 
 typedef union {
 	ccb_priv_entry	entries[CCB_PERIPH_PRIV_SIZE];
 	u_int8_t	bytes[CCB_PERIPH_PRIV_SIZE * sizeof(ccb_priv_entry)];
 } ccb_ppriv_area;
 
 typedef union {
 	ccb_priv_entry	entries[CCB_SIM_PRIV_SIZE];
 	u_int8_t	bytes[CCB_SIM_PRIV_SIZE * sizeof(ccb_priv_entry)];
 } ccb_spriv_area;
 
 typedef struct {
 	struct timeval	*etime;
 	uintptr_t	sim_data;
 	uintptr_t	periph_data;
 } ccb_qos_area;
 
 struct ccb_hdr {
 	cam_pinfo	pinfo;		/* Info for priority scheduling */
 	camq_entry	xpt_links;	/* For chaining in the XPT layer */	
 	camq_entry	sim_links;	/* For chaining in the SIM layer */	
 	camq_entry	periph_links;	/* For chaining in the type driver */
 	u_int32_t	retry_count;
 	void		(*cbfcnp)(struct cam_periph *, union ccb *);
 					/* Callback on completion function */
 	xpt_opcode	func_code;	/* XPT function code */
 	u_int32_t	status;		/* Status returned by CAM subsystem */
 	struct		cam_path *path;	/* Compiled path for this ccb */
 	path_id_t	path_id;	/* Path ID for the request */
 	target_id_t	target_id;	/* Target device ID */
 	lun_id_t	target_lun;	/* Target LUN number */
 	u_int32_t	flags;		/* ccb_flags */
 	u_int32_t	xflags;		/* Extended flags */
 	ccb_ppriv_area	periph_priv;
 	ccb_spriv_area	sim_priv;
 	ccb_qos_area	qos;
 	u_int32_t	timeout;	/* Hard timeout value in mseconds */
 	struct timeval	softtimeout;	/* Soft timeout value in sec + usec */
 };
 
 /* Get Device Information CCB */
 struct ccb_getdev {
 	struct	  ccb_hdr ccb_h;
 	cam_proto protocol;
 	struct scsi_inquiry_data inq_data;
 	struct ata_params ident_data;
 	u_int8_t  serial_num[252];
 	u_int8_t  inq_flags;
 	u_int8_t  serial_num_len;
 	const struct nvme_controller_data	*nvme_cdata;
 	const struct nvme_namespace_data	*nvme_data;
 };
 
 /* Device Statistics CCB */
 struct ccb_getdevstats {
 	struct	ccb_hdr	ccb_h;
 	int	dev_openings;	/* Space left for more work on device*/	
 	int	dev_active;	/* Transactions running on the device */
 	int	allocated;	/* CCBs allocated for the device */
 	int	queued;		/* CCBs queued to be sent to the device */
 	int	held;		/*
 				 * CCBs held by peripheral drivers
 				 * for this device
 				 */
 	int	maxtags;	/*
 				 * Boundary conditions for number of
 				 * tagged operations
 				 */
 	int	mintags;
 	struct	timeval last_reset;	/* Time of last bus reset/loop init */
 };
 
 typedef enum {
 	CAM_GDEVLIST_LAST_DEVICE,
 	CAM_GDEVLIST_LIST_CHANGED,
 	CAM_GDEVLIST_MORE_DEVS,
 	CAM_GDEVLIST_ERROR
 } ccb_getdevlist_status_e;
 
 struct ccb_getdevlist {
 	struct ccb_hdr		ccb_h;
 	char 			periph_name[DEV_IDLEN];
 	u_int32_t		unit_number;
 	unsigned int		generation;
 	u_int32_t		index;
 	ccb_getdevlist_status_e	status;
 };
 
 typedef enum {
 	PERIPH_MATCH_NONE	= 0x000,
 	PERIPH_MATCH_PATH	= 0x001,
 	PERIPH_MATCH_TARGET	= 0x002,
 	PERIPH_MATCH_LUN	= 0x004,
 	PERIPH_MATCH_NAME	= 0x008,
 	PERIPH_MATCH_UNIT	= 0x010,
 	PERIPH_MATCH_ANY	= 0x01f
 } periph_pattern_flags;
 
 struct periph_match_pattern {
 	char			periph_name[DEV_IDLEN];
 	u_int32_t		unit_number;
 	path_id_t		path_id;
 	target_id_t		target_id;
 	lun_id_t		target_lun;
 	periph_pattern_flags	flags;
 };
 
 typedef enum {
 	DEV_MATCH_NONE		= 0x000,
 	DEV_MATCH_PATH		= 0x001,
 	DEV_MATCH_TARGET	= 0x002,
 	DEV_MATCH_LUN		= 0x004,
 	DEV_MATCH_INQUIRY	= 0x008,
 	DEV_MATCH_DEVID		= 0x010,
 	DEV_MATCH_ANY		= 0x00f
 } dev_pattern_flags;
 
 struct device_id_match_pattern {
 	uint8_t id_len;
 	uint8_t id[256];
 };
 
 struct device_match_pattern {
 	path_id_t					path_id;
 	target_id_t					target_id;
 	lun_id_t					target_lun;
 	dev_pattern_flags				flags;
 	union {
 		struct scsi_static_inquiry_pattern	inq_pat;
 		struct device_id_match_pattern		devid_pat;
 	} data;	
 };
 
 typedef enum {
 	BUS_MATCH_NONE		= 0x000,
 	BUS_MATCH_PATH		= 0x001,
 	BUS_MATCH_NAME		= 0x002,
 	BUS_MATCH_UNIT		= 0x004,
 	BUS_MATCH_BUS_ID	= 0x008,
 	BUS_MATCH_ANY		= 0x00f
 } bus_pattern_flags;
 
 struct bus_match_pattern {
 	path_id_t		path_id;
 	char			dev_name[DEV_IDLEN];
 	u_int32_t		unit_number;
 	u_int32_t		bus_id;
 	bus_pattern_flags	flags;
 };
 
 union match_pattern {
 	struct periph_match_pattern	periph_pattern;
 	struct device_match_pattern	device_pattern;
 	struct bus_match_pattern	bus_pattern;
 };
 
 typedef enum {
 	DEV_MATCH_PERIPH,
 	DEV_MATCH_DEVICE,
 	DEV_MATCH_BUS
 } dev_match_type;
 
 struct dev_match_pattern {
 	dev_match_type		type;
 	union match_pattern	pattern;
 };
 
 struct periph_match_result {
 	char			periph_name[DEV_IDLEN];
 	u_int32_t		unit_number;
 	path_id_t		path_id;
 	target_id_t		target_id;
 	lun_id_t		target_lun;
 };
 
 typedef enum {
 	DEV_RESULT_NOFLAG		= 0x00,
 	DEV_RESULT_UNCONFIGURED		= 0x01
 } dev_result_flags;
 
 struct device_match_result {
 	path_id_t			path_id;
 	target_id_t			target_id;
 	lun_id_t			target_lun;
 	cam_proto			protocol;
 	struct scsi_inquiry_data	inq_data;
 	struct ata_params		ident_data;
 	dev_result_flags		flags;
 };
 
 struct bus_match_result {
 	path_id_t	path_id;
 	char		dev_name[DEV_IDLEN];
 	u_int32_t	unit_number;
 	u_int32_t	bus_id;
 };
 
 union match_result {
 	struct periph_match_result	periph_result;
 	struct device_match_result	device_result;
 	struct bus_match_result		bus_result;
 };
 
 struct dev_match_result {
 	dev_match_type		type;
 	union match_result	result;
 };
 
 typedef enum {
 	CAM_DEV_MATCH_LAST,
 	CAM_DEV_MATCH_MORE,
 	CAM_DEV_MATCH_LIST_CHANGED,
 	CAM_DEV_MATCH_SIZE_ERROR,
 	CAM_DEV_MATCH_ERROR
 } ccb_dev_match_status;
 
 typedef enum {
 	CAM_DEV_POS_NONE	= 0x000,
 	CAM_DEV_POS_BUS		= 0x001,
 	CAM_DEV_POS_TARGET	= 0x002,
 	CAM_DEV_POS_DEVICE	= 0x004,
 	CAM_DEV_POS_PERIPH	= 0x008,
 	CAM_DEV_POS_PDPTR	= 0x010,
 	CAM_DEV_POS_TYPEMASK	= 0xf00,
 	CAM_DEV_POS_EDT		= 0x100,
 	CAM_DEV_POS_PDRV	= 0x200
 } dev_pos_type;
 
 struct ccb_dm_cookie {
 	void 	*bus;
 	void	*target;	
 	void	*device;
 	void	*periph;
 	void	*pdrv;
 };
 
 struct ccb_dev_position {
 	u_int			generations[4];
 #define	CAM_BUS_GENERATION	0x00
 #define CAM_TARGET_GENERATION	0x01
 #define CAM_DEV_GENERATION	0x02
 #define CAM_PERIPH_GENERATION	0x03
 	dev_pos_type		position_type;
 	struct ccb_dm_cookie	cookie;
 };
 
 struct ccb_dev_match {
 	struct ccb_hdr			ccb_h;
 	ccb_dev_match_status		status;
 	u_int32_t			num_patterns;
 	u_int32_t			pattern_buf_len;
 	struct dev_match_pattern	*patterns;
 	u_int32_t			num_matches;
 	u_int32_t			match_buf_len;
 	struct dev_match_result		*matches;
 	struct ccb_dev_position		pos;
 };
 
 /*
  * Definitions for the path inquiry CCB fields.
  */
 #define CAM_VERSION	0x19	/* Hex value for current version */
 
 typedef enum {
 	PI_MDP_ABLE	= 0x80,	/* Supports MDP message */
 	PI_WIDE_32	= 0x40,	/* Supports 32 bit wide SCSI */
 	PI_WIDE_16	= 0x20, /* Supports 16 bit wide SCSI */
 	PI_SDTR_ABLE	= 0x10,	/* Supports SDTR message */
 	PI_LINKED_CDB	= 0x08, /* Supports linked CDBs */
 	PI_SATAPM	= 0x04,	/* Supports SATA PM */
 	PI_TAG_ABLE	= 0x02,	/* Supports tag queue messages */
 	PI_SOFT_RST	= 0x01	/* Supports soft reset alternative */
 } pi_inqflag;
 
 typedef enum {
 	PIT_PROCESSOR	= 0x80,	/* Target mode processor mode */
 	PIT_PHASE	= 0x40,	/* Target mode phase cog. mode */
 	PIT_DISCONNECT	= 0x20,	/* Disconnects supported in target mode */
 	PIT_TERM_IO	= 0x10,	/* Terminate I/O message supported in TM */
 	PIT_GRP_6	= 0x08,	/* Group 6 commands supported */
 	PIT_GRP_7	= 0x04	/* Group 7 commands supported */
 } pi_tmflag;
 
 typedef enum {
 	PIM_ATA_EXT	= 0x200,/* ATA requests can understand ata_ext requests */
 	PIM_EXTLUNS	= 0x100,/* 64bit extended LUNs supported */
 	PIM_SCANHILO	= 0x80,	/* Bus scans from high ID to low ID */
 	PIM_NOREMOVE	= 0x40,	/* Removeable devices not included in scan */
 	PIM_NOINITIATOR	= 0x20,	/* Initiator role not supported. */
 	PIM_NOBUSRESET	= 0x10,	/* User has disabled initial BUS RESET */
 	PIM_NO_6_BYTE	= 0x08,	/* Do not send 6-byte commands */
 	PIM_SEQSCAN	= 0x04,	/* Do bus scans sequentially, not in parallel */
 	PIM_UNMAPPED	= 0x02,
 	PIM_NOSCAN	= 0x01	/* SIM does its own scanning */
 } pi_miscflag;
 
 /* Path Inquiry CCB */
 struct ccb_pathinq_settings_spi {
 	u_int8_t ppr_options;
 };
 
 struct ccb_pathinq_settings_fc {
 	u_int64_t wwnn;		/* world wide node name */
 	u_int64_t wwpn;		/* world wide port name */
 	u_int32_t port;		/* 24 bit port id, if known */
 	u_int32_t bitrate;	/* Mbps */
 };
 
 struct ccb_pathinq_settings_sas {
 	u_int32_t bitrate;	/* Mbps */
 };
 
 struct ccb_pathinq_settings_nvme {
 	uint16_t nsid;		/* Namespace ID for this path */
 };
 
 #define	PATHINQ_SETTINGS_SIZE	128
 
 struct ccb_pathinq {
 	struct 	    ccb_hdr ccb_h;
 	u_int8_t    version_num;	/* Version number for the SIM/HBA */
 	u_int8_t    hba_inquiry;	/* Mimic of INQ byte 7 for the HBA */
 	u_int16_t   target_sprt;	/* Flags for target mode support */
 	u_int32_t   hba_misc;		/* Misc HBA features */
 	u_int16_t   hba_eng_cnt;	/* HBA engine count */
 					/* Vendor Unique capabilities */
 	u_int8_t    vuhba_flags[VUHBALEN];
 	u_int32_t   max_target;		/* Maximum supported Target */
 	u_int32_t   max_lun;		/* Maximum supported Lun */
 	u_int32_t   async_flags;	/* Installed Async handlers */
 	path_id_t   hpath_id;		/* Highest Path ID in the subsystem */
 	target_id_t initiator_id;	/* ID of the HBA on the SCSI bus */
 	char	    sim_vid[SIM_IDLEN];	/* Vendor ID of the SIM */
 	char	    hba_vid[HBA_IDLEN];	/* Vendor ID of the HBA */
 	char 	    dev_name[DEV_IDLEN];/* Device name for SIM */
 	u_int32_t   unit_number;	/* Unit number for SIM */
 	u_int32_t   bus_id;		/* Bus ID for SIM */
 	u_int32_t   base_transfer_speed;/* Base bus speed in KB/sec */
 	cam_proto   protocol;
 	u_int	    protocol_version;
 	cam_xport   transport;
 	u_int	    transport_version;
 	union {
 		struct ccb_pathinq_settings_spi spi;
 		struct ccb_pathinq_settings_fc fc;
 		struct ccb_pathinq_settings_sas sas;
 		struct ccb_pathinq_settings_nvme nvme;
 		char ccb_pathinq_settings_opaque[PATHINQ_SETTINGS_SIZE];
 	} xport_specific;
 	u_int		maxio;		/* Max supported I/O size, in bytes. */
 	u_int16_t	hba_vendor;	/* HBA vendor ID */
 	u_int16_t	hba_device;	/* HBA device ID */
 	u_int16_t	hba_subvendor;	/* HBA subvendor ID */
 	u_int16_t	hba_subdevice;	/* HBA subdevice ID */
 };
 
 /* Path Statistics CCB */
 struct ccb_pathstats {
 	struct	ccb_hdr	ccb_h;
 	struct	timeval last_reset;	/* Time of last bus reset/loop init */
 };
 
 typedef enum {
 	SMP_FLAG_NONE		= 0x00,
 	SMP_FLAG_REQ_SG		= 0x01,
 	SMP_FLAG_RSP_SG		= 0x02
 } ccb_smp_pass_flags;
 
 /*
  * Serial Management Protocol CCB
  * XXX Currently the semantics for this CCB are that it is executed either
  * by the addressed device, or that device's parent (i.e. an expander for
  * any device on an expander) if the addressed device doesn't support SMP.
  * Later, once we have the ability to probe SMP-only devices and put them
  * in CAM's topology, the CCB will only be executed by the addressed device
  * if possible.
  */
 struct ccb_smpio {
 	struct ccb_hdr		ccb_h;
 	uint8_t			*smp_request;
 	int			smp_request_len;
 	uint16_t		smp_request_sglist_cnt;
 	uint8_t			*smp_response;
 	int			smp_response_len;
 	uint16_t		smp_response_sglist_cnt;
 	ccb_smp_pass_flags	flags;
 };
 
 typedef union {
 	u_int8_t *sense_ptr;		/*
 					 * Pointer to storage
 					 * for sense information
 					 */
 	                                /* Storage Area for sense information */
 	struct	 scsi_sense_data sense_buf;
 } sense_t;
 
 typedef union {
 	u_int8_t  *cdb_ptr;		/* Pointer to the CDB bytes to send */
 					/* Area for the CDB send */
 	u_int8_t  cdb_bytes[IOCDBLEN];
 } cdb_t;
 
 /*
  * SCSI I/O Request CCB used for the XPT_SCSI_IO and XPT_CONT_TARGET_IO
  * function codes.
  */
 struct ccb_scsiio {
 	struct	   ccb_hdr ccb_h;
 	union	   ccb *next_ccb;	/* Ptr for next CCB for action */
 	u_int8_t   *req_map;		/* Ptr to mapping info */
 	u_int8_t   *data_ptr;		/* Ptr to the data buf/SG list */
 	u_int32_t  dxfer_len;		/* Data transfer length */
 					/* Autosense storage */	
 	struct     scsi_sense_data sense_data;
 	u_int8_t   sense_len;		/* Number of bytes to autosense */
 	u_int8_t   cdb_len;		/* Number of bytes for the CDB */
 	u_int16_t  sglist_cnt;		/* Number of SG list entries */
 	u_int8_t   scsi_status;		/* Returned SCSI status */
 	u_int8_t   sense_resid;		/* Autosense resid length: 2's comp */
 	u_int32_t  resid;		/* Transfer residual length: 2's comp */
 	cdb_t	   cdb_io;		/* Union for CDB bytes/pointer */
 	u_int8_t   *msg_ptr;		/* Pointer to the message buffer */
 	u_int16_t  msg_len;		/* Number of bytes for the Message */
 	u_int8_t   tag_action;		/* What to do for tag queueing */
 	/*
 	 * The tag action should be either the define below (to send a
 	 * non-tagged transaction) or one of the defined scsi tag messages
 	 * from scsi_message.h.
 	 */
 #define		CAM_TAG_ACTION_NONE	0x00
 	u_int	   tag_id;		/* tag id from initator (target mode) */
 	u_int	   init_id;		/* initiator id of who selected */
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+	struct bio *bio;		/* Associated bio */
+#endif
 };
 
 static __inline uint8_t *
 scsiio_cdb_ptr(struct ccb_scsiio *ccb)
 {
 	return ((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
 	    ccb->cdb_io.cdb_ptr : ccb->cdb_io.cdb_bytes);
 }
 
 /*
  * ATA I/O Request CCB used for the XPT_ATA_IO function code.
  */
 struct ccb_ataio {
 	struct	   ccb_hdr ccb_h;
 	union	   ccb *next_ccb;	/* Ptr for next CCB for action */
 	struct ata_cmd	cmd;		/* ATA command register set */
 	struct ata_res	res;		/* ATA result register set */
 	u_int8_t   *data_ptr;		/* Ptr to the data buf/SG list */
 	u_int32_t  dxfer_len;		/* Data transfer length */
 	u_int32_t  resid;		/* Transfer residual length: 2's comp */
 	u_int8_t   ata_flags;		/* Flags for the rest of the buffer */
 #define ATA_FLAG_AUX 0x1
 	uint32_t   aux;
 	uint32_t   unused;
 };
 
 struct ccb_accept_tio {
 	struct	   ccb_hdr ccb_h;
 	cdb_t	   cdb_io;		/* Union for CDB bytes/pointer */
 	u_int8_t   cdb_len;		/* Number of bytes for the CDB */
 	u_int8_t   tag_action;		/* What to do for tag queueing */
 	u_int8_t   sense_len;		/* Number of bytes of Sense Data */
 	u_int      tag_id;		/* tag id from initator (target mode) */
 	u_int      init_id;		/* initiator id of who selected */
 	struct     scsi_sense_data sense_data;
 };
 
 /* Release SIM Queue */
 struct ccb_relsim {
 	struct ccb_hdr ccb_h;
 	u_int32_t      release_flags;
 #define RELSIM_ADJUST_OPENINGS		0x01
 #define RELSIM_RELEASE_AFTER_TIMEOUT	0x02
 #define RELSIM_RELEASE_AFTER_CMDCMPLT	0x04
 #define RELSIM_RELEASE_AFTER_QEMPTY	0x08
 	u_int32_t      openings;
 	u_int32_t      release_timeout;	/* Abstract argument. */
 	u_int32_t      qfrozen_cnt;
 };
 
 /*
  * NVMe I/O Request CCB used for the XPT_NVME_IO function code.
  */
 struct ccb_nvmeio {
 	struct	   ccb_hdr ccb_h;
 	union	   ccb *next_ccb;	/* Ptr for next CCB for action */
 	struct nvme_command cmd;	/* NVME command, per NVME standard */
 	struct nvme_completion cpl;	/* NVME completion, per NVME standard */
 	uint8_t   *data_ptr;		/* Ptr to the data buf/SG list */
 	uint32_t  dxfer_len;		/* Data transfer length */
 	uint32_t  resid;		/* Transfer residual length: 2's comp unused ?*/
 };
 
 /*
  * Definitions for the asynchronous callback CCB fields.
  */
 typedef enum {
 	AC_UNIT_ATTENTION	= 0x4000,/* Device reported UNIT ATTENTION */
 	AC_ADVINFO_CHANGED	= 0x2000,/* Advance info might have changes */
 	AC_CONTRACT		= 0x1000,/* A contractual callback */
 	AC_GETDEV_CHANGED	= 0x800,/* Getdev info might have changed */
 	AC_INQ_CHANGED		= 0x400,/* Inquiry info might have changed */
 	AC_TRANSFER_NEG		= 0x200,/* New transfer settings in effect */
 	AC_LOST_DEVICE		= 0x100,/* A device went away */
 	AC_FOUND_DEVICE		= 0x080,/* A new device was found */
 	AC_PATH_DEREGISTERED	= 0x040,/* A path has de-registered */
 	AC_PATH_REGISTERED	= 0x020,/* A new path has been registered */
 	AC_SENT_BDR		= 0x010,/* A BDR message was sent to target */
 	AC_SCSI_AEN		= 0x008,/* A SCSI AEN has been received */
 	AC_UNSOL_RESEL		= 0x002,/* Unsolicited reselection occurred */
 	AC_BUS_RESET		= 0x001	/* A SCSI bus reset occurred */
 } ac_code;
 
 typedef void ac_callback_t (void *softc, u_int32_t code,
 			    struct cam_path *path, void *args);
 
 /*
  * Generic Asynchronous callbacks.
  *
  * Generic arguments passed bac which are then interpreted between a per-system
  * contract number.
  */
 #define	AC_CONTRACT_DATA_MAX (128 - sizeof (u_int64_t))
 struct ac_contract {
 	u_int64_t	contract_number;
 	u_int8_t	contract_data[AC_CONTRACT_DATA_MAX];
 };
 
 #define	AC_CONTRACT_DEV_CHG	1
 struct ac_device_changed {
 	u_int64_t	wwpn;
 	u_int32_t	port;
 	target_id_t	target;
 	u_int8_t	arrived;
 };
 
 /* Set Asynchronous Callback CCB */
 struct ccb_setasync {
 	struct ccb_hdr	 ccb_h;
 	u_int32_t	 event_enable;	/* Async Event enables */	
 	ac_callback_t	*callback;
 	void		*callback_arg;
 };
 
 /* Set Device Type CCB */
 struct ccb_setdev {
 	struct	   ccb_hdr ccb_h;
 	u_int8_t   dev_type;	/* Value for dev type field in EDT */
 };
 
 /* SCSI Control Functions */
 
 /* Abort XPT request CCB */
 struct ccb_abort {
 	struct 	ccb_hdr ccb_h;
 	union	ccb *abort_ccb;	/* Pointer to CCB to abort */
 };
 
 /* Reset SCSI Bus CCB */
 struct ccb_resetbus {
 	struct	ccb_hdr ccb_h;
 };
 
 /* Reset SCSI Device CCB */
 struct ccb_resetdev {
 	struct	ccb_hdr ccb_h;
 };
 
 /* Terminate I/O Process Request CCB */
 struct ccb_termio {
 	struct	ccb_hdr ccb_h;
 	union	ccb *termio_ccb;	/* Pointer to CCB to terminate */
 };
 
 typedef enum {
 	CTS_TYPE_CURRENT_SETTINGS,
 	CTS_TYPE_USER_SETTINGS
 } cts_type;
 
 struct ccb_trans_settings_scsi
 {
 	u_int	valid;	/* Which fields to honor */
 #define	CTS_SCSI_VALID_TQ		0x01
 	u_int	flags;
 #define	CTS_SCSI_FLAGS_TAG_ENB		0x01
 };
 
 struct ccb_trans_settings_ata
 {
 	u_int	valid;	/* Which fields to honor */
 #define	CTS_ATA_VALID_TQ		0x01
 	u_int	flags;
 #define	CTS_ATA_FLAGS_TAG_ENB		0x01
 };
 
 struct ccb_trans_settings_spi
 {
 	u_int	  valid;	/* Which fields to honor */
 #define	CTS_SPI_VALID_SYNC_RATE		0x01
 #define	CTS_SPI_VALID_SYNC_OFFSET	0x02
 #define	CTS_SPI_VALID_BUS_WIDTH		0x04
 #define	CTS_SPI_VALID_DISC		0x08
 #define CTS_SPI_VALID_PPR_OPTIONS	0x10
 	u_int	flags;
 #define	CTS_SPI_FLAGS_DISC_ENB		0x01
 	u_int	sync_period;
 	u_int	sync_offset;
 	u_int	bus_width;
 	u_int	ppr_options;
 };
 
 struct ccb_trans_settings_fc {
 	u_int     	valid;		/* Which fields to honor */
 #define	CTS_FC_VALID_WWNN		0x8000
 #define	CTS_FC_VALID_WWPN		0x4000
 #define	CTS_FC_VALID_PORT		0x2000
 #define	CTS_FC_VALID_SPEED		0x1000
 	u_int64_t	wwnn;		/* world wide node name */
 	u_int64_t 	wwpn;		/* world wide port name */
 	u_int32_t 	port;		/* 24 bit port id, if known */
 	u_int32_t 	bitrate;	/* Mbps */
 };
 
 struct ccb_trans_settings_sas {
 	u_int     	valid;		/* Which fields to honor */
 #define	CTS_SAS_VALID_SPEED		0x1000
 	u_int32_t 	bitrate;	/* Mbps */
 };
 
 struct ccb_trans_settings_pata {
 	u_int     	valid;		/* Which fields to honor */
 #define	CTS_ATA_VALID_MODE		0x01
 #define	CTS_ATA_VALID_BYTECOUNT		0x02
 #define	CTS_ATA_VALID_ATAPI		0x20
 #define	CTS_ATA_VALID_CAPS		0x40
 	int		mode;		/* Mode */
 	u_int 		bytecount;	/* Length of PIO transaction */
 	u_int 		atapi;		/* Length of ATAPI CDB */
 	u_int 		caps;		/* Device and host SATA caps. */
 #define	CTS_ATA_CAPS_H			0x0000ffff
 #define	CTS_ATA_CAPS_H_DMA48		0x00000001 /* 48-bit DMA */
 #define	CTS_ATA_CAPS_D			0xffff0000
 };
 
 struct ccb_trans_settings_sata {
 	u_int     	valid;		/* Which fields to honor */
 #define	CTS_SATA_VALID_MODE		0x01
 #define	CTS_SATA_VALID_BYTECOUNT	0x02
 #define	CTS_SATA_VALID_REVISION		0x04
 #define	CTS_SATA_VALID_PM		0x08
 #define	CTS_SATA_VALID_TAGS		0x10
 #define	CTS_SATA_VALID_ATAPI		0x20
 #define	CTS_SATA_VALID_CAPS		0x40
 	int		mode;		/* Legacy PATA mode */
 	u_int 		bytecount;	/* Length of PIO transaction */
 	int		revision;	/* SATA revision */
 	u_int 		pm_present;	/* PM is present (XPT->SIM) */
 	u_int 		tags;		/* Number of allowed tags */
 	u_int 		atapi;		/* Length of ATAPI CDB */
 	u_int 		caps;		/* Device and host SATA caps. */
 #define	CTS_SATA_CAPS_H			0x0000ffff
 #define	CTS_SATA_CAPS_H_PMREQ		0x00000001
 #define	CTS_SATA_CAPS_H_APST		0x00000002
 #define	CTS_SATA_CAPS_H_DMAAA		0x00000010 /* Auto-activation */
 #define	CTS_SATA_CAPS_H_AN		0x00000020 /* Async. notification */
 #define	CTS_SATA_CAPS_D			0xffff0000
 #define	CTS_SATA_CAPS_D_PMREQ		0x00010000
 #define	CTS_SATA_CAPS_D_APST		0x00020000
 };
 
 struct ccb_trans_settings_nvme 
 {
 	u_int     	valid;		/* Which fields to honor */
 #define CTS_NVME_VALID_SPEC	0x01
 #define CTS_NVME_VALID_CAPS	0x02
 	u_int		spec_major;	/* Major version of spec supported */
 	u_int		spec_minor;	/* Minor verison of spec supported */
 	u_int		spec_tiny;	/* Tiny version of spec supported */
 	u_int		max_xfer;	/* Max transfer size (0 -> unlimited */
 	u_int		caps;
 };
 	
 /* Get/Set transfer rate/width/disconnection/tag queueing settings */
 struct ccb_trans_settings {
 	struct	  ccb_hdr ccb_h;
 	cts_type  type;		/* Current or User settings */
 	cam_proto protocol;
 	u_int	  protocol_version;
 	cam_xport transport;
 	u_int	  transport_version;
 	union {
 		u_int  valid;	/* Which fields to honor */
 		struct ccb_trans_settings_ata ata;
 		struct ccb_trans_settings_scsi scsi;
 		struct ccb_trans_settings_nvme nvme;
 	} proto_specific;
 	union {
 		u_int  valid;	/* Which fields to honor */
 		struct ccb_trans_settings_spi spi;
 		struct ccb_trans_settings_fc fc;
 		struct ccb_trans_settings_sas sas;
 		struct ccb_trans_settings_pata ata;
 		struct ccb_trans_settings_sata sata;
 		struct ccb_trans_settings_nvme nvme;
 	} xport_specific;
 };
 
 
 /*
  * Calculate the geometry parameters for a device
  * give the block size and volume size in blocks.
  */
 struct ccb_calc_geometry {
 	struct	  ccb_hdr ccb_h;
 	u_int32_t block_size;
 	u_int64_t volume_size;
 	u_int32_t cylinders;		
 	u_int8_t  heads;
 	u_int8_t  secs_per_track;
 };
 
 /*
  * Set or get SIM (and transport) specific knobs
  */
 
 #define	KNOB_VALID_ADDRESS	0x1
 #define	KNOB_VALID_ROLE		0x2
 
 
 #define	KNOB_ROLE_NONE		0x0
 #define	KNOB_ROLE_INITIATOR	0x1
 #define	KNOB_ROLE_TARGET	0x2
 #define	KNOB_ROLE_BOTH		0x3
 
 struct ccb_sim_knob_settings_spi {
 	u_int		valid;
 	u_int		initiator_id;
 	u_int		role;
 };
 
 struct ccb_sim_knob_settings_fc {
 	u_int		valid;
 	u_int64_t	wwnn;		/* world wide node name */
 	u_int64_t 	wwpn;		/* world wide port name */
 	u_int		role;
 };
 
 struct ccb_sim_knob_settings_sas {
 	u_int		valid;
 	u_int64_t	wwnn;		/* world wide node name */
 	u_int		role;
 };
 #define	KNOB_SETTINGS_SIZE	128
 
 struct ccb_sim_knob {
 	struct	  ccb_hdr ccb_h;
 	union {
 		u_int  valid;	/* Which fields to honor */
 		struct ccb_sim_knob_settings_spi spi;
 		struct ccb_sim_knob_settings_fc fc;
 		struct ccb_sim_knob_settings_sas sas;
 		char pad[KNOB_SETTINGS_SIZE];
 	} xport_specific;
 };
 
 /*
  * Rescan the given bus, or bus/target/lun
  */
 struct ccb_rescan {
 	struct	ccb_hdr ccb_h;
 	cam_flags	flags;
 };
 
 /*
  * Turn on debugging for the given bus, bus/target, or bus/target/lun.
  */
 struct ccb_debug {
 	struct	ccb_hdr ccb_h;
 	cam_debug_flags flags;
 };
 
 /* Target mode structures. */
 
 struct ccb_en_lun {
 	struct	  ccb_hdr ccb_h;
 	u_int16_t grp6_len;		/* Group 6 VU CDB length */
 	u_int16_t grp7_len;		/* Group 7 VU CDB length */
 	u_int8_t  enable;
 };
 
 /* old, barely used immediate notify, binary compatibility */
 struct ccb_immed_notify {
 	struct	  ccb_hdr ccb_h;
 	struct    scsi_sense_data sense_data;
 	u_int8_t  sense_len;		/* Number of bytes in sense buffer */
 	u_int8_t  initiator_id;		/* Id of initiator that selected */
 	u_int8_t  message_args[7];	/* Message Arguments */
 };
 
 struct ccb_notify_ack {
 	struct	  ccb_hdr ccb_h;
 	u_int16_t seq_id;		/* Sequence identifier */
 	u_int8_t  event;		/* Event flags */
 };
 
 struct ccb_immediate_notify {
 	struct    ccb_hdr ccb_h;
 	u_int     tag_id;		/* Tag for immediate notify */
 	u_int     seq_id;		/* Tag for target of notify */
 	u_int     initiator_id;		/* Initiator Identifier */
 	u_int     arg;			/* Function specific */
 };
 
 struct ccb_notify_acknowledge {
 	struct    ccb_hdr ccb_h;
 	u_int     tag_id;		/* Tag for immediate notify */
 	u_int     seq_id;		/* Tar for target of notify */
 	u_int     initiator_id;		/* Initiator Identifier */
 	u_int     arg;			/* Response information */
 	/*
 	 * Lower byte of arg is one of RESPONSE CODE values defined below
 	 * (subset of response codes from SPL-4 and FCP-4 specifications),
 	 * upper 3 bytes is code-specific ADDITIONAL RESPONSE INFORMATION.
 	 */
 #define	CAM_RSP_TMF_COMPLETE		0x00
 #define	CAM_RSP_TMF_REJECTED		0x04
 #define	CAM_RSP_TMF_FAILED		0x05
 #define	CAM_RSP_TMF_SUCCEEDED		0x08
 #define	CAM_RSP_TMF_INCORRECT_LUN	0x09
 };
 
 /* HBA engine structures. */
 
 typedef enum {
 	EIT_BUFFER,	/* Engine type: buffer memory */
 	EIT_LOSSLESS,	/* Engine type: lossless compression */
 	EIT_LOSSY,	/* Engine type: lossy compression */
 	EIT_ENCRYPT	/* Engine type: encryption */
 } ei_type;
 
 typedef enum {
 	EAD_VUNIQUE,	/* Engine algorithm ID: vendor unique */
 	EAD_LZ1V1,	/* Engine algorithm ID: LZ1 var.1 */
 	EAD_LZ2V1,	/* Engine algorithm ID: LZ2 var.1 */
 	EAD_LZ2V2	/* Engine algorithm ID: LZ2 var.2 */
 } ei_algo;
 
 struct ccb_eng_inq {
 	struct	  ccb_hdr ccb_h;
 	u_int16_t eng_num;	/* The engine number for this inquiry */
 	ei_type   eng_type;	/* Returned engine type */
 	ei_algo   eng_algo;	/* Returned engine algorithm type */
 	u_int32_t eng_memeory;	/* Returned engine memory size */
 };
 
 struct ccb_eng_exec {	/* This structure must match SCSIIO size */
 	struct	  ccb_hdr ccb_h;
 	u_int8_t  *pdrv_ptr;	/* Ptr used by the peripheral driver */
 	u_int8_t  *req_map;	/* Ptr for mapping info on the req. */
 	u_int8_t  *data_ptr;	/* Pointer to the data buf/SG list */
 	u_int32_t dxfer_len;	/* Data transfer length */
 	u_int8_t  *engdata_ptr;	/* Pointer to the engine buffer data */
 	u_int16_t sglist_cnt;	/* Num of scatter gather list entries */
 	u_int32_t dmax_len;	/* Destination data maximum length */
 	u_int32_t dest_len;	/* Destination data length */
 	int32_t	  src_resid;	/* Source residual length: 2's comp */
 	u_int32_t timeout;	/* Timeout value */
 	u_int16_t eng_num;	/* Engine number for this request */
 	u_int16_t vu_flags;	/* Vendor Unique flags */
 };
 
 /*
  * Definitions for the timeout field in the SCSI I/O CCB.
  */
 #define	CAM_TIME_DEFAULT	0x00000000	/* Use SIM default value */
 #define	CAM_TIME_INFINITY	0xFFFFFFFF	/* Infinite timeout */
 
 #define	CAM_SUCCESS	0	/* For signaling general success */
 #define	CAM_FAILURE	1	/* For signaling general failure */
 
 #define CAM_FALSE	0
 #define CAM_TRUE	1
 
 #define XPT_CCB_INVALID	-1	/* for signaling a bad CCB to free */
 
 /*
  * CCB for working with advanced device information.  This operates in a fashion
  * similar to XPT_GDEV_TYPE.  Specify the target in ccb_h, the buffer
  * type requested, and provide a buffer size/buffer to write to.  If the
  * buffer is too small, provsiz will be larger than bufsiz.
  */
 struct ccb_dev_advinfo {
 	struct ccb_hdr ccb_h;
 	uint32_t flags;
 #define	CDAI_FLAG_NONE		0x0	/* No flags set */
 #define	CDAI_FLAG_STORE		0x1	/* If set, action becomes store */
 	uint32_t buftype;		/* IN: Type of data being requested */
 	/* NB: buftype is interpreted on a per-transport basis */
 #define	CDAI_TYPE_SCSI_DEVID	1
 #define	CDAI_TYPE_SERIAL_NUM	2
 #define	CDAI_TYPE_PHYS_PATH	3
 #define	CDAI_TYPE_RCAPLONG	4
 #define	CDAI_TYPE_EXT_INQ	5
 	off_t bufsiz;			/* IN: Size of external buffer */
 #define	CAM_SCSI_DEVID_MAXLEN	65536	/* length in buffer is an uint16_t */
 	off_t provsiz;			/* OUT: Size required/used */
 	uint8_t *buf;			/* IN/OUT: Buffer for requested data */
 };
 
 /*
  * CCB for sending async events
  */
 struct ccb_async {
 	struct ccb_hdr ccb_h;
 	uint32_t async_code;
 	off_t async_arg_size;
 	void *async_arg_ptr;
 };
 
 /*
  * Union of all CCB types for kernel space allocation.  This union should
  * never be used for manipulating CCBs - its only use is for the allocation
  * and deallocation of raw CCB space and is the return type of xpt_ccb_alloc
  * and the argument to xpt_ccb_free.
  */
 union ccb {
 	struct	ccb_hdr			ccb_h;	/* For convenience */
 	struct	ccb_scsiio		csio;
 	struct	ccb_getdev		cgd;
 	struct	ccb_getdevlist		cgdl;
 	struct	ccb_pathinq		cpi;
 	struct	ccb_relsim		crs;
 	struct	ccb_setasync		csa;
 	struct	ccb_setdev		csd;
 	struct	ccb_pathstats		cpis;
 	struct	ccb_getdevstats		cgds;
 	struct	ccb_dev_match		cdm;
 	struct	ccb_trans_settings	cts;
 	struct	ccb_calc_geometry	ccg;	
 	struct	ccb_sim_knob		knob;	
 	struct	ccb_abort		cab;
 	struct	ccb_resetbus		crb;
 	struct	ccb_resetdev		crd;
 	struct	ccb_termio		tio;
 	struct	ccb_accept_tio		atio;
 	struct	ccb_scsiio		ctio;
 	struct	ccb_en_lun		cel;
 	struct	ccb_immed_notify	cin;
 	struct	ccb_notify_ack		cna;
 	struct	ccb_immediate_notify	cin1;
 	struct	ccb_notify_acknowledge	cna2;
 	struct	ccb_eng_inq		cei;
 	struct	ccb_eng_exec		cee;
 	struct	ccb_smpio		smpio;
 	struct 	ccb_rescan		crcn;
 	struct  ccb_debug		cdbg;
 	struct	ccb_ataio		ataio;
 	struct	ccb_dev_advinfo		cdai;
 	struct	ccb_async		casync;
 	struct	ccb_nvmeio		nvmeio;
 };
 
 #define CCB_CLEAR_ALL_EXCEPT_HDR(ccbp)			\
 	bzero((char *)(ccbp) + sizeof((ccbp)->ccb_h),	\
 	    sizeof(*(ccbp)) - sizeof((ccbp)->ccb_h))
 
 __BEGIN_DECLS
 static __inline void
 cam_fill_csio(struct ccb_scsiio *csio, u_int32_t retries,
 	      void (*cbfcnp)(struct cam_periph *, union ccb *),
 	      u_int32_t flags, u_int8_t tag_action,
 	      u_int8_t *data_ptr, u_int32_t dxfer_len,
 	      u_int8_t sense_len, u_int8_t cdb_len,
 	      u_int32_t timeout);
 
 static __inline void
 cam_fill_nvmeio(struct ccb_nvmeio *nvmeio, u_int32_t retries,
 	      void (*cbfcnp)(struct cam_periph *, union ccb *),
 	      u_int32_t flags, u_int8_t *data_ptr, u_int32_t dxfer_len,
 	      u_int32_t timeout);
 
 static __inline void
 cam_fill_ctio(struct ccb_scsiio *csio, u_int32_t retries,
 	      void (*cbfcnp)(struct cam_periph *, union ccb *),
 	      u_int32_t flags, u_int tag_action, u_int tag_id,
 	      u_int init_id, u_int scsi_status, u_int8_t *data_ptr,
 	      u_int32_t dxfer_len, u_int32_t timeout);
 
 static __inline void
 cam_fill_ataio(struct ccb_ataio *ataio, u_int32_t retries,
 	      void (*cbfcnp)(struct cam_periph *, union ccb *),
 	      u_int32_t flags, u_int tag_action,
 	      u_int8_t *data_ptr, u_int32_t dxfer_len,
 	      u_int32_t timeout);
 
 static __inline void
 cam_fill_smpio(struct ccb_smpio *smpio, uint32_t retries, 
 	       void (*cbfcnp)(struct cam_periph *, union ccb *), uint32_t flags,
 	       uint8_t *smp_request, int smp_request_len,
 	       uint8_t *smp_response, int smp_response_len,
 	       uint32_t timeout);
 
 static __inline void
 cam_fill_csio(struct ccb_scsiio *csio, u_int32_t retries,
 	      void (*cbfcnp)(struct cam_periph *, union ccb *),
 	      u_int32_t flags, u_int8_t tag_action,
 	      u_int8_t *data_ptr, u_int32_t dxfer_len,
 	      u_int8_t sense_len, u_int8_t cdb_len,
 	      u_int32_t timeout)
 {
 	csio->ccb_h.func_code = XPT_SCSI_IO;
 	csio->ccb_h.flags = flags;
 	csio->ccb_h.xflags = 0;
 	csio->ccb_h.retry_count = retries;	
 	csio->ccb_h.cbfcnp = cbfcnp;
 	csio->ccb_h.timeout = timeout;
 	csio->data_ptr = data_ptr;
 	csio->dxfer_len = dxfer_len;
 	csio->sense_len = sense_len;
 	csio->cdb_len = cdb_len;
 	csio->tag_action = tag_action;
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+	csio->bio = NULL;
+#endif
 }
 
 static __inline void
 cam_fill_ctio(struct ccb_scsiio *csio, u_int32_t retries,
 	      void (*cbfcnp)(struct cam_periph *, union ccb *),
 	      u_int32_t flags, u_int tag_action, u_int tag_id,
 	      u_int init_id, u_int scsi_status, u_int8_t *data_ptr,
 	      u_int32_t dxfer_len, u_int32_t timeout)
 {
 	csio->ccb_h.func_code = XPT_CONT_TARGET_IO;
 	csio->ccb_h.flags = flags;
 	csio->ccb_h.xflags = 0;
 	csio->ccb_h.retry_count = retries;	
 	csio->ccb_h.cbfcnp = cbfcnp;
 	csio->ccb_h.timeout = timeout;
 	csio->data_ptr = data_ptr;
 	csio->dxfer_len = dxfer_len;
 	csio->scsi_status = scsi_status;
 	csio->tag_action = tag_action;
 	csio->tag_id = tag_id;
 	csio->init_id = init_id;
 }
 
 static __inline void
 cam_fill_ataio(struct ccb_ataio *ataio, u_int32_t retries,
 	      void (*cbfcnp)(struct cam_periph *, union ccb *),
 	      u_int32_t flags, u_int tag_action __unused,
 	      u_int8_t *data_ptr, u_int32_t dxfer_len,
 	      u_int32_t timeout)
 {
 	ataio->ccb_h.func_code = XPT_ATA_IO;
 	ataio->ccb_h.flags = flags;
 	ataio->ccb_h.retry_count = retries;
 	ataio->ccb_h.cbfcnp = cbfcnp;
 	ataio->ccb_h.timeout = timeout;
 	ataio->data_ptr = data_ptr;
 	ataio->dxfer_len = dxfer_len;
 	ataio->ata_flags = 0;
 }
 
 static __inline void
 cam_fill_smpio(struct ccb_smpio *smpio, uint32_t retries, 
 	       void (*cbfcnp)(struct cam_periph *, union ccb *), uint32_t flags,
 	       uint8_t *smp_request, int smp_request_len,
 	       uint8_t *smp_response, int smp_response_len,
 	       uint32_t timeout)
 {
 #ifdef _KERNEL
 	KASSERT((flags & CAM_DIR_MASK) == CAM_DIR_BOTH,
 		("direction != CAM_DIR_BOTH"));
 	KASSERT((smp_request != NULL) && (smp_response != NULL),
 		("need valid request and response buffers"));
 	KASSERT((smp_request_len != 0) && (smp_response_len != 0),
 		("need non-zero request and response lengths"));
 #endif /*_KERNEL*/
 	smpio->ccb_h.func_code = XPT_SMP_IO;
 	smpio->ccb_h.flags = flags;
 	smpio->ccb_h.retry_count = retries;
 	smpio->ccb_h.cbfcnp = cbfcnp;
 	smpio->ccb_h.timeout = timeout;
 	smpio->smp_request = smp_request;
 	smpio->smp_request_len = smp_request_len;
 	smpio->smp_response = smp_response;
 	smpio->smp_response_len = smp_response_len;
 }
 
 static __inline void
 cam_set_ccbstatus(union ccb *ccb, cam_status status)
 {
 	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
 	ccb->ccb_h.status |= status;
 }
 
 static __inline cam_status
 cam_ccb_status(union ccb *ccb)
 {
 	return ((cam_status)(ccb->ccb_h.status & CAM_STATUS_MASK));
 }
 
 void cam_calc_geometry(struct ccb_calc_geometry *ccg, int extended);
 
 static __inline void
 cam_fill_nvmeio(struct ccb_nvmeio *nvmeio, u_int32_t retries,
 	      void (*cbfcnp)(struct cam_periph *, union ccb *),
 	      u_int32_t flags, u_int8_t *data_ptr, u_int32_t dxfer_len,
 	      u_int32_t timeout)
 {
 	nvmeio->ccb_h.func_code = XPT_NVME_IO;
 	nvmeio->ccb_h.flags = flags;
 	nvmeio->ccb_h.retry_count = retries;
 	nvmeio->ccb_h.cbfcnp = cbfcnp;
 	nvmeio->ccb_h.timeout = timeout;
 	nvmeio->data_ptr = data_ptr;
 	nvmeio->dxfer_len = dxfer_len;
 }
 __END_DECLS
 
 #endif /* _CAM_CAM_CCB_H */
Index: head/sys/cam/cam_periph.c
===================================================================
--- head/sys/cam/cam_periph.c	(revision 308154)
+++ head/sys/cam/cam_periph.c	(revision 308155)
@@ -1,1944 +1,1949 @@
 /*-
  * Common functions for CAM "type" (peripheral) drivers.
  *
  * Copyright (c) 1997, 1998 Justin T. Gibbs.
  * Copyright (c) 1997, 1998, 1999, 2000 Kenneth D. Merry.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/devicestat.h>
 #include <sys/bus.h>
 #include <sys/sbuf.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_queue.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_sim.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_message.h>
 #include <cam/scsi/scsi_pass.h>
 
 static	u_int		camperiphnextunit(struct periph_driver *p_drv,
 					  u_int newunit, int wired,
 					  path_id_t pathid, target_id_t target,
 					  lun_id_t lun);
 static	u_int		camperiphunit(struct periph_driver *p_drv,
 				      path_id_t pathid, target_id_t target,
 				      lun_id_t lun); 
 static	void		camperiphdone(struct cam_periph *periph, 
 					union ccb *done_ccb);
 static  void		camperiphfree(struct cam_periph *periph);
 static int		camperiphscsistatuserror(union ccb *ccb,
 					        union ccb **orig_ccb,
 						 cam_flags camflags,
 						 u_int32_t sense_flags,
 						 int *openings,
 						 u_int32_t *relsim_flags,
 						 u_int32_t *timeout,
 						 u_int32_t  *action,
 						 const char **action_string);
 static	int		camperiphscsisenseerror(union ccb *ccb,
 					        union ccb **orig_ccb,
 					        cam_flags camflags,
 					        u_int32_t sense_flags,
 					        int *openings,
 					        u_int32_t *relsim_flags,
 					        u_int32_t *timeout,
 					        u_int32_t *action,
 					        const char **action_string);
 static void		cam_periph_devctl_notify(union ccb *ccb);
 
 static int nperiph_drivers;
 static int initialized = 0;
 struct periph_driver **periph_drivers;
 
 static MALLOC_DEFINE(M_CAMPERIPH, "CAM periph", "CAM peripheral buffers");
 
 static int periph_selto_delay = 1000;
 TUNABLE_INT("kern.cam.periph_selto_delay", &periph_selto_delay);
 static int periph_noresrc_delay = 500;
 TUNABLE_INT("kern.cam.periph_noresrc_delay", &periph_noresrc_delay);
 static int periph_busy_delay = 500;
 TUNABLE_INT("kern.cam.periph_busy_delay", &periph_busy_delay);
 
 
 void
 periphdriver_register(void *data)
 {
 	struct periph_driver *drv = (struct periph_driver *)data;
 	struct periph_driver **newdrivers, **old;
 	int ndrivers;
 
 again:
 	ndrivers = nperiph_drivers + 2;
 	newdrivers = malloc(sizeof(*newdrivers) * ndrivers, M_CAMPERIPH,
 			    M_WAITOK);
 	xpt_lock_buses();
 	if (ndrivers != nperiph_drivers + 2) {
 		/*
 		 * Lost race against itself; go around.
 		 */
 		xpt_unlock_buses();
 		free(newdrivers, M_CAMPERIPH);
 		goto again;
 	}
 	if (periph_drivers)
 		bcopy(periph_drivers, newdrivers,
 		      sizeof(*newdrivers) * nperiph_drivers);
 	newdrivers[nperiph_drivers] = drv;
 	newdrivers[nperiph_drivers + 1] = NULL;
 	old = periph_drivers;
 	periph_drivers = newdrivers;
 	nperiph_drivers++;
 	xpt_unlock_buses();
 	if (old)
 		free(old, M_CAMPERIPH);
 	/* If driver marked as early or it is late now, initialize it. */
 	if (((drv->flags & CAM_PERIPH_DRV_EARLY) != 0 && initialized > 0) ||
 	    initialized > 1)
 		(*drv->init)();
 }
 
 void
 periphdriver_init(int level)
 {
 	int	i, early;
 
 	initialized = max(initialized, level);
 	for (i = 0; periph_drivers[i] != NULL; i++) {
 		early = (periph_drivers[i]->flags & CAM_PERIPH_DRV_EARLY) ? 1 : 2;
 		if (early == initialized)
 			(*periph_drivers[i]->init)();
 	}
 }
 
 cam_status
 cam_periph_alloc(periph_ctor_t *periph_ctor,
 		 periph_oninv_t *periph_oninvalidate,
 		 periph_dtor_t *periph_dtor, periph_start_t *periph_start,
 		 char *name, cam_periph_type type, struct cam_path *path,
 		 ac_callback_t *ac_callback, ac_code code, void *arg)
 {
 	struct		periph_driver **p_drv;
 	struct		cam_sim *sim;
 	struct		cam_periph *periph;
 	struct		cam_periph *cur_periph;
 	path_id_t	path_id;
 	target_id_t	target_id;
 	lun_id_t	lun_id;
 	cam_status	status;
 	u_int		init_level;
 
 	init_level = 0;
 	/*
 	 * Handle Hot-Plug scenarios.  If there is already a peripheral
 	 * of our type assigned to this path, we are likely waiting for
 	 * final close on an old, invalidated, peripheral.  If this is
 	 * the case, queue up a deferred call to the peripheral's async
 	 * handler.  If it looks like a mistaken re-allocation, complain.
 	 */
 	if ((periph = cam_periph_find(path, name)) != NULL) {
 
 		if ((periph->flags & CAM_PERIPH_INVALID) != 0
 		 && (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) == 0) {
 			periph->flags |= CAM_PERIPH_NEW_DEV_FOUND;
 			periph->deferred_callback = ac_callback;
 			periph->deferred_ac = code;
 			return (CAM_REQ_INPROG);
 		} else {
 			printf("cam_periph_alloc: attempt to re-allocate "
 			       "valid device %s%d rejected flags %#x "
 			       "refcount %d\n", periph->periph_name,
 			       periph->unit_number, periph->flags,
 			       periph->refcount);
 		}
 		return (CAM_REQ_INVALID);
 	}
 	
 	periph = (struct cam_periph *)malloc(sizeof(*periph), M_CAMPERIPH,
 					     M_NOWAIT|M_ZERO);
 
 	if (periph == NULL)
 		return (CAM_RESRC_UNAVAIL);
 	
 	init_level++;
 
 
 	sim = xpt_path_sim(path);
 	path_id = xpt_path_path_id(path);
 	target_id = xpt_path_target_id(path);
 	lun_id = xpt_path_lun_id(path);
 	periph->periph_start = periph_start;
 	periph->periph_dtor = periph_dtor;
 	periph->periph_oninval = periph_oninvalidate;
 	periph->type = type;
 	periph->periph_name = name;
 	periph->scheduled_priority = CAM_PRIORITY_NONE;
 	periph->immediate_priority = CAM_PRIORITY_NONE;
 	periph->refcount = 1;		/* Dropped by invalidation. */
 	periph->sim = sim;
 	SLIST_INIT(&periph->ccb_list);
 	status = xpt_create_path(&path, periph, path_id, target_id, lun_id);
 	if (status != CAM_REQ_CMP)
 		goto failure;
 	periph->path = path;
 
 	xpt_lock_buses();
 	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
 		if (strcmp((*p_drv)->driver_name, name) == 0)
 			break;
 	}
 	if (*p_drv == NULL) {
 		printf("cam_periph_alloc: invalid periph name '%s'\n", name);
 		xpt_unlock_buses();
 		xpt_free_path(periph->path);
 		free(periph, M_CAMPERIPH);
 		return (CAM_REQ_INVALID);
 	}
 	periph->unit_number = camperiphunit(*p_drv, path_id, target_id, lun_id);
 	cur_periph = TAILQ_FIRST(&(*p_drv)->units);
 	while (cur_periph != NULL
 	    && cur_periph->unit_number < periph->unit_number)
 		cur_periph = TAILQ_NEXT(cur_periph, unit_links);
 	if (cur_periph != NULL) {
 		KASSERT(cur_periph->unit_number != periph->unit_number, ("duplicate units on periph list"));
 		TAILQ_INSERT_BEFORE(cur_periph, periph, unit_links);
 	} else {
 		TAILQ_INSERT_TAIL(&(*p_drv)->units, periph, unit_links);
 		(*p_drv)->generation++;
 	}
 	xpt_unlock_buses();
 
 	init_level++;
 
 	status = xpt_add_periph(periph);
 	if (status != CAM_REQ_CMP)
 		goto failure;
 
 	init_level++;
 	CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph created\n"));
 
 	status = periph_ctor(periph, arg);
 
 	if (status == CAM_REQ_CMP)
 		init_level++;
 
 failure:
 	switch (init_level) {
 	case 4:
 		/* Initialized successfully */
 		break;
 	case 3:
 		CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n"));
 		xpt_remove_periph(periph);
 		/* FALLTHROUGH */
 	case 2:
 		xpt_lock_buses();
 		TAILQ_REMOVE(&(*p_drv)->units, periph, unit_links);
 		xpt_unlock_buses();
 		xpt_free_path(periph->path);
 		/* FALLTHROUGH */
 	case 1:
 		free(periph, M_CAMPERIPH);
 		/* FALLTHROUGH */
 	case 0:
 		/* No cleanup to perform. */
 		break;
 	default:
 		panic("%s: Unknown init level", __func__);
 	}
 	return(status);
 }
 
 /*
  * Find a peripheral structure with the specified path, target, lun, 
  * and (optionally) type.  If the name is NULL, this function will return
  * the first peripheral driver that matches the specified path.
  */
 struct cam_periph *
 cam_periph_find(struct cam_path *path, char *name)
 {
 	struct periph_driver **p_drv;
 	struct cam_periph *periph;
 
 	xpt_lock_buses();
 	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
 
 		if (name != NULL && (strcmp((*p_drv)->driver_name, name) != 0))
 			continue;
 
 		TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) {
 			if (xpt_path_comp(periph->path, path) == 0) {
 				xpt_unlock_buses();
 				cam_periph_assert(periph, MA_OWNED);
 				return(periph);
 			}
 		}
 		if (name != NULL) {
 			xpt_unlock_buses();
 			return(NULL);
 		}
 	}
 	xpt_unlock_buses();
 	return(NULL);
 }
 
 /*
  * Find peripheral driver instances attached to the specified path.
  */
 int
 cam_periph_list(struct cam_path *path, struct sbuf *sb)
 {
 	struct sbuf local_sb;
 	struct periph_driver **p_drv;
 	struct cam_periph *periph;
 	int count;
 	int sbuf_alloc_len;
 
 	sbuf_alloc_len = 16;
 retry:
 	sbuf_new(&local_sb, NULL, sbuf_alloc_len, SBUF_FIXEDLEN);
 	count = 0;
 	xpt_lock_buses();
 	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
 
 		TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) {
 			if (xpt_path_comp(periph->path, path) != 0)
 				continue;
 
 			if (sbuf_len(&local_sb) != 0)
 				sbuf_cat(&local_sb, ",");
 
 			sbuf_printf(&local_sb, "%s%d", periph->periph_name,
 				    periph->unit_number);
 
 			if (sbuf_error(&local_sb) == ENOMEM) {
 				sbuf_alloc_len *= 2;
 				xpt_unlock_buses();
 				sbuf_delete(&local_sb);
 				goto retry;
 			}
 			count++;
 		}
 	}
 	xpt_unlock_buses();
 	sbuf_finish(&local_sb);
 	sbuf_cpy(sb, sbuf_data(&local_sb));
 	sbuf_delete(&local_sb);
 	return (count);
 }
 
 cam_status
 cam_periph_acquire(struct cam_periph *periph)
 {
 	cam_status status;
 
 	status = CAM_REQ_CMP_ERR;
 	if (periph == NULL)
 		return (status);
 
 	xpt_lock_buses();
 	if ((periph->flags & CAM_PERIPH_INVALID) == 0) {
 		periph->refcount++;
 		status = CAM_REQ_CMP;
 	}
 	xpt_unlock_buses();
 
 	return (status);
 }
 
 void
 cam_periph_doacquire(struct cam_periph *periph)
 {
 
 	xpt_lock_buses();
 	KASSERT(periph->refcount >= 1,
 	    ("cam_periph_doacquire() with refcount == %d", periph->refcount));
 	periph->refcount++;
 	xpt_unlock_buses();
 }
 
 void
 cam_periph_release_locked_buses(struct cam_periph *periph)
 {
 
 	cam_periph_assert(periph, MA_OWNED);
 	KASSERT(periph->refcount >= 1, ("periph->refcount >= 1"));
 	if (--periph->refcount == 0)
 		camperiphfree(periph);
 }
 
 void
 cam_periph_release_locked(struct cam_periph *periph)
 {
 
 	if (periph == NULL)
 		return;
 
 	xpt_lock_buses();
 	cam_periph_release_locked_buses(periph);
 	xpt_unlock_buses();
 }
 
 void
 cam_periph_release(struct cam_periph *periph)
 {
 	struct mtx *mtx;
 
 	if (periph == NULL)
 		return;
 	
 	cam_periph_assert(periph, MA_NOTOWNED);
 	mtx = cam_periph_mtx(periph);
 	mtx_lock(mtx);
 	cam_periph_release_locked(periph);
 	mtx_unlock(mtx);
 }
 
 int
 cam_periph_hold(struct cam_periph *periph, int priority)
 {
 	int error;
 
 	/*
 	 * Increment the reference count on the peripheral
 	 * while we wait for our lock attempt to succeed
 	 * to ensure the peripheral doesn't disappear out
 	 * from user us while we sleep.
 	 */
 
 	if (cam_periph_acquire(periph) != CAM_REQ_CMP)
 		return (ENXIO);
 
 	cam_periph_assert(periph, MA_OWNED);
 	while ((periph->flags & CAM_PERIPH_LOCKED) != 0) {
 		periph->flags |= CAM_PERIPH_LOCK_WANTED;
 		if ((error = cam_periph_sleep(periph, periph, priority,
 		    "caplck", 0)) != 0) {
 			cam_periph_release_locked(periph);
 			return (error);
 		}
 		if (periph->flags & CAM_PERIPH_INVALID) {
 			cam_periph_release_locked(periph);
 			return (ENXIO);
 		}
 	}
 
 	periph->flags |= CAM_PERIPH_LOCKED;
 	return (0);
 }
 
 void
 cam_periph_unhold(struct cam_periph *periph)
 {
 
 	cam_periph_assert(periph, MA_OWNED);
 
 	periph->flags &= ~CAM_PERIPH_LOCKED;
 	if ((periph->flags & CAM_PERIPH_LOCK_WANTED) != 0) {
 		periph->flags &= ~CAM_PERIPH_LOCK_WANTED;
 		wakeup(periph);
 	}
 
 	cam_periph_release_locked(periph);
 }
 
 /*
  * Look for the next unit number that is not currently in use for this
  * peripheral type starting at "newunit".  Also exclude unit numbers that
  * are reserved by for future "hardwiring" unless we already know that this
  * is a potential wired device.  Only assume that the device is "wired" the
  * first time through the loop since after that we'll be looking at unit
  * numbers that did not match a wiring entry.
  */
 static u_int
 camperiphnextunit(struct periph_driver *p_drv, u_int newunit, int wired,
 		  path_id_t pathid, target_id_t target, lun_id_t lun)
 {
 	struct	cam_periph *periph;
 	char	*periph_name;
 	int	i, val, dunit, r;
 	const char *dname, *strval;
 
 	periph_name = p_drv->driver_name;
 	for (;;newunit++) {
 
 		for (periph = TAILQ_FIRST(&p_drv->units);
 		     periph != NULL && periph->unit_number != newunit;
 		     periph = TAILQ_NEXT(periph, unit_links))
 			;
 
 		if (periph != NULL && periph->unit_number == newunit) {
 			if (wired != 0) {
 				xpt_print(periph->path, "Duplicate Wired "
 				    "Device entry!\n");
 				xpt_print(periph->path, "Second device (%s "
 				    "device at scbus%d target %d lun %d) will "
 				    "not be wired\n", periph_name, pathid,
 				    target, lun);
 				wired = 0;
 			}
 			continue;
 		}
 		if (wired)
 			break;
 
 		/*
 		 * Don't match entries like "da 4" as a wired down
 		 * device, but do match entries like "da 4 target 5"
 		 * or even "da 4 scbus 1". 
 		 */
 		i = 0;
 		dname = periph_name;
 		for (;;) {
 			r = resource_find_dev(&i, dname, &dunit, NULL, NULL);
 			if (r != 0)
 				break;
 			/* if no "target" and no specific scbus, skip */
 			if (resource_int_value(dname, dunit, "target", &val) &&
 			    (resource_string_value(dname, dunit, "at",&strval)||
 			     strcmp(strval, "scbus") == 0))
 				continue;
 			if (newunit == dunit)
 				break;
 		}
 		if (r != 0)
 			break;
 	}
 	return (newunit);
 }
 
 static u_int
 camperiphunit(struct periph_driver *p_drv, path_id_t pathid,
 	      target_id_t target, lun_id_t lun)
 {
 	u_int	unit;
 	int	wired, i, val, dunit;
 	const char *dname, *strval;
 	char	pathbuf[32], *periph_name;
 
 	periph_name = p_drv->driver_name;
 	snprintf(pathbuf, sizeof(pathbuf), "scbus%d", pathid);
 	unit = 0;
 	i = 0;
 	dname = periph_name;
 	for (wired = 0; resource_find_dev(&i, dname, &dunit, NULL, NULL) == 0;
 	     wired = 0) {
 		if (resource_string_value(dname, dunit, "at", &strval) == 0) {
 			if (strcmp(strval, pathbuf) != 0)
 				continue;
 			wired++;
 		}
 		if (resource_int_value(dname, dunit, "target", &val) == 0) {
 			if (val != target)
 				continue;
 			wired++;
 		}
 		if (resource_int_value(dname, dunit, "lun", &val) == 0) {
 			if (val != lun)
 				continue;
 			wired++;
 		}
 		if (wired != 0) {
 			unit = dunit;
 			break;
 		}
 	}
 
 	/*
 	 * Either start from 0 looking for the next unit or from
 	 * the unit number given in the resource config.  This way,
 	 * if we have wildcard matches, we don't return the same
 	 * unit number twice.
 	 */
 	unit = camperiphnextunit(p_drv, unit, wired, pathid, target, lun);
 
 	return (unit);
 }
 
 void
 cam_periph_invalidate(struct cam_periph *periph)
 {
 
 	cam_periph_assert(periph, MA_OWNED);
 	/*
 	 * We only call this routine the first time a peripheral is
 	 * invalidated.
 	 */
 	if ((periph->flags & CAM_PERIPH_INVALID) != 0)
 		return;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph invalidated\n"));
 	if ((periph->flags & CAM_PERIPH_ANNOUNCED) && !rebooting)
 		xpt_denounce_periph(periph);
 	periph->flags |= CAM_PERIPH_INVALID;
 	periph->flags &= ~CAM_PERIPH_NEW_DEV_FOUND;
 	if (periph->periph_oninval != NULL)
 		periph->periph_oninval(periph);
 	cam_periph_release_locked(periph);
 }
 
 static void
 camperiphfree(struct cam_periph *periph)
 {
 	struct periph_driver **p_drv;
 
 	cam_periph_assert(periph, MA_OWNED);
 	KASSERT(periph->periph_allocating == 0, ("%s%d: freed while allocating",
 	    periph->periph_name, periph->unit_number));
 	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
 		if (strcmp((*p_drv)->driver_name, periph->periph_name) == 0)
 			break;
 	}
 	if (*p_drv == NULL) {
 		printf("camperiphfree: attempt to free non-existant periph\n");
 		return;
 	}
 
 	/*
 	 * We need to set this flag before dropping the topology lock, to
 	 * let anyone who is traversing the list that this peripheral is
 	 * about to be freed, and there will be no more reference count
 	 * checks.
 	 */
 	periph->flags |= CAM_PERIPH_FREE;
 
 	/*
 	 * The peripheral destructor semantics dictate calling with only the
 	 * SIM mutex held.  Since it might sleep, it should not be called
 	 * with the topology lock held.
 	 */
 	xpt_unlock_buses();
 
 	/*
 	 * We need to call the peripheral destructor prior to removing the
 	 * peripheral from the list.  Otherwise, we risk running into a
 	 * scenario where the peripheral unit number may get reused
 	 * (because it has been removed from the list), but some resources
 	 * used by the peripheral are still hanging around.  In particular,
 	 * the devfs nodes used by some peripherals like the pass(4) driver
 	 * aren't fully cleaned up until the destructor is run.  If the
 	 * unit number is reused before the devfs instance is fully gone,
 	 * devfs will panic.
 	 */
 	if (periph->periph_dtor != NULL)
 		periph->periph_dtor(periph);
 
 	/*
 	 * The peripheral list is protected by the topology lock.
 	 */
 	xpt_lock_buses();
 
 	TAILQ_REMOVE(&(*p_drv)->units, periph, unit_links);
 	(*p_drv)->generation++;
 
 	xpt_remove_periph(periph);
 
 	xpt_unlock_buses();
 	if ((periph->flags & CAM_PERIPH_ANNOUNCED) && !rebooting)
 		xpt_print(periph->path, "Periph destroyed\n");
 	else
 		CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n"));
 
 	if (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) {
 		union ccb ccb;
 		void *arg;
 
 		switch (periph->deferred_ac) {
 		case AC_FOUND_DEVICE:
 			ccb.ccb_h.func_code = XPT_GDEV_TYPE;
 			xpt_setup_ccb(&ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 			xpt_action(&ccb);
 			arg = &ccb;
 			break;
 		case AC_PATH_REGISTERED:
 			ccb.ccb_h.func_code = XPT_PATH_INQ;
 			xpt_setup_ccb(&ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 			xpt_action(&ccb);
 			arg = &ccb;
 			break;
 		default:
 			arg = NULL;
 			break;
 		}
 		periph->deferred_callback(NULL, periph->deferred_ac,
 					  periph->path, arg);
 	}
 	xpt_free_path(periph->path);
 	free(periph, M_CAMPERIPH);
 	xpt_lock_buses();
 }
 
 /*
  * Map user virtual pointers into kernel virtual address space, so we can
  * access the memory.  This is now a generic function that centralizes most
  * of the sanity checks on the data flags, if any.
  * This also only works for up to MAXPHYS memory.  Since we use
  * buffers to map stuff in and out, we're limited to the buffer size.
  */
 int
 cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo,
     u_int maxmap)
 {
 	int numbufs, i, j;
 	int flags[CAM_PERIPH_MAXMAPS];
 	u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
 	u_int32_t lengths[CAM_PERIPH_MAXMAPS];
 	u_int32_t dirs[CAM_PERIPH_MAXMAPS];
 
 	if (maxmap == 0)
 		maxmap = DFLTPHYS;	/* traditional default */
 	else if (maxmap > MAXPHYS)
 		maxmap = MAXPHYS;	/* for safety */
 	switch(ccb->ccb_h.func_code) {
 	case XPT_DEV_MATCH:
 		if (ccb->cdm.match_buf_len == 0) {
 			printf("cam_periph_mapmem: invalid match buffer "
 			       "length 0\n");
 			return(EINVAL);
 		}
 		if (ccb->cdm.pattern_buf_len > 0) {
 			data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
 			lengths[0] = ccb->cdm.pattern_buf_len;
 			dirs[0] = CAM_DIR_OUT;
 			data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
 			lengths[1] = ccb->cdm.match_buf_len;
 			dirs[1] = CAM_DIR_IN;
 			numbufs = 2;
 		} else {
 			data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
 			lengths[0] = ccb->cdm.match_buf_len;
 			dirs[0] = CAM_DIR_IN;
 			numbufs = 1;
 		}
 		/*
 		 * This request will not go to the hardware, no reason
 		 * to be so strict. vmapbuf() is able to map up to MAXPHYS.
 		 */
 		maxmap = MAXPHYS;
 		break;
 	case XPT_SCSI_IO:
 	case XPT_CONT_TARGET_IO:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return(0);
 		if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
 			return (EINVAL);
 		data_ptrs[0] = &ccb->csio.data_ptr;
 		lengths[0] = ccb->csio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 1;
 		break;
 	case XPT_ATA_IO:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return(0);
 		if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
 			return (EINVAL);
 		data_ptrs[0] = &ccb->ataio.data_ptr;
 		lengths[0] = ccb->ataio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 1;
 		break;
 	case XPT_SMP_IO:
 		data_ptrs[0] = &ccb->smpio.smp_request;
 		lengths[0] = ccb->smpio.smp_request_len;
 		dirs[0] = CAM_DIR_OUT;
 		data_ptrs[1] = &ccb->smpio.smp_response;
 		lengths[1] = ccb->smpio.smp_response_len;
 		dirs[1] = CAM_DIR_IN;
 		numbufs = 2;
 		break;
 	case XPT_DEV_ADVINFO:
 		if (ccb->cdai.bufsiz == 0)
 			return (0);
 
 		data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
 		lengths[0] = ccb->cdai.bufsiz;
 		dirs[0] = CAM_DIR_IN;
 		numbufs = 1;
 
 		/*
 		 * This request will not go to the hardware, no reason
 		 * to be so strict. vmapbuf() is able to map up to MAXPHYS.
 		 */
 		maxmap = MAXPHYS;
 		break;
 	default:
 		return(EINVAL);
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * Check the transfer length and permissions first, so we don't
 	 * have to unmap any previously mapped buffers.
 	 */
 	for (i = 0; i < numbufs; i++) {
 
 		flags[i] = 0;
 
 		/*
 		 * The userland data pointer passed in may not be page
 		 * aligned.  vmapbuf() truncates the address to a page
 		 * boundary, so if the address isn't page aligned, we'll
 		 * need enough space for the given transfer length, plus
 		 * whatever extra space is necessary to make it to the page
 		 * boundary.
 		 */
 		if ((lengths[i] +
 		    (((vm_offset_t)(*data_ptrs[i])) & PAGE_MASK)) > maxmap){
 			printf("cam_periph_mapmem: attempt to map %lu bytes, "
 			       "which is greater than %lu\n",
 			       (long)(lengths[i] +
 			       (((vm_offset_t)(*data_ptrs[i])) & PAGE_MASK)),
 			       (u_long)maxmap);
 			return(E2BIG);
 		}
 
 		if (dirs[i] & CAM_DIR_OUT) {
 			flags[i] = BIO_WRITE;
 		}
 
 		if (dirs[i] & CAM_DIR_IN) {
 			flags[i] = BIO_READ;
 		}
 
 	}
 
 	/*
 	 * This keeps the kernel stack of current thread from getting
 	 * swapped.  In low-memory situations where the kernel stack might
 	 * otherwise get swapped out, this holds it and allows the thread
 	 * to make progress and release the kernel mapped pages sooner.
 	 *
 	 * XXX KDM should I use P_NOSWAP instead?
 	 */
 	PHOLD(curproc);
 
 	for (i = 0; i < numbufs; i++) {
 		/*
 		 * Get the buffer.
 		 */
 		mapinfo->bp[i] = getpbuf(NULL);
 
 		/* put our pointer in the data slot */
 		mapinfo->bp[i]->b_data = *data_ptrs[i];
 
 		/* save the user's data address */
 		mapinfo->bp[i]->b_caller1 = *data_ptrs[i];
 
 		/* set the transfer length, we know it's < MAXPHYS */
 		mapinfo->bp[i]->b_bufsize = lengths[i];
 
 		/* set the direction */
 		mapinfo->bp[i]->b_iocmd = flags[i];
 
 		/*
 		 * Map the buffer into kernel memory.
 		 *
 		 * Note that useracc() alone is not a  sufficient test.
 		 * vmapbuf() can still fail due to a smaller file mapped
 		 * into a larger area of VM, or if userland races against
 		 * vmapbuf() after the useracc() check.
 		 */
 		if (vmapbuf(mapinfo->bp[i], 1) < 0) {
 			for (j = 0; j < i; ++j) {
 				*data_ptrs[j] = mapinfo->bp[j]->b_caller1;
 				vunmapbuf(mapinfo->bp[j]);
 				relpbuf(mapinfo->bp[j], NULL);
 			}
 			relpbuf(mapinfo->bp[i], NULL);
 			PRELE(curproc);
 			return(EACCES);
 		}
 
 		/* set our pointer to the new mapped area */
 		*data_ptrs[i] = mapinfo->bp[i]->b_data;
 
 		mapinfo->num_bufs_used++;
 	}
 
 	/*
 	 * Now that we've gotten this far, change ownership to the kernel
 	 * of the buffers so that we don't run afoul of returning to user
 	 * space with locks (on the buffer) held.
 	 */
 	for (i = 0; i < numbufs; i++) {
 		BUF_KERNPROC(mapinfo->bp[i]);
 	}
 
 
 	return(0);
 }
 
 /*
  * Unmap memory segments mapped into kernel virtual address space by
  * cam_periph_mapmem().
  */
 void
 cam_periph_unmapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo)
 {
 	int numbufs, i;
 	u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
 
 	if (mapinfo->num_bufs_used <= 0) {
 		/* nothing to free and the process wasn't held. */
 		return;
 	}
 
 	switch (ccb->ccb_h.func_code) {
 	case XPT_DEV_MATCH:
 		numbufs = min(mapinfo->num_bufs_used, 2);
 
 		if (numbufs == 1) {
 			data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
 		} else {
 			data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
 			data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
 		}
 		break;
 	case XPT_SCSI_IO:
 	case XPT_CONT_TARGET_IO:
 		data_ptrs[0] = &ccb->csio.data_ptr;
 		numbufs = min(mapinfo->num_bufs_used, 1);
 		break;
 	case XPT_ATA_IO:
 		data_ptrs[0] = &ccb->ataio.data_ptr;
 		numbufs = min(mapinfo->num_bufs_used, 1);
 		break;
 	case XPT_SMP_IO:
 		numbufs = min(mapinfo->num_bufs_used, 2);
 		data_ptrs[0] = &ccb->smpio.smp_request;
 		data_ptrs[1] = &ccb->smpio.smp_response;
 		break;
 	case XPT_DEV_ADVINFO:
 		numbufs = min(mapinfo->num_bufs_used, 1);
 		data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
 		break;
 	default:
 		/* allow ourselves to be swapped once again */
 		PRELE(curproc);
 		return;
 		break; /* NOTREACHED */ 
 	}
 
 	for (i = 0; i < numbufs; i++) {
 		/* Set the user's pointer back to the original value */
 		*data_ptrs[i] = mapinfo->bp[i]->b_caller1;
 
 		/* unmap the buffer */
 		vunmapbuf(mapinfo->bp[i]);
 
 		/* release the buffer */
 		relpbuf(mapinfo->bp[i], NULL);
 	}
 
 	/* allow ourselves to be swapped once again */
 	PRELE(curproc);
 }
 
 int
 cam_periph_ioctl(struct cam_periph *periph, u_long cmd, caddr_t addr,
 		 int (*error_routine)(union ccb *ccb, 
 				      cam_flags camflags,
 				      u_int32_t sense_flags))
 {
 	union ccb 	     *ccb;
 	int 		     error;
 	int		     found;
 
 	error = found = 0;
 
 	switch(cmd){
 	case CAMGETPASSTHRU:
 		ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 		xpt_setup_ccb(&ccb->ccb_h,
 			      ccb->ccb_h.path,
 			      CAM_PRIORITY_NORMAL);
 		ccb->ccb_h.func_code = XPT_GDEVLIST;
 
 		/*
 		 * Basically, the point of this is that we go through
 		 * getting the list of devices, until we find a passthrough
 		 * device.  In the current version of the CAM code, the
 		 * only way to determine what type of device we're dealing
 		 * with is by its name.
 		 */
 		while (found == 0) {
 			ccb->cgdl.index = 0;
 			ccb->cgdl.status = CAM_GDEVLIST_MORE_DEVS;
 			while (ccb->cgdl.status == CAM_GDEVLIST_MORE_DEVS) {
 
 				/* we want the next device in the list */
 				xpt_action(ccb);
 				if (strncmp(ccb->cgdl.periph_name, 
 				    "pass", 4) == 0){
 					found = 1;
 					break;
 				}
 			}
 			if ((ccb->cgdl.status == CAM_GDEVLIST_LAST_DEVICE) &&
 			    (found == 0)) {
 				ccb->cgdl.periph_name[0] = '\0';
 				ccb->cgdl.unit_number = 0;
 				break;
 			}
 		}
 
 		/* copy the result back out */	
 		bcopy(ccb, addr, sizeof(union ccb));
 
 		/* and release the ccb */
 		xpt_release_ccb(ccb);
 
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 	return(error);
 }
 
 static void
 cam_periph_done_panic(struct cam_periph *periph, union ccb *done_ccb)
 {
 
 	panic("%s: already done with ccb %p", __func__, done_ccb);
 }
 
 static void
 cam_periph_done(struct cam_periph *periph, union ccb *done_ccb)
 {
 
 	/* Caller will release the CCB */
 	xpt_path_assert(done_ccb->ccb_h.path, MA_OWNED);
 	done_ccb->ccb_h.cbfcnp = cam_periph_done_panic;
 	wakeup(&done_ccb->ccb_h.cbfcnp);
 }
 
 static void
 cam_periph_ccbwait(union ccb *ccb)
 {
 
 	if ((ccb->ccb_h.func_code & XPT_FC_QUEUED) != 0) {
 		while (ccb->ccb_h.cbfcnp != cam_periph_done_panic)
 			xpt_path_sleep(ccb->ccb_h.path, &ccb->ccb_h.cbfcnp,
 			    PRIBIO, "cbwait", 0);
 	}
 	KASSERT(ccb->ccb_h.pinfo.index == CAM_UNQUEUED_INDEX &&
 	    (ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_INPROG,
 	    ("%s: proceeding with incomplete ccb: ccb=%p, func_code=%#x, "
 	     "status=%#x, index=%d", __func__, ccb, ccb->ccb_h.func_code,
 	     ccb->ccb_h.status, ccb->ccb_h.pinfo.index));
 }
 
 int
 cam_periph_runccb(union ccb *ccb,
 		  int (*error_routine)(union ccb *ccb,
 				       cam_flags camflags,
 				       u_int32_t sense_flags),
 		  cam_flags camflags, u_int32_t sense_flags,
 		  struct devstat *ds)
 {
 	struct bintime *starttime;
 	struct bintime ltime;
 	int error;
  
 	starttime = NULL;
 	xpt_path_assert(ccb->ccb_h.path, MA_OWNED);
 	KASSERT((ccb->ccb_h.flags & CAM_UNLOCKED) == 0,
 	    ("%s: ccb=%p, func_code=%#x, flags=%#x", __func__, ccb,
 	     ccb->ccb_h.func_code, ccb->ccb_h.flags));
 
 	/*
 	 * If the user has supplied a stats structure, and if we understand
 	 * this particular type of ccb, record the transaction start.
 	 */
 	if ((ds != NULL) && (ccb->ccb_h.func_code == XPT_SCSI_IO ||
 	    ccb->ccb_h.func_code == XPT_ATA_IO)) {
 		starttime = &ltime;
 		binuptime(starttime);
 		devstat_start_transaction(ds, starttime);
 	}
 
 	ccb->ccb_h.cbfcnp = cam_periph_done;
 	xpt_action(ccb);
  
 	do {
 		cam_periph_ccbwait(ccb);
 		if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP)
 			error = 0;
 		else if (error_routine != NULL) {
 			ccb->ccb_h.cbfcnp = cam_periph_done;
 			error = (*error_routine)(ccb, camflags, sense_flags);
 		} else
 			error = 0;
 
 	} while (error == ERESTART);
           
 	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
 		cam_release_devq(ccb->ccb_h.path,
 				 /* relsim_flags */0,
 				 /* openings */0,
 				 /* timeout */0,
 				 /* getcount_only */ FALSE);
 		ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 	}
 
 	if (ds != NULL) {
 		if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
 			devstat_end_transaction(ds,
 					ccb->csio.dxfer_len - ccb->csio.resid,
 					ccb->csio.tag_action & 0x3,
 					((ccb->ccb_h.flags & CAM_DIR_MASK) ==
 					CAM_DIR_NONE) ?  DEVSTAT_NO_DATA : 
 					(ccb->ccb_h.flags & CAM_DIR_OUT) ?
 					DEVSTAT_WRITE : 
 					DEVSTAT_READ, NULL, starttime);
 		} else if (ccb->ccb_h.func_code == XPT_ATA_IO) {
 			devstat_end_transaction(ds,
 					ccb->ataio.dxfer_len - ccb->ataio.resid,
 					0, /* Not used in ATA */
 					((ccb->ccb_h.flags & CAM_DIR_MASK) ==
 					CAM_DIR_NONE) ?  DEVSTAT_NO_DATA : 
 					(ccb->ccb_h.flags & CAM_DIR_OUT) ?
 					DEVSTAT_WRITE : 
 					DEVSTAT_READ, NULL, starttime);
 		}
 	}
 
 	return(error);
 }
 
 void
 cam_freeze_devq(struct cam_path *path)
 {
 	struct ccb_hdr ccb_h;
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("cam_freeze_devq\n"));
 	xpt_setup_ccb(&ccb_h, path, /*priority*/1);
 	ccb_h.func_code = XPT_NOOP;
 	ccb_h.flags = CAM_DEV_QFREEZE;
 	xpt_action((union ccb *)&ccb_h);
 }
 
 u_int32_t
 cam_release_devq(struct cam_path *path, u_int32_t relsim_flags,
 		 u_int32_t openings, u_int32_t arg,
 		 int getcount_only)
 {
 	struct ccb_relsim crs;
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("cam_release_devq(%u, %u, %u, %d)\n",
 	    relsim_flags, openings, arg, getcount_only));
 	xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL);
 	crs.ccb_h.func_code = XPT_REL_SIMQ;
 	crs.ccb_h.flags = getcount_only ? CAM_DEV_QFREEZE : 0;
 	crs.release_flags = relsim_flags;
 	crs.openings = openings;
 	crs.release_timeout = arg;
 	xpt_action((union ccb *)&crs);
 	return (crs.qfrozen_cnt);
 }
 
 #define saved_ccb_ptr ppriv_ptr0
 static void
 camperiphdone(struct cam_periph *periph, union ccb *done_ccb)
 {
 	union ccb      *saved_ccb;
 	cam_status	status;
 	struct scsi_start_stop_unit *scsi_cmd;
 	int    error_code, sense_key, asc, ascq;
 
 	scsi_cmd = (struct scsi_start_stop_unit *)
 	    &done_ccb->csio.cdb_io.cdb_bytes;
 	status = done_ccb->ccb_h.status;
 
 	if ((status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		if (scsi_extract_sense_ccb(done_ccb,
 		    &error_code, &sense_key, &asc, &ascq)) {
 			/*
 			 * If the error is "invalid field in CDB",
 			 * and the load/eject flag is set, turn the
 			 * flag off and try again.  This is just in
 			 * case the drive in question barfs on the
 			 * load eject flag.  The CAM code should set
 			 * the load/eject flag by default for
 			 * removable media.
 			 */
 			if ((scsi_cmd->opcode == START_STOP_UNIT) &&
 			    ((scsi_cmd->how & SSS_LOEJ) != 0) &&
 			     (asc == 0x24) && (ascq == 0x00)) {
 				scsi_cmd->how &= ~SSS_LOEJ;
 				if (status & CAM_DEV_QFRZN) {
 					cam_release_devq(done_ccb->ccb_h.path,
 					    0, 0, 0, 0);
 					done_ccb->ccb_h.status &=
 					    ~CAM_DEV_QFRZN;
 				}
 				xpt_action(done_ccb);
 				goto out;
 			}
 		}
 		if (cam_periph_error(done_ccb,
 		    0, SF_RETRY_UA | SF_NO_PRINT, NULL) == ERESTART)
 			goto out;
 		if (done_ccb->ccb_h.status & CAM_DEV_QFRZN) {
 			cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0);
 			done_ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 		}
 	} else {
 		/*
 		 * If we have successfully taken a device from the not
 		 * ready to ready state, re-scan the device and re-get
 		 * the inquiry information.  Many devices (mostly disks)
 		 * don't properly report their inquiry information unless
 		 * they are spun up.
 		 */
 		if (scsi_cmd->opcode == START_STOP_UNIT)
 			xpt_async(AC_INQ_CHANGED, done_ccb->ccb_h.path, NULL);
 	}
 
 	/*
 	 * Perform the final retry with the original CCB so that final
 	 * error processing is performed by the owner of the CCB.
 	 */
 	saved_ccb = (union ccb *)done_ccb->ccb_h.saved_ccb_ptr;
 	bcopy(saved_ccb, done_ccb, sizeof(*done_ccb));
 	xpt_free_ccb(saved_ccb);
 	if (done_ccb->ccb_h.cbfcnp != camperiphdone)
 		periph->flags &= ~CAM_PERIPH_RECOVERY_INPROG;
 	xpt_action(done_ccb);
 
 out:
 	/* Drop freeze taken due to CAM_DEV_QFREEZE flag set. */
 	cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0);
 }
 
 /*
  * Generic Async Event handler.  Peripheral drivers usually
  * filter out the events that require personal attention,
  * and leave the rest to this function.
  */
 void
 cam_periph_async(struct cam_periph *periph, u_int32_t code,
 		 struct cam_path *path, void *arg)
 {
 	switch (code) {
 	case AC_LOST_DEVICE:
 		cam_periph_invalidate(periph);
 		break; 
 	default:
 		break;
 	}
 }
 
 void
 cam_periph_bus_settle(struct cam_periph *periph, u_int bus_settle)
 {
 	struct ccb_getdevstats cgds;
 
 	xpt_setup_ccb(&cgds.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 	cgds.ccb_h.func_code = XPT_GDEV_STATS;
 	xpt_action((union ccb *)&cgds);
 	cam_periph_freeze_after_event(periph, &cgds.last_reset, bus_settle);
 }
 
 void
 cam_periph_freeze_after_event(struct cam_periph *periph,
 			      struct timeval* event_time, u_int duration_ms)
 {
 	struct timeval delta;
 	struct timeval duration_tv;
 
 	if (!timevalisset(event_time))
 		return;
 
 	microtime(&delta);
 	timevalsub(&delta, event_time);
 	duration_tv.tv_sec = duration_ms / 1000;
 	duration_tv.tv_usec = (duration_ms % 1000) * 1000;
 	if (timevalcmp(&delta, &duration_tv, <)) {
 		timevalsub(&duration_tv, &delta);
 
 		duration_ms = duration_tv.tv_sec * 1000;
 		duration_ms += duration_tv.tv_usec / 1000;
 		cam_freeze_devq(periph->path); 
 		cam_release_devq(periph->path,
 				RELSIM_RELEASE_AFTER_TIMEOUT,
 				/*reduction*/0,
 				/*timeout*/duration_ms,
 				/*getcount_only*/0);
 	}
 
 }
 
 static int
 camperiphscsistatuserror(union ccb *ccb, union ccb **orig_ccb,
     cam_flags camflags, u_int32_t sense_flags,
     int *openings, u_int32_t *relsim_flags,
     u_int32_t *timeout, u_int32_t *action, const char **action_string)
 {
 	int error;
 
 	switch (ccb->csio.scsi_status) {
 	case SCSI_STATUS_OK:
 	case SCSI_STATUS_COND_MET:
 	case SCSI_STATUS_INTERMED:
 	case SCSI_STATUS_INTERMED_COND_MET:
 		error = 0;
 		break;
 	case SCSI_STATUS_CMD_TERMINATED:
 	case SCSI_STATUS_CHECK_COND:
 		error = camperiphscsisenseerror(ccb, orig_ccb,
 					        camflags,
 					        sense_flags,
 					        openings,
 					        relsim_flags,
 					        timeout,
 					        action,
 					        action_string);
 		break;
 	case SCSI_STATUS_QUEUE_FULL:
 	{
 		/* no decrement */
 		struct ccb_getdevstats cgds;
 
 		/*
 		 * First off, find out what the current
 		 * transaction counts are.
 		 */
 		xpt_setup_ccb(&cgds.ccb_h,
 			      ccb->ccb_h.path,
 			      CAM_PRIORITY_NORMAL);
 		cgds.ccb_h.func_code = XPT_GDEV_STATS;
 		xpt_action((union ccb *)&cgds);
 
 		/*
 		 * If we were the only transaction active, treat
 		 * the QUEUE FULL as if it were a BUSY condition.
 		 */
 		if (cgds.dev_active != 0) {
 			int total_openings;
 
 			/*
 		 	 * Reduce the number of openings to
 			 * be 1 less than the amount it took
 			 * to get a queue full bounded by the
 			 * minimum allowed tag count for this
 			 * device.
 		 	 */
 			total_openings = cgds.dev_active + cgds.dev_openings;
 			*openings = cgds.dev_active;
 			if (*openings < cgds.mintags)
 				*openings = cgds.mintags;
 			if (*openings < total_openings)
 				*relsim_flags = RELSIM_ADJUST_OPENINGS;
 			else {
 				/*
 				 * Some devices report queue full for
 				 * temporary resource shortages.  For
 				 * this reason, we allow a minimum
 				 * tag count to be entered via a
 				 * quirk entry to prevent the queue
 				 * count on these devices from falling
 				 * to a pessimisticly low value.  We
 				 * still wait for the next successful
 				 * completion, however, before queueing
 				 * more transactions to the device.
 				 */
 				*relsim_flags = RELSIM_RELEASE_AFTER_CMDCMPLT;
 			}
 			*timeout = 0;
 			error = ERESTART;
 			*action &= ~SSQ_PRINT_SENSE;
 			break;
 		}
 		/* FALLTHROUGH */
 	}
 	case SCSI_STATUS_BUSY:
 		/*
 		 * Restart the queue after either another
 		 * command completes or a 1 second timeout.
 		 */
 		if ((sense_flags & SF_RETRY_BUSY) != 0 ||
 		    (ccb->ccb_h.retry_count--) > 0) {
 			error = ERESTART;
 			*relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT
 				      | RELSIM_RELEASE_AFTER_CMDCMPLT;
 			*timeout = 1000;
 		} else {
 			error = EIO;
 		}
 		break;
 	case SCSI_STATUS_RESERV_CONFLICT:
 	default:
 		error = EIO;
 		break;
 	}
 	return (error);
 }
 
 static int
 camperiphscsisenseerror(union ccb *ccb, union ccb **orig,
     cam_flags camflags, u_int32_t sense_flags,
     int *openings, u_int32_t *relsim_flags,
     u_int32_t *timeout, u_int32_t *action, const char **action_string)
 {
 	struct cam_periph *periph;
 	union ccb *orig_ccb = ccb;
 	int error, recoveryccb;
 
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+	if (ccb->ccb_h.func_code == XPT_SCSI_IO && ccb->csio.bio != NULL)
+		biotrack(ccb->csio.bio, __func__);
+#endif
+
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	recoveryccb = (ccb->ccb_h.cbfcnp == camperiphdone);
 	if ((periph->flags & CAM_PERIPH_RECOVERY_INPROG) && !recoveryccb) {
 		/*
 		 * If error recovery is already in progress, don't attempt
 		 * to process this error, but requeue it unconditionally
 		 * and attempt to process it once error recovery has
 		 * completed.  This failed command is probably related to
 		 * the error that caused the currently active error recovery
 		 * action so our  current recovery efforts should also
 		 * address this command.  Be aware that the error recovery
 		 * code assumes that only one recovery action is in progress
 		 * on a particular peripheral instance at any given time
 		 * (e.g. only one saved CCB for error recovery) so it is
 		 * imperitive that we don't violate this assumption.
 		 */
 		error = ERESTART;
 		*action &= ~SSQ_PRINT_SENSE;
 	} else {
 		scsi_sense_action err_action;
 		struct ccb_getdev cgd;
 
 		/*
 		 * Grab the inquiry data for this device.
 		 */
 		xpt_setup_ccb(&cgd.ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL);
 		cgd.ccb_h.func_code = XPT_GDEV_TYPE;
 		xpt_action((union ccb *)&cgd);
 
 		err_action = scsi_error_action(&ccb->csio, &cgd.inq_data,
 		    sense_flags);
 		error = err_action & SS_ERRMASK;
 
 		/*
 		 * Do not autostart sequential access devices
 		 * to avoid unexpected tape loading.
 		 */
 		if ((err_action & SS_MASK) == SS_START &&
 		    SID_TYPE(&cgd.inq_data) == T_SEQUENTIAL) {
 			*action_string = "Will not autostart a "
 			    "sequential access device";
 			goto sense_error_done;
 		}
 
 		/*
 		 * Avoid recovery recursion if recovery action is the same.
 		 */
 		if ((err_action & SS_MASK) >= SS_START && recoveryccb) {
 			if (((err_action & SS_MASK) == SS_START &&
 			     ccb->csio.cdb_io.cdb_bytes[0] == START_STOP_UNIT) ||
 			    ((err_action & SS_MASK) == SS_TUR &&
 			     (ccb->csio.cdb_io.cdb_bytes[0] == TEST_UNIT_READY))) {
 				err_action = SS_RETRY|SSQ_DECREMENT_COUNT|EIO;
 				*relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
 				*timeout = 500;
 			}
 		}
 
 		/*
 		 * If the recovery action will consume a retry,
 		 * make sure we actually have retries available.
 		 */
 		if ((err_action & SSQ_DECREMENT_COUNT) != 0) {
 		 	if (ccb->ccb_h.retry_count > 0 &&
 			    (periph->flags & CAM_PERIPH_INVALID) == 0)
 		 		ccb->ccb_h.retry_count--;
 			else {
 				*action_string = "Retries exhausted";
 				goto sense_error_done;
 			}
 		}
 
 		if ((err_action & SS_MASK) >= SS_START) {
 			/*
 			 * Do common portions of commands that
 			 * use recovery CCBs.
 			 */
 			orig_ccb = xpt_alloc_ccb_nowait();
 			if (orig_ccb == NULL) {
 				*action_string = "Can't allocate recovery CCB";
 				goto sense_error_done;
 			}
 			/*
 			 * Clear freeze flag for original request here, as
 			 * this freeze will be dropped as part of ERESTART.
 			 */
 			ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 			bcopy(ccb, orig_ccb, sizeof(*orig_ccb));
 		}
 
 		switch (err_action & SS_MASK) {
 		case SS_NOP:
 			*action_string = "No recovery action needed";
 			error = 0;
 			break;
 		case SS_RETRY:
 			*action_string = "Retrying command (per sense data)";
 			error = ERESTART;
 			break;
 		case SS_FAIL:
 			*action_string = "Unretryable error";
 			break;
 		case SS_START:
 		{
 			int le;
 
 			/*
 			 * Send a start unit command to the device, and
 			 * then retry the command.
 			 */
 			*action_string = "Attempting to start unit";
 			periph->flags |= CAM_PERIPH_RECOVERY_INPROG;
 
 			/*
 			 * Check for removable media and set
 			 * load/eject flag appropriately.
 			 */
 			if (SID_IS_REMOVABLE(&cgd.inq_data))
 				le = TRUE;
 			else
 				le = FALSE;
 
 			scsi_start_stop(&ccb->csio,
 					/*retries*/1,
 					camperiphdone,
 					MSG_SIMPLE_Q_TAG,
 					/*start*/TRUE,
 					/*load/eject*/le,
 					/*immediate*/FALSE,
 					SSD_FULL_SIZE,
 					/*timeout*/50000);
 			break;
 		}
 		case SS_TUR:
 		{
 			/*
 			 * Send a Test Unit Ready to the device.
 			 * If the 'many' flag is set, we send 120
 			 * test unit ready commands, one every half 
 			 * second.  Otherwise, we just send one TUR.
 			 * We only want to do this if the retry 
 			 * count has not been exhausted.
 			 */
 			int retries;
 
 			if ((err_action & SSQ_MANY) != 0) {
 				*action_string = "Polling device for readiness";
 				retries = 120;
 			} else {
 				*action_string = "Testing device for readiness";
 				retries = 1;
 			}
 			periph->flags |= CAM_PERIPH_RECOVERY_INPROG;
 			scsi_test_unit_ready(&ccb->csio,
 					     retries,
 					     camperiphdone,
 					     MSG_SIMPLE_Q_TAG,
 					     SSD_FULL_SIZE,
 					     /*timeout*/5000);
 
 			/*
 			 * Accomplish our 500ms delay by deferring
 			 * the release of our device queue appropriately.
 			 */
 			*relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
 			*timeout = 500;
 			break;
 		}
 		default:
 			panic("Unhandled error action %x", err_action);
 		}
 		
 		if ((err_action & SS_MASK) >= SS_START) {
 			/*
 			 * Drop the priority, so that the recovery
 			 * CCB is the first to execute.  Freeze the queue
 			 * after this command is sent so that we can
 			 * restore the old csio and have it queued in
 			 * the proper order before we release normal 
 			 * transactions to the device.
 			 */
 			ccb->ccb_h.pinfo.priority--;
 			ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
 			ccb->ccb_h.saved_ccb_ptr = orig_ccb;
 			error = ERESTART;
 			*orig = orig_ccb;
 		}
 
 sense_error_done:
 		*action = err_action;
 	}
 	return (error);
 }
 
 /*
  * Generic error handler.  Peripheral drivers usually filter
  * out the errors that they handle in a unique manner, then
  * call this function.
  */
 int
 cam_periph_error(union ccb *ccb, cam_flags camflags,
 		 u_int32_t sense_flags, union ccb *save_ccb)
 {
 	struct cam_path *newpath;
 	union ccb  *orig_ccb, *scan_ccb;
 	struct cam_periph *periph;
 	const char *action_string;
 	cam_status  status;
 	int	    frozen, error, openings, devctl_err;
 	u_int32_t   action, relsim_flags, timeout;
 
 	action = SSQ_PRINT_SENSE;
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	action_string = NULL;
 	status = ccb->ccb_h.status;
 	frozen = (status & CAM_DEV_QFRZN) != 0;
 	status &= CAM_STATUS_MASK;
 	devctl_err = openings = relsim_flags = timeout = 0;
 	orig_ccb = ccb;
 
 	/* Filter the errors that should be reported via devctl */
 	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
 	case CAM_CMD_TIMEOUT:
 	case CAM_REQ_ABORTED:
 	case CAM_REQ_CMP_ERR:
 	case CAM_REQ_TERMIO:
 	case CAM_UNREC_HBA_ERROR:
 	case CAM_DATA_RUN_ERR:
 	case CAM_SCSI_STATUS_ERROR:
 	case CAM_ATA_STATUS_ERROR:
 	case CAM_SMP_STATUS_ERROR:
 		devctl_err++;
 		break;
 	default:
 		break;
 	}
 
 	switch (status) {
 	case CAM_REQ_CMP:
 		error = 0;
 		action &= ~SSQ_PRINT_SENSE;
 		break;
 	case CAM_SCSI_STATUS_ERROR:
 		error = camperiphscsistatuserror(ccb, &orig_ccb,
 		    camflags, sense_flags, &openings, &relsim_flags,
 		    &timeout, &action, &action_string);
 		break;
 	case CAM_AUTOSENSE_FAIL:
 		error = EIO;	/* we have to kill the command */
 		break;
 	case CAM_UA_ABORT:
 	case CAM_UA_TERMIO:
 	case CAM_MSG_REJECT_REC:
 		/* XXX Don't know that these are correct */
 		error = EIO;
 		break;
 	case CAM_SEL_TIMEOUT:
 		if ((camflags & CAM_RETRY_SELTO) != 0) {
 			if (ccb->ccb_h.retry_count > 0 &&
 			    (periph->flags & CAM_PERIPH_INVALID) == 0) {
 				ccb->ccb_h.retry_count--;
 				error = ERESTART;
 
 				/*
 				 * Wait a bit to give the device
 				 * time to recover before we try again.
 				 */
 				relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
 				timeout = periph_selto_delay;
 				break;
 			}
 			action_string = "Retries exhausted";
 		}
 		/* FALLTHROUGH */
 	case CAM_DEV_NOT_THERE:
 		error = ENXIO;
 		action = SSQ_LOST;
 		break;
 	case CAM_REQ_INVALID:
 	case CAM_PATH_INVALID:
 	case CAM_NO_HBA:
 	case CAM_PROVIDE_FAIL:
 	case CAM_REQ_TOO_BIG:
 	case CAM_LUN_INVALID:
 	case CAM_TID_INVALID:
 	case CAM_FUNC_NOTAVAIL:
 		error = EINVAL;
 		break;
 	case CAM_SCSI_BUS_RESET:
 	case CAM_BDR_SENT:
 		/*
 		 * Commands that repeatedly timeout and cause these
 		 * kinds of error recovery actions, should return
 		 * CAM_CMD_TIMEOUT, which allows us to safely assume
 		 * that this command was an innocent bystander to
 		 * these events and should be unconditionally
 		 * retried.
 		 */
 	case CAM_REQUEUE_REQ:
 		/* Unconditional requeue if device is still there */
 		if (periph->flags & CAM_PERIPH_INVALID) {
 			action_string = "Periph was invalidated";
 			error = EIO;
 		} else if (sense_flags & SF_NO_RETRY) {
 			error = EIO;
 			action_string = "Retry was blocked";
 		} else {
 			error = ERESTART;
 			action &= ~SSQ_PRINT_SENSE;
 		}
 		break;
 	case CAM_RESRC_UNAVAIL:
 		/* Wait a bit for the resource shortage to abate. */
 		timeout = periph_noresrc_delay;
 		/* FALLTHROUGH */
 	case CAM_BUSY:
 		if (timeout == 0) {
 			/* Wait a bit for the busy condition to abate. */
 			timeout = periph_busy_delay;
 		}
 		relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
 		/* FALLTHROUGH */
 	case CAM_ATA_STATUS_ERROR:
 	case CAM_REQ_CMP_ERR:
 	case CAM_CMD_TIMEOUT:
 	case CAM_UNEXP_BUSFREE:
 	case CAM_UNCOR_PARITY:
 	case CAM_DATA_RUN_ERR:
 	default:
 		if (periph->flags & CAM_PERIPH_INVALID) {
 			error = EIO;
 			action_string = "Periph was invalidated";
 		} else if (ccb->ccb_h.retry_count == 0) {
 			error = EIO;
 			action_string = "Retries exhausted";
 		} else if (sense_flags & SF_NO_RETRY) {
 			error = EIO;
 			action_string = "Retry was blocked";
 		} else {
 			ccb->ccb_h.retry_count--;
 			error = ERESTART;
 		}
 		break;
 	}
 
 	if ((sense_flags & SF_PRINT_ALWAYS) ||
 	    CAM_DEBUGGED(ccb->ccb_h.path, CAM_DEBUG_INFO))
 		action |= SSQ_PRINT_SENSE;
 	else if (sense_flags & SF_NO_PRINT)
 		action &= ~SSQ_PRINT_SENSE;
 	if ((action & SSQ_PRINT_SENSE) != 0)
 		cam_error_print(orig_ccb, CAM_ESF_ALL, CAM_EPF_ALL);
 	if (error != 0 && (action & SSQ_PRINT_SENSE) != 0) {
 		if (error != ERESTART) {
 			if (action_string == NULL)
 				action_string = "Unretryable error";
 			xpt_print(ccb->ccb_h.path, "Error %d, %s\n",
 			    error, action_string);
 		} else if (action_string != NULL)
 			xpt_print(ccb->ccb_h.path, "%s\n", action_string);
 		else
 			xpt_print(ccb->ccb_h.path, "Retrying command\n");
 	}
 
 	if (devctl_err && (error != 0 || (action & SSQ_PRINT_SENSE) != 0))
 		cam_periph_devctl_notify(orig_ccb);
 
 	if ((action & SSQ_LOST) != 0) {
 		lun_id_t lun_id;
 
 		/*
 		 * For a selection timeout, we consider all of the LUNs on
 		 * the target to be gone.  If the status is CAM_DEV_NOT_THERE,
 		 * then we only get rid of the device(s) specified by the
 		 * path in the original CCB.
 		 */
 		if (status == CAM_SEL_TIMEOUT)
 			lun_id = CAM_LUN_WILDCARD;
 		else
 			lun_id = xpt_path_lun_id(ccb->ccb_h.path);
 
 		/* Should we do more if we can't create the path?? */
 		if (xpt_create_path(&newpath, periph,
 				    xpt_path_path_id(ccb->ccb_h.path),
 				    xpt_path_target_id(ccb->ccb_h.path),
 				    lun_id) == CAM_REQ_CMP) {
 
 			/*
 			 * Let peripheral drivers know that this
 			 * device has gone away.
 			 */
 			xpt_async(AC_LOST_DEVICE, newpath, NULL);
 			xpt_free_path(newpath);
 		}
 	}
 
 	/* Broadcast UNIT ATTENTIONs to all periphs. */
 	if ((action & SSQ_UA) != 0)
 		xpt_async(AC_UNIT_ATTENTION, orig_ccb->ccb_h.path, orig_ccb);
 
 	/* Rescan target on "Reported LUNs data has changed" */
 	if ((action & SSQ_RESCAN) != 0) {
 		if (xpt_create_path(&newpath, NULL,
 				    xpt_path_path_id(ccb->ccb_h.path),
 				    xpt_path_target_id(ccb->ccb_h.path),
 				    CAM_LUN_WILDCARD) == CAM_REQ_CMP) {
 
 			scan_ccb = xpt_alloc_ccb_nowait();
 			if (scan_ccb != NULL) {
 				scan_ccb->ccb_h.path = newpath;
 				scan_ccb->ccb_h.func_code = XPT_SCAN_TGT;
 				scan_ccb->crcn.flags = 0;
 				xpt_rescan(scan_ccb);
 			} else {
 				xpt_print(newpath,
 				    "Can't allocate CCB to rescan target\n");
 				xpt_free_path(newpath);
 			}
 		}
 	}
 
 	/* Attempt a retry */
 	if (error == ERESTART || error == 0) {
 		if (frozen != 0)
 			ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 		if (error == ERESTART)
 			xpt_action(ccb);
 		if (frozen != 0)
 			cam_release_devq(ccb->ccb_h.path,
 					 relsim_flags,
 					 openings,
 					 timeout,
 					 /*getcount_only*/0);
 	}
 
 	return (error);
 }
 
 #define CAM_PERIPH_DEVD_MSG_SIZE	256
 
 static void
 cam_periph_devctl_notify(union ccb *ccb)
 {
 	struct cam_periph *periph;
 	struct ccb_getdev *cgd;
 	struct sbuf sb;
 	int serr, sk, asc, ascq;
 	char *sbmsg, *type;
 
 	sbmsg = malloc(CAM_PERIPH_DEVD_MSG_SIZE, M_CAMPERIPH, M_NOWAIT);
 	if (sbmsg == NULL)
 		return;
 
 	sbuf_new(&sb, sbmsg, CAM_PERIPH_DEVD_MSG_SIZE, SBUF_FIXEDLEN);
 
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	sbuf_printf(&sb, "device=%s%d ", periph->periph_name,
 	    periph->unit_number);
 
 	sbuf_printf(&sb, "serial=\"");
 	if ((cgd = (struct ccb_getdev *)xpt_alloc_ccb_nowait()) != NULL) {
 		xpt_setup_ccb(&cgd->ccb_h, ccb->ccb_h.path,
 		    CAM_PRIORITY_NORMAL);
 		cgd->ccb_h.func_code = XPT_GDEV_TYPE;
 		xpt_action((union ccb *)cgd);
 
 		if (cgd->ccb_h.status == CAM_REQ_CMP)
 			sbuf_bcat(&sb, cgd->serial_num, cgd->serial_num_len);
 		xpt_free_ccb((union ccb *)cgd);
 	}
 	sbuf_printf(&sb, "\" ");
 	sbuf_printf(&sb, "cam_status=\"0x%x\" ", ccb->ccb_h.status);
 
 	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
 	case CAM_CMD_TIMEOUT:
 		sbuf_printf(&sb, "timeout=%d ", ccb->ccb_h.timeout);
 		type = "timeout";
 		break;
 	case CAM_SCSI_STATUS_ERROR:
 		sbuf_printf(&sb, "scsi_status=%d ", ccb->csio.scsi_status);
 		if (scsi_extract_sense_ccb(ccb, &serr, &sk, &asc, &ascq))
 			sbuf_printf(&sb, "scsi_sense=\"%02x %02x %02x %02x\" ",
 			    serr, sk, asc, ascq);
 		type = "error";
 		break;
 	case CAM_ATA_STATUS_ERROR:
 		sbuf_printf(&sb, "RES=\"");
 		ata_res_sbuf(&ccb->ataio.res, &sb);
 		sbuf_printf(&sb, "\" ");
 		type = "error";
 		break;
 	default:
 		type = "error";
 		break;
 	}
 
 	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
 		sbuf_printf(&sb, "CDB=\"");
 		if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0)
 			scsi_cdb_sbuf(ccb->csio.cdb_io.cdb_ptr, &sb);
 		else
 			scsi_cdb_sbuf(ccb->csio.cdb_io.cdb_bytes, &sb);
 		sbuf_printf(&sb, "\" ");
 	} else if (ccb->ccb_h.func_code == XPT_ATA_IO) {
 		sbuf_printf(&sb, "ACB=\"");
 		ata_cmd_sbuf(&ccb->ataio.cmd, &sb);
 		sbuf_printf(&sb, "\" ");
 	}
 
 	if (sbuf_finish(&sb) == 0)
 		devctl_notify("CAM", "periph", type, sbuf_data(&sb));
 	sbuf_delete(&sb);
 	free(sbmsg, M_CAMPERIPH);
 }
 
Index: head/sys/cam/cam_xpt.c
===================================================================
--- head/sys/cam/cam_xpt.c	(revision 308154)
+++ head/sys/cam/cam_xpt.c	(revision 308155)
@@ -1,5407 +1,5424 @@
 /*-
  * Implementation of the Common Access Method Transport (XPT) layer.
  *
  * Copyright (c) 1997, 1998, 1999 Justin T. Gibbs.
  * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/time.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/interrupt.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/taskqueue.h>
 
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/kthread.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_queue.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_xpt_internal.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_compat.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_message.h>
 #include <cam/scsi/scsi_pass.h>
 
 #include <machine/md_var.h>	/* geometry translation */
 #include <machine/stdarg.h>	/* for xpt_print below */
 
 #include "opt_cam.h"
 
 /*
  * This is the maximum number of high powered commands (e.g. start unit)
  * that can be outstanding at a particular time.
  */
 #ifndef CAM_MAX_HIGHPOWER
 #define CAM_MAX_HIGHPOWER  4
 #endif
 
 /* Datastructures internal to the xpt layer */
 MALLOC_DEFINE(M_CAMXPT, "CAM XPT", "CAM XPT buffers");
 MALLOC_DEFINE(M_CAMDEV, "CAM DEV", "CAM devices");
 MALLOC_DEFINE(M_CAMCCB, "CAM CCB", "CAM CCBs");
 MALLOC_DEFINE(M_CAMPATH, "CAM path", "CAM paths");
 
 /* Object for defering XPT actions to a taskqueue */
 struct xpt_task {
 	struct task	task;
 	void		*data1;
 	uintptr_t	data2;
 };
 
 struct xpt_softc {
 	uint32_t		xpt_generation;
 
 	/* number of high powered commands that can go through right now */
 	struct mtx		xpt_highpower_lock;
 	STAILQ_HEAD(highpowerlist, cam_ed)	highpowerq;
 	int			num_highpower;
 
 	/* queue for handling async rescan requests. */
 	TAILQ_HEAD(, ccb_hdr) ccb_scanq;
 	int buses_to_config;
 	int buses_config_done;
 
 	/* Registered busses */
 	TAILQ_HEAD(,cam_eb)	xpt_busses;
 	u_int			bus_generation;
 
 	struct intr_config_hook	*xpt_config_hook;
 
 	int			boot_delay;
 	struct callout 		boot_callout;
 
 	struct mtx		xpt_topo_lock;
 	struct mtx		xpt_lock;
 	struct taskqueue	*xpt_taskq;
 };
 
 typedef enum {
 	DM_RET_COPY		= 0x01,
 	DM_RET_FLAG_MASK	= 0x0f,
 	DM_RET_NONE		= 0x00,
 	DM_RET_STOP		= 0x10,
 	DM_RET_DESCEND		= 0x20,
 	DM_RET_ERROR		= 0x30,
 	DM_RET_ACTION_MASK	= 0xf0
 } dev_match_ret;
 
 typedef enum {
 	XPT_DEPTH_BUS,
 	XPT_DEPTH_TARGET,
 	XPT_DEPTH_DEVICE,
 	XPT_DEPTH_PERIPH
 } xpt_traverse_depth;
 
 struct xpt_traverse_config {
 	xpt_traverse_depth	depth;
 	void			*tr_func;
 	void			*tr_arg;
 };
 
 typedef	int	xpt_busfunc_t (struct cam_eb *bus, void *arg);
 typedef	int	xpt_targetfunc_t (struct cam_et *target, void *arg);
 typedef	int	xpt_devicefunc_t (struct cam_ed *device, void *arg);
 typedef	int	xpt_periphfunc_t (struct cam_periph *periph, void *arg);
 typedef int	xpt_pdrvfunc_t (struct periph_driver **pdrv, void *arg);
 
 /* Transport layer configuration information */
 static struct xpt_softc xsoftc;
 
 MTX_SYSINIT(xpt_topo_init, &xsoftc.xpt_topo_lock, "XPT topology lock", MTX_DEF);
 
 SYSCTL_INT(_kern_cam, OID_AUTO, boot_delay, CTLFLAG_RDTUN,
            &xsoftc.boot_delay, 0, "Bus registration wait time");
 SYSCTL_UINT(_kern_cam, OID_AUTO, xpt_generation, CTLFLAG_RD,
 	    &xsoftc.xpt_generation, 0, "CAM peripheral generation count");
 
 struct cam_doneq {
 	struct mtx_padalign	cam_doneq_mtx;
 	STAILQ_HEAD(, ccb_hdr)	cam_doneq;
 	int			cam_doneq_sleep;
 };
 
 static struct cam_doneq cam_doneqs[MAXCPU];
 static int cam_num_doneqs;
 static struct proc *cam_proc;
 
 SYSCTL_INT(_kern_cam, OID_AUTO, num_doneqs, CTLFLAG_RDTUN,
            &cam_num_doneqs, 0, "Number of completion queues/threads");
 
 struct cam_periph *xpt_periph;
 
 static periph_init_t xpt_periph_init;
 
 static struct periph_driver xpt_driver =
 {
 	xpt_periph_init, "xpt",
 	TAILQ_HEAD_INITIALIZER(xpt_driver.units), /* generation */ 0,
 	CAM_PERIPH_DRV_EARLY
 };
 
 PERIPHDRIVER_DECLARE(xpt, xpt_driver);
 
 static d_open_t xptopen;
 static d_close_t xptclose;
 static d_ioctl_t xptioctl;
 static d_ioctl_t xptdoioctl;
 
 static struct cdevsw xpt_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	0,
 	.d_open =	xptopen,
 	.d_close =	xptclose,
 	.d_ioctl =	xptioctl,
 	.d_name =	"xpt",
 };
 
 /* Storage for debugging datastructures */
 struct cam_path *cam_dpath;
 u_int32_t cam_dflags = CAM_DEBUG_FLAGS;
 SYSCTL_UINT(_kern_cam, OID_AUTO, dflags, CTLFLAG_RWTUN,
 	&cam_dflags, 0, "Enabled debug flags");
 u_int32_t cam_debug_delay = CAM_DEBUG_DELAY;
 SYSCTL_UINT(_kern_cam, OID_AUTO, debug_delay, CTLFLAG_RWTUN,
 	&cam_debug_delay, 0, "Delay in us after each debug message");
 
 /* Our boot-time initialization hook */
 static int cam_module_event_handler(module_t, int /*modeventtype_t*/, void *);
 
 static moduledata_t cam_moduledata = {
 	"cam",
 	cam_module_event_handler,
 	NULL
 };
 
 static int	xpt_init(void *);
 
 DECLARE_MODULE(cam, cam_moduledata, SI_SUB_CONFIGURE, SI_ORDER_SECOND);
 MODULE_VERSION(cam, 1);
 
 
 static void		xpt_async_bcast(struct async_list *async_head,
 					u_int32_t async_code,
 					struct cam_path *path,
 					void *async_arg);
 static path_id_t xptnextfreepathid(void);
 static path_id_t xptpathid(const char *sim_name, int sim_unit, int sim_bus);
 static union ccb *xpt_get_ccb(struct cam_periph *periph);
 static union ccb *xpt_get_ccb_nowait(struct cam_periph *periph);
 static void	 xpt_run_allocq(struct cam_periph *periph, int sleep);
 static void	 xpt_run_allocq_task(void *context, int pending);
 static void	 xpt_run_devq(struct cam_devq *devq);
 static timeout_t xpt_release_devq_timeout;
 static void	 xpt_release_simq_timeout(void *arg) __unused;
 static void	 xpt_acquire_bus(struct cam_eb *bus);
 static void	 xpt_release_bus(struct cam_eb *bus);
 static uint32_t	 xpt_freeze_devq_device(struct cam_ed *dev, u_int count);
 static int	 xpt_release_devq_device(struct cam_ed *dev, u_int count,
 		    int run_queue);
 static struct cam_et*
 		 xpt_alloc_target(struct cam_eb *bus, target_id_t target_id);
 static void	 xpt_acquire_target(struct cam_et *target);
 static void	 xpt_release_target(struct cam_et *target);
 static struct cam_eb*
 		 xpt_find_bus(path_id_t path_id);
 static struct cam_et*
 		 xpt_find_target(struct cam_eb *bus, target_id_t target_id);
 static struct cam_ed*
 		 xpt_find_device(struct cam_et *target, lun_id_t lun_id);
 static void	 xpt_config(void *arg);
 static int	 xpt_schedule_dev(struct camq *queue, cam_pinfo *dev_pinfo,
 				 u_int32_t new_priority);
 static xpt_devicefunc_t xptpassannouncefunc;
 static void	 xptaction(struct cam_sim *sim, union ccb *work_ccb);
 static void	 xptpoll(struct cam_sim *sim);
 static void	 camisr_runqueue(void);
 static void	 xpt_done_process(struct ccb_hdr *ccb_h);
 static void	 xpt_done_td(void *);
 static dev_match_ret	xptbusmatch(struct dev_match_pattern *patterns,
 				    u_int num_patterns, struct cam_eb *bus);
 static dev_match_ret	xptdevicematch(struct dev_match_pattern *patterns,
 				       u_int num_patterns,
 				       struct cam_ed *device);
 static dev_match_ret	xptperiphmatch(struct dev_match_pattern *patterns,
 				       u_int num_patterns,
 				       struct cam_periph *periph);
 static xpt_busfunc_t	xptedtbusfunc;
 static xpt_targetfunc_t	xptedttargetfunc;
 static xpt_devicefunc_t	xptedtdevicefunc;
 static xpt_periphfunc_t	xptedtperiphfunc;
 static xpt_pdrvfunc_t	xptplistpdrvfunc;
 static xpt_periphfunc_t	xptplistperiphfunc;
 static int		xptedtmatch(struct ccb_dev_match *cdm);
 static int		xptperiphlistmatch(struct ccb_dev_match *cdm);
 static int		xptbustraverse(struct cam_eb *start_bus,
 				       xpt_busfunc_t *tr_func, void *arg);
 static int		xpttargettraverse(struct cam_eb *bus,
 					  struct cam_et *start_target,
 					  xpt_targetfunc_t *tr_func, void *arg);
 static int		xptdevicetraverse(struct cam_et *target,
 					  struct cam_ed *start_device,
 					  xpt_devicefunc_t *tr_func, void *arg);
 static int		xptperiphtraverse(struct cam_ed *device,
 					  struct cam_periph *start_periph,
 					  xpt_periphfunc_t *tr_func, void *arg);
 static int		xptpdrvtraverse(struct periph_driver **start_pdrv,
 					xpt_pdrvfunc_t *tr_func, void *arg);
 static int		xptpdperiphtraverse(struct periph_driver **pdrv,
 					    struct cam_periph *start_periph,
 					    xpt_periphfunc_t *tr_func,
 					    void *arg);
 static xpt_busfunc_t	xptdefbusfunc;
 static xpt_targetfunc_t	xptdeftargetfunc;
 static xpt_devicefunc_t	xptdefdevicefunc;
 static xpt_periphfunc_t	xptdefperiphfunc;
 static void		xpt_finishconfig_task(void *context, int pending);
 static void		xpt_dev_async_default(u_int32_t async_code,
 					      struct cam_eb *bus,
 					      struct cam_et *target,
 					      struct cam_ed *device,
 					      void *async_arg);
 static struct cam_ed *	xpt_alloc_device_default(struct cam_eb *bus,
 						 struct cam_et *target,
 						 lun_id_t lun_id);
 static xpt_devicefunc_t	xptsetasyncfunc;
 static xpt_busfunc_t	xptsetasyncbusfunc;
 static cam_status	xptregister(struct cam_periph *periph,
 				    void *arg);
 static const char *	xpt_action_name(uint32_t action);
 static __inline int device_is_queued(struct cam_ed *device);
 
 static __inline int
 xpt_schedule_devq(struct cam_devq *devq, struct cam_ed *dev)
 {
 	int	retval;
 
 	mtx_assert(&devq->send_mtx, MA_OWNED);
 	if ((dev->ccbq.queue.entries > 0) &&
 	    (dev->ccbq.dev_openings > 0) &&
 	    (dev->ccbq.queue.qfrozen_cnt == 0)) {
 		/*
 		 * The priority of a device waiting for controller
 		 * resources is that of the highest priority CCB
 		 * enqueued.
 		 */
 		retval =
 		    xpt_schedule_dev(&devq->send_queue,
 				     &dev->devq_entry,
 				     CAMQ_GET_PRIO(&dev->ccbq.queue));
 	} else {
 		retval = 0;
 	}
 	return (retval);
 }
 
 static __inline int
 device_is_queued(struct cam_ed *device)
 {
 	return (device->devq_entry.index != CAM_UNQUEUED_INDEX);
 }
 
 static void
 xpt_periph_init()
 {
 	make_dev(&xpt_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0600, "xpt0");
 }
 
 static int
 xptopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 
 	/*
 	 * Only allow read-write access.
 	 */
 	if (((flags & FWRITE) == 0) || ((flags & FREAD) == 0))
 		return(EPERM);
 
 	/*
 	 * We don't allow nonblocking access.
 	 */
 	if ((flags & O_NONBLOCK) != 0) {
 		printf("%s: can't do nonblocking access\n", devtoname(dev));
 		return(ENODEV);
 	}
 
 	return(0);
 }
 
 static int
 xptclose(struct cdev *dev, int flag, int fmt, struct thread *td)
 {
 
 	return(0);
 }
 
 /*
  * Don't automatically grab the xpt softc lock here even though this is going
  * through the xpt device.  The xpt device is really just a back door for
  * accessing other devices and SIMs, so the right thing to do is to grab
  * the appropriate SIM lock once the bus/SIM is located.
  */
 static int
 xptioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
 {
 	int error;
 
 	if ((error = xptdoioctl(dev, cmd, addr, flag, td)) == ENOTTY) {
 		error = cam_compat_ioctl(dev, cmd, addr, flag, td, xptdoioctl);
 	}
 	return (error);
 }
 	
 static int
 xptdoioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
 {
 	int error;
 
 	error = 0;
 
 	switch(cmd) {
 	/*
 	 * For the transport layer CAMIOCOMMAND ioctl, we really only want
 	 * to accept CCB types that don't quite make sense to send through a
 	 * passthrough driver. XPT_PATH_INQ is an exception to this, as stated
 	 * in the CAM spec.
 	 */
 	case CAMIOCOMMAND: {
 		union ccb *ccb;
 		union ccb *inccb;
 		struct cam_eb *bus;
 
 		inccb = (union ccb *)addr;
 
 		bus = xpt_find_bus(inccb->ccb_h.path_id);
 		if (bus == NULL)
 			return (EINVAL);
 
 		switch (inccb->ccb_h.func_code) {
 		case XPT_SCAN_BUS:
 		case XPT_RESET_BUS:
 			if (inccb->ccb_h.target_id != CAM_TARGET_WILDCARD ||
 			    inccb->ccb_h.target_lun != CAM_LUN_WILDCARD) {
 				xpt_release_bus(bus);
 				return (EINVAL);
 			}
 			break;
 		case XPT_SCAN_TGT:
 			if (inccb->ccb_h.target_id == CAM_TARGET_WILDCARD ||
 			    inccb->ccb_h.target_lun != CAM_LUN_WILDCARD) {
 				xpt_release_bus(bus);
 				return (EINVAL);
 			}
 			break;
 		default:
 			break;
 		}
 
 		switch(inccb->ccb_h.func_code) {
 		case XPT_SCAN_BUS:
 		case XPT_RESET_BUS:
 		case XPT_PATH_INQ:
 		case XPT_ENG_INQ:
 		case XPT_SCAN_LUN:
 		case XPT_SCAN_TGT:
 
 			ccb = xpt_alloc_ccb();
 
 			/*
 			 * Create a path using the bus, target, and lun the
 			 * user passed in.
 			 */
 			if (xpt_create_path(&ccb->ccb_h.path, NULL,
 					    inccb->ccb_h.path_id,
 					    inccb->ccb_h.target_id,
 					    inccb->ccb_h.target_lun) !=
 					    CAM_REQ_CMP){
 				error = EINVAL;
 				xpt_free_ccb(ccb);
 				break;
 			}
 			/* Ensure all of our fields are correct */
 			xpt_setup_ccb(&ccb->ccb_h, ccb->ccb_h.path,
 				      inccb->ccb_h.pinfo.priority);
 			xpt_merge_ccb(ccb, inccb);
 			xpt_path_lock(ccb->ccb_h.path);
 			cam_periph_runccb(ccb, NULL, 0, 0, NULL);
 			xpt_path_unlock(ccb->ccb_h.path);
 			bcopy(ccb, inccb, sizeof(union ccb));
 			xpt_free_path(ccb->ccb_h.path);
 			xpt_free_ccb(ccb);
 			break;
 
 		case XPT_DEBUG: {
 			union ccb ccb;
 
 			/*
 			 * This is an immediate CCB, so it's okay to
 			 * allocate it on the stack.
 			 */
 
 			/*
 			 * Create a path using the bus, target, and lun the
 			 * user passed in.
 			 */
 			if (xpt_create_path(&ccb.ccb_h.path, NULL,
 					    inccb->ccb_h.path_id,
 					    inccb->ccb_h.target_id,
 					    inccb->ccb_h.target_lun) !=
 					    CAM_REQ_CMP){
 				error = EINVAL;
 				break;
 			}
 			/* Ensure all of our fields are correct */
 			xpt_setup_ccb(&ccb.ccb_h, ccb.ccb_h.path,
 				      inccb->ccb_h.pinfo.priority);
 			xpt_merge_ccb(&ccb, inccb);
 			xpt_action(&ccb);
 			bcopy(&ccb, inccb, sizeof(union ccb));
 			xpt_free_path(ccb.ccb_h.path);
 			break;
 
 		}
 		case XPT_DEV_MATCH: {
 			struct cam_periph_map_info mapinfo;
 			struct cam_path *old_path;
 
 			/*
 			 * We can't deal with physical addresses for this
 			 * type of transaction.
 			 */
 			if ((inccb->ccb_h.flags & CAM_DATA_MASK) !=
 			    CAM_DATA_VADDR) {
 				error = EINVAL;
 				break;
 			}
 
 			/*
 			 * Save this in case the caller had it set to
 			 * something in particular.
 			 */
 			old_path = inccb->ccb_h.path;
 
 			/*
 			 * We really don't need a path for the matching
 			 * code.  The path is needed because of the
 			 * debugging statements in xpt_action().  They
 			 * assume that the CCB has a valid path.
 			 */
 			inccb->ccb_h.path = xpt_periph->path;
 
 			bzero(&mapinfo, sizeof(mapinfo));
 
 			/*
 			 * Map the pattern and match buffers into kernel
 			 * virtual address space.
 			 */
 			error = cam_periph_mapmem(inccb, &mapinfo, MAXPHYS);
 
 			if (error) {
 				inccb->ccb_h.path = old_path;
 				break;
 			}
 
 			/*
 			 * This is an immediate CCB, we can send it on directly.
 			 */
 			xpt_action(inccb);
 
 			/*
 			 * Map the buffers back into user space.
 			 */
 			cam_periph_unmapmem(inccb, &mapinfo);
 
 			inccb->ccb_h.path = old_path;
 
 			error = 0;
 			break;
 		}
 		default:
 			error = ENOTSUP;
 			break;
 		}
 		xpt_release_bus(bus);
 		break;
 	}
 	/*
 	 * This is the getpassthru ioctl. It takes a XPT_GDEVLIST ccb as input,
 	 * with the periphal driver name and unit name filled in.  The other
 	 * fields don't really matter as input.  The passthrough driver name
 	 * ("pass"), and unit number are passed back in the ccb.  The current
 	 * device generation number, and the index into the device peripheral
 	 * driver list, and the status are also passed back.  Note that
 	 * since we do everything in one pass, unlike the XPT_GDEVLIST ccb,
 	 * we never return a status of CAM_GDEVLIST_LIST_CHANGED.  It is
 	 * (or rather should be) impossible for the device peripheral driver
 	 * list to change since we look at the whole thing in one pass, and
 	 * we do it with lock protection.
 	 *
 	 */
 	case CAMGETPASSTHRU: {
 		union ccb *ccb;
 		struct cam_periph *periph;
 		struct periph_driver **p_drv;
 		char   *name;
 		u_int unit;
 		int base_periph_found;
 
 		ccb = (union ccb *)addr;
 		unit = ccb->cgdl.unit_number;
 		name = ccb->cgdl.periph_name;
 		base_periph_found = 0;
 
 		/*
 		 * Sanity check -- make sure we don't get a null peripheral
 		 * driver name.
 		 */
 		if (*ccb->cgdl.periph_name == '\0') {
 			error = EINVAL;
 			break;
 		}
 
 		/* Keep the list from changing while we traverse it */
 		xpt_lock_buses();
 
 		/* first find our driver in the list of drivers */
 		for (p_drv = periph_drivers; *p_drv != NULL; p_drv++)
 			if (strcmp((*p_drv)->driver_name, name) == 0)
 				break;
 
 		if (*p_drv == NULL) {
 			xpt_unlock_buses();
 			ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 			ccb->cgdl.status = CAM_GDEVLIST_ERROR;
 			*ccb->cgdl.periph_name = '\0';
 			ccb->cgdl.unit_number = 0;
 			error = ENOENT;
 			break;
 		}
 
 		/*
 		 * Run through every peripheral instance of this driver
 		 * and check to see whether it matches the unit passed
 		 * in by the user.  If it does, get out of the loops and
 		 * find the passthrough driver associated with that
 		 * peripheral driver.
 		 */
 		for (periph = TAILQ_FIRST(&(*p_drv)->units); periph != NULL;
 		     periph = TAILQ_NEXT(periph, unit_links)) {
 
 			if (periph->unit_number == unit)
 				break;
 		}
 		/*
 		 * If we found the peripheral driver that the user passed
 		 * in, go through all of the peripheral drivers for that
 		 * particular device and look for a passthrough driver.
 		 */
 		if (periph != NULL) {
 			struct cam_ed *device;
 			int i;
 
 			base_periph_found = 1;
 			device = periph->path->device;
 			for (i = 0, periph = SLIST_FIRST(&device->periphs);
 			     periph != NULL;
 			     periph = SLIST_NEXT(periph, periph_links), i++) {
 				/*
 				 * Check to see whether we have a
 				 * passthrough device or not.
 				 */
 				if (strcmp(periph->periph_name, "pass") == 0) {
 					/*
 					 * Fill in the getdevlist fields.
 					 */
 					strcpy(ccb->cgdl.periph_name,
 					       periph->periph_name);
 					ccb->cgdl.unit_number =
 						periph->unit_number;
 					if (SLIST_NEXT(periph, periph_links))
 						ccb->cgdl.status =
 							CAM_GDEVLIST_MORE_DEVS;
 					else
 						ccb->cgdl.status =
 						       CAM_GDEVLIST_LAST_DEVICE;
 					ccb->cgdl.generation =
 						device->generation;
 					ccb->cgdl.index = i;
 					/*
 					 * Fill in some CCB header fields
 					 * that the user may want.
 					 */
 					ccb->ccb_h.path_id =
 						periph->path->bus->path_id;
 					ccb->ccb_h.target_id =
 						periph->path->target->target_id;
 					ccb->ccb_h.target_lun =
 						periph->path->device->lun_id;
 					ccb->ccb_h.status = CAM_REQ_CMP;
 					break;
 				}
 			}
 		}
 
 		/*
 		 * If the periph is null here, one of two things has
 		 * happened.  The first possibility is that we couldn't
 		 * find the unit number of the particular peripheral driver
 		 * that the user is asking about.  e.g. the user asks for
 		 * the passthrough driver for "da11".  We find the list of
 		 * "da" peripherals all right, but there is no unit 11.
 		 * The other possibility is that we went through the list
 		 * of peripheral drivers attached to the device structure,
 		 * but didn't find one with the name "pass".  Either way,
 		 * we return ENOENT, since we couldn't find something.
 		 */
 		if (periph == NULL) {
 			ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 			ccb->cgdl.status = CAM_GDEVLIST_ERROR;
 			*ccb->cgdl.periph_name = '\0';
 			ccb->cgdl.unit_number = 0;
 			error = ENOENT;
 			/*
 			 * It is unfortunate that this is even necessary,
 			 * but there are many, many clueless users out there.
 			 * If this is true, the user is looking for the
 			 * passthrough driver, but doesn't have one in his
 			 * kernel.
 			 */
 			if (base_periph_found == 1) {
 				printf("xptioctl: pass driver is not in the "
 				       "kernel\n");
 				printf("xptioctl: put \"device pass\" in "
 				       "your kernel config file\n");
 			}
 		}
 		xpt_unlock_buses();
 		break;
 		}
 	default:
 		error = ENOTTY;
 		break;
 	}
 
 	return(error);
 }
 
 static int
 cam_module_event_handler(module_t mod, int what, void *arg)
 {
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		if ((error = xpt_init(NULL)) != 0)
 			return (error);
 		break;
 	case MOD_UNLOAD:
 		return EBUSY;
 	default:
 		return EOPNOTSUPP;
 	}
 
 	return 0;
 }
 
 static struct xpt_proto *
 xpt_proto_find(cam_proto proto)
 {
 	struct xpt_proto **pp;
 
 	SET_FOREACH(pp, cam_xpt_proto_set) {
 		if ((*pp)->proto == proto)
 			return *pp;
 	}
 
 	return NULL;
 }
 
 static void
 xpt_rescan_done(struct cam_periph *periph, union ccb *done_ccb)
 {
 
 	if (done_ccb->ccb_h.ppriv_ptr1 == NULL) {
 		xpt_free_path(done_ccb->ccb_h.path);
 		xpt_free_ccb(done_ccb);
 	} else {
 		done_ccb->ccb_h.cbfcnp = done_ccb->ccb_h.ppriv_ptr1;
 		(*done_ccb->ccb_h.cbfcnp)(periph, done_ccb);
 	}
 	xpt_release_boot();
 }
 
 /* thread to handle bus rescans */
 static void
 xpt_scanner_thread(void *dummy)
 {
 	union ccb	*ccb;
 	struct cam_path	 path;
 
 	xpt_lock_buses();
 	for (;;) {
 		if (TAILQ_EMPTY(&xsoftc.ccb_scanq))
 			msleep(&xsoftc.ccb_scanq, &xsoftc.xpt_topo_lock, PRIBIO,
 			       "-", 0);
 		if ((ccb = (union ccb *)TAILQ_FIRST(&xsoftc.ccb_scanq)) != NULL) {
 			TAILQ_REMOVE(&xsoftc.ccb_scanq, &ccb->ccb_h, sim_links.tqe);
 			xpt_unlock_buses();
 
 			/*
 			 * Since lock can be dropped inside and path freed
 			 * by completion callback even before return here,
 			 * take our own path copy for reference.
 			 */
 			xpt_copy_path(&path, ccb->ccb_h.path);
 			xpt_path_lock(&path);
 			xpt_action(ccb);
 			xpt_path_unlock(&path);
 			xpt_release_path(&path);
 
 			xpt_lock_buses();
 		}
 	}
 }
 
 void
 xpt_rescan(union ccb *ccb)
 {
 	struct ccb_hdr *hdr;
 
 	/* Prepare request */
 	if (ccb->ccb_h.path->target->target_id == CAM_TARGET_WILDCARD &&
 	    ccb->ccb_h.path->device->lun_id == CAM_LUN_WILDCARD)
 		ccb->ccb_h.func_code = XPT_SCAN_BUS;
 	else if (ccb->ccb_h.path->target->target_id != CAM_TARGET_WILDCARD &&
 	    ccb->ccb_h.path->device->lun_id == CAM_LUN_WILDCARD)
 		ccb->ccb_h.func_code = XPT_SCAN_TGT;
 	else if (ccb->ccb_h.path->target->target_id != CAM_TARGET_WILDCARD &&
 	    ccb->ccb_h.path->device->lun_id != CAM_LUN_WILDCARD)
 		ccb->ccb_h.func_code = XPT_SCAN_LUN;
 	else {
 		xpt_print(ccb->ccb_h.path, "illegal scan path\n");
 		xpt_free_path(ccb->ccb_h.path);
 		xpt_free_ccb(ccb);
 		return;
 	}
 	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
 	    ("xpt_rescan: func %#x %s\n", ccb->ccb_h.func_code,
  		xpt_action_name(ccb->ccb_h.func_code)));
 
 	ccb->ccb_h.ppriv_ptr1 = ccb->ccb_h.cbfcnp;
 	ccb->ccb_h.cbfcnp = xpt_rescan_done;
 	xpt_setup_ccb(&ccb->ccb_h, ccb->ccb_h.path, CAM_PRIORITY_XPT);
 	/* Don't make duplicate entries for the same paths. */
 	xpt_lock_buses();
 	if (ccb->ccb_h.ppriv_ptr1 == NULL) {
 		TAILQ_FOREACH(hdr, &xsoftc.ccb_scanq, sim_links.tqe) {
 			if (xpt_path_comp(hdr->path, ccb->ccb_h.path) == 0) {
 				wakeup(&xsoftc.ccb_scanq);
 				xpt_unlock_buses();
 				xpt_print(ccb->ccb_h.path, "rescan already queued\n");
 				xpt_free_path(ccb->ccb_h.path);
 				xpt_free_ccb(ccb);
 				return;
 			}
 		}
 	}
 	TAILQ_INSERT_TAIL(&xsoftc.ccb_scanq, &ccb->ccb_h, sim_links.tqe);
 	xsoftc.buses_to_config++;
 	wakeup(&xsoftc.ccb_scanq);
 	xpt_unlock_buses();
 }
 
 /* Functions accessed by the peripheral drivers */
 static int
 xpt_init(void *dummy)
 {
 	struct cam_sim *xpt_sim;
 	struct cam_path *path;
 	struct cam_devq *devq;
 	cam_status status;
 	int error, i;
 
 	TAILQ_INIT(&xsoftc.xpt_busses);
 	TAILQ_INIT(&xsoftc.ccb_scanq);
 	STAILQ_INIT(&xsoftc.highpowerq);
 	xsoftc.num_highpower = CAM_MAX_HIGHPOWER;
 
 	mtx_init(&xsoftc.xpt_lock, "XPT lock", NULL, MTX_DEF);
 	mtx_init(&xsoftc.xpt_highpower_lock, "XPT highpower lock", NULL, MTX_DEF);
 	xsoftc.xpt_taskq = taskqueue_create("CAM XPT task", M_WAITOK,
 	    taskqueue_thread_enqueue, /*context*/&xsoftc.xpt_taskq);
 
 #ifdef CAM_BOOT_DELAY
 	/*
 	 * Override this value at compile time to assist our users
 	 * who don't use loader to boot a kernel.
 	 */
 	xsoftc.boot_delay = CAM_BOOT_DELAY;
 #endif
 	/*
 	 * The xpt layer is, itself, the equivalent of a SIM.
 	 * Allow 16 ccbs in the ccb pool for it.  This should
 	 * give decent parallelism when we probe busses and
 	 * perform other XPT functions.
 	 */
 	devq = cam_simq_alloc(16);
 	xpt_sim = cam_sim_alloc(xptaction,
 				xptpoll,
 				"xpt",
 				/*softc*/NULL,
 				/*unit*/0,
 				/*mtx*/&xsoftc.xpt_lock,
 				/*max_dev_transactions*/0,
 				/*max_tagged_dev_transactions*/0,
 				devq);
 	if (xpt_sim == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&xsoftc.xpt_lock);
 	if ((status = xpt_bus_register(xpt_sim, NULL, 0)) != CAM_SUCCESS) {
 		mtx_unlock(&xsoftc.xpt_lock);
 		printf("xpt_init: xpt_bus_register failed with status %#x,"
 		       " failing attach\n", status);
 		return (EINVAL);
 	}
 	mtx_unlock(&xsoftc.xpt_lock);
 
 	/*
 	 * Looking at the XPT from the SIM layer, the XPT is
 	 * the equivalent of a peripheral driver.  Allocate
 	 * a peripheral driver entry for us.
 	 */
 	if ((status = xpt_create_path(&path, NULL, CAM_XPT_PATH_ID,
 				      CAM_TARGET_WILDCARD,
 				      CAM_LUN_WILDCARD)) != CAM_REQ_CMP) {
 		printf("xpt_init: xpt_create_path failed with status %#x,"
 		       " failing attach\n", status);
 		return (EINVAL);
 	}
 	xpt_path_lock(path);
 	cam_periph_alloc(xptregister, NULL, NULL, NULL, "xpt", CAM_PERIPH_BIO,
 			 path, NULL, 0, xpt_sim);
 	xpt_path_unlock(path);
 	xpt_free_path(path);
 
 	if (cam_num_doneqs < 1)
 		cam_num_doneqs = 1 + mp_ncpus / 6;
 	else if (cam_num_doneqs > MAXCPU)
 		cam_num_doneqs = MAXCPU;
 	for (i = 0; i < cam_num_doneqs; i++) {
 		mtx_init(&cam_doneqs[i].cam_doneq_mtx, "CAM doneq", NULL,
 		    MTX_DEF);
 		STAILQ_INIT(&cam_doneqs[i].cam_doneq);
 		error = kproc_kthread_add(xpt_done_td, &cam_doneqs[i],
 		    &cam_proc, NULL, 0, 0, "cam", "doneq%d", i);
 		if (error != 0) {
 			cam_num_doneqs = i;
 			break;
 		}
 	}
 	if (cam_num_doneqs < 1) {
 		printf("xpt_init: Cannot init completion queues "
 		       "- failing attach\n");
 		return (ENOMEM);
 	}
 	/*
 	 * Register a callback for when interrupts are enabled.
 	 */
 	xsoftc.xpt_config_hook =
 	    (struct intr_config_hook *)malloc(sizeof(struct intr_config_hook),
 					      M_CAMXPT, M_NOWAIT | M_ZERO);
 	if (xsoftc.xpt_config_hook == NULL) {
 		printf("xpt_init: Cannot malloc config hook "
 		       "- failing attach\n");
 		return (ENOMEM);
 	}
 	xsoftc.xpt_config_hook->ich_func = xpt_config;
 	if (config_intrhook_establish(xsoftc.xpt_config_hook) != 0) {
 		free (xsoftc.xpt_config_hook, M_CAMXPT);
 		printf("xpt_init: config_intrhook_establish failed "
 		       "- failing attach\n");
 	}
 
 	return (0);
 }
 
 static cam_status
 xptregister(struct cam_periph *periph, void *arg)
 {
 	struct cam_sim *xpt_sim;
 
 	if (periph == NULL) {
 		printf("xptregister: periph was NULL!!\n");
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	xpt_sim = (struct cam_sim *)arg;
 	xpt_sim->softc = periph;
 	xpt_periph = periph;
 	periph->softc = NULL;
 
 	return(CAM_REQ_CMP);
 }
 
 int32_t
 xpt_add_periph(struct cam_periph *periph)
 {
 	struct cam_ed *device;
 	int32_t	 status;
 
 	TASK_INIT(&periph->periph_run_task, 0, xpt_run_allocq_task, periph);
 	device = periph->path->device;
 	status = CAM_REQ_CMP;
 	if (device != NULL) {
 		mtx_lock(&device->target->bus->eb_mtx);
 		device->generation++;
 		SLIST_INSERT_HEAD(&device->periphs, periph, periph_links);
 		mtx_unlock(&device->target->bus->eb_mtx);
 		atomic_add_32(&xsoftc.xpt_generation, 1);
 	}
 
 	return (status);
 }
 
 void
 xpt_remove_periph(struct cam_periph *periph)
 {
 	struct cam_ed *device;
 
 	device = periph->path->device;
 	if (device != NULL) {
 		mtx_lock(&device->target->bus->eb_mtx);
 		device->generation++;
 		SLIST_REMOVE(&device->periphs, periph, cam_periph, periph_links);
 		mtx_unlock(&device->target->bus->eb_mtx);
 		atomic_add_32(&xsoftc.xpt_generation, 1);
 	}
 }
 
 
 void
 xpt_announce_periph(struct cam_periph *periph, char *announce_string)
 {
 	struct	cam_path *path = periph->path;
 	struct  xpt_proto *proto;
 
 	cam_periph_assert(periph, MA_OWNED);
 	periph->flags |= CAM_PERIPH_ANNOUNCED;
 
 	printf("%s%d at %s%d bus %d scbus%d target %d lun %jx\n",
 	       periph->periph_name, periph->unit_number,
 	       path->bus->sim->sim_name,
 	       path->bus->sim->unit_number,
 	       path->bus->sim->bus_id,
 	       path->bus->path_id,
 	       path->target->target_id,
 	       (uintmax_t)path->device->lun_id);
 	printf("%s%d: ", periph->periph_name, periph->unit_number);
 	proto = xpt_proto_find(path->device->protocol);
 	if (proto)
 		proto->ops->announce(path->device);
 	else
 		printf("%s%d: Unknown protocol device %d\n",
 		    periph->periph_name, periph->unit_number,
 		    path->device->protocol);
 	if (path->device->serial_num_len > 0) {
 		/* Don't wrap the screen  - print only the first 60 chars */
 		printf("%s%d: Serial Number %.60s\n", periph->periph_name,
 		       periph->unit_number, path->device->serial_num);
 	}
 	/* Announce transport details. */
 	path->bus->xport->ops->announce(periph);
 	/* Announce command queueing. */
 	if (path->device->inq_flags & SID_CmdQue
 	 || path->device->flags & CAM_DEV_TAG_AFTER_COUNT) {
 		printf("%s%d: Command Queueing enabled\n",
 		       periph->periph_name, periph->unit_number);
 	}
 	/* Announce caller's details if they've passed in. */
 	if (announce_string != NULL)
 		printf("%s%d: %s\n", periph->periph_name,
 		       periph->unit_number, announce_string);
 }
 
 void
 xpt_announce_quirks(struct cam_periph *periph, int quirks, char *bit_string)
 {
 	if (quirks != 0) {
 		printf("%s%d: quirks=0x%b\n", periph->periph_name,
 		    periph->unit_number, quirks, bit_string);
 	}
 }
 
 void
 xpt_denounce_periph(struct cam_periph *periph)
 {
 	struct	cam_path *path = periph->path;
 	struct  xpt_proto *proto;
 
 	cam_periph_assert(periph, MA_OWNED);
 	printf("%s%d at %s%d bus %d scbus%d target %d lun %jx\n",
 	       periph->periph_name, periph->unit_number,
 	       path->bus->sim->sim_name,
 	       path->bus->sim->unit_number,
 	       path->bus->sim->bus_id,
 	       path->bus->path_id,
 	       path->target->target_id,
 	       (uintmax_t)path->device->lun_id);
 	printf("%s%d: ", periph->periph_name, periph->unit_number);
 	proto = xpt_proto_find(path->device->protocol);
 	if (proto)
 		proto->ops->denounce(path->device);
 	else
 		printf("%s%d: Unknown protocol device %d\n",
 		    periph->periph_name, periph->unit_number,
 		    path->device->protocol);
 	if (path->device->serial_num_len > 0)
 		printf(" s/n %.60s", path->device->serial_num);
 	printf(" detached\n");
 }
 
 
 int
 xpt_getattr(char *buf, size_t len, const char *attr, struct cam_path *path)
 {
 	int ret = -1, l;
 	struct ccb_dev_advinfo cdai;
 	struct scsi_vpd_id_descriptor *idd;
 
 	xpt_path_assert(path, MA_OWNED);
 
 	memset(&cdai, 0, sizeof(cdai));
 	xpt_setup_ccb(&cdai.ccb_h, path, CAM_PRIORITY_NORMAL);
 	cdai.ccb_h.func_code = XPT_DEV_ADVINFO;
 	cdai.bufsiz = len;
 
 	if (!strcmp(attr, "GEOM::ident"))
 		cdai.buftype = CDAI_TYPE_SERIAL_NUM;
 	else if (!strcmp(attr, "GEOM::physpath"))
 		cdai.buftype = CDAI_TYPE_PHYS_PATH;
 	else if (strcmp(attr, "GEOM::lunid") == 0 ||
 		 strcmp(attr, "GEOM::lunname") == 0) {
 		cdai.buftype = CDAI_TYPE_SCSI_DEVID;
 		cdai.bufsiz = CAM_SCSI_DEVID_MAXLEN;
 	} else
 		goto out;
 
 	cdai.buf = malloc(cdai.bufsiz, M_CAMXPT, M_NOWAIT|M_ZERO);
 	if (cdai.buf == NULL) {
 		ret = ENOMEM;
 		goto out;
 	}
 	xpt_action((union ccb *)&cdai); /* can only be synchronous */
 	if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0)
 		cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE);
 	if (cdai.provsiz == 0)
 		goto out;
 	if (cdai.buftype == CDAI_TYPE_SCSI_DEVID) {
 		if (strcmp(attr, "GEOM::lunid") == 0) {
 			idd = scsi_get_devid((struct scsi_vpd_device_id *)cdai.buf,
 			    cdai.provsiz, scsi_devid_is_lun_naa);
 			if (idd == NULL)
 				idd = scsi_get_devid((struct scsi_vpd_device_id *)cdai.buf,
 				    cdai.provsiz, scsi_devid_is_lun_eui64);
 		} else
 			idd = NULL;
 		if (idd == NULL)
 			idd = scsi_get_devid((struct scsi_vpd_device_id *)cdai.buf,
 			    cdai.provsiz, scsi_devid_is_lun_t10);
 		if (idd == NULL)
 			idd = scsi_get_devid((struct scsi_vpd_device_id *)cdai.buf,
 			    cdai.provsiz, scsi_devid_is_lun_name);
 		if (idd == NULL)
 			goto out;
 		ret = 0;
 		if ((idd->proto_codeset & SVPD_ID_CODESET_MASK) == SVPD_ID_CODESET_ASCII) {
 			if (idd->length < len) {
 				for (l = 0; l < idd->length; l++)
 					buf[l] = idd->identifier[l] ?
 					    idd->identifier[l] : ' ';
 				buf[l] = 0;
 			} else
 				ret = EFAULT;
 		} else if ((idd->proto_codeset & SVPD_ID_CODESET_MASK) == SVPD_ID_CODESET_UTF8) {
 			l = strnlen(idd->identifier, idd->length);
 			if (l < len) {
 				bcopy(idd->identifier, buf, l);
 				buf[l] = 0;
 			} else
 				ret = EFAULT;
 		} else {
 			if (idd->length * 2 < len) {
 				for (l = 0; l < idd->length; l++)
 					sprintf(buf + l * 2, "%02x",
 					    idd->identifier[l]);
 			} else
 				ret = EFAULT;
 		}
 	} else {
 		ret = 0;
 		if (strlcpy(buf, cdai.buf, len) >= len)
 			ret = EFAULT;
 	}
 
 out:
 	if (cdai.buf != NULL)
 		free(cdai.buf, M_CAMXPT);
 	return ret;
 }
 
 static dev_match_ret
 xptbusmatch(struct dev_match_pattern *patterns, u_int num_patterns,
 	    struct cam_eb *bus)
 {
 	dev_match_ret retval;
 	u_int i;
 
 	retval = DM_RET_NONE;
 
 	/*
 	 * If we aren't given something to match against, that's an error.
 	 */
 	if (bus == NULL)
 		return(DM_RET_ERROR);
 
 	/*
 	 * If there are no match entries, then this bus matches no
 	 * matter what.
 	 */
 	if ((patterns == NULL) || (num_patterns == 0))
 		return(DM_RET_DESCEND | DM_RET_COPY);
 
 	for (i = 0; i < num_patterns; i++) {
 		struct bus_match_pattern *cur_pattern;
 
 		/*
 		 * If the pattern in question isn't for a bus node, we
 		 * aren't interested.  However, we do indicate to the
 		 * calling routine that we should continue descending the
 		 * tree, since the user wants to match against lower-level
 		 * EDT elements.
 		 */
 		if (patterns[i].type != DEV_MATCH_BUS) {
 			if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE)
 				retval |= DM_RET_DESCEND;
 			continue;
 		}
 
 		cur_pattern = &patterns[i].pattern.bus_pattern;
 
 		/*
 		 * If they want to match any bus node, we give them any
 		 * device node.
 		 */
 		if (cur_pattern->flags == BUS_MATCH_ANY) {
 			/* set the copy flag */
 			retval |= DM_RET_COPY;
 
 			/*
 			 * If we've already decided on an action, go ahead
 			 * and return.
 			 */
 			if ((retval & DM_RET_ACTION_MASK) != DM_RET_NONE)
 				return(retval);
 		}
 
 		/*
 		 * Not sure why someone would do this...
 		 */
 		if (cur_pattern->flags == BUS_MATCH_NONE)
 			continue;
 
 		if (((cur_pattern->flags & BUS_MATCH_PATH) != 0)
 		 && (cur_pattern->path_id != bus->path_id))
 			continue;
 
 		if (((cur_pattern->flags & BUS_MATCH_BUS_ID) != 0)
 		 && (cur_pattern->bus_id != bus->sim->bus_id))
 			continue;
 
 		if (((cur_pattern->flags & BUS_MATCH_UNIT) != 0)
 		 && (cur_pattern->unit_number != bus->sim->unit_number))
 			continue;
 
 		if (((cur_pattern->flags & BUS_MATCH_NAME) != 0)
 		 && (strncmp(cur_pattern->dev_name, bus->sim->sim_name,
 			     DEV_IDLEN) != 0))
 			continue;
 
 		/*
 		 * If we get to this point, the user definitely wants
 		 * information on this bus.  So tell the caller to copy the
 		 * data out.
 		 */
 		retval |= DM_RET_COPY;
 
 		/*
 		 * If the return action has been set to descend, then we
 		 * know that we've already seen a non-bus matching
 		 * expression, therefore we need to further descend the tree.
 		 * This won't change by continuing around the loop, so we
 		 * go ahead and return.  If we haven't seen a non-bus
 		 * matching expression, we keep going around the loop until
 		 * we exhaust the matching expressions.  We'll set the stop
 		 * flag once we fall out of the loop.
 		 */
 		if ((retval & DM_RET_ACTION_MASK) == DM_RET_DESCEND)
 			return(retval);
 	}
 
 	/*
 	 * If the return action hasn't been set to descend yet, that means
 	 * we haven't seen anything other than bus matching patterns.  So
 	 * tell the caller to stop descending the tree -- the user doesn't
 	 * want to match against lower level tree elements.
 	 */
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE)
 		retval |= DM_RET_STOP;
 
 	return(retval);
 }
 
 static dev_match_ret
 xptdevicematch(struct dev_match_pattern *patterns, u_int num_patterns,
 	       struct cam_ed *device)
 {
 	dev_match_ret retval;
 	u_int i;
 
 	retval = DM_RET_NONE;
 
 	/*
 	 * If we aren't given something to match against, that's an error.
 	 */
 	if (device == NULL)
 		return(DM_RET_ERROR);
 
 	/*
 	 * If there are no match entries, then this device matches no
 	 * matter what.
 	 */
 	if ((patterns == NULL) || (num_patterns == 0))
 		return(DM_RET_DESCEND | DM_RET_COPY);
 
 	for (i = 0; i < num_patterns; i++) {
 		struct device_match_pattern *cur_pattern;
 		struct scsi_vpd_device_id *device_id_page;
 
 		/*
 		 * If the pattern in question isn't for a device node, we
 		 * aren't interested.
 		 */
 		if (patterns[i].type != DEV_MATCH_DEVICE) {
 			if ((patterns[i].type == DEV_MATCH_PERIPH)
 			 && ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE))
 				retval |= DM_RET_DESCEND;
 			continue;
 		}
 
 		cur_pattern = &patterns[i].pattern.device_pattern;
 
 		/* Error out if mutually exclusive options are specified. */ 
 		if ((cur_pattern->flags & (DEV_MATCH_INQUIRY|DEV_MATCH_DEVID))
 		 == (DEV_MATCH_INQUIRY|DEV_MATCH_DEVID))
 			return(DM_RET_ERROR);
 
 		/*
 		 * If they want to match any device node, we give them any
 		 * device node.
 		 */
 		if (cur_pattern->flags == DEV_MATCH_ANY)
 			goto copy_dev_node;
 
 		/*
 		 * Not sure why someone would do this...
 		 */
 		if (cur_pattern->flags == DEV_MATCH_NONE)
 			continue;
 
 		if (((cur_pattern->flags & DEV_MATCH_PATH) != 0)
 		 && (cur_pattern->path_id != device->target->bus->path_id))
 			continue;
 
 		if (((cur_pattern->flags & DEV_MATCH_TARGET) != 0)
 		 && (cur_pattern->target_id != device->target->target_id))
 			continue;
 
 		if (((cur_pattern->flags & DEV_MATCH_LUN) != 0)
 		 && (cur_pattern->target_lun != device->lun_id))
 			continue;
 
 		if (((cur_pattern->flags & DEV_MATCH_INQUIRY) != 0)
 		 && (cam_quirkmatch((caddr_t)&device->inq_data,
 				    (caddr_t)&cur_pattern->data.inq_pat,
 				    1, sizeof(cur_pattern->data.inq_pat),
 				    scsi_static_inquiry_match) == NULL))
 			continue;
 
 		device_id_page = (struct scsi_vpd_device_id *)device->device_id;
 		if (((cur_pattern->flags & DEV_MATCH_DEVID) != 0)
 		 && (device->device_id_len < SVPD_DEVICE_ID_HDR_LEN
 		  || scsi_devid_match((uint8_t *)device_id_page->desc_list,
 				      device->device_id_len
 				    - SVPD_DEVICE_ID_HDR_LEN,
 				      cur_pattern->data.devid_pat.id,
 				      cur_pattern->data.devid_pat.id_len) != 0))
 			continue;
 
 copy_dev_node:
 		/*
 		 * If we get to this point, the user definitely wants
 		 * information on this device.  So tell the caller to copy
 		 * the data out.
 		 */
 		retval |= DM_RET_COPY;
 
 		/*
 		 * If the return action has been set to descend, then we
 		 * know that we've already seen a peripheral matching
 		 * expression, therefore we need to further descend the tree.
 		 * This won't change by continuing around the loop, so we
 		 * go ahead and return.  If we haven't seen a peripheral
 		 * matching expression, we keep going around the loop until
 		 * we exhaust the matching expressions.  We'll set the stop
 		 * flag once we fall out of the loop.
 		 */
 		if ((retval & DM_RET_ACTION_MASK) == DM_RET_DESCEND)
 			return(retval);
 	}
 
 	/*
 	 * If the return action hasn't been set to descend yet, that means
 	 * we haven't seen any peripheral matching patterns.  So tell the
 	 * caller to stop descending the tree -- the user doesn't want to
 	 * match against lower level tree elements.
 	 */
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE)
 		retval |= DM_RET_STOP;
 
 	return(retval);
 }
 
 /*
  * Match a single peripheral against any number of match patterns.
  */
 static dev_match_ret
 xptperiphmatch(struct dev_match_pattern *patterns, u_int num_patterns,
 	       struct cam_periph *periph)
 {
 	dev_match_ret retval;
 	u_int i;
 
 	/*
 	 * If we aren't given something to match against, that's an error.
 	 */
 	if (periph == NULL)
 		return(DM_RET_ERROR);
 
 	/*
 	 * If there are no match entries, then this peripheral matches no
 	 * matter what.
 	 */
 	if ((patterns == NULL) || (num_patterns == 0))
 		return(DM_RET_STOP | DM_RET_COPY);
 
 	/*
 	 * There aren't any nodes below a peripheral node, so there's no
 	 * reason to descend the tree any further.
 	 */
 	retval = DM_RET_STOP;
 
 	for (i = 0; i < num_patterns; i++) {
 		struct periph_match_pattern *cur_pattern;
 
 		/*
 		 * If the pattern in question isn't for a peripheral, we
 		 * aren't interested.
 		 */
 		if (patterns[i].type != DEV_MATCH_PERIPH)
 			continue;
 
 		cur_pattern = &patterns[i].pattern.periph_pattern;
 
 		/*
 		 * If they want to match on anything, then we will do so.
 		 */
 		if (cur_pattern->flags == PERIPH_MATCH_ANY) {
 			/* set the copy flag */
 			retval |= DM_RET_COPY;
 
 			/*
 			 * We've already set the return action to stop,
 			 * since there are no nodes below peripherals in
 			 * the tree.
 			 */
 			return(retval);
 		}
 
 		/*
 		 * Not sure why someone would do this...
 		 */
 		if (cur_pattern->flags == PERIPH_MATCH_NONE)
 			continue;
 
 		if (((cur_pattern->flags & PERIPH_MATCH_PATH) != 0)
 		 && (cur_pattern->path_id != periph->path->bus->path_id))
 			continue;
 
 		/*
 		 * For the target and lun id's, we have to make sure the
 		 * target and lun pointers aren't NULL.  The xpt peripheral
 		 * has a wildcard target and device.
 		 */
 		if (((cur_pattern->flags & PERIPH_MATCH_TARGET) != 0)
 		 && ((periph->path->target == NULL)
 		 ||(cur_pattern->target_id != periph->path->target->target_id)))
 			continue;
 
 		if (((cur_pattern->flags & PERIPH_MATCH_LUN) != 0)
 		 && ((periph->path->device == NULL)
 		 || (cur_pattern->target_lun != periph->path->device->lun_id)))
 			continue;
 
 		if (((cur_pattern->flags & PERIPH_MATCH_UNIT) != 0)
 		 && (cur_pattern->unit_number != periph->unit_number))
 			continue;
 
 		if (((cur_pattern->flags & PERIPH_MATCH_NAME) != 0)
 		 && (strncmp(cur_pattern->periph_name, periph->periph_name,
 			     DEV_IDLEN) != 0))
 			continue;
 
 		/*
 		 * If we get to this point, the user definitely wants
 		 * information on this peripheral.  So tell the caller to
 		 * copy the data out.
 		 */
 		retval |= DM_RET_COPY;
 
 		/*
 		 * The return action has already been set to stop, since
 		 * peripherals don't have any nodes below them in the EDT.
 		 */
 		return(retval);
 	}
 
 	/*
 	 * If we get to this point, the peripheral that was passed in
 	 * doesn't match any of the patterns.
 	 */
 	return(retval);
 }
 
 static int
 xptedtbusfunc(struct cam_eb *bus, void *arg)
 {
 	struct ccb_dev_match *cdm;
 	struct cam_et *target;
 	dev_match_ret retval;
 
 	cdm = (struct ccb_dev_match *)arg;
 
 	/*
 	 * If our position is for something deeper in the tree, that means
 	 * that we've already seen this node.  So, we keep going down.
 	 */
 	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
 	 && (cdm->pos.cookie.bus == bus)
 	 && (cdm->pos.position_type & CAM_DEV_POS_TARGET)
 	 && (cdm->pos.cookie.target != NULL))
 		retval = DM_RET_DESCEND;
 	else
 		retval = xptbusmatch(cdm->patterns, cdm->num_patterns, bus);
 
 	/*
 	 * If we got an error, bail out of the search.
 	 */
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) {
 		cdm->status = CAM_DEV_MATCH_ERROR;
 		return(0);
 	}
 
 	/*
 	 * If the copy flag is set, copy this bus out.
 	 */
 	if (retval & DM_RET_COPY) {
 		int spaceleft, j;
 
 		spaceleft = cdm->match_buf_len - (cdm->num_matches *
 			sizeof(struct dev_match_result));
 
 		/*
 		 * If we don't have enough space to put in another
 		 * match result, save our position and tell the
 		 * user there are more devices to check.
 		 */
 		if (spaceleft < sizeof(struct dev_match_result)) {
 			bzero(&cdm->pos, sizeof(cdm->pos));
 			cdm->pos.position_type =
 				CAM_DEV_POS_EDT | CAM_DEV_POS_BUS;
 
 			cdm->pos.cookie.bus = bus;
 			cdm->pos.generations[CAM_BUS_GENERATION]=
 				xsoftc.bus_generation;
 			cdm->status = CAM_DEV_MATCH_MORE;
 			return(0);
 		}
 		j = cdm->num_matches;
 		cdm->num_matches++;
 		cdm->matches[j].type = DEV_MATCH_BUS;
 		cdm->matches[j].result.bus_result.path_id = bus->path_id;
 		cdm->matches[j].result.bus_result.bus_id = bus->sim->bus_id;
 		cdm->matches[j].result.bus_result.unit_number =
 			bus->sim->unit_number;
 		strncpy(cdm->matches[j].result.bus_result.dev_name,
 			bus->sim->sim_name, DEV_IDLEN);
 	}
 
 	/*
 	 * If the user is only interested in busses, there's no
 	 * reason to descend to the next level in the tree.
 	 */
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_STOP)
 		return(1);
 
 	/*
 	 * If there is a target generation recorded, check it to
 	 * make sure the target list hasn't changed.
 	 */
 	mtx_lock(&bus->eb_mtx);
 	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
 	 && (cdm->pos.cookie.bus == bus)
 	 && (cdm->pos.position_type & CAM_DEV_POS_TARGET)
 	 && (cdm->pos.cookie.target != NULL)) {
 		if ((cdm->pos.generations[CAM_TARGET_GENERATION] !=
 		    bus->generation)) {
 			mtx_unlock(&bus->eb_mtx);
 			cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
 			return (0);
 		}
 		target = (struct cam_et *)cdm->pos.cookie.target;
 		target->refcount++;
 	} else
 		target = NULL;
 	mtx_unlock(&bus->eb_mtx);
 
 	return (xpttargettraverse(bus, target, xptedttargetfunc, arg));
 }
 
 static int
 xptedttargetfunc(struct cam_et *target, void *arg)
 {
 	struct ccb_dev_match *cdm;
 	struct cam_eb *bus;
 	struct cam_ed *device;
 
 	cdm = (struct ccb_dev_match *)arg;
 	bus = target->bus;
 
 	/*
 	 * If there is a device list generation recorded, check it to
 	 * make sure the device list hasn't changed.
 	 */
 	mtx_lock(&bus->eb_mtx);
 	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
 	 && (cdm->pos.cookie.bus == bus)
 	 && (cdm->pos.position_type & CAM_DEV_POS_TARGET)
 	 && (cdm->pos.cookie.target == target)
 	 && (cdm->pos.position_type & CAM_DEV_POS_DEVICE)
 	 && (cdm->pos.cookie.device != NULL)) {
 		if (cdm->pos.generations[CAM_DEV_GENERATION] !=
 		    target->generation) {
 			mtx_unlock(&bus->eb_mtx);
 			cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
 			return(0);
 		}
 		device = (struct cam_ed *)cdm->pos.cookie.device;
 		device->refcount++;
 	} else
 		device = NULL;
 	mtx_unlock(&bus->eb_mtx);
 
 	return (xptdevicetraverse(target, device, xptedtdevicefunc, arg));
 }
 
 static int
 xptedtdevicefunc(struct cam_ed *device, void *arg)
 {
 	struct cam_eb *bus;
 	struct cam_periph *periph;
 	struct ccb_dev_match *cdm;
 	dev_match_ret retval;
 
 	cdm = (struct ccb_dev_match *)arg;
 	bus = device->target->bus;
 
 	/*
 	 * If our position is for something deeper in the tree, that means
 	 * that we've already seen this node.  So, we keep going down.
 	 */
 	if ((cdm->pos.position_type & CAM_DEV_POS_DEVICE)
 	 && (cdm->pos.cookie.device == device)
 	 && (cdm->pos.position_type & CAM_DEV_POS_PERIPH)
 	 && (cdm->pos.cookie.periph != NULL))
 		retval = DM_RET_DESCEND;
 	else
 		retval = xptdevicematch(cdm->patterns, cdm->num_patterns,
 					device);
 
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) {
 		cdm->status = CAM_DEV_MATCH_ERROR;
 		return(0);
 	}
 
 	/*
 	 * If the copy flag is set, copy this device out.
 	 */
 	if (retval & DM_RET_COPY) {
 		int spaceleft, j;
 
 		spaceleft = cdm->match_buf_len - (cdm->num_matches *
 			sizeof(struct dev_match_result));
 
 		/*
 		 * If we don't have enough space to put in another
 		 * match result, save our position and tell the
 		 * user there are more devices to check.
 		 */
 		if (spaceleft < sizeof(struct dev_match_result)) {
 			bzero(&cdm->pos, sizeof(cdm->pos));
 			cdm->pos.position_type =
 				CAM_DEV_POS_EDT | CAM_DEV_POS_BUS |
 				CAM_DEV_POS_TARGET | CAM_DEV_POS_DEVICE;
 
 			cdm->pos.cookie.bus = device->target->bus;
 			cdm->pos.generations[CAM_BUS_GENERATION]=
 				xsoftc.bus_generation;
 			cdm->pos.cookie.target = device->target;
 			cdm->pos.generations[CAM_TARGET_GENERATION] =
 				device->target->bus->generation;
 			cdm->pos.cookie.device = device;
 			cdm->pos.generations[CAM_DEV_GENERATION] =
 				device->target->generation;
 			cdm->status = CAM_DEV_MATCH_MORE;
 			return(0);
 		}
 		j = cdm->num_matches;
 		cdm->num_matches++;
 		cdm->matches[j].type = DEV_MATCH_DEVICE;
 		cdm->matches[j].result.device_result.path_id =
 			device->target->bus->path_id;
 		cdm->matches[j].result.device_result.target_id =
 			device->target->target_id;
 		cdm->matches[j].result.device_result.target_lun =
 			device->lun_id;
 		cdm->matches[j].result.device_result.protocol =
 			device->protocol;
 		bcopy(&device->inq_data,
 		      &cdm->matches[j].result.device_result.inq_data,
 		      sizeof(struct scsi_inquiry_data));
 		bcopy(&device->ident_data,
 		      &cdm->matches[j].result.device_result.ident_data,
 		      sizeof(struct ata_params));
 
 		/* Let the user know whether this device is unconfigured */
 		if (device->flags & CAM_DEV_UNCONFIGURED)
 			cdm->matches[j].result.device_result.flags =
 				DEV_RESULT_UNCONFIGURED;
 		else
 			cdm->matches[j].result.device_result.flags =
 				DEV_RESULT_NOFLAG;
 	}
 
 	/*
 	 * If the user isn't interested in peripherals, don't descend
 	 * the tree any further.
 	 */
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_STOP)
 		return(1);
 
 	/*
 	 * If there is a peripheral list generation recorded, make sure
 	 * it hasn't changed.
 	 */
 	xpt_lock_buses();
 	mtx_lock(&bus->eb_mtx);
 	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
 	 && (cdm->pos.cookie.bus == bus)
 	 && (cdm->pos.position_type & CAM_DEV_POS_TARGET)
 	 && (cdm->pos.cookie.target == device->target)
 	 && (cdm->pos.position_type & CAM_DEV_POS_DEVICE)
 	 && (cdm->pos.cookie.device == device)
 	 && (cdm->pos.position_type & CAM_DEV_POS_PERIPH)
 	 && (cdm->pos.cookie.periph != NULL)) {
 		if (cdm->pos.generations[CAM_PERIPH_GENERATION] !=
 		    device->generation) {
 			mtx_unlock(&bus->eb_mtx);
 			xpt_unlock_buses();
 			cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
 			return(0);
 		}
 		periph = (struct cam_periph *)cdm->pos.cookie.periph;
 		periph->refcount++;
 	} else
 		periph = NULL;
 	mtx_unlock(&bus->eb_mtx);
 	xpt_unlock_buses();
 
 	return (xptperiphtraverse(device, periph, xptedtperiphfunc, arg));
 }
 
 static int
 xptedtperiphfunc(struct cam_periph *periph, void *arg)
 {
 	struct ccb_dev_match *cdm;
 	dev_match_ret retval;
 
 	cdm = (struct ccb_dev_match *)arg;
 
 	retval = xptperiphmatch(cdm->patterns, cdm->num_patterns, periph);
 
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) {
 		cdm->status = CAM_DEV_MATCH_ERROR;
 		return(0);
 	}
 
 	/*
 	 * If the copy flag is set, copy this peripheral out.
 	 */
 	if (retval & DM_RET_COPY) {
 		int spaceleft, j;
 
 		spaceleft = cdm->match_buf_len - (cdm->num_matches *
 			sizeof(struct dev_match_result));
 
 		/*
 		 * If we don't have enough space to put in another
 		 * match result, save our position and tell the
 		 * user there are more devices to check.
 		 */
 		if (spaceleft < sizeof(struct dev_match_result)) {
 			bzero(&cdm->pos, sizeof(cdm->pos));
 			cdm->pos.position_type =
 				CAM_DEV_POS_EDT | CAM_DEV_POS_BUS |
 				CAM_DEV_POS_TARGET | CAM_DEV_POS_DEVICE |
 				CAM_DEV_POS_PERIPH;
 
 			cdm->pos.cookie.bus = periph->path->bus;
 			cdm->pos.generations[CAM_BUS_GENERATION]=
 				xsoftc.bus_generation;
 			cdm->pos.cookie.target = periph->path->target;
 			cdm->pos.generations[CAM_TARGET_GENERATION] =
 				periph->path->bus->generation;
 			cdm->pos.cookie.device = periph->path->device;
 			cdm->pos.generations[CAM_DEV_GENERATION] =
 				periph->path->target->generation;
 			cdm->pos.cookie.periph = periph;
 			cdm->pos.generations[CAM_PERIPH_GENERATION] =
 				periph->path->device->generation;
 			cdm->status = CAM_DEV_MATCH_MORE;
 			return(0);
 		}
 
 		j = cdm->num_matches;
 		cdm->num_matches++;
 		cdm->matches[j].type = DEV_MATCH_PERIPH;
 		cdm->matches[j].result.periph_result.path_id =
 			periph->path->bus->path_id;
 		cdm->matches[j].result.periph_result.target_id =
 			periph->path->target->target_id;
 		cdm->matches[j].result.periph_result.target_lun =
 			periph->path->device->lun_id;
 		cdm->matches[j].result.periph_result.unit_number =
 			periph->unit_number;
 		strncpy(cdm->matches[j].result.periph_result.periph_name,
 			periph->periph_name, DEV_IDLEN);
 	}
 
 	return(1);
 }
 
 static int
 xptedtmatch(struct ccb_dev_match *cdm)
 {
 	struct cam_eb *bus;
 	int ret;
 
 	cdm->num_matches = 0;
 
 	/*
 	 * Check the bus list generation.  If it has changed, the user
 	 * needs to reset everything and start over.
 	 */
 	xpt_lock_buses();
 	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
 	 && (cdm->pos.cookie.bus != NULL)) {
 		if (cdm->pos.generations[CAM_BUS_GENERATION] !=
 		    xsoftc.bus_generation) {
 			xpt_unlock_buses();
 			cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
 			return(0);
 		}
 		bus = (struct cam_eb *)cdm->pos.cookie.bus;
 		bus->refcount++;
 	} else
 		bus = NULL;
 	xpt_unlock_buses();
 
 	ret = xptbustraverse(bus, xptedtbusfunc, cdm);
 
 	/*
 	 * If we get back 0, that means that we had to stop before fully
 	 * traversing the EDT.  It also means that one of the subroutines
 	 * has set the status field to the proper value.  If we get back 1,
 	 * we've fully traversed the EDT and copied out any matching entries.
 	 */
 	if (ret == 1)
 		cdm->status = CAM_DEV_MATCH_LAST;
 
 	return(ret);
 }
 
 static int
 xptplistpdrvfunc(struct periph_driver **pdrv, void *arg)
 {
 	struct cam_periph *periph;
 	struct ccb_dev_match *cdm;
 
 	cdm = (struct ccb_dev_match *)arg;
 
 	xpt_lock_buses();
 	if ((cdm->pos.position_type & CAM_DEV_POS_PDPTR)
 	 && (cdm->pos.cookie.pdrv == pdrv)
 	 && (cdm->pos.position_type & CAM_DEV_POS_PERIPH)
 	 && (cdm->pos.cookie.periph != NULL)) {
 		if (cdm->pos.generations[CAM_PERIPH_GENERATION] !=
 		    (*pdrv)->generation) {
 			xpt_unlock_buses();
 			cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
 			return(0);
 		}
 		periph = (struct cam_periph *)cdm->pos.cookie.periph;
 		periph->refcount++;
 	} else
 		periph = NULL;
 	xpt_unlock_buses();
 
 	return (xptpdperiphtraverse(pdrv, periph, xptplistperiphfunc, arg));
 }
 
 static int
 xptplistperiphfunc(struct cam_periph *periph, void *arg)
 {
 	struct ccb_dev_match *cdm;
 	dev_match_ret retval;
 
 	cdm = (struct ccb_dev_match *)arg;
 
 	retval = xptperiphmatch(cdm->patterns, cdm->num_patterns, periph);
 
 	if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) {
 		cdm->status = CAM_DEV_MATCH_ERROR;
 		return(0);
 	}
 
 	/*
 	 * If the copy flag is set, copy this peripheral out.
 	 */
 	if (retval & DM_RET_COPY) {
 		int spaceleft, j;
 
 		spaceleft = cdm->match_buf_len - (cdm->num_matches *
 			sizeof(struct dev_match_result));
 
 		/*
 		 * If we don't have enough space to put in another
 		 * match result, save our position and tell the
 		 * user there are more devices to check.
 		 */
 		if (spaceleft < sizeof(struct dev_match_result)) {
 			struct periph_driver **pdrv;
 
 			pdrv = NULL;
 			bzero(&cdm->pos, sizeof(cdm->pos));
 			cdm->pos.position_type =
 				CAM_DEV_POS_PDRV | CAM_DEV_POS_PDPTR |
 				CAM_DEV_POS_PERIPH;
 
 			/*
 			 * This may look a bit non-sensical, but it is
 			 * actually quite logical.  There are very few
 			 * peripheral drivers, and bloating every peripheral
 			 * structure with a pointer back to its parent
 			 * peripheral driver linker set entry would cost
 			 * more in the long run than doing this quick lookup.
 			 */
 			for (pdrv = periph_drivers; *pdrv != NULL; pdrv++) {
 				if (strcmp((*pdrv)->driver_name,
 				    periph->periph_name) == 0)
 					break;
 			}
 
 			if (*pdrv == NULL) {
 				cdm->status = CAM_DEV_MATCH_ERROR;
 				return(0);
 			}
 
 			cdm->pos.cookie.pdrv = pdrv;
 			/*
 			 * The periph generation slot does double duty, as
 			 * does the periph pointer slot.  They are used for
 			 * both edt and pdrv lookups and positioning.
 			 */
 			cdm->pos.cookie.periph = periph;
 			cdm->pos.generations[CAM_PERIPH_GENERATION] =
 				(*pdrv)->generation;
 			cdm->status = CAM_DEV_MATCH_MORE;
 			return(0);
 		}
 
 		j = cdm->num_matches;
 		cdm->num_matches++;
 		cdm->matches[j].type = DEV_MATCH_PERIPH;
 		cdm->matches[j].result.periph_result.path_id =
 			periph->path->bus->path_id;
 
 		/*
 		 * The transport layer peripheral doesn't have a target or
 		 * lun.
 		 */
 		if (periph->path->target)
 			cdm->matches[j].result.periph_result.target_id =
 				periph->path->target->target_id;
 		else
 			cdm->matches[j].result.periph_result.target_id =
 				CAM_TARGET_WILDCARD;
 
 		if (periph->path->device)
 			cdm->matches[j].result.periph_result.target_lun =
 				periph->path->device->lun_id;
 		else
 			cdm->matches[j].result.periph_result.target_lun =
 				CAM_LUN_WILDCARD;
 
 		cdm->matches[j].result.periph_result.unit_number =
 			periph->unit_number;
 		strncpy(cdm->matches[j].result.periph_result.periph_name,
 			periph->periph_name, DEV_IDLEN);
 	}
 
 	return(1);
 }
 
 static int
 xptperiphlistmatch(struct ccb_dev_match *cdm)
 {
 	int ret;
 
 	cdm->num_matches = 0;
 
 	/*
 	 * At this point in the edt traversal function, we check the bus
 	 * list generation to make sure that no busses have been added or
 	 * removed since the user last sent a XPT_DEV_MATCH ccb through.
 	 * For the peripheral driver list traversal function, however, we
 	 * don't have to worry about new peripheral driver types coming or
 	 * going; they're in a linker set, and therefore can't change
 	 * without a recompile.
 	 */
 
 	if ((cdm->pos.position_type & CAM_DEV_POS_PDPTR)
 	 && (cdm->pos.cookie.pdrv != NULL))
 		ret = xptpdrvtraverse(
 				(struct periph_driver **)cdm->pos.cookie.pdrv,
 				xptplistpdrvfunc, cdm);
 	else
 		ret = xptpdrvtraverse(NULL, xptplistpdrvfunc, cdm);
 
 	/*
 	 * If we get back 0, that means that we had to stop before fully
 	 * traversing the peripheral driver tree.  It also means that one of
 	 * the subroutines has set the status field to the proper value.  If
 	 * we get back 1, we've fully traversed the EDT and copied out any
 	 * matching entries.
 	 */
 	if (ret == 1)
 		cdm->status = CAM_DEV_MATCH_LAST;
 
 	return(ret);
 }
 
 static int
 xptbustraverse(struct cam_eb *start_bus, xpt_busfunc_t *tr_func, void *arg)
 {
 	struct cam_eb *bus, *next_bus;
 	int retval;
 
 	retval = 1;
 	if (start_bus)
 		bus = start_bus;
 	else {
 		xpt_lock_buses();
 		bus = TAILQ_FIRST(&xsoftc.xpt_busses);
 		if (bus == NULL) {
 			xpt_unlock_buses();
 			return (retval);
 		}
 		bus->refcount++;
 		xpt_unlock_buses();
 	}
 	for (; bus != NULL; bus = next_bus) {
 		retval = tr_func(bus, arg);
 		if (retval == 0) {
 			xpt_release_bus(bus);
 			break;
 		}
 		xpt_lock_buses();
 		next_bus = TAILQ_NEXT(bus, links);
 		if (next_bus)
 			next_bus->refcount++;
 		xpt_unlock_buses();
 		xpt_release_bus(bus);
 	}
 	return(retval);
 }
 
 static int
 xpttargettraverse(struct cam_eb *bus, struct cam_et *start_target,
 		  xpt_targetfunc_t *tr_func, void *arg)
 {
 	struct cam_et *target, *next_target;
 	int retval;
 
 	retval = 1;
 	if (start_target)
 		target = start_target;
 	else {
 		mtx_lock(&bus->eb_mtx);
 		target = TAILQ_FIRST(&bus->et_entries);
 		if (target == NULL) {
 			mtx_unlock(&bus->eb_mtx);
 			return (retval);
 		}
 		target->refcount++;
 		mtx_unlock(&bus->eb_mtx);
 	}
 	for (; target != NULL; target = next_target) {
 		retval = tr_func(target, arg);
 		if (retval == 0) {
 			xpt_release_target(target);
 			break;
 		}
 		mtx_lock(&bus->eb_mtx);
 		next_target = TAILQ_NEXT(target, links);
 		if (next_target)
 			next_target->refcount++;
 		mtx_unlock(&bus->eb_mtx);
 		xpt_release_target(target);
 	}
 	return(retval);
 }
 
 static int
 xptdevicetraverse(struct cam_et *target, struct cam_ed *start_device,
 		  xpt_devicefunc_t *tr_func, void *arg)
 {
 	struct cam_eb *bus;
 	struct cam_ed *device, *next_device;
 	int retval;
 
 	retval = 1;
 	bus = target->bus;
 	if (start_device)
 		device = start_device;
 	else {
 		mtx_lock(&bus->eb_mtx);
 		device = TAILQ_FIRST(&target->ed_entries);
 		if (device == NULL) {
 			mtx_unlock(&bus->eb_mtx);
 			return (retval);
 		}
 		device->refcount++;
 		mtx_unlock(&bus->eb_mtx);
 	}
 	for (; device != NULL; device = next_device) {
 		mtx_lock(&device->device_mtx);
 		retval = tr_func(device, arg);
 		mtx_unlock(&device->device_mtx);
 		if (retval == 0) {
 			xpt_release_device(device);
 			break;
 		}
 		mtx_lock(&bus->eb_mtx);
 		next_device = TAILQ_NEXT(device, links);
 		if (next_device)
 			next_device->refcount++;
 		mtx_unlock(&bus->eb_mtx);
 		xpt_release_device(device);
 	}
 	return(retval);
 }
 
 static int
 xptperiphtraverse(struct cam_ed *device, struct cam_periph *start_periph,
 		  xpt_periphfunc_t *tr_func, void *arg)
 {
 	struct cam_eb *bus;
 	struct cam_periph *periph, *next_periph;
 	int retval;
 
 	retval = 1;
 
 	bus = device->target->bus;
 	if (start_periph)
 		periph = start_periph;
 	else {
 		xpt_lock_buses();
 		mtx_lock(&bus->eb_mtx);
 		periph = SLIST_FIRST(&device->periphs);
 		while (periph != NULL && (periph->flags & CAM_PERIPH_FREE) != 0)
 			periph = SLIST_NEXT(periph, periph_links);
 		if (periph == NULL) {
 			mtx_unlock(&bus->eb_mtx);
 			xpt_unlock_buses();
 			return (retval);
 		}
 		periph->refcount++;
 		mtx_unlock(&bus->eb_mtx);
 		xpt_unlock_buses();
 	}
 	for (; periph != NULL; periph = next_periph) {
 		retval = tr_func(periph, arg);
 		if (retval == 0) {
 			cam_periph_release_locked(periph);
 			break;
 		}
 		xpt_lock_buses();
 		mtx_lock(&bus->eb_mtx);
 		next_periph = SLIST_NEXT(periph, periph_links);
 		while (next_periph != NULL &&
 		    (next_periph->flags & CAM_PERIPH_FREE) != 0)
 			next_periph = SLIST_NEXT(next_periph, periph_links);
 		if (next_periph)
 			next_periph->refcount++;
 		mtx_unlock(&bus->eb_mtx);
 		xpt_unlock_buses();
 		cam_periph_release_locked(periph);
 	}
 	return(retval);
 }
 
 static int
 xptpdrvtraverse(struct periph_driver **start_pdrv,
 		xpt_pdrvfunc_t *tr_func, void *arg)
 {
 	struct periph_driver **pdrv;
 	int retval;
 
 	retval = 1;
 
 	/*
 	 * We don't traverse the peripheral driver list like we do the
 	 * other lists, because it is a linker set, and therefore cannot be
 	 * changed during runtime.  If the peripheral driver list is ever
 	 * re-done to be something other than a linker set (i.e. it can
 	 * change while the system is running), the list traversal should
 	 * be modified to work like the other traversal functions.
 	 */
 	for (pdrv = (start_pdrv ? start_pdrv : periph_drivers);
 	     *pdrv != NULL; pdrv++) {
 		retval = tr_func(pdrv, arg);
 
 		if (retval == 0)
 			return(retval);
 	}
 
 	return(retval);
 }
 
 static int
 xptpdperiphtraverse(struct periph_driver **pdrv,
 		    struct cam_periph *start_periph,
 		    xpt_periphfunc_t *tr_func, void *arg)
 {
 	struct cam_periph *periph, *next_periph;
 	int retval;
 
 	retval = 1;
 
 	if (start_periph)
 		periph = start_periph;
 	else {
 		xpt_lock_buses();
 		periph = TAILQ_FIRST(&(*pdrv)->units);
 		while (periph != NULL && (periph->flags & CAM_PERIPH_FREE) != 0)
 			periph = TAILQ_NEXT(periph, unit_links);
 		if (periph == NULL) {
 			xpt_unlock_buses();
 			return (retval);
 		}
 		periph->refcount++;
 		xpt_unlock_buses();
 	}
 	for (; periph != NULL; periph = next_periph) {
 		cam_periph_lock(periph);
 		retval = tr_func(periph, arg);
 		cam_periph_unlock(periph);
 		if (retval == 0) {
 			cam_periph_release(periph);
 			break;
 		}
 		xpt_lock_buses();
 		next_periph = TAILQ_NEXT(periph, unit_links);
 		while (next_periph != NULL &&
 		    (next_periph->flags & CAM_PERIPH_FREE) != 0)
 			next_periph = TAILQ_NEXT(next_periph, unit_links);
 		if (next_periph)
 			next_periph->refcount++;
 		xpt_unlock_buses();
 		cam_periph_release(periph);
 	}
 	return(retval);
 }
 
 static int
 xptdefbusfunc(struct cam_eb *bus, void *arg)
 {
 	struct xpt_traverse_config *tr_config;
 
 	tr_config = (struct xpt_traverse_config *)arg;
 
 	if (tr_config->depth == XPT_DEPTH_BUS) {
 		xpt_busfunc_t *tr_func;
 
 		tr_func = (xpt_busfunc_t *)tr_config->tr_func;
 
 		return(tr_func(bus, tr_config->tr_arg));
 	} else
 		return(xpttargettraverse(bus, NULL, xptdeftargetfunc, arg));
 }
 
 static int
 xptdeftargetfunc(struct cam_et *target, void *arg)
 {
 	struct xpt_traverse_config *tr_config;
 
 	tr_config = (struct xpt_traverse_config *)arg;
 
 	if (tr_config->depth == XPT_DEPTH_TARGET) {
 		xpt_targetfunc_t *tr_func;
 
 		tr_func = (xpt_targetfunc_t *)tr_config->tr_func;
 
 		return(tr_func(target, tr_config->tr_arg));
 	} else
 		return(xptdevicetraverse(target, NULL, xptdefdevicefunc, arg));
 }
 
 static int
 xptdefdevicefunc(struct cam_ed *device, void *arg)
 {
 	struct xpt_traverse_config *tr_config;
 
 	tr_config = (struct xpt_traverse_config *)arg;
 
 	if (tr_config->depth == XPT_DEPTH_DEVICE) {
 		xpt_devicefunc_t *tr_func;
 
 		tr_func = (xpt_devicefunc_t *)tr_config->tr_func;
 
 		return(tr_func(device, tr_config->tr_arg));
 	} else
 		return(xptperiphtraverse(device, NULL, xptdefperiphfunc, arg));
 }
 
 static int
 xptdefperiphfunc(struct cam_periph *periph, void *arg)
 {
 	struct xpt_traverse_config *tr_config;
 	xpt_periphfunc_t *tr_func;
 
 	tr_config = (struct xpt_traverse_config *)arg;
 
 	tr_func = (xpt_periphfunc_t *)tr_config->tr_func;
 
 	/*
 	 * Unlike the other default functions, we don't check for depth
 	 * here.  The peripheral driver level is the last level in the EDT,
 	 * so if we're here, we should execute the function in question.
 	 */
 	return(tr_func(periph, tr_config->tr_arg));
 }
 
 /*
  * Execute the given function for every bus in the EDT.
  */
 static int
 xpt_for_all_busses(xpt_busfunc_t *tr_func, void *arg)
 {
 	struct xpt_traverse_config tr_config;
 
 	tr_config.depth = XPT_DEPTH_BUS;
 	tr_config.tr_func = tr_func;
 	tr_config.tr_arg = arg;
 
 	return(xptbustraverse(NULL, xptdefbusfunc, &tr_config));
 }
 
 /*
  * Execute the given function for every device in the EDT.
  */
 static int
 xpt_for_all_devices(xpt_devicefunc_t *tr_func, void *arg)
 {
 	struct xpt_traverse_config tr_config;
 
 	tr_config.depth = XPT_DEPTH_DEVICE;
 	tr_config.tr_func = tr_func;
 	tr_config.tr_arg = arg;
 
 	return(xptbustraverse(NULL, xptdefbusfunc, &tr_config));
 }
 
 static int
 xptsetasyncfunc(struct cam_ed *device, void *arg)
 {
 	struct cam_path path;
 	struct ccb_getdev cgd;
 	struct ccb_setasync *csa = (struct ccb_setasync *)arg;
 
 	/*
 	 * Don't report unconfigured devices (Wildcard devs,
 	 * devices only for target mode, device instances
 	 * that have been invalidated but are waiting for
 	 * their last reference count to be released).
 	 */
 	if ((device->flags & CAM_DEV_UNCONFIGURED) != 0)
 		return (1);
 
 	xpt_compile_path(&path,
 			 NULL,
 			 device->target->bus->path_id,
 			 device->target->target_id,
 			 device->lun_id);
 	xpt_setup_ccb(&cgd.ccb_h, &path, CAM_PRIORITY_NORMAL);
 	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
 	xpt_action((union ccb *)&cgd);
 	csa->callback(csa->callback_arg,
 			    AC_FOUND_DEVICE,
 			    &path, &cgd);
 	xpt_release_path(&path);
 
 	return(1);
 }
 
 static int
 xptsetasyncbusfunc(struct cam_eb *bus, void *arg)
 {
 	struct cam_path path;
 	struct ccb_pathinq cpi;
 	struct ccb_setasync *csa = (struct ccb_setasync *)arg;
 
 	xpt_compile_path(&path, /*periph*/NULL,
 			 bus->path_id,
 			 CAM_TARGET_WILDCARD,
 			 CAM_LUN_WILDCARD);
 	xpt_path_lock(&path);
 	xpt_setup_ccb(&cpi.ccb_h, &path, CAM_PRIORITY_NORMAL);
 	cpi.ccb_h.func_code = XPT_PATH_INQ;
 	xpt_action((union ccb *)&cpi);
 	csa->callback(csa->callback_arg,
 			    AC_PATH_REGISTERED,
 			    &path, &cpi);
 	xpt_path_unlock(&path);
 	xpt_release_path(&path);
 
 	return(1);
 }
 
 void
 xpt_action(union ccb *start_ccb)
 {
 
 	CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_TRACE,
 	    ("xpt_action: func %#x %s\n", start_ccb->ccb_h.func_code,
 		xpt_action_name(start_ccb->ccb_h.func_code)));
 
 	start_ccb->ccb_h.status = CAM_REQ_INPROG;
 	(*(start_ccb->ccb_h.path->bus->xport->ops->action))(start_ccb);
 }
 
 void
 xpt_action_default(union ccb *start_ccb)
 {
 	struct cam_path *path;
 	struct cam_sim *sim;
 	int lock;
 
 	path = start_ccb->ccb_h.path;
 	CAM_DEBUG(path, CAM_DEBUG_TRACE,
 	    ("xpt_action_default: func %#x %s\n", start_ccb->ccb_h.func_code,
 		xpt_action_name(start_ccb->ccb_h.func_code)));
 
 	switch (start_ccb->ccb_h.func_code) {
 	case XPT_SCSI_IO:
 	{
 		struct cam_ed *device;
 
 		/*
 		 * For the sake of compatibility with SCSI-1
 		 * devices that may not understand the identify
 		 * message, we include lun information in the
 		 * second byte of all commands.  SCSI-1 specifies
 		 * that luns are a 3 bit value and reserves only 3
 		 * bits for lun information in the CDB.  Later
 		 * revisions of the SCSI spec allow for more than 8
 		 * luns, but have deprecated lun information in the
 		 * CDB.  So, if the lun won't fit, we must omit.
 		 *
 		 * Also be aware that during initial probing for devices,
 		 * the inquiry information is unknown but initialized to 0.
 		 * This means that this code will be exercised while probing
 		 * devices with an ANSI revision greater than 2.
 		 */
 		device = path->device;
 		if (device->protocol_version <= SCSI_REV_2
 		 && start_ccb->ccb_h.target_lun < 8
 		 && (start_ccb->ccb_h.flags & CAM_CDB_POINTER) == 0) {
 
 			start_ccb->csio.cdb_io.cdb_bytes[1] |=
 			    start_ccb->ccb_h.target_lun << 5;
 		}
 		start_ccb->csio.scsi_status = SCSI_STATUS_OK;
 	}
 	/* FALLTHROUGH */
 	case XPT_TARGET_IO:
 	case XPT_CONT_TARGET_IO:
 		start_ccb->csio.sense_resid = 0;
 		start_ccb->csio.resid = 0;
 		/* FALLTHROUGH */
 	case XPT_ATA_IO:
 		if (start_ccb->ccb_h.func_code == XPT_ATA_IO)
 			start_ccb->ataio.resid = 0;
 		/* FALLTHROUGH */
 	case XPT_NVME_IO:
 		if (start_ccb->ccb_h.func_code == XPT_NVME_IO)
 			start_ccb->nvmeio.resid = 0;
 		/* FALLTHROUGH */
 	case XPT_RESET_DEV:
 	case XPT_ENG_EXEC:
 	case XPT_SMP_IO:
 	{
 		struct cam_devq *devq;
 
 		devq = path->bus->sim->devq;
 		mtx_lock(&devq->send_mtx);
 		cam_ccbq_insert_ccb(&path->device->ccbq, start_ccb);
 		if (xpt_schedule_devq(devq, path->device) != 0)
 			xpt_run_devq(devq);
 		mtx_unlock(&devq->send_mtx);
 		break;
 	}
 	case XPT_CALC_GEOMETRY:
 		/* Filter out garbage */
 		if (start_ccb->ccg.block_size == 0
 		 || start_ccb->ccg.volume_size == 0) {
 			start_ccb->ccg.cylinders = 0;
 			start_ccb->ccg.heads = 0;
 			start_ccb->ccg.secs_per_track = 0;
 			start_ccb->ccb_h.status = CAM_REQ_CMP;
 			break;
 		}
 #if defined(PC98) || defined(__sparc64__)
 		/*
 		 * In a PC-98 system, geometry translation depens on
 		 * the "real" device geometry obtained from mode page 4.
 		 * SCSI geometry translation is performed in the
 		 * initialization routine of the SCSI BIOS and the result
 		 * stored in host memory.  If the translation is available
 		 * in host memory, use it.  If not, rely on the default
 		 * translation the device driver performs.
 		 * For sparc64, we may need adjust the geometry of large
 		 * disks in order to fit the limitations of the 16-bit
 		 * fields of the VTOC8 disk label.
 		 */
 		if (scsi_da_bios_params(&start_ccb->ccg) != 0) {
 			start_ccb->ccb_h.status = CAM_REQ_CMP;
 			break;
 		}
 #endif
 		goto call_sim;
 	case XPT_ABORT:
 	{
 		union ccb* abort_ccb;
 
 		abort_ccb = start_ccb->cab.abort_ccb;
 		if (XPT_FC_IS_DEV_QUEUED(abort_ccb)) {
 			struct cam_ed *device;
 			struct cam_devq *devq;
 
 			device = abort_ccb->ccb_h.path->device;
 			devq = device->sim->devq;
 
 			mtx_lock(&devq->send_mtx);
 			if (abort_ccb->ccb_h.pinfo.index > 0) {
 				cam_ccbq_remove_ccb(&device->ccbq, abort_ccb);
 				abort_ccb->ccb_h.status =
 				    CAM_REQ_ABORTED|CAM_DEV_QFRZN;
 				xpt_freeze_devq_device(device, 1);
 				mtx_unlock(&devq->send_mtx);
 				xpt_done(abort_ccb);
 				start_ccb->ccb_h.status = CAM_REQ_CMP;
 				break;
 			}
 			mtx_unlock(&devq->send_mtx);
 
 			if (abort_ccb->ccb_h.pinfo.index == CAM_UNQUEUED_INDEX
 			 && (abort_ccb->ccb_h.status & CAM_SIM_QUEUED) == 0) {
 				/*
 				 * We've caught this ccb en route to
 				 * the SIM.  Flag it for abort and the
 				 * SIM will do so just before starting
 				 * real work on the CCB.
 				 */
 				abort_ccb->ccb_h.status =
 				    CAM_REQ_ABORTED|CAM_DEV_QFRZN;
 				xpt_freeze_devq(abort_ccb->ccb_h.path, 1);
 				start_ccb->ccb_h.status = CAM_REQ_CMP;
 				break;
 			}
 		}
 		if (XPT_FC_IS_QUEUED(abort_ccb)
 		 && (abort_ccb->ccb_h.pinfo.index == CAM_DONEQ_INDEX)) {
 			/*
 			 * It's already completed but waiting
 			 * for our SWI to get to it.
 			 */
 			start_ccb->ccb_h.status = CAM_UA_ABORT;
 			break;
 		}
 		/*
 		 * If we weren't able to take care of the abort request
 		 * in the XPT, pass the request down to the SIM for processing.
 		 */
 	}
 	/* FALLTHROUGH */
 	case XPT_ACCEPT_TARGET_IO:
 	case XPT_EN_LUN:
 	case XPT_IMMED_NOTIFY:
 	case XPT_NOTIFY_ACK:
 	case XPT_RESET_BUS:
 	case XPT_IMMEDIATE_NOTIFY:
 	case XPT_NOTIFY_ACKNOWLEDGE:
 	case XPT_GET_SIM_KNOB_OLD:
 	case XPT_GET_SIM_KNOB:
 	case XPT_SET_SIM_KNOB:
 	case XPT_GET_TRAN_SETTINGS:
 	case XPT_SET_TRAN_SETTINGS:
 	case XPT_PATH_INQ:
 call_sim:
 		sim = path->bus->sim;
 		lock = (mtx_owned(sim->mtx) == 0);
 		if (lock)
 			CAM_SIM_LOCK(sim);
 		CAM_DEBUG(path, CAM_DEBUG_TRACE,
 		    ("sim->sim_action: func=%#x\n", start_ccb->ccb_h.func_code));
 		(*(sim->sim_action))(sim, start_ccb);
 		CAM_DEBUG(path, CAM_DEBUG_TRACE,
 		    ("sim->sim_action: status=%#x\n", start_ccb->ccb_h.status));
 		if (lock)
 			CAM_SIM_UNLOCK(sim);
 		break;
 	case XPT_PATH_STATS:
 		start_ccb->cpis.last_reset = path->bus->last_reset;
 		start_ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	case XPT_GDEV_TYPE:
 	{
 		struct cam_ed *dev;
 
 		dev = path->device;
 		if ((dev->flags & CAM_DEV_UNCONFIGURED) != 0) {
 			start_ccb->ccb_h.status = CAM_DEV_NOT_THERE;
 		} else {
 			struct ccb_getdev *cgd;
 
 			cgd = &start_ccb->cgd;
 			cgd->protocol = dev->protocol;
 			cgd->inq_data = dev->inq_data;
 			cgd->ident_data = dev->ident_data;
 			cgd->inq_flags = dev->inq_flags;
 			cgd->nvme_data = dev->nvme_data;
 			cgd->nvme_cdata = dev->nvme_cdata;
 			cgd->ccb_h.status = CAM_REQ_CMP;
 			cgd->serial_num_len = dev->serial_num_len;
 			if ((dev->serial_num_len > 0)
 			 && (dev->serial_num != NULL))
 				bcopy(dev->serial_num, cgd->serial_num,
 				      dev->serial_num_len);
 		}
 		break;
 	}
 	case XPT_GDEV_STATS:
 	{
 		struct cam_ed *dev;
 
 		dev = path->device;
 		if ((dev->flags & CAM_DEV_UNCONFIGURED) != 0) {
 			start_ccb->ccb_h.status = CAM_DEV_NOT_THERE;
 		} else {
 			struct ccb_getdevstats *cgds;
 			struct cam_eb *bus;
 			struct cam_et *tar;
 			struct cam_devq *devq;
 
 			cgds = &start_ccb->cgds;
 			bus = path->bus;
 			tar = path->target;
 			devq = bus->sim->devq;
 			mtx_lock(&devq->send_mtx);
 			cgds->dev_openings = dev->ccbq.dev_openings;
 			cgds->dev_active = dev->ccbq.dev_active;
 			cgds->allocated = dev->ccbq.allocated;
 			cgds->queued = cam_ccbq_pending_ccb_count(&dev->ccbq);
 			cgds->held = cgds->allocated - cgds->dev_active -
 			    cgds->queued;
 			cgds->last_reset = tar->last_reset;
 			cgds->maxtags = dev->maxtags;
 			cgds->mintags = dev->mintags;
 			if (timevalcmp(&tar->last_reset, &bus->last_reset, <))
 				cgds->last_reset = bus->last_reset;
 			mtx_unlock(&devq->send_mtx);
 			cgds->ccb_h.status = CAM_REQ_CMP;
 		}
 		break;
 	}
 	case XPT_GDEVLIST:
 	{
 		struct cam_periph	*nperiph;
 		struct periph_list	*periph_head;
 		struct ccb_getdevlist	*cgdl;
 		u_int			i;
 		struct cam_ed		*device;
 		int			found;
 
 
 		found = 0;
 
 		/*
 		 * Don't want anyone mucking with our data.
 		 */
 		device = path->device;
 		periph_head = &device->periphs;
 		cgdl = &start_ccb->cgdl;
 
 		/*
 		 * Check and see if the list has changed since the user
 		 * last requested a list member.  If so, tell them that the
 		 * list has changed, and therefore they need to start over
 		 * from the beginning.
 		 */
 		if ((cgdl->index != 0) &&
 		    (cgdl->generation != device->generation)) {
 			cgdl->status = CAM_GDEVLIST_LIST_CHANGED;
 			break;
 		}
 
 		/*
 		 * Traverse the list of peripherals and attempt to find
 		 * the requested peripheral.
 		 */
 		for (nperiph = SLIST_FIRST(periph_head), i = 0;
 		     (nperiph != NULL) && (i <= cgdl->index);
 		     nperiph = SLIST_NEXT(nperiph, periph_links), i++) {
 			if (i == cgdl->index) {
 				strncpy(cgdl->periph_name,
 					nperiph->periph_name,
 					DEV_IDLEN);
 				cgdl->unit_number = nperiph->unit_number;
 				found = 1;
 			}
 		}
 		if (found == 0) {
 			cgdl->status = CAM_GDEVLIST_ERROR;
 			break;
 		}
 
 		if (nperiph == NULL)
 			cgdl->status = CAM_GDEVLIST_LAST_DEVICE;
 		else
 			cgdl->status = CAM_GDEVLIST_MORE_DEVS;
 
 		cgdl->index++;
 		cgdl->generation = device->generation;
 
 		cgdl->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_DEV_MATCH:
 	{
 		dev_pos_type position_type;
 		struct ccb_dev_match *cdm;
 
 		cdm = &start_ccb->cdm;
 
 		/*
 		 * There are two ways of getting at information in the EDT.
 		 * The first way is via the primary EDT tree.  It starts
 		 * with a list of busses, then a list of targets on a bus,
 		 * then devices/luns on a target, and then peripherals on a
 		 * device/lun.  The "other" way is by the peripheral driver
 		 * lists.  The peripheral driver lists are organized by
 		 * peripheral driver.  (obviously)  So it makes sense to
 		 * use the peripheral driver list if the user is looking
 		 * for something like "da1", or all "da" devices.  If the
 		 * user is looking for something on a particular bus/target
 		 * or lun, it's generally better to go through the EDT tree.
 		 */
 
 		if (cdm->pos.position_type != CAM_DEV_POS_NONE)
 			position_type = cdm->pos.position_type;
 		else {
 			u_int i;
 
 			position_type = CAM_DEV_POS_NONE;
 
 			for (i = 0; i < cdm->num_patterns; i++) {
 				if ((cdm->patterns[i].type == DEV_MATCH_BUS)
 				 ||(cdm->patterns[i].type == DEV_MATCH_DEVICE)){
 					position_type = CAM_DEV_POS_EDT;
 					break;
 				}
 			}
 
 			if (cdm->num_patterns == 0)
 				position_type = CAM_DEV_POS_EDT;
 			else if (position_type == CAM_DEV_POS_NONE)
 				position_type = CAM_DEV_POS_PDRV;
 		}
 
 		switch(position_type & CAM_DEV_POS_TYPEMASK) {
 		case CAM_DEV_POS_EDT:
 			xptedtmatch(cdm);
 			break;
 		case CAM_DEV_POS_PDRV:
 			xptperiphlistmatch(cdm);
 			break;
 		default:
 			cdm->status = CAM_DEV_MATCH_ERROR;
 			break;
 		}
 
 		if (cdm->status == CAM_DEV_MATCH_ERROR)
 			start_ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 		else
 			start_ccb->ccb_h.status = CAM_REQ_CMP;
 
 		break;
 	}
 	case XPT_SASYNC_CB:
 	{
 		struct ccb_setasync *csa;
 		struct async_node *cur_entry;
 		struct async_list *async_head;
 		u_int32_t added;
 
 		csa = &start_ccb->csa;
 		added = csa->event_enable;
 		async_head = &path->device->asyncs;
 
 		/*
 		 * If there is already an entry for us, simply
 		 * update it.
 		 */
 		cur_entry = SLIST_FIRST(async_head);
 		while (cur_entry != NULL) {
 			if ((cur_entry->callback_arg == csa->callback_arg)
 			 && (cur_entry->callback == csa->callback))
 				break;
 			cur_entry = SLIST_NEXT(cur_entry, links);
 		}
 
 		if (cur_entry != NULL) {
 		 	/*
 			 * If the request has no flags set,
 			 * remove the entry.
 			 */
 			added &= ~cur_entry->event_enable;
 			if (csa->event_enable == 0) {
 				SLIST_REMOVE(async_head, cur_entry,
 					     async_node, links);
 				xpt_release_device(path->device);
 				free(cur_entry, M_CAMXPT);
 			} else {
 				cur_entry->event_enable = csa->event_enable;
 			}
 			csa->event_enable = added;
 		} else {
 			cur_entry = malloc(sizeof(*cur_entry), M_CAMXPT,
 					   M_NOWAIT);
 			if (cur_entry == NULL) {
 				csa->ccb_h.status = CAM_RESRC_UNAVAIL;
 				break;
 			}
 			cur_entry->event_enable = csa->event_enable;
 			cur_entry->event_lock =
 			    mtx_owned(path->bus->sim->mtx) ? 1 : 0;
 			cur_entry->callback_arg = csa->callback_arg;
 			cur_entry->callback = csa->callback;
 			SLIST_INSERT_HEAD(async_head, cur_entry, links);
 			xpt_acquire_device(path->device);
 		}
 		start_ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_REL_SIMQ:
 	{
 		struct ccb_relsim *crs;
 		struct cam_ed *dev;
 
 		crs = &start_ccb->crs;
 		dev = path->device;
 		if (dev == NULL) {
 
 			crs->ccb_h.status = CAM_DEV_NOT_THERE;
 			break;
 		}
 
 		if ((crs->release_flags & RELSIM_ADJUST_OPENINGS) != 0) {
 
 			/* Don't ever go below one opening */
 			if (crs->openings > 0) {
 				xpt_dev_ccbq_resize(path, crs->openings);
 				if (bootverbose) {
 					xpt_print(path,
 					    "number of openings is now %d\n",
 					    crs->openings);
 				}
 			}
 		}
 
 		mtx_lock(&dev->sim->devq->send_mtx);
 		if ((crs->release_flags & RELSIM_RELEASE_AFTER_TIMEOUT) != 0) {
 
 			if ((dev->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0) {
 
 				/*
 				 * Just extend the old timeout and decrement
 				 * the freeze count so that a single timeout
 				 * is sufficient for releasing the queue.
 				 */
 				start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE;
 				callout_stop(&dev->callout);
 			} else {
 
 				start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
 			}
 
 			callout_reset_sbt(&dev->callout,
 			    SBT_1MS * crs->release_timeout, 0,
 			    xpt_release_devq_timeout, dev, 0);
 
 			dev->flags |= CAM_DEV_REL_TIMEOUT_PENDING;
 
 		}
 
 		if ((crs->release_flags & RELSIM_RELEASE_AFTER_CMDCMPLT) != 0) {
 
 			if ((dev->flags & CAM_DEV_REL_ON_COMPLETE) != 0) {
 				/*
 				 * Decrement the freeze count so that a single
 				 * completion is still sufficient to unfreeze
 				 * the queue.
 				 */
 				start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE;
 			} else {
 
 				dev->flags |= CAM_DEV_REL_ON_COMPLETE;
 				start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
 			}
 		}
 
 		if ((crs->release_flags & RELSIM_RELEASE_AFTER_QEMPTY) != 0) {
 
 			if ((dev->flags & CAM_DEV_REL_ON_QUEUE_EMPTY) != 0
 			 || (dev->ccbq.dev_active == 0)) {
 
 				start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE;
 			} else {
 
 				dev->flags |= CAM_DEV_REL_ON_QUEUE_EMPTY;
 				start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
 			}
 		}
 		mtx_unlock(&dev->sim->devq->send_mtx);
 
 		if ((start_ccb->ccb_h.flags & CAM_DEV_QFREEZE) == 0)
 			xpt_release_devq(path, /*count*/1, /*run_queue*/TRUE);
 		start_ccb->crs.qfrozen_cnt = dev->ccbq.queue.qfrozen_cnt;
 		start_ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_DEBUG: {
 		struct cam_path *oldpath;
 
 		/* Check that all request bits are supported. */
 		if (start_ccb->cdbg.flags & ~(CAM_DEBUG_COMPILE)) {
 			start_ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
 			break;
 		}
 
 		cam_dflags = CAM_DEBUG_NONE;
 		if (cam_dpath != NULL) {
 			oldpath = cam_dpath;
 			cam_dpath = NULL;
 			xpt_free_path(oldpath);
 		}
 		if (start_ccb->cdbg.flags != CAM_DEBUG_NONE) {
 			if (xpt_create_path(&cam_dpath, NULL,
 					    start_ccb->ccb_h.path_id,
 					    start_ccb->ccb_h.target_id,
 					    start_ccb->ccb_h.target_lun) !=
 					    CAM_REQ_CMP) {
 				start_ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
 			} else {
 				cam_dflags = start_ccb->cdbg.flags;
 				start_ccb->ccb_h.status = CAM_REQ_CMP;
 				xpt_print(cam_dpath, "debugging flags now %x\n",
 				    cam_dflags);
 			}
 		} else
 			start_ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_NOOP:
 		if ((start_ccb->ccb_h.flags & CAM_DEV_QFREEZE) != 0)
 			xpt_freeze_devq(path, 1);
 		start_ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	case XPT_REPROBE_LUN:
 		xpt_async(AC_INQ_CHANGED, path, NULL);
 		start_ccb->ccb_h.status = CAM_REQ_CMP;
 		xpt_done(start_ccb);
 		break;
 	default:
 	case XPT_SDEV_TYPE:
 	case XPT_TERM_IO:
 	case XPT_ENG_INQ:
 		/* XXX Implement */
 		xpt_print_path(start_ccb->ccb_h.path);
 		printf("%s: CCB type %#x %s not supported\n", __func__,
 		    start_ccb->ccb_h.func_code,
 		    xpt_action_name(start_ccb->ccb_h.func_code));
 		start_ccb->ccb_h.status = CAM_PROVIDE_FAIL;
 		if (start_ccb->ccb_h.func_code & XPT_FC_DEV_QUEUED) {
 			xpt_done(start_ccb);
 		}
 		break;
 	}
 	CAM_DEBUG(path, CAM_DEBUG_TRACE,
 	    ("xpt_action_default: func= %#x %s status %#x\n",
 		start_ccb->ccb_h.func_code,
  		xpt_action_name(start_ccb->ccb_h.func_code),
 		start_ccb->ccb_h.status));
 }
 
 void
 xpt_polled_action(union ccb *start_ccb)
 {
 	u_int32_t timeout;
 	struct	  cam_sim *sim;
 	struct	  cam_devq *devq;
 	struct	  cam_ed *dev;
 
 	timeout = start_ccb->ccb_h.timeout * 10;
 	sim = start_ccb->ccb_h.path->bus->sim;
 	devq = sim->devq;
 	dev = start_ccb->ccb_h.path->device;
 
 	mtx_unlock(&dev->device_mtx);
 
 	/*
 	 * Steal an opening so that no other queued requests
 	 * can get it before us while we simulate interrupts.
 	 */
 	mtx_lock(&devq->send_mtx);
 	dev->ccbq.dev_openings--;
 	while((devq->send_openings <= 0 || dev->ccbq.dev_openings < 0) &&
 	    (--timeout > 0)) {
 		mtx_unlock(&devq->send_mtx);
 		DELAY(100);
 		CAM_SIM_LOCK(sim);
 		(*(sim->sim_poll))(sim);
 		CAM_SIM_UNLOCK(sim);
 		camisr_runqueue();
 		mtx_lock(&devq->send_mtx);
 	}
 	dev->ccbq.dev_openings++;
 	mtx_unlock(&devq->send_mtx);
 
 	if (timeout != 0) {
 		xpt_action(start_ccb);
 		while(--timeout > 0) {
 			CAM_SIM_LOCK(sim);
 			(*(sim->sim_poll))(sim);
 			CAM_SIM_UNLOCK(sim);
 			camisr_runqueue();
 			if ((start_ccb->ccb_h.status  & CAM_STATUS_MASK)
 			    != CAM_REQ_INPROG)
 				break;
 			DELAY(100);
 		}
 		if (timeout == 0) {
 			/*
 			 * XXX Is it worth adding a sim_timeout entry
 			 * point so we can attempt recovery?  If
 			 * this is only used for dumps, I don't think
 			 * it is.
 			 */
 			start_ccb->ccb_h.status = CAM_CMD_TIMEOUT;
 		}
 	} else {
 		start_ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
 	}
 
 	mtx_lock(&dev->device_mtx);
 }
 
 /*
  * Schedule a peripheral driver to receive a ccb when its
  * target device has space for more transactions.
  */
 void
 xpt_schedule(struct cam_periph *periph, u_int32_t new_priority)
 {
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("xpt_schedule\n"));
 	cam_periph_assert(periph, MA_OWNED);
 	if (new_priority < periph->scheduled_priority) {
 		periph->scheduled_priority = new_priority;
 		xpt_run_allocq(periph, 0);
 	}
 }
 
 
 /*
  * Schedule a device to run on a given queue.
  * If the device was inserted as a new entry on the queue,
  * return 1 meaning the device queue should be run. If we
  * were already queued, implying someone else has already
  * started the queue, return 0 so the caller doesn't attempt
  * to run the queue.
  */
 static int
 xpt_schedule_dev(struct camq *queue, cam_pinfo *pinfo,
 		 u_int32_t new_priority)
 {
 	int retval;
 	u_int32_t old_priority;
 
 	CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_schedule_dev\n"));
 
 	old_priority = pinfo->priority;
 
 	/*
 	 * Are we already queued?
 	 */
 	if (pinfo->index != CAM_UNQUEUED_INDEX) {
 		/* Simply reorder based on new priority */
 		if (new_priority < old_priority) {
 			camq_change_priority(queue, pinfo->index,
 					     new_priority);
 			CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
 					("changed priority to %d\n",
 					 new_priority));
 			retval = 1;
 		} else
 			retval = 0;
 	} else {
 		/* New entry on the queue */
 		if (new_priority < old_priority)
 			pinfo->priority = new_priority;
 
 		CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
 				("Inserting onto queue\n"));
 		pinfo->generation = ++queue->generation;
 		camq_insert(queue, pinfo);
 		retval = 1;
 	}
 	return (retval);
 }
 
 static void
 xpt_run_allocq_task(void *context, int pending)
 {
 	struct cam_periph *periph = context;
 
 	cam_periph_lock(periph);
 	periph->flags &= ~CAM_PERIPH_RUN_TASK;
 	xpt_run_allocq(periph, 1);
 	cam_periph_unlock(periph);
 	cam_periph_release(periph);
 }
 
 static void
 xpt_run_allocq(struct cam_periph *periph, int sleep)
 {
 	struct cam_ed	*device;
 	union ccb	*ccb;
 	uint32_t	 prio;
 
 	cam_periph_assert(periph, MA_OWNED);
 	if (periph->periph_allocating)
 		return;
 	periph->periph_allocating = 1;
 	CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_run_allocq(%p)\n", periph));
 	device = periph->path->device;
 	ccb = NULL;
 restart:
 	while ((prio = min(periph->scheduled_priority,
 	    periph->immediate_priority)) != CAM_PRIORITY_NONE &&
 	    (periph->periph_allocated - (ccb != NULL ? 1 : 0) <
 	     device->ccbq.total_openings || prio <= CAM_PRIORITY_OOB)) {
 
 		if (ccb == NULL &&
 		    (ccb = xpt_get_ccb_nowait(periph)) == NULL) {
 			if (sleep) {
 				ccb = xpt_get_ccb(periph);
 				goto restart;
 			}
 			if (periph->flags & CAM_PERIPH_RUN_TASK)
 				break;
 			cam_periph_doacquire(periph);
 			periph->flags |= CAM_PERIPH_RUN_TASK;
 			taskqueue_enqueue(xsoftc.xpt_taskq,
 			    &periph->periph_run_task);
 			break;
 		}
 		xpt_setup_ccb(&ccb->ccb_h, periph->path, prio);
 		if (prio == periph->immediate_priority) {
 			periph->immediate_priority = CAM_PRIORITY_NONE;
 			CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
 					("waking cam_periph_getccb()\n"));
 			SLIST_INSERT_HEAD(&periph->ccb_list, &ccb->ccb_h,
 					  periph_links.sle);
 			wakeup(&periph->ccb_list);
 		} else {
 			periph->scheduled_priority = CAM_PRIORITY_NONE;
 			CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
 					("calling periph_start()\n"));
 			periph->periph_start(periph, ccb);
 		}
 		ccb = NULL;
 	}
 	if (ccb != NULL)
 		xpt_release_ccb(ccb);
 	periph->periph_allocating = 0;
 }
 
 static void
 xpt_run_devq(struct cam_devq *devq)
 {
 	int lock;
 
 	CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_run_devq\n"));
 
 	devq->send_queue.qfrozen_cnt++;
 	while ((devq->send_queue.entries > 0)
 	    && (devq->send_openings > 0)
 	    && (devq->send_queue.qfrozen_cnt <= 1)) {
 		struct	cam_ed *device;
 		union ccb *work_ccb;
 		struct	cam_sim *sim;
 		struct xpt_proto *proto;
 
 		device = (struct cam_ed *)camq_remove(&devq->send_queue,
 							   CAMQ_HEAD);
 		CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
 				("running device %p\n", device));
 
 		work_ccb = cam_ccbq_peek_ccb(&device->ccbq, CAMQ_HEAD);
 		if (work_ccb == NULL) {
 			printf("device on run queue with no ccbs???\n");
 			continue;
 		}
 
 		if ((work_ccb->ccb_h.flags & CAM_HIGH_POWER) != 0) {
 
 			mtx_lock(&xsoftc.xpt_highpower_lock);
 		 	if (xsoftc.num_highpower <= 0) {
 				/*
 				 * We got a high power command, but we
 				 * don't have any available slots.  Freeze
 				 * the device queue until we have a slot
 				 * available.
 				 */
 				xpt_freeze_devq_device(device, 1);
 				STAILQ_INSERT_TAIL(&xsoftc.highpowerq, device,
 						   highpowerq_entry);
 
 				mtx_unlock(&xsoftc.xpt_highpower_lock);
 				continue;
 			} else {
 				/*
 				 * Consume a high power slot while
 				 * this ccb runs.
 				 */
 				xsoftc.num_highpower--;
 			}
 			mtx_unlock(&xsoftc.xpt_highpower_lock);
 		}
 		cam_ccbq_remove_ccb(&device->ccbq, work_ccb);
 		cam_ccbq_send_ccb(&device->ccbq, work_ccb);
 		devq->send_openings--;
 		devq->send_active++;
 		xpt_schedule_devq(devq, device);
 		mtx_unlock(&devq->send_mtx);
 
 		if ((work_ccb->ccb_h.flags & CAM_DEV_QFREEZE) != 0) {
 			/*
 			 * The client wants to freeze the queue
 			 * after this CCB is sent.
 			 */
 			xpt_freeze_devq(work_ccb->ccb_h.path, 1);
 		}
 
 		/* In Target mode, the peripheral driver knows best... */
 		if (work_ccb->ccb_h.func_code == XPT_SCSI_IO) {
 			if ((device->inq_flags & SID_CmdQue) != 0
 			 && work_ccb->csio.tag_action != CAM_TAG_ACTION_NONE)
 				work_ccb->ccb_h.flags |= CAM_TAG_ACTION_VALID;
 			else
 				/*
 				 * Clear this in case of a retried CCB that
 				 * failed due to a rejected tag.
 				 */
 				work_ccb->ccb_h.flags &= ~CAM_TAG_ACTION_VALID;
 		}
 
 		KASSERT(device == work_ccb->ccb_h.path->device,
 		    ("device (%p) / path->device (%p) mismatch",
 			device, work_ccb->ccb_h.path->device));
 		proto = xpt_proto_find(device->protocol);
 		if (proto && proto->ops->debug_out)
 			proto->ops->debug_out(work_ccb);
 
 		/*
 		 * Device queues can be shared among multiple SIM instances
 		 * that reside on different busses.  Use the SIM from the
 		 * queued device, rather than the one from the calling bus.
 		 */
 		sim = device->sim;
 		lock = (mtx_owned(sim->mtx) == 0);
 		if (lock)
 			CAM_SIM_LOCK(sim);
 		work_ccb->ccb_h.qos.sim_data = sbinuptime(); // xxx uintprt_t too small 32bit platforms
 		(*(sim->sim_action))(sim, work_ccb);
 		if (lock)
 			CAM_SIM_UNLOCK(sim);
 		mtx_lock(&devq->send_mtx);
 	}
 	devq->send_queue.qfrozen_cnt--;
 }
 
 /*
  * This function merges stuff from the slave ccb into the master ccb, while
  * keeping important fields in the master ccb constant.
  */
 void
 xpt_merge_ccb(union ccb *master_ccb, union ccb *slave_ccb)
 {
 
 	/*
 	 * Pull fields that are valid for peripheral drivers to set
 	 * into the master CCB along with the CCB "payload".
 	 */
 	master_ccb->ccb_h.retry_count = slave_ccb->ccb_h.retry_count;
 	master_ccb->ccb_h.func_code = slave_ccb->ccb_h.func_code;
 	master_ccb->ccb_h.timeout = slave_ccb->ccb_h.timeout;
 	master_ccb->ccb_h.flags = slave_ccb->ccb_h.flags;
 	bcopy(&(&slave_ccb->ccb_h)[1], &(&master_ccb->ccb_h)[1],
 	      sizeof(union ccb) - sizeof(struct ccb_hdr));
 }
 
 void
 xpt_setup_ccb_flags(struct ccb_hdr *ccb_h, struct cam_path *path,
 		    u_int32_t priority, u_int32_t flags)
 {
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_setup_ccb\n"));
 	ccb_h->pinfo.priority = priority;
 	ccb_h->path = path;
 	ccb_h->path_id = path->bus->path_id;
 	if (path->target)
 		ccb_h->target_id = path->target->target_id;
 	else
 		ccb_h->target_id = CAM_TARGET_WILDCARD;
 	if (path->device) {
 		ccb_h->target_lun = path->device->lun_id;
 		ccb_h->pinfo.generation = ++path->device->ccbq.queue.generation;
 	} else {
 		ccb_h->target_lun = CAM_TARGET_WILDCARD;
 	}
 	ccb_h->pinfo.index = CAM_UNQUEUED_INDEX;
 	ccb_h->flags = flags;
 	ccb_h->xflags = 0;
 }
 
 void
 xpt_setup_ccb(struct ccb_hdr *ccb_h, struct cam_path *path, u_int32_t priority)
 {
 	xpt_setup_ccb_flags(ccb_h, path, priority, /*flags*/ 0);
 }
 
 /* Path manipulation functions */
 cam_status
 xpt_create_path(struct cam_path **new_path_ptr, struct cam_periph *perph,
 		path_id_t path_id, target_id_t target_id, lun_id_t lun_id)
 {
 	struct	   cam_path *path;
 	cam_status status;
 
 	path = (struct cam_path *)malloc(sizeof(*path), M_CAMPATH, M_NOWAIT);
 
 	if (path == NULL) {
 		status = CAM_RESRC_UNAVAIL;
 		return(status);
 	}
 	status = xpt_compile_path(path, perph, path_id, target_id, lun_id);
 	if (status != CAM_REQ_CMP) {
 		free(path, M_CAMPATH);
 		path = NULL;
 	}
 	*new_path_ptr = path;
 	return (status);
 }
 
 cam_status
 xpt_create_path_unlocked(struct cam_path **new_path_ptr,
 			 struct cam_periph *periph, path_id_t path_id,
 			 target_id_t target_id, lun_id_t lun_id)
 {
 
 	return (xpt_create_path(new_path_ptr, periph, path_id, target_id,
 	    lun_id));
 }
 
 cam_status
 xpt_compile_path(struct cam_path *new_path, struct cam_periph *perph,
 		 path_id_t path_id, target_id_t target_id, lun_id_t lun_id)
 {
 	struct	     cam_eb *bus;
 	struct	     cam_et *target;
 	struct	     cam_ed *device;
 	cam_status   status;
 
 	status = CAM_REQ_CMP;	/* Completed without error */
 	target = NULL;		/* Wildcarded */
 	device = NULL;		/* Wildcarded */
 
 	/*
 	 * We will potentially modify the EDT, so block interrupts
 	 * that may attempt to create cam paths.
 	 */
 	bus = xpt_find_bus(path_id);
 	if (bus == NULL) {
 		status = CAM_PATH_INVALID;
 	} else {
 		xpt_lock_buses();
 		mtx_lock(&bus->eb_mtx);
 		target = xpt_find_target(bus, target_id);
 		if (target == NULL) {
 			/* Create one */
 			struct cam_et *new_target;
 
 			new_target = xpt_alloc_target(bus, target_id);
 			if (new_target == NULL) {
 				status = CAM_RESRC_UNAVAIL;
 			} else {
 				target = new_target;
 			}
 		}
 		xpt_unlock_buses();
 		if (target != NULL) {
 			device = xpt_find_device(target, lun_id);
 			if (device == NULL) {
 				/* Create one */
 				struct cam_ed *new_device;
 
 				new_device =
 				    (*(bus->xport->ops->alloc_device))(bus,
 								       target,
 								       lun_id);
 				if (new_device == NULL) {
 					status = CAM_RESRC_UNAVAIL;
 				} else {
 					device = new_device;
 				}
 			}
 		}
 		mtx_unlock(&bus->eb_mtx);
 	}
 
 	/*
 	 * Only touch the user's data if we are successful.
 	 */
 	if (status == CAM_REQ_CMP) {
 		new_path->periph = perph;
 		new_path->bus = bus;
 		new_path->target = target;
 		new_path->device = device;
 		CAM_DEBUG(new_path, CAM_DEBUG_TRACE, ("xpt_compile_path\n"));
 	} else {
 		if (device != NULL)
 			xpt_release_device(device);
 		if (target != NULL)
 			xpt_release_target(target);
 		if (bus != NULL)
 			xpt_release_bus(bus);
 	}
 	return (status);
 }
 
 cam_status
 xpt_clone_path(struct cam_path **new_path_ptr, struct cam_path *path)
 {
 	struct	   cam_path *new_path;
 
 	new_path = (struct cam_path *)malloc(sizeof(*path), M_CAMPATH, M_NOWAIT);
 	if (new_path == NULL)
 		return(CAM_RESRC_UNAVAIL);
 	xpt_copy_path(new_path, path);
 	*new_path_ptr = new_path;
 	return (CAM_REQ_CMP);
 }
 
 void
 xpt_copy_path(struct cam_path *new_path, struct cam_path *path)
 {
 
 	*new_path = *path;
 	if (path->bus != NULL)
 		xpt_acquire_bus(path->bus);
 	if (path->target != NULL)
 		xpt_acquire_target(path->target);
 	if (path->device != NULL)
 		xpt_acquire_device(path->device);
 }
 
 void
 xpt_release_path(struct cam_path *path)
 {
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_release_path\n"));
 	if (path->device != NULL) {
 		xpt_release_device(path->device);
 		path->device = NULL;
 	}
 	if (path->target != NULL) {
 		xpt_release_target(path->target);
 		path->target = NULL;
 	}
 	if (path->bus != NULL) {
 		xpt_release_bus(path->bus);
 		path->bus = NULL;
 	}
 }
 
 void
 xpt_free_path(struct cam_path *path)
 {
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_free_path\n"));
 	xpt_release_path(path);
 	free(path, M_CAMPATH);
 }
 
 void
 xpt_path_counts(struct cam_path *path, uint32_t *bus_ref,
     uint32_t *periph_ref, uint32_t *target_ref, uint32_t *device_ref)
 {
 
 	xpt_lock_buses();
 	if (bus_ref) {
 		if (path->bus)
 			*bus_ref = path->bus->refcount;
 		else
 			*bus_ref = 0;
 	}
 	if (periph_ref) {
 		if (path->periph)
 			*periph_ref = path->periph->refcount;
 		else
 			*periph_ref = 0;
 	}
 	xpt_unlock_buses();
 	if (target_ref) {
 		if (path->target)
 			*target_ref = path->target->refcount;
 		else
 			*target_ref = 0;
 	}
 	if (device_ref) {
 		if (path->device)
 			*device_ref = path->device->refcount;
 		else
 			*device_ref = 0;
 	}
 }
 
 /*
  * Return -1 for failure, 0 for exact match, 1 for match with wildcards
  * in path1, 2 for match with wildcards in path2.
  */
 int
 xpt_path_comp(struct cam_path *path1, struct cam_path *path2)
 {
 	int retval = 0;
 
 	if (path1->bus != path2->bus) {
 		if (path1->bus->path_id == CAM_BUS_WILDCARD)
 			retval = 1;
 		else if (path2->bus->path_id == CAM_BUS_WILDCARD)
 			retval = 2;
 		else
 			return (-1);
 	}
 	if (path1->target != path2->target) {
 		if (path1->target->target_id == CAM_TARGET_WILDCARD) {
 			if (retval == 0)
 				retval = 1;
 		} else if (path2->target->target_id == CAM_TARGET_WILDCARD)
 			retval = 2;
 		else
 			return (-1);
 	}
 	if (path1->device != path2->device) {
 		if (path1->device->lun_id == CAM_LUN_WILDCARD) {
 			if (retval == 0)
 				retval = 1;
 		} else if (path2->device->lun_id == CAM_LUN_WILDCARD)
 			retval = 2;
 		else
 			return (-1);
 	}
 	return (retval);
 }
 
 int
 xpt_path_comp_dev(struct cam_path *path, struct cam_ed *dev)
 {
 	int retval = 0;
 
 	if (path->bus != dev->target->bus) {
 		if (path->bus->path_id == CAM_BUS_WILDCARD)
 			retval = 1;
 		else if (dev->target->bus->path_id == CAM_BUS_WILDCARD)
 			retval = 2;
 		else
 			return (-1);
 	}
 	if (path->target != dev->target) {
 		if (path->target->target_id == CAM_TARGET_WILDCARD) {
 			if (retval == 0)
 				retval = 1;
 		} else if (dev->target->target_id == CAM_TARGET_WILDCARD)
 			retval = 2;
 		else
 			return (-1);
 	}
 	if (path->device != dev) {
 		if (path->device->lun_id == CAM_LUN_WILDCARD) {
 			if (retval == 0)
 				retval = 1;
 		} else if (dev->lun_id == CAM_LUN_WILDCARD)
 			retval = 2;
 		else
 			return (-1);
 	}
 	return (retval);
 }
 
 void
 xpt_print_path(struct cam_path *path)
 {
 
 	if (path == NULL)
 		printf("(nopath): ");
 	else {
 		if (path->periph != NULL)
 			printf("(%s%d:", path->periph->periph_name,
 			       path->periph->unit_number);
 		else
 			printf("(noperiph:");
 
 		if (path->bus != NULL)
 			printf("%s%d:%d:", path->bus->sim->sim_name,
 			       path->bus->sim->unit_number,
 			       path->bus->sim->bus_id);
 		else
 			printf("nobus:");
 
 		if (path->target != NULL)
 			printf("%d:", path->target->target_id);
 		else
 			printf("X:");
 
 		if (path->device != NULL)
 			printf("%jx): ", (uintmax_t)path->device->lun_id);
 		else
 			printf("X): ");
 	}
 }
 
 void
 xpt_print_device(struct cam_ed *device)
 {
 
 	if (device == NULL)
 		printf("(nopath): ");
 	else {
 		printf("(noperiph:%s%d:%d:%d:%jx): ", device->sim->sim_name,
 		       device->sim->unit_number,
 		       device->sim->bus_id,
 		       device->target->target_id,
 		       (uintmax_t)device->lun_id);
 	}
 }
 
 void
 xpt_print(struct cam_path *path, const char *fmt, ...)
 {
 	va_list ap;
 	xpt_print_path(path);
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 }
 
 int
 xpt_path_string(struct cam_path *path, char *str, size_t str_len)
 {
 	struct sbuf sb;
 
 	sbuf_new(&sb, str, str_len, 0);
 
 	if (path == NULL)
 		sbuf_printf(&sb, "(nopath): ");
 	else {
 		if (path->periph != NULL)
 			sbuf_printf(&sb, "(%s%d:", path->periph->periph_name,
 				    path->periph->unit_number);
 		else
 			sbuf_printf(&sb, "(noperiph:");
 
 		if (path->bus != NULL)
 			sbuf_printf(&sb, "%s%d:%d:", path->bus->sim->sim_name,
 				    path->bus->sim->unit_number,
 				    path->bus->sim->bus_id);
 		else
 			sbuf_printf(&sb, "nobus:");
 
 		if (path->target != NULL)
 			sbuf_printf(&sb, "%d:", path->target->target_id);
 		else
 			sbuf_printf(&sb, "X:");
 
 		if (path->device != NULL)
 			sbuf_printf(&sb, "%jx): ",
 			    (uintmax_t)path->device->lun_id);
 		else
 			sbuf_printf(&sb, "X): ");
 	}
 	sbuf_finish(&sb);
 
 	return(sbuf_len(&sb));
 }
 
 path_id_t
 xpt_path_path_id(struct cam_path *path)
 {
 	return(path->bus->path_id);
 }
 
 target_id_t
 xpt_path_target_id(struct cam_path *path)
 {
 	if (path->target != NULL)
 		return (path->target->target_id);
 	else
 		return (CAM_TARGET_WILDCARD);
 }
 
 lun_id_t
 xpt_path_lun_id(struct cam_path *path)
 {
 	if (path->device != NULL)
 		return (path->device->lun_id);
 	else
 		return (CAM_LUN_WILDCARD);
 }
 
 struct cam_sim *
 xpt_path_sim(struct cam_path *path)
 {
 
 	return (path->bus->sim);
 }
 
 struct cam_periph*
 xpt_path_periph(struct cam_path *path)
 {
 
 	return (path->periph);
 }
 
 /*
  * Release a CAM control block for the caller.  Remit the cost of the structure
  * to the device referenced by the path.  If the this device had no 'credits'
  * and peripheral drivers have registered async callbacks for this notification
  * call them now.
  */
 void
 xpt_release_ccb(union ccb *free_ccb)
 {
 	struct	 cam_ed *device;
 	struct	 cam_periph *periph;
 
 	CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_release_ccb\n"));
 	xpt_path_assert(free_ccb->ccb_h.path, MA_OWNED);
 	device = free_ccb->ccb_h.path->device;
 	periph = free_ccb->ccb_h.path->periph;
 
 	xpt_free_ccb(free_ccb);
 	periph->periph_allocated--;
 	cam_ccbq_release_opening(&device->ccbq);
 	xpt_run_allocq(periph, 0);
 }
 
 /* Functions accessed by SIM drivers */
 
 static struct xpt_xport_ops xport_default_ops = {
 	.alloc_device = xpt_alloc_device_default,
 	.action = xpt_action_default,
 	.async = xpt_dev_async_default,
 };
 static struct xpt_xport xport_default = {
 	.xport = XPORT_UNKNOWN,
 	.name = "unknown",
 	.ops = &xport_default_ops,
 };
 
 CAM_XPT_XPORT(xport_default);
 
 /*
  * A sim structure, listing the SIM entry points and instance
  * identification info is passed to xpt_bus_register to hook the SIM
  * into the CAM framework.  xpt_bus_register creates a cam_eb entry
  * for this new bus and places it in the array of busses and assigns
  * it a path_id.  The path_id may be influenced by "hard wiring"
  * information specified by the user.  Once interrupt services are
  * available, the bus will be probed.
  */
 int32_t
 xpt_bus_register(struct cam_sim *sim, device_t parent, u_int32_t bus)
 {
 	struct cam_eb *new_bus;
 	struct cam_eb *old_bus;
 	struct ccb_pathinq cpi;
 	struct cam_path *path;
 	cam_status status;
 
 	mtx_assert(sim->mtx, MA_OWNED);
 
 	sim->bus_id = bus;
 	new_bus = (struct cam_eb *)malloc(sizeof(*new_bus),
 					  M_CAMXPT, M_NOWAIT|M_ZERO);
 	if (new_bus == NULL) {
 		/* Couldn't satisfy request */
 		return (CAM_RESRC_UNAVAIL);
 	}
 
 	mtx_init(&new_bus->eb_mtx, "CAM bus lock", NULL, MTX_DEF);
 	TAILQ_INIT(&new_bus->et_entries);
 	cam_sim_hold(sim);
 	new_bus->sim = sim;
 	timevalclear(&new_bus->last_reset);
 	new_bus->flags = 0;
 	new_bus->refcount = 1;	/* Held until a bus_deregister event */
 	new_bus->generation = 0;
 
 	xpt_lock_buses();
 	sim->path_id = new_bus->path_id =
 	    xptpathid(sim->sim_name, sim->unit_number, sim->bus_id);
 	old_bus = TAILQ_FIRST(&xsoftc.xpt_busses);
 	while (old_bus != NULL
 	    && old_bus->path_id < new_bus->path_id)
 		old_bus = TAILQ_NEXT(old_bus, links);
 	if (old_bus != NULL)
 		TAILQ_INSERT_BEFORE(old_bus, new_bus, links);
 	else
 		TAILQ_INSERT_TAIL(&xsoftc.xpt_busses, new_bus, links);
 	xsoftc.bus_generation++;
 	xpt_unlock_buses();
 
 	/*
 	 * Set a default transport so that a PATH_INQ can be issued to
 	 * the SIM.  This will then allow for probing and attaching of
 	 * a more appropriate transport.
 	 */
 	new_bus->xport = &xport_default;
 
 	status = xpt_create_path(&path, /*periph*/NULL, sim->path_id,
 				  CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD);
 	if (status != CAM_REQ_CMP) {
 		xpt_release_bus(new_bus);
 		free(path, M_CAMXPT);
 		return (CAM_RESRC_UNAVAIL);
 	}
 
 	xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NORMAL);
 	cpi.ccb_h.func_code = XPT_PATH_INQ;
 	xpt_action((union ccb *)&cpi);
 
 	if (cpi.ccb_h.status == CAM_REQ_CMP) {
 		struct xpt_xport **xpt;
 
 		SET_FOREACH(xpt, cam_xpt_xport_set) {
 			if ((*xpt)->xport == cpi.transport) {
 				new_bus->xport = *xpt;
 				break;
 			}
 		}
 		if (new_bus->xport == NULL) {
 			xpt_print_path(path);
 			printf("No transport found for %d\n", cpi.transport);
 			xpt_release_bus(new_bus);
 			free(path, M_CAMXPT);
 			return (CAM_RESRC_UNAVAIL);
 		}
 	}
 
 	/* Notify interested parties */
 	if (sim->path_id != CAM_XPT_PATH_ID) {
 
 		xpt_async(AC_PATH_REGISTERED, path, &cpi);
 		if ((cpi.hba_misc & PIM_NOSCAN) == 0) {
 			union	ccb *scan_ccb;
 
 			/* Initiate bus rescan. */
 			scan_ccb = xpt_alloc_ccb_nowait();
 			if (scan_ccb != NULL) {
 				scan_ccb->ccb_h.path = path;
 				scan_ccb->ccb_h.func_code = XPT_SCAN_BUS;
 				scan_ccb->crcn.flags = 0;
 				xpt_rescan(scan_ccb);
 			} else {
 				xpt_print(path,
 					  "Can't allocate CCB to scan bus\n");
 				xpt_free_path(path);
 			}
 		} else
 			xpt_free_path(path);
 	} else
 		xpt_free_path(path);
 	return (CAM_SUCCESS);
 }
 
 int32_t
 xpt_bus_deregister(path_id_t pathid)
 {
 	struct cam_path bus_path;
 	cam_status status;
 
 	status = xpt_compile_path(&bus_path, NULL, pathid,
 				  CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD);
 	if (status != CAM_REQ_CMP)
 		return (status);
 
 	xpt_async(AC_LOST_DEVICE, &bus_path, NULL);
 	xpt_async(AC_PATH_DEREGISTERED, &bus_path, NULL);
 
 	/* Release the reference count held while registered. */
 	xpt_release_bus(bus_path.bus);
 	xpt_release_path(&bus_path);
 
 	return (CAM_REQ_CMP);
 }
 
 static path_id_t
 xptnextfreepathid(void)
 {
 	struct cam_eb *bus;
 	path_id_t pathid;
 	const char *strval;
 
 	mtx_assert(&xsoftc.xpt_topo_lock, MA_OWNED);
 	pathid = 0;
 	bus = TAILQ_FIRST(&xsoftc.xpt_busses);
 retry:
 	/* Find an unoccupied pathid */
 	while (bus != NULL && bus->path_id <= pathid) {
 		if (bus->path_id == pathid)
 			pathid++;
 		bus = TAILQ_NEXT(bus, links);
 	}
 
 	/*
 	 * Ensure that this pathid is not reserved for
 	 * a bus that may be registered in the future.
 	 */
 	if (resource_string_value("scbus", pathid, "at", &strval) == 0) {
 		++pathid;
 		/* Start the search over */
 		goto retry;
 	}
 	return (pathid);
 }
 
 static path_id_t
 xptpathid(const char *sim_name, int sim_unit, int sim_bus)
 {
 	path_id_t pathid;
 	int i, dunit, val;
 	char buf[32];
 	const char *dname;
 
 	pathid = CAM_XPT_PATH_ID;
 	snprintf(buf, sizeof(buf), "%s%d", sim_name, sim_unit);
 	if (strcmp(buf, "xpt0") == 0 && sim_bus == 0)
 		return (pathid);
 	i = 0;
 	while ((resource_find_match(&i, &dname, &dunit, "at", buf)) == 0) {
 		if (strcmp(dname, "scbus")) {
 			/* Avoid a bit of foot shooting. */
 			continue;
 		}
 		if (dunit < 0)		/* unwired?! */
 			continue;
 		if (resource_int_value("scbus", dunit, "bus", &val) == 0) {
 			if (sim_bus == val) {
 				pathid = dunit;
 				break;
 			}
 		} else if (sim_bus == 0) {
 			/* Unspecified matches bus 0 */
 			pathid = dunit;
 			break;
 		} else {
 			printf("Ambiguous scbus configuration for %s%d "
 			       "bus %d, cannot wire down.  The kernel "
 			       "config entry for scbus%d should "
 			       "specify a controller bus.\n"
 			       "Scbus will be assigned dynamically.\n",
 			       sim_name, sim_unit, sim_bus, dunit);
 			break;
 		}
 	}
 
 	if (pathid == CAM_XPT_PATH_ID)
 		pathid = xptnextfreepathid();
 	return (pathid);
 }
 
 static const char *
 xpt_async_string(u_int32_t async_code)
 {
 
 	switch (async_code) {
 	case AC_BUS_RESET: return ("AC_BUS_RESET");
 	case AC_UNSOL_RESEL: return ("AC_UNSOL_RESEL");
 	case AC_SCSI_AEN: return ("AC_SCSI_AEN");
 	case AC_SENT_BDR: return ("AC_SENT_BDR");
 	case AC_PATH_REGISTERED: return ("AC_PATH_REGISTERED");
 	case AC_PATH_DEREGISTERED: return ("AC_PATH_DEREGISTERED");
 	case AC_FOUND_DEVICE: return ("AC_FOUND_DEVICE");
 	case AC_LOST_DEVICE: return ("AC_LOST_DEVICE");
 	case AC_TRANSFER_NEG: return ("AC_TRANSFER_NEG");
 	case AC_INQ_CHANGED: return ("AC_INQ_CHANGED");
 	case AC_GETDEV_CHANGED: return ("AC_GETDEV_CHANGED");
 	case AC_CONTRACT: return ("AC_CONTRACT");
 	case AC_ADVINFO_CHANGED: return ("AC_ADVINFO_CHANGED");
 	case AC_UNIT_ATTENTION: return ("AC_UNIT_ATTENTION");
 	}
 	return ("AC_UNKNOWN");
 }
 
 static int
 xpt_async_size(u_int32_t async_code)
 {
 
 	switch (async_code) {
 	case AC_BUS_RESET: return (0);
 	case AC_UNSOL_RESEL: return (0);
 	case AC_SCSI_AEN: return (0);
 	case AC_SENT_BDR: return (0);
 	case AC_PATH_REGISTERED: return (sizeof(struct ccb_pathinq));
 	case AC_PATH_DEREGISTERED: return (0);
 	case AC_FOUND_DEVICE: return (sizeof(struct ccb_getdev));
 	case AC_LOST_DEVICE: return (0);
 	case AC_TRANSFER_NEG: return (sizeof(struct ccb_trans_settings));
 	case AC_INQ_CHANGED: return (0);
 	case AC_GETDEV_CHANGED: return (0);
 	case AC_CONTRACT: return (sizeof(struct ac_contract));
 	case AC_ADVINFO_CHANGED: return (-1);
 	case AC_UNIT_ATTENTION: return (sizeof(struct ccb_scsiio));
 	}
 	return (0);
 }
 
 static int
 xpt_async_process_dev(struct cam_ed *device, void *arg)
 {
 	union ccb *ccb = arg;
 	struct cam_path *path = ccb->ccb_h.path;
 	void *async_arg = ccb->casync.async_arg_ptr;
 	u_int32_t async_code = ccb->casync.async_code;
 	int relock;
 
 	if (path->device != device
 	 && path->device->lun_id != CAM_LUN_WILDCARD
 	 && device->lun_id != CAM_LUN_WILDCARD)
 		return (1);
 
 	/*
 	 * The async callback could free the device.
 	 * If it is a broadcast async, it doesn't hold
 	 * device reference, so take our own reference.
 	 */
 	xpt_acquire_device(device);
 
 	/*
 	 * If async for specific device is to be delivered to
 	 * the wildcard client, take the specific device lock.
 	 * XXX: We may need a way for client to specify it.
 	 */
 	if ((device->lun_id == CAM_LUN_WILDCARD &&
 	     path->device->lun_id != CAM_LUN_WILDCARD) ||
 	    (device->target->target_id == CAM_TARGET_WILDCARD &&
 	     path->target->target_id != CAM_TARGET_WILDCARD) ||
 	    (device->target->bus->path_id == CAM_BUS_WILDCARD &&
 	     path->target->bus->path_id != CAM_BUS_WILDCARD)) {
 		mtx_unlock(&device->device_mtx);
 		xpt_path_lock(path);
 		relock = 1;
 	} else
 		relock = 0;
 
 	(*(device->target->bus->xport->ops->async))(async_code,
 	    device->target->bus, device->target, device, async_arg);
 	xpt_async_bcast(&device->asyncs, async_code, path, async_arg);
 
 	if (relock) {
 		xpt_path_unlock(path);
 		mtx_lock(&device->device_mtx);
 	}
 	xpt_release_device(device);
 	return (1);
 }
 
 static int
 xpt_async_process_tgt(struct cam_et *target, void *arg)
 {
 	union ccb *ccb = arg;
 	struct cam_path *path = ccb->ccb_h.path;
 
 	if (path->target != target
 	 && path->target->target_id != CAM_TARGET_WILDCARD
 	 && target->target_id != CAM_TARGET_WILDCARD)
 		return (1);
 
 	if (ccb->casync.async_code == AC_SENT_BDR) {
 		/* Update our notion of when the last reset occurred */
 		microtime(&target->last_reset);
 	}
 
 	return (xptdevicetraverse(target, NULL, xpt_async_process_dev, ccb));
 }
 
 static void
 xpt_async_process(struct cam_periph *periph, union ccb *ccb)
 {
 	struct cam_eb *bus;
 	struct cam_path *path;
 	void *async_arg;
 	u_int32_t async_code;
 
 	path = ccb->ccb_h.path;
 	async_code = ccb->casync.async_code;
 	async_arg = ccb->casync.async_arg_ptr;
 	CAM_DEBUG(path, CAM_DEBUG_TRACE | CAM_DEBUG_INFO,
 	    ("xpt_async(%s)\n", xpt_async_string(async_code)));
 	bus = path->bus;
 
 	if (async_code == AC_BUS_RESET) {
 		/* Update our notion of when the last reset occurred */
 		microtime(&bus->last_reset);
 	}
 
 	xpttargettraverse(bus, NULL, xpt_async_process_tgt, ccb);
 
 	/*
 	 * If this wasn't a fully wildcarded async, tell all
 	 * clients that want all async events.
 	 */
 	if (bus != xpt_periph->path->bus) {
 		xpt_path_lock(xpt_periph->path);
 		xpt_async_process_dev(xpt_periph->path->device, ccb);
 		xpt_path_unlock(xpt_periph->path);
 	}
 
 	if (path->device != NULL && path->device->lun_id != CAM_LUN_WILDCARD)
 		xpt_release_devq(path, 1, TRUE);
 	else
 		xpt_release_simq(path->bus->sim, TRUE);
 	if (ccb->casync.async_arg_size > 0)
 		free(async_arg, M_CAMXPT);
 	xpt_free_path(path);
 	xpt_free_ccb(ccb);
 }
 
 static void
 xpt_async_bcast(struct async_list *async_head,
 		u_int32_t async_code,
 		struct cam_path *path, void *async_arg)
 {
 	struct async_node *cur_entry;
 	int lock;
 
 	cur_entry = SLIST_FIRST(async_head);
 	while (cur_entry != NULL) {
 		struct async_node *next_entry;
 		/*
 		 * Grab the next list entry before we call the current
 		 * entry's callback.  This is because the callback function
 		 * can delete its async callback entry.
 		 */
 		next_entry = SLIST_NEXT(cur_entry, links);
 		if ((cur_entry->event_enable & async_code) != 0) {
 			lock = cur_entry->event_lock;
 			if (lock)
 				CAM_SIM_LOCK(path->device->sim);
 			cur_entry->callback(cur_entry->callback_arg,
 					    async_code, path,
 					    async_arg);
 			if (lock)
 				CAM_SIM_UNLOCK(path->device->sim);
 		}
 		cur_entry = next_entry;
 	}
 }
 
 void
 xpt_async(u_int32_t async_code, struct cam_path *path, void *async_arg)
 {
 	union ccb *ccb;
 	int size;
 
 	ccb = xpt_alloc_ccb_nowait();
 	if (ccb == NULL) {
 		xpt_print(path, "Can't allocate CCB to send %s\n",
 		    xpt_async_string(async_code));
 		return;
 	}
 
 	if (xpt_clone_path(&ccb->ccb_h.path, path) != CAM_REQ_CMP) {
 		xpt_print(path, "Can't allocate path to send %s\n",
 		    xpt_async_string(async_code));
 		xpt_free_ccb(ccb);
 		return;
 	}
 	ccb->ccb_h.path->periph = NULL;
 	ccb->ccb_h.func_code = XPT_ASYNC;
 	ccb->ccb_h.cbfcnp = xpt_async_process;
 	ccb->ccb_h.flags |= CAM_UNLOCKED;
 	ccb->casync.async_code = async_code;
 	ccb->casync.async_arg_size = 0;
 	size = xpt_async_size(async_code);
 	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
 	    ("xpt_async: func %#x %s aync_code %d %s\n",
 		ccb->ccb_h.func_code,
 		xpt_action_name(ccb->ccb_h.func_code),
 		async_code,
 		xpt_async_string(async_code)));
 	if (size > 0 && async_arg != NULL) {
 		ccb->casync.async_arg_ptr = malloc(size, M_CAMXPT, M_NOWAIT);
 		if (ccb->casync.async_arg_ptr == NULL) {
 			xpt_print(path, "Can't allocate argument to send %s\n",
 			    xpt_async_string(async_code));
 			xpt_free_path(ccb->ccb_h.path);
 			xpt_free_ccb(ccb);
 			return;
 		}
 		memcpy(ccb->casync.async_arg_ptr, async_arg, size);
 		ccb->casync.async_arg_size = size;
 	} else if (size < 0) {
 		ccb->casync.async_arg_ptr = async_arg;
 		ccb->casync.async_arg_size = size;
 	}
 	if (path->device != NULL && path->device->lun_id != CAM_LUN_WILDCARD)
 		xpt_freeze_devq(path, 1);
 	else
 		xpt_freeze_simq(path->bus->sim, 1);
 	xpt_done(ccb);
 }
 
 static void
 xpt_dev_async_default(u_int32_t async_code, struct cam_eb *bus,
 		      struct cam_et *target, struct cam_ed *device,
 		      void *async_arg)
 {
 
 	/*
 	 * We only need to handle events for real devices.
 	 */
 	if (target->target_id == CAM_TARGET_WILDCARD
 	 || device->lun_id == CAM_LUN_WILDCARD)
 		return;
 
 	printf("%s called\n", __func__);
 }
 
 static uint32_t
 xpt_freeze_devq_device(struct cam_ed *dev, u_int count)
 {
 	struct cam_devq	*devq;
 	uint32_t freeze;
 
 	devq = dev->sim->devq;
 	mtx_assert(&devq->send_mtx, MA_OWNED);
 	CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE,
 	    ("xpt_freeze_devq_device(%d) %u->%u\n", count,
 	    dev->ccbq.queue.qfrozen_cnt, dev->ccbq.queue.qfrozen_cnt + count));
 	freeze = (dev->ccbq.queue.qfrozen_cnt += count);
 	/* Remove frozen device from sendq. */
 	if (device_is_queued(dev))
 		camq_remove(&devq->send_queue, dev->devq_entry.index);
 	return (freeze);
 }
 
 u_int32_t
 xpt_freeze_devq(struct cam_path *path, u_int count)
 {
 	struct cam_ed	*dev = path->device;
 	struct cam_devq	*devq;
 	uint32_t	 freeze;
 
 	devq = dev->sim->devq;
 	mtx_lock(&devq->send_mtx);
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_freeze_devq(%d)\n", count));
 	freeze = xpt_freeze_devq_device(dev, count);
 	mtx_unlock(&devq->send_mtx);
 	return (freeze);
 }
 
 u_int32_t
 xpt_freeze_simq(struct cam_sim *sim, u_int count)
 {
 	struct cam_devq	*devq;
 	uint32_t	 freeze;
 
 	devq = sim->devq;
 	mtx_lock(&devq->send_mtx);
 	freeze = (devq->send_queue.qfrozen_cnt += count);
 	mtx_unlock(&devq->send_mtx);
 	return (freeze);
 }
 
 static void
 xpt_release_devq_timeout(void *arg)
 {
 	struct cam_ed *dev;
 	struct cam_devq *devq;
 
 	dev = (struct cam_ed *)arg;
 	CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE, ("xpt_release_devq_timeout\n"));
 	devq = dev->sim->devq;
 	mtx_assert(&devq->send_mtx, MA_OWNED);
 	if (xpt_release_devq_device(dev, /*count*/1, /*run_queue*/TRUE))
 		xpt_run_devq(devq);
 }
 
 void
 xpt_release_devq(struct cam_path *path, u_int count, int run_queue)
 {
 	struct cam_ed *dev;
 	struct cam_devq *devq;
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_release_devq(%d, %d)\n",
 	    count, run_queue));
 	dev = path->device;
 	devq = dev->sim->devq;
 	mtx_lock(&devq->send_mtx);
 	if (xpt_release_devq_device(dev, count, run_queue))
 		xpt_run_devq(dev->sim->devq);
 	mtx_unlock(&devq->send_mtx);
 }
 
 static int
 xpt_release_devq_device(struct cam_ed *dev, u_int count, int run_queue)
 {
 
 	mtx_assert(&dev->sim->devq->send_mtx, MA_OWNED);
 	CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE,
 	    ("xpt_release_devq_device(%d, %d) %u->%u\n", count, run_queue,
 	    dev->ccbq.queue.qfrozen_cnt, dev->ccbq.queue.qfrozen_cnt - count));
 	if (count > dev->ccbq.queue.qfrozen_cnt) {
 #ifdef INVARIANTS
 		printf("xpt_release_devq(): requested %u > present %u\n",
 		    count, dev->ccbq.queue.qfrozen_cnt);
 #endif
 		count = dev->ccbq.queue.qfrozen_cnt;
 	}
 	dev->ccbq.queue.qfrozen_cnt -= count;
 	if (dev->ccbq.queue.qfrozen_cnt == 0) {
 		/*
 		 * No longer need to wait for a successful
 		 * command completion.
 		 */
 		dev->flags &= ~CAM_DEV_REL_ON_COMPLETE;
 		/*
 		 * Remove any timeouts that might be scheduled
 		 * to release this queue.
 		 */
 		if ((dev->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0) {
 			callout_stop(&dev->callout);
 			dev->flags &= ~CAM_DEV_REL_TIMEOUT_PENDING;
 		}
 		/*
 		 * Now that we are unfrozen schedule the
 		 * device so any pending transactions are
 		 * run.
 		 */
 		xpt_schedule_devq(dev->sim->devq, dev);
 	} else
 		run_queue = 0;
 	return (run_queue);
 }
 
 void
 xpt_release_simq(struct cam_sim *sim, int run_queue)
 {
 	struct cam_devq	*devq;
 
 	devq = sim->devq;
 	mtx_lock(&devq->send_mtx);
 	if (devq->send_queue.qfrozen_cnt <= 0) {
 #ifdef INVARIANTS
 		printf("xpt_release_simq: requested 1 > present %u\n",
 		    devq->send_queue.qfrozen_cnt);
 #endif
 	} else
 		devq->send_queue.qfrozen_cnt--;
 	if (devq->send_queue.qfrozen_cnt == 0) {
 		/*
 		 * If there is a timeout scheduled to release this
 		 * sim queue, remove it.  The queue frozen count is
 		 * already at 0.
 		 */
 		if ((sim->flags & CAM_SIM_REL_TIMEOUT_PENDING) != 0){
 			callout_stop(&sim->callout);
 			sim->flags &= ~CAM_SIM_REL_TIMEOUT_PENDING;
 		}
 		if (run_queue) {
 			/*
 			 * Now that we are unfrozen run the send queue.
 			 */
 			xpt_run_devq(sim->devq);
 		}
 	}
 	mtx_unlock(&devq->send_mtx);
 }
 
 /*
  * XXX Appears to be unused.
  */
 static void
 xpt_release_simq_timeout(void *arg)
 {
 	struct cam_sim *sim;
 
 	sim = (struct cam_sim *)arg;
 	xpt_release_simq(sim, /* run_queue */ TRUE);
 }
 
 void
 xpt_done(union ccb *done_ccb)
 {
 	struct cam_doneq *queue;
 	int	run, hash;
 
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+	if (done_ccb->ccb_h.func_code == XPT_SCSI_IO &&
+	    done_ccb->csio.bio != NULL)
+		biotrack(done_ccb->csio.bio, __func__);
+#endif
+
 	CAM_DEBUG(done_ccb->ccb_h.path, CAM_DEBUG_TRACE,
 	    ("xpt_done: func= %#x %s status %#x\n",
 		done_ccb->ccb_h.func_code,
 		xpt_action_name(done_ccb->ccb_h.func_code),
 		done_ccb->ccb_h.status));
 	if ((done_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0)
 		return;
 
 	/* Store the time the ccb was in the sim */
 	done_ccb->ccb_h.qos.sim_data = sbinuptime() - done_ccb->ccb_h.qos.sim_data;
 	hash = (done_ccb->ccb_h.path_id + done_ccb->ccb_h.target_id +
 	    done_ccb->ccb_h.target_lun) % cam_num_doneqs;
 	queue = &cam_doneqs[hash];
 	mtx_lock(&queue->cam_doneq_mtx);
 	run = (queue->cam_doneq_sleep && STAILQ_EMPTY(&queue->cam_doneq));
 	STAILQ_INSERT_TAIL(&queue->cam_doneq, &done_ccb->ccb_h, sim_links.stqe);
 	done_ccb->ccb_h.pinfo.index = CAM_DONEQ_INDEX;
 	mtx_unlock(&queue->cam_doneq_mtx);
 	if (run)
 		wakeup(&queue->cam_doneq);
 }
 
 void
 xpt_done_direct(union ccb *done_ccb)
 {
 
 	CAM_DEBUG(done_ccb->ccb_h.path, CAM_DEBUG_TRACE,
 	    ("xpt_done_direct: status %#x\n", done_ccb->ccb_h.status));
 	if ((done_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0)
 		return;
 
 	/* Store the time the ccb was in the sim */
 	done_ccb->ccb_h.qos.sim_data = sbinuptime() - done_ccb->ccb_h.qos.sim_data;
 	xpt_done_process(&done_ccb->ccb_h);
 }
 
 union ccb *
 xpt_alloc_ccb()
 {
 	union ccb *new_ccb;
 
 	new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_WAITOK);
 	return (new_ccb);
 }
 
 union ccb *
 xpt_alloc_ccb_nowait()
 {
 	union ccb *new_ccb;
 
 	new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_NOWAIT);
 	return (new_ccb);
 }
 
 void
 xpt_free_ccb(union ccb *free_ccb)
 {
 	free(free_ccb, M_CAMCCB);
 }
 
 
 
 /* Private XPT functions */
 
 /*
  * Get a CAM control block for the caller. Charge the structure to the device
  * referenced by the path.  If we don't have sufficient resources to allocate
  * more ccbs, we return NULL.
  */
 static union ccb *
 xpt_get_ccb_nowait(struct cam_periph *periph)
 {
 	union ccb *new_ccb;
 
 	new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_NOWAIT);
 	if (new_ccb == NULL)
 		return (NULL);
 	periph->periph_allocated++;
 	cam_ccbq_take_opening(&periph->path->device->ccbq);
 	return (new_ccb);
 }
 
 static union ccb *
 xpt_get_ccb(struct cam_periph *periph)
 {
 	union ccb *new_ccb;
 
 	cam_periph_unlock(periph);
 	new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_WAITOK);
 	cam_periph_lock(periph);
 	periph->periph_allocated++;
 	cam_ccbq_take_opening(&periph->path->device->ccbq);
 	return (new_ccb);
 }
 
 union ccb *
 cam_periph_getccb(struct cam_periph *periph, u_int32_t priority)
 {
 	struct ccb_hdr *ccb_h;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("cam_periph_getccb\n"));
 	cam_periph_assert(periph, MA_OWNED);
 	while ((ccb_h = SLIST_FIRST(&periph->ccb_list)) == NULL ||
 	    ccb_h->pinfo.priority != priority) {
 		if (priority < periph->immediate_priority) {
 			periph->immediate_priority = priority;
 			xpt_run_allocq(periph, 0);
 		} else
 			cam_periph_sleep(periph, &periph->ccb_list, PRIBIO,
 			    "cgticb", 0);
 	}
 	SLIST_REMOVE_HEAD(&periph->ccb_list, periph_links.sle);
 	return ((union ccb *)ccb_h);
 }
 
 static void
 xpt_acquire_bus(struct cam_eb *bus)
 {
 
 	xpt_lock_buses();
 	bus->refcount++;
 	xpt_unlock_buses();
 }
 
 static void
 xpt_release_bus(struct cam_eb *bus)
 {
 
 	xpt_lock_buses();
 	KASSERT(bus->refcount >= 1, ("bus->refcount >= 1"));
 	if (--bus->refcount > 0) {
 		xpt_unlock_buses();
 		return;
 	}
 	TAILQ_REMOVE(&xsoftc.xpt_busses, bus, links);
 	xsoftc.bus_generation++;
 	xpt_unlock_buses();
 	KASSERT(TAILQ_EMPTY(&bus->et_entries),
 	    ("destroying bus, but target list is not empty"));
 	cam_sim_release(bus->sim);
 	mtx_destroy(&bus->eb_mtx);
 	free(bus, M_CAMXPT);
 }
 
 static struct cam_et *
 xpt_alloc_target(struct cam_eb *bus, target_id_t target_id)
 {
 	struct cam_et *cur_target, *target;
 
 	mtx_assert(&xsoftc.xpt_topo_lock, MA_OWNED);
 	mtx_assert(&bus->eb_mtx, MA_OWNED);
 	target = (struct cam_et *)malloc(sizeof(*target), M_CAMXPT,
 					 M_NOWAIT|M_ZERO);
 	if (target == NULL)
 		return (NULL);
 
 	TAILQ_INIT(&target->ed_entries);
 	target->bus = bus;
 	target->target_id = target_id;
 	target->refcount = 1;
 	target->generation = 0;
 	target->luns = NULL;
 	mtx_init(&target->luns_mtx, "CAM LUNs lock", NULL, MTX_DEF);
 	timevalclear(&target->last_reset);
 	/*
 	 * Hold a reference to our parent bus so it
 	 * will not go away before we do.
 	 */
 	bus->refcount++;
 
 	/* Insertion sort into our bus's target list */
 	cur_target = TAILQ_FIRST(&bus->et_entries);
 	while (cur_target != NULL && cur_target->target_id < target_id)
 		cur_target = TAILQ_NEXT(cur_target, links);
 	if (cur_target != NULL) {
 		TAILQ_INSERT_BEFORE(cur_target, target, links);
 	} else {
 		TAILQ_INSERT_TAIL(&bus->et_entries, target, links);
 	}
 	bus->generation++;
 	return (target);
 }
 
 static void
 xpt_acquire_target(struct cam_et *target)
 {
 	struct cam_eb *bus = target->bus;
 
 	mtx_lock(&bus->eb_mtx);
 	target->refcount++;
 	mtx_unlock(&bus->eb_mtx);
 }
 
 static void
 xpt_release_target(struct cam_et *target)
 {
 	struct cam_eb *bus = target->bus;
 
 	mtx_lock(&bus->eb_mtx);
 	if (--target->refcount > 0) {
 		mtx_unlock(&bus->eb_mtx);
 		return;
 	}
 	TAILQ_REMOVE(&bus->et_entries, target, links);
 	bus->generation++;
 	mtx_unlock(&bus->eb_mtx);
 	KASSERT(TAILQ_EMPTY(&target->ed_entries),
 	    ("destroying target, but device list is not empty"));
 	xpt_release_bus(bus);
 	mtx_destroy(&target->luns_mtx);
 	if (target->luns)
 		free(target->luns, M_CAMXPT);
 	free(target, M_CAMXPT);
 }
 
 static struct cam_ed *
 xpt_alloc_device_default(struct cam_eb *bus, struct cam_et *target,
 			 lun_id_t lun_id)
 {
 	struct cam_ed *device;
 
 	device = xpt_alloc_device(bus, target, lun_id);
 	if (device == NULL)
 		return (NULL);
 
 	device->mintags = 1;
 	device->maxtags = 1;
 	return (device);
 }
 
 static void
 xpt_destroy_device(void *context, int pending)
 {
 	struct cam_ed	*device = context;
 
 	mtx_lock(&device->device_mtx);
 	mtx_destroy(&device->device_mtx);
 	free(device, M_CAMDEV);
 }
 
 struct cam_ed *
 xpt_alloc_device(struct cam_eb *bus, struct cam_et *target, lun_id_t lun_id)
 {
 	struct cam_ed	*cur_device, *device;
 	struct cam_devq	*devq;
 	cam_status status;
 
 	mtx_assert(&bus->eb_mtx, MA_OWNED);
 	/* Make space for us in the device queue on our bus */
 	devq = bus->sim->devq;
 	mtx_lock(&devq->send_mtx);
 	status = cam_devq_resize(devq, devq->send_queue.array_size + 1);
 	mtx_unlock(&devq->send_mtx);
 	if (status != CAM_REQ_CMP)
 		return (NULL);
 
 	device = (struct cam_ed *)malloc(sizeof(*device),
 					 M_CAMDEV, M_NOWAIT|M_ZERO);
 	if (device == NULL)
 		return (NULL);
 
 	cam_init_pinfo(&device->devq_entry);
 	device->target = target;
 	device->lun_id = lun_id;
 	device->sim = bus->sim;
 	if (cam_ccbq_init(&device->ccbq,
 			  bus->sim->max_dev_openings) != 0) {
 		free(device, M_CAMDEV);
 		return (NULL);
 	}
 	SLIST_INIT(&device->asyncs);
 	SLIST_INIT(&device->periphs);
 	device->generation = 0;
 	device->flags = CAM_DEV_UNCONFIGURED;
 	device->tag_delay_count = 0;
 	device->tag_saved_openings = 0;
 	device->refcount = 1;
 	mtx_init(&device->device_mtx, "CAM device lock", NULL, MTX_DEF);
 	callout_init_mtx(&device->callout, &devq->send_mtx, 0);
 	TASK_INIT(&device->device_destroy_task, 0, xpt_destroy_device, device);
 	/*
 	 * Hold a reference to our parent bus so it
 	 * will not go away before we do.
 	 */
 	target->refcount++;
 
 	cur_device = TAILQ_FIRST(&target->ed_entries);
 	while (cur_device != NULL && cur_device->lun_id < lun_id)
 		cur_device = TAILQ_NEXT(cur_device, links);
 	if (cur_device != NULL)
 		TAILQ_INSERT_BEFORE(cur_device, device, links);
 	else
 		TAILQ_INSERT_TAIL(&target->ed_entries, device, links);
 	target->generation++;
 	return (device);
 }
 
 void
 xpt_acquire_device(struct cam_ed *device)
 {
 	struct cam_eb *bus = device->target->bus;
 
 	mtx_lock(&bus->eb_mtx);
 	device->refcount++;
 	mtx_unlock(&bus->eb_mtx);
 }
 
 void
 xpt_release_device(struct cam_ed *device)
 {
 	struct cam_eb *bus = device->target->bus;
 	struct cam_devq *devq;
 
 	mtx_lock(&bus->eb_mtx);
 	if (--device->refcount > 0) {
 		mtx_unlock(&bus->eb_mtx);
 		return;
 	}
 
 	TAILQ_REMOVE(&device->target->ed_entries, device,links);
 	device->target->generation++;
 	mtx_unlock(&bus->eb_mtx);
 
 	/* Release our slot in the devq */
 	devq = bus->sim->devq;
 	mtx_lock(&devq->send_mtx);
 	cam_devq_resize(devq, devq->send_queue.array_size - 1);
 	mtx_unlock(&devq->send_mtx);
 
 	KASSERT(SLIST_EMPTY(&device->periphs),
 	    ("destroying device, but periphs list is not empty"));
 	KASSERT(device->devq_entry.index == CAM_UNQUEUED_INDEX,
 	    ("destroying device while still queued for ccbs"));
 
 	if ((device->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0)
 		callout_stop(&device->callout);
 
 	xpt_release_target(device->target);
 
 	cam_ccbq_fini(&device->ccbq);
 	/*
 	 * Free allocated memory.  free(9) does nothing if the
 	 * supplied pointer is NULL, so it is safe to call without
 	 * checking.
 	 */
 	free(device->supported_vpds, M_CAMXPT);
 	free(device->device_id, M_CAMXPT);
 	free(device->ext_inq, M_CAMXPT);
 	free(device->physpath, M_CAMXPT);
 	free(device->rcap_buf, M_CAMXPT);
 	free(device->serial_num, M_CAMXPT);
 	taskqueue_enqueue(xsoftc.xpt_taskq, &device->device_destroy_task);
 }
 
 u_int32_t
 xpt_dev_ccbq_resize(struct cam_path *path, int newopenings)
 {
 	int	result;
 	struct	cam_ed *dev;
 
 	dev = path->device;
 	mtx_lock(&dev->sim->devq->send_mtx);
 	result = cam_ccbq_resize(&dev->ccbq, newopenings);
 	mtx_unlock(&dev->sim->devq->send_mtx);
 	if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0
 	 || (dev->inq_flags & SID_CmdQue) != 0)
 		dev->tag_saved_openings = newopenings;
 	return (result);
 }
 
 static struct cam_eb *
 xpt_find_bus(path_id_t path_id)
 {
 	struct cam_eb *bus;
 
 	xpt_lock_buses();
 	for (bus = TAILQ_FIRST(&xsoftc.xpt_busses);
 	     bus != NULL;
 	     bus = TAILQ_NEXT(bus, links)) {
 		if (bus->path_id == path_id) {
 			bus->refcount++;
 			break;
 		}
 	}
 	xpt_unlock_buses();
 	return (bus);
 }
 
 static struct cam_et *
 xpt_find_target(struct cam_eb *bus, target_id_t	target_id)
 {
 	struct cam_et *target;
 
 	mtx_assert(&bus->eb_mtx, MA_OWNED);
 	for (target = TAILQ_FIRST(&bus->et_entries);
 	     target != NULL;
 	     target = TAILQ_NEXT(target, links)) {
 		if (target->target_id == target_id) {
 			target->refcount++;
 			break;
 		}
 	}
 	return (target);
 }
 
 static struct cam_ed *
 xpt_find_device(struct cam_et *target, lun_id_t lun_id)
 {
 	struct cam_ed *device;
 
 	mtx_assert(&target->bus->eb_mtx, MA_OWNED);
 	for (device = TAILQ_FIRST(&target->ed_entries);
 	     device != NULL;
 	     device = TAILQ_NEXT(device, links)) {
 		if (device->lun_id == lun_id) {
 			device->refcount++;
 			break;
 		}
 	}
 	return (device);
 }
 
 void
 xpt_start_tags(struct cam_path *path)
 {
 	struct ccb_relsim crs;
 	struct cam_ed *device;
 	struct cam_sim *sim;
 	int    newopenings;
 
 	device = path->device;
 	sim = path->bus->sim;
 	device->flags &= ~CAM_DEV_TAG_AFTER_COUNT;
 	xpt_freeze_devq(path, /*count*/1);
 	device->inq_flags |= SID_CmdQue;
 	if (device->tag_saved_openings != 0)
 		newopenings = device->tag_saved_openings;
 	else
 		newopenings = min(device->maxtags,
 				  sim->max_tagged_dev_openings);
 	xpt_dev_ccbq_resize(path, newopenings);
 	xpt_async(AC_GETDEV_CHANGED, path, NULL);
 	xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL);
 	crs.ccb_h.func_code = XPT_REL_SIMQ;
 	crs.release_flags = RELSIM_RELEASE_AFTER_QEMPTY;
 	crs.openings
 	    = crs.release_timeout
 	    = crs.qfrozen_cnt
 	    = 0;
 	xpt_action((union ccb *)&crs);
 }
 
 void
 xpt_stop_tags(struct cam_path *path)
 {
 	struct ccb_relsim crs;
 	struct cam_ed *device;
 	struct cam_sim *sim;
 
 	device = path->device;
 	sim = path->bus->sim;
 	device->flags &= ~CAM_DEV_TAG_AFTER_COUNT;
 	device->tag_delay_count = 0;
 	xpt_freeze_devq(path, /*count*/1);
 	device->inq_flags &= ~SID_CmdQue;
 	xpt_dev_ccbq_resize(path, sim->max_dev_openings);
 	xpt_async(AC_GETDEV_CHANGED, path, NULL);
 	xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL);
 	crs.ccb_h.func_code = XPT_REL_SIMQ;
 	crs.release_flags = RELSIM_RELEASE_AFTER_QEMPTY;
 	crs.openings
 	    = crs.release_timeout
 	    = crs.qfrozen_cnt
 	    = 0;
 	xpt_action((union ccb *)&crs);
 }
 
 static void
 xpt_boot_delay(void *arg)
 {
 
 	xpt_release_boot();
 }
 
 static void
 xpt_config(void *arg)
 {
 	/*
 	 * Now that interrupts are enabled, go find our devices
 	 */
 	if (taskqueue_start_threads(&xsoftc.xpt_taskq, 1, PRIBIO, "CAM taskq"))
 		printf("xpt_config: failed to create taskqueue thread.\n");
 
 	/* Setup debugging path */
 	if (cam_dflags != CAM_DEBUG_NONE) {
 		if (xpt_create_path(&cam_dpath, NULL,
 				    CAM_DEBUG_BUS, CAM_DEBUG_TARGET,
 				    CAM_DEBUG_LUN) != CAM_REQ_CMP) {
 			printf("xpt_config: xpt_create_path() failed for debug"
 			       " target %d:%d:%d, debugging disabled\n",
 			       CAM_DEBUG_BUS, CAM_DEBUG_TARGET, CAM_DEBUG_LUN);
 			cam_dflags = CAM_DEBUG_NONE;
 		}
 	} else
 		cam_dpath = NULL;
 
 	periphdriver_init(1);
 	xpt_hold_boot();
 	callout_init(&xsoftc.boot_callout, 1);
 	callout_reset_sbt(&xsoftc.boot_callout, SBT_1MS * xsoftc.boot_delay, 0,
 	    xpt_boot_delay, NULL, 0);
 	/* Fire up rescan thread. */
 	if (kproc_kthread_add(xpt_scanner_thread, NULL, &cam_proc, NULL, 0, 0,
 	    "cam", "scanner")) {
 		printf("xpt_config: failed to create rescan thread.\n");
 	}
 }
 
 void
 xpt_hold_boot(void)
 {
 	xpt_lock_buses();
 	xsoftc.buses_to_config++;
 	xpt_unlock_buses();
 }
 
 void
 xpt_release_boot(void)
 {
 	xpt_lock_buses();
 	xsoftc.buses_to_config--;
 	if (xsoftc.buses_to_config == 0 && xsoftc.buses_config_done == 0) {
 		struct	xpt_task *task;
 
 		xsoftc.buses_config_done = 1;
 		xpt_unlock_buses();
 		/* Call manually because we don't have any busses */
 		task = malloc(sizeof(struct xpt_task), M_CAMXPT, M_NOWAIT);
 		if (task != NULL) {
 			TASK_INIT(&task->task, 0, xpt_finishconfig_task, task);
 			taskqueue_enqueue(taskqueue_thread, &task->task);
 		}
 	} else
 		xpt_unlock_buses();
 }
 
 /*
  * If the given device only has one peripheral attached to it, and if that
  * peripheral is the passthrough driver, announce it.  This insures that the
  * user sees some sort of announcement for every peripheral in their system.
  */
 static int
 xptpassannouncefunc(struct cam_ed *device, void *arg)
 {
 	struct cam_periph *periph;
 	int i;
 
 	for (periph = SLIST_FIRST(&device->periphs), i = 0; periph != NULL;
 	     periph = SLIST_NEXT(periph, periph_links), i++);
 
 	periph = SLIST_FIRST(&device->periphs);
 	if ((i == 1)
 	 && (strncmp(periph->periph_name, "pass", 4) == 0))
 		xpt_announce_periph(periph, NULL);
 
 	return(1);
 }
 
 static void
 xpt_finishconfig_task(void *context, int pending)
 {
 
 	periphdriver_init(2);
 	/*
 	 * Check for devices with no "standard" peripheral driver
 	 * attached.  For any devices like that, announce the
 	 * passthrough driver so the user will see something.
 	 */
 	if (!bootverbose)
 		xpt_for_all_devices(xptpassannouncefunc, NULL);
 
 	/* Release our hook so that the boot can continue. */
 	config_intrhook_disestablish(xsoftc.xpt_config_hook);
 	free(xsoftc.xpt_config_hook, M_CAMXPT);
 	xsoftc.xpt_config_hook = NULL;
 
 	free(context, M_CAMXPT);
 }
 
 cam_status
 xpt_register_async(int event, ac_callback_t *cbfunc, void *cbarg,
 		   struct cam_path *path)
 {
 	struct ccb_setasync csa;
 	cam_status status;
 	int xptpath = 0;
 
 	if (path == NULL) {
 		status = xpt_create_path(&path, /*periph*/NULL, CAM_XPT_PATH_ID,
 					 CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD);
 		if (status != CAM_REQ_CMP)
 			return (status);
 		xpt_path_lock(path);
 		xptpath = 1;
 	}
 
 	xpt_setup_ccb(&csa.ccb_h, path, CAM_PRIORITY_NORMAL);
 	csa.ccb_h.func_code = XPT_SASYNC_CB;
 	csa.event_enable = event;
 	csa.callback = cbfunc;
 	csa.callback_arg = cbarg;
 	xpt_action((union ccb *)&csa);
 	status = csa.ccb_h.status;
 
 	CAM_DEBUG(csa.ccb_h.path, CAM_DEBUG_TRACE,
 	    ("xpt_register_async: func %p\n", cbfunc));
 
 	if (xptpath) {
 		xpt_path_unlock(path);
 		xpt_free_path(path);
 	}
 
 	if ((status == CAM_REQ_CMP) &&
 	    (csa.event_enable & AC_FOUND_DEVICE)) {
 		/*
 		 * Get this peripheral up to date with all
 		 * the currently existing devices.
 		 */
 		xpt_for_all_devices(xptsetasyncfunc, &csa);
 	}
 	if ((status == CAM_REQ_CMP) &&
 	    (csa.event_enable & AC_PATH_REGISTERED)) {
 		/*
 		 * Get this peripheral up to date with all
 		 * the currently existing busses.
 		 */
 		xpt_for_all_busses(xptsetasyncbusfunc, &csa);
 	}
 
 	return (status);
 }
 
 static void
 xptaction(struct cam_sim *sim, union ccb *work_ccb)
 {
 	CAM_DEBUG(work_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xptaction\n"));
 
 	switch (work_ccb->ccb_h.func_code) {
 	/* Common cases first */
 	case XPT_PATH_INQ:		/* Path routing inquiry */
 	{
 		struct ccb_pathinq *cpi;
 
 		cpi = &work_ccb->cpi;
 		cpi->version_num = 1; /* XXX??? */
 		cpi->hba_inquiry = 0;
 		cpi->target_sprt = 0;
 		cpi->hba_misc = 0;
 		cpi->hba_eng_cnt = 0;
 		cpi->max_target = 0;
 		cpi->max_lun = 0;
 		cpi->initiator_id = 0;
 		strncpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
 		strncpy(cpi->hba_vid, "", HBA_IDLEN);
 		strncpy(cpi->dev_name, sim->sim_name, DEV_IDLEN);
 		cpi->unit_number = sim->unit_number;
 		cpi->bus_id = sim->bus_id;
 		cpi->base_transfer_speed = 0;
 		cpi->protocol = PROTO_UNSPECIFIED;
 		cpi->protocol_version = PROTO_VERSION_UNSPECIFIED;
 		cpi->transport = XPORT_UNSPECIFIED;
 		cpi->transport_version = XPORT_VERSION_UNSPECIFIED;
 		cpi->ccb_h.status = CAM_REQ_CMP;
 		xpt_done(work_ccb);
 		break;
 	}
 	default:
 		work_ccb->ccb_h.status = CAM_REQ_INVALID;
 		xpt_done(work_ccb);
 		break;
 	}
 }
 
 /*
  * The xpt as a "controller" has no interrupt sources, so polling
  * is a no-op.
  */
 static void
 xptpoll(struct cam_sim *sim)
 {
 }
 
 void
 xpt_lock_buses(void)
 {
 	mtx_lock(&xsoftc.xpt_topo_lock);
 }
 
 void
 xpt_unlock_buses(void)
 {
 	mtx_unlock(&xsoftc.xpt_topo_lock);
 }
 
 struct mtx *
 xpt_path_mtx(struct cam_path *path)
 {
 
 	return (&path->device->device_mtx);
 }
 
 static void
 xpt_done_process(struct ccb_hdr *ccb_h)
 {
 	struct cam_sim *sim;
 	struct cam_devq *devq;
 	struct mtx *mtx = NULL;
+
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+	struct ccb_scsiio *csio;
+
+	if (ccb_h->func_code == XPT_SCSI_IO) {
+		csio = &((union ccb *)ccb_h)->csio;
+		if (csio->bio != NULL)
+			biotrack(csio->bio, __func__);
+	}
+#endif
 
 	if (ccb_h->flags & CAM_HIGH_POWER) {
 		struct highpowerlist	*hphead;
 		struct cam_ed		*device;
 
 		mtx_lock(&xsoftc.xpt_highpower_lock);
 		hphead = &xsoftc.highpowerq;
 
 		device = STAILQ_FIRST(hphead);
 
 		/*
 		 * Increment the count since this command is done.
 		 */
 		xsoftc.num_highpower++;
 
 		/*
 		 * Any high powered commands queued up?
 		 */
 		if (device != NULL) {
 
 			STAILQ_REMOVE_HEAD(hphead, highpowerq_entry);
 			mtx_unlock(&xsoftc.xpt_highpower_lock);
 
 			mtx_lock(&device->sim->devq->send_mtx);
 			xpt_release_devq_device(device,
 					 /*count*/1, /*runqueue*/TRUE);
 			mtx_unlock(&device->sim->devq->send_mtx);
 		} else
 			mtx_unlock(&xsoftc.xpt_highpower_lock);
 	}
 
 	sim = ccb_h->path->bus->sim;
 
 	if (ccb_h->status & CAM_RELEASE_SIMQ) {
 		xpt_release_simq(sim, /*run_queue*/FALSE);
 		ccb_h->status &= ~CAM_RELEASE_SIMQ;
 	}
 
 	if ((ccb_h->flags & CAM_DEV_QFRZDIS)
 	 && (ccb_h->status & CAM_DEV_QFRZN)) {
 		xpt_release_devq(ccb_h->path, /*count*/1, /*run_queue*/TRUE);
 		ccb_h->status &= ~CAM_DEV_QFRZN;
 	}
 
 	devq = sim->devq;
 	if ((ccb_h->func_code & XPT_FC_USER_CCB) == 0) {
 		struct cam_ed *dev = ccb_h->path->device;
 
 		mtx_lock(&devq->send_mtx);
 		devq->send_active--;
 		devq->send_openings++;
 		cam_ccbq_ccb_done(&dev->ccbq, (union ccb *)ccb_h);
 
 		if (((dev->flags & CAM_DEV_REL_ON_QUEUE_EMPTY) != 0
 		  && (dev->ccbq.dev_active == 0))) {
 			dev->flags &= ~CAM_DEV_REL_ON_QUEUE_EMPTY;
 			xpt_release_devq_device(dev, /*count*/1,
 					 /*run_queue*/FALSE);
 		}
 
 		if (((dev->flags & CAM_DEV_REL_ON_COMPLETE) != 0
 		  && (ccb_h->status&CAM_STATUS_MASK) != CAM_REQUEUE_REQ)) {
 			dev->flags &= ~CAM_DEV_REL_ON_COMPLETE;
 			xpt_release_devq_device(dev, /*count*/1,
 					 /*run_queue*/FALSE);
 		}
 
 		if (!device_is_queued(dev))
 			(void)xpt_schedule_devq(devq, dev);
 		xpt_run_devq(devq);
 		mtx_unlock(&devq->send_mtx);
 
 		if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0) {
 			mtx = xpt_path_mtx(ccb_h->path);
 			mtx_lock(mtx);
 
 			if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0
 			 && (--dev->tag_delay_count == 0))
 				xpt_start_tags(ccb_h->path);
 		}
 	}
 
 	if ((ccb_h->flags & CAM_UNLOCKED) == 0) {
 		if (mtx == NULL) {
 			mtx = xpt_path_mtx(ccb_h->path);
 			mtx_lock(mtx);
 		}
 	} else {
 		if (mtx != NULL) {
 			mtx_unlock(mtx);
 			mtx = NULL;
 		}
 	}
 
 	/* Call the peripheral driver's callback */
 	ccb_h->pinfo.index = CAM_UNQUEUED_INDEX;
 	(*ccb_h->cbfcnp)(ccb_h->path->periph, (union ccb *)ccb_h);
 	if (mtx != NULL)
 		mtx_unlock(mtx);
 }
 
 void
 xpt_done_td(void *arg)
 {
 	struct cam_doneq *queue = arg;
 	struct ccb_hdr *ccb_h;
 	STAILQ_HEAD(, ccb_hdr)	doneq;
 
 	STAILQ_INIT(&doneq);
 	mtx_lock(&queue->cam_doneq_mtx);
 	while (1) {
 		while (STAILQ_EMPTY(&queue->cam_doneq)) {
 			queue->cam_doneq_sleep = 1;
 			msleep(&queue->cam_doneq, &queue->cam_doneq_mtx,
 			    PRIBIO, "-", 0);
 			queue->cam_doneq_sleep = 0;
 		}
 		STAILQ_CONCAT(&doneq, &queue->cam_doneq);
 		mtx_unlock(&queue->cam_doneq_mtx);
 
 		THREAD_NO_SLEEPING();
 		while ((ccb_h = STAILQ_FIRST(&doneq)) != NULL) {
 			STAILQ_REMOVE_HEAD(&doneq, sim_links.stqe);
 			xpt_done_process(ccb_h);
 		}
 		THREAD_SLEEPING_OK();
 
 		mtx_lock(&queue->cam_doneq_mtx);
 	}
 }
 
 static void
 camisr_runqueue(void)
 {
 	struct	ccb_hdr *ccb_h;
 	struct cam_doneq *queue;
 	int i;
 
 	/* Process global queues. */
 	for (i = 0; i < cam_num_doneqs; i++) {
 		queue = &cam_doneqs[i];
 		mtx_lock(&queue->cam_doneq_mtx);
 		while ((ccb_h = STAILQ_FIRST(&queue->cam_doneq)) != NULL) {
 			STAILQ_REMOVE_HEAD(&queue->cam_doneq, sim_links.stqe);
 			mtx_unlock(&queue->cam_doneq_mtx);
 			xpt_done_process(ccb_h);
 			mtx_lock(&queue->cam_doneq_mtx);
 		}
 		mtx_unlock(&queue->cam_doneq_mtx);
 	}
 }
 
 struct kv 
 {
 	uint32_t v;
 	const char *name;
 };
 
 static struct kv map[] = {
 	{ XPT_NOOP, "XPT_NOOP" },
 	{ XPT_SCSI_IO, "XPT_SCSI_IO" },
 	{ XPT_GDEV_TYPE, "XPT_GDEV_TYPE" },
 	{ XPT_GDEVLIST, "XPT_GDEVLIST" },
 	{ XPT_PATH_INQ, "XPT_PATH_INQ" },
 	{ XPT_REL_SIMQ, "XPT_REL_SIMQ" },
 	{ XPT_SASYNC_CB, "XPT_SASYNC_CB" },
 	{ XPT_SDEV_TYPE, "XPT_SDEV_TYPE" },
 	{ XPT_SCAN_BUS, "XPT_SCAN_BUS" },
 	{ XPT_DEV_MATCH, "XPT_DEV_MATCH" },
 	{ XPT_DEBUG, "XPT_DEBUG" },
 	{ XPT_PATH_STATS, "XPT_PATH_STATS" },
 	{ XPT_GDEV_STATS, "XPT_GDEV_STATS" },
 	{ XPT_DEV_ADVINFO, "XPT_DEV_ADVINFO" },
 	{ XPT_ASYNC, "XPT_ASYNC" },
 	{ XPT_ABORT, "XPT_ABORT" },
 	{ XPT_RESET_BUS, "XPT_RESET_BUS" },
 	{ XPT_RESET_DEV, "XPT_RESET_DEV" },
 	{ XPT_TERM_IO, "XPT_TERM_IO" },
 	{ XPT_SCAN_LUN, "XPT_SCAN_LUN" },
 	{ XPT_GET_TRAN_SETTINGS, "XPT_GET_TRAN_SETTINGS" },
 	{ XPT_SET_TRAN_SETTINGS, "XPT_SET_TRAN_SETTINGS" },
 	{ XPT_CALC_GEOMETRY, "XPT_CALC_GEOMETRY" },
 	{ XPT_ATA_IO, "XPT_ATA_IO" },
 	{ XPT_GET_SIM_KNOB, "XPT_GET_SIM_KNOB" },
 	{ XPT_SET_SIM_KNOB, "XPT_SET_SIM_KNOB" },
 	{ XPT_NVME_IO, "XPT_NVME_IO" },
 	{ XPT_MMCSD_IO, "XPT_MMCSD_IO" },
 	{ XPT_SMP_IO, "XPT_SMP_IO" },
 	{ XPT_SCAN_TGT, "XPT_SCAN_TGT" },
 	{ XPT_ENG_INQ, "XPT_ENG_INQ" },
 	{ XPT_ENG_EXEC, "XPT_ENG_EXEC" },
 	{ XPT_EN_LUN, "XPT_EN_LUN" },
 	{ XPT_TARGET_IO, "XPT_TARGET_IO" },
 	{ XPT_ACCEPT_TARGET_IO, "XPT_ACCEPT_TARGET_IO" },
 	{ XPT_CONT_TARGET_IO, "XPT_CONT_TARGET_IO" },
 	{ XPT_IMMED_NOTIFY, "XPT_IMMED_NOTIFY" },
 	{ XPT_NOTIFY_ACK, "XPT_NOTIFY_ACK" },
 	{ XPT_IMMEDIATE_NOTIFY, "XPT_IMMEDIATE_NOTIFY" },
 	{ XPT_NOTIFY_ACKNOWLEDGE, "XPT_NOTIFY_ACKNOWLEDGE" },
 	{ 0, 0 }
 };
 
 static const char *
 xpt_action_name(uint32_t action) 
 {
 	static char buffer[32];	/* Only for unknown messages -- racy */
 	struct kv *walker = map;
 
 	while (walker->name != NULL) {
 		if (walker->v == action)
 			return (walker->name);
 		walker++;
 	}
 
 	snprintf(buffer, sizeof(buffer), "%#x", action);
 	return (buffer);
 }
Index: head/sys/cam/scsi/scsi_da.c
===================================================================
--- head/sys/cam/scsi/scsi_da.c	(revision 308154)
+++ head/sys/cam/scsi/scsi_da.c	(revision 308155)
@@ -1,5903 +1,5918 @@
 /*-
  * Implementation of SCSI Direct Access Peripheral driver for CAM.
  *
  * Copyright (c) 1997 Justin T. Gibbs.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/conf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/cons.h>
 #include <sys/endian.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <geom/geom.h>
 #include <geom/geom_disk.h>
 #endif /* _KERNEL */
 
 #ifndef _KERNEL
 #include <stdio.h>
 #include <string.h>
 #endif /* _KERNEL */
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_iosched.h>
 
 #include <cam/scsi/scsi_message.h>
 #include <cam/scsi/scsi_da.h>
 
 #ifdef _KERNEL
 /*
  * Note that there are probe ordering dependencies here.  The order isn't
  * controlled by this enumeration, but by explicit state transitions in
  * dastart() and dadone().  Here are some of the dependencies:
  * 
  * 1. RC should come first, before RC16, unless there is evidence that RC16
  *    is supported.
  * 2. BDC needs to come before any of the ATA probes, or the ZONE probe.
  * 3. The ATA probes should go in this order:
  *    ATA -> LOGDIR -> IDDIR -> SUP -> ATA_ZONE
  */
 typedef enum {
 	DA_STATE_PROBE_RC,
 	DA_STATE_PROBE_RC16,
 	DA_STATE_PROBE_LBP,
 	DA_STATE_PROBE_BLK_LIMITS,
 	DA_STATE_PROBE_BDC,
 	DA_STATE_PROBE_ATA,
 	DA_STATE_PROBE_ATA_LOGDIR,
 	DA_STATE_PROBE_ATA_IDDIR,
 	DA_STATE_PROBE_ATA_SUP,
 	DA_STATE_PROBE_ATA_ZONE,
 	DA_STATE_PROBE_ZONE,
 	DA_STATE_NORMAL
 } da_state;
 
 typedef enum {
 	DA_FLAG_PACK_INVALID	= 0x000001,
 	DA_FLAG_NEW_PACK	= 0x000002,
 	DA_FLAG_PACK_LOCKED	= 0x000004,
 	DA_FLAG_PACK_REMOVABLE	= 0x000008,
 	DA_FLAG_NEED_OTAG	= 0x000020,
 	DA_FLAG_WAS_OTAG	= 0x000040,
 	DA_FLAG_RETRY_UA	= 0x000080,
 	DA_FLAG_OPEN		= 0x000100,
 	DA_FLAG_SCTX_INIT	= 0x000200,
 	DA_FLAG_CAN_RC16	= 0x000400,
 	DA_FLAG_PROBED		= 0x000800,
 	DA_FLAG_DIRTY		= 0x001000,
 	DA_FLAG_ANNOUNCED	= 0x002000,
 	DA_FLAG_CAN_ATA_DMA	= 0x004000,
 	DA_FLAG_CAN_ATA_LOG	= 0x008000,
 	DA_FLAG_CAN_ATA_IDLOG	= 0x010000,
 	DA_FLAG_CAN_ATA_SUPCAP	= 0x020000,
 	DA_FLAG_CAN_ATA_ZONE	= 0x040000
 } da_flags;
 
 typedef enum {
 	DA_Q_NONE		= 0x00,
 	DA_Q_NO_SYNC_CACHE	= 0x01,
 	DA_Q_NO_6_BYTE		= 0x02,
 	DA_Q_NO_PREVENT		= 0x04,
 	DA_Q_4K			= 0x08,
 	DA_Q_NO_RC16		= 0x10,
 	DA_Q_NO_UNMAP		= 0x20,
 	DA_Q_RETRY_BUSY		= 0x40,
 	DA_Q_SMR_DM		= 0x80
 } da_quirks;
 
 #define DA_Q_BIT_STRING		\
 	"\020"			\
 	"\001NO_SYNC_CACHE"	\
 	"\002NO_6_BYTE"		\
 	"\003NO_PREVENT"	\
 	"\0044K"		\
 	"\005NO_RC16"		\
 	"\006NO_UNMAP"		\
 	"\007RETRY_BUSY"	\
 	"\008SMR_DM"
 
 typedef enum {
 	DA_CCB_PROBE_RC		= 0x01,
 	DA_CCB_PROBE_RC16	= 0x02,
 	DA_CCB_PROBE_LBP	= 0x03,
 	DA_CCB_PROBE_BLK_LIMITS	= 0x04,
 	DA_CCB_PROBE_BDC	= 0x05,
 	DA_CCB_PROBE_ATA	= 0x06,
 	DA_CCB_BUFFER_IO	= 0x07,
 	DA_CCB_DUMP		= 0x0A,
 	DA_CCB_DELETE		= 0x0B,
  	DA_CCB_TUR		= 0x0C,
 	DA_CCB_PROBE_ZONE	= 0x0D,
 	DA_CCB_PROBE_ATA_LOGDIR	= 0x0E,
 	DA_CCB_PROBE_ATA_IDDIR	= 0x0F,
 	DA_CCB_PROBE_ATA_SUP	= 0x10,
 	DA_CCB_PROBE_ATA_ZONE	= 0x11,
 	DA_CCB_TYPE_MASK	= 0x1F,
 	DA_CCB_RETRY_UA		= 0x20
 } da_ccb_state;
 
 /*
  * Order here is important for method choice
  *
  * We prefer ATA_TRIM as tests run against a Sandforce 2281 SSD attached to
  * LSI 2008 (mps) controller (FW: v12, Drv: v14) resulted 20% quicker deletes
  * using ATA_TRIM than the corresponding UNMAP results for a real world mysql
  * import taking 5mins.
  *
  */
 typedef enum {
 	DA_DELETE_NONE,
 	DA_DELETE_DISABLE,
 	DA_DELETE_ATA_TRIM,
 	DA_DELETE_UNMAP,
 	DA_DELETE_WS16,
 	DA_DELETE_WS10,
 	DA_DELETE_ZERO,
 	DA_DELETE_MIN = DA_DELETE_ATA_TRIM,
 	DA_DELETE_MAX = DA_DELETE_ZERO
 } da_delete_methods;
 
 /*
  * For SCSI, host managed drives show up as a separate device type.  For
  * ATA, host managed drives also have a different device signature.
  * XXX KDM figure out the ATA host managed signature.
  */
 typedef enum {
 	DA_ZONE_NONE		= 0x00,
 	DA_ZONE_DRIVE_MANAGED	= 0x01,
 	DA_ZONE_HOST_AWARE	= 0x02,
 	DA_ZONE_HOST_MANAGED	= 0x03
 } da_zone_mode;
 
 /*
  * We distinguish between these interface cases in addition to the drive type:
  * o ATA drive behind a SCSI translation layer that knows about ZBC/ZAC
  * o ATA drive behind a SCSI translation layer that does not know about
  *   ZBC/ZAC, and so needs to be managed via ATA passthrough.  In this
  *   case, we would need to share the ATA code with the ada(4) driver.
  * o SCSI drive.
  */
 typedef enum {
 	DA_ZONE_IF_SCSI,
 	DA_ZONE_IF_ATA_PASS,
 	DA_ZONE_IF_ATA_SAT,
 } da_zone_interface;
 
 typedef enum {
 	DA_ZONE_FLAG_RZ_SUP		= 0x0001,
 	DA_ZONE_FLAG_OPEN_SUP		= 0x0002,
 	DA_ZONE_FLAG_CLOSE_SUP		= 0x0004,
 	DA_ZONE_FLAG_FINISH_SUP		= 0x0008,
 	DA_ZONE_FLAG_RWP_SUP		= 0x0010,
 	DA_ZONE_FLAG_SUP_MASK		= (DA_ZONE_FLAG_RZ_SUP |
 					   DA_ZONE_FLAG_OPEN_SUP |
 					   DA_ZONE_FLAG_CLOSE_SUP |
 					   DA_ZONE_FLAG_FINISH_SUP |
 					   DA_ZONE_FLAG_RWP_SUP),
 	DA_ZONE_FLAG_URSWRZ		= 0x0020,
 	DA_ZONE_FLAG_OPT_SEQ_SET	= 0x0040,
 	DA_ZONE_FLAG_OPT_NONSEQ_SET	= 0x0080,
 	DA_ZONE_FLAG_MAX_SEQ_SET	= 0x0100,
 	DA_ZONE_FLAG_SET_MASK		= (DA_ZONE_FLAG_OPT_SEQ_SET |
 					   DA_ZONE_FLAG_OPT_NONSEQ_SET |
 					   DA_ZONE_FLAG_MAX_SEQ_SET)
 } da_zone_flags;
 
 static struct da_zone_desc {
 	da_zone_flags value;
 	const char *desc;
 } da_zone_desc_table[] = {
 	{DA_ZONE_FLAG_RZ_SUP, "Report Zones" },
 	{DA_ZONE_FLAG_OPEN_SUP, "Open" },
 	{DA_ZONE_FLAG_CLOSE_SUP, "Close" },
 	{DA_ZONE_FLAG_FINISH_SUP, "Finish" },
 	{DA_ZONE_FLAG_RWP_SUP, "Reset Write Pointer" },
 };
 
 typedef void da_delete_func_t (struct cam_periph *periph, union ccb *ccb,
 			      struct bio *bp);
 static da_delete_func_t da_delete_trim;
 static da_delete_func_t da_delete_unmap;
 static da_delete_func_t da_delete_ws;
 
 static const void * da_delete_functions[] = {
 	NULL,
 	NULL,
 	da_delete_trim,
 	da_delete_unmap,
 	da_delete_ws,
 	da_delete_ws,
 	da_delete_ws
 };
 
 static const char *da_delete_method_names[] =
     { "NONE", "DISABLE", "ATA_TRIM", "UNMAP", "WS16", "WS10", "ZERO" };
 static const char *da_delete_method_desc[] =
     { "NONE", "DISABLED", "ATA TRIM", "UNMAP", "WRITE SAME(16) with UNMAP",
       "WRITE SAME(10) with UNMAP", "ZERO" };
 
 /* Offsets into our private area for storing information */
 #define ccb_state	ppriv_field0
 #define ccb_bp		ppriv_ptr1
 
 struct disk_params {
 	u_int8_t  heads;
 	u_int32_t cylinders;
 	u_int8_t  secs_per_track;
 	u_int32_t secsize;	/* Number of bytes/sector */
 	u_int64_t sectors;	/* total number sectors */
 	u_int     stripesize;
 	u_int     stripeoffset;
 };
 
 #define UNMAP_RANGE_MAX		0xffffffff
 #define UNMAP_HEAD_SIZE		8
 #define UNMAP_RANGE_SIZE	16
 #define UNMAP_MAX_RANGES	2048 /* Protocol Max is 4095 */
 #define UNMAP_BUF_SIZE		((UNMAP_MAX_RANGES * UNMAP_RANGE_SIZE) + \
 				UNMAP_HEAD_SIZE)
 
 #define WS10_MAX_BLKS		0xffff
 #define WS16_MAX_BLKS		0xffffffff
 #define ATA_TRIM_MAX_RANGES	((UNMAP_BUF_SIZE / \
 	(ATA_DSM_RANGE_SIZE * ATA_DSM_BLK_SIZE)) * ATA_DSM_BLK_SIZE)
 
 #define DA_WORK_TUR		(1 << 16)
 
 struct da_softc {
 	struct   cam_iosched_softc *cam_iosched;
 	struct	 bio_queue_head delete_run_queue;
 	LIST_HEAD(, ccb_hdr) pending_ccbs;
 	int	 refcount;		/* Active xpt_action() calls */
 	da_state state;
 	da_flags flags;	
 	da_quirks quirks;
 	int	 minimum_cmd_size;
 	int	 error_inject;
 	int	 trim_max_ranges;
 	int	 delete_available;	/* Delete methods possibly available */
 	da_zone_mode 			zone_mode;
 	da_zone_interface		zone_interface;
 	da_zone_flags			zone_flags;
 	struct ata_gp_log_dir		ata_logdir;
 	int				valid_logdir_len;
 	struct ata_identify_log_pages	ata_iddir;
 	int				valid_iddir_len;
 	uint64_t			optimal_seq_zones;
 	uint64_t			optimal_nonseq_zones;
 	uint64_t			max_seq_zones;
 	u_int	 		maxio;
 	uint32_t		unmap_max_ranges;
 	uint32_t		unmap_max_lba; /* Max LBAs in UNMAP req */
 	uint64_t		ws_max_blks;
 	da_delete_methods	delete_method_pref;
 	da_delete_methods	delete_method;
 	da_delete_func_t	*delete_func;
 	int			unmappedio;
 	int			rotating;
 	struct	 disk_params params;
 	struct	 disk *disk;
 	union	 ccb saved_ccb;
 	struct task		sysctl_task;
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
 	struct callout		sendordered_c;
 	uint64_t wwpn;
 	uint8_t	 unmap_buf[UNMAP_BUF_SIZE];
 	struct scsi_read_capacity_data_long rcaplong;
 	struct callout		mediapoll_c;
 #ifdef CAM_IO_STATS
 	struct sysctl_ctx_list	sysctl_stats_ctx;
 	struct sysctl_oid	*sysctl_stats_tree;
 	u_int	errors;
 	u_int	timeouts;
 	u_int	invalidations;
 #endif
 };
 
 #define dadeleteflag(softc, delete_method, enable)			\
 	if (enable) {							\
 		softc->delete_available |= (1 << delete_method);	\
 	} else {							\
 		softc->delete_available &= ~(1 << delete_method);	\
 	}
 
 struct da_quirk_entry {
 	struct scsi_inquiry_pattern inq_pat;
 	da_quirks quirks;
 };
 
 static const char quantum[] = "QUANTUM";
 static const char microp[] = "MICROP";
 
 static struct da_quirk_entry da_quirk_table[] =
 {
 	/* SPI, FC devices */
 	{
 		/*
 		 * Fujitsu M2513A MO drives.
 		 * Tested devices: M2513A2 firmware versions 1200 & 1300.
 		 * (dip switch selects whether T_DIRECT or T_OPTICAL device)
 		 * Reported by: W.Scholten <whs@xs4all.nl>
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/* See above. */
 		{T_OPTICAL, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * This particular Fujitsu drive doesn't like the
 		 * synchronize cache command.
 		 * Reported by: Tom Jackson <toj@gorilla.net>
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "FUJITSU", "M2954*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * This drive doesn't like the synchronize cache command
 		 * either.  Reported by: Matthew Jacob <mjacob@feral.com>
 		 * in NetBSD PR kern/6027, August 24, 1998.
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, microp, "2217*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * This drive doesn't like the synchronize cache command
 		 * either.  Reported by: Hellmuth Michaelis (hm@kts.org)
 		 * (PR 8882).
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, microp, "2112*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Doesn't like the synchronize cache command.
 		 * Reported by: Blaz Zupan <blaz@gold.amis.net>
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "NEC", "D3847*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Doesn't like the synchronize cache command.
 		 * Reported by: Blaz Zupan <blaz@gold.amis.net>
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, quantum, "MAVERICK 540S", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Doesn't like the synchronize cache command.
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS525S", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Doesn't like the synchronize cache command.
 		 * Reported by: walter@pelissero.de
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS540S", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Doesn't work correctly with 6 byte reads/writes.
 		 * Returns illegal request, and points to byte 9 of the
 		 * 6-byte CDB.
 		 * Reported by:  Adam McDougall <bsdx@spawnet.com>
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 4*", "*"},
 		/*quirks*/ DA_Q_NO_6_BYTE
 	},
 	{
 		/* See above. */
 		{T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 2*", "*"},
 		/*quirks*/ DA_Q_NO_6_BYTE
 	},
 	{
 		/*
 		 * Doesn't like the synchronize cache command.
 		 * Reported by: walter@pelissero.de
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "CONNER", "CP3500*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * The CISS RAID controllers do not support SYNC_CACHE
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "COMPAQ", "RAID*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * The STEC SSDs sometimes hang on UNMAP.
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "STEC", "*", "*"},
 		/*quirks*/ DA_Q_NO_UNMAP
 	},
 	{
 		/*
 		 * VMware returns BUSY status when storage has transient
 		 * connectivity problems, so better wait.
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "VMware*", "*", "*"},
 		/*quirks*/ DA_Q_RETRY_BUSY
 	},
 	/* USB mass storage devices supported by umass(4) */
 	{
 		/*
 		 * EXATELECOM (Sigmatel) i-Bead 100/105 USB Flash MP3 Player
 		 * PR: kern/51675
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "EXATEL", "i-BEAD10*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Power Quotient Int. (PQI) USB flash key
 		 * PR: kern/53067
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "USB Flash Disk*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
  	{
  		/*
  		 * Creative Nomad MUVO mp3 player (USB)
  		 * PR: kern/53094
  		 */
  		{T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "NOMAD_MUVO", "*"},
  		/*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT
  	},
 	{
 		/*
 		 * Jungsoft NEXDISK USB flash key
 		 * PR: kern/54737
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "JUNGSOFT", "NEXDISK*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * FreeDik USB Mini Data Drive
 		 * PR: kern/54786
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "FreeDik*", "Mini Data Drive",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Sigmatel USB Flash MP3 Player
 		 * PR: kern/57046
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "SigmaTel", "MSCN", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT
 	},
 	{
 		/*
 		 * Neuros USB Digital Audio Computer
 		 * PR: kern/63645
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "NEUROS", "dig. audio comp.",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * SEAGRAND NP-900 MP3 Player
 		 * PR: kern/64563
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "SEAGRAND", "NP-900*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT
 	},
 	{
 		/*
 		 * iRiver iFP MP3 player (with UMS Firmware)
 		 * PR: kern/54881, i386/63941, kern/66124
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "iRiver", "iFP*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
  	},
 	{
 		/*
 		 * Frontier Labs NEX IA+ Digital Audio Player, rev 1.10/0.01
 		 * PR: kern/70158
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "FL" , "Nex*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * ZICPlay USB MP3 Player with FM
 		 * PR: kern/75057
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "ACTIONS*" , "USB DISK*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * TEAC USB floppy mechanisms
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "TEAC" , "FD-05*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Kingston DataTraveler II+ USB Pen-Drive.
 		 * Reported by: Pawel Jakub Dawidek <pjd@FreeBSD.org>
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston" , "DataTraveler II+",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * USB DISK Pro PMAP
 		 * Reported by: jhs
 		 * PR: usb/96381
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, " ", "USB DISK Pro", "PMAP"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Motorola E398 Mobile Phone (TransFlash memory card).
 		 * Reported by: Wojciech A. Koszek <dunstan@FreeBSD.czest.pl>
 		 * PR: usb/89889
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Motorola" , "Motorola Phone",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Qware BeatZkey! Pro
 		 * PR: usb/79164
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "GENERIC", "USB DISK DEVICE",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Time DPA20B 1GB MP3 Player
 		 * PR: usb/81846
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB2.0*", "(FS) FLASH DISK*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Samsung USB key 128Mb
 		 * PR: usb/90081
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB-DISK", "FreeDik-FlashUsb",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Kingston DataTraveler 2.0 USB Flash memory.
 		 * PR: usb/89196
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler 2.0",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Creative MUVO Slim mp3 player (USB)
 		 * PR: usb/86131
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "MuVo Slim",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT
 		},
 	{
 		/*
 		 * United MP5512 Portable MP3 Player (2-in-1 USB DISK/MP3)
 		 * PR: usb/80487
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "MUSIC DISK",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * SanDisk Micro Cruzer 128MB
 		 * PR: usb/75970
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "SanDisk" , "Micro Cruzer",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * TOSHIBA TransMemory USB sticks
 		 * PR: kern/94660
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "TOSHIBA", "TransMemory",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * PNY USB 3.0 Flash Drives
 		*/
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "PNY", "USB 3.0 FD*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_RC16
 	},
 	{
 		/*
 		 * PNY USB Flash keys
 		 * PR: usb/75578, usb/72344, usb/65436 
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "*" , "USB DISK*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Genesys 6-in-1 Card Reader
 		 * PR: usb/94647
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "STORAGE DEVICE*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Rekam Digital CAMERA
 		 * PR: usb/98713
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "CAMERA*", "4MP-9J6*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * iRiver H10 MP3 player
 		 * PR: usb/102547
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "H10*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * iRiver U10 MP3 player
 		 * PR: usb/92306
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "U10*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * X-Micro Flash Disk
 		 * PR: usb/96901
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "X-Micro", "Flash Disk",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * EasyMP3 EM732X USB 2.0 Flash MP3 Player
 		 * PR: usb/96546
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "EM732X", "MP3 Player*",
 		"1.00"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Denver MP3 player
 		 * PR: usb/107101
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "DENVER", "MP3 PLAYER",
 		 "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Philips USB Key Audio KEY013
 		 * PR: usb/68412
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "PHILIPS", "Key*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_PREVENT
 	},
 	{
 		/*
 		 * JNC MP3 Player
 		 * PR: usb/94439
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "JNC*" , "MP3 Player*",
 		 "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * SAMSUNG MP0402H
 		 * PR: usb/108427
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "MP0402H", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * I/O Magic USB flash - Giga Bank
 		 * PR: usb/108810
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "GS-Magic", "stor*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * JoyFly 128mb USB Flash Drive
 		 * PR: 96133
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "Flash Disk*",
 		 "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * ChipsBnk usb stick
 		 * PR: 103702
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "ChipsBnk", "USB*",
 		 "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Storcase (Kingston) InfoStation IFS FC2/SATA-R 201A
 		 * PR: 129858
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "IFS", "FC2/SATA-R*",
 		 "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Samsung YP-U3 mp3-player
 		 * PR: 125398
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Samsung", "YP-U3",
 		 "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Netac", "OnlyDisk*",
 		 "2000"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Sony Cyber-Shot DSC cameras
 		 * PR: usb/137035
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Sony", "Sony DSC", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_PREVENT
 	},
 	{
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler G3",
 		 "1.00"}, /*quirks*/ DA_Q_NO_PREVENT
 	},
 	{
 		/* At least several Transcent USB sticks lie on RC16. */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "JetFlash", "Transcend*",
 		 "*"}, /*quirks*/ DA_Q_NO_RC16
 	},
 	{
 		/*
 		 * I-O Data USB Flash Disk
 		 * PR: usb/211716
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "I-O DATA", "USB Flash Disk*",
 		 "*"}, /*quirks*/ DA_Q_NO_RC16
 	},
 	/* ATA/SATA devices over SAS/USB/... */
 	{
 		/* Hitachi Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "Hitachi", "H??????????E3*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Samsung Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD155UI*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Samsung Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD155UI*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Samsung Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD204UI*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Samsung Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD204UI*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Barracuda Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DL*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Barracuda Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST????DL", "*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Barracuda Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???DM*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Barracuda Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST???DM*", "*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Barracuda Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DM*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Barracuda Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST????DM", "*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500423AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "3AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500424AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "4AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640423AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "3AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640424AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "4AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750420AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "0AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750422AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "2AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750423AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "3AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Thin Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???LT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Thin Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST???LT*", "*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RX*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RX*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RX*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RX*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Black Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PKT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Black Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PKT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Black Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PKT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Black Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PKT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Blue Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PVT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Blue Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PVT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Blue Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PVT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Blue Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PVT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Olympus FE-210 camera
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "OLYMPUS", "FE210*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * LG UP3S MP3 player
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "LG", "UP3S",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Laser MP3-2GA13 MP3 player
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "(HS) Flash Disk",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * LaCie external 250GB Hard drive des by Porsche
 		 * Submitted by: Ben Stuyts <ben@altesco.nl>
 		 * PR: 121474
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HM250JI", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	/* SATA SSDs */
 	{
 		/*
 		 * Corsair Force 2 SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair CSSD-F*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Corsair Force 3 SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair Force 3*", "*" },
 		/*quirks*/DA_Q_4K
 	},
         {
 		/*
 		 * Corsair Neutron GTX SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Corsair Neutron GTX*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Corsair Force GT & GS SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair Force G*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Crucial M4 SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "M4-CT???M4SSD2*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Crucial RealSSD C300 SSDs
 		 * 4k optimised
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "C300-CTFDDAC???MAG*",
 		"*" }, /*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Intel 320 Series SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSA2CW*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Intel 330 Series SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2CT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Intel 510 Series SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2MH*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Intel 520 Series SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2BW*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Intel X25-M Series SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSA2M*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Kingston E100 Series SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "KINGSTON SE100S3*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Kingston HyperX 3k SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "KINGSTON SH103S3*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Marvell SSDs (entry taken from OpenSolaris)
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "MARVELL SD88SA02*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * OCZ Agility 2 SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ-AGILITY2*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * OCZ Agility 3 SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-AGILITY3*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * OCZ Deneva R Series SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "DENRSTE251M45*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * OCZ Vertex 2 SSDs (inc pro series)
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ?VERTEX2*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * OCZ Vertex 3 SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-VERTEX3*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * OCZ Vertex 4 SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-VERTEX4*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Samsung 830 Series SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG SSD 830 Series*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Samsung 840 SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 840*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Samsung 850 SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 850*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Samsung 843T Series SSDs (MZ7WD*)
 		 * Samsung PM851 Series SSDs (MZ7TE*)
 		 * Samsung PM853T Series SSDs (MZ7GE*)
 		 * Samsung SM863 Series SSDs (MZ7KM*)
 		 * 4k optimised
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG MZ7*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * SuperTalent TeraDrive CT SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "FTM??CT25H*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * XceedIOPS SATA SSDs
 		 * 4k optimised
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SG9XCS2D*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Hama Innostor USB-Stick 
 		 */
 		{ T_DIRECT, SIP_MEDIA_REMOVABLE, "Innostor", "Innostor*", "*" }, 
 		/*quirks*/DA_Q_NO_RC16
 	},
 	{
 		/*
 		 * Seagate Lamarr 8TB Shingled Magnetic Recording (SMR)
 		 * Drive Managed SATA hard drive.  This drive doesn't report
 		 * in firmware that it is a drive managed SMR drive.
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST8000AS0002*", "*" },
 		/*quirks*/DA_Q_SMR_DM
 	},
 	{
 		/*
 		 * MX-ES USB Drive by Mach Xtreme
 		 */
 		{ T_DIRECT, SIP_MEDIA_REMOVABLE, "MX", "MXUB3*", "*"},
 		/*quirks*/DA_Q_NO_RC16
 	},
 };
 
 static	disk_strategy_t	dastrategy;
 static	dumper_t	dadump;
 static	periph_init_t	dainit;
 static	void		daasync(void *callback_arg, u_int32_t code,
 				struct cam_path *path, void *arg);
 static	void		dasysctlinit(void *context, int pending);
 static	int		dasysctlsofttimeout(SYSCTL_HANDLER_ARGS);
 static	int		dacmdsizesysctl(SYSCTL_HANDLER_ARGS);
 static	int		dadeletemethodsysctl(SYSCTL_HANDLER_ARGS);
 static	int		dazonemodesysctl(SYSCTL_HANDLER_ARGS);
 static	int		dazonesupsysctl(SYSCTL_HANDLER_ARGS);
 static	int		dadeletemaxsysctl(SYSCTL_HANDLER_ARGS);
 static	void		dadeletemethodset(struct da_softc *softc,
 					  da_delete_methods delete_method);
 static	off_t		dadeletemaxsize(struct da_softc *softc,
 					da_delete_methods delete_method);
 static	void		dadeletemethodchoose(struct da_softc *softc,
 					     da_delete_methods default_method);
 static	void		daprobedone(struct cam_periph *periph, union ccb *ccb);
 
 static	periph_ctor_t	daregister;
 static	periph_dtor_t	dacleanup;
 static	periph_start_t	dastart;
 static	periph_oninv_t	daoninvalidate;
 static	void		dazonedone(struct cam_periph *periph, union ccb *ccb);
 static	void		dadone(struct cam_periph *periph,
 			       union ccb *done_ccb);
 static  int		daerror(union ccb *ccb, u_int32_t cam_flags,
 				u_int32_t sense_flags);
 static void		daprevent(struct cam_periph *periph, int action);
 static void		dareprobe(struct cam_periph *periph);
 static void		dasetgeom(struct cam_periph *periph, uint32_t block_len,
 				  uint64_t maxsector,
 				  struct scsi_read_capacity_data_long *rcaplong,
 				  size_t rcap_size);
 static timeout_t	dasendorderedtag;
 static void		dashutdown(void *arg, int howto);
 static timeout_t	damediapoll;
 
 #ifndef	DA_DEFAULT_POLL_PERIOD
 #define	DA_DEFAULT_POLL_PERIOD	3
 #endif
 
 #ifndef DA_DEFAULT_TIMEOUT
 #define DA_DEFAULT_TIMEOUT 60	/* Timeout in seconds */
 #endif
 
 #ifndef DA_DEFAULT_SOFTTIMEOUT
 #define DA_DEFAULT_SOFTTIMEOUT	0
 #endif
 
 #ifndef	DA_DEFAULT_RETRY
 #define	DA_DEFAULT_RETRY	4
 #endif
 
 #ifndef	DA_DEFAULT_SEND_ORDERED
 #define	DA_DEFAULT_SEND_ORDERED	1
 #endif
 
 static int da_poll_period = DA_DEFAULT_POLL_PERIOD;
 static int da_retry_count = DA_DEFAULT_RETRY;
 static int da_default_timeout = DA_DEFAULT_TIMEOUT;
 static sbintime_t da_default_softtimeout = DA_DEFAULT_SOFTTIMEOUT;
 static int da_send_ordered = DA_DEFAULT_SEND_ORDERED;
 
 static SYSCTL_NODE(_kern_cam, OID_AUTO, da, CTLFLAG_RD, 0,
             "CAM Direct Access Disk driver");
 SYSCTL_INT(_kern_cam_da, OID_AUTO, poll_period, CTLFLAG_RWTUN,
            &da_poll_period, 0, "Media polling period in seconds");
 SYSCTL_INT(_kern_cam_da, OID_AUTO, retry_count, CTLFLAG_RWTUN,
            &da_retry_count, 0, "Normal I/O retry count");
 SYSCTL_INT(_kern_cam_da, OID_AUTO, default_timeout, CTLFLAG_RWTUN,
            &da_default_timeout, 0, "Normal I/O timeout (in seconds)");
 SYSCTL_INT(_kern_cam_da, OID_AUTO, send_ordered, CTLFLAG_RWTUN,
            &da_send_ordered, 0, "Send Ordered Tags");
 
 SYSCTL_PROC(_kern_cam_da, OID_AUTO, default_softtimeout,
     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, dasysctlsofttimeout, "I",
     "Soft I/O timeout (ms)");
 TUNABLE_INT64("kern.cam.da.default_softtimeout", &da_default_softtimeout);
 
 /*
  * DA_ORDEREDTAG_INTERVAL determines how often, relative
  * to the default timeout, we check to see whether an ordered
  * tagged transaction is appropriate to prevent simple tag
  * starvation.  Since we'd like to ensure that there is at least
  * 1/2 of the timeout length left for a starved transaction to
  * complete after we've sent an ordered tag, we must poll at least
  * four times in every timeout period.  This takes care of the worst
  * case where a starved transaction starts during an interval that
  * meets the requirement "don't send an ordered tag" test so it takes
  * us two intervals to determine that a tag must be sent.
  */
 #ifndef DA_ORDEREDTAG_INTERVAL
 #define DA_ORDEREDTAG_INTERVAL 4
 #endif
 
 static struct periph_driver dadriver =
 {
 	dainit, "da",
 	TAILQ_HEAD_INITIALIZER(dadriver.units), /* generation */ 0
 };
 
 PERIPHDRIVER_DECLARE(da, dadriver);
 
 static MALLOC_DEFINE(M_SCSIDA, "scsi_da", "scsi_da buffers");
 
 static int
 daopen(struct disk *dp)
 {
 	struct cam_periph *periph;
 	struct da_softc *softc;
 	int error;
 
 	periph = (struct cam_periph *)dp->d_drv1;
 	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
 		return (ENXIO);
 	}
 
 	cam_periph_lock(periph);
 	if ((error = cam_periph_hold(periph, PRIBIO|PCATCH)) != 0) {
 		cam_periph_unlock(periph);
 		cam_periph_release(periph);
 		return (error);
 	}
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH,
 	    ("daopen\n"));
 
 	softc = (struct da_softc *)periph->softc;
 	dareprobe(periph);
 
 	/* Wait for the disk size update.  */
 	error = cam_periph_sleep(periph, &softc->disk->d_mediasize, PRIBIO,
 	    "dareprobe", 0);
 	if (error != 0)
 		xpt_print(periph->path, "unable to retrieve capacity data\n");
 
 	if (periph->flags & CAM_PERIPH_INVALID)
 		error = ENXIO;
 
 	if (error == 0 && (softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 &&
 	    (softc->quirks & DA_Q_NO_PREVENT) == 0)
 		daprevent(periph, PR_PREVENT);
 
 	if (error == 0) {
 		softc->flags &= ~DA_FLAG_PACK_INVALID;
 		softc->flags |= DA_FLAG_OPEN;
 	}
 
 	cam_periph_unhold(periph);
 	cam_periph_unlock(periph);
 
 	if (error != 0)
 		cam_periph_release(periph);
 
 	return (error);
 }
 
 static int
 daclose(struct disk *dp)
 {
 	struct	cam_periph *periph;
 	struct	da_softc *softc;
 	union	ccb *ccb;
 	int error;
 
 	periph = (struct cam_periph *)dp->d_drv1;
 	softc = (struct da_softc *)periph->softc;
 	cam_periph_lock(periph);
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH,
 	    ("daclose\n"));
 
 	if (cam_periph_hold(periph, PRIBIO) == 0) {
 
 		/* Flush disk cache. */
 		if ((softc->flags & DA_FLAG_DIRTY) != 0 &&
 		    (softc->quirks & DA_Q_NO_SYNC_CACHE) == 0 &&
 		    (softc->flags & DA_FLAG_PACK_INVALID) == 0) {
 			ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 			scsi_synchronize_cache(&ccb->csio, /*retries*/1,
 			    /*cbfcnp*/dadone, MSG_SIMPLE_Q_TAG,
 			    /*begin_lba*/0, /*lb_count*/0, SSD_FULL_SIZE,
 			    5 * 60 * 1000);
 			error = cam_periph_runccb(ccb, daerror, /*cam_flags*/0,
 			    /*sense_flags*/SF_RETRY_UA | SF_QUIET_IR,
 			    softc->disk->d_devstat);
 			softc->flags &= ~DA_FLAG_DIRTY;
 			xpt_release_ccb(ccb);
 		}
 
 		/* Allow medium removal. */
 		if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 &&
 		    (softc->quirks & DA_Q_NO_PREVENT) == 0)
 			daprevent(periph, PR_ALLOW);
 
 		cam_periph_unhold(periph);
 	}
 
 	/*
 	 * If we've got removeable media, mark the blocksize as
 	 * unavailable, since it could change when new media is
 	 * inserted.
 	 */
 	if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0)
 		softc->disk->d_devstat->flags |= DEVSTAT_BS_UNAVAILABLE;
 
 	softc->flags &= ~DA_FLAG_OPEN;
 	while (softc->refcount != 0)
 		cam_periph_sleep(periph, &softc->refcount, PRIBIO, "daclose", 1);
 	cam_periph_unlock(periph);
 	cam_periph_release(periph);
 	return (0);
 }
 
 static void
 daschedule(struct cam_periph *periph)
 {
 	struct da_softc *softc = (struct da_softc *)periph->softc;
 
 	if (softc->state != DA_STATE_NORMAL)
 		return;
 
 	cam_iosched_schedule(softc->cam_iosched, periph);
 }
 
 /*
  * Actually translate the requested transfer into one the physical driver
  * can understand.  The transfer is described by a buf and will include
  * only one physical transfer.
  */
 static void
 dastrategy(struct bio *bp)
 {
 	struct cam_periph *periph;
 	struct da_softc *softc;
 	
 	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
 	softc = (struct da_softc *)periph->softc;
 
 	cam_periph_lock(periph);
 
 	/*
 	 * If the device has been made invalid, error out
 	 */
 	if ((softc->flags & DA_FLAG_PACK_INVALID)) {
 		cam_periph_unlock(periph);
 		biofinish(bp, NULL, ENXIO);
 		return;
 	}
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastrategy(%p)\n", bp));
 
 	/*
 	 * Zone commands must be ordered, because they can depend on the
 	 * effects of previously issued commands, and they may affect
 	 * commands after them.
 	 */
 	if (bp->bio_cmd == BIO_ZONE)
 		bp->bio_flags |= BIO_ORDERED;
 
 	/*
 	 * Place it in the queue of disk activities for this disk
 	 */
 	cam_iosched_queue_work(softc->cam_iosched, bp);
 
 	/*
 	 * Schedule ourselves for performing the work.
 	 */
 	daschedule(periph);
 	cam_periph_unlock(periph);
 
 	return;
 }
 
 static int
 dadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length)
 {
 	struct	    cam_periph *periph;
 	struct	    da_softc *softc;
 	u_int	    secsize;
 	struct	    ccb_scsiio csio;
 	struct	    disk *dp;
 	int	    error = 0;
 
 	dp = arg;
 	periph = dp->d_drv1;
 	softc = (struct da_softc *)periph->softc;
 	cam_periph_lock(periph);
 	secsize = softc->params.secsize;
 	
 	if ((softc->flags & DA_FLAG_PACK_INVALID) != 0) {
 		cam_periph_unlock(periph);
 		return (ENXIO);
 	}
 
 	if (length > 0) {
 		xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 		csio.ccb_h.ccb_state = DA_CCB_DUMP;
 		scsi_read_write(&csio,
 				/*retries*/0,
 				dadone,
 				MSG_ORDERED_Q_TAG,
 				/*read*/SCSI_RW_WRITE,
 				/*byte2*/0,
 				/*minimum_cmd_size*/ softc->minimum_cmd_size,
 				offset / secsize,
 				length / secsize,
 				/*data_ptr*/(u_int8_t *) virtual,
 				/*dxfer_len*/length,
 				/*sense_len*/SSD_FULL_SIZE,
 				da_default_timeout * 1000);
 		xpt_polled_action((union ccb *)&csio);
 
 		error = cam_periph_error((union ccb *)&csio,
 		    0, SF_NO_RECOVERY | SF_NO_RETRY, NULL);
 		if ((csio.ccb_h.status & CAM_DEV_QFRZN) != 0)
 			cam_release_devq(csio.ccb_h.path, /*relsim_flags*/0,
 			    /*reduction*/0, /*timeout*/0, /*getcount_only*/0);
 		if (error != 0)
 			printf("Aborting dump due to I/O error.\n");
 		cam_periph_unlock(periph);
 		return (error);
 	}
 		
 	/*
 	 * Sync the disk cache contents to the physical media.
 	 */
 	if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) {
 
 		xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 		csio.ccb_h.ccb_state = DA_CCB_DUMP;
 		scsi_synchronize_cache(&csio,
 				       /*retries*/0,
 				       /*cbfcnp*/dadone,
 				       MSG_SIMPLE_Q_TAG,
 				       /*begin_lba*/0,/* Cover the whole disk */
 				       /*lb_count*/0,
 				       SSD_FULL_SIZE,
 				       5 * 1000);
 		xpt_polled_action((union ccb *)&csio);
 
 		error = cam_periph_error((union ccb *)&csio,
 		    0, SF_NO_RECOVERY | SF_NO_RETRY | SF_QUIET_IR, NULL);
 		if ((csio.ccb_h.status & CAM_DEV_QFRZN) != 0)
 			cam_release_devq(csio.ccb_h.path, /*relsim_flags*/0,
 			    /*reduction*/0, /*timeout*/0, /*getcount_only*/0);
 		if (error != 0)
 			xpt_print(periph->path, "Synchronize cache failed\n");
 	}
 	cam_periph_unlock(periph);
 	return (error);
 }
 
 static int
 dagetattr(struct bio *bp)
 {
 	int ret;
 	struct cam_periph *periph;
 
 	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
 	cam_periph_lock(periph);
 	ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute,
 	    periph->path);
 	cam_periph_unlock(periph);
 	if (ret == 0)
 		bp->bio_completed = bp->bio_length;
 	return ret;
 }
 
 static void
 dainit(void)
 {
 	cam_status status;
 
 	/*
 	 * Install a global async callback.  This callback will
 	 * receive async callbacks like "new device found".
 	 */
 	status = xpt_register_async(AC_FOUND_DEVICE, daasync, NULL, NULL);
 
 	if (status != CAM_REQ_CMP) {
 		printf("da: Failed to attach master async callback "
 		       "due to status 0x%x!\n", status);
 	} else if (da_send_ordered) {
 
 		/* Register our shutdown event handler */
 		if ((EVENTHANDLER_REGISTER(shutdown_post_sync, dashutdown, 
 					   NULL, SHUTDOWN_PRI_DEFAULT)) == NULL)
 		    printf("dainit: shutdown event registration failed!\n");
 	}
 }
 
 /*
  * Callback from GEOM, called when it has finished cleaning up its
  * resources.
  */
 static void
 dadiskgonecb(struct disk *dp)
 {
 	struct cam_periph *periph;
 
 	periph = (struct cam_periph *)dp->d_drv1;
 	cam_periph_release(periph);
 }
 
 static void
 daoninvalidate(struct cam_periph *periph)
 {
 	struct da_softc *softc;
 
 	softc = (struct da_softc *)periph->softc;
 
 	/*
 	 * De-register any async callbacks.
 	 */
 	xpt_register_async(0, daasync, periph, periph->path);
 
 	softc->flags |= DA_FLAG_PACK_INVALID;
 #ifdef CAM_IO_STATS
 	softc->invalidations++;
 #endif
 
 	/*
 	 * Return all queued I/O with ENXIO.
 	 * XXX Handle any transactions queued to the card
 	 *     with XPT_ABORT_CCB.
 	 */
 	cam_iosched_flush(softc->cam_iosched, NULL, ENXIO);
 
 	/*
 	 * Tell GEOM that we've gone away, we'll get a callback when it is
 	 * done cleaning up its resources.
 	 */
 	disk_gone(softc->disk);
 }
 
 static void
 dacleanup(struct cam_periph *periph)
 {
 	struct da_softc *softc;
 
 	softc = (struct da_softc *)periph->softc;
 
 	cam_periph_unlock(periph);
 
 	cam_iosched_fini(softc->cam_iosched);
 
 	/*
 	 * If we can't free the sysctl tree, oh well...
 	 */
 	if ((softc->flags & DA_FLAG_SCTX_INIT) != 0) {
 #ifdef CAM_IO_STATS
 		if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0)
 			xpt_print(periph->path,
 			    "can't remove sysctl stats context\n");
 #endif
 		if (sysctl_ctx_free(&softc->sysctl_ctx) != 0)
 			xpt_print(periph->path,
 			    "can't remove sysctl context\n");
 	}
 
 	callout_drain(&softc->mediapoll_c);
 	disk_destroy(softc->disk);
 	callout_drain(&softc->sendordered_c);
 	free(softc, M_DEVBUF);
 	cam_periph_lock(periph);
 }
 
 static void
 daasync(void *callback_arg, u_int32_t code,
 	struct cam_path *path, void *arg)
 {
 	struct cam_periph *periph;
 	struct da_softc *softc;
 
 	periph = (struct cam_periph *)callback_arg;
 	switch (code) {
 	case AC_FOUND_DEVICE:
 	{
 		struct ccb_getdev *cgd;
 		cam_status status;
  
 		cgd = (struct ccb_getdev *)arg;
 		if (cgd == NULL)
 			break;
 
 		if (cgd->protocol != PROTO_SCSI)
 			break;
 		if (SID_QUAL(&cgd->inq_data) != SID_QUAL_LU_CONNECTED)
 			break;
 		if (SID_TYPE(&cgd->inq_data) != T_DIRECT
 		    && SID_TYPE(&cgd->inq_data) != T_RBC
 		    && SID_TYPE(&cgd->inq_data) != T_OPTICAL
 		    && SID_TYPE(&cgd->inq_data) != T_ZBC_HM)
 			break;
 
 		/*
 		 * Allocate a peripheral instance for
 		 * this device and start the probe
 		 * process.
 		 */
 		status = cam_periph_alloc(daregister, daoninvalidate,
 					  dacleanup, dastart,
 					  "da", CAM_PERIPH_BIO,
 					  path, daasync,
 					  AC_FOUND_DEVICE, cgd);
 
 		if (status != CAM_REQ_CMP
 		 && status != CAM_REQ_INPROG)
 			printf("daasync: Unable to attach to new device "
 				"due to status 0x%x\n", status);
 		return;
 	}
 	case AC_ADVINFO_CHANGED:
 	{
 		uintptr_t buftype;
 
 		buftype = (uintptr_t)arg;
 		if (buftype == CDAI_TYPE_PHYS_PATH) {
 			struct da_softc *softc;
 
 			softc = periph->softc;
 			disk_attr_changed(softc->disk, "GEOM::physpath",
 					  M_NOWAIT);
 		}
 		break;
 	}
 	case AC_UNIT_ATTENTION:
 	{
 		union ccb *ccb;
 		int error_code, sense_key, asc, ascq;
 
 		softc = (struct da_softc *)periph->softc;
 		ccb = (union ccb *)arg;
 
 		/*
 		 * Handle all UNIT ATTENTIONs except our own,
 		 * as they will be handled by daerror().
 		 */
 		if (xpt_path_periph(ccb->ccb_h.path) != periph &&
 		    scsi_extract_sense_ccb(ccb,
 		     &error_code, &sense_key, &asc, &ascq)) {
 			if (asc == 0x2A && ascq == 0x09) {
 				xpt_print(ccb->ccb_h.path,
 				    "Capacity data has changed\n");
 				softc->flags &= ~DA_FLAG_PROBED;
 				dareprobe(periph);
 			} else if (asc == 0x28 && ascq == 0x00) {
 				softc->flags &= ~DA_FLAG_PROBED;
 				disk_media_changed(softc->disk, M_NOWAIT);
 			} else if (asc == 0x3F && ascq == 0x03) {
 				xpt_print(ccb->ccb_h.path,
 				    "INQUIRY data has changed\n");
 				softc->flags &= ~DA_FLAG_PROBED;
 				dareprobe(periph);
 			}
 		}
 		cam_periph_async(periph, code, path, arg);
 		break;
 	}
 	case AC_SCSI_AEN:
 		softc = (struct da_softc *)periph->softc;
 		if (!cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) {
 			if (cam_periph_acquire(periph) == CAM_REQ_CMP) {
 				cam_iosched_set_work_flags(softc->cam_iosched, DA_WORK_TUR);
 				daschedule(periph);
 			}
 		}
 		/* FALLTHROUGH */
 	case AC_SENT_BDR:
 	case AC_BUS_RESET:
 	{
 		struct ccb_hdr *ccbh;
 
 		softc = (struct da_softc *)periph->softc;
 		/*
 		 * Don't fail on the expected unit attention
 		 * that will occur.
 		 */
 		softc->flags |= DA_FLAG_RETRY_UA;
 		LIST_FOREACH(ccbh, &softc->pending_ccbs, periph_links.le)
 			ccbh->ccb_state |= DA_CCB_RETRY_UA;
 		break;
 	}
 	case AC_INQ_CHANGED:
 		softc = (struct da_softc *)periph->softc;
 		softc->flags &= ~DA_FLAG_PROBED;
 		dareprobe(periph);
 		break;
 	default:
 		break;
 	}
 	cam_periph_async(periph, code, path, arg);
 }
 
 static void
 dasysctlinit(void *context, int pending)
 {
 	struct cam_periph *periph;
 	struct da_softc *softc;
 	char tmpstr[80], tmpstr2[80];
 	struct ccb_trans_settings cts;
 
 	periph = (struct cam_periph *)context;
 	/*
 	 * periph was held for us when this task was enqueued
 	 */
 	if (periph->flags & CAM_PERIPH_INVALID) {
 		cam_periph_release(periph);
 		return;
 	}
 
 	softc = (struct da_softc *)periph->softc;
 	snprintf(tmpstr, sizeof(tmpstr), "CAM DA unit %d", periph->unit_number);
 	snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);
 
 	sysctl_ctx_init(&softc->sysctl_ctx);
 	softc->flags |= DA_FLAG_SCTX_INIT;
 	softc->sysctl_tree = SYSCTL_ADD_NODE(&softc->sysctl_ctx,
 		SYSCTL_STATIC_CHILDREN(_kern_cam_da), OID_AUTO, tmpstr2,
 		CTLFLAG_RD, 0, tmpstr);
 	if (softc->sysctl_tree == NULL) {
 		printf("dasysctlinit: unable to allocate sysctl tree\n");
 		cam_periph_release(periph);
 		return;
 	}
 
 	/*
 	 * Now register the sysctl handler, so the user can change the value on
 	 * the fly.
 	 */
 	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 		OID_AUTO, "delete_method", CTLTYPE_STRING | CTLFLAG_RWTUN,
 		softc, 0, dadeletemethodsysctl, "A",
 		"BIO_DELETE execution method");
 	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 		OID_AUTO, "delete_max", CTLTYPE_U64 | CTLFLAG_RW,
 		softc, 0, dadeletemaxsysctl, "Q",
 		"Maximum BIO_DELETE size");
 	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 		OID_AUTO, "minimum_cmd_size", CTLTYPE_INT | CTLFLAG_RW,
 		&softc->minimum_cmd_size, 0, dacmdsizesysctl, "I",
 		"Minimum CDB size");
 
 	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 		OID_AUTO, "zone_mode", CTLTYPE_STRING | CTLFLAG_RD,
 		softc, 0, dazonemodesysctl, "A",
 		"Zone Mode");
 	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 		OID_AUTO, "zone_support", CTLTYPE_STRING | CTLFLAG_RD,
 		softc, 0, dazonesupsysctl, "A",
 		"Zone Support");
 	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
 		SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
 		"optimal_seq_zones", CTLFLAG_RD, &softc->optimal_seq_zones,
 		"Optimal Number of Open Sequential Write Preferred Zones");
 	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
 		SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
 		"optimal_nonseq_zones", CTLFLAG_RD,
 		&softc->optimal_nonseq_zones,
 		"Optimal Number of Non-Sequentially Written Sequential Write "
 		"Preferred Zones");
 	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
 		SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
 		"max_seq_zones", CTLFLAG_RD, &softc->max_seq_zones,
 		"Maximum Number of Open Sequential Write Required Zones");
 
 	SYSCTL_ADD_INT(&softc->sysctl_ctx,
 		       SYSCTL_CHILDREN(softc->sysctl_tree),
 		       OID_AUTO,
 		       "error_inject",
 		       CTLFLAG_RW,
 		       &softc->error_inject,
 		       0,
 		       "error_inject leaf");
 
 	SYSCTL_ADD_INT(&softc->sysctl_ctx,
 		       SYSCTL_CHILDREN(softc->sysctl_tree),
 		       OID_AUTO,
 		       "unmapped_io",
 		       CTLFLAG_RD, 
 		       &softc->unmappedio,
 		       0,
 		       "Unmapped I/O leaf");
 
 	SYSCTL_ADD_INT(&softc->sysctl_ctx,
 		       SYSCTL_CHILDREN(softc->sysctl_tree),
 		       OID_AUTO,
 		       "rotating",
 		       CTLFLAG_RD, 
 		       &softc->rotating,
 		       0,
 		       "Rotating media");
 
 	/*
 	 * Add some addressing info.
 	 */
 	memset(&cts, 0, sizeof (cts));
 	xpt_setup_ccb(&cts.ccb_h, periph->path, CAM_PRIORITY_NONE);
 	cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
 	cts.type = CTS_TYPE_CURRENT_SETTINGS;
 	cam_periph_lock(periph);
 	xpt_action((union ccb *)&cts);
 	cam_periph_unlock(periph);
 	if (cts.ccb_h.status != CAM_REQ_CMP) {
 		cam_periph_release(periph);
 		return;
 	}
 	if (cts.protocol == PROTO_SCSI && cts.transport == XPORT_FC) {
 		struct ccb_trans_settings_fc *fc = &cts.xport_specific.fc;
 		if (fc->valid & CTS_FC_VALID_WWPN) {
 			softc->wwpn = fc->wwpn;
 			SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
 			    SYSCTL_CHILDREN(softc->sysctl_tree),
 			    OID_AUTO, "wwpn", CTLFLAG_RD,
 			    &softc->wwpn, "World Wide Port Name");
 		}
 	}
 
 #ifdef CAM_IO_STATS
 	/*
 	 * Now add some useful stats.
 	 * XXX These should live in cam_periph and be common to all periphs
 	 */
 	softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx,
 	    SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats",
 	    CTLFLAG_RD, 0, "Statistics");
 	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
 		       SYSCTL_CHILDREN(softc->sysctl_stats_tree),
 		       OID_AUTO,
 		       "errors",
 		       CTLFLAG_RD,
 		       &softc->errors,
 		       0,
 		       "Transport errors reported by the SIM");
 	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
 		       SYSCTL_CHILDREN(softc->sysctl_stats_tree),
 		       OID_AUTO,
 		       "timeouts",
 		       CTLFLAG_RD,
 		       &softc->timeouts,
 		       0,
 		       "Device timeouts reported by the SIM");
 	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
 		       SYSCTL_CHILDREN(softc->sysctl_stats_tree),
 		       OID_AUTO,
 		       "pack_invalidations",
 		       CTLFLAG_RD,
 		       &softc->invalidations,
 		       0,
 		       "Device pack invalidations");
 #endif
 
 	cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx,
 	    softc->sysctl_tree);
 
 	cam_periph_release(periph);
 }
 
 static int
 dadeletemaxsysctl(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint64_t value;
 	struct da_softc *softc;
 
 	softc = (struct da_softc *)arg1;
 
 	value = softc->disk->d_delmaxsize;
 	error = sysctl_handle_64(oidp, &value, 0, req);
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	/* only accept values smaller than the calculated value */
 	if (value > dadeletemaxsize(softc, softc->delete_method)) {
 		return (EINVAL);
 	}
 	softc->disk->d_delmaxsize = value;
 
 	return (0);
 }
 
 static int
 dacmdsizesysctl(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = *(int *)arg1;
 
 	error = sysctl_handle_int(oidp, &value, 0, req);
 
 	if ((error != 0)
 	 || (req->newptr == NULL))
 		return (error);
 
 	/*
 	 * Acceptable values here are 6, 10, 12 or 16.
 	 */
 	if (value < 6)
 		value = 6;
 	else if ((value > 6)
 	      && (value <= 10))
 		value = 10;
 	else if ((value > 10)
 	      && (value <= 12))
 		value = 12;
 	else if (value > 12)
 		value = 16;
 
 	*(int *)arg1 = value;
 
 	return (0);
 }
 
 static int
 dasysctlsofttimeout(SYSCTL_HANDLER_ARGS)
 {
 	sbintime_t value;
 	int error;
 
 	value = da_default_softtimeout / SBT_1MS;
 
 	error = sysctl_handle_int(oidp, (int *)&value, 0, req);
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	/* XXX Should clip this to a reasonable level */
 	if (value > da_default_timeout * 1000)
 		return (EINVAL);
 
 	da_default_softtimeout = value * SBT_1MS;
 	return (0);
 }
 
 static void
 dadeletemethodset(struct da_softc *softc, da_delete_methods delete_method)
 {
 
 	softc->delete_method = delete_method;
 	softc->disk->d_delmaxsize = dadeletemaxsize(softc, delete_method);
 	softc->delete_func = da_delete_functions[delete_method];
 
 	if (softc->delete_method > DA_DELETE_DISABLE)
 		softc->disk->d_flags |= DISKFLAG_CANDELETE;
 	else
 		softc->disk->d_flags &= ~DISKFLAG_CANDELETE;
 }
 
 static off_t
 dadeletemaxsize(struct da_softc *softc, da_delete_methods delete_method)
 {
 	off_t sectors;
 
 	switch(delete_method) {
 	case DA_DELETE_UNMAP:
 		sectors = (off_t)softc->unmap_max_lba;
 		break;
 	case DA_DELETE_ATA_TRIM:
 		sectors = (off_t)ATA_DSM_RANGE_MAX * softc->trim_max_ranges;
 		break;
 	case DA_DELETE_WS16:
 		sectors = omin(softc->ws_max_blks, WS16_MAX_BLKS);
 		break;
 	case DA_DELETE_ZERO:
 	case DA_DELETE_WS10:
 		sectors = omin(softc->ws_max_blks, WS10_MAX_BLKS);
 		break;
 	default:
 		return 0;
 	}
 
 	return (off_t)softc->params.secsize *
 	    omin(sectors, softc->params.sectors);
 }
 
 static void
 daprobedone(struct cam_periph *periph, union ccb *ccb)
 {
 	struct da_softc *softc;
 
 	softc = (struct da_softc *)periph->softc;
 
 	dadeletemethodchoose(softc, DA_DELETE_NONE);
 
 	if (bootverbose && (softc->flags & DA_FLAG_ANNOUNCED) == 0) {
 		char buf[80];
 		int i, sep;
 
 		snprintf(buf, sizeof(buf), "Delete methods: <");
 		sep = 0;
 		for (i = 0; i <= DA_DELETE_MAX; i++) {
 			if ((softc->delete_available & (1 << i)) == 0 &&
 			    i != softc->delete_method)
 				continue;
 			if (sep)
 				strlcat(buf, ",", sizeof(buf));
 			strlcat(buf, da_delete_method_names[i],
 			    sizeof(buf));
 			if (i == softc->delete_method)
 				strlcat(buf, "(*)", sizeof(buf));
 			sep = 1;
 		}
 		strlcat(buf, ">", sizeof(buf));
 		printf("%s%d: %s\n", periph->periph_name,
 		    periph->unit_number, buf);
 	}
 
 	/*
 	 * Since our peripheral may be invalidated by an error
 	 * above or an external event, we must release our CCB
 	 * before releasing the probe lock on the peripheral.
 	 * The peripheral will only go away once the last lock
 	 * is removed, and we need it around for the CCB release
 	 * operation.
 	 */
 	xpt_release_ccb(ccb);
 	softc->state = DA_STATE_NORMAL;
 	softc->flags |= DA_FLAG_PROBED;
 	daschedule(periph);
 	wakeup(&softc->disk->d_mediasize);
 	if ((softc->flags & DA_FLAG_ANNOUNCED) == 0) {
 		softc->flags |= DA_FLAG_ANNOUNCED;
 		cam_periph_unhold(periph);
 	} else
 		cam_periph_release_locked(periph);
 }
 
 static void
 dadeletemethodchoose(struct da_softc *softc, da_delete_methods default_method)
 {
 	int i, methods;
 
 	/* If available, prefer the method requested by user. */
 	i = softc->delete_method_pref;
 	methods = softc->delete_available | (1 << DA_DELETE_DISABLE);
 	if (methods & (1 << i)) {
 		dadeletemethodset(softc, i);
 		return;
 	}
 
 	/* Use the pre-defined order to choose the best performing delete. */
 	for (i = DA_DELETE_MIN; i <= DA_DELETE_MAX; i++) {
 		if (i == DA_DELETE_ZERO)
 			continue;
 		if (softc->delete_available & (1 << i)) {
 			dadeletemethodset(softc, i);
 			return;
 		}
 	}
 
 	/* Fallback to default. */
 	dadeletemethodset(softc, default_method);
 }
 
 static int
 dadeletemethodsysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	const char *p;
 	struct da_softc *softc;
 	int i, error, methods, value;
 
 	softc = (struct da_softc *)arg1;
 
 	value = softc->delete_method;
 	if (value < 0 || value > DA_DELETE_MAX)
 		p = "UNKNOWN";
 	else
 		p = da_delete_method_names[value];
 	strncpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	methods = softc->delete_available | (1 << DA_DELETE_DISABLE);
 	for (i = 0; i <= DA_DELETE_MAX; i++) {
 		if (strcmp(buf, da_delete_method_names[i]) == 0)
 			break;
 	}
 	if (i > DA_DELETE_MAX)
 		return (EINVAL);
 	softc->delete_method_pref = i;
 	dadeletemethodchoose(softc, DA_DELETE_NONE);
 	return (0);
 }
 
 static int
 dazonemodesysctl(SYSCTL_HANDLER_ARGS)
 {
 	char tmpbuf[40];
 	struct da_softc *softc;
 	int error;
 
 	softc = (struct da_softc *)arg1;
 
 	switch (softc->zone_mode) {
 	case DA_ZONE_DRIVE_MANAGED:
 		snprintf(tmpbuf, sizeof(tmpbuf), "Drive Managed");
 		break;
 	case DA_ZONE_HOST_AWARE:
 		snprintf(tmpbuf, sizeof(tmpbuf), "Host Aware");
 		break;
 	case DA_ZONE_HOST_MANAGED:
 		snprintf(tmpbuf, sizeof(tmpbuf), "Host Managed");
 		break;
 	case DA_ZONE_NONE:
 	default:
 		snprintf(tmpbuf, sizeof(tmpbuf), "Not Zoned");
 		break;
 	}
 
 	error = sysctl_handle_string(oidp, tmpbuf, sizeof(tmpbuf), req);
 
 	return (error);
 }
 
 static int
 dazonesupsysctl(SYSCTL_HANDLER_ARGS)
 {
 	char tmpbuf[180];
 	struct da_softc *softc;
 	struct sbuf sb;
 	int error, first;
 	unsigned int i;
 
 	softc = (struct da_softc *)arg1;
 
 	error = 0;
 	first = 1;
 	sbuf_new(&sb, tmpbuf, sizeof(tmpbuf), 0);
 
 	for (i = 0; i < sizeof(da_zone_desc_table) /
 	     sizeof(da_zone_desc_table[0]); i++) {
 		if (softc->zone_flags & da_zone_desc_table[i].value) {
 			if (first == 0)
 				sbuf_printf(&sb, ", ");
 			else
 				first = 0;
 			sbuf_cat(&sb, da_zone_desc_table[i].desc);
 		}
 	}
 
 	if (first == 1)
 		sbuf_printf(&sb, "None");
 
 	sbuf_finish(&sb);
 
 	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 
 	return (error);
 }
 
 static cam_status
 daregister(struct cam_periph *periph, void *arg)
 {
 	struct da_softc *softc;
 	struct ccb_pathinq cpi;
 	struct ccb_getdev *cgd;
 	char tmpstr[80];
 	caddr_t match;
 
 	cgd = (struct ccb_getdev *)arg;
 	if (cgd == NULL) {
 		printf("daregister: no getdev CCB, can't register device\n");
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	softc = (struct da_softc *)malloc(sizeof(*softc), M_DEVBUF,
 	    M_NOWAIT|M_ZERO);
 
 	if (softc == NULL) {
 		printf("daregister: Unable to probe new device. "
 		       "Unable to allocate softc\n");
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	if (cam_iosched_init(&softc->cam_iosched, periph) != 0) {
 		printf("daregister: Unable to probe new device. "
 		       "Unable to allocate iosched memory\n");
 		free(softc, M_DEVBUF);
 		return(CAM_REQ_CMP_ERR);
 	}
 	
 	LIST_INIT(&softc->pending_ccbs);
 	softc->state = DA_STATE_PROBE_RC;
 	bioq_init(&softc->delete_run_queue);
 	if (SID_IS_REMOVABLE(&cgd->inq_data))
 		softc->flags |= DA_FLAG_PACK_REMOVABLE;
 	softc->unmap_max_ranges = UNMAP_MAX_RANGES;
 	softc->unmap_max_lba = UNMAP_RANGE_MAX;
 	softc->ws_max_blks = WS16_MAX_BLKS;
 	softc->trim_max_ranges = ATA_TRIM_MAX_RANGES;
 	softc->rotating = 1;
 
 	periph->softc = softc;
 
 	/*
 	 * See if this device has any quirks.
 	 */
 	match = cam_quirkmatch((caddr_t)&cgd->inq_data,
 			       (caddr_t)da_quirk_table,
 			       nitems(da_quirk_table),
 			       sizeof(*da_quirk_table), scsi_inquiry_match);
 
 	if (match != NULL)
 		softc->quirks = ((struct da_quirk_entry *)match)->quirks;
 	else
 		softc->quirks = DA_Q_NONE;
 
 	/* Check if the SIM does not want 6 byte commands */
 	bzero(&cpi, sizeof(cpi));
 	xpt_setup_ccb(&cpi.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 	cpi.ccb_h.func_code = XPT_PATH_INQ;
 	xpt_action((union ccb *)&cpi);
 	if (cpi.ccb_h.status == CAM_REQ_CMP && (cpi.hba_misc & PIM_NO_6_BYTE))
 		softc->quirks |= DA_Q_NO_6_BYTE;
 
 	if (SID_TYPE(&cgd->inq_data) == T_ZBC_HM)
 		softc->zone_mode = DA_ZONE_HOST_MANAGED;
 	else if (softc->quirks & DA_Q_SMR_DM)
 		softc->zone_mode = DA_ZONE_DRIVE_MANAGED;
 	else
 		softc->zone_mode = DA_ZONE_NONE;
 
 	if (softc->zone_mode != DA_ZONE_NONE) {
 		if (scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) {
 			if (scsi_vpd_supported_page(periph, SVPD_ZONED_BDC))
 				softc->zone_interface = DA_ZONE_IF_ATA_SAT;
 			else
 				softc->zone_interface = DA_ZONE_IF_ATA_PASS;
 		} else
 			softc->zone_interface = DA_ZONE_IF_SCSI;
 	}
 
 	TASK_INIT(&softc->sysctl_task, 0, dasysctlinit, periph);
 
 	/*
 	 * Take an exclusive refcount on the periph while dastart is called
 	 * to finish the probe.  The reference will be dropped in dadone at
 	 * the end of probe.
 	 */
 	(void)cam_periph_hold(periph, PRIBIO);
 
 	/*
 	 * Schedule a periodic event to occasionally send an
 	 * ordered tag to a device.
 	 */
 	callout_init_mtx(&softc->sendordered_c, cam_periph_mtx(periph), 0);
 	callout_reset(&softc->sendordered_c,
 	    (da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL,
 	    dasendorderedtag, softc);
 
 	cam_periph_unlock(periph);
 	/*
 	 * RBC devices don't have to support READ(6), only READ(10).
 	 */
 	if (softc->quirks & DA_Q_NO_6_BYTE || SID_TYPE(&cgd->inq_data) == T_RBC)
 		softc->minimum_cmd_size = 10;
 	else
 		softc->minimum_cmd_size = 6;
 
 	/*
 	 * Load the user's default, if any.
 	 */
 	snprintf(tmpstr, sizeof(tmpstr), "kern.cam.da.%d.minimum_cmd_size",
 		 periph->unit_number);
 	TUNABLE_INT_FETCH(tmpstr, &softc->minimum_cmd_size);
 
 	/*
 	 * 6, 10, 12 and 16 are the currently permissible values.
 	 */
 	if (softc->minimum_cmd_size < 6)
 		softc->minimum_cmd_size = 6;
 	else if ((softc->minimum_cmd_size > 6)
 	      && (softc->minimum_cmd_size <= 10))
 		softc->minimum_cmd_size = 10;
 	else if ((softc->minimum_cmd_size > 10)
 	      && (softc->minimum_cmd_size <= 12))
 		softc->minimum_cmd_size = 12;
 	else if (softc->minimum_cmd_size > 12)
 		softc->minimum_cmd_size = 16;
 
 	/* Predict whether device may support READ CAPACITY(16). */
 	if (SID_ANSI_REV(&cgd->inq_data) >= SCSI_REV_SPC3 &&
 	    (softc->quirks & DA_Q_NO_RC16) == 0) {
 		softc->flags |= DA_FLAG_CAN_RC16;
 		softc->state = DA_STATE_PROBE_RC16;
 	}
 
 	/*
 	 * Register this media as a disk.
 	 */
 	softc->disk = disk_alloc();
 	softc->disk->d_devstat = devstat_new_entry(periph->periph_name,
 			  periph->unit_number, 0,
 			  DEVSTAT_BS_UNAVAILABLE,
 			  SID_TYPE(&cgd->inq_data) |
 			  XPORT_DEVSTAT_TYPE(cpi.transport),
 			  DEVSTAT_PRIORITY_DISK);
 	softc->disk->d_open = daopen;
 	softc->disk->d_close = daclose;
 	softc->disk->d_strategy = dastrategy;
 	softc->disk->d_dump = dadump;
 	softc->disk->d_getattr = dagetattr;
 	softc->disk->d_gone = dadiskgonecb;
 	softc->disk->d_name = "da";
 	softc->disk->d_drv1 = periph;
 	if (cpi.maxio == 0)
 		softc->maxio = DFLTPHYS;	/* traditional default */
 	else if (cpi.maxio > MAXPHYS)
 		softc->maxio = MAXPHYS;		/* for safety */
 	else
 		softc->maxio = cpi.maxio;
 	softc->disk->d_maxsize = softc->maxio;
 	softc->disk->d_unit = periph->unit_number;
 	softc->disk->d_flags = DISKFLAG_DIRECT_COMPLETION | DISKFLAG_CANZONE;
 	if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0)
 		softc->disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
 	if ((cpi.hba_misc & PIM_UNMAPPED) != 0) {
 		softc->unmappedio = 1;
 		softc->disk->d_flags |= DISKFLAG_UNMAPPED_BIO;
 		xpt_print(periph->path, "UNMAPPED\n");
 	}
 	cam_strvis(softc->disk->d_descr, cgd->inq_data.vendor,
 	    sizeof(cgd->inq_data.vendor), sizeof(softc->disk->d_descr));
 	strlcat(softc->disk->d_descr, " ", sizeof(softc->disk->d_descr));
 	cam_strvis(&softc->disk->d_descr[strlen(softc->disk->d_descr)],
 	    cgd->inq_data.product, sizeof(cgd->inq_data.product),
 	    sizeof(softc->disk->d_descr) - strlen(softc->disk->d_descr));
 	softc->disk->d_hba_vendor = cpi.hba_vendor;
 	softc->disk->d_hba_device = cpi.hba_device;
 	softc->disk->d_hba_subvendor = cpi.hba_subvendor;
 	softc->disk->d_hba_subdevice = cpi.hba_subdevice;
 
 	/*
 	 * Acquire a reference to the periph before we register with GEOM.
 	 * We'll release this reference once GEOM calls us back (via
 	 * dadiskgonecb()) telling us that our provider has been freed.
 	 */
 	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
 		xpt_print(periph->path, "%s: lost periph during "
 			  "registration!\n", __func__);
 		cam_periph_lock(periph);
 		return (CAM_REQ_CMP_ERR);
 	}
 
 	disk_create(softc->disk, DISK_VERSION);
 	cam_periph_lock(periph);
 
 	/*
 	 * Add async callbacks for events of interest.
 	 * I don't bother checking if this fails as,
 	 * in most cases, the system will function just
 	 * fine without them and the only alternative
 	 * would be to not attach the device on failure.
 	 */
 	xpt_register_async(AC_SENT_BDR | AC_BUS_RESET | AC_LOST_DEVICE |
 	    AC_ADVINFO_CHANGED | AC_SCSI_AEN | AC_UNIT_ATTENTION |
 	    AC_INQ_CHANGED, daasync, periph, periph->path);
 
 	/*
 	 * Emit an attribute changed notification just in case 
 	 * physical path information arrived before our async
 	 * event handler was registered, but after anyone attaching
 	 * to our disk device polled it.
 	 */
 	disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT);
 
 	/*
 	 * Schedule a periodic media polling events.
 	 */
 	callout_init_mtx(&softc->mediapoll_c, cam_periph_mtx(periph), 0);
 	if ((softc->flags & DA_FLAG_PACK_REMOVABLE) &&
 	    (cgd->inq_flags & SID_AEN) == 0 &&
 	    da_poll_period != 0)
 		callout_reset(&softc->mediapoll_c, da_poll_period * hz,
 		    damediapoll, periph);
 
 	xpt_schedule(periph, CAM_PRIORITY_DEV);
 
 	return(CAM_REQ_CMP);
 }
 
 static int
 da_zone_bio_to_scsi(int disk_zone_cmd)
 {
 	switch (disk_zone_cmd) {
 	case DISK_ZONE_OPEN:
 		return ZBC_OUT_SA_OPEN;
 	case DISK_ZONE_CLOSE:
 		return ZBC_OUT_SA_CLOSE;
 	case DISK_ZONE_FINISH:
 		return ZBC_OUT_SA_FINISH;
 	case DISK_ZONE_RWP:
 		return ZBC_OUT_SA_RWP;
 	}
 
 	return -1;
 }
 
 static int
 da_zone_cmd(struct cam_periph *periph, union ccb *ccb, struct bio *bp,
 	    int *queue_ccb)
 {
 	struct da_softc *softc;
 	int error;
 
 	error = 0;
 
 	if (bp->bio_cmd != BIO_ZONE) {
 		error = EINVAL;
 		goto bailout;
 	}
 
 	softc = periph->softc;
 
 	switch (bp->bio_zone.zone_cmd) {
 	case DISK_ZONE_OPEN:
 	case DISK_ZONE_CLOSE:
 	case DISK_ZONE_FINISH:
 	case DISK_ZONE_RWP: {
 		int zone_flags;
 		int zone_sa;
 		uint64_t lba;
 
 		zone_sa = da_zone_bio_to_scsi(bp->bio_zone.zone_cmd);
 		if (zone_sa == -1) {
 			xpt_print(periph->path, "Cannot translate zone "
 			    "cmd %#x to SCSI\n", bp->bio_zone.zone_cmd);
 			error = EINVAL;
 			goto bailout;
 		}
 
 		zone_flags = 0;
 		lba = bp->bio_zone.zone_params.rwp.id;
 
 		if (bp->bio_zone.zone_params.rwp.flags &
 		    DISK_ZONE_RWP_FLAG_ALL)
 			zone_flags |= ZBC_OUT_ALL;
 
 		if (softc->zone_interface != DA_ZONE_IF_ATA_PASS) {
 			scsi_zbc_out(&ccb->csio,
 				     /*retries*/ da_retry_count,
 				     /*cbfcnp*/ dadone,
 				     /*tag_action*/ MSG_SIMPLE_Q_TAG,
 				     /*service_action*/ zone_sa,
 				     /*zone_id*/ lba,
 				     /*zone_flags*/ zone_flags,
 				     /*data_ptr*/ NULL,
 				     /*dxfer_len*/ 0,
 				     /*sense_len*/ SSD_FULL_SIZE,
 				     /*timeout*/ da_default_timeout * 1000);
 		} else {
 			/*
 			 * Note that in this case, even though we can
 			 * technically use NCQ, we don't bother for several
 			 * reasons:
 			 * 1. It hasn't been tested on a SAT layer that
 			 *    supports it.  This is new as of SAT-4.
 			 * 2. Even when there is a SAT layer that supports
 			 *    it, that SAT layer will also probably support
 			 *    ZBC -> ZAC translation, since they are both
 			 *    in the SAT-4 spec.
 			 * 3. Translation will likely be preferable to ATA
 			 *    passthrough.  LSI / Avago at least single
 			 *    steps ATA passthrough commands in the HBA,
 			 *    regardless of protocol, so unless that
 			 *    changes, there is a performance penalty for
 			 *    doing ATA passthrough no matter whether
 			 *    you're using NCQ/FPDMA, DMA or PIO.
 			 * 4. It requires a 32-byte CDB, which at least at
 			 *    this point in CAM requires a CDB pointer, which
 			 *    would require us to allocate an additional bit
 			 *    of storage separate from the CCB.
 			 */
 			error = scsi_ata_zac_mgmt_out(&ccb->csio,
 			    /*retries*/ da_retry_count,
 			    /*cbfcnp*/ dadone,
 			    /*tag_action*/ MSG_SIMPLE_Q_TAG,
 			    /*use_ncq*/ 0,
 			    /*zm_action*/ zone_sa,
 			    /*zone_id*/ lba,
 			    /*zone_flags*/ zone_flags,
 			    /*data_ptr*/ NULL,
 			    /*dxfer_len*/ 0,
 			    /*cdb_storage*/ NULL,
 			    /*cdb_storage_len*/ 0,
 			    /*sense_len*/ SSD_FULL_SIZE,
 			    /*timeout*/ da_default_timeout * 1000);
 			if (error != 0) {
 				error = EINVAL;
 				xpt_print(periph->path,
 				    "scsi_ata_zac_mgmt_out() returned an "
 				    "error!");
 				goto bailout;
 			}
 		}
 		*queue_ccb = 1;
 
 		break;
 	}
 	case DISK_ZONE_REPORT_ZONES: {
 		uint8_t *rz_ptr;
 		uint32_t num_entries, alloc_size;
 		struct disk_zone_report *rep;
 
 		rep = &bp->bio_zone.zone_params.report;
 
 		num_entries = rep->entries_allocated;
 		if (num_entries == 0) {
 			xpt_print(periph->path, "No entries allocated for "
 			    "Report Zones request\n");
 			error = EINVAL;
 			goto bailout;
 		}
 		alloc_size = sizeof(struct scsi_report_zones_hdr) +
 		    (sizeof(struct scsi_report_zones_desc) * num_entries);
 		alloc_size = min(alloc_size, softc->disk->d_maxsize);
 		rz_ptr = malloc(alloc_size, M_SCSIDA, M_NOWAIT | M_ZERO);
 		if (rz_ptr == NULL) {
 			xpt_print(periph->path, "Unable to allocate memory "
 			   "for Report Zones request\n");
 			error = ENOMEM;
 			goto bailout;
 		}
 		
 		if (softc->zone_interface != DA_ZONE_IF_ATA_PASS) {
 			scsi_zbc_in(&ccb->csio,
 				    /*retries*/ da_retry_count,
 				    /*cbcfnp*/ dadone,
 				    /*tag_action*/ MSG_SIMPLE_Q_TAG,
 				    /*service_action*/ ZBC_IN_SA_REPORT_ZONES,
 				    /*zone_start_lba*/ rep->starting_id,
 				    /*zone_options*/ rep->rep_options,
 				    /*data_ptr*/ rz_ptr,
 				    /*dxfer_len*/ alloc_size,
 				    /*sense_len*/ SSD_FULL_SIZE,
 				    /*timeout*/ da_default_timeout * 1000);
 		} else {
 			/*
 			 * Note that in this case, even though we can
 			 * technically use NCQ, we don't bother for several
 			 * reasons:
 			 * 1. It hasn't been tested on a SAT layer that
 			 *    supports it.  This is new as of SAT-4.
 			 * 2. Even when there is a SAT layer that supports
 			 *    it, that SAT layer will also probably support
 			 *    ZBC -> ZAC translation, since they are both
 			 *    in the SAT-4 spec.
 			 * 3. Translation will likely be preferable to ATA
 			 *    passthrough.  LSI / Avago at least single
 			 *    steps ATA passthrough commands in the HBA,
 			 *    regardless of protocol, so unless that
 			 *    changes, there is a performance penalty for
 			 *    doing ATA passthrough no matter whether
 			 *    you're using NCQ/FPDMA, DMA or PIO.
 			 * 4. It requires a 32-byte CDB, which at least at
 			 *    this point in CAM requires a CDB pointer, which
 			 *    would require us to allocate an additional bit
 			 *    of storage separate from the CCB.
 			 */
 			error = scsi_ata_zac_mgmt_in(&ccb->csio,
 			    /*retries*/ da_retry_count,
 			    /*cbcfnp*/ dadone,
 			    /*tag_action*/ MSG_SIMPLE_Q_TAG,
 			    /*use_ncq*/ 0,
 			    /*zm_action*/ ATA_ZM_REPORT_ZONES,
 			    /*zone_id*/ rep->starting_id,
 			    /*zone_flags*/ rep->rep_options,
 			    /*data_ptr*/ rz_ptr,
 			    /*dxfer_len*/ alloc_size,
 			    /*cdb_storage*/ NULL,
 			    /*cdb_storage_len*/ 0,
 			    /*sense_len*/ SSD_FULL_SIZE,
 			    /*timeout*/ da_default_timeout * 1000);
 			if (error != 0) {
 				error = EINVAL;
 				xpt_print(periph->path,
 				    "scsi_ata_zac_mgmt_in() returned an "
 				    "error!");
 				goto bailout;
 			}
 		}
 
 		/*
 		 * For BIO_ZONE, this isn't normally needed.  However, it
 		 * is used by devstat_end_transaction_bio() to determine
 		 * how much data was transferred.
 		 */
 		/*
 		 * XXX KDM we have a problem.  But I'm not sure how to fix
 		 * it.  devstat uses bio_bcount - bio_resid to calculate
 		 * the amount of data transferred.   The GEOM disk code
 		 * uses bio_length - bio_resid to calculate the amount of
 		 * data in bio_completed.  We have different structure
 		 * sizes above and below the ada(4) driver.  So, if we
 		 * use the sizes above, the amount transferred won't be
 		 * quite accurate for devstat.  If we use different sizes
 		 * for bio_bcount and bio_length (above and below
 		 * respectively), then the residual needs to match one or
 		 * the other.  Everything is calculated after the bio
 		 * leaves the driver, so changing the values around isn't
 		 * really an option.  For now, just set the count to the
 		 * passed in length.  This means that the calculations
 		 * above (e.g. bio_completed) will be correct, but the
 		 * amount of data reported to devstat will be slightly
 		 * under or overstated.
 		 */
 		bp->bio_bcount = bp->bio_length;
 
 		*queue_ccb = 1;
 
 		break;
 	}
 	case DISK_ZONE_GET_PARAMS: {
 		struct disk_zone_disk_params *params;
 
 		params = &bp->bio_zone.zone_params.disk_params;
 		bzero(params, sizeof(*params));
 
 		switch (softc->zone_mode) {
 		case DA_ZONE_DRIVE_MANAGED:
 			params->zone_mode = DISK_ZONE_MODE_DRIVE_MANAGED;
 			break;
 		case DA_ZONE_HOST_AWARE:
 			params->zone_mode = DISK_ZONE_MODE_HOST_AWARE;
 			break;
 		case DA_ZONE_HOST_MANAGED:
 			params->zone_mode = DISK_ZONE_MODE_HOST_MANAGED;
 			break;
 		default:
 		case DA_ZONE_NONE:
 			params->zone_mode = DISK_ZONE_MODE_NONE;
 			break;
 		}
 
 		if (softc->zone_flags & DA_ZONE_FLAG_URSWRZ)
 			params->flags |= DISK_ZONE_DISK_URSWRZ;
 
 		if (softc->zone_flags & DA_ZONE_FLAG_OPT_SEQ_SET) {
 			params->optimal_seq_zones = softc->optimal_seq_zones;
 			params->flags |= DISK_ZONE_OPT_SEQ_SET;
 		}
 
 		if (softc->zone_flags & DA_ZONE_FLAG_OPT_NONSEQ_SET) {
 			params->optimal_nonseq_zones =
 			    softc->optimal_nonseq_zones;
 			params->flags |= DISK_ZONE_OPT_NONSEQ_SET;
 		}
 
 		if (softc->zone_flags & DA_ZONE_FLAG_MAX_SEQ_SET) {
 			params->max_seq_zones = softc->max_seq_zones;
 			params->flags |= DISK_ZONE_MAX_SEQ_SET;
 		}
 		if (softc->zone_flags & DA_ZONE_FLAG_RZ_SUP)
 			params->flags |= DISK_ZONE_RZ_SUP;
 
 		if (softc->zone_flags & DA_ZONE_FLAG_OPEN_SUP)
 			params->flags |= DISK_ZONE_OPEN_SUP;
 
 		if (softc->zone_flags & DA_ZONE_FLAG_CLOSE_SUP)
 			params->flags |= DISK_ZONE_CLOSE_SUP;
 
 		if (softc->zone_flags & DA_ZONE_FLAG_FINISH_SUP)
 			params->flags |= DISK_ZONE_FINISH_SUP;
 
 		if (softc->zone_flags & DA_ZONE_FLAG_RWP_SUP)
 			params->flags |= DISK_ZONE_RWP_SUP;
 		break;
 	}
 	default:
 		break;
 	}
 bailout:
 	return (error);
 }
 
 static void
 dastart(struct cam_periph *periph, union ccb *start_ccb)
 {
 	struct da_softc *softc;
 
 	softc = (struct da_softc *)periph->softc;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastart\n"));
 
 skipstate:
 	switch (softc->state) {
 	case DA_STATE_NORMAL:
 	{
 		struct bio *bp;
 		uint8_t tag_code;
 
 more:
 		bp = cam_iosched_next_bio(softc->cam_iosched);
 		if (bp == NULL) {
 			if (cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) {
 				cam_iosched_clr_work_flags(softc->cam_iosched, DA_WORK_TUR);
 				scsi_test_unit_ready(&start_ccb->csio,
 				     /*retries*/ da_retry_count,
 				     dadone,
 				     MSG_SIMPLE_Q_TAG,
 				     SSD_FULL_SIZE,
 				     da_default_timeout * 1000);
 				start_ccb->ccb_h.ccb_bp = NULL;
 				start_ccb->ccb_h.ccb_state = DA_CCB_TUR;
 				xpt_action(start_ccb);
 			} else
 				xpt_release_ccb(start_ccb);
 			break;
 		}
 
 		if (bp->bio_cmd == BIO_DELETE) {
 			if (softc->delete_func != NULL) {
 				softc->delete_func(periph, start_ccb, bp);
 				goto out;
 			} else {
 				/* Not sure this is possible, but failsafe by lying and saying "sure, done." */
 				biofinish(bp, NULL, 0);
 				goto more;
 			}
 		}
 
 		if (cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) {
 			cam_iosched_clr_work_flags(softc->cam_iosched, DA_WORK_TUR);
 			cam_periph_release_locked(periph);	/* XXX is this still valid? I think so but unverified */
 		}
 
 		if ((bp->bio_flags & BIO_ORDERED) != 0 ||
 		    (softc->flags & DA_FLAG_NEED_OTAG) != 0) {
 			softc->flags &= ~DA_FLAG_NEED_OTAG;
 			softc->flags |= DA_FLAG_WAS_OTAG;
 			tag_code = MSG_ORDERED_Q_TAG;
 		} else {
 			tag_code = MSG_SIMPLE_Q_TAG;
 		}
 
 		switch (bp->bio_cmd) {
 		case BIO_WRITE:
 		case BIO_READ:
 		{
 			void *data_ptr;
 			int rw_op;
 
+			biotrack(bp, __func__);
+
 			if (bp->bio_cmd == BIO_WRITE) {
 				softc->flags |= DA_FLAG_DIRTY;
 				rw_op = SCSI_RW_WRITE;
 			} else {
 				rw_op = SCSI_RW_READ;
 			}
 
 			data_ptr = bp->bio_data;
 			if ((bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0) {
 				rw_op |= SCSI_RW_BIO;
 				data_ptr = bp;
 			}
 
 			scsi_read_write(&start_ccb->csio,
 					/*retries*/da_retry_count,
 					/*cbfcnp*/dadone,
 					/*tag_action*/tag_code,
 					rw_op,
 					/*byte2*/0,
 					softc->minimum_cmd_size,
 					/*lba*/bp->bio_pblkno,
 					/*block_count*/bp->bio_bcount /
 					softc->params.secsize,
 					data_ptr,
 					/*dxfer_len*/ bp->bio_bcount,
 					/*sense_len*/SSD_FULL_SIZE,
 					da_default_timeout * 1000);
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+			start_ccb->csio.bio = bp;
+#endif
 			break;
 		}
 		case BIO_FLUSH:
 			/*
 			 * BIO_FLUSH doesn't currently communicate
 			 * range data, so we synchronize the cache
 			 * over the whole disk.  We also force
 			 * ordered tag semantics the flush applies
 			 * to all previously queued I/O.
 			 */
 			scsi_synchronize_cache(&start_ccb->csio,
 					       /*retries*/1,
 					       /*cbfcnp*/dadone,
 					       MSG_ORDERED_Q_TAG,
 					       /*begin_lba*/0,
 					       /*lb_count*/0,
 					       SSD_FULL_SIZE,
 					       da_default_timeout*1000);
 			break;
 		case BIO_ZONE: {
 			int error, queue_ccb;
 
 			queue_ccb = 0;
 
 			error = da_zone_cmd(periph, start_ccb, bp,&queue_ccb);
 			if ((error != 0)
 			 || (queue_ccb == 0)) {
 				biofinish(bp, NULL, error);
 				xpt_release_ccb(start_ccb);
 				return;
 			}
 			break;
 		}
 		}
 		start_ccb->ccb_h.ccb_state = DA_CCB_BUFFER_IO;
 		start_ccb->ccb_h.flags |= CAM_UNLOCKED;
 		start_ccb->ccb_h.softtimeout = sbttotv(da_default_softtimeout);
 
 out:
 		LIST_INSERT_HEAD(&softc->pending_ccbs,
 				 &start_ccb->ccb_h, periph_links.le);
 
 		/* We expect a unit attention from this device */
 		if ((softc->flags & DA_FLAG_RETRY_UA) != 0) {
 			start_ccb->ccb_h.ccb_state |= DA_CCB_RETRY_UA;
 			softc->flags &= ~DA_FLAG_RETRY_UA;
 		}
 
 		start_ccb->ccb_h.ccb_bp = bp;
 		softc->refcount++;
 		cam_periph_unlock(periph);
 		xpt_action(start_ccb);
 		cam_periph_lock(periph);
 		softc->refcount--;
 
 		/* May have more work to do, so ensure we stay scheduled */
 		daschedule(periph);
 		break;
 	}
 	case DA_STATE_PROBE_RC:
 	{
 		struct scsi_read_capacity_data *rcap;
 
 		rcap = (struct scsi_read_capacity_data *)
 		    malloc(sizeof(*rcap), M_SCSIDA, M_NOWAIT|M_ZERO);
 		if (rcap == NULL) {
 			printf("dastart: Couldn't malloc read_capacity data\n");
 			/* da_free_periph??? */
 			break;
 		}
 		scsi_read_capacity(&start_ccb->csio,
 				   /*retries*/da_retry_count,
 				   dadone,
 				   MSG_SIMPLE_Q_TAG,
 				   rcap,
 				   SSD_FULL_SIZE,
 				   /*timeout*/5000);
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_RC;
 		xpt_action(start_ccb);
 		break;
 	}
 	case DA_STATE_PROBE_RC16:
 	{
 		struct scsi_read_capacity_data_long *rcaplong;
 
 		rcaplong = (struct scsi_read_capacity_data_long *)
 			malloc(sizeof(*rcaplong), M_SCSIDA, M_NOWAIT|M_ZERO);
 		if (rcaplong == NULL) {
 			printf("dastart: Couldn't malloc read_capacity data\n");
 			/* da_free_periph??? */
 			break;
 		}
 		scsi_read_capacity_16(&start_ccb->csio,
 				      /*retries*/ da_retry_count,
 				      /*cbfcnp*/ dadone,
 				      /*tag_action*/ MSG_SIMPLE_Q_TAG,
 				      /*lba*/ 0,
 				      /*reladr*/ 0,
 				      /*pmi*/ 0,
 				      /*rcap_buf*/ (uint8_t *)rcaplong,
 				      /*rcap_buf_len*/ sizeof(*rcaplong),
 				      /*sense_len*/ SSD_FULL_SIZE,
 				      /*timeout*/ da_default_timeout * 1000);
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_RC16;
 		xpt_action(start_ccb);
 		break;
 	}
 	case DA_STATE_PROBE_LBP:
 	{
 		struct scsi_vpd_logical_block_prov *lbp;
 
 		if (!scsi_vpd_supported_page(periph, SVPD_LBP)) {
 			/*
 			 * If we get here we don't support any SBC-3 delete
 			 * methods with UNMAP as the Logical Block Provisioning
 			 * VPD page support is required for devices which
 			 * support it according to T10/1799-D Revision 31
 			 * however older revisions of the spec don't mandate
 			 * this so we currently don't remove these methods
 			 * from the available set.
 			 */
 			softc->state = DA_STATE_PROBE_BLK_LIMITS;
 			goto skipstate;
 		}
 
 		lbp = (struct scsi_vpd_logical_block_prov *)
 			malloc(sizeof(*lbp), M_SCSIDA, M_NOWAIT|M_ZERO);
 
 		if (lbp == NULL) {
 			printf("dastart: Couldn't malloc lbp data\n");
 			/* da_free_periph??? */
 			break;
 		}
 
 		scsi_inquiry(&start_ccb->csio,
 			     /*retries*/da_retry_count,
 			     /*cbfcnp*/dadone,
 			     /*tag_action*/MSG_SIMPLE_Q_TAG,
 			     /*inq_buf*/(u_int8_t *)lbp,
 			     /*inq_len*/sizeof(*lbp),
 			     /*evpd*/TRUE,
 			     /*page_code*/SVPD_LBP,
 			     /*sense_len*/SSD_MIN_SIZE,
 			     /*timeout*/da_default_timeout * 1000);
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_LBP;
 		xpt_action(start_ccb);
 		break;
 	}
 	case DA_STATE_PROBE_BLK_LIMITS:
 	{
 		struct scsi_vpd_block_limits *block_limits;
 
 		if (!scsi_vpd_supported_page(periph, SVPD_BLOCK_LIMITS)) {
 			/* Not supported skip to next probe */
 			softc->state = DA_STATE_PROBE_BDC;
 			goto skipstate;
 		}
 
 		block_limits = (struct scsi_vpd_block_limits *)
 			malloc(sizeof(*block_limits), M_SCSIDA, M_NOWAIT|M_ZERO);
 
 		if (block_limits == NULL) {
 			printf("dastart: Couldn't malloc block_limits data\n");
 			/* da_free_periph??? */
 			break;
 		}
 
 		scsi_inquiry(&start_ccb->csio,
 			     /*retries*/da_retry_count,
 			     /*cbfcnp*/dadone,
 			     /*tag_action*/MSG_SIMPLE_Q_TAG,
 			     /*inq_buf*/(u_int8_t *)block_limits,
 			     /*inq_len*/sizeof(*block_limits),
 			     /*evpd*/TRUE,
 			     /*page_code*/SVPD_BLOCK_LIMITS,
 			     /*sense_len*/SSD_MIN_SIZE,
 			     /*timeout*/da_default_timeout * 1000);
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_BLK_LIMITS;
 		xpt_action(start_ccb);
 		break;
 	}
 	case DA_STATE_PROBE_BDC:
 	{
 		struct scsi_vpd_block_characteristics *bdc;
 
 		if (!scsi_vpd_supported_page(periph, SVPD_BDC)) {
 			softc->state = DA_STATE_PROBE_ATA;
 			goto skipstate;
 		}
 
 		bdc = (struct scsi_vpd_block_characteristics *)
 			malloc(sizeof(*bdc), M_SCSIDA, M_NOWAIT|M_ZERO);
 
 		if (bdc == NULL) {
 			printf("dastart: Couldn't malloc bdc data\n");
 			/* da_free_periph??? */
 			break;
 		}
 
 		scsi_inquiry(&start_ccb->csio,
 			     /*retries*/da_retry_count,
 			     /*cbfcnp*/dadone,
 			     /*tag_action*/MSG_SIMPLE_Q_TAG,
 			     /*inq_buf*/(u_int8_t *)bdc,
 			     /*inq_len*/sizeof(*bdc),
 			     /*evpd*/TRUE,
 			     /*page_code*/SVPD_BDC,
 			     /*sense_len*/SSD_MIN_SIZE,
 			     /*timeout*/da_default_timeout * 1000);
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_BDC;
 		xpt_action(start_ccb);
 		break;
 	}
 	case DA_STATE_PROBE_ATA:
 	{
 		struct ata_params *ata_params;
 
 		if (!scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) {
 			if ((softc->zone_mode == DA_ZONE_HOST_AWARE)
 			 || (softc->zone_mode == DA_ZONE_HOST_MANAGED)) {
 				/*
 				 * Note that if the ATA VPD page isn't
 				 * supported, we aren't talking to an ATA
 				 * device anyway.  Support for that VPD
 				 * page is mandatory for SCSI to ATA (SAT)
 				 * translation layers.
 				 */
 				softc->state = DA_STATE_PROBE_ZONE;
 				goto skipstate;
 			}
 			daprobedone(periph, start_ccb);
 			break;
 		}
 
 		ata_params = (struct ata_params*)
 			malloc(sizeof(*ata_params), M_SCSIDA,M_NOWAIT|M_ZERO);
 
 		if (ata_params == NULL) {
 			xpt_print(periph->path, "Couldn't malloc ata_params "
 			    "data\n");
 			/* da_free_periph??? */
 			break;
 		}
 
 		scsi_ata_identify(&start_ccb->csio,
 				  /*retries*/da_retry_count,
 				  /*cbfcnp*/dadone,
                                   /*tag_action*/MSG_SIMPLE_Q_TAG,
 				  /*data_ptr*/(u_int8_t *)ata_params,
 				  /*dxfer_len*/sizeof(*ata_params),
 				  /*sense_len*/SSD_FULL_SIZE,
 				  /*timeout*/da_default_timeout * 1000);
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA;
 		xpt_action(start_ccb);
 		break;
 	}
 	case DA_STATE_PROBE_ATA_LOGDIR:
 	{
 		struct ata_gp_log_dir *log_dir;
 		int retval;
 
 		retval = 0;
 
 		if ((softc->flags & DA_FLAG_CAN_ATA_LOG) == 0) {
 			/*
 			 * If we don't have log support, not much point in
 			 * trying to probe zone support.
 			 */
 			daprobedone(periph, start_ccb);
 			break;
 		}
 
 		/*
 		 * If we have an ATA device (the SCSI ATA Information VPD
 		 * page should be present and the ATA identify should have
 		 * succeeded) and it supports logs, ask for the log directory.
 		 */
 
 		log_dir = malloc(sizeof(*log_dir), M_SCSIDA, M_NOWAIT|M_ZERO);
 		if (log_dir == NULL) {
 			xpt_print(periph->path, "Couldn't malloc log_dir "
 			    "data\n");
 			daprobedone(periph, start_ccb);
 			break;
 		}
 
 		retval = scsi_ata_read_log(&start_ccb->csio,
 		    /*retries*/ da_retry_count,
 		    /*cbfcnp*/ dadone,
 		    /*tag_action*/ MSG_SIMPLE_Q_TAG,
 		    /*log_address*/ ATA_LOG_DIRECTORY,
 		    /*page_number*/ 0,
 		    /*block_count*/ 1,
 		    /*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
 				 AP_PROTO_DMA : AP_PROTO_PIO_IN,
 		    /*data_ptr*/ (uint8_t *)log_dir,
 		    /*dxfer_len*/ sizeof(*log_dir),
 		    /*sense_len*/ SSD_FULL_SIZE,
 		    /*timeout*/ da_default_timeout * 1000);
 
 		if (retval != 0) {
 			xpt_print(periph->path, "scsi_ata_read_log() failed!");
 			free(log_dir, M_SCSIDA);
 			daprobedone(periph, start_ccb);
 			break;
 		}
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_LOGDIR;
 		xpt_action(start_ccb);
 		break;
 	}
 	case DA_STATE_PROBE_ATA_IDDIR:
 	{
 		struct ata_identify_log_pages *id_dir;
 		int retval;
 
 		retval = 0;
 
 		/*
 		 * Check here to see whether the Identify Device log is
 		 * supported in the directory of logs.  If so, continue
 		 * with requesting the log of identify device pages.
 		 */
 		if ((softc->flags & DA_FLAG_CAN_ATA_IDLOG) == 0) {
 			daprobedone(periph, start_ccb);
 			break;
 		}
 
 		id_dir = malloc(sizeof(*id_dir), M_SCSIDA, M_NOWAIT | M_ZERO);
 		if (id_dir == NULL) {
 			xpt_print(periph->path, "Couldn't malloc id_dir "
 			    "data\n");
 			daprobedone(periph, start_ccb);
 			break;
 		}
 
 		retval = scsi_ata_read_log(&start_ccb->csio,
 		    /*retries*/ da_retry_count,
 		    /*cbfcnp*/ dadone,
 		    /*tag_action*/ MSG_SIMPLE_Q_TAG,
 		    /*log_address*/ ATA_IDENTIFY_DATA_LOG,
 		    /*page_number*/ ATA_IDL_PAGE_LIST,
 		    /*block_count*/ 1,
 		    /*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
 				 AP_PROTO_DMA : AP_PROTO_PIO_IN,
 		    /*data_ptr*/ (uint8_t *)id_dir,
 		    /*dxfer_len*/ sizeof(*id_dir),
 		    /*sense_len*/ SSD_FULL_SIZE,
 		    /*timeout*/ da_default_timeout * 1000);
 
 		if (retval != 0) {
 			xpt_print(periph->path, "scsi_ata_read_log() failed!");
 			free(id_dir, M_SCSIDA);
 			daprobedone(periph, start_ccb);
 			break;
 		}
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_IDDIR;
 		xpt_action(start_ccb);
 		break;
 	}
 	case DA_STATE_PROBE_ATA_SUP:
 	{
 		struct ata_identify_log_sup_cap *sup_cap;
 		int retval;
 
 		retval = 0;
 
 		/*
 		 * Check here to see whether the Supported Capabilities log
 		 * is in the list of Identify Device logs.
 		 */
 		if ((softc->flags & DA_FLAG_CAN_ATA_SUPCAP) == 0) {
 			daprobedone(periph, start_ccb);
 			break;
 		}
 
 		sup_cap = malloc(sizeof(*sup_cap), M_SCSIDA, M_NOWAIT|M_ZERO);
 		if (sup_cap == NULL) {
 			xpt_print(periph->path, "Couldn't malloc sup_cap "
 			    "data\n");
 			daprobedone(periph, start_ccb);
 			break;
 		}
 
 		retval = scsi_ata_read_log(&start_ccb->csio,
 		    /*retries*/ da_retry_count,
 		    /*cbfcnp*/ dadone,
 		    /*tag_action*/ MSG_SIMPLE_Q_TAG,
 		    /*log_address*/ ATA_IDENTIFY_DATA_LOG,
 		    /*page_number*/ ATA_IDL_SUP_CAP,
 		    /*block_count*/ 1,
 		    /*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
 				 AP_PROTO_DMA : AP_PROTO_PIO_IN,
 		    /*data_ptr*/ (uint8_t *)sup_cap,
 		    /*dxfer_len*/ sizeof(*sup_cap),
 		    /*sense_len*/ SSD_FULL_SIZE,
 		    /*timeout*/ da_default_timeout * 1000);
 
 		if (retval != 0) {
 			xpt_print(periph->path, "scsi_ata_read_log() failed!");
 			free(sup_cap, M_SCSIDA);
 			daprobedone(periph, start_ccb);
 			break;
 
 		}
 
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_SUP;
 		xpt_action(start_ccb);
 		break;
 	}
 	case DA_STATE_PROBE_ATA_ZONE:
 	{
 		struct ata_zoned_info_log *ata_zone;
 		int retval;
 
 		retval = 0;
 
 		/*
 		 * Check here to see whether the zoned device information
 		 * page is supported.  If so, continue on to request it.
 		 * If not, skip to DA_STATE_PROBE_LOG or done.
 		 */
 		if ((softc->flags & DA_FLAG_CAN_ATA_ZONE) == 0) {
 			daprobedone(periph, start_ccb);
 			break;
 		}
 		ata_zone = malloc(sizeof(*ata_zone), M_SCSIDA,
 				  M_NOWAIT|M_ZERO);
 		if (ata_zone == NULL) {
 			xpt_print(periph->path, "Couldn't malloc ata_zone "
 			    "data\n");
 			daprobedone(periph, start_ccb);
 			break;
 		}
 
 		retval = scsi_ata_read_log(&start_ccb->csio,
 		    /*retries*/ da_retry_count,
 		    /*cbfcnp*/ dadone,
 		    /*tag_action*/ MSG_SIMPLE_Q_TAG,
 		    /*log_address*/ ATA_IDENTIFY_DATA_LOG,
 		    /*page_number*/ ATA_IDL_ZDI,
 		    /*block_count*/ 1,
 		    /*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
 				 AP_PROTO_DMA : AP_PROTO_PIO_IN,
 		    /*data_ptr*/ (uint8_t *)ata_zone,
 		    /*dxfer_len*/ sizeof(*ata_zone),
 		    /*sense_len*/ SSD_FULL_SIZE,
 		    /*timeout*/ da_default_timeout * 1000);
 
 		if (retval != 0) {
 			xpt_print(periph->path, "scsi_ata_read_log() failed!");
 			free(ata_zone, M_SCSIDA);
 			daprobedone(periph, start_ccb);
 			break;
 		}
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_ZONE;
 		xpt_action(start_ccb);
 
 		break;
 	}
 	case DA_STATE_PROBE_ZONE:
 	{
 		struct scsi_vpd_zoned_bdc *bdc;
 
 		/*
 		 * Note that this page will be supported for SCSI protocol
 		 * devices that support ZBC (SMR devices), as well as ATA
 		 * protocol devices that are behind a SAT (SCSI to ATA
 		 * Translation) layer that supports converting ZBC commands
 		 * to their ZAC equivalents.
 		 */
 		if (!scsi_vpd_supported_page(periph, SVPD_ZONED_BDC)) {
 			daprobedone(periph, start_ccb);
 			break;
 		}
 		bdc = (struct scsi_vpd_zoned_bdc *)
 			malloc(sizeof(*bdc), M_SCSIDA, M_NOWAIT|M_ZERO);
 
 		if (bdc == NULL) {
 			xpt_release_ccb(start_ccb);
 			xpt_print(periph->path, "Couldn't malloc zone VPD "
 			    "data\n");
 			break;
 		}
 		scsi_inquiry(&start_ccb->csio,
 			     /*retries*/da_retry_count,
 			     /*cbfcnp*/dadone,
 			     /*tag_action*/MSG_SIMPLE_Q_TAG,
 			     /*inq_buf*/(u_int8_t *)bdc,
 			     /*inq_len*/sizeof(*bdc),
 			     /*evpd*/TRUE,
 			     /*page_code*/SVPD_ZONED_BDC,
 			     /*sense_len*/SSD_FULL_SIZE,
 			     /*timeout*/da_default_timeout * 1000);
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ZONE;
 		xpt_action(start_ccb);
 		break;
 	}
 	}
 }
 
 /*
  * In each of the methods below, while its the caller's
  * responsibility to ensure the request will fit into a
  * single device request, we might have changed the delete
  * method due to the device incorrectly advertising either
  * its supported methods or limits.
  * 
  * To prevent this causing further issues we validate the
  * against the methods limits, and warn which would
  * otherwise be unnecessary.
  */
 static void
 da_delete_unmap(struct cam_periph *periph, union ccb *ccb, struct bio *bp)
 {
 	struct da_softc *softc = (struct da_softc *)periph->softc;;
 	struct bio *bp1;
 	uint8_t *buf = softc->unmap_buf;
 	uint64_t lba, lastlba = (uint64_t)-1;
 	uint64_t totalcount = 0;
 	uint64_t count;
 	uint32_t lastcount = 0, c;
 	uint32_t off, ranges = 0;
 
 	/*
 	 * Currently this doesn't take the UNMAP
 	 * Granularity and Granularity Alignment
 	 * fields into account.
 	 *
 	 * This could result in both unoptimal unmap
 	 * requests as as well as UNMAP calls unmapping
 	 * fewer LBA's than requested.
 	 */
 
 	bzero(softc->unmap_buf, sizeof(softc->unmap_buf));
 	bp1 = bp;
 	do {
 		/*
 		 * Note: ada and da are different in how they store the
 		 * pending bp's in a trim. ada stores all of them in the
 		 * trim_req.bps. da stores all but the first one in the
 		 * delete_run_queue. ada then completes all the bps in
 		 * its adadone() loop. da completes all the bps in the
 		 * delete_run_queue in dadone, and relies on the biodone
 		 * after to complete. This should be reconciled since there's
 		 * no real reason to do it differently. XXX
 		 */
 		if (bp1 != bp)
 			bioq_insert_tail(&softc->delete_run_queue, bp1);
 		lba = bp1->bio_pblkno;
 		count = bp1->bio_bcount / softc->params.secsize;
 
 		/* Try to extend the previous range. */
 		if (lba == lastlba) {
 			c = omin(count, UNMAP_RANGE_MAX - lastcount);
 			lastcount += c;
 			off = ((ranges - 1) * UNMAP_RANGE_SIZE) +
 			      UNMAP_HEAD_SIZE;
 			scsi_ulto4b(lastcount, &buf[off + 8]);
 			count -= c;
 			lba +=c;
 			totalcount += c;
 		}
 
 		while (count > 0) {
 			c = omin(count, UNMAP_RANGE_MAX);
 			if (totalcount + c > softc->unmap_max_lba ||
 			    ranges >= softc->unmap_max_ranges) {
 				xpt_print(periph->path,
 				    "%s issuing short delete %ld > %ld"
 				    "|| %d >= %d",
 				    da_delete_method_desc[softc->delete_method],
 				    totalcount + c, softc->unmap_max_lba,
 				    ranges, softc->unmap_max_ranges);
 				break;
 			}
 			off = (ranges * UNMAP_RANGE_SIZE) + UNMAP_HEAD_SIZE;
 			scsi_u64to8b(lba, &buf[off + 0]);
 			scsi_ulto4b(c, &buf[off + 8]);
 			lba += c;
 			totalcount += c;
 			ranges++;
 			count -= c;
 			lastcount = c;
 		}
 		lastlba = lba;
 		bp1 = cam_iosched_next_trim(softc->cam_iosched);
 		if (bp1 == NULL)
 			break;
 		if (ranges >= softc->unmap_max_ranges ||
 		    totalcount + bp1->bio_bcount /
 		    softc->params.secsize > softc->unmap_max_lba) {
 			cam_iosched_put_back_trim(softc->cam_iosched, bp1);
 			break;
 		}
 	} while (1);
 	scsi_ulto2b(ranges * 16 + 6, &buf[0]);
 	scsi_ulto2b(ranges * 16, &buf[2]);
 
 	scsi_unmap(&ccb->csio,
 		   /*retries*/da_retry_count,
 		   /*cbfcnp*/dadone,
 		   /*tag_action*/MSG_SIMPLE_Q_TAG,
 		   /*byte2*/0,
 		   /*data_ptr*/ buf,
 		   /*dxfer_len*/ ranges * 16 + 8,
 		   /*sense_len*/SSD_FULL_SIZE,
 		   da_default_timeout * 1000);
 	ccb->ccb_h.ccb_state = DA_CCB_DELETE;
 	ccb->ccb_h.flags |= CAM_UNLOCKED;
 	cam_iosched_submit_trim(softc->cam_iosched);
 }
 
 static void
 da_delete_trim(struct cam_periph *periph, union ccb *ccb, struct bio *bp)
 {
 	struct da_softc *softc = (struct da_softc *)periph->softc;
 	struct bio *bp1;
 	uint8_t *buf = softc->unmap_buf;
 	uint64_t lastlba = (uint64_t)-1;
 	uint64_t count;
 	uint64_t lba;
 	uint32_t lastcount = 0, c, requestcount;
 	int ranges = 0, off, block_count;
 
 	bzero(softc->unmap_buf, sizeof(softc->unmap_buf));
 	bp1 = bp;
 	do {
 		if (bp1 != bp)//XXX imp XXX
 			bioq_insert_tail(&softc->delete_run_queue, bp1);
 		lba = bp1->bio_pblkno;
 		count = bp1->bio_bcount / softc->params.secsize;
 		requestcount = count;
 
 		/* Try to extend the previous range. */
 		if (lba == lastlba) {
 			c = omin(count, ATA_DSM_RANGE_MAX - lastcount);
 			lastcount += c;
 			off = (ranges - 1) * 8;
 			buf[off + 6] = lastcount & 0xff;
 			buf[off + 7] = (lastcount >> 8) & 0xff;
 			count -= c;
 			lba += c;
 		}
 
 		while (count > 0) {
 			c = omin(count, ATA_DSM_RANGE_MAX);
 			off = ranges * 8;
 
 			buf[off + 0] = lba & 0xff;
 			buf[off + 1] = (lba >> 8) & 0xff;
 			buf[off + 2] = (lba >> 16) & 0xff;
 			buf[off + 3] = (lba >> 24) & 0xff;
 			buf[off + 4] = (lba >> 32) & 0xff;
 			buf[off + 5] = (lba >> 40) & 0xff;
 			buf[off + 6] = c & 0xff;
 			buf[off + 7] = (c >> 8) & 0xff;
 			lba += c;
 			ranges++;
 			count -= c;
 			lastcount = c;
 			if (count != 0 && ranges == softc->trim_max_ranges) {
 				xpt_print(periph->path,
 				    "%s issuing short delete %ld > %ld\n",
 				    da_delete_method_desc[softc->delete_method],
 				    requestcount,
 				    (softc->trim_max_ranges - ranges) *
 				    ATA_DSM_RANGE_MAX);
 				break;
 			}
 		}
 		lastlba = lba;
 		bp1 = cam_iosched_next_trim(softc->cam_iosched);
 		if (bp1 == NULL)
 			break;
 		if (bp1->bio_bcount / softc->params.secsize >
 		    (softc->trim_max_ranges - ranges) * ATA_DSM_RANGE_MAX) {
 			cam_iosched_put_back_trim(softc->cam_iosched, bp1);
 			break;
 		}
 	} while (1);
 
 	block_count = howmany(ranges, ATA_DSM_BLK_RANGES);
 	scsi_ata_trim(&ccb->csio,
 		      /*retries*/da_retry_count,
 		      /*cbfcnp*/dadone,
 		      /*tag_action*/MSG_SIMPLE_Q_TAG,
 		      block_count,
 		      /*data_ptr*/buf,
 		      /*dxfer_len*/block_count * ATA_DSM_BLK_SIZE,
 		      /*sense_len*/SSD_FULL_SIZE,
 		      da_default_timeout * 1000);
 	ccb->ccb_h.ccb_state = DA_CCB_DELETE;
 	ccb->ccb_h.flags |= CAM_UNLOCKED;
 	cam_iosched_submit_trim(softc->cam_iosched);
 }
 
 /*
  * We calculate ws_max_blks here based off d_delmaxsize instead
  * of using softc->ws_max_blks as it is absolute max for the
  * device not the protocol max which may well be lower.
  */
 static void
 da_delete_ws(struct cam_periph *periph, union ccb *ccb, struct bio *bp)
 {
 	struct da_softc *softc;
 	struct bio *bp1;
 	uint64_t ws_max_blks;
 	uint64_t lba;
 	uint64_t count; /* forward compat with WS32 */
 
 	softc = (struct da_softc *)periph->softc;
 	ws_max_blks = softc->disk->d_delmaxsize / softc->params.secsize;
 	lba = bp->bio_pblkno;
 	count = 0;
 	bp1 = bp;
 	do {
 		if (bp1 != bp)//XXX imp XXX
 			bioq_insert_tail(&softc->delete_run_queue, bp1);
 		count += bp1->bio_bcount / softc->params.secsize;
 		if (count > ws_max_blks) {
 			xpt_print(periph->path,
 			    "%s issuing short delete %ld > %ld\n",
 			    da_delete_method_desc[softc->delete_method],
 			    count, ws_max_blks);
 			count = omin(count, ws_max_blks);
 			break;
 		}
 		bp1 = cam_iosched_next_trim(softc->cam_iosched);
 		if (bp1 == NULL)
 			break;
 		if (lba + count != bp1->bio_pblkno ||
 		    count + bp1->bio_bcount /
 		    softc->params.secsize > ws_max_blks) {
 			cam_iosched_put_back_trim(softc->cam_iosched, bp1);
 			break;
 		}
 	} while (1);
 
 	scsi_write_same(&ccb->csio,
 			/*retries*/da_retry_count,
 			/*cbfcnp*/dadone,
 			/*tag_action*/MSG_SIMPLE_Q_TAG,
 			/*byte2*/softc->delete_method ==
 			    DA_DELETE_ZERO ? 0 : SWS_UNMAP,
 			softc->delete_method == DA_DELETE_WS16 ? 16 : 10,
 			/*lba*/lba,
 			/*block_count*/count,
 			/*data_ptr*/ __DECONST(void *, zero_region),
 			/*dxfer_len*/ softc->params.secsize,
 			/*sense_len*/SSD_FULL_SIZE,
 			da_default_timeout * 1000);
 	ccb->ccb_h.ccb_state = DA_CCB_DELETE;
 	ccb->ccb_h.flags |= CAM_UNLOCKED;
 	cam_iosched_submit_trim(softc->cam_iosched);
 }
 
 static int
 cmd6workaround(union ccb *ccb)
 {
 	struct scsi_rw_6 cmd6;
 	struct scsi_rw_10 *cmd10;
 	struct da_softc *softc;
 	u_int8_t *cdb;
 	struct bio *bp;
 	int frozen;
 
 	cdb = ccb->csio.cdb_io.cdb_bytes;
 	softc = (struct da_softc *)xpt_path_periph(ccb->ccb_h.path)->softc;
 
 	if (ccb->ccb_h.ccb_state == DA_CCB_DELETE) {
 		da_delete_methods old_method = softc->delete_method;
 
 		/*
 		 * Typically there are two reasons for failure here
 		 * 1. Delete method was detected as supported but isn't
 		 * 2. Delete failed due to invalid params e.g. too big
 		 *
 		 * While we will attempt to choose an alternative delete method
 		 * this may result in short deletes if the existing delete
 		 * requests from geom are big for the new method chosen.
 		 *
 		 * This method assumes that the error which triggered this
 		 * will not retry the io otherwise a panic will occur
 		 */
 		dadeleteflag(softc, old_method, 0);
 		dadeletemethodchoose(softc, DA_DELETE_DISABLE);
 		if (softc->delete_method == DA_DELETE_DISABLE)
 			xpt_print(ccb->ccb_h.path,
 				  "%s failed, disabling BIO_DELETE\n",
 				  da_delete_method_desc[old_method]);
 		else
 			xpt_print(ccb->ccb_h.path,
 				  "%s failed, switching to %s BIO_DELETE\n",
 				  da_delete_method_desc[old_method],
 				  da_delete_method_desc[softc->delete_method]);
 
 		while ((bp = bioq_takefirst(&softc->delete_run_queue)) != NULL)
 			cam_iosched_queue_work(softc->cam_iosched, bp);
 		cam_iosched_queue_work(softc->cam_iosched,
 		    (struct bio *)ccb->ccb_h.ccb_bp);
 		ccb->ccb_h.ccb_bp = NULL;
 		return (0);
 	}
 
 	/* Detect unsupported PREVENT ALLOW MEDIUM REMOVAL. */
 	if ((ccb->ccb_h.flags & CAM_CDB_POINTER) == 0 &&
 	    (*cdb == PREVENT_ALLOW) &&
 	    (softc->quirks & DA_Q_NO_PREVENT) == 0) {
 		if (bootverbose)
 			xpt_print(ccb->ccb_h.path,
 			    "PREVENT ALLOW MEDIUM REMOVAL not supported.\n");
 		softc->quirks |= DA_Q_NO_PREVENT;
 		return (0);
 	}
 
 	/* Detect unsupported SYNCHRONIZE CACHE(10). */
 	if ((ccb->ccb_h.flags & CAM_CDB_POINTER) == 0 &&
 	    (*cdb == SYNCHRONIZE_CACHE) &&
 	    (softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) {
 		if (bootverbose)
 			xpt_print(ccb->ccb_h.path,
 			    "SYNCHRONIZE CACHE(10) not supported.\n");
 		softc->quirks |= DA_Q_NO_SYNC_CACHE;
 		softc->disk->d_flags &= ~DISKFLAG_CANFLUSHCACHE;
 		return (0);
 	}
 
 	/* Translation only possible if CDB is an array and cmd is R/W6 */
 	if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0 ||
 	    (*cdb != READ_6 && *cdb != WRITE_6))
 		return 0;
 
 	xpt_print(ccb->ccb_h.path, "READ(6)/WRITE(6) not supported, "
 	    "increasing minimum_cmd_size to 10.\n");
  	softc->minimum_cmd_size = 10;
 
 	bcopy(cdb, &cmd6, sizeof(struct scsi_rw_6));
 	cmd10 = (struct scsi_rw_10 *)cdb;
 	cmd10->opcode = (cmd6.opcode == READ_6) ? READ_10 : WRITE_10;
 	cmd10->byte2 = 0;
 	scsi_ulto4b(scsi_3btoul(cmd6.addr), cmd10->addr);
 	cmd10->reserved = 0;
 	scsi_ulto2b(cmd6.length, cmd10->length);
 	cmd10->control = cmd6.control;
 	ccb->csio.cdb_len = sizeof(*cmd10);
 
 	/* Requeue request, unfreezing queue if necessary */
 	frozen = (ccb->ccb_h.status & CAM_DEV_QFRZN) != 0;
  	ccb->ccb_h.status = CAM_REQUEUE_REQ;
 	xpt_action(ccb);
 	if (frozen) {
 		cam_release_devq(ccb->ccb_h.path,
 				 /*relsim_flags*/0,
 				 /*reduction*/0,
 				 /*timeout*/0,
 				 /*getcount_only*/0);
 	}
 	return (ERESTART);
 }
 
 static void
 dazonedone(struct cam_periph *periph, union ccb *ccb)
 {
 	struct da_softc *softc;
 	struct bio *bp;
 
 	softc = periph->softc;
 	bp = (struct bio *)ccb->ccb_h.ccb_bp;
 
 	switch (bp->bio_zone.zone_cmd) {
 	case DISK_ZONE_OPEN:
 	case DISK_ZONE_CLOSE:
 	case DISK_ZONE_FINISH:
 	case DISK_ZONE_RWP:
 		break;
 	case DISK_ZONE_REPORT_ZONES: {
 		uint32_t avail_len;
 		struct disk_zone_report *rep;
 		struct scsi_report_zones_hdr *hdr;
 		struct scsi_report_zones_desc *desc;
 		struct disk_zone_rep_entry *entry;
 		uint32_t num_alloced, hdr_len, num_avail;
 		uint32_t num_to_fill, i;
 		int ata;
 
 		rep = &bp->bio_zone.zone_params.report;
 		avail_len = ccb->csio.dxfer_len - ccb->csio.resid;
 		/*
 		 * Note that bio_resid isn't normally used for zone
 		 * commands, but it is used by devstat_end_transaction_bio()
 		 * to determine how much data was transferred.  Because
 		 * the size of the SCSI/ATA data structures is different
 		 * than the size of the BIO interface structures, the
 		 * amount of data actually transferred from the drive will
 		 * be different than the amount of data transferred to
 		 * the user.
 		 */
 		bp->bio_resid = ccb->csio.resid;
 		num_alloced = rep->entries_allocated;
 		hdr = (struct scsi_report_zones_hdr *)ccb->csio.data_ptr;
 		if (avail_len < sizeof(*hdr)) {
 			/*
 			 * Is there a better error than EIO here?  We asked
 			 * for at least the header, and we got less than
 			 * that.
 			 */
 			bp->bio_error = EIO;
 			bp->bio_flags |= BIO_ERROR;
 			bp->bio_resid = bp->bio_bcount;
 			break;
 		}
 
 		if (softc->zone_interface == DA_ZONE_IF_ATA_PASS)
 			ata = 1;
 		else
 			ata = 0;
 
 		hdr_len = ata ? le32dec(hdr->length) :
 				scsi_4btoul(hdr->length);
 		if (hdr_len > 0)
 			rep->entries_available = hdr_len / sizeof(*desc);
 		else
 			rep->entries_available = 0;
 		/*
 		 * NOTE: using the same values for the BIO version of the
 		 * same field as the SCSI/ATA values.  This means we could
 		 * get some additional values that aren't defined in bio.h
 		 * if more values of the same field are defined later.
 		 */
 		rep->header.same = hdr->byte4 & SRZ_SAME_MASK;
 		rep->header.maximum_lba = ata ?  le64dec(hdr->maximum_lba) :
 					  scsi_8btou64(hdr->maximum_lba);
 		/*
 		 * If the drive reports no entries that match the query,
 		 * we're done.
 		 */
 		if (hdr_len == 0) {
 			rep->entries_filled = 0;
 			break;
 		}
 
 		num_avail = min((avail_len - sizeof(*hdr)) / sizeof(*desc),
 				hdr_len / sizeof(*desc));
 		/*
 		 * If the drive didn't return any data, then we're done.
 		 */
 		if (num_avail == 0) {
 			rep->entries_filled = 0;
 			break;
 		}
 
 		num_to_fill = min(num_avail, rep->entries_allocated);
 		/*
 		 * If the user didn't allocate any entries for us to fill,
 		 * we're done.
 		 */
 		if (num_to_fill == 0) {
 			rep->entries_filled = 0;
 			break;
 		}
 
 		for (i = 0, desc = &hdr->desc_list[0], entry=&rep->entries[0];
 		     i < num_to_fill; i++, desc++, entry++) {
 			/*
 			 * NOTE: we're mapping the values here directly
 			 * from the SCSI/ATA bit definitions to the bio.h
 			 * definitons.  There is also a warning in
 			 * disk_zone.h, but the impact is that if
 			 * additional values are added in the SCSI/ATA
 			 * specs these will be visible to consumers of
 			 * this interface.
 			 */
 			entry->zone_type = desc->zone_type & SRZ_TYPE_MASK;
 			entry->zone_condition =
 			    (desc->zone_flags & SRZ_ZONE_COND_MASK) >>
 			    SRZ_ZONE_COND_SHIFT;
 			entry->zone_flags |= desc->zone_flags &
 			    (SRZ_ZONE_NON_SEQ|SRZ_ZONE_RESET);
 			entry->zone_length =
 			    ata ? le64dec(desc->zone_length) :
 				  scsi_8btou64(desc->zone_length);
 			entry->zone_start_lba =
 			    ata ? le64dec(desc->zone_start_lba) :
 				  scsi_8btou64(desc->zone_start_lba);
 			entry->write_pointer_lba =
 			    ata ? le64dec(desc->write_pointer_lba) :
 				  scsi_8btou64(desc->write_pointer_lba);
 		}
 		rep->entries_filled = num_to_fill;
 		break;
 	}
 	case DISK_ZONE_GET_PARAMS:
 	default:
 		/*
 		 * In theory we should not get a GET_PARAMS bio, since it
 		 * should be handled without queueing the command to the
 		 * drive.
 		 */
 		panic("%s: Invalid zone command %d", __func__,
 		    bp->bio_zone.zone_cmd);
 		break;
 	}
 
 	if (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES)
 		free(ccb->csio.data_ptr, M_SCSIDA);
 }
 
 static void
 dadone(struct cam_periph *periph, union ccb *done_ccb)
 {
 	struct da_softc *softc;
 	struct ccb_scsiio *csio;
 	u_int32_t  priority;
 	da_ccb_state state;
 
 	softc = (struct da_softc *)periph->softc;
 	priority = done_ccb->ccb_h.pinfo.priority;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone\n"));
 
 	csio = &done_ccb->csio;
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+	if (csio->bio != NULL)
+		biotrack(csio->bio, __func__);
+#endif
 	state = csio->ccb_h.ccb_state & DA_CCB_TYPE_MASK;
 	switch (state) {
 	case DA_CCB_BUFFER_IO:
 	case DA_CCB_DELETE:
 	{
 		struct bio *bp, *bp1;
 
 		cam_periph_lock(periph);
 		bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
 		if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 			int error;
 			int sf;
 
 			if ((csio->ccb_h.ccb_state & DA_CCB_RETRY_UA) != 0)
 				sf = SF_RETRY_UA;
 			else
 				sf = 0;
 
 			error = daerror(done_ccb, CAM_RETRY_SELTO, sf);
 			if (error == ERESTART) {
 				/*
 				 * A retry was scheduled, so
 				 * just return.
 				 */
 				cam_periph_unlock(periph);
 				return;
 			}
 			bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
 			if (error != 0) {
 				int queued_error;
 
 				/*
 				 * return all queued I/O with EIO, so that
 				 * the client can retry these I/Os in the
 				 * proper order should it attempt to recover.
 				 */
 				queued_error = EIO;
 
 				if (error == ENXIO
 				 && (softc->flags & DA_FLAG_PACK_INVALID)== 0) {
 					/*
 					 * Catastrophic error.  Mark our pack as
 					 * invalid.
 					 */
 					/*
 					 * XXX See if this is really a media
 					 * XXX change first?
 					 */
 					xpt_print(periph->path,
 					    "Invalidating pack\n");
 					softc->flags |= DA_FLAG_PACK_INVALID;
 #ifdef CAM_IO_STATS
 					softc->invalidations++;
 #endif
 					queued_error = ENXIO;
 				}
 				cam_iosched_flush(softc->cam_iosched, NULL,
 					   queued_error);
 				if (bp != NULL) {
 					bp->bio_error = error;
 					bp->bio_resid = bp->bio_bcount;
 					bp->bio_flags |= BIO_ERROR;
 				}
 			} else if (bp != NULL) {
 				if (state == DA_CCB_DELETE)
 					bp->bio_resid = 0;
 				else
 					bp->bio_resid = csio->resid;
 				bp->bio_error = 0;
 				if (bp->bio_resid != 0)
 					bp->bio_flags |= BIO_ERROR;
 			}
 			if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
 				cam_release_devq(done_ccb->ccb_h.path,
 						 /*relsim_flags*/0,
 						 /*reduction*/0,
 						 /*timeout*/0,
 						 /*getcount_only*/0);
 		} else if (bp != NULL) {
 			if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
 				panic("REQ_CMP with QFRZN");
 			if (bp->bio_cmd == BIO_ZONE)
 				dazonedone(periph, done_ccb);
 			else if (state == DA_CCB_DELETE)
 				bp->bio_resid = 0;
 			else
 				bp->bio_resid = csio->resid;
 			if ((csio->resid > 0)
 			 && (bp->bio_cmd != BIO_ZONE))
 				bp->bio_flags |= BIO_ERROR;
 			if (softc->error_inject != 0) {
 				bp->bio_error = softc->error_inject;
 				bp->bio_resid = bp->bio_bcount;
 				bp->bio_flags |= BIO_ERROR;
 				softc->error_inject = 0;
 			}
 		}
 
+		biotrack(bp, __func__);
 		LIST_REMOVE(&done_ccb->ccb_h, periph_links.le);
 		if (LIST_EMPTY(&softc->pending_ccbs))
 			softc->flags |= DA_FLAG_WAS_OTAG;
 
 		cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
 		xpt_release_ccb(done_ccb);
 		if (state == DA_CCB_DELETE) {
 			TAILQ_HEAD(, bio) queue;
 
 			TAILQ_INIT(&queue);
 			TAILQ_CONCAT(&queue, &softc->delete_run_queue.queue, bio_queue);
 			softc->delete_run_queue.insert_point = NULL;
 			/*
 			 * Normally, the xpt_release_ccb() above would make sure
 			 * that when we have more work to do, that work would
 			 * get kicked off. However, we specifically keep
 			 * delete_running set to 0 before the call above to
 			 * allow other I/O to progress when many BIO_DELETE
 			 * requests are pushed down. We set delete_running to 0
 			 * and call daschedule again so that we don't stall if
 			 * there are no other I/Os pending apart from BIO_DELETEs.
 			 */
 			cam_iosched_trim_done(softc->cam_iosched);
 			daschedule(periph);
 			cam_periph_unlock(periph);
 			while ((bp1 = TAILQ_FIRST(&queue)) != NULL) {
 				TAILQ_REMOVE(&queue, bp1, bio_queue);
 				bp1->bio_error = bp->bio_error;
 				if (bp->bio_flags & BIO_ERROR) {
 					bp1->bio_flags |= BIO_ERROR;
 					bp1->bio_resid = bp1->bio_bcount;
 				} else
 					bp1->bio_resid = 0;
 				biodone(bp1);
 			}
 		} else {
 			daschedule(periph);
 			cam_periph_unlock(periph);
 		}
 		if (bp != NULL)
 			biodone(bp);
 		return;
 	}
 	case DA_CCB_PROBE_RC:
 	case DA_CCB_PROBE_RC16:
 	{
 		struct	   scsi_read_capacity_data *rdcap;
 		struct     scsi_read_capacity_data_long *rcaplong;
 		char	   announce_buf[80];
 		int	   lbp;
 
 		lbp = 0;
 		rdcap = NULL;
 		rcaplong = NULL;
 		if (state == DA_CCB_PROBE_RC)
 			rdcap =(struct scsi_read_capacity_data *)csio->data_ptr;
 		else
 			rcaplong = (struct scsi_read_capacity_data_long *)
 				csio->data_ptr;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 			struct disk_params *dp;
 			uint32_t block_size;
 			uint64_t maxsector;
 			u_int lalba;	/* Lowest aligned LBA. */
 
 			if (state == DA_CCB_PROBE_RC) {
 				block_size = scsi_4btoul(rdcap->length);
 				maxsector = scsi_4btoul(rdcap->addr);
 				lalba = 0;
 
 				/*
 				 * According to SBC-2, if the standard 10
 				 * byte READ CAPACITY command returns 2^32,
 				 * we should issue the 16 byte version of
 				 * the command, since the device in question
 				 * has more sectors than can be represented
 				 * with the short version of the command.
 				 */
 				if (maxsector == 0xffffffff) {
 					free(rdcap, M_SCSIDA);
 					xpt_release_ccb(done_ccb);
 					softc->state = DA_STATE_PROBE_RC16;
 					xpt_schedule(periph, priority);
 					return;
 				}
 			} else {
 				block_size = scsi_4btoul(rcaplong->length);
 				maxsector = scsi_8btou64(rcaplong->addr);
 				lalba = scsi_2btoul(rcaplong->lalba_lbp);
 			}
 
 			/*
 			 * Because GEOM code just will panic us if we
 			 * give them an 'illegal' value we'll avoid that
 			 * here.
 			 */
 			if (block_size == 0) {
 				block_size = 512;
 				if (maxsector == 0)
 					maxsector = -1;
 			}
 			if (block_size >= MAXPHYS) {
 				xpt_print(periph->path,
 				    "unsupportable block size %ju\n",
 				    (uintmax_t) block_size);
 				announce_buf[0] = '\0';
 				cam_periph_invalidate(periph);
 			} else {
 				/*
 				 * We pass rcaplong into dasetgeom(),
 				 * because it will only use it if it is
 				 * non-NULL.
 				 */
 				dasetgeom(periph, block_size, maxsector,
 					  rcaplong, sizeof(*rcaplong));
 				lbp = (lalba & SRC16_LBPME_A);
 				dp = &softc->params;
 				snprintf(announce_buf, sizeof(announce_buf),
 				    "%juMB (%ju %u byte sectors)",
 				    ((uintmax_t)dp->secsize * dp->sectors) /
 				     (1024 * 1024),
 				    (uintmax_t)dp->sectors, dp->secsize);
 			}
 		} else {
 			int	error;
 
 			announce_buf[0] = '\0';
 
 			/*
 			 * Retry any UNIT ATTENTION type errors.  They
 			 * are expected at boot.
 			 */
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
 					SF_RETRY_UA|SF_NO_PRINT);
 			if (error == ERESTART) {
 				/*
 				 * A retry was scheuled, so
 				 * just return.
 				 */
 				return;
 			} else if (error != 0) {
 				int asc, ascq;
 				int sense_key, error_code;
 				int have_sense;
 				cam_status status;
 				struct ccb_getdev cgd;
 
 				/* Don't wedge this device's queue */
 				status = done_ccb->ccb_h.status;
 				if ((status & CAM_DEV_QFRZN) != 0)
 					cam_release_devq(done_ccb->ccb_h.path,
 							 /*relsim_flags*/0,
 							 /*reduction*/0,
 							 /*timeout*/0,
 							 /*getcount_only*/0);
 
 
 				xpt_setup_ccb(&cgd.ccb_h, 
 					      done_ccb->ccb_h.path,
 					      CAM_PRIORITY_NORMAL);
 				cgd.ccb_h.func_code = XPT_GDEV_TYPE;
 				xpt_action((union ccb *)&cgd);
 
 				if (scsi_extract_sense_ccb(done_ccb,
 				    &error_code, &sense_key, &asc, &ascq))
 					have_sense = TRUE;
 				else
 					have_sense = FALSE;
 
 				/*
 				 * If we tried READ CAPACITY(16) and failed,
 				 * fallback to READ CAPACITY(10).
 				 */
 				if ((state == DA_CCB_PROBE_RC16) &&
 				    (softc->flags & DA_FLAG_CAN_RC16) &&
 				    (((csio->ccb_h.status & CAM_STATUS_MASK) ==
 					CAM_REQ_INVALID) ||
 				     ((have_sense) &&
 				      (error_code == SSD_CURRENT_ERROR) &&
 				      (sense_key == SSD_KEY_ILLEGAL_REQUEST)))) {
 					softc->flags &= ~DA_FLAG_CAN_RC16;
 					free(rdcap, M_SCSIDA);
 					xpt_release_ccb(done_ccb);
 					softc->state = DA_STATE_PROBE_RC;
 					xpt_schedule(periph, priority);
 					return;
 				}
 
 				/*
 				 * Attach to anything that claims to be a
 				 * direct access or optical disk device,
 				 * as long as it doesn't return a "Logical
 				 * unit not supported" (0x25) error.
 				 */
 				if ((have_sense) && (asc != 0x25)
 				 && (error_code == SSD_CURRENT_ERROR)) {
 					const char *sense_key_desc;
 					const char *asc_desc;
 
 					dasetgeom(periph, 512, -1, NULL, 0);
 					scsi_sense_desc(sense_key, asc, ascq,
 							&cgd.inq_data,
 							&sense_key_desc,
 							&asc_desc);
 					snprintf(announce_buf,
 					    sizeof(announce_buf),
 						"Attempt to query device "
 						"size failed: %s, %s",
 						sense_key_desc,
 						asc_desc);
 				} else { 
 					if (have_sense)
 						scsi_sense_print(
 							&done_ccb->csio);
 					else {
 						xpt_print(periph->path,
 						    "got CAM status %#x\n",
 						    done_ccb->ccb_h.status);
 					}
 
 					xpt_print(periph->path, "fatal error, "
 					    "failed to attach to device\n");
 
 					/*
 					 * Free up resources.
 					 */
 					cam_periph_invalidate(periph);
 				} 
 			}
 		}
 		free(csio->data_ptr, M_SCSIDA);
 		if (announce_buf[0] != '\0' &&
 		    ((softc->flags & DA_FLAG_ANNOUNCED) == 0)) {
 			/*
 			 * Create our sysctl variables, now that we know
 			 * we have successfully attached.
 			 */
 			/* increase the refcount */
 			if (cam_periph_acquire(periph) == CAM_REQ_CMP) {
 				taskqueue_enqueue(taskqueue_thread,
 						  &softc->sysctl_task);
 				xpt_announce_periph(periph, announce_buf);
 				xpt_announce_quirks(periph, softc->quirks,
 				    DA_Q_BIT_STRING);
 			} else {
 				xpt_print(periph->path, "fatal error, "
 				    "could not acquire reference count\n");
 			}
 		}
 
 		/* We already probed the device. */
 		if (softc->flags & DA_FLAG_PROBED) {
 			daprobedone(periph, done_ccb);
 			return;
 		}
 
 		/* Ensure re-probe doesn't see old delete. */
 		softc->delete_available = 0;
 		dadeleteflag(softc, DA_DELETE_ZERO, 1);
 		if (lbp && (softc->quirks & DA_Q_NO_UNMAP) == 0) {
 			/*
 			 * Based on older SBC-3 spec revisions
 			 * any of the UNMAP methods "may" be
 			 * available via LBP given this flag so
 			 * we flag all of them as available and
 			 * then remove those which further
 			 * probes confirm aren't available
 			 * later.
 			 *
 			 * We could also check readcap(16) p_type
 			 * flag to exclude one or more invalid
 			 * write same (X) types here
 			 */
 			dadeleteflag(softc, DA_DELETE_WS16, 1);
 			dadeleteflag(softc, DA_DELETE_WS10, 1);
 			dadeleteflag(softc, DA_DELETE_UNMAP, 1);
 
 			xpt_release_ccb(done_ccb);
 			softc->state = DA_STATE_PROBE_LBP;
 			xpt_schedule(periph, priority);
 			return;
 		}
 
 		xpt_release_ccb(done_ccb);
 		softc->state = DA_STATE_PROBE_BDC;
 		xpt_schedule(periph, priority);
 		return;
 	}
 	case DA_CCB_PROBE_LBP:
 	{
 		struct scsi_vpd_logical_block_prov *lbp;
 
 		lbp = (struct scsi_vpd_logical_block_prov *)csio->data_ptr;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 			/*
 			 * T10/1799-D Revision 31 states at least one of these
 			 * must be supported but we don't currently enforce this.
 			 */
 			dadeleteflag(softc, DA_DELETE_WS16,
 				     (lbp->flags & SVPD_LBP_WS16));
 			dadeleteflag(softc, DA_DELETE_WS10,
 				     (lbp->flags & SVPD_LBP_WS10));
 			dadeleteflag(softc, DA_DELETE_UNMAP,
 				     (lbp->flags & SVPD_LBP_UNMAP));
 		} else {
 			int error;
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
 					SF_RETRY_UA|SF_NO_PRINT);
 			if (error == ERESTART)
 				return;
 			else if (error != 0) {
 				if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
 					/* Don't wedge this device's queue */
 					cam_release_devq(done_ccb->ccb_h.path,
 							 /*relsim_flags*/0,
 							 /*reduction*/0,
 							 /*timeout*/0,
 							 /*getcount_only*/0);
 				}
 
 				/*
 				 * Failure indicates we don't support any SBC-3
 				 * delete methods with UNMAP
 				 */
 			}
 		}
 
 		free(lbp, M_SCSIDA);
 		xpt_release_ccb(done_ccb);
 		softc->state = DA_STATE_PROBE_BLK_LIMITS;
 		xpt_schedule(periph, priority);
 		return;
 	}
 	case DA_CCB_PROBE_BLK_LIMITS:
 	{
 		struct scsi_vpd_block_limits *block_limits;
 
 		block_limits = (struct scsi_vpd_block_limits *)csio->data_ptr;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 			uint32_t max_txfer_len = scsi_4btoul(
 				block_limits->max_txfer_len);
 			uint32_t max_unmap_lba_cnt = scsi_4btoul(
 				block_limits->max_unmap_lba_cnt);
 			uint32_t max_unmap_blk_cnt = scsi_4btoul(
 				block_limits->max_unmap_blk_cnt);
 			uint64_t ws_max_blks = scsi_8btou64(
 				block_limits->max_write_same_length);
 
 			if (max_txfer_len != 0) {
 				softc->disk->d_maxsize = MIN(softc->maxio,
 				    (off_t)max_txfer_len * softc->params.secsize);
 			}
 
 			/*
 			 * We should already support UNMAP but we check lba
 			 * and block count to be sure
 			 */
 			if (max_unmap_lba_cnt != 0x00L &&
 			    max_unmap_blk_cnt != 0x00L) {
 				softc->unmap_max_lba = max_unmap_lba_cnt;
 				softc->unmap_max_ranges = min(max_unmap_blk_cnt,
 					UNMAP_MAX_RANGES);
 			} else {
 				/*
 				 * Unexpected UNMAP limits which means the
 				 * device doesn't actually support UNMAP
 				 */
 				dadeleteflag(softc, DA_DELETE_UNMAP, 0);
 			}
 
 			if (ws_max_blks != 0x00L)
 				softc->ws_max_blks = ws_max_blks;
 		} else {
 			int error;
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
 					SF_RETRY_UA|SF_NO_PRINT);
 			if (error == ERESTART)
 				return;
 			else if (error != 0) {
 				if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
 					/* Don't wedge this device's queue */
 					cam_release_devq(done_ccb->ccb_h.path,
 							 /*relsim_flags*/0,
 							 /*reduction*/0,
 							 /*timeout*/0,
 							 /*getcount_only*/0);
 				}
 
 				/*
 				 * Failure here doesn't mean UNMAP is not
 				 * supported as this is an optional page.
 				 */
 				softc->unmap_max_lba = 1;
 				softc->unmap_max_ranges = 1;
 			}
 		}
 
 		free(block_limits, M_SCSIDA);
 		xpt_release_ccb(done_ccb);
 		softc->state = DA_STATE_PROBE_BDC;
 		xpt_schedule(periph, priority);
 		return;
 	}
 	case DA_CCB_PROBE_BDC:
 	{
 		struct scsi_vpd_block_device_characteristics *bdc;
 
 		bdc = (struct scsi_vpd_block_device_characteristics *)
 		    csio->data_ptr;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 			uint32_t valid_len;
 
 			/*
 			 * Disable queue sorting for non-rotational media
 			 * by default.
 			 */
 			u_int16_t old_rate = softc->disk->d_rotation_rate;
 
 			valid_len = csio->dxfer_len - csio->resid;
 			if (SBDC_IS_PRESENT(bdc, valid_len,
 			    medium_rotation_rate)) {
 				softc->disk->d_rotation_rate =
 					scsi_2btoul(bdc->medium_rotation_rate);
 				if (softc->disk->d_rotation_rate ==
 				    SVPD_BDC_RATE_NON_ROTATING) {
 					cam_iosched_set_sort_queue(
 					    softc->cam_iosched, 0);
 					softc->rotating = 0;
 				}
 				if (softc->disk->d_rotation_rate != old_rate) {
 					disk_attr_changed(softc->disk,
 					    "GEOM::rotation_rate", M_NOWAIT);
 				}
 			}
 			if ((SBDC_IS_PRESENT(bdc, valid_len, flags))
 			 && (softc->zone_mode == DA_ZONE_NONE)) {
 				int ata_proto;
 
 				if (scsi_vpd_supported_page(periph,
 				    SVPD_ATA_INFORMATION))
 					ata_proto = 1;
 				else
 					ata_proto = 0;
 
 				/*
 				 * The Zoned field will only be set for
 				 * Drive Managed and Host Aware drives.  If
 				 * they are Host Managed, the device type
 				 * in the standard INQUIRY data should be
 				 * set to T_ZBC_HM (0x14).
 				 */
 				if ((bdc->flags & SVPD_ZBC_MASK) ==
 				     SVPD_HAW_ZBC) {
 					softc->zone_mode = DA_ZONE_HOST_AWARE;
 					softc->zone_interface = (ata_proto) ?
 					   DA_ZONE_IF_ATA_SAT : DA_ZONE_IF_SCSI;
 				} else if ((bdc->flags & SVPD_ZBC_MASK) ==
 				     SVPD_DM_ZBC) {
 					softc->zone_mode =DA_ZONE_DRIVE_MANAGED;
 					softc->zone_interface = (ata_proto) ?
 					   DA_ZONE_IF_ATA_SAT : DA_ZONE_IF_SCSI;
 				} else if ((bdc->flags & SVPD_ZBC_MASK) != 
 					  SVPD_ZBC_NR) {
 					xpt_print(periph->path, "Unknown zoned "
 					    "type %#x",
 					    bdc->flags & SVPD_ZBC_MASK);
 				}
 			}
 		} else {
 			int error;
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
 					SF_RETRY_UA|SF_NO_PRINT);
 			if (error == ERESTART)
 				return;
 			else if (error != 0) {
 				if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
 					/* Don't wedge this device's queue */
 					cam_release_devq(done_ccb->ccb_h.path,
 							 /*relsim_flags*/0,
 							 /*reduction*/0,
 							 /*timeout*/0,
 							 /*getcount_only*/0);
 				}
 			}
 		}
 
 		free(bdc, M_SCSIDA);
 		xpt_release_ccb(done_ccb);
 		softc->state = DA_STATE_PROBE_ATA;
 		xpt_schedule(periph, priority);
 		return;
 	}
 	case DA_CCB_PROBE_ATA:
 	{
 		int i;
 		struct ata_params *ata_params;
 		int continue_probe;
 		int error;
 		int16_t *ptr;
 
 		ata_params = (struct ata_params *)csio->data_ptr;
 		ptr = (uint16_t *)ata_params;
 		continue_probe = 0;
 		error = 0;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 			uint16_t old_rate;
 
 			for (i = 0; i < sizeof(*ata_params) / 2; i++)
 				ptr[i] = le16toh(ptr[i]);
 			if (ata_params->support_dsm & ATA_SUPPORT_DSM_TRIM &&
 			    (softc->quirks & DA_Q_NO_UNMAP) == 0) {
 				dadeleteflag(softc, DA_DELETE_ATA_TRIM, 1);
 				if (ata_params->max_dsm_blocks != 0)
 					softc->trim_max_ranges = min(
 					  softc->trim_max_ranges,
 					  ata_params->max_dsm_blocks *
 					  ATA_DSM_BLK_RANGES);
 			}
 			/*
 			 * Disable queue sorting for non-rotational media
 			 * by default.
 			 */
 			old_rate = softc->disk->d_rotation_rate;
 			softc->disk->d_rotation_rate =
 			    ata_params->media_rotation_rate;
 			if (softc->disk->d_rotation_rate ==
 			    ATA_RATE_NON_ROTATING) {
 				cam_iosched_set_sort_queue(softc->cam_iosched, 0);
 				softc->rotating = 0;
 			}
 			if (softc->disk->d_rotation_rate != old_rate) {
 				disk_attr_changed(softc->disk,
 				    "GEOM::rotation_rate", M_NOWAIT);
 			}
 
 			if (ata_params->capabilities1 & ATA_SUPPORT_DMA)
 				softc->flags |= DA_FLAG_CAN_ATA_DMA;
 
 			if (ata_params->support.extension &
 			    ATA_SUPPORT_GENLOG)
 				softc->flags |= DA_FLAG_CAN_ATA_LOG;
 
 			/*
 			 * At this point, if we have a SATA host aware drive,
 			 * we communicate via ATA passthrough unless the
 			 * SAT layer supports ZBC -> ZAC translation.  In
 			 * that case,
 			 */
 			/*
 			 * XXX KDM figure out how to detect a host managed
 			 * SATA drive.
 			 */
 			if (softc->zone_mode == DA_ZONE_NONE) {
 				/*
 				 * Note that we don't override the zone
 				 * mode or interface if it has already been
 				 * set.  This is because it has either been
 				 * set as a quirk, or when we probed the
 				 * SCSI Block Device Characteristics page,
 				 * the zoned field was set.  The latter
 				 * means that the SAT layer supports ZBC to
 				 * ZAC translation, and we would prefer to
 				 * use that if it is available.
 				 */
 				if ((ata_params->support3 &
 				    ATA_SUPPORT_ZONE_MASK) ==
 				    ATA_SUPPORT_ZONE_HOST_AWARE) {
 					softc->zone_mode = DA_ZONE_HOST_AWARE;
 					softc->zone_interface =
 					    DA_ZONE_IF_ATA_PASS;
 				} else if ((ata_params->support3 &
 					    ATA_SUPPORT_ZONE_MASK) ==
 					    ATA_SUPPORT_ZONE_DEV_MANAGED) {
 					softc->zone_mode =DA_ZONE_DRIVE_MANAGED;
 					softc->zone_interface =
 					    DA_ZONE_IF_ATA_PASS;
 				}
 			}
 
 		} else {
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
 					SF_RETRY_UA|SF_NO_PRINT);
 			if (error == ERESTART)
 				return;
 			else if (error != 0) {
 				if ((done_ccb->ccb_h.status &
 				     CAM_DEV_QFRZN) != 0) {
 					/* Don't wedge this device's queue */
 					cam_release_devq(done_ccb->ccb_h.path,
 							 /*relsim_flags*/0,
 							 /*reduction*/0,
 							 /*timeout*/0,
 							 /*getcount_only*/0);
 				}
 			}
 		}
 
 		free(ata_params, M_SCSIDA);
 		if ((softc->zone_mode == DA_ZONE_HOST_AWARE)
 		 || (softc->zone_mode == DA_ZONE_HOST_MANAGED)) {
 			/*
 			 * If the ATA IDENTIFY failed, we could be talking
 			 * to a SCSI drive, although that seems unlikely,
 			 * since the drive did report that it supported the 
 			 * ATA Information VPD page.  If the ATA IDENTIFY
 			 * succeeded, and the SAT layer doesn't support
 			 * ZBC -> ZAC translation, continue on to get the
 			 * directory of ATA logs, and complete the rest of
 			 * the ZAC probe.  If the SAT layer does support
 			 * ZBC -> ZAC translation, we want to use that,
 			 * and we'll probe the SCSI Zoned Block Device
 			 * Characteristics VPD page next.
 			 */
 			if ((error == 0)
 			 && (softc->flags & DA_FLAG_CAN_ATA_LOG)
 			 && (softc->zone_interface == DA_ZONE_IF_ATA_PASS))
 				softc->state = DA_STATE_PROBE_ATA_LOGDIR;
 			else
 				softc->state = DA_STATE_PROBE_ZONE;
 			continue_probe = 1;
 		}
 		if (continue_probe != 0) {
 			xpt_release_ccb(done_ccb);
 			xpt_schedule(periph, priority);
 			return;
 		} else
 			daprobedone(periph, done_ccb);
 		return;
 	}
 	case DA_CCB_PROBE_ATA_LOGDIR:
 	{
 		int error;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 			error = 0;
 			softc->valid_logdir_len = 0;
 			bzero(&softc->ata_logdir, sizeof(softc->ata_logdir));
 			softc->valid_logdir_len =
 				csio->dxfer_len - csio->resid;
 			if (softc->valid_logdir_len > 0)
 				bcopy(csio->data_ptr, &softc->ata_logdir,
 				    min(softc->valid_logdir_len,
 					sizeof(softc->ata_logdir)));
 			/*
 			 * Figure out whether the Identify Device log is
 			 * supported.  The General Purpose log directory
 			 * has a header, and lists the number of pages
 			 * available for each GP log identified by the
 			 * offset into the list.
 			 */
 			if ((softc->valid_logdir_len >=
 			    ((ATA_IDENTIFY_DATA_LOG + 1) * sizeof(uint16_t)))
 			 && (le16dec(softc->ata_logdir.header) == 
 			     ATA_GP_LOG_DIR_VERSION)
 			 && (le16dec(&softc->ata_logdir.num_pages[
 			     (ATA_IDENTIFY_DATA_LOG *
 			     sizeof(uint16_t)) - sizeof(uint16_t)]) > 0)){
 				softc->flags |= DA_FLAG_CAN_ATA_IDLOG;
 			} else {
 				softc->flags &= ~DA_FLAG_CAN_ATA_IDLOG;
 			}
 		} else {
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
 					SF_RETRY_UA|SF_NO_PRINT);
 			if (error == ERESTART)
 				return;
 			else if (error != 0) {
 				/*
 				 * If we can't get the ATA log directory,
 				 * then ATA logs are effectively not
 				 * supported even if the bit is set in the
 				 * identify data.
 				 */ 
 				softc->flags &= ~(DA_FLAG_CAN_ATA_LOG |
 						  DA_FLAG_CAN_ATA_IDLOG);
 				if ((done_ccb->ccb_h.status &
 				     CAM_DEV_QFRZN) != 0) {
 					/* Don't wedge this device's queue */
 					cam_release_devq(done_ccb->ccb_h.path,
 							 /*relsim_flags*/0,
 							 /*reduction*/0,
 							 /*timeout*/0,
 							 /*getcount_only*/0);
 				}
 			}
 		}
 
 		free(csio->data_ptr, M_SCSIDA);
 
 		if ((error == 0)
 		 && (softc->flags & DA_FLAG_CAN_ATA_IDLOG)) {
 			softc->state = DA_STATE_PROBE_ATA_IDDIR;
 			xpt_release_ccb(done_ccb);
 			xpt_schedule(periph, priority);
 			return;
 		} 
 		daprobedone(periph, done_ccb);
 		return;
 	}
 	case DA_CCB_PROBE_ATA_IDDIR:
 	{
 		int error;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 			off_t entries_offset, max_entries;
 			error = 0;
 
 			softc->valid_iddir_len = 0;
 			bzero(&softc->ata_iddir, sizeof(softc->ata_iddir));
 			softc->flags &= ~(DA_FLAG_CAN_ATA_SUPCAP |
 					  DA_FLAG_CAN_ATA_ZONE);
 			softc->valid_iddir_len =
 				csio->dxfer_len - csio->resid;
 			if (softc->valid_iddir_len > 0)
 				bcopy(csio->data_ptr, &softc->ata_iddir,
 				    min(softc->valid_iddir_len,
 					sizeof(softc->ata_iddir)));
 
 			entries_offset =
 			    __offsetof(struct ata_identify_log_pages,entries);
 			max_entries = softc->valid_iddir_len - entries_offset;
 			if ((softc->valid_iddir_len > (entries_offset + 1))
 			 && (le64dec(softc->ata_iddir.header) ==
 			     ATA_IDLOG_REVISION)
 			 && (softc->ata_iddir.entry_count > 0)) {
 				int num_entries, i;
 
 				num_entries = softc->ata_iddir.entry_count;
 				num_entries = min(num_entries,
 				   softc->valid_iddir_len - entries_offset);
 				for (i = 0; i < num_entries &&
 				     i < max_entries; i++) {
 					if (softc->ata_iddir.entries[i] ==
 					    ATA_IDL_SUP_CAP)
 						softc->flags |=
 						    DA_FLAG_CAN_ATA_SUPCAP;
 					else if (softc->ata_iddir.entries[i]==
 						 ATA_IDL_ZDI)
 						softc->flags |=
 						    DA_FLAG_CAN_ATA_ZONE;
 
 					if ((softc->flags &
 					     DA_FLAG_CAN_ATA_SUPCAP)
 					 && (softc->flags &
 					     DA_FLAG_CAN_ATA_ZONE))
 						break;
 				}
 			}
 		} else {
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
 					SF_RETRY_UA|SF_NO_PRINT);
 			if (error == ERESTART)
 				return;
 			else if (error != 0) {
 				/*
 				 * If we can't get the ATA Identify Data log
 				 * directory, then it effectively isn't
 				 * supported even if the ATA Log directory
 				 * a non-zero number of pages present for
 				 * this log.
 				 */
 				softc->flags &= ~DA_FLAG_CAN_ATA_IDLOG;
 				if ((done_ccb->ccb_h.status &
 				     CAM_DEV_QFRZN) != 0) {
 					/* Don't wedge this device's queue */
 					cam_release_devq(done_ccb->ccb_h.path,
 							 /*relsim_flags*/0,
 							 /*reduction*/0,
 							 /*timeout*/0,
 							 /*getcount_only*/0);
 				}
 			}
 		}
 
 		free(csio->data_ptr, M_SCSIDA);
 
 		if ((error == 0)
 		 && (softc->flags & DA_FLAG_CAN_ATA_SUPCAP)) {
 			softc->state = DA_STATE_PROBE_ATA_SUP;
 			xpt_release_ccb(done_ccb);
 			xpt_schedule(periph, priority);
 			return;
 		} 
 		daprobedone(periph, done_ccb);
 		return;
 	}
 	case DA_CCB_PROBE_ATA_SUP:
 	{
 		int error;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 			uint32_t valid_len;
 			size_t needed_size;
 			struct ata_identify_log_sup_cap *sup_cap;
 			error = 0;
 
 			sup_cap = (struct ata_identify_log_sup_cap *)
 			    csio->data_ptr;
 			valid_len = csio->dxfer_len - csio->resid;
 			needed_size =
 			    __offsetof(struct ata_identify_log_sup_cap,
 			    sup_zac_cap) + 1 + sizeof(sup_cap->sup_zac_cap);
 			if (valid_len >= needed_size) {
 				uint64_t zoned, zac_cap;
 
 				zoned = le64dec(sup_cap->zoned_cap);
 				if (zoned & ATA_ZONED_VALID) {
 					/*
 					 * This should have already been
 					 * set, because this is also in the
 					 * ATA identify data.
 					 */
 					if ((zoned & ATA_ZONED_MASK) ==
 					    ATA_SUPPORT_ZONE_HOST_AWARE)
 						softc->zone_mode =
 						    DA_ZONE_HOST_AWARE;
 					else if ((zoned & ATA_ZONED_MASK) ==
 					    ATA_SUPPORT_ZONE_DEV_MANAGED)
 						softc->zone_mode =
 						    DA_ZONE_DRIVE_MANAGED;
 				}
 
 				zac_cap = le64dec(sup_cap->sup_zac_cap);
 				if (zac_cap & ATA_SUP_ZAC_CAP_VALID) {
 					if (zac_cap & ATA_REPORT_ZONES_SUP)
 						softc->zone_flags |=
 						    DA_ZONE_FLAG_RZ_SUP;
 					if (zac_cap & ATA_ND_OPEN_ZONE_SUP)
 						softc->zone_flags |=
 						    DA_ZONE_FLAG_OPEN_SUP;
 					if (zac_cap & ATA_ND_CLOSE_ZONE_SUP)
 						softc->zone_flags |=
 						    DA_ZONE_FLAG_CLOSE_SUP;
 					if (zac_cap & ATA_ND_FINISH_ZONE_SUP)
 						softc->zone_flags |=
 						    DA_ZONE_FLAG_FINISH_SUP;
 					if (zac_cap & ATA_ND_RWP_SUP)
 						softc->zone_flags |=
 						    DA_ZONE_FLAG_RWP_SUP;
 				} else {
 					/*
 					 * This field was introduced in
 					 * ACS-4, r08 on April 28th, 2015.
 					 * If the drive firmware was written
 					 * to an earlier spec, it won't have
 					 * the field.  So, assume all
 					 * commands are supported.
 					 */ 
 					softc->zone_flags |=
 					    DA_ZONE_FLAG_SUP_MASK;
 				}
 					    
 			}
 		} else {
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
 					SF_RETRY_UA|SF_NO_PRINT);
 			if (error == ERESTART)
 				return;
 			else if (error != 0) {
 				/*
 				 * If we can't get the ATA Identify Data
 				 * Supported Capabilities page, clear the
 				 * flag...
 				 */
 				softc->flags &= ~DA_FLAG_CAN_ATA_SUPCAP;
 				/*
 				 * And clear zone capabilities.
 				 */
 				softc->zone_flags &= ~DA_ZONE_FLAG_SUP_MASK;
 				if ((done_ccb->ccb_h.status &
 				     CAM_DEV_QFRZN) != 0) {
 					/* Don't wedge this device's queue */
 					cam_release_devq(done_ccb->ccb_h.path,
 							 /*relsim_flags*/0,
 							 /*reduction*/0,
 							 /*timeout*/0,
 							 /*getcount_only*/0);
 				}
 			}
 		}
 
 		free(csio->data_ptr, M_SCSIDA);
 
 		if ((error == 0)
 		 && (softc->flags & DA_FLAG_CAN_ATA_ZONE)) {
 			softc->state = DA_STATE_PROBE_ATA_ZONE;
 			xpt_release_ccb(done_ccb);
 			xpt_schedule(periph, priority);
 			return;
 		} 
 		daprobedone(periph, done_ccb);
 		return;
 	}
 	case DA_CCB_PROBE_ATA_ZONE:
 	{
 		int error;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 			struct ata_zoned_info_log *zi_log;
 			uint32_t valid_len;
 			size_t needed_size;
 
 			zi_log = (struct ata_zoned_info_log *)csio->data_ptr;
 
 			valid_len = csio->dxfer_len - csio->resid;
 			needed_size = __offsetof(struct ata_zoned_info_log,
 			    version_info) + 1 + sizeof(zi_log->version_info);
 			if (valid_len >= needed_size) {
 				uint64_t tmpvar;
 
 				tmpvar = le64dec(zi_log->zoned_cap);
 				if (tmpvar & ATA_ZDI_CAP_VALID) {
 					if (tmpvar & ATA_ZDI_CAP_URSWRZ)
 						softc->zone_flags |=
 						    DA_ZONE_FLAG_URSWRZ;
 					else
 						softc->zone_flags &=
 						    ~DA_ZONE_FLAG_URSWRZ;
 				}
 				tmpvar = le64dec(zi_log->optimal_seq_zones);
 				if (tmpvar & ATA_ZDI_OPT_SEQ_VALID) {
 					softc->zone_flags |=
 					    DA_ZONE_FLAG_OPT_SEQ_SET;
 					softc->optimal_seq_zones = (tmpvar &
 					    ATA_ZDI_OPT_SEQ_MASK);
 				} else {
 					softc->zone_flags &=
 					    ~DA_ZONE_FLAG_OPT_SEQ_SET;
 					softc->optimal_seq_zones = 0;
 				}
 
 				tmpvar =le64dec(zi_log->optimal_nonseq_zones);
 				if (tmpvar & ATA_ZDI_OPT_NS_VALID) {
 					softc->zone_flags |=
 					    DA_ZONE_FLAG_OPT_NONSEQ_SET;
 					softc->optimal_nonseq_zones =
 					    (tmpvar & ATA_ZDI_OPT_NS_MASK);
 				} else {
 					softc->zone_flags &=
 					    ~DA_ZONE_FLAG_OPT_NONSEQ_SET;
 					softc->optimal_nonseq_zones = 0;
 				}
 
 				tmpvar = le64dec(zi_log->max_seq_req_zones);
 				if (tmpvar & ATA_ZDI_MAX_SEQ_VALID) {
 					softc->zone_flags |=
 					    DA_ZONE_FLAG_MAX_SEQ_SET;
 					softc->max_seq_zones =
 					    (tmpvar & ATA_ZDI_MAX_SEQ_MASK);
 				} else {
 					softc->zone_flags &=
 					    ~DA_ZONE_FLAG_MAX_SEQ_SET;
 					softc->max_seq_zones = 0;
 				}
 			}
 		} else {
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
 					SF_RETRY_UA|SF_NO_PRINT);
 			if (error == ERESTART)
 				return;
 			else if (error != 0) {
 				softc->flags &= ~DA_FLAG_CAN_ATA_ZONE;
 				softc->flags &= ~DA_ZONE_FLAG_SET_MASK;
 
 				if ((done_ccb->ccb_h.status &
 				     CAM_DEV_QFRZN) != 0) {
 					/* Don't wedge this device's queue */
 					cam_release_devq(done_ccb->ccb_h.path,
 							 /*relsim_flags*/0,
 							 /*reduction*/0,
 							 /*timeout*/0,
 							 /*getcount_only*/0);
 				}
 			}
 	
 		}
 		free(csio->data_ptr, M_SCSIDA);
 
 		daprobedone(periph, done_ccb);
 		return;
 	}
 	case DA_CCB_PROBE_ZONE:
 	{
 		int error;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 			uint32_t valid_len;
 			size_t needed_len;
 			struct scsi_vpd_zoned_bdc *zoned_bdc;
 
 			error = 0;
 			zoned_bdc = (struct scsi_vpd_zoned_bdc *)
 				csio->data_ptr;
 			valid_len = csio->dxfer_len - csio->resid;
 			needed_len = __offsetof(struct scsi_vpd_zoned_bdc,
 			    max_seq_req_zones) + 1 +
 			    sizeof(zoned_bdc->max_seq_req_zones);
 			if ((valid_len >= needed_len)
 			 && (scsi_2btoul(zoned_bdc->page_length) >=
 			     SVPD_ZBDC_PL)) {
 				if (zoned_bdc->flags & SVPD_ZBDC_URSWRZ)
 					softc->zone_flags |=
 					    DA_ZONE_FLAG_URSWRZ;
 				else
 					softc->zone_flags &= 
 					    ~DA_ZONE_FLAG_URSWRZ;
 				softc->optimal_seq_zones =
 				    scsi_4btoul(zoned_bdc->optimal_seq_zones);
 				softc->zone_flags |= DA_ZONE_FLAG_OPT_SEQ_SET;
 				softc->optimal_nonseq_zones = scsi_4btoul(
 				    zoned_bdc->optimal_nonseq_zones);
 				softc->zone_flags |=
 				    DA_ZONE_FLAG_OPT_NONSEQ_SET;
 				softc->max_seq_zones =
 				    scsi_4btoul(zoned_bdc->max_seq_req_zones);
 				softc->zone_flags |= DA_ZONE_FLAG_MAX_SEQ_SET;
 			}
 			/*
 			 * All of the zone commands are mandatory for SCSI
 			 * devices.
 			 *
 			 * XXX KDM this is valid as of September 2015.
 			 * Re-check this assumption once the SAT spec is
 			 * updated to support SCSI ZBC to ATA ZAC mapping.
 			 * Since ATA allows zone commands to be reported
 			 * as supported or not, this may not necessarily
 			 * be true for an ATA device behind a SAT (SCSI to
 			 * ATA Translation) layer.
 			 */
 			softc->zone_flags |= DA_ZONE_FLAG_SUP_MASK;
 		} else {
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
 					SF_RETRY_UA|SF_NO_PRINT);
 			if (error == ERESTART)
 				return;
 			else if (error != 0) {
 				if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
 					/* Don't wedge this device's queue */
 					cam_release_devq(done_ccb->ccb_h.path,
 							 /*relsim_flags*/0,
 							 /*reduction*/0,
 							 /*timeout*/0,
 							 /*getcount_only*/0);
 				}
 			}
 		}
 		daprobedone(periph, done_ccb);
 		return;
 	}
 	case DA_CCB_DUMP:
 		/* No-op.  We're polling */
 		return;
 	case DA_CCB_TUR:
 	{
 		if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 
 			if (daerror(done_ccb, CAM_RETRY_SELTO,
 			    SF_RETRY_UA | SF_NO_RECOVERY | SF_NO_PRINT) ==
 			    ERESTART)
 				return;
 			if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
 				cam_release_devq(done_ccb->ccb_h.path,
 						 /*relsim_flags*/0,
 						 /*reduction*/0,
 						 /*timeout*/0,
 						 /*getcount_only*/0);
 		}
 		xpt_release_ccb(done_ccb);
 		cam_periph_release_locked(periph);
 		return;
 	}
 	default:
 		break;
 	}
 	xpt_release_ccb(done_ccb);
 }
 
 static void
 dareprobe(struct cam_periph *periph)
 {
 	struct da_softc	  *softc;
 	cam_status status;
 
 	softc = (struct da_softc *)periph->softc;
 
 	/* Probe in progress; don't interfere. */
 	if (softc->state != DA_STATE_NORMAL)
 		return;
 
 	status = cam_periph_acquire(periph);
 	KASSERT(status == CAM_REQ_CMP,
 	    ("dareprobe: cam_periph_acquire failed"));
 
 	if (softc->flags & DA_FLAG_CAN_RC16)
 		softc->state = DA_STATE_PROBE_RC16;
 	else
 		softc->state = DA_STATE_PROBE_RC;
 
 	xpt_schedule(periph, CAM_PRIORITY_DEV);
 }
 
 static int
 daerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
 {
 	struct da_softc	  *softc;
 	struct cam_periph *periph;
 	int error, error_code, sense_key, asc, ascq;
+
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+	if (ccb->csio.bio != NULL)
+		biotrack(ccb->csio.bio, __func__);
+#endif
 
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	softc = (struct da_softc *)periph->softc;
 
  	/*
 	 * Automatically detect devices that do not support
  	 * READ(6)/WRITE(6) and upgrade to using 10 byte cdbs.
  	 */
 	error = 0;
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) {
 		error = cmd6workaround(ccb);
 	} else if (scsi_extract_sense_ccb(ccb,
 	    &error_code, &sense_key, &asc, &ascq)) {
 		if (sense_key == SSD_KEY_ILLEGAL_REQUEST)
  			error = cmd6workaround(ccb);
 		/*
 		 * If the target replied with CAPACITY DATA HAS CHANGED UA,
 		 * query the capacity and notify upper layers.
 		 */
 		else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
 		    asc == 0x2A && ascq == 0x09) {
 			xpt_print(periph->path, "Capacity data has changed\n");
 			softc->flags &= ~DA_FLAG_PROBED;
 			dareprobe(periph);
 			sense_flags |= SF_NO_PRINT;
 		} else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
 		    asc == 0x28 && ascq == 0x00) {
 			softc->flags &= ~DA_FLAG_PROBED;
 			disk_media_changed(softc->disk, M_NOWAIT);
 		} else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
 		    asc == 0x3F && ascq == 0x03) {
 			xpt_print(periph->path, "INQUIRY data has changed\n");
 			softc->flags &= ~DA_FLAG_PROBED;
 			dareprobe(periph);
 			sense_flags |= SF_NO_PRINT;
 		} else if (sense_key == SSD_KEY_NOT_READY &&
 		    asc == 0x3a && (softc->flags & DA_FLAG_PACK_INVALID) == 0) {
 			softc->flags |= DA_FLAG_PACK_INVALID;
 			disk_media_gone(softc->disk, M_NOWAIT);
 		}
 	}
 	if (error == ERESTART)
 		return (ERESTART);
 
 #ifdef CAM_IO_STATS
 	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
 	case CAM_CMD_TIMEOUT:
 		softc->timeouts++;
 		break;
 	case CAM_REQ_ABORTED:
 	case CAM_REQ_CMP_ERR:
 	case CAM_REQ_TERMIO:
 	case CAM_UNREC_HBA_ERROR:
 	case CAM_DATA_RUN_ERR:
 		softc->errors++;
 		break;
 	default:
 		break;
 	}
 #endif
 
 	/*
 	 * XXX
 	 * Until we have a better way of doing pack validation,
 	 * don't treat UAs as errors.
 	 */
 	sense_flags |= SF_RETRY_UA;
 
 	if (softc->quirks & DA_Q_RETRY_BUSY)
 		sense_flags |= SF_RETRY_BUSY;
 	return(cam_periph_error(ccb, cam_flags, sense_flags,
 				&softc->saved_ccb));
 }
 
 static void
 damediapoll(void *arg)
 {
 	struct cam_periph *periph = arg;
 	struct da_softc *softc = periph->softc;
 
 	if (!cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR) &&
 	    LIST_EMPTY(&softc->pending_ccbs)) {
 		if (cam_periph_acquire(periph) == CAM_REQ_CMP) {
 			cam_iosched_set_work_flags(softc->cam_iosched, DA_WORK_TUR);
 			daschedule(periph);
 		}
 	}
 	/* Queue us up again */
 	if (da_poll_period != 0)
 		callout_schedule(&softc->mediapoll_c, da_poll_period * hz);
 }
 
 static void
 daprevent(struct cam_periph *periph, int action)
 {
 	struct	da_softc *softc;
 	union	ccb *ccb;		
 	int	error;
 		
 	softc = (struct da_softc *)periph->softc;
 
 	if (((action == PR_ALLOW)
 	  && (softc->flags & DA_FLAG_PACK_LOCKED) == 0)
 	 || ((action == PR_PREVENT)
 	  && (softc->flags & DA_FLAG_PACK_LOCKED) != 0)) {
 		return;
 	}
 
 	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 
 	scsi_prevent(&ccb->csio,
 		     /*retries*/1,
 		     /*cbcfp*/dadone,
 		     MSG_SIMPLE_Q_TAG,
 		     action,
 		     SSD_FULL_SIZE,
 		     5000);
 
 	error = cam_periph_runccb(ccb, daerror, CAM_RETRY_SELTO,
 	    SF_RETRY_UA | SF_NO_PRINT, softc->disk->d_devstat);
 
 	if (error == 0) {
 		if (action == PR_ALLOW)
 			softc->flags &= ~DA_FLAG_PACK_LOCKED;
 		else
 			softc->flags |= DA_FLAG_PACK_LOCKED;
 	}
 
 	xpt_release_ccb(ccb);
 }
 
 static void
 dasetgeom(struct cam_periph *periph, uint32_t block_len, uint64_t maxsector,
 	  struct scsi_read_capacity_data_long *rcaplong, size_t rcap_len)
 {
 	struct ccb_calc_geometry ccg;
 	struct da_softc *softc;
 	struct disk_params *dp;
 	u_int lbppbe, lalba;
 	int error;
 
 	softc = (struct da_softc *)periph->softc;
 
 	dp = &softc->params;
 	dp->secsize = block_len;
 	dp->sectors = maxsector + 1;
 	if (rcaplong != NULL) {
 		lbppbe = rcaplong->prot_lbppbe & SRC16_LBPPBE;
 		lalba = scsi_2btoul(rcaplong->lalba_lbp);
 		lalba &= SRC16_LALBA_A;
 	} else {
 		lbppbe = 0;
 		lalba = 0;
 	}
 
 	if (lbppbe > 0) {
 		dp->stripesize = block_len << lbppbe;
 		dp->stripeoffset = (dp->stripesize - block_len * lalba) %
 		    dp->stripesize;
 	} else if (softc->quirks & DA_Q_4K) {
 		dp->stripesize = 4096;
 		dp->stripeoffset = 0;
 	} else {
 		dp->stripesize = 0;
 		dp->stripeoffset = 0;
 	}
 	/*
 	 * Have the controller provide us with a geometry
 	 * for this disk.  The only time the geometry
 	 * matters is when we boot and the controller
 	 * is the only one knowledgeable enough to come
 	 * up with something that will make this a bootable
 	 * device.
 	 */
 	xpt_setup_ccb(&ccg.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 	ccg.ccb_h.func_code = XPT_CALC_GEOMETRY;
 	ccg.block_size = dp->secsize;
 	ccg.volume_size = dp->sectors;
 	ccg.heads = 0;
 	ccg.secs_per_track = 0;
 	ccg.cylinders = 0;
 	xpt_action((union ccb*)&ccg);
 	if ((ccg.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		/*
 		 * We don't know what went wrong here- but just pick
 		 * a geometry so we don't have nasty things like divide
 		 * by zero.
 		 */
 		dp->heads = 255;
 		dp->secs_per_track = 255;
 		dp->cylinders = dp->sectors / (255 * 255);
 		if (dp->cylinders == 0) {
 			dp->cylinders = 1;
 		}
 	} else {
 		dp->heads = ccg.heads;
 		dp->secs_per_track = ccg.secs_per_track;
 		dp->cylinders = ccg.cylinders;
 	}
 
 	/*
 	 * If the user supplied a read capacity buffer, and if it is
 	 * different than the previous buffer, update the data in the EDT.
 	 * If it's the same, we don't bother.  This avoids sending an
 	 * update every time someone opens this device.
 	 */
 	if ((rcaplong != NULL)
 	 && (bcmp(rcaplong, &softc->rcaplong,
 		  min(sizeof(softc->rcaplong), rcap_len)) != 0)) {
 		struct ccb_dev_advinfo cdai;
 
 		xpt_setup_ccb(&cdai.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 		cdai.ccb_h.func_code = XPT_DEV_ADVINFO;
 		cdai.buftype = CDAI_TYPE_RCAPLONG;
 		cdai.flags = CDAI_FLAG_STORE;
 		cdai.bufsiz = rcap_len;
 		cdai.buf = (uint8_t *)rcaplong;
 		xpt_action((union ccb *)&cdai);
 		if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0)
 			cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE);
 		if (cdai.ccb_h.status != CAM_REQ_CMP) {
 			xpt_print(periph->path, "%s: failed to set read "
 				  "capacity advinfo\n", __func__);
 			/* Use cam_error_print() to decode the status */
 			cam_error_print((union ccb *)&cdai, CAM_ESF_CAM_STATUS,
 					CAM_EPF_ALL);
 		} else {
 			bcopy(rcaplong, &softc->rcaplong,
 			      min(sizeof(softc->rcaplong), rcap_len));
 		}
 	}
 
 	softc->disk->d_sectorsize = softc->params.secsize;
 	softc->disk->d_mediasize = softc->params.secsize * (off_t)softc->params.sectors;
 	softc->disk->d_stripesize = softc->params.stripesize;
 	softc->disk->d_stripeoffset = softc->params.stripeoffset;
 	/* XXX: these are not actually "firmware" values, so they may be wrong */
 	softc->disk->d_fwsectors = softc->params.secs_per_track;
 	softc->disk->d_fwheads = softc->params.heads;
 	softc->disk->d_devstat->block_size = softc->params.secsize;
 	softc->disk->d_devstat->flags &= ~DEVSTAT_BS_UNAVAILABLE;
 
 	error = disk_resize(softc->disk, M_NOWAIT);
 	if (error != 0)
 		xpt_print(periph->path, "disk_resize(9) failed, error = %d\n", error);
 }
 
 static void
 dasendorderedtag(void *arg)
 {
 	struct da_softc *softc = arg;
 
 	if (da_send_ordered) {
 		if (!LIST_EMPTY(&softc->pending_ccbs)) {
 			if ((softc->flags & DA_FLAG_WAS_OTAG) == 0)
 				softc->flags |= DA_FLAG_NEED_OTAG;
 			softc->flags &= ~DA_FLAG_WAS_OTAG;
 		}
 	}
 	/* Queue us up again */
 	callout_reset(&softc->sendordered_c,
 	    (da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL,
 	    dasendorderedtag, softc);
 }
 
 /*
  * Step through all DA peripheral drivers, and if the device is still open,
  * sync the disk cache to physical media.
  */
 static void
 dashutdown(void * arg, int howto)
 {
 	struct cam_periph *periph;
 	struct da_softc *softc;
 	union ccb *ccb;
 	int error;
 
 	CAM_PERIPH_FOREACH(periph, &dadriver) {
 		softc = (struct da_softc *)periph->softc;
 		if (SCHEDULER_STOPPED()) {
 			/* If we paniced with the lock held, do not recurse. */
 			if (!cam_periph_owned(periph) &&
 			    (softc->flags & DA_FLAG_OPEN)) {
 				dadump(softc->disk, NULL, 0, 0, 0);
 			}
 			continue;
 		}
 		cam_periph_lock(periph);
 
 		/*
 		 * We only sync the cache if the drive is still open, and
 		 * if the drive is capable of it..
 		 */
 		if (((softc->flags & DA_FLAG_OPEN) == 0)
 		 || (softc->quirks & DA_Q_NO_SYNC_CACHE)) {
 			cam_periph_unlock(periph);
 			continue;
 		}
 
 		ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 		scsi_synchronize_cache(&ccb->csio,
 				       /*retries*/0,
 				       /*cbfcnp*/dadone,
 				       MSG_SIMPLE_Q_TAG,
 				       /*begin_lba*/0, /* whole disk */
 				       /*lb_count*/0,
 				       SSD_FULL_SIZE,
 				       60 * 60 * 1000);
 
 		error = cam_periph_runccb(ccb, daerror, /*cam_flags*/0,
 		    /*sense_flags*/ SF_NO_RECOVERY | SF_NO_RETRY | SF_QUIET_IR,
 		    softc->disk->d_devstat);
 		if (error != 0)
 			xpt_print(periph->path, "Synchronize cache failed\n");
 		xpt_release_ccb(ccb);
 		cam_periph_unlock(periph);
 	}
 }
 
 #else /* !_KERNEL */
 
 /*
  * XXX These are only left out of the kernel build to silence warnings.  If,
  * for some reason these functions are used in the kernel, the ifdefs should
  * be moved so they are included both in the kernel and userland.
  */
 void
 scsi_format_unit(struct ccb_scsiio *csio, u_int32_t retries,
 		 void (*cbfcnp)(struct cam_periph *, union ccb *),
 		 u_int8_t tag_action, u_int8_t byte2, u_int16_t ileave,
 		 u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len,
 		 u_int32_t timeout)
 {
 	struct scsi_format_unit *scsi_cmd;
 
 	scsi_cmd = (struct scsi_format_unit *)&csio->cdb_io.cdb_bytes;
 	scsi_cmd->opcode = FORMAT_UNIT;
 	scsi_cmd->byte2 = byte2;
 	scsi_ulto2b(ileave, scsi_cmd->interleave);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_read_defects(struct ccb_scsiio *csio, uint32_t retries,
 		  void (*cbfcnp)(struct cam_periph *, union ccb *),
 		  uint8_t tag_action, uint8_t list_format,
 		  uint32_t addr_desc_index, uint8_t *data_ptr,
 		  uint32_t dxfer_len, int minimum_cmd_size, 
 		  uint8_t sense_len, uint32_t timeout)
 {
 	uint8_t cdb_len;
 
 	/*
 	 * These conditions allow using the 10 byte command.  Otherwise we
 	 * need to use the 12 byte command.
 	 */
 	if ((minimum_cmd_size <= 10)
 	 && (addr_desc_index == 0) 
 	 && (dxfer_len <= SRDD10_MAX_LENGTH)) {
 		struct scsi_read_defect_data_10 *cdb10;
 
 		cdb10 = (struct scsi_read_defect_data_10 *)
 			&csio->cdb_io.cdb_bytes;
 
 		cdb_len = sizeof(*cdb10);
 		bzero(cdb10, cdb_len);
                 cdb10->opcode = READ_DEFECT_DATA_10;
                 cdb10->format = list_format;
                 scsi_ulto2b(dxfer_len, cdb10->alloc_length);
 	} else {
 		struct scsi_read_defect_data_12 *cdb12;
 
 		cdb12 = (struct scsi_read_defect_data_12 *)
 			&csio->cdb_io.cdb_bytes;
 
 		cdb_len = sizeof(*cdb12);
 		bzero(cdb12, cdb_len);
                 cdb12->opcode = READ_DEFECT_DATA_12;
                 cdb12->format = list_format;
                 scsi_ulto4b(dxfer_len, cdb12->alloc_length);
 		scsi_ulto4b(addr_desc_index, cdb12->address_descriptor_index);
 	}
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/ CAM_DIR_IN,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      cdb_len,
 		      timeout);
 }
 
 void
 scsi_sanitize(struct ccb_scsiio *csio, u_int32_t retries,
 	      void (*cbfcnp)(struct cam_periph *, union ccb *),
 	      u_int8_t tag_action, u_int8_t byte2, u_int16_t control,
 	      u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len,
 	      u_int32_t timeout)
 {
 	struct scsi_sanitize *scsi_cmd;
 
 	scsi_cmd = (struct scsi_sanitize *)&csio->cdb_io.cdb_bytes;
 	scsi_cmd->opcode = SANITIZE;
 	scsi_cmd->byte2 = byte2;
 	scsi_cmd->control = control;
 	scsi_ulto2b(dxfer_len, scsi_cmd->length);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 #endif /* _KERNEL */
 
 void
 scsi_zbc_out(struct ccb_scsiio *csio, uint32_t retries, 
 	     void (*cbfcnp)(struct cam_periph *, union ccb *),
 	     uint8_t tag_action, uint8_t service_action, uint64_t zone_id,
 	     uint8_t zone_flags, uint8_t *data_ptr, uint32_t dxfer_len,
 	     uint8_t sense_len, uint32_t timeout)
 {
 	struct scsi_zbc_out *scsi_cmd;
 
 	scsi_cmd = (struct scsi_zbc_out *)&csio->cdb_io.cdb_bytes;
 	scsi_cmd->opcode = ZBC_OUT;
 	scsi_cmd->service_action = service_action;
 	scsi_u64to8b(zone_id, scsi_cmd->zone_id);
 	scsi_cmd->zone_flags = zone_flags;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_zbc_in(struct ccb_scsiio *csio, uint32_t retries, 
 	    void (*cbfcnp)(struct cam_periph *, union ccb *),
 	    uint8_t tag_action, uint8_t service_action, uint64_t zone_start_lba,
 	    uint8_t zone_options, uint8_t *data_ptr, uint32_t dxfer_len,
 	    uint8_t sense_len, uint32_t timeout)
 {
 	struct scsi_zbc_in *scsi_cmd;
 
 	scsi_cmd = (struct scsi_zbc_in *)&csio->cdb_io.cdb_bytes;
 	scsi_cmd->opcode = ZBC_IN;
 	scsi_cmd->service_action = service_action;
 	scsi_u64to8b(zone_start_lba, scsi_cmd->zone_start_lba);
 	scsi_cmd->zone_options = zone_options;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/ (dxfer_len > 0) ? CAM_DIR_IN : CAM_DIR_NONE,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 
 }
 
 int
 scsi_ata_zac_mgmt_out(struct ccb_scsiio *csio, uint32_t retries, 
 		      void (*cbfcnp)(struct cam_periph *, union ccb *),
 		      uint8_t tag_action, int use_ncq,
 		      uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags,
 		      uint8_t *data_ptr, uint32_t dxfer_len,
 		      uint8_t *cdb_storage, size_t cdb_storage_len,
 		      uint8_t sense_len, uint32_t timeout)
 {
 	uint8_t command_out, protocol, ata_flags;
 	uint16_t features_out;
 	uint32_t sectors_out, auxiliary;
 	int retval;
 
 	retval = 0;
 
 	if (use_ncq == 0) {
 		command_out = ATA_ZAC_MANAGEMENT_OUT;
 		features_out = (zm_action & 0xf) | (zone_flags << 8);
 		ata_flags = AP_FLAG_BYT_BLOK_BLOCKS;
 		if (dxfer_len == 0) {
 			protocol = AP_PROTO_NON_DATA;
 			ata_flags |= AP_FLAG_TLEN_NO_DATA;
 			sectors_out = 0;
 		} else {
 			protocol = AP_PROTO_DMA;
 			ata_flags |= AP_FLAG_TLEN_SECT_CNT |
 				     AP_FLAG_TDIR_TO_DEV;
 			sectors_out = ((dxfer_len >> 9) & 0xffff);
 		}
 		auxiliary = 0;
 	} else {
 		ata_flags = AP_FLAG_BYT_BLOK_BLOCKS;
 		if (dxfer_len == 0) {
 			command_out = ATA_NCQ_NON_DATA;
 			features_out = ATA_NCQ_ZAC_MGMT_OUT;
 			/*
 			 * We're assuming the SCSI to ATA translation layer
 			 * will set the NCQ tag number in the tag field.
 			 * That isn't clear from the SAT-4 spec (as of rev 05).
 			 */
 			sectors_out = 0;
 			ata_flags |= AP_FLAG_TLEN_NO_DATA;
 		} else {
 			command_out = ATA_SEND_FPDMA_QUEUED;
 			/*
 			 * Note that we're defaulting to normal priority,
 			 * and assuming that the SCSI to ATA translation
 			 * layer will insert the NCQ tag number in the tag
 			 * field.  That isn't clear in the SAT-4 spec (as
 			 * of rev 05).
 			 */
 			sectors_out = ATA_SFPDMA_ZAC_MGMT_OUT << 8;
 
 			ata_flags |= AP_FLAG_TLEN_FEAT |
 				     AP_FLAG_TDIR_TO_DEV;
 
 			/*
 			 * For SEND FPDMA QUEUED, the transfer length is
 			 * encoded in the FEATURE register, and 0 means
 			 * that 65536 512 byte blocks are to be tranferred.
 			 * In practice, it seems unlikely that we'll see
 			 * a transfer that large, and it may confuse the
 			 * the SAT layer, because generally that means that
 			 * 0 bytes should be transferred.
 			 */
 			if (dxfer_len == (65536 * 512)) {
 				features_out = 0;
 			} else if (dxfer_len <= (65535 * 512)) {
 				features_out = ((dxfer_len >> 9) & 0xffff);
 			} else {
 				/* The transfer is too big. */
 				retval = 1;
 				goto bailout;
 			}
 
 		}
 
 		auxiliary = (zm_action & 0xf) | (zone_flags << 8);
 		protocol = AP_PROTO_FPDMA;
 	}
 
 	protocol |= AP_EXTEND;
 
 	retval = scsi_ata_pass(csio,
 	    retries,
 	    cbfcnp,
 	    /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
 	    tag_action,
 	    /*protocol*/ protocol,
 	    /*ata_flags*/ ata_flags,
 	    /*features*/ features_out,
 	    /*sector_count*/ sectors_out,
 	    /*lba*/ zone_id,
 	    /*command*/ command_out,
 	    /*device*/ 0,
 	    /*icc*/ 0,
 	    /*auxiliary*/ auxiliary,
 	    /*control*/ 0,
 	    /*data_ptr*/ data_ptr,
 	    /*dxfer_len*/ dxfer_len,
 	    /*cdb_storage*/ cdb_storage,
 	    /*cdb_storage_len*/ cdb_storage_len,
 	    /*minimum_cmd_size*/ 0,
 	    /*sense_len*/ SSD_FULL_SIZE,
 	    /*timeout*/ timeout);
 
 bailout:
 
 	return (retval);
 }
 
 int
 scsi_ata_zac_mgmt_in(struct ccb_scsiio *csio, uint32_t retries, 
 		     void (*cbfcnp)(struct cam_periph *, union ccb *),
 		     uint8_t tag_action, int use_ncq,
 		     uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags,
 		     uint8_t *data_ptr, uint32_t dxfer_len,
 		     uint8_t *cdb_storage, size_t cdb_storage_len,
 		     uint8_t sense_len, uint32_t timeout)
 {
 	uint8_t command_out, protocol;
 	uint16_t features_out, sectors_out;
 	uint32_t auxiliary;
 	int ata_flags;
 	int retval;
 
 	retval = 0;
 	ata_flags = AP_FLAG_TDIR_FROM_DEV | AP_FLAG_BYT_BLOK_BLOCKS;
 
 	if (use_ncq == 0) {
 		command_out = ATA_ZAC_MANAGEMENT_IN;
 		/* XXX KDM put a macro here */
 		features_out = (zm_action & 0xf) | (zone_flags << 8);
 		sectors_out = dxfer_len >> 9; /* XXX KDM macro */
 		protocol = AP_PROTO_DMA;
 		ata_flags |= AP_FLAG_TLEN_SECT_CNT;
 		auxiliary = 0;
 	} else {
 		ata_flags |= AP_FLAG_TLEN_FEAT;
 
 		command_out = ATA_RECV_FPDMA_QUEUED;
 		sectors_out = ATA_RFPDMA_ZAC_MGMT_IN << 8;
 
 		/*
 		 * For RECEIVE FPDMA QUEUED, the transfer length is
 		 * encoded in the FEATURE register, and 0 means
 		 * that 65536 512 byte blocks are to be tranferred.
 		 * In practice, it seems unlikely that we'll see
 		 * a transfer that large, and it may confuse the
 		 * the SAT layer, because generally that means that
 		 * 0 bytes should be transferred.
 		 */
 		if (dxfer_len == (65536 * 512)) {
 			features_out = 0;
 		} else if (dxfer_len <= (65535 * 512)) {
 			features_out = ((dxfer_len >> 9) & 0xffff);
 		} else {
 			/* The transfer is too big. */
 			retval = 1;
 			goto bailout;
 		}
 		auxiliary = (zm_action & 0xf) | (zone_flags << 8),
 		protocol = AP_PROTO_FPDMA;
 	}
 
 	protocol |= AP_EXTEND;
 
 	retval = scsi_ata_pass(csio,
 	    retries,
 	    cbfcnp,
 	    /*flags*/ CAM_DIR_IN,
 	    tag_action,
 	    /*protocol*/ protocol,
 	    /*ata_flags*/ ata_flags,
 	    /*features*/ features_out,
 	    /*sector_count*/ sectors_out,
 	    /*lba*/ zone_id,
 	    /*command*/ command_out,
 	    /*device*/ 0,
 	    /*icc*/ 0,
 	    /*auxiliary*/ auxiliary,
 	    /*control*/ 0,
 	    /*data_ptr*/ data_ptr,
 	    /*dxfer_len*/ (dxfer_len >> 9) * 512, /* XXX KDM */
 	    /*cdb_storage*/ cdb_storage,
 	    /*cdb_storage_len*/ cdb_storage_len,
 	    /*minimum_cmd_size*/ 0,
 	    /*sense_len*/ SSD_FULL_SIZE,
 	    /*timeout*/ timeout);
 
 bailout:
 	return (retval);
 }
Index: head/sys/conf/options
===================================================================
--- head/sys/conf/options	(revision 308154)
+++ head/sys/conf/options	(revision 308155)
@@ -1,996 +1,998 @@
 # $FreeBSD$
 #
 #        On the handling of kernel options
 #
 # All kernel options should be listed in NOTES, with suitable
 # descriptions.  Negative options (options that make some code not
 # compile) should be commented out; LINT (generated from NOTES) should
 # compile as much code as possible.  Try to structure option-using
 # code so that a single option only switch code on, or only switch
 # code off, to make it possible to have a full compile-test.  If
 # necessary, you can check for COMPILING_LINT to get maximum code
 # coverage.
 #
 # All new options shall also be listed in either "conf/options" or
 # "conf/options.<machine>".  Options that affect a single source-file
 # <xxx>.[c|s] should be directed into "opt_<xxx>.h", while options
 # that affect multiple files should either go in "opt_global.h" if
 # this is a kernel-wide option (used just about everywhere), or in
 # "opt_<option-name-in-lower-case>.h" if it affects only some files.
 # Note that the effect of listing only an option without a
 # header-file-name in conf/options (and cousins) is that the last
 # convention is followed.
 #
 # This handling scheme is not yet fully implemented.
 #
 #
 # Format of this file:
 # Option name	filename
 #
 # If filename is missing, the default is
 # opt_<name-of-option-in-lower-case>.h
 
 AAC_DEBUG		opt_aac.h
 AACRAID_DEBUG		opt_aacraid.h
 AHC_ALLOW_MEMIO		opt_aic7xxx.h
 AHC_TMODE_ENABLE	opt_aic7xxx.h
 AHC_DUMP_EEPROM		opt_aic7xxx.h
 AHC_DEBUG		opt_aic7xxx.h
 AHC_DEBUG_OPTS		opt_aic7xxx.h
 AHC_REG_PRETTY_PRINT	opt_aic7xxx.h
 AHD_DEBUG		opt_aic79xx.h
 AHD_DEBUG_OPTS		opt_aic79xx.h
 AHD_TMODE_ENABLE	opt_aic79xx.h	
 AHD_REG_PRETTY_PRINT	opt_aic79xx.h
 ADW_ALLOW_MEMIO		opt_adw.h
 
 TWA_DEBUG		opt_twa.h
 TWA_FLASH_FIRMWARE	opt_twa.h
 
 # Debugging options.
 ALT_BREAK_TO_DEBUGGER	opt_kdb.h
 BREAK_TO_DEBUGGER	opt_kdb.h
+BUF_TRACKING		opt_global.h
 DDB
 DDB_BUFR_SIZE	opt_ddb.h
 DDB_CAPTURE_DEFAULTBUFSIZE	opt_ddb.h
 DDB_CAPTURE_MAXBUFSIZE	opt_ddb.h
 DDB_CTF		opt_ddb.h
 DDB_NUMSYM	opt_ddb.h
+FULL_BUF_TRACKING	opt_global.h
 GDB
 KDB		opt_global.h
 KDB_TRACE	opt_kdb.h
 KDB_UNATTENDED	opt_kdb.h
 KLD_DEBUG	opt_kld.h
 SYSCTL_DEBUG	opt_sysctl.h
 EARLY_PRINTF	opt_global.h
 TEXTDUMP_PREFERRED	opt_ddb.h
 TEXTDUMP_VERBOSE	opt_ddb.h
 NUM_CORE_FILES	opt_global.h
 
 # Miscellaneous options.
 ADAPTIVE_LOCKMGRS
 ALQ
 ALTERA_SDCARD_FAST_SIM	opt_altera_sdcard.h
 ATSE_CFI_HACK	opt_cfi.h
 AUDIT		opt_global.h
 BOOTHOWTO	opt_global.h
 BOOTVERBOSE	opt_global.h
 CALLOUT_PROFILING
 CAPABILITIES	opt_capsicum.h
 CAPABILITY_MODE	opt_capsicum.h
 COMPAT_43	opt_compat.h
 COMPAT_43TTY	opt_compat.h
 COMPAT_FREEBSD4	opt_compat.h
 COMPAT_FREEBSD5	opt_compat.h
 COMPAT_FREEBSD6	opt_compat.h
 COMPAT_FREEBSD7	opt_compat.h
 COMPAT_FREEBSD9	opt_compat.h
 COMPAT_FREEBSD10	opt_compat.h
 COMPAT_CLOUDABI32	opt_dontuse.h
 COMPAT_CLOUDABI64	opt_dontuse.h
 COMPAT_LINUXKPI	opt_compat.h
 COMPILING_LINT	opt_global.h
 CY_PCI_FASTINTR
 DEADLKRES	opt_watchdog.h
 DEVICE_NUMA
 EXT_RESOURCES	opt_global.h
 DIRECTIO
 FILEMON		opt_dontuse.h
 FFCLOCK
 FULL_PREEMPTION	opt_sched.h
 GZIO		opt_gzio.h
 IMAGACT_BINMISC		opt_dontuse.h
 IPI_PREEMPTION	opt_sched.h
 GEOM_AES	opt_geom.h
 GEOM_BDE	opt_geom.h
 GEOM_BSD	opt_geom.h
 GEOM_CACHE	opt_geom.h
 GEOM_CONCAT	opt_geom.h
 GEOM_ELI	opt_geom.h
 GEOM_FOX	opt_geom.h
 GEOM_GATE	opt_geom.h
 GEOM_JOURNAL	opt_geom.h
 GEOM_LABEL	opt_geom.h
 GEOM_LABEL_GPT	opt_geom.h
 GEOM_LINUX_LVM	opt_geom.h
 GEOM_MAP	opt_geom.h
 GEOM_MBR	opt_geom.h
 GEOM_MIRROR	opt_geom.h
 GEOM_MOUNTVER	opt_geom.h
 GEOM_MULTIPATH	opt_geom.h
 GEOM_NOP	opt_geom.h
 GEOM_PART_APM	opt_geom.h
 GEOM_PART_BSD	opt_geom.h
 GEOM_PART_BSD64	opt_geom.h
 GEOM_PART_EBR	opt_geom.h
 GEOM_PART_EBR_COMPAT	opt_geom.h
 GEOM_PART_GPT	opt_geom.h
 GEOM_PART_LDM	opt_geom.h
 GEOM_PART_MBR	opt_geom.h
 GEOM_PART_PC98	opt_geom.h
 GEOM_PART_VTOC8	opt_geom.h
 GEOM_PC98	opt_geom.h
 GEOM_RAID	opt_geom.h
 GEOM_RAID3	opt_geom.h
 GEOM_SHSEC	opt_geom.h
 GEOM_STRIPE	opt_geom.h
 GEOM_SUNLABEL	opt_geom.h
 GEOM_UZIP	opt_geom.h
 GEOM_UZIP_DEBUG	opt_geom.h
 GEOM_VINUM	opt_geom.h
 GEOM_VIRSTOR	opt_geom.h
 GEOM_VOL	opt_geom.h
 GEOM_ZERO	opt_geom.h
 IFLIB		opt_iflib.h
 KDTRACE_HOOKS	opt_global.h
 KDTRACE_FRAME	opt_kdtrace.h
 KN_HASHSIZE	opt_kqueue.h
 KSTACK_MAX_PAGES
 KSTACK_PAGES
 KSTACK_USAGE_PROF
 KTRACE
 KTRACE_REQUEST_POOL	opt_ktrace.h
 LIBICONV
 MAC		opt_global.h
 MAC_BIBA	opt_dontuse.h
 MAC_BSDEXTENDED	opt_dontuse.h
 MAC_IFOFF	opt_dontuse.h
 MAC_LOMAC	opt_dontuse.h
 MAC_MLS		opt_dontuse.h
 MAC_NONE	opt_dontuse.h
 MAC_PARTITION	opt_dontuse.h
 MAC_PORTACL	opt_dontuse.h
 MAC_SEEOTHERUIDS	opt_dontuse.h
 MAC_STATIC	opt_mac.h
 MAC_STUB	opt_dontuse.h
 MAC_TEST	opt_dontuse.h
 MD_ROOT		opt_md.h
 MD_ROOT_FSTYPE	opt_md.h
 MD_ROOT_SIZE	opt_md.h
 MFI_DEBUG	opt_mfi.h
 MFI_DECODE_LOG	opt_mfi.h
 MPROF_BUFFERS	opt_mprof.h
 MPROF_HASH_SIZE	opt_mprof.h
 NEW_PCIB	opt_global.h
 NO_ADAPTIVE_MUTEXES	opt_adaptive_mutexes.h
 NO_ADAPTIVE_RWLOCKS
 NO_ADAPTIVE_SX
 NO_EVENTTIMERS		opt_timer.h
 NO_SYSCTL_DESCR	opt_global.h
 NSWBUF_MIN	opt_swap.h
 MBUF_PACKET_ZONE_DISABLE	opt_global.h
 PANIC_REBOOT_WAIT_TIME	opt_panic.h
 PCI_HP		opt_pci.h
 PCI_IOV		opt_global.h
 PPC_DEBUG	opt_ppc.h
 PPC_PROBE_CHIPSET	opt_ppc.h
 PPS_SYNC	opt_ntp.h
 PREEMPTION	opt_sched.h
 QUOTA
 SCHED_4BSD	opt_sched.h
 SCHED_STATS	opt_sched.h
 SCHED_ULE	opt_sched.h
 SLEEPQUEUE_PROFILING
 SLHCI_DEBUG	opt_slhci.h
 SPX_HACK
 STACK		opt_stack.h
 SUIDDIR
 MSGMNB		opt_sysvipc.h
 MSGMNI		opt_sysvipc.h
 MSGSEG		opt_sysvipc.h
 MSGSSZ		opt_sysvipc.h
 MSGTQL		opt_sysvipc.h
 SEMMNI		opt_sysvipc.h
 SEMMNS		opt_sysvipc.h
 SEMMNU		opt_sysvipc.h
 SEMMSL		opt_sysvipc.h
 SEMOPM		opt_sysvipc.h
 SEMUME		opt_sysvipc.h
 SHMALL		opt_sysvipc.h
 SHMMAX		opt_sysvipc.h
 SHMMAXPGS	opt_sysvipc.h
 SHMMIN		opt_sysvipc.h
 SHMMNI		opt_sysvipc.h
 SHMSEG		opt_sysvipc.h
 SYSVMSG		opt_sysvipc.h
 SYSVSEM		opt_sysvipc.h
 SYSVSHM		opt_sysvipc.h
 SW_WATCHDOG	opt_watchdog.h
 TURNSTILE_PROFILING
 UMTX_PROFILING
 VERBOSE_SYSINIT
 
 # POSIX kernel options
 P1003_1B_MQUEUE			opt_posix.h
 P1003_1B_SEMAPHORES		opt_posix.h
 _KPOSIX_PRIORITY_SCHEDULING	opt_posix.h
 
 # Do we want the config file compiled into the kernel?
 INCLUDE_CONFIG_FILE	opt_config.h
 
 # Options for static filesystems.  These should only be used at config
 # time, since the corresponding lkms cannot work if there are any static
 # dependencies.  Unusability is enforced by hiding the defines for the
 # options in a never-included header.
 AUTOFS		opt_dontuse.h
 CD9660		opt_dontuse.h
 EXT2FS		opt_dontuse.h
 FDESCFS		opt_dontuse.h
 FFS		opt_dontuse.h
 FUSE		opt_dontuse.h
 MSDOSFS		opt_dontuse.h
 NANDFS		opt_dontuse.h
 NULLFS		opt_dontuse.h
 PROCFS		opt_dontuse.h
 PSEUDOFS	opt_dontuse.h
 SMBFS		opt_dontuse.h
 TMPFS		opt_dontuse.h
 UDF		opt_dontuse.h
 UNIONFS		opt_dontuse.h
 ZFS		opt_dontuse.h
 
 # Pseudofs debugging
 PSEUDOFS_TRACE	opt_pseudofs.h
 
 # In-kernel GSS-API
 KGSSAPI		opt_kgssapi.h
 KGSSAPI_DEBUG	opt_kgssapi.h
 
 # These static filesystems have one slightly bogus static dependency in
 # sys/i386/i386/autoconf.c.  If any of these filesystems are
 # statically compiled into the kernel, code for mounting them as root
 # filesystems will be enabled - but look below.
 # NFSCL - client
 # NFSD - server
 NFSCL		opt_nfs.h
 NFSD		opt_nfs.h
 
 # filesystems and libiconv bridge
 CD9660_ICONV	opt_dontuse.h
 MSDOSFS_ICONV	opt_dontuse.h
 UDF_ICONV	opt_dontuse.h
 
 # If you are following the conditions in the copyright,
 # you can enable soft-updates which will speed up a lot of thigs
 # and make the system safer from crashes at the same time.
 # otherwise a STUB module will be compiled in.
 SOFTUPDATES	opt_ffs.h
 
 # On small, embedded systems, it can be useful to turn off support for
 # snapshots.  It saves about 30-40k for a feature that would be lightly
 # used, if it is used at all.
 NO_FFS_SNAPSHOT	opt_ffs.h
 
 # Enabling this option turns on support for Access Control Lists in UFS,
 # which can be used to support high security configurations.  Depends on
 # UFS_EXTATTR.
 UFS_ACL		opt_ufs.h
 
 # Enabling this option turns on support for extended attributes in UFS-based
 # filesystems, which can be used to support high security configurations
 # as well as new filesystem features.
 UFS_EXTATTR	opt_ufs.h
 UFS_EXTATTR_AUTOSTART	opt_ufs.h
 
 # Enable fast hash lookups for large directories on UFS-based filesystems.
 UFS_DIRHASH	opt_ufs.h
 
 # Enable gjournal-based UFS journal.
 UFS_GJOURNAL	opt_ufs.h
 
 # The below sentence is not in English, and neither is this one.
 # We plan to remove the static dependences above, with a
 # <filesystem>_ROOT option to control if it usable as root.  This list
 # allows these options to be present in config files already (though
 # they won't make any difference yet).
 NFS_ROOT	opt_nfsroot.h
 
 # SMB/CIFS requester
 NETSMB		opt_netsmb.h
 
 # Options used only in subr_param.c.
 HZ		opt_param.h
 MAXFILES	opt_param.h
 NBUF		opt_param.h
 NSFBUFS		opt_param.h
 VM_BCACHE_SIZE_MAX	opt_param.h
 VM_SWZONE_SIZE_MAX	opt_param.h
 MAXUSERS
 DFLDSIZ		opt_param.h
 MAXDSIZ		opt_param.h
 MAXSSIZ		opt_param.h
 
 # Generic SCSI options.
 CAM_MAX_HIGHPOWER	opt_cam.h
 CAMDEBUG		opt_cam.h
 CAM_DEBUG_COMPILE	opt_cam.h
 CAM_DEBUG_DELAY		opt_cam.h
 CAM_DEBUG_BUS		opt_cam.h
 CAM_DEBUG_TARGET	opt_cam.h
 CAM_DEBUG_LUN		opt_cam.h
 CAM_DEBUG_FLAGS		opt_cam.h
 CAM_BOOT_DELAY		opt_cam.h
 CAM_IOSCHED_DYNAMIC	opt_cam.h
 SCSI_DELAY		opt_scsi.h
 SCSI_NO_SENSE_STRINGS	opt_scsi.h
 SCSI_NO_OP_STRINGS	opt_scsi.h
 
 # Options used only in cam/ata/ata_da.c
 ADA_TEST_FAILURE	opt_ada.h
 ATA_STATIC_ID		opt_ada.h
 
 # Options used only in cam/scsi/scsi_cd.c
 CHANGER_MIN_BUSY_SECONDS	opt_cd.h
 CHANGER_MAX_BUSY_SECONDS	opt_cd.h
 
 # Options used only in cam/scsi/scsi_sa.c.
 SA_IO_TIMEOUT		opt_sa.h
 SA_SPACE_TIMEOUT	opt_sa.h
 SA_REWIND_TIMEOUT	opt_sa.h
 SA_ERASE_TIMEOUT	opt_sa.h
 SA_1FM_AT_EOD		opt_sa.h
 
 # Options used only in cam/scsi/scsi_pt.c
 SCSI_PT_DEFAULT_TIMEOUT	opt_pt.h
 
 # Options used only in cam/scsi/scsi_ses.c
 SES_ENABLE_PASSTHROUGH	opt_ses.h
 
 # Options used in dev/sym/ (Symbios SCSI driver).
 SYM_SETUP_LP_PROBE_MAP	opt_sym.h	#-Low Priority Probe Map (bits)
 					# Allows the ncr to take precedence
 					# 1 (1<<0) -> 810a, 860
 					# 2 (1<<1) -> 825a, 875, 885, 895
 					# 4 (1<<2) -> 895a, 896, 1510d 
 SYM_SETUP_SCSI_DIFF	opt_sym.h	#-HVD support for 825a, 875, 885
 					# disabled:0 (default), enabled:1
 SYM_SETUP_PCI_PARITY	opt_sym.h	#-PCI parity checking
 					# disabled:0, enabled:1 (default)
 SYM_SETUP_MAX_LUN	opt_sym.h	#-Number of LUNs supported
 					# default:8, range:[1..64]
 
 # Options used only in dev/ncr/*
 SCSI_NCR_DEBUG		opt_ncr.h
 SCSI_NCR_MAX_SYNC	opt_ncr.h
 SCSI_NCR_MAX_WIDE	opt_ncr.h
 SCSI_NCR_MYADDR		opt_ncr.h
 
 # Options used only in dev/isp/*
 ISP_TARGET_MODE		opt_isp.h
 ISP_FW_CRASH_DUMP	opt_isp.h
 ISP_DEFAULT_ROLES	opt_isp.h
 ISP_INTERNAL_TARGET	opt_isp.h
 
 # Options used only in dev/iscsi
 ISCSI_INITIATOR_DEBUG	opt_iscsi_initiator.h
 
 # Net stuff.
 ACCEPT_FILTER_DATA
 ACCEPT_FILTER_DNS
 ACCEPT_FILTER_HTTP
 ALTQ			opt_global.h
 ALTQ_CBQ		opt_altq.h
 ALTQ_CDNR		opt_altq.h
 ALTQ_CODEL		opt_altq.h
 ALTQ_DEBUG		opt_altq.h
 ALTQ_HFSC		opt_altq.h
 ALTQ_FAIRQ		opt_altq.h
 ALTQ_NOPCC		opt_altq.h
 ALTQ_PRIQ		opt_altq.h
 ALTQ_RED		opt_altq.h
 ALTQ_RIO		opt_altq.h
 BOOTP			opt_bootp.h
 BOOTP_BLOCKSIZE		opt_bootp.h
 BOOTP_COMPAT		opt_bootp.h
 BOOTP_NFSROOT		opt_bootp.h
 BOOTP_NFSV3		opt_bootp.h
 BOOTP_WIRED_TO		opt_bootp.h
 DEVICE_POLLING
 DUMMYNET		opt_ipdn.h
 INET			opt_inet.h
 INET6			opt_inet6.h
 IPDIVERT
 IPFILTER		opt_ipfilter.h
 IPFILTER_DEFAULT_BLOCK	opt_ipfilter.h
 IPFILTER_LOG		opt_ipfilter.h
 IPFILTER_LOOKUP		opt_ipfilter.h
 IPFIREWALL		opt_ipfw.h
 IPFIREWALL_DEFAULT_TO_ACCEPT	opt_ipfw.h
 IPFIREWALL_NAT		opt_ipfw.h
 IPFIREWALL_NAT64	opt_ipfw.h
 IPFIREWALL_NAT64_DIRECT_OUTPUT	opt_ipfw.h
 IPFIREWALL_NPTV6	opt_ipfw.h
 IPFIREWALL_VERBOSE	opt_ipfw.h
 IPFIREWALL_VERBOSE_LIMIT	opt_ipfw.h
 IPSEC			opt_ipsec.h
 IPSEC_DEBUG		opt_ipsec.h
 IPSEC_NAT_T		opt_ipsec.h
 IPSTEALTH
 KRPC
 LIBALIAS
 LIBMBPOOL
 LIBMCHAIN
 MBUF_PROFILING
 MBUF_STRESS_TEST
 MROUTING		opt_mrouting.h
 NFSLOCKD
 PCBGROUP		opt_pcbgroup.h
 PF_DEFAULT_TO_DROP	opt_pf.h
 RADIX_MPATH		opt_mpath.h
 ROUTETABLES		opt_route.h
 RSS			opt_rss.h
 SLIP_IFF_OPTS		opt_slip.h
 TCPDEBUG
 TCPPCAP		opt_global.h
 SIFTR
 TCP_HHOOK		opt_inet.h
 TCP_OFFLOAD		opt_inet.h # Enable code to dispatch TCP offloading
 TCP_RFC7413		opt_inet.h
 TCP_RFC7413_MAX_KEYS	opt_inet.h
 TCP_SIGNATURE		opt_inet.h
 VLAN_ARRAY		opt_vlan.h
 XBONEHACK
 FLOWTABLE		opt_route.h
 FLOWTABLE_HASH_ALL	opt_route.h
 
 #
 # SCTP
 #
 SCTP			opt_sctp.h
 SCTP_DEBUG		opt_sctp.h # Enable debug printfs
 SCTP_WITH_NO_CSUM	opt_sctp.h # Use this at your peril
 SCTP_LOCK_LOGGING	opt_sctp.h # Log to KTR lock activity
 SCTP_MBUF_LOGGING	opt_sctp.h # Log to KTR general mbuf aloc/free
 SCTP_MBCNT_LOGGING	opt_sctp.h # Log to KTR mbcnt activity
 SCTP_PACKET_LOGGING	opt_sctp.h # Log to a packet buffer last N packets
 SCTP_LTRACE_CHUNKS	opt_sctp.h # Log to KTR chunks processed
 SCTP_LTRACE_ERRORS	opt_sctp.h # Log to KTR error returns.
 SCTP_USE_PERCPU_STAT	opt_sctp.h # Use per cpu stats.
 SCTP_MCORE_INPUT	opt_sctp.h # Have multiple input threads for input mbufs
 SCTP_LOCAL_TRACE_BUF	opt_sctp.h # Use tracebuffer exported via sysctl
 SCTP_DETAILED_STR_STATS	opt_sctp.h # Use per PR-SCTP policy stream stats
 #
 #
 #
 
 # Netgraph(4). Use option NETGRAPH to enable the base netgraph code.
 # Each netgraph node type can be either be compiled into the kernel
 # or loaded dynamically. To get the former, include the corresponding
 # option below. Each type has its own man page, e.g. ng_async(4).
 NETGRAPH
 NETGRAPH_DEBUG		opt_netgraph.h
 NETGRAPH_ASYNC		opt_netgraph.h
 NETGRAPH_ATMLLC		opt_netgraph.h
 NETGRAPH_ATM_ATMPIF	opt_netgraph.h
 NETGRAPH_BLUETOOTH	opt_netgraph.h
 NETGRAPH_BLUETOOTH_BT3C	opt_netgraph.h
 NETGRAPH_BLUETOOTH_H4	opt_netgraph.h
 NETGRAPH_BLUETOOTH_HCI	opt_netgraph.h
 NETGRAPH_BLUETOOTH_L2CAP	opt_netgraph.h
 NETGRAPH_BLUETOOTH_SOCKET	opt_netgraph.h
 NETGRAPH_BLUETOOTH_UBT	opt_netgraph.h
 NETGRAPH_BLUETOOTH_UBTBCMFW	opt_netgraph.h
 NETGRAPH_BPF		opt_netgraph.h
 NETGRAPH_BRIDGE		opt_netgraph.h
 NETGRAPH_CAR		opt_netgraph.h
 NETGRAPH_CISCO		opt_netgraph.h
 NETGRAPH_DEFLATE	opt_netgraph.h
 NETGRAPH_DEVICE		opt_netgraph.h
 NETGRAPH_ECHO		opt_netgraph.h
 NETGRAPH_EIFACE		opt_netgraph.h
 NETGRAPH_ETHER		opt_netgraph.h
 NETGRAPH_ETHER_ECHO	opt_netgraph.h
 NETGRAPH_FEC		opt_netgraph.h
 NETGRAPH_FRAME_RELAY	opt_netgraph.h
 NETGRAPH_GIF		opt_netgraph.h
 NETGRAPH_GIF_DEMUX	opt_netgraph.h
 NETGRAPH_HOLE		opt_netgraph.h
 NETGRAPH_IFACE		opt_netgraph.h
 NETGRAPH_IP_INPUT	opt_netgraph.h
 NETGRAPH_IPFW		opt_netgraph.h
 NETGRAPH_KSOCKET	opt_netgraph.h
 NETGRAPH_L2TP		opt_netgraph.h
 NETGRAPH_LMI		opt_netgraph.h
 # MPPC compression requires proprietary files (not included)
 NETGRAPH_MPPC_COMPRESSION	opt_netgraph.h
 NETGRAPH_MPPC_ENCRYPTION	opt_netgraph.h
 NETGRAPH_NAT		opt_netgraph.h
 NETGRAPH_NETFLOW	opt_netgraph.h
 NETGRAPH_ONE2MANY	opt_netgraph.h
 NETGRAPH_PATCH		opt_netgraph.h
 NETGRAPH_PIPE		opt_netgraph.h
 NETGRAPH_PPP		opt_netgraph.h
 NETGRAPH_PPPOE		opt_netgraph.h
 NETGRAPH_PPTPGRE	opt_netgraph.h
 NETGRAPH_PRED1		opt_netgraph.h
 NETGRAPH_RFC1490	opt_netgraph.h
 NETGRAPH_SOCKET		opt_netgraph.h
 NETGRAPH_SPLIT		opt_netgraph.h
 NETGRAPH_SPPP		opt_netgraph.h
 NETGRAPH_TAG		opt_netgraph.h
 NETGRAPH_TCPMSS		opt_netgraph.h
 NETGRAPH_TEE		opt_netgraph.h
 NETGRAPH_TTY		opt_netgraph.h
 NETGRAPH_UI		opt_netgraph.h
 NETGRAPH_VJC		opt_netgraph.h
 NETGRAPH_VLAN		opt_netgraph.h
 
 # NgATM options
 NGATM_ATM		opt_netgraph.h
 NGATM_ATMBASE		opt_netgraph.h
 NGATM_SSCOP		opt_netgraph.h
 NGATM_SSCFU		opt_netgraph.h
 NGATM_UNI		opt_netgraph.h
 NGATM_CCATM		opt_netgraph.h
 
 # DRM options
 DRM_DEBUG		opt_drm.h
 
 TI_SF_BUF_JUMBO		opt_ti.h
 TI_JUMBO_HDRSPLIT	opt_ti.h
 
 # XXX Conflict: # of devices vs network protocol (Native ATM).
 # This makes "atm.h" unusable.
 NATM
 
 # DPT driver debug flags
 DPT_MEASURE_PERFORMANCE	opt_dpt.h
 DPT_RESET_HBA		opt_dpt.h
 
 # Misc debug flags.  Most of these should probably be replaced with
 # 'DEBUG', and then let people recompile just the interesting modules
 # with 'make CC="cc -DDEBUG"'.
 CLUSTERDEBUG		opt_debug_cluster.h
 DEBUG_1284		opt_ppb_1284.h
 VP0_DEBUG		opt_vpo.h
 LPT_DEBUG		opt_lpt.h
 PLIP_DEBUG		opt_plip.h
 LOCKF_DEBUG		opt_debug_lockf.h
 SI_DEBUG		opt_debug_si.h
 IFMEDIA_DEBUG		opt_ifmedia.h
 
 # Fb options
 FB_DEBUG		opt_fb.h
 FB_INSTALL_CDEV		opt_fb.h
 
 # ppbus related options
 PERIPH_1284		opt_ppb_1284.h
 DONTPROBE_1284		opt_ppb_1284.h
 
 # smbus related options
 ENABLE_ALART		opt_intpm.h
 
 # These cause changes all over the kernel
 BLKDEV_IOSIZE		opt_global.h
 BURN_BRIDGES		opt_global.h
 DEBUG			opt_global.h
 DEBUG_LOCKS		opt_global.h
 DEBUG_VFS_LOCKS		opt_global.h
 DFLTPHYS		opt_global.h
 DIAGNOSTIC		opt_global.h
 INVARIANT_SUPPORT	opt_global.h
 INVARIANTS		opt_global.h
 MAXCPU			opt_global.h
 MAXMEMDOM		opt_global.h
 MAXPHYS			opt_global.h
 MCLSHIFT		opt_global.h
 MUTEX_NOINLINE		opt_global.h
 LOCK_PROFILING		opt_global.h
 LOCK_PROFILING_FAST	opt_global.h
 MSIZE			opt_global.h
 REGRESSION		opt_global.h
 RWLOCK_NOINLINE		opt_global.h
 SX_NOINLINE		opt_global.h
 VFS_BIO_DEBUG		opt_global.h
 
 # These are VM related options
 VM_KMEM_SIZE		opt_vm.h
 VM_KMEM_SIZE_SCALE	opt_vm.h
 VM_KMEM_SIZE_MAX	opt_vm.h
 VM_NRESERVLEVEL		opt_vm.h
 VM_NUMA_ALLOC		opt_vm.h
 VM_LEVEL_0_ORDER	opt_vm.h
 NO_SWAPPING		opt_vm.h
 MALLOC_MAKE_FAILURES	opt_vm.h
 MALLOC_PROFILE		opt_vm.h
 MALLOC_DEBUG_MAXZONES	opt_vm.h
 
 # The MemGuard replacement allocator used for tamper-after-free detection
 DEBUG_MEMGUARD		opt_vm.h
 
 # The RedZone malloc(9) protection
 DEBUG_REDZONE		opt_vm.h
 
 # Standard SMP options
 EARLY_AP_STARTUP	opt_global.h
 SMP			opt_global.h
 
 # Size of the kernel message buffer
 MSGBUF_SIZE		opt_msgbuf.h
 
 # NFS options
 NFS_MINATTRTIMO		opt_nfs.h
 NFS_MAXATTRTIMO		opt_nfs.h
 NFS_MINDIRATTRTIMO	opt_nfs.h
 NFS_MAXDIRATTRTIMO	opt_nfs.h
 NFS_DEBUG		opt_nfs.h
 
 # For the Bt848/Bt848A/Bt849/Bt878/Bt879 driver
 OVERRIDE_CARD			opt_bktr.h
 OVERRIDE_TUNER			opt_bktr.h
 OVERRIDE_DBX			opt_bktr.h
 OVERRIDE_MSP			opt_bktr.h
 BROOKTREE_SYSTEM_DEFAULT	opt_bktr.h
 BROOKTREE_ALLOC_PAGES		opt_bktr.h
 BKTR_OVERRIDE_CARD		opt_bktr.h
 BKTR_OVERRIDE_TUNER		opt_bktr.h
 BKTR_OVERRIDE_DBX		opt_bktr.h
 BKTR_OVERRIDE_MSP		opt_bktr.h
 BKTR_SYSTEM_DEFAULT		opt_bktr.h
 BKTR_ALLOC_PAGES		opt_bktr.h
 BKTR_USE_PLL			opt_bktr.h	
 BKTR_GPIO_ACCESS		opt_bktr.h
 BKTR_NO_MSP_RESET		opt_bktr.h
 BKTR_430_FX_MODE		opt_bktr.h
 BKTR_SIS_VIA_MODE		opt_bktr.h
 BKTR_USE_FREEBSD_SMBUS		opt_bktr.h
 BKTR_NEW_MSP34XX_DRIVER		opt_bktr.h
 
 # Options for uart(4)
 UART_PPS_ON_CTS		opt_uart.h
 UART_POLL_FREQ		opt_uart.h
 UART_DEV_TOLERANCE_PCT	opt_uart.h
 
 # options for bus/device framework
 BUS_DEBUG		opt_bus.h
 
 # options for USB support
 USB_DEBUG		opt_usb.h
 USB_HOST_ALIGN		opt_usb.h
 USB_REQ_DEBUG		opt_usb.h
 USB_TEMPLATE		opt_usb.h
 USB_VERBOSE		opt_usb.h
 USB_DMA_SINGLE_ALLOC	opt_usb.h
 USB_EHCI_BIG_ENDIAN_DESC	opt_usb.h
 U3G_DEBUG		opt_u3g.h
 UKBD_DFLT_KEYMAP	opt_ukbd.h
 UPLCOM_INTR_INTERVAL	opt_uplcom.h
 UVSCOM_DEFAULT_OPKTSIZE	opt_uvscom.h
 UVSCOM_INTR_INTERVAL	opt_uvscom.h
 
 # options for the Realtek rtwn driver
 RTWN_DEBUG		opt_rtwn.h
 RTWN_WITHOUT_UCODE	opt_rtwn.h
 
 # Embedded system options
 INIT_PATH
 
 ROOTDEVNAME
 
 FDC_DEBUG		opt_fdc.h
 PCFCLOCK_VERBOSE	opt_pcfclock.h
 PCFCLOCK_MAX_RETRIES	opt_pcfclock.h
 
 KTR			opt_global.h
 KTR_ALQ			opt_ktr.h
 KTR_MASK		opt_ktr.h
 KTR_CPUMASK		opt_ktr.h
 KTR_COMPILE		opt_global.h
 KTR_BOOT_ENTRIES	opt_global.h
 KTR_ENTRIES		opt_global.h
 KTR_VERBOSE		opt_ktr.h
 WITNESS			opt_global.h
 WITNESS_KDB		opt_witness.h
 WITNESS_NO_VNODE	opt_witness.h
 WITNESS_SKIPSPIN	opt_witness.h
 WITNESS_COUNT		opt_witness.h
 OPENSOLARIS_WITNESS	opt_global.h
 
 # options for ACPI support
 ACPI_DEBUG		opt_acpi.h
 ACPI_MAX_TASKS		opt_acpi.h
 ACPI_MAX_THREADS	opt_acpi.h
 ACPI_DMAR		opt_acpi.h
 DEV_ACPI		opt_acpi.h
 
 # ISA support
 DEV_ISA			opt_isa.h
 ISAPNP			opt_isa.h
 
 # various 'device presence' options.
 DEV_BPF			opt_bpf.h
 DEV_CARP		opt_carp.h
 DEV_MCA			opt_mca.h
 DEV_NETMAP		opt_global.h
 DEV_PCI			opt_pci.h
 DEV_PF			opt_pf.h
 DEV_PFLOG		opt_pf.h
 DEV_PFSYNC		opt_pf.h
 DEV_RANDOM		opt_global.h
 DEV_SPLASH		opt_splash.h
 DEV_VLAN		opt_vlan.h
 
 # EISA support
 DEV_EISA		opt_eisa.h
 EISA_SLOTS		opt_eisa.h
 
 # ed driver
 ED_HPP			opt_ed.h
 ED_3C503		opt_ed.h
 ED_SIC			opt_ed.h
 
 # bce driver
 BCE_DEBUG		opt_bce.h
 BCE_NVRAM_WRITE_SUPPORT	opt_bce.h
 
 SOCKBUF_DEBUG		opt_global.h
 
 
 # options for ubsec driver
 UBSEC_DEBUG		opt_ubsec.h
 UBSEC_RNDTEST		opt_ubsec.h
 UBSEC_NO_RNG		opt_ubsec.h
 
 # options for hifn driver
 HIFN_DEBUG		opt_hifn.h
 HIFN_RNDTEST		opt_hifn.h
 
 # options for safenet driver
 SAFE_DEBUG		opt_safe.h
 SAFE_NO_RNG		opt_safe.h
 SAFE_RNDTEST		opt_safe.h
 
 # syscons/vt options
 MAXCONS			opt_syscons.h
 SC_ALT_MOUSE_IMAGE	opt_syscons.h
 SC_CUT_SPACES2TABS	opt_syscons.h
 SC_CUT_SEPCHARS		opt_syscons.h
 SC_DEBUG_LEVEL		opt_syscons.h
 SC_DFLT_FONT		opt_syscons.h
 SC_DISABLE_KDBKEY	opt_syscons.h
 SC_DISABLE_REBOOT	opt_syscons.h
 SC_HISTORY_SIZE		opt_syscons.h
 SC_KERNEL_CONS_ATTR	opt_syscons.h
 SC_KERNEL_CONS_REV_ATTR	opt_syscons.h
 SC_MOUSE_CHAR		opt_syscons.h
 SC_NO_CUTPASTE		opt_syscons.h
 SC_NO_FONT_LOADING	opt_syscons.h
 SC_NO_HISTORY		opt_syscons.h
 SC_NO_MODE_CHANGE	opt_syscons.h
 SC_NO_SUSPEND_VTYSWITCH	opt_syscons.h
 SC_NO_SYSMOUSE		opt_syscons.h
 SC_NORM_ATTR		opt_syscons.h
 SC_NORM_REV_ATTR	opt_syscons.h
 SC_PIXEL_MODE		opt_syscons.h
 SC_RENDER_DEBUG		opt_syscons.h
 SC_TWOBUTTON_MOUSE	opt_syscons.h
 VT_ALT_TO_ESC_HACK	opt_syscons.h
 VT_FB_DEFAULT_WIDTH	opt_syscons.h
 VT_FB_DEFAULT_HEIGHT	opt_syscons.h
 VT_MAXWINDOWS		opt_syscons.h
 VT_TWOBUTTON_MOUSE	opt_syscons.h
 DEV_SC			opt_syscons.h
 DEV_VT			opt_syscons.h
 
 # teken terminal emulator options
 TEKEN_CONS25		opt_teken.h
 TEKEN_UTF8		opt_teken.h
 TERMINAL_KERN_ATTR	opt_teken.h
 TERMINAL_NORM_ATTR	opt_teken.h
 
 # options for printf
 PRINTF_BUFR_SIZE	opt_printf.h
 
 # kbd options
 KBD_DISABLE_KEYMAP_LOAD	opt_kbd.h
 KBD_INSTALL_CDEV	opt_kbd.h
 KBD_MAXRETRY		opt_kbd.h
 KBD_MAXWAIT		opt_kbd.h
 KBD_RESETDELAY		opt_kbd.h
 KBDIO_DEBUG		opt_kbd.h
 
 KBDMUX_DFLT_KEYMAP	opt_kbdmux.h
 
 # options for the Atheros driver
 ATH_DEBUG		opt_ath.h
 ATH_TXBUF		opt_ath.h
 ATH_RXBUF		opt_ath.h
 ATH_DIAGAPI		opt_ath.h
 ATH_TX99_DIAG		opt_ath.h
 ATH_ENABLE_11N		opt_ath.h
 ATH_ENABLE_DFS		opt_ath.h
 ATH_EEPROM_FIRMWARE	opt_ath.h
 ATH_ENABLE_RADIOTAP_VENDOR_EXT	opt_ath.h
 ATH_DEBUG_ALQ		opt_ath.h
 ATH_KTR_INTR_DEBUG	opt_ath.h
 
 # options for the Atheros hal
 AH_SUPPORT_AR5416	opt_ah.h
 # XXX For now, this breaks non-AR9130 chipsets, so only use it
 # XXX when actually targeting AR9130.
 AH_SUPPORT_AR9130	opt_ah.h
 
 # This is required for AR933x SoC support
 AH_SUPPORT_AR9330	opt_ah.h
 AH_SUPPORT_AR9340	opt_ah.h
 AH_SUPPORT_QCA9530	opt_ah.h
 AH_SUPPORT_QCA9550	opt_ah.h
 
 AH_DEBUG		opt_ah.h
 AH_ASSERT		opt_ah.h
 AH_DEBUG_ALQ		opt_ah.h
 AH_REGOPS_FUNC		opt_ah.h
 AH_WRITE_REGDOMAIN	opt_ah.h
 AH_DEBUG_COUNTRY	opt_ah.h
 AH_WRITE_EEPROM		opt_ah.h
 AH_PRIVATE_DIAG		opt_ah.h
 AH_NEED_DESC_SWAP	opt_ah.h
 AH_USE_INIPDGAIN	opt_ah.h
 AH_MAXCHAN		opt_ah.h
 AH_RXCFG_SDMAMW_4BYTES	opt_ah.h
 AH_INTERRUPT_DEBUGGING	opt_ah.h
 # AR5416 and later interrupt mitigation
 # XXX do not use this for AR9130
 AH_AR5416_INTERRUPT_MITIGATION	opt_ah.h
 
 # options for the Broadcom BCM43xx driver (bwi)
 BWI_DEBUG		opt_bwi.h
 BWI_DEBUG_VERBOSE	opt_bwi.h
 
 # options for the Brodacom BCM43xx driver (bwn)
 BWN_DEBUG		opt_bwn.h
 BWN_GPL_PHY		opt_bwn.h
 
 # Options for the SIBA driver
 SIBA_DEBUG		opt_siba.h
 
 # options for the Marvell 8335 wireless driver
 MALO_DEBUG		opt_malo.h
 MALO_TXBUF		opt_malo.h
 MALO_RXBUF		opt_malo.h
 
 # options for the Marvell wireless driver
 MWL_DEBUG		opt_mwl.h
 MWL_TXBUF		opt_mwl.h
 MWL_RXBUF		opt_mwl.h
 MWL_DIAGAPI		opt_mwl.h
 MWL_AGGR_SIZE		opt_mwl.h
 MWL_TX_NODROP		opt_mwl.h
 
 # Options for the Intel 802.11ac wireless driver
 IWM_DEBUG		opt_iwm.h
 
 # Options for the Intel 802.11n wireless driver
 IWN_DEBUG		opt_iwn.h
 
 # Options for the Intel 3945ABG wireless driver
 WPI_DEBUG		opt_wpi.h
 
 # dcons options 
 DCONS_BUF_SIZE		opt_dcons.h
 DCONS_POLL_HZ		opt_dcons.h
 DCONS_FORCE_CONSOLE	opt_dcons.h
 DCONS_FORCE_GDB		opt_dcons.h
 
 # HWPMC options
 HWPMC_DEBUG		opt_global.h
 HWPMC_HOOKS
 HWPMC_MIPS_BACKTRACE 	opt_hwpmc_hooks.h
 
 # XBOX options for FreeBSD/i386, but some files are MI
 XBOX			opt_xbox.h
 
 # Interrupt filtering
 INTR_FILTER
 
 # 802.11 support layer
 IEEE80211_DEBUG		opt_wlan.h
 IEEE80211_DEBUG_REFCNT	opt_wlan.h
 IEEE80211_AMPDU_AGE	opt_wlan.h
 IEEE80211_SUPPORT_MESH	opt_wlan.h
 IEEE80211_SUPPORT_SUPERG	opt_wlan.h
 IEEE80211_SUPPORT_TDMA	opt_wlan.h
 IEEE80211_ALQ		opt_wlan.h
 IEEE80211_DFS_DEBUG	opt_wlan.h
 
 # 802.11 TDMA support
 TDMA_SLOTLEN_DEFAULT	opt_tdma.h
 TDMA_SLOTCNT_DEFAULT	opt_tdma.h
 TDMA_BINTVAL_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11B_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11G_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11A_DEFAULT	opt_tdma.h
 TDMA_TXRATE_TURBO_DEFAULT	opt_tdma.h
 TDMA_TXRATE_HALF_DEFAULT	opt_tdma.h
 TDMA_TXRATE_QUARTER_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11NA_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11NG_DEFAULT	opt_tdma.h
 
 # VideoMode
 PICKMODE_DEBUG			opt_videomode.h
 
 # Network stack virtualization options
 VIMAGE			opt_global.h
 VNET_DEBUG		opt_global.h
 
 # Common Flash Interface (CFI) options
 CFI_SUPPORT_STRATAFLASH	opt_cfi.h
 CFI_ARMEDANDDANGEROUS	opt_cfi.h
 CFI_HARDWAREBYTESWAP	opt_cfi.h
 
 # Sound options
 SND_DEBUG		opt_snd.h
 SND_DIAGNOSTIC		opt_snd.h
 SND_FEEDER_MULTIFORMAT	opt_snd.h
 SND_FEEDER_FULL_MULTIFORMAT	opt_snd.h
 SND_FEEDER_RATE_HP	opt_snd.h
 SND_PCM_64		opt_snd.h
 SND_OLDSTEREO		opt_snd.h
 
 X86BIOS
 
 # Flattened device tree options
 FDT		opt_platform.h
 FDT_DTB_STATIC	opt_platform.h
 
 # OFED Infiniband stack
 OFED		opt_ofed.h
 OFED_DEBUG_INIT	opt_ofed.h
 SDP		opt_ofed.h
 SDP_DEBUG	opt_ofed.h
 IPOIB		opt_ofed.h
 IPOIB_DEBUG	opt_ofed.h
 IPOIB_CM	opt_ofed.h
 
 # Resource Accounting
 RACCT		opt_global.h
 RACCT_DEFAULT_TO_DISABLED	opt_global.h
 
 # Resource Limits
 RCTL		opt_global.h
 
 # Random number generator(s)
 # Which CSPRNG hash we get.
 # If Yarrow is not chosen, Fortuna is selected.
 RANDOM_YARROW	opt_global.h
 # With this, no entropy processor is loaded, but the entropy
 # harvesting infrastructure is present. This means an entropy
 # processor may be loaded as a module.
 RANDOM_LOADABLE	opt_global.h
 # This turns on high-rate and potentially expensive harvesting in
 # the uma slab allocator.
 RANDOM_ENABLE_UMA	opt_global.h
 
 # Intel em(4) driver
 EM_MULTIQUEUE	opt_em.h
 
 # BHND(4) driver
 BHND_LOGLEVEL	opt_global.h
 
 # GPIO and child devices
 GPIO_SPI_DEBUG	opt_gpio.h
 
 # evdev protocol support
 EVDEV_SUPPORT	opt_evdev.h
 EVDEV_DEBUG	opt_evdev.h
 UINPUT_DEBUG	opt_evdev.h
Index: head/sys/dev/mps/mps_sas.c
===================================================================
--- head/sys/dev/mps/mps_sas.c	(revision 308154)
+++ head/sys/dev/mps/mps_sas.c	(revision 308155)
@@ -1,3712 +1,3721 @@
 /*-
  * Copyright (c) 2009 Yahoo! Inc.
  * Copyright (c) 2011-2015 LSI Corp.
  * Copyright (c) 2013-2015 Avago Technologies
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Avago Technologies (LSI) MPT-Fusion Host Adapter FreeBSD
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* Communications core for Avago Technologies (LSI) MPT2 */
 
 /* TODO Move headers to mpsvar */
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/selinfo.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/bio.h>
 #include <sys/malloc.h>
 #include <sys/uio.h>
 #include <sys/sysctl.h>
 #include <sys/endian.h>
 #include <sys/queue.h>
 #include <sys/kthread.h>
 #include <sys/taskqueue.h>
 #include <sys/sbuf.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/rman.h>
 
 #include <machine/stdarg.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_xpt.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_periph.h>
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_message.h>
 #if __FreeBSD_version >= 900026
 #include <cam/scsi/smp_all.h>
 #endif
 
 #include <dev/mps/mpi/mpi2_type.h>
 #include <dev/mps/mpi/mpi2.h>
 #include <dev/mps/mpi/mpi2_ioc.h>
 #include <dev/mps/mpi/mpi2_sas.h>
 #include <dev/mps/mpi/mpi2_cnfg.h>
 #include <dev/mps/mpi/mpi2_init.h>
 #include <dev/mps/mpi/mpi2_tool.h>
 #include <dev/mps/mps_ioctl.h>
 #include <dev/mps/mpsvar.h>
 #include <dev/mps/mps_table.h>
 #include <dev/mps/mps_sas.h>
 
 #define MPSSAS_DISCOVERY_TIMEOUT	20
 #define MPSSAS_MAX_DISCOVERY_TIMEOUTS	10 /* 200 seconds */
 
 /*
  * static array to check SCSI OpCode for EEDP protection bits
  */
 #define	PRO_R MPI2_SCSIIO_EEDPFLAGS_CHECK_REMOVE_OP
 #define	PRO_W MPI2_SCSIIO_EEDPFLAGS_INSERT_OP
 #define	PRO_V MPI2_SCSIIO_EEDPFLAGS_INSERT_OP
 static uint8_t op_code_prot[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, PRO_R, 0, PRO_W, 0, 0, 0, PRO_W, PRO_V,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, PRO_W, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, PRO_R, 0, PRO_W, 0, 0, 0, PRO_W, PRO_V,
 	0, 0, 0, PRO_W, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, PRO_R, 0, PRO_W, 0, 0, 0, PRO_W, PRO_V,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
 MALLOC_DEFINE(M_MPSSAS, "MPSSAS", "MPS SAS memory");
 
 static void mpssas_remove_device(struct mps_softc *, struct mps_command *);
 static void mpssas_remove_complete(struct mps_softc *, struct mps_command *);
 static void mpssas_action(struct cam_sim *sim, union ccb *ccb);
 static void mpssas_poll(struct cam_sim *sim);
 static int mpssas_send_abort(struct mps_softc *sc, struct mps_command *tm,
     struct mps_command *cm);
 static void mpssas_scsiio_timeout(void *data);
 static void mpssas_abort_complete(struct mps_softc *sc, struct mps_command *cm);
 static void mpssas_direct_drive_io(struct mpssas_softc *sassc,
     struct mps_command *cm, union ccb *ccb);
 static void mpssas_action_scsiio(struct mpssas_softc *, union ccb *);
 static void mpssas_scsiio_complete(struct mps_softc *, struct mps_command *);
 static void mpssas_action_resetdev(struct mpssas_softc *, union ccb *);
 #if __FreeBSD_version >= 900026
 static void mpssas_smpio_complete(struct mps_softc *sc, struct mps_command *cm);
 static void mpssas_send_smpcmd(struct mpssas_softc *sassc, union ccb *ccb,
 			       uint64_t sasaddr);
 static void mpssas_action_smpio(struct mpssas_softc *sassc, union ccb *ccb);
 #endif //FreeBSD_version >= 900026
 static void mpssas_resetdev_complete(struct mps_softc *, struct mps_command *);
 static void mpssas_async(void *callback_arg, uint32_t code,
 			 struct cam_path *path, void *arg);
 #if (__FreeBSD_version < 901503) || \
     ((__FreeBSD_version >= 1000000) && (__FreeBSD_version < 1000006))
 static void mpssas_check_eedp(struct mps_softc *sc, struct cam_path *path,
 			      struct ccb_getdev *cgd);
 static void mpssas_read_cap_done(struct cam_periph *periph, union ccb *done_ccb);
 #endif
 static int mpssas_send_portenable(struct mps_softc *sc);
 static void mpssas_portenable_complete(struct mps_softc *sc,
     struct mps_command *cm);
 
 struct mpssas_target *
 mpssas_find_target_by_handle(struct mpssas_softc *sassc, int start, uint16_t handle)
 {
 	struct mpssas_target *target;
 	int i;
 
 	for (i = start; i < sassc->maxtargets; i++) {
 		target = &sassc->targets[i];
 		if (target->handle == handle)
 			return (target);
 	}
 
 	return (NULL);
 }
 
 /* we need to freeze the simq during attach and diag reset, to avoid failing
  * commands before device handles have been found by discovery.  Since
  * discovery involves reading config pages and possibly sending commands,
  * discovery actions may continue even after we receive the end of discovery
  * event, so refcount discovery actions instead of assuming we can unfreeze
  * the simq when we get the event.
  */
 void
 mpssas_startup_increment(struct mpssas_softc *sassc)
 {
 	MPS_FUNCTRACE(sassc->sc);
 
 	if ((sassc->flags & MPSSAS_IN_STARTUP) != 0) {
 		if (sassc->startup_refcount++ == 0) {
 			/* just starting, freeze the simq */
 			mps_dprint(sassc->sc, MPS_INIT,
 			    "%s freezing simq\n", __func__);
 #if __FreeBSD_version >= 1000039
 			xpt_hold_boot();
 #endif
 			xpt_freeze_simq(sassc->sim, 1);
 		}
 		mps_dprint(sassc->sc, MPS_INIT, "%s refcount %u\n", __func__,
 		    sassc->startup_refcount);
 	}
 }
 
 void
 mpssas_release_simq_reinit(struct mpssas_softc *sassc)
 {
 	if (sassc->flags & MPSSAS_QUEUE_FROZEN) {
 		sassc->flags &= ~MPSSAS_QUEUE_FROZEN;
 		xpt_release_simq(sassc->sim, 1);
 		mps_dprint(sassc->sc, MPS_INFO, "Unfreezing SIM queue\n");
 	}
 }
 
 void
 mpssas_startup_decrement(struct mpssas_softc *sassc)
 {
 	MPS_FUNCTRACE(sassc->sc);
 
 	if ((sassc->flags & MPSSAS_IN_STARTUP) != 0) {
 		if (--sassc->startup_refcount == 0) {
 			/* finished all discovery-related actions, release
 			 * the simq and rescan for the latest topology.
 			 */
 			mps_dprint(sassc->sc, MPS_INIT,
 			    "%s releasing simq\n", __func__);
 			sassc->flags &= ~MPSSAS_IN_STARTUP;
 			xpt_release_simq(sassc->sim, 1);
 #if __FreeBSD_version >= 1000039
 			xpt_release_boot();
 #else
 			mpssas_rescan_target(sassc->sc, NULL);
 #endif
 		}
 		mps_dprint(sassc->sc, MPS_INIT, "%s refcount %u\n", __func__,
 		    sassc->startup_refcount);
 	}
 }
 
 /* The firmware requires us to stop sending commands when we're doing task
  * management, so refcount the TMs and keep the simq frozen when any are in
  * use.
  */
 struct mps_command *
 mpssas_alloc_tm(struct mps_softc *sc)
 {
 	struct mps_command *tm;
 
 	tm = mps_alloc_high_priority_command(sc);
 	return tm;
 }
 
 void
 mpssas_free_tm(struct mps_softc *sc, struct mps_command *tm)
 {
 	int target_id = 0xFFFFFFFF;
  
 	if (tm == NULL)
 		return;
 
 	/*
 	 * For TM's the devq is frozen for the device.  Unfreeze it here and
 	 * free the resources used for freezing the devq.  Must clear the
 	 * INRESET flag as well or scsi I/O will not work.
 	 */
 	if (tm->cm_targ != NULL) {
 		tm->cm_targ->flags &= ~MPSSAS_TARGET_INRESET;
 		target_id = tm->cm_targ->tid;
 	}
 	if (tm->cm_ccb) {
 		mps_dprint(sc, MPS_INFO, "Unfreezing devq for target ID %d\n",
 		    target_id);
 		xpt_release_devq(tm->cm_ccb->ccb_h.path, 1, TRUE);
 		xpt_free_path(tm->cm_ccb->ccb_h.path);
 		xpt_free_ccb(tm->cm_ccb);
 	}
 
 	mps_free_high_priority_command(sc, tm);
 }
 
 void
 mpssas_rescan_target(struct mps_softc *sc, struct mpssas_target *targ)
 {
 	struct mpssas_softc *sassc = sc->sassc;
 	path_id_t pathid;
 	target_id_t targetid;
 	union ccb *ccb;
 
 	MPS_FUNCTRACE(sc);
 	pathid = cam_sim_path(sassc->sim);
 	if (targ == NULL)
 		targetid = CAM_TARGET_WILDCARD;
 	else
 		targetid = targ - sassc->targets;
 
 	/*
 	 * Allocate a CCB and schedule a rescan.
 	 */
 	ccb = xpt_alloc_ccb_nowait();
 	if (ccb == NULL) {
 		mps_dprint(sc, MPS_ERROR, "unable to alloc CCB for rescan\n");
 		return;
 	}
 
 	if (xpt_create_path(&ccb->ccb_h.path, NULL, pathid,
 	    targetid, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
 		mps_dprint(sc, MPS_ERROR, "unable to create path for rescan\n");
 		xpt_free_ccb(ccb);
 		return;
 	}
 
 	if (targetid == CAM_TARGET_WILDCARD)
 		ccb->ccb_h.func_code = XPT_SCAN_BUS;
 	else
 		ccb->ccb_h.func_code = XPT_SCAN_TGT;     
 
 	mps_dprint(sc, MPS_TRACE, "%s targetid %u\n", __func__, targetid);
 	xpt_rescan(ccb);
 }
 
 static void
 mpssas_log_command(struct mps_command *cm, u_int level, const char *fmt, ...)
 {
 	struct sbuf sb;
 	va_list ap;
 	char str[192];
 	char path_str[64];
 
 	if (cm == NULL)
 		return;
 
 	/* No need to be in here if debugging isn't enabled */
 	if ((cm->cm_sc->mps_debug & level) == 0)
 		return;
 
 	sbuf_new(&sb, str, sizeof(str), 0);
 
 	va_start(ap, fmt);
 
 	if (cm->cm_ccb != NULL) {
 		xpt_path_string(cm->cm_ccb->csio.ccb_h.path, path_str,
 				sizeof(path_str));
 		sbuf_cat(&sb, path_str);
 		if (cm->cm_ccb->ccb_h.func_code == XPT_SCSI_IO) {
 			scsi_command_string(&cm->cm_ccb->csio, &sb);
 			sbuf_printf(&sb, "length %d ",
 				    cm->cm_ccb->csio.dxfer_len);
 		}
 	}
 	else {
 		sbuf_printf(&sb, "(noperiph:%s%d:%u:%u:%u): ",
 		    cam_sim_name(cm->cm_sc->sassc->sim),
 		    cam_sim_unit(cm->cm_sc->sassc->sim),
 		    cam_sim_bus(cm->cm_sc->sassc->sim),
 		    cm->cm_targ ? cm->cm_targ->tid : 0xFFFFFFFF,
 		    cm->cm_lun);
 	}
 
 	sbuf_printf(&sb, "SMID %u ", cm->cm_desc.Default.SMID);
 	sbuf_vprintf(&sb, fmt, ap);
 	sbuf_finish(&sb);
 	mps_dprint_field(cm->cm_sc, level, "%s", sbuf_data(&sb));
 
 	va_end(ap);
 }
 
 
 static void
 mpssas_remove_volume(struct mps_softc *sc, struct mps_command *tm)
 {
 	MPI2_SCSI_TASK_MANAGE_REPLY *reply;
 	struct mpssas_target *targ;
 	uint16_t handle;
 
 	MPS_FUNCTRACE(sc);
 
 	reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply;
 	handle = (uint16_t)(uintptr_t)tm->cm_complete_data;
 	targ = tm->cm_targ;
 
 	if (reply == NULL) {
 		/* XXX retry the remove after the diag reset completes? */
 		mps_dprint(sc, MPS_FAULT,
 		    "%s NULL reply resetting device 0x%04x\n", __func__,
 		    handle);
 		mpssas_free_tm(sc, tm);
 		return;
 	}
 
 	if ((le16toh(reply->IOCStatus) & MPI2_IOCSTATUS_MASK) !=
 	    MPI2_IOCSTATUS_SUCCESS) {
 		mps_dprint(sc, MPS_ERROR,
 		   "IOCStatus = 0x%x while resetting device 0x%x\n",
 		   le16toh(reply->IOCStatus), handle);
 	}
 
 	mps_dprint(sc, MPS_XINFO,
 	    "Reset aborted %u commands\n", reply->TerminationCount);
 	mps_free_reply(sc, tm->cm_reply_data);
 	tm->cm_reply = NULL;	/* Ensures the reply won't get re-freed */
 
 	mps_dprint(sc, MPS_XINFO,
 	    "clearing target %u handle 0x%04x\n", targ->tid, handle);
 	
 	/*
 	 * Don't clear target if remove fails because things will get confusing.
 	 * Leave the devname and sasaddr intact so that we know to avoid reusing
 	 * this target id if possible, and so we can assign the same target id
 	 * to this device if it comes back in the future.
 	 */
 	if ((le16toh(reply->IOCStatus) & MPI2_IOCSTATUS_MASK) ==
 	    MPI2_IOCSTATUS_SUCCESS) {
 		targ = tm->cm_targ;
 		targ->handle = 0x0;
 		targ->encl_handle = 0x0;
 		targ->encl_slot = 0x0;
 		targ->exp_dev_handle = 0x0;
 		targ->phy_num = 0x0;
 		targ->linkrate = 0x0;
 		targ->devinfo = 0x0;
 		targ->flags = 0x0;
 	}
 
 	mpssas_free_tm(sc, tm);
 }
 
 
 /*
  * No Need to call "MPI2_SAS_OP_REMOVE_DEVICE" For Volume removal.
  * Otherwise Volume Delete is same as Bare Drive Removal.
  */
 void
 mpssas_prepare_volume_remove(struct mpssas_softc *sassc, uint16_t handle)
 {
 	MPI2_SCSI_TASK_MANAGE_REQUEST *req;
 	struct mps_softc *sc;
 	struct mps_command *cm;
 	struct mpssas_target *targ = NULL;
 
 	MPS_FUNCTRACE(sassc->sc);
 	sc = sassc->sc;
 
 #ifdef WD_SUPPORT
 	/*
 	 * If this is a WD controller, determine if the disk should be exposed
 	 * to the OS or not.  If disk should be exposed, return from this
 	 * function without doing anything.
 	 */
 	if (sc->WD_available && (sc->WD_hide_expose ==
 	    MPS_WD_EXPOSE_ALWAYS)) {
 		return;
 	}
 #endif //WD_SUPPORT
 
 	targ = mpssas_find_target_by_handle(sassc, 0, handle);
 	if (targ == NULL) {
 		/* FIXME: what is the action? */
 		/* We don't know about this device? */
 		mps_dprint(sc, MPS_ERROR,
 		   "%s %d : invalid handle 0x%x \n", __func__,__LINE__, handle);
 		return;
 	}
 
 	targ->flags |= MPSSAS_TARGET_INREMOVAL;
 
 	cm = mpssas_alloc_tm(sc);
 	if (cm == NULL) {
 		mps_dprint(sc, MPS_ERROR,
 		    "%s: command alloc failure\n", __func__);
 		return;
 	}
 
 	mpssas_rescan_target(sc, targ);
 
 	req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)cm->cm_req;
 	req->DevHandle = targ->handle;
 	req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT;
 	req->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET;
 
 	/* SAS Hard Link Reset / SATA Link Reset */
 	req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET;
 
 	cm->cm_targ = targ;
 	cm->cm_data = NULL;
 	cm->cm_desc.HighPriority.RequestFlags =
 	    MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY;
 	cm->cm_complete = mpssas_remove_volume;
 	cm->cm_complete_data = (void *)(uintptr_t)handle;
 
 	mps_dprint(sc, MPS_INFO, "%s: Sending reset for target ID %d\n",
 	    __func__, targ->tid);
 	mpssas_prepare_for_tm(sc, cm, targ, CAM_LUN_WILDCARD);
 
 	mps_map_command(sc, cm);
 }
 
 /*
  * The MPT2 firmware performs debounce on the link to avoid transient link
  * errors and false removals.  When it does decide that link has been lost
  * and a device need to go away, it expects that the host will perform a
  * target reset and then an op remove.  The reset has the side-effect of
  * aborting any outstanding requests for the device, which is required for
  * the op-remove to succeed.  It's not clear if the host should check for
  * the device coming back alive after the reset.
  */
 void
 mpssas_prepare_remove(struct mpssas_softc *sassc, uint16_t handle)
 {
 	MPI2_SCSI_TASK_MANAGE_REQUEST *req;
 	struct mps_softc *sc;
 	struct mps_command *cm;
 	struct mpssas_target *targ = NULL;
 
 	MPS_FUNCTRACE(sassc->sc);
 
 	sc = sassc->sc;
 
 	targ = mpssas_find_target_by_handle(sassc, 0, handle);
 	if (targ == NULL) {
 		/* FIXME: what is the action? */
 		/* We don't know about this device? */
 		mps_dprint(sc, MPS_ERROR,
 		    "%s : invalid handle 0x%x \n", __func__, handle);
 		return;
 	}
 
 	targ->flags |= MPSSAS_TARGET_INREMOVAL;
 
 	cm = mpssas_alloc_tm(sc);
 	if (cm == NULL) {
 		mps_dprint(sc, MPS_ERROR,
 		    "%s: command alloc failure\n", __func__);
 		return;
 	}
 
 	mpssas_rescan_target(sc, targ);
 
 	req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)cm->cm_req;
 	memset(req, 0, sizeof(*req));
 	req->DevHandle = htole16(targ->handle);
 	req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT;
 	req->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET;
 
 	/* SAS Hard Link Reset / SATA Link Reset */
 	req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET;
 
 	cm->cm_targ = targ;
 	cm->cm_data = NULL;
 	cm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY;
 	cm->cm_complete = mpssas_remove_device;
 	cm->cm_complete_data = (void *)(uintptr_t)handle;
 
 	mps_dprint(sc, MPS_INFO, "%s: Sending reset for target ID %d\n",
 	    __func__, targ->tid);
 	mpssas_prepare_for_tm(sc, cm, targ, CAM_LUN_WILDCARD);
 
 	mps_map_command(sc, cm);
 }
 
 static void
 mpssas_remove_device(struct mps_softc *sc, struct mps_command *tm)
 {
 	MPI2_SCSI_TASK_MANAGE_REPLY *reply;
 	MPI2_SAS_IOUNIT_CONTROL_REQUEST *req;
 	struct mpssas_target *targ;
 	struct mps_command *next_cm;
 	uint16_t handle;
 
 	MPS_FUNCTRACE(sc);
 
 	reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply;
 	handle = (uint16_t)(uintptr_t)tm->cm_complete_data;
 	targ = tm->cm_targ;
 
 	/*
 	 * Currently there should be no way we can hit this case.  It only
 	 * happens when we have a failure to allocate chain frames, and
 	 * task management commands don't have S/G lists.
 	 */
 	if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) {
 		mps_dprint(sc, MPS_ERROR,
 		    "%s: cm_flags = %#x for remove of handle %#04x! "
 		    "This should not happen!\n", __func__, tm->cm_flags,
 		    handle);
 	}
 
 	if (reply == NULL) {
 		/* XXX retry the remove after the diag reset completes? */
 		mps_dprint(sc, MPS_FAULT,
 		    "%s NULL reply resetting device 0x%04x\n", __func__,
 		    handle);
 		mpssas_free_tm(sc, tm);
 		return;
 	}
 
 	if ((le16toh(reply->IOCStatus) & MPI2_IOCSTATUS_MASK) !=
 	    MPI2_IOCSTATUS_SUCCESS) {
 		mps_dprint(sc, MPS_ERROR,
 		   "IOCStatus = 0x%x while resetting device 0x%x\n",
 		   le16toh(reply->IOCStatus), handle);
 	}
 
 	mps_dprint(sc, MPS_XINFO, "Reset aborted %u commands\n",
 	    le32toh(reply->TerminationCount));
 	mps_free_reply(sc, tm->cm_reply_data);
 	tm->cm_reply = NULL;	/* Ensures the reply won't get re-freed */
 
 	/* Reuse the existing command */
 	req = (MPI2_SAS_IOUNIT_CONTROL_REQUEST *)tm->cm_req;
 	memset(req, 0, sizeof(*req));
 	req->Function = MPI2_FUNCTION_SAS_IO_UNIT_CONTROL;
 	req->Operation = MPI2_SAS_OP_REMOVE_DEVICE;
 	req->DevHandle = htole16(handle);
 	tm->cm_data = NULL;
 	tm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 	tm->cm_complete = mpssas_remove_complete;
 	tm->cm_complete_data = (void *)(uintptr_t)handle;
 
 	mps_map_command(sc, tm);
 
 	mps_dprint(sc, MPS_XINFO, "clearing target %u handle 0x%04x\n",
 		   targ->tid, handle);
 	TAILQ_FOREACH_SAFE(tm, &targ->commands, cm_link, next_cm) {
 		union ccb *ccb;
 
 		mps_dprint(sc, MPS_XINFO, "Completing missed command %p\n", tm);
 		ccb = tm->cm_complete_data;
 		mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE);
 		mpssas_scsiio_complete(sc, tm);
 	}
 }
 
 static void
 mpssas_remove_complete(struct mps_softc *sc, struct mps_command *tm)
 {
 	MPI2_SAS_IOUNIT_CONTROL_REPLY *reply;
 	uint16_t handle;
 	struct mpssas_target *targ;
 	struct mpssas_lun *lun;
 
 	MPS_FUNCTRACE(sc);
 
 	reply = (MPI2_SAS_IOUNIT_CONTROL_REPLY *)tm->cm_reply;
 	handle = (uint16_t)(uintptr_t)tm->cm_complete_data;
 
 	/*
 	 * Currently there should be no way we can hit this case.  It only
 	 * happens when we have a failure to allocate chain frames, and
 	 * task management commands don't have S/G lists.
 	 */
 	if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) {
 		mps_dprint(sc, MPS_XINFO,
 			   "%s: cm_flags = %#x for remove of handle %#04x! "
 			   "This should not happen!\n", __func__, tm->cm_flags,
 			   handle);
 		mpssas_free_tm(sc, tm);
 		return;
 	}
 
 	if (reply == NULL) {
 		/* most likely a chip reset */
 		mps_dprint(sc, MPS_FAULT,
 		    "%s NULL reply removing device 0x%04x\n", __func__, handle);
 		mpssas_free_tm(sc, tm);
 		return;
 	}
 
 	mps_dprint(sc, MPS_XINFO,
 	    "%s on handle 0x%04x, IOCStatus= 0x%x\n", __func__, 
 	    handle, le16toh(reply->IOCStatus));
 
 	/*
 	 * Don't clear target if remove fails because things will get confusing.
 	 * Leave the devname and sasaddr intact so that we know to avoid reusing
 	 * this target id if possible, and so we can assign the same target id
 	 * to this device if it comes back in the future.
 	 */
 	if ((le16toh(reply->IOCStatus) & MPI2_IOCSTATUS_MASK) ==
 	    MPI2_IOCSTATUS_SUCCESS) {
 		targ = tm->cm_targ;
 		targ->handle = 0x0;
 		targ->encl_handle = 0x0;
 		targ->encl_slot = 0x0;
 		targ->exp_dev_handle = 0x0;
 		targ->phy_num = 0x0;
 		targ->linkrate = 0x0;
 		targ->devinfo = 0x0;
 		targ->flags = 0x0;
 		
 		while(!SLIST_EMPTY(&targ->luns)) {
 			lun = SLIST_FIRST(&targ->luns);
 			SLIST_REMOVE_HEAD(&targ->luns, lun_link);
 			free(lun, M_MPT2);
 		}
 	}
 	
 
 	mpssas_free_tm(sc, tm);
 }
 
 static int
 mpssas_register_events(struct mps_softc *sc)
 {
 	u32 events[MPI2_EVENT_NOTIFY_EVENTMASK_WORDS];
 
 	bzero(events, 16);
 	setbit(events, MPI2_EVENT_SAS_DEVICE_STATUS_CHANGE);
 	setbit(events, MPI2_EVENT_SAS_DISCOVERY);
 	setbit(events, MPI2_EVENT_SAS_BROADCAST_PRIMITIVE);
 	setbit(events, MPI2_EVENT_SAS_INIT_DEVICE_STATUS_CHANGE);
 	setbit(events, MPI2_EVENT_SAS_INIT_TABLE_OVERFLOW);
 	setbit(events, MPI2_EVENT_SAS_TOPOLOGY_CHANGE_LIST);
 	setbit(events, MPI2_EVENT_SAS_ENCL_DEVICE_STATUS_CHANGE);
 	setbit(events, MPI2_EVENT_IR_CONFIGURATION_CHANGE_LIST);
 	setbit(events, MPI2_EVENT_IR_VOLUME);
 	setbit(events, MPI2_EVENT_IR_PHYSICAL_DISK);
 	setbit(events, MPI2_EVENT_IR_OPERATION_STATUS);
 	setbit(events, MPI2_EVENT_LOG_ENTRY_ADDED);
 
 	mps_register_events(sc, events, mpssas_evt_handler, NULL,
 	    &sc->sassc->mpssas_eh);
 
 	return (0);
 }
 
 int
 mps_attach_sas(struct mps_softc *sc)
 {
 	struct mpssas_softc *sassc;
 	cam_status status;
 	int unit, error = 0;
 
 	MPS_FUNCTRACE(sc);
 
 	sassc = malloc(sizeof(struct mpssas_softc), M_MPT2, M_WAITOK|M_ZERO);
 	if(!sassc) {
 		device_printf(sc->mps_dev, "Cannot allocate memory %s %d\n",
 		__func__, __LINE__);
 		return (ENOMEM);
 	}
 
 	/*
 	 * XXX MaxTargets could change during a reinit.  Since we don't
 	 * resize the targets[] array during such an event, cache the value
 	 * of MaxTargets here so that we don't get into trouble later.  This
 	 * should move into the reinit logic.
 	 */
 	sassc->maxtargets = sc->facts->MaxTargets;
 	sassc->targets = malloc(sizeof(struct mpssas_target) *
 	    sassc->maxtargets, M_MPT2, M_WAITOK|M_ZERO);
 	if(!sassc->targets) {
 		device_printf(sc->mps_dev, "Cannot allocate memory %s %d\n",
 		__func__, __LINE__);
 		free(sassc, M_MPT2);
 		return (ENOMEM);
 	}
 	sc->sassc = sassc;
 	sassc->sc = sc;
 
 	if ((sassc->devq = cam_simq_alloc(sc->num_reqs)) == NULL) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate SIMQ\n");
 		error = ENOMEM;
 		goto out;
 	}
 
 	unit = device_get_unit(sc->mps_dev);
 	sassc->sim = cam_sim_alloc(mpssas_action, mpssas_poll, "mps", sassc,
 	    unit, &sc->mps_mtx, sc->num_reqs, sc->num_reqs, sassc->devq);
 	if (sassc->sim == NULL) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate SIM\n");
 		error = EINVAL;
 		goto out;
 	}
 
 	TAILQ_INIT(&sassc->ev_queue);
 
 	/* Initialize taskqueue for Event Handling */
 	TASK_INIT(&sassc->ev_task, 0, mpssas_firmware_event_work, sc);
 	sassc->ev_tq = taskqueue_create("mps_taskq", M_NOWAIT | M_ZERO,
 	    taskqueue_thread_enqueue, &sassc->ev_tq);
 	taskqueue_start_threads(&sassc->ev_tq, 1, PRIBIO, "%s taskq", 
 	    device_get_nameunit(sc->mps_dev));
 
 	mps_lock(sc);
 
 	/*
 	 * XXX There should be a bus for every port on the adapter, but since
 	 * we're just going to fake the topology for now, we'll pretend that
 	 * everything is just a target on a single bus.
 	 */
 	if ((error = xpt_bus_register(sassc->sim, sc->mps_dev, 0)) != 0) {
 		mps_dprint(sc, MPS_ERROR, "Error %d registering SCSI bus\n",
 		    error);
 		mps_unlock(sc);
 		goto out;
 	}
 
 	/*
 	 * Assume that discovery events will start right away.
 	 *
 	 * Hold off boot until discovery is complete.
 	 */
 	sassc->flags |= MPSSAS_IN_STARTUP | MPSSAS_IN_DISCOVERY;
 	sc->sassc->startup_refcount = 0;
 	mpssas_startup_increment(sassc);
 
 	callout_init(&sassc->discovery_callout, 1 /*mpsafe*/);
 
 	/*
 	 * Register for async events so we can determine the EEDP
 	 * capabilities of devices.
 	 */
 	status = xpt_create_path(&sassc->path, /*periph*/NULL,
 	    cam_sim_path(sc->sassc->sim), CAM_TARGET_WILDCARD,
 	    CAM_LUN_WILDCARD);
 	if (status != CAM_REQ_CMP) {
 		mps_printf(sc, "Error %#x creating sim path\n", status);
 		sassc->path = NULL;
 	} else {
 		int event;
 
 #if (__FreeBSD_version >= 1000006) || \
     ((__FreeBSD_version >= 901503) && (__FreeBSD_version < 1000000))
 		event = AC_ADVINFO_CHANGED;
 #else
 		event = AC_FOUND_DEVICE;
 #endif
 		status = xpt_register_async(event, mpssas_async, sc,
 					    sassc->path);
 		if (status != CAM_REQ_CMP) {
 			mps_dprint(sc, MPS_ERROR,
 			    "Error %#x registering async handler for "
 			    "AC_ADVINFO_CHANGED events\n", status);
 			xpt_free_path(sassc->path);
 			sassc->path = NULL;
 		}
 	}
 	if (status != CAM_REQ_CMP) {
 		/*
 		 * EEDP use is the exception, not the rule.
 		 * Warn the user, but do not fail to attach.
 		 */
 		mps_printf(sc, "EEDP capabilities disabled.\n");
 	}
 
 	mps_unlock(sc);
 
 	mpssas_register_events(sc);
 out:
 	if (error)
 		mps_detach_sas(sc);
 	return (error);
 }
 
 int
 mps_detach_sas(struct mps_softc *sc)
 {
 	struct mpssas_softc *sassc;
 	struct mpssas_lun *lun, *lun_tmp;
 	struct mpssas_target *targ;
 	int i;
 
 	MPS_FUNCTRACE(sc);
 
 	if (sc->sassc == NULL)
 		return (0);
 
 	sassc = sc->sassc;
 	mps_deregister_events(sc, sassc->mpssas_eh);
 
 	/*
 	 * Drain and free the event handling taskqueue with the lock
 	 * unheld so that any parallel processing tasks drain properly
 	 * without deadlocking.
 	 */
 	if (sassc->ev_tq != NULL)
 		taskqueue_free(sassc->ev_tq);
 
 	/* Make sure CAM doesn't wedge if we had to bail out early. */
 	mps_lock(sc);
 
 	/* Deregister our async handler */
 	if (sassc->path != NULL) {
 		xpt_register_async(0, mpssas_async, sc, sassc->path);
 		xpt_free_path(sassc->path);
 		sassc->path = NULL;
 	}
 
 	if (sassc->flags & MPSSAS_IN_STARTUP)
 		xpt_release_simq(sassc->sim, 1);
 
 	if (sassc->sim != NULL) {
 		xpt_bus_deregister(cam_sim_path(sassc->sim));
 		cam_sim_free(sassc->sim, FALSE);
 	}
 
 	mps_unlock(sc);
 
 	if (sassc->devq != NULL)
 		cam_simq_free(sassc->devq);
 
 	for(i=0; i< sassc->maxtargets ;i++) {
 		targ = &sassc->targets[i];
 		SLIST_FOREACH_SAFE(lun, &targ->luns, lun_link, lun_tmp) {
 			free(lun, M_MPT2);
 		}
 	}
 	free(sassc->targets, M_MPT2);
 	free(sassc, M_MPT2);
 	sc->sassc = NULL;
 
 	return (0);
 }
 
 void
 mpssas_discovery_end(struct mpssas_softc *sassc)
 {
 	struct mps_softc *sc = sassc->sc;
 
 	MPS_FUNCTRACE(sc);
 
 	if (sassc->flags & MPSSAS_DISCOVERY_TIMEOUT_PENDING)
 		callout_stop(&sassc->discovery_callout);
 
 }
 
 static void
 mpssas_action(struct cam_sim *sim, union ccb *ccb)
 {
 	struct mpssas_softc *sassc;
 
 	sassc = cam_sim_softc(sim);
 
 	MPS_FUNCTRACE(sassc->sc);
 	mps_dprint(sassc->sc, MPS_TRACE, "ccb func_code 0x%x\n",
 	    ccb->ccb_h.func_code);
 	mtx_assert(&sassc->sc->mps_mtx, MA_OWNED);
 
 	switch (ccb->ccb_h.func_code) {
 	case XPT_PATH_INQ:
 	{
 		struct ccb_pathinq *cpi = &ccb->cpi;
 		struct mps_softc *sc = sassc->sc;
 		uint8_t sges_per_frame;
 
 		cpi->version_num = 1;
 		cpi->hba_inquiry = PI_SDTR_ABLE|PI_TAG_ABLE|PI_WIDE_16;
 		cpi->target_sprt = 0;
 #if __FreeBSD_version >= 1000039
 		cpi->hba_misc = PIM_NOBUSRESET | PIM_UNMAPPED | PIM_NOSCAN;
 #else
 		cpi->hba_misc = PIM_NOBUSRESET | PIM_UNMAPPED;
 #endif
 		cpi->hba_eng_cnt = 0;
 		cpi->max_target = sassc->maxtargets - 1;
 		cpi->max_lun = 255;
 		cpi->initiator_id = sassc->maxtargets - 1;
 		strncpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
 		strncpy(cpi->hba_vid, "Avago Tech (LSI)", HBA_IDLEN);
 		strncpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
 		cpi->unit_number = cam_sim_unit(sim);
 		cpi->bus_id = cam_sim_bus(sim);
 		cpi->base_transfer_speed = 150000;
 		cpi->transport = XPORT_SAS;
 		cpi->transport_version = 0;
 		cpi->protocol = PROTO_SCSI;
 		cpi->protocol_version = SCSI_REV_SPC;
 
 		/*
 		 * Max IO Size is Page Size * the following:
 		 * ((SGEs per frame - 1 for chain element) *
 		 * Max Chain Depth) + 1 for no chain needed in last frame
 		 *
 		 * If user suggests a Max IO size to use, use the smaller of the
 		 * user's value and the calculated value as long as the user's
 		 * value is larger than 0. The user's value is in pages.
 		 */
 		sges_per_frame = ((sc->facts->IOCRequestFrameSize * 4) /
 		    sizeof(MPI2_SGE_SIMPLE64)) - 1;
 		cpi->maxio = (sges_per_frame * sc->facts->MaxChainDepth) + 1;
 		cpi->maxio *= PAGE_SIZE;
 		if ((sc->max_io_pages > 0) && (sc->max_io_pages * PAGE_SIZE <
 		    cpi->maxio))
 			cpi->maxio = sc->max_io_pages * PAGE_SIZE;
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP);
 		break;
 	}
 	case XPT_GET_TRAN_SETTINGS:
 	{
 		struct ccb_trans_settings	*cts;
 		struct ccb_trans_settings_sas	*sas;
 		struct ccb_trans_settings_scsi	*scsi;
 		struct mpssas_target *targ;
 
 		cts = &ccb->cts;
 		sas = &cts->xport_specific.sas;
 		scsi = &cts->proto_specific.scsi;
 
 		KASSERT(cts->ccb_h.target_id < sassc->maxtargets,
 		    ("Target %d out of bounds in XPT_GET_TRANS_SETTINGS\n",
 		    cts->ccb_h.target_id));
 		targ = &sassc->targets[cts->ccb_h.target_id];
 		if (targ->handle == 0x0) {
 			mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE);
 			break;
 		}
 
 		cts->protocol_version = SCSI_REV_SPC2;
 		cts->transport = XPORT_SAS;
 		cts->transport_version = 0;
 
 		sas->valid = CTS_SAS_VALID_SPEED;
 		switch (targ->linkrate) {
 		case 0x08:
 			sas->bitrate = 150000;
 			break;
 		case 0x09:
 			sas->bitrate = 300000;
 			break;
 		case 0x0a:
 			sas->bitrate = 600000;
 			break;
 		default:
 			sas->valid = 0;
 		}
 
 		cts->protocol = PROTO_SCSI;
 		scsi->valid = CTS_SCSI_VALID_TQ;
 		scsi->flags = CTS_SCSI_FLAGS_TAG_ENB;
 
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP);
 		break;
 	}
 	case XPT_CALC_GEOMETRY:
 		cam_calc_geometry(&ccb->ccg, /*extended*/1);
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP);
 		break;
 	case XPT_RESET_DEV:
 		mps_dprint(sassc->sc, MPS_XINFO, "mpssas_action XPT_RESET_DEV\n");
 		mpssas_action_resetdev(sassc, ccb);
 		return;
 	case XPT_RESET_BUS:
 	case XPT_ABORT:
 	case XPT_TERM_IO:
 		mps_dprint(sassc->sc, MPS_XINFO,
 		    "mpssas_action faking success for abort or reset\n");
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP);
 		break;
 	case XPT_SCSI_IO:
 		mpssas_action_scsiio(sassc, ccb);
 		return;
 #if __FreeBSD_version >= 900026
 	case XPT_SMP_IO:
 		mpssas_action_smpio(sassc, ccb);
 		return;
 #endif
 	default:
 		mpssas_set_ccbstatus(ccb, CAM_FUNC_NOTAVAIL);
 		break;
 	}
 	xpt_done(ccb);
 
 }
 
 static void
 mpssas_announce_reset(struct mps_softc *sc, uint32_t ac_code,
     target_id_t target_id, lun_id_t lun_id)
 {
 	path_id_t path_id = cam_sim_path(sc->sassc->sim);
 	struct cam_path *path;
 
 	mps_dprint(sc, MPS_XINFO, "%s code %x target %d lun %jx\n", __func__,
 	    ac_code, target_id, (uintmax_t)lun_id);
 
 	if (xpt_create_path(&path, NULL, 
 		path_id, target_id, lun_id) != CAM_REQ_CMP) {
 		mps_dprint(sc, MPS_ERROR, "unable to create path for reset "
 			   "notification\n");
 		return;
 	}
 
 	xpt_async(ac_code, path, NULL);
 	xpt_free_path(path);
 }
 
 static void 
 mpssas_complete_all_commands(struct mps_softc *sc)
 {
 	struct mps_command *cm;
 	int i;
 	int completed;
 
 	MPS_FUNCTRACE(sc);
 	mtx_assert(&sc->mps_mtx, MA_OWNED);
 
 	/* complete all commands with a NULL reply */
 	for (i = 1; i < sc->num_reqs; i++) {
 		cm = &sc->commands[i];
 		cm->cm_reply = NULL;
 		completed = 0;
 
 		if (cm->cm_flags & MPS_CM_FLAGS_POLLED)
 			cm->cm_flags |= MPS_CM_FLAGS_COMPLETE;
 
 		if (cm->cm_complete != NULL) {
 			mpssas_log_command(cm, MPS_RECOVERY,
 			    "completing cm %p state %x ccb %p for diag reset\n", 
 			    cm, cm->cm_state, cm->cm_ccb);
 
 			cm->cm_complete(sc, cm);
 			completed = 1;
 		}
 
 		if (cm->cm_flags & MPS_CM_FLAGS_WAKEUP) {
 			mpssas_log_command(cm, MPS_RECOVERY,
 			    "waking up cm %p state %x ccb %p for diag reset\n", 
 			    cm, cm->cm_state, cm->cm_ccb);
 			wakeup(cm);
 			completed = 1;
 		}
 
 		if (cm->cm_sc->io_cmds_active != 0) {
 			cm->cm_sc->io_cmds_active--;
 		} else {
 			mps_dprint(cm->cm_sc, MPS_INFO, "Warning: "
 			    "io_cmds_active is out of sync - resynching to "
 			    "0\n");
 		}
 		
 		if ((completed == 0) && (cm->cm_state != MPS_CM_STATE_FREE)) {
 			/* this should never happen, but if it does, log */
 			mpssas_log_command(cm, MPS_RECOVERY,
 			    "cm %p state %x flags 0x%x ccb %p during diag "
 			    "reset\n", cm, cm->cm_state, cm->cm_flags,
 			    cm->cm_ccb);
 		}
 	}
 }
 
 void
 mpssas_handle_reinit(struct mps_softc *sc)
 {
 	int i;
 
 	/* Go back into startup mode and freeze the simq, so that CAM
 	 * doesn't send any commands until after we've rediscovered all
 	 * targets and found the proper device handles for them.
 	 *
 	 * After the reset, portenable will trigger discovery, and after all
 	 * discovery-related activities have finished, the simq will be
 	 * released.
 	 */
 	mps_dprint(sc, MPS_INIT, "%s startup\n", __func__);
 	sc->sassc->flags |= MPSSAS_IN_STARTUP;
 	sc->sassc->flags |= MPSSAS_IN_DISCOVERY;
 	mpssas_startup_increment(sc->sassc);
 
 	/* notify CAM of a bus reset */
 	mpssas_announce_reset(sc, AC_BUS_RESET, CAM_TARGET_WILDCARD, 
 	    CAM_LUN_WILDCARD);
 
 	/* complete and cleanup after all outstanding commands */
 	mpssas_complete_all_commands(sc);
 
 	mps_dprint(sc, MPS_INIT,
 	    "%s startup %u after command completion\n", __func__,
 	    sc->sassc->startup_refcount);
 
 	/* zero all the target handles, since they may change after the
 	 * reset, and we have to rediscover all the targets and use the new
 	 * handles.  
 	 */
 	for (i = 0; i < sc->sassc->maxtargets; i++) {
 		if (sc->sassc->targets[i].outstanding != 0)
 			mps_dprint(sc, MPS_INIT, "target %u outstanding %u\n", 
 			    i, sc->sassc->targets[i].outstanding);
 		sc->sassc->targets[i].handle = 0x0;
 		sc->sassc->targets[i].exp_dev_handle = 0x0;
 		sc->sassc->targets[i].outstanding = 0;
 		sc->sassc->targets[i].flags = MPSSAS_TARGET_INDIAGRESET;
 	}
 }
 
 static void
 mpssas_tm_timeout(void *data)
 {
 	struct mps_command *tm = data;
 	struct mps_softc *sc = tm->cm_sc;
 
 	mtx_assert(&sc->mps_mtx, MA_OWNED);
 
 	mpssas_log_command(tm, MPS_INFO|MPS_RECOVERY,
 	    "task mgmt %p timed out\n", tm);
 	mps_reinit(sc);
 }
 
 static void
 mpssas_logical_unit_reset_complete(struct mps_softc *sc, struct mps_command *tm)
 {
 	MPI2_SCSI_TASK_MANAGE_REPLY *reply;
 	MPI2_SCSI_TASK_MANAGE_REQUEST *req;
 	unsigned int cm_count = 0;
 	struct mps_command *cm;
 	struct mpssas_target *targ;
 
 	callout_stop(&tm->cm_callout);
 
 	req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req;
 	reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply;
 	targ = tm->cm_targ;
 
 	/*
 	 * Currently there should be no way we can hit this case.  It only
 	 * happens when we have a failure to allocate chain frames, and
 	 * task management commands don't have S/G lists.
 	 * XXXSL So should it be an assertion?
 	 */
 	if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) {
 		mps_dprint(sc, MPS_ERROR, "%s: cm_flags = %#x for LUN reset! "
 			   "This should not happen!\n", __func__, tm->cm_flags);
 		mpssas_free_tm(sc, tm);
 		return;
 	}
 
 	if (reply == NULL) {
 		mpssas_log_command(tm, MPS_RECOVERY,
 		    "NULL reset reply for tm %p\n", tm);
 		if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0) {
 			/* this completion was due to a reset, just cleanup */
 			targ->tm = NULL;
 			mpssas_free_tm(sc, tm);
 		}
 		else {
 			/* we should have gotten a reply. */
 			mps_reinit(sc);
 		}
 		return;
 	}
 
 	mpssas_log_command(tm, MPS_RECOVERY,
 	    "logical unit reset status 0x%x code 0x%x count %u\n",
 	    le16toh(reply->IOCStatus), le32toh(reply->ResponseCode),
 	    le32toh(reply->TerminationCount));
 		
 	/* See if there are any outstanding commands for this LUN.
 	 * This could be made more efficient by using a per-LU data
 	 * structure of some sort.
 	 */
 	TAILQ_FOREACH(cm, &targ->commands, cm_link) {
 		if (cm->cm_lun == tm->cm_lun)
 			cm_count++;
 	}
 
 	if (cm_count == 0) {
 		mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO,
 		    "logical unit %u finished recovery after reset\n",
 		    tm->cm_lun, tm);
 
 		mpssas_announce_reset(sc, AC_SENT_BDR, tm->cm_targ->tid, 
 		    tm->cm_lun);
 
 		/* we've finished recovery for this logical unit.  check and
 		 * see if some other logical unit has a timedout command
 		 * that needs to be processed.
 		 */
 		cm = TAILQ_FIRST(&targ->timedout_commands);
 		if (cm) {
 			mpssas_send_abort(sc, tm, cm);
 		}
 		else {
 			targ->tm = NULL;
 			mpssas_free_tm(sc, tm);
 		}
 	}
 	else {
 		/* if we still have commands for this LUN, the reset
 		 * effectively failed, regardless of the status reported.
 		 * Escalate to a target reset.
 		 */
 		mpssas_log_command(tm, MPS_RECOVERY,
 		    "logical unit reset complete for tm %p, but still have %u command(s)\n",
 		    tm, cm_count);
 		mpssas_send_reset(sc, tm,
 		    MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET);
 	}
 }
 
 static void
 mpssas_target_reset_complete(struct mps_softc *sc, struct mps_command *tm)
 {
 	MPI2_SCSI_TASK_MANAGE_REPLY *reply;
 	MPI2_SCSI_TASK_MANAGE_REQUEST *req;
 	struct mpssas_target *targ;
 
 	callout_stop(&tm->cm_callout);
 
 	req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req;
 	reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply;
 	targ = tm->cm_targ;
 
 	/*
 	 * Currently there should be no way we can hit this case.  It only
 	 * happens when we have a failure to allocate chain frames, and
 	 * task management commands don't have S/G lists.
 	 */
 	if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) {
 		mps_dprint(sc, MPS_ERROR,"%s: cm_flags = %#x for target reset! "
 			   "This should not happen!\n", __func__, tm->cm_flags);
 		mpssas_free_tm(sc, tm);
 		return;
 	}
 
 	if (reply == NULL) {
 		mpssas_log_command(tm, MPS_RECOVERY,
 		    "NULL reset reply for tm %p\n", tm);
 		if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0) {
 			/* this completion was due to a reset, just cleanup */
 			targ->tm = NULL;
 			mpssas_free_tm(sc, tm);
 		}
 		else {
 			/* we should have gotten a reply. */
 			mps_reinit(sc);
 		}
 		return;
 	}
 
 	mpssas_log_command(tm, MPS_RECOVERY,
 	    "target reset status 0x%x code 0x%x count %u\n",
 	    le16toh(reply->IOCStatus), le32toh(reply->ResponseCode),
 	    le32toh(reply->TerminationCount));
 
 	if (targ->outstanding == 0) {
 		/* we've finished recovery for this target and all
 		 * of its logical units.
 		 */
 		mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO,
 		    "recovery finished after target reset\n");
 
 		mpssas_announce_reset(sc, AC_SENT_BDR, tm->cm_targ->tid,
 		    CAM_LUN_WILDCARD);
 
 		targ->tm = NULL;
 		mpssas_free_tm(sc, tm);
 	}
 	else {
 		/* after a target reset, if this target still has
 		 * outstanding commands, the reset effectively failed,
 		 * regardless of the status reported.  escalate.
 		 */
 		mpssas_log_command(tm, MPS_RECOVERY,
 		    "target reset complete for tm %p, but still have %u command(s)\n", 
 		    tm, targ->outstanding);
 		mps_reinit(sc);
 	}
 }
 
 #define MPS_RESET_TIMEOUT 30
 
 int
 mpssas_send_reset(struct mps_softc *sc, struct mps_command *tm, uint8_t type)
 {
 	MPI2_SCSI_TASK_MANAGE_REQUEST *req;
 	struct mpssas_target *target;
 	int err;
 
 	target = tm->cm_targ;
 	if (target->handle == 0) {
 		mps_dprint(sc, MPS_ERROR,"%s null devhandle for target_id %d\n",
 		    __func__, target->tid);
 		return -1;
 	}
 
 	req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req;
 	req->DevHandle = htole16(target->handle);
 	req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT;
 	req->TaskType = type;
 
 	if (type == MPI2_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET) {
 		/* XXX Need to handle invalid LUNs */
 		MPS_SET_LUN(req->LUN, tm->cm_lun);
 		tm->cm_targ->logical_unit_resets++;
 		mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO,
 		    "sending logical unit reset\n");
 		tm->cm_complete = mpssas_logical_unit_reset_complete;
 		mpssas_prepare_for_tm(sc, tm, target, tm->cm_lun);
 	}
 	else if (type == MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET) {
 		/*
 		 * Target reset method =
 		 * 	SAS Hard Link Reset / SATA Link Reset
 		 */
 		req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET;
 		tm->cm_targ->target_resets++;
 		mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO,
 		    "sending target reset\n");
 		tm->cm_complete = mpssas_target_reset_complete;
 		mpssas_prepare_for_tm(sc, tm, target, CAM_LUN_WILDCARD);
 	}
 	else {
 		mps_dprint(sc, MPS_ERROR, "unexpected reset type 0x%x\n", type);
 		return -1;
 	}
 
 	tm->cm_data = NULL;
 	tm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY;
 	tm->cm_complete_data = (void *)tm;
 
 	callout_reset(&tm->cm_callout, MPS_RESET_TIMEOUT * hz,
 	    mpssas_tm_timeout, tm);
 
 	err = mps_map_command(sc, tm);
 	if (err)
 		mpssas_log_command(tm, MPS_RECOVERY,
 		    "error %d sending reset type %u\n",
 		    err, type);
 
 	return err;
 }
 
 
 static void
 mpssas_abort_complete(struct mps_softc *sc, struct mps_command *tm)
 {
 	struct mps_command *cm;
 	MPI2_SCSI_TASK_MANAGE_REPLY *reply;
 	MPI2_SCSI_TASK_MANAGE_REQUEST *req;
 	struct mpssas_target *targ;
 
 	callout_stop(&tm->cm_callout);
 
 	req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req;
 	reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply;
 	targ = tm->cm_targ;
 
 	/*
 	 * Currently there should be no way we can hit this case.  It only
 	 * happens when we have a failure to allocate chain frames, and
 	 * task management commands don't have S/G lists.
 	 */
 	if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) {
 		mpssas_log_command(tm, MPS_RECOVERY,
 		    "cm_flags = %#x for abort %p TaskMID %u!\n", 
 		    tm->cm_flags, tm, le16toh(req->TaskMID));
 		mpssas_free_tm(sc, tm);
 		return;
 	}
 
 	if (reply == NULL) {
 		mpssas_log_command(tm, MPS_RECOVERY,
 		    "NULL abort reply for tm %p TaskMID %u\n", 
 		    tm, le16toh(req->TaskMID));
 		if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0) {
 			/* this completion was due to a reset, just cleanup */
 			targ->tm = NULL;
 			mpssas_free_tm(sc, tm);
 		}
 		else {
 			/* we should have gotten a reply. */
 			mps_reinit(sc);
 		}
 		return;
 	}
 
 	mpssas_log_command(tm, MPS_RECOVERY,
 	    "abort TaskMID %u status 0x%x code 0x%x count %u\n",
 	    le16toh(req->TaskMID),
 	    le16toh(reply->IOCStatus), le32toh(reply->ResponseCode),
 	    le32toh(reply->TerminationCount));
 
 	cm = TAILQ_FIRST(&tm->cm_targ->timedout_commands);
 	if (cm == NULL) {
 		/* if there are no more timedout commands, we're done with
 		 * error recovery for this target.
 		 */
 		mpssas_log_command(tm, MPS_RECOVERY,
 		    "finished recovery after aborting TaskMID %u\n",
 		    le16toh(req->TaskMID));
 
 		targ->tm = NULL;
 		mpssas_free_tm(sc, tm);
 	}
 	else if (le16toh(req->TaskMID) != cm->cm_desc.Default.SMID) {
 		/* abort success, but we have more timedout commands to abort */
 		mpssas_log_command(tm, MPS_RECOVERY,
 		    "continuing recovery after aborting TaskMID %u\n",
 		    le16toh(req->TaskMID));
 		
 		mpssas_send_abort(sc, tm, cm);
 	}
 	else {
 		/* we didn't get a command completion, so the abort
 		 * failed as far as we're concerned.  escalate.
 		 */
 		mpssas_log_command(tm, MPS_RECOVERY,
 		    "abort failed for TaskMID %u tm %p\n",
 		    le16toh(req->TaskMID), tm);
 
 		mpssas_send_reset(sc, tm, 
 		    MPI2_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET);
 	}
 }
 
 #define MPS_ABORT_TIMEOUT 5
 
 static int
 mpssas_send_abort(struct mps_softc *sc, struct mps_command *tm, struct mps_command *cm)
 {
 	MPI2_SCSI_TASK_MANAGE_REQUEST *req;
 	struct mpssas_target *targ;
 	int err;
 
 	targ = cm->cm_targ;
 	if (targ->handle == 0) {
 		mps_dprint(sc, MPS_ERROR,"%s null devhandle for target_id %d\n",
 		    __func__, cm->cm_ccb->ccb_h.target_id);
 		return -1;
 	}
 
 	mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO,
 	    "Aborting command %p\n", cm);
 
 	req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req;
 	req->DevHandle = htole16(targ->handle);
 	req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT;
 	req->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_ABORT_TASK;
 
 	/* XXX Need to handle invalid LUNs */
 	MPS_SET_LUN(req->LUN, cm->cm_ccb->ccb_h.target_lun);
 
 	req->TaskMID = htole16(cm->cm_desc.Default.SMID);
 
 	tm->cm_data = NULL;
 	tm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY;
 	tm->cm_complete = mpssas_abort_complete;
 	tm->cm_complete_data = (void *)tm;
 	tm->cm_targ = cm->cm_targ;
 	tm->cm_lun = cm->cm_lun;
 
 	callout_reset(&tm->cm_callout, MPS_ABORT_TIMEOUT * hz,
 	    mpssas_tm_timeout, tm);
 
 	targ->aborts++;
 
 	mps_dprint(sc, MPS_INFO, "Sending reset from %s for target ID %d\n",
 	    __func__, targ->tid);
 	mpssas_prepare_for_tm(sc, tm, targ, tm->cm_lun);
 
 	err = mps_map_command(sc, tm);
 	if (err)
 		mpssas_log_command(tm, MPS_RECOVERY,
 		    "error %d sending abort for cm %p SMID %u\n",
 		    err, cm, req->TaskMID);
 	return err;
 }
 
 static void
 mpssas_scsiio_timeout(void *data)
 {
 	struct mps_softc *sc;
 	struct mps_command *cm;
 	struct mpssas_target *targ;
 
 	cm = (struct mps_command *)data;
 	sc = cm->cm_sc;
 
 	MPS_FUNCTRACE(sc);
 	mtx_assert(&sc->mps_mtx, MA_OWNED);
 
 	mps_dprint(sc, MPS_XINFO, "Timeout checking cm %p\n", sc);
 
 	/*
 	 * Run the interrupt handler to make sure it's not pending.  This
 	 * isn't perfect because the command could have already completed
 	 * and been re-used, though this is unlikely.
 	 */
 	mps_intr_locked(sc);
 	if (cm->cm_state == MPS_CM_STATE_FREE) {
 		mpssas_log_command(cm, MPS_XINFO,
 		    "SCSI command %p almost timed out\n", cm);
 		return;
 	}
 
 	if (cm->cm_ccb == NULL) {
 		mps_dprint(sc, MPS_ERROR, "command timeout with NULL ccb\n");
 		return;
 	}
 
 	mpssas_log_command(cm, MPS_INFO, "command timeout cm %p ccb %p\n", 
 	    cm, cm->cm_ccb);
 
 	targ = cm->cm_targ;
 	targ->timeouts++;
 
 	/* XXX first, check the firmware state, to see if it's still
 	 * operational.  if not, do a diag reset.
 	 */
 	mpssas_set_ccbstatus(cm->cm_ccb, CAM_CMD_TIMEOUT);
 	cm->cm_state = MPS_CM_STATE_TIMEDOUT;
 	TAILQ_INSERT_TAIL(&targ->timedout_commands, cm, cm_recovery);
 
 	if (targ->tm != NULL) {
 		/* target already in recovery, just queue up another
 		 * timedout command to be processed later.
 		 */
 		mps_dprint(sc, MPS_RECOVERY,
 		    "queued timedout cm %p for processing by tm %p\n",
 		    cm, targ->tm);
 	}
 	else if ((targ->tm = mpssas_alloc_tm(sc)) != NULL) {
 		mps_dprint(sc, MPS_RECOVERY, "timedout cm %p allocated tm %p\n",
 		    cm, targ->tm);
 
 		/* start recovery by aborting the first timedout command */
 		mpssas_send_abort(sc, targ->tm, cm);
 	}
 	else {
 		/* XXX queue this target up for recovery once a TM becomes
 		 * available.  The firmware only has a limited number of
 		 * HighPriority credits for the high priority requests used
 		 * for task management, and we ran out.
 		 * 
 		 * Isilon: don't worry about this for now, since we have
 		 * more credits than disks in an enclosure, and limit
 		 * ourselves to one TM per target for recovery.
 		 */
 		mps_dprint(sc, MPS_RECOVERY,
 		    "timedout cm %p failed to allocate a tm\n", cm);
 	}
 
 }
 
 static void
 mpssas_action_scsiio(struct mpssas_softc *sassc, union ccb *ccb)
 {
 	MPI2_SCSI_IO_REQUEST *req;
 	struct ccb_scsiio *csio;
 	struct mps_softc *sc;
 	struct mpssas_target *targ;
 	struct mpssas_lun *lun;
 	struct mps_command *cm;
 	uint8_t i, lba_byte, *ref_tag_addr;
 	uint16_t eedp_flags;
 	uint32_t mpi_control;
 
 	sc = sassc->sc;
 	MPS_FUNCTRACE(sc);
 	mtx_assert(&sc->mps_mtx, MA_OWNED);
 
 	csio = &ccb->csio;
 	KASSERT(csio->ccb_h.target_id < sassc->maxtargets,
 	    ("Target %d out of bounds in XPT_SCSI_IO\n",
 	     csio->ccb_h.target_id));
 	targ = &sassc->targets[csio->ccb_h.target_id];
 	mps_dprint(sc, MPS_TRACE, "ccb %p target flag %x\n", ccb, targ->flags);
 	if (targ->handle == 0x0) {
 		mps_dprint(sc, MPS_ERROR, "%s NULL handle for target %u\n", 
 		    __func__, csio->ccb_h.target_id);
 		mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE);
 		xpt_done(ccb);
 		return;
 	}
 	if (targ->flags & MPS_TARGET_FLAGS_RAID_COMPONENT) {
 		mps_dprint(sc, MPS_ERROR, "%s Raid component no SCSI IO "
 		    "supported %u\n", __func__, csio->ccb_h.target_id);
 		mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE);
 		xpt_done(ccb);
 		return;
 	}
 	/*
 	 * Sometimes, it is possible to get a command that is not "In
 	 * Progress" and was actually aborted by the upper layer.  Check for
 	 * this here and complete the command without error.
 	 */
 	if (mpssas_get_ccbstatus(ccb) != CAM_REQ_INPROG) {
 		mps_dprint(sc, MPS_TRACE, "%s Command is not in progress for "
 		    "target %u\n", __func__, csio->ccb_h.target_id);
 		xpt_done(ccb);
 		return;
 	}
 	/*
 	 * If devinfo is 0 this will be a volume.  In that case don't tell CAM
 	 * that the volume has timed out.  We want volumes to be enumerated
 	 * until they are deleted/removed, not just failed.
 	 */
 	if (targ->flags & MPSSAS_TARGET_INREMOVAL) {
 		if (targ->devinfo == 0)
 			mpssas_set_ccbstatus(ccb, CAM_REQ_CMP);
 		else
 			mpssas_set_ccbstatus(ccb, CAM_SEL_TIMEOUT);
 		xpt_done(ccb);
 		return;
 	}
 
 	if ((sc->mps_flags & MPS_FLAGS_SHUTDOWN) != 0) {
 		mps_dprint(sc, MPS_INFO, "%s shutting down\n", __func__);
 		mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE);
 		xpt_done(ccb);
 		return;
 	}
 
 	/*
 	 * If target has a reset in progress, freeze the devq and return.  The
 	 * devq will be released when the TM reset is finished.
 	 */
 	if (targ->flags & MPSSAS_TARGET_INRESET) {
 		ccb->ccb_h.status = CAM_BUSY | CAM_DEV_QFRZN;
 		mps_dprint(sc, MPS_INFO, "%s: Freezing devq for target ID %d\n",
 		    __func__, targ->tid);
 		xpt_freeze_devq(ccb->ccb_h.path, 1);
 		xpt_done(ccb);
 		return;
 	}
 
 	cm = mps_alloc_command(sc);
 	if (cm == NULL || (sc->mps_flags & MPS_FLAGS_DIAGRESET)) {
 		if (cm != NULL) {
 			mps_free_command(sc, cm);
 		}
 		if ((sassc->flags & MPSSAS_QUEUE_FROZEN) == 0) {
 			xpt_freeze_simq(sassc->sim, 1);
 			sassc->flags |= MPSSAS_QUEUE_FROZEN;
 		}
 		ccb->ccb_h.status &= ~CAM_SIM_QUEUED;
 		ccb->ccb_h.status |= CAM_REQUEUE_REQ;
 		xpt_done(ccb);
 		return;
 	}
 
 	req = (MPI2_SCSI_IO_REQUEST *)cm->cm_req;
 	bzero(req, sizeof(*req));
 	req->DevHandle = htole16(targ->handle);
 	req->Function = MPI2_FUNCTION_SCSI_IO_REQUEST;
 	req->MsgFlags = 0;
 	req->SenseBufferLowAddress = htole32(cm->cm_sense_busaddr);
 	req->SenseBufferLength = MPS_SENSE_LEN;
 	req->SGLFlags = 0;
 	req->ChainOffset = 0;
 	req->SGLOffset0 = 24;	/* 32bit word offset to the SGL */
 	req->SGLOffset1= 0;
 	req->SGLOffset2= 0;
 	req->SGLOffset3= 0;
 	req->SkipCount = 0;
 	req->DataLength = htole32(csio->dxfer_len);
 	req->BidirectionalDataLength = 0;
 	req->IoFlags = htole16(csio->cdb_len);
 	req->EEDPFlags = 0;
 
 	/* Note: BiDirectional transfers are not supported */
 	switch (csio->ccb_h.flags & CAM_DIR_MASK) {
 	case CAM_DIR_IN:
 		mpi_control = MPI2_SCSIIO_CONTROL_READ;
 		cm->cm_flags |= MPS_CM_FLAGS_DATAIN;
 		break;
 	case CAM_DIR_OUT:
 		mpi_control = MPI2_SCSIIO_CONTROL_WRITE;
 		cm->cm_flags |= MPS_CM_FLAGS_DATAOUT;
 		break;
 	case CAM_DIR_NONE:
 	default:
 		mpi_control = MPI2_SCSIIO_CONTROL_NODATATRANSFER;
 		break;
 	}
  
 	if (csio->cdb_len == 32)
                 mpi_control |= 4 << MPI2_SCSIIO_CONTROL_ADDCDBLEN_SHIFT;
 	/*
 	 * It looks like the hardware doesn't require an explicit tag
 	 * number for each transaction.  SAM Task Management not supported
 	 * at the moment.
 	 */
 	switch (csio->tag_action) {
 	case MSG_HEAD_OF_Q_TAG:
 		mpi_control |= MPI2_SCSIIO_CONTROL_HEADOFQ;
 		break;
 	case MSG_ORDERED_Q_TAG:
 		mpi_control |= MPI2_SCSIIO_CONTROL_ORDEREDQ;
 		break;
 	case MSG_ACA_TASK:
 		mpi_control |= MPI2_SCSIIO_CONTROL_ACAQ;
 		break;
 	case CAM_TAG_ACTION_NONE:
 	case MSG_SIMPLE_Q_TAG:
 	default:
 		mpi_control |= MPI2_SCSIIO_CONTROL_SIMPLEQ;
 		break;
 	}
 	mpi_control |= sc->mapping_table[csio->ccb_h.target_id].TLR_bits;
 	req->Control = htole32(mpi_control);
 	if (MPS_SET_LUN(req->LUN, csio->ccb_h.target_lun) != 0) {
 		mps_free_command(sc, cm);
 		mpssas_set_ccbstatus(ccb, CAM_LUN_INVALID);
 		xpt_done(ccb);
 		return;
 	}
 
 	if (csio->ccb_h.flags & CAM_CDB_POINTER)
 		bcopy(csio->cdb_io.cdb_ptr, &req->CDB.CDB32[0], csio->cdb_len);
 	else
 		bcopy(csio->cdb_io.cdb_bytes, &req->CDB.CDB32[0],csio->cdb_len);
 	req->IoFlags = htole16(csio->cdb_len);
 
 	/*
 	 * Check if EEDP is supported and enabled.  If it is then check if the
 	 * SCSI opcode could be using EEDP.  If so, make sure the LUN exists and
 	 * is formatted for EEDP support.  If all of this is true, set CDB up
 	 * for EEDP transfer.
 	 */
 	eedp_flags = op_code_prot[req->CDB.CDB32[0]];
 	if (sc->eedp_enabled && eedp_flags) {
 		SLIST_FOREACH(lun, &targ->luns, lun_link) {
 			if (lun->lun_id == csio->ccb_h.target_lun) {
 				break;
 			}
 		}
 
 		if ((lun != NULL) && (lun->eedp_formatted)) {
 			req->EEDPBlockSize = htole16(lun->eedp_block_size);
 			eedp_flags |= (MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG |
 			    MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG |
 			    MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD);
 			req->EEDPFlags = htole16(eedp_flags);
 
 			/*
 			 * If CDB less than 32, fill in Primary Ref Tag with
 			 * low 4 bytes of LBA.  If CDB is 32, tag stuff is
 			 * already there.  Also, set protection bit.  FreeBSD
 			 * currently does not support CDBs bigger than 16, but
 			 * the code doesn't hurt, and will be here for the
 			 * future.
 			 */
 			if (csio->cdb_len != 32) {
 				lba_byte = (csio->cdb_len == 16) ? 6 : 2;
 				ref_tag_addr = (uint8_t *)&req->CDB.EEDP32.
 				    PrimaryReferenceTag;
 				for (i = 0; i < 4; i++) {
 					*ref_tag_addr =
 					    req->CDB.CDB32[lba_byte + i];
 					ref_tag_addr++;
 				}
 				req->CDB.EEDP32.PrimaryReferenceTag = 
 					htole32(req->CDB.EEDP32.PrimaryReferenceTag);
 				req->CDB.EEDP32.PrimaryApplicationTagMask =
 				    0xFFFF;
 				req->CDB.CDB32[1] = (req->CDB.CDB32[1] & 0x1F) |
 				    0x20;
 			} else {
 				eedp_flags |=
 				    MPI2_SCSIIO_EEDPFLAGS_INC_PRI_APPTAG;
 				req->EEDPFlags = htole16(eedp_flags);
 				req->CDB.CDB32[10] = (req->CDB.CDB32[10] &
 				    0x1F) | 0x20;
 			}
 		}
 	}
 
 	cm->cm_length = csio->dxfer_len;
 	if (cm->cm_length != 0) {
 		cm->cm_data = ccb;
 		cm->cm_flags |= MPS_CM_FLAGS_USE_CCB;
 	} else {
 		cm->cm_data = NULL;
 	}
 	cm->cm_sge = &req->SGL;
 	cm->cm_sglsize = (32 - 24) * 4;
 	cm->cm_desc.SCSIIO.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO;
 	cm->cm_desc.SCSIIO.DevHandle = htole16(targ->handle);
 	cm->cm_complete = mpssas_scsiio_complete;
 	cm->cm_complete_data = ccb;
 	cm->cm_targ = targ;
 	cm->cm_lun = csio->ccb_h.target_lun;
 	cm->cm_ccb = ccb;
 
 	/*
 	 * If HBA is a WD and the command is not for a retry, try to build a
 	 * direct I/O message. If failed, or the command is for a retry, send
 	 * the I/O to the IR volume itself.
 	 */
 	if (sc->WD_valid_config) {
 		if (ccb->ccb_h.sim_priv.entries[0].field == MPS_WD_RETRY) {
 			mpssas_direct_drive_io(sassc, cm, ccb);
 		} else {
 			mpssas_set_ccbstatus(ccb, CAM_REQ_INPROG);
 		}
 	}
 
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+	if (csio->bio != NULL)
+		biotrack(csio->bio, __func__);
+#endif
 	callout_reset_sbt(&cm->cm_callout, SBT_1MS * ccb->ccb_h.timeout, 0,
 	    mpssas_scsiio_timeout, cm, 0);
 
 	targ->issued++;
 	targ->outstanding++;
 	TAILQ_INSERT_TAIL(&targ->commands, cm, cm_link);
 	ccb->ccb_h.status |= CAM_SIM_QUEUED;
 
 	mpssas_log_command(cm, MPS_XINFO, "%s cm %p ccb %p outstanding %u\n",
 	    __func__, cm, ccb, targ->outstanding);
 
 	mps_map_command(sc, cm);
 	return;
 }
 
 static void
 mps_response_code(struct mps_softc *sc, u8 response_code)
 {
         char *desc;
  
         switch (response_code) {
         case MPI2_SCSITASKMGMT_RSP_TM_COMPLETE:
                 desc = "task management request completed";
                 break;
         case MPI2_SCSITASKMGMT_RSP_INVALID_FRAME:
                 desc = "invalid frame";
                 break;
         case MPI2_SCSITASKMGMT_RSP_TM_NOT_SUPPORTED:
                 desc = "task management request not supported";
                 break;
         case MPI2_SCSITASKMGMT_RSP_TM_FAILED:
                 desc = "task management request failed";
                 break;
         case MPI2_SCSITASKMGMT_RSP_TM_SUCCEEDED:
                 desc = "task management request succeeded";
                 break;
         case MPI2_SCSITASKMGMT_RSP_TM_INVALID_LUN:
                 desc = "invalid lun";
                 break;
         case 0xA:
                 desc = "overlapped tag attempted";
                 break;
         case MPI2_SCSITASKMGMT_RSP_IO_QUEUED_ON_IOC:
                 desc = "task queued, however not sent to target";
                 break;
         default:
                 desc = "unknown";
                 break;
         }
 		mps_dprint(sc, MPS_XINFO, "response_code(0x%01x): %s\n",
                 response_code, desc);
 }
 /**
  * mps_sc_failed_io_info - translated non-succesfull SCSI_IO request
  */
 static void
 mps_sc_failed_io_info(struct mps_softc *sc, struct ccb_scsiio *csio,
     Mpi2SCSIIOReply_t *mpi_reply)
 {
 	u32 response_info;
 	u8 *response_bytes;
 	u16 ioc_status = le16toh(mpi_reply->IOCStatus) &
 	    MPI2_IOCSTATUS_MASK;
 	u8 scsi_state = mpi_reply->SCSIState;
 	u8 scsi_status = mpi_reply->SCSIStatus;
 	char *desc_ioc_state = NULL;
 	char *desc_scsi_status = NULL;
 	char *desc_scsi_state = sc->tmp_string;
 	u32 log_info = le32toh(mpi_reply->IOCLogInfo);
 	
 	if (log_info == 0x31170000)
 		return;
 
 	switch (ioc_status) {
 	case MPI2_IOCSTATUS_SUCCESS:
 		desc_ioc_state = "success";
 		break;
 	case MPI2_IOCSTATUS_INVALID_FUNCTION:
 		desc_ioc_state = "invalid function";
 		break;
 	case MPI2_IOCSTATUS_SCSI_RECOVERED_ERROR:
 		desc_ioc_state = "scsi recovered error";
 		break;
 	case MPI2_IOCSTATUS_SCSI_INVALID_DEVHANDLE:
 		desc_ioc_state = "scsi invalid dev handle";
 		break;
 	case MPI2_IOCSTATUS_SCSI_DEVICE_NOT_THERE:
 		desc_ioc_state = "scsi device not there";
 		break;
 	case MPI2_IOCSTATUS_SCSI_DATA_OVERRUN:
 		desc_ioc_state = "scsi data overrun";
 		break;
 	case MPI2_IOCSTATUS_SCSI_DATA_UNDERRUN:
 		desc_ioc_state = "scsi data underrun";
 		break;
 	case MPI2_IOCSTATUS_SCSI_IO_DATA_ERROR:
 		desc_ioc_state = "scsi io data error";
 		break;
 	case MPI2_IOCSTATUS_SCSI_PROTOCOL_ERROR:
 		desc_ioc_state = "scsi protocol error";
 		break;
 	case MPI2_IOCSTATUS_SCSI_TASK_TERMINATED:
 		desc_ioc_state = "scsi task terminated";
 		break;
 	case MPI2_IOCSTATUS_SCSI_RESIDUAL_MISMATCH:
 		desc_ioc_state = "scsi residual mismatch";
 		break;
 	case MPI2_IOCSTATUS_SCSI_TASK_MGMT_FAILED:
 		desc_ioc_state = "scsi task mgmt failed";
 		break;
 	case MPI2_IOCSTATUS_SCSI_IOC_TERMINATED:
 		desc_ioc_state = "scsi ioc terminated";
 		break;
 	case MPI2_IOCSTATUS_SCSI_EXT_TERMINATED:
 		desc_ioc_state = "scsi ext terminated";
 		break;
 	case MPI2_IOCSTATUS_EEDP_GUARD_ERROR:
 		desc_ioc_state = "eedp guard error";
 		break;
 	case MPI2_IOCSTATUS_EEDP_REF_TAG_ERROR:
 		desc_ioc_state = "eedp ref tag error";
 		break;
 	case MPI2_IOCSTATUS_EEDP_APP_TAG_ERROR:
 		desc_ioc_state = "eedp app tag error";
 		break;
 	default:
 		desc_ioc_state = "unknown";
 		break;
 	}
 
 	switch (scsi_status) {
 	case MPI2_SCSI_STATUS_GOOD:
 		desc_scsi_status = "good";
 		break;
 	case MPI2_SCSI_STATUS_CHECK_CONDITION:
 		desc_scsi_status = "check condition";
 		break;
 	case MPI2_SCSI_STATUS_CONDITION_MET:
 		desc_scsi_status = "condition met";
 		break;
 	case MPI2_SCSI_STATUS_BUSY:
 		desc_scsi_status = "busy";
 		break;
 	case MPI2_SCSI_STATUS_INTERMEDIATE:
 		desc_scsi_status = "intermediate";
 		break;
 	case MPI2_SCSI_STATUS_INTERMEDIATE_CONDMET:
 		desc_scsi_status = "intermediate condmet";
 		break;
 	case MPI2_SCSI_STATUS_RESERVATION_CONFLICT:
 		desc_scsi_status = "reservation conflict";
 		break;
 	case MPI2_SCSI_STATUS_COMMAND_TERMINATED:
 		desc_scsi_status = "command terminated";
 		break;
 	case MPI2_SCSI_STATUS_TASK_SET_FULL:
 		desc_scsi_status = "task set full";
 		break;
 	case MPI2_SCSI_STATUS_ACA_ACTIVE:
 		desc_scsi_status = "aca active";
 		break;
 	case MPI2_SCSI_STATUS_TASK_ABORTED:
 		desc_scsi_status = "task aborted";
 		break;
 	default:
 		desc_scsi_status = "unknown";
 		break;
 	}
 
 	desc_scsi_state[0] = '\0';
 	if (!scsi_state)
 		desc_scsi_state = " ";
 	if (scsi_state & MPI2_SCSI_STATE_RESPONSE_INFO_VALID)
 		strcat(desc_scsi_state, "response info ");
 	if (scsi_state & MPI2_SCSI_STATE_TERMINATED)
 		strcat(desc_scsi_state, "state terminated ");
 	if (scsi_state & MPI2_SCSI_STATE_NO_SCSI_STATUS)
 		strcat(desc_scsi_state, "no status ");
 	if (scsi_state & MPI2_SCSI_STATE_AUTOSENSE_FAILED)
 		strcat(desc_scsi_state, "autosense failed ");
 	if (scsi_state & MPI2_SCSI_STATE_AUTOSENSE_VALID)
 		strcat(desc_scsi_state, "autosense valid ");
 
 	mps_dprint(sc, MPS_XINFO, "\thandle(0x%04x), ioc_status(%s)(0x%04x)\n",
 	    le16toh(mpi_reply->DevHandle), desc_ioc_state, ioc_status);
 	/* We can add more detail about underflow data here
 	 * TO-DO
 	 * */
 	mps_dprint(sc, MPS_XINFO, "\tscsi_status(%s)(0x%02x), "
 	    "scsi_state(%s)(0x%02x)\n", desc_scsi_status, scsi_status,
 	    desc_scsi_state, scsi_state);
 
 	if (sc->mps_debug & MPS_XINFO &&
 		scsi_state & MPI2_SCSI_STATE_AUTOSENSE_VALID) {
 		mps_dprint(sc, MPS_XINFO, "-> Sense Buffer Data : Start :\n");
 		scsi_sense_print(csio);
 		mps_dprint(sc, MPS_XINFO, "-> Sense Buffer Data : End :\n");
 	}
 
 	if (scsi_state & MPI2_SCSI_STATE_RESPONSE_INFO_VALID) {
 		response_info = le32toh(mpi_reply->ResponseInfo);
 		response_bytes = (u8 *)&response_info;
 		mps_response_code(sc,response_bytes[0]);
 	}
 }
 
 static void
 mpssas_scsiio_complete(struct mps_softc *sc, struct mps_command *cm)
 {
 	MPI2_SCSI_IO_REPLY *rep;
 	union ccb *ccb;
 	struct ccb_scsiio *csio;
 	struct mpssas_softc *sassc;
 	struct scsi_vpd_supported_page_list *vpd_list = NULL;
 	u8 *TLR_bits, TLR_on;
 	int dir = 0, i;
 	u16 alloc_len;
 	struct mpssas_target *target;
 	target_id_t target_id;
 
 	MPS_FUNCTRACE(sc);
 	mps_dprint(sc, MPS_TRACE,
 	    "cm %p SMID %u ccb %p reply %p outstanding %u\n", cm,
 	    cm->cm_desc.Default.SMID, cm->cm_ccb, cm->cm_reply,
 	    cm->cm_targ->outstanding);
 
 	callout_stop(&cm->cm_callout);
 	mtx_assert(&sc->mps_mtx, MA_OWNED);
 
 	sassc = sc->sassc;
 	ccb = cm->cm_complete_data;
 	csio = &ccb->csio;
 	target_id = csio->ccb_h.target_id;
 	rep = (MPI2_SCSI_IO_REPLY *)cm->cm_reply;
 	/*
 	 * XXX KDM if the chain allocation fails, does it matter if we do
 	 * the sync and unload here?  It is simpler to do it in every case,
 	 * assuming it doesn't cause problems.
 	 */
 	if (cm->cm_data != NULL) {
 		if (cm->cm_flags & MPS_CM_FLAGS_DATAIN)
 			dir = BUS_DMASYNC_POSTREAD;
 		else if (cm->cm_flags & MPS_CM_FLAGS_DATAOUT)
 			dir = BUS_DMASYNC_POSTWRITE;
 		bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir);
 		bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap);
 	}
 
 	cm->cm_targ->completed++;
 	cm->cm_targ->outstanding--;
 	TAILQ_REMOVE(&cm->cm_targ->commands, cm, cm_link);
 	ccb->ccb_h.status &= ~(CAM_STATUS_MASK | CAM_SIM_QUEUED);
+
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+	if (ccb->csio.bio != NULL)
+		biotrack(ccb->csio.bio, __func__);
+#endif
 
 	if (cm->cm_state == MPS_CM_STATE_TIMEDOUT) {
 		TAILQ_REMOVE(&cm->cm_targ->timedout_commands, cm, cm_recovery);
 		if (cm->cm_reply != NULL)
 			mpssas_log_command(cm, MPS_RECOVERY,
 			    "completed timedout cm %p ccb %p during recovery "
 			    "ioc %x scsi %x state %x xfer %u\n",
 			    cm, cm->cm_ccb,
 			    le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState,
 			    le32toh(rep->TransferCount));
 		else
 			mpssas_log_command(cm, MPS_RECOVERY,
 			    "completed timedout cm %p ccb %p during recovery\n",
 			    cm, cm->cm_ccb);
 	} else if (cm->cm_targ->tm != NULL) {
 		if (cm->cm_reply != NULL)
 			mpssas_log_command(cm, MPS_RECOVERY,
 			    "completed cm %p ccb %p during recovery "
 			    "ioc %x scsi %x state %x xfer %u\n",
 			    cm, cm->cm_ccb,
 			    le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState,
 			    le32toh(rep->TransferCount));
 		else
 			mpssas_log_command(cm, MPS_RECOVERY,
 			    "completed cm %p ccb %p during recovery\n",
 			    cm, cm->cm_ccb);
 	} else if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0) {
 		mpssas_log_command(cm, MPS_RECOVERY,
 		    "reset completed cm %p ccb %p\n",
 		    cm, cm->cm_ccb);
 	}
 
 	if ((cm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) {
 		/*
 		 * We ran into an error after we tried to map the command,
 		 * so we're getting a callback without queueing the command
 		 * to the hardware.  So we set the status here, and it will
 		 * be retained below.  We'll go through the "fast path",
 		 * because there can be no reply when we haven't actually
 		 * gone out to the hardware.
 		 */
 		mpssas_set_ccbstatus(ccb, CAM_REQUEUE_REQ);
 
 		/*
 		 * Currently the only error included in the mask is
 		 * MPS_CM_FLAGS_CHAIN_FAILED, which means we're out of
 		 * chain frames.  We need to freeze the queue until we get
 		 * a command that completed without this error, which will
 		 * hopefully have some chain frames attached that we can
 		 * use.  If we wanted to get smarter about it, we would
 		 * only unfreeze the queue in this condition when we're
 		 * sure that we're getting some chain frames back.  That's
 		 * probably unnecessary.
 		 */
 		if ((sassc->flags & MPSSAS_QUEUE_FROZEN) == 0) {
 			xpt_freeze_simq(sassc->sim, 1);
 			sassc->flags |= MPSSAS_QUEUE_FROZEN;
 			mps_dprint(sc, MPS_XINFO, "Error sending command, "
 				   "freezing SIM queue\n");
 		}
 	}
 
 	/*
 	 * If this is a Start Stop Unit command and it was issued by the driver
 	 * during shutdown, decrement the refcount to account for all of the
 	 * commands that were sent.  All SSU commands should be completed before
 	 * shutdown completes, meaning SSU_refcount will be 0 after SSU_started
 	 * is TRUE.
 	 */
 	if (sc->SSU_started && (csio->cdb_io.cdb_bytes[0] == START_STOP_UNIT)) {
 		mps_dprint(sc, MPS_INFO, "Decrementing SSU count.\n");
 		sc->SSU_refcount--;
 	}
 
 	/* Take the fast path to completion */
 	if (cm->cm_reply == NULL) {
 		if (mpssas_get_ccbstatus(ccb) == CAM_REQ_INPROG) {
 			if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0)
 				mpssas_set_ccbstatus(ccb, CAM_SCSI_BUS_RESET);
 			else {
 				mpssas_set_ccbstatus(ccb, CAM_REQ_CMP);
 				ccb->csio.scsi_status = SCSI_STATUS_OK;
 			}
 			if (sassc->flags & MPSSAS_QUEUE_FROZEN) {
 				ccb->ccb_h.status |= CAM_RELEASE_SIMQ;
 				sassc->flags &= ~MPSSAS_QUEUE_FROZEN;
 				mps_dprint(sc, MPS_XINFO,
 				    "Unfreezing SIM queue\n");
 			}
 		} 
 
 		/*
 		 * There are two scenarios where the status won't be
 		 * CAM_REQ_CMP.  The first is if MPS_CM_FLAGS_ERROR_MASK is
 		 * set, the second is in the MPS_FLAGS_DIAGRESET above.
 		 */
 		if (mpssas_get_ccbstatus(ccb) != CAM_REQ_CMP) {
 			/*
 			 * Freeze the dev queue so that commands are
 			 * executed in the correct order after error
 			 * recovery.
 			 */
 			ccb->ccb_h.status |= CAM_DEV_QFRZN;
 			xpt_freeze_devq(ccb->ccb_h.path, /*count*/ 1);
 		}
 		mps_free_command(sc, cm);
 		xpt_done(ccb);
 		return;
 	}
 
 	mpssas_log_command(cm, MPS_XINFO,
 	    "ioc %x scsi %x state %x xfer %u\n",
 	    le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState,
 	    le32toh(rep->TransferCount));
 
 	/*
 	 * If this is a Direct Drive I/O, reissue the I/O to the original IR
 	 * Volume if an error occurred (normal I/O retry).  Use the original
 	 * CCB, but set a flag that this will be a retry so that it's sent to
 	 * the original volume.  Free the command but reuse the CCB.
 	 */
 	if (cm->cm_flags & MPS_CM_FLAGS_DD_IO) {
 		mps_free_command(sc, cm);
 		ccb->ccb_h.sim_priv.entries[0].field = MPS_WD_RETRY;
 		mpssas_action_scsiio(sassc, ccb);
 		return;
 	} else
 		ccb->ccb_h.sim_priv.entries[0].field = 0;
 
 	switch (le16toh(rep->IOCStatus) & MPI2_IOCSTATUS_MASK) {
 	case MPI2_IOCSTATUS_SCSI_DATA_UNDERRUN:
 		csio->resid = cm->cm_length - le32toh(rep->TransferCount);
 		/* FALLTHROUGH */
 	case MPI2_IOCSTATUS_SUCCESS:
 	case MPI2_IOCSTATUS_SCSI_RECOVERED_ERROR:
 
 		if ((le16toh(rep->IOCStatus) & MPI2_IOCSTATUS_MASK) ==
 		    MPI2_IOCSTATUS_SCSI_RECOVERED_ERROR)
 			mpssas_log_command(cm, MPS_XINFO, "recovered error\n");
 
 		/* Completion failed at the transport level. */
 		if (rep->SCSIState & (MPI2_SCSI_STATE_NO_SCSI_STATUS |
 		    MPI2_SCSI_STATE_TERMINATED)) {
 			mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR);
 			break;
 		}
 
 		/* In a modern packetized environment, an autosense failure
 		 * implies that there's not much else that can be done to
 		 * recover the command.
 		 */
 		if (rep->SCSIState & MPI2_SCSI_STATE_AUTOSENSE_FAILED) {
 			mpssas_set_ccbstatus(ccb, CAM_AUTOSENSE_FAIL);
 			break;
 		}
 
 		/*
 		 * CAM doesn't care about SAS Response Info data, but if this is
 		 * the state check if TLR should be done.  If not, clear the
 		 * TLR_bits for the target.
 		 */
 		if ((rep->SCSIState & MPI2_SCSI_STATE_RESPONSE_INFO_VALID) &&
 		    ((le32toh(rep->ResponseInfo) &
 		    MPI2_SCSI_RI_MASK_REASONCODE) ==
 		    MPS_SCSI_RI_INVALID_FRAME)) {
 			sc->mapping_table[target_id].TLR_bits =
 			    (u8)MPI2_SCSIIO_CONTROL_NO_TLR;
 		}
 
 		/*
 		 * Intentionally override the normal SCSI status reporting
 		 * for these two cases.  These are likely to happen in a
 		 * multi-initiator environment, and we want to make sure that
 		 * CAM retries these commands rather than fail them.
 		 */
 		if ((rep->SCSIStatus == MPI2_SCSI_STATUS_COMMAND_TERMINATED) ||
 		    (rep->SCSIStatus == MPI2_SCSI_STATUS_TASK_ABORTED)) {
 			mpssas_set_ccbstatus(ccb, CAM_REQ_ABORTED);
 			break;
 		}
 
 		/* Handle normal status and sense */
 		csio->scsi_status = rep->SCSIStatus;
 		if (rep->SCSIStatus == MPI2_SCSI_STATUS_GOOD)
 			mpssas_set_ccbstatus(ccb, CAM_REQ_CMP);
 		else
 			mpssas_set_ccbstatus(ccb, CAM_SCSI_STATUS_ERROR);
 
 		if (rep->SCSIState & MPI2_SCSI_STATE_AUTOSENSE_VALID) {
 			int sense_len, returned_sense_len;
 
 			returned_sense_len = min(le32toh(rep->SenseCount),
 			    sizeof(struct scsi_sense_data));
 			if (returned_sense_len < ccb->csio.sense_len)
 				ccb->csio.sense_resid = ccb->csio.sense_len -
 					returned_sense_len;
 			else
 				ccb->csio.sense_resid = 0;
 
 			sense_len = min(returned_sense_len,
 			    ccb->csio.sense_len - ccb->csio.sense_resid);
 			bzero(&ccb->csio.sense_data,
 			      sizeof(ccb->csio.sense_data));
 			bcopy(cm->cm_sense, &ccb->csio.sense_data, sense_len);
 			ccb->ccb_h.status |= CAM_AUTOSNS_VALID;
 		}
 
 		/*
 		 * Check if this is an INQUIRY command.  If it's a VPD inquiry,
 		 * and it's page code 0 (Supported Page List), and there is
 		 * inquiry data, and this is for a sequential access device, and
 		 * the device is an SSP target, and TLR is supported by the
 		 * controller, turn the TLR_bits value ON if page 0x90 is
 		 * supported.
 		 */
 		if ((csio->cdb_io.cdb_bytes[0] == INQUIRY) &&
 		    (csio->cdb_io.cdb_bytes[1] & SI_EVPD) &&
 		    (csio->cdb_io.cdb_bytes[2] == SVPD_SUPPORTED_PAGE_LIST) &&
 		    ((csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR) &&
 		    (csio->data_ptr != NULL) &&
 		    ((csio->data_ptr[0] & 0x1f) == T_SEQUENTIAL) &&
 		    (sc->control_TLR) &&
 		    (sc->mapping_table[target_id].device_info &
 		    MPI2_SAS_DEVICE_INFO_SSP_TARGET)) {
 			vpd_list = (struct scsi_vpd_supported_page_list *)
 			    csio->data_ptr;
 			TLR_bits = &sc->mapping_table[target_id].TLR_bits;
 			*TLR_bits = (u8)MPI2_SCSIIO_CONTROL_NO_TLR;
 			TLR_on = (u8)MPI2_SCSIIO_CONTROL_TLR_ON;
 			alloc_len = ((u16)csio->cdb_io.cdb_bytes[3] << 8) +
 			    csio->cdb_io.cdb_bytes[4];
 			alloc_len -= csio->resid;
 			for (i = 0; i < MIN(vpd_list->length, alloc_len); i++) {
 				if (vpd_list->list[i] == 0x90) {
 					*TLR_bits = TLR_on;
 					break;
 				}
 			}
 		}
 
 		/*
 		 * If this is a SATA direct-access end device, mark it so that
 		 * a SCSI StartStopUnit command will be sent to it when the
 		 * driver is being shutdown.
 		 */
 		if ((csio->cdb_io.cdb_bytes[0] == INQUIRY) &&
 		    ((csio->data_ptr[0] & 0x1f) == T_DIRECT) &&
 		    (sc->mapping_table[target_id].device_info &
 		    MPI2_SAS_DEVICE_INFO_SATA_DEVICE) &&
 		    ((sc->mapping_table[target_id].device_info &
 		    MPI2_SAS_DEVICE_INFO_MASK_DEVICE_TYPE) ==
 		    MPI2_SAS_DEVICE_INFO_END_DEVICE)) {
 			target = &sassc->targets[target_id];
 			target->supports_SSU = TRUE;
 			mps_dprint(sc, MPS_XINFO, "Target %d supports SSU\n",
 			    target_id);
 		}
 		break;
 	case MPI2_IOCSTATUS_SCSI_INVALID_DEVHANDLE:
 	case MPI2_IOCSTATUS_SCSI_DEVICE_NOT_THERE:
 		/*
 		 * If devinfo is 0 this will be a volume.  In that case don't
 		 * tell CAM that the volume is not there.  We want volumes to
 		 * be enumerated until they are deleted/removed, not just
 		 * failed.
 		 */
 		if (cm->cm_targ->devinfo == 0)
 			mpssas_set_ccbstatus(ccb, CAM_REQ_CMP);
 		else
 			mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE);
 		break;
 	case MPI2_IOCSTATUS_INVALID_SGL:
 		mps_print_scsiio_cmd(sc, cm);
 		mpssas_set_ccbstatus(ccb, CAM_UNREC_HBA_ERROR);
 		break;
 	case MPI2_IOCSTATUS_SCSI_TASK_TERMINATED:
 		/*
 		 * This is one of the responses that comes back when an I/O
 		 * has been aborted.  If it is because of a timeout that we
 		 * initiated, just set the status to CAM_CMD_TIMEOUT.
 		 * Otherwise set it to CAM_REQ_ABORTED.  The effect on the
 		 * command is the same (it gets retried, subject to the
 		 * retry counter), the only difference is what gets printed
 		 * on the console.
 		 */
 		if (cm->cm_state == MPS_CM_STATE_TIMEDOUT)
 			mpssas_set_ccbstatus(ccb, CAM_CMD_TIMEOUT);
 		else
 			mpssas_set_ccbstatus(ccb, CAM_REQ_ABORTED);
 		break;
 	case MPI2_IOCSTATUS_SCSI_DATA_OVERRUN:
 		/* resid is ignored for this condition */
 		csio->resid = 0;
 		mpssas_set_ccbstatus(ccb, CAM_DATA_RUN_ERR);
 		break;
 	case MPI2_IOCSTATUS_SCSI_IOC_TERMINATED:
 	case MPI2_IOCSTATUS_SCSI_EXT_TERMINATED:
 		/*
 		 * These can sometimes be transient transport-related
 		 * errors, and sometimes persistent drive-related errors.
 		 * We used to retry these without decrementing the retry
 		 * count by returning CAM_REQUEUE_REQ.  Unfortunately, if
 		 * we hit a persistent drive problem that returns one of
 		 * these error codes, we would retry indefinitely.  So,
 		 * return CAM_REQ_CMP_ERROR so that we decrement the retry
 		 * count and avoid infinite retries.  We're taking the
 		 * potential risk of flagging false failures in the event
 		 * of a topology-related error (e.g. a SAS expander problem
 		 * causes a command addressed to a drive to fail), but
 		 * avoiding getting into an infinite retry loop.
 		 */
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR);
 		mpssas_log_command(cm, MPS_INFO,
 		    "terminated ioc %x scsi %x state %x xfer %u\n",
 		    le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState,
 		    le32toh(rep->TransferCount));
 		break;
 	case MPI2_IOCSTATUS_INVALID_FUNCTION:
 	case MPI2_IOCSTATUS_INTERNAL_ERROR:
 	case MPI2_IOCSTATUS_INVALID_VPID:
 	case MPI2_IOCSTATUS_INVALID_FIELD:
 	case MPI2_IOCSTATUS_INVALID_STATE:
 	case MPI2_IOCSTATUS_OP_STATE_NOT_SUPPORTED:
 	case MPI2_IOCSTATUS_SCSI_IO_DATA_ERROR:
 	case MPI2_IOCSTATUS_SCSI_PROTOCOL_ERROR:
 	case MPI2_IOCSTATUS_SCSI_RESIDUAL_MISMATCH:
 	case MPI2_IOCSTATUS_SCSI_TASK_MGMT_FAILED:
 	default:
 		mpssas_log_command(cm, MPS_XINFO,
 		    "completed ioc %x scsi %x state %x xfer %u\n",
 		    le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState,
 		    le32toh(rep->TransferCount));
 		csio->resid = cm->cm_length;
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR);
 		break;
 	}
 	
 	mps_sc_failed_io_info(sc,csio,rep);
 
 	if (sassc->flags & MPSSAS_QUEUE_FROZEN) {
 		ccb->ccb_h.status |= CAM_RELEASE_SIMQ;
 		sassc->flags &= ~MPSSAS_QUEUE_FROZEN;
 		mps_dprint(sc, MPS_XINFO, "Command completed, "
 		    "unfreezing SIM queue\n");
 	}
 
 	if (mpssas_get_ccbstatus(ccb) != CAM_REQ_CMP) {
 		ccb->ccb_h.status |= CAM_DEV_QFRZN;
 		xpt_freeze_devq(ccb->ccb_h.path, /*count*/ 1);
 	}
 
 	mps_free_command(sc, cm);
 	xpt_done(ccb);
 }
 
 /* All Request reached here are Endian safe */
 static void
 mpssas_direct_drive_io(struct mpssas_softc *sassc, struct mps_command *cm,
     union ccb *ccb) {
 	pMpi2SCSIIORequest_t	pIO_req;
 	struct mps_softc	*sc = sassc->sc;
 	uint64_t		virtLBA;
 	uint32_t		physLBA, stripe_offset, stripe_unit;
 	uint32_t		io_size, column;
 	uint8_t			*ptrLBA, lba_idx, physLBA_byte, *CDB;
 
 	/*
 	 * If this is a valid SCSI command (Read6, Read10, Read16, Write6,
 	 * Write10, or Write16), build a direct I/O message.  Otherwise, the I/O
 	 * will be sent to the IR volume itself.  Since Read6 and Write6 are a
 	 * bit different than the 10/16 CDBs, handle them separately.
 	 */
 	pIO_req = (pMpi2SCSIIORequest_t)cm->cm_req;
 	CDB = pIO_req->CDB.CDB32;
 
 	/*
 	 * Handle 6 byte CDBs.
 	 */
 	if ((pIO_req->DevHandle == sc->DD_dev_handle) && ((CDB[0] == READ_6) ||
 	    (CDB[0] == WRITE_6))) {
 		/*
 		 * Get the transfer size in blocks.
 		 */
 		io_size = (cm->cm_length >> sc->DD_block_exponent);
 
 		/*
 		 * Get virtual LBA given in the CDB.
 		 */
 		virtLBA = ((uint64_t)(CDB[1] & 0x1F) << 16) |
 		    ((uint64_t)CDB[2] << 8) | (uint64_t)CDB[3];
 
 		/*
 		 * Check that LBA range for I/O does not exceed volume's
 		 * MaxLBA.
 		 */
 		if ((virtLBA + (uint64_t)io_size - 1) <=
 		    sc->DD_max_lba) {
 			/*
 			 * Check if the I/O crosses a stripe boundary.  If not,
 			 * translate the virtual LBA to a physical LBA and set
 			 * the DevHandle for the PhysDisk to be used.  If it
 			 * does cross a boundary, do normal I/O.  To get the
 			 * right DevHandle to use, get the map number for the
 			 * column, then use that map number to look up the
 			 * DevHandle of the PhysDisk.
 			 */
 			stripe_offset = (uint32_t)virtLBA &
 			    (sc->DD_stripe_size - 1);
 			if ((stripe_offset + io_size) <= sc->DD_stripe_size) {
 				physLBA = (uint32_t)virtLBA >>
 				    sc->DD_stripe_exponent;
 				stripe_unit = physLBA / sc->DD_num_phys_disks;
 				column = physLBA % sc->DD_num_phys_disks;
 				pIO_req->DevHandle =
 				    htole16(sc->DD_column_map[column].dev_handle);
 				/* ???? Is this endian safe*/
 				cm->cm_desc.SCSIIO.DevHandle =
 				    pIO_req->DevHandle;
 
 				physLBA = (stripe_unit <<
 				    sc->DD_stripe_exponent) + stripe_offset;
 				ptrLBA = &pIO_req->CDB.CDB32[1];
 				physLBA_byte = (uint8_t)(physLBA >> 16);
 				*ptrLBA = physLBA_byte;
 				ptrLBA = &pIO_req->CDB.CDB32[2];
 				physLBA_byte = (uint8_t)(physLBA >> 8);
 				*ptrLBA = physLBA_byte;
 				ptrLBA = &pIO_req->CDB.CDB32[3];
 				physLBA_byte = (uint8_t)physLBA;
 				*ptrLBA = physLBA_byte;
 
 				/*
 				 * Set flag that Direct Drive I/O is
 				 * being done.
 				 */
 				cm->cm_flags |= MPS_CM_FLAGS_DD_IO;
 			}
 		}
 		return;
 	}
 
 	/*
 	 * Handle 10, 12 or 16 byte CDBs.
 	 */
 	if ((pIO_req->DevHandle == sc->DD_dev_handle) && ((CDB[0] == READ_10) ||
 	    (CDB[0] == WRITE_10) || (CDB[0] == READ_16) ||
 	    (CDB[0] == WRITE_16) || (CDB[0] == READ_12) ||
 	    (CDB[0] == WRITE_12))) {
 		/*
 		 * For 16-byte CDB's, verify that the upper 4 bytes of the CDB
 		 * are 0.  If not, this is accessing beyond 2TB so handle it in
 		 * the else section.  10-byte and 12-byte CDB's are OK.
 		 * FreeBSD sends very rare 12 byte READ/WRITE, but driver is 
 		 * ready to accept 12byte CDB for Direct IOs.
 		 */
 		if ((CDB[0] == READ_10 || CDB[0] == WRITE_10) ||
 		    (CDB[0] == READ_12 || CDB[0] == WRITE_12) ||
 		    !(CDB[2] | CDB[3] | CDB[4] | CDB[5])) {
 			/*
 			 * Get the transfer size in blocks.
 			 */
 			io_size = (cm->cm_length >> sc->DD_block_exponent);
 
 			/*
 			 * Get virtual LBA.  Point to correct lower 4 bytes of
 			 * LBA in the CDB depending on command.
 			 */
 			lba_idx = ((CDB[0] == READ_12) || 
 				(CDB[0] == WRITE_12) ||
 				(CDB[0] == READ_10) ||
 				(CDB[0] == WRITE_10))? 2 : 6;
 			virtLBA = ((uint64_t)CDB[lba_idx] << 24) |
 			    ((uint64_t)CDB[lba_idx + 1] << 16) |
 			    ((uint64_t)CDB[lba_idx + 2] << 8) |
 			    (uint64_t)CDB[lba_idx + 3];
 
 			/*
 			 * Check that LBA range for I/O does not exceed volume's
 			 * MaxLBA.
 			 */
 			if ((virtLBA + (uint64_t)io_size - 1) <=
 			    sc->DD_max_lba) {
 				/*
 				 * Check if the I/O crosses a stripe boundary.
 				 * If not, translate the virtual LBA to a
 				 * physical LBA and set the DevHandle for the
 				 * PhysDisk to be used.  If it does cross a
 				 * boundary, do normal I/O.  To get the right
 				 * DevHandle to use, get the map number for the
 				 * column, then use that map number to look up
 				 * the DevHandle of the PhysDisk.
 				 */
 				stripe_offset = (uint32_t)virtLBA &
 				    (sc->DD_stripe_size - 1);
 				if ((stripe_offset + io_size) <=
 				    sc->DD_stripe_size) {
 					physLBA = (uint32_t)virtLBA >>
 					    sc->DD_stripe_exponent;
 					stripe_unit = physLBA /
 					    sc->DD_num_phys_disks;
 					column = physLBA %
 					    sc->DD_num_phys_disks;
 					pIO_req->DevHandle =
 					    htole16(sc->DD_column_map[column].
 					    dev_handle);
 					cm->cm_desc.SCSIIO.DevHandle =
 					    pIO_req->DevHandle;
 
 					physLBA = (stripe_unit <<
 					    sc->DD_stripe_exponent) +
 					    stripe_offset;
 					ptrLBA =
 					    &pIO_req->CDB.CDB32[lba_idx];
 					physLBA_byte = (uint8_t)(physLBA >> 24);
 					*ptrLBA = physLBA_byte;
 					ptrLBA =
 					    &pIO_req->CDB.CDB32[lba_idx + 1];
 					physLBA_byte = (uint8_t)(physLBA >> 16);
 					*ptrLBA = physLBA_byte;
 					ptrLBA =
 					    &pIO_req->CDB.CDB32[lba_idx + 2];
 					physLBA_byte = (uint8_t)(physLBA >> 8);
 					*ptrLBA = physLBA_byte;
 					ptrLBA =
 					    &pIO_req->CDB.CDB32[lba_idx + 3];
 					physLBA_byte = (uint8_t)physLBA;
 					*ptrLBA = physLBA_byte;
 
 					/*
 					 * Set flag that Direct Drive I/O is
 					 * being done.
 					 */
 					cm->cm_flags |= MPS_CM_FLAGS_DD_IO;
 				}
 			}
 		} else {
 			/*
 			 * 16-byte CDB and the upper 4 bytes of the CDB are not
 			 * 0.  Get the transfer size in blocks.
 			 */
 			io_size = (cm->cm_length >> sc->DD_block_exponent);
 
 			/*
 			 * Get virtual LBA.
 			 */
 			virtLBA = ((uint64_t)CDB[2] << 54) |
 			    ((uint64_t)CDB[3] << 48) |
 			    ((uint64_t)CDB[4] << 40) |
 			    ((uint64_t)CDB[5] << 32) |
 			    ((uint64_t)CDB[6] << 24) |
 			    ((uint64_t)CDB[7] << 16) |
 			    ((uint64_t)CDB[8] << 8) |
 			    (uint64_t)CDB[9]; 
 
 			/*
 			 * Check that LBA range for I/O does not exceed volume's
 			 * MaxLBA.
 			 */
 			if ((virtLBA + (uint64_t)io_size - 1) <=
 			    sc->DD_max_lba) {
 				/*
 				 * Check if the I/O crosses a stripe boundary.
 				 * If not, translate the virtual LBA to a
 				 * physical LBA and set the DevHandle for the
 				 * PhysDisk to be used.  If it does cross a
 				 * boundary, do normal I/O.  To get the right
 				 * DevHandle to use, get the map number for the
 				 * column, then use that map number to look up
 				 * the DevHandle of the PhysDisk.
 				 */
 				stripe_offset = (uint32_t)virtLBA &
 				    (sc->DD_stripe_size - 1);
 				if ((stripe_offset + io_size) <=
 				    sc->DD_stripe_size) {
 					physLBA = (uint32_t)(virtLBA >>
 					    sc->DD_stripe_exponent);
 					stripe_unit = physLBA /
 					    sc->DD_num_phys_disks;
 					column = physLBA %
 					    sc->DD_num_phys_disks;
 					pIO_req->DevHandle =
 					    htole16(sc->DD_column_map[column].
 					    dev_handle);
 					cm->cm_desc.SCSIIO.DevHandle =
 					    pIO_req->DevHandle;
 
 					physLBA = (stripe_unit <<
 					    sc->DD_stripe_exponent) +
 					    stripe_offset;
 
 					/*
 					 * Set upper 4 bytes of LBA to 0.  We
 					 * assume that the phys disks are less
 					 * than 2 TB's in size.  Then, set the
 					 * lower 4 bytes.
 					 */
 					pIO_req->CDB.CDB32[2] = 0;
 					pIO_req->CDB.CDB32[3] = 0;
 					pIO_req->CDB.CDB32[4] = 0;
 					pIO_req->CDB.CDB32[5] = 0;
 					ptrLBA = &pIO_req->CDB.CDB32[6];
 					physLBA_byte = (uint8_t)(physLBA >> 24);
 					*ptrLBA = physLBA_byte;
 					ptrLBA = &pIO_req->CDB.CDB32[7];
 					physLBA_byte = (uint8_t)(physLBA >> 16);
 					*ptrLBA = physLBA_byte;
 					ptrLBA = &pIO_req->CDB.CDB32[8];
 					physLBA_byte = (uint8_t)(physLBA >> 8);
 					*ptrLBA = physLBA_byte;
 					ptrLBA = &pIO_req->CDB.CDB32[9];
 					physLBA_byte = (uint8_t)physLBA;
 					*ptrLBA = physLBA_byte;
 
 					/*
 					 * Set flag that Direct Drive I/O is
 					 * being done.
 					 */
 					cm->cm_flags |= MPS_CM_FLAGS_DD_IO;
 				}
 			}
 		}
 	}
 }
 
 #if __FreeBSD_version >= 900026
 static void
 mpssas_smpio_complete(struct mps_softc *sc, struct mps_command *cm)
 {
 	MPI2_SMP_PASSTHROUGH_REPLY *rpl;
 	MPI2_SMP_PASSTHROUGH_REQUEST *req;
 	uint64_t sasaddr;
 	union ccb *ccb;
 
 	ccb = cm->cm_complete_data;
 
 	/*
 	 * Currently there should be no way we can hit this case.  It only
 	 * happens when we have a failure to allocate chain frames, and SMP
 	 * commands require two S/G elements only.  That should be handled
 	 * in the standard request size.
 	 */
 	if ((cm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) {
 		mps_dprint(sc, MPS_ERROR,"%s: cm_flags = %#x on SMP request!\n",
 			   __func__, cm->cm_flags);
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR);
 		goto bailout;
         }
 
 	rpl = (MPI2_SMP_PASSTHROUGH_REPLY *)cm->cm_reply;
 	if (rpl == NULL) {
 		mps_dprint(sc, MPS_ERROR, "%s: NULL cm_reply!\n", __func__);
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR);
 		goto bailout;
 	}
 
 	req = (MPI2_SMP_PASSTHROUGH_REQUEST *)cm->cm_req;
 	sasaddr = le32toh(req->SASAddress.Low);
 	sasaddr |= ((uint64_t)(le32toh(req->SASAddress.High))) << 32;
 
 	if ((le16toh(rpl->IOCStatus) & MPI2_IOCSTATUS_MASK) !=
 	    MPI2_IOCSTATUS_SUCCESS ||
 	    rpl->SASStatus != MPI2_SASSTATUS_SUCCESS) {
 		mps_dprint(sc, MPS_XINFO, "%s: IOCStatus %04x SASStatus %02x\n",
 		    __func__, le16toh(rpl->IOCStatus), rpl->SASStatus);
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR);
 		goto bailout;
 	}
 
 	mps_dprint(sc, MPS_XINFO, "%s: SMP request to SAS address "
 		   "%#jx completed successfully\n", __func__,
 		   (uintmax_t)sasaddr);
 
 	if (ccb->smpio.smp_response[2] == SMP_FR_ACCEPTED)
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP);
 	else
 		mpssas_set_ccbstatus(ccb, CAM_SMP_STATUS_ERROR);
 
 bailout:
 	/*
 	 * We sync in both directions because we had DMAs in the S/G list
 	 * in both directions.
 	 */
 	bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap,
 			BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 	bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap);
 	mps_free_command(sc, cm);
 	xpt_done(ccb);
 }
 
 static void
 mpssas_send_smpcmd(struct mpssas_softc *sassc, union ccb *ccb, uint64_t sasaddr)
 {
 	struct mps_command *cm;
 	uint8_t *request, *response;
 	MPI2_SMP_PASSTHROUGH_REQUEST *req;
 	struct mps_softc *sc;
 	int error;
 
 	sc = sassc->sc;
 	error = 0;
 
 	/*
 	 * XXX We don't yet support physical addresses here.
 	 */
 	switch ((ccb->ccb_h.flags & CAM_DATA_MASK)) {
 	case CAM_DATA_PADDR:
 	case CAM_DATA_SG_PADDR:
 		mps_dprint(sc, MPS_ERROR,
 			   "%s: physical addresses not supported\n", __func__);
 		mpssas_set_ccbstatus(ccb, CAM_REQ_INVALID);
 		xpt_done(ccb);
 		return;
 	case CAM_DATA_SG:
 		/*
 		 * The chip does not support more than one buffer for the
 		 * request or response.
 		 */
 	 	if ((ccb->smpio.smp_request_sglist_cnt > 1)
 		  || (ccb->smpio.smp_response_sglist_cnt > 1)) {
 			mps_dprint(sc, MPS_ERROR,
 				   "%s: multiple request or response "
 				   "buffer segments not supported for SMP\n",
 				   __func__);
 			mpssas_set_ccbstatus(ccb, CAM_REQ_INVALID);
 			xpt_done(ccb);
 			return;
 		}
 
 		/*
 		 * The CAM_SCATTER_VALID flag was originally implemented
 		 * for the XPT_SCSI_IO CCB, which only has one data pointer.
 		 * We have two.  So, just take that flag to mean that we
 		 * might have S/G lists, and look at the S/G segment count
 		 * to figure out whether that is the case for each individual
 		 * buffer.
 		 */
 		if (ccb->smpio.smp_request_sglist_cnt != 0) {
 			bus_dma_segment_t *req_sg;
 
 			req_sg = (bus_dma_segment_t *)ccb->smpio.smp_request;
 			request = (uint8_t *)(uintptr_t)req_sg[0].ds_addr;
 		} else
 			request = ccb->smpio.smp_request;
 
 		if (ccb->smpio.smp_response_sglist_cnt != 0) {
 			bus_dma_segment_t *rsp_sg;
 
 			rsp_sg = (bus_dma_segment_t *)ccb->smpio.smp_response;
 			response = (uint8_t *)(uintptr_t)rsp_sg[0].ds_addr;
 		} else
 			response = ccb->smpio.smp_response;
 		break;
 	case CAM_DATA_VADDR:
 		request = ccb->smpio.smp_request;
 		response = ccb->smpio.smp_response;
 		break;
 	default:
 		mpssas_set_ccbstatus(ccb, CAM_REQ_INVALID);
 		xpt_done(ccb);
 		return;
 	}
 
 	cm = mps_alloc_command(sc);
 	if (cm == NULL) {
 		mps_dprint(sc, MPS_ERROR,
 		    "%s: cannot allocate command\n", __func__);
 		mpssas_set_ccbstatus(ccb, CAM_RESRC_UNAVAIL);
 		xpt_done(ccb);
 		return;
 	}
 
 	req = (MPI2_SMP_PASSTHROUGH_REQUEST *)cm->cm_req;
 	bzero(req, sizeof(*req));
 	req->Function = MPI2_FUNCTION_SMP_PASSTHROUGH;
 
 	/* Allow the chip to use any route to this SAS address. */
 	req->PhysicalPort = 0xff;
 
 	req->RequestDataLength = htole16(ccb->smpio.smp_request_len);
 	req->SGLFlags = 
 	    MPI2_SGLFLAGS_SYSTEM_ADDRESS_SPACE | MPI2_SGLFLAGS_SGL_TYPE_MPI;
 
 	mps_dprint(sc, MPS_XINFO, "%s: sending SMP request to SAS "
 	    "address %#jx\n", __func__, (uintmax_t)sasaddr);
 
 	mpi_init_sge(cm, req, &req->SGL);
 
 	/*
 	 * Set up a uio to pass into mps_map_command().  This allows us to
 	 * do one map command, and one busdma call in there.
 	 */
 	cm->cm_uio.uio_iov = cm->cm_iovec;
 	cm->cm_uio.uio_iovcnt = 2;
 	cm->cm_uio.uio_segflg = UIO_SYSSPACE;
 
 	/*
 	 * The read/write flag isn't used by busdma, but set it just in
 	 * case.  This isn't exactly accurate, either, since we're going in
 	 * both directions.
 	 */
 	cm->cm_uio.uio_rw = UIO_WRITE;
 
 	cm->cm_iovec[0].iov_base = request;
 	cm->cm_iovec[0].iov_len = le16toh(req->RequestDataLength);
 	cm->cm_iovec[1].iov_base = response;
 	cm->cm_iovec[1].iov_len = ccb->smpio.smp_response_len;
 
 	cm->cm_uio.uio_resid = cm->cm_iovec[0].iov_len +
 			       cm->cm_iovec[1].iov_len;
 
 	/*
 	 * Trigger a warning message in mps_data_cb() for the user if we
 	 * wind up exceeding two S/G segments.  The chip expects one
 	 * segment for the request and another for the response.
 	 */
 	cm->cm_max_segs = 2;
 
 	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 	cm->cm_complete = mpssas_smpio_complete;
 	cm->cm_complete_data = ccb;
 
 	/*
 	 * Tell the mapping code that we're using a uio, and that this is
 	 * an SMP passthrough request.  There is a little special-case
 	 * logic there (in mps_data_cb()) to handle the bidirectional
 	 * transfer.  
 	 */
 	cm->cm_flags |= MPS_CM_FLAGS_USE_UIO | MPS_CM_FLAGS_SMP_PASS |
 			MPS_CM_FLAGS_DATAIN | MPS_CM_FLAGS_DATAOUT;
 
 	/* The chip data format is little endian. */
 	req->SASAddress.High = htole32(sasaddr >> 32);
 	req->SASAddress.Low = htole32(sasaddr);
 
 	/*
 	 * XXX Note that we don't have a timeout/abort mechanism here.
 	 * From the manual, it looks like task management requests only
 	 * work for SCSI IO and SATA passthrough requests.  We may need to
 	 * have a mechanism to retry requests in the event of a chip reset
 	 * at least.  Hopefully the chip will insure that any errors short
 	 * of that are relayed back to the driver.
 	 */
 	error = mps_map_command(sc, cm);
 	if ((error != 0) && (error != EINPROGRESS)) {
 		mps_dprint(sc, MPS_ERROR,
 			   "%s: error %d returned from mps_map_command()\n",
 			   __func__, error);
 		goto bailout_error;
 	}
 
 	return;
 
 bailout_error:
 	mps_free_command(sc, cm);
 	mpssas_set_ccbstatus(ccb, CAM_RESRC_UNAVAIL);
 	xpt_done(ccb);
 	return;
 
 }
 
 static void
 mpssas_action_smpio(struct mpssas_softc *sassc, union ccb *ccb)
 {
 	struct mps_softc *sc;
 	struct mpssas_target *targ;
 	uint64_t sasaddr = 0;
 
 	sc = sassc->sc;
 
 	/*
 	 * Make sure the target exists.
 	 */
 	KASSERT(ccb->ccb_h.target_id < sassc->maxtargets,
 	    ("Target %d out of bounds in XPT_SMP_IO\n", ccb->ccb_h.target_id));
 	targ = &sassc->targets[ccb->ccb_h.target_id];
 	if (targ->handle == 0x0) {
 		mps_dprint(sc, MPS_ERROR,
 			   "%s: target %d does not exist!\n", __func__,
 			   ccb->ccb_h.target_id);
 		mpssas_set_ccbstatus(ccb, CAM_SEL_TIMEOUT);
 		xpt_done(ccb);
 		return;
 	}
 
 	/*
 	 * If this device has an embedded SMP target, we'll talk to it
 	 * directly.
 	 * figure out what the expander's address is.
 	 */
 	if ((targ->devinfo & MPI2_SAS_DEVICE_INFO_SMP_TARGET) != 0)
 		sasaddr = targ->sasaddr;
 
 	/*
 	 * If we don't have a SAS address for the expander yet, try
 	 * grabbing it from the page 0x83 information cached in the
 	 * transport layer for this target.  LSI expanders report the
 	 * expander SAS address as the port-associated SAS address in
 	 * Inquiry VPD page 0x83.  Maxim expanders don't report it in page
 	 * 0x83.
 	 *
 	 * XXX KDM disable this for now, but leave it commented out so that
 	 * it is obvious that this is another possible way to get the SAS
 	 * address.
 	 *
 	 * The parent handle method below is a little more reliable, and
 	 * the other benefit is that it works for devices other than SES
 	 * devices.  So you can send a SMP request to a da(4) device and it
 	 * will get routed to the expander that device is attached to.
 	 * (Assuming the da(4) device doesn't contain an SMP target...)
 	 */
 #if 0
 	if (sasaddr == 0)
 		sasaddr = xpt_path_sas_addr(ccb->ccb_h.path);
 #endif
 
 	/*
 	 * If we still don't have a SAS address for the expander, look for
 	 * the parent device of this device, which is probably the expander.
 	 */
 	if (sasaddr == 0) {
 #ifdef OLD_MPS_PROBE
 		struct mpssas_target *parent_target;
 #endif
 
 		if (targ->parent_handle == 0x0) {
 			mps_dprint(sc, MPS_ERROR,
 				   "%s: handle %d does not have a valid "
 				   "parent handle!\n", __func__, targ->handle);
 			mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE);
 			goto bailout;
 		}
 #ifdef OLD_MPS_PROBE
 		parent_target = mpssas_find_target_by_handle(sassc, 0,
 			targ->parent_handle);
 
 		if (parent_target == NULL) {
 			mps_dprint(sc, MPS_ERROR,
 				   "%s: handle %d does not have a valid "
 				   "parent target!\n", __func__, targ->handle);
 			mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE);
 			goto bailout;
 		}
 
 		if ((parent_target->devinfo &
 		     MPI2_SAS_DEVICE_INFO_SMP_TARGET) == 0) {
 			mps_dprint(sc, MPS_ERROR,
 				   "%s: handle %d parent %d does not "
 				   "have an SMP target!\n", __func__,
 				   targ->handle, parent_target->handle);
 			mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE);
 			goto bailout;
 
 		}
 
 		sasaddr = parent_target->sasaddr;
 #else /* OLD_MPS_PROBE */
 		if ((targ->parent_devinfo &
 		     MPI2_SAS_DEVICE_INFO_SMP_TARGET) == 0) {
 			mps_dprint(sc, MPS_ERROR,
 				   "%s: handle %d parent %d does not "
 				   "have an SMP target!\n", __func__,
 				   targ->handle, targ->parent_handle);
 			mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE);
 			goto bailout;
 
 		}
 		if (targ->parent_sasaddr == 0x0) {
 			mps_dprint(sc, MPS_ERROR,
 				   "%s: handle %d parent handle %d does "
 				   "not have a valid SAS address!\n",
 				   __func__, targ->handle, targ->parent_handle);
 			mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE);
 			goto bailout;
 		}
 
 		sasaddr = targ->parent_sasaddr;
 #endif /* OLD_MPS_PROBE */
 
 	}
 
 	if (sasaddr == 0) {
 		mps_dprint(sc, MPS_INFO,
 			   "%s: unable to find SAS address for handle %d\n",
 			   __func__, targ->handle);
 		mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE);
 		goto bailout;
 	}
 	mpssas_send_smpcmd(sassc, ccb, sasaddr);
 
 	return;
 
 bailout:
 	xpt_done(ccb);
 
 }
 #endif //__FreeBSD_version >= 900026
 
 static void
 mpssas_action_resetdev(struct mpssas_softc *sassc, union ccb *ccb)
 {
 	MPI2_SCSI_TASK_MANAGE_REQUEST *req;
 	struct mps_softc *sc;
 	struct mps_command *tm;
 	struct mpssas_target *targ;
 
 	MPS_FUNCTRACE(sassc->sc);
 	mtx_assert(&sassc->sc->mps_mtx, MA_OWNED);
 
 	KASSERT(ccb->ccb_h.target_id < sassc->maxtargets,
 	    ("Target %d out of bounds in XPT_RESET_DEV\n",
 	     ccb->ccb_h.target_id));
 	sc = sassc->sc;
 	tm = mps_alloc_command(sc);
 	if (tm == NULL) {
 		mps_dprint(sc, MPS_ERROR,
 		    "command alloc failure in mpssas_action_resetdev\n");
 		mpssas_set_ccbstatus(ccb, CAM_RESRC_UNAVAIL);
 		xpt_done(ccb);
 		return;
 	}
 
 	targ = &sassc->targets[ccb->ccb_h.target_id];
 	req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req;
 	req->DevHandle = htole16(targ->handle);
 	req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT;
 	req->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET;
 
 	/* SAS Hard Link Reset / SATA Link Reset */
 	req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET;
 
 	tm->cm_data = NULL;
 	tm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY;
 	tm->cm_complete = mpssas_resetdev_complete;
 	tm->cm_complete_data = ccb;
 	tm->cm_targ = targ;
 	targ->flags |= MPSSAS_TARGET_INRESET;
 
 	mps_map_command(sc, tm);
 }
 
 static void
 mpssas_resetdev_complete(struct mps_softc *sc, struct mps_command *tm)
 {
 	MPI2_SCSI_TASK_MANAGE_REPLY *resp;
 	union ccb *ccb;
 
 	MPS_FUNCTRACE(sc);
 	mtx_assert(&sc->mps_mtx, MA_OWNED);
 
 	resp = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply;
 	ccb = tm->cm_complete_data;
 
 	/*
 	 * Currently there should be no way we can hit this case.  It only
 	 * happens when we have a failure to allocate chain frames, and
 	 * task management commands don't have S/G lists.
 	 */
 	if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) {
 		MPI2_SCSI_TASK_MANAGE_REQUEST *req;
 
 		req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req;
 
 		mps_dprint(sc, MPS_ERROR,
 			   "%s: cm_flags = %#x for reset of handle %#04x! "
 			   "This should not happen!\n", __func__, tm->cm_flags,
 			   req->DevHandle);
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR);
 		goto bailout;
 	}
 
 	mps_dprint(sc, MPS_XINFO,
 	    "%s: IOCStatus = 0x%x ResponseCode = 0x%x\n", __func__,
 	    le16toh(resp->IOCStatus), le32toh(resp->ResponseCode));
 
 	if (le32toh(resp->ResponseCode) == MPI2_SCSITASKMGMT_RSP_TM_COMPLETE) {
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP);
 		mpssas_announce_reset(sc, AC_SENT_BDR, tm->cm_targ->tid,
 		    CAM_LUN_WILDCARD);
 	}
 	else
 		mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR);
 
 bailout:
 
 	mpssas_free_tm(sc, tm);
 	xpt_done(ccb);
 }
 
 static void
 mpssas_poll(struct cam_sim *sim)
 {
 	struct mpssas_softc *sassc;
 
 	sassc = cam_sim_softc(sim);
 
 	if (sassc->sc->mps_debug & MPS_TRACE) {
 		/* frequent debug messages during a panic just slow
 		 * everything down too much.
 		 */
 		mps_printf(sassc->sc, "%s clearing MPS_TRACE\n", __func__);
 		sassc->sc->mps_debug &= ~MPS_TRACE;
 	}
 
 	mps_intr_locked(sassc->sc);
 }
 
 static void
 mpssas_async(void *callback_arg, uint32_t code, struct cam_path *path,
 	     void *arg)
 {
 	struct mps_softc *sc;
 
 	sc = (struct mps_softc *)callback_arg;
 
 	switch (code) {
 #if (__FreeBSD_version >= 1000006) || \
     ((__FreeBSD_version >= 901503) && (__FreeBSD_version < 1000000))
 	case AC_ADVINFO_CHANGED: {
 		struct mpssas_target *target;
 		struct mpssas_softc *sassc;
 		struct scsi_read_capacity_data_long rcap_buf;
 		struct ccb_dev_advinfo cdai;
 		struct mpssas_lun *lun;
 		lun_id_t lunid;
 		int found_lun;
 		uintptr_t buftype;
 
 		buftype = (uintptr_t)arg;
 
 		found_lun = 0;
 		sassc = sc->sassc;
 
 		/*
 		 * We're only interested in read capacity data changes.
 		 */
 		if (buftype != CDAI_TYPE_RCAPLONG)
 			break;
 
 		/*
 		 * We should have a handle for this, but check to make sure.
 		 */
 		KASSERT(xpt_path_target_id(path) < sassc->maxtargets,
 		    ("Target %d out of bounds in mpssas_async\n",
 		    xpt_path_target_id(path)));
 		target = &sassc->targets[xpt_path_target_id(path)];
 		if (target->handle == 0)
 			break;
 
 		lunid = xpt_path_lun_id(path);
 
 		SLIST_FOREACH(lun, &target->luns, lun_link) {
 			if (lun->lun_id == lunid) {
 				found_lun = 1;
 				break;
 			}
 		}
 
 		if (found_lun == 0) {
 			lun = malloc(sizeof(struct mpssas_lun), M_MPT2,
 				     M_NOWAIT | M_ZERO);
 			if (lun == NULL) {
 				mps_dprint(sc, MPS_ERROR, "Unable to alloc "
 					   "LUN for EEDP support.\n");
 				break;
 			}
 			lun->lun_id = lunid;
 			SLIST_INSERT_HEAD(&target->luns, lun, lun_link);
 		}
 
 		bzero(&rcap_buf, sizeof(rcap_buf));
 		xpt_setup_ccb(&cdai.ccb_h, path, CAM_PRIORITY_NORMAL);
 		cdai.ccb_h.func_code = XPT_DEV_ADVINFO;
 		cdai.ccb_h.flags = CAM_DIR_IN;
 		cdai.buftype = CDAI_TYPE_RCAPLONG;
 #if (__FreeBSD_version >= 1100061) || \
     ((__FreeBSD_version >= 1001510) && (__FreeBSD_version < 1100000))
 		cdai.flags = CDAI_FLAG_NONE;
 #else
 		cdai.flags = 0;
 #endif
 		cdai.bufsiz = sizeof(rcap_buf);
 		cdai.buf = (uint8_t *)&rcap_buf;
 		xpt_action((union ccb *)&cdai);
 		if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0)
 			cam_release_devq(cdai.ccb_h.path,
 					 0, 0, 0, FALSE);
 
 		if ((mpssas_get_ccbstatus((union ccb *)&cdai) == CAM_REQ_CMP)
 		 && (rcap_buf.prot & SRC16_PROT_EN)) {
 			lun->eedp_formatted = TRUE;
 			lun->eedp_block_size = scsi_4btoul(rcap_buf.length);
 		} else {
 			lun->eedp_formatted = FALSE;
 			lun->eedp_block_size = 0;
 		}
 		break;
 	}
 #else
 	case AC_FOUND_DEVICE: {
 		struct ccb_getdev *cgd;
 
 		cgd = arg;
 		mpssas_check_eedp(sc, path, cgd);
 		break;
 	}
 #endif
 	default:
 		break;
 	}
 }
 
 #if (__FreeBSD_version < 901503) || \
     ((__FreeBSD_version >= 1000000) && (__FreeBSD_version < 1000006))
 static void
 mpssas_check_eedp(struct mps_softc *sc, struct cam_path *path,
 		  struct ccb_getdev *cgd)
 {
 	struct mpssas_softc *sassc = sc->sassc;
 	struct ccb_scsiio *csio;
 	struct scsi_read_capacity_16 *scsi_cmd;
 	struct scsi_read_capacity_eedp *rcap_buf;
 	path_id_t pathid;
 	target_id_t targetid;
 	lun_id_t lunid;
 	union ccb *ccb;
 	struct cam_path *local_path;
 	struct mpssas_target *target;
 	struct mpssas_lun *lun;
 	uint8_t	found_lun;
 	char path_str[64];
 
 	sassc = sc->sassc;
 	pathid = cam_sim_path(sassc->sim);
 	targetid = xpt_path_target_id(path);
 	lunid = xpt_path_lun_id(path);
 
 	KASSERT(targetid < sassc->maxtargets,
 	    ("Target %d out of bounds in mpssas_check_eedp\n",
 	     targetid));
 	target = &sassc->targets[targetid];
 	if (target->handle == 0x0)
 		return;
 
 	/*
 	 * Determine if the device is EEDP capable.
 	 *
 	 * If this flag is set in the inquiry data, 
 	 * the device supports protection information,
 	 * and must support the 16 byte read
 	 * capacity command, otherwise continue without
 	 * sending read cap 16
 	 */
 	if ((cgd->inq_data.spc3_flags & SPC3_SID_PROTECT) == 0)
 		return;
 
 	/*
 	 * Issue a READ CAPACITY 16 command.  This info
 	 * is used to determine if the LUN is formatted
 	 * for EEDP support.
 	 */
 	ccb = xpt_alloc_ccb_nowait();
 	if (ccb == NULL) {
 		mps_dprint(sc, MPS_ERROR, "Unable to alloc CCB "
 		    "for EEDP support.\n");
 		return;
 	}
 
 	if (xpt_create_path(&local_path, xpt_periph,
 	    pathid, targetid, lunid) != CAM_REQ_CMP) {
 		mps_dprint(sc, MPS_ERROR, "Unable to create "
 		    "path for EEDP support\n");
 		xpt_free_ccb(ccb);
 		return;
 	}
 
 	/*
 	 * If LUN is already in list, don't create a new
 	 * one.
 	 */
 	found_lun = FALSE;
 	SLIST_FOREACH(lun, &target->luns, lun_link) {
 		if (lun->lun_id == lunid) {
 			found_lun = TRUE;
 			break;
 		}
 	}
 	if (!found_lun) {
 		lun = malloc(sizeof(struct mpssas_lun), M_MPT2,
 		    M_NOWAIT | M_ZERO);
 		if (lun == NULL) {
 			mps_dprint(sc, MPS_ERROR,
 			    "Unable to alloc LUN for EEDP support.\n");
 			xpt_free_path(local_path);
 			xpt_free_ccb(ccb);
 			return;
 		}
 		lun->lun_id = lunid;
 		SLIST_INSERT_HEAD(&target->luns, lun,
 		    lun_link);
 	}
 
 	xpt_path_string(local_path, path_str, sizeof(path_str));
 
 	mps_dprint(sc, MPS_INFO, "Sending read cap: path %s handle %d\n",
 	    path_str, target->handle);
 
 	/*
 	 * Issue a READ CAPACITY 16 command for the LUN.
 	 * The mpssas_read_cap_done function will load
 	 * the read cap info into the LUN struct.
 	 */
 	rcap_buf = malloc(sizeof(struct scsi_read_capacity_eedp),
 	    M_MPT2, M_NOWAIT | M_ZERO);
 	if (rcap_buf == NULL) {
 		mps_dprint(sc, MPS_FAULT,
 		    "Unable to alloc read capacity buffer for EEDP support.\n");
 		xpt_free_path(ccb->ccb_h.path);
 		xpt_free_ccb(ccb);
 		return;
 	}
 	xpt_setup_ccb(&ccb->ccb_h, local_path, CAM_PRIORITY_XPT);
 	csio = &ccb->csio;
 	csio->ccb_h.func_code = XPT_SCSI_IO;
 	csio->ccb_h.flags = CAM_DIR_IN;
 	csio->ccb_h.retry_count = 4;	
 	csio->ccb_h.cbfcnp = mpssas_read_cap_done;
 	csio->ccb_h.timeout = 60000;
 	csio->data_ptr = (uint8_t *)rcap_buf;
 	csio->dxfer_len = sizeof(struct scsi_read_capacity_eedp);
 	csio->sense_len = MPS_SENSE_LEN;
 	csio->cdb_len = sizeof(*scsi_cmd);
 	csio->tag_action = MSG_SIMPLE_Q_TAG;
 
 	scsi_cmd = (struct scsi_read_capacity_16 *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = 0x9E;
 	scsi_cmd->service_action = SRC16_SERVICE_ACTION;
 	((uint8_t *)scsi_cmd)[13] = sizeof(struct scsi_read_capacity_eedp);
 
 	ccb->ccb_h.ppriv_ptr1 = sassc;
 	xpt_action(ccb);
 }
 
 static void
 mpssas_read_cap_done(struct cam_periph *periph, union ccb *done_ccb)
 {
 	struct mpssas_softc *sassc;
 	struct mpssas_target *target;
 	struct mpssas_lun *lun;
 	struct scsi_read_capacity_eedp *rcap_buf;
 
 	if (done_ccb == NULL)
 		return;
 	
 	/* Driver need to release devq, it Scsi command is
 	 * generated by driver internally.
 	 * Currently there is a single place where driver
 	 * calls scsi command internally. In future if driver
 	 * calls more scsi command internally, it needs to release
 	 * devq internally, since those command will not go back to
 	 * cam_periph.
 	 */
 	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) ) {
         	done_ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 		xpt_release_devq(done_ccb->ccb_h.path,
 			       	/*count*/ 1, /*run_queue*/TRUE);
 	}
 
 	rcap_buf = (struct scsi_read_capacity_eedp *)done_ccb->csio.data_ptr;
 
 	/*
 	 * Get the LUN ID for the path and look it up in the LUN list for the
 	 * target.
 	 */
 	sassc = (struct mpssas_softc *)done_ccb->ccb_h.ppriv_ptr1;
 	KASSERT(done_ccb->ccb_h.target_id < sassc->maxtargets,
 	    ("Target %d out of bounds in mpssas_read_cap_done\n",
 	     done_ccb->ccb_h.target_id));
 	target = &sassc->targets[done_ccb->ccb_h.target_id];
 	SLIST_FOREACH(lun, &target->luns, lun_link) {
 		if (lun->lun_id != done_ccb->ccb_h.target_lun)
 			continue;
 
 		/*
 		 * Got the LUN in the target's LUN list.  Fill it in
 		 * with EEDP info.  If the READ CAP 16 command had some
 		 * SCSI error (common if command is not supported), mark
 		 * the lun as not supporting EEDP and set the block size
 		 * to 0.
 		 */
 		if ((mpssas_get_ccbstatus(done_ccb) != CAM_REQ_CMP)
 		 || (done_ccb->csio.scsi_status != SCSI_STATUS_OK)) {
 			lun->eedp_formatted = FALSE;
 			lun->eedp_block_size = 0;
 			break;
 		}
 
 		if (rcap_buf->protect & 0x01) {
 			mps_dprint(sassc->sc, MPS_INFO, "LUN %d for "
  			    "target ID %d is formatted for EEDP "
  			    "support.\n", done_ccb->ccb_h.target_lun,
  			    done_ccb->ccb_h.target_id);
 			lun->eedp_formatted = TRUE;
 			lun->eedp_block_size = scsi_4btoul(rcap_buf->length);
 		}
 		break;
 	}
 
 	// Finished with this CCB and path.
 	free(rcap_buf, M_MPT2);
 	xpt_free_path(done_ccb->ccb_h.path);
 	xpt_free_ccb(done_ccb);
 }
 #endif /* (__FreeBSD_version < 901503) || \
           ((__FreeBSD_version >= 1000000) && (__FreeBSD_version < 1000006)) */
 
 void
 mpssas_prepare_for_tm(struct mps_softc *sc, struct mps_command *tm,
     struct mpssas_target *target, lun_id_t lun_id)
 {
 	union ccb *ccb;
 	path_id_t path_id;
 
 	/*
 	 * Set the INRESET flag for this target so that no I/O will be sent to
 	 * the target until the reset has completed.  If an I/O request does
 	 * happen, the devq will be frozen.  The CCB holds the path which is
 	 * used to release the devq.  The devq is released and the CCB is freed
 	 * when the TM completes.
 	 */
 	ccb = xpt_alloc_ccb_nowait();
 	if (ccb) {
 		path_id = cam_sim_path(sc->sassc->sim);
 		if (xpt_create_path(&ccb->ccb_h.path, xpt_periph, path_id,
 		    target->tid, lun_id) != CAM_REQ_CMP) {
 			xpt_free_ccb(ccb);
 		} else {
 			tm->cm_ccb = ccb;
 			tm->cm_targ = target;
 			target->flags |= MPSSAS_TARGET_INRESET;
 		}
 	}
 }
 
 int
 mpssas_startup(struct mps_softc *sc)
 {
 
 	/*
 	 * Send the port enable message and set the wait_for_port_enable flag.
 	 * This flag helps to keep the simq frozen until all discovery events
 	 * are processed.
 	 */
 	sc->wait_for_port_enable = 1;
 	mpssas_send_portenable(sc);
 	return (0);
 }
 
 static int
 mpssas_send_portenable(struct mps_softc *sc)
 {
 	MPI2_PORT_ENABLE_REQUEST *request;
 	struct mps_command *cm;
 
 	MPS_FUNCTRACE(sc);
 
 	if ((cm = mps_alloc_command(sc)) == NULL)
 		return (EBUSY);
 	request = (MPI2_PORT_ENABLE_REQUEST *)cm->cm_req;
 	request->Function = MPI2_FUNCTION_PORT_ENABLE;
 	request->MsgFlags = 0;
 	request->VP_ID = 0;
 	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 	cm->cm_complete = mpssas_portenable_complete;
 	cm->cm_data = NULL;
 	cm->cm_sge = NULL;
 
 	mps_map_command(sc, cm);
 	mps_dprint(sc, MPS_XINFO, 
 	    "mps_send_portenable finished cm %p req %p complete %p\n",
 	    cm, cm->cm_req, cm->cm_complete);
 	return (0);
 }
 
 static void
 mpssas_portenable_complete(struct mps_softc *sc, struct mps_command *cm)
 {
 	MPI2_PORT_ENABLE_REPLY *reply;
 	struct mpssas_softc *sassc;
 
 	MPS_FUNCTRACE(sc);
 	sassc = sc->sassc;
 
 	/*
 	 * Currently there should be no way we can hit this case.  It only
 	 * happens when we have a failure to allocate chain frames, and
 	 * port enable commands don't have S/G lists.
 	 */
 	if ((cm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) {
 		mps_dprint(sc, MPS_ERROR, "%s: cm_flags = %#x for port enable! "
 			   "This should not happen!\n", __func__, cm->cm_flags);
 	}
 
 	reply = (MPI2_PORT_ENABLE_REPLY *)cm->cm_reply;
 	if (reply == NULL)
 		mps_dprint(sc, MPS_FAULT, "Portenable NULL reply\n");
 	else if (le16toh(reply->IOCStatus & MPI2_IOCSTATUS_MASK) !=
 	    MPI2_IOCSTATUS_SUCCESS)
 		mps_dprint(sc, MPS_FAULT, "Portenable failed\n");
 
 	mps_free_command(sc, cm);
 	if (sc->mps_ich.ich_arg != NULL) {
 		mps_dprint(sc, MPS_XINFO, "disestablish config intrhook\n");
 		config_intrhook_disestablish(&sc->mps_ich);
 		sc->mps_ich.ich_arg = NULL;
 	}
 
 	/*
 	 * Get WarpDrive info after discovery is complete but before the scan
 	 * starts.  At this point, all devices are ready to be exposed to the
 	 * OS.  If devices should be hidden instead, take them out of the
 	 * 'targets' array before the scan.  The devinfo for a disk will have
 	 * some info and a volume's will be 0.  Use that to remove disks.
 	 */
 	mps_wd_config_pages(sc);
 
 	/*
 	 * Done waiting for port enable to complete.  Decrement the refcount.
 	 * If refcount is 0, discovery is complete and a rescan of the bus can
 	 * take place.  Since the simq was explicitly frozen before port
 	 * enable, it must be explicitly released here to keep the
 	 * freeze/release count in sync.
 	 */
 	sc->wait_for_port_enable = 0;
 	sc->port_enable_complete = 1;
 	wakeup(&sc->port_enable_complete);
 	mpssas_startup_decrement(sassc);
 }
 
 int
 mpssas_check_id(struct mpssas_softc *sassc, int id)
 {
 	struct mps_softc *sc = sassc->sc;
 	char *ids;
 	char *name;
 
 	ids = &sc->exclude_ids[0];
 	while((name = strsep(&ids, ",")) != NULL) {
 		if (name[0] == '\0')
 			continue;
 		if (strtol(name, NULL, 0) == (long)id)
 			return (1);
 	}
 
 	return (0);
 }
 
 void
 mpssas_realloc_targets(struct mps_softc *sc, int maxtargets)
 {
 	struct mpssas_softc *sassc;
 	struct mpssas_lun *lun, *lun_tmp;
 	struct mpssas_target *targ;
 	int i;
 
 	sassc = sc->sassc;
 	/*
 	 * The number of targets is based on IOC Facts, so free all of
 	 * the allocated LUNs for each target and then the target buffer
 	 * itself.
 	 */
 	for (i=0; i< maxtargets; i++) {
 		targ = &sassc->targets[i];
 		SLIST_FOREACH_SAFE(lun, &targ->luns, lun_link, lun_tmp) {
 			free(lun, M_MPT2);
 		}
 	}
 	free(sassc->targets, M_MPT2);
 
 	sassc->targets = malloc(sizeof(struct mpssas_target) * maxtargets,
 	    M_MPT2, M_WAITOK|M_ZERO);
 	if (!sassc->targets) {
 		panic("%s failed to alloc targets with error %d\n",
 		    __func__, ENOMEM);
 	}
 }
Index: head/sys/geom/geom_dev.c
===================================================================
--- head/sys/geom/geom_dev.c	(revision 308154)
+++ head/sys/geom/geom_dev.c	(revision 308155)
@@ -1,764 +1,765 @@
 /*-
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/ctype.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/limits.h>
 #include <sys/sysctl.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 #include <machine/stdarg.h>
 
 struct g_dev_softc {
 	struct mtx	 sc_mtx;
 	struct cdev	*sc_dev;
 	struct cdev	*sc_alias;
 	int		 sc_open;
 	int		 sc_active;
 };
 
 static d_open_t		g_dev_open;
 static d_close_t	g_dev_close;
 static d_strategy_t	g_dev_strategy;
 static d_ioctl_t	g_dev_ioctl;
 
 static struct cdevsw g_dev_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	g_dev_open,
 	.d_close =	g_dev_close,
 	.d_read =	physread,
 	.d_write =	physwrite,
 	.d_ioctl =	g_dev_ioctl,
 	.d_strategy =	g_dev_strategy,
 	.d_name =	"g_dev",
 	.d_flags =	D_DISK | D_TRACKCLOSE,
 };
 
 static g_init_t g_dev_init;
 static g_fini_t g_dev_fini;
 static g_taste_t g_dev_taste;
 static g_orphan_t g_dev_orphan;
 static g_attrchanged_t g_dev_attrchanged;
 
 static struct g_class g_dev_class	= {
 	.name = "DEV",
 	.version = G_VERSION,
 	.init = g_dev_init,
 	.fini = g_dev_fini,
 	.taste = g_dev_taste,
 	.orphan = g_dev_orphan,
 	.attrchanged = g_dev_attrchanged
 };
 
 /*
  * We target 262144 (8 x 32768) sectors by default as this significantly
  * increases the throughput on commonly used SSD's with a marginal
  * increase in non-interruptible request latency.
  */
 static uint64_t g_dev_del_max_sectors = 262144;
 SYSCTL_DECL(_kern_geom);
 SYSCTL_NODE(_kern_geom, OID_AUTO, dev, CTLFLAG_RW, 0, "GEOM_DEV stuff");
 SYSCTL_QUAD(_kern_geom_dev, OID_AUTO, delete_max_sectors, CTLFLAG_RW,
     &g_dev_del_max_sectors, 0, "Maximum number of sectors in a single "
     "delete request sent to the provider. Larger requests are chunked "
     "so they can be interrupted. (0 = disable chunking)");
 
 static char *dumpdev = NULL;
 static void
 g_dev_init(struct g_class *mp)
 {
 
 	dumpdev = kern_getenv("dumpdev");
 }
 
 static void
 g_dev_fini(struct g_class *mp)
 {
 
 	freeenv(dumpdev);
 	dumpdev = NULL;
 }
 
 static int
 g_dev_setdumpdev(struct cdev *dev, struct thread *td)
 {
 	struct g_kerneldump kd;
 	struct g_consumer *cp;
 	int error, len;
 
 	if (dev == NULL)
 		return (set_dumper(NULL, NULL, td));
 
 	cp = dev->si_drv2;
 	len = sizeof(kd);
 	kd.offset = 0;
 	kd.length = OFF_MAX;
 	error = g_io_getattr("GEOM::kerneldump", cp, &len, &kd);
 	if (error == 0) {
 		error = set_dumper(&kd.di, devtoname(dev), td);
 		if (error == 0)
 			dev->si_flags |= SI_DUMPDEV;
 	}
 	return (error);
 }
 
 static int
 init_dumpdev(struct cdev *dev)
 {
 	struct g_consumer *cp;
 	const char *devprefix = "/dev/", *devname;
 	int error;
 	size_t len;
 
 	if (dumpdev == NULL)
 		return (0);
 
 	len = strlen(devprefix);
 	devname = devtoname(dev);
 	if (strcmp(devname, dumpdev) != 0 &&
 	   (strncmp(dumpdev, devprefix, len) != 0 ||
 	    strcmp(devname, dumpdev + len) != 0))
 		return (0);
 
 	cp = (struct g_consumer *)dev->si_drv2;
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 
 	error = g_dev_setdumpdev(dev, curthread);
 	if (error == 0) {
 		freeenv(dumpdev);
 		dumpdev = NULL;
 	}
 
 	(void)g_access(cp, -1, 0, 0);
 
 	return (error);
 }
 
 static void
 g_dev_destroy(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_dev_softc *sc;
 	char buf[SPECNAMELEN + 6];
 
 	g_topology_assert();
 	cp = arg;
 	gp = cp->geom;
 	sc = cp->private;
 	g_trace(G_T_TOPOLOGY, "g_dev_destroy(%p(%s))", cp, gp->name);
 	snprintf(buf, sizeof(buf), "cdev=%s", gp->name);
 	devctl_notify_f("GEOM", "DEV", "DESTROY", buf, M_WAITOK);
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	mtx_destroy(&sc->sc_mtx);
 	g_free(sc);
 }
 
 void
 g_dev_print(void)
 {
 	struct g_geom *gp;
 	char const *p = "";
 
 	LIST_FOREACH(gp, &g_dev_class.geom, geom) {
 		printf("%s%s", p, gp->name);
 		p = " ";
 	}
 	printf("\n");
 }
 
 static void
 g_dev_set_physpath(struct g_consumer *cp)
 {
 	struct g_dev_softc *sc;
 	char *physpath;
 	int error, physpath_len;
 
 	if (g_access(cp, 1, 0, 0) != 0)
 		return;
 
 	sc = cp->private;
 	physpath_len = MAXPATHLEN;
 	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
 	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
 	g_access(cp, -1, 0, 0);
 	if (error == 0 && strlen(physpath) != 0) {
 		struct cdev *dev, *old_alias_dev;
 		struct cdev **alias_devp;
 
 		dev = sc->sc_dev;
 		old_alias_dev = sc->sc_alias;
 		alias_devp = (struct cdev **)&sc->sc_alias;
 		make_dev_physpath_alias(MAKEDEV_WAITOK, alias_devp, dev,
 		    old_alias_dev, physpath);
 	} else if (sc->sc_alias) {
 		destroy_dev((struct cdev *)sc->sc_alias);
 		sc->sc_alias = NULL;
 	}
 	g_free(physpath);
 }
 
 static void
 g_dev_set_media(struct g_consumer *cp)
 {
 	struct g_dev_softc *sc;
 	struct cdev *dev;
 	char buf[SPECNAMELEN + 6];
 
 	sc = cp->private;
 	dev = sc->sc_dev;
 	snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name);
 	devctl_notify_f("DEVFS", "CDEV", "MEDIACHANGE", buf, M_WAITOK);
 	devctl_notify_f("GEOM", "DEV", "MEDIACHANGE", buf, M_WAITOK);
 	dev = sc->sc_alias;
 	if (dev != NULL) {
 		snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name);
 		devctl_notify_f("DEVFS", "CDEV", "MEDIACHANGE", buf, M_WAITOK);
 		devctl_notify_f("GEOM", "DEV", "MEDIACHANGE", buf, M_WAITOK);
 	}
 }
 
 static void
 g_dev_attrchanged(struct g_consumer *cp, const char *attr)
 {
 
 	if (strcmp(attr, "GEOM::media") == 0) {
 		g_dev_set_media(cp);
 		return;
 	}
 
 	if (strcmp(attr, "GEOM::physpath") == 0) {
 		g_dev_set_physpath(cp);
 		return;
 	}
 }
 
 struct g_provider *
 g_dev_getprovider(struct cdev *dev)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	if (dev == NULL)
 		return (NULL);
 	if (dev->si_devsw != &g_dev_cdevsw)
 		return (NULL);
 	cp = dev->si_drv2;
 	return (cp->provider);
 }
 
 static struct g_geom *
 g_dev_taste(struct g_class *mp, struct g_provider *pp, int insist __unused)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	int error;
 	struct cdev *dev;
 	char buf[SPECNAMELEN + 6];
 
 	g_trace(G_T_TOPOLOGY, "dev_taste(%s,%s)", mp->name, pp->name);
 	g_topology_assert();
 	gp = g_new_geomf(mp, "%s", pp->name);
 	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
 	mtx_init(&sc->sc_mtx, "g_dev", NULL, MTX_DEF);
 	cp = g_new_consumer(gp);
 	cp->private = sc;
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	KASSERT(error == 0,
 	    ("g_dev_taste(%s) failed to g_attach, err=%d", pp->name, error));
 	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &dev,
 	    &g_dev_cdevsw, NULL, UID_ROOT, GID_OPERATOR, 0640, "%s", gp->name);
 	if (error != 0) {
 		printf("%s: make_dev_p() failed (gp->name=%s, error=%d)\n",
 		    __func__, gp->name, error);
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		g_destroy_geom(gp);
 		mtx_destroy(&sc->sc_mtx);
 		g_free(sc);
 		return (NULL);
 	}
 	dev->si_flags |= SI_UNMAPPED;
 	sc->sc_dev = dev;
 
 	dev->si_iosize_max = MAXPHYS;
 	dev->si_drv2 = cp;
 	error = init_dumpdev(dev);
 	if (error != 0)
 		printf("%s: init_dumpdev() failed (gp->name=%s, error=%d)\n",
 		    __func__, gp->name, error);
 
 	g_dev_attrchanged(cp, "GEOM::physpath");
 	snprintf(buf, sizeof(buf), "cdev=%s", gp->name);
 	devctl_notify_f("GEOM", "DEV", "CREATE", buf, M_WAITOK);
 
 	return (gp);
 }
 
 static int
 g_dev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	int error, r, w, e;
 
 	cp = dev->si_drv2;
 	if (cp == NULL)
 		return (ENXIO);		/* g_dev_taste() not done yet */
 	g_trace(G_T_ACCESS, "g_dev_open(%s, %d, %d, %p)",
 	    cp->geom->name, flags, fmt, td);
 
 	r = flags & FREAD ? 1 : 0;
 	w = flags & FWRITE ? 1 : 0;
 #ifdef notyet
 	e = flags & O_EXCL ? 1 : 0;
 #else
 	e = 0;
 #endif
 
 	/*
 	 * This happens on attempt to open a device node with O_EXEC.
 	 */
 	if (r + w + e == 0)
 		return (EINVAL);
 
 	if (w) {
 		/*
 		 * When running in very secure mode, do not allow
 		 * opens for writing of any disks.
 		 */
 		error = securelevel_ge(td->td_ucred, 2);
 		if (error)
 			return (error);
 	}
 	g_topology_lock();
 	error = g_access(cp, r, w, e);
 	g_topology_unlock();
 	if (error == 0) {
 		sc = cp->private;
 		mtx_lock(&sc->sc_mtx);
 		if (sc->sc_open == 0 && sc->sc_active != 0)
 			wakeup(&sc->sc_active);
 		sc->sc_open += r + w + e;
 		mtx_unlock(&sc->sc_mtx);
 	}
 	return (error);
 }
 
 static int
 g_dev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	int error, r, w, e;
 
 	cp = dev->si_drv2;
 	if (cp == NULL)
 		return (ENXIO);
 	g_trace(G_T_ACCESS, "g_dev_close(%s, %d, %d, %p)",
 	    cp->geom->name, flags, fmt, td);
 
 	r = flags & FREAD ? -1 : 0;
 	w = flags & FWRITE ? -1 : 0;
 #ifdef notyet
 	e = flags & O_EXCL ? -1 : 0;
 #else
 	e = 0;
 #endif
 
 	/*
 	 * The vgonel(9) - caused by eg. forced unmount of devfs - calls
 	 * VOP_CLOSE(9) on devfs vnode without any FREAD or FWRITE flags,
 	 * which would result in zero deltas, which in turn would cause
 	 * panic in g_access(9).
 	 *
 	 * Note that we cannot zero the counters (ie. do "r = cp->acr"
 	 * etc) instead, because the consumer might be opened in another
 	 * devfs instance.
 	 */
 	if (r + w + e == 0)
 		return (EINVAL);
 
 	sc = cp->private;
 	mtx_lock(&sc->sc_mtx);
 	sc->sc_open += r + w + e;
 	while (sc->sc_open == 0 && sc->sc_active != 0)
 		msleep(&sc->sc_active, &sc->sc_mtx, 0, "PRIBIO", 0);
 	mtx_unlock(&sc->sc_mtx);
 	g_topology_lock();
 	error = g_access(cp, r, w, e);
 	g_topology_unlock();
 	return (error);
 }
 
 /*
  * XXX: Until we have unmessed the ioctl situation, there is a race against
  * XXX: a concurrent orphanization.  We cannot close it by holding topology
  * XXX: since that would prevent us from doing our job, and stalling events
  * XXX: will break (actually: stall) the BSD disklabel hacks.
  */
 static int
 g_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	off_t offset, length, chunk;
 	int i, error;
 
 	cp = dev->si_drv2;
 	pp = cp->provider;
 
 	error = 0;
 	KASSERT(cp->acr || cp->acw,
 	    ("Consumer with zero access count in g_dev_ioctl"));
 
 	i = IOCPARM_LEN(cmd);
 	switch (cmd) {
 	case DIOCGSECTORSIZE:
 		*(u_int *)data = cp->provider->sectorsize;
 		if (*(u_int *)data == 0)
 			error = ENOENT;
 		break;
 	case DIOCGMEDIASIZE:
 		*(off_t *)data = cp->provider->mediasize;
 		if (*(off_t *)data == 0)
 			error = ENOENT;
 		break;
 	case DIOCGFWSECTORS:
 		error = g_io_getattr("GEOM::fwsectors", cp, &i, data);
 		if (error == 0 && *(u_int *)data == 0)
 			error = ENOENT;
 		break;
 	case DIOCGFWHEADS:
 		error = g_io_getattr("GEOM::fwheads", cp, &i, data);
 		if (error == 0 && *(u_int *)data == 0)
 			error = ENOENT;
 		break;
 	case DIOCGFRONTSTUFF:
 		error = g_io_getattr("GEOM::frontstuff", cp, &i, data);
 		break;
 	case DIOCSKERNELDUMP:
 		if (*(u_int *)data == 0)
 			error = g_dev_setdumpdev(NULL, td);
 		else
 			error = g_dev_setdumpdev(dev, td);
 		break;
 	case DIOCGFLUSH:
 		error = g_io_flush(cp);
 		break;
 	case DIOCGDELETE:
 		offset = ((off_t *)data)[0];
 		length = ((off_t *)data)[1];
 		if ((offset % cp->provider->sectorsize) != 0 ||
 		    (length % cp->provider->sectorsize) != 0 || length <= 0) {
 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
 			    length);
 			error = EINVAL;
 			break;
 		}
 		while (length > 0) {
 			chunk = length;
 			if (g_dev_del_max_sectors != 0 && chunk >
 			    g_dev_del_max_sectors * cp->provider->sectorsize) {
 				chunk = g_dev_del_max_sectors *
 				    cp->provider->sectorsize;
 			}
 			error = g_delete_data(cp, offset, chunk);
 			length -= chunk;
 			offset += chunk;
 			if (error)
 				break;
 			/*
 			 * Since the request size can be large, the service
 			 * time can be is likewise.  We make this ioctl
 			 * interruptible by checking for signals for each bio.
 			 */
 			if (SIGPENDING(td))
 				break;
 		}
 		break;
 	case DIOCGIDENT:
 		error = g_io_getattr("GEOM::ident", cp, &i, data);
 		break;
 	case DIOCGPROVIDERNAME:
 		if (pp == NULL)
 			return (ENOENT);
 		strlcpy(data, pp->name, i);
 		break;
 	case DIOCGSTRIPESIZE:
 		*(off_t *)data = cp->provider->stripesize;
 		break;
 	case DIOCGSTRIPEOFFSET:
 		*(off_t *)data = cp->provider->stripeoffset;
 		break;
 	case DIOCGPHYSPATH:
 		error = g_io_getattr("GEOM::physpath", cp, &i, data);
 		if (error == 0 && *(char *)data == '\0')
 			error = ENOENT;
 		break;
 	case DIOCGATTR: {
 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
 
 		if (arg->len > sizeof(arg->value)) {
 			error = EINVAL;
 			break;
 		}
 		error = g_io_getattr(arg->name, cp, &arg->len, &arg->value);
 		break;
 	}
 	case DIOCZONECMD: {
 		struct disk_zone_args *zone_args =(struct disk_zone_args *)data;
 		struct disk_zone_rep_entry *new_entries, *old_entries;
 		struct disk_zone_report *rep;
 		size_t alloc_size;
 
 		old_entries = NULL;
 		new_entries = NULL;
 		rep = NULL;
 		alloc_size = 0;
 
 		if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) {
 
 			rep = &zone_args->zone_params.report;
 			alloc_size = rep->entries_allocated *
 			    sizeof(struct disk_zone_rep_entry);
 			if (alloc_size != 0)
 				new_entries = g_malloc(alloc_size,
 				    M_WAITOK| M_ZERO);
 			old_entries = rep->entries;
 			rep->entries = new_entries;
 		}
 		error = g_io_zonecmd(zone_args, cp);
 		if ((zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES)
 		 && (alloc_size != 0)
 		 && (error == 0)) {
 			error = copyout(new_entries, old_entries, alloc_size);
 		}
 		if ((old_entries != NULL)
 		 && (rep != NULL))
 			rep->entries = old_entries;
 
 		if (new_entries != NULL)
 			g_free(new_entries);
 		break;
 	}
 	default:
 		if (cp->provider->geom->ioctl != NULL) {
 			error = cp->provider->geom->ioctl(cp->provider, cmd, data, fflag, td);
 		} else {
 			error = ENOIOCTL;
 		}
 	}
 
 	return (error);
 }
 
 static void
 g_dev_done(struct bio *bp2)
 {
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	struct bio *bp;
 	int destroy;
 
 	cp = bp2->bio_from;
 	sc = cp->private;
 	bp = bp2->bio_parent;
 	bp->bio_error = bp2->bio_error;
 	bp->bio_completed = bp2->bio_completed;
 	bp->bio_resid = bp->bio_length - bp2->bio_completed;
 	if (bp2->bio_cmd == BIO_ZONE)
 		bcopy(&bp2->bio_zone, &bp->bio_zone, sizeof(bp->bio_zone));
 
 	if (bp2->bio_error != 0) {
 		g_trace(G_T_BIO, "g_dev_done(%p) had error %d",
 		    bp2, bp2->bio_error);
 		bp->bio_flags |= BIO_ERROR;
 	} else {
 		g_trace(G_T_BIO, "g_dev_done(%p/%p) resid %ld completed %jd",
 		    bp2, bp, bp2->bio_resid, (intmax_t)bp2->bio_completed);
 	}
 	g_destroy_bio(bp2);
 	destroy = 0;
 	mtx_lock(&sc->sc_mtx);
 	if ((--sc->sc_active) == 0) {
 		if (sc->sc_open == 0)
 			wakeup(&sc->sc_active);
 		if (sc->sc_dev == NULL)
 			destroy = 1;
 	}
 	mtx_unlock(&sc->sc_mtx);
 	if (destroy)
 		g_post_event(g_dev_destroy, cp, M_NOWAIT, NULL);
 	biodone(bp);
 }
 
 static void
 g_dev_strategy(struct bio *bp)
 {
 	struct g_consumer *cp;
 	struct bio *bp2;
 	struct cdev *dev;
 	struct g_dev_softc *sc;
 
 	KASSERT(bp->bio_cmd == BIO_READ ||
 	        bp->bio_cmd == BIO_WRITE ||
 	        bp->bio_cmd == BIO_DELETE ||
 		bp->bio_cmd == BIO_FLUSH ||
 		bp->bio_cmd == BIO_ZONE,
 		("Wrong bio_cmd bio=%p cmd=%d", bp, bp->bio_cmd));
 	dev = bp->bio_dev;
 	cp = dev->si_drv2;
 	sc = cp->private;
 	KASSERT(cp->acr || cp->acw,
 	    ("Consumer with zero access count in g_dev_strategy"));
+	biotrack(bp, __func__);
 #ifdef INVARIANTS
 	if ((bp->bio_offset % cp->provider->sectorsize) != 0 ||
 	    (bp->bio_bcount % cp->provider->sectorsize) != 0) {
 		bp->bio_resid = bp->bio_bcount;
 		biofinish(bp, NULL, EINVAL);
 		return;
 	}
 #endif
 	mtx_lock(&sc->sc_mtx);
 	KASSERT(sc->sc_open > 0, ("Closed device in g_dev_strategy"));
 	sc->sc_active++;
 	mtx_unlock(&sc->sc_mtx);
 
 	for (;;) {
 		/*
 		 * XXX: This is not an ideal solution, but I believe it to
 		 * XXX: deadlock safely, all things considered.
 		 */
 		bp2 = g_clone_bio(bp);
 		if (bp2 != NULL)
 			break;
 		pause("gdstrat", hz / 10);
 	}
 	KASSERT(bp2 != NULL, ("XXX: ENOMEM in a bad place"));
 	bp2->bio_done = g_dev_done;
 	g_trace(G_T_BIO,
 	    "g_dev_strategy(%p/%p) offset %jd length %jd data %p cmd %d",
 	    bp, bp2, (intmax_t)bp->bio_offset, (intmax_t)bp2->bio_length,
 	    bp2->bio_data, bp2->bio_cmd);
 	g_io_request(bp2, cp);
 	KASSERT(cp->acr || cp->acw,
 	    ("g_dev_strategy raced with g_dev_close and lost"));
 
 }
 
 /*
  * g_dev_callback()
  *
  * Called by devfs when asynchronous device destruction is completed.
  * - Mark that we have no attached device any more.
  * - If there are no outstanding requests, schedule geom destruction.
  *   Otherwise destruction will be scheduled later by g_dev_done().
  */
 
 static void
 g_dev_callback(void *arg)
 {
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	int destroy;
 
 	cp = arg;
 	sc = cp->private;
 	g_trace(G_T_TOPOLOGY, "g_dev_callback(%p(%s))", cp, cp->geom->name);
 
 	mtx_lock(&sc->sc_mtx);
 	sc->sc_dev = NULL;
 	sc->sc_alias = NULL;
 	destroy = (sc->sc_active == 0);
 	mtx_unlock(&sc->sc_mtx);
 	if (destroy)
 		g_post_event(g_dev_destroy, cp, M_WAITOK, NULL);
 }
 
 /*
  * g_dev_orphan()
  *
  * Called from below when the provider orphaned us.
  * - Clear any dump settings.
  * - Request asynchronous device destruction to prevent any more requests
  *   from coming in.  The provider is already marked with an error, so
  *   anything which comes in the interim will be returned immediately.
  */
 
 static void
 g_dev_orphan(struct g_consumer *cp)
 {
 	struct cdev *dev;
 	struct g_dev_softc *sc;
 
 	g_topology_assert();
 	sc = cp->private;
 	dev = sc->sc_dev;
 	g_trace(G_T_TOPOLOGY, "g_dev_orphan(%p(%s))", cp, cp->geom->name);
 
 	/* Reset any dump-area set on this device */
 	if (dev->si_flags & SI_DUMPDEV)
 		(void)set_dumper(NULL, NULL, curthread);
 
 	/* Destroy the struct cdev *so we get no more requests */
 	destroy_dev_sched_cb(dev, g_dev_callback, cp);
 }
 
 DECLARE_GEOM_CLASS(g_dev_class, g_dev);
Index: head/sys/geom/geom_disk.c
===================================================================
--- head/sys/geom/geom_disk.c	(revision 308154)
+++ head/sys/geom/geom_disk.c	(revision 308155)
@@ -1,1010 +1,1012 @@
 /*-
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_geom.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/ctype.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/devicestat.h>
 #include <machine/md_var.h>
 
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <geom/geom.h>
 #include <geom/geom_disk.h>
 #include <geom/geom_int.h>
 
 #include <dev/led/led.h>
 
 #include <machine/bus.h>
 
 struct g_disk_softc {
 	struct mtx		 done_mtx;
 	struct disk		*dp;
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
 	char			led[64];
 	uint32_t		state;
 	struct mtx		 start_mtx;
 };
 
 static g_access_t g_disk_access;
 static g_start_t g_disk_start;
 static g_ioctl_t g_disk_ioctl;
 static g_dumpconf_t g_disk_dumpconf;
 static g_provgone_t g_disk_providergone;
 
 static struct g_class g_disk_class = {
 	.name = G_DISK_CLASS_NAME,
 	.version = G_VERSION,
 	.start = g_disk_start,
 	.access = g_disk_access,
 	.ioctl = g_disk_ioctl,
 	.providergone = g_disk_providergone,
 	.dumpconf = g_disk_dumpconf,
 };
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, disk, CTLFLAG_RW, 0,
     "GEOM_DISK stuff");
 
 DECLARE_GEOM_CLASS(g_disk_class, g_disk);
 
 static int
 g_disk_access(struct g_provider *pp, int r, int w, int e)
 {
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	int error;
 
 	g_trace(G_T_ACCESS, "g_disk_access(%s, %d, %d, %d)",
 	    pp->name, r, w, e);
 	g_topology_assert();
 	sc = pp->private;
 	if (sc == NULL || (dp = sc->dp) == NULL || dp->d_destroyed) {
 		/*
 		 * Allow decreasing access count even if disk is not
 		 * available anymore.
 		 */
 		if (r <= 0 && w <= 0 && e <= 0)
 			return (0);
 		return (ENXIO);
 	}
 	r += pp->acr;
 	w += pp->acw;
 	e += pp->ace;
 	error = 0;
 	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
 		if (dp->d_open != NULL) {
 			error = dp->d_open(dp);
 			if (bootverbose && error != 0)
 				printf("Opened disk %s -> %d\n",
 				    pp->name, error);
 			if (error != 0)
 				return (error);
 		}
 		pp->sectorsize = dp->d_sectorsize;
 		if (dp->d_maxsize == 0) {
 			printf("WARNING: Disk drive %s%d has no d_maxsize\n",
 			    dp->d_name, dp->d_unit);
 			dp->d_maxsize = DFLTPHYS;
 		}
 		if (dp->d_delmaxsize == 0) {
 			if (bootverbose && dp->d_flags & DISKFLAG_CANDELETE) {
 				printf("WARNING: Disk drive %s%d has no "
 				    "d_delmaxsize\n", dp->d_name, dp->d_unit);
 			}
 			dp->d_delmaxsize = dp->d_maxsize;
 		}
 		pp->stripeoffset = dp->d_stripeoffset;
 		pp->stripesize = dp->d_stripesize;
 		dp->d_flags |= DISKFLAG_OPEN;
 		/*
 		 * Do not invoke resize event when initial size was zero.
 		 * Some disks report its size only after first opening.
 		 */
 		if (pp->mediasize == 0)
 			pp->mediasize = dp->d_mediasize;
 		else
 			g_resize_provider(pp, dp->d_mediasize);
 	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
 		if (dp->d_close != NULL) {
 			error = dp->d_close(dp);
 			if (error != 0)
 				printf("Closed disk %s -> %d\n",
 				    pp->name, error);
 		}
 		sc->state = G_STATE_ACTIVE;
 		if (sc->led[0] != 0)
 			led_set(sc->led, "0");
 		dp->d_flags &= ~DISKFLAG_OPEN;
 	}
 	return (error);
 }
 
 static void
 g_disk_kerneldump(struct bio *bp, struct disk *dp)
 {
 	struct g_kerneldump *gkd;
 	struct g_geom *gp;
 
 	gkd = (struct g_kerneldump*)bp->bio_data;
 	gp = bp->bio_to->geom;
 	g_trace(G_T_TOPOLOGY, "g_disk_kerneldump(%s, %jd, %jd)",
 		gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
 	if (dp->d_dump == NULL) {
 		g_io_deliver(bp, ENODEV);
 		return;
 	}
 	gkd->di.dumper = dp->d_dump;
 	gkd->di.priv = dp;
 	gkd->di.blocksize = dp->d_sectorsize;
 	gkd->di.maxiosize = dp->d_maxsize;
 	gkd->di.mediaoffset = gkd->offset;
 	if ((gkd->offset + gkd->length) > dp->d_mediasize)
 		gkd->length = dp->d_mediasize - gkd->offset;
 	gkd->di.mediasize = gkd->length;
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_disk_setstate(struct bio *bp, struct g_disk_softc *sc)
 {
 	const char *cmd;
 
 	memcpy(&sc->state, bp->bio_data, sizeof(sc->state));
 	if (sc->led[0] != 0) {
 		switch (sc->state) {
 		case G_STATE_FAILED:
 			cmd = "1";
 			break;
 		case G_STATE_REBUILD:
 			cmd = "f5";
 			break;
 		case G_STATE_RESYNC:
 			cmd = "f1";
 			break;
 		default:
 			cmd = "0";
 			break;
 		}
 		led_set(sc->led, cmd);
 	}
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_disk_done(struct bio *bp)
 {
 	struct bintime now;
 	struct bio *bp2;
 	struct g_disk_softc *sc;
 
 	/* See "notes" for why we need a mutex here */
 	/* XXX: will witness accept a mix of Giant/unGiant drivers here ? */
 	bp2 = bp->bio_parent;
 	sc = bp2->bio_to->private;
 	bp->bio_completed = bp->bio_length - bp->bio_resid;
 	binuptime(&now);
 	mtx_lock(&sc->done_mtx);
 	if (bp2->bio_error == 0)
 		bp2->bio_error = bp->bio_error;
 	bp2->bio_completed += bp->bio_completed;
 
 	switch (bp->bio_cmd) {
 	case BIO_ZONE:
 		bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone));
 		/*FALLTHROUGH*/
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 	case BIO_FLUSH:
 		devstat_end_transaction_bio_bt(sc->dp->d_devstat, bp, &now);
 		break;
 	default:
 		break;
 	}
 	bp2->bio_inbed++;
 	if (bp2->bio_children == bp2->bio_inbed) {
 		mtx_unlock(&sc->done_mtx);
 		bp2->bio_resid = bp2->bio_bcount - bp2->bio_completed;
 		g_io_deliver(bp2, bp2->bio_error);
 	} else
 		mtx_unlock(&sc->done_mtx);
 	g_destroy_bio(bp);
 }
 
 static int
 g_disk_ioctl(struct g_provider *pp, u_long cmd, void * data, int fflag, struct thread *td)
 {
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	int error;
 
 	sc = pp->private;
 	dp = sc->dp;
 
 	if (dp->d_ioctl == NULL)
 		return (ENOIOCTL);
 	error = dp->d_ioctl(dp, cmd, data, fflag, td);
 	return (error);
 }
 
 static off_t
 g_disk_maxsize(struct disk *dp, struct bio *bp)
 {
 	if (bp->bio_cmd == BIO_DELETE)
 		return (dp->d_delmaxsize);
 	return (dp->d_maxsize);
 }
 
 static int
 g_disk_maxsegs(struct disk *dp, struct bio *bp)
 {
 	return ((g_disk_maxsize(dp, bp) / PAGE_SIZE) + 1);
 }
 
 static void
 g_disk_advance(struct disk *dp, struct bio *bp, off_t off)
 {
 
 	bp->bio_offset += off;
 	bp->bio_length -= off;
 
 	if ((bp->bio_flags & BIO_VLIST) != 0) {
 		bus_dma_segment_t *seg, *end;
 
 		seg = (bus_dma_segment_t *)bp->bio_data;
 		end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n;
 		off += bp->bio_ma_offset;
 		while (off >= seg->ds_len) {
 			KASSERT((seg != end),
 			    ("vlist request runs off the end"));
 			off -= seg->ds_len;
 			seg++;
 		}
 		bp->bio_ma_offset = off;
 		bp->bio_ma_n = end - seg;
 		bp->bio_data = (void *)seg;
 	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 		bp->bio_ma += off / PAGE_SIZE;
 		bp->bio_ma_offset += off;
 		bp->bio_ma_offset %= PAGE_SIZE;
 		bp->bio_ma_n -= off / PAGE_SIZE;
 	} else {
 		bp->bio_data += off;
 	}
 }
 
 static void
 g_disk_seg_limit(bus_dma_segment_t *seg, off_t *poffset,
     off_t *plength, int *ppages)
 {
 	uintptr_t seg_page_base;
 	uintptr_t seg_page_end;
 	off_t offset;
 	off_t length;
 	int seg_pages;
 
 	offset = *poffset;
 	length = *plength;
 
 	if (length > seg->ds_len - offset)
 		length = seg->ds_len - offset;
 
 	seg_page_base = trunc_page(seg->ds_addr + offset);
 	seg_page_end  = round_page(seg->ds_addr + offset + length);
 	seg_pages = (seg_page_end - seg_page_base) >> PAGE_SHIFT;
 
 	if (seg_pages > *ppages) {
 		seg_pages = *ppages;
 		length = (seg_page_base + (seg_pages << PAGE_SHIFT)) -
 		    (seg->ds_addr + offset);
 	}
 
 	*poffset = 0;
 	*plength -= length;
 	*ppages -= seg_pages;
 }
 
 static off_t
 g_disk_vlist_limit(struct disk *dp, struct bio *bp, bus_dma_segment_t **pendseg)
 {
 	bus_dma_segment_t *seg, *end;
 	off_t residual;
 	off_t offset;
 	int pages;
 
 	seg = (bus_dma_segment_t *)bp->bio_data;
 	end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n;
 	residual = bp->bio_length;
 	offset = bp->bio_ma_offset;
 	pages = g_disk_maxsegs(dp, bp);
 	while (residual != 0 && pages != 0) {
 		KASSERT((seg != end),
 		    ("vlist limit runs off the end"));
 		g_disk_seg_limit(seg, &offset, &residual, &pages);
 		seg++;
 	}
 	if (pendseg != NULL)
 		*pendseg = seg;
 	return (residual);
 }
 
 static bool
 g_disk_limit(struct disk *dp, struct bio *bp)
 {
 	bool limited = false;
 	off_t maxsz;
 
 	maxsz = g_disk_maxsize(dp, bp);
 
 	/*
 	 * XXX: If we have a stripesize we should really use it here.
 	 *      Care should be taken in the delete case if this is done
 	 *      as deletes can be very sensitive to size given how they
 	 *      are processed.
 	 */
 	if (bp->bio_length > maxsz) {
 		bp->bio_length = maxsz;
 		limited = true;
 	}
 
 	if ((bp->bio_flags & BIO_VLIST) != 0) {
 		bus_dma_segment_t *firstseg, *endseg;
 		off_t residual;
 
 		firstseg = (bus_dma_segment_t*)bp->bio_data;
 		residual = g_disk_vlist_limit(dp, bp, &endseg);
 		if (residual != 0) {
 			bp->bio_ma_n = endseg - firstseg;
 			bp->bio_length -= residual;
 			limited = true;
 		}
 	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 		bp->bio_ma_n =
 		    howmany(bp->bio_ma_offset + bp->bio_length, PAGE_SIZE);
 	}
 
 	return (limited);
 }
 
 static void
 g_disk_start(struct bio *bp)
 {
 	struct bio *bp2, *bp3;
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	int error;
 	off_t off;
 
+	biotrack(bp, __func__);
+
 	sc = bp->bio_to->private;
 	if (sc == NULL || (dp = sc->dp) == NULL || dp->d_destroyed) {
 		g_io_deliver(bp, ENXIO);
 		return;
 	}
 	error = EJUSTRETURN;
 	switch(bp->bio_cmd) {
 	case BIO_DELETE:
 		if (!(dp->d_flags & DISKFLAG_CANDELETE)) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		/* fall-through */
 	case BIO_READ:
 	case BIO_WRITE:
 		KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0 ||
 		    (bp->bio_flags & BIO_UNMAPPED) == 0,
 		    ("unmapped bio not supported by disk %s", dp->d_name));
 		off = 0;
 		bp3 = NULL;
 		bp2 = g_clone_bio(bp);
 		if (bp2 == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		for (;;) {
 			if (g_disk_limit(dp, bp2)) {
 				off += bp2->bio_length;
 
 				/*
 				 * To avoid a race, we need to grab the next bio
 				 * before we schedule this one.  See "notes".
 				 */
 				bp3 = g_clone_bio(bp);
 				if (bp3 == NULL)
 					bp->bio_error = ENOMEM;
 			}
 			bp2->bio_done = g_disk_done;
 			bp2->bio_pblkno = bp2->bio_offset / dp->d_sectorsize;
 			bp2->bio_bcount = bp2->bio_length;
 			bp2->bio_disk = dp;
 			mtx_lock(&sc->start_mtx); 
 			devstat_start_transaction_bio(dp->d_devstat, bp2);
 			mtx_unlock(&sc->start_mtx); 
 			dp->d_strategy(bp2);
 
 			if (bp3 == NULL)
 				break;
 
 			bp2 = bp3;
 			bp3 = NULL;
 			g_disk_advance(dp, bp2, off);
 		}
 		break;
 	case BIO_GETATTR:
 		/* Give the driver a chance to override */
 		if (dp->d_getattr != NULL) {
 			if (bp->bio_disk == NULL)
 				bp->bio_disk = dp;
 			error = dp->d_getattr(bp);
 			if (error != -1)
 				break;
 			error = EJUSTRETURN;
 		}
 		if (g_handleattr_int(bp, "GEOM::candelete",
 		    (dp->d_flags & DISKFLAG_CANDELETE) != 0))
 			break;
 		else if (g_handleattr_int(bp, "GEOM::fwsectors",
 		    dp->d_fwsectors))
 			break;
 		else if (g_handleattr_int(bp, "GEOM::fwheads", dp->d_fwheads))
 			break;
 		else if (g_handleattr_off_t(bp, "GEOM::frontstuff", 0))
 			break;
 		else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor",
 		    dp->d_hba_vendor))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_device",
 		    dp->d_hba_device))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor",
 		    dp->d_hba_subvendor))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice",
 		    dp->d_hba_subdevice))
 			break;
 		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
 			g_disk_kerneldump(bp, dp);
 		else if (!strcmp(bp->bio_attribute, "GEOM::setstate"))
 			g_disk_setstate(bp, sc);
 		else if (g_handleattr_uint16_t(bp, "GEOM::rotation_rate",
 		    dp->d_rotation_rate))
 			break;
 		else 
 			error = ENOIOCTL;
 		break;
 	case BIO_FLUSH:
 		g_trace(G_T_BIO, "g_disk_flushcache(%s)",
 		    bp->bio_to->name);
 		if (!(dp->d_flags & DISKFLAG_CANFLUSHCACHE)) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		/*FALLTHROUGH*/
 	case BIO_ZONE:
 		if (bp->bio_cmd == BIO_ZONE) {
 			if (!(dp->d_flags & DISKFLAG_CANZONE)) {
 				error = EOPNOTSUPP;
 				break;
 			}
 			g_trace(G_T_BIO, "g_disk_zone(%s)",
 			    bp->bio_to->name);
 		}
 		bp2 = g_clone_bio(bp);
 		if (bp2 == NULL) {
 			g_io_deliver(bp, ENOMEM);
 			return;
 		}
 		bp2->bio_done = g_disk_done;
 		bp2->bio_disk = dp;
 		mtx_lock(&sc->start_mtx);
 		devstat_start_transaction_bio(dp->d_devstat, bp2);
 		mtx_unlock(&sc->start_mtx);
 		dp->d_strategy(bp2);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	if (error != EJUSTRETURN)
 		g_io_deliver(bp, error);
 	return;
 }
 
 static void
 g_disk_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp)
 {
 	struct bio *bp;
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	char *buf;
 	int res = 0;
 
 	sc = gp->softc;
 	if (sc == NULL || (dp = sc->dp) == NULL)
 		return;
 	if (indent == NULL) {
 		sbuf_printf(sb, " hd %u", dp->d_fwheads);
 		sbuf_printf(sb, " sc %u", dp->d_fwsectors);
 		return;
 	}
 	if (pp != NULL) {
 		sbuf_printf(sb, "%s<fwheads>%u</fwheads>\n",
 		    indent, dp->d_fwheads);
 		sbuf_printf(sb, "%s<fwsectors>%u</fwsectors>\n",
 		    indent, dp->d_fwsectors);
 
 		/*
 		 * "rotationrate" is a little complicated, because the value
 		 * returned by the drive might not be the RPM; 0 and 1 are
 		 * special cases, and there's also a valid range.
 		 */
 		sbuf_printf(sb, "%s<rotationrate>", indent);
 		if (dp->d_rotation_rate == 0)		/* Old drives don't */
 			sbuf_printf(sb, "unknown");	/* report RPM. */
 		else if (dp->d_rotation_rate == 1)	/* Since 0 is used */
 			sbuf_printf(sb, "0");		/* above, SSDs use 1. */
 		else if ((dp->d_rotation_rate >= 0x041) &&
 		    (dp->d_rotation_rate <= 0xfffe))
 			sbuf_printf(sb, "%u", dp->d_rotation_rate);
 		else
 			sbuf_printf(sb, "invalid");
 		sbuf_printf(sb, "</rotationrate>\n");
 		if (dp->d_getattr != NULL) {
 			buf = g_malloc(DISK_IDENT_SIZE, M_WAITOK);
 			bp = g_alloc_bio();
 			bp->bio_disk = dp;
 			bp->bio_attribute = "GEOM::ident";
 			bp->bio_length = DISK_IDENT_SIZE;
 			bp->bio_data = buf;
 			res = dp->d_getattr(bp);
 			sbuf_printf(sb, "%s<ident>", indent);
 			g_conf_printf_escaped(sb, "%s",
 			    res == 0 ? buf: dp->d_ident);
 			sbuf_printf(sb, "</ident>\n");
 			bp->bio_attribute = "GEOM::lunid";
 			bp->bio_length = DISK_IDENT_SIZE;
 			bp->bio_data = buf;
 			if (dp->d_getattr(bp) == 0) {
 				sbuf_printf(sb, "%s<lunid>", indent);
 				g_conf_printf_escaped(sb, "%s", buf);
 				sbuf_printf(sb, "</lunid>\n");
 			}
 			bp->bio_attribute = "GEOM::lunname";
 			bp->bio_length = DISK_IDENT_SIZE;
 			bp->bio_data = buf;
 			if (dp->d_getattr(bp) == 0) {
 				sbuf_printf(sb, "%s<lunname>", indent);
 				g_conf_printf_escaped(sb, "%s", buf);
 				sbuf_printf(sb, "</lunname>\n");
 			}
 			g_destroy_bio(bp);
 			g_free(buf);
 		} else {
 			sbuf_printf(sb, "%s<ident>", indent);
 			g_conf_printf_escaped(sb, "%s", dp->d_ident);
 			sbuf_printf(sb, "</ident>\n");
 		}
 		sbuf_printf(sb, "%s<descr>", indent);
 		g_conf_printf_escaped(sb, "%s", dp->d_descr);
 		sbuf_printf(sb, "</descr>\n");
 	}
 }
 
 static void
 g_disk_resize(void *ptr, int flag)
 {
 	struct disk *dp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	if (flag == EV_CANCEL)
 		return;
 	g_topology_assert();
 
 	dp = ptr;
 	gp = dp->d_geom;
 
 	if (dp->d_destroyed || gp == NULL)
 		return;
 
 	LIST_FOREACH(pp, &gp->provider, provider) {
 		if (pp->sectorsize != 0 &&
 		    pp->sectorsize != dp->d_sectorsize)
 			g_wither_provider(pp, ENXIO);
 		else
 			g_resize_provider(pp, dp->d_mediasize);
 	}
 }
 
 static void
 g_disk_create(void *arg, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	char tmpstr[80];
 
 	if (flag == EV_CANCEL)
 		return;
 	g_topology_assert();
 	dp = arg;
 
 	mtx_pool_lock(mtxpool_sleep, dp);
 	dp->d_init_level = DISK_INIT_START;
 
 	/*
 	 * If the disk has already gone away, we can just stop here and
 	 * call the user's callback to tell him we've cleaned things up.
 	 */
 	if (dp->d_goneflag != 0) {
 		mtx_pool_unlock(mtxpool_sleep, dp);
 		if (dp->d_gone != NULL)
 			dp->d_gone(dp);
 		return;
 	}
 	mtx_pool_unlock(mtxpool_sleep, dp);
 
 	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
 	mtx_init(&sc->start_mtx, "g_disk_start", NULL, MTX_DEF);
 	mtx_init(&sc->done_mtx, "g_disk_done", NULL, MTX_DEF);
 	sc->dp = dp;
 	gp = g_new_geomf(&g_disk_class, "%s%d", dp->d_name, dp->d_unit);
 	gp->softc = sc;
 	pp = g_new_providerf(gp, "%s", gp->name);
 	devstat_remove_entry(pp->stat);
 	pp->stat = NULL;
 	dp->d_devstat->id = pp;
 	pp->mediasize = dp->d_mediasize;
 	pp->sectorsize = dp->d_sectorsize;
 	pp->stripeoffset = dp->d_stripeoffset;
 	pp->stripesize = dp->d_stripesize;
 	if ((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0)
 		pp->flags |= G_PF_ACCEPT_UNMAPPED;
 	if ((dp->d_flags & DISKFLAG_DIRECT_COMPLETION) != 0)
 		pp->flags |= G_PF_DIRECT_SEND;
 	pp->flags |= G_PF_DIRECT_RECEIVE;
 	if (bootverbose)
 		printf("GEOM: new disk %s\n", gp->name);
 	sysctl_ctx_init(&sc->sysctl_ctx);
 	snprintf(tmpstr, sizeof(tmpstr), "GEOM disk %s", gp->name);
 	sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
 		SYSCTL_STATIC_CHILDREN(_kern_geom_disk), OID_AUTO, gp->name,
 		CTLFLAG_RD, 0, tmpstr);
 	if (sc->sysctl_tree != NULL) {
 		SYSCTL_ADD_STRING(&sc->sysctl_ctx,
 		    SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "led",
 		    CTLFLAG_RWTUN, sc->led, sizeof(sc->led),
 		    "LED name");
 	}
 	pp->private = sc;
 	dp->d_geom = gp;
 	g_error_provider(pp, 0);
 
 	mtx_pool_lock(mtxpool_sleep, dp);
 	dp->d_init_level = DISK_INIT_DONE;
 
 	/*
 	 * If the disk has gone away at this stage, start the withering
 	 * process for it.
 	 */
 	if (dp->d_goneflag != 0) {
 		mtx_pool_unlock(mtxpool_sleep, dp);
 		g_wither_provider(pp, ENXIO);
 		return;
 	}
 	mtx_pool_unlock(mtxpool_sleep, dp);
 
 }
 
 /*
  * We get this callback after all of the consumers have gone away, and just
  * before the provider is freed.  If the disk driver provided a d_gone
  * callback, let them know that it is okay to free resources -- they won't
  * be getting any more accesses from GEOM.
  */
 static void
 g_disk_providergone(struct g_provider *pp)
 {
 	struct disk *dp;
 	struct g_disk_softc *sc;
 
 	sc = (struct g_disk_softc *)pp->private;
 	dp = sc->dp;
 	if (dp != NULL && dp->d_gone != NULL)
 		dp->d_gone(dp);
 	if (sc->sysctl_tree != NULL) {
 		sysctl_ctx_free(&sc->sysctl_ctx);
 		sc->sysctl_tree = NULL;
 	}
 	if (sc->led[0] != 0) {
 		led_set(sc->led, "0");
 		sc->led[0] = 0;
 	}
 	pp->private = NULL;
 	pp->geom->softc = NULL;
 	mtx_destroy(&sc->done_mtx);
 	mtx_destroy(&sc->start_mtx);
 	g_free(sc);
 }
 
 static void
 g_disk_destroy(void *ptr, int flag)
 {
 	struct disk *dp;
 	struct g_geom *gp;
 	struct g_disk_softc *sc;
 
 	g_topology_assert();
 	dp = ptr;
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		sc = gp->softc;
 		if (sc != NULL)
 			sc->dp = NULL;
 		dp->d_geom = NULL;
 		g_wither_geom(gp, ENXIO);
 	}
 
 	g_free(dp);
 }
 
 /*
  * We only allow printable characters in disk ident,
  * the rest is converted to 'x<HH>'.
  */
 static void
 g_disk_ident_adjust(char *ident, size_t size)
 {
 	char *p, tmp[4], newid[DISK_IDENT_SIZE];
 
 	newid[0] = '\0';
 	for (p = ident; *p != '\0'; p++) {
 		if (isprint(*p)) {
 			tmp[0] = *p;
 			tmp[1] = '\0';
 		} else {
 			snprintf(tmp, sizeof(tmp), "x%02hhx",
 			    *(unsigned char *)p);
 		}
 		if (strlcat(newid, tmp, sizeof(newid)) >= sizeof(newid))
 			break;
 	}
 	bzero(ident, size);
 	strlcpy(ident, newid, size);
 }
 
 struct disk *
 disk_alloc(void)
 {
 
 	return (g_malloc(sizeof(struct disk), M_WAITOK | M_ZERO));
 }
 
 void
 disk_create(struct disk *dp, int version)
 {
 
 	if (version != DISK_VERSION) {
 		printf("WARNING: Attempt to add disk %s%d %s",
 		    dp->d_name, dp->d_unit,
 		    " using incompatible ABI version of disk(9)\n");
 		printf("WARNING: Ignoring disk %s%d\n",
 		    dp->d_name, dp->d_unit);
 		return;
 	}
 	if (dp->d_flags & DISKFLAG_RESERVED) {
 		printf("WARNING: Attempt to add non-MPSAFE disk %s%d\n",
 		    dp->d_name, dp->d_unit);
 		printf("WARNING: Ignoring disk %s%d\n",
 		    dp->d_name, dp->d_unit);
 		return;
 	}
 	KASSERT(dp->d_strategy != NULL, ("disk_create need d_strategy"));
 	KASSERT(dp->d_name != NULL, ("disk_create need d_name"));
 	KASSERT(*dp->d_name != 0, ("disk_create need d_name"));
 	KASSERT(strlen(dp->d_name) < SPECNAMELEN - 4, ("disk name too long"));
 	if (dp->d_devstat == NULL)
 		dp->d_devstat = devstat_new_entry(dp->d_name, dp->d_unit,
 		    dp->d_sectorsize, DEVSTAT_ALL_SUPPORTED,
 		    DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 	dp->d_geom = NULL;
 
 	dp->d_init_level = DISK_INIT_NONE;
 
 	g_disk_ident_adjust(dp->d_ident, sizeof(dp->d_ident));
 	g_post_event(g_disk_create, dp, M_WAITOK, dp, NULL);
 }
 
 void
 disk_destroy(struct disk *dp)
 {
 
 	g_cancel_event(dp);
 	dp->d_destroyed = 1;
 	if (dp->d_devstat != NULL)
 		devstat_remove_entry(dp->d_devstat);
 	g_post_event(g_disk_destroy, dp, M_WAITOK, NULL);
 }
 
 void
 disk_gone(struct disk *dp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	mtx_pool_lock(mtxpool_sleep, dp);
 	dp->d_goneflag = 1;
 
 	/*
 	 * If we're still in the process of creating this disk (the
 	 * g_disk_create() function is still queued, or is in
 	 * progress), the init level will not yet be DISK_INIT_DONE.
 	 *
 	 * If that is the case, g_disk_create() will see d_goneflag
 	 * and take care of cleaning things up.
 	 *
 	 * If the disk has already been created, we default to
 	 * withering the provider as usual below.
 	 *
 	 * If the caller has not set a d_gone() callback, he will
 	 * not be any worse off by returning here, because the geom
 	 * has not been fully setup in any case.
 	 */
 	if (dp->d_init_level < DISK_INIT_DONE) {
 		mtx_pool_unlock(mtxpool_sleep, dp);
 		return;
 	}
 	mtx_pool_unlock(mtxpool_sleep, dp);
 
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		pp = LIST_FIRST(&gp->provider);
 		if (pp != NULL) {
 			KASSERT(LIST_NEXT(pp, provider) == NULL,
 			    ("geom %p has more than one provider", gp));
 			g_wither_provider(pp, ENXIO);
 		}
 	}
 }
 
 void
 disk_attr_changed(struct disk *dp, const char *attr, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	char devnamebuf[128];
 
 	gp = dp->d_geom;
 	if (gp != NULL)
 		LIST_FOREACH(pp, &gp->provider, provider)
 			(void)g_attr_changed(pp, attr, flag);
 	snprintf(devnamebuf, sizeof(devnamebuf), "devname=%s%d", dp->d_name,
 	    dp->d_unit);
 	devctl_notify("GEOM", "disk", attr, devnamebuf);
 }
 
 void
 disk_media_changed(struct disk *dp, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		pp = LIST_FIRST(&gp->provider);
 		if (pp != NULL) {
 			KASSERT(LIST_NEXT(pp, provider) == NULL,
 			    ("geom %p has more than one provider", gp));
 			g_media_changed(pp, flag);
 		}
 	}
 }
 
 void
 disk_media_gone(struct disk *dp, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		pp = LIST_FIRST(&gp->provider);
 		if (pp != NULL) {
 			KASSERT(LIST_NEXT(pp, provider) == NULL,
 			    ("geom %p has more than one provider", gp));
 			g_media_gone(pp, flag);
 		}
 	}
 }
 
 int
 disk_resize(struct disk *dp, int flag)
 {
 
 	if (dp->d_destroyed || dp->d_geom == NULL)
 		return (0);
 
 	return (g_post_event(g_disk_resize, dp, flag, NULL));
 }
 
 static void
 g_kern_disks(void *p, int flag __unused)
 {
 	struct sbuf *sb;
 	struct g_geom *gp;
 	char *sp;
 
 	sb = p;
 	sp = "";
 	g_topology_assert();
 	LIST_FOREACH(gp, &g_disk_class.geom, geom) {
 		sbuf_printf(sb, "%s%s", sp, gp->name);
 		sp = " ";
 	}
 	sbuf_finish(sb);
 }
 
 static int
 sysctl_disks(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct sbuf *sb;
 
 	sb = sbuf_new_auto();
 	g_waitfor_event(g_kern_disks, sb, M_WAITOK, NULL);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 	return error;
 }
  
 SYSCTL_PROC(_kern, OID_AUTO, disks,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_disks, "A", "names of available disks");
Index: head/sys/geom/geom_io.c
===================================================================
--- head/sys/geom/geom_io.c	(revision 308154)
+++ head/sys/geom/geom_io.c	(revision 308155)
@@ -1,1045 +1,1057 @@
 /*-
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/ktr.h>
 #include <sys/proc.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/vmem.h>
 
 #include <sys/errno.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 #include <sys/devicestat.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 static int	g_io_transient_map_bio(struct bio *bp);
 
 static struct g_bioq g_bio_run_down;
 static struct g_bioq g_bio_run_up;
 
 /*
  * Pace is a hint that we've had some trouble recently allocating
  * bios, so we should back off trying to send I/O down the stack
  * a bit to let the problem resolve. When pacing, we also turn
  * off direct dispatch to also reduce memory pressure from I/Os
  * there, at the expxense of some added latency while the memory
  * pressures exist. See g_io_schedule_down() for more details
  * and limitations.
  */
 static volatile u_int pace;
 
 static uma_zone_t	biozone;
 
 /*
  * The head of the list of classifiers used in g_io_request.
  * Use g_register_classifier() and g_unregister_classifier()
  * to add/remove entries to the list.
  * Classifiers are invoked in registration order.
  */
 static TAILQ_HEAD(g_classifier_tailq, g_classifier_hook)
     g_classifier_tailq = TAILQ_HEAD_INITIALIZER(g_classifier_tailq);
 
 #include <machine/atomic.h>
 
 static void
 g_bioq_lock(struct g_bioq *bq)
 {
 
 	mtx_lock(&bq->bio_queue_lock);
 }
 
 static void
 g_bioq_unlock(struct g_bioq *bq)
 {
 
 	mtx_unlock(&bq->bio_queue_lock);
 }
 
 #if 0
 static void
 g_bioq_destroy(struct g_bioq *bq)
 {
 
 	mtx_destroy(&bq->bio_queue_lock);
 }
 #endif
 
 static void
 g_bioq_init(struct g_bioq *bq)
 {
 
 	TAILQ_INIT(&bq->bio_queue);
 	mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF);
 }
 
 static struct bio *
 g_bioq_first(struct g_bioq *bq)
 {
 	struct bio *bp;
 
 	bp = TAILQ_FIRST(&bq->bio_queue);
 	if (bp != NULL) {
 		KASSERT((bp->bio_flags & BIO_ONQUEUE),
 		    ("Bio not on queue bp=%p target %p", bp, bq));
 		bp->bio_flags &= ~BIO_ONQUEUE;
 		TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue);
 		bq->bio_queue_length--;
 	}
 	return (bp);
 }
 
 struct bio *
 g_new_bio(void)
 {
 	struct bio *bp;
 
 	bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR1(KTR_GEOM, "g_new_bio(): %p", bp);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3, 0);
 	}
 #endif
 	return (bp);
 }
 
 struct bio *
 g_alloc_bio(void)
 {
 	struct bio *bp;
 
 	bp = uma_zalloc(biozone, M_WAITOK | M_ZERO);
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3, 0);
 	}
 #endif
 	return (bp);
 }
 
 void
 g_destroy_bio(struct bio *bp)
 {
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3, 0);
 	}
 #endif
 	uma_zfree(biozone, bp);
 }
 
 struct bio *
 g_clone_bio(struct bio *bp)
 {
 	struct bio *bp2;
 
 	bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
 	if (bp2 != NULL) {
 		bp2->bio_parent = bp;
 		bp2->bio_cmd = bp->bio_cmd;
 		/*
 		 *  BIO_ORDERED flag may be used by disk drivers to enforce
 		 *  ordering restrictions, so this flag needs to be cloned.
 		 *  BIO_UNMAPPED and BIO_VLIST should be inherited, to properly
 		 *  indicate which way the buffer is passed.
 		 *  Other bio flags are not suitable for cloning.
 		 */
 		bp2->bio_flags = bp->bio_flags &
 		    (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST);
 		bp2->bio_length = bp->bio_length;
 		bp2->bio_offset = bp->bio_offset;
 		bp2->bio_data = bp->bio_data;
 		bp2->bio_ma = bp->bio_ma;
 		bp2->bio_ma_n = bp->bio_ma_n;
 		bp2->bio_ma_offset = bp->bio_ma_offset;
 		bp2->bio_attribute = bp->bio_attribute;
 		if (bp->bio_cmd == BIO_ZONE)
 			bcopy(&bp->bio_zone, &bp2->bio_zone,
 			    sizeof(bp->bio_zone));
 		/* Inherit classification info from the parent */
 		bp2->bio_classifier1 = bp->bio_classifier1;
 		bp2->bio_classifier2 = bp->bio_classifier2;
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+		bp2->bio_track_bp = bp->bio_track_bp;
+#endif
 		bp->bio_children++;
 	}
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3, 0);
 	}
 #endif
 	return(bp2);
 }
 
 struct bio *
 g_duplicate_bio(struct bio *bp)
 {
 	struct bio *bp2;
 
 	bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
 	bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST);
 	bp2->bio_parent = bp;
 	bp2->bio_cmd = bp->bio_cmd;
 	bp2->bio_length = bp->bio_length;
 	bp2->bio_offset = bp->bio_offset;
 	bp2->bio_data = bp->bio_data;
 	bp2->bio_ma = bp->bio_ma;
 	bp2->bio_ma_n = bp->bio_ma_n;
 	bp2->bio_ma_offset = bp->bio_ma_offset;
 	bp2->bio_attribute = bp->bio_attribute;
 	bp->bio_children++;
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3, 0);
 	}
 #endif
 	return(bp2);
 }
 
 void
 g_reset_bio(struct bio *bp)
 {
 
 	bzero(bp, sizeof(*bp));
 }
 
 void
 g_io_init()
 {
 
 	g_bioq_init(&g_bio_run_down);
 	g_bioq_init(&g_bio_run_up);
 	biozone = uma_zcreate("g_bio", sizeof (struct bio),
 	    NULL, NULL,
 	    NULL, NULL,
 	    0, 0);
 }
 
 int
 g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr)
 {
 	struct bio *bp;
 	int error;
 
 	g_trace(G_T_BIO, "bio_getattr(%s)", attr);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_GETATTR;
 	bp->bio_done = NULL;
 	bp->bio_attribute = attr;
 	bp->bio_length = *len;
 	bp->bio_data = ptr;
 	g_io_request(bp, cp);
 	error = biowait(bp, "ggetattr");
 	*len = bp->bio_completed;
 	g_destroy_bio(bp);
 	return (error);
 }
 
 int
 g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp)
 {
 	struct bio *bp;
 	int error;
 	
 	g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_ZONE;
 	bp->bio_done = NULL;
 	/*
 	 * XXX KDM need to handle report zone data.
 	 */
 	bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args));
 	if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES)
 		bp->bio_length =
 		    zone_args->zone_params.report.entries_allocated *
 		    sizeof(struct disk_zone_rep_entry);
 	else
 		bp->bio_length = 0;
 
 	g_io_request(bp, cp);
 	error = biowait(bp, "gzone");
 	bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args));
 	g_destroy_bio(bp);
 	return (error);
 }
 
 int
 g_io_flush(struct g_consumer *cp)
 {
 	struct bio *bp;
 	int error;
 
 	g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_FLUSH;
 	bp->bio_flags |= BIO_ORDERED;
 	bp->bio_done = NULL;
 	bp->bio_attribute = NULL;
 	bp->bio_offset = cp->provider->mediasize;
 	bp->bio_length = 0;
 	bp->bio_data = NULL;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gflush");
 	g_destroy_bio(bp);
 	return (error);
 }
 
 static int
 g_io_check(struct bio *bp)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	off_t excess;
 	int error;
 
+	biotrack(bp, __func__);
+
 	cp = bp->bio_from;
 	pp = bp->bio_to;
 
 	/* Fail if access counters dont allow the operation */
 	switch(bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_GETATTR:
 		if (cp->acr == 0)
 			return (EPERM);
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 	case BIO_FLUSH:
 		if (cp->acw == 0)
 			return (EPERM);
 		break;
 	case BIO_ZONE:
 		if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) ||
 		    (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) {
 			if (cp->acr == 0)
 				return (EPERM);
 		} else if (cp->acw == 0)
 			return (EPERM);
 		break;
 	default:
 		return (EPERM);
 	}
 	/* if provider is marked for error, don't disturb. */
 	if (pp->error)
 		return (pp->error);
 	if (cp->flags & G_CF_ORPHAN)
 		return (ENXIO);
 
 	switch(bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		/* Zero sectorsize or mediasize is probably a lack of media. */
 		if (pp->sectorsize == 0 || pp->mediasize == 0)
 			return (ENXIO);
 		/* Reject I/O not on sector boundary */
 		if (bp->bio_offset % pp->sectorsize)
 			return (EINVAL);
 		/* Reject I/O not integral sector long */
 		if (bp->bio_length % pp->sectorsize)
 			return (EINVAL);
 		/* Reject requests before or past the end of media. */
 		if (bp->bio_offset < 0)
 			return (EIO);
 		if (bp->bio_offset > pp->mediasize)
 			return (EIO);
 
 		/* Truncate requests to the end of providers media. */
 		excess = bp->bio_offset + bp->bio_length;
 		if (excess > bp->bio_to->mediasize) {
 			KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 ||
 			    round_page(bp->bio_ma_offset +
 			    bp->bio_length) / PAGE_SIZE == bp->bio_ma_n,
 			    ("excess bio %p too short", bp));
 			excess -= bp->bio_to->mediasize;
 			bp->bio_length -= excess;
 			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 				bp->bio_ma_n = round_page(bp->bio_ma_offset +
 				    bp->bio_length) / PAGE_SIZE;
 			}
 			if (excess > 0)
 				CTR3(KTR_GEOM, "g_down truncated bio "
 				    "%p provider %s by %d", bp,
 				    bp->bio_to->name, excess);
 		}
 
 		/* Deliver zero length transfers right here. */
 		if (bp->bio_length == 0) {
 			CTR2(KTR_GEOM, "g_down terminated 0-length "
 			    "bp %p provider %s", bp, bp->bio_to->name);
 			return (0);
 		}
 
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
 		    (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
 		    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
 			if ((error = g_io_transient_map_bio(bp)) >= 0)
 				return (error);
 		}
 		break;
 	default:
 		break;
 	}
 	return (EJUSTRETURN);
 }
 
 /*
  * bio classification support.
  *
  * g_register_classifier() and g_unregister_classifier()
  * are used to add/remove a classifier from the list.
  * The list is protected using the g_bio_run_down lock,
  * because the classifiers are called in this path.
  *
  * g_io_request() passes bio's that are not already classified
  * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers().
  * Classifiers can store their result in the two fields
  * bio_classifier1 and bio_classifier2.
  * A classifier that updates one of the fields should
  * return a non-zero value.
  * If no classifier updates the field, g_run_classifiers() sets
  * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls.
  */
 
 int
 g_register_classifier(struct g_classifier_hook *hook)
 {
 
 	g_bioq_lock(&g_bio_run_down);
 	TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link);
 	g_bioq_unlock(&g_bio_run_down);
 
 	return (0);
 }
 
 void
 g_unregister_classifier(struct g_classifier_hook *hook)
 {
 	struct g_classifier_hook *entry;
 
 	g_bioq_lock(&g_bio_run_down);
 	TAILQ_FOREACH(entry, &g_classifier_tailq, link) {
 		if (entry == hook) {
 			TAILQ_REMOVE(&g_classifier_tailq, hook, link);
 			break;
 		}
 	}
 	g_bioq_unlock(&g_bio_run_down);
 }
 
 static void
 g_run_classifiers(struct bio *bp)
 {
 	struct g_classifier_hook *hook;
 	int classified = 0;
 
+	biotrack(bp, __func__);
+
 	TAILQ_FOREACH(hook, &g_classifier_tailq, link)
 		classified |= hook->func(hook->arg, bp);
 
 	if (!classified)
 		bp->bio_classifier1 = BIO_NOTCLASSIFIED;
 }
 
 void
 g_io_request(struct bio *bp, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	struct mtx *mtxp;
 	int direct, error, first;
 	uint8_t cmd;
 
+	biotrack(bp, __func__);
+
 	KASSERT(cp != NULL, ("NULL cp in g_io_request"));
 	KASSERT(bp != NULL, ("NULL bp in g_io_request"));
 	pp = cp->provider;
 	KASSERT(pp != NULL, ("consumer not attached in g_io_request"));
 #ifdef DIAGNOSTIC
 	KASSERT(bp->bio_driver1 == NULL,
 	    ("bio_driver1 used by the consumer (geom %s)", cp->geom->name));
 	KASSERT(bp->bio_driver2 == NULL,
 	    ("bio_driver2 used by the consumer (geom %s)", cp->geom->name));
 	KASSERT(bp->bio_pflags == 0,
 	    ("bio_pflags used by the consumer (geom %s)", cp->geom->name));
 	/*
 	 * Remember consumer's private fields, so we can detect if they were
 	 * modified by the provider.
 	 */
 	bp->_bio_caller1 = bp->bio_caller1;
 	bp->_bio_caller2 = bp->bio_caller2;
 	bp->_bio_cflags = bp->bio_cflags;
 #endif
 
 	cmd = bp->bio_cmd;
 	if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) {
 		KASSERT(bp->bio_data != NULL,
 		    ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd));
 	}
 	if (cmd == BIO_DELETE || cmd == BIO_FLUSH) {
 		KASSERT(bp->bio_data == NULL,
 		    ("non-NULL bp->data in g_io_request(cmd=%hu)",
 		    bp->bio_cmd));
 	}
 	if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) {
 		KASSERT(bp->bio_offset % cp->provider->sectorsize == 0,
 		    ("wrong offset %jd for sectorsize %u",
 		    bp->bio_offset, cp->provider->sectorsize));
 		KASSERT(bp->bio_length % cp->provider->sectorsize == 0,
 		    ("wrong length %jd for sectorsize %u",
 		    bp->bio_length, cp->provider->sectorsize));
 	}
 
 	g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d",
 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd);
 
 	bp->bio_from = cp;
 	bp->bio_to = pp;
 	bp->bio_error = 0;
 	bp->bio_completed = 0;
 
 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
 	    ("Bio already on queue bp=%p", bp));
 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
 		binuptime(&bp->bio_t0);
 	else
 		getbinuptime(&bp->bio_t0);
 
 #ifdef GET_STACK_USAGE
 	direct = (cp->flags & G_CF_DIRECT_SEND) != 0 &&
 	    (pp->flags & G_PF_DIRECT_RECEIVE) != 0 &&
 	    !g_is_geom_thread(curthread) &&
 	    ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ||
 	    (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) &&
 	    pace == 0;
 	if (direct) {
 		/* Block direct execution if less then half of stack left. */
 		size_t	st, su;
 		GET_STACK_USAGE(st, su);
 		if (su * 2 > st)
 			direct = 0;
 	}
 #else
 	direct = 0;
 #endif
 
 	if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) {
 		g_bioq_lock(&g_bio_run_down);
 		g_run_classifiers(bp);
 		g_bioq_unlock(&g_bio_run_down);
 	}
 
 	/*
 	 * The statistics collection is lockless, as such, but we
 	 * can not update one instance of the statistics from more
 	 * than one thread at a time, so grab the lock first.
 	 */
 	mtxp = mtx_pool_find(mtxpool_sleep, pp);
 	mtx_lock(mtxp);
 	if (g_collectstats & G_STATS_PROVIDERS)
 		devstat_start_transaction(pp->stat, &bp->bio_t0);
 	if (g_collectstats & G_STATS_CONSUMERS)
 		devstat_start_transaction(cp->stat, &bp->bio_t0);
 	pp->nstart++;
 	cp->nstart++;
 	mtx_unlock(mtxp);
 
 	if (direct) {
 		error = g_io_check(bp);
 		if (error >= 0) {
 			CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p "
 			    "provider %s returned %d", bp, bp->bio_to->name,
 			    error);
 			g_io_deliver(bp, error);
 			return;
 		}
 		bp->bio_to->geom->start(bp);
 	} else {
 		g_bioq_lock(&g_bio_run_down);
 		first = TAILQ_EMPTY(&g_bio_run_down.bio_queue);
 		TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue);
 		bp->bio_flags |= BIO_ONQUEUE;
 		g_bio_run_down.bio_queue_length++;
 		g_bioq_unlock(&g_bio_run_down);
 		/* Pass it on down. */
 		if (first)
 			wakeup(&g_wait_down);
 	}
 }
 
 void
 g_io_deliver(struct bio *bp, int error)
 {
 	struct bintime now;
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	struct mtx *mtxp;
 	int direct, first;
 
+	biotrack(bp, __func__);
+
 	KASSERT(bp != NULL, ("NULL bp in g_io_deliver"));
 	pp = bp->bio_to;
 	KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver"));
 	cp = bp->bio_from;
 	if (cp == NULL) {
 		bp->bio_error = error;
 		bp->bio_done(bp);
 		return;
 	}
 	KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver"));
 	KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver"));
 #ifdef DIAGNOSTIC
 	/*
 	 * Some classes - GJournal in particular - can modify bio's
 	 * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO
 	 * flag means it's an expected behaviour for that particular geom.
 	 */
 	if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) {
 		KASSERT(bp->bio_caller1 == bp->_bio_caller1,
 		    ("bio_caller1 used by the provider %s", pp->name));
 		KASSERT(bp->bio_caller2 == bp->_bio_caller2,
 		    ("bio_caller2 used by the provider %s", pp->name));
 		KASSERT(bp->bio_cflags == bp->_bio_cflags,
 		    ("bio_cflags used by the provider %s", pp->name));
 	}
 #endif
 	KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0"));
 	KASSERT(bp->bio_completed <= bp->bio_length,
 	    ("bio_completed can't be greater than bio_length"));
 
 	g_trace(G_T_BIO,
 "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd",
 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error,
 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
 
 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
 	    ("Bio already on queue bp=%p", bp));
 
 	/*
 	 * XXX: next two doesn't belong here
 	 */
 	bp->bio_bcount = bp->bio_length;
 	bp->bio_resid = bp->bio_bcount - bp->bio_completed;
 
 #ifdef GET_STACK_USAGE
 	direct = (pp->flags & G_PF_DIRECT_SEND) &&
 		 (cp->flags & G_CF_DIRECT_RECEIVE) &&
 		 !g_is_geom_thread(curthread);
 	if (direct) {
 		/* Block direct execution if less then half of stack left. */
 		size_t	st, su;
 		GET_STACK_USAGE(st, su);
 		if (su * 2 > st)
 			direct = 0;
 	}
 #else
 	direct = 0;
 #endif
 
 	/*
 	 * The statistics collection is lockless, as such, but we
 	 * can not update one instance of the statistics from more
 	 * than one thread at a time, so grab the lock first.
 	 */
 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
 		binuptime(&now);
 	mtxp = mtx_pool_find(mtxpool_sleep, cp);
 	mtx_lock(mtxp);
 	if (g_collectstats & G_STATS_PROVIDERS)
 		devstat_end_transaction_bio_bt(pp->stat, bp, &now);
 	if (g_collectstats & G_STATS_CONSUMERS)
 		devstat_end_transaction_bio_bt(cp->stat, bp, &now);
 	cp->nend++;
 	pp->nend++;
 	mtx_unlock(mtxp);
 
 	if (error != ENOMEM) {
 		bp->bio_error = error;
 		if (direct) {
 			biodone(bp);
 		} else {
 			g_bioq_lock(&g_bio_run_up);
 			first = TAILQ_EMPTY(&g_bio_run_up.bio_queue);
 			TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue);
 			bp->bio_flags |= BIO_ONQUEUE;
 			g_bio_run_up.bio_queue_length++;
 			g_bioq_unlock(&g_bio_run_up);
 			if (first)
 				wakeup(&g_wait_up);
 		}
 		return;
 	}
 
 	if (bootverbose)
 		printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name);
 	bp->bio_children = 0;
 	bp->bio_inbed = 0;
 	bp->bio_driver1 = NULL;
 	bp->bio_driver2 = NULL;
 	bp->bio_pflags = 0;
 	g_io_request(bp, cp);
 	pace = 1;
 	return;
 }
 
 SYSCTL_DECL(_kern_geom);
 
 static long transient_maps;
 SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
     &transient_maps, 0,
     "Total count of the transient mapping requests");
 u_int transient_map_retries = 10;
 SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
     &transient_map_retries, 0,
     "Max count of retries used before giving up on creating transient map");
 int transient_map_hard_failures;
 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
     &transient_map_hard_failures, 0,
     "Failures to establish the transient mapping due to retry attempts "
     "exhausted");
 int transient_map_soft_failures;
 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
     &transient_map_soft_failures, 0,
     "Count of retried failures to establish the transient mapping");
 int inflight_transient_maps;
 SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
     &inflight_transient_maps, 0,
     "Current count of the active transient maps");
 
 static int
 g_io_transient_map_bio(struct bio *bp)
 {
 	vm_offset_t addr;
 	long size;
 	u_int retried;
 
 	KASSERT(unmapped_buf_allowed, ("unmapped disabled"));
 
 	size = round_page(bp->bio_ma_offset + bp->bio_length);
 	KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
 	addr = 0;
 	retried = 0;
 	atomic_add_long(&transient_maps, 1);
 retry:
 	if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) {
 		if (transient_map_retries != 0 &&
 		    retried >= transient_map_retries) {
 			CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
 			    bp, bp->bio_to->name);
 			atomic_add_int(&transient_map_hard_failures, 1);
 			return (EDEADLK/* XXXKIB */);
 		} else {
 			/*
 			 * Naive attempt to quisce the I/O to get more
 			 * in-flight requests completed and defragment
 			 * the transient_arena.
 			 */
 			CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
 			    bp, bp->bio_to->name, retried);
 			pause("g_d_tra", hz / 10);
 			retried++;
 			atomic_add_int(&transient_map_soft_failures, 1);
 			goto retry;
 		}
 	}
 	atomic_add_int(&inflight_transient_maps, 1);
 	pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
 	bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
 	bp->bio_flags |= BIO_TRANSIENT_MAPPING;
 	bp->bio_flags &= ~BIO_UNMAPPED;
 	return (EJUSTRETURN);
 }
 
 void
 g_io_schedule_down(struct thread *tp __unused)
 {
 	struct bio *bp;
 	int error;
 
 	for(;;) {
 		g_bioq_lock(&g_bio_run_down);
 		bp = g_bioq_first(&g_bio_run_down);
 		if (bp == NULL) {
 			CTR0(KTR_GEOM, "g_down going to sleep");
 			msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock,
 			    PRIBIO | PDROP, "-", 0);
 			continue;
 		}
 		CTR0(KTR_GEOM, "g_down has work to do");
 		g_bioq_unlock(&g_bio_run_down);
+		biotrack(bp, __func__);
 		if (pace != 0) {
 			/*
 			 * There has been at least one memory allocation
 			 * failure since the last I/O completed. Pause 1ms to
 			 * give the system a chance to free up memory. We only
 			 * do this once because a large number of allocations
 			 * can fail in the direct dispatch case and there's no
 			 * relationship between the number of these failures and
 			 * the length of the outage. If there's still an outage,
 			 * we'll pause again and again until it's
 			 * resolved. Older versions paused longer and once per
 			 * allocation failure. This was OK for a single threaded
 			 * g_down, but with direct dispatch would lead to max of
 			 * 10 IOPs for minutes at a time when transient memory
 			 * issues prevented allocation for a batch of requests
 			 * from the upper layers.
 			 *
 			 * XXX This pacing is really lame. It needs to be solved
 			 * by other methods. This is OK only because the worst
 			 * case scenario is so rare. In the worst case scenario
 			 * all memory is tied up waiting for I/O to complete
 			 * which can never happen since we can't allocate bios
 			 * for that I/O.
 			 */
 			CTR0(KTR_GEOM, "g_down pacing self");
 			pause("g_down", min(hz/1000, 1));
 			pace = 0;
 		}
 		CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp,
 		    bp->bio_to->name);
 		error = g_io_check(bp);
 		if (error >= 0) {
 			CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider "
 			    "%s returned %d", bp, bp->bio_to->name, error);
 			g_io_deliver(bp, error);
 			continue;
 		}
 		THREAD_NO_SLEEPING();
 		CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
 		    "len %ld", bp, bp->bio_to->name, bp->bio_offset,
 		    bp->bio_length);
 		bp->bio_to->geom->start(bp);
 		THREAD_SLEEPING_OK();
 	}
 }
 
 void
 g_io_schedule_up(struct thread *tp __unused)
 {
 	struct bio *bp;
 
 	for(;;) {
 		g_bioq_lock(&g_bio_run_up);
 		bp = g_bioq_first(&g_bio_run_up);
 		if (bp == NULL) {
 			CTR0(KTR_GEOM, "g_up going to sleep");
 			msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock,
 			    PRIBIO | PDROP, "-", 0);
 			continue;
 		}
 		g_bioq_unlock(&g_bio_run_up);
 		THREAD_NO_SLEEPING();
 		CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off "
 		    "%jd len %ld", bp, bp->bio_to->name,
 		    bp->bio_offset, bp->bio_length);
 		biodone(bp);
 		THREAD_SLEEPING_OK();
 	}
 }
 
 void *
 g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error)
 {
 	struct bio *bp;
 	void *ptr;
 	int errorc;
 
 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
 	    length <= MAXPHYS, ("g_read_data(): invalid length %jd",
 	    (intmax_t)length));
 
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_READ;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = length;
 	ptr = g_malloc(length, M_WAITOK);
 	bp->bio_data = ptr;
 	g_io_request(bp, cp);
 	errorc = biowait(bp, "gread");
 	if (error != NULL)
 		*error = errorc;
 	g_destroy_bio(bp);
 	if (errorc) {
 		g_free(ptr);
 		ptr = NULL;
 	}
 	return (ptr);
 }
 
 int
 g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length)
 {
 	struct bio *bp;
 	int error;
 
 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
 	    length <= MAXPHYS, ("g_write_data(): invalid length %jd",
 	    (intmax_t)length));
 
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_WRITE;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = length;
 	bp->bio_data = ptr;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gwrite");
 	g_destroy_bio(bp);
 	return (error);
 }
 
 int
 g_delete_data(struct g_consumer *cp, off_t offset, off_t length)
 {
 	struct bio *bp;
 	int error;
 
 	KASSERT(length > 0 && length >= cp->provider->sectorsize,
 	    ("g_delete_data(): invalid length %jd", (intmax_t)length));
 
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_DELETE;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = length;
 	bp->bio_data = NULL;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gdelete");
 	g_destroy_bio(bp);
 	return (error);
 }
 
 void
 g_print_bio(struct bio *bp)
 {
 	const char *pname, *cmd = NULL;
 
 	if (bp->bio_to != NULL)
 		pname = bp->bio_to->name;
 	else
 		pname = "[unknown]";
 
 	switch (bp->bio_cmd) {
 	case BIO_GETATTR:
 		cmd = "GETATTR";
 		printf("%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute);
 		return;
 	case BIO_FLUSH:
 		cmd = "FLUSH";
 		printf("%s[%s]", pname, cmd);
 		return;
 	case BIO_ZONE: {
 		char *subcmd = NULL;
 		cmd = "ZONE";
 		switch (bp->bio_zone.zone_cmd) {
 		case DISK_ZONE_OPEN:
 			subcmd = "OPEN";
 			break;
 		case DISK_ZONE_CLOSE:
 			subcmd = "CLOSE";
 			break;
 		case DISK_ZONE_FINISH:
 			subcmd = "FINISH";
 			break;
 		case DISK_ZONE_RWP:
 			subcmd = "RWP";
 			break;
 		case DISK_ZONE_REPORT_ZONES:
 			subcmd = "REPORT ZONES";
 			break;
 		case DISK_ZONE_GET_PARAMS:
 			subcmd = "GET PARAMS";
 			break;
 		default:
 			subcmd = "UNKNOWN";
 			break;
 		}
 		printf("%s[%s,%s]", pname, cmd, subcmd);
 		return;
 	}
 	case BIO_READ:
 		cmd = "READ";
 		break;
 	case BIO_WRITE:
 		cmd = "WRITE";
 		break;
 	case BIO_DELETE:
 		cmd = "DELETE";
 		break;
 	default:
 		cmd = "UNKNOWN";
 		printf("%s[%s()]", pname, cmd);
 		return;
 	}
 	printf("%s[%s(offset=%jd, length=%jd)]", pname, cmd,
 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
 }
Index: head/sys/geom/geom_subr.c
===================================================================
--- head/sys/geom/geom_subr.c	(revision 308154)
+++ head/sys/geom/geom_subr.c	(revision 308155)
@@ -1,1541 +1,1545 @@
 /*-
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/devicestat.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/errno.h>
 #include <sys/sbuf.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 #include <machine/stdarg.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #ifdef KDB
 #include <sys/kdb.h>
 #endif
 
 struct class_list_head g_classes = LIST_HEAD_INITIALIZER(g_classes);
 static struct g_tailq_head geoms = TAILQ_HEAD_INITIALIZER(geoms);
 char *g_wait_event, *g_wait_up, *g_wait_down, *g_wait_sim;
 
 struct g_hh00 {
 	struct g_class		*mp;
 	struct g_provider	*pp;
 	off_t			size;
 	int			error;
 	int			post;
 };
 
 /*
  * This event offers a new class a chance to taste all preexisting providers.
  */
 static void
 g_load_class(void *arg, int flag)
 {
 	struct g_hh00 *hh;
 	struct g_class *mp2, *mp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	g_topology_assert();
 	if (flag == EV_CANCEL)	/* XXX: can't happen ? */
 		return;
 	if (g_shutdown)
 		return;
 
 	hh = arg;
 	mp = hh->mp;
 	hh->error = 0;
 	if (hh->post) {
 		g_free(hh);
 		hh = NULL;
 	}
 	g_trace(G_T_TOPOLOGY, "g_load_class(%s)", mp->name);
 	KASSERT(mp->name != NULL && *mp->name != '\0',
 	    ("GEOM class has no name"));
 	LIST_FOREACH(mp2, &g_classes, class) {
 		if (mp2 == mp) {
 			printf("The GEOM class %s is already loaded.\n",
 			    mp2->name);
 			if (hh != NULL)
 				hh->error = EEXIST;
 			return;
 		} else if (strcmp(mp2->name, mp->name) == 0) {
 			printf("A GEOM class %s is already loaded.\n",
 			    mp2->name);
 			if (hh != NULL)
 				hh->error = EEXIST;
 			return;
 		}
 	}
 
 	LIST_INIT(&mp->geom);
 	LIST_INSERT_HEAD(&g_classes, mp, class);
 	if (mp->init != NULL)
 		mp->init(mp);
 	if (mp->taste == NULL)
 		return;
 	LIST_FOREACH(mp2, &g_classes, class) {
 		if (mp == mp2)
 			continue;
 		LIST_FOREACH(gp, &mp2->geom, geom) {
 			LIST_FOREACH(pp, &gp->provider, provider) {
 				mp->taste(mp, pp, 0);
 				g_topology_assert();
 			}
 		}
 	}
 }
 
 static int
 g_unload_class(struct g_class *mp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	int error;
 
 	g_topology_lock();
 	g_trace(G_T_TOPOLOGY, "g_unload_class(%s)", mp->name);
 retry:
 	G_VALID_CLASS(mp);
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		/* We refuse to unload if anything is open */
 		LIST_FOREACH(pp, &gp->provider, provider)
 			if (pp->acr || pp->acw || pp->ace) {
 				g_topology_unlock();
 				return (EBUSY);
 			}
 		LIST_FOREACH(cp, &gp->consumer, consumer)
 			if (cp->acr || cp->acw || cp->ace) {
 				g_topology_unlock();
 				return (EBUSY);
 			}
 		/* If the geom is withering, wait for it to finish. */
 		if (gp->flags & G_GEOM_WITHER) {
 			g_topology_sleep(mp, 1);
 			goto retry;
 		}
 	}
 
 	/*
 	 * We allow unloading if we have no geoms, or a class
 	 * method we can use to get rid of them.
 	 */
 	if (!LIST_EMPTY(&mp->geom) && mp->destroy_geom == NULL) {
 		g_topology_unlock();
 		return (EOPNOTSUPP);
 	}
 
 	/* Bar new entries */
 	mp->taste = NULL;
 	mp->config = NULL;
 
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		error = mp->destroy_geom(NULL, mp, gp);
 		if (error != 0) {
 			g_topology_unlock();
 			return (error);
 		}
 	}
 	/* Wait for withering to finish. */
 	for (;;) {
 		gp = LIST_FIRST(&mp->geom);
 		if (gp == NULL)
 			break;
 		KASSERT(gp->flags & G_GEOM_WITHER,
 		   ("Non-withering geom in class %s", mp->name));
 		g_topology_sleep(mp, 1);
 	}
 	G_VALID_CLASS(mp);
 	if (mp->fini != NULL)
 		mp->fini(mp);
 	LIST_REMOVE(mp, class);
 	g_topology_unlock();
 
 	return (0);
 }
 
 int
 g_modevent(module_t mod, int type, void *data)
 {
 	struct g_hh00 *hh;
 	int error;
 	static int g_ignition;
 	struct g_class *mp;
 
 	mp = data;
 	if (mp->version != G_VERSION) {
 		printf("GEOM class %s has Wrong version %x\n",
 		    mp->name, mp->version);
 		return (EINVAL);
 	}
 	if (!g_ignition) {
 		g_ignition++;
 		g_init();
 	}
 	error = EOPNOTSUPP;
 	switch (type) {
 	case MOD_LOAD:
 		g_trace(G_T_TOPOLOGY, "g_modevent(%s, LOAD)", mp->name);
 		hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO);
 		hh->mp = mp;
 		/*
 		 * Once the system is not cold, MOD_LOAD calls will be
 		 * from the userland and the g_event thread will be able
 		 * to acknowledge their completion.
 		 */
 		if (cold) {
 			hh->post = 1;
 			error = g_post_event(g_load_class, hh, M_WAITOK, NULL);
 		} else {
 			error = g_waitfor_event(g_load_class, hh, M_WAITOK,
 			    NULL);
 			if (error == 0)
 				error = hh->error;
 			g_free(hh);
 		}
 		break;
 	case MOD_UNLOAD:
 		g_trace(G_T_TOPOLOGY, "g_modevent(%s, UNLOAD)", mp->name);
 		error = g_unload_class(mp);
 		if (error == 0) {
 			KASSERT(LIST_EMPTY(&mp->geom),
 			    ("Unloaded class (%s) still has geom", mp->name));
 		}
 		break;
 	}
 	return (error);
 }
 
 static void
 g_retaste_event(void *arg, int flag)
 {
 	struct g_class *mp, *mp2;
 	struct g_geom *gp;
 	struct g_hh00 *hh;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	if (flag == EV_CANCEL)  /* XXX: can't happen ? */
 		return;
 	if (g_shutdown || g_notaste)
 		return;
 
 	hh = arg;
 	mp = hh->mp;
 	hh->error = 0;
 	if (hh->post) {
 		g_free(hh);
 		hh = NULL;
 	}
 	g_trace(G_T_TOPOLOGY, "g_retaste(%s)", mp->name);
 
 	LIST_FOREACH(mp2, &g_classes, class) {
 		LIST_FOREACH(gp, &mp2->geom, geom) {
 			LIST_FOREACH(pp, &gp->provider, provider) {
 				if (pp->acr || pp->acw || pp->ace)
 					continue;
 				LIST_FOREACH(cp, &pp->consumers, consumers) {
 					if (cp->geom->class == mp &&
 					    (cp->flags & G_CF_ORPHAN) == 0)
 						break;
 				}
 				if (cp != NULL) {
 					cp->flags |= G_CF_ORPHAN;
 					g_wither_geom(cp->geom, ENXIO);
 				}
 				mp->taste(mp, pp, 0);
 				g_topology_assert();
 			}
 		}
 	}
 }
 
 int
 g_retaste(struct g_class *mp)
 {
 	struct g_hh00 *hh;
 	int error;
 
 	if (mp->taste == NULL)
 		return (EINVAL);
 
 	hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO);
 	hh->mp = mp;
 
 	if (cold) {
 		hh->post = 1;
 		error = g_post_event(g_retaste_event, hh, M_WAITOK, NULL);
 	} else {
 		error = g_waitfor_event(g_retaste_event, hh, M_WAITOK, NULL);
 		if (error == 0)
 			error = hh->error;
 		g_free(hh);
 	}
 
 	return (error);
 }
 
 struct g_geom *
 g_new_geomf(struct g_class *mp, const char *fmt, ...)
 {
 	struct g_geom *gp;
 	va_list ap;
 	struct sbuf *sb;
 
 	g_topology_assert();
 	G_VALID_CLASS(mp);
 	sb = sbuf_new_auto();
 	va_start(ap, fmt);
 	sbuf_vprintf(sb, fmt, ap);
 	va_end(ap);
 	sbuf_finish(sb);
 	gp = g_malloc(sizeof *gp, M_WAITOK | M_ZERO);
 	gp->name = g_malloc(sbuf_len(sb) + 1, M_WAITOK | M_ZERO);
 	gp->class = mp;
 	gp->rank = 1;
 	LIST_INIT(&gp->consumer);
 	LIST_INIT(&gp->provider);
 	LIST_INSERT_HEAD(&mp->geom, gp, geom);
 	TAILQ_INSERT_HEAD(&geoms, gp, geoms);
 	strcpy(gp->name, sbuf_data(sb));
 	sbuf_delete(sb);
 	/* Fill in defaults from class */
 	gp->start = mp->start;
 	gp->spoiled = mp->spoiled;
 	gp->attrchanged = mp->attrchanged;
 	gp->providergone = mp->providergone;
 	gp->dumpconf = mp->dumpconf;
 	gp->access = mp->access;
 	gp->orphan = mp->orphan;
 	gp->ioctl = mp->ioctl;
 	gp->resize = mp->resize;
 	return (gp);
 }
 
 void
 g_destroy_geom(struct g_geom *gp)
 {
 
 	g_topology_assert();
 	G_VALID_GEOM(gp);
 	g_trace(G_T_TOPOLOGY, "g_destroy_geom(%p(%s))", gp, gp->name);
 	KASSERT(LIST_EMPTY(&gp->consumer),
 	    ("g_destroy_geom(%s) with consumer(s) [%p]",
 	    gp->name, LIST_FIRST(&gp->consumer)));
 	KASSERT(LIST_EMPTY(&gp->provider),
 	    ("g_destroy_geom(%s) with provider(s) [%p]",
 	    gp->name, LIST_FIRST(&gp->provider)));
 	g_cancel_event(gp);
 	LIST_REMOVE(gp, geom);
 	TAILQ_REMOVE(&geoms, gp, geoms);
 	g_free(gp->name);
 	g_free(gp);
 }
 
 /*
  * This function is called (repeatedly) until the geom has withered away.
  */
 void
 g_wither_geom(struct g_geom *gp, int error)
 {
 	struct g_provider *pp;
 
 	g_topology_assert();
 	G_VALID_GEOM(gp);
 	g_trace(G_T_TOPOLOGY, "g_wither_geom(%p(%s))", gp, gp->name);
 	if (!(gp->flags & G_GEOM_WITHER)) {
 		gp->flags |= G_GEOM_WITHER;
 		LIST_FOREACH(pp, &gp->provider, provider)
 			if (!(pp->flags & G_PF_ORPHAN))
 				g_orphan_provider(pp, error);
 	}
 	g_do_wither();
 }
 
 /*
  * Convenience function to destroy a particular provider.
  */
 void
 g_wither_provider(struct g_provider *pp, int error)
 {
 
 	pp->flags |= G_PF_WITHER;
 	if (!(pp->flags & G_PF_ORPHAN))
 		g_orphan_provider(pp, error);
 }
 
 /*
  * This function is called (repeatedly) until the has withered away.
  */
 void
 g_wither_geom_close(struct g_geom *gp, int error)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	G_VALID_GEOM(gp);
 	g_trace(G_T_TOPOLOGY, "g_wither_geom_close(%p(%s))", gp, gp->name);
 	LIST_FOREACH(cp, &gp->consumer, consumer)
 		if (cp->acr || cp->acw || cp->ace)
 			g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	g_wither_geom(gp, error);
 }
 
 /*
  * This function is called (repeatedly) until we cant wash away more
  * withered bits at present.
  */
 void
 g_wither_washer()
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 	struct g_provider *pp, *pp2;
 	struct g_consumer *cp, *cp2;
 
 	g_topology_assert();
 	LIST_FOREACH(mp, &g_classes, class) {
 		LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 			LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) {
 				if (!(pp->flags & G_PF_WITHER))
 					continue;
 				if (LIST_EMPTY(&pp->consumers))
 					g_destroy_provider(pp);
 			}
 			if (!(gp->flags & G_GEOM_WITHER))
 				continue;
 			LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) {
 				if (LIST_EMPTY(&pp->consumers))
 					g_destroy_provider(pp);
 			}
 			LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp2) {
 				if (cp->acr || cp->acw || cp->ace)
 					continue;
 				if (cp->provider != NULL)
 					g_detach(cp);
 				g_destroy_consumer(cp);
 			}
 			if (LIST_EMPTY(&gp->provider) &&
 			    LIST_EMPTY(&gp->consumer))
 				g_destroy_geom(gp);
 		}
 	}
 }
 
 struct g_consumer *
 g_new_consumer(struct g_geom *gp)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	G_VALID_GEOM(gp);
 	KASSERT(!(gp->flags & G_GEOM_WITHER),
 	    ("g_new_consumer on WITHERing geom(%s) (class %s)",
 	    gp->name, gp->class->name));
 	KASSERT(gp->orphan != NULL,
 	    ("g_new_consumer on geom(%s) (class %s) without orphan",
 	    gp->name, gp->class->name));
 
 	cp = g_malloc(sizeof *cp, M_WAITOK | M_ZERO);
 	cp->geom = gp;
 	cp->stat = devstat_new_entry(cp, -1, 0, DEVSTAT_ALL_SUPPORTED,
 	    DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 	LIST_INSERT_HEAD(&gp->consumer, cp, consumer);
 	return(cp);
 }
 
 void
 g_destroy_consumer(struct g_consumer *cp)
 {
 	struct g_geom *gp;
 
 	g_topology_assert();
 	G_VALID_CONSUMER(cp);
 	g_trace(G_T_TOPOLOGY, "g_destroy_consumer(%p)", cp);
 	KASSERT (cp->provider == NULL, ("g_destroy_consumer but attached"));
 	KASSERT (cp->acr == 0, ("g_destroy_consumer with acr"));
 	KASSERT (cp->acw == 0, ("g_destroy_consumer with acw"));
 	KASSERT (cp->ace == 0, ("g_destroy_consumer with ace"));
 	g_cancel_event(cp);
 	gp = cp->geom;
 	LIST_REMOVE(cp, consumer);
 	devstat_remove_entry(cp->stat);
 	g_free(cp);
 	if (gp->flags & G_GEOM_WITHER)
 		g_do_wither();
 }
 
 static void
 g_new_provider_event(void *arg, int flag)
 {
 	struct g_class *mp;
 	struct g_provider *pp;
 	struct g_consumer *cp, *next_cp;
 
 	g_topology_assert();
 	if (flag == EV_CANCEL)
 		return;
 	if (g_shutdown)
 		return;
 	pp = arg;
 	G_VALID_PROVIDER(pp);
 	KASSERT(!(pp->flags & G_PF_WITHER),
 	    ("g_new_provider_event but withered"));
 	LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, next_cp) {
 		if ((cp->flags & G_CF_ORPHAN) == 0 &&
 		    cp->geom->attrchanged != NULL)
 			cp->geom->attrchanged(cp, "GEOM::media");
 	}
 	if (g_notaste)
 		return;
 	LIST_FOREACH(mp, &g_classes, class) {
 		if (mp->taste == NULL)
 			continue;
 		LIST_FOREACH(cp, &pp->consumers, consumers)
 			if (cp->geom->class == mp &&
 			    (cp->flags & G_CF_ORPHAN) == 0)
 				break;
 		if (cp != NULL)
 			continue;
 		mp->taste(mp, pp, 0);
 		g_topology_assert();
 	}
 }
 
 
 struct g_provider *
 g_new_providerf(struct g_geom *gp, const char *fmt, ...)
 {
 	struct g_provider *pp;
 	struct sbuf *sb;
 	va_list ap;
 
 	g_topology_assert();
 	G_VALID_GEOM(gp);
 	KASSERT(gp->access != NULL,
 	    ("new provider on geom(%s) without ->access (class %s)",
 	    gp->name, gp->class->name));
 	KASSERT(gp->start != NULL,
 	    ("new provider on geom(%s) without ->start (class %s)",
 	    gp->name, gp->class->name));
 	KASSERT(!(gp->flags & G_GEOM_WITHER),
 	    ("new provider on WITHERing geom(%s) (class %s)",
 	    gp->name, gp->class->name));
 	sb = sbuf_new_auto();
 	va_start(ap, fmt);
 	sbuf_vprintf(sb, fmt, ap);
 	va_end(ap);
 	sbuf_finish(sb);
 	pp = g_malloc(sizeof *pp + sbuf_len(sb) + 1, M_WAITOK | M_ZERO);
 	pp->name = (char *)(pp + 1);
 	strcpy(pp->name, sbuf_data(sb));
 	sbuf_delete(sb);
 	LIST_INIT(&pp->consumers);
 	pp->error = ENXIO;
 	pp->geom = gp;
 	pp->stat = devstat_new_entry(pp, -1, 0, DEVSTAT_ALL_SUPPORTED,
 	    DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 	LIST_INSERT_HEAD(&gp->provider, pp, provider);
 	g_post_event(g_new_provider_event, pp, M_WAITOK, pp, gp, NULL);
 	return (pp);
 }
 
 void
 g_error_provider(struct g_provider *pp, int error)
 {
 
 	/* G_VALID_PROVIDER(pp);  We may not have g_topology */
 	pp->error = error;
 }
 
 static void
 g_resize_provider_event(void *arg, int flag)
 {
 	struct g_hh00 *hh;
 	struct g_class *mp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_consumer *cp, *cp2;
 	off_t size;
 
 	g_topology_assert();
 	if (g_shutdown)
 		return;
 
 	hh = arg;
 	pp = hh->pp;
 	size = hh->size;
 	g_free(hh);
 
 	G_VALID_PROVIDER(pp);
 	KASSERT(!(pp->flags & G_PF_WITHER),
 	    ("g_resize_provider_event but withered"));
 	g_trace(G_T_TOPOLOGY, "g_resize_provider_event(%p)", pp);
 
 	LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) {
 		gp = cp->geom;
 		if (gp->resize == NULL && size < pp->mediasize) {
 			cp->flags |= G_CF_ORPHAN;
 			cp->geom->orphan(cp);
 		}
 	}
 
 	pp->mediasize = size;
 	
 	LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) {
 		gp = cp->geom;
 		if ((gp->flags & G_GEOM_WITHER) == 0 && gp->resize != NULL)
 			gp->resize(cp);
 	}
 
 	/*
 	 * After resizing, the previously invalid GEOM class metadata
 	 * might become valid.  This means we should retaste.
 	 */
 	LIST_FOREACH(mp, &g_classes, class) {
 		if (mp->taste == NULL)
 			continue;
 		LIST_FOREACH(cp, &pp->consumers, consumers)
 			if (cp->geom->class == mp &&
 			    (cp->flags & G_CF_ORPHAN) == 0)
 				break;
 		if (cp != NULL)
 			continue;
 		mp->taste(mp, pp, 0);
 		g_topology_assert();
 	}
 }
 
 void
 g_resize_provider(struct g_provider *pp, off_t size)
 {
 	struct g_hh00 *hh;
 
 	G_VALID_PROVIDER(pp);
 	if (pp->flags & G_PF_WITHER)
 		return;
 
 	if (size == pp->mediasize)
 		return;
 
 	hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO);
 	hh->pp = pp;
 	hh->size = size;
 	g_post_event(g_resize_provider_event, hh, M_WAITOK, NULL);
 }
 
 #ifndef	_PATH_DEV
 #define	_PATH_DEV	"/dev/"
 #endif
 
 struct g_provider *
 g_provider_by_name(char const *arg)
 {
 	struct g_class *cp;
 	struct g_geom *gp;
 	struct g_provider *pp, *wpp;
 
 	if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
 		arg += sizeof(_PATH_DEV) - 1;
 
 	wpp = NULL;
 	LIST_FOREACH(cp, &g_classes, class) {
 		LIST_FOREACH(gp, &cp->geom, geom) {
 			LIST_FOREACH(pp, &gp->provider, provider) {
 				if (strcmp(arg, pp->name) != 0)
 					continue;
 				if ((gp->flags & G_GEOM_WITHER) == 0 &&
 				    (pp->flags & G_PF_WITHER) == 0)
 					return (pp);
 				else
 					wpp = pp;
 			}
 		}
 	}
 
 	return (wpp);
 }
 
 void
 g_destroy_provider(struct g_provider *pp)
 {
 	struct g_geom *gp;
 
 	g_topology_assert();
 	G_VALID_PROVIDER(pp);
 	KASSERT(LIST_EMPTY(&pp->consumers),
 	    ("g_destroy_provider but attached"));
 	KASSERT (pp->acr == 0, ("g_destroy_provider with acr"));
 	KASSERT (pp->acw == 0, ("g_destroy_provider with acw"));
 	KASSERT (pp->ace == 0, ("g_destroy_provider with ace"));
 	g_cancel_event(pp);
 	LIST_REMOVE(pp, provider);
 	gp = pp->geom;
 	devstat_remove_entry(pp->stat);
 	/*
 	 * If a callback was provided, send notification that the provider
 	 * is now gone.
 	 */
 	if (gp->providergone != NULL)
 		gp->providergone(pp);
 
 	g_free(pp);
 	if ((gp->flags & G_GEOM_WITHER))
 		g_do_wither();
 }
 
 /*
  * We keep the "geoms" list sorted by topological order (== increasing
  * numerical rank) at all times.
  * When an attach is done, the attaching geoms rank is invalidated
  * and it is moved to the tail of the list.
  * All geoms later in the sequence has their ranks reevaluated in
  * sequence.  If we cannot assign rank to a geom because it's
  * prerequisites do not have rank, we move that element to the tail
  * of the sequence with invalid rank as well.
  * At some point we encounter our original geom and if we stil fail
  * to assign it a rank, there must be a loop and we fail back to
  * g_attach() which detach again and calls redo_rank again
  * to fix up the damage.
  * It would be much simpler code wise to do it recursively, but we
  * can't risk that on the kernel stack.
  */
 
 static int
 redo_rank(struct g_geom *gp)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp1, *gp2;
 	int n, m;
 
 	g_topology_assert();
 	G_VALID_GEOM(gp);
 
 	/* Invalidate this geoms rank and move it to the tail */
 	gp1 = TAILQ_NEXT(gp, geoms);
 	if (gp1 != NULL) {
 		gp->rank = 0;
 		TAILQ_REMOVE(&geoms, gp, geoms);
 		TAILQ_INSERT_TAIL(&geoms, gp, geoms);
 	} else {
 		gp1 = gp;
 	}
 
 	/* re-rank the rest of the sequence */
 	for (; gp1 != NULL; gp1 = gp2) {
 		gp1->rank = 0;
 		m = 1;
 		LIST_FOREACH(cp, &gp1->consumer, consumer) {
 			if (cp->provider == NULL)
 				continue;
 			n = cp->provider->geom->rank;
 			if (n == 0) {
 				m = 0;
 				break;
 			} else if (n >= m)
 				m = n + 1;
 		}
 		gp1->rank = m;
 		gp2 = TAILQ_NEXT(gp1, geoms);
 
 		/* got a rank, moving on */
 		if (m != 0)
 			continue;
 
 		/* no rank to original geom means loop */
 		if (gp == gp1) 
 			return (ELOOP);
 
 		/* no rank, put it at the end move on */
 		TAILQ_REMOVE(&geoms, gp1, geoms);
 		TAILQ_INSERT_TAIL(&geoms, gp1, geoms);
 	}
 	return (0);
 }
 
 int
 g_attach(struct g_consumer *cp, struct g_provider *pp)
 {
 	int error;
 
 	g_topology_assert();
 	G_VALID_CONSUMER(cp);
 	G_VALID_PROVIDER(pp);
 	g_trace(G_T_TOPOLOGY, "g_attach(%p, %p)", cp, pp);
 	KASSERT(cp->provider == NULL, ("attach but attached"));
 	cp->provider = pp;
 	LIST_INSERT_HEAD(&pp->consumers, cp, consumers);
 	error = redo_rank(cp->geom);
 	if (error) {
 		LIST_REMOVE(cp, consumers);
 		cp->provider = NULL;
 		redo_rank(cp->geom);
 	}
 	return (error);
 }
 
 void
 g_detach(struct g_consumer *cp)
 {
 	struct g_provider *pp;
 
 	g_topology_assert();
 	G_VALID_CONSUMER(cp);
 	g_trace(G_T_TOPOLOGY, "g_detach(%p)", cp);
 	KASSERT(cp->provider != NULL, ("detach but not attached"));
 	KASSERT(cp->acr == 0, ("detach but nonzero acr"));
 	KASSERT(cp->acw == 0, ("detach but nonzero acw"));
 	KASSERT(cp->ace == 0, ("detach but nonzero ace"));
 	KASSERT(cp->nstart == cp->nend,
 	    ("detach with active requests"));
 	pp = cp->provider;
 	LIST_REMOVE(cp, consumers);
 	cp->provider = NULL;
 	if ((cp->geom->flags & G_GEOM_WITHER) ||
 	    (pp->geom->flags & G_GEOM_WITHER) ||
 	    (pp->flags & G_PF_WITHER))
 		g_do_wither();
 	redo_rank(cp->geom);
 }
 
 /*
  * g_access()
  *
  * Access-check with delta values.  The question asked is "can provider
  * "cp" change the access counters by the relative amounts dc[rwe] ?"
  */
 
 int
 g_access(struct g_consumer *cp, int dcr, int dcw, int dce)
 {
 	struct g_provider *pp;
 	int pr,pw,pe;
 	int error;
 
 	g_topology_assert();
 	G_VALID_CONSUMER(cp);
 	pp = cp->provider;
 	KASSERT(pp != NULL, ("access but not attached"));
 	G_VALID_PROVIDER(pp);
 
 	g_trace(G_T_ACCESS, "g_access(%p(%s), %d, %d, %d)",
 	    cp, pp->name, dcr, dcw, dce);
 
 	KASSERT(cp->acr + dcr >= 0, ("access resulting in negative acr"));
 	KASSERT(cp->acw + dcw >= 0, ("access resulting in negative acw"));
 	KASSERT(cp->ace + dce >= 0, ("access resulting in negative ace"));
 	KASSERT(dcr != 0 || dcw != 0 || dce != 0, ("NOP access request"));
 	KASSERT(pp->geom->access != NULL, ("NULL geom->access"));
 
 	/*
 	 * If our class cares about being spoiled, and we have been, we
 	 * are probably just ahead of the event telling us that.  Fail
 	 * now rather than having to unravel this later.
 	 */
 	if (cp->geom->spoiled != NULL && (cp->flags & G_CF_SPOILED) &&
 	    (dcr > 0 || dcw > 0 || dce > 0))
 		return (ENXIO);
 
 	/*
 	 * Figure out what counts the provider would have had, if this
 	 * consumer had (r0w0e0) at this time.
 	 */
 	pr = pp->acr - cp->acr;
 	pw = pp->acw - cp->acw;
 	pe = pp->ace - cp->ace;
 
 	g_trace(G_T_ACCESS,
     "open delta:[r%dw%de%d] old:[r%dw%de%d] provider:[r%dw%de%d] %p(%s)",
 	    dcr, dcw, dce,
 	    cp->acr, cp->acw, cp->ace,
 	    pp->acr, pp->acw, pp->ace,
 	    pp, pp->name);
 
 	/* If foot-shooting is enabled, any open on rank#1 is OK */
 	if ((g_debugflags & 16) && pp->geom->rank == 1)
 		;
 	/* If we try exclusive but already write: fail */
 	else if (dce > 0 && pw > 0)
 		return (EPERM);
 	/* If we try write but already exclusive: fail */
 	else if (dcw > 0 && pe > 0)
 		return (EPERM);
 	/* If we try to open more but provider is error'ed: fail */
 	else if ((dcr > 0 || dcw > 0 || dce > 0) && pp->error != 0) {
 		printf("%s(%d): provider %s has error\n",
 		       __func__, __LINE__, pp->name);
 		return (pp->error);
 	}
 
 	/* Ok then... */
 
 	error = pp->geom->access(pp, dcr, dcw, dce);
 	KASSERT(dcr > 0 || dcw > 0 || dce > 0 || error == 0,
 	    ("Geom provider %s::%s dcr=%d dcw=%d dce=%d error=%d failed "
 	    "closing ->access()", pp->geom->class->name, pp->name, dcr, dcw,
 	    dce, error));
 	if (!error) {
 		/*
 		 * If we open first write, spoil any partner consumers.
 		 * If we close last write and provider is not errored,
 		 * trigger re-taste.
 		 */
 		if (pp->acw == 0 && dcw != 0)
 			g_spoil(pp, cp);
 		else if (pp->acw != 0 && pp->acw == -dcw && pp->error == 0 &&
 		    !(pp->geom->flags & G_GEOM_WITHER))
 			g_post_event(g_new_provider_event, pp, M_WAITOK, 
 			    pp, NULL);
 
 		pp->acr += dcr;
 		pp->acw += dcw;
 		pp->ace += dce;
 		cp->acr += dcr;
 		cp->acw += dcw;
 		cp->ace += dce;
 		if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)
 			KASSERT(pp->sectorsize > 0,
 			    ("Provider %s lacks sectorsize", pp->name));
 		if ((cp->geom->flags & G_GEOM_WITHER) &&
 		    cp->acr == 0 && cp->acw == 0 && cp->ace == 0)
 			g_do_wither();
 	}
 	return (error);
 }
 
 int
 g_handleattr_int(struct bio *bp, const char *attribute, int val)
 {
 
 	return (g_handleattr(bp, attribute, &val, sizeof val));
 }
 
 int
 g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val)
 {
 
 	return (g_handleattr(bp, attribute, &val, sizeof val));
 }
 
 int
 g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val)
 {
 
 	return (g_handleattr(bp, attribute, &val, sizeof val));
 }
 
 int
 g_handleattr_str(struct bio *bp, const char *attribute, const char *str)
 {
 
 	return (g_handleattr(bp, attribute, str, 0));
 }
 
 int
 g_handleattr(struct bio *bp, const char *attribute, const void *val, int len)
 {
 	int error = 0;
 
 	if (strcmp(bp->bio_attribute, attribute))
 		return (0);
 	if (len == 0) {
 		bzero(bp->bio_data, bp->bio_length);
 		if (strlcpy(bp->bio_data, val, bp->bio_length) >=
 		    bp->bio_length) {
 			printf("%s: %s bio_length %jd len %zu -> EFAULT\n",
 			    __func__, bp->bio_to->name,
 			    (intmax_t)bp->bio_length, strlen(val));
 			error = EFAULT;
 		}
 	} else if (bp->bio_length == len) {
 		bcopy(val, bp->bio_data, len);
 	} else {
 		printf("%s: %s bio_length %jd len %d -> EFAULT\n", __func__,
 		    bp->bio_to->name, (intmax_t)bp->bio_length, len);
 		error = EFAULT;
 	}
 	if (error == 0)
 		bp->bio_completed = bp->bio_length;
 	g_io_deliver(bp, error);
 	return (1);
 }
 
 int
 g_std_access(struct g_provider *pp,
 	int dr __unused, int dw __unused, int de __unused)
 {
 
 	g_topology_assert();
 	G_VALID_PROVIDER(pp);
         return (0);
 }
 
 void
 g_std_done(struct bio *bp)
 {
 	struct bio *bp2;
 
 	bp2 = bp->bio_parent;
 	if (bp2->bio_error == 0)
 		bp2->bio_error = bp->bio_error;
 	bp2->bio_completed += bp->bio_completed;
 	g_destroy_bio(bp);
 	bp2->bio_inbed++;
 	if (bp2->bio_children == bp2->bio_inbed)
 		g_io_deliver(bp2, bp2->bio_error);
 }
 
 /* XXX: maybe this is only g_slice_spoiled */
 
 void
 g_std_spoiled(struct g_consumer *cp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	g_topology_assert();
 	G_VALID_CONSUMER(cp);
 	g_trace(G_T_TOPOLOGY, "g_std_spoiled(%p)", cp);
 	cp->flags |= G_CF_ORPHAN;
 	g_detach(cp);
 	gp = cp->geom;
 	LIST_FOREACH(pp, &gp->provider, provider)
 		g_orphan_provider(pp, ENXIO);
 	g_destroy_consumer(cp);
 	if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer))
 		g_destroy_geom(gp);
 	else
 		gp->flags |= G_GEOM_WITHER;
 }
 
 /*
  * Spoiling happens when a provider is opened for writing, but consumers
  * which are configured by in-band data are attached (slicers for instance).
  * Since the write might potentially change the in-band data, such consumers
  * need to re-evaluate their existence after the writing session closes.
  * We do this by (offering to) tear them down when the open for write happens
  * in return for a re-taste when it closes again.
  * Together with the fact that such consumers grab an 'e' bit whenever they
  * are open, regardless of mode, this ends up DTRT.
  */
 
 static void
 g_spoil_event(void *arg, int flag)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp, *cp2;
 
 	g_topology_assert();
 	if (flag == EV_CANCEL)
 		return;
 	pp = arg;
 	G_VALID_PROVIDER(pp);
 	g_trace(G_T_TOPOLOGY, "%s %p(%s:%s:%s)", __func__, pp,
 	    pp->geom->class->name, pp->geom->name, pp->name);
 	for (cp = LIST_FIRST(&pp->consumers); cp != NULL; cp = cp2) {
 		cp2 = LIST_NEXT(cp, consumers);
 		if ((cp->flags & G_CF_SPOILED) == 0)
 			continue;
 		cp->flags &= ~G_CF_SPOILED;
 		if (cp->geom->spoiled == NULL)
 			continue;
 		cp->geom->spoiled(cp);
 		g_topology_assert();
 	}
 }
 
 void
 g_spoil(struct g_provider *pp, struct g_consumer *cp)
 {
 	struct g_consumer *cp2;
 
 	g_topology_assert();
 	G_VALID_PROVIDER(pp);
 	G_VALID_CONSUMER(cp);
 
 	LIST_FOREACH(cp2, &pp->consumers, consumers) {
 		if (cp2 == cp)
 			continue;
 /*
 		KASSERT(cp2->acr == 0, ("spoiling cp->acr = %d", cp2->acr));
 		KASSERT(cp2->acw == 0, ("spoiling cp->acw = %d", cp2->acw));
 */
 		KASSERT(cp2->ace == 0, ("spoiling cp->ace = %d", cp2->ace));
 		cp2->flags |= G_CF_SPOILED;
 	}
 	g_post_event(g_spoil_event, pp, M_WAITOK, pp, NULL);
 }
 
 static void
 g_media_changed_event(void *arg, int flag)
 {
 	struct g_provider *pp;
 	int retaste;
 
 	g_topology_assert();
 	if (flag == EV_CANCEL)
 		return;
 	pp = arg;
 	G_VALID_PROVIDER(pp);
 
 	/*
 	 * If provider was not open for writing, queue retaste after spoiling.
 	 * If it was, retaste will happen automatically on close.
 	 */
 	retaste = (pp->acw == 0 && pp->error == 0 &&
 	    !(pp->geom->flags & G_GEOM_WITHER));
 	g_spoil_event(arg, flag);
 	if (retaste)
 		g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL);
 }
 
 int
 g_media_changed(struct g_provider *pp, int flag)
 {
 	struct g_consumer *cp;
 
 	LIST_FOREACH(cp, &pp->consumers, consumers)
 		cp->flags |= G_CF_SPOILED;
 	return (g_post_event(g_media_changed_event, pp, flag, pp, NULL));
 }
 
 int
 g_media_gone(struct g_provider *pp, int flag)
 {
 	struct g_consumer *cp;
 
 	LIST_FOREACH(cp, &pp->consumers, consumers)
 		cp->flags |= G_CF_SPOILED;
 	return (g_post_event(g_spoil_event, pp, flag, pp, NULL));
 }
 
 int
 g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len)
 {
 	int error, i;
 
 	i = len;
 	error = g_io_getattr(attr, cp, &i, var);
 	if (error)
 		return (error);
 	if (i != len)
 		return (EINVAL);
 	return (0);
 }
 
 static int
 g_get_device_prefix_len(const char *name)
 {
 	int len;
 
 	if (strncmp(name, "ada", 3) == 0)
 		len = 3;
 	else if (strncmp(name, "ad", 2) == 0)
 		len = 2;
 	else
 		return (0);
 	if (name[len] < '0' || name[len] > '9')
 		return (0);
 	do {
 		len++;
 	} while (name[len] >= '0' && name[len] <= '9');
 	return (len);
 }
 
 int
 g_compare_names(const char *namea, const char *nameb)
 {
 	int deva, devb;
 
 	if (strcmp(namea, nameb) == 0)
 		return (1);
 	deva = g_get_device_prefix_len(namea);
 	if (deva == 0)
 		return (0);
 	devb = g_get_device_prefix_len(nameb);
 	if (devb == 0)
 		return (0);
 	if (strcmp(namea + deva, nameb + devb) == 0)
 		return (1);
 	return (0);
 }
 
 #if defined(DIAGNOSTIC) || defined(DDB)
 /*
  * This function walks the mesh and returns a non-zero integer if it
  * finds the argument pointer is an object. The return value indicates
  * which type of object it is believed to be. If topology is not locked,
  * this function is potentially dangerous, but we don't assert that the
  * topology lock is held when called from debugger.
  */
 int
 g_valid_obj(void const *ptr)
 {
 	struct g_class *mp;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	struct g_provider *pp;
 
 #ifdef KDB
 	if (kdb_active == 0)
 #endif
 		g_topology_assert();
 
 	LIST_FOREACH(mp, &g_classes, class) {
 		if (ptr == mp)
 			return (1);
 		LIST_FOREACH(gp, &mp->geom, geom) {
 			if (ptr == gp)
 				return (2);
 			LIST_FOREACH(cp, &gp->consumer, consumer)
 				if (ptr == cp)
 					return (3);
 			LIST_FOREACH(pp, &gp->provider, provider)
 				if (ptr == pp)
 					return (4);
 		}
 	}
 	return(0);
 }
 #endif
 
 #ifdef DDB
 
 #define	gprintf(...)	do {						\
 	db_printf("%*s", indent, "");					\
 	db_printf(__VA_ARGS__);						\
 } while (0)
 #define	gprintln(...)	do {						\
 	gprintf(__VA_ARGS__);						\
 	db_printf("\n");						\
 } while (0)
 
 #define	ADDFLAG(obj, flag, sflag)	do {				\
 	if ((obj)->flags & (flag)) {					\
 		if (comma)						\
 			strlcat(str, ",", size);			\
 		strlcat(str, (sflag), size);				\
 		comma = 1;						\
 	}								\
 } while (0)
 
 static char *
 provider_flags_to_string(struct g_provider *pp, char *str, size_t size)
 {
 	int comma = 0;
 
 	bzero(str, size);
 	if (pp->flags == 0) {
 		strlcpy(str, "NONE", size);
 		return (str);
 	}
 	ADDFLAG(pp, G_PF_WITHER, "G_PF_WITHER");
 	ADDFLAG(pp, G_PF_ORPHAN, "G_PF_ORPHAN");
 	return (str);
 }
 
 static char *
 geom_flags_to_string(struct g_geom *gp, char *str, size_t size)
 {
 	int comma = 0;
 
 	bzero(str, size);
 	if (gp->flags == 0) {
 		strlcpy(str, "NONE", size);
 		return (str);
 	}
 	ADDFLAG(gp, G_GEOM_WITHER, "G_GEOM_WITHER");
 	return (str);
 }
 static void
 db_show_geom_consumer(int indent, struct g_consumer *cp)
 {
 
 	if (indent == 0) {
 		gprintln("consumer: %p", cp);
 		gprintln("  class:    %s (%p)", cp->geom->class->name,
 		    cp->geom->class);
 		gprintln("  geom:     %s (%p)", cp->geom->name, cp->geom);
 		if (cp->provider == NULL)
 			gprintln("  provider: none");
 		else {
 			gprintln("  provider: %s (%p)", cp->provider->name,
 			    cp->provider);
 		}
 		gprintln("  access:   r%dw%de%d", cp->acr, cp->acw, cp->ace);
 		gprintln("  flags:    0x%04x", cp->flags);
 		gprintln("  nstart:   %u", cp->nstart);
 		gprintln("  nend:     %u", cp->nend);
 	} else {
 		gprintf("consumer: %p (%s), access=r%dw%de%d", cp,
 		    cp->provider != NULL ? cp->provider->name : "none",
 		    cp->acr, cp->acw, cp->ace);
 		if (cp->flags)
 			db_printf(", flags=0x%04x", cp->flags);
 		db_printf("\n");
 	}
 }
 
 static void
 db_show_geom_provider(int indent, struct g_provider *pp)
 {
 	struct g_consumer *cp;
 	char flags[64];
 
 	if (indent == 0) {
 		gprintln("provider: %s (%p)", pp->name, pp);
 		gprintln("  class:        %s (%p)", pp->geom->class->name,
 		    pp->geom->class);
 		gprintln("  geom:         %s (%p)", pp->geom->name, pp->geom);
 		gprintln("  mediasize:    %jd", (intmax_t)pp->mediasize);
 		gprintln("  sectorsize:   %u", pp->sectorsize);
 		gprintln("  stripesize:   %u", pp->stripesize);
 		gprintln("  stripeoffset: %u", pp->stripeoffset);
 		gprintln("  access:       r%dw%de%d", pp->acr, pp->acw,
 		    pp->ace);
 		gprintln("  flags:        %s (0x%04x)",
 		    provider_flags_to_string(pp, flags, sizeof(flags)),
 		    pp->flags);
 		gprintln("  error:        %d", pp->error);
 		gprintln("  nstart:       %u", pp->nstart);
 		gprintln("  nend:         %u", pp->nend);
 		if (LIST_EMPTY(&pp->consumers))
 			gprintln("  consumers:    none");
 	} else {
 		gprintf("provider: %s (%p), access=r%dw%de%d",
 		    pp->name, pp, pp->acr, pp->acw, pp->ace);
 		if (pp->flags != 0) {
 			db_printf(", flags=%s (0x%04x)",
 			    provider_flags_to_string(pp, flags, sizeof(flags)),
 			    pp->flags);
 		}
 		db_printf("\n");
 	}
 	if (!LIST_EMPTY(&pp->consumers)) {
 		LIST_FOREACH(cp, &pp->consumers, consumers) {
 			db_show_geom_consumer(indent + 2, cp);
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 
 static void
 db_show_geom_geom(int indent, struct g_geom *gp)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	char flags[64];
 
 	if (indent == 0) {
 		gprintln("geom: %s (%p)", gp->name, gp);
 		gprintln("  class:     %s (%p)", gp->class->name, gp->class);
 		gprintln("  flags:     %s (0x%04x)",
 		    geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags);
 		gprintln("  rank:      %d", gp->rank);
 		if (LIST_EMPTY(&gp->provider))
 			gprintln("  providers: none");
 		if (LIST_EMPTY(&gp->consumer))
 			gprintln("  consumers: none");
 	} else {
 		gprintf("geom: %s (%p), rank=%d", gp->name, gp, gp->rank);
 		if (gp->flags != 0) {
 			db_printf(", flags=%s (0x%04x)",
 			    geom_flags_to_string(gp, flags, sizeof(flags)),
 			    gp->flags);
 		}
 		db_printf("\n");
 	}
 	if (!LIST_EMPTY(&gp->provider)) {
 		LIST_FOREACH(pp, &gp->provider, provider) {
 			db_show_geom_provider(indent + 2, pp);
 			if (db_pager_quit)
 				break;
 		}
 	}
 	if (!LIST_EMPTY(&gp->consumer)) {
 		LIST_FOREACH(cp, &gp->consumer, consumer) {
 			db_show_geom_consumer(indent + 2, cp);
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 
 static void
 db_show_geom_class(struct g_class *mp)
 {
 	struct g_geom *gp;
 
 	db_printf("class: %s (%p)\n", mp->name, mp);
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		db_show_geom_geom(2, gp);
 		if (db_pager_quit)
 			break;
 	}
 }
 
 /*
  * Print the GEOM topology or the given object.
  */
 DB_SHOW_COMMAND(geom, db_show_geom)
 {
 	struct g_class *mp;
 
 	if (!have_addr) {
 		/* No address given, print the entire topology. */
 		LIST_FOREACH(mp, &g_classes, class) {
 			db_show_geom_class(mp);
 			db_printf("\n");
 			if (db_pager_quit)
 				break;
 		}
 	} else {
 		switch (g_valid_obj((void *)addr)) {
 		case 1:
 			db_show_geom_class((struct g_class *)addr);
 			break;
 		case 2:
 			db_show_geom_geom(0, (struct g_geom *)addr);
 			break;
 		case 3:
 			db_show_geom_consumer(0, (struct g_consumer *)addr);
 			break;
 		case 4:
 			db_show_geom_provider(0, (struct g_provider *)addr);
 			break;
 		default:
 			db_printf("Not a GEOM object.\n");
 			break;
 		}
 	}
 }
 
 static void
 db_print_bio_cmd(struct bio *bp)
 {
 	db_printf("  cmd: ");
 	switch (bp->bio_cmd) {
 	case BIO_READ: db_printf("BIO_READ"); break;
 	case BIO_WRITE: db_printf("BIO_WRITE"); break;
 	case BIO_DELETE: db_printf("BIO_DELETE"); break;
 	case BIO_GETATTR: db_printf("BIO_GETATTR"); break;
 	case BIO_FLUSH: db_printf("BIO_FLUSH"); break;
 	case BIO_CMD0: db_printf("BIO_CMD0"); break;
 	case BIO_CMD1: db_printf("BIO_CMD1"); break;
 	case BIO_CMD2: db_printf("BIO_CMD2"); break;
 	case BIO_ZONE: db_printf("BIO_ZONE"); break;
 	default: db_printf("UNKNOWN"); break;
 	}
 	db_printf("\n");
 }
 
 static void
 db_print_bio_flags(struct bio *bp)
 {
 	int comma;
 
 	comma = 0;
 	db_printf("  flags: ");
 	if (bp->bio_flags & BIO_ERROR) {
 		db_printf("BIO_ERROR");
 		comma = 1;
 	}
 	if (bp->bio_flags & BIO_DONE) {
 		db_printf("%sBIO_DONE", (comma ? ", " : ""));
 		comma = 1;
 	}
 	if (bp->bio_flags & BIO_ONQUEUE)
 		db_printf("%sBIO_ONQUEUE", (comma ? ", " : ""));
 	db_printf("\n");
 }
 
 /*
  * Print useful information in a BIO
  */
 DB_SHOW_COMMAND(bio, db_show_bio)
 {
 	struct bio *bp;
 
 	if (have_addr) {
 		bp = (struct bio *)addr;
 		db_printf("BIO %p\n", bp);
 		db_print_bio_cmd(bp);
 		db_print_bio_flags(bp);
 		db_printf("  cflags: 0x%hx\n", bp->bio_cflags);
 		db_printf("  pflags: 0x%hx\n", bp->bio_pflags);
 		db_printf("  offset: %jd\n", (intmax_t)bp->bio_offset);
 		db_printf("  length: %jd\n", (intmax_t)bp->bio_length);
 		db_printf("  bcount: %ld\n", bp->bio_bcount);
 		db_printf("  resid: %ld\n", bp->bio_resid);
 		db_printf("  completed: %jd\n", (intmax_t)bp->bio_completed);
 		db_printf("  children: %u\n", bp->bio_children);
 		db_printf("  inbed: %u\n", bp->bio_inbed);
 		db_printf("  error: %d\n", bp->bio_error);
 		db_printf("  parent: %p\n", bp->bio_parent);
 		db_printf("  driver1: %p\n", bp->bio_driver1);
 		db_printf("  driver2: %p\n", bp->bio_driver2);
 		db_printf("  caller1: %p\n", bp->bio_caller1);
 		db_printf("  caller2: %p\n", bp->bio_caller2);
 		db_printf("  bio_from: %p\n", bp->bio_from);
 		db_printf("  bio_to: %p\n", bp->bio_to);
+
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+		db_printf("  bio_track_bp: %p\n", bp->bio_track_bp);
+#endif
 	}
 }
 
 #undef	gprintf
 #undef	gprintln
 #undef	ADDFLAG
 
 #endif	/* DDB */
Index: head/sys/geom/geom_vfs.c
===================================================================
--- head/sys/geom/geom_vfs.c	(revision 308154)
+++ head/sys/geom/geom_vfs.c	(revision 308155)
@@ -1,284 +1,288 @@
 /*-
  * Copyright (c) 2004 Poul-Henning Kamp
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 /*
  * subroutines for use by filesystems.
  *
  * XXX: should maybe live somewhere else ?
  */
 #include <sys/buf.h>
 
 struct g_vfs_softc {
 	struct mtx	 sc_mtx;
 	struct bufobj	*sc_bo;
 	int		 sc_active;
 	int		 sc_orphaned;
 };
 
 static struct buf_ops __g_vfs_bufops = {
 	.bop_name =	"GEOM_VFS",
 	.bop_write =	bufwrite,
 	.bop_strategy =	g_vfs_strategy,	
 	.bop_sync =	bufsync,	
 	.bop_bdflush =	bufbdflush
 };
 
 struct buf_ops *g_vfs_bufops = &__g_vfs_bufops;
 
 static g_orphan_t g_vfs_orphan;
 
 static struct g_class g_vfs_class = {
 	.name =		"VFS",
 	.version =	G_VERSION,
 	.orphan =	g_vfs_orphan,
 };
 
 DECLARE_GEOM_CLASS(g_vfs_class, g_vfs);
 
 static void
 g_vfs_destroy(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	cp = arg;
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	g_detach(cp);
 	if (cp->geom->softc == NULL)
 		g_wither_geom(cp->geom, ENXIO);
 }
 
 static void
 g_vfs_done(struct bio *bip)
 {
 	struct g_consumer *cp;
 	struct g_vfs_softc *sc;
 	struct buf *bp;
 	int destroy;
 	struct mount *mp;
 	struct vnode *vp;
 	struct cdev *cdevp;
 
 	/*
 	 * Collect statistics on synchronous and asynchronous read
 	 * and write counts for disks that have associated filesystems.
 	 */
 	bp = bip->bio_caller2;
 	vp = bp->b_vp;
 	if (vp != NULL) {
 		/*
 		 * If not a disk vnode, use its associated mount point
 		 * otherwise use the mountpoint associated with the disk.
 		 */
 		VI_LOCK(vp);
 		if (vp->v_type != VCHR ||
 		    (cdevp = vp->v_rdev) == NULL ||
 		    cdevp->si_devsw == NULL ||
 		    (cdevp->si_devsw->d_flags & D_DISK) == 0)
 			mp = vp->v_mount;
 		else
 			mp = cdevp->si_mountpt;
 		if (mp != NULL) {
 			if (bp->b_iocmd == BIO_READ) {
 				if (LK_HOLDER(bp->b_lock.lk_lock) == LK_KERNPROC)
 					mp->mnt_stat.f_asyncreads++;
 				else
 					mp->mnt_stat.f_syncreads++;
 			} else if (bp->b_iocmd == BIO_WRITE) {
 				if (LK_HOLDER(bp->b_lock.lk_lock) == LK_KERNPROC)
 					mp->mnt_stat.f_asyncwrites++;
 				else
 					mp->mnt_stat.f_syncwrites++;
 			}
 		}
 		VI_UNLOCK(vp);
 	}
 
 	cp = bip->bio_from;
 	sc = cp->geom->softc;
 	if (bip->bio_error) {
 		printf("g_vfs_done():");
 		g_print_bio(bip);
 		printf("error = %d\n", bip->bio_error);
 	}
 	bp->b_error = bip->bio_error;
 	bp->b_ioflags = bip->bio_flags;
 	if (bip->bio_error)
 		bp->b_ioflags |= BIO_ERROR;
 	bp->b_resid = bp->b_bcount - bip->bio_completed;
 	g_destroy_bio(bip);
 
 	mtx_lock(&sc->sc_mtx);
 	destroy = ((--sc->sc_active) == 0 && sc->sc_orphaned);
 	mtx_unlock(&sc->sc_mtx);
 	if (destroy)
 		g_post_event(g_vfs_destroy, cp, M_WAITOK, NULL);
 
 	bufdone(bp);
 }
 
 void
 g_vfs_strategy(struct bufobj *bo, struct buf *bp)
 {
 	struct g_vfs_softc *sc;
 	struct g_consumer *cp;
 	struct bio *bip;
 
 	cp = bo->bo_private;
 	sc = cp->geom->softc;
 
 	/*
 	 * If the provider has orphaned us, just return EXIO.
 	 */
 	mtx_lock(&sc->sc_mtx);
 	if (sc->sc_orphaned) {
 		mtx_unlock(&sc->sc_mtx);
 		bp->b_error = ENXIO;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 	sc->sc_active++;
 	mtx_unlock(&sc->sc_mtx);
 
 	bip = g_alloc_bio();
 	bip->bio_cmd = bp->b_iocmd;
 	bip->bio_offset = bp->b_iooffset;
 	bip->bio_length = bp->b_bcount;
 	bdata2bio(bp, bip);
 	if ((bp->b_flags & B_BARRIER) != 0) {
 		bip->bio_flags |= BIO_ORDERED;
 		bp->b_flags &= ~B_BARRIER;
 	}
 	bip->bio_done = g_vfs_done;
 	bip->bio_caller2 = bp;
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+	buf_track(bp, __func__);
+	bip->bio_track_bp = bp;
+#endif
 	g_io_request(bip, cp);
 }
 
 static void
 g_vfs_orphan(struct g_consumer *cp)
 {
 	struct g_geom *gp;
 	struct g_vfs_softc *sc;
 	int destroy;
 
 	g_topology_assert();
 
 	gp = cp->geom;
 	g_trace(G_T_TOPOLOGY, "g_vfs_orphan(%p(%s))", cp, gp->name);
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	mtx_lock(&sc->sc_mtx);
 	sc->sc_orphaned = 1;
 	destroy = (sc->sc_active == 0);
 	mtx_unlock(&sc->sc_mtx);
 	if (destroy)
 		g_vfs_destroy(cp, 0);
 
 	/*
 	 * Do not destroy the geom.  Filesystem will do that during unmount.
 	 */
 }
 
 int
 g_vfs_open(struct vnode *vp, struct g_consumer **cpp, const char *fsname, int wr)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	struct g_vfs_softc *sc;
 	struct bufobj *bo;
 	int error;
 
 	g_topology_assert();
 
 	*cpp = NULL;
 	bo = &vp->v_bufobj;
 	if (bo->bo_private != vp)
 		return (EBUSY);
 
 	pp = g_dev_getprovider(vp->v_rdev);
 	if (pp == NULL)
 		return (ENOENT);
 	gp = g_new_geomf(&g_vfs_class, "%s.%s", fsname, pp->name);
 	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
 	mtx_init(&sc->sc_mtx, "g_vfs", NULL, MTX_DEF);
 	sc->sc_bo = bo;
 	gp->softc = sc;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_access(cp, 1, wr, wr);
 	if (error) {
 		g_wither_geom(gp, ENXIO);
 		return (error);
 	}
 	vnode_create_vobject(vp, pp->mediasize, curthread);
 	*cpp = cp;
 	cp->private = vp;
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	bo->bo_ops = g_vfs_bufops;
 	bo->bo_private = cp;
 	bo->bo_bsize = pp->sectorsize;
 
 	return (error);
 }
 
 void
 g_vfs_close(struct g_consumer *cp)
 {
 	struct g_geom *gp;
 	struct g_vfs_softc *sc;
 
 	g_topology_assert();
 
 	gp = cp->geom;
 	sc = gp->softc;
 	bufobj_invalbuf(sc->sc_bo, V_SAVE, 0, 0);
 	sc->sc_bo->bo_private = cp->private;
 	gp->softc = NULL;
 	mtx_destroy(&sc->sc_mtx);
 	if (!sc->sc_orphaned || cp->provider == NULL)
 		g_wither_geom_close(gp, ENXIO);
 	g_free(sc);
 }
Index: head/sys/geom/part/g_part.c
===================================================================
--- head/sys/geom/part/g_part.c	(revision 308154)
+++ head/sys/geom/part/g_part.c	(revision 308155)
@@ -1,2340 +1,2342 @@
 /*-
  * Copyright (c) 2002, 2005-2009 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/endian.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/uuid.h>
 #include <geom/geom.h>
 #include <geom/geom_ctl.h>
 #include <geom/geom_int.h>
 #include <geom/part/g_part.h>
 
 #include "g_part_if.h"
 
 #ifndef _PATH_DEV
 #define _PATH_DEV "/dev/"
 #endif
 
 static kobj_method_t g_part_null_methods[] = {
 	{ 0, 0 }
 };
 
 static struct g_part_scheme g_part_null_scheme = {
 	"(none)",
 	g_part_null_methods,
 	sizeof(struct g_part_table),
 };
 
 TAILQ_HEAD(, g_part_scheme) g_part_schemes =
     TAILQ_HEAD_INITIALIZER(g_part_schemes);
 
 struct g_part_alias_list {
 	const char *lexeme;
 	enum g_part_alias alias;
 } g_part_alias_list[G_PART_ALIAS_COUNT] = {
 	{ "apple-boot", G_PART_ALIAS_APPLE_BOOT },
 	{ "apple-core-storage", G_PART_ALIAS_APPLE_CORE_STORAGE },
 	{ "apple-hfs", G_PART_ALIAS_APPLE_HFS },
 	{ "apple-label", G_PART_ALIAS_APPLE_LABEL },
 	{ "apple-raid", G_PART_ALIAS_APPLE_RAID },
 	{ "apple-raid-offline", G_PART_ALIAS_APPLE_RAID_OFFLINE },
 	{ "apple-tv-recovery", G_PART_ALIAS_APPLE_TV_RECOVERY },
 	{ "apple-ufs", G_PART_ALIAS_APPLE_UFS },
 	{ "bios-boot", G_PART_ALIAS_BIOS_BOOT },
 	{ "chromeos-firmware", G_PART_ALIAS_CHROMEOS_FIRMWARE },
 	{ "chromeos-kernel", G_PART_ALIAS_CHROMEOS_KERNEL },
 	{ "chromeos-reserved", G_PART_ALIAS_CHROMEOS_RESERVED },
 	{ "chromeos-root", G_PART_ALIAS_CHROMEOS_ROOT },
 	{ "dragonfly-ccd", G_PART_ALIAS_DFBSD_CCD },
 	{ "dragonfly-hammer", G_PART_ALIAS_DFBSD_HAMMER },
 	{ "dragonfly-hammer2", G_PART_ALIAS_DFBSD_HAMMER2 },
 	{ "dragonfly-label32", G_PART_ALIAS_DFBSD },
 	{ "dragonfly-label64", G_PART_ALIAS_DFBSD64 },
 	{ "dragonfly-legacy", G_PART_ALIAS_DFBSD_LEGACY },
 	{ "dragonfly-swap", G_PART_ALIAS_DFBSD_SWAP },
 	{ "dragonfly-ufs", G_PART_ALIAS_DFBSD_UFS },
 	{ "dragonfly-vinum", G_PART_ALIAS_DFBSD_VINUM },
 	{ "ebr", G_PART_ALIAS_EBR },
 	{ "efi", G_PART_ALIAS_EFI },
 	{ "fat16", G_PART_ALIAS_MS_FAT16 },
 	{ "fat32", G_PART_ALIAS_MS_FAT32 },
 	{ "freebsd", G_PART_ALIAS_FREEBSD },
 	{ "freebsd-boot", G_PART_ALIAS_FREEBSD_BOOT },
 	{ "freebsd-nandfs", G_PART_ALIAS_FREEBSD_NANDFS },
 	{ "freebsd-swap", G_PART_ALIAS_FREEBSD_SWAP },
 	{ "freebsd-ufs", G_PART_ALIAS_FREEBSD_UFS },
 	{ "freebsd-vinum", G_PART_ALIAS_FREEBSD_VINUM },
 	{ "freebsd-zfs", G_PART_ALIAS_FREEBSD_ZFS },
 	{ "linux-data", G_PART_ALIAS_LINUX_DATA },
 	{ "linux-lvm", G_PART_ALIAS_LINUX_LVM },
 	{ "linux-raid", G_PART_ALIAS_LINUX_RAID },
 	{ "linux-swap", G_PART_ALIAS_LINUX_SWAP },
 	{ "mbr", G_PART_ALIAS_MBR },
 	{ "ms-basic-data", G_PART_ALIAS_MS_BASIC_DATA },
 	{ "ms-ldm-data", G_PART_ALIAS_MS_LDM_DATA },
 	{ "ms-ldm-metadata", G_PART_ALIAS_MS_LDM_METADATA },
 	{ "ms-recovery", G_PART_ALIAS_MS_RECOVERY },
 	{ "ms-reserved", G_PART_ALIAS_MS_RESERVED },
 	{ "ms-spaces", G_PART_ALIAS_MS_SPACES },
 	{ "netbsd-ccd", G_PART_ALIAS_NETBSD_CCD },
 	{ "netbsd-cgd", G_PART_ALIAS_NETBSD_CGD },
 	{ "netbsd-ffs", G_PART_ALIAS_NETBSD_FFS },
 	{ "netbsd-lfs", G_PART_ALIAS_NETBSD_LFS },
 	{ "netbsd-raid", G_PART_ALIAS_NETBSD_RAID },
 	{ "netbsd-swap", G_PART_ALIAS_NETBSD_SWAP },
 	{ "ntfs", G_PART_ALIAS_MS_NTFS },
 	{ "openbsd-data", G_PART_ALIAS_OPENBSD_DATA },
 	{ "prep-boot", G_PART_ALIAS_PREP_BOOT },
 	{ "vmware-reserved", G_PART_ALIAS_VMRESERVED },
 	{ "vmware-vmfs", G_PART_ALIAS_VMFS },
 	{ "vmware-vmkdiag", G_PART_ALIAS_VMKDIAG },
 	{ "vmware-vsanhdr", G_PART_ALIAS_VMVSANHDR },
 };
 
 SYSCTL_DECL(_kern_geom);
 SYSCTL_NODE(_kern_geom, OID_AUTO, part, CTLFLAG_RW, 0,
     "GEOM_PART stuff");
 static u_int check_integrity = 1;
 SYSCTL_UINT(_kern_geom_part, OID_AUTO, check_integrity,
     CTLFLAG_RWTUN, &check_integrity, 1,
     "Enable integrity checking");
 
 /*
  * The GEOM partitioning class.
  */
 static g_ctl_req_t g_part_ctlreq;
 static g_ctl_destroy_geom_t g_part_destroy_geom;
 static g_fini_t g_part_fini;
 static g_init_t g_part_init;
 static g_taste_t g_part_taste;
 
 static g_access_t g_part_access;
 static g_dumpconf_t g_part_dumpconf;
 static g_orphan_t g_part_orphan;
 static g_spoiled_t g_part_spoiled;
 static g_start_t g_part_start;
 static g_resize_t g_part_resize;
 static g_ioctl_t g_part_ioctl;
 
 static struct g_class g_part_class = {
 	.name = "PART",
 	.version = G_VERSION,
 	/* Class methods. */
 	.ctlreq = g_part_ctlreq,
 	.destroy_geom = g_part_destroy_geom,
 	.fini = g_part_fini,
 	.init = g_part_init,
 	.taste = g_part_taste,
 	/* Geom methods. */
 	.access = g_part_access,
 	.dumpconf = g_part_dumpconf,
 	.orphan = g_part_orphan,
 	.spoiled = g_part_spoiled,
 	.start = g_part_start,
 	.resize = g_part_resize,
 	.ioctl = g_part_ioctl,
 };
 
 DECLARE_GEOM_CLASS(g_part_class, g_part);
 MODULE_VERSION(g_part, 0);
 
 /*
  * Support functions.
  */
 
 static void g_part_wither(struct g_geom *, int);
 
 const char *
 g_part_alias_name(enum g_part_alias alias)
 {
 	int i;
 
 	for (i = 0; i < G_PART_ALIAS_COUNT; i++) {
 		if (g_part_alias_list[i].alias != alias)
 			continue;
 		return (g_part_alias_list[i].lexeme);
 	}
 
 	return (NULL);
 }
 
 void
 g_part_geometry_heads(off_t blocks, u_int sectors, off_t *bestchs,
     u_int *bestheads)
 {
 	static u_int candidate_heads[] = { 1, 2, 16, 32, 64, 128, 255, 0 };
 	off_t chs, cylinders;
 	u_int heads;
 	int idx;
 
 	*bestchs = 0;
 	*bestheads = 0;
 	for (idx = 0; candidate_heads[idx] != 0; idx++) {
 		heads = candidate_heads[idx];
 		cylinders = blocks / heads / sectors;
 		if (cylinders < heads || cylinders < sectors)
 			break;
 		if (cylinders > 1023)
 			continue;
 		chs = cylinders * heads * sectors;
 		if (chs > *bestchs || (chs == *bestchs && *bestheads == 1)) {
 			*bestchs = chs;
 			*bestheads = heads;
 		}
 	}
 }
 
 static void
 g_part_geometry(struct g_part_table *table, struct g_consumer *cp,
     off_t blocks)
 {
 	static u_int candidate_sectors[] = { 1, 9, 17, 33, 63, 0 };
 	off_t chs, bestchs;
 	u_int heads, sectors;
 	int idx;
 
 	if (g_getattr("GEOM::fwsectors", cp, &sectors) != 0 || sectors == 0 ||
 	    g_getattr("GEOM::fwheads", cp, &heads) != 0 || heads == 0) {
 		table->gpt_fixgeom = 0;
 		table->gpt_heads = 0;
 		table->gpt_sectors = 0;
 		bestchs = 0;
 		for (idx = 0; candidate_sectors[idx] != 0; idx++) {
 			sectors = candidate_sectors[idx];
 			g_part_geometry_heads(blocks, sectors, &chs, &heads);
 			if (chs == 0)
 				continue;
 			/*
 			 * Prefer a geometry with sectors > 1, but only if
 			 * it doesn't bump down the number of heads to 1.
 			 */
 			if (chs > bestchs || (chs == bestchs && heads > 1 &&
 			    table->gpt_sectors == 1)) {
 				bestchs = chs;
 				table->gpt_heads = heads;
 				table->gpt_sectors = sectors;
 			}
 		}
 		/*
 		 * If we didn't find a geometry at all, then the disk is
 		 * too big. This means we can use the maximum number of
 		 * heads and sectors.
 		 */
 		if (bestchs == 0) {
 			table->gpt_heads = 255;
 			table->gpt_sectors = 63;
 		}
 	} else {
 		table->gpt_fixgeom = 1;
 		table->gpt_heads = heads;
 		table->gpt_sectors = sectors;
 	}
 }
 
 #define	DPRINTF(...)	if (bootverbose) {	\
 	printf("GEOM_PART: " __VA_ARGS__);	\
 }
 
 static int
 g_part_check_integrity(struct g_part_table *table, struct g_consumer *cp)
 {
 	struct g_part_entry *e1, *e2;
 	struct g_provider *pp;
 	off_t offset;
 	int failed;
 
 	failed = 0;
 	pp = cp->provider;
 	if (table->gpt_last < table->gpt_first) {
 		DPRINTF("last LBA is below first LBA: %jd < %jd\n",
 		    (intmax_t)table->gpt_last, (intmax_t)table->gpt_first);
 		failed++;
 	}
 	if (table->gpt_last > pp->mediasize / pp->sectorsize - 1) {
 		DPRINTF("last LBA extends beyond mediasize: "
 		    "%jd > %jd\n", (intmax_t)table->gpt_last,
 		    (intmax_t)pp->mediasize / pp->sectorsize - 1);
 		failed++;
 	}
 	LIST_FOREACH(e1, &table->gpt_entry, gpe_entry) {
 		if (e1->gpe_deleted || e1->gpe_internal)
 			continue;
 		if (e1->gpe_start < table->gpt_first) {
 			DPRINTF("partition %d has start offset below first "
 			    "LBA: %jd < %jd\n", e1->gpe_index,
 			    (intmax_t)e1->gpe_start,
 			    (intmax_t)table->gpt_first);
 			failed++;
 		}
 		if (e1->gpe_start > table->gpt_last) {
 			DPRINTF("partition %d has start offset beyond last "
 			    "LBA: %jd > %jd\n", e1->gpe_index,
 			    (intmax_t)e1->gpe_start,
 			    (intmax_t)table->gpt_last);
 			failed++;
 		}
 		if (e1->gpe_end < e1->gpe_start) {
 			DPRINTF("partition %d has end offset below start "
 			    "offset: %jd < %jd\n", e1->gpe_index,
 			    (intmax_t)e1->gpe_end,
 			    (intmax_t)e1->gpe_start);
 			failed++;
 		}
 		if (e1->gpe_end > table->gpt_last) {
 			DPRINTF("partition %d has end offset beyond last "
 			    "LBA: %jd > %jd\n", e1->gpe_index,
 			    (intmax_t)e1->gpe_end,
 			    (intmax_t)table->gpt_last);
 			failed++;
 		}
 		if (pp->stripesize > 0) {
 			offset = e1->gpe_start * pp->sectorsize;
 			if (e1->gpe_offset > offset)
 				offset = e1->gpe_offset;
 			if ((offset + pp->stripeoffset) % pp->stripesize) {
 				DPRINTF("partition %d on (%s, %s) is not "
 				    "aligned on %u bytes\n", e1->gpe_index,
 				    pp->name, table->gpt_scheme->name,
 				    pp->stripesize);
 				/* Don't treat this as a critical failure */
 			}
 		}
 		e2 = e1;
 		while ((e2 = LIST_NEXT(e2, gpe_entry)) != NULL) {
 			if (e2->gpe_deleted || e2->gpe_internal)
 				continue;
 			if (e1->gpe_start >= e2->gpe_start &&
 			    e1->gpe_start <= e2->gpe_end) {
 				DPRINTF("partition %d has start offset inside "
 				    "partition %d: start[%d] %jd >= start[%d] "
 				    "%jd <= end[%d] %jd\n",
 				    e1->gpe_index, e2->gpe_index,
 				    e2->gpe_index, (intmax_t)e2->gpe_start,
 				    e1->gpe_index, (intmax_t)e1->gpe_start,
 				    e2->gpe_index, (intmax_t)e2->gpe_end);
 				failed++;
 			}
 			if (e1->gpe_end >= e2->gpe_start &&
 			    e1->gpe_end <= e2->gpe_end) {
 				DPRINTF("partition %d has end offset inside "
 				    "partition %d: start[%d] %jd >= end[%d] "
 				    "%jd <= end[%d] %jd\n",
 				    e1->gpe_index, e2->gpe_index,
 				    e2->gpe_index, (intmax_t)e2->gpe_start,
 				    e1->gpe_index, (intmax_t)e1->gpe_end,
 				    e2->gpe_index, (intmax_t)e2->gpe_end);
 				failed++;
 			}
 			if (e1->gpe_start < e2->gpe_start &&
 			    e1->gpe_end > e2->gpe_end) {
 				DPRINTF("partition %d contains partition %d: "
 				    "start[%d] %jd > start[%d] %jd, end[%d] "
 				    "%jd < end[%d] %jd\n",
 				    e1->gpe_index, e2->gpe_index,
 				    e1->gpe_index, (intmax_t)e1->gpe_start,
 				    e2->gpe_index, (intmax_t)e2->gpe_start,
 				    e2->gpe_index, (intmax_t)e2->gpe_end,
 				    e1->gpe_index, (intmax_t)e1->gpe_end);
 				failed++;
 			}
 		}
 	}
 	if (failed != 0) {
 		printf("GEOM_PART: integrity check failed (%s, %s)\n",
 		    pp->name, table->gpt_scheme->name);
 		if (check_integrity != 0)
 			return (EINVAL);
 		table->gpt_corrupt = 1;
 	}
 	return (0);
 }
 #undef	DPRINTF
 
 struct g_part_entry *
 g_part_new_entry(struct g_part_table *table, int index, quad_t start,
     quad_t end)
 {
 	struct g_part_entry *entry, *last;
 
 	last = NULL;
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (entry->gpe_index == index)
 			break;
 		if (entry->gpe_index > index) {
 			entry = NULL;
 			break;
 		}
 		last = entry;
 	}
 	if (entry == NULL) {
 		entry = g_malloc(table->gpt_scheme->gps_entrysz,
 		    M_WAITOK | M_ZERO);
 		entry->gpe_index = index;
 		if (last == NULL)
 			LIST_INSERT_HEAD(&table->gpt_entry, entry, gpe_entry);
 		else
 			LIST_INSERT_AFTER(last, entry, gpe_entry);
 	} else
 		entry->gpe_offset = 0;
 	entry->gpe_start = start;
 	entry->gpe_end = end;
 	return (entry);
 }
 
 static void
 g_part_new_provider(struct g_geom *gp, struct g_part_table *table,
     struct g_part_entry *entry)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	struct sbuf *sb;
 	off_t offset;
 
 	cp = LIST_FIRST(&gp->consumer);
 	pp = cp->provider;
 
 	offset = entry->gpe_start * pp->sectorsize;
 	if (entry->gpe_offset < offset)
 		entry->gpe_offset = offset;
 
 	if (entry->gpe_pp == NULL) {
 		sb = sbuf_new_auto();
 		G_PART_FULLNAME(table, entry, sb, gp->name);
 		sbuf_finish(sb);
 		entry->gpe_pp = g_new_providerf(gp, "%s", sbuf_data(sb));
 		sbuf_delete(sb);
 		entry->gpe_pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 		entry->gpe_pp->private = entry;		/* Close the circle. */
 	}
 	entry->gpe_pp->index = entry->gpe_index - 1;	/* index is 1-based. */
 	entry->gpe_pp->mediasize = (entry->gpe_end - entry->gpe_start + 1) *
 	    pp->sectorsize;
 	entry->gpe_pp->mediasize -= entry->gpe_offset - offset;
 	entry->gpe_pp->sectorsize = pp->sectorsize;
 	entry->gpe_pp->stripesize = pp->stripesize;
 	entry->gpe_pp->stripeoffset = pp->stripeoffset + entry->gpe_offset;
 	if (pp->stripesize > 0)
 		entry->gpe_pp->stripeoffset %= pp->stripesize;
 	entry->gpe_pp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED;
 	g_error_provider(entry->gpe_pp, 0);
 }
 
 static struct g_geom*
 g_part_find_geom(const char *name)
 {
 	struct g_geom *gp;
 	LIST_FOREACH(gp, &g_part_class.geom, geom) {
 		if ((gp->flags & G_GEOM_WITHER) == 0 &&
 		    strcmp(name, gp->name) == 0)
 			break;
 	}
 	return (gp);
 }
 
 static int
 g_part_parm_geom(struct gctl_req *req, const char *name, struct g_geom **v)
 {
 	struct g_geom *gp;
 	const char *gname;
 
 	gname = gctl_get_asciiparam(req, name);
 	if (gname == NULL)
 		return (ENOATTR);
 	if (strncmp(gname, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
 		gname += sizeof(_PATH_DEV) - 1;
 	gp = g_part_find_geom(gname);
 	if (gp == NULL) {
 		gctl_error(req, "%d %s '%s'", EINVAL, name, gname);
 		return (EINVAL);
 	}
 	*v = gp;
 	return (0);
 }
 
 static int
 g_part_parm_provider(struct gctl_req *req, const char *name,
     struct g_provider **v)
 {
 	struct g_provider *pp;
 	const char *pname;
 
 	pname = gctl_get_asciiparam(req, name);
 	if (pname == NULL)
 		return (ENOATTR);
 	if (strncmp(pname, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
 		pname += sizeof(_PATH_DEV) - 1;
 	pp = g_provider_by_name(pname);
 	if (pp == NULL) {
 		gctl_error(req, "%d %s '%s'", EINVAL, name, pname);
 		return (EINVAL);
 	}
 	*v = pp;
 	return (0);
 }
 
 static int
 g_part_parm_quad(struct gctl_req *req, const char *name, quad_t *v)
 {
 	const char *p;
 	char *x;
 	quad_t q;
 
 	p = gctl_get_asciiparam(req, name);
 	if (p == NULL)
 		return (ENOATTR);
 	q = strtoq(p, &x, 0);
 	if (*x != '\0' || q < 0) {
 		gctl_error(req, "%d %s '%s'", EINVAL, name, p);
 		return (EINVAL);
 	}
 	*v = q;
 	return (0);
 }
 
 static int
 g_part_parm_scheme(struct gctl_req *req, const char *name,
     struct g_part_scheme **v)
 {
 	struct g_part_scheme *s;
 	const char *p;
 
 	p = gctl_get_asciiparam(req, name);
 	if (p == NULL)
 		return (ENOATTR);
 	TAILQ_FOREACH(s, &g_part_schemes, scheme_list) {
 		if (s == &g_part_null_scheme)
 			continue;
 		if (!strcasecmp(s->name, p))
 			break;
 	}
 	if (s == NULL) {
 		gctl_error(req, "%d %s '%s'", EINVAL, name, p);
 		return (EINVAL);
 	}
 	*v = s;
 	return (0);
 }
 
 static int
 g_part_parm_str(struct gctl_req *req, const char *name, const char **v)
 {
 	const char *p;
 
 	p = gctl_get_asciiparam(req, name);
 	if (p == NULL)
 		return (ENOATTR);
 	/* An empty label is always valid. */
 	if (strcmp(name, "label") != 0 && p[0] == '\0') {
 		gctl_error(req, "%d %s '%s'", EINVAL, name, p);
 		return (EINVAL);
 	}
 	*v = p;
 	return (0);
 }
 
 static int
 g_part_parm_intmax(struct gctl_req *req, const char *name, u_int *v)
 {
 	const intmax_t *p;
 	int size;
 
 	p = gctl_get_param(req, name, &size);
 	if (p == NULL)
 		return (ENOATTR);
 	if (size != sizeof(*p) || *p < 0 || *p > INT_MAX) {
 		gctl_error(req, "%d %s '%jd'", EINVAL, name, *p);
 		return (EINVAL);
 	}
 	*v = (u_int)*p;
 	return (0);
 }
 
 static int
 g_part_parm_uint32(struct gctl_req *req, const char *name, u_int *v)
 {
 	const uint32_t *p;
 	int size;
 
 	p = gctl_get_param(req, name, &size);
 	if (p == NULL)
 		return (ENOATTR);
 	if (size != sizeof(*p) || *p > INT_MAX) {
 		gctl_error(req, "%d %s '%u'", EINVAL, name, (unsigned int)*p);
 		return (EINVAL);
 	}
 	*v = (u_int)*p;
 	return (0);
 }
 
 static int
 g_part_parm_bootcode(struct gctl_req *req, const char *name, const void **v,
     unsigned int *s)
 {
 	const void *p;
 	int size;
 
 	p = gctl_get_param(req, name, &size);
 	if (p == NULL)
 		return (ENOATTR);
 	*v = p;
 	*s = size;
 	return (0);
 }
 
 static int
 g_part_probe(struct g_geom *gp, struct g_consumer *cp, int depth)
 {
 	struct g_part_scheme *iter, *scheme;
 	struct g_part_table *table;
 	int pri, probe;
 
 	table = gp->softc;
 	scheme = (table != NULL) ? table->gpt_scheme : NULL;
 	pri = (scheme != NULL) ? G_PART_PROBE(table, cp) : INT_MIN;
 	if (pri == 0)
 		goto done;
 	if (pri > 0) {	/* error */
 		scheme = NULL;
 		pri = INT_MIN;
 	}
 
 	TAILQ_FOREACH(iter, &g_part_schemes, scheme_list) {
 		if (iter == &g_part_null_scheme)
 			continue;
 		table = (void *)kobj_create((kobj_class_t)iter, M_GEOM,
 		    M_WAITOK);
 		table->gpt_gp = gp;
 		table->gpt_scheme = iter;
 		table->gpt_depth = depth;
 		probe = G_PART_PROBE(table, cp);
 		if (probe <= 0 && probe > pri) {
 			pri = probe;
 			scheme = iter;
 			if (gp->softc != NULL)
 				kobj_delete((kobj_t)gp->softc, M_GEOM);
 			gp->softc = table;
 			if (pri == 0)
 				goto done;
 		} else
 			kobj_delete((kobj_t)table, M_GEOM);
 	}
 
 done:
 	return ((scheme == NULL) ? ENXIO : 0);
 }
 
 /*
  * Control request functions.
  */
 
 static int
 g_part_ctl_add(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_entry *delent, *last, *entry;
 	struct g_part_table *table;
 	struct sbuf *sb;
 	quad_t end;
 	unsigned int index;
 	int error;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	pp = LIST_FIRST(&gp->consumer)->provider;
 	table = gp->softc;
 	end = gpp->gpp_start + gpp->gpp_size - 1;
 
 	if (gpp->gpp_start < table->gpt_first ||
 	    gpp->gpp_start > table->gpt_last) {
 		gctl_error(req, "%d start '%jd'", EINVAL,
 		    (intmax_t)gpp->gpp_start);
 		return (EINVAL);
 	}
 	if (end < gpp->gpp_start || end > table->gpt_last) {
 		gctl_error(req, "%d size '%jd'", EINVAL,
 		    (intmax_t)gpp->gpp_size);
 		return (EINVAL);
 	}
 	if (gpp->gpp_index > table->gpt_entries) {
 		gctl_error(req, "%d index '%d'", EINVAL, gpp->gpp_index);
 		return (EINVAL);
 	}
 
 	delent = last = NULL;
 	index = (gpp->gpp_index > 0) ? gpp->gpp_index : 1;
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (entry->gpe_deleted) {
 			if (entry->gpe_index == index)
 				delent = entry;
 			continue;
 		}
 		if (entry->gpe_index == index)
 			index = entry->gpe_index + 1;
 		if (entry->gpe_index < index)
 			last = entry;
 		if (entry->gpe_internal)
 			continue;
 		if (gpp->gpp_start >= entry->gpe_start &&
 		    gpp->gpp_start <= entry->gpe_end) {
 			gctl_error(req, "%d start '%jd'", ENOSPC,
 			    (intmax_t)gpp->gpp_start);
 			return (ENOSPC);
 		}
 		if (end >= entry->gpe_start && end <= entry->gpe_end) {
 			gctl_error(req, "%d end '%jd'", ENOSPC, (intmax_t)end);
 			return (ENOSPC);
 		}
 		if (gpp->gpp_start < entry->gpe_start && end > entry->gpe_end) {
 			gctl_error(req, "%d size '%jd'", ENOSPC,
 			    (intmax_t)gpp->gpp_size);
 			return (ENOSPC);
 		}
 	}
 	if (gpp->gpp_index > 0 && index != gpp->gpp_index) {
 		gctl_error(req, "%d index '%d'", EEXIST, gpp->gpp_index);
 		return (EEXIST);
 	}
 	if (index > table->gpt_entries) {
 		gctl_error(req, "%d index '%d'", ENOSPC, index);
 		return (ENOSPC);
 	}
 
 	entry = (delent == NULL) ? g_malloc(table->gpt_scheme->gps_entrysz,
 	    M_WAITOK | M_ZERO) : delent;
 	entry->gpe_index = index;
 	entry->gpe_start = gpp->gpp_start;
 	entry->gpe_end = end;
 	error = G_PART_ADD(table, entry, gpp);
 	if (error) {
 		gctl_error(req, "%d", error);
 		if (delent == NULL)
 			g_free(entry);
 		return (error);
 	}
 	if (delent == NULL) {
 		if (last == NULL)
 			LIST_INSERT_HEAD(&table->gpt_entry, entry, gpe_entry);
 		else
 			LIST_INSERT_AFTER(last, entry, gpe_entry);
 		entry->gpe_created = 1;
 	} else {
 		entry->gpe_deleted = 0;
 		entry->gpe_modified = 1;
 	}
 	g_part_new_provider(gp, table, entry);
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		G_PART_FULLNAME(table, entry, sb, gp->name);
 		if (pp->stripesize > 0 && entry->gpe_pp->stripeoffset != 0)
 			sbuf_printf(sb, " added, but partition is not "
 			    "aligned on %u bytes\n", pp->stripesize);
 		else
 			sbuf_cat(sb, " added\n");
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_bootcode(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_geom *gp;
 	struct g_part_table *table;
 	struct sbuf *sb;
 	int error, sz;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 	sz = table->gpt_scheme->gps_bootcodesz;
 	if (sz == 0) {
 		error = ENODEV;
 		goto fail;
 	}
 	if (gpp->gpp_codesize > sz) {
 		error = EFBIG;
 		goto fail;
 	}
 
 	error = G_PART_BOOTCODE(table, gpp);
 	if (error)
 		goto fail;
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		sbuf_printf(sb, "bootcode written to %s\n", gp->name);
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 
  fail:
 	gctl_error(req, "%d", error);
 	return (error);
 }
 
 static int
 g_part_ctl_commit(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_entry *entry, *tmp;
 	struct g_part_table *table;
 	char *buf;
 	int error, i;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 	if (!table->gpt_opened) {
 		gctl_error(req, "%d", EPERM);
 		return (EPERM);
 	}
 
 	g_topology_unlock();
 
 	cp = LIST_FIRST(&gp->consumer);
 	if ((table->gpt_smhead | table->gpt_smtail) != 0) {
 		pp = cp->provider;
 		buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO);
 		while (table->gpt_smhead != 0) {
 			i = ffs(table->gpt_smhead) - 1;
 			error = g_write_data(cp, i * pp->sectorsize, buf,
 			    pp->sectorsize);
 			if (error) {
 				g_free(buf);
 				goto fail;
 			}
 			table->gpt_smhead &= ~(1 << i);
 		}
 		while (table->gpt_smtail != 0) {
 			i = ffs(table->gpt_smtail) - 1;
 			error = g_write_data(cp, pp->mediasize - (i + 1) *
 			    pp->sectorsize, buf, pp->sectorsize);
 			if (error) {
 				g_free(buf);
 				goto fail;
 			}
 			table->gpt_smtail &= ~(1 << i);
 		}
 		g_free(buf);
 	}
 
 	if (table->gpt_scheme == &g_part_null_scheme) {
 		g_topology_lock();
 		g_access(cp, -1, -1, -1);
 		g_part_wither(gp, ENXIO);
 		return (0);
 	}
 
 	error = G_PART_WRITE(table, cp);
 	if (error)
 		goto fail;
 
 	LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) {
 		if (!entry->gpe_deleted) {
 			entry->gpe_created = 0;
 			entry->gpe_modified = 0;
 			continue;
 		}
 		LIST_REMOVE(entry, gpe_entry);
 		g_free(entry);
 	}
 	table->gpt_created = 0;
 	table->gpt_opened = 0;
 
 	g_topology_lock();
 	g_access(cp, -1, -1, -1);
 	return (0);
 
 fail:
 	g_topology_lock();
 	gctl_error(req, "%d", error);
 	return (error);
 }
 
 static int
 g_part_ctl_create(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_scheme *scheme;
 	struct g_part_table *null, *table;
 	struct sbuf *sb;
 	int attr, error;
 
 	pp = gpp->gpp_provider;
 	scheme = gpp->gpp_scheme;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, pp->name));
 	g_topology_assert();
 
 	/* Check that there isn't already a g_part geom on the provider. */
 	gp = g_part_find_geom(pp->name);
 	if (gp != NULL) {
 		null = gp->softc;
 		if (null->gpt_scheme != &g_part_null_scheme) {
 			gctl_error(req, "%d geom '%s'", EEXIST, pp->name);
 			return (EEXIST);
 		}
 	} else
 		null = NULL;
 
 	if ((gpp->gpp_parms & G_PART_PARM_ENTRIES) &&
 	    (gpp->gpp_entries < scheme->gps_minent ||
 	     gpp->gpp_entries > scheme->gps_maxent)) {
 		gctl_error(req, "%d entries '%d'", EINVAL, gpp->gpp_entries);
 		return (EINVAL);
 	}
 
 	if (null == NULL)
 		gp = g_new_geomf(&g_part_class, "%s", pp->name);
 	gp->softc = kobj_create((kobj_class_t)gpp->gpp_scheme, M_GEOM,
 	    M_WAITOK);
 	table = gp->softc;
 	table->gpt_gp = gp;
 	table->gpt_scheme = gpp->gpp_scheme;
 	table->gpt_entries = (gpp->gpp_parms & G_PART_PARM_ENTRIES) ?
 	    gpp->gpp_entries : scheme->gps_minent;
 	LIST_INIT(&table->gpt_entry);
 	if (null == NULL) {
 		cp = g_new_consumer(gp);
 		cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 		error = g_attach(cp, pp);
 		if (error == 0)
 			error = g_access(cp, 1, 1, 1);
 		if (error != 0) {
 			g_part_wither(gp, error);
 			gctl_error(req, "%d geom '%s'", error, pp->name);
 			return (error);
 		}
 		table->gpt_opened = 1;
 	} else {
 		cp = LIST_FIRST(&gp->consumer);
 		table->gpt_opened = null->gpt_opened;
 		table->gpt_smhead = null->gpt_smhead;
 		table->gpt_smtail = null->gpt_smtail;
 	}
 
 	g_topology_unlock();
 
 	/* Make sure the provider has media. */
 	if (pp->mediasize == 0 || pp->sectorsize == 0) {
 		error = ENODEV;
 		goto fail;
 	}
 
 	/* Make sure we can nest and if so, determine our depth. */
 	error = g_getattr("PART::isleaf", cp, &attr);
 	if (!error && attr) {
 		error = ENODEV;
 		goto fail;
 	}
 	error = g_getattr("PART::depth", cp, &attr);
 	table->gpt_depth = (!error) ? attr + 1 : 0;
 
 	/*
 	 * Synthesize a disk geometry. Some partitioning schemes
 	 * depend on it and since some file systems need it even
 	 * when the partitition scheme doesn't, we do it here in
 	 * scheme-independent code.
 	 */
 	g_part_geometry(table, cp, pp->mediasize / pp->sectorsize);
 
 	error = G_PART_CREATE(table, gpp);
 	if (error)
 		goto fail;
 
 	g_topology_lock();
 
 	table->gpt_created = 1;
 	if (null != NULL)
 		kobj_delete((kobj_t)null, M_GEOM);
 
 	/*
 	 * Support automatic commit by filling in the gpp_geom
 	 * parameter.
 	 */
 	gpp->gpp_parms |= G_PART_PARM_GEOM;
 	gpp->gpp_geom = gp;
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		sbuf_printf(sb, "%s created\n", gp->name);
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 
 fail:
 	g_topology_lock();
 	if (null == NULL) {
 		g_access(cp, -1, -1, -1);
 		g_part_wither(gp, error);
 	} else {
 		kobj_delete((kobj_t)gp->softc, M_GEOM);
 		gp->softc = null;
 	}
 	gctl_error(req, "%d provider", error);
 	return (error);
 }
 
 static int
 g_part_ctl_delete(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 	struct sbuf *sb;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (entry->gpe_deleted || entry->gpe_internal)
 			continue;
 		if (entry->gpe_index == gpp->gpp_index)
 			break;
 	}
 	if (entry == NULL) {
 		gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index);
 		return (ENOENT);
 	}
 
 	pp = entry->gpe_pp;
 	if (pp != NULL) {
 		if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) {
 			gctl_error(req, "%d", EBUSY);
 			return (EBUSY);
 		}
 
 		pp->private = NULL;
 		entry->gpe_pp = NULL;
 	}
 
 	if (pp != NULL)
 		g_wither_provider(pp, ENXIO);
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		G_PART_FULLNAME(table, entry, sb, gp->name);
 		sbuf_cat(sb, " deleted\n");
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 
 	if (entry->gpe_created) {
 		LIST_REMOVE(entry, gpe_entry);
 		g_free(entry);
 	} else {
 		entry->gpe_modified = 0;
 		entry->gpe_deleted = 1;
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_destroy(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_entry *entry, *tmp;
 	struct g_part_table *null, *table;
 	struct sbuf *sb;
 	int error;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 	/* Check for busy providers. */
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (entry->gpe_deleted || entry->gpe_internal)
 			continue;
 		if (gpp->gpp_force) {
 			pp = entry->gpe_pp;
 			if (pp == NULL)
 				continue;
 			if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
 				continue;
 		}
 		gctl_error(req, "%d", EBUSY);
 		return (EBUSY);
 	}
 
 	if (gpp->gpp_force) {
 		/* Destroy all providers. */
 		LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) {
 			pp = entry->gpe_pp;
 			if (pp != NULL) {
 				pp->private = NULL;
 				g_wither_provider(pp, ENXIO);
 			}
 			LIST_REMOVE(entry, gpe_entry);
 			g_free(entry);
 		}
 	}
 
 	error = G_PART_DESTROY(table, gpp);
 	if (error) {
 		gctl_error(req, "%d", error);
 		return (error);
 	}
 
 	gp->softc = kobj_create((kobj_class_t)&g_part_null_scheme, M_GEOM,
 	    M_WAITOK);
 	null = gp->softc;
 	null->gpt_gp = gp;
 	null->gpt_scheme = &g_part_null_scheme;
 	LIST_INIT(&null->gpt_entry);
 
 	cp = LIST_FIRST(&gp->consumer);
 	pp = cp->provider;
 	null->gpt_last = pp->mediasize / pp->sectorsize - 1;
 
 	null->gpt_depth = table->gpt_depth;
 	null->gpt_opened = table->gpt_opened;
 	null->gpt_smhead = table->gpt_smhead;
 	null->gpt_smtail = table->gpt_smtail;
 
 	while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) {
 		LIST_REMOVE(entry, gpe_entry);
 		g_free(entry);
 	}
 	kobj_delete((kobj_t)table, M_GEOM);
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		sbuf_printf(sb, "%s destroyed\n", gp->name);
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_modify(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_geom *gp;
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 	struct sbuf *sb;
 	int error;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (entry->gpe_deleted || entry->gpe_internal)
 			continue;
 		if (entry->gpe_index == gpp->gpp_index)
 			break;
 	}
 	if (entry == NULL) {
 		gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index);
 		return (ENOENT);
 	}
 
 	error = G_PART_MODIFY(table, entry, gpp);
 	if (error) {
 		gctl_error(req, "%d", error);
 		return (error);
 	}
 
 	if (!entry->gpe_created)
 		entry->gpe_modified = 1;
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		G_PART_FULLNAME(table, entry, sb, gp->name);
 		sbuf_cat(sb, " modified\n");
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_move(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	gctl_error(req, "%d verb 'move'", ENOSYS);
 	return (ENOSYS);
 }
 
 static int
 g_part_ctl_recover(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_part_table *table;
 	struct g_geom *gp;
 	struct sbuf *sb;
 	int error, recovered;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 	table = gp->softc;
 	error = recovered = 0;
 
 	if (table->gpt_corrupt) {
 		error = G_PART_RECOVER(table);
 		if (error == 0)
 			error = g_part_check_integrity(table,
 			    LIST_FIRST(&gp->consumer));
 		if (error) {
 			gctl_error(req, "%d recovering '%s' failed",
 			    error, gp->name);
 			return (error);
 		}
 		recovered = 1;
 	}
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		if (recovered)
 			sbuf_printf(sb, "%s recovered\n", gp->name);
 		else
 			sbuf_printf(sb, "%s recovering is not needed\n",
 			    gp->name);
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_resize(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_entry *pe, *entry;
 	struct g_part_table *table;
 	struct sbuf *sb;
 	quad_t end;
 	int error;
 	off_t mediasize;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 	table = gp->softc;
 
 	/* check gpp_index */
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (entry->gpe_deleted || entry->gpe_internal)
 			continue;
 		if (entry->gpe_index == gpp->gpp_index)
 			break;
 	}
 	if (entry == NULL) {
 		gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index);
 		return (ENOENT);
 	}
 
 	/* check gpp_size */
 	end = entry->gpe_start + gpp->gpp_size - 1;
 	if (gpp->gpp_size < 1 || end > table->gpt_last) {
 		gctl_error(req, "%d size '%jd'", EINVAL,
 		    (intmax_t)gpp->gpp_size);
 		return (EINVAL);
 	}
 
 	LIST_FOREACH(pe, &table->gpt_entry, gpe_entry) {
 		if (pe->gpe_deleted || pe->gpe_internal || pe == entry)
 			continue;
 		if (end >= pe->gpe_start && end <= pe->gpe_end) {
 			gctl_error(req, "%d end '%jd'", ENOSPC,
 			    (intmax_t)end);
 			return (ENOSPC);
 		}
 		if (entry->gpe_start < pe->gpe_start && end > pe->gpe_end) {
 			gctl_error(req, "%d size '%jd'", ENOSPC,
 			    (intmax_t)gpp->gpp_size);
 			return (ENOSPC);
 		}
 	}
 
 	pp = entry->gpe_pp;
 	if ((g_debugflags & 16) == 0 &&
 	    (pp->acr > 0 || pp->acw > 0 || pp->ace > 0)) {
 		if (entry->gpe_end - entry->gpe_start + 1 > gpp->gpp_size) {
 			/* Deny shrinking of an opened partition. */
 			gctl_error(req, "%d", EBUSY);
 			return (EBUSY);
 		} 
 	}
 
 	error = G_PART_RESIZE(table, entry, gpp);
 	if (error) {
 		gctl_error(req, "%d%s", error, error != EBUSY ? "":
 		    " resizing will lead to unexpected shrinking"
 		    " due to alignment");
 		return (error);
 	}
 
 	if (!entry->gpe_created)
 		entry->gpe_modified = 1;
 
 	/* update mediasize of changed provider */
 	mediasize = (entry->gpe_end - entry->gpe_start + 1) *
 		pp->sectorsize;
 	g_resize_provider(pp, mediasize);
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		G_PART_FULLNAME(table, entry, sb, gp->name);
 		sbuf_cat(sb, " resized\n");
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_setunset(struct gctl_req *req, struct g_part_parms *gpp,
     unsigned int set)
 {
 	struct g_geom *gp;
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 	struct sbuf *sb;
 	int error;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 
 	if (gpp->gpp_parms & G_PART_PARM_INDEX) {
 		LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 			if (entry->gpe_deleted || entry->gpe_internal)
 				continue;
 			if (entry->gpe_index == gpp->gpp_index)
 				break;
 		}
 		if (entry == NULL) {
 			gctl_error(req, "%d index '%d'", ENOENT,
 			    gpp->gpp_index);
 			return (ENOENT);
 		}
 	} else
 		entry = NULL;
 
 	error = G_PART_SETUNSET(table, entry, gpp->gpp_attrib, set);
 	if (error) {
 		gctl_error(req, "%d attrib '%s'", error, gpp->gpp_attrib);
 		return (error);
 	}
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		sbuf_printf(sb, "%s %sset on ", gpp->gpp_attrib,
 		    (set) ? "" : "un");
 		if (entry)
 			G_PART_FULLNAME(table, entry, sb, gp->name);
 		else
 			sbuf_cat(sb, gp->name);
 		sbuf_cat(sb, "\n");
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_undo(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	struct g_geom *gp;
 	struct g_part_entry *entry, *tmp;
 	struct g_part_table *table;
 	int error, reprobe;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 	if (!table->gpt_opened) {
 		gctl_error(req, "%d", EPERM);
 		return (EPERM);
 	}
 
 	cp = LIST_FIRST(&gp->consumer);
 	LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) {
 		entry->gpe_modified = 0;
 		if (entry->gpe_created) {
 			pp = entry->gpe_pp;
 			if (pp != NULL) {
 				pp->private = NULL;
 				entry->gpe_pp = NULL;
 				g_wither_provider(pp, ENXIO);
 			}
 			entry->gpe_deleted = 1;
 		}
 		if (entry->gpe_deleted) {
 			LIST_REMOVE(entry, gpe_entry);
 			g_free(entry);
 		}
 	}
 
 	g_topology_unlock();
 
 	reprobe = (table->gpt_scheme == &g_part_null_scheme ||
 	    table->gpt_created) ? 1 : 0;
 
 	if (reprobe) {
 		LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 			if (entry->gpe_internal)
 				continue;
 			error = EBUSY;
 			goto fail;
 		}
 		while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) {
 			LIST_REMOVE(entry, gpe_entry);
 			g_free(entry);
 		}
 		error = g_part_probe(gp, cp, table->gpt_depth);
 		if (error) {
 			g_topology_lock();
 			g_access(cp, -1, -1, -1);
 			g_part_wither(gp, error);
 			return (0);
 		}
 		table = gp->softc;
 
 		/*
 		 * Synthesize a disk geometry. Some partitioning schemes
 		 * depend on it and since some file systems need it even
 		 * when the partitition scheme doesn't, we do it here in
 		 * scheme-independent code.
 		 */
 		pp = cp->provider;
 		g_part_geometry(table, cp, pp->mediasize / pp->sectorsize);
 	}
 
 	error = G_PART_READ(table, cp);
 	if (error)
 		goto fail;
 	error = g_part_check_integrity(table, cp);
 	if (error)
 		goto fail;
 
 	g_topology_lock();
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (!entry->gpe_internal)
 			g_part_new_provider(gp, table, entry);
 	}
 
 	table->gpt_opened = 0;
 	g_access(cp, -1, -1, -1);
 	return (0);
 
 fail:
 	g_topology_lock();
 	gctl_error(req, "%d", error);
 	return (error);
 }
 
 static void
 g_part_wither(struct g_geom *gp, int error)
 {
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 
 	table = gp->softc;
 	if (table != NULL) {
 		G_PART_DESTROY(table, NULL);
 		while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) {
 			LIST_REMOVE(entry, gpe_entry);
 			g_free(entry);
 		}
 		if (gp->softc != NULL) {
 			kobj_delete((kobj_t)gp->softc, M_GEOM);
 			gp->softc = NULL;
 		}
 	}
 	g_wither_geom(gp, error);
 }
 
 /*
  * Class methods.
  */
 
 static void
 g_part_ctlreq(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	struct g_part_parms gpp;
 	struct g_part_table *table;
 	struct gctl_req_arg *ap;
 	enum g_part_ctl ctlreq;
 	unsigned int i, mparms, oparms, parm;
 	int auto_commit, close_on_error;
 	int error, modifies;
 
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, verb));
 	g_topology_assert();
 
 	ctlreq = G_PART_CTL_NONE;
 	modifies = 1;
 	mparms = 0;
 	oparms = G_PART_PARM_FLAGS | G_PART_PARM_OUTPUT | G_PART_PARM_VERSION;
 	switch (*verb) {
 	case 'a':
 		if (!strcmp(verb, "add")) {
 			ctlreq = G_PART_CTL_ADD;
 			mparms |= G_PART_PARM_GEOM | G_PART_PARM_SIZE |
 			    G_PART_PARM_START | G_PART_PARM_TYPE;
 			oparms |= G_PART_PARM_INDEX | G_PART_PARM_LABEL;
 		}
 		break;
 	case 'b':
 		if (!strcmp(verb, "bootcode")) {
 			ctlreq = G_PART_CTL_BOOTCODE;
 			mparms |= G_PART_PARM_GEOM | G_PART_PARM_BOOTCODE;
 		}
 		break;
 	case 'c':
 		if (!strcmp(verb, "commit")) {
 			ctlreq = G_PART_CTL_COMMIT;
 			mparms |= G_PART_PARM_GEOM;
 			modifies = 0;
 		} else if (!strcmp(verb, "create")) {
 			ctlreq = G_PART_CTL_CREATE;
 			mparms |= G_PART_PARM_PROVIDER | G_PART_PARM_SCHEME;
 			oparms |= G_PART_PARM_ENTRIES;
 		}
 		break;
 	case 'd':
 		if (!strcmp(verb, "delete")) {
 			ctlreq = G_PART_CTL_DELETE;
 			mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX;
 		} else if (!strcmp(verb, "destroy")) {
 			ctlreq = G_PART_CTL_DESTROY;
 			mparms |= G_PART_PARM_GEOM;
 			oparms |= G_PART_PARM_FORCE;
 		}
 		break;
 	case 'm':
 		if (!strcmp(verb, "modify")) {
 			ctlreq = G_PART_CTL_MODIFY;
 			mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX;
 			oparms |= G_PART_PARM_LABEL | G_PART_PARM_TYPE;
 		} else if (!strcmp(verb, "move")) {
 			ctlreq = G_PART_CTL_MOVE;
 			mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX;
 		}
 		break;
 	case 'r':
 		if (!strcmp(verb, "recover")) {
 			ctlreq = G_PART_CTL_RECOVER;
 			mparms |= G_PART_PARM_GEOM;
 		} else if (!strcmp(verb, "resize")) {
 			ctlreq = G_PART_CTL_RESIZE;
 			mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX |
 			    G_PART_PARM_SIZE;
 		}
 		break;
 	case 's':
 		if (!strcmp(verb, "set")) {
 			ctlreq = G_PART_CTL_SET;
 			mparms |= G_PART_PARM_ATTRIB | G_PART_PARM_GEOM;
 			oparms |= G_PART_PARM_INDEX;
 		}
 		break;
 	case 'u':
 		if (!strcmp(verb, "undo")) {
 			ctlreq = G_PART_CTL_UNDO;
 			mparms |= G_PART_PARM_GEOM;
 			modifies = 0;
 		} else if (!strcmp(verb, "unset")) {
 			ctlreq = G_PART_CTL_UNSET;
 			mparms |= G_PART_PARM_ATTRIB | G_PART_PARM_GEOM;
 			oparms |= G_PART_PARM_INDEX;
 		}
 		break;
 	}
 	if (ctlreq == G_PART_CTL_NONE) {
 		gctl_error(req, "%d verb '%s'", EINVAL, verb);
 		return;
 	}
 
 	bzero(&gpp, sizeof(gpp));
 	for (i = 0; i < req->narg; i++) {
 		ap = &req->arg[i];
 		parm = 0;
 		switch (ap->name[0]) {
 		case 'a':
 			if (!strcmp(ap->name, "arg0")) {
 				parm = mparms &
 				    (G_PART_PARM_GEOM | G_PART_PARM_PROVIDER);
 			}
 			if (!strcmp(ap->name, "attrib"))
 				parm = G_PART_PARM_ATTRIB;
 			break;
 		case 'b':
 			if (!strcmp(ap->name, "bootcode"))
 				parm = G_PART_PARM_BOOTCODE;
 			break;
 		case 'c':
 			if (!strcmp(ap->name, "class"))
 				continue;
 			break;
 		case 'e':
 			if (!strcmp(ap->name, "entries"))
 				parm = G_PART_PARM_ENTRIES;
 			break;
 		case 'f':
 			if (!strcmp(ap->name, "flags"))
 				parm = G_PART_PARM_FLAGS;
 			else if (!strcmp(ap->name, "force"))
 				parm = G_PART_PARM_FORCE;
 			break;
 		case 'i':
 			if (!strcmp(ap->name, "index"))
 				parm = G_PART_PARM_INDEX;
 			break;
 		case 'l':
 			if (!strcmp(ap->name, "label"))
 				parm = G_PART_PARM_LABEL;
 			break;
 		case 'o':
 			if (!strcmp(ap->name, "output"))
 				parm = G_PART_PARM_OUTPUT;
 			break;
 		case 's':
 			if (!strcmp(ap->name, "scheme"))
 				parm = G_PART_PARM_SCHEME;
 			else if (!strcmp(ap->name, "size"))
 				parm = G_PART_PARM_SIZE;
 			else if (!strcmp(ap->name, "start"))
 				parm = G_PART_PARM_START;
 			break;
 		case 't':
 			if (!strcmp(ap->name, "type"))
 				parm = G_PART_PARM_TYPE;
 			break;
 		case 'v':
 			if (!strcmp(ap->name, "verb"))
 				continue;
 			else if (!strcmp(ap->name, "version"))
 				parm = G_PART_PARM_VERSION;
 			break;
 		}
 		if ((parm & (mparms | oparms)) == 0) {
 			gctl_error(req, "%d param '%s'", EINVAL, ap->name);
 			return;
 		}
 		switch (parm) {
 		case G_PART_PARM_ATTRIB:
 			error = g_part_parm_str(req, ap->name,
 			    &gpp.gpp_attrib);
 			break;
 		case G_PART_PARM_BOOTCODE:
 			error = g_part_parm_bootcode(req, ap->name,
 			    &gpp.gpp_codeptr, &gpp.gpp_codesize);
 			break;
 		case G_PART_PARM_ENTRIES:
 			error = g_part_parm_intmax(req, ap->name,
 			    &gpp.gpp_entries);
 			break;
 		case G_PART_PARM_FLAGS:
 			error = g_part_parm_str(req, ap->name, &gpp.gpp_flags);
 			break;
 		case G_PART_PARM_FORCE:
 			error = g_part_parm_uint32(req, ap->name,
 			    &gpp.gpp_force);
 			break;
 		case G_PART_PARM_GEOM:
 			error = g_part_parm_geom(req, ap->name, &gpp.gpp_geom);
 			break;
 		case G_PART_PARM_INDEX:
 			error = g_part_parm_intmax(req, ap->name,
 			    &gpp.gpp_index);
 			break;
 		case G_PART_PARM_LABEL:
 			error = g_part_parm_str(req, ap->name, &gpp.gpp_label);
 			break;
 		case G_PART_PARM_OUTPUT:
 			error = 0;	/* Write-only parameter */
 			break;
 		case G_PART_PARM_PROVIDER:
 			error = g_part_parm_provider(req, ap->name,
 			    &gpp.gpp_provider);
 			break;
 		case G_PART_PARM_SCHEME:
 			error = g_part_parm_scheme(req, ap->name,
 			    &gpp.gpp_scheme);
 			break;
 		case G_PART_PARM_SIZE:
 			error = g_part_parm_quad(req, ap->name, &gpp.gpp_size);
 			break;
 		case G_PART_PARM_START:
 			error = g_part_parm_quad(req, ap->name,
 			    &gpp.gpp_start);
 			break;
 		case G_PART_PARM_TYPE:
 			error = g_part_parm_str(req, ap->name, &gpp.gpp_type);
 			break;
 		case G_PART_PARM_VERSION:
 			error = g_part_parm_uint32(req, ap->name,
 			    &gpp.gpp_version);
 			break;
 		default:
 			error = EDOOFUS;
 			gctl_error(req, "%d %s", error, ap->name);
 			break;
 		}
 		if (error != 0) {
 			if (error == ENOATTR) {
 				gctl_error(req, "%d param '%s'", error,
 				    ap->name);
 			}
 			return;
 		}
 		gpp.gpp_parms |= parm;
 	}
 	if ((gpp.gpp_parms & mparms) != mparms) {
 		parm = mparms - (gpp.gpp_parms & mparms);
 		gctl_error(req, "%d param '%x'", ENOATTR, parm);
 		return;
 	}
 
 	/* Obtain permissions if possible/necessary. */
 	close_on_error = 0;
 	table = NULL;
 	if (modifies && (gpp.gpp_parms & G_PART_PARM_GEOM)) {
 		table = gpp.gpp_geom->softc;
 		if (table != NULL && table->gpt_corrupt &&
 		    ctlreq != G_PART_CTL_DESTROY &&
 		    ctlreq != G_PART_CTL_RECOVER) {
 			gctl_error(req, "%d table '%s' is corrupt",
 			    EPERM, gpp.gpp_geom->name);
 			return;
 		}
 		if (table != NULL && !table->gpt_opened) {
 			error = g_access(LIST_FIRST(&gpp.gpp_geom->consumer),
 			    1, 1, 1);
 			if (error) {
 				gctl_error(req, "%d geom '%s'", error,
 				    gpp.gpp_geom->name);
 				return;
 			}
 			table->gpt_opened = 1;
 			close_on_error = 1;
 		}
 	}
 
 	/* Allow the scheme to check or modify the parameters. */
 	if (table != NULL) {
 		error = G_PART_PRECHECK(table, ctlreq, &gpp);
 		if (error) {
 			gctl_error(req, "%d pre-check failed", error);
 			goto out;
 		}
 	} else
 		error = EDOOFUS;	/* Prevent bogus uninit. warning. */
 
 	switch (ctlreq) {
 	case G_PART_CTL_NONE:
 		panic("%s", __func__);
 	case G_PART_CTL_ADD:
 		error = g_part_ctl_add(req, &gpp);
 		break;
 	case G_PART_CTL_BOOTCODE:
 		error = g_part_ctl_bootcode(req, &gpp);
 		break;
 	case G_PART_CTL_COMMIT:
 		error = g_part_ctl_commit(req, &gpp);
 		break;
 	case G_PART_CTL_CREATE:
 		error = g_part_ctl_create(req, &gpp);
 		break;
 	case G_PART_CTL_DELETE:
 		error = g_part_ctl_delete(req, &gpp);
 		break;
 	case G_PART_CTL_DESTROY:
 		error = g_part_ctl_destroy(req, &gpp);
 		break;
 	case G_PART_CTL_MODIFY:
 		error = g_part_ctl_modify(req, &gpp);
 		break;
 	case G_PART_CTL_MOVE:
 		error = g_part_ctl_move(req, &gpp);
 		break;
 	case G_PART_CTL_RECOVER:
 		error = g_part_ctl_recover(req, &gpp);
 		break;
 	case G_PART_CTL_RESIZE:
 		error = g_part_ctl_resize(req, &gpp);
 		break;
 	case G_PART_CTL_SET:
 		error = g_part_ctl_setunset(req, &gpp, 1);
 		break;
 	case G_PART_CTL_UNDO:
 		error = g_part_ctl_undo(req, &gpp);
 		break;
 	case G_PART_CTL_UNSET:
 		error = g_part_ctl_setunset(req, &gpp, 0);
 		break;
 	}
 
 	/* Implement automatic commit. */
 	if (!error) {
 		auto_commit = (modifies &&
 		    (gpp.gpp_parms & G_PART_PARM_FLAGS) &&
 		    strchr(gpp.gpp_flags, 'C') != NULL) ? 1 : 0;
 		if (auto_commit) {
 			KASSERT(gpp.gpp_parms & G_PART_PARM_GEOM, ("%s",
 			    __func__));
 			error = g_part_ctl_commit(req, &gpp);
 		}
 	}
 
  out:
 	if (error && close_on_error) {
 		g_access(LIST_FIRST(&gpp.gpp_geom->consumer), -1, -1, -1);
 		table->gpt_opened = 0;
 	}
 }
 
 static int
 g_part_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp)
 {
 
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, gp->name));
 	g_topology_assert();
 
 	g_part_wither(gp, EINVAL);
 	return (0);
 }
 
 static struct g_geom *
 g_part_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 	struct root_hold_token *rht;
 	int attr, depth;
 	int error;
 
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, pp->name));
 	g_topology_assert();
 
 	/* Skip providers that are already open for writing. */
 	if (pp->acw > 0)
 		return (NULL);
 
 	/*
 	 * Create a GEOM with consumer and hook it up to the provider.
 	 * With that we become part of the topology. Optain read access
 	 * to the provider.
 	 */
 	gp = g_new_geomf(mp, "%s", pp->name);
 	cp = g_new_consumer(gp);
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	if (error == 0)
 		error = g_access(cp, 1, 0, 0);
 	if (error != 0) {
 		if (cp->provider)
 			g_detach(cp);
 		g_destroy_consumer(cp);
 		g_destroy_geom(gp);
 		return (NULL);
 	}
 
 	rht = root_mount_hold(mp->name);
 	g_topology_unlock();
 
 	/*
 	 * Short-circuit the whole probing galore when there's no
 	 * media present.
 	 */
 	if (pp->mediasize == 0 || pp->sectorsize == 0) {
 		error = ENODEV;
 		goto fail;
 	}
 
 	/* Make sure we can nest and if so, determine our depth. */
 	error = g_getattr("PART::isleaf", cp, &attr);
 	if (!error && attr) {
 		error = ENODEV;
 		goto fail;
 	}
 	error = g_getattr("PART::depth", cp, &attr);
 	depth = (!error) ? attr + 1 : 0;
 
 	error = g_part_probe(gp, cp, depth);
 	if (error)
 		goto fail;
 
 	table = gp->softc;
 
 	/*
 	 * Synthesize a disk geometry. Some partitioning schemes
 	 * depend on it and since some file systems need it even
 	 * when the partitition scheme doesn't, we do it here in
 	 * scheme-independent code.
 	 */
 	g_part_geometry(table, cp, pp->mediasize / pp->sectorsize);
 
 	error = G_PART_READ(table, cp);
 	if (error)
 		goto fail;
 	error = g_part_check_integrity(table, cp);
 	if (error)
 		goto fail;
 
 	g_topology_lock();
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (!entry->gpe_internal)
 			g_part_new_provider(gp, table, entry);
 	}
 
 	root_mount_rel(rht);
 	g_access(cp, -1, 0, 0);
 	return (gp);
 
  fail:
 	g_topology_lock();
 	root_mount_rel(rht);
 	g_access(cp, -1, 0, 0);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	return (NULL);
 }
 
 /*
  * Geom methods.
  */
 
 static int
 g_part_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_consumer *cp;
 
 	G_PART_TRACE((G_T_ACCESS, "%s(%s,%d,%d,%d)", __func__, pp->name, dr,
 	    dw, de));
 
 	cp = LIST_FIRST(&pp->geom->consumer);
 
 	/* We always gain write-exclusive access. */
 	return (g_access(cp, dr, dw, dw + de));
 }
 
 static void
 g_part_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	char buf[64];
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 
 	KASSERT(sb != NULL && gp != NULL, ("%s", __func__));
 	table = gp->softc;
 
 	if (indent == NULL) {
 		KASSERT(cp == NULL && pp != NULL, ("%s", __func__));
 		entry = pp->private;
 		if (entry == NULL)
 			return;
 		sbuf_printf(sb, " i %u o %ju ty %s", entry->gpe_index,
 		    (uintmax_t)entry->gpe_offset,
 		    G_PART_TYPE(table, entry, buf, sizeof(buf)));
 		/*
 		 * libdisk compatibility quirk - the scheme dumps the
 		 * slicer name and partition type in a way that is
 		 * compatible with libdisk. When libdisk is not used
 		 * anymore, this should go away.
 		 */
 		G_PART_DUMPCONF(table, entry, sb, indent);
 	} else if (cp != NULL) {	/* Consumer configuration. */
 		KASSERT(pp == NULL, ("%s", __func__));
 		/* none */
 	} else if (pp != NULL) {	/* Provider configuration. */
 		entry = pp->private;
 		if (entry == NULL)
 			return;
 		sbuf_printf(sb, "%s<start>%ju</start>\n", indent,
 		    (uintmax_t)entry->gpe_start);
 		sbuf_printf(sb, "%s<end>%ju</end>\n", indent,
 		    (uintmax_t)entry->gpe_end);
 		sbuf_printf(sb, "%s<index>%u</index>\n", indent,
 		    entry->gpe_index);
 		sbuf_printf(sb, "%s<type>%s</type>\n", indent,
 		    G_PART_TYPE(table, entry, buf, sizeof(buf)));
 		sbuf_printf(sb, "%s<offset>%ju</offset>\n", indent,
 		    (uintmax_t)entry->gpe_offset);
 		sbuf_printf(sb, "%s<length>%ju</length>\n", indent,
 		    (uintmax_t)pp->mediasize);
 		G_PART_DUMPCONF(table, entry, sb, indent);
 	} else {			/* Geom configuration. */
 		sbuf_printf(sb, "%s<scheme>%s</scheme>\n", indent,
 		    table->gpt_scheme->name);
 		sbuf_printf(sb, "%s<entries>%u</entries>\n", indent,
 		    table->gpt_entries);
 		sbuf_printf(sb, "%s<first>%ju</first>\n", indent,
 		    (uintmax_t)table->gpt_first);
 		sbuf_printf(sb, "%s<last>%ju</last>\n", indent,
 		    (uintmax_t)table->gpt_last);
 		sbuf_printf(sb, "%s<fwsectors>%u</fwsectors>\n", indent,
 		    table->gpt_sectors);
 		sbuf_printf(sb, "%s<fwheads>%u</fwheads>\n", indent,
 		    table->gpt_heads);
 		sbuf_printf(sb, "%s<state>%s</state>\n", indent,
 		    table->gpt_corrupt ? "CORRUPT": "OK");
 		sbuf_printf(sb, "%s<modified>%s</modified>\n", indent,
 		    table->gpt_opened ? "true": "false");
 		G_PART_DUMPCONF(table, NULL, sb, indent);
 	}
 }
 
 /*-
  * This start routine is only called for non-trivial requests, all the
  * trivial ones are handled autonomously by the slice code.
  * For requests we handle here, we must call the g_io_deliver() on the
  * bio, and return non-zero to indicate to the slice code that we did so.
  * This code executes in the "DOWN" I/O path, this means:
  *    * No sleeping.
  *    * Don't grab the topology lock.
  *    * Don't call biowait, g_getattr(), g_setattr() or g_read_data()
  */
 static int
 g_part_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td)
 {
 	struct g_part_table *table;
 
 	table = pp->geom->softc;
 	return G_PART_IOCTL(table, pp, cmd, data, fflag, td);
 }
 
 static void
 g_part_resize(struct g_consumer *cp)
 {
 	struct g_part_table *table;
 
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name));
 	g_topology_assert();
 
 	table = cp->geom->softc;
 	if (table->gpt_opened == 0) {
 		if (g_access(cp, 1, 1, 1) != 0)
 			return;
 		table->gpt_opened = 1;
 	}
 	if (G_PART_RESIZE(table, NULL, NULL) == 0)
 		printf("GEOM_PART: %s was automatically resized.\n"
 		    "  Use `gpart commit %s` to save changes or "
 		    "`gpart undo %s` to revert them.\n", cp->geom->name,
 		    cp->geom->name, cp->geom->name);
 	if (g_part_check_integrity(table, cp) != 0) {
 		g_access(cp, -1, -1, -1);
 		table->gpt_opened = 0;
 		g_part_wither(table->gpt_gp, ENXIO);
 	}
 }
 
 static void
 g_part_orphan(struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	struct g_part_table *table;
 
 	pp = cp->provider;
 	KASSERT(pp != NULL, ("%s", __func__));
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, pp->name));
 	g_topology_assert();
 
 	KASSERT(pp->error != 0, ("%s", __func__));
 	table = cp->geom->softc;
 	if (table != NULL && table->gpt_opened)
 		g_access(cp, -1, -1, -1);
 	g_part_wither(cp->geom, pp->error);
 }
 
 static void
 g_part_spoiled(struct g_consumer *cp)
 {
 
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name));
 	g_topology_assert();
 
 	cp->flags |= G_CF_ORPHAN;
 	g_part_wither(cp->geom, ENXIO);
 }
 
 static void
 g_part_start(struct bio *bp)
 {
 	struct bio *bp2;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 	struct g_kerneldump *gkd;
 	struct g_provider *pp;
 	char buf[64];
 
+	biotrack(bp, __func__);
+
 	pp = bp->bio_to;
 	gp = pp->geom;
 	table = gp->softc;
 	cp = LIST_FIRST(&gp->consumer);
 
 	G_PART_TRACE((G_T_BIO, "%s: cmd=%d, provider=%s", __func__, bp->bio_cmd,
 	    pp->name));
 
 	entry = pp->private;
 	if (entry == NULL) {
 		g_io_deliver(bp, ENXIO);
 		return;
 	}
 
 	switch(bp->bio_cmd) {
 	case BIO_DELETE:
 	case BIO_READ:
 	case BIO_WRITE:
 		if (bp->bio_offset >= pp->mediasize) {
 			g_io_deliver(bp, EIO);
 			return;
 		}
 		bp2 = g_clone_bio(bp);
 		if (bp2 == NULL) {
 			g_io_deliver(bp, ENOMEM);
 			return;
 		}
 		if (bp2->bio_offset + bp2->bio_length > pp->mediasize)
 			bp2->bio_length = pp->mediasize - bp2->bio_offset;
 		bp2->bio_done = g_std_done;
 		bp2->bio_offset += entry->gpe_offset;
 		g_io_request(bp2, cp);
 		return;
 	case BIO_FLUSH:
 		break;
 	case BIO_GETATTR:
 		if (g_handleattr_int(bp, "GEOM::fwheads", table->gpt_heads))
 			return;
 		if (g_handleattr_int(bp, "GEOM::fwsectors", table->gpt_sectors))
 			return;
 		if (g_handleattr_int(bp, "PART::isleaf", table->gpt_isleaf))
 			return;
 		if (g_handleattr_int(bp, "PART::depth", table->gpt_depth))
 			return;
 		if (g_handleattr_str(bp, "PART::scheme",
 		    table->gpt_scheme->name))
 			return;
 		if (g_handleattr_str(bp, "PART::type",
 		    G_PART_TYPE(table, entry, buf, sizeof(buf))))
 			return;
 		if (!strcmp("GEOM::kerneldump", bp->bio_attribute)) {
 			/*
 			 * Check that the partition is suitable for kernel
 			 * dumps. Typically only swap partitions should be
 			 * used. If the request comes from the nested scheme
 			 * we allow dumping there as well.
 			 */
 			if ((bp->bio_from == NULL ||
 			    bp->bio_from->geom->class != &g_part_class) &&
 			    G_PART_DUMPTO(table, entry) == 0) {
 				g_io_deliver(bp, ENODEV);
 				printf("GEOM_PART: Partition '%s' not suitable"
 				    " for kernel dumps (wrong type?)\n",
 				    pp->name);
 				return;
 			}
 			gkd = (struct g_kerneldump *)bp->bio_data;
 			if (gkd->offset >= pp->mediasize) {
 				g_io_deliver(bp, EIO);
 				return;
 			}
 			if (gkd->offset + gkd->length > pp->mediasize)
 				gkd->length = pp->mediasize - gkd->offset;
 			gkd->offset += entry->gpe_offset;
 		}
 		break;
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 
 	bp2 = g_clone_bio(bp);
 	if (bp2 == NULL) {
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 	bp2->bio_done = g_std_done;
 	g_io_request(bp2, cp);
 }
 
 static void
 g_part_init(struct g_class *mp)
 {
 
 	TAILQ_INSERT_HEAD(&g_part_schemes, &g_part_null_scheme, scheme_list);
 }
 
 static void
 g_part_fini(struct g_class *mp)
 {
 
 	TAILQ_REMOVE(&g_part_schemes, &g_part_null_scheme, scheme_list);
 }
 
 static void
 g_part_unload_event(void *arg, int flag)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_scheme *scheme;
 	struct g_part_table *table;
 	uintptr_t *xchg;
 	int acc, error;
 
 	if (flag == EV_CANCEL)
 		return;
 
 	xchg = arg;
 	error = 0;
 	scheme = (void *)(*xchg);
 
 	g_topology_assert();
 
 	LIST_FOREACH(gp, &g_part_class.geom, geom) {
 		table = gp->softc;
 		if (table->gpt_scheme != scheme)
 			continue;
 
 		acc = 0;
 		LIST_FOREACH(pp, &gp->provider, provider)
 			acc += pp->acr + pp->acw + pp->ace;
 		LIST_FOREACH(cp, &gp->consumer, consumer)
 			acc += cp->acr + cp->acw + cp->ace;
 
 		if (!acc)
 			g_part_wither(gp, ENOSYS);
 		else
 			error = EBUSY;
 	}
 
 	if (!error)
 		TAILQ_REMOVE(&g_part_schemes, scheme, scheme_list);
 
 	*xchg = error;
 }
 
 int
 g_part_modevent(module_t mod, int type, struct g_part_scheme *scheme)
 {
 	struct g_part_scheme *iter;
 	uintptr_t arg;
 	int error;
 
 	error = 0;
 	switch (type) {
 	case MOD_LOAD:
 		TAILQ_FOREACH(iter, &g_part_schemes, scheme_list) {
 			if (scheme == iter) {
 				printf("GEOM_PART: scheme %s is already "
 				    "registered!\n", scheme->name);
 				break;
 			}
 		}
 		if (iter == NULL) {
 			TAILQ_INSERT_TAIL(&g_part_schemes, scheme,
 			    scheme_list);
 			g_retaste(&g_part_class);
 		}
 		break;
 	case MOD_UNLOAD:
 		arg = (uintptr_t)scheme;
 		error = g_waitfor_event(g_part_unload_event, &arg, M_WAITOK,
 		    NULL);
 		if (error == 0)
 			error = arg;
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
Index: head/sys/kern/vfs_bio.c
===================================================================
--- head/sys/kern/vfs_bio.c	(revision 308154)
+++ head/sys/kern/vfs_bio.c	(revision 308155)
@@ -1,4902 +1,4933 @@
 /*-
  * Copyright (c) 2004 Poul-Henning Kamp
  * Copyright (c) 1994,1997 John S. Dyson
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  *
  * see man buf(9) for more info.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/fail.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 #include <geom/geom.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/swap_pager.h>
 #include "opt_compat.h"
 #include "opt_swap.h"
 
 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
 
 struct	bio_ops bioops;		/* I/O operation notification */
 
 struct	buf_ops buf_ops_bio = {
 	.bop_name	=	"buf_ops_bio",
 	.bop_write	=	bufwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 static struct buf *buf;		/* buffer header pool */
 extern struct buf *swbuf;	/* Swap buffer header pool. */
 caddr_t unmapped_buf;
 
 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
 struct proc *bufdaemonproc;
 struct proc *bufspacedaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
 		vm_offset_t to);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
 		vm_page_t m);
 static void vfs_clean_pages_dirty_buf(struct buf *bp);
 static void vfs_setdirty_locked_object(struct buf *bp);
 static void vfs_vmio_invalidate(struct buf *bp);
 static void vfs_vmio_truncate(struct buf *bp, int npages);
 static void vfs_vmio_extend(struct buf *bp, int npages, int size);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
 static int buf_flush(struct vnode *vp, int);
 static int buf_recycle(bool);
 static int buf_scan(bool);
 static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 static void bufkva_reclaim(vmem_t *, int);
 static void bufkva_free(struct buf *);
 static int buf_import(void *, void **, int, int);
 static void buf_release(void *, void **, int);
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
 #endif
 
 int vmiodirenable = TRUE;
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
     "Use the VM system for directory writes");
 long runningbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
     "Amount of presently outstanding async buffer io");
 static long bufspace;
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
     &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
 #else
 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
     "Physical memory used for buffers");
 #endif
 static long bufkvaspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
     "Maximum allowed value of bufspace (including metadata)");
 static long bufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
     "Amount of malloced memory for buffers");
 static long maxbufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
     0, "Maximum amount of malloced memory for buffers");
 static long lobufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
     "Minimum amount of buffers we want to have");
 long hibufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
     "Maximum allowed value of bufspace (excluding metadata)");
 long bufspacethresh;
 SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
     0, "Bufspace consumed before waking the daemon to free some");
 static int buffreekvacnt;
 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
     "Number of times we have freed the KVA space from some buffer");
 static int bufdefragcnt;
 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
     "Number of times we have had to repeat buffer allocation to defragment");
 static long lorunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
     "Minimum preferred space used for in-progress I/O");
 static long hirunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
     "Maximum amount of space to use for in-progress I/O");
 int dirtybufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
     0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
 int bdwriteskip;
 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
     0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
 int altbufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
     0, "Number of fsync flushes to limit dirty buffers");
 static int recursiveflushes;
 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
     0, "Number of flushes skipped due to being recursive");
 static int numdirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
     "Number of buffers that are dirty (has unwritten changes) at the moment");
 static int lodirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
     "How many buffers we want to have free before bufdaemon can sleep");
 static int hidirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
     "When the number of dirty buffers is considered severe");
 int dirtybufthresh;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
     0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
 static int numfreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
     "Number of free buffers");
 static int lofreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
    "Target number of free buffers");
 static int hifreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
    "Threshold for clean buffer recycling");
 static int getnewbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
    "Number of calls to getnewbuf");
 static int getnewbufrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
     "Number of times getnewbuf has had to restart a buffer acquisition");
 static int mappingrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
     "Number of times getblk has had to restart a buffer mapping for "
     "unmapped buffer");
 static int numbufallocfails;
 SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
     "Number of times buffer allocations failed");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
 static long notbufdflushes;
 SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
     "Number of barrier writes");
 SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
     &unmapped_buf_allowed, 0,
     "Permit the use of the unmapped i/o");
 
 /*
  * This lock synchronizes access to bd_request.
  */
 static struct mtx_padalign bdlock;
 
 /*
  * This lock protects the runningbufreq and synchronizes runningbufwakeup and
  * waitrunningbufspace().
  */
 static struct mtx_padalign rbreqlock;
 
 /*
  * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
  */
 static struct rwlock_padalign nblock;
 
 /*
  * Lock that protects bdirtywait.
  */
 static struct mtx_padalign bdirtylock;
 
 /*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
  * is idling.
  */
 static int bd_request;
 
 /*
  * Request/wakeup point for the bufspace daemon.
  */
 static int bufspace_request;
 
 /*
  * Request for the buf daemon to write more buffers than is indicated by
  * lodirtybuf.  This may be necessary to push out excess dependencies or
  * defragment the address space where a simple count of the number of dirty
  * buffers is insufficient to characterize the demand for flushing them.
  */
 static int bd_speedupreq;
 
 /*
  * bogus page -- for I/O to/from partially complete buffers
  * this is a temporary solution to the problem, but it is not
  * really that bad.  it would be better to split the buffer
  * for input in the case of buffers partially already in memory,
  * but the code is intricate enough already.
  */
 vm_page_t bogus_page;
 
 /*
  * Synchronization (sleep/wakeup) variable for active buffer space requests.
  * Set when wait starts, cleared prior to wakeup().
  * Used in runningbufwakeup() and waitrunningbufspace().
  */
 static int runningbufreq;
 
 /* 
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
  * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
  * getnewbuf(), and getblk().
  */
 static volatile int needsbuffer;
 
 /*
  * Synchronization for bwillwrite() waiters.
  */
 static int bdirtywait;
 
 /*
  * Definitions for the buffer free lists.
  */
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_EMPTY	1	/* empty buffer headers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
 #define QUEUE_CLEAN	3	/* non-B_DELWRI buffers */
 #define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
 
 /* Maximum number of clean buffer queues. */
 #define	CLEAN_QUEUES	16
 
 /* Configured number of clean queues. */
 static int clean_queues;
 
 /* Maximum number of buffer queues. */
 #define BUFFER_QUEUES	(QUEUE_CLEAN + CLEAN_QUEUES)
 
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
 #ifdef INVARIANTS
 static int bq_len[BUFFER_QUEUES];
 #endif
 
 /*
  * Lock for each bufqueue
  */
 static struct mtx_padalign bqlocks[BUFFER_QUEUES];
 
 /*
  * per-cpu empty buffer cache.
  */
 uma_zone_t buf_zone;
 
 /*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
  */
 const char *buf_wmesg = BUF_WMESG;
 
 static int
 sysctl_runningspace(SYSCTL_HANDLER_ARGS)
 {
 	long value;
 	int error;
 
 	value = *(long *)arg1;
 	error = sysctl_handle_long(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	mtx_lock(&rbreqlock);
 	if (arg1 == &hirunningspace) {
 		if (value < lorunningspace)
 			error = EINVAL;
 		else
 			hirunningspace = value;
 	} else {
 		KASSERT(arg1 == &lorunningspace,
 		    ("%s: unknown arg1", __func__));
 		if (value > hirunningspace)
 			error = EINVAL;
 		else
 			lorunningspace = value;
 	}
 	mtx_unlock(&rbreqlock);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int
 sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
 	long lvalue;
 	int ivalue;
 
 	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
 		return (sysctl_handle_long(oidp, arg1, arg2, req));
 	lvalue = *(long *)arg1;
 	if (lvalue > INT_MAX)
 		/* On overflow, still write out a long to trigger ENOMEM. */
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	ivalue = lvalue;
 	return (sysctl_handle_int(oidp, &ivalue, 0, req));
 }
 #endif
 
 static int
 bqcleanq(void)
 {
 	static int nextq;
 
 	return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
 }
 
 static int
 bqisclean(int qindex)
 {
 
 	return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
 }
 
 /*
  *	bqlock:
  *
  *	Return the appropriate queue lock based on the index.
  */
 static inline struct mtx *
 bqlock(int qindex)
 {
 
 	return (struct mtx *)&bqlocks[qindex];
 }
 
 /*
  *	bdirtywakeup:
  *
  *	Wakeup any bwillwrite() waiters.
  */
 static void
 bdirtywakeup(void)
 {
 	mtx_lock(&bdirtylock);
 	if (bdirtywait) {
 		bdirtywait = 0;
 		wakeup(&bdirtywait);
 	}
 	mtx_unlock(&bdirtylock);
 }
 
 /*
  *	bdirtysub:
  *
  *	Decrement the numdirtybuffers count by one and wakeup any
  *	threads blocked in bwillwrite().
  */
 static void
 bdirtysub(void)
 {
 
 	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
 	    (lodirtybuffers + hidirtybuffers) / 2)
 		bdirtywakeup();
 }
 
 /*
  *	bdirtyadd:
  *
  *	Increment the numdirtybuffers count by one and wakeup the buf 
  *	daemon if needed.
  */
 static void
 bdirtyadd(void)
 {
 
 	/*
 	 * Only do the wakeup once as we cross the boundary.  The
 	 * buf daemon will keep running until the condition clears.
 	 */
 	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
 	    (lodirtybuffers + hidirtybuffers) / 2)
 		bd_wakeup();
 }
 
 /*
  *	bufspace_wakeup:
  *
  *	Called when buffer space is potentially available for recovery.
  *	getnewbuf() will block on this flag when it is unable to free 
  *	sufficient buffer space.  Buffer space becomes recoverable when 
  *	bp's get placed back in the queues.
  */
 static void
 bufspace_wakeup(void)
 {
 
 	/*
 	 * If someone is waiting for bufspace, wake them up.
 	 *
 	 * Since needsbuffer is set prior to doing an additional queue
 	 * scan it is safe to check for the flag prior to acquiring the
 	 * lock.  The thread that is preparing to scan again before
 	 * blocking would discover the buf we released.
 	 */
 	if (needsbuffer) {
 		rw_rlock(&nblock);
 		if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
 			wakeup(__DEVOLATILE(void *, &needsbuffer));
 		rw_runlock(&nblock);
 	}
 }
 
 /*
  *	bufspace_daemonwakeup:
  *
  *	Wakeup the daemon responsible for freeing clean bufs.
  */
 static void
 bufspace_daemonwakeup(void)
 {
 	rw_rlock(&nblock);
 	if (bufspace_request == 0) {
 		bufspace_request = 1;
 		wakeup(&bufspace_request);
 	}
 	rw_runlock(&nblock);
 }
 
 /*
  *	bufspace_adjust:
  *
  *	Adjust the reported bufspace for a KVA managed buffer, possibly
  * 	waking any waiters.
  */
 static void
 bufspace_adjust(struct buf *bp, int bufsize)
 {
 	long space;
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) == 0,
 	    ("bufspace_adjust: malloc buf %p", bp));
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0) {
 		atomic_subtract_long(&bufspace, -diff);
 		bufspace_wakeup();
 	} else {
 		space = atomic_fetchadd_long(&bufspace, diff);
 		/* Wake up the daemon on the transition. */
 		if (space < bufspacethresh && space + diff >= bufspacethresh)
 			bufspace_daemonwakeup();
 	}
 	bp->b_bufsize = bufsize;
 }
 
 /*
  *	bufspace_reserve:
  *
  *	Reserve bufspace before calling allocbuf().  metadata has a
  *	different space limit than data.
  */
 static int
 bufspace_reserve(int size, bool metadata)
 {
 	long limit;
 	long space;
 
 	if (metadata)
 		limit = maxbufspace;
 	else
 		limit = hibufspace;
 	do {
 		space = bufspace;
 		if (space + size > limit)
 			return (ENOSPC);
 	} while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
 
 	/* Wake up the daemon on the transition. */
 	if (space < bufspacethresh && space + size >= bufspacethresh)
 		bufspace_daemonwakeup();
 
 	return (0);
 }
 
 /*
  *	bufspace_release:
  *
  *	Release reserved bufspace after bufspace_adjust() has consumed it.
  */
 static void
 bufspace_release(int size)
 {
 	atomic_subtract_long(&bufspace, size);
 	bufspace_wakeup();
 }
 
 /*
  *	bufspace_wait:
  *
  *	Wait for bufspace, acting as the buf daemon if a locked vnode is
  *	supplied.  needsbuffer must be set in a safe fashion prior to
  *	polling for space.  The operation must be re-tried on return.
  */
 static void
 bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
 {
 	struct thread *td;
 	int error, fl, norunbuf;
 
 	if ((gbflags & GB_NOWAIT_BD) != 0)
 		return;
 
 	td = curthread;
 	rw_wlock(&nblock);
 	while (needsbuffer != 0) {
 		if (vp != NULL && vp->v_type != VCHR &&
 		    (td->td_pflags & TDP_BUFNEED) == 0) {
 			rw_wunlock(&nblock);
 			/*
 			 * getblk() is called with a vnode locked, and
 			 * some majority of the dirty buffers may as
 			 * well belong to the vnode.  Flushing the
 			 * buffers there would make a progress that
 			 * cannot be achieved by the buf_daemon, that
 			 * cannot lock the vnode.
 			 */
 			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
 			    (td->td_pflags & TDP_NORUNNINGBUF);
 
 			/*
 			 * Play bufdaemon.  The getnewbuf() function
 			 * may be called while the thread owns lock
 			 * for another dirty buffer for the same
 			 * vnode, which makes it impossible to use
 			 * VOP_FSYNC() there, due to the buffer lock
 			 * recursion.
 			 */
 			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
 			fl = buf_flush(vp, flushbufqtarget);
 			td->td_pflags &= norunbuf;
 			rw_wlock(&nblock);
 			if (fl != 0)
 				continue;
 			if (needsbuffer == 0)
 				break;
 		}
 		error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
 		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
 		if (error != 0)
 			break;
 	}
 	rw_wunlock(&nblock);
 }
 
 
 /*
  *	bufspace_daemon:
  *
  *	buffer space management daemon.  Tries to maintain some marginal
  *	amount of free buffer space so that requesting processes neither
  *	block nor work to reclaim buffers.
  */
 static void
 bufspace_daemon(void)
 {
 	for (;;) {
 		kproc_suspend_check(bufspacedaemonproc);
 
 		/*
 		 * Free buffers from the clean queue until we meet our
 		 * targets.
 		 *
 		 * Theory of operation:  The buffer cache is most efficient
 		 * when some free buffer headers and space are always
 		 * available to getnewbuf().  This daemon attempts to prevent
 		 * the excessive blocking and synchronization associated
 		 * with shortfall.  It goes through three phases according
 		 * demand:
 		 *
 		 * 1)	The daemon wakes up voluntarily once per-second
 		 *	during idle periods when the counters are below
 		 *	the wakeup thresholds (bufspacethresh, lofreebuffers).
 		 *
 		 * 2)	The daemon wakes up as we cross the thresholds
 		 *	ahead of any potential blocking.  This may bounce
 		 *	slightly according to the rate of consumption and
 		 *	release.
 		 *
 		 * 3)	The daemon and consumers are starved for working
 		 *	clean buffers.  This is the 'bufspace' sleep below
 		 *	which will inefficiently trade bufs with bqrelse
 		 *	until we return to condition 2.
 		 */
 		while (bufspace > lobufspace ||
 		    numfreebuffers < hifreebuffers) {
 			if (buf_recycle(false) != 0) {
 				atomic_set_int(&needsbuffer, 1);
 				if (buf_recycle(false) != 0) {
 					rw_wlock(&nblock);
 					if (needsbuffer)
 						rw_sleep(__DEVOLATILE(void *,
 						    &needsbuffer), &nblock,
 						    PRIBIO|PDROP, "bufspace",
 						    hz/10);
 					else
 						rw_wunlock(&nblock);
 				}
 			}
 			maybe_yield();
 		}
 
 		/*
 		 * Re-check our limits under the exclusive nblock.
 		 */
 		rw_wlock(&nblock);
 		if (bufspace < bufspacethresh &&
 		    numfreebuffers > lofreebuffers) {
 			bufspace_request = 0;
 			rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
 			    "-", hz);
 		} else
 			rw_wunlock(&nblock);
 	}
 }
 
 static struct kproc_desc bufspace_kp = {
 	"bufspacedaemon",
 	bufspace_daemon,
 	&bufspacedaemonproc
 };
 SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
     &bufspace_kp);
 
 /*
  *	bufmallocadjust:
  *
  *	Adjust the reported bufspace for a malloc managed buffer, possibly
  *	waking any waiters.
  */
 static void
 bufmallocadjust(struct buf *bp, int bufsize)
 {
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) != 0,
 	    ("bufmallocadjust: non-malloc buf %p", bp));
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0)
 		atomic_subtract_long(&bufmallocspace, -diff);
 	else
 		atomic_add_long(&bufmallocspace, diff);
 	bp->b_bufsize = bufsize;
 }
 
 /*
  *	runningwakeup:
  *
  *	Wake up processes that are waiting on asynchronous writes to fall
  *	below lorunningspace.
  */
 static void
 runningwakeup(void)
 {
 
 	mtx_lock(&rbreqlock);
 	if (runningbufreq) {
 		runningbufreq = 0;
 		wakeup(&runningbufreq);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 /*
  *	runningbufwakeup:
  *
  *	Decrement the outstanding write count according.
  */
 void
 runningbufwakeup(struct buf *bp)
 {
 	long space, bspace;
 
 	bspace = bp->b_runningbufspace;
 	if (bspace == 0)
 		return;
 	space = atomic_fetchadd_long(&runningbufspace, -bspace);
 	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
 	    space, bspace));
 	bp->b_runningbufspace = 0;
 	/*
 	 * Only acquire the lock and wakeup on the transition from exceeding
 	 * the threshold to falling below it.
 	 */
 	if (space < lorunningspace)
 		return;
 	if (space - bspace > lorunningspace)
 		return;
 	runningwakeup();
 }
 
 /*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
  *	running.  This routine is used in async-write situations to
  *	prevent creating huge backups of pending writes to a device.
  *	Only asynchronous writes are governed by this function.
  *
  *	This does NOT turn an async write into a sync write.  It waits  
  *	for earlier writes to complete and generally returns before the
  *	caller's write has reached the device.
  */
 void
 waitrunningbufspace(void)
 {
 
 	mtx_lock(&rbreqlock);
 	while (runningbufspace > hirunningspace) {
 		runningbufreq = 1;
 		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 
 /*
  *	vfs_buf_test_cache:
  *
  *	Called when a buffer is extended.  This function clears the B_CACHE
  *	bit if the newly extended portion of the buffer does not contain
  *	valid data.
  */
 static __inline void
 vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
     vm_offset_t size, vm_page_t m)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if (bp->b_flags & B_CACHE) {
 		int base = (foff + off) & PAGE_MASK;
 		if (vm_page_is_valid(m, base, size) == 0)
 			bp->b_flags &= ~B_CACHE;
 	}
 }
 
 /* Wake up the buffer daemon if necessary */
 static __inline void
 bd_wakeup(void)
 {
 
 	mtx_lock(&bdlock);
 	if (bd_request == 0) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
 	mtx_unlock(&bdlock);
 }
 
 /*
  * bd_speedup - speedup the buffer cache flushing code
  */
 void
 bd_speedup(void)
 {
 	int needwake;
 
 	mtx_lock(&bdlock);
 	needwake = 0;
 	if (bd_speedupreq == 0 || bd_request == 0)
 		needwake = 1;
 	bd_speedupreq = 1;
 	bd_request = 1;
 	if (needwake)
 		wakeup(&bd_request);
 	mtx_unlock(&bdlock);
 }
 
 #ifndef NSWBUF_MIN
 #define	NSWBUF_MIN	16
 #endif
 
 #ifdef __i386__
 #define	TRANSIENT_DENOM	5
 #else
 #define	TRANSIENT_DENOM 10
 #endif
 
 /*
  * Calculating buffer cache scaling values and reserve space for buffer
  * headers.  This is called during low level kernel initialization and
  * may be called more then once.  We CANNOT write to the memory area
  * being reserved at this time.
  */
 caddr_t
 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 {
 	int tuned_nbuf;
 	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
 
 	/*
 	 * physmem_est is in pages.  Convert it to kilobytes (assumes
 	 * PAGE_SIZE is >= 1K)
 	 */
 	physmem_est = physmem_est * (PAGE_SIZE / 1024);
 
 	/*
 	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 	 * For the first 64MB of ram nominally allocate sufficient buffers to
 	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
 	 * the buffer cache we limit the eventual kva reservation to
 	 * maxbcache bytes.
 	 *
 	 * factor represents the 1/4 x ram conversion.
 	 */
 	if (nbuf == 0) {
 		int factor = 4 * BKVASIZE / 1024;
 
 		nbuf = 50;
 		if (physmem_est > 4096)
 			nbuf += min((physmem_est - 4096) / factor,
 			    65536 / factor);
 		if (physmem_est > 65536)
 			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
 			    32 * 1024 * 1024 / (factor * 5));
 
 		if (maxbcache && nbuf > maxbcache / BKVASIZE)
 			nbuf = maxbcache / BKVASIZE;
 		tuned_nbuf = 1;
 	} else
 		tuned_nbuf = 0;
 
 	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
 	maxbuf = (LONG_MAX / 3) / BKVASIZE;
 	if (nbuf > maxbuf) {
 		if (!tuned_nbuf)
 			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
 			    maxbuf);
 		nbuf = maxbuf;
 	}
 
 	/*
 	 * Ideal allocation size for the transient bio submap is 10%
 	 * of the maximal space buffer map.  This roughly corresponds
 	 * to the amount of the buffer mapped for typical UFS load.
 	 *
 	 * Clip the buffer map to reserve space for the transient
 	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
 	 * maximum buffer map extent on the platform.
 	 *
 	 * The fall-back to the maxbuf in case of maxbcache unset,
 	 * allows to not trim the buffer KVA for the architectures
 	 * with ample KVA space.
 	 */
 	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
 		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
 		buf_sz = (long)nbuf * BKVASIZE;
 		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
 		    (TRANSIENT_DENOM - 1)) {
 			/*
 			 * There is more KVA than memory.  Do not
 			 * adjust buffer map size, and assign the rest
 			 * of maxbuf to transient map.
 			 */
 			biotmap_sz = maxbuf_sz - buf_sz;
 		} else {
 			/*
 			 * Buffer map spans all KVA we could afford on
 			 * this platform.  Give 10% (20% on i386) of
 			 * the buffer map to the transient bio map.
 			 */
 			biotmap_sz = buf_sz / TRANSIENT_DENOM;
 			buf_sz -= biotmap_sz;
 		}
 		if (biotmap_sz / INT_MAX > MAXPHYS)
 			bio_transient_maxcnt = INT_MAX;
 		else
 			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
 		/*
 		 * Artificially limit to 1024 simultaneous in-flight I/Os
 		 * using the transient mapping.
 		 */
 		if (bio_transient_maxcnt > 1024)
 			bio_transient_maxcnt = 1024;
 		if (tuned_nbuf)
 			nbuf = buf_sz / BKVASIZE;
 	}
 
 	/*
 	 * swbufs are used as temporary holders for I/O, such as paging I/O.
 	 * We have no less then 16 and no more then 256.
 	 */
 	nswbuf = min(nbuf / 4, 256);
 	TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
 	if (nswbuf < NSWBUF_MIN)
 		nswbuf = NSWBUF_MIN;
 
 	/*
 	 * Reserve space for the buffer cache buffers
 	 */
 	swbuf = (void *)v;
 	v = (caddr_t)(swbuf + nswbuf);
 	buf = (void *)v;
 	v = (caddr_t)(buf + nbuf);
 
 	return(v);
 }
 
 /* Initialize the buffer subsystem.  Called before use of any buffers. */
 void
 bufinit(void)
 {
 	struct buf *bp;
 	int i;
 
 	CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
 	mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
 	mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
 	for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
 		mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 	rw_init(&nblock, "needsbuffer lock");
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
 
 	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
 
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_xflags = 0;
 		bp->b_data = bp->b_kvabase = unmapped_buf;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 #ifdef INVARIANTS
 		bq_len[QUEUE_EMPTY]++;
 #endif
 	}
 
 	/*
 	 * maxbufspace is the absolute maximum amount of buffer space we are 
 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
 	 * is nominally used by metadata.  hibufspace is the nominal maximum
 	 * used by most other requests.  The differential is required to 
 	 * ensure that metadata deadlocks don't occur.
 	 *
 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
 	 * this may result in KVM fragmentation which is not handled optimally
 	 * by the system. XXX This is less true with vmem.  We could use
 	 * PAGE_SIZE.
 	 */
 	maxbufspace = (long)nbuf * BKVASIZE;
 	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
 	lobufspace = (hibufspace / 20) * 19; /* 95% */
 	bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
 
 	/*
 	 * Note: The 16 MiB upper limit for hirunningspace was chosen
 	 * arbitrarily and may need further tuning. It corresponds to
 	 * 128 outstanding write IO requests (if IO size is 128 KiB),
 	 * which fits with many RAID controllers' tagged queuing limits.
 	 * The lower 1 MiB limit is the historical upper limit for
 	 * hirunningspace.
 	 */
 	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF),
 	    16 * 1024 * 1024), 1024 * 1024);
 	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
 
 	/*
 	 * Limit the amount of malloc memory since it is wired permanently into
 	 * the kernel space.  Even though this is accounted for in the buffer
 	 * allocation, we don't want the malloced region to grow uncontrolled.
 	 * The malloc scheme improves memory utilization significantly on
 	 * average (small) directories.
 	 */
 	maxbufmallocspace = hibufspace / 20;
 
 	/*
 	 * Reduce the chance of a deadlock occurring by limiting the number
 	 * of delayed-write dirty buffers we allow to stack up.
 	 */
 	hidirtybuffers = nbuf / 4 + 20;
 	dirtybufthresh = hidirtybuffers * 9 / 10;
 	numdirtybuffers = 0;
 	/*
 	 * To support extreme low-memory systems, make sure hidirtybuffers
 	 * cannot eat up all available buffer space.  This occurs when our
 	 * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
 	 * buffer space assuming BKVASIZE'd buffers.
 	 */
 	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 		hidirtybuffers >>= 1;
 	}
 	lodirtybuffers = hidirtybuffers / 2;
 
 	/*
 	 * lofreebuffers should be sufficient to avoid stalling waiting on
 	 * buf headers under heavy utilization.  The bufs in per-cpu caches
 	 * are counted as free but will be unavailable to threads executing
 	 * on other cpus.
 	 *
 	 * hifreebuffers is the free target for the bufspace daemon.  This
 	 * should be set appropriately to limit work per-iteration.
 	 */
 	lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
 	hifreebuffers = (3 * lofreebuffers) / 2;
 	numfreebuffers = nbuf;
 
 	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
 
 	/* Setup the kva and free list allocators. */
 	vmem_set_reclaim(buffer_arena, bufkva_reclaim);
 	buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
 	    NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
 
 	/*
 	 * Size the clean queue according to the amount of buffer space.
 	 * One queue per-256mb up to the max.  More queues gives better
 	 * concurrency but less accurate LRU.
 	 */
 	clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
 
 }
 
 #ifdef INVARIANTS
 static inline void
 vfs_buf_check_mapped(struct buf *bp)
 {
 
 	KASSERT(bp->b_kvabase != unmapped_buf,
 	    ("mapped buf: b_kvabase was not updated %p", bp));
 	KASSERT(bp->b_data != unmapped_buf,
 	    ("mapped buf: b_data was not updated %p", bp));
 	KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf +
 	    MAXPHYS, ("b_data + b_offset unmapped %p", bp));
 }
 
 static inline void
 vfs_buf_check_unmapped(struct buf *bp)
 {
 
 	KASSERT(bp->b_data == unmapped_buf,
 	    ("unmapped buf: corrupted b_data %p", bp));
 }
 
 #define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
 #define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
 #else
 #define	BUF_CHECK_MAPPED(bp) do {} while (0)
 #define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
 #endif
 
 static int
 isbufbusy(struct buf *bp)
 {
 	if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) ||
 	    ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
 		return (1);
 	return (0);
 }
 
 /*
  * Shutdown the system cleanly to prepare for reboot, halt, or power off.
  */
 void
 bufshutdown(int show_busybufs)
 {
 	static int first_buf_printf = 1;
 	struct buf *bp;
 	int iter, nbusy, pbusy;
 #ifndef PREEMPTION
 	int subiter;
 #endif
 
 	/* 
 	 * Sync filesystems for shutdown
 	 */
 	wdog_kern_pat(WD_LASTVAL);
 	sys_sync(curthread, NULL);
 
 	/*
 	 * With soft updates, some buffers that are
 	 * written will be remarked as dirty until other
 	 * buffers are written.
 	 */
 	for (iter = pbusy = 0; iter < 20; iter++) {
 		nbusy = 0;
 		for (bp = &buf[nbuf]; --bp >= buf; )
 			if (isbufbusy(bp))
 				nbusy++;
 		if (nbusy == 0) {
 			if (first_buf_printf)
 				printf("All buffers synced.");
 			break;
 		}
 		if (first_buf_printf) {
 			printf("Syncing disks, buffers remaining... ");
 			first_buf_printf = 0;
 		}
 		printf("%d ", nbusy);
 		if (nbusy < pbusy)
 			iter = 0;
 		pbusy = nbusy;
 
 		wdog_kern_pat(WD_LASTVAL);
 		sys_sync(curthread, NULL);
 
 #ifdef PREEMPTION
 		/*
 		 * Drop Giant and spin for a while to allow
 		 * interrupt threads to run.
 		 */
 		DROP_GIANT();
 		DELAY(50000 * iter);
 		PICKUP_GIANT();
 #else
 		/*
 		 * Drop Giant and context switch several times to
 		 * allow interrupt threads to run.
 		 */
 		DROP_GIANT();
 		for (subiter = 0; subiter < 50 * iter; subiter++) {
 			thread_lock(curthread);
 			mi_switch(SW_VOL, NULL);
 			thread_unlock(curthread);
 			DELAY(1000);
 		}
 		PICKUP_GIANT();
 #endif
 	}
 	printf("\n");
 	/*
 	 * Count only busy local buffers to prevent forcing 
 	 * a fsck if we're just a client of a wedged NFS server
 	 */
 	nbusy = 0;
 	for (bp = &buf[nbuf]; --bp >= buf; ) {
 		if (isbufbusy(bp)) {
 #if 0
 /* XXX: This is bogus.  We should probably have a BO_REMOTE flag instead */
 			if (bp->b_dev == NULL) {
 				TAILQ_REMOVE(&mountlist,
 				    bp->b_vp->v_mount, mnt_list);
 				continue;
 			}
 #endif
 			nbusy++;
 			if (show_busybufs > 0) {
 				printf(
 	    "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
 				    nbusy, bp, bp->b_vp, bp->b_flags,
 				    (intmax_t)bp->b_blkno,
 				    (intmax_t)bp->b_lblkno);
 				BUF_LOCKPRINTINFO(bp);
 				if (show_busybufs > 1)
 					vn_printf(bp->b_vp,
 					    "vnode content: ");
 			}
 		}
 	}
 	if (nbusy) {
 		/*
 		 * Failed to sync all blocks. Indicate this and don't
 		 * unmount filesystems (thus forcing an fsck on reboot).
 		 */
 		printf("Giving up on %d buffers\n", nbusy);
 		DELAY(5000000);	/* 5 seconds */
 	} else {
 		if (!first_buf_printf)
 			printf("Final sync complete\n");
 		/*
 		 * Unmount filesystems
 		 */
 		if (panicstr == NULL)
 			vfs_unmountall();
 	}
 	swapoff_all();
 	DELAY(100000);		/* wait for console output to finish */
 }
 
 static void
 bpmap_qenter(struct buf *bp)
 {
 
 	BUF_CHECK_MAPPED(bp);
 
 	/*
 	 * bp->b_data is relative to bp->b_offset, but
 	 * bp->b_offset may be offset into the first page.
 	 */
 	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
 	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
 	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
 	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
 /*
  *	binsfree:
  *
  *	Insert the buffer into the appropriate free list.
  */
 static void
 binsfree(struct buf *bp, int qindex)
 {
 	struct mtx *olock, *nlock;
 
 	if (qindex != QUEUE_EMPTY) {
 		BUF_ASSERT_XLOCKED(bp);
 	}
 
 	/*
 	 * Stick to the same clean queue for the lifetime of the buf to
 	 * limit locking below.  Otherwise pick ont sequentially.
 	 */
 	if (qindex == QUEUE_CLEAN) {
 		if (bqisclean(bp->b_qindex))
 			qindex = bp->b_qindex;
 		else
 			qindex = bqcleanq();
 	}
 
 	/*
 	 * Handle delayed bremfree() processing.
 	 */
 	nlock = bqlock(qindex);
 	if (bp->b_flags & B_REMFREE) {
 		olock = bqlock(bp->b_qindex);
 		mtx_lock(olock);
 		bremfreel(bp);
 		if (olock != nlock) {
 			mtx_unlock(olock);
 			mtx_lock(nlock);
 		}
 	} else
 		mtx_lock(nlock);
 
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("binsfree: free buffer onto another queue???");
 
 	bp->b_qindex = qindex;
 	if (bp->b_flags & B_AGE)
 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 	else
 		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
 #ifdef INVARIANTS
 	bq_len[bp->b_qindex]++;
 #endif
 	mtx_unlock(nlock);
 }
 
 /*
  * buf_free:
  *
  *	Free a buffer to the buf zone once it no longer has valid contents.
  */
 static void
 buf_free(struct buf *bp)
 {
 
 	if (bp->b_flags & B_REMFREE)
 		bremfreef(bp);
 	if (bp->b_vflags & BV_BKGRDINPROG)
 		panic("losing buffer 1");
 	if (bp->b_rcred != NOCRED) {
 		crfree(bp->b_rcred);
 		bp->b_rcred = NOCRED;
 	}
 	if (bp->b_wcred != NOCRED) {
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_deallocate(bp);
 	bufkva_free(bp);
 	BUF_UNLOCK(bp);
 	uma_zfree(buf_zone, bp);
 	atomic_add_int(&numfreebuffers, 1);
 	bufspace_wakeup();
 }
 
 /*
  * buf_import:
  *
  *	Import bufs into the uma cache from the buf list.  The system still
  *	expects a static array of bufs and much of the synchronization
  *	around bufs assumes type stable storage.  As a result, UMA is used
  *	only as a per-cpu cache of bufs still maintained on a global list.
  */
 static int
 buf_import(void *arg, void **store, int cnt, int flags)
 {
 	struct buf *bp;
 	int i;
 
 	mtx_lock(&bqlocks[QUEUE_EMPTY]);
 	for (i = 0; i < cnt; i++) {
 		bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 		if (bp == NULL)
 			break;
 		bremfreel(bp);
 		store[i] = bp;
 	}
 	mtx_unlock(&bqlocks[QUEUE_EMPTY]);
 
 	return (i);
 }
 
 /*
  * buf_release:
  *
  *	Release bufs from the uma cache back to the buffer queues.
  */
 static void
 buf_release(void *arg, void **store, int cnt)
 {
         int i;
 
         for (i = 0; i < cnt; i++)
 		binsfree(store[i], QUEUE_EMPTY);
 }
 
 /*
  * buf_alloc:
  *
  *	Allocate an empty buffer header.
  */
 static struct buf *
 buf_alloc(void)
 {
 	struct buf *bp;
 
 	bp = uma_zalloc(buf_zone, M_NOWAIT);
 	if (bp == NULL) {
 		bufspace_daemonwakeup();
 		atomic_add_int(&numbufallocfails, 1);
 		return (NULL);
 	}
 
 	/*
 	 * Wake-up the bufspace daemon on transition.
 	 */
 	if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
 		bufspace_daemonwakeup();
 
 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
 	
 	KASSERT(bp->b_vp == NULL,
 	    ("bp: %p still has vnode %p.", bp, bp->b_vp));
 	KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
 	    ("invalid buffer %p flags %#x", bp, bp->b_flags));
 	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
 	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
 	KASSERT(bp->b_npages == 0,
 	    ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
 	KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
 	KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
 
 	bp->b_flags = 0;
 	bp->b_ioflags = 0;
 	bp->b_xflags = 0;
 	bp->b_vflags = 0;
 	bp->b_vp = NULL;
 	bp->b_blkno = bp->b_lblkno = 0;
 	bp->b_offset = NOOFFSET;
 	bp->b_iodone = 0;
 	bp->b_error = 0;
 	bp->b_resid = 0;
 	bp->b_bcount = 0;
 	bp->b_npages = 0;
 	bp->b_dirtyoff = bp->b_dirtyend = 0;
 	bp->b_bufobj = NULL;
 	bp->b_data = bp->b_kvabase = unmapped_buf;
 	bp->b_fsprivate1 = NULL;
 	bp->b_fsprivate2 = NULL;
 	bp->b_fsprivate3 = NULL;
 	LIST_INIT(&bp->b_dep);
 
 	return (bp);
 }
 
 /*
  *	buf_qrecycle:
  *
  *	Free a buffer from the given bufqueue.  kva controls whether the
  *	freed buf must own some kva resources.  This is used for
  *	defragmenting.
  */
 static int
 buf_qrecycle(int qindex, bool kva)
 {
 	struct buf *bp, *nbp;
 
 	if (kva)
 		atomic_add_int(&bufdefragcnt, 1);
 	nbp = NULL;
 	mtx_lock(&bqlocks[qindex]);
 	nbp = TAILQ_FIRST(&bufqueues[qindex]);
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
 	while ((bp = nbp) != NULL) {
 		/*
 		 * Calculate next bp (we can only use it if we do not
 		 * release the bqlock).
 		 */
 		nbp = TAILQ_NEXT(bp, b_freelist);
 
 		/*
 		 * If we are defragging then we need a buffer with 
 		 * some kva to reclaim.
 		 */
 		if (kva && bp->b_kvasize == 0)
 			continue;
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
 
 		/*
 		 * Skip buffers with background writes in progress.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 
 		KASSERT(bp->b_qindex == qindex,
 		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 */
 		bremfreel(bp);
 		mtx_unlock(&bqlocks[qindex]);
 
 		/*
 		 * Requeue the background write buffer with error and
 		 * restart the scan.
 		 */
 		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
 			bqrelse(bp);
 			mtx_lock(&bqlocks[qindex]);
 			nbp = TAILQ_FIRST(&bufqueues[qindex]);
 			continue;
 		}
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 		return (0);
 	}
 	mtx_unlock(&bqlocks[qindex]);
 
 	return (ENOBUFS);
 }
 
 /*
  *	buf_recycle:
  *
  *	Iterate through all clean queues until we find a buf to recycle or
  *	exhaust the search.
  */
 static int
 buf_recycle(bool kva)
 {
 	int qindex, first_qindex;
 
 	qindex = first_qindex = bqcleanq();
 	do {
 		if (buf_qrecycle(qindex, kva) == 0)
 			return (0);
 		if (++qindex == QUEUE_CLEAN + clean_queues)
 			qindex = QUEUE_CLEAN;
 	} while (qindex != first_qindex);
 
 	return (ENOBUFS);
 }
 
 /*
  *	buf_scan:
  *
  *	Scan the clean queues looking for a buffer to recycle.  needsbuffer
  *	is set on failure so that the caller may optionally bufspace_wait()
  *	in a race-free fashion.
  */
 static int
 buf_scan(bool defrag)
 {
 	int error;
 
 	/*
 	 * To avoid heavy synchronization and wakeup races we set
 	 * needsbuffer and re-poll before failing.  This ensures that
 	 * no frees can be missed between an unsuccessful poll and
 	 * going to sleep in a synchronized fashion.
 	 */
 	if ((error = buf_recycle(defrag)) != 0) {
 		atomic_set_int(&needsbuffer, 1);
 		bufspace_daemonwakeup();
 		error = buf_recycle(defrag);
 	}
 	if (error == 0)
 		atomic_add_int(&getnewbufrestarts, 1);
 	return (error);
 }
 
 /*
  *	bremfree:
  *
  *	Mark the buffer for removal from the appropriate free list.
  *	
  */
 void
 bremfree(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT((bp->b_flags & B_REMFREE) == 0,
 	    ("bremfree: buffer %p already marked for delayed removal.", bp));
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfree: buffer %p not on a queue.", bp));
 	BUF_ASSERT_XLOCKED(bp);
 
 	bp->b_flags |= B_REMFREE;
 }
 
 /*
  *	bremfreef:
  *
  *	Force an immediate removal from a free list.  Used only in nfs when
  *	it abuses the b_freelist pointer.
  */
 void
 bremfreef(struct buf *bp)
 {
 	struct mtx *qlock;
 
 	qlock = bqlock(bp->b_qindex);
 	mtx_lock(qlock);
 	bremfreel(bp);
 	mtx_unlock(qlock);
 }
 
 /*
  *	bremfreel:
  *
  *	Removes a buffer from the free list, must be called with the
  *	correct qlock held.
  */
 static void
 bremfreel(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfreel: buffer %p not on a queue.", bp));
 	if (bp->b_qindex != QUEUE_EMPTY) {
 		BUF_ASSERT_XLOCKED(bp);
 	}
 	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
 
 	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 #ifdef INVARIANTS
 	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
 	    bp->b_qindex));
 	bq_len[bp->b_qindex]--;
 #endif
 	bp->b_qindex = QUEUE_NONE;
 	bp->b_flags &= ~B_REMFREE;
 }
 
 /*
  *	bufkva_free:
  *
  *	Free the kva allocation for a buffer.
  *
  */
 static void
 bufkva_free(struct buf *bp)
 {
 
 #ifdef INVARIANTS
 	if (bp->b_kvasize == 0) {
 		KASSERT(bp->b_kvabase == unmapped_buf &&
 		    bp->b_data == unmapped_buf,
 		    ("Leaked KVA space on %p", bp));
 	} else if (buf_mapped(bp))
 		BUF_CHECK_MAPPED(bp);
 	else
 		BUF_CHECK_UNMAPPED(bp);
 #endif
 	if (bp->b_kvasize == 0)
 		return;
 
 	vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
 	atomic_subtract_long(&bufkvaspace, bp->b_kvasize);
 	atomic_add_int(&buffreekvacnt, 1);
 	bp->b_data = bp->b_kvabase = unmapped_buf;
 	bp->b_kvasize = 0;
 }
 
 /*
  *	bufkva_alloc:
  *
  *	Allocate the buffer KVA and set b_kvasize and b_kvabase.
  */
 static int
 bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
 {
 	vm_offset_t addr;
 	int error;
 
 	KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
 	    ("Invalid gbflags 0x%x in %s", gbflags, __func__));
 
 	bufkva_free(bp);
 
 	addr = 0;
 	error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
 	if (error != 0) {
 		/*
 		 * Buffer map is too fragmented.  Request the caller
 		 * to defragment the map.
 		 */
 		return (error);
 	}
 	bp->b_kvabase = (caddr_t)addr;
 	bp->b_kvasize = maxsize;
 	atomic_add_long(&bufkvaspace, bp->b_kvasize);
 	if ((gbflags & GB_UNMAPPED) != 0) {
 		bp->b_data = unmapped_buf;
 		BUF_CHECK_UNMAPPED(bp);
 	} else {
 		bp->b_data = bp->b_kvabase;
 		BUF_CHECK_MAPPED(bp);
 	}
 	return (0);
 }
 
 /*
  *	bufkva_reclaim:
  *
  *	Reclaim buffer kva by freeing buffers holding kva.  This is a vmem
  *	callback that fires to avoid returning failure.
  */
 static void
 bufkva_reclaim(vmem_t *vmem, int flags)
 {
 	int i;
 
 	for (i = 0; i < 5; i++)
 		if (buf_scan(true) != 0)
 			break;
 	return;
 }
 
 
 /*
  * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
  * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
  * the buffer is valid and we do not have to do anything.
  */
 void
 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
     int cnt, struct ucred * cred)
 {
 	struct buf *rabp;
 	int i;
 
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
 
 		if ((rabp->b_flags & B_CACHE) == 0) {
 			if (!TD_IS_IDLETHREAD(curthread)) {
 #ifdef RACCT
 				if (racct_enable) {
 					PROC_LOCK(curproc);
 					racct_add_buf(curproc, rabp, 0);
 					PROC_UNLOCK(curproc);
 				}
 #endif /* RACCT */
 				curthread->td_ru.ru_inblock++;
 			}
 			rabp->b_flags |= B_ASYNC;
 			rabp->b_flags &= ~B_INVAL;
 			rabp->b_ioflags &= ~BIO_ERROR;
 			rabp->b_iocmd = BIO_READ;
 			if (rabp->b_rcred == NOCRED && cred != NOCRED)
 				rabp->b_rcred = crhold(cred);
 			vfs_busy_pages(rabp, 0);
 			BUF_KERNPROC(rabp);
 			rabp->b_iooffset = dbtob(rabp->b_blkno);
 			bstrategy(rabp);
 		} else {
 			brelse(rabp);
 		}
 	}
 }
 
 /*
  * Entry point for bread() and breadn() via #defines in sys/buf.h.
  *
  * Get a buffer with the specified data.  Look in the cache first.  We
  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  * is set, the buffer is valid and we do not have to do anything, see
  * getblk(). Also starts asynchronous I/O on read-ahead blocks.
  *
  * Always return a NULL buffer pointer (in bpp) when returning an error.
  */
 int
 breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
     int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
 {
 	struct buf *bp;
 	int rv = 0, readwait = 0;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
 	/*
 	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
 	 */
 	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
 	if (bp == NULL)
 		return (EBUSY);
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (!TD_IS_IDLETHREAD(curthread)) {
 #ifdef RACCT
 			if (racct_enable) {
 				PROC_LOCK(curproc);
 				racct_add_buf(curproc, bp, 0);
 				PROC_UNLOCK(curproc);
 			}
 #endif /* RACCT */
 			curthread->td_ru.ru_inblock++;
 		}
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if (bp->b_rcred == NOCRED && cred != NOCRED)
 			bp->b_rcred = crhold(cred);
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
 		++readwait;
 	}
 
 	breada(vp, rablkno, rabsize, cnt, cred);
 
 	if (readwait) {
 		rv = bufwait(bp);
 		if (rv != 0) {
 			brelse(bp);
 			*bpp = NULL;
 		}
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 int
 bufwrite(struct buf *bp)
 {
 	int oldflags;
 	struct vnode *vp;
 	long space;
 	int vp_md;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
 		bp->b_flags |= B_INVAL | B_RELBUF;
 		bp->b_flags &= ~B_CACHE;
 		brelse(bp);
 		return (ENXIO);
 	}
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	if (bp->b_flags & B_BARRIER)
 		barrierwrites++;
 
 	oldflags = bp->b_flags;
 
 	BUF_ASSERT_HELD(bp);
 
 	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
 	    ("FFS background buffer should not get here %p", bp));
 
 	vp = bp->b_vp;
 	if (vp)
 		vp_md = vp->v_vflag & VV_MD;
 	else
 		vp_md = 0;
 
 	/*
 	 * Mark the buffer clean.  Increment the bufobj write count
 	 * before bundirty() call, to prevent other thread from seeing
 	 * empty dirty list and zero counter for writes in progress,
 	 * falsely indicating that the bufobj is clean.
 	 */
 	bufobj_wref(bp->b_bufobj);
 	bundirty(bp);
 
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_flags |= B_CACHE;
 	bp->b_iocmd = BIO_WRITE;
 
 	vfs_busy_pages(bp, 1);
 
 	/*
 	 * Normal bwrites pipeline writes
 	 */
 	bp->b_runningbufspace = bp->b_bufsize;
 	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
 
 	if (!TD_IS_IDLETHREAD(curthread)) {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(curproc);
 			racct_add_buf(curproc, bp, 1);
 			PROC_UNLOCK(curproc);
 		}
 #endif /* RACCT */
 		curthread->td_ru.ru_oublock++;
 	}
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
+	buf_track(bp, __func__);
 	bstrategy(bp);
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
 	} else if (space > hirunningspace) {
 		/*
 		 * don't allow the async write to saturate the I/O
 		 * system.  We will not deadlock here because
 		 * we are blocking waiting for I/O that is already in-progress
 		 * to complete. We do not block here if it is the update
 		 * or syncer daemon trying to clean up as that can lead
 		 * to deadlock.
 		 */
 		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
 			waitrunningbufspace();
 	}
 
 	return (0);
 }
 
 void
 bufbdflush(struct bufobj *bo, struct buf *bp)
 {
 	struct buf *nbp;
 
 	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
 		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
 		altbufferflushes++;
 	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
 		BO_LOCK(bo);
 		/*
 		 * Try to find a buffer to flush.
 		 */
 		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 			    BUF_LOCK(nbp,
 				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
 				continue;
 			if (bp == nbp)
 				panic("bdwrite: found ourselves");
 			BO_UNLOCK(bo);
 			/* Don't countdeps with the bo lock held. */
 			if (buf_countdeps(nbp, 0)) {
 				BO_LOCK(bo);
 				BUF_UNLOCK(nbp);
 				continue;
 			}
 			if (nbp->b_flags & B_CLUSTEROK) {
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
 				bawrite(nbp);
 			}
 			dirtybufferflushes++;
 			break;
 		}
 		if (nbp == NULL)
 			BO_UNLOCK(bo);
 	}
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  * anything if the buffer is marked invalid.
  *
  * Note that since the buffer must be completely valid, we can safely
  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  * biodone() in order to prevent getblk from writing the buffer
  * out synchronously.
  */
 void
 bdwrite(struct buf *bp)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT((bp->b_flags & B_BARRIER) == 0,
 	    ("Barrier request in delayed write %p", bp));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 
 	/*
 	 * If we have too many dirty buffers, don't create any more.
 	 * If we are wildly over our limit, then force a complete
 	 * cleanup. Otherwise, just keep the situation from getting
 	 * out of control. Note that we have to avoid a recursive
 	 * disaster and not try to clean up after our own cleanup!
 	 */
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
 		td->td_pflags |= TDP_INBDFLUSH;
 		BO_BDFLUSH(bo, bp);
 		td->td_pflags &= ~TDP_INBDFLUSH;
 	} else
 		recursiveflushes++;
 
 	bdirty(bp);
 	/*
 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
 	 * true even of NFS now.
 	 */
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
+	buf_track(bp, __func__);
+
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty
 	 * pages.
 	 *
 	 * Mark the buffer pages as clean.  We need to do this here to
 	 * satisfy the vnode_pager and the pageout daemon, so that it
 	 * thinks that the pages have been "cleaned".  Note that since
 	 * the pages are in a delayed write buffer -- the VFS layer
 	 * "will" see that the pages get written out on the next sync,
 	 * or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages_dirty_buf(bp);
 	bqrelse(bp);
 
 	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 	 * due to the softdep code.
 	 */
 }
 
 /*
  *	bdirty:
  *
  *	Turn buffer into delayed write request.  We must clear BIO_READ and
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
  *	clears B_DONE ( else a panic will occur later ).  
  *
  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *
  *	The buffer must be on QUEUE_NONE.
  */
 void
 bdirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	BUF_ASSERT_HELD(bp);
 	bp->b_flags &= ~(B_RELBUF);
 	bp->b_iocmd = BIO_WRITE;
 
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
 		reassignbuf(bp);
 		bdirtyadd();
 	}
 }
 
 /*
  *	bundirty:
  *
  *	Clear B_DELWRI for buffer.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *	
  *	The buffer must be on QUEUE_NONE.
  */
 
 void
 bundirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
 		bdirtysub();
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
 	 */
 	bp->b_flags &= ~B_DEFERRED;
 }
 
 /*
  *	bawrite:
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
  *
  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC;
 	(void) bwrite(bp);
 }
 
 /*
  *	babarrierwrite:
  *
  *	Asynchronous barrier write.  Start output on a buffer, but do not
  *	wait for it to complete.  Place a write barrier after this write so
  *	that this buffer and all buffers written before it are committed to
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 void
 babarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC | B_BARRIER;
 	(void) bwrite(bp);
 }
 
 /*
  *	bbarrierwrite:
  *
  *	Synchronous barrier write.  Start output on a buffer and wait for
  *	it to complete.  Place a write barrier after this write so that
  *	this buffer and all buffers written before it are committed to 
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 int
 bbarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_BARRIER;
 	return (bwrite(bp));
 }
 
 /*
  *	bwillwrite:
  *
  *	Called prior to the locking of any vnodes when we are expecting to
  *	write.  We do not want to starve the buffer cache with too many
  *	dirty buffers so we block here.  By blocking prior to the locking
  *	of any vnodes we attempt to avoid the situation where a locked vnode
  *	prevents the various system daemons from flushing related buffers.
  */
 void
 bwillwrite(void)
 {
 
 	if (numdirtybuffers >= hidirtybuffers) {
 		mtx_lock(&bdirtylock);
 		while (numdirtybuffers >= hidirtybuffers) {
 			bdirtywait = 1;
 			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
 			    "flswai", 0);
 		}
 		mtx_unlock(&bdirtylock);
 	}
 }
 
 /*
  * Return true if we have too many dirty buffers.
  */
 int
 buf_dirty_count_severe(void)
 {
 
 	return(numdirtybuffers >= hidirtybuffers);
 }
 
 /*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
  *	buffer will be stashed in the appropriate bufqueue[] allowing it
  *	to be accessed later as a cache entity or reused for other purposes.
  */
 void
 brelse(struct buf *bp)
 {
 	int qindex;
 
 	/*
 	 * Many functions erroneously call brelse with a NULL bp under rare
 	 * error conditions. Simply return when called with a NULL bp.
 	 */
 	if (bp == NULL)
 		return;
 	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 	KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0,
 	    ("brelse: non-VMIO buffer marked NOREUSE"));
 
 	if (BUF_LOCKRECURSED(bp)) {
 		/*
 		 * Do not process, in particular, do not handle the
 		 * B_INVAL/B_RELBUF and do not release to free list.
 		 */
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	if (bp->b_flags & B_MANAGED) {
 		bqrelse(bp);
 		return;
 	}
 
 	if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags &= ~BV_BKGRDERR;
 		BO_UNLOCK(bp->b_bufobj);
 		bdirty(bp);
 	}
 	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
 	    !(bp->b_flags & B_INVAL)) {
 		/*
 		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
 		 * pages from being scrapped.
 		 */
 		bp->b_ioflags &= ~BIO_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
 	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
 		/*
 		 * Either a failed read I/O or we were asked to free or not
 		 * cache the buffer.
 		 */
 		bp->b_flags |= B_INVAL;
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI)
 			bdirtysub();
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			allocbuf(bp, 0);
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 	}
 
 	/*
 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_truncate() 
 	 * is called with B_DELWRI set, the underlying pages may wind up
 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 	 * because pages associated with a B_DELWRI bp are marked clean.
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_truncate(), even
 	 * if B_DELWRI is set.
 	 */
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
 	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
 	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 	 * the commit state and we cannot afford to lose the buffer. If the
 	 * buffer has a background write in progress, we need to keep it
 	 * around to prevent it from being reconstituted and starting a second
 	 * background write.
 	 */
 	if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE ||
 	    (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) &&
 	    !(bp->b_vp->v_mount != NULL &&
 	    (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) {
 		vfs_vmio_invalidate(bp);
 		allocbuf(bp, 0);
 	}
 
 	if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 ||
 	    (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) {
 		allocbuf(bp, 0);
 		bp->b_flags &= ~B_NOREUSE;
 		if (bp->b_vp != NULL)
 			brelvp(bp);
 	}
 			
 	/*
 	 * If the buffer has junk contents signal it and eventually
 	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
 	 * doesn't find it.
 	 */
 	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
 	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
 		bp->b_flags |= B_INVAL;
 	if (bp->b_flags & B_INVAL) {
 		if (bp->b_flags & B_DELWRI)
 			bundirty(bp);
 		if (bp->b_vp)
 			brelvp(bp);
 	}
 
+	buf_track(bp, __func__);
+
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		buf_free(bp);
 		return;
 	}
 	/* buffers with junk contents */
 	if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
 	    (bp->b_ioflags & BIO_ERROR)) {
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 2");
 		qindex = QUEUE_CLEAN;
 		bp->b_flags |= B_AGE;
 	/* remaining buffers */
 	} else if (bp->b_flags & B_DELWRI)
 		qindex = QUEUE_DIRTY;
 	else
 		qindex = QUEUE_CLEAN;
 
 	binsfree(bp, qindex);
 
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("brelse: not dirty");
 	/* unlock */
 	BUF_UNLOCK(bp);
 	if (qindex == QUEUE_CLEAN)
 		bufspace_wakeup();
 }
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
  * known good buffers need to be requeued but we think we may need the data
  * again soon.
  *
  * XXX we should be able to leave the B_RELBUF hint set on completion.
  */
 void
 bqrelse(struct buf *bp)
 {
 	int qindex;
 
 	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	qindex = QUEUE_NONE;
 	if (BUF_LOCKRECURSED(bp)) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		return;
 	}
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 
 	if (bp->b_flags & B_MANAGED) {
 		if (bp->b_flags & B_REMFREE)
 			bremfreef(bp);
 		goto out;
 	}
 
 	/* buffers with stale but valid contents */
 	if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
 	    BV_BKGRDERR)) == BV_BKGRDERR) {
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags &= ~BV_BKGRDERR;
 		BO_UNLOCK(bp->b_bufobj);
 		qindex = QUEUE_DIRTY;
 	} else {
 		if ((bp->b_flags & B_DELWRI) == 0 &&
 		    (bp->b_xflags & BX_VNDIRTY))
 			panic("bqrelse: not dirty");
 		if ((bp->b_flags & B_NOREUSE) != 0) {
 			brelse(bp);
 			return;
 		}
 		qindex = QUEUE_CLEAN;
 	}
 	binsfree(bp, qindex);
 
 out:
+	buf_track(bp, __func__);
 	/* unlock */
 	BUF_UNLOCK(bp);
 	if (qindex == QUEUE_CLEAN)
 		bufspace_wakeup();
 }
 
 /*
  * Complete I/O to a VMIO backed page.  Validate the pages as appropriate,
  * restore bogus pages.
  */
 static void
 vfs_vmio_iodone(struct buf *bp)
 {
 	vm_ooffset_t foff;
 	vm_page_t m;
 	vm_object_t obj;
 	struct vnode *vp;
 	int bogus, i, iosize;
 
 	obj = bp->b_bufobj->bo_object;
 	KASSERT(obj->paging_in_progress >= bp->b_npages,
 	    ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)",
 	    obj->paging_in_progress, bp->b_npages));
 
 	vp = bp->b_vp;
 	KASSERT(vp->v_holdcnt > 0,
 	    ("vfs_vmio_iodone: vnode %p has zero hold count", vp));
 	KASSERT(vp->v_object != NULL,
 	    ("vfs_vmio_iodone: vnode %p has no vm_object", vp));
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_vmio_iodone: bp %p has no buffer offset", bp));
 
 	bogus = 0;
 	iosize = bp->b_bcount - bp->b_resid;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		int resid;
 
 		resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 		if (resid > iosize)
 			resid = iosize;
 
 		/*
 		 * cleanup bogus pages, restoring the originals
 		 */
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			bogus = 1;
 			m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 			if (m == NULL)
 				panic("biodone: page disappeared!");
 			bp->b_pages[i] = m;
 		} else if ((bp->b_iocmd == BIO_READ) && resid > 0) {
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly ( see bdwrite() ), so we 
 			 * only need to do this here in the read case.
 			 */
 			KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK,
 			    resid)) == 0, ("vfs_vmio_iodone: page %p "
 			    "has unexpected dirty bits", m));
 			vfs_page_set_valid(bp, foff, m);
 		}
 		KASSERT(OFF_TO_IDX(foff) == m->pindex,
 		    ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch",
 		    (intmax_t)foff, (uintmax_t)m->pindex));
 
 		vm_page_sunbusy(m);
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		iosize -= resid;
 	}
 	vm_object_pip_wakeupn(obj, bp->b_npages);
 	VM_OBJECT_WUNLOCK(obj);
 	if (bogus && buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  * Unwire a page held by a buf and place it on the appropriate vm queue.
  */
 static void
 vfs_vmio_unwire(struct buf *bp, vm_page_t m)
 {
 	bool freed;
 
 	vm_page_lock(m);
 	if (vm_page_unwire(m, PQ_NONE)) {
 		/*
 		 * Determine if the page should be freed before adding
 		 * it to the inactive queue.
 		 */
 		if (m->valid == 0) {
 			freed = !vm_page_busied(m);
 			if (freed)
 				vm_page_free(m);
 		} else if ((bp->b_flags & B_DIRECT) != 0)
 			freed = vm_page_try_to_free(m);
 		else
 			freed = false;
 		if (!freed) {
 			/*
 			 * If the page is unlikely to be reused, let the
 			 * VM know.  Otherwise, maintain LRU page
 			 * ordering and put the page at the tail of the
 			 * inactive queue.
 			 */
 			if ((bp->b_flags & B_NOREUSE) != 0)
 				vm_page_deactivate_noreuse(m);
 			else
 				vm_page_deactivate(m);
 		}
 	}
 	vm_page_unlock(m);
 }
 
 /*
  * Perform page invalidation when a buffer is released.  The fully invalid
  * pages will be reclaimed later in vfs_vmio_truncate().
  */
 static void
 vfs_vmio_invalidate(struct buf *bp)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	int i, resid, poffset, presid;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 	/*
 	 * Get the base offset and length of the buffer.  Note that 
 	 * in the VMIO case if the buffer block size is not
 	 * page-aligned then b_data pointer may not be page-aligned.
 	 * But our b_pages[] array *IS* page aligned.
 	 *
 	 * block sizes less then DEV_BSIZE (usually 512) are not 
 	 * supported due to the page granularity bits (m->valid,
 	 * m->dirty, etc...). 
 	 *
 	 * See man buf(9) for more information
 	 */
 	obj = bp->b_bufobj->bo_object;
 	resid = bp->b_bufsize;
 	poffset = bp->b_offset & PAGE_MASK;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page)
 			panic("vfs_vmio_invalidate: Unexpected bogus page.");
 		bp->b_pages[i] = NULL;
 
 		presid = resid > (PAGE_SIZE - poffset) ?
 		    (PAGE_SIZE - poffset) : resid;
 		KASSERT(presid >= 0, ("brelse: extra page"));
 		while (vm_page_xbusied(m)) {
 			vm_page_lock(m);
 			VM_OBJECT_WUNLOCK(obj);
 			vm_page_busy_sleep(m, "mbncsh", true);
 			VM_OBJECT_WLOCK(obj);
 		}
 		if (pmap_page_wired_mappings(m) == 0)
 			vm_page_set_invalid(m, poffset, presid);
 		vfs_vmio_unwire(bp, m);
 		resid -= presid;
 		poffset = 0;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	bp->b_npages = 0;
 }
 
 /*
  * Page-granular truncation of an existing VMIO buffer.
  */
 static void
 vfs_vmio_truncate(struct buf *bp, int desiredpages)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	int i;
 
 	if (bp->b_npages == desiredpages)
 		return;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) +
 		    (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 	obj = bp->b_bufobj->bo_object;
 	if (obj != NULL)
 		VM_OBJECT_WLOCK(obj);
 	for (i = desiredpages; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		KASSERT(m != bogus_page, ("allocbuf: bogus page found"));
 		bp->b_pages[i] = NULL;
 		vfs_vmio_unwire(bp, m);
 	}
 	if (obj != NULL)
 		VM_OBJECT_WUNLOCK(obj);
 	bp->b_npages = desiredpages;
 }
 
 /*
  * Byte granular extension of VMIO buffers.
  */
 static void
 vfs_vmio_extend(struct buf *bp, int desiredpages, int size)
 {
 	/*
 	 * We are growing the buffer, possibly in a 
 	 * byte-granular fashion.
 	 */
 	vm_object_t obj;
 	vm_offset_t toff;
 	vm_offset_t tinc;
 	vm_page_t m;
 
 	/*
 	 * Step 1, bring in the VM pages from the object, allocating
 	 * them if necessary.  We must clear B_CACHE if these pages
 	 * are not valid for the range covered by the buffer.
 	 */
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_WLOCK(obj);
 	while (bp->b_npages < desiredpages) {
 		/*
 		 * We must allocate system pages since blocking
 		 * here could interfere with paging I/O, no
 		 * matter which process we are.
 		 *
 		 * Only exclusive busy can be tested here.
 		 * Blocking on shared busy might lead to
 		 * deadlocks once allocbuf() is called after
 		 * pages are vfs_busy_pages().
 		 */
 		m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages,
 		    VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM |
 		    VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
 		    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
 		if (m->valid == 0)
 			bp->b_flags &= ~B_CACHE;
 		bp->b_pages[bp->b_npages] = m;
 		++bp->b_npages;
 	}
 
 	/*
 	 * Step 2.  We've loaded the pages into the buffer,
 	 * we have to figure out if we can still have B_CACHE
 	 * set.  Note that B_CACHE is set according to the
 	 * byte-granular range ( bcount and size ), not the
 	 * aligned range ( newbsize ).
 	 *
 	 * The VM test is against m->valid, which is DEV_BSIZE
 	 * aligned.  Needless to say, the validity of the data
 	 * needs to also be DEV_BSIZE aligned.  Note that this
 	 * fails with NFS if the server or some other client
 	 * extends the file's EOF.  If our buffer is resized, 
 	 * B_CACHE may remain set! XXX
 	 */
 	toff = bp->b_bcount;
 	tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 	while ((bp->b_flags & B_CACHE) && toff < size) {
 		vm_pindex_t pi;
 
 		if (tinc > (size - toff))
 			tinc = size - toff;
 		pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT;
 		m = bp->b_pages[pi];
 		vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m);
 		toff += tinc;
 		tinc = PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 
 	/*
 	 * Step 3, fixup the KVA pmap.
 	 */
 	if (buf_mapped(bp))
 		bpmap_qenter(bp);
 	else
 		BUF_CHECK_UNMAPPED(bp);
 }
 
 /*
  * Check to see if a block at a particular lbn is available for a clustered
  * write.
  */
 static int
 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
 {
 	struct buf *bpa;
 	int match;
 
 	match = 0;
 
 	/* If the buf isn't in core skip it */
 	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
 		return (0);
 
 	/* If the buf is busy we don't want to wait for it */
 	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		return (0);
 
 	/* Only cluster with valid clusterable delayed write buffers */
 	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
 	    (B_DELWRI | B_CLUSTEROK))
 		goto done;
 
 	if (bpa->b_bufsize != size)
 		goto done;
 
 	/*
 	 * Check to see if it is in the expected place on disk and that the
 	 * block has been mapped.
 	 */
 	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
 		match = 1;
 done:
 	BUF_UNLOCK(bpa);
 	return (match);
 }
 
 /*
  *	vfs_bio_awrite:
  *
  *	Implement clustered async writes for clearing out B_DELWRI buffers.
  *	This is much better then the old way of writing only one buffer at
  *	a time.  Note that we may not be presented with the buffers in the 
  *	correct order, so we search for the cluster in both directions.
  */
 int
 vfs_bio_awrite(struct buf *bp)
 {
 	struct bufobj *bo;
 	int i;
 	int j;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int ncl;
 	int nwritten;
 	int size;
 	int maxcl;
 	int gbflags;
 
 	bo = &vp->v_bufobj;
 	gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0;
 	/*
 	 * right now we support clustered writing only to regular files.  If
 	 * we find a clusterable block we could be in the middle of a cluster
 	 * rather then at the beginning.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
 		BO_RLOCK(bo);
 		for (i = 1; i < maxcl; i++)
 			if (vfs_bio_clcheck(vp, size, lblkno + i,
 			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
 				break;
 
 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) 
 			if (vfs_bio_clcheck(vp, size, lblkno - j,
 			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
 				break;
 		BO_RUNLOCK(bo);
 		--j;
 		ncl = i + j;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			BUF_UNLOCK(bp);
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
 			    gbflags);
 			return (nwritten);
 		}
 	}
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;
 	/*
 	 * default (old) behavior, writing out only one block
 	 *
 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) bwrite(bp);
 
 	return (nwritten);
 }
 
 /*
  *	getnewbuf_kva:
  *
  *	Allocate KVA for an empty buf header according to gbflags.
  */
 static int
 getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
 {
 
 	if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) {
 		/*
 		 * In order to keep fragmentation sane we only allocate kva
 		 * in BKVASIZE chunks.  XXX with vmem we can do page size.
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
 		if (maxsize != bp->b_kvasize &&
 		    bufkva_alloc(bp, maxsize, gbflags))
 			return (ENOSPC);
 	}
 	return (0);
 }
 
 /*
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers
  *	in the bufqueues as necessary.  The new buffer is returned locked.
  *
  *	We block if:
  *		We have insufficient buffer headers
  *		We have insufficient buffer space
  *		buffer_arena is too fragmented ( space reservation fails )
  *		If we have to flush dirty buffers ( but we try to avoid this )
  *
  *	The caller is responsible for releasing the reserved bufspace after
  *	allocbuf() is called.
  */
 static struct buf *
 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
 {
 	struct buf *bp;
 	bool metadata, reserved;
 
 	bp = NULL;
 	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	if (!unmapped_buf_allowed)
 		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
 	    vp->v_type == VCHR)
 		metadata = true;
 	else
 		metadata = false;
 	atomic_add_int(&getnewbufcalls, 1);
 	reserved = false;
 	do {
 		if (reserved == false &&
 		    bufspace_reserve(maxsize, metadata) != 0)
 			continue;
 		reserved = true;
 		if ((bp = buf_alloc()) == NULL)
 			continue;
 		if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
 			return (bp);
 		break;
 	} while(buf_scan(false) == 0);
 
 	if (reserved)
 		atomic_subtract_long(&bufspace, maxsize);
 	if (bp != NULL) {
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 	}
 	bufspace_wait(vp, gbflags, slpflag, slptimeo);
 
 	return (NULL);
 }
 
 /*
  *	buf_daemon:
  *
  *	buffer flushing daemon.  Buffers are normally flushed by the
  *	update daemon but if it cannot keep up this process starts to
  *	take the load in an attempt to prevent getnewbuf() from blocking.
  */
 static struct kproc_desc buf_kp = {
 	"bufdaemon",
 	buf_daemon,
 	&bufdaemonproc
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
 static int
 buf_flush(struct vnode *vp, int target)
 {
 	int flushed;
 
 	flushed = flushbufqueues(vp, target, 0);
 	if (flushed == 0) {
 		/*
 		 * Could not find any buffers without rollback
 		 * dependencies, so just write the first one
 		 * in the hopes of eventually making progress.
 		 */
 		if (vp != NULL && target > 2)
 			target /= 2;
 		flushbufqueues(vp, target, 1);
 	}
 	return (flushed);
 }
 
 static void
 buf_daemon()
 {
 	int lodirty;
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
 	 */
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
 	    SHUTDOWN_PRI_LAST);
 
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
 	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
 	mtx_lock(&bdlock);
 	for (;;) {
 		bd_request = 0;
 		mtx_unlock(&bdlock);
 
 		kproc_suspend_check(bufdaemonproc);
 		lodirty = lodirtybuffers;
 		if (bd_speedupreq) {
 			lodirty = numdirtybuffers / 2;
 			bd_speedupreq = 0;
 		}
 		/*
 		 * Do the flush.  Limit the amount of in-transit I/O we
 		 * allow to build up, otherwise we would completely saturate
 		 * the I/O system.
 		 */
 		while (numdirtybuffers > lodirty) {
 			if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
 				break;
 			kern_yield(PRI_USER);
 		}
 
 		/*
 		 * Only clear bd_request if we have reached our low water
 		 * mark.  The buf_daemon normally waits 1 second and
 		 * then incrementally flushes any dirty buffers that have
 		 * built up, within reason.
 		 *
 		 * If we were unable to hit our low water mark and couldn't
 		 * find any flushable buffers, we sleep for a short period
 		 * to avoid endless loops on unlockable buffers.
 		 */
 		mtx_lock(&bdlock);
 		if (numdirtybuffers <= lodirtybuffers) {
 			/*
 			 * We reached our low water mark, reset the
 			 * request and sleep until we are needed again.
 			 * The sleep is just so the suspend code works.
 			 */
 			bd_request = 0;
 			/*
 			 * Do an extra wakeup in case dirty threshold
 			 * changed via sysctl and the explicit transition
 			 * out of shortfall was missed.
 			 */
 			bdirtywakeup();
 			if (runningbufspace <= lorunningspace)
 				runningwakeup();
 			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
 		} else {
 			/*
 			 * We couldn't find any flushable dirty buffers but
 			 * still have too many dirty buffers, we
 			 * have to sleep and try again.  (rare)
 			 */
 			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
 		}
 	}
 }
 
 /*
  *	flushbufqueues:
  *
  *	Try to flush a buffer in the dirty queue.  We must be careful to
  *	free up B_INVAL buffers instead of write them, which NFS is 
  *	particularly sensitive to.
  */
 static int flushwithdeps = 0;
 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
 flushbufqueues(struct vnode *lvp, int target, int flushdeps)
 {
 	struct buf *sentinel;
 	struct vnode *vp;
 	struct mount *mp;
 	struct buf *bp;
 	int hasdeps;
 	int flushed;
 	int queue;
 	int error;
 	bool unlock;
 
 	flushed = 0;
 	queue = QUEUE_DIRTY;
 	bp = NULL;
 	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
 	sentinel->b_qindex = QUEUE_SENTINEL;
 	mtx_lock(&bqlocks[queue]);
 	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
 	mtx_unlock(&bqlocks[queue]);
 	while (flushed != target) {
 		maybe_yield();
 		mtx_lock(&bqlocks[queue]);
 		bp = TAILQ_NEXT(sentinel, b_freelist);
 		if (bp != NULL) {
 			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
 			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
 			    b_freelist);
 		} else {
 			mtx_unlock(&bqlocks[queue]);
 			break;
 		}
 		/*
 		 * Skip sentinels inserted by other invocations of the
 		 * flushbufqueues(), taking care to not reorder them.
 		 *
 		 * Only flush the buffers that belong to the
 		 * vnode locked by the curthread.
 		 */
 		if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
 		    bp->b_vp != lvp)) {
 			mtx_unlock(&bqlocks[queue]);
 			continue;
 		}
 		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
 		mtx_unlock(&bqlocks[queue]);
 		if (error != 0)
 			continue;
 
 		/*
 		 * BKGRDINPROG can only be set with the buf and bufobj
 		 * locks both held.  We tolerate a race to clear it here.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
 		    (bp->b_flags & B_DELWRI) == 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (bp->b_flags & B_INVAL) {
 			bremfreef(bp);
 			brelse(bp);
 			flushed++;
 			continue;
 		}
 
 		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
 			if (flushdeps == 0) {
 				BUF_UNLOCK(bp);
 				continue;
 			}
 			hasdeps = 1;
 		} else
 			hasdeps = 0;
 		/*
 		 * We must hold the lock on a vnode before writing
 		 * one of its buffers. Otherwise we may confuse, or
 		 * in the case of a snapshot vnode, deadlock the
 		 * system.
 		 *
 		 * The lock order here is the reverse of the normal
 		 * of vnode followed by buf lock.  This is ok because
 		 * the NOWAIT will prevent deadlock.
 		 */
 		vp = bp->b_vp;
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (lvp == NULL) {
 			unlock = true;
 			error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
 		} else {
 			ASSERT_VOP_LOCKED(vp, "getbuf");
 			unlock = false;
 			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
 			    vn_lock(vp, LK_TRYUPGRADE);
 		}
 		if (error == 0) {
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
 			if (curproc == bufdaemonproc) {
 				vfs_bio_awrite(bp);
 			} else {
 				bremfree(bp);
 				bwrite(bp);
 				notbufdflushes++;
 			}
 			vn_finished_write(mp);
 			if (unlock)
 				VOP_UNLOCK(vp, 0);
 			flushwithdeps += hasdeps;
 			flushed++;
 
 			/*
 			 * Sleeping on runningbufspace while holding
 			 * vnode lock leads to deadlock.
 			 */
 			if (curproc == bufdaemonproc &&
 			    runningbufspace > hirunningspace)
 				waitrunningbufspace();
 			continue;
 		}
 		vn_finished_write(mp);
 		BUF_UNLOCK(bp);
 	}
 	mtx_lock(&bqlocks[queue]);
 	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
 	mtx_unlock(&bqlocks[queue]);
 	free(sentinel, M_TEMP);
 	return (flushed);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct bufobj *bo, daddr_t blkno)
 {
 	struct buf *bp;
 
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	BO_RUNLOCK(bo);
 	return (bp);
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 
 static int
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t toff, tinc, size;
 	vm_page_t m;
 	vm_ooffset_t off;
 
 	ASSERT_VOP_LOCKED(vp, "inmem");
 
 	if (incore(&vp->v_bufobj, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
 	obj = vp->v_object;
 	if (obj == NULL)
 		return (0);
 
 	size = PAGE_SIZE;
 	if (size > vp->v_mount->mnt_stat.f_iosize)
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
 	VM_OBJECT_RLOCK(obj);
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
 			goto notinmem;
 		tinc = size;
 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 		if (vm_page_is_valid(m,
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 			goto notinmem;
 	}
 	VM_OBJECT_RUNLOCK(obj);
 	return 1;
 
 notinmem:
 	VM_OBJECT_RUNLOCK(obj);
 	return (0);
 }
 
 /*
  * Set the dirty range for a buffer based on the status of the dirty
  * bits in the pages comprising the buffer.  The range is limited
  * to the size of the buffer.
  *
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  *
  * Note that while we only really need to clean through to b_bcount, we
  * just go ahead and clean through to b_bufsize.
  */
 static void
 vfs_clean_pages_dirty_buf(struct buf *bp)
 {
 	vm_ooffset_t foff, noff, eoff;
 	vm_page_t m;
 	int i;
 
 	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
 		return;
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
 
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	vfs_drain_busy_pages(bp);
 	vfs_setdirty_locked_object(bp);
 	for (i = 0; i < bp->b_npages; i++) {
 		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		eoff = noff;
 		if (eoff > bp->b_offset + bp->b_bufsize)
 			eoff = bp->b_offset + bp->b_bufsize;
 		m = bp->b_pages[i];
 		vfs_page_set_validclean(bp, foff, m);
 		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 		foff = noff;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 static void
 vfs_setdirty_locked_object(struct buf *bp)
 {
 	vm_object_t object;
 	int i;
 
 	object = bp->b_bufobj->bo_object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * We qualify the scan for modified pages on whether the
 	 * object has been flushed yet.
 	 */
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		vm_offset_t boffset;
 		vm_offset_t eoffset;
 
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
 		 */
 		for (i = 0; i < bp->b_npages; i++)
 			vm_page_test_dirty(bp->b_pages[i]);
 
 		/*
 		 * Calculate the encompassing dirty range, boffset and eoffset,
 		 * (eoffset - boffset) bytes.
 		 */
 
 		for (i = 0; i < bp->b_npages; i++) {
 			if (bp->b_pages[i]->dirty)
 				break;
 		}
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		/*
 		 * Fit it to the buffer.
 		 */
 
 		if (eoffset > bp->b_bcount)
 			eoffset = bp->b_bcount;
 
 		/*
 		 * If we have a good dirty range, merge with the existing
 		 * dirty range.
 		 */
 
 		if (boffset < eoffset) {
 			if (bp->b_dirtyoff > boffset)
 				bp->b_dirtyoff = boffset;
 			if (bp->b_dirtyend < eoffset)
 				bp->b_dirtyend = eoffset;
 		}
 	}
 }
 
 /*
  * Allocate the KVA mapping for an existing buffer.
  * If an unmapped buffer is provided but a mapped buffer is requested, take
  * also care to properly setup mappings between pages and KVA.
  */
 static void
 bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
 {
 	int bsize, maxsize, need_mapping, need_kva;
 	off_t offset;
 
 	need_mapping = bp->b_data == unmapped_buf &&
 	    (gbflags & GB_UNMAPPED) == 0;
 	need_kva = bp->b_kvabase == unmapped_buf &&
 	    bp->b_data == unmapped_buf &&
 	    (gbflags & GB_KVAALLOC) != 0;
 	if (!need_mapping && !need_kva)
 		return;
 
 	BUF_CHECK_UNMAPPED(bp);
 
 	if (need_mapping && bp->b_kvabase != unmapped_buf) {
 		/*
 		 * Buffer is not mapped, but the KVA was already
 		 * reserved at the time of the instantiation.  Use the
 		 * allocated space.
 		 */
 		goto has_addr;
 	}
 
 	/*
 	 * Calculate the amount of the address space we would reserve
 	 * if the buffer was mapped.
 	 */
 	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
 	KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
 	offset = blkno * bsize;
 	maxsize = size + (offset & PAGE_MASK);
 	maxsize = imax(maxsize, bsize);
 
 	while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
 		if ((gbflags & GB_NOWAIT_BD) != 0) {
 			/*
 			 * XXXKIB: defragmentation cannot
 			 * succeed, not sure what else to do.
 			 */
 			panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
 		}
 		atomic_add_int(&mappingrestarts, 1);
 		bufspace_wait(bp->b_vp, gbflags, 0, 0);
 	}
 has_addr:
 	if (need_mapping) {
 		/* b_offset is handled by bpmap_qenter. */
 		bp->b_data = bp->b_kvabase;
 		BUF_CHECK_MAPPED(bp);
 		bpmap_qenter(bp);
 	}
 }
 
 /*
  *	getblk:
  *
  *	Get a block given a specified block and offset into a file/device.
  *	The buffers B_DONE bit will be cleared on return, making it almost
  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
  *	return.  The caller should clear B_INVAL prior to initiating a
  *	READ.
  *
  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
  *	an existing buffer.
  *
  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
  *	and then cleared based on the backing VM.  If the previous buffer is
  *	non-0-sized but invalid, B_CACHE will be cleared.
  *
  *	If getblk() must create a new buffer, the new buffer is returned with
  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
  *	case it is returned with B_INVAL clear and B_CACHE set based on the
  *	backing VM.
  *
  *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
  *	B_CACHE bit is clear.
  *	
  *	What this means, basically, is that the caller should use B_CACHE to
  *	determine whether the buffer is fully valid or not and should clear
  *	B_INVAL prior to issuing a read.  If the caller intends to validate
  *	the buffer by loading its data area with something, the caller needs
  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
  *	the caller should set B_CACHE ( as an optimization ), else the caller
  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
  *	a write attempt or if it was a successful read.  If the caller 
  *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 struct buf *
 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
     int flags)
 {
 	struct buf *bp;
 	struct bufobj *bo;
 	int bsize, error, maxsize, vmio;
 	off_t offset;
 
 	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
 	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	ASSERT_VOP_LOCKED(vp, "getblk");
 	if (size > MAXBCACHEBUF)
 		panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size,
 		    MAXBCACHEBUF);
 	if (!unmapped_buf_allowed)
 		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	bo = &vp->v_bufobj;
 loop:
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	if (bp != NULL) {
 		int lockflags;
 		/*
 		 * Buffer is in-core.  If the buffer is not busy nor managed,
 		 * it must be on a queue.
 		 */
 		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
 
 		if (flags & GB_LOCK_NOWAIT)
 			lockflags |= LK_NOWAIT;
 
 		error = BUF_TIMELOCK(bp, lockflags,
 		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
 
 		/*
 		 * If we slept and got the lock we have to restart in case
 		 * the buffer changed identities.
 		 */
 		if (error == ENOLCK)
 			goto loop;
 		/* We timed out or were interrupted. */
 		else if (error)
 			return (NULL);
 		/* If recursed, assume caller knows the rules. */
 		else if (BUF_LOCKRECURSED(bp))
 			goto end;
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 		 * backing VM cache.
 		 */
 		if (bp->b_flags & B_INVAL)
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
 		if (bp->b_flags & B_MANAGED)
 			MPASS(bp->b_qindex == QUEUE_NONE);
 		else
 			bremfree(bp);
 
 		/*
 		 * check for size inconsistencies for non-VMIO case.
 		 */
 		if (bp->b_bcount != size) {
 			if ((bp->b_flags & B_VMIO) == 0 ||
 			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
 					bp->b_flags |= B_NOCACHE;
 					bwrite(bp);
 				} else {
 					if (LIST_EMPTY(&bp->b_dep)) {
 						bp->b_flags |= B_RELBUF;
 						brelse(bp);
 					} else {
 						bp->b_flags |= B_NOCACHE;
 						bwrite(bp);
 					}
 				}
 				goto loop;
 			}
 		}
 
 		/*
 		 * Handle the case of unmapped buffer which should
 		 * become mapped, or the buffer for which KVA
 		 * reservation is requested.
 		 */
 		bp_unmapped_get_kva(bp, blkno, size, flags);
 
 		/*
 		 * If the size is inconsistent in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
 		allocbuf(bp, size);
 
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
 
 		/*
 		 * A buffer with B_DELWRI set and B_CACHE clear must
 		 * be committed before we can return the buffer in
 		 * order to prevent the caller from issuing a read
 		 * ( due to B_CACHE not being set ) and overwriting
 		 * it.
 		 *
 		 * Most callers, including NFS and FFS, need this to
 		 * operate properly either because they assume they
 		 * can issue a read if B_CACHE is not set, or because
 		 * ( for example ) an uncached B_DELWRI might loop due 
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
 		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 		 * above while extending the buffer, we cannot allow the
 		 * buffer to remain with B_CACHE set after the write
 		 * completes or it will represent a corrupt state.  To
 		 * deal with this we set B_NOCACHE to scrap the buffer
 		 * after the write.
 		 *
 		 * We might be able to do something fancy, like setting
 		 * B_CACHE in bwrite() except if B_DELWRI is already set,
 		 * so the below call doesn't set B_CACHE, but that gets real
 		 * confusing.  This is much easier.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 			bp->b_flags |= B_NOCACHE;
 			bwrite(bp);
 			goto loop;
 		}
 		bp->b_flags &= ~B_DONE;
 	} else {
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
 		BO_RUNLOCK(bo);
 		/*
 		 * If the user does not want us to create the buffer, bail out
 		 * here.
 		 */
 		if (flags & GB_NOCREAT)
 			return NULL;
 		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
 			return NULL;
 
 		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
 		KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
 		offset = blkno * bsize;
 		vmio = vp->v_object != NULL;
 		if (vmio) {
 			maxsize = size + (offset & PAGE_MASK);
 		} else {
 			maxsize = size;
 			/* Do not allow non-VMIO notmapped buffers. */
 			flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 		}
 		maxsize = imax(maxsize, bsize);
 
 		bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
 		if (bp == NULL) {
 			if (slpflag || slptimeo)
 				return NULL;
 			/*
 			 * XXX This is here until the sleep path is diagnosed
 			 * enough to work under very low memory conditions.
 			 *
 			 * There's an issue on low memory, 4BSD+non-preempt
 			 * systems (eg MIPS routers with 32MB RAM) where buffer
 			 * exhaustion occurs without sleeping for buffer
 			 * reclaimation.  This just sticks in a loop and
 			 * constantly attempts to allocate a buffer, which
 			 * hits exhaustion and tries to wakeup bufdaemon.
 			 * This never happens because we never yield.
 			 *
 			 * The real solution is to identify and fix these cases
 			 * so we aren't effectively busy-waiting in a loop
 			 * until the reclaimation path has cycles to run.
 			 */
 			kern_yield(PRI_USER);
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.
 		 *
 		 * Note: this must occur before we associate the buffer
 		 * with the vp especially considering limitations in
 		 * the splay tree implementation when dealing with duplicate
 		 * lblkno's.
 		 */
 		BO_LOCK(bo);
 		if (gbincore(bo, blkno)) {
 			BO_UNLOCK(bo);
 			bp->b_flags |= B_INVAL;
 			brelse(bp);
 			bufspace_release(maxsize);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bp->b_offset = offset;
 		bgetvp(vp, bp);
 		BO_UNLOCK(bo);
 
 		/*
 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 		 * buffer size starts out as 0, B_CACHE will be set by
 		 * allocbuf() for the VMIO case prior to it testing the
 		 * backing store for validity.
 		 */
 
 		if (vmio) {
 			bp->b_flags |= B_VMIO;
 			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
 			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
 			    bp, vp->v_object, bp->b_bufobj->bo_object));
 		} else {
 			bp->b_flags &= ~B_VMIO;
 			KASSERT(bp->b_bufobj->bo_object == NULL,
 			    ("ARGH! has b_bufobj->bo_object %p %p\n",
 			    bp, bp->b_bufobj->bo_object));
 			BUF_CHECK_MAPPED(bp);
 		}
 
 		allocbuf(bp, size);
 		bufspace_release(maxsize);
 		bp->b_flags &= ~B_DONE;
 	}
 	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
 	BUF_ASSERT_HELD(bp);
 end:
+	buf_track(bp, __func__);
 	KASSERT(bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	return (bp);
 }
 
 /*
  * Get an empty, disassociated buffer of given size.  The buffer is initially
  * set to B_INVAL.
  */
 struct buf *
 geteblk(int size, int flags)
 {
 	struct buf *bp;
 	int maxsize;
 
 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 	while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
 		if ((flags & GB_NOWAIT_BD) &&
 		    (curthread->td_pflags & TDP_BUFNEED) != 0)
 			return (NULL);
 	}
 	allocbuf(bp, size);
 	bufspace_release(maxsize);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	BUF_ASSERT_HELD(bp);
 	return (bp);
 }
 
 /*
  * Truncate the backing store for a non-vmio buffer.
  */
 static void
 vfs_nonvmio_truncate(struct buf *bp, int newbsize)
 {
 
 	if (bp->b_flags & B_MALLOC) {
 		/*
 		 * malloced buffers are not shrunk
 		 */
 		if (newbsize == 0) {
 			bufmallocadjust(bp, 0);
 			free(bp->b_data, M_BIOBUF);
 			bp->b_data = bp->b_kvabase;
 			bp->b_flags &= ~B_MALLOC;
 		}
 		return;
 	}
 	vm_hold_free_pages(bp, newbsize);
 	bufspace_adjust(bp, newbsize);
 }
 
 /*
  * Extend the backing for a non-VMIO buffer.
  */
 static void
 vfs_nonvmio_extend(struct buf *bp, int newbsize)
 {
 	caddr_t origbuf;
 	int origbufsize;
 
 	/*
 	 * We only use malloced memory on the first allocation.
 	 * and revert to page-allocated memory when the buffer
 	 * grows.
 	 *
 	 * There is a potential smp race here that could lead
 	 * to bufmallocspace slightly passing the max.  It
 	 * is probably extremely rare and not worth worrying
 	 * over.
 	 */
 	if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 &&
 	    bufmallocspace < maxbufmallocspace) {
 		bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK);
 		bp->b_flags |= B_MALLOC;
 		bufmallocadjust(bp, newbsize);
 		return;
 	}
 
 	/*
 	 * If the buffer is growing on its other-than-first
 	 * allocation then we revert to the page-allocation
 	 * scheme.
 	 */
 	origbuf = NULL;
 	origbufsize = 0;
 	if (bp->b_flags & B_MALLOC) {
 		origbuf = bp->b_data;
 		origbufsize = bp->b_bufsize;
 		bp->b_data = bp->b_kvabase;
 		bufmallocadjust(bp, 0);
 		bp->b_flags &= ~B_MALLOC;
 		newbsize = round_page(newbsize);
 	}
 	vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
 	    (vm_offset_t) bp->b_data + newbsize);
 	if (origbuf != NULL) {
 		bcopy(origbuf, bp->b_data, origbufsize);
 		free(origbuf, M_BIOBUF);
 	}
 	bufspace_adjust(bp, newbsize);
 }
 
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).  This code is able to
  * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistent data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
  *
  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
  * B_CACHE for the non-VMIO case.
  */
 int
 allocbuf(struct buf *bp, int size)
 {
 	int newbsize;
 
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_bcount == size)
 		return (1);
 
 	if (bp->b_kvasize != 0 && bp->b_kvasize < size)
 		panic("allocbuf: buffer too small");
 
 	newbsize = roundup2(size, DEV_BSIZE);
 	if ((bp->b_flags & B_VMIO) == 0) {
 		if ((bp->b_flags & B_MALLOC) == 0)
 			newbsize = round_page(newbsize);
 		/*
 		 * Just get anonymous memory from the kernel.  Don't
 		 * mess with B_CACHE.
 		 */
 		if (newbsize < bp->b_bufsize)
 			vfs_nonvmio_truncate(bp, newbsize);
 		else if (newbsize > bp->b_bufsize)
 			vfs_nonvmio_extend(bp, newbsize);
 	} else {
 		int desiredpages;
 
 		desiredpages = (size == 0) ? 0 :
 		    num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 		/*
 		 * Set B_CACHE initially if buffer is 0 length or will become
 		 * 0-length.
 		 */
 		if (size == 0 || bp->b_bufsize == 0)
 			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize)
 			vfs_vmio_truncate(bp, desiredpages);
 		/* XXX This looks as if it should be newbsize > b_bufsize */
 		else if (size > bp->b_bcount)
 			vfs_vmio_extend(bp, desiredpages, size);
 		bufspace_adjust(bp, newbsize);
 	}
 	bp->b_bcount = size;		/* requested buffer size. */
 	return (1);
 }
 
 extern int inflight_transient_maps;
 
 void
 biodone(struct bio *bp)
 {
 	struct mtx *mtxp;
 	void (*done)(struct bio *);
 	vm_offset_t start, end;
 
+	biotrack(bp, __func__);
 	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
 		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
 		bp->bio_flags |= BIO_UNMAPPED;
 		start = trunc_page((vm_offset_t)bp->bio_data);
 		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
 		bp->bio_data = unmapped_buf;
 		pmap_qremove(start, OFF_TO_IDX(end - start));
 		vmem_free(transient_arena, start, end - start);
 		atomic_add_int(&inflight_transient_maps, -1);
 	}
 	done = bp->bio_done;
 	if (done == NULL) {
 		mtxp = mtx_pool_find(mtxpool_sleep, bp);
 		mtx_lock(mtxp);
 		bp->bio_flags |= BIO_DONE;
 		wakeup(bp);
 		mtx_unlock(mtxp);
 	} else {
 		bp->bio_flags |= BIO_DONE;
 		done(bp);
 	}
 }
 
 /*
  * Wait for a BIO to finish.
  */
 int
 biowait(struct bio *bp, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->bio_flags & BIO_DONE) == 0)
 		msleep(bp, mtxp, PRIBIO, wchan, 0);
 	mtx_unlock(mtxp);
 	if (bp->bio_error != 0)
 		return (bp->bio_error);
 	if (!(bp->bio_flags & BIO_ERROR))
 		return (0);
 	return (EIO);
 }
 
 void
 biofinish(struct bio *bp, struct devstat *stat, int error)
 {
 	
 	if (error) {
 		bp->bio_error = error;
 		bp->bio_flags |= BIO_ERROR;
 	}
 	if (stat != NULL)
 		devstat_end_transaction_bio(stat, bp);
 	biodone(bp);
 }
 
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+void
+biotrack_buf(struct bio *bp, const char *location)
+{
+
+	buf_track(bp->bio_track_bp, location);
+}
+#endif
+
 /*
  *	bufwait:
  *
  *	Wait for buffer I/O completion, returning error status.  The buffer
  *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
  *	error and cleared.
  */
 int
 bufwait(struct buf *bp)
 {
 	if (bp->b_iocmd == BIO_READ)
 		bwait(bp, PRIBIO, "biord");
 	else
 		bwait(bp, PRIBIO, "biowr");
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_ioflags & BIO_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
 /*
  *	bufdone:
  *
  *	Finish I/O on a buffer, optionally calling a completion function.
  *	This is usually called from an interrupt so process blocking is
  *	not allowed.
  *
  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
  *	assuming B_INVAL is clear.
  *
  *	For the VMIO case, we set B_CACHE if the op was a read and no
  *	read error occurred, or if the op was a write.  B_CACHE is never
  *	set if the buffer is invalid or otherwise uncacheable.
  *
  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
  *	initiator to leave B_INVAL set to brelse the buffer out of existence
  *	in the biodone routine.
  */
 void
 bufdone(struct buf *bp)
 {
 	struct bufobj *dropobj;
 	void    (*biodone)(struct buf *);
 
+	buf_track(bp, __func__);
 	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	dropobj = NULL;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 	BUF_ASSERT_HELD(bp);
 
 	runningbufwakeup(bp);
 	if (bp->b_iocmd == BIO_WRITE)
 		dropobj = bp->b_bufobj;
 	/* call optional completion function if requested */
 	if (bp->b_iodone != NULL) {
 		biodone = bp->b_iodone;
 		bp->b_iodone = NULL;
 		(*biodone) (bp);
 		if (dropobj)
 			bufobj_wdrop(dropobj);
 		return;
 	}
 
 	bufdone_finish(bp);
 
 	if (dropobj)
 		bufobj_wdrop(dropobj);
 }
 
 void
 bufdone_finish(struct buf *bp)
 {
 	BUF_ASSERT_HELD(bp);
 
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_complete(bp);
 
 	if (bp->b_flags & B_VMIO) {
 		/*
 		 * Set B_CACHE if the op was a normal read and no error
 		 * occurred.  B_CACHE is set for writes in the b*write()
 		 * routines.
 		 */
 		if (bp->b_iocmd == BIO_READ &&
 		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
 		    !(bp->b_ioflags & BIO_ERROR))
 			bp->b_flags |= B_CACHE;
 		vfs_vmio_iodone(bp);
 	}
 
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * will do a wakeup there if necessary - so no need to do a wakeup
 	 * here in the async case. The sync case always needs to do a wakeup.
 	 */
 	if (bp->b_flags & B_ASYNC) {
 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) ||
 		    (bp->b_ioflags & BIO_ERROR))
 			brelse(bp);
 		else
 			bqrelse(bp);
 	} else
 		bdone(bp);
 }
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistent.
  */
 void
 vfs_unbusy_pages(struct buf *bp)
 {
 	int i;
 	vm_object_t obj;
 	vm_page_t m;
 
 	runningbufwakeup(bp);
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 			if (!m)
 				panic("vfs_unbusy_pages: page missing\n");
 			bp->b_pages[i] = m;
 			if (buf_mapped(bp)) {
 				BUF_CHECK_MAPPED(bp);
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 				    bp->b_pages, bp->b_npages);
 			} else
 				BUF_CHECK_UNMAPPED(bp);
 		}
 		vm_page_sunbusy(m);
 	}
 	vm_object_pip_wakeupn(obj, bp->b_npages);
 	VM_OBJECT_WUNLOCK(obj);
 }
 
 /*
  * vfs_page_set_valid:
  *
  *	Set the valid bits in a page based on the supplied offset.   The
  *	range is restricted to the buffer's size.
  *
  *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t eoff;
 
 	/*
 	 * Compute the end offset, eoff, such that [off, eoff) does not span a
 	 * page boundary and eoff is not greater than the end of the buffer.
 	 * The end of the buffer, in this case, is our file EOF, not the
 	 * allocation size of the buffer.
 	 */
 	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > off)
 		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
 }
 
 /*
  * vfs_page_set_validclean:
  *
  *	Set the valid bits and clear the dirty bits in a page based on the
  *	supplied offset.   The range is restricted to the buffer's size.
  */
 static void
 vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t soff, eoff;
 
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 	 * page boundary or cross the end of the buffer.  The end of the
 	 * buffer, in this case, is our file EOF, not the allocation size
 	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > soff) {
 		vm_page_set_validclean(
 		    m,
 		   (vm_offset_t) (soff & PAGE_MASK),
 		   (vm_offset_t) (eoff - soff)
 		);
 	}
 }
 
 /*
  * Ensure that all buffer pages are not exclusive busied.  If any page is
  * exclusive busy, drain it.
  */
 void
 vfs_drain_busy_pages(struct buf *bp)
 {
 	vm_page_t m;
 	int i, last_busied;
 
 	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
 	last_busied = 0;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (vm_page_xbusied(m)) {
 			for (; last_busied < i; last_busied++)
 				vm_page_sbusy(bp->b_pages[last_busied]);
 			while (vm_page_xbusied(m)) {
 				vm_page_lock(m);
 				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 				vm_page_busy_sleep(m, "vbpage", true);
 				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 			}
 		}
 	}
 	for (i = 0; i < last_busied; i++)
 		vm_page_sunbusy(bp->b_pages[i]);
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being exclusive busy.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistent.
  *
  * Since I/O has not been initiated yet, certain buffer flags
  * such as BIO_ERROR or B_INVAL may be in an inconsistent state
  * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf *bp, int clear_modify)
 {
 	int i, bogus;
 	vm_object_t obj;
 	vm_ooffset_t foff;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_busy_pages: no buffer offset"));
 	VM_OBJECT_WLOCK(obj);
 	vfs_drain_busy_pages(bp);
 	if (bp->b_bufsize != 0)
 		vfs_setdirty_locked_object(bp);
 	bogus = 0;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 
 		if ((bp->b_flags & B_CLUSTER) == 0) {
 			vm_object_pip_add(obj, 1);
 			vm_page_sbusy(m);
 		}
 		/*
 		 * When readying a buffer for a read ( i.e
 		 * clear_modify == 0 ), it is important to do
 		 * bogus_page replacement for valid pages in 
 		 * partially instantiated buffers.  Partially 
 		 * instantiated buffers can, in turn, occur when
 		 * reconstituting a buffer from its VM backing store
 		 * base.  We only have to do this if B_CACHE is
 		 * clear ( which causes the I/O to occur in the
 		 * first place ).  The replacement prevents the read
 		 * I/O from overwriting potentially dirty VM-backed
 		 * pages.  XXX bogus page replacement is, uh, bogus.
 		 * It may not work properly with small-block devices.
 		 * We need to find a better way.
 		 */
 		if (clear_modify) {
 			pmap_remove_write(m);
 			vfs_page_set_validclean(bp, foff, m);
 		} else if (m->valid == VM_PAGE_BITS_ALL &&
 		    (bp->b_flags & B_CACHE) == 0) {
 			bp->b_pages[i] = bogus_page;
 			bogus++;
 		}
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	if (bogus && buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  *	vfs_bio_set_valid:
  *
  *	Set the range within the buffer to valid.  The range is
  *	relative to the beginning of the buffer, b_offset.  Note that
  *	b_offset itself may be offset from the beginning of the first
  *	page.
  */
 void   
 vfs_bio_set_valid(struct buf *bp, int base, int size)
 {
 	int i, n;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	/*
 	 * Fixup base to be relative to beginning of first page.
 	 * Set initial n to be the maximum number of bytes in the
 	 * first page that can be validated.
 	 */
 	base += (bp->b_offset & PAGE_MASK);
 	n = PAGE_SIZE - (base & PAGE_MASK);
 
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 		m = bp->b_pages[i];
 		if (n > size)
 			n = size;
 		vm_page_set_valid_range(m, base & PAGE_MASK, n);
 		base += n;
 		size -= n;
 		n = PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 /*
  *	vfs_bio_clrbuf:
  *
  *	If the specified buffer is a non-VMIO buffer, clear the entire
  *	buffer.  If the specified buffer is a VMIO buffer, clear and
  *	validate only the previously invalid portions of the buffer.
  *	This routine essentially fakes an I/O, so we need to clear
  *	BIO_ERROR and B_INVAL.
  *
  *	Note that while we only theoretically need to clear through b_bcount,
  *	we go ahead and clear through b_bufsize.
  */
 void
 vfs_bio_clrbuf(struct buf *bp) 
 {
 	int i, j, mask, sa, ea, slide;
 
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
 		clrbuf(bp);
 		return;
 	}
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 	    (bp->b_offset & PAGE_MASK) == 0) {
 		if (bp->b_pages[0] == bogus_page)
 			goto unlock;
 		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
 		if ((bp->b_pages[0]->valid & mask) == mask)
 			goto unlock;
 		if ((bp->b_pages[0]->valid & mask) == 0) {
 			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
 			bp->b_pages[0]->valid |= mask;
 			goto unlock;
 		}
 	}
 	sa = bp->b_offset & PAGE_MASK;
 	slide = 0;
 	for (i = 0; i < bp->b_npages; i++, sa = 0) {
 		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
 		ea = slide & PAGE_MASK;
 		if (ea == 0)
 			ea = PAGE_SIZE;
 		if (bp->b_pages[i] == bogus_page)
 			continue;
 		j = sa / DEV_BSIZE;
 		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
 		if ((bp->b_pages[i]->valid & mask) == mask)
 			continue;
 		if ((bp->b_pages[i]->valid & mask) == 0)
 			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
 		else {
 			for (; sa < ea; sa += DEV_BSIZE, j++) {
 				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
 					pmap_zero_page_area(bp->b_pages[i],
 					    sa, DEV_BSIZE);
 				}
 			}
 		}
 		bp->b_pages[i]->valid |= mask;
 	}
 unlock:
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 	bp->b_resid = 0;
 }
 
 void
 vfs_bio_bzero_buf(struct buf *bp, int base, int size)
 {
 	vm_page_t m;
 	int i, n;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		bzero(bp->b_data + base, size);
 	} else {
 		BUF_CHECK_UNMAPPED(bp);
 		n = PAGE_SIZE - (base & PAGE_MASK);
 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 			m = bp->b_pages[i];
 			if (n > size)
 				n = size;
 			pmap_zero_page_area(m, base & PAGE_MASK, n);
 			base += n;
 			size -= n;
 			n = PAGE_SIZE;
 		}
 	}
 }
 
 /*
  * vm_hold_load_pages and vm_hold_free_pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 static void
 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
 	BUF_CHECK_MAPPED(bp);
 
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 tryagain:
 		/*
 		 * note: must allocate system pages since blocking here
 		 * could interfere with paging I/O, no matter which
 		 * process we are.
 		 */
 		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
 		if (p == NULL) {
 			VM_WAIT;
 			goto tryagain;
 		}
 		pmap_qenter(pg, &p, 1);
 		bp->b_pages[index] = p;
 	}
 	bp->b_npages = index;
 }
 
 /* Return pages associated with this buf to the vm system */
 static void
 vm_hold_free_pages(struct buf *bp, int newbsize)
 {
 	vm_offset_t from;
 	vm_page_t p;
 	int index, newnpages;
 
 	BUF_CHECK_MAPPED(bp);
 
 	from = round_page((vm_offset_t)bp->b_data + newbsize);
 	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 	if (bp->b_npages > newnpages)
 		pmap_qremove(from, bp->b_npages - newnpages);
 	for (index = newnpages; index < bp->b_npages; index++) {
 		p = bp->b_pages[index];
 		bp->b_pages[index] = NULL;
 		if (vm_page_sbusied(p))
 			printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
 			    (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
 		p->wire_count--;
 		vm_page_free(p);
 		atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 	}
 	bp->b_npages = newnpages;
 }
 
 /*
  * Map an IO request into kernel virtual address space.
  *
  * All requests are (re)mapped into kernel VA space.
  * Notice that we use b_bufsize for the size of the buffer
  * to be mapped.  b_bcount might be modified by the driver.
  *
  * Note that even if the caller determines that the address space should
  * be valid, a race or a smaller-file mapped into a larger space may
  * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
  * check the return value.
  *
  * This function only works with pager buffers.
  */
 int
 vmapbuf(struct buf *bp, int mapbuf)
 {
 	vm_prot_t prot;
 	int pidx;
 
 	if (bp->b_bufsize < 0)
 		return (-1);
 	prot = VM_PROT_READ;
 	if (bp->b_iocmd == BIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
 	    btoc(MAXPHYS))) < 0)
 		return (-1);
 	bp->b_npages = pidx;
 	bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
 	if (mapbuf || !unmapped_buf_allowed) {
 		pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx);
 		bp->b_data = bp->b_kvabase + bp->b_offset;
 	} else
 		bp->b_data = unmapped_buf;
 	return(0);
 }
 
 /*
  * Free the io map PTEs associated with this IO operation.
  * We also invalidate the TLB entries and restore the original b_addr.
  *
  * This function only works with pager buffers.
  */
 void
 vunmapbuf(struct buf *bp)
 {
 	int npages;
 
 	npages = bp->b_npages;
 	if (buf_mapped(bp))
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
 	vm_page_unhold_pages(bp->b_pages, npages);
 
 	bp->b_data = unmapped_buf;
 }
 
 void
 bdone(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	bp->b_flags |= B_DONE;
 	wakeup(bp);
 	mtx_unlock(mtxp);
 }
 
 void
 bwait(struct buf *bp, u_char pri, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->b_flags & B_DONE) == 0)
 		msleep(bp, mtxp, pri, wchan, 0);
 	mtx_unlock(mtxp);
 }
 
 int
 bufsync(struct bufobj *bo, int waitfor)
 {
 
 	return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread));
 }
 
 void
 bufstrategy(struct bufobj *bo, struct buf *bp)
 {
 	int i = 0;
 	struct vnode *vp;
 
 	vp = bp->b_vp;
 	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
 	i = VOP_STRATEGY(vp, bp);
 	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
 }
 
 void
 bufobj_wrefl(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	ASSERT_BO_WLOCKED(bo);
 	bo->bo_numoutput++;
 }
 
 void
 bufobj_wref(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	BO_LOCK(bo);
 	bo->bo_numoutput++;
 	BO_UNLOCK(bo);
 }
 
 void
 bufobj_wdrop(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
 	BO_LOCK(bo);
 	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
 	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
 		bo->bo_flag &= ~BO_WWAIT;
 		wakeup(&bo->bo_numoutput);
 	}
 	BO_UNLOCK(bo);
 }
 
 int
 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
 {
 	int error;
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
 	ASSERT_BO_WLOCKED(bo);
 	error = 0;
 	while (bo->bo_numoutput) {
 		bo->bo_flag |= BO_WWAIT;
 		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
 		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Set bio_data or bio_ma for struct bio from the struct buf.
  */
 void
 bdata2bio(struct buf *bp, struct bio *bip)
 {
 
 	if (!buf_mapped(bp)) {
 		KASSERT(unmapped_buf_allowed, ("unmapped"));
 		bip->bio_ma = bp->b_pages;
 		bip->bio_ma_n = bp->b_npages;
 		bip->bio_data = unmapped_buf;
 		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
 		bip->bio_flags |= BIO_UNMAPPED;
 		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
 		    PAGE_SIZE == bp->b_npages,
 		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
 		    (long long)bip->bio_length, bip->bio_ma_n));
 	} else {
 		bip->bio_data = bp->b_data;
 		bip->bio_ma = NULL;
 	}
 }
 
 static int buf_pager_relbuf;
 SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
     &buf_pager_relbuf, 0,
     "Make buffer pager release buffers after reading");
 
 /*
  * The buffer pager.  It uses buffer reads to validate pages.
  *
  * In contrast to the generic local pager from vm/vnode_pager.c, this
  * pager correctly and easily handles volumes where the underlying
  * device block size is greater than the machine page size.  The
  * buffer cache transparently extends the requested page run to be
  * aligned at the block boundary, and does the necessary bogus page
  * replacements in the addends to avoid obliterating already valid
  * pages.
  *
  * The only non-trivial issue is that the exclusive busy state for
  * pages, which is assumed by the vm_pager_getpages() interface, is
  * incompatible with the VMIO buffer cache's desire to share-busy the
  * pages.  This function performs a trivial downgrade of the pages'
  * state before reading buffers, and a less trivial upgrade from the
  * shared-busy to excl-busy state after the read.
  */
 int
 vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count,
     int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno,
     vbg_get_blksize_t get_blksize)
 {
 	vm_page_t m;
 	vm_object_t object;
 	struct buf *bp;
 	daddr_t lbn, lbnp;
 	vm_ooffset_t la, lb, poff, poffe;
 	long bsize;
 	int bo_bs, error, i;
 	bool redo, lpart;
 
 	object = vp->v_object;
 	la = IDX_TO_OFF(ma[count - 1]->pindex);
 	if (la >= object->un_pager.vnp.vnp_size)
 		return (VM_PAGER_BAD);
 	lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size;
 	bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)));
 	if (rbehind != NULL) {
 		lb = IDX_TO_OFF(ma[0]->pindex);
 		*rbehind = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
 	}
 	if (rahead != NULL) {
 		*rahead = OFF_TO_IDX(roundup2(la, bo_bs) - la);
 		if (la + IDX_TO_OFF(*rahead) >= object->un_pager.vnp.vnp_size) {
 			*rahead = OFF_TO_IDX(roundup2(object->un_pager.
 			    vnp.vnp_size, PAGE_SIZE) - la);
 		}
 	}
 	VM_OBJECT_WLOCK(object);
 again:
 	for (i = 0; i < count; i++)
 		vm_page_busy_downgrade(ma[i]);
 	VM_OBJECT_WUNLOCK(object);
 
 	lbnp = -1;
 	for (i = 0; i < count; i++) {
 		m = ma[i];
 
 		/*
 		 * Pages are shared busy and the object lock is not
 		 * owned, which together allow for the pages'
 		 * invalidation.  The racy test for validity avoids
 		 * useless creation of the buffer for the most typical
 		 * case when invalidation is not used in redo or for
 		 * parallel read.  The shared->excl upgrade loop at
 		 * the end of the function catches the race in a
 		 * reliable way (protected by the object lock).
 		 */
 		if (m->valid == VM_PAGE_BITS_ALL)
 			continue;
 
 		poff = IDX_TO_OFF(m->pindex);
 		poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size);
 		for (; poff < poffe; poff += bsize) {
 			lbn = get_lblkno(vp, poff);
 			if (lbn == lbnp)
 				goto next_page;
 			lbnp = lbn;
 
 			bsize = get_blksize(vp, lbn);
 			error = bread_gb(vp, lbn, bsize, NOCRED, GB_UNMAPPED,
 			    &bp);
 			if (error != 0)
 				goto end_pages;
 			if (LIST_EMPTY(&bp->b_dep)) {
 				/*
 				 * Invalidation clears m->valid, but
 				 * may leave B_CACHE flag if the
 				 * buffer existed at the invalidation
 				 * time.  In this case, recycle the
 				 * buffer to do real read on next
 				 * bread() after redo.
 				 *
 				 * Otherwise B_RELBUF is not strictly
 				 * necessary, enable to reduce buf
 				 * cache pressure.
 				 */
 				if (buf_pager_relbuf ||
 				    m->valid != VM_PAGE_BITS_ALL)
 					bp->b_flags |= B_RELBUF;
 
 				bp->b_flags &= ~B_NOCACHE;
 				brelse(bp);
 			} else {
 				bqrelse(bp);
 			}
 		}
 		KASSERT(1 /* racy, enable for debugging */ ||
 		    m->valid == VM_PAGE_BITS_ALL || i == count - 1,
 		    ("buf %d %p invalid", i, m));
 		if (i == count - 1 && lpart) {
 			VM_OBJECT_WLOCK(object);
 			if (m->valid != 0 &&
 			    m->valid != VM_PAGE_BITS_ALL)
 				vm_page_zero_invalid(m, TRUE);
 			VM_OBJECT_WUNLOCK(object);
 		}
 next_page:;
 	}
 end_pages:
 
 	VM_OBJECT_WLOCK(object);
 	redo = false;
 	for (i = 0; i < count; i++) {
 		vm_page_sunbusy(ma[i]);
 		ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
 
 		/*
 		 * Since the pages were only sbusy while neither the
 		 * buffer nor the object lock was held by us, or
 		 * reallocated while vm_page_grab() slept for busy
 		 * relinguish, they could have been invalidated.
 		 * Recheck the valid bits and re-read as needed.
 		 *
 		 * Note that the last page is made fully valid in the
 		 * read loop, and partial validity for the page at
 		 * index count - 1 could mean that the page was
 		 * invalidated or removed, so we must restart for
 		 * safety as well.
 		 */
 		if (ma[i]->valid != VM_PAGE_BITS_ALL)
 			redo = true;
 	}
 	if (redo && error == 0)
 		goto again;
 	VM_OBJECT_WUNLOCK(object);
 	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 /* DDB command to show buffer data */
 DB_SHOW_COMMAND(buffer, db_show_buffer)
 {
 	/* get args */
 	struct buf *bp = (struct buf *)addr;
+#ifdef FULL_BUF_TRACKING
+	uint32_t i, j;
+#endif
 
 	if (!have_addr) {
 		db_printf("usage: show buffer <addr>\n");
 		return;
 	}
 
 	db_printf("buf at %p\n", bp);
 	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
 	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
 	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
 	db_printf(
 	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
 	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
 	    "b_dep = %p\n",
 	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
 	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
 	db_printf("b_kvabase = %p, b_kvasize = %d\n",
 	    bp->b_kvabase, bp->b_kvasize);
 	if (bp->b_npages) {
 		int i;
 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m;
 			m = bp->b_pages[i];
 			if (m != NULL)
 				db_printf("(%p, 0x%lx, 0x%lx)", m->object,
 				    (u_long)m->pindex,
 				    (u_long)VM_PAGE_TO_PHYS(m));
 			else
 				db_printf("( ??? )");
 			if ((i + 1) < bp->b_npages)
 				db_printf(",");
 		}
 		db_printf("\n");
 	}
+#if defined(FULL_BUF_TRACKING)
+	db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt);
+
+	i = bp->b_io_tcnt % BUF_TRACKING_SIZE;
+	for (j = 1; j <= BUF_TRACKING_SIZE; j++)
+		db_printf(" %2u: %s\n", j,
+		    bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]);
+#elif defined(BUF_TRACKING)
+	db_printf("b_io_tracking: %s\n", bp->b_io_tracking);
+#endif
 	db_printf(" ");
 	BUF_LOCKPRINTINFO(bp);
 }
 
 DB_SHOW_COMMAND(lockedbufs, lockedbufs)
 {
 	struct buf *bp;
 	int i;
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (BUF_ISLOCKED(bp)) {
 			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 			db_printf("\n");
 		}
 	}
 }
 
 DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
 {
 	struct vnode *vp;
 	struct buf *bp;
 
 	if (!have_addr) {
 		db_printf("usage: show vnodebufs <addr>\n");
 		return;
 	}
 	vp = (struct vnode *)addr;
 	db_printf("Clean buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 	db_printf("Dirty buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 }
 
 DB_COMMAND(countfreebufs, db_coundfreebufs)
 {
 	struct buf *bp;
 	int i, used = 0, nfree = 0;
 
 	if (have_addr) {
 		db_printf("usage: countfreebufs\n");
 		return;
 	}
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (bp->b_qindex == QUEUE_EMPTY)
 			nfree++;
 		else
 			used++;
 	}
 
 	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
 	    nfree + used);
 	db_printf("numfreebuffers is %d\n", numfreebuffers);
 }
 #endif /* DDB */
Index: head/sys/kern/vfs_cluster.c
===================================================================
--- head/sys/kern/vfs_cluster.c	(revision 308154)
+++ head/sys/kern/vfs_cluster.c	(revision 308155)
@@ -1,1063 +1,1064 @@
 /*-
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  * Modifications/enhancements:
  * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_debug_cluster.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <sys/sysctl.h>
 
 #if defined(CLUSTERDEBUG)
 static int	rcluster= 0;
 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
     "Debug VFS clustering code");
 #endif
 
 static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
 
 static struct cluster_save *cluster_collectbufs(struct vnode *vp,
 	    struct buf *last_bp, int gbflags);
 static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize,
 	    daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
 	    struct buf *fbp);
 static void cluster_callback(struct buf *);
 
 static int write_behind = 1;
 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
     "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
 
 static int read_max = 64;
 SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
     "Cluster read-ahead max block count");
 
 static int read_min = 1;
 SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0,
     "Cluster read min block count");
 
 /* Page expended to mark partially backed buffers */
 extern vm_page_t	bogus_page;
 
 /*
  * Read data to a buf, including read-ahead if we find this to be beneficial.
  * cluster_read replaces bread.
  */
 int
 cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
     struct ucred *cred, long totread, int seqcount, int gbflags,
     struct buf **bpp)
 {
 	struct buf *bp, *rbp, *reqbp;
 	struct bufobj *bo;
 	daddr_t blkno, origblkno;
 	int maxra, racluster;
 	int error, ncontig;
 	int i;
 
 	error = 0;
 	bo = &vp->v_bufobj;
 	if (!unmapped_buf_allowed)
 		gbflags &= ~GB_UNMAPPED;
 
 	/*
 	 * Try to limit the amount of read-ahead by a few
 	 * ad-hoc parameters.  This needs work!!!
 	 */
 	racluster = vp->v_mount->mnt_iosize_max / size;
 	maxra = seqcount;
 	maxra = min(read_max, maxra);
 	maxra = min(nbuf/8, maxra);
 	if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
 		maxra = (filesize / size) - lblkno;
 
 	/*
 	 * get the requested block
 	 */
 	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags);
 	if (bp == NULL)
 		return (EBUSY);
 	origblkno = lblkno;
 
 	/*
 	 * if it is in the cache, then check to see if the reads have been
 	 * sequential.  If they have, then try some read-ahead, otherwise
 	 * back-off on prospective read-aheads.
 	 */
 	if (bp->b_flags & B_CACHE) {
 		if (!seqcount) {
 			return 0;
 		} else if ((bp->b_flags & B_RAM) == 0) {
 			return 0;
 		} else {
 			bp->b_flags &= ~B_RAM;
 			BO_RLOCK(bo);
 			for (i = 1; i < maxra; i++) {
 				/*
 				 * Stop if the buffer does not exist or it
 				 * is invalid (about to go away?)
 				 */
 				rbp = gbincore(&vp->v_bufobj, lblkno+i);
 				if (rbp == NULL || (rbp->b_flags & B_INVAL))
 					break;
 
 				/*
 				 * Set another read-ahead mark so we know 
 				 * to check again. (If we can lock the
 				 * buffer without waiting)
 				 */
 				if ((((i % racluster) == (racluster - 1)) ||
 				    (i == (maxra - 1))) 
 				    && (0 == BUF_LOCK(rbp, 
 					LK_EXCLUSIVE | LK_NOWAIT, NULL))) {
 					rbp->b_flags |= B_RAM;
 					BUF_UNLOCK(rbp);
 				}			
 			}
 			BO_RUNLOCK(bo);
 			if (i >= maxra) {
 				return 0;
 			}
 			lblkno += i;
 		}
 		reqbp = bp = NULL;
 	/*
 	 * If it isn't in the cache, then get a chunk from
 	 * disk if sequential, otherwise just get the block.
 	 */
 	} else {
 		off_t firstread = bp->b_offset;
 		int nblks;
 		long minread;
 
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("cluster_read: no buffer offset"));
 
 		ncontig = 0;
 
 		/*
 		 * Adjust totread if needed
 		 */
 		minread = read_min * size;
 		if (minread > totread)
 			totread = minread;
 
 		/*
 		 * Compute the total number of blocks that we should read
 		 * synchronously.
 		 */
 		if (firstread + totread > filesize)
 			totread = filesize - firstread;
 		nblks = howmany(totread, size);
 		if (nblks > racluster)
 			nblks = racluster;
 
 		/*
 		 * Now compute the number of contiguous blocks.
 		 */
 		if (nblks > 1) {
 	    		error = VOP_BMAP(vp, lblkno, NULL,
 				&blkno, &ncontig, NULL);
 			/*
 			 * If this failed to map just do the original block.
 			 */
 			if (error || blkno == -1)
 				ncontig = 0;
 		}
 
 		/*
 		 * If we have contiguous data available do a cluster
 		 * otherwise just read the requested block.
 		 */
 		if (ncontig) {
 			/* Account for our first block. */
 			ncontig = min(ncontig + 1, nblks);
 			if (ncontig < nblks)
 				nblks = ncontig;
 			bp = cluster_rbuild(vp, filesize, lblkno,
 			    blkno, size, nblks, gbflags, bp);
 			lblkno += (bp->b_bufsize / size);
 		} else {
 			bp->b_flags |= B_RAM;
 			bp->b_iocmd = BIO_READ;
 			lblkno += 1;
 		}
 	}
 
 	/*
 	 * handle the synchronous read so that it is available ASAP.
 	 */
 	if (bp) {
 		if ((bp->b_flags & B_CLUSTER) == 0) {
 			vfs_busy_pages(bp, 0);
 		}
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
 			BUF_KERNPROC(bp);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(curproc);
 			racct_add_buf(curproc, bp, 0);
 			PROC_UNLOCK(curproc);
 		}
 #endif /* RACCT */
 		curthread->td_ru.ru_inblock++;
 	}
 
 	/*
 	 * If we have been doing sequential I/O, then do some read-ahead.
 	 */
 	while (lblkno < (origblkno + maxra)) {
 		error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL);
 		if (error)
 			break;
 
 		if (blkno == -1)
 			break;
 
 		/*
 		 * We could throttle ncontig here by maxra but we might as
 		 * well read the data if it is contiguous.  We're throttled
 		 * by racluster anyway.
 		 */
 		if (ncontig) {
 			ncontig = min(ncontig + 1, racluster);
 			rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
 			    size, ncontig, gbflags, NULL);
 			lblkno += (rbp->b_bufsize / size);
 			if (rbp->b_flags & B_DELWRI) {
 				bqrelse(rbp);
 				continue;
 			}
 		} else {
 			rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
 			lblkno += 1;
 			if (rbp->b_flags & B_DELWRI) {
 				bqrelse(rbp);
 				continue;
 			}
 			rbp->b_flags |= B_ASYNC | B_RAM;
 			rbp->b_iocmd = BIO_READ;
 			rbp->b_blkno = blkno;
 		}
 		if (rbp->b_flags & B_CACHE) {
 			rbp->b_flags &= ~B_ASYNC;
 			bqrelse(rbp);
 			continue;
 		}
 		if ((rbp->b_flags & B_CLUSTER) == 0) {
 			vfs_busy_pages(rbp, 0);
 		}
 		rbp->b_flags &= ~B_INVAL;
 		rbp->b_ioflags &= ~BIO_ERROR;
 		if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
 			BUF_KERNPROC(rbp);
 		rbp->b_iooffset = dbtob(rbp->b_blkno);
 		bstrategy(rbp);
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(curproc);
 			racct_add_buf(curproc, rbp, 0);
 			PROC_UNLOCK(curproc);
 		}
 #endif /* RACCT */
 		curthread->td_ru.ru_inblock++;
 	}
 
 	if (reqbp) {
 		/*
 		 * Like bread, always brelse() the buffer when
 		 * returning an error.
 		 */
 		error = bufwait(reqbp);
 		if (error != 0) {
 			brelse(reqbp);
 			*bpp = NULL;
 		}
 	}
 	return (error);
 }
 
 /*
  * If blocks are contiguous on disk, use this to provide clustered
  * read ahead.  We will read as many blocks as possible sequentially
  * and then parcel them up into logical blocks in the buffer hash table.
  */
 static struct buf *
 cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
     daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
 {
 	struct buf *bp, *tbp;
 	daddr_t bn;
 	off_t off;
 	long tinc, tsize;
 	int i, inc, j, k, toff;
 
 	KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
 	    ("cluster_rbuild: size %ld != f_iosize %jd\n",
 	    size, (intmax_t)vp->v_mount->mnt_stat.f_iosize));
 
 	/*
 	 * avoid a division
 	 */
 	while ((u_quad_t) size * (lbn + run) > filesize) {
 		--run;
 	}
 
 	if (fbp) {
 		tbp = fbp;
 		tbp->b_iocmd = BIO_READ; 
 	} else {
 		tbp = getblk(vp, lbn, size, 0, 0, gbflags);
 		if (tbp->b_flags & B_CACHE)
 			return tbp;
 		tbp->b_flags |= B_ASYNC | B_RAM;
 		tbp->b_iocmd = BIO_READ;
 	}
 	tbp->b_blkno = blkno;
 	if( (tbp->b_flags & B_MALLOC) ||
 		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
 		return tbp;
 
 	bp = trypbuf(&cluster_pbuf_freecnt);
 	if (bp == NULL)
 		return tbp;
 
 	/*
 	 * We are synthesizing a buffer out of vm_page_t's, but
 	 * if the block size is not page aligned then the starting
 	 * address may not be either.  Inherit the b_data offset
 	 * from the original buffer.
 	 */
 	bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
 	if ((gbflags & GB_UNMAPPED) != 0) {
 		bp->b_data = unmapped_buf;
 	} else {
 		bp->b_data = (char *)((vm_offset_t)bp->b_data |
 		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
 	}
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = cluster_callback;
 	bp->b_blkno = blkno;
 	bp->b_lblkno = lbn;
 	bp->b_offset = tbp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
 	pbgetvp(vp, bp);
 
 	TAILQ_INIT(&bp->b_cluster.cluster_head);
 
 	bp->b_bcount = 0;
 	bp->b_bufsize = 0;
 	bp->b_npages = 0;
 
 	inc = btodb(size);
 	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
 		if (i == 0) {
 			VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
 			vfs_drain_busy_pages(tbp);
 			vm_object_pip_add(tbp->b_bufobj->bo_object,
 			    tbp->b_npages);
 			for (k = 0; k < tbp->b_npages; k++)
 				vm_page_sbusy(tbp->b_pages[k]);
 			VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
 		} else {
 			if ((bp->b_npages * PAGE_SIZE) +
 			    round_page(size) > vp->v_mount->mnt_iosize_max) {
 				break;
 			}
 
 			tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
 			    (gbflags & GB_UNMAPPED));
 
 			/* Don't wait around for locked bufs. */
 			if (tbp == NULL)
 				break;
 
 			/*
 			 * Stop scanning if the buffer is fully valid
 			 * (marked B_CACHE), or locked (may be doing a
 			 * background write), or if the buffer is not
 			 * VMIO backed.  The clustering code can only deal
 			 * with VMIO-backed buffers.  The bo lock is not
 			 * required for the BKGRDINPROG check since it
 			 * can not be set without the buf lock.
 			 */
 			if ((tbp->b_vflags & BV_BKGRDINPROG) ||
 			    (tbp->b_flags & B_CACHE) ||
 			    (tbp->b_flags & B_VMIO) == 0) {
 				bqrelse(tbp);
 				break;
 			}
 
 			/*
 			 * The buffer must be completely invalid in order to
 			 * take part in the cluster.  If it is partially valid
 			 * then we stop.
 			 */
 			off = tbp->b_offset;
 			tsize = size;
 			VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
 			for (j = 0; tsize > 0; j++) {
 				toff = off & PAGE_MASK;
 				tinc = tsize;
 				if (toff + tinc > PAGE_SIZE)
 					tinc = PAGE_SIZE - toff;
 				VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object);
 				if ((tbp->b_pages[j]->valid &
 				    vm_page_bits(toff, tinc)) != 0)
 					break;
 				if (vm_page_xbusied(tbp->b_pages[j]))
 					break;
 				vm_object_pip_add(tbp->b_bufobj->bo_object, 1);
 				vm_page_sbusy(tbp->b_pages[j]);
 				off += tinc;
 				tsize -= tinc;
 			}
 			if (tsize > 0) {
 clean_sbusy:
 				vm_object_pip_add(tbp->b_bufobj->bo_object, -j);
 				for (k = 0; k < j; k++)
 					vm_page_sunbusy(tbp->b_pages[k]);
 				VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
 				bqrelse(tbp);
 				break;
 			}
 			VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
 
 			/*
 			 * Set a read-ahead mark as appropriate
 			 */
 			if ((fbp && (i == 1)) || (i == (run - 1)))
 				tbp->b_flags |= B_RAM;
 
 			/*
 			 * Set the buffer up for an async read (XXX should
 			 * we do this only if we do not wind up brelse()ing?).
 			 * Set the block number if it isn't set, otherwise
 			 * if it is make sure it matches the block number we
 			 * expect.
 			 */
 			tbp->b_flags |= B_ASYNC;
 			tbp->b_iocmd = BIO_READ;
 			if (tbp->b_blkno == tbp->b_lblkno) {
 				tbp->b_blkno = bn;
 			} else if (tbp->b_blkno != bn) {
 				VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
 				goto clean_sbusy;
 			}
 		}
 		/*
 		 * XXX fbp from caller may not be B_ASYNC, but we are going
 		 * to biodone() it in cluster_callback() anyway
 		 */
 		BUF_KERNPROC(tbp);
 		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
 			tbp, b_cluster.cluster_entry);
 		VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
 		for (j = 0; j < tbp->b_npages; j += 1) {
 			vm_page_t m;
 			m = tbp->b_pages[j];
 			if ((bp->b_npages == 0) ||
 			    (bp->b_pages[bp->b_npages-1] != m)) {
 				bp->b_pages[bp->b_npages] = m;
 				bp->b_npages++;
 			}
 			if (m->valid == VM_PAGE_BITS_ALL)
 				tbp->b_pages[j] = bogus_page;
 		}
 		VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
 		/*
 		 * Don't inherit tbp->b_bufsize as it may be larger due to
 		 * a non-page-aligned size.  Instead just aggregate using
 		 * 'size'.
 		 */
 		if (tbp->b_bcount != size)
 			printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
 		if (tbp->b_bufsize != size)
 			printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
 		bp->b_bcount += size;
 		bp->b_bufsize += size;
 	}
 
 	/*
 	 * Fully valid pages in the cluster are already good and do not need
 	 * to be re-read from disk.  Replace the page with bogus_page
 	 */
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	for (j = 0; j < bp->b_npages; j++) {
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object);
 		if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL)
 			bp->b_pages[j] = bogus_page;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 	if (bp->b_bufsize > bp->b_kvasize)
 		panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
 		    bp->b_bufsize, bp->b_kvasize);
 
 	if (buf_mapped(bp)) {
 		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
 		    (vm_page_t *)bp->b_pages, bp->b_npages);
 	}
 	return (bp);
 }
 
 /*
  * Cleanup after a clustered read or write.
  * This is complicated by the fact that any of the buffers might have
  * extra memory (if there were no empty buffer headers at allocbuf time)
  * that we will need to shift around.
  */
 static void
 cluster_callback(bp)
 	struct buf *bp;
 {
 	struct buf *nbp, *tbp;
 	int error = 0;
 
 	/*
 	 * Must propagate errors to all the components.
 	 */
 	if (bp->b_ioflags & BIO_ERROR)
 		error = bp->b_error;
 
 	if (buf_mapped(bp)) {
 		pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
 		    bp->b_npages);
 	}
 	/*
 	 * Move memory from the large cluster buffer into the component
 	 * buffers and mark IO as done on these.
 	 */
 	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
 		tbp; tbp = nbp) {
 		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
 		if (error) {
 			tbp->b_ioflags |= BIO_ERROR;
 			tbp->b_error = error;
 		} else {
 			tbp->b_dirtyoff = tbp->b_dirtyend = 0;
 			tbp->b_flags &= ~B_INVAL;
 			tbp->b_ioflags &= ~BIO_ERROR;
 			/*
 			 * XXX the bdwrite()/bqrelse() issued during
 			 * cluster building clears B_RELBUF (see bqrelse()
 			 * comment).  If direct I/O was specified, we have
 			 * to restore it here to allow the buffer and VM
 			 * to be freed.
 			 */
 			if (tbp->b_flags & B_DIRECT)
 				tbp->b_flags |= B_RELBUF;
 		}
 		bufdone(tbp);
 	}
 	pbrelvp(bp);
 	relpbuf(bp, &cluster_pbuf_freecnt);
 }
 
 /*
  *	cluster_wbuild_wb:
  *
  *	Implement modified write build for cluster.
  *
  *		write_behind = 0	write behind disabled
  *		write_behind = 1	write behind normal (default)
  *		write_behind = 2	write behind backed-off
  */
 
 static __inline int
 cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
     int gbflags)
 {
 	int r = 0;
 
 	switch (write_behind) {
 	case 2:
 		if (start_lbn < len)
 			break;
 		start_lbn -= len;
 		/* FALLTHROUGH */
 	case 1:
 		r = cluster_wbuild(vp, size, start_lbn, len, gbflags);
 		/* FALLTHROUGH */
 	default:
 		/* FALLTHROUGH */
 		break;
 	}
 	return(r);
 }
 
 /*
  * Do clustered write for FFS.
  *
  * Three cases:
  *	1. Write is not sequential (write asynchronously)
  *	Write is sequential:
  *	2.	beginning of cluster - begin cluster
  *	3.	middle of a cluster - add to cluster
  *	4.	end of a cluster - asynchronously write cluster
  */
 void
 cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
     int gbflags)
 {
 	daddr_t lbn;
 	int maxclen, cursize;
 	int lblocksize;
 	int async;
 
 	if (!unmapped_buf_allowed)
 		gbflags &= ~GB_UNMAPPED;
 
 	if (vp->v_type == VREG) {
 		async = DOINGASYNC(vp);
 		lblocksize = vp->v_mount->mnt_stat.f_iosize;
 	} else {
 		async = 0;
 		lblocksize = bp->b_bufsize;
 	}
 	lbn = bp->b_lblkno;
 	KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
 
 	/* Initialize vnode to beginning of file. */
 	if (lbn == 0)
 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
 
 	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
 	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
 		maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
 		if (vp->v_clen != 0) {
 			/*
 			 * Next block is not sequential.
 			 *
 			 * If we are not writing at end of file, the process
 			 * seeked to another point in the file since its last
 			 * write, or we have reached our maximum cluster size,
 			 * then push the previous cluster. Otherwise try
 			 * reallocating to make it sequential.
 			 *
 			 * Change to algorithm: only push previous cluster if
 			 * it was sequential from the point of view of the
 			 * seqcount heuristic, otherwise leave the buffer 
 			 * intact so we can potentially optimize the I/O
 			 * later on in the buf_daemon or update daemon
 			 * flush.
 			 */
 			cursize = vp->v_lastw - vp->v_cstart + 1;
 			if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
 				if (!async && seqcount > 0) {
 					cluster_wbuild_wb(vp, lblocksize,
 					    vp->v_cstart, cursize, gbflags);
 				}
 			} else {
 				struct buf **bpp, **endbp;
 				struct cluster_save *buflist;
 
 				buflist = cluster_collectbufs(vp, bp, gbflags);
 				endbp = &buflist->bs_children
 				    [buflist->bs_nchildren - 1];
 				if (VOP_REALLOCBLKS(vp, buflist)) {
 					/*
 					 * Failed, push the previous cluster
 					 * if *really* writing sequentially
 					 * in the logical file (seqcount > 1),
 					 * otherwise delay it in the hopes that
 					 * the low level disk driver can
 					 * optimize the write ordering.
 					 */
 					for (bpp = buflist->bs_children;
 					     bpp < endbp; bpp++)
 						brelse(*bpp);
 					free(buflist, M_SEGMENT);
 					if (seqcount > 1) {
 						cluster_wbuild_wb(vp, 
 						    lblocksize, vp->v_cstart, 
 						    cursize, gbflags);
 					}
 				} else {
 					/*
 					 * Succeeded, keep building cluster.
 					 */
 					for (bpp = buflist->bs_children;
 					     bpp <= endbp; bpp++)
 						bdwrite(*bpp);
 					free(buflist, M_SEGMENT);
 					vp->v_lastw = lbn;
 					vp->v_lasta = bp->b_blkno;
 					return;
 				}
 			}
 		}
 		/*
 		 * Consider beginning a cluster. If at end of file, make
 		 * cluster as large as possible, otherwise find size of
 		 * existing cluster.
 		 */
 		if ((vp->v_type == VREG) &&
 			((u_quad_t) bp->b_offset + lblocksize) != filesize &&
 		    (bp->b_blkno == bp->b_lblkno) &&
 		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
 		     bp->b_blkno == -1)) {
 			bawrite(bp);
 			vp->v_clen = 0;
 			vp->v_lasta = bp->b_blkno;
 			vp->v_cstart = lbn + 1;
 			vp->v_lastw = lbn;
 			return;
 		}
 		vp->v_clen = maxclen;
 		if (!async && maxclen == 0) {	/* I/O not contiguous */
 			vp->v_cstart = lbn + 1;
 			bawrite(bp);
 		} else {	/* Wait for rest of cluster */
 			vp->v_cstart = lbn;
 			bdwrite(bp);
 		}
 	} else if (lbn == vp->v_cstart + vp->v_clen) {
 		/*
 		 * At end of cluster, write it out if seqcount tells us we
 		 * are operating sequentially, otherwise let the buf or
 		 * update daemon handle it.
 		 */
 		bdwrite(bp);
 		if (seqcount > 1) {
 			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
 			    vp->v_clen + 1, gbflags);
 		}
 		vp->v_clen = 0;
 		vp->v_cstart = lbn + 1;
 	} else if (vm_page_count_severe()) {
 		/*
 		 * We are low on memory, get it going NOW
 		 */
 		bawrite(bp);
 	} else {
 		/*
 		 * In the middle of a cluster, so just delay the I/O for now.
 		 */
 		bdwrite(bp);
 	}
 	vp->v_lastw = lbn;
 	vp->v_lasta = bp->b_blkno;
 }
 
 
 /*
  * This is an awful lot like cluster_rbuild...wish they could be combined.
  * The last lbn argument is the current block on which I/O is being
  * performed.  Check to see that it doesn't fall in the middle of
  * the current block (if last_bp == NULL).
  */
 int
 cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
     int gbflags)
 {
 	struct buf *bp, *tbp;
 	struct bufobj *bo;
 	int i, j;
 	int totalwritten = 0;
 	int dbsize = btodb(size);
 
 	if (!unmapped_buf_allowed)
 		gbflags &= ~GB_UNMAPPED;
 
 	bo = &vp->v_bufobj;
 	while (len > 0) {
 		/*
 		 * If the buffer is not delayed-write (i.e. dirty), or it
 		 * is delayed-write but either locked or inval, it cannot
 		 * partake in the clustered write.
 		 */
 		BO_LOCK(bo);
 		if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL ||
 		    (tbp->b_vflags & BV_BKGRDINPROG)) {
 			BO_UNLOCK(bo);
 			++start_lbn;
 			--len;
 			continue;
 		}
 		if (BUF_LOCK(tbp,
 		    LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) {
 			++start_lbn;
 			--len;
 			continue;
 		}
 		if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) {
 			BUF_UNLOCK(tbp);
 			++start_lbn;
 			--len;
 			continue;
 		}
 		bremfree(tbp);
 		tbp->b_flags &= ~B_DONE;
 
 		/*
 		 * Extra memory in the buffer, punt on this buffer.
 		 * XXX we could handle this in most cases, but we would
 		 * have to push the extra memory down to after our max
 		 * possible cluster size and then potentially pull it back
 		 * up if the cluster was terminated prematurely--too much
 		 * hassle.
 		 */
 		if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != 
 		     (B_CLUSTEROK | B_VMIO)) ||
 		  (tbp->b_bcount != tbp->b_bufsize) ||
 		  (tbp->b_bcount != size) ||
 		  (len == 1) ||
 		  ((bp = (vp->v_vflag & VV_MD) != 0 ?
 		  trypbuf(&cluster_pbuf_freecnt) :
 		  getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
 			totalwritten += tbp->b_bufsize;
 			bawrite(tbp);
 			++start_lbn;
 			--len;
 			continue;
 		}
 
 		/*
 		 * We got a pbuf to make the cluster in.
 		 * so initialise it.
 		 */
 		TAILQ_INIT(&bp->b_cluster.cluster_head);
 		bp->b_bcount = 0;
 		bp->b_bufsize = 0;
 		bp->b_npages = 0;
 		if (tbp->b_wcred != NOCRED)
 			bp->b_wcred = crhold(tbp->b_wcred);
 
 		bp->b_blkno = tbp->b_blkno;
 		bp->b_lblkno = tbp->b_lblkno;
 		bp->b_offset = tbp->b_offset;
 
 		/*
 		 * We are synthesizing a buffer out of vm_page_t's, but
 		 * if the block size is not page aligned then the starting
 		 * address may not be either.  Inherit the b_data offset
 		 * from the original buffer.
 		 */
 		if ((gbflags & GB_UNMAPPED) == 0 ||
 		    (tbp->b_flags & B_VMIO) == 0) {
 			bp->b_data = (char *)((vm_offset_t)bp->b_data |
 			    ((vm_offset_t)tbp->b_data & PAGE_MASK));
 		} else {
 			bp->b_data = unmapped_buf;
 		}
 		bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO |
 		    B_NEEDCOMMIT));
 		bp->b_iodone = cluster_callback;
 		pbgetvp(vp, bp);
 		/*
 		 * From this location in the file, scan forward to see
 		 * if there are buffers with adjacent data that need to
 		 * be written as well.
 		 */
 		for (i = 0; i < len; ++i, ++start_lbn) {
 			if (i != 0) { /* If not the first buffer */
 				/*
 				 * If the adjacent data is not even in core it
 				 * can't need to be written.
 				 */
 				BO_LOCK(bo);
 				if ((tbp = gbincore(bo, start_lbn)) == NULL ||
 				    (tbp->b_vflags & BV_BKGRDINPROG)) {
 					BO_UNLOCK(bo);
 					break;
 				}
 
 				/*
 				 * If it IS in core, but has different
 				 * characteristics, or is locked (which
 				 * means it could be undergoing a background
 				 * I/O or be in a weird state), then don't
 				 * cluster with it.
 				 */
 				if (BUF_LOCK(tbp,
 				    LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
 				    BO_LOCKPTR(bo)))
 					break;
 
 				if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
 				    B_INVAL | B_DELWRI | B_NEEDCOMMIT))
 				    != (B_DELWRI | B_CLUSTEROK |
 				    (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
 				    tbp->b_wcred != bp->b_wcred) {
 					BUF_UNLOCK(tbp);
 					break;
 				}
 
 				/*
 				 * Check that the combined cluster
 				 * would make sense with regard to pages
 				 * and would not be too large
 				 */
 				if ((tbp->b_bcount != size) ||
 				  ((bp->b_blkno + (dbsize * i)) !=
 				    tbp->b_blkno) ||
 				  ((tbp->b_npages + bp->b_npages) >
 				    (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
 					BUF_UNLOCK(tbp);
 					break;
 				}
 
 				/*
 				 * Ok, it's passed all the tests,
 				 * so remove it from the free list
 				 * and mark it busy. We will use it.
 				 */
 				bremfree(tbp);
 				tbp->b_flags &= ~B_DONE;
 			} /* end of code for non-first buffers only */
 			/*
 			 * If the IO is via the VM then we do some
 			 * special VM hackery (yuck).  Since the buffer's
 			 * block size may not be page-aligned it is possible
 			 * for a page to be shared between two buffers.  We
 			 * have to get rid of the duplication when building
 			 * the cluster.
 			 */
 			if (tbp->b_flags & B_VMIO) {
 				vm_page_t m;
 
 				VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
 				if (i == 0) {
 					vfs_drain_busy_pages(tbp);
 				} else { /* if not first buffer */
 					for (j = 0; j < tbp->b_npages; j += 1) {
 						m = tbp->b_pages[j];
 						if (vm_page_xbusied(m)) {
 							VM_OBJECT_WUNLOCK(
 							    tbp->b_object);
 							bqrelse(tbp);
 							goto finishcluster;
 						}
 					}
 				}
 				for (j = 0; j < tbp->b_npages; j += 1) {
 					m = tbp->b_pages[j];
 					vm_page_sbusy(m);
 					vm_object_pip_add(m->object, 1);
 					if ((bp->b_npages == 0) ||
 					  (bp->b_pages[bp->b_npages - 1] != m)) {
 						bp->b_pages[bp->b_npages] = m;
 						bp->b_npages++;
 					}
 				}
 				VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
 			}
 			bp->b_bcount += size;
 			bp->b_bufsize += size;
 			/*
 			 * If any of the clustered buffers have their
 			 * B_BARRIER flag set, transfer that request to
 			 * the cluster.
 			 */
 			bp->b_flags |= (tbp->b_flags & B_BARRIER);
 			tbp->b_flags &= ~(B_DONE | B_BARRIER);
 			tbp->b_flags |= B_ASYNC;
 			tbp->b_ioflags &= ~BIO_ERROR;
 			tbp->b_iocmd = BIO_WRITE;
 			bundirty(tbp);
 			reassignbuf(tbp);		/* put on clean list */
 			bufobj_wref(tbp->b_bufobj);
 			BUF_KERNPROC(tbp);
+			buf_track(tbp, __func__);
 			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
 				tbp, b_cluster.cluster_entry);
 		}
 	finishcluster:
 		if (buf_mapped(bp)) {
 			pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
 			    (vm_page_t *)bp->b_pages, bp->b_npages);
 		}
 		if (bp->b_bufsize > bp->b_kvasize)
 			panic(
 			    "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
 			    bp->b_bufsize, bp->b_kvasize);
 		totalwritten += bp->b_bufsize;
 		bp->b_dirtyoff = 0;
 		bp->b_dirtyend = bp->b_bufsize;
 		bawrite(bp);
 
 		len -= i;
 	}
 	return totalwritten;
 }
 
 /*
  * Collect together all the buffers in a cluster.
  * Plus add one additional buffer.
  */
 static struct cluster_save *
 cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
 {
 	struct cluster_save *buflist;
 	struct buf *bp;
 	daddr_t lbn;
 	int i, len;
 
 	len = vp->v_lastw - vp->v_cstart + 1;
 	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
 	    M_SEGMENT, M_WAITOK);
 	buflist->bs_nchildren = 0;
 	buflist->bs_children = (struct buf **) (buflist + 1);
 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
 		(void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
 		    gbflags, &bp);
 		buflist->bs_children[i] = bp;
 		if (bp->b_blkno == bp->b_lblkno)
 			VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
 				NULL, NULL);
 	}
 	buflist->bs_children[i] = bp = last_bp;
 	if (bp->b_blkno == bp->b_lblkno)
 		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	buflist->bs_nchildren = i + 1;
 	return (buflist);
 }
Index: head/sys/sys/bio.h
===================================================================
--- head/sys/sys/bio.h	(revision 308154)
+++ head/sys/sys/bio.h	(revision 308155)
@@ -1,160 +1,180 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_BIO_H_
 #define	_SYS_BIO_H_
 
 #include <sys/queue.h>
 #include <sys/disk_zone.h>
 
 /* bio_cmd */
 #define BIO_READ	0x01	/* Read I/O data */
 #define BIO_WRITE	0x02	/* Write I/O data */
 #define BIO_DELETE	0x03	/* TRIM or free blocks, i.e. mark as unused */
 #define BIO_GETATTR	0x04	/* Get GEOM attributes of object */
 #define BIO_FLUSH	0x05	/* Commit outstanding I/O now */
 #define BIO_CMD0	0x06	/* Available for local hacks */
 #define BIO_CMD1	0x07	/* Available for local hacks */
 #define BIO_CMD2	0x08	/* Available for local hacks */
 #define BIO_ZONE	0x09	/* Zone command */
 
 /* bio_flags */
 #define BIO_ERROR	0x01	/* An error occurred processing this bio. */
 #define BIO_DONE	0x02	/* This bio is finished. */
 #define BIO_ONQUEUE	0x04	/* This bio is in a queue & not yet taken. */
 /*
  * This bio must be executed after all previous bios in the queue have been
  * executed, and before any successive bios can be executed.
  */
 #define BIO_ORDERED	0x08
 #define	BIO_UNMAPPED	0x10
 #define	BIO_TRANSIENT_MAPPING	0x20
 #define	BIO_VLIST	0x40
 
 #ifdef _KERNEL
 struct disk;
 struct bio;
 struct vm_map;
 
 /* Empty classifier tag, to prevent further classification. */
 #define	BIO_NOTCLASSIFIED		(void *)(~0UL)
 
 typedef void bio_task_t(void *);
 
 /*
  * The bio structure describes an I/O operation in the kernel.
  */
 struct bio {
 	uint16_t bio_cmd;		/* I/O operation. */
 	uint16_t bio_flags;		/* General flags. */
 	uint16_t bio_cflags;		/* Private use by the consumer. */
 	uint16_t bio_pflags;		/* Private use by the provider. */
 	struct cdev *bio_dev;		/* Device to do I/O on. */
 	struct disk *bio_disk;		/* Valid below geom_disk.c only */
 	off_t	bio_offset;		/* Offset into file. */
 	long	bio_bcount;		/* Valid bytes in buffer. */
 	caddr_t	bio_data;		/* Memory, superblocks, indirect etc. */
 	struct vm_page **bio_ma;	/* Or unmapped. */
 	int	bio_ma_offset;		/* Offset in the first page of bio_ma. */
 	int	bio_ma_n;		/* Number of pages in bio_ma. */
 	int	bio_error;		/* Errno for BIO_ERROR. */
 	long	bio_resid;		/* Remaining I/O in bytes. */
 	void	(*bio_done)(struct bio *);
 	void	*bio_driver1;		/* Private use by the provider. */
 	void	*bio_driver2;		/* Private use by the provider. */
 	void	*bio_caller1;		/* Private use by the consumer. */
 	void	*bio_caller2;		/* Private use by the consumer. */
 	TAILQ_ENTRY(bio) bio_queue;	/* Disksort queue. */
 	const char *bio_attribute;	/* Attribute for BIO_[GS]ETATTR */
 	struct  disk_zone_args bio_zone;/* Used for BIO_ZONE */
 	struct g_consumer *bio_from;	/* GEOM linkage */
 	struct g_provider *bio_to;	/* GEOM linkage */
 	off_t	bio_length;		/* Like bio_bcount */
 	off_t	bio_completed;		/* Inverse of bio_resid */
 	u_int	bio_children;		/* Number of spawned bios */
 	u_int	bio_inbed;		/* Children safely home by now */
 	struct bio *bio_parent;		/* Pointer to parent */
 	struct bintime bio_t0;		/* Time request started */
 
 	bio_task_t *bio_task;		/* Task_queue handler */
 	void	*bio_task_arg;		/* Argument to above */
 
 	void	*bio_classifier1;	/* Classifier tag. */
 	void	*bio_classifier2;	/* Classifier tag. */
 
 #ifdef DIAGNOSTIC
 	void	*_bio_caller1;
 	void	*_bio_caller2;
 	uint8_t	_bio_cflags;
 #endif
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+	struct buf *bio_track_bp;	/* Parent buf for tracking */
+#endif
 
 	/* XXX: these go away when bio chaining is introduced */
 	daddr_t bio_pblkno;               /* physical block number */
 };
 
 struct uio;
 struct devstat;
 
 struct bio_queue_head {
 	TAILQ_HEAD(bio_queue, bio) queue;
 	off_t last_offset;
 	struct	bio *insert_point;
 };
 
 extern struct vm_map *bio_transient_map;
 extern int bio_transient_maxcnt;
 
 void biodone(struct bio *bp);
 void biofinish(struct bio *bp, struct devstat *stat, int error);
 int biowait(struct bio *bp, const char *wchan);
+
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+void biotrack_buf(struct bio *bp, const char *location);
+
+static __inline void
+biotrack(struct bio *bp, const char *location)
+{
+
+	if (bp->bio_track_bp != NULL)
+		biotrack_buf(bp, location);
+}
+#else
+static __inline void
+biotrack(struct bio *bp __unused, const char *location __unused)
+{
+}
+#endif
 
 void bioq_disksort(struct bio_queue_head *ap, struct bio *bp);
 struct bio *bioq_first(struct bio_queue_head *head);
 struct bio *bioq_takefirst(struct bio_queue_head *head);
 void bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error);
 void bioq_init(struct bio_queue_head *head);
 void bioq_insert_head(struct bio_queue_head *head, struct bio *bp);
 void bioq_insert_tail(struct bio_queue_head *head, struct bio *bp);
 void bioq_remove(struct bio_queue_head *head, struct bio *bp);
 
 int	physio(struct cdev *dev, struct uio *uio, int ioflag);
 #define physread physio
 #define physwrite physio
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_BIO_H_ */
Index: head/sys/sys/buf.h
===================================================================
--- head/sys/sys/buf.h	(revision 308154)
+++ head/sys/sys/buf.h	(revision 308155)
@@ -1,549 +1,569 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_BUF_H_
 #define	_SYS_BUF_H_
 
 #include <sys/bufobj.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 
 struct bio;
 struct buf;
 struct bufobj;
 struct mount;
 struct vnode;
 struct uio;
 
 /*
  * To avoid including <ufs/ffs/softdep.h> 
  */   
 LIST_HEAD(workhead, worklist);
 /*
  * These are currently used only by the soft dependency code, hence
  * are stored once in a global variable. If other subsystems wanted
  * to use these hooks, a pointer to a set of bio_ops could be added
  * to each buffer.
  */
 extern struct bio_ops {
 	void	(*io_start)(struct buf *);
 	void	(*io_complete)(struct buf *);
 	void	(*io_deallocate)(struct buf *);
 	int	(*io_countdeps)(struct buf *, int);
 } bioops;
 
 struct vm_object;
 struct vm_page;
 
 typedef unsigned char b_xflags_t;
 
 /*
  * The buffer header describes an I/O operation in the kernel.
  *
  * NOTES:
  *	b_bufsize, b_bcount.  b_bufsize is the allocation size of the
  *	buffer, either DEV_BSIZE or PAGE_SIZE aligned.  b_bcount is the
  *	originally requested buffer size and can serve as a bounds check
  *	against EOF.  For most, but not all uses, b_bcount == b_bufsize.
  *
  *	b_dirtyoff, b_dirtyend.  Buffers support piecemeal, unaligned
  *	ranges of dirty data that need to be written to backing store.
  *	The range is typically clipped at b_bcount ( not b_bufsize ).
  *
  *	b_resid.  Number of bytes remaining in I/O.  After an I/O operation
  *	completes, b_resid is usually 0 indicating 100% success.
  *
  *	All fields are protected by the buffer lock except those marked:
  *		V - Protected by owning bufobj lock
  *		Q - Protected by the buf queue lock
  *		D - Protected by an dependency implementation specific lock
  */
 struct buf {
 	struct bufobj	*b_bufobj;
 	long		b_bcount;
 	void		*b_caller1;
 	caddr_t		b_data;
 	int		b_error;
 	uint16_t	b_iocmd;	/* BIO_* bio_cmd from bio.h */
 	uint16_t	b_ioflags;	/* BIO_* bio_flags from bio.h */
 	off_t		b_iooffset;
 	long		b_resid;
 	void	(*b_iodone)(struct buf *);
 	daddr_t b_blkno;		/* Underlying physical block number. */
 	off_t	b_offset;		/* Offset into file. */
 	TAILQ_ENTRY(buf) b_bobufs;	/* (V) Buffer's associated vnode. */
 	uint32_t	b_vflags;	/* (V) BV_* flags */
 	unsigned short b_qindex;	/* (Q) buffer queue index */
 	uint32_t	b_flags;	/* B_* flags. */
 	b_xflags_t b_xflags;		/* extra flags */
 	struct lock b_lock;		/* Buffer lock */
 	long	b_bufsize;		/* Allocated buffer size. */
 	int	b_runningbufspace;	/* when I/O is running, pipelining */
 	int	b_kvasize;		/* size of kva for buffer */
 	int	b_dirtyoff;		/* Offset in buffer of dirty region. */
 	int	b_dirtyend;		/* Offset of end of dirty region. */
 	caddr_t	b_kvabase;		/* base kva for buffer */
 	daddr_t b_lblkno;		/* Logical block number. */
 	struct	vnode *b_vp;		/* Device vnode. */
 	struct	ucred *b_rcred;		/* Read credentials reference. */
 	struct	ucred *b_wcred;		/* Write credentials reference. */
 	union {
 		TAILQ_ENTRY(buf) b_freelist; /* (Q) */
 		struct {
 			void	(*b_pgiodone)(void *, vm_page_t *, int, int);
 			int	b_pgbefore;
 			int	b_pgafter;
 		};
 	};
 	union	cluster_info {
 		TAILQ_HEAD(cluster_list_head, buf) cluster_head;
 		TAILQ_ENTRY(buf) cluster_entry;
 	} b_cluster;
 	struct	vm_page *b_pages[btoc(MAXPHYS)];
 	int		b_npages;
 	struct	workhead b_dep;		/* (D) List of filesystem dependencies. */
 	void	*b_fsprivate1;
 	void	*b_fsprivate2;
 	void	*b_fsprivate3;
+
+#if defined(FULL_BUF_TRACKING)
+#define BUF_TRACKING_SIZE	32
+#define BUF_TRACKING_ENTRY(x)	((x) & (BUF_TRACKING_SIZE - 1))
+	const char	*b_io_tracking[BUF_TRACKING_SIZE];
+	uint32_t	b_io_tcnt;
+#elif defined(BUF_TRACKING)
+	const char	*b_io_tracking;
+#endif
 };
 
 #define b_object	b_bufobj->bo_object
 
 /*
  * These flags are kept in b_flags.
  *
  * Notes:
  *
  *	B_ASYNC		VOP calls on bp's are usually async whether or not
  *			B_ASYNC is set, but some subsystems, such as NFS, like 
  *			to know what is best for the caller so they can
  *			optimize the I/O.
  *
  *	B_PAGING	Indicates that bp is being used by the paging system or
  *			some paging system and that the bp is not linked into
  *			the b_vp's clean/dirty linked lists or ref counts.
  *			Buffer vp reassignments are illegal in this case.
  *
  *	B_CACHE		This may only be set if the buffer is entirely valid.
  *			The situation where B_DELWRI is set and B_CACHE is
  *			clear MUST be committed to disk by getblk() so 
  *			B_DELWRI can also be cleared.  See the comments for
  *			getblk() in kern/vfs_bio.c.  If B_CACHE is clear,
  *			the caller is expected to clear BIO_ERROR and B_INVAL,
  *			set BIO_READ, and initiate an I/O.
  *
  *			The 'entire buffer' is defined to be the range from
  *			0 through b_bcount.
  *
  *	B_MALLOC	Request that the buffer be allocated from the malloc
  *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
  *
  *	B_CLUSTEROK	This flag is typically set for B_DELWRI buffers
  *			by filesystems that allow clustering when the buffer
  *			is fully dirty and indicates that it may be clustered
  *			with other adjacent dirty buffers.  Note the clustering
  *			may not be used with the stage 1 data write under NFS
  *			but may be used for the commit rpc portion.
  *
  *	B_VMIO		Indicates that the buffer is tied into an VM object.
  *			The buffer's data is always PAGE_SIZE aligned even
  *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
  *			always at least DEV_BSIZE aligned, though ).
  *
  *	B_DIRECT	Hint that we should attempt to completely free
  *			the pages underlying the buffer.  B_DIRECT is
  *			sticky until the buffer is released and typically
  *			only has an effect when B_RELBUF is also set.
  *
  */
 
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
 #define	B_NEEDCOMMIT	0x00000002	/* Append-write in progress. */
 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
 #define	B_DIRECT	0x00000008	/* direct I/O flag (pls free vmio) */
 #define	B_DEFERRED	0x00000010	/* Skipped over for cleaning */
 #define	B_CACHE		0x00000020	/* Bread found us in the cache. */
 #define	B_VALIDSUSPWRT	0x00000040	/* Valid write during suspension. */
 #define	B_DELWRI	0x00000080	/* Delay I/O until buffer reused. */
 #define	B_00000100	0x00000100	/* Available flag. */
 #define	B_DONE		0x00000200	/* I/O completed. */
 #define	B_EINTR		0x00000400	/* I/O was interrupted */
 #define	B_NOREUSE	0x00000800	/* Contents not reused once released. */
 #define	B_00001000	0x00001000	/* Available flag. */
 #define	B_INVAL		0x00002000	/* Does not contain valid info. */
 #define	B_BARRIER	0x00004000	/* Write this and all preceding first. */
 #define	B_NOCACHE	0x00008000	/* Do not cache block after use. */
 #define	B_MALLOC	0x00010000	/* malloced b_data */
 #define	B_CLUSTEROK	0x00020000	/* Pagein op, so swap() can count it. */
 #define	B_00040000	0x00040000	/* Available flag. */
 #define	B_00080000	0x00080000	/* Available flag. */
 #define	B_00100000	0x00100000	/* Available flag. */
 #define	B_00200000	0x00200000	/* Available flag. */
 #define	B_RELBUF	0x00400000	/* Release VMIO buffer. */
 #define	B_FS_FLAG1	0x00800000	/* Available flag for FS use. */
 #define	B_NOCOPY	0x01000000	/* Don't copy-on-write this buf. */
 #define	B_INFREECNT	0x02000000	/* buf is counted in numfreebufs */
 #define	B_PAGING	0x04000000	/* volatile paging I/O -- bypass VMIO */
 #define B_MANAGED	0x08000000	/* Managed by FS. */
 #define B_RAM		0x10000000	/* Read ahead mark (flag) */
 #define B_VMIO		0x20000000	/* VMIO flag */
 #define B_CLUSTER	0x40000000	/* pagein op, so swap() can count it */
 #define B_REMFREE	0x80000000	/* Delayed bremfree */
 
 #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \
 	"\33paging\32infreecnt\31nocopy\30b23\27relbuf\26b21\25b20" \
 	"\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \
 	"\15b12\14noreuse\13eintr\12done\11b8\10delwri" \
 	"\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age"
 
 /*
  * These flags are kept in b_xflags.
  */
 #define	BX_VNDIRTY	0x00000001	/* On vnode dirty list */
 #define	BX_VNCLEAN	0x00000002	/* On vnode clean list */
 #define	BX_BKGRDWRITE	0x00000010	/* Do writes in background */
 #define BX_BKGRDMARKER	0x00000020	/* Mark buffer for splay tree */
 #define	BX_ALTDATA	0x00000040	/* Holds extended data */
 
 #define	PRINT_BUF_XFLAGS "\20\7altdata\6bkgrdmarker\5bkgrdwrite\2clean\1dirty"
 
 #define	NOOFFSET	(-1LL)		/* No buffer offset calculated yet */
 
 /*
  * These flags are kept in b_vflags.
  */
 #define	BV_SCANNED	0x00000001	/* VOP_FSYNC funcs mark written bufs */
 #define	BV_BKGRDINPROG	0x00000002	/* Background write in progress */
 #define	BV_BKGRDWAIT	0x00000004	/* Background write waiting */
 #define	BV_BKGRDERR	0x00000008	/* Error from background write */
 
 #define	PRINT_BUF_VFLAGS "\20\4bkgrderr\3bkgrdwait\2bkgrdinprog\1scanned"
 
 #ifdef _KERNEL
 /*
  * Buffer locking
  */
 extern const char *buf_wmesg;		/* Default buffer lock message */
 #define BUF_WMESG "bufwait"
 #include <sys/proc.h>			/* XXX for curthread */
 #include <sys/mutex.h>
 
 /*
  * Initialize a lock.
  */
 #define BUF_LOCKINIT(bp)						\
 	lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0)
 /*
  *
  * Get a lock sleeping non-interruptably until it becomes available.
  */
 #define	BUF_LOCK(bp, locktype, interlock)				\
 	_lockmgr_args_rw(&(bp)->b_lock, (locktype), (interlock),	\
 	    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,		\
 	    LOCK_FILE, LOCK_LINE)
 
 /*
  * Get a lock sleeping with specified interruptably and timeout.
  */
 #define	BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo)	\
 	_lockmgr_args_rw(&(bp)->b_lock, (locktype) | LK_TIMELOCK,	\
 	    (interlock), (wmesg), (PRIBIO + 4) | (catch), (timo),	\
 	    LOCK_FILE, LOCK_LINE)
 
 /*
  * Release a lock. Only the acquiring process may free the lock unless
  * it has been handed off to biodone.
  */
 #define	BUF_UNLOCK(bp) do {						\
 	KASSERT(((bp)->b_flags & B_REMFREE) == 0,			\
 	    ("BUF_UNLOCK %p while B_REMFREE is still set.", (bp)));	\
 									\
 	(void)_lockmgr_args(&(bp)->b_lock, LK_RELEASE, NULL,		\
 	    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,		\
 	    LOCK_FILE, LOCK_LINE);					\
 } while (0)
 
 /*
  * Check if a buffer lock is recursed.
  */
 #define	BUF_LOCKRECURSED(bp)						\
 	lockmgr_recursed(&(bp)->b_lock)
 
 /*
  * Check if a buffer lock is currently held.
  */
 #define	BUF_ISLOCKED(bp)						\
 	lockstatus(&(bp)->b_lock)
 /*
  * Free a buffer lock.
  */
 #define BUF_LOCKFREE(bp) 						\
 	lockdestroy(&(bp)->b_lock)
 
 /*
  * Print informations on a buffer lock.
  */
 #define BUF_LOCKPRINTINFO(bp) 						\
 	lockmgr_printinfo(&(bp)->b_lock)
 
 /*
  * Buffer lock assertions.
  */
 #if defined(INVARIANTS) && defined(INVARIANT_SUPPORT)
 #define	BUF_ASSERT_LOCKED(bp)						\
 	_lockmgr_assert(&(bp)->b_lock, KA_LOCKED, LOCK_FILE, LOCK_LINE)
 #define	BUF_ASSERT_SLOCKED(bp)						\
 	_lockmgr_assert(&(bp)->b_lock, KA_SLOCKED, LOCK_FILE, LOCK_LINE)
 #define	BUF_ASSERT_XLOCKED(bp)						\
 	_lockmgr_assert(&(bp)->b_lock, KA_XLOCKED, LOCK_FILE, LOCK_LINE)
 #define	BUF_ASSERT_UNLOCKED(bp)						\
 	_lockmgr_assert(&(bp)->b_lock, KA_UNLOCKED, LOCK_FILE, LOCK_LINE)
 #define	BUF_ASSERT_HELD(bp)
 #define	BUF_ASSERT_UNHELD(bp)
 #else
 #define	BUF_ASSERT_LOCKED(bp)
 #define	BUF_ASSERT_SLOCKED(bp)
 #define	BUF_ASSERT_XLOCKED(bp)
 #define	BUF_ASSERT_UNLOCKED(bp)
 #define	BUF_ASSERT_HELD(bp)
 #define	BUF_ASSERT_UNHELD(bp)
 #endif
 
 #ifdef _SYS_PROC_H_	/* Avoid #include <sys/proc.h> pollution */
 /*
  * When initiating asynchronous I/O, change ownership of the lock to the
  * kernel. Once done, the lock may legally released by biodone. The
  * original owning process can no longer acquire it recursively, but must
  * wait until the I/O is completed and the lock has been freed by biodone.
  */
 #define	BUF_KERNPROC(bp)						\
 	_lockmgr_disown(&(bp)->b_lock, LOCK_FILE, LOCK_LINE)
 #endif
 
 #endif /* _KERNEL */
 
 struct buf_queue_head {
 	TAILQ_HEAD(buf_queue, buf) queue;
 	daddr_t last_pblkno;
 	struct	buf *insert_point;
 	struct	buf *switch_point;
 };
 
 /*
  * This structure describes a clustered I/O. 
  */
 struct cluster_save {
 	long	bs_bcount;		/* Saved b_bcount. */
 	long	bs_bufsize;		/* Saved b_bufsize. */
 	int	bs_nchildren;		/* Number of associated buffers. */
 	struct buf **bs_children;	/* List of associated buffers. */
 };
 
 #ifdef _KERNEL
 
 static __inline int
 bwrite(struct buf *bp)
 {
 
 	KASSERT(bp->b_bufobj != NULL, ("bwrite: no bufobj bp=%p", bp));
 	KASSERT(bp->b_bufobj->bo_ops != NULL, ("bwrite: no bo_ops bp=%p", bp));
 	KASSERT(bp->b_bufobj->bo_ops->bop_write != NULL,
 	    ("bwrite: no bop_write bp=%p", bp));
 	return (BO_WRITE(bp->b_bufobj, bp));
 }
 
 static __inline void
 bstrategy(struct buf *bp)
 {
 
 	KASSERT(bp->b_bufobj != NULL, ("bstrategy: no bufobj bp=%p", bp));
 	KASSERT(bp->b_bufobj->bo_ops != NULL,
 	    ("bstrategy: no bo_ops bp=%p", bp));
 	KASSERT(bp->b_bufobj->bo_ops->bop_strategy != NULL,
 	    ("bstrategy: no bop_strategy bp=%p", bp));
 	BO_STRATEGY(bp->b_bufobj, bp);
 }
 
 static __inline void
 buf_start(struct buf *bp)
 {
 	if (bioops.io_start)
 		(*bioops.io_start)(bp);
 }
 
 static __inline void
 buf_complete(struct buf *bp)
 {
 	if (bioops.io_complete)
 		(*bioops.io_complete)(bp);
 }
 
 static __inline void
 buf_deallocate(struct buf *bp)
 {
 	if (bioops.io_deallocate)
 		(*bioops.io_deallocate)(bp);
 }
 
 static __inline int
 buf_countdeps(struct buf *bp, int i)
 {
 	if (bioops.io_countdeps)
 		return ((*bioops.io_countdeps)(bp, i));
 	else
 		return (0);
+}
+
+static __inline void
+buf_track(struct buf *bp, const char *location)
+{
+
+#if defined(FULL_BUF_TRACKING)
+	bp->b_io_tracking[BUF_TRACKING_ENTRY(bp->b_io_tcnt++)] = location;
+#elif defined(BUF_TRACKING)
+	bp->b_io_tracking = location;
+#endif
 }
 
 #endif /* _KERNEL */
 
 /*
  * Zero out the buffer's data area.
  */
 #define	clrbuf(bp) {							\
 	bzero((bp)->b_data, (u_int)(bp)->b_bcount);			\
 	(bp)->b_resid = 0;						\
 }
 
 /*
  * Flags for getblk's last parameter.
  */
 #define	GB_LOCK_NOWAIT	0x0001		/* Fail if we block on a buf lock. */
 #define	GB_NOCREAT	0x0002		/* Don't create a buf if not found. */
 #define	GB_NOWAIT_BD	0x0004		/* Do not wait for bufdaemon. */
 #define	GB_UNMAPPED	0x0008		/* Do not mmap buffer pages. */
 #define	GB_KVAALLOC	0x0010		/* But allocate KVA. */
 
 #ifdef _KERNEL
 extern int	nbuf;			/* The number of buffer headers */
 extern long	maxswzone;		/* Max KVA for swap structures */
 extern long	maxbcache;		/* Max KVA for buffer cache */
 extern long	runningbufspace;
 extern long	hibufspace;
 extern int	dirtybufthresh;
 extern int	bdwriteskip;
 extern int	dirtybufferflushes;
 extern int	altbufferflushes;
 extern int	nswbuf;			/* Number of swap I/O buffer headers. */
 extern int	cluster_pbuf_freecnt;	/* Number of pbufs for clusters */
 extern int	vnode_pbuf_freecnt;	/* Number of pbufs for vnode pager */
 extern int	vnode_async_pbuf_freecnt; /* Number of pbufs for vnode pager,
 					     asynchronous reads */
 extern caddr_t	unmapped_buf;	/* Data address for unmapped buffers. */
 
 static inline int
 buf_mapped(struct buf *bp)
 {
 
 	return (bp->b_data != unmapped_buf);
 }
 
 void	runningbufwakeup(struct buf *);
 void	waitrunningbufspace(void);
 caddr_t	kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est);
 void	bufinit(void);
 void	bufshutdown(int);
 void	bdata2bio(struct buf *bp, struct bio *bip);
 void	bwillwrite(void);
 int	buf_dirty_count_severe(void);
 void	bremfree(struct buf *);
 void	bremfreef(struct buf *);	/* XXX Force bremfree, only for nfs. */
 #define bread(vp, blkno, size, cred, bpp) \
 	    breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, 0, bpp)
 #define bread_gb(vp, blkno, size, cred, gbflags, bpp) \
 	    breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, \
 		gbflags, bpp)
 #define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \
 	    breadn_flags(vp, blkno, size, rablkno, rabsize, cnt, cred, 0, bpp)
 int	breadn_flags(struct vnode *, daddr_t, int, daddr_t *, int *, int,
 	    struct ucred *, int, struct buf **);
 void	breada(struct vnode *, daddr_t *, int *, int, struct ucred *);
 void	bdwrite(struct buf *);
 void	bawrite(struct buf *);
 void	babarrierwrite(struct buf *);
 int	bbarrierwrite(struct buf *);
 void	bdirty(struct buf *);
 void	bundirty(struct buf *);
 void	bufstrategy(struct bufobj *, struct buf *);
 void	brelse(struct buf *);
 void	bqrelse(struct buf *);
 int	vfs_bio_awrite(struct buf *);
 void	vfs_drain_busy_pages(struct buf *bp);
 struct buf *     getpbuf(int *);
 struct buf *incore(struct bufobj *, daddr_t);
 struct buf *gbincore(struct bufobj *, daddr_t);
 struct buf *getblk(struct vnode *, daddr_t, int, int, int, int);
 struct buf *geteblk(int, int);
 int	bufwait(struct buf *);
 int	bufwrite(struct buf *);
 void	bufdone(struct buf *);
 void	bufdone_finish(struct buf *);
 void	bd_speedup(void);
 
 int	cluster_read(struct vnode *, u_quad_t, daddr_t, long,
 	    struct ucred *, long, int, int, struct buf **);
 int	cluster_wbuild(struct vnode *, long, daddr_t, int, int);
 void	cluster_write(struct vnode *, struct buf *, u_quad_t, int, int);
 void	vfs_bio_bzero_buf(struct buf *bp, int base, int size);
 void	vfs_bio_set_valid(struct buf *, int base, int size);
 void	vfs_bio_clrbuf(struct buf *);
 void	vfs_busy_pages(struct buf *, int clear_modify);
 void	vfs_unbusy_pages(struct buf *);
 int	vmapbuf(struct buf *, int);
 void	vunmapbuf(struct buf *);
 void	relpbuf(struct buf *, int *);
 void	brelvp(struct buf *);
 void	bgetvp(struct vnode *, struct buf *);
 void	pbgetbo(struct bufobj *bo, struct buf *bp);
 void	pbgetvp(struct vnode *, struct buf *);
 void	pbrelbo(struct buf *);
 void	pbrelvp(struct buf *);
 int	allocbuf(struct buf *bp, int size);
 void	reassignbuf(struct buf *);
 struct	buf *trypbuf(int *);
 void	bwait(struct buf *, u_char, const char *);
 void	bdone(struct buf *);
 
 typedef daddr_t (vbg_get_lblkno_t)(struct vnode *, vm_ooffset_t);
 typedef int (vbg_get_blksize_t)(struct vnode *, daddr_t);
 int	vfs_bio_getpages(struct vnode *vp, struct vm_page **ma, int count,
 	    int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno,
 	    vbg_get_blksize_t get_blksize);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_BUF_H_ */
Index: head/sys/vm/vm_pager.c
===================================================================
--- head/sys/vm/vm_pager.c	(revision 308154)
+++ head/sys/vm/vm_pager.c	(revision 308155)
@@ -1,560 +1,562 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_pager.c	8.6 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Paging space routine stubs.  Emulates a matchmaker-like interface
  *	for builtin pagers.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/ucred.h>
 #include <sys/malloc.h>
 #include <sys/rwlock.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 int cluster_pbuf_freecnt = -1;	/* unlimited to begin with */
 
 struct buf *swbuf;
 
 static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
 static vm_object_t dead_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *);
 static void dead_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
 static boolean_t dead_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static void dead_pager_dealloc(vm_object_t);
 
 static int
 dead_pager_getpages(vm_object_t obj, vm_page_t *ma, int count, int *rbehind,
     int *rahead)
 {
 
 	return (VM_PAGER_FAIL);
 }
 
 static vm_object_t
 dead_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t off, struct ucred *cred)
 {
 	return NULL;
 }
 
 static void
 dead_pager_putpages(object, m, count, flags, rtvals)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	int flags;
 	int *rtvals;
 {
 	int i;
 
 	for (i = 0; i < count; i++) {
 		rtvals[i] = VM_PAGER_AGAIN;
 	}
 }
 
 static int
 dead_pager_haspage(object, pindex, prev, next)
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int *prev;
 	int *next;
 {
 	if (prev)
 		*prev = 0;
 	if (next)
 		*next = 0;
 	return FALSE;
 }
 
 static void
 dead_pager_dealloc(object)
 	vm_object_t object;
 {
 	return;
 }
 
 static struct pagerops deadpagerops = {
 	.pgo_alloc = 	dead_pager_alloc,
 	.pgo_dealloc =	dead_pager_dealloc,
 	.pgo_getpages =	dead_pager_getpages,
 	.pgo_putpages =	dead_pager_putpages,
 	.pgo_haspage =	dead_pager_haspage,
 };
 
 struct pagerops *pagertab[] = {
 	&defaultpagerops,	/* OBJT_DEFAULT */
 	&swappagerops,		/* OBJT_SWAP */
 	&vnodepagerops,		/* OBJT_VNODE */
 	&devicepagerops,	/* OBJT_DEVICE */
 	&physpagerops,		/* OBJT_PHYS */
 	&deadpagerops,		/* OBJT_DEAD */
 	&sgpagerops,		/* OBJT_SG */
 	&mgtdevicepagerops,	/* OBJT_MGTDEVICE */
 };
 
 /*
  * Kernel address space for mapping pages.
  * Used by pagers where KVAs are needed for IO.
  *
  * XXX needs to be large enough to support the number of pending async
  * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size
  * (MAXPHYS == 64k) if you want to get the most efficiency.
  */
 struct mtx_padalign pbuf_mtx;
 static TAILQ_HEAD(swqueue, buf) bswlist;
 static int bswneeded;
 vm_offset_t swapbkva;		/* swap buffers kva */
 
 void
 vm_pager_init()
 {
 	struct pagerops **pgops;
 
 	TAILQ_INIT(&bswlist);
 	/*
 	 * Initialize known pagers
 	 */
 	for (pgops = pagertab; pgops < &pagertab[nitems(pagertab)]; pgops++)
 		if ((*pgops)->pgo_init != NULL)
 			(*(*pgops)->pgo_init) ();
 }
 
 void
 vm_pager_bufferinit()
 {
 	struct buf *bp;
 	int i;
 
 	mtx_init(&pbuf_mtx, "pbuf mutex", NULL, MTX_DEF);
 	bp = swbuf;
 	/*
 	 * Now set up swap and physical I/O buffer headers.
 	 */
 	for (i = 0; i < nswbuf; i++, bp++) {
 		TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist);
 		BUF_LOCKINIT(bp);
 		LIST_INIT(&bp->b_dep);
 		bp->b_rcred = bp->b_wcred = NOCRED;
 		bp->b_xflags = 0;
 	}
 
 	cluster_pbuf_freecnt = nswbuf / 2;
 	vnode_pbuf_freecnt = nswbuf / 2 + 1;
 	vnode_async_pbuf_freecnt = nswbuf / 2;
 }
 
 /*
  * Allocate an instance of a pager of the given type.
  * Size, protection and offset parameters are passed in for pagers that
  * need to perform page-level validation (e.g. the device pager).
  */
 vm_object_t
 vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size,
     vm_prot_t prot, vm_ooffset_t off, struct ucred *cred)
 {
 	vm_object_t ret;
 	struct pagerops *ops;
 
 	ops = pagertab[type];
 	if (ops)
 		ret = (*ops->pgo_alloc) (handle, size, prot, off, cred);
 	else
 		ret = NULL;
 	return (ret);
 }
 
 /*
  *	The object must be locked.
  */
 void
 vm_pager_deallocate(object)
 	vm_object_t object;
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	(*pagertab[object->type]->pgo_dealloc) (object);
 }
 
 static void
 vm_pager_assert_in(vm_object_t object, vm_page_t *m, int count)
 {
 #ifdef INVARIANTS
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(count > 0, ("%s: 0 count", __func__));
 	/*
 	 * All pages must be busied, not mapped, not fully valid,
 	 * not dirty and belong to the proper object.
 	 */
 	for (int i = 0 ; i < count; i++) {
 		vm_page_assert_xbusied(m[i]);
 		KASSERT(!pmap_page_is_mapped(m[i]),
 		    ("%s: page %p is mapped", __func__, m[i]));
 		KASSERT(m[i]->valid != VM_PAGE_BITS_ALL,
 		    ("%s: request for a valid page %p", __func__, m[i]));
 		KASSERT(m[i]->dirty == 0,
 		    ("%s: page %p is dirty", __func__, m[i]));
 		KASSERT(m[i]->object == object,
 		    ("%s: wrong object %p/%p", __func__, object, m[i]->object));
 	}
 #endif
 }
 
 /*
  * Page in the pages for the object using its associated pager.
  * The requested page must be fully valid on successful return.
  */
 int
 vm_pager_get_pages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
     int *rahead)
 {
 #ifdef INVARIANTS
 	vm_pindex_t pindex = m[0]->pindex;
 #endif
 	int r;
 
 	vm_pager_assert_in(object, m, count);
 
 	r = (*pagertab[object->type]->pgo_getpages)(object, m, count, rbehind,
 	    rahead);
 	if (r != VM_PAGER_OK)
 		return (r);
 
 	for (int i = 0; i < count; i++) {
 		/*
 		 * If pager has replaced a page, assert that it had
 		 * updated the array.
 		 */
 		KASSERT(m[i] == vm_page_lookup(object, pindex++),
 		    ("%s: mismatch page %p pindex %ju", __func__,
 		    m[i], (uintmax_t )pindex - 1));
 		/*
 		 * Zero out partially filled data.
 		 */
 		if (m[i]->valid != VM_PAGE_BITS_ALL)
 			vm_page_zero_invalid(m[i], TRUE);
 	}
 	return (VM_PAGER_OK);
 }
 
 int
 vm_pager_get_pages_async(vm_object_t object, vm_page_t *m, int count,
     int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg)
 {
 
 	vm_pager_assert_in(object, m, count);
 
 	return ((*pagertab[object->type]->pgo_getpages_async)(object, m,
 	    count, rbehind, rahead, iodone, arg));
 }
 
 /*
  * vm_pager_put_pages() - inline, see vm/vm_pager.h
  * vm_pager_has_page() - inline, see vm/vm_pager.h
  */
 
 /*
  * Search the specified pager object list for an object with the
  * specified handle.  If an object with the specified handle is found,
  * increase its reference count and return it.  Otherwise, return NULL.
  *
  * The pager object list must be locked.
  */
 vm_object_t
 vm_pager_object_lookup(struct pagerlst *pg_list, void *handle)
 {
 	vm_object_t object;
 
 	TAILQ_FOREACH(object, pg_list, pager_object_list) {
 		if (object->handle == handle) {
 			VM_OBJECT_WLOCK(object);
 			if ((object->flags & OBJ_DEAD) == 0) {
 				vm_object_reference_locked(object);
 				VM_OBJECT_WUNLOCK(object);
 				break;
 			}
 			VM_OBJECT_WUNLOCK(object);
 		}
 	}
 	return (object);
 }
 
 /*
  * initialize a physical buffer
  */
 
 /*
  * XXX This probably belongs in vfs_bio.c
  */
 static void
 initpbuf(struct buf *bp)
 {
 	KASSERT(bp->b_bufobj == NULL, ("initpbuf with bufobj"));
 	KASSERT(bp->b_vp == NULL, ("initpbuf with vp"));
 	bp->b_rcred = NOCRED;
 	bp->b_wcred = NOCRED;
 	bp->b_qindex = 0;	/* On no queue (QUEUE_NONE) */
 	bp->b_kvabase = (caddr_t) (MAXPHYS * (bp - swbuf)) + swapbkva;
 	bp->b_data = bp->b_kvabase;
 	bp->b_kvasize = MAXPHYS;
 	bp->b_flags = 0;
 	bp->b_xflags = 0;
 	bp->b_ioflags = 0;
 	bp->b_iodone = NULL;
 	bp->b_error = 0;
 	BUF_LOCK(bp, LK_EXCLUSIVE, NULL);
+	buf_track(bp, __func__);
 }
 
 /*
  * allocate a physical buffer
  *
  *	There are a limited number (nswbuf) of physical buffers.  We need
  *	to make sure that no single subsystem is able to hog all of them,
  *	so each subsystem implements a counter which is typically initialized
  *	to 1/2 nswbuf.  getpbuf() decrements this counter in allocation and
  *	increments it on release, and blocks if the counter hits zero.  A
  *	subsystem may initialize the counter to -1 to disable the feature,
  *	but it must still be sure to match up all uses of getpbuf() with 
  *	relpbuf() using the same variable.
  *
  *	NOTE: pfreecnt can be NULL, but this 'feature' will be removed
  *	relatively soon when the rest of the subsystems get smart about it. XXX
  */
 struct buf *
 getpbuf(int *pfreecnt)
 {
 	struct buf *bp;
 
 	mtx_lock(&pbuf_mtx);
 
 	for (;;) {
 		if (pfreecnt) {
 			while (*pfreecnt == 0) {
 				msleep(pfreecnt, &pbuf_mtx, PVM, "wswbuf0", 0);
 			}
 		}
 
 		/* get a bp from the swap buffer header pool */
 		if ((bp = TAILQ_FIRST(&bswlist)) != NULL)
 			break;
 
 		bswneeded = 1;
 		msleep(&bswneeded, &pbuf_mtx, PVM, "wswbuf1", 0);
 		/* loop in case someone else grabbed one */
 	}
 	TAILQ_REMOVE(&bswlist, bp, b_freelist);
 	if (pfreecnt)
 		--*pfreecnt;
 	mtx_unlock(&pbuf_mtx);
 
 	initpbuf(bp);
 	return bp;
 }
 
 /*
  * allocate a physical buffer, if one is available.
  *
  *	Note that there is no NULL hack here - all subsystems using this
  *	call understand how to use pfreecnt.
  */
 struct buf *
 trypbuf(int *pfreecnt)
 {
 	struct buf *bp;
 
 	mtx_lock(&pbuf_mtx);
 	if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) {
 		mtx_unlock(&pbuf_mtx);
 		return NULL;
 	}
 	TAILQ_REMOVE(&bswlist, bp, b_freelist);
 
 	--*pfreecnt;
 
 	mtx_unlock(&pbuf_mtx);
 
 	initpbuf(bp);
 
 	return bp;
 }
 
 /*
  * release a physical buffer
  *
  *	NOTE: pfreecnt can be NULL, but this 'feature' will be removed
  *	relatively soon when the rest of the subsystems get smart about it. XXX
  */
 void
 relpbuf(struct buf *bp, int *pfreecnt)
 {
 
 	if (bp->b_rcred != NOCRED) {
 		crfree(bp->b_rcred);
 		bp->b_rcred = NOCRED;
 	}
 	if (bp->b_wcred != NOCRED) {
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
 
 	KASSERT(bp->b_vp == NULL, ("relpbuf with vp"));
 	KASSERT(bp->b_bufobj == NULL, ("relpbuf with bufobj"));
 
+	buf_track(bp, __func__);
 	BUF_UNLOCK(bp);
 
 	mtx_lock(&pbuf_mtx);
 	TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist);
 
 	if (bswneeded) {
 		bswneeded = 0;
 		wakeup(&bswneeded);
 	}
 	if (pfreecnt) {
 		if (++*pfreecnt == 1)
 			wakeup(pfreecnt);
 	}
 	mtx_unlock(&pbuf_mtx);
 }
 
 /*
  * Associate a p-buffer with a vnode.
  *
  * Also sets B_PAGING flag to indicate that vnode is not fully associated
  * with the buffer.  i.e. the bp has not been linked into the vnode or
  * ref-counted.
  */
 void
 pbgetvp(struct vnode *vp, struct buf *bp)
 {
 
 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
 	KASSERT(bp->b_bufobj == NULL, ("pbgetvp: not free (bufobj)"));
 
 	bp->b_vp = vp;
 	bp->b_flags |= B_PAGING;
 	bp->b_bufobj = &vp->v_bufobj;
 }
 
 /*
  * Associate a p-buffer with a vnode.
  *
  * Also sets B_PAGING flag to indicate that vnode is not fully associated
  * with the buffer.  i.e. the bp has not been linked into the vnode or
  * ref-counted.
  */
 void
 pbgetbo(struct bufobj *bo, struct buf *bp)
 {
 
 	KASSERT(bp->b_vp == NULL, ("pbgetbo: not free (vnode)"));
 	KASSERT(bp->b_bufobj == NULL, ("pbgetbo: not free (bufobj)"));
 
 	bp->b_flags |= B_PAGING;
 	bp->b_bufobj = bo;
 }
 
 /*
  * Disassociate a p-buffer from a vnode.
  */
 void
 pbrelvp(struct buf *bp)
 {
 
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 	KASSERT(bp->b_bufobj != NULL, ("pbrelvp: NULL bufobj"));
 	KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0,
 	    ("pbrelvp: pager buf on vnode list."));
 
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	bp->b_flags &= ~B_PAGING;
 }
 
 /*
  * Disassociate a p-buffer from a bufobj.
  */
 void
 pbrelbo(struct buf *bp)
 {
 
 	KASSERT(bp->b_vp == NULL, ("pbrelbo: vnode"));
 	KASSERT(bp->b_bufobj != NULL, ("pbrelbo: NULL bufobj"));
 	KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0,
 	    ("pbrelbo: pager buf on vnode list."));
 
 	bp->b_bufobj = NULL;
 	bp->b_flags &= ~B_PAGING;
 }