No OneTemporary
Actions

Size

5 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	diff --git a/sys/cam/ata/ata_da.c b/sys/cam/ata/ata_da.c
	index 99335e94bfca..cd49776e4f8b 100644
	--- a/sys/cam/ata/ata_da.c
	+++ b/sys/cam/ata/ata_da.c
	@@ -1,3704 +1,3704 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2009 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ada.h"

	#include <sys/param.h>

	#ifdef _KERNEL
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bio.h>
	#include <sys/sysctl.h>
	#include <sys/taskqueue.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/conf.h>
	#include <sys/devicestat.h>
	#include <sys/eventhandler.h>
	#include <sys/malloc.h>
	#include <sys/endian.h>
	#include <sys/cons.h>
	#include <sys/proc.h>
	#include <sys/reboot.h>
	#include <sys/sbuf.h>
	#include <geom/geom.h>
	#include <geom/geom_disk.h>
	#endif /* _KERNEL */

	#ifndef _KERNEL
	#include <stdio.h>
	#include <string.h>
	#endif /* _KERNEL */

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_da.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_iosched.h>

	#include <cam/ata/ata_all.h>

	#ifdef _KERNEL

	#define ATA_MAX_28BIT_LBA 268435455UL

	extern int iosched_debug;

	typedef enum {
	ADA_STATE_RAHEAD,
	ADA_STATE_WCACHE,
	ADA_STATE_LOGDIR,
	ADA_STATE_IDDIR,
	ADA_STATE_SUP_CAP,
	ADA_STATE_ZONE,
	ADA_STATE_NORMAL
	} ada_state;

	typedef enum {
	ADA_FLAG_CAN_48BIT = 0x00000002,
	ADA_FLAG_CAN_FLUSHCACHE = 0x00000004,
	ADA_FLAG_CAN_NCQ = 0x00000008,
	ADA_FLAG_CAN_DMA = 0x00000010,
	ADA_FLAG_NEED_OTAG = 0x00000020,
	ADA_FLAG_WAS_OTAG = 0x00000040,
	ADA_FLAG_CAN_TRIM = 0x00000080,
	ADA_FLAG_OPEN = 0x00000100,
	ADA_FLAG_SCTX_INIT = 0x00000200,
	ADA_FLAG_CAN_CFA = 0x00000400,
	ADA_FLAG_CAN_POWERMGT = 0x00000800,
	ADA_FLAG_CAN_DMA48 = 0x00001000,
	ADA_FLAG_CAN_LOG = 0x00002000,
	ADA_FLAG_CAN_IDLOG = 0x00004000,
	ADA_FLAG_CAN_SUPCAP = 0x00008000,
	ADA_FLAG_CAN_ZONE = 0x00010000,
	ADA_FLAG_CAN_WCACHE = 0x00020000,
	ADA_FLAG_CAN_RAHEAD = 0x00040000,
	ADA_FLAG_PROBED = 0x00080000,
	ADA_FLAG_ANNOUNCED = 0x00100000,
	ADA_FLAG_DIRTY = 0x00200000,
	ADA_FLAG_CAN_NCQ_TRIM = 0x00400000, /* CAN_TRIM also set */
	ADA_FLAG_PIM_ATA_EXT = 0x00800000,
	ADA_FLAG_UNMAPPEDIO = 0x01000000,
	ADA_FLAG_ROTATING = 0x02000000
	} ada_flags;
	#define ADA_FLAG_STRING \
	"\020" \
	"\002CAN_48BIT" \
	"\003CAN_FLUSHCACHE" \
	"\004CAN_NCQ" \
	"\005CAN_DMA" \
	"\006NEED_OTAG" \
	"\007WAS_OTAG" \
	"\010CAN_TRIM" \
	"\011OPEN" \
	"\012SCTX_INIT" \
	"\013CAN_CFA" \
	"\014CAN_POWERMGT" \
	"\015CAN_DMA48" \
	"\016CAN_LOG" \
	"\017CAN_IDLOG" \
	"\020CAN_SUPCAP" \
	"\021CAN_ZONE" \
	"\022CAN_WCACHE" \
	"\023CAN_RAHEAD" \
	"\024PROBED" \
	"\025ANNOUNCED" \
	"\026DIRTY" \
	"\027CAN_NCQ_TRIM" \
	"\030PIM_ATA_EXT" \
	"\031UNMAPPEDIO" \
	"\032ROTATING"

	typedef enum {
	ADA_Q_NONE = 0x00,
	ADA_Q_4K = 0x01,
	ADA_Q_NCQ_TRIM_BROKEN = 0x02,
	ADA_Q_LOG_BROKEN = 0x04,
	ADA_Q_SMR_DM = 0x08,
	ADA_Q_NO_TRIM = 0x10,
	ADA_Q_128KB = 0x20
	} ada_quirks;

	#define ADA_Q_BIT_STRING \
	"\020" \
	"\0014K" \
	"\002NCQ_TRIM_BROKEN" \
	"\003LOG_BROKEN" \
	"\004SMR_DM" \
	"\005NO_TRIM" \
	"\006128KB"

	typedef enum {
	ADA_CCB_RAHEAD = 0x01,
	ADA_CCB_WCACHE = 0x02,
	ADA_CCB_BUFFER_IO = 0x03,
	ADA_CCB_DUMP = 0x05,
	ADA_CCB_TRIM = 0x06,
	ADA_CCB_LOGDIR = 0x07,
	ADA_CCB_IDDIR = 0x08,
	ADA_CCB_SUP_CAP = 0x09,
	ADA_CCB_ZONE = 0x0a,
	ADA_CCB_TYPE_MASK = 0x0F,
	} ada_ccb_state;

	typedef enum {
	ADA_ZONE_NONE = 0x00,
	ADA_ZONE_DRIVE_MANAGED = 0x01,
	ADA_ZONE_HOST_AWARE = 0x02,
	ADA_ZONE_HOST_MANAGED = 0x03
	} ada_zone_mode;

	typedef enum {
	ADA_ZONE_FLAG_RZ_SUP = 0x0001,
	ADA_ZONE_FLAG_OPEN_SUP = 0x0002,
	ADA_ZONE_FLAG_CLOSE_SUP = 0x0004,
	ADA_ZONE_FLAG_FINISH_SUP = 0x0008,
	ADA_ZONE_FLAG_RWP_SUP = 0x0010,
	ADA_ZONE_FLAG_SUP_MASK = (ADA_ZONE_FLAG_RZ_SUP \|
	ADA_ZONE_FLAG_OPEN_SUP \|
	ADA_ZONE_FLAG_CLOSE_SUP \|
	ADA_ZONE_FLAG_FINISH_SUP \|
	ADA_ZONE_FLAG_RWP_SUP),
	ADA_ZONE_FLAG_URSWRZ = 0x0020,
	ADA_ZONE_FLAG_OPT_SEQ_SET = 0x0040,
	ADA_ZONE_FLAG_OPT_NONSEQ_SET = 0x0080,
	ADA_ZONE_FLAG_MAX_SEQ_SET = 0x0100,
	ADA_ZONE_FLAG_SET_MASK = (ADA_ZONE_FLAG_OPT_SEQ_SET \|
	ADA_ZONE_FLAG_OPT_NONSEQ_SET \|
	ADA_ZONE_FLAG_MAX_SEQ_SET)
	} ada_zone_flags;

	static struct ada_zone_desc {
	ada_zone_flags value;
	const char *desc;
	} ada_zone_desc_table[] = {
	{ADA_ZONE_FLAG_RZ_SUP, "Report Zones" },
	{ADA_ZONE_FLAG_OPEN_SUP, "Open" },
	{ADA_ZONE_FLAG_CLOSE_SUP, "Close" },
	{ADA_ZONE_FLAG_FINISH_SUP, "Finish" },
	{ADA_ZONE_FLAG_RWP_SUP, "Reset Write Pointer" },
	};

	/* Offsets into our private area for storing information */
	#define ccb_state ppriv_field0
	#define ccb_bp ppriv_ptr1

	typedef enum {
	ADA_DELETE_NONE,
	ADA_DELETE_DISABLE,
	ADA_DELETE_CFA_ERASE,
	ADA_DELETE_DSM_TRIM,
	ADA_DELETE_NCQ_DSM_TRIM,
	ADA_DELETE_MIN = ADA_DELETE_CFA_ERASE,
	ADA_DELETE_MAX = ADA_DELETE_NCQ_DSM_TRIM,
	} ada_delete_methods;

	static const char *ada_delete_method_names[] =
	{ "NONE", "DISABLE", "CFA_ERASE", "DSM_TRIM", "NCQ_DSM_TRIM" };
	#if 0
	static const char *ada_delete_method_desc[] =
	{ "NONE", "DISABLED", "CFA Erase", "DSM Trim", "DSM Trim via NCQ" };
	#endif

	struct disk_params {
	u_int8_t heads;
	u_int8_t secs_per_track;
	u_int32_t cylinders;
	u_int32_t secsize; /* Number of bytes/logical sector */
	u_int64_t sectors; /* Total number sectors */
	};

	#define TRIM_MAX_BLOCKS 8
	#define TRIM_MAX_RANGES (TRIM_MAX_BLOCKS * ATA_DSM_BLK_RANGES)
	struct trim_request {
	uint8_t data[TRIM_MAX_RANGES * ATA_DSM_RANGE_SIZE];
	TAILQ_HEAD(, bio) bps;
	};

	struct ada_softc {
	struct cam_iosched_softc *cam_iosched;
	int outstanding_cmds; /* Number of active commands */
	int refcount; /* Active xpt_action() calls */
	ada_state state;
	ada_flags flags;
	ada_zone_mode zone_mode;
	ada_zone_flags zone_flags;
	struct ata_gp_log_dir ata_logdir;
	int valid_logdir_len;
	struct ata_identify_log_pages ata_iddir;
	int valid_iddir_len;
	uint64_t optimal_seq_zones;
	uint64_t optimal_nonseq_zones;
	uint64_t max_seq_zones;
	ada_quirks quirks;
	ada_delete_methods delete_method;
	int trim_max_ranges;
	int read_ahead;
	int write_cache;
	#ifdef CAM_TEST_FAILURE
	int force_read_error;
	int force_write_error;
	int periodic_read_error;
	int periodic_read_count;
	#endif
	struct ccb_pathinq cpi;
	struct disk_params params;
	struct disk *disk;
	struct task sysctl_task;
	struct sysctl_ctx_list sysctl_ctx;
	struct sysctl_oid *sysctl_tree;
	struct callout sendordered_c;
	struct trim_request trim_req;
	uint64_t trim_count;
	uint64_t trim_ranges;
	uint64_t trim_lbas;
	#ifdef CAM_IO_STATS
	struct sysctl_ctx_list sysctl_stats_ctx;
	struct sysctl_oid *sysctl_stats_tree;
	u_int timeouts;
	u_int errors;
	u_int invalidations;
	#endif
	#define ADA_ANNOUNCETMP_SZ 80
	char announce_temp[ADA_ANNOUNCETMP_SZ];
	#define ADA_ANNOUNCE_SZ 400
	char announce_buffer[ADA_ANNOUNCE_SZ];
	};

	struct ada_quirk_entry {
	struct scsi_inquiry_pattern inq_pat;
	ada_quirks quirks;
	};

	static struct ada_quirk_entry ada_quirk_table[] =
	{
	{
	/* Sandisk X400 */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SanDisk?SD8SB8U1T00", "X4162000*" },
	/quirks/ADA_Q_128KB
	},
	{
	/* Hitachi Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Hitachi H??????????E3", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Samsung Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG HD155UI", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Samsung Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG HD204UI", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST????DL", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Barracuda Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST???DM", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Barracuda Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST????DM", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9500423AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9500424AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9640423AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9640424AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9750420AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9750422AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9750423AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Thin Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST???LT", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Red Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD????CX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD????RS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Green/Red Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD????RX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Red Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD??????CX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD????AZEX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD????FZEX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD??????RS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD??????RX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Scorpio Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD???PKT", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Scorpio Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD?????PKT", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Scorpio Blue Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD???PVT", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Scorpio Blue Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD?????PVT", "*" },
	/quirks/ADA_Q_4K
	},
	/* SSDs */
	{
	/*
	* Corsair Force 2 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Corsair CSSD-F", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Corsair Force 3 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Corsair Force 3", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Corsair Neutron GTX SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Corsair Neutron GTX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Corsair Force GT & GS SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Corsair Force G", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Crucial M4 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "M4-CT???M4SSD2", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Crucial M500 SSDs MU07 firmware
	* NCQ Trim works
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Crucial CTM500*", "MU07" },
	/quirks/0
	},
	{
	/*
	* Crucial M500 SSDs all other firmware
	* NCQ Trim doesn't work
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Crucial CTM500", "" },
	/quirks/ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Crucial M550 SSDs
	* NCQ Trim doesn't work, but only on MU01 firmware
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Crucial CTM550*", "MU01" },
	/quirks/ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Crucial MX100 SSDs
	* NCQ Trim doesn't work, but only on MU01 firmware
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Crucial CTMX100*", "MU01" },
	/quirks/ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Crucial RealSSD C300 SSDs
	* 4k optimised
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "C300-CTFDDAC???MAG",
	"" }, /quirks*/ADA_Q_4K
	},
	{
	/*
	* FCCT M500 SSDs
	* NCQ Trim doesn't work
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "FCCTM500", "" },
	/quirks/ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Intel 320 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "INTEL SSDSA2CW", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Intel 330 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "INTEL SSDSC2CT", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Intel 510 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "INTEL SSDSC2MH", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Intel 520 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "INTEL SSDSC2BW", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Intel S3610 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "INTEL SSDSC2BX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Intel X25-M Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "INTEL SSDSA2M", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* KingDian S200 60GB P0921B
	* Trimming crash the SSD
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "KingDian S200 ", "*" },
	/quirks/ADA_Q_NO_TRIM
	},
	{
	/*
	* Kingston E100 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "KINGSTON SE100S3", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Kingston HyperX 3k SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "KINGSTON SH103S3", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Marvell SSDs (entry taken from OpenSolaris)
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "MARVELL SD88SA02", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Micron M500 SSDs firmware MU07
	* NCQ Trim works?
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Micron M500", "MU07" },
	/quirks/0
	},
	{
	/*
	* Micron M500 SSDs all other firmware
	* NCQ Trim doesn't work
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Micron M500", "*" },
	/quirks/ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Micron M5[15]0 SSDs
	* NCQ Trim doesn't work, but only MU01 firmware
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Micron M5[15]0", "MU01" },
	/quirks/ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Micron 5100 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Micron 5100 MTFDDAK", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* OCZ Agility 2 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "OCZ-AGILITY2", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* OCZ Agility 3 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "OCZ-AGILITY3", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* OCZ Deneva R Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "DENRSTE251M45", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* OCZ Vertex 2 SSDs (inc pro series)
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "OCZ?VERTEX2", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* OCZ Vertex 3 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "OCZ-VERTEX3", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* OCZ Vertex 4 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "OCZ-VERTEX4", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Samsung 750 SSDs
	* 4k optimised, NCQ TRIM seems to work
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Samsung SSD 750", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Samsung 830 Series SSDs
	* 4k optimised, NCQ TRIM Broken (normal TRIM is fine)
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG SSD 830 Series", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Samsung 840 SSDs
	* 4k optimised, NCQ TRIM Broken (normal TRIM is fine)
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Samsung SSD 840", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Samsung 845 SSDs
	* 4k optimised, NCQ TRIM Broken (normal TRIM is fine)
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Samsung SSD 845", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Samsung 850 SSDs
	* 4k optimised, NCQ TRIM broken (normal TRIM fine)
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Samsung SSD 850", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Samsung SM863 Series SSDs (MZ7KM*)
	* 4k optimised, NCQ believed to be working
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG MZ7KM", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Samsung 843T Series SSDs (MZ7WD*)
	* Samsung PM851 Series SSDs (MZ7TE*)
	* Samsung PM853T Series SSDs (MZ7GE*)
	* 4k optimised, NCQ believed to be broken since these are
	* appear to be built with the same controllers as the 840/850.
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG MZ7", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Same as for SAMSUNG MZ7* but enable the quirks for SSD
	* starting with MZ7* too
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "MZ7", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Samsung PM851 Series SSDs Dell OEM
	* device model "SAMSUNG SSD PM851 mSATA 256GB"
	* 4k optimised, NCQ broken
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG SSD PM851", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* SuperTalent TeraDrive CT SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "FTM??CT25H", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* XceedIOPS SATA SSDs
	* 4k optimised
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SG9XCS2D", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Samsung drive that doesn't support READ LOG EXT or
	* READ LOG DMA EXT, despite reporting that it does in
	* ATA identify data:
	* SAMSUNG HD200HJ KF100-06
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG HD200", "*" },
	/quirks/ADA_Q_LOG_BROKEN
	},
	{
	/*
	* Samsung drive that doesn't support READ LOG EXT or
	* READ LOG DMA EXT, despite reporting that it does in
	* ATA identify data:
	* SAMSUNG HD501LJ CR100-10
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG HD501", "*" },
	/quirks/ADA_Q_LOG_BROKEN
	},
	{
	/*
	* Seagate Lamarr 8TB Shingled Magnetic Recording (SMR)
	* Drive Managed SATA hard drive. This drive doesn't report
	* in firmware that it is a drive managed SMR drive.
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST8000AS000[23]", "*" },
	/quirks/ADA_Q_SMR_DM
	},
	{
	/* WD Green SSD */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WDS?????G0", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/* Default */
	{
	T_ANY, SIP_MEDIA_REMOVABLE\|SIP_MEDIA_FIXED,
	/vendor/"", /product/"", /revision/"*"
	},
	/quirks/0
	},
	};

	static disk_strategy_t adastrategy;
	static dumper_t adadump;
	static periph_init_t adainit;
	static void adadiskgonecb(struct disk *dp);
	static periph_oninv_t adaoninvalidate;
	static periph_dtor_t adacleanup;
	static void adaasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg);
	static int adabitsysctl(SYSCTL_HANDLER_ARGS);
	static int adaflagssysctl(SYSCTL_HANDLER_ARGS);
	static int adazonesupsysctl(SYSCTL_HANDLER_ARGS);
	static void adasysctlinit(void *context, int pending);
	static int adagetattr(struct bio *bp);
	static void adasetflags(struct ada_softc *softc,
	struct ccb_getdev *cgd);
	static void adasetgeom(struct ada_softc *softc,
	struct ccb_getdev *cgd);
	static periph_ctor_t adaregister;
	static void ada_dsmtrim(struct ada_softc softc, struct bio bp,
	struct ccb_ataio *ataio);
	static void ada_cfaerase(struct ada_softc softc, struct bio bp,
	struct ccb_ataio *ataio);
	static int ada_zone_bio_to_ata(int disk_zone_cmd);
	static int ada_zone_cmd(struct cam_periph periph, union ccb ccb,
	struct bio bp, int queue_ccb);
	static periph_start_t adastart;
	static void adaprobedone(struct cam_periph periph, union ccb ccb);
	static void adazonedone(struct cam_periph periph, union ccb ccb);
	static void adadone(struct cam_periph *periph,
	union ccb *done_ccb);
	static int adaerror(union ccb *ccb, u_int32_t cam_flags,
	u_int32_t sense_flags);
	static callout_func_t adasendorderedtag;
	static void adashutdown(void *arg, int howto);
	static void adasuspend(void *arg);
	static void adaresume(void *arg);

	#ifndef ADA_DEFAULT_TIMEOUT
	#define ADA_DEFAULT_TIMEOUT 30 /* Timeout in seconds */
	#endif

	#ifndef ADA_DEFAULT_RETRY
	#define ADA_DEFAULT_RETRY 4
	#endif

	#ifndef ADA_DEFAULT_SEND_ORDERED
	#define ADA_DEFAULT_SEND_ORDERED 1
	#endif

	#ifndef ADA_DEFAULT_SPINDOWN_SHUTDOWN
	#define ADA_DEFAULT_SPINDOWN_SHUTDOWN 1
	#endif

	#ifndef ADA_DEFAULT_SPINDOWN_SUSPEND
	#define ADA_DEFAULT_SPINDOWN_SUSPEND 1
	#endif

	#ifndef ADA_DEFAULT_READ_AHEAD
	#define ADA_DEFAULT_READ_AHEAD 1
	#endif

	#ifndef ADA_DEFAULT_WRITE_CACHE
	#define ADA_DEFAULT_WRITE_CACHE 1
	#endif

	#define ADA_RA (softc->read_ahead >= 0 ? \
	softc->read_ahead : ada_read_ahead)
	#define ADA_WC (softc->write_cache >= 0 ? \
	softc->write_cache : ada_write_cache)

	/*
	* Most platforms map firmware geometry to actual, but some don't. If
	* not overridden, default to nothing.
	*/
	#ifndef ata_disk_firmware_geom_adjust
	#define ata_disk_firmware_geom_adjust(disk)
	#endif

	static int ada_retry_count = ADA_DEFAULT_RETRY;
	static int ada_default_timeout = ADA_DEFAULT_TIMEOUT;
	static int ada_send_ordered = ADA_DEFAULT_SEND_ORDERED;
	static int ada_spindown_shutdown = ADA_DEFAULT_SPINDOWN_SHUTDOWN;
	static int ada_spindown_suspend = ADA_DEFAULT_SPINDOWN_SUSPEND;
	static int ada_read_ahead = ADA_DEFAULT_READ_AHEAD;
	static int ada_write_cache = ADA_DEFAULT_WRITE_CACHE;
	static int ada_enable_biospeedup = 1;

	static SYSCTL_NODE(_kern_cam, OID_AUTO, ada, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"CAM Direct Access Disk driver");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, retry_count, CTLFLAG_RWTUN,
	&ada_retry_count, 0, "Normal I/O retry count");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, default_timeout, CTLFLAG_RWTUN,
	&ada_default_timeout, 0, "Normal I/O timeout (in seconds)");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, send_ordered, CTLFLAG_RWTUN,
	&ada_send_ordered, 0, "Send Ordered Tags");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, spindown_shutdown, CTLFLAG_RWTUN,
	&ada_spindown_shutdown, 0, "Spin down upon shutdown");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, spindown_suspend, CTLFLAG_RWTUN,
	&ada_spindown_suspend, 0, "Spin down upon suspend");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, read_ahead, CTLFLAG_RWTUN,
	&ada_read_ahead, 0, "Enable disk read-ahead");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, write_cache, CTLFLAG_RWTUN,
	&ada_write_cache, 0, "Enable disk write cache");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, enable_biospeedup, CTLFLAG_RDTUN,
	&ada_enable_biospeedup, 0, "Enable BIO_SPEEDUP processing");

	/*
	* ADA_ORDEREDTAG_INTERVAL determines how often, relative
	* to the default timeout, we check to see whether an ordered
	* tagged transaction is appropriate to prevent simple tag
	* starvation. Since we'd like to ensure that there is at least
	* 1/2 of the timeout length left for a starved transaction to
	* complete after we've sent an ordered tag, we must poll at least
	* four times in every timeout period. This takes care of the worst
	* case where a starved transaction starts during an interval that
	* meets the requirement "don't send an ordered tag" test so it takes
	* us two intervals to determine that a tag must be sent.
	*/
	#ifndef ADA_ORDEREDTAG_INTERVAL
	#define ADA_ORDEREDTAG_INTERVAL 4
	#endif

	static struct periph_driver adadriver =
	{
	adainit, "ada",
	TAILQ_HEAD_INITIALIZER(adadriver.units), /* generation */ 0
	};

	static int adadeletemethodsysctl(SYSCTL_HANDLER_ARGS);

	PERIPHDRIVER_DECLARE(ada, adadriver);

	static MALLOC_DEFINE(M_ATADA, "ata_da", "ata_da buffers");

	static int
	adaopen(struct disk *dp)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;
	int error;

	periph = (struct cam_periph *)dp->d_drv1;
	if (cam_periph_acquire(periph) != 0) {
	return(ENXIO);
	}

	cam_periph_lock(periph);
	if ((error = cam_periph_hold(periph, PRIBIO\|PCATCH)) != 0) {
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (error);
	}

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE \| CAM_DEBUG_PERIPH,
	("adaopen\n"));

	softc = (struct ada_softc *)periph->softc;
	softc->flags \|= ADA_FLAG_OPEN;

	cam_periph_unhold(periph);
	cam_periph_unlock(periph);
	return (0);
	}

	static int
	adaclose(struct disk *dp)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;
	union ccb *ccb;
	int error;

	periph = (struct cam_periph *)dp->d_drv1;
	softc = (struct ada_softc *)periph->softc;
	cam_periph_lock(periph);

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE \| CAM_DEBUG_PERIPH,
	("adaclose\n"));

	/* We only sync the cache if the drive is capable of it. */
	if ((softc->flags & ADA_FLAG_DIRTY) != 0 &&
	(softc->flags & ADA_FLAG_CAN_FLUSHCACHE) != 0 &&
	(periph->flags & CAM_PERIPH_INVALID) == 0 &&
	cam_periph_hold(periph, PRIBIO) == 0) {
	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	cam_fill_ataio(&ccb->ataio,
	1,
	NULL,
	CAM_DIR_NONE,
	0,
	NULL,
	0,
	ada_default_timeout*1000);

	if (softc->flags & ADA_FLAG_CAN_48BIT)
	ata_48bit_cmd(&ccb->ataio, ATA_FLUSHCACHE48, 0, 0, 0);
	else
	ata_28bit_cmd(&ccb->ataio, ATA_FLUSHCACHE, 0, 0, 0);
	error = cam_periph_runccb(ccb, adaerror, /cam_flags/0,
	/sense_flags/0, softc->disk->d_devstat);

	if (error != 0)
	xpt_print(periph->path, "Synchronize cache failed\n");
	softc->flags &= ~ADA_FLAG_DIRTY;
	xpt_release_ccb(ccb);
	cam_periph_unhold(periph);
	}

	softc->flags &= ~ADA_FLAG_OPEN;

	while (softc->refcount != 0)
	cam_periph_sleep(periph, &softc->refcount, PRIBIO, "adaclose", 1);
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (0);
	}

	static void
	adaschedule(struct cam_periph *periph)
	{
	struct ada_softc softc = (struct ada_softc )periph->softc;

	if (softc->state != ADA_STATE_NORMAL)
	return;

	cam_iosched_schedule(softc->cam_iosched, periph);
	}

	/*
	* Actually translate the requested transfer into one the physical driver
	* can understand. The transfer is described by a buf and will include
	* only one physical transfer.
	*/
	static void
	adastrategy(struct bio *bp)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;

	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
	softc = (struct ada_softc *)periph->softc;

	cam_periph_lock(periph);

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("adastrategy(%p)\n", bp));

	/*
	* If the device has been made invalid, error out
	*/
	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
	cam_periph_unlock(periph);
	biofinish(bp, NULL, ENXIO);
	return;
	}

	/*
	* Zone commands must be ordered, because they can depend on the
	* effects of previously issued commands, and they may affect
	* commands after them.
	*/
	if (bp->bio_cmd == BIO_ZONE)
	bp->bio_flags \|= BIO_ORDERED;

	/*
	* Place it in the queue of disk activities for this disk
	*/
	cam_iosched_queue_work(softc->cam_iosched, bp);

	/*
	* Schedule ourselves for performing the work.
	*/
	adaschedule(periph);
	cam_periph_unlock(periph);

	return;
	}

	static int
	adadump(void arg, void virtual, vm_offset_t physical, off_t offset, size_t length)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;
	u_int secsize;
	struct ccb_ataio ataio;
	struct disk *dp;
	uint64_t lba;
	uint16_t count;
	int error = 0;

	dp = arg;
	periph = dp->d_drv1;
	softc = (struct ada_softc *)periph->softc;
	secsize = softc->params.secsize;
	lba = offset / secsize;
	count = length / secsize;
	if ((periph->flags & CAM_PERIPH_INVALID) != 0)
	return (ENXIO);

	memset(&ataio, 0, sizeof(ataio));
	if (length > 0) {
	xpt_setup_ccb(&ataio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	ataio.ccb_h.ccb_state = ADA_CCB_DUMP;
	cam_fill_ataio(&ataio,
	0,
	NULL,
	CAM_DIR_OUT,
	0,
	(u_int8_t *) virtual,
	length,
	ada_default_timeout*1000);
	if ((softc->flags & ADA_FLAG_CAN_48BIT) &&
	(lba + count >= ATA_MAX_28BIT_LBA \|\|
	count >= 256)) {
	ata_48bit_cmd(&ataio, ATA_WRITE_DMA48,
	0, lba, count);
	} else {
	ata_28bit_cmd(&ataio, ATA_WRITE_DMA,
	0, lba, count);
	}
	error = cam_periph_runccb((union ccb *)&ataio, adaerror,
	0, SF_NO_RECOVERY \| SF_NO_RETRY, NULL);
	if (error != 0)
	printf("Aborting dump due to I/O error.\n");

	return (error);
	}

	if (softc->flags & ADA_FLAG_CAN_FLUSHCACHE) {
	xpt_setup_ccb(&ataio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);

	/*
	* Tell the drive to flush its internal cache. if we
	* can't flush in 5s we have big problems. No need to
	* wait the default 60s to detect problems.
	*/
	ataio.ccb_h.ccb_state = ADA_CCB_DUMP;
	cam_fill_ataio(&ataio,
	0,
	NULL,
	CAM_DIR_NONE,
	0,
	NULL,
	0,
	5*1000);

	if (softc->flags & ADA_FLAG_CAN_48BIT)
	ata_48bit_cmd(&ataio, ATA_FLUSHCACHE48, 0, 0, 0);
	else
	ata_28bit_cmd(&ataio, ATA_FLUSHCACHE, 0, 0, 0);
	error = cam_periph_runccb((union ccb *)&ataio, adaerror,
	0, SF_NO_RECOVERY \| SF_NO_RETRY, NULL);
	if (error != 0)
	xpt_print(periph->path, "Synchronize cache failed\n");
	}
	return (error);
	}

	static void
	adainit(void)
	{
	cam_status status;

	/*
	* Install a global async callback. This callback will
	* receive async callbacks like "new device found".
	*/
	status = xpt_register_async(AC_FOUND_DEVICE, adaasync, NULL, NULL);

	if (status != CAM_REQ_CMP) {
	printf("ada: Failed to attach master async callback "
	"due to status 0x%x!\n", status);
	} else if (ada_send_ordered) {
	/* Register our event handlers */
	if ((EVENTHANDLER_REGISTER(power_suspend, adasuspend,
	NULL, EVENTHANDLER_PRI_LAST)) == NULL)
	printf("adainit: power event registration failed!\n");
	if ((EVENTHANDLER_REGISTER(power_resume, adaresume,
	NULL, EVENTHANDLER_PRI_LAST)) == NULL)
	printf("adainit: power event registration failed!\n");
	if ((EVENTHANDLER_REGISTER(shutdown_post_sync, adashutdown,
	NULL, SHUTDOWN_PRI_DEFAULT)) == NULL)
	printf("adainit: shutdown event registration failed!\n");
	}
	}

	/*
	* Callback from GEOM, called when it has finished cleaning up its
	* resources.
	*/
	static void
	adadiskgonecb(struct disk *dp)
	{
	struct cam_periph *periph;

	periph = (struct cam_periph *)dp->d_drv1;

	cam_periph_release(periph);
	}

	static void
	adaoninvalidate(struct cam_periph *periph)
	{
	struct ada_softc *softc;

	softc = (struct ada_softc *)periph->softc;

	/*
	* De-register any async callbacks.
	*/
	xpt_register_async(0, adaasync, periph, periph->path);
	#ifdef CAM_IO_STATS
	softc->invalidations++;
	#endif

	/*
	* Return all queued I/O with ENXIO.
	* XXX Handle any transactions queued to the card
	* with XPT_ABORT_CCB.
	*/
	cam_iosched_flush(softc->cam_iosched, NULL, ENXIO);

	disk_gone(softc->disk);
	}

	static void
	adacleanup(struct cam_periph *periph)
	{
	struct ada_softc *softc;

	softc = (struct ada_softc *)periph->softc;

	cam_periph_unlock(periph);

	cam_iosched_fini(softc->cam_iosched);

	/*
	* If we can't free the sysctl tree, oh well...
	*/
	if ((softc->flags & ADA_FLAG_SCTX_INIT) != 0) {
	#ifdef CAM_IO_STATS
	if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0)
	xpt_print(periph->path,
	"can't remove sysctl stats context\n");
	#endif
	if (sysctl_ctx_free(&softc->sysctl_ctx) != 0)
	xpt_print(periph->path,
	"can't remove sysctl context\n");
	}

	disk_destroy(softc->disk);
	callout_drain(&softc->sendordered_c);
	free(softc, M_DEVBUF);
	cam_periph_lock(periph);
	}

	static void
	adasetdeletemethod(struct ada_softc *softc)
	{

	if (softc->flags & ADA_FLAG_CAN_NCQ_TRIM)
	softc->delete_method = ADA_DELETE_NCQ_DSM_TRIM;
	else if (softc->flags & ADA_FLAG_CAN_TRIM)
	softc->delete_method = ADA_DELETE_DSM_TRIM;
	else if ((softc->flags & ADA_FLAG_CAN_CFA) && !(softc->flags & ADA_FLAG_CAN_48BIT))
	softc->delete_method = ADA_DELETE_CFA_ERASE;
	else
	softc->delete_method = ADA_DELETE_NONE;
	}

	static void
	adaasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg)
	{
	struct ccb_getdev cgd;
	struct cam_periph *periph;
	struct ada_softc *softc;

	periph = (struct cam_periph *)callback_arg;
	switch (code) {
	case AC_FOUND_DEVICE:
	{
	struct ccb_getdev *cgd;
	cam_status status;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL)
	break;

	if (cgd->protocol != PROTO_ATA)
	break;

	/*
	* Allocate a peripheral instance for
	* this device and start the probe
	* process.
	*/
	status = cam_periph_alloc(adaregister, adaoninvalidate,
	adacleanup, adastart,
	"ada", CAM_PERIPH_BIO,
	path, adaasync,
	AC_FOUND_DEVICE, cgd);

	if (status != CAM_REQ_CMP
	&& status != CAM_REQ_INPROG)
	printf("adaasync: Unable to attach to new device "
	"due to status 0x%x\n", status);
	break;
	}
	case AC_GETDEV_CHANGED:
	{
	softc = (struct ada_softc *)periph->softc;
	xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_action((union ccb *)&cgd);

	/*
	* Update our information based on the new Identify data.
	*/
	adasetflags(softc, &cgd);
	adasetgeom(softc, &cgd);
	disk_resize(softc->disk, M_NOWAIT);

	cam_periph_async(periph, code, path, arg);
	break;
	}
	case AC_ADVINFO_CHANGED:
	{
	uintptr_t buftype;

	buftype = (uintptr_t)arg;
	if (buftype == CDAI_TYPE_PHYS_PATH) {
	struct ada_softc *softc;

	softc = periph->softc;
	disk_attr_changed(softc->disk, "GEOM::physpath",
	M_NOWAIT);
	}
	break;
	}
	case AC_SENT_BDR:
	case AC_BUS_RESET:
	{
	softc = (struct ada_softc *)periph->softc;
	cam_periph_async(periph, code, path, arg);
	if (softc->state != ADA_STATE_NORMAL)
	break;
	xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_action((union ccb *)&cgd);
	if (ADA_RA >= 0 && softc->flags & ADA_FLAG_CAN_RAHEAD)
	softc->state = ADA_STATE_RAHEAD;
	else if (ADA_WC >= 0 && softc->flags & ADA_FLAG_CAN_WCACHE)
	softc->state = ADA_STATE_WCACHE;
	else if ((softc->flags & ADA_FLAG_CAN_LOG)
	&& (softc->zone_mode != ADA_ZONE_NONE))
	softc->state = ADA_STATE_LOGDIR;
	else
	break;
	if (cam_periph_acquire(periph) != 0)
	softc->state = ADA_STATE_NORMAL;
	else
	xpt_schedule(periph, CAM_PRIORITY_DEV);
	}
	default:
	cam_periph_async(periph, code, path, arg);
	break;
	}
	}

	static int
	adazonemodesysctl(SYSCTL_HANDLER_ARGS)
	{
	char tmpbuf[40];
	struct ada_softc *softc;
	int error;

	softc = (struct ada_softc *)arg1;

	switch (softc->zone_mode) {
	case ADA_ZONE_DRIVE_MANAGED:
	snprintf(tmpbuf, sizeof(tmpbuf), "Drive Managed");
	break;
	case ADA_ZONE_HOST_AWARE:
	snprintf(tmpbuf, sizeof(tmpbuf), "Host Aware");
	break;
	case ADA_ZONE_HOST_MANAGED:
	snprintf(tmpbuf, sizeof(tmpbuf), "Host Managed");
	break;
	case ADA_ZONE_NONE:
	default:
	snprintf(tmpbuf, sizeof(tmpbuf), "Not Zoned");
	break;
	}

	error = sysctl_handle_string(oidp, tmpbuf, sizeof(tmpbuf), req);

	return (error);
	}

	static int
	adazonesupsysctl(SYSCTL_HANDLER_ARGS)
	{
	char tmpbuf[180];
	struct ada_softc *softc;
	struct sbuf sb;
	int error, first;
	unsigned int i;

	softc = (struct ada_softc *)arg1;

	error = 0;
	first = 1;
	sbuf_new(&sb, tmpbuf, sizeof(tmpbuf), 0);

	for (i = 0; i < sizeof(ada_zone_desc_table) /
	sizeof(ada_zone_desc_table[0]); i++) {
	if (softc->zone_flags & ada_zone_desc_table[i].value) {
	if (first == 0)
	sbuf_printf(&sb, ", ");
	else
	first = 0;
	sbuf_cat(&sb, ada_zone_desc_table[i].desc);
	}
	}

	if (first == 1)
	sbuf_printf(&sb, "None");

	sbuf_finish(&sb);

	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);

	return (error);
	}

	static void
	adasysctlinit(void *context, int pending)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;
	char tmpstr[32], tmpstr2[16];

	periph = (struct cam_periph *)context;

	/* periph was held for us when this task was enqueued */
	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
	cam_periph_release(periph);
	return;
	}

	softc = (struct ada_softc *)periph->softc;
	snprintf(tmpstr, sizeof(tmpstr), "CAM ADA unit %d",periph->unit_number);
	snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);

	sysctl_ctx_init(&softc->sysctl_ctx);
	softc->flags \|= ADA_FLAG_SCTX_INIT;
	softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_kern_cam_ada), OID_AUTO, tmpstr2,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, tmpstr, "device_index");
	if (softc->sysctl_tree == NULL) {
	printf("adasysctlinit: unable to allocate sysctl tree\n");
	cam_periph_release(periph);
	return;
	}

	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "delete_method",
	CTLTYPE_STRING \| CTLFLAG_RW \| CTLFLAG_NEEDGIANT,
	softc, 0, adadeletemethodsysctl, "A",
	"BIO_DELETE execution method");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"trim_count", CTLFLAG_RD, &softc->trim_count,
	"Total number of dsm commands sent");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"trim_ranges", CTLFLAG_RD, &softc->trim_ranges,
	"Total number of ranges in dsm commands");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"trim_lbas", CTLFLAG_RD, &softc->trim_lbas,
	"Total lbas in the dsm commands sent");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "read_ahead", CTLFLAG_RW \| CTLFLAG_MPSAFE,
	&softc->read_ahead, 0, "Enable disk read ahead.");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "write_cache", CTLFLAG_RW \| CTLFLAG_MPSAFE,
	&softc->write_cache, 0, "Enable disk write cache.");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "zone_mode",
	CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_NEEDGIANT,
	softc, 0, adazonemodesysctl, "A",
	"Zone Mode");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "zone_support",
	CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_NEEDGIANT,
	softc, 0, adazonesupsysctl, "A",
	"Zone Support");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"optimal_seq_zones", CTLFLAG_RD, &softc->optimal_seq_zones,
	"Optimal Number of Open Sequential Write Preferred Zones");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"optimal_nonseq_zones", CTLFLAG_RD,
	&softc->optimal_nonseq_zones,
	"Optimal Number of Non-Sequentially Written Sequential Write "
	"Preferred Zones");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"max_seq_zones", CTLFLAG_RD, &softc->max_seq_zones,
	"Maximum Number of Open Sequential Write Required Zones");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "flags", CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	softc, 0, adaflagssysctl, "A",
	"Flags for drive");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "unmapped_io", CTLTYPE_INT \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&softc->flags, (u_int)ADA_FLAG_UNMAPPEDIO, adabitsysctl, "I",
	"Unmapped I/O support DEPRECATED gone in FreeBSD 14");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "rotating", CTLTYPE_INT \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&softc->flags, (u_int)ADA_FLAG_ROTATING, adabitsysctl, "I",
	"Rotating media DEPRECATED gone in FreeBSD 14");

	#ifdef CAM_TEST_FAILURE
	/*
	* Add a 'door bell' sysctl which allows one to set it from userland
	* and cause something bad to happen. For the moment, we only allow
	* whacking the next read or write.
	*/
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "force_read_error", CTLFLAG_RW \| CTLFLAG_MPSAFE,
	&softc->force_read_error, 0,
	"Force a read error for the next N reads.");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "force_write_error", CTLFLAG_RW \| CTLFLAG_MPSAFE,
	&softc->force_write_error, 0,
	"Force a write error for the next N writes.");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "periodic_read_error", CTLFLAG_RW \| CTLFLAG_MPSAFE,
	&softc->periodic_read_error, 0,
	"Force a read error every N reads (don't set too low).");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "invalidate", CTLTYPE_U64 \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	periph, 0, cam_periph_invalidate_sysctl, "I",
	"Write 1 to invalidate the drive immediately");
	#endif

	#ifdef CAM_IO_STATS
	softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats",
	CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, "Statistics");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO, "timeouts", CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&softc->timeouts, 0,
	"Device timeouts reported by the SIM");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO, "errors", CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&softc->errors, 0,
	"Transport errors reported by the SIM.");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO, "pack_invalidations", CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&softc->invalidations, 0,
	"Device pack invalidations.");
	#endif

	cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx,
	softc->sysctl_tree);

	cam_periph_release(periph);
	}

	static int
	adagetattr(struct bio *bp)
	{
	int ret;
	struct cam_periph *periph;

	if (g_handleattr_int(bp, "GEOM::canspeedup", ada_enable_biospeedup))
	return (EJUSTRETURN);

	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
	cam_periph_lock(periph);
	ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute,
	periph->path);
	cam_periph_unlock(periph);
	if (ret == 0)
	bp->bio_completed = bp->bio_length;
	return ret;
	}

	static int
	adadeletemethodsysctl(SYSCTL_HANDLER_ARGS)
	{
	char buf[16];
	const char *p;
	struct ada_softc *softc;
	int i, error, value, methods;

	softc = (struct ada_softc *)arg1;

	value = softc->delete_method;
	if (value < 0 \|\| value > ADA_DELETE_MAX)
	p = "UNKNOWN";
	else
	p = ada_delete_method_names[value];
	strncpy(buf, p, sizeof(buf));
	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	methods = 1 << ADA_DELETE_DISABLE;
	if ((softc->flags & ADA_FLAG_CAN_CFA) &&
	!(softc->flags & ADA_FLAG_CAN_48BIT))
	methods \|= 1 << ADA_DELETE_CFA_ERASE;
	if (softc->flags & ADA_FLAG_CAN_TRIM)
	methods \|= 1 << ADA_DELETE_DSM_TRIM;
	if (softc->flags & ADA_FLAG_CAN_NCQ_TRIM)
	methods \|= 1 << ADA_DELETE_NCQ_DSM_TRIM;
	for (i = 0; i <= ADA_DELETE_MAX; i++) {
	if (!(methods & (1 << i)) \|\|
	strcmp(buf, ada_delete_method_names[i]) != 0)
	continue;
	softc->delete_method = i;
	return (0);
	}
	return (EINVAL);
	}

	static int
	adabitsysctl(SYSCTL_HANDLER_ARGS)
	{
	u_int *flags = arg1;
	u_int test = arg2;
	int tmpout, error;

	tmpout = !!(*flags & test);
	error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout));
	if (error \|\| !req->newptr)
	return (error);

	return (EPERM);
	}

	static int
	adaflagssysctl(SYSCTL_HANDLER_ARGS)
	{
	struct sbuf sbuf;
	struct ada_softc *softc = arg1;
	int error;

	sbuf_new_for_sysctl(&sbuf, NULL, 0, req);
	if (softc->flags != 0)
	sbuf_printf(&sbuf, "0x%b", (unsigned)softc->flags, ADA_FLAG_STRING);
	else
	sbuf_printf(&sbuf, "0");
	error = sbuf_finish(&sbuf);
	sbuf_delete(&sbuf);

	return (error);
	}

	static void
	adasetflags(struct ada_softc softc, struct ccb_getdev cgd)
	{
	if ((cgd->ident_data.capabilities1 & ATA_SUPPORT_DMA) &&
	(cgd->inq_flags & SID_DMA))
	softc->flags \|= ADA_FLAG_CAN_DMA;
	else
	softc->flags &= ~ADA_FLAG_CAN_DMA;

	if (cgd->ident_data.support.command2 & ATA_SUPPORT_ADDRESS48) {
	softc->flags \|= ADA_FLAG_CAN_48BIT;
	if (cgd->inq_flags & SID_DMA48)
	softc->flags \|= ADA_FLAG_CAN_DMA48;
	else
	softc->flags &= ~ADA_FLAG_CAN_DMA48;
	} else
	softc->flags &= ~(ADA_FLAG_CAN_48BIT \| ADA_FLAG_CAN_DMA48);

	if (cgd->ident_data.support.command2 & ATA_SUPPORT_FLUSHCACHE)
	softc->flags \|= ADA_FLAG_CAN_FLUSHCACHE;
	else
	softc->flags &= ~ADA_FLAG_CAN_FLUSHCACHE;

	if (cgd->ident_data.support.command1 & ATA_SUPPORT_POWERMGT)
	softc->flags \|= ADA_FLAG_CAN_POWERMGT;
	else
	softc->flags &= ~ADA_FLAG_CAN_POWERMGT;

	if ((cgd->ident_data.satacapabilities & ATA_SUPPORT_NCQ) &&
	(cgd->inq_flags & SID_DMA) && (cgd->inq_flags & SID_CmdQue))
	softc->flags \|= ADA_FLAG_CAN_NCQ;
	else
	softc->flags &= ~ADA_FLAG_CAN_NCQ;

	if ((cgd->ident_data.support_dsm & ATA_SUPPORT_DSM_TRIM) &&
	(cgd->inq_flags & SID_DMA) &&
	(softc->quirks & ADA_Q_NO_TRIM) == 0) {
	softc->flags \|= ADA_FLAG_CAN_TRIM;
	softc->trim_max_ranges = TRIM_MAX_RANGES;
	if (cgd->ident_data.max_dsm_blocks != 0) {
	softc->trim_max_ranges =
	min(cgd->ident_data.max_dsm_blocks *
	ATA_DSM_BLK_RANGES, softc->trim_max_ranges);
	}
	/*
	* If we can do RCVSND_FPDMA_QUEUED commands, we may be able
	* to do NCQ trims, if we support trims at all. We also need
	* support from the SIM to do things properly. Perhaps we
	* should look at log 13 dword 0 bit 0 and dword 1 bit 0 are
	* set too...
	*/
	if ((softc->quirks & ADA_Q_NCQ_TRIM_BROKEN) == 0 &&
	(softc->flags & ADA_FLAG_PIM_ATA_EXT) != 0 &&
	(cgd->ident_data.satacapabilities2 &
	ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 &&
	(softc->flags & ADA_FLAG_CAN_TRIM) != 0)
	softc->flags \|= ADA_FLAG_CAN_NCQ_TRIM;
	else
	softc->flags &= ~ADA_FLAG_CAN_NCQ_TRIM;
	} else
	softc->flags &= ~(ADA_FLAG_CAN_TRIM \| ADA_FLAG_CAN_NCQ_TRIM);

	if (cgd->ident_data.support.command2 & ATA_SUPPORT_CFA)
	softc->flags \|= ADA_FLAG_CAN_CFA;
	else
	softc->flags &= ~ADA_FLAG_CAN_CFA;

	/*
	* Now that we've set the appropriate flags, setup the delete
	* method.
	*/
	adasetdeletemethod(softc);

	if ((cgd->ident_data.support.extension & ATA_SUPPORT_GENLOG)
	&& ((softc->quirks & ADA_Q_LOG_BROKEN) == 0))
	softc->flags \|= ADA_FLAG_CAN_LOG;
	else
	softc->flags &= ~ADA_FLAG_CAN_LOG;

	if ((cgd->ident_data.support3 & ATA_SUPPORT_ZONE_MASK) ==
	ATA_SUPPORT_ZONE_HOST_AWARE)
	softc->zone_mode = ADA_ZONE_HOST_AWARE;
	else if (((cgd->ident_data.support3 & ATA_SUPPORT_ZONE_MASK) ==
	ATA_SUPPORT_ZONE_DEV_MANAGED)
	\|\| (softc->quirks & ADA_Q_SMR_DM))
	softc->zone_mode = ADA_ZONE_DRIVE_MANAGED;
	else
	softc->zone_mode = ADA_ZONE_NONE;

	if (cgd->ident_data.support.command1 & ATA_SUPPORT_LOOKAHEAD)
	softc->flags \|= ADA_FLAG_CAN_RAHEAD;
	else
	softc->flags &= ~ADA_FLAG_CAN_RAHEAD;

	if (cgd->ident_data.support.command1 & ATA_SUPPORT_WRITECACHE)
	softc->flags \|= ADA_FLAG_CAN_WCACHE;
	else
	softc->flags &= ~ADA_FLAG_CAN_WCACHE;
	}

	static cam_status
	adaregister(struct cam_periph periph, void arg)
	{
	struct ada_softc *softc;
	struct ccb_getdev *cgd;
	struct disk_params *dp;
	struct sbuf sb;
	char *announce_buf;
	caddr_t match;
	int quirks;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL) {
	printf("adaregister: no getdev CCB, can't register device\n");
	return(CAM_REQ_CMP_ERR);
	}

	softc = (struct ada_softc )malloc(sizeof(softc), M_DEVBUF,
	M_NOWAIT\|M_ZERO);

	if (softc == NULL) {
	printf("adaregister: Unable to probe new device. "
	"Unable to allocate softc\n");
	return(CAM_REQ_CMP_ERR);
	}

	announce_buf = softc->announce_temp;
	bzero(announce_buf, ADA_ANNOUNCETMP_SZ);

	if (cam_iosched_init(&softc->cam_iosched, periph) != 0) {
	printf("adaregister: Unable to probe new device. "
	"Unable to allocate iosched memory\n");
	free(softc, M_DEVBUF);
	return(CAM_REQ_CMP_ERR);
	}

	periph->softc = softc;
	xpt_path_inq(&softc->cpi, periph->path);

	/*
	* See if this device has any quirks.
	*/
	match = cam_quirkmatch((caddr_t)&cgd->ident_data,
	(caddr_t)ada_quirk_table,
	nitems(ada_quirk_table),
	sizeof(*ada_quirk_table), ata_identify_match);
	if (match != NULL)
	softc->quirks = ((struct ada_quirk_entry *)match)->quirks;
	else
	softc->quirks = ADA_Q_NONE;

	TASK_INIT(&softc->sysctl_task, 0, adasysctlinit, periph);

	/*
	* Register this media as a disk
	*/
	(void)cam_periph_hold(periph, PRIBIO);
	cam_periph_unlock(periph);
	snprintf(announce_buf, ADA_ANNOUNCETMP_SZ,
	"kern.cam.ada.%d.quirks", periph->unit_number);
	quirks = softc->quirks;
	TUNABLE_INT_FETCH(announce_buf, &quirks);
	softc->quirks = quirks;
	softc->read_ahead = -1;
	snprintf(announce_buf, ADA_ANNOUNCETMP_SZ,
	"kern.cam.ada.%d.read_ahead", periph->unit_number);
	TUNABLE_INT_FETCH(announce_buf, &softc->read_ahead);
	softc->write_cache = -1;
	snprintf(announce_buf, ADA_ANNOUNCETMP_SZ,
	"kern.cam.ada.%d.write_cache", periph->unit_number);
	TUNABLE_INT_FETCH(announce_buf, &softc->write_cache);

	/*
	* Set support flags based on the Identify data and quirks.
	*/
	adasetflags(softc, cgd);
	if (softc->cpi.hba_misc & PIM_ATA_EXT)
	softc->flags \|= ADA_FLAG_PIM_ATA_EXT;

	/* Disable queue sorting for non-rotational media by default. */
	if (cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING) {
	softc->flags &= ~ADA_FLAG_ROTATING;
	} else {
	softc->flags \|= ADA_FLAG_ROTATING;
	}
	cam_iosched_set_sort_queue(softc->cam_iosched,
	(softc->flags & ADA_FLAG_ROTATING) ? -1 : 0);
	softc->disk = disk_alloc();
	adasetgeom(softc, cgd);
	softc->disk->d_devstat = devstat_new_entry(periph->periph_name,
	periph->unit_number, softc->params.secsize,
	DEVSTAT_ALL_SUPPORTED,
	DEVSTAT_TYPE_DIRECT \|
	XPORT_DEVSTAT_TYPE(softc->cpi.transport),
	DEVSTAT_PRIORITY_DISK);
	softc->disk->d_open = adaopen;
	softc->disk->d_close = adaclose;
	softc->disk->d_strategy = adastrategy;
	softc->disk->d_getattr = adagetattr;
	softc->disk->d_dump = adadump;
	softc->disk->d_gone = adadiskgonecb;
	softc->disk->d_name = "ada";
	softc->disk->d_drv1 = periph;
	softc->disk->d_unit = periph->unit_number;

	/*
	* Acquire a reference to the periph before we register with GEOM.
	* We'll release this reference once GEOM calls us back (via
	* adadiskgonecb()) telling us that our provider has been freed.
	*/
	if (cam_periph_acquire(periph) != 0) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}
	disk_create(softc->disk, DISK_VERSION);
	cam_periph_lock(periph);

	dp = &softc->params;
	snprintf(announce_buf, ADA_ANNOUNCETMP_SZ,
	"%juMB (%ju %u byte sectors)",
	((uintmax_t)dp->secsize * dp->sectors) / (1024 * 1024),
	(uintmax_t)dp->sectors, dp->secsize);

	sbuf_new(&sb, softc->announce_buffer, ADA_ANNOUNCE_SZ, SBUF_FIXEDLEN);
	xpt_announce_periph_sbuf(periph, &sb, announce_buf);
	xpt_announce_quirks_sbuf(periph, &sb, softc->quirks, ADA_Q_BIT_STRING);
	sbuf_finish(&sb);
	sbuf_putbuf(&sb);

	/*
	* Create our sysctl variables, now that we know
	* we have successfully attached.
	*/
	if (cam_periph_acquire(periph) == 0)
	taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task);

	/*
	* Add async callbacks for bus reset and
	* bus device reset calls. I don't bother
	* checking if this fails as, in most cases,
	* the system will function just fine without
	* them and the only alternative would be to
	* not attach the device on failure.
	*/
	xpt_register_async(AC_SENT_BDR \| AC_BUS_RESET \| AC_LOST_DEVICE \|
	AC_GETDEV_CHANGED \| AC_ADVINFO_CHANGED,
	adaasync, periph, periph->path);

	/*
	* Schedule a periodic event to occasionally send an
	* ordered tag to a device.
	*/
	callout_init_mtx(&softc->sendordered_c, cam_periph_mtx(periph), 0);
	callout_reset(&softc->sendordered_c,
	(ada_default_timeout * hz) / ADA_ORDEREDTAG_INTERVAL,
	adasendorderedtag, softc);

	if (ADA_RA >= 0 && softc->flags & ADA_FLAG_CAN_RAHEAD) {
	softc->state = ADA_STATE_RAHEAD;
	} else if (ADA_WC >= 0 && softc->flags & ADA_FLAG_CAN_WCACHE) {
	softc->state = ADA_STATE_WCACHE;
	} else if ((softc->flags & ADA_FLAG_CAN_LOG)
	&& (softc->zone_mode != ADA_ZONE_NONE)) {
	softc->state = ADA_STATE_LOGDIR;
	} else {
	/*
	* Nothing to probe, so we can just transition to the
	* normal state.
	*/
	adaprobedone(periph, NULL);
	return(CAM_REQ_CMP);
	}

	xpt_schedule(periph, CAM_PRIORITY_DEV);

	return(CAM_REQ_CMP);
	}

	static int
	ada_dsmtrim_req_create(struct ada_softc softc, struct bio bp, struct trim_request *req)
	{
	uint64_t lastlba = (uint64_t)-1, lbas = 0;
	int c, lastcount = 0, off, ranges = 0;

	bzero(req, sizeof(*req));
	TAILQ_INIT(&req->bps);
	do {
	uint64_t lba = bp->bio_pblkno;
	int count = bp->bio_bcount / softc->params.secsize;

	/* Try to extend the previous range. */
	if (lba == lastlba) {
	c = min(count, ATA_DSM_RANGE_MAX - lastcount);
	lastcount += c;
	off = (ranges - 1) * ATA_DSM_RANGE_SIZE;
	req->data[off + 6] = lastcount & 0xff;
	req->data[off + 7] =
	(lastcount >> 8) & 0xff;
	count -= c;
	lba += c;
	lbas += c;
	}

	while (count > 0) {
	c = min(count, ATA_DSM_RANGE_MAX);
	off = ranges * ATA_DSM_RANGE_SIZE;
	req->data[off + 0] = lba & 0xff;
	req->data[off + 1] = (lba >> 8) & 0xff;
	req->data[off + 2] = (lba >> 16) & 0xff;
	req->data[off + 3] = (lba >> 24) & 0xff;
	req->data[off + 4] = (lba >> 32) & 0xff;
	req->data[off + 5] = (lba >> 40) & 0xff;
	req->data[off + 6] = c & 0xff;
	req->data[off + 7] = (c >> 8) & 0xff;
	lba += c;
	lbas += c;
	count -= c;
	lastcount = c;
	ranges++;
	/*
	* Its the caller's responsibility to ensure the
	* request will fit so we don't need to check for
	* overrun here
	*/
	}
	lastlba = lba;
	TAILQ_INSERT_TAIL(&req->bps, bp, bio_queue);

	bp = cam_iosched_next_trim(softc->cam_iosched);
	if (bp == NULL)
	break;
	if (bp->bio_bcount / softc->params.secsize >
	(softc->trim_max_ranges - ranges) * ATA_DSM_RANGE_MAX) {
	cam_iosched_put_back_trim(softc->cam_iosched, bp);
	break;
	}
	} while (1);
	softc->trim_count++;
	softc->trim_ranges += ranges;
	softc->trim_lbas += lbas;

	return (ranges);
	}

	static void
	ada_dsmtrim(struct ada_softc softc, struct bio bp, struct ccb_ataio *ataio)
	{
	struct trim_request *req = &softc->trim_req;
	int ranges;

	ranges = ada_dsmtrim_req_create(softc, bp, req);
	cam_fill_ataio(ataio,
	ada_retry_count,
	adadone,
	CAM_DIR_OUT,
	0,
	req->data,
	howmany(ranges, ATA_DSM_BLK_RANGES) * ATA_DSM_BLK_SIZE,
	ada_default_timeout * 1000);
	ata_48bit_cmd(ataio, ATA_DATA_SET_MANAGEMENT,
	ATA_DSM_TRIM, 0, howmany(ranges, ATA_DSM_BLK_RANGES));
	}

	static void
	ada_ncq_dsmtrim(struct ada_softc softc, struct bio bp, struct ccb_ataio *ataio)
	{
	struct trim_request *req = &softc->trim_req;
	int ranges;

	ranges = ada_dsmtrim_req_create(softc, bp, req);
	cam_fill_ataio(ataio,
	ada_retry_count,
	adadone,
	CAM_DIR_OUT,
	0,
	req->data,
	howmany(ranges, ATA_DSM_BLK_RANGES) * ATA_DSM_BLK_SIZE,
	ada_default_timeout * 1000);
	ata_ncq_cmd(ataio,
	ATA_SEND_FPDMA_QUEUED,
	0,
	howmany(ranges, ATA_DSM_BLK_RANGES));
	ataio->cmd.sector_count_exp = ATA_SFPDMA_DSM;
	ataio->ata_flags \|= ATA_FLAG_AUX;
	ataio->aux = 1;
	}

	static void
	ada_cfaerase(struct ada_softc softc, struct bio bp, struct ccb_ataio *ataio)
	{
	struct trim_request *req = &softc->trim_req;
	uint64_t lba = bp->bio_pblkno;
	uint16_t count = bp->bio_bcount / softc->params.secsize;

	bzero(req, sizeof(*req));
	TAILQ_INIT(&req->bps);
	TAILQ_INSERT_TAIL(&req->bps, bp, bio_queue);

	cam_fill_ataio(ataio,
	ada_retry_count,
	adadone,
	CAM_DIR_NONE,
	0,
	NULL,
	0,
	ada_default_timeout*1000);

	if (count >= 256)
	count = 0;
	ata_28bit_cmd(ataio, ATA_CFA_ERASE, 0, lba, count);
	}

	static int
	ada_zone_bio_to_ata(int disk_zone_cmd)
	{
	switch (disk_zone_cmd) {
	case DISK_ZONE_OPEN:
	return ATA_ZM_OPEN_ZONE;
	case DISK_ZONE_CLOSE:
	return ATA_ZM_CLOSE_ZONE;
	case DISK_ZONE_FINISH:
	return ATA_ZM_FINISH_ZONE;
	case DISK_ZONE_RWP:
	return ATA_ZM_RWP;
	}

	return -1;
	}

	static int
	ada_zone_cmd(struct cam_periph periph, union ccb ccb, struct bio *bp,
	int *queue_ccb)
	{
	struct ada_softc *softc;
	int error;

	error = 0;

	if (bp->bio_cmd != BIO_ZONE) {
	error = EINVAL;
	goto bailout;
	}

	softc = periph->softc;

	switch (bp->bio_zone.zone_cmd) {
	case DISK_ZONE_OPEN:
	case DISK_ZONE_CLOSE:
	case DISK_ZONE_FINISH:
	case DISK_ZONE_RWP: {
	int zone_flags;
	int zone_sa;
	uint64_t lba;

	zone_sa = ada_zone_bio_to_ata(bp->bio_zone.zone_cmd);
	if (zone_sa == -1) {
	xpt_print(periph->path, "Cannot translate zone "
	"cmd %#x to ATA\n", bp->bio_zone.zone_cmd);
	error = EINVAL;
	goto bailout;
	}

	zone_flags = 0;
	lba = bp->bio_zone.zone_params.rwp.id;

	if (bp->bio_zone.zone_params.rwp.flags &
	DISK_ZONE_RWP_FLAG_ALL)
	zone_flags \|= ZBC_OUT_ALL;

	ata_zac_mgmt_out(&ccb->ataio,
	/retries/ ada_retry_count,
	/cbfcnp/ adadone,
	/use_ncq/ (softc->flags &
	ADA_FLAG_PIM_ATA_EXT) ? 1 : 0,
	/zm_action/ zone_sa,
	/zone_id/ lba,
	/zone_flags/ zone_flags,
	/sector_count/ 0,
	/data_ptr/ NULL,
	/dxfer_len/ 0,
	/timeout/ ada_default_timeout * 1000);
	*queue_ccb = 1;

	break;
	}
	case DISK_ZONE_REPORT_ZONES: {
	uint8_t *rz_ptr;
	uint32_t num_entries, alloc_size;
	struct disk_zone_report *rep;

	rep = &bp->bio_zone.zone_params.report;

	num_entries = rep->entries_allocated;
	if (num_entries == 0) {
	xpt_print(periph->path, "No entries allocated for "
	"Report Zones request\n");
	error = EINVAL;
	goto bailout;
	}
	alloc_size = sizeof(struct scsi_report_zones_hdr) +
	(sizeof(struct scsi_report_zones_desc) * num_entries);
	alloc_size = min(alloc_size, softc->disk->d_maxsize);
	rz_ptr = malloc(alloc_size, M_ATADA, M_NOWAIT \| M_ZERO);
	if (rz_ptr == NULL) {
	xpt_print(periph->path, "Unable to allocate memory "
	"for Report Zones request\n");
	error = ENOMEM;
	goto bailout;
	}

	ata_zac_mgmt_in(&ccb->ataio,
	/retries/ ada_retry_count,
	/cbcfnp/ adadone,
	/use_ncq/ (softc->flags &
	ADA_FLAG_PIM_ATA_EXT) ? 1 : 0,
	/zm_action/ ATA_ZM_REPORT_ZONES,
	/zone_id/ rep->starting_id,
	/zone_flags/ rep->rep_options,
	/data_ptr/ rz_ptr,
	/dxfer_len/ alloc_size,
	/timeout/ ada_default_timeout * 1000);

	/*
	* For BIO_ZONE, this isn't normally needed. However, it
	* is used by devstat_end_transaction_bio() to determine
	* how much data was transferred.
	*/
	/*
	* XXX KDM we have a problem. But I'm not sure how to fix
	* it. devstat uses bio_bcount - bio_resid to calculate
	* the amount of data transferred. The GEOM disk code
	* uses bio_length - bio_resid to calculate the amount of
	* data in bio_completed. We have different structure
	* sizes above and below the ada(4) driver. So, if we
	* use the sizes above, the amount transferred won't be
	* quite accurate for devstat. If we use different sizes
	* for bio_bcount and bio_length (above and below
	* respectively), then the residual needs to match one or
	* the other. Everything is calculated after the bio
	* leaves the driver, so changing the values around isn't
	* really an option. For now, just set the count to the
	* passed in length. This means that the calculations
	* above (e.g. bio_completed) will be correct, but the
	* amount of data reported to devstat will be slightly
	* under or overstated.
	*/
	bp->bio_bcount = bp->bio_length;

	*queue_ccb = 1;

	break;
	}
	case DISK_ZONE_GET_PARAMS: {
	struct disk_zone_disk_params *params;

	params = &bp->bio_zone.zone_params.disk_params;
	bzero(params, sizeof(*params));

	switch (softc->zone_mode) {
	case ADA_ZONE_DRIVE_MANAGED:
	params->zone_mode = DISK_ZONE_MODE_DRIVE_MANAGED;
	break;
	case ADA_ZONE_HOST_AWARE:
	params->zone_mode = DISK_ZONE_MODE_HOST_AWARE;
	break;
	case ADA_ZONE_HOST_MANAGED:
	params->zone_mode = DISK_ZONE_MODE_HOST_MANAGED;
	break;
	default:
	case ADA_ZONE_NONE:
	params->zone_mode = DISK_ZONE_MODE_NONE;
	break;
	}

	if (softc->zone_flags & ADA_ZONE_FLAG_URSWRZ)
	params->flags \|= DISK_ZONE_DISK_URSWRZ;

	if (softc->zone_flags & ADA_ZONE_FLAG_OPT_SEQ_SET) {
	params->optimal_seq_zones = softc->optimal_seq_zones;
	params->flags \|= DISK_ZONE_OPT_SEQ_SET;
	}

	if (softc->zone_flags & ADA_ZONE_FLAG_OPT_NONSEQ_SET) {
	params->optimal_nonseq_zones =
	softc->optimal_nonseq_zones;
	params->flags \|= DISK_ZONE_OPT_NONSEQ_SET;
	}

	if (softc->zone_flags & ADA_ZONE_FLAG_MAX_SEQ_SET) {
	params->max_seq_zones = softc->max_seq_zones;
	params->flags \|= DISK_ZONE_MAX_SEQ_SET;
	}
	if (softc->zone_flags & ADA_ZONE_FLAG_RZ_SUP)
	params->flags \|= DISK_ZONE_RZ_SUP;

	if (softc->zone_flags & ADA_ZONE_FLAG_OPEN_SUP)
	params->flags \|= DISK_ZONE_OPEN_SUP;

	if (softc->zone_flags & ADA_ZONE_FLAG_CLOSE_SUP)
	params->flags \|= DISK_ZONE_CLOSE_SUP;

	if (softc->zone_flags & ADA_ZONE_FLAG_FINISH_SUP)
	params->flags \|= DISK_ZONE_FINISH_SUP;

	if (softc->zone_flags & ADA_ZONE_FLAG_RWP_SUP)
	params->flags \|= DISK_ZONE_RWP_SUP;
	break;
	}
	default:
	break;
	}
	bailout:
	return (error);
	}

	static void
	adastart(struct cam_periph periph, union ccb start_ccb)
	{
	struct ada_softc softc = (struct ada_softc )periph->softc;
	struct ccb_ataio *ataio = &start_ccb->ataio;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("adastart\n"));

	switch (softc->state) {
	case ADA_STATE_NORMAL:
	{
	struct bio *bp;
	u_int8_t tag_code;

	bp = cam_iosched_next_bio(softc->cam_iosched);
	if (bp == NULL) {
	xpt_release_ccb(start_ccb);
	break;
	}

	if ((bp->bio_flags & BIO_ORDERED) != 0 \|\|
	(bp->bio_cmd != BIO_DELETE && (softc->flags & ADA_FLAG_NEED_OTAG) != 0)) {
	softc->flags &= ~ADA_FLAG_NEED_OTAG;
	softc->flags \|= ADA_FLAG_WAS_OTAG;
	tag_code = 0;
	} else {
	tag_code = 1;
	}
	switch (bp->bio_cmd) {
	case BIO_WRITE:
	case BIO_READ:
	{
	uint64_t lba = bp->bio_pblkno;
	uint16_t count = bp->bio_bcount / softc->params.secsize;
	void *data_ptr;
	int rw_op;

	if (bp->bio_cmd == BIO_WRITE) {
	softc->flags \|= ADA_FLAG_DIRTY;
	rw_op = CAM_DIR_OUT;
	} else {
	rw_op = CAM_DIR_IN;
	}

	data_ptr = bp->bio_data;
	if ((bp->bio_flags & (BIO_UNMAPPED\|BIO_VLIST)) != 0) {
	rw_op \|= CAM_DATA_BIO;
	data_ptr = bp;
	}

	#ifdef CAM_TEST_FAILURE
	int fail = 0;

	/*
	* Support the failure ioctls. If the command is a
	* read, and there are pending forced read errors, or
	* if a write and pending write errors, then fail this
	* operation with EIO. This is useful for testing
	* purposes. Also, support having every Nth read fail.
	*
	* This is a rather blunt tool.
	*/
	if (bp->bio_cmd == BIO_READ) {
	if (softc->force_read_error) {
	softc->force_read_error--;
	fail = 1;
	}
	if (softc->periodic_read_error > 0) {
	if (++softc->periodic_read_count >=
	softc->periodic_read_error) {
	softc->periodic_read_count = 0;
	fail = 1;
	}
	}
	} else {
	if (softc->force_write_error) {
	softc->force_write_error--;
	fail = 1;
	}
	}
	if (fail) {
	biofinish(bp, NULL, EIO);
	xpt_release_ccb(start_ccb);
	adaschedule(periph);
	return;
	}
	#endif
	KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 \|\|
	round_page(bp->bio_bcount + bp->bio_ma_offset) /
	PAGE_SIZE == bp->bio_ma_n,
	("Short bio %p", bp));
	cam_fill_ataio(ataio,
	ada_retry_count,
	adadone,
	rw_op,
	0,
	data_ptr,
	bp->bio_bcount,
	ada_default_timeout*1000);

	if ((softc->flags & ADA_FLAG_CAN_NCQ) && tag_code) {
	if (bp->bio_cmd == BIO_READ) {
	ata_ncq_cmd(ataio, ATA_READ_FPDMA_QUEUED,
	lba, count);
	} else {
	ata_ncq_cmd(ataio, ATA_WRITE_FPDMA_QUEUED,
	lba, count);
	}
	} else if ((softc->flags & ADA_FLAG_CAN_48BIT) &&
	(lba + count >= ATA_MAX_28BIT_LBA \|\|
	count > 256)) {
	if (softc->flags & ADA_FLAG_CAN_DMA48) {
	if (bp->bio_cmd == BIO_READ) {
	ata_48bit_cmd(ataio, ATA_READ_DMA48,
	0, lba, count);
	} else {
	ata_48bit_cmd(ataio, ATA_WRITE_DMA48,
	0, lba, count);
	}
	} else {
	if (bp->bio_cmd == BIO_READ) {
	ata_48bit_cmd(ataio, ATA_READ_MUL48,
	0, lba, count);
	} else {
	ata_48bit_cmd(ataio, ATA_WRITE_MUL48,
	0, lba, count);
	}
	}
	} else {
	if (count == 256)
	count = 0;
	if (softc->flags & ADA_FLAG_CAN_DMA) {
	if (bp->bio_cmd == BIO_READ) {
	ata_28bit_cmd(ataio, ATA_READ_DMA,
	0, lba, count);
	} else {
	ata_28bit_cmd(ataio, ATA_WRITE_DMA,
	0, lba, count);
	}
	} else {
	if (bp->bio_cmd == BIO_READ) {
	ata_28bit_cmd(ataio, ATA_READ_MUL,
	0, lba, count);
	} else {
	ata_28bit_cmd(ataio, ATA_WRITE_MUL,
	0, lba, count);
	}
	}
	}
	break;
	}
	case BIO_DELETE:
	switch (softc->delete_method) {
	case ADA_DELETE_NCQ_DSM_TRIM:
	ada_ncq_dsmtrim(softc, bp, ataio);
	break;
	case ADA_DELETE_DSM_TRIM:
	ada_dsmtrim(softc, bp, ataio);
	break;
	case ADA_DELETE_CFA_ERASE:
	ada_cfaerase(softc, bp, ataio);
	break;
	default:
	biofinish(bp, NULL, EOPNOTSUPP);
	xpt_release_ccb(start_ccb);
	adaschedule(periph);
	return;
	}
	start_ccb->ccb_h.ccb_state = ADA_CCB_TRIM;
	start_ccb->ccb_h.flags \|= CAM_UNLOCKED;
	cam_iosched_submit_trim(softc->cam_iosched);
	goto out;
	case BIO_FLUSH:
	cam_fill_ataio(ataio,
	1,
	adadone,
	CAM_DIR_NONE,
	0,
	NULL,
	0,
	ada_default_timeout*1000);

	if (softc->flags & ADA_FLAG_CAN_48BIT)
	ata_48bit_cmd(ataio, ATA_FLUSHCACHE48, 0, 0, 0);
	else
	ata_28bit_cmd(ataio, ATA_FLUSHCACHE, 0, 0, 0);
	break;
	case BIO_ZONE: {
	int error, queue_ccb;

	queue_ccb = 0;

	error = ada_zone_cmd(periph, start_ccb, bp, &queue_ccb);
	if ((error != 0)
	\|\| (queue_ccb == 0)) {
	biofinish(bp, NULL, error);
	xpt_release_ccb(start_ccb);
	return;
	}
	break;
	}
	default:
	biofinish(bp, NULL, EOPNOTSUPP);
	xpt_release_ccb(start_ccb);
	return;
	}
	start_ccb->ccb_h.ccb_state = ADA_CCB_BUFFER_IO;
	start_ccb->ccb_h.flags \|= CAM_UNLOCKED;
	out:
	start_ccb->ccb_h.ccb_bp = bp;
	softc->outstanding_cmds++;
	softc->refcount++;
	cam_periph_unlock(periph);
	xpt_action(start_ccb);
	cam_periph_lock(periph);

	/* May have more work to do, so ensure we stay scheduled */
	adaschedule(periph);
	break;
	}
	case ADA_STATE_RAHEAD:
	case ADA_STATE_WCACHE:
	{
	cam_fill_ataio(ataio,
	1,
	adadone,
	CAM_DIR_NONE,
	0,
	NULL,
	0,
	ada_default_timeout*1000);

	if (softc->state == ADA_STATE_RAHEAD) {
	ata_28bit_cmd(ataio, ATA_SETFEATURES, ADA_RA ?
	ATA_SF_ENAB_RCACHE : ATA_SF_DIS_RCACHE, 0, 0);
	start_ccb->ccb_h.ccb_state = ADA_CCB_RAHEAD;
	} else {
	ata_28bit_cmd(ataio, ATA_SETFEATURES, ADA_WC ?
	ATA_SF_ENAB_WCACHE : ATA_SF_DIS_WCACHE, 0, 0);
	start_ccb->ccb_h.ccb_state = ADA_CCB_WCACHE;
	}
	start_ccb->ccb_h.flags \|= CAM_DEV_QFREEZE;
	xpt_action(start_ccb);
	break;
	}
	case ADA_STATE_LOGDIR:
	{
	struct ata_gp_log_dir *log_dir;

	if ((softc->flags & ADA_FLAG_CAN_LOG) == 0) {
	adaprobedone(periph, start_ccb);
	break;
	}

	log_dir = malloc(sizeof(*log_dir), M_ATADA, M_NOWAIT\|M_ZERO);
	if (log_dir == NULL) {
	xpt_print(periph->path, "Couldn't malloc log_dir "
	"data\n");
	softc->state = ADA_STATE_NORMAL;
	xpt_release_ccb(start_ccb);
	break;
	}

	ata_read_log(ataio,
	/retries/1,
	/cbfcnp/adadone,
	/log_address/ ATA_LOG_DIRECTORY,
	/page_number/ 0,
	/block_count/ 1,
	/protocol/ softc->flags & ADA_FLAG_CAN_DMA ?
	CAM_ATAIO_DMA : 0,
	/data_ptr/ (uint8_t *)log_dir,
	/dxfer_len/sizeof(*log_dir),
	/timeout/ada_default_timeout*1000);

	start_ccb->ccb_h.ccb_state = ADA_CCB_LOGDIR;
	xpt_action(start_ccb);
	break;
	}
	case ADA_STATE_IDDIR:
	{
	struct ata_identify_log_pages *id_dir;

	id_dir = malloc(sizeof(*id_dir), M_ATADA, M_NOWAIT \| M_ZERO);
	if (id_dir == NULL) {
	xpt_print(periph->path, "Couldn't malloc id_dir "
	"data\n");
	adaprobedone(periph, start_ccb);
	break;
	}

	ata_read_log(ataio,
	/retries/1,
	/cbfcnp/adadone,
	/log_address/ ATA_IDENTIFY_DATA_LOG,
	/page_number/ ATA_IDL_PAGE_LIST,
	/block_count/ 1,
	/protocol/ softc->flags & ADA_FLAG_CAN_DMA ?
	CAM_ATAIO_DMA : 0,
	/data_ptr/ (uint8_t *)id_dir,
	/dxfer_len/ sizeof(*id_dir),
	/timeout/ada_default_timeout*1000);

	start_ccb->ccb_h.ccb_state = ADA_CCB_IDDIR;
	xpt_action(start_ccb);
	break;
	}
	case ADA_STATE_SUP_CAP:
	{
	struct ata_identify_log_sup_cap *sup_cap;

	sup_cap = malloc(sizeof(*sup_cap), M_ATADA, M_NOWAIT\|M_ZERO);
	if (sup_cap == NULL) {
	xpt_print(periph->path, "Couldn't malloc sup_cap "
	"data\n");
	adaprobedone(periph, start_ccb);
	break;
	}

	ata_read_log(ataio,
	/retries/1,
	/cbfcnp/adadone,
	/log_address/ ATA_IDENTIFY_DATA_LOG,
	/page_number/ ATA_IDL_SUP_CAP,
	/block_count/ 1,
	/protocol/ softc->flags & ADA_FLAG_CAN_DMA ?
	CAM_ATAIO_DMA : 0,
	/data_ptr/ (uint8_t *)sup_cap,
	/dxfer_len/ sizeof(*sup_cap),
	/timeout/ada_default_timeout*1000);

	start_ccb->ccb_h.ccb_state = ADA_CCB_SUP_CAP;
	xpt_action(start_ccb);
	break;
	}
	case ADA_STATE_ZONE:
	{
	struct ata_zoned_info_log *ata_zone;

	ata_zone = malloc(sizeof(*ata_zone), M_ATADA, M_NOWAIT\|M_ZERO);
	if (ata_zone == NULL) {
	xpt_print(periph->path, "Couldn't malloc ata_zone "
	"data\n");
	adaprobedone(periph, start_ccb);
	break;
	}

	ata_read_log(ataio,
	/retries/1,
	/cbfcnp/adadone,
	/log_address/ ATA_IDENTIFY_DATA_LOG,
	/page_number/ ATA_IDL_ZDI,
	/block_count/ 1,
	/protocol/ softc->flags & ADA_FLAG_CAN_DMA ?
	CAM_ATAIO_DMA : 0,
	/data_ptr/ (uint8_t *)ata_zone,
	/dxfer_len/ sizeof(*ata_zone),
	/timeout/ada_default_timeout*1000);

	start_ccb->ccb_h.ccb_state = ADA_CCB_ZONE;
	xpt_action(start_ccb);
	break;
	}
	}
	}

	static void
	adaprobedone(struct cam_periph periph, union ccb ccb)
	{
	struct ada_softc *softc;

	softc = (struct ada_softc *)periph->softc;

	if (ccb != NULL)
	xpt_release_ccb(ccb);

	softc->state = ADA_STATE_NORMAL;
	softc->flags \|= ADA_FLAG_PROBED;
	adaschedule(periph);
	if ((softc->flags & ADA_FLAG_ANNOUNCED) == 0) {
	softc->flags \|= ADA_FLAG_ANNOUNCED;
	cam_periph_unhold(periph);
	} else {
	cam_periph_release_locked(periph);
	}
	}

	static void
	adazonedone(struct cam_periph periph, union ccb ccb)
	{
	struct bio *bp;

	bp = (struct bio *)ccb->ccb_h.ccb_bp;

	switch (bp->bio_zone.zone_cmd) {
	case DISK_ZONE_OPEN:
	case DISK_ZONE_CLOSE:
	case DISK_ZONE_FINISH:
	case DISK_ZONE_RWP:
	break;
	case DISK_ZONE_REPORT_ZONES: {
	uint32_t avail_len;
	struct disk_zone_report *rep;
	struct scsi_report_zones_hdr *hdr;
	struct scsi_report_zones_desc *desc;
	struct disk_zone_rep_entry *entry;
	uint32_t hdr_len, num_avail;
	uint32_t num_to_fill, i;

	rep = &bp->bio_zone.zone_params.report;
	avail_len = ccb->ataio.dxfer_len - ccb->ataio.resid;
	/*
	* Note that bio_resid isn't normally used for zone
	* commands, but it is used by devstat_end_transaction_bio()
	* to determine how much data was transferred. Because
	* the size of the SCSI/ATA data structures is different
	* than the size of the BIO interface structures, the
	* amount of data actually transferred from the drive will
	* be different than the amount of data transferred to
	* the user.
	*/
	hdr = (struct scsi_report_zones_hdr *)ccb->ataio.data_ptr;
	if (avail_len < sizeof(*hdr)) {
	/*
	* Is there a better error than EIO here? We asked
	* for at least the header, and we got less than
	* that.
	*/
	bp->bio_error = EIO;
	bp->bio_flags \|= BIO_ERROR;
	bp->bio_resid = bp->bio_bcount;
	break;
	}

	hdr_len = le32dec(hdr->length);
	if (hdr_len > 0)
	rep->entries_available = hdr_len / sizeof(*desc);
	else
	rep->entries_available = 0;
	/*
	* NOTE: using the same values for the BIO version of the
	* same field as the SCSI/ATA values. This means we could
	* get some additional values that aren't defined in bio.h
	* if more values of the same field are defined later.
	*/
	rep->header.same = hdr->byte4 & SRZ_SAME_MASK;
	rep->header.maximum_lba = le64dec(hdr->maximum_lba);
	/*
	* If the drive reports no entries that match the query,
	* we're done.
	*/
	if (hdr_len == 0) {
	rep->entries_filled = 0;
	bp->bio_resid = bp->bio_bcount;
	break;
	}

	num_avail = min((avail_len - sizeof(hdr)) / sizeof(desc),
	hdr_len / sizeof(*desc));
	/*
	* If the drive didn't return any data, then we're done.
	*/
	if (num_avail == 0) {
	rep->entries_filled = 0;
	bp->bio_resid = bp->bio_bcount;
	break;
	}

	num_to_fill = min(num_avail, rep->entries_allocated);
	/*
	* If the user didn't allocate any entries for us to fill,
	* we're done.
	*/
	if (num_to_fill == 0) {
	rep->entries_filled = 0;
	bp->bio_resid = bp->bio_bcount;
	break;
	}

	for (i = 0, desc = &hdr->desc_list[0], entry=&rep->entries[0];
	i < num_to_fill; i++, desc++, entry++) {
	/*
	* NOTE: we're mapping the values here directly
	* from the SCSI/ATA bit definitions to the bio.h
	* definitions. There is also a warning in
	* disk_zone.h, but the impact is that if
	* additional values are added in the SCSI/ATA
	* specs these will be visible to consumers of
	* this interface.
	*/
	entry->zone_type = desc->zone_type & SRZ_TYPE_MASK;
	entry->zone_condition =
	(desc->zone_flags & SRZ_ZONE_COND_MASK) >>
	SRZ_ZONE_COND_SHIFT;
	entry->zone_flags \|= desc->zone_flags &
	(SRZ_ZONE_NON_SEQ\|SRZ_ZONE_RESET);
	entry->zone_length = le64dec(desc->zone_length);
	entry->zone_start_lba = le64dec(desc->zone_start_lba);
	entry->write_pointer_lba =
	le64dec(desc->write_pointer_lba);
	}
	rep->entries_filled = num_to_fill;
	/*
	* Note that this residual is accurate from the user's
	* standpoint, but the amount transferred isn't accurate
	* from the standpoint of what actually came back from the
	* drive.
	*/
	bp->bio_resid = bp->bio_bcount - (num_to_fill * sizeof(*entry));
	break;
	}
	case DISK_ZONE_GET_PARAMS:
	default:
	/*
	* In theory we should not get a GET_PARAMS bio, since it
	* should be handled without queueing the command to the
	* drive.
	*/
	panic("%s: Invalid zone command %d", __func__,
	bp->bio_zone.zone_cmd);
	break;
	}

	if (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES)
	free(ccb->ataio.data_ptr, M_ATADA);
	}

	static void
	adadone(struct cam_periph periph, union ccb done_ccb)
	{
	struct ada_softc *softc;
	struct ccb_ataio *ataio;
	struct cam_path *path;
	uint32_t priority;
	int state;

	softc = (struct ada_softc *)periph->softc;
	ataio = &done_ccb->ataio;
	path = done_ccb->ccb_h.path;
	priority = done_ccb->ccb_h.pinfo.priority;

	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("adadone\n"));

	state = ataio->ccb_h.ccb_state & ADA_CCB_TYPE_MASK;
	switch (state) {
	case ADA_CCB_BUFFER_IO:
	case ADA_CCB_TRIM:
	{
	struct bio *bp;
	int error;

	cam_periph_lock(periph);
	bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	error = adaerror(done_ccb, 0, 0);
	if (error == ERESTART) {
	/* A retry was scheduled, so just return. */
	cam_periph_unlock(periph);
	return;
	}
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	/*
	* If we get an error on an NCQ DSM TRIM, fall back
	* to a non-NCQ DSM TRIM forever. Please note that if
	* CAN_NCQ_TRIM is set, CAN_TRIM is necessarily set too.
	* However, for this one trim, we treat it as advisory
	* and return success up the stack.
	*/
	if (state == ADA_CCB_TRIM &&
	error != 0 &&
	(softc->flags & ADA_FLAG_CAN_NCQ_TRIM) != 0) {
	softc->flags &= ~ADA_FLAG_CAN_NCQ_TRIM;
	error = 0;
	adasetdeletemethod(softc);
	}
	} else {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	panic("REQ_CMP with QFRZN");

	error = 0;
	}
	bp->bio_error = error;
	if (error != 0) {
	bp->bio_resid = bp->bio_bcount;
	bp->bio_flags \|= BIO_ERROR;
	} else {
	if (bp->bio_cmd == BIO_ZONE)
	adazonedone(periph, done_ccb);
	else if (state == ADA_CCB_TRIM)
	bp->bio_resid = 0;
	else
	bp->bio_resid = ataio->resid;

	if ((bp->bio_resid > 0)
	&& (bp->bio_cmd != BIO_ZONE))
	bp->bio_flags \|= BIO_ERROR;
	}
	softc->outstanding_cmds--;
	if (softc->outstanding_cmds == 0)
	softc->flags \|= ADA_FLAG_WAS_OTAG;

	/*
	* We need to call cam_iosched before we call biodone so that we
	* don't measure any activity that happens in the completion
	* routine, which in the case of sendfile can be quite
	* extensive. Release the periph refcount taken in adastart()
	* for each CCB.
	*/
	cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
	xpt_release_ccb(done_ccb);
	KASSERT(softc->refcount >= 1, ("adadone softc %p refcount %d", softc, softc->refcount));
	softc->refcount--;
	if (state == ADA_CCB_TRIM) {
	TAILQ_HEAD(, bio) queue;
	struct bio *bp1;

	TAILQ_INIT(&queue);
	TAILQ_CONCAT(&queue, &softc->trim_req.bps, bio_queue);
	/*
	* Normally, the xpt_release_ccb() above would make sure
	* that when we have more work to do, that work would
	* get kicked off. However, we specifically keep
	* trim_running set to 0 before the call above to allow
	* other I/O to progress when many BIO_DELETE requests
	* are pushed down. We set trim_running to 0 and call
	* daschedule again so that we don't stall if there are
	* no other I/Os pending apart from BIO_DELETEs.
	*/
	cam_iosched_trim_done(softc->cam_iosched);
	adaschedule(periph);
	cam_periph_unlock(periph);
	while ((bp1 = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, bp1, bio_queue);
	bp1->bio_error = error;
	if (error != 0) {
	bp1->bio_flags \|= BIO_ERROR;
	bp1->bio_resid = bp1->bio_bcount;
	} else
	bp1->bio_resid = 0;
	biodone(bp1);
	}
	} else {
	adaschedule(periph);
	cam_periph_unlock(periph);
	biodone(bp);
	}
	return;
	}
	case ADA_CCB_RAHEAD:
	{
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	if (adaerror(done_ccb, 0, 0) == ERESTART) {
	/* Drop freeze taken due to CAM_DEV_QFREEZE */
	cam_release_devq(path, 0, 0, 0, FALSE);
	return;
	} else if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	cam_release_devq(path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}

	/*
	* Since our peripheral may be invalidated by an error
	* above or an external event, we must release our CCB
	* before releasing the reference on the peripheral.
	* The peripheral will only go away once the last reference
	* is removed, and we need it around for the CCB release
	* operation.
	*/

	xpt_release_ccb(done_ccb);
	softc->state = ADA_STATE_WCACHE;
	xpt_schedule(periph, priority);
	/* Drop freeze taken due to CAM_DEV_QFREEZE */
	cam_release_devq(path, 0, 0, 0, FALSE);
	return;
	}
	case ADA_CCB_WCACHE:
	{
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	if (adaerror(done_ccb, 0, 0) == ERESTART) {
	/* Drop freeze taken due to CAM_DEV_QFREEZE */
	cam_release_devq(path, 0, 0, 0, FALSE);
	return;
	} else if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	cam_release_devq(path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}

	/* Drop freeze taken due to CAM_DEV_QFREEZE */
	cam_release_devq(path, 0, 0, 0, FALSE);

	if ((softc->flags & ADA_FLAG_CAN_LOG)
	&& (softc->zone_mode != ADA_ZONE_NONE)) {
	xpt_release_ccb(done_ccb);
	softc->state = ADA_STATE_LOGDIR;
	xpt_schedule(periph, priority);
	} else {
	adaprobedone(periph, done_ccb);
	}
	return;
	}
	case ADA_CCB_LOGDIR:
	{
	int error;

	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	error = 0;
	softc->valid_logdir_len = 0;
	bzero(&softc->ata_logdir, sizeof(softc->ata_logdir));
	softc->valid_logdir_len =
	ataio->dxfer_len - ataio->resid;
	if (softc->valid_logdir_len > 0)
	bcopy(ataio->data_ptr, &softc->ata_logdir,
	min(softc->valid_logdir_len,
	sizeof(softc->ata_logdir)));
	/*
	* Figure out whether the Identify Device log is
	* supported. The General Purpose log directory
	* has a header, and lists the number of pages
	* available for each GP log identified by the
	* offset into the list.
	*/
	if ((softc->valid_logdir_len >=
	((ATA_IDENTIFY_DATA_LOG + 1) * sizeof(uint16_t)))
	&& (le16dec(softc->ata_logdir.header) ==
	ATA_GP_LOG_DIR_VERSION)
	&& (le16dec(&softc->ata_logdir.num_pages[
	(ATA_IDENTIFY_DATA_LOG *
	sizeof(uint16_t)) - sizeof(uint16_t)]) > 0)){
	softc->flags \|= ADA_FLAG_CAN_IDLOG;
	} else {
	softc->flags &= ~ADA_FLAG_CAN_IDLOG;
	}
	} else {
	error = adaerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	/*
	* If we can't get the ATA log directory,
	* then ATA logs are effectively not
	* supported even if the bit is set in the
	* identify data.
	*/
	softc->flags &= ~(ADA_FLAG_CAN_LOG \|
	ADA_FLAG_CAN_IDLOG);
	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(ataio->data_ptr, M_ATADA);

	if ((error == 0)
	&& (softc->flags & ADA_FLAG_CAN_IDLOG)) {
	softc->state = ADA_STATE_IDDIR;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	} else
	adaprobedone(periph, done_ccb);

	return;
	}
	case ADA_CCB_IDDIR: {
	int error;

	if ((ataio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	off_t entries_offset, max_entries;
	error = 0;

	softc->valid_iddir_len = 0;
	bzero(&softc->ata_iddir, sizeof(softc->ata_iddir));
	softc->flags &= ~(ADA_FLAG_CAN_SUPCAP \|
	ADA_FLAG_CAN_ZONE);
	softc->valid_iddir_len =
	ataio->dxfer_len - ataio->resid;
	if (softc->valid_iddir_len > 0)
	bcopy(ataio->data_ptr, &softc->ata_iddir,
	min(softc->valid_iddir_len,
	sizeof(softc->ata_iddir)));

	entries_offset =
	__offsetof(struct ata_identify_log_pages,entries);
	max_entries = softc->valid_iddir_len - entries_offset;
	if ((softc->valid_iddir_len > (entries_offset + 1))
	&& (le64dec(softc->ata_iddir.header) ==
	ATA_IDLOG_REVISION)
	&& (softc->ata_iddir.entry_count > 0)) {
	int num_entries, i;

	num_entries = softc->ata_iddir.entry_count;
	num_entries = min(num_entries,
	softc->valid_iddir_len - entries_offset);
	for (i = 0; i < num_entries &&
	i < max_entries; i++) {
	if (softc->ata_iddir.entries[i] ==
	ATA_IDL_SUP_CAP)
	softc->flags \|=
	ADA_FLAG_CAN_SUPCAP;
	else if (softc->ata_iddir.entries[i]==
	ATA_IDL_ZDI)
	softc->flags \|=
	ADA_FLAG_CAN_ZONE;

	if ((softc->flags &
	ADA_FLAG_CAN_SUPCAP)
	&& (softc->flags &
	ADA_FLAG_CAN_ZONE))
	break;
	}
	}
	} else {
	error = adaerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	/*
	* If we can't get the ATA Identify Data log
	* directory, then it effectively isn't
	* supported even if the ATA Log directory
	* a non-zero number of pages present for
	* this log.
	*/
	softc->flags &= ~ADA_FLAG_CAN_IDLOG;
	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(ataio->data_ptr, M_ATADA);

	if ((error == 0)
	&& (softc->flags & ADA_FLAG_CAN_SUPCAP)) {
	softc->state = ADA_STATE_SUP_CAP;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	} else
	adaprobedone(periph, done_ccb);
	return;
	}
	case ADA_CCB_SUP_CAP: {
	int error;

	if ((ataio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	uint32_t valid_len;
	size_t needed_size;
	struct ata_identify_log_sup_cap *sup_cap;
	error = 0;

	sup_cap = (struct ata_identify_log_sup_cap *)
	ataio->data_ptr;
	valid_len = ataio->dxfer_len - ataio->resid;
	needed_size =
	__offsetof(struct ata_identify_log_sup_cap,
	sup_zac_cap) + 1 + sizeof(sup_cap->sup_zac_cap);
	if (valid_len >= needed_size) {
	uint64_t zoned, zac_cap;

	zoned = le64dec(sup_cap->zoned_cap);
	if (zoned & ATA_ZONED_VALID) {
	/*
	* This should have already been
	* set, because this is also in the
	* ATA identify data.
	*/
	if ((zoned & ATA_ZONED_MASK) ==
	ATA_SUPPORT_ZONE_HOST_AWARE)
	softc->zone_mode =
	ADA_ZONE_HOST_AWARE;
	else if ((zoned & ATA_ZONED_MASK) ==
	ATA_SUPPORT_ZONE_DEV_MANAGED)
	softc->zone_mode =
	ADA_ZONE_DRIVE_MANAGED;
	}

	zac_cap = le64dec(sup_cap->sup_zac_cap);
	if (zac_cap & ATA_SUP_ZAC_CAP_VALID) {
	if (zac_cap & ATA_REPORT_ZONES_SUP)
	softc->zone_flags \|=
	ADA_ZONE_FLAG_RZ_SUP;
	if (zac_cap & ATA_ND_OPEN_ZONE_SUP)
	softc->zone_flags \|=
	ADA_ZONE_FLAG_OPEN_SUP;
	if (zac_cap & ATA_ND_CLOSE_ZONE_SUP)
	softc->zone_flags \|=
	ADA_ZONE_FLAG_CLOSE_SUP;
	if (zac_cap & ATA_ND_FINISH_ZONE_SUP)
	softc->zone_flags \|=
	ADA_ZONE_FLAG_FINISH_SUP;
	if (zac_cap & ATA_ND_RWP_SUP)
	softc->zone_flags \|=
	ADA_ZONE_FLAG_RWP_SUP;
	} else {
	/*
	* This field was introduced in
	* ACS-4, r08 on April 28th, 2015.
	* If the drive firmware was written
	* to an earlier spec, it won't have
	* the field. So, assume all
	* commands are supported.
	*/
	softc->zone_flags \|=
	ADA_ZONE_FLAG_SUP_MASK;
	}
	}
	} else {
	error = adaerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	/*
	* If we can't get the ATA Identify Data
	* Supported Capabilities page, clear the
	* flag...
	*/
	softc->flags &= ~ADA_FLAG_CAN_SUPCAP;
	/*
	* And clear zone capabilities.
	*/
	softc->zone_flags &= ~ADA_ZONE_FLAG_SUP_MASK;
	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(ataio->data_ptr, M_ATADA);

	if ((error == 0)
	&& (softc->flags & ADA_FLAG_CAN_ZONE)) {
	softc->state = ADA_STATE_ZONE;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	} else
	adaprobedone(periph, done_ccb);
	return;
	}
	case ADA_CCB_ZONE: {
	int error;

	if ((ataio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	struct ata_zoned_info_log *zi_log;
	uint32_t valid_len;
	size_t needed_size;

	zi_log = (struct ata_zoned_info_log *)ataio->data_ptr;

	valid_len = ataio->dxfer_len - ataio->resid;
	needed_size = __offsetof(struct ata_zoned_info_log,
	version_info) + 1 + sizeof(zi_log->version_info);
	if (valid_len >= needed_size) {
	uint64_t tmpvar;

	tmpvar = le64dec(zi_log->zoned_cap);
	if (tmpvar & ATA_ZDI_CAP_VALID) {
	if (tmpvar & ATA_ZDI_CAP_URSWRZ)
	softc->zone_flags \|=
	ADA_ZONE_FLAG_URSWRZ;
	else
	softc->zone_flags &=
	~ADA_ZONE_FLAG_URSWRZ;
	}
	tmpvar = le64dec(zi_log->optimal_seq_zones);
	if (tmpvar & ATA_ZDI_OPT_SEQ_VALID) {
	softc->zone_flags \|=
	ADA_ZONE_FLAG_OPT_SEQ_SET;
	softc->optimal_seq_zones = (tmpvar &
	ATA_ZDI_OPT_SEQ_MASK);
	} else {
	softc->zone_flags &=
	~ADA_ZONE_FLAG_OPT_SEQ_SET;
	softc->optimal_seq_zones = 0;
	}

	tmpvar =le64dec(zi_log->optimal_nonseq_zones);
	if (tmpvar & ATA_ZDI_OPT_NS_VALID) {
	softc->zone_flags \|=
	ADA_ZONE_FLAG_OPT_NONSEQ_SET;
	softc->optimal_nonseq_zones =
	(tmpvar & ATA_ZDI_OPT_NS_MASK);
	} else {
	softc->zone_flags &=
	~ADA_ZONE_FLAG_OPT_NONSEQ_SET;
	softc->optimal_nonseq_zones = 0;
	}

	tmpvar = le64dec(zi_log->max_seq_req_zones);
	if (tmpvar & ATA_ZDI_MAX_SEQ_VALID) {
	softc->zone_flags \|=
	ADA_ZONE_FLAG_MAX_SEQ_SET;
	softc->max_seq_zones =
	(tmpvar & ATA_ZDI_MAX_SEQ_MASK);
	} else {
	softc->zone_flags &=
	~ADA_ZONE_FLAG_MAX_SEQ_SET;
	softc->max_seq_zones = 0;
	}
	}
	} else {
	error = adaerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	softc->flags &= ~ADA_FLAG_CAN_ZONE;
	softc->flags &= ~ADA_ZONE_FLAG_SET_MASK;

	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}
	free(ataio->data_ptr, M_ATADA);

	adaprobedone(periph, done_ccb);
	return;
	}
	case ADA_CCB_DUMP:
	/* No-op. We're polling */
	return;
	default:
	break;
	}
	xpt_release_ccb(done_ccb);
	}

	static int
	adaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
	{
	#ifdef CAM_IO_STATS
	struct ada_softc *softc;
	struct cam_periph *periph;

	periph = xpt_path_periph(ccb->ccb_h.path);
	softc = (struct ada_softc *)periph->softc;

	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
	case CAM_CMD_TIMEOUT:
	softc->timeouts++;
	break;
	case CAM_REQ_ABORTED:
	case CAM_REQ_CMP_ERR:
	case CAM_REQ_TERMIO:
	case CAM_UNREC_HBA_ERROR:
	case CAM_DATA_RUN_ERR:
	case CAM_ATA_STATUS_ERROR:
	softc->errors++;
	break;
	default:
	break;
	}
	#endif

	return(cam_periph_error(ccb, cam_flags, sense_flags));
	}

	static void
	adasetgeom(struct ada_softc softc, struct ccb_getdev cgd)
	{
	struct disk_params *dp = &softc->params;
	u_int64_t lbasize48;
	u_int32_t lbasize;
	u_int maxio, d_flags;

	dp->secsize = ata_logical_sector_size(&cgd->ident_data);
	if ((cgd->ident_data.atavalid & ATA_FLAG_54_58) &&
	cgd->ident_data.current_heads != 0 &&
	cgd->ident_data.current_sectors != 0) {
	dp->heads = cgd->ident_data.current_heads;
	dp->secs_per_track = cgd->ident_data.current_sectors;
	dp->cylinders = cgd->ident_data.cylinders;
	dp->sectors = (u_int32_t)cgd->ident_data.current_size_1 \|
	((u_int32_t)cgd->ident_data.current_size_2 << 16);
	} else {
	dp->heads = cgd->ident_data.heads;
	dp->secs_per_track = cgd->ident_data.sectors;
	dp->cylinders = cgd->ident_data.cylinders;
	dp->sectors = cgd->ident_data.cylinders *
	(u_int32_t)(dp->heads * dp->secs_per_track);
	}
	lbasize = (u_int32_t)cgd->ident_data.lba_size_1 \|
	((u_int32_t)cgd->ident_data.lba_size_2 << 16);

	/* use the 28bit LBA size if valid or bigger than the CHS mapping */
	if (cgd->ident_data.cylinders == 16383 \|\| dp->sectors < lbasize)
	dp->sectors = lbasize;

	/* use the 48bit LBA size if valid */
	lbasize48 = ((u_int64_t)cgd->ident_data.lba_size48_1) \|
	((u_int64_t)cgd->ident_data.lba_size48_2 << 16) \|
	((u_int64_t)cgd->ident_data.lba_size48_3 << 32) \|
	((u_int64_t)cgd->ident_data.lba_size48_4 << 48);
	if ((cgd->ident_data.support.command2 & ATA_SUPPORT_ADDRESS48) &&
	lbasize48 > ATA_MAX_28BIT_LBA)
	dp->sectors = lbasize48;

	maxio = softc->cpi.maxio; /* Honor max I/O size of SIM */
	if (maxio == 0)
	maxio = DFLTPHYS; /* traditional default */
	- else if (maxio > MAXPHYS)
	- maxio = MAXPHYS; /* for safety */
	+ else if (maxio > maxphys)
	+ maxio = maxphys; /* for safety */
	if (softc->flags & ADA_FLAG_CAN_48BIT)
	maxio = min(maxio, 65536 * softc->params.secsize);
	else /* 28bit ATA command limit */
	maxio = min(maxio, 256 * softc->params.secsize);
	if (softc->quirks & ADA_Q_128KB)
	maxio = min(maxio, 128 * 1024);
	softc->disk->d_maxsize = maxio;
	d_flags = DISKFLAG_DIRECT_COMPLETION \| DISKFLAG_CANZONE;
	if (softc->flags & ADA_FLAG_CAN_FLUSHCACHE)
	d_flags \|= DISKFLAG_CANFLUSHCACHE;
	if (softc->flags & ADA_FLAG_CAN_TRIM) {
	d_flags \|= DISKFLAG_CANDELETE;
	softc->disk->d_delmaxsize = softc->params.secsize *
	ATA_DSM_RANGE_MAX * softc->trim_max_ranges;
	} else if ((softc->flags & ADA_FLAG_CAN_CFA) &&
	!(softc->flags & ADA_FLAG_CAN_48BIT)) {
	d_flags \|= DISKFLAG_CANDELETE;
	softc->disk->d_delmaxsize = 256 * softc->params.secsize;
	} else
	softc->disk->d_delmaxsize = maxio;
	if ((softc->cpi.hba_misc & PIM_UNMAPPED) != 0) {
	d_flags \|= DISKFLAG_UNMAPPED_BIO;
	softc->flags \|= ADA_FLAG_UNMAPPEDIO;
	}
	softc->disk->d_flags = d_flags;
	strlcpy(softc->disk->d_descr, cgd->ident_data.model,
	MIN(sizeof(softc->disk->d_descr), sizeof(cgd->ident_data.model)));
	strlcpy(softc->disk->d_ident, cgd->ident_data.serial,
	MIN(sizeof(softc->disk->d_ident), sizeof(cgd->ident_data.serial)));

	softc->disk->d_sectorsize = softc->params.secsize;
	softc->disk->d_mediasize = (off_t)softc->params.sectors *
	softc->params.secsize;
	if (ata_physical_sector_size(&cgd->ident_data) !=
	softc->params.secsize) {
	softc->disk->d_stripesize =
	ata_physical_sector_size(&cgd->ident_data);
	softc->disk->d_stripeoffset = (softc->disk->d_stripesize -
	ata_logical_sector_offset(&cgd->ident_data)) %
	softc->disk->d_stripesize;
	} else if (softc->quirks & ADA_Q_4K) {
	softc->disk->d_stripesize = 4096;
	softc->disk->d_stripeoffset = 0;
	}
	softc->disk->d_fwsectors = softc->params.secs_per_track;
	softc->disk->d_fwheads = softc->params.heads;
	ata_disk_firmware_geom_adjust(softc->disk);
	softc->disk->d_rotation_rate = cgd->ident_data.media_rotation_rate;
	snprintf(softc->disk->d_attachment, sizeof(softc->disk->d_attachment),
	"%s%d", softc->cpi.dev_name, softc->cpi.unit_number);
	}

	static void
	adasendorderedtag(void *arg)
	{
	struct ada_softc *softc = arg;

	if (ada_send_ordered) {
	if (softc->outstanding_cmds > 0) {
	if ((softc->flags & ADA_FLAG_WAS_OTAG) == 0)
	softc->flags \|= ADA_FLAG_NEED_OTAG;
	softc->flags &= ~ADA_FLAG_WAS_OTAG;
	}
	}
	/* Queue us up again */
	callout_reset(&softc->sendordered_c,
	(ada_default_timeout * hz) / ADA_ORDEREDTAG_INTERVAL,
	adasendorderedtag, softc);
	}

	/*
	* Step through all ADA peripheral drivers, and if the device is still open,
	* sync the disk cache to physical media.
	*/
	static void
	adaflush(void)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;
	union ccb *ccb;
	int error;

	CAM_PERIPH_FOREACH(periph, &adadriver) {
	softc = (struct ada_softc *)periph->softc;
	if (SCHEDULER_STOPPED()) {
	/* If we paniced with the lock held, do not recurse. */
	if (!cam_periph_owned(periph) &&
	(softc->flags & ADA_FLAG_OPEN)) {
	adadump(softc->disk, NULL, 0, 0, 0);
	}
	continue;
	}
	cam_periph_lock(periph);
	/*
	* We only sync the cache if the drive is still open, and
	* if the drive is capable of it..
	*/
	if (((softc->flags & ADA_FLAG_OPEN) == 0) \|\|
	(softc->flags & ADA_FLAG_CAN_FLUSHCACHE) == 0) {
	cam_periph_unlock(periph);
	continue;
	}

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	cam_fill_ataio(&ccb->ataio,
	0,
	NULL,
	CAM_DIR_NONE,
	0,
	NULL,
	0,
	ada_default_timeout*1000);
	if (softc->flags & ADA_FLAG_CAN_48BIT)
	ata_48bit_cmd(&ccb->ataio, ATA_FLUSHCACHE48, 0, 0, 0);
	else
	ata_28bit_cmd(&ccb->ataio, ATA_FLUSHCACHE, 0, 0, 0);

	error = cam_periph_runccb(ccb, adaerror, /cam_flags/0,
	/sense_flags/ SF_NO_RECOVERY \| SF_NO_RETRY,
	softc->disk->d_devstat);
	if (error != 0)
	xpt_print(periph->path, "Synchronize cache failed\n");
	xpt_release_ccb(ccb);
	cam_periph_unlock(periph);
	}
	}

	static void
	adaspindown(uint8_t cmd, int flags)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;
	struct ccb_ataio local_ccb;
	int error;

	CAM_PERIPH_FOREACH(periph, &adadriver) {
	/* If we paniced with lock held - not recurse here. */
	if (cam_periph_owned(periph))
	continue;
	cam_periph_lock(periph);
	softc = (struct ada_softc *)periph->softc;
	/*
	* We only spin-down the drive if it is capable of it..
	*/
	if ((softc->flags & ADA_FLAG_CAN_POWERMGT) == 0) {
	cam_periph_unlock(periph);
	continue;
	}

	if (bootverbose)
	xpt_print(periph->path, "spin-down\n");

	memset(&local_ccb, 0, sizeof(local_ccb));
	xpt_setup_ccb(&local_ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	local_ccb.ccb_h.ccb_state = ADA_CCB_DUMP;

	cam_fill_ataio(&local_ccb,
	0,
	NULL,
	CAM_DIR_NONE \| flags,
	0,
	NULL,
	0,
	ada_default_timeout*1000);
	ata_28bit_cmd(&local_ccb, cmd, 0, 0, 0);
	error = cam_periph_runccb((union ccb *)&local_ccb, adaerror,
	/cam_flags/0, /sense_flags/ SF_NO_RECOVERY \| SF_NO_RETRY,
	softc->disk->d_devstat);
	if (error != 0)
	xpt_print(periph->path, "Spin-down disk failed\n");
	cam_periph_unlock(periph);
	}
	}

	static void
	adashutdown(void *arg, int howto)
	{
	int how;

	adaflush();

	/*
	* STANDBY IMMEDIATE saves any volatile data to the drive. It also spins
	* down hard drives. IDLE IMMEDIATE also saves the volatile data without
	* a spindown. We send the former when we expect to lose power soon. For
	* a warm boot, we send the latter to avoid a thundering herd of spinups
	* just after the kernel loads while probing. We have to do something to
	* flush the data because the BIOS in many systems resets the HBA
	* causing a COMINIT/COMRESET negotiation, which some drives interpret
	* as license to toss the volatile data, and others count as unclean
	* shutdown when in the Active PM state in SMART attributes.
	*
	* adaspindown will ensure that we don't send this to a drive that
	* doesn't support it.
	*/
	if (ada_spindown_shutdown != 0) {
	how = (howto & (RB_HALT \| RB_POWEROFF \| RB_POWERCYCLE)) ?
	ATA_STANDBY_IMMEDIATE : ATA_IDLE_IMMEDIATE;
	adaspindown(how, 0);
	}
	}

	static void
	adasuspend(void *arg)
	{

	adaflush();
	/*
	* SLEEP also fushes any volatile data, like STANDBY IMEDIATE,
	* so we don't need to send it as well.
	*/
	if (ada_spindown_suspend != 0)
	adaspindown(ATA_SLEEP, CAM_DEV_QFREEZE);
	}

	static void
	adaresume(void *arg)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;

	if (ada_spindown_suspend == 0)
	return;

	CAM_PERIPH_FOREACH(periph, &adadriver) {
	cam_periph_lock(periph);
	softc = (struct ada_softc *)periph->softc;
	/*
	* We only spin-down the drive if it is capable of it..
	*/
	if ((softc->flags & ADA_FLAG_CAN_POWERMGT) == 0) {
	cam_periph_unlock(periph);
	continue;
	}

	if (bootverbose)
	xpt_print(periph->path, "resume\n");

	/*
	* Drop freeze taken due to CAM_DEV_QFREEZE flag set on
	* sleep request.
	*/
	cam_release_devq(periph->path,
	/relsim_flags/0,
	/openings/0,
	/timeout/0,
	/getcount_only/0);

	cam_periph_unlock(periph);
	}
	}

	#endif /* _KERNEL */
	diff --git a/sys/cam/cam_compat.c b/sys/cam/cam_compat.c
	index 07aba7d357db..4c89072fa389 100644
	--- a/sys/cam/cam_compat.c
	+++ b/sys/cam/cam_compat.c
	@@ -1,422 +1,422 @@
	/*-
	* CAM ioctl compatibility shims
	*
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2013 Scott Long
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/types.h>
	#include <sys/kernel.h>
	#include <sys/conf.h>
	#include <sys/fcntl.h>

	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysctl.h>
	#include <sys/kthread.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_xpt.h>
	#include <cam/cam_compat.h>
	#include <cam/cam_periph.h>

	#include <cam/scsi/scsi_pass.h>

	#include "opt_cam.h"

	static int cam_compat_handle_0x17(struct cdev *dev, u_long cmd, caddr_t addr,
	int flag, struct thread td, d_ioctl_t cbfnp);
	static int cam_compat_handle_0x18(struct cdev *dev, u_long cmd, caddr_t addr,
	int flag, struct thread td, d_ioctl_t cbfnp);
	static int cam_compat_translate_dev_match_0x18(union ccb *ccb);

	int
	cam_compat_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
	struct thread td, d_ioctl_t cbfnp)
	{
	int error;

	switch (cmd) {
	case CAMIOCOMMAND_0x16:
	{
	struct ccb_hdr_0x17 *hdr17;

	hdr17 = (struct ccb_hdr_0x17 *)addr;
	if (hdr17->flags & CAM_SG_LIST_PHYS_0x16) {
	hdr17->flags &= ~CAM_SG_LIST_PHYS_0x16;
	hdr17->flags \|= CAM_DATA_SG_PADDR;
	}
	if (hdr17->flags & CAM_DATA_PHYS_0x16) {
	hdr17->flags &= ~CAM_DATA_PHYS_0x16;
	hdr17->flags \|= CAM_DATA_PADDR;
	}
	if (hdr17->flags & CAM_SCATTER_VALID_0x16) {
	hdr17->flags &= CAM_SCATTER_VALID_0x16;
	hdr17->flags \|= CAM_DATA_SG;
	}
	cmd = CAMIOCOMMAND;
	error = cam_compat_handle_0x17(dev, cmd, addr, flag, td, cbfnp);
	break;
	}
	case CAMGETPASSTHRU_0x16:
	cmd = CAMGETPASSTHRU;
	error = cam_compat_handle_0x17(dev, cmd, addr, flag, td, cbfnp);
	break;
	case CAMIOCOMMAND_0x17:
	cmd = CAMIOCOMMAND;
	error = cam_compat_handle_0x17(dev, cmd, addr, flag, td, cbfnp);
	break;
	case CAMGETPASSTHRU_0x17:
	cmd = CAMGETPASSTHRU;
	error = cam_compat_handle_0x17(dev, cmd, addr, flag, td, cbfnp);
	break;
	case CAMIOCOMMAND_0x18:
	cmd = CAMIOCOMMAND;
	error = cam_compat_handle_0x18(dev, cmd, addr, flag, td, cbfnp);
	break;
	case CAMGETPASSTHRU_0x18:
	cmd = CAMGETPASSTHRU;
	error = cam_compat_handle_0x18(dev, cmd, addr, flag, td, cbfnp);
	break;
	default:
	error = ENOTTY;
	}

	return (error);
	}

	static int
	cam_compat_handle_0x17(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
	struct thread td, d_ioctl_t cbfnp)
	{
	union ccb *ccb;
	struct ccb_hdr *hdr;
	struct ccb_hdr_0x17 *hdr17;
	uint8_t ccbb, ccbb17;
	u_int error;

	hdr17 = (struct ccb_hdr_0x17 *)addr;
	ccb = xpt_alloc_ccb();
	hdr = &ccb->ccb_h;

	hdr->pinfo = hdr17->pinfo;
	hdr->xpt_links = hdr17->xpt_links;
	hdr->sim_links = hdr17->sim_links;
	hdr->periph_links = hdr17->periph_links;
	hdr->retry_count = hdr17->retry_count;
	hdr->cbfcnp = hdr17->cbfcnp;
	hdr->func_code = hdr17->func_code;
	hdr->status = hdr17->status;
	hdr->path = hdr17->path;
	hdr->path_id = hdr17->path_id;
	hdr->target_id = hdr17->target_id;
	hdr->target_lun = hdr17->target_lun;
	hdr->flags = hdr17->flags;
	hdr->xflags = 0;
	hdr->periph_priv = hdr17->periph_priv;
	hdr->sim_priv = hdr17->sim_priv;
	hdr->timeout = hdr17->timeout;
	hdr->softtimeout.tv_sec = 0;
	hdr->softtimeout.tv_usec = 0;

	ccbb = (uint8_t *)&hdr[1];
	ccbb17 = (uint8_t *)&hdr17[1];
	if (ccb->ccb_h.func_code == XPT_SET_TRAN_SETTINGS) {
	struct ccb_trans_settings *cts;
	struct ccb_trans_settings_0x17 *cts17;

	cts = &ccb->cts;
	cts17 = (struct ccb_trans_settings_0x17 *)hdr17;
	cts->type = cts17->type;
	cts->protocol = cts17->protocol;
	cts->protocol_version = cts17->protocol_version;
	cts->transport = cts17->transport;
	cts->transport_version = cts17->transport_version;
	bcopy(&cts17->proto_specific, &cts->proto_specific,
	sizeof(cts17->proto_specific));
	bcopy(&cts17->xport_specific, &cts->xport_specific,
	sizeof(cts17->xport_specific));
	} else {
	bcopy(ccbb17, ccbb, CAM_0X17_DATA_LEN);
	}

	error = (cbfnp)(dev, cmd, (caddr_t)ccb, flag, td);

	hdr17->pinfo = hdr->pinfo;
	hdr17->xpt_links = hdr->xpt_links;
	hdr17->sim_links = hdr->sim_links;
	hdr17->periph_links = hdr->periph_links;
	hdr17->retry_count = hdr->retry_count;
	hdr17->cbfcnp = hdr->cbfcnp;
	hdr17->func_code = hdr->func_code;
	hdr17->status = hdr->status;
	hdr17->path = hdr->path;
	hdr17->path_id = hdr->path_id;
	hdr17->target_id = hdr->target_id;
	hdr17->target_lun = hdr->target_lun;
	hdr17->flags = hdr->flags;
	hdr17->periph_priv = hdr->periph_priv;
	hdr17->sim_priv = hdr->sim_priv;
	hdr17->timeout = hdr->timeout;

	if (ccb->ccb_h.func_code == XPT_PATH_INQ) {
	struct ccb_pathinq *cpi;
	struct ccb_pathinq_0x17 *cpi17;

	/* The PATH_INQ only needs special handling on the way out */
	cpi = &ccb->cpi;
	cpi17 = (struct ccb_pathinq_0x17 *)hdr17;
	cpi17->version_num = cpi->version_num;
	cpi17->hba_inquiry = cpi->hba_inquiry;
	cpi17->target_sprt = (u_int8_t)cpi->target_sprt;
	cpi17->hba_misc = (u_int8_t)cpi->hba_misc;
	cpi17->hba_eng_cnt = cpi->hba_eng_cnt;
	bcopy(&cpi->vuhba_flags[0], &cpi17->vuhba_flags[0], VUHBALEN);
	cpi17->max_target = cpi->max_target;
	cpi17->max_lun = cpi->max_lun;
	cpi17->async_flags = cpi->async_flags;
	cpi17->hpath_id = cpi->hpath_id;
	cpi17->initiator_id = cpi->initiator_id;
	bcopy(&cpi->sim_vid[0], &cpi17->sim_vid[0], SIM_IDLEN);
	bcopy(&cpi->hba_vid[0], &cpi17->hba_vid[0], HBA_IDLEN);
	bcopy(&cpi->dev_name[0], &cpi17->dev_name[0], DEV_IDLEN);
	cpi17->unit_number = cpi->unit_number;
	cpi17->bus_id = cpi->bus_id;
	cpi17->base_transfer_speed = cpi->base_transfer_speed;
	cpi17->protocol = cpi->protocol;
	cpi17->protocol_version = cpi->protocol_version;
	cpi17->transport = cpi->transport;
	cpi17->transport_version = cpi->transport_version;
	bcopy(&cpi->xport_specific, &cpi17->xport_specific,
	PATHINQ_SETTINGS_SIZE);
	cpi17->maxio = cpi->maxio;
	cpi17->hba_vendor = cpi->hba_vendor;
	cpi17->hba_device = cpi->hba_device;
	cpi17->hba_subvendor = cpi->hba_subvendor;
	cpi17->hba_subdevice = cpi->hba_subdevice;
	} else if (ccb->ccb_h.func_code == XPT_GET_TRAN_SETTINGS) {
	struct ccb_trans_settings *cts;
	struct ccb_trans_settings_0x17 *cts17;

	cts = &ccb->cts;
	cts17 = (struct ccb_trans_settings_0x17 *)hdr17;
	cts17->type = cts->type;
	cts17->protocol = cts->protocol;
	cts17->protocol_version = cts->protocol_version;
	cts17->transport = cts->transport;
	cts17->transport_version = cts->transport_version;
	bcopy(&cts->proto_specific, &cts17->proto_specific,
	sizeof(cts17->proto_specific));
	bcopy(&cts->xport_specific, &cts17->xport_specific,
	sizeof(cts17->xport_specific));
	} else if (ccb->ccb_h.func_code == XPT_DEV_MATCH) {
	/* Copy the rest of the header over */
	bcopy(ccbb, ccbb17, CAM_0X17_DATA_LEN);

	cam_compat_translate_dev_match_0x18(ccb);
	} else {
	bcopy(ccbb, ccbb17, CAM_0X17_DATA_LEN);
	}

	xpt_free_ccb(ccb);

	return (error);
	}

	static int
	cam_compat_handle_0x18(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
	struct thread td, d_ioctl_t cbfnp)
	{
	union ccb *ccb;
	struct ccb_hdr *hdr;
	struct ccb_hdr_0x18 *hdr18;
	uint8_t ccbb, ccbb18;
	u_int error;

	hdr18 = (struct ccb_hdr_0x18 *)addr;
	ccb = xpt_alloc_ccb();
	hdr = &ccb->ccb_h;

	hdr->pinfo = hdr18->pinfo;
	hdr->xpt_links = hdr18->xpt_links;
	hdr->sim_links = hdr18->sim_links;
	hdr->periph_links = hdr18->periph_links;
	hdr->retry_count = hdr18->retry_count;
	hdr->cbfcnp = hdr18->cbfcnp;
	hdr->func_code = hdr18->func_code;
	hdr->status = hdr18->status;
	hdr->path = hdr18->path;
	hdr->path_id = hdr18->path_id;
	hdr->target_id = hdr18->target_id;
	hdr->target_lun = hdr18->target_lun;
	if (hdr18->xflags & CAM_EXTLUN_VALID_0x18)
	hdr->target_lun = hdr18->ext_lun;
	hdr->flags = hdr18->flags;
	hdr->xflags = hdr18->xflags;
	hdr->periph_priv = hdr18->periph_priv;
	hdr->sim_priv = hdr18->sim_priv;
	hdr->timeout = hdr18->timeout;
	hdr->softtimeout.tv_sec = 0;
	hdr->softtimeout.tv_usec = 0;

	ccbb = (uint8_t *)&hdr[1];
	ccbb18 = (uint8_t *)&hdr18[1];
	if (ccb->ccb_h.func_code == XPT_SET_TRAN_SETTINGS) {
	struct ccb_trans_settings *cts;
	struct ccb_trans_settings_0x18 *cts18;

	cts = &ccb->cts;
	cts18 = (struct ccb_trans_settings_0x18 *)hdr18;
	cts->type = cts18->type;
	cts->protocol = cts18->protocol;
	cts->protocol_version = cts18->protocol_version;
	cts->transport = cts18->transport;
	cts->transport_version = cts18->transport_version;
	bcopy(&cts18->proto_specific, &cts->proto_specific,
	sizeof(cts18->proto_specific));
	bcopy(&cts18->xport_specific, &cts->xport_specific,
	sizeof(cts18->xport_specific));
	} else {
	bcopy(ccbb18, ccbb, CAM_0X18_DATA_LEN);
	}

	error = (cbfnp)(dev, cmd, (caddr_t)ccb, flag, td);

	hdr18->pinfo = hdr->pinfo;
	hdr18->xpt_links = hdr->xpt_links;
	hdr18->sim_links = hdr->sim_links;
	hdr18->periph_links = hdr->periph_links;
	hdr18->retry_count = hdr->retry_count;
	hdr18->cbfcnp = hdr->cbfcnp;
	hdr18->func_code = hdr->func_code;
	hdr18->status = hdr->status;
	hdr18->path = hdr->path;
	hdr18->path_id = hdr->path_id;
	hdr18->target_id = hdr->target_id;
	hdr18->target_lun = hdr->target_lun;
	hdr18->ext_lun = hdr->target_lun;
	hdr18->flags = hdr->flags;
	hdr18->xflags = hdr->xflags \| CAM_EXTLUN_VALID_0x18;
	hdr18->periph_priv = hdr->periph_priv;
	hdr18->sim_priv = hdr->sim_priv;
	hdr18->timeout = hdr->timeout;

	if (ccb->ccb_h.func_code == XPT_GET_TRAN_SETTINGS) {
	struct ccb_trans_settings *cts;
	struct ccb_trans_settings_0x18 *cts18;

	cts = &ccb->cts;
	cts18 = (struct ccb_trans_settings_0x18 *)hdr18;
	cts18->type = cts->type;
	cts18->protocol = cts->protocol;
	cts18->protocol_version = cts->protocol_version;
	cts18->transport = cts->transport;
	cts18->transport_version = cts->transport_version;
	bcopy(&cts->proto_specific, &cts18->proto_specific,
	sizeof(cts18->proto_specific));
	bcopy(&cts->xport_specific, &cts18->xport_specific,
	sizeof(cts18->xport_specific));
	} else if (ccb->ccb_h.func_code == XPT_DEV_MATCH) {
	bcopy(ccbb, ccbb18, CAM_0X18_DATA_LEN);
	cam_compat_translate_dev_match_0x18(ccb);
	} else {
	bcopy(ccbb, ccbb18, CAM_0X18_DATA_LEN);
	}

	xpt_free_ccb(ccb);

	return (error);
	}

	static int
	cam_compat_translate_dev_match_0x18(union ccb *ccb)
	{
	struct dev_match_result *dm;
	struct dev_match_result_0x18 *dm18;
	struct cam_periph_map_info mapinfo;
	int i;

	/* Remap the CCB into kernel address space */
	bzero(&mapinfo, sizeof(mapinfo));
	- cam_periph_mapmem(ccb, &mapinfo, MAXPHYS);
	+ cam_periph_mapmem(ccb, &mapinfo, maxphys);

	dm = ccb->cdm.matches;
	/* Translate in-place: old fields are smaller */
	dm18 = (struct dev_match_result_0x18 *)(dm);

	for (i = 0; i < ccb->cdm.num_matches; i++) {
	dm18[i].type = dm[i].type;
	switch (dm[i].type) {
	case DEV_MATCH_PERIPH:
	memcpy(&dm18[i].result.periph_result.periph_name,
	&dm[i].result.periph_result.periph_name,
	DEV_IDLEN);
	dm18[i].result.periph_result.unit_number =
	dm[i].result.periph_result.unit_number;
	dm18[i].result.periph_result.path_id =
	dm[i].result.periph_result.path_id;
	dm18[i].result.periph_result.target_id =
	dm[i].result.periph_result.target_id;
	dm18[i].result.periph_result.target_lun =
	dm[i].result.periph_result.target_lun;
	break;
	case DEV_MATCH_DEVICE:
	dm18[i].result.device_result.path_id =
	dm[i].result.device_result.path_id;
	dm18[i].result.device_result.target_id =
	dm[i].result.device_result.target_id;
	dm18[i].result.device_result.target_lun =
	dm[i].result.device_result.target_lun;
	dm18[i].result.device_result.protocol =
	dm[i].result.device_result.protocol;
	memcpy(&dm18[i].result.device_result.inq_data,
	&dm[i].result.device_result.inq_data,
	sizeof(struct scsi_inquiry_data));
	memcpy(&dm18[i].result.device_result.ident_data,
	&dm[i].result.device_result.ident_data,
	sizeof(struct ata_params));
	dm18[i].result.device_result.flags =
	dm[i].result.device_result.flags;
	break;
	case DEV_MATCH_BUS:
	memcpy(&dm18[i].result.bus_result,
	&dm[i].result.bus_result,
	sizeof(struct bus_match_result));
	break;
	}
	}

	cam_periph_unmapmem(ccb, &mapinfo);

	return (0);
	}
	diff --git a/sys/cam/cam_periph.c b/sys/cam/cam_periph.c
	index cc4a6d41ae76..aaa9bc3eada7 100644
	--- a/sys/cam/cam_periph.c
	+++ b/sys/cam/cam_periph.c
	@@ -1,2167 +1,2167 @@
	/*-
	* Common functions for CAM "type" (peripheral) drivers.
	*
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1997, 1998 Justin T. Gibbs.
	* Copyright (c) 1997, 1998, 1999, 2000 Kenneth D. Merry.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/types.h>
	#include <sys/malloc.h>
	#include <sys/kernel.h>
	#include <sys/bio.h>
	#include <sys/conf.h>
	#include <sys/devctl.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/buf.h>
	#include <sys/proc.h>
	#include <sys/devicestat.h>
	#include <sys/sbuf.h>
	#include <sys/sysctl.h>
	#include <vm/vm.h>
	#include <vm/vm_extern.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_queue.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_debug.h>
	#include <cam/cam_sim.h>

	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_message.h>
	#include <cam/scsi/scsi_pass.h>

	static u_int camperiphnextunit(struct periph_driver *p_drv,
	u_int newunit, int wired,
	path_id_t pathid, target_id_t target,
	lun_id_t lun);
	static u_int camperiphunit(struct periph_driver *p_drv,
	path_id_t pathid, target_id_t target,
	lun_id_t lun);
	static void camperiphdone(struct cam_periph *periph,
	union ccb *done_ccb);
	static void camperiphfree(struct cam_periph *periph);
	static int camperiphscsistatuserror(union ccb *ccb,
	union ccb **orig_ccb,
	cam_flags camflags,
	u_int32_t sense_flags,
	int *openings,
	u_int32_t *relsim_flags,
	u_int32_t *timeout,
	u_int32_t *action,
	const char **action_string);
	static int camperiphscsisenseerror(union ccb *ccb,
	union ccb **orig_ccb,
	cam_flags camflags,
	u_int32_t sense_flags,
	int *openings,
	u_int32_t *relsim_flags,
	u_int32_t *timeout,
	u_int32_t *action,
	const char **action_string);
	static void cam_periph_devctl_notify(union ccb *ccb);

	static int nperiph_drivers;
	static int initialized = 0;
	struct periph_driver **periph_drivers;

	static MALLOC_DEFINE(M_CAMPERIPH, "CAM periph", "CAM peripheral buffers");

	static int periph_selto_delay = 1000;
	TUNABLE_INT("kern.cam.periph_selto_delay", &periph_selto_delay);
	static int periph_noresrc_delay = 500;
	TUNABLE_INT("kern.cam.periph_noresrc_delay", &periph_noresrc_delay);
	static int periph_busy_delay = 500;
	TUNABLE_INT("kern.cam.periph_busy_delay", &periph_busy_delay);

	static u_int periph_mapmem_thresh = 65536;
	SYSCTL_UINT(_kern_cam, OID_AUTO, mapmem_thresh, CTLFLAG_RWTUN,
	&periph_mapmem_thresh, 0, "Threshold for user-space buffer mapping");

	void
	periphdriver_register(void *data)
	{
	struct periph_driver drv = (struct periph_driver )data;
	struct periph_driver newdrivers, old;
	int ndrivers;

	again:
	ndrivers = nperiph_drivers + 2;
	newdrivers = malloc(sizeof(newdrivers) ndrivers, M_CAMPERIPH,
	M_WAITOK);
	xpt_lock_buses();
	if (ndrivers != nperiph_drivers + 2) {
	/*
	* Lost race against itself; go around.
	*/
	xpt_unlock_buses();
	free(newdrivers, M_CAMPERIPH);
	goto again;
	}
	if (periph_drivers)
	bcopy(periph_drivers, newdrivers,
	sizeof(newdrivers) nperiph_drivers);
	newdrivers[nperiph_drivers] = drv;
	newdrivers[nperiph_drivers + 1] = NULL;
	old = periph_drivers;
	periph_drivers = newdrivers;
	nperiph_drivers++;
	xpt_unlock_buses();
	if (old)
	free(old, M_CAMPERIPH);
	/* If driver marked as early or it is late now, initialize it. */
	if (((drv->flags & CAM_PERIPH_DRV_EARLY) != 0 && initialized > 0) \|\|
	initialized > 1)
	(*drv->init)();
	}

	int
	periphdriver_unregister(void *data)
	{
	struct periph_driver drv = (struct periph_driver )data;
	int error, n;

	/* If driver marked as early or it is late now, deinitialize it. */
	if (((drv->flags & CAM_PERIPH_DRV_EARLY) != 0 && initialized > 0) \|\|
	initialized > 1) {
	if (drv->deinit == NULL) {
	printf("CAM periph driver '%s' doesn't have deinit.\n",
	drv->driver_name);
	return (EOPNOTSUPP);
	}
	error = drv->deinit();
	if (error != 0)
	return (error);
	}

	xpt_lock_buses();
	for (n = 0; n < nperiph_drivers && periph_drivers[n] != drv; n++)
	;
	KASSERT(n < nperiph_drivers,
	("Periph driver '%s' was not registered", drv->driver_name));
	for (; n + 1 < nperiph_drivers; n++)
	periph_drivers[n] = periph_drivers[n + 1];
	periph_drivers[n + 1] = NULL;
	nperiph_drivers--;
	xpt_unlock_buses();
	return (0);
	}

	void
	periphdriver_init(int level)
	{
	int i, early;

	initialized = max(initialized, level);
	for (i = 0; periph_drivers[i] != NULL; i++) {
	early = (periph_drivers[i]->flags & CAM_PERIPH_DRV_EARLY) ? 1 : 2;
	if (early == initialized)
	(*periph_drivers[i]->init)();
	}
	}

	cam_status
	cam_periph_alloc(periph_ctor_t *periph_ctor,
	periph_oninv_t *periph_oninvalidate,
	periph_dtor_t periph_dtor, periph_start_t periph_start,
	char name, cam_periph_type type, struct cam_path path,
	ac_callback_t ac_callback, ac_code code, void arg)
	{
	struct periph_driver **p_drv;
	struct cam_sim *sim;
	struct cam_periph *periph;
	struct cam_periph *cur_periph;
	path_id_t path_id;
	target_id_t target_id;
	lun_id_t lun_id;
	cam_status status;
	u_int init_level;

	init_level = 0;
	/*
	* Handle Hot-Plug scenarios. If there is already a peripheral
	* of our type assigned to this path, we are likely waiting for
	* final close on an old, invalidated, peripheral. If this is
	* the case, queue up a deferred call to the peripheral's async
	* handler. If it looks like a mistaken re-allocation, complain.
	*/
	if ((periph = cam_periph_find(path, name)) != NULL) {
	if ((periph->flags & CAM_PERIPH_INVALID) != 0
	&& (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) == 0) {
	periph->flags \|= CAM_PERIPH_NEW_DEV_FOUND;
	periph->deferred_callback = ac_callback;
	periph->deferred_ac = code;
	return (CAM_REQ_INPROG);
	} else {
	printf("cam_periph_alloc: attempt to re-allocate "
	"valid device %s%d rejected flags %#x "
	"refcount %d\n", periph->periph_name,
	periph->unit_number, periph->flags,
	periph->refcount);
	}
	return (CAM_REQ_INVALID);
	}

	periph = (struct cam_periph )malloc(sizeof(periph), M_CAMPERIPH,
	M_NOWAIT\|M_ZERO);

	if (periph == NULL)
	return (CAM_RESRC_UNAVAIL);

	init_level++;

	sim = xpt_path_sim(path);
	path_id = xpt_path_path_id(path);
	target_id = xpt_path_target_id(path);
	lun_id = xpt_path_lun_id(path);
	periph->periph_start = periph_start;
	periph->periph_dtor = periph_dtor;
	periph->periph_oninval = periph_oninvalidate;
	periph->type = type;
	periph->periph_name = name;
	periph->scheduled_priority = CAM_PRIORITY_NONE;
	periph->immediate_priority = CAM_PRIORITY_NONE;
	periph->refcount = 1; /* Dropped by invalidation. */
	periph->sim = sim;
	SLIST_INIT(&periph->ccb_list);
	status = xpt_create_path(&path, periph, path_id, target_id, lun_id);
	if (status != CAM_REQ_CMP)
	goto failure;
	periph->path = path;

	xpt_lock_buses();
	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
	if (strcmp((*p_drv)->driver_name, name) == 0)
	break;
	}
	if (*p_drv == NULL) {
	printf("cam_periph_alloc: invalid periph name '%s'\n", name);
	xpt_unlock_buses();
	xpt_free_path(periph->path);
	free(periph, M_CAMPERIPH);
	return (CAM_REQ_INVALID);
	}
	periph->unit_number = camperiphunit(*p_drv, path_id, target_id, lun_id);
	cur_periph = TAILQ_FIRST(&(*p_drv)->units);
	while (cur_periph != NULL
	&& cur_periph->unit_number < periph->unit_number)
	cur_periph = TAILQ_NEXT(cur_periph, unit_links);
	if (cur_periph != NULL) {
	KASSERT(cur_periph->unit_number != periph->unit_number, ("duplicate units on periph list"));
	TAILQ_INSERT_BEFORE(cur_periph, periph, unit_links);
	} else {
	TAILQ_INSERT_TAIL(&(*p_drv)->units, periph, unit_links);
	(*p_drv)->generation++;
	}
	xpt_unlock_buses();

	init_level++;

	status = xpt_add_periph(periph);
	if (status != CAM_REQ_CMP)
	goto failure;

	init_level++;
	CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph created\n"));

	status = periph_ctor(periph, arg);

	if (status == CAM_REQ_CMP)
	init_level++;

	failure:
	switch (init_level) {
	case 4:
	/* Initialized successfully */
	break;
	case 3:
	CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n"));
	xpt_remove_periph(periph);
	/* FALLTHROUGH */
	case 2:
	xpt_lock_buses();
	TAILQ_REMOVE(&(*p_drv)->units, periph, unit_links);
	xpt_unlock_buses();
	xpt_free_path(periph->path);
	/* FALLTHROUGH */
	case 1:
	free(periph, M_CAMPERIPH);
	/* FALLTHROUGH */
	case 0:
	/* No cleanup to perform. */
	break;
	default:
	panic("%s: Unknown init level", __func__);
	}
	return(status);
	}

	/*
	* Find a peripheral structure with the specified path, target, lun,
	* and (optionally) type. If the name is NULL, this function will return
	* the first peripheral driver that matches the specified path.
	*/
	struct cam_periph *
	cam_periph_find(struct cam_path path, char name)
	{
	struct periph_driver **p_drv;
	struct cam_periph *periph;

	xpt_lock_buses();
	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
	if (name != NULL && (strcmp((*p_drv)->driver_name, name) != 0))
	continue;

	TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) {
	if (xpt_path_comp(periph->path, path) == 0) {
	xpt_unlock_buses();
	cam_periph_assert(periph, MA_OWNED);
	return(periph);
	}
	}
	if (name != NULL) {
	xpt_unlock_buses();
	return(NULL);
	}
	}
	xpt_unlock_buses();
	return(NULL);
	}

	/*
	* Find peripheral driver instances attached to the specified path.
	*/
	int
	cam_periph_list(struct cam_path path, struct sbuf sb)
	{
	struct sbuf local_sb;
	struct periph_driver **p_drv;
	struct cam_periph *periph;
	int count;
	int sbuf_alloc_len;

	sbuf_alloc_len = 16;
	retry:
	sbuf_new(&local_sb, NULL, sbuf_alloc_len, SBUF_FIXEDLEN);
	count = 0;
	xpt_lock_buses();
	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
	TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) {
	if (xpt_path_comp(periph->path, path) != 0)
	continue;

	if (sbuf_len(&local_sb) != 0)
	sbuf_cat(&local_sb, ",");

	sbuf_printf(&local_sb, "%s%d", periph->periph_name,
	periph->unit_number);

	if (sbuf_error(&local_sb) == ENOMEM) {
	sbuf_alloc_len *= 2;
	xpt_unlock_buses();
	sbuf_delete(&local_sb);
	goto retry;
	}
	count++;
	}
	}
	xpt_unlock_buses();
	sbuf_finish(&local_sb);
	if (sbuf_len(sb) != 0)
	sbuf_cat(sb, ",");
	sbuf_cat(sb, sbuf_data(&local_sb));
	sbuf_delete(&local_sb);
	return (count);
	}

	int
	cam_periph_acquire(struct cam_periph *periph)
	{
	int status;

	if (periph == NULL)
	return (EINVAL);

	status = ENOENT;
	xpt_lock_buses();
	if ((periph->flags & CAM_PERIPH_INVALID) == 0) {
	periph->refcount++;
	status = 0;
	}
	xpt_unlock_buses();

	return (status);
	}

	void
	cam_periph_doacquire(struct cam_periph *periph)
	{

	xpt_lock_buses();
	KASSERT(periph->refcount >= 1,
	("cam_periph_doacquire() with refcount == %d", periph->refcount));
	periph->refcount++;
	xpt_unlock_buses();
	}

	void
	cam_periph_release_locked_buses(struct cam_periph *periph)
	{

	cam_periph_assert(periph, MA_OWNED);
	KASSERT(periph->refcount >= 1, ("periph->refcount >= 1"));
	if (--periph->refcount == 0)
	camperiphfree(periph);
	}

	void
	cam_periph_release_locked(struct cam_periph *periph)
	{

	if (periph == NULL)
	return;

	xpt_lock_buses();
	cam_periph_release_locked_buses(periph);
	xpt_unlock_buses();
	}

	void
	cam_periph_release(struct cam_periph *periph)
	{
	struct mtx *mtx;

	if (periph == NULL)
	return;

	cam_periph_assert(periph, MA_NOTOWNED);
	mtx = cam_periph_mtx(periph);
	mtx_lock(mtx);
	cam_periph_release_locked(periph);
	mtx_unlock(mtx);
	}

	/*
	* hold/unhold act as mutual exclusion for sections of the code that
	* need to sleep and want to make sure that other sections that
	* will interfere are held off. This only protects exclusive sections
	* from each other.
	*/
	int
	cam_periph_hold(struct cam_periph *periph, int priority)
	{
	int error;

	/*
	* Increment the reference count on the peripheral
	* while we wait for our lock attempt to succeed
	* to ensure the peripheral doesn't disappear out
	* from user us while we sleep.
	*/

	if (cam_periph_acquire(periph) != 0)
	return (ENXIO);

	cam_periph_assert(periph, MA_OWNED);
	while ((periph->flags & CAM_PERIPH_LOCKED) != 0) {
	periph->flags \|= CAM_PERIPH_LOCK_WANTED;
	if ((error = cam_periph_sleep(periph, periph, priority,
	"caplck", 0)) != 0) {
	cam_periph_release_locked(periph);
	return (error);
	}
	if (periph->flags & CAM_PERIPH_INVALID) {
	cam_periph_release_locked(periph);
	return (ENXIO);
	}
	}

	periph->flags \|= CAM_PERIPH_LOCKED;
	return (0);
	}

	void
	cam_periph_unhold(struct cam_periph *periph)
	{

	cam_periph_assert(periph, MA_OWNED);

	periph->flags &= ~CAM_PERIPH_LOCKED;
	if ((periph->flags & CAM_PERIPH_LOCK_WANTED) != 0) {
	periph->flags &= ~CAM_PERIPH_LOCK_WANTED;
	wakeup(periph);
	}

	cam_periph_release_locked(periph);
	}

	/*
	* Look for the next unit number that is not currently in use for this
	* peripheral type starting at "newunit". Also exclude unit numbers that
	* are reserved by for future "hardwiring" unless we already know that this
	* is a potential wired device. Only assume that the device is "wired" the
	* first time through the loop since after that we'll be looking at unit
	* numbers that did not match a wiring entry.
	*/
	static u_int
	camperiphnextunit(struct periph_driver *p_drv, u_int newunit, int wired,
	path_id_t pathid, target_id_t target, lun_id_t lun)
	{
	struct cam_periph *periph;
	char *periph_name;
	int i, val, dunit, r;
	const char dname, strval;

	periph_name = p_drv->driver_name;
	for (;;newunit++) {
	for (periph = TAILQ_FIRST(&p_drv->units);
	periph != NULL && periph->unit_number != newunit;
	periph = TAILQ_NEXT(periph, unit_links))
	;

	if (periph != NULL && periph->unit_number == newunit) {
	if (wired != 0) {
	xpt_print(periph->path, "Duplicate Wired "
	"Device entry!\n");
	xpt_print(periph->path, "Second device (%s "
	"device at scbus%d target %d lun %d) will "
	"not be wired\n", periph_name, pathid,
	target, lun);
	wired = 0;
	}
	continue;
	}
	if (wired)
	break;

	/*
	* Don't match entries like "da 4" as a wired down
	* device, but do match entries like "da 4 target 5"
	* or even "da 4 scbus 1".
	*/
	i = 0;
	dname = periph_name;
	for (;;) {
	r = resource_find_dev(&i, dname, &dunit, NULL, NULL);
	if (r != 0)
	break;
	/* if no "target" and no specific scbus, skip */
	if (resource_int_value(dname, dunit, "target", &val) &&
	(resource_string_value(dname, dunit, "at",&strval)\|\|
	strcmp(strval, "scbus") == 0))
	continue;
	if (newunit == dunit)
	break;
	}
	if (r != 0)
	break;
	}
	return (newunit);
	}

	static u_int
	camperiphunit(struct periph_driver *p_drv, path_id_t pathid,
	target_id_t target, lun_id_t lun)
	{
	u_int unit;
	int wired, i, val, dunit;
	const char dname, strval;
	char pathbuf[32], *periph_name;

	periph_name = p_drv->driver_name;
	snprintf(pathbuf, sizeof(pathbuf), "scbus%d", pathid);
	unit = 0;
	i = 0;
	dname = periph_name;
	for (wired = 0; resource_find_dev(&i, dname, &dunit, NULL, NULL) == 0;
	wired = 0) {
	if (resource_string_value(dname, dunit, "at", &strval) == 0) {
	if (strcmp(strval, pathbuf) != 0)
	continue;
	wired++;
	}
	if (resource_int_value(dname, dunit, "target", &val) == 0) {
	if (val != target)
	continue;
	wired++;
	}
	if (resource_int_value(dname, dunit, "lun", &val) == 0) {
	if (val != lun)
	continue;
	wired++;
	}
	if (wired != 0) {
	unit = dunit;
	break;
	}
	}

	/*
	* Either start from 0 looking for the next unit or from
	* the unit number given in the resource config. This way,
	* if we have wildcard matches, we don't return the same
	* unit number twice.
	*/
	unit = camperiphnextunit(p_drv, unit, wired, pathid, target, lun);

	return (unit);
	}

	void
	cam_periph_invalidate(struct cam_periph *periph)
	{

	cam_periph_assert(periph, MA_OWNED);
	/*
	* We only tear down the device the first time a peripheral is
	* invalidated.
	*/
	if ((periph->flags & CAM_PERIPH_INVALID) != 0)
	return;

	CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph invalidated\n"));
	if ((periph->flags & CAM_PERIPH_ANNOUNCED) && !rebooting) {
	struct sbuf sb;
	char buffer[160];

	sbuf_new(&sb, buffer, 160, SBUF_FIXEDLEN);
	xpt_denounce_periph_sbuf(periph, &sb);
	sbuf_finish(&sb);
	sbuf_putbuf(&sb);
	}
	periph->flags \|= CAM_PERIPH_INVALID;
	periph->flags &= ~CAM_PERIPH_NEW_DEV_FOUND;
	if (periph->periph_oninval != NULL)
	periph->periph_oninval(periph);
	cam_periph_release_locked(periph);
	}

	static void
	camperiphfree(struct cam_periph *periph)
	{
	struct periph_driver **p_drv;
	struct periph_driver *drv;

	cam_periph_assert(periph, MA_OWNED);
	KASSERT(periph->periph_allocating == 0, ("%s%d: freed while allocating",
	periph->periph_name, periph->unit_number));
	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
	if (strcmp((*p_drv)->driver_name, periph->periph_name) == 0)
	break;
	}
	if (*p_drv == NULL) {
	printf("camperiphfree: attempt to free non-existant periph\n");
	return;
	}
	/*
	* Cache a pointer to the periph_driver structure. If a
	* periph_driver is added or removed from the array (see
	* periphdriver_register()) while we drop the toplogy lock
	* below, p_drv may change. This doesn't protect against this
	* particular periph_driver going away. That will require full
	* reference counting in the periph_driver infrastructure.
	*/
	drv = *p_drv;

	/*
	* We need to set this flag before dropping the topology lock, to
	* let anyone who is traversing the list that this peripheral is
	* about to be freed, and there will be no more reference count
	* checks.
	*/
	periph->flags \|= CAM_PERIPH_FREE;

	/*
	* The peripheral destructor semantics dictate calling with only the
	* SIM mutex held. Since it might sleep, it should not be called
	* with the topology lock held.
	*/
	xpt_unlock_buses();

	/*
	* We need to call the peripheral destructor prior to removing the
	* peripheral from the list. Otherwise, we risk running into a
	* scenario where the peripheral unit number may get reused
	* (because it has been removed from the list), but some resources
	* used by the peripheral are still hanging around. In particular,
	* the devfs nodes used by some peripherals like the pass(4) driver
	* aren't fully cleaned up until the destructor is run. If the
	* unit number is reused before the devfs instance is fully gone,
	* devfs will panic.
	*/
	if (periph->periph_dtor != NULL)
	periph->periph_dtor(periph);

	/*
	* The peripheral list is protected by the topology lock. We have to
	* remove the periph from the drv list before we call deferred_ac. The
	* AC_FOUND_DEVICE callback won't create a new periph if it's still there.
	*/
	xpt_lock_buses();

	TAILQ_REMOVE(&drv->units, periph, unit_links);
	drv->generation++;

	xpt_remove_periph(periph);

	xpt_unlock_buses();
	if ((periph->flags & CAM_PERIPH_ANNOUNCED) && !rebooting)
	xpt_print(periph->path, "Periph destroyed\n");
	else
	CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n"));

	if (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) {
	union ccb ccb;
	void *arg;

	switch (periph->deferred_ac) {
	case AC_FOUND_DEVICE:
	ccb.ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_setup_ccb(&ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	xpt_action(&ccb);
	arg = &ccb;
	break;
	case AC_PATH_REGISTERED:
	xpt_path_inq(&ccb.cpi, periph->path);
	arg = &ccb;
	break;
	default:
	arg = NULL;
	break;
	}
	periph->deferred_callback(NULL, periph->deferred_ac,
	periph->path, arg);
	}
	xpt_free_path(periph->path);
	free(periph, M_CAMPERIPH);
	xpt_lock_buses();
	}

	/*
	* Map user virtual pointers into kernel virtual address space, so we can
	* access the memory. This is now a generic function that centralizes most
	* of the sanity checks on the data flags, if any.
	- * This also only works for up to MAXPHYS memory. Since we use
	+ * This also only works for up to maxphys memory. Since we use
	* buffers to map stuff in and out, we're limited to the buffer size.
	*/
	int
	cam_periph_mapmem(union ccb ccb, struct cam_periph_map_info mapinfo,
	u_int maxmap)
	{
	int numbufs, i;
	u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
	u_int32_t lengths[CAM_PERIPH_MAXMAPS];
	u_int32_t dirs[CAM_PERIPH_MAXMAPS];
	bool misaligned[CAM_PERIPH_MAXMAPS];

	bzero(mapinfo, sizeof(*mapinfo));
	if (maxmap == 0)
	maxmap = DFLTPHYS; /* traditional default */
	- else if (maxmap > MAXPHYS)
	- maxmap = MAXPHYS; /* for safety */
	+ else if (maxmap > maxphys)
	+ maxmap = maxphys; /* for safety */
	switch(ccb->ccb_h.func_code) {
	case XPT_DEV_MATCH:
	if (ccb->cdm.match_buf_len == 0) {
	printf("cam_periph_mapmem: invalid match buffer "
	"length 0\n");
	return(EINVAL);
	}
	if (ccb->cdm.pattern_buf_len > 0) {
	data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
	lengths[0] = ccb->cdm.pattern_buf_len;
	dirs[0] = CAM_DIR_OUT;
	data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
	lengths[1] = ccb->cdm.match_buf_len;
	dirs[1] = CAM_DIR_IN;
	numbufs = 2;
	} else {
	data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
	lengths[0] = ccb->cdm.match_buf_len;
	dirs[0] = CAM_DIR_IN;
	numbufs = 1;
	}
	/*
	* This request will not go to the hardware, no reason
	- * to be so strict. vmapbuf() is able to map up to MAXPHYS.
	+ * to be so strict. vmapbuf() is able to map up to maxphys.
	*/
	- maxmap = MAXPHYS;
	+ maxmap = maxphys;
	break;
	case XPT_SCSI_IO:
	case XPT_CONT_TARGET_IO:
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
	return(0);
	if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
	return (EINVAL);
	data_ptrs[0] = &ccb->csio.data_ptr;
	lengths[0] = ccb->csio.dxfer_len;
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	numbufs = 1;
	break;
	case XPT_ATA_IO:
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
	return(0);
	if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
	return (EINVAL);
	data_ptrs[0] = &ccb->ataio.data_ptr;
	lengths[0] = ccb->ataio.dxfer_len;
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	numbufs = 1;
	break;
	case XPT_MMC_IO:
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
	return(0);
	/* Two mappings: one for cmd->data and one for cmd->data->data */
	data_ptrs[0] = (unsigned char **)&ccb->mmcio.cmd.data;
	lengths[0] = sizeof(struct mmc_data *);
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	data_ptrs[1] = (unsigned char **)&ccb->mmcio.cmd.data->data;
	lengths[1] = ccb->mmcio.cmd.data->len;
	dirs[1] = ccb->ccb_h.flags & CAM_DIR_MASK;
	numbufs = 2;
	break;
	case XPT_SMP_IO:
	data_ptrs[0] = &ccb->smpio.smp_request;
	lengths[0] = ccb->smpio.smp_request_len;
	dirs[0] = CAM_DIR_OUT;
	data_ptrs[1] = &ccb->smpio.smp_response;
	lengths[1] = ccb->smpio.smp_response_len;
	dirs[1] = CAM_DIR_IN;
	numbufs = 2;
	break;
	case XPT_NVME_IO:
	case XPT_NVME_ADMIN:
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
	return (0);
	if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
	return (EINVAL);
	data_ptrs[0] = &ccb->nvmeio.data_ptr;
	lengths[0] = ccb->nvmeio.dxfer_len;
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	numbufs = 1;
	break;
	case XPT_DEV_ADVINFO:
	if (ccb->cdai.bufsiz == 0)
	return (0);

	data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
	lengths[0] = ccb->cdai.bufsiz;
	dirs[0] = CAM_DIR_IN;
	numbufs = 1;

	/*
	* This request will not go to the hardware, no reason
	- * to be so strict. vmapbuf() is able to map up to MAXPHYS.
	+ * to be so strict. vmapbuf() is able to map up to maxphys.
	*/
	- maxmap = MAXPHYS;
	+ maxmap = maxphys;
	break;
	default:
	return(EINVAL);
	break; /* NOTREACHED */
	}

	/*
	* Check the transfer length and permissions first, so we don't
	* have to unmap any previously mapped buffers.
	*/
	for (i = 0; i < numbufs; i++) {
	if (lengths[i] > maxmap) {
	printf("cam_periph_mapmem: attempt to map %lu bytes, "
	"which is greater than %lu\n",
	(long)(lengths[i]), (u_long)maxmap);
	return (E2BIG);
	}

	/*
	* The userland data pointer passed in may not be page
	* aligned. vmapbuf() truncates the address to a page
	* boundary, so if the address isn't page aligned, we'll
	* need enough space for the given transfer length, plus
	* whatever extra space is necessary to make it to the page
	* boundary.
	*/
	misaligned[i] = (lengths[i] +
	- (((vm_offset_t)(*data_ptrs[i])) & PAGE_MASK) > MAXPHYS);
	+ (((vm_offset_t)(*data_ptrs[i])) & PAGE_MASK) > maxphys);
	}

	/*
	* This keeps the kernel stack of current thread from getting
	* swapped. In low-memory situations where the kernel stack might
	* otherwise get swapped out, this holds it and allows the thread
	* to make progress and release the kernel mapped pages sooner.
	*
	* XXX KDM should I use P_NOSWAP instead?
	*/
	PHOLD(curproc);

	for (i = 0; i < numbufs; i++) {
	/* Save the user's data address. */
	mapinfo->orig[i] = *data_ptrs[i];

	/*
	* For small buffers use malloc+copyin/copyout instead of
	* mapping to KVA to avoid expensive TLB shootdowns. For
	* small allocations malloc is backed by UMA, and so much
	* cheaper on SMP systems.
	*/
	if ((lengths[i] <= periph_mapmem_thresh \|\| misaligned[i]) &&
	ccb->ccb_h.func_code != XPT_MMC_IO) {
	*data_ptrs[i] = malloc(lengths[i], M_CAMPERIPH,
	M_WAITOK);
	if (dirs[i] != CAM_DIR_IN) {
	if (copyin(mapinfo->orig[i], *data_ptrs[i],
	lengths[i]) != 0) {
	free(*data_ptrs[i], M_CAMPERIPH);
	*data_ptrs[i] = mapinfo->orig[i];
	goto fail;
	}
	} else
	bzero(*data_ptrs[i], lengths[i]);
	continue;
	}

	/*
	* Get the buffer.
	*/
	mapinfo->bp[i] = uma_zalloc(pbuf_zone, M_WAITOK);

	/* set the direction */
	mapinfo->bp[i]->b_iocmd = (dirs[i] == CAM_DIR_OUT) ?
	BIO_WRITE : BIO_READ;

	/* Map the buffer into kernel memory. */
	if (vmapbuf(mapinfo->bp[i], *data_ptrs[i], lengths[i], 1) < 0) {
	uma_zfree(pbuf_zone, mapinfo->bp[i]);
	goto fail;
	}

	/* set our pointer to the new mapped area */
	*data_ptrs[i] = mapinfo->bp[i]->b_data;
	}

	/*
	* Now that we've gotten this far, change ownership to the kernel
	* of the buffers so that we don't run afoul of returning to user
	* space with locks (on the buffer) held.
	*/
	for (i = 0; i < numbufs; i++) {
	if (mapinfo->bp[i])
	BUF_KERNPROC(mapinfo->bp[i]);
	}

	mapinfo->num_bufs_used = numbufs;
	return(0);

	fail:
	for (i--; i >= 0; i--) {
	if (mapinfo->bp[i]) {
	vunmapbuf(mapinfo->bp[i]);
	uma_zfree(pbuf_zone, mapinfo->bp[i]);
	} else
	free(*data_ptrs[i], M_CAMPERIPH);
	*data_ptrs[i] = mapinfo->orig[i];
	}
	PRELE(curproc);
	return(EACCES);
	}

	/*
	* Unmap memory segments mapped into kernel virtual address space by
	* cam_periph_mapmem().
	*/
	void
	cam_periph_unmapmem(union ccb ccb, struct cam_periph_map_info mapinfo)
	{
	int numbufs, i;
	u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
	u_int32_t lengths[CAM_PERIPH_MAXMAPS];
	u_int32_t dirs[CAM_PERIPH_MAXMAPS];

	if (mapinfo->num_bufs_used <= 0) {
	/* nothing to free and the process wasn't held. */
	return;
	}

	switch (ccb->ccb_h.func_code) {
	case XPT_DEV_MATCH:
	if (ccb->cdm.pattern_buf_len > 0) {
	data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
	lengths[0] = ccb->cdm.pattern_buf_len;
	dirs[0] = CAM_DIR_OUT;
	data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
	lengths[1] = ccb->cdm.match_buf_len;
	dirs[1] = CAM_DIR_IN;
	numbufs = 2;
	} else {
	data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
	lengths[0] = ccb->cdm.match_buf_len;
	dirs[0] = CAM_DIR_IN;
	numbufs = 1;
	}
	break;
	case XPT_SCSI_IO:
	case XPT_CONT_TARGET_IO:
	data_ptrs[0] = &ccb->csio.data_ptr;
	lengths[0] = ccb->csio.dxfer_len;
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	numbufs = 1;
	break;
	case XPT_ATA_IO:
	data_ptrs[0] = &ccb->ataio.data_ptr;
	lengths[0] = ccb->ataio.dxfer_len;
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	numbufs = 1;
	break;
	case XPT_MMC_IO:
	data_ptrs[0] = (u_int8_t **)&ccb->mmcio.cmd.data;
	lengths[0] = sizeof(struct mmc_data *);
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	data_ptrs[1] = (u_int8_t **)&ccb->mmcio.cmd.data->data;
	lengths[1] = ccb->mmcio.cmd.data->len;
	dirs[1] = ccb->ccb_h.flags & CAM_DIR_MASK;
	numbufs = 2;
	break;
	case XPT_SMP_IO:
	data_ptrs[0] = &ccb->smpio.smp_request;
	lengths[0] = ccb->smpio.smp_request_len;
	dirs[0] = CAM_DIR_OUT;
	data_ptrs[1] = &ccb->smpio.smp_response;
	lengths[1] = ccb->smpio.smp_response_len;
	dirs[1] = CAM_DIR_IN;
	numbufs = 2;
	break;
	case XPT_NVME_IO:
	case XPT_NVME_ADMIN:
	data_ptrs[0] = &ccb->nvmeio.data_ptr;
	lengths[0] = ccb->nvmeio.dxfer_len;
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	numbufs = 1;
	break;
	case XPT_DEV_ADVINFO:
	data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
	lengths[0] = ccb->cdai.bufsiz;
	dirs[0] = CAM_DIR_IN;
	numbufs = 1;
	break;
	default:
	/* allow ourselves to be swapped once again */
	PRELE(curproc);
	return;
	break; /* NOTREACHED */
	}

	for (i = 0; i < numbufs; i++) {
	if (mapinfo->bp[i]) {
	/* unmap the buffer */
	vunmapbuf(mapinfo->bp[i]);

	/* release the buffer */
	uma_zfree(pbuf_zone, mapinfo->bp[i]);
	} else {
	if (dirs[i] != CAM_DIR_OUT) {
	copyout(*data_ptrs[i], mapinfo->orig[i],
	lengths[i]);
	}
	free(*data_ptrs[i], M_CAMPERIPH);
	}

	/* Set the user's pointer back to the original value */
	*data_ptrs[i] = mapinfo->orig[i];
	}

	/* allow ourselves to be swapped once again */
	PRELE(curproc);
	}

	int
	cam_periph_ioctl(struct cam_periph *periph, u_long cmd, caddr_t addr,
	int (error_routine)(union ccb ccb,
	cam_flags camflags,
	u_int32_t sense_flags))
	{
	union ccb *ccb;
	int error;
	int found;

	error = found = 0;

	switch(cmd){
	case CAMGETPASSTHRU:
	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	xpt_setup_ccb(&ccb->ccb_h,
	ccb->ccb_h.path,
	CAM_PRIORITY_NORMAL);
	ccb->ccb_h.func_code = XPT_GDEVLIST;

	/*
	* Basically, the point of this is that we go through
	* getting the list of devices, until we find a passthrough
	* device. In the current version of the CAM code, the
	* only way to determine what type of device we're dealing
	* with is by its name.
	*/
	while (found == 0) {
	ccb->cgdl.index = 0;
	ccb->cgdl.status = CAM_GDEVLIST_MORE_DEVS;
	while (ccb->cgdl.status == CAM_GDEVLIST_MORE_DEVS) {
	/* we want the next device in the list */
	xpt_action(ccb);
	if (strncmp(ccb->cgdl.periph_name,
	"pass", 4) == 0){
	found = 1;
	break;
	}
	}
	if ((ccb->cgdl.status == CAM_GDEVLIST_LAST_DEVICE) &&
	(found == 0)) {
	ccb->cgdl.periph_name[0] = '\0';
	ccb->cgdl.unit_number = 0;
	break;
	}
	}

	/* copy the result back out */
	bcopy(ccb, addr, sizeof(union ccb));

	/* and release the ccb */
	xpt_release_ccb(ccb);

	break;
	default:
	error = ENOTTY;
	break;
	}
	return(error);
	}

	static void
	cam_periph_done_panic(struct cam_periph periph, union ccb done_ccb)
	{

	panic("%s: already done with ccb %p", __func__, done_ccb);
	}

	static void
	cam_periph_done(struct cam_periph periph, union ccb done_ccb)
	{

	/* Caller will release the CCB */
	xpt_path_assert(done_ccb->ccb_h.path, MA_OWNED);
	done_ccb->ccb_h.cbfcnp = cam_periph_done_panic;
	wakeup(&done_ccb->ccb_h.cbfcnp);
	}

	static void
	cam_periph_ccbwait(union ccb *ccb)
	{

	if ((ccb->ccb_h.func_code & XPT_FC_QUEUED) != 0) {
	while (ccb->ccb_h.cbfcnp != cam_periph_done_panic)
	xpt_path_sleep(ccb->ccb_h.path, &ccb->ccb_h.cbfcnp,
	PRIBIO, "cbwait", 0);
	}
	KASSERT(ccb->ccb_h.pinfo.index == CAM_UNQUEUED_INDEX &&
	(ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_INPROG,
	("%s: proceeding with incomplete ccb: ccb=%p, func_code=%#x, "
	"status=%#x, index=%d", __func__, ccb, ccb->ccb_h.func_code,
	ccb->ccb_h.status, ccb->ccb_h.pinfo.index));
	}

	/*
	* Dispatch a CCB and wait for it to complete. If the CCB has set a
	* callback function (ccb->ccb_h.cbfcnp), it will be overwritten and lost.
	*/
	int
	cam_periph_runccb(union ccb *ccb,
	int (error_routine)(union ccb ccb,
	cam_flags camflags,
	u_int32_t sense_flags),
	cam_flags camflags, u_int32_t sense_flags,
	struct devstat *ds)
	{
	struct bintime *starttime;
	struct bintime ltime;
	int error;
	bool must_poll;
	uint32_t timeout = 1;

	starttime = NULL;
	xpt_path_assert(ccb->ccb_h.path, MA_OWNED);
	KASSERT((ccb->ccb_h.flags & CAM_UNLOCKED) == 0,
	("%s: ccb=%p, func_code=%#x, flags=%#x", __func__, ccb,
	ccb->ccb_h.func_code, ccb->ccb_h.flags));

	/*
	* If the user has supplied a stats structure, and if we understand
	* this particular type of ccb, record the transaction start.
	*/
	if (ds != NULL &&
	(ccb->ccb_h.func_code == XPT_SCSI_IO \|\|
	ccb->ccb_h.func_code == XPT_ATA_IO \|\|
	ccb->ccb_h.func_code == XPT_NVME_IO)) {
	starttime = &ltime;
	binuptime(starttime);
	devstat_start_transaction(ds, starttime);
	}

	/*
	* We must poll the I/O while we're dumping. The scheduler is normally
	* stopped for dumping, except when we call doadump from ddb. While the
	* scheduler is running in this case, we still need to poll the I/O to
	* avoid sleeping waiting for the ccb to complete.
	*
	* A panic triggered dump stops the scheduler, any callback from the
	* shutdown_post_sync event will run with the scheduler stopped, but
	* before we're officially dumping. To avoid hanging in adashutdown
	* initiated commands (or other similar situations), we have to test for
	* either SCHEDULER_STOPPED() here as well.
	*
	* To avoid locking problems, dumping/polling callers must call
	* without a periph lock held.
	*/
	must_poll = dumping \|\| SCHEDULER_STOPPED();
	ccb->ccb_h.cbfcnp = cam_periph_done;

	/*
	* If we're polling, then we need to ensure that we have ample resources
	* in the periph. cam_periph_error can reschedule the ccb by calling
	* xpt_action and returning ERESTART, so we have to effect the polling
	* in the do loop below.
	*/
	if (must_poll) {
	timeout = xpt_poll_setup(ccb);
	}

	if (timeout == 0) {
	ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
	error = EBUSY;
	} else {
	xpt_action(ccb);
	do {
	if (must_poll) {
	xpt_pollwait(ccb, timeout);
	timeout = ccb->ccb_h.timeout * 10;
	} else {
	cam_periph_ccbwait(ccb);
	}
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP)
	error = 0;
	else if (error_routine != NULL) {
	ccb->ccb_h.cbfcnp = cam_periph_done;
	error = (*error_routine)(ccb, camflags, sense_flags);
	} else
	error = 0;
	} while (error == ERESTART);
	}

	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	cam_release_devq(ccb->ccb_h.path,
	/* relsim_flags */0,
	/* openings */0,
	/* timeout */0,
	/* getcount_only */ FALSE);
	ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
	}

	if (ds != NULL) {
	uint32_t bytes;
	devstat_tag_type tag;
	bool valid = true;

	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
	bytes = ccb->csio.dxfer_len - ccb->csio.resid;
	tag = (devstat_tag_type)(ccb->csio.tag_action & 0x3);
	} else if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	bytes = ccb->ataio.dxfer_len - ccb->ataio.resid;
	tag = (devstat_tag_type)0;
	} else if (ccb->ccb_h.func_code == XPT_NVME_IO) {
	bytes = ccb->nvmeio.dxfer_len; /* NB: resid no possible */
	tag = (devstat_tag_type)0;
	} else {
	valid = false;
	}
	if (valid)
	devstat_end_transaction(ds, bytes, tag,
	((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) ?
	DEVSTAT_NO_DATA : (ccb->ccb_h.flags & CAM_DIR_OUT) ?
	DEVSTAT_WRITE : DEVSTAT_READ, NULL, starttime);
	}

	return(error);
	}

	void
	cam_freeze_devq(struct cam_path *path)
	{
	struct ccb_hdr ccb_h;

	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("cam_freeze_devq\n"));
	xpt_setup_ccb(&ccb_h, path, /priority/1);
	ccb_h.func_code = XPT_NOOP;
	ccb_h.flags = CAM_DEV_QFREEZE;
	xpt_action((union ccb *)&ccb_h);
	}

	u_int32_t
	cam_release_devq(struct cam_path *path, u_int32_t relsim_flags,
	u_int32_t openings, u_int32_t arg,
	int getcount_only)
	{
	struct ccb_relsim crs;

	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("cam_release_devq(%u, %u, %u, %d)\n",
	relsim_flags, openings, arg, getcount_only));
	xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL);
	crs.ccb_h.func_code = XPT_REL_SIMQ;
	crs.ccb_h.flags = getcount_only ? CAM_DEV_QFREEZE : 0;
	crs.release_flags = relsim_flags;
	crs.openings = openings;
	crs.release_timeout = arg;
	xpt_action((union ccb *)&crs);
	return (crs.qfrozen_cnt);
	}

	#define saved_ccb_ptr ppriv_ptr0
	static void
	camperiphdone(struct cam_periph periph, union ccb done_ccb)
	{
	union ccb *saved_ccb;
	cam_status status;
	struct scsi_start_stop_unit *scsi_cmd;
	int error = 0, error_code, sense_key, asc, ascq;

	scsi_cmd = (struct scsi_start_stop_unit *)
	&done_ccb->csio.cdb_io.cdb_bytes;
	status = done_ccb->ccb_h.status;

	if ((status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	if (scsi_extract_sense_ccb(done_ccb,
	&error_code, &sense_key, &asc, &ascq)) {
	/*
	* If the error is "invalid field in CDB",
	* and the load/eject flag is set, turn the
	* flag off and try again. This is just in
	* case the drive in question barfs on the
	* load eject flag. The CAM code should set
	* the load/eject flag by default for
	* removable media.
	*/
	if ((scsi_cmd->opcode == START_STOP_UNIT) &&
	((scsi_cmd->how & SSS_LOEJ) != 0) &&
	(asc == 0x24) && (ascq == 0x00)) {
	scsi_cmd->how &= ~SSS_LOEJ;
	if (status & CAM_DEV_QFRZN) {
	cam_release_devq(done_ccb->ccb_h.path,
	0, 0, 0, 0);
	done_ccb->ccb_h.status &=
	~CAM_DEV_QFRZN;
	}
	xpt_action(done_ccb);
	goto out;
	}
	}
	error = cam_periph_error(done_ccb, 0,
	SF_RETRY_UA \| SF_NO_PRINT);
	if (error == ERESTART)
	goto out;
	if (done_ccb->ccb_h.status & CAM_DEV_QFRZN) {
	cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0);
	done_ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
	}
	} else {
	/*
	* If we have successfully taken a device from the not
	* ready to ready state, re-scan the device and re-get
	* the inquiry information. Many devices (mostly disks)
	* don't properly report their inquiry information unless
	* they are spun up.
	*/
	if (scsi_cmd->opcode == START_STOP_UNIT)
	xpt_async(AC_INQ_CHANGED, done_ccb->ccb_h.path, NULL);
	}

	/* If we tried long wait and still failed, remember that. */
	if ((periph->flags & CAM_PERIPH_RECOVERY_WAIT) &&
	(done_ccb->csio.cdb_io.cdb_bytes[0] == TEST_UNIT_READY)) {
	periph->flags &= ~CAM_PERIPH_RECOVERY_WAIT;
	if (error != 0 && done_ccb->ccb_h.retry_count == 0)
	periph->flags \|= CAM_PERIPH_RECOVERY_WAIT_FAILED;
	}

	/*
	* After recovery action(s) completed, return to the original CCB.
	* If the recovery CCB has failed, considering its own possible
	* retries and recovery, assume we are back in state where we have
	* been originally, but without recovery hopes left. In such case,
	* after the final attempt below, we cancel any further retries,
	* blocking by that also any new recovery attempts for this CCB,
	* and the result will be the final one returned to the CCB owher.
	*/
	saved_ccb = (union ccb *)done_ccb->ccb_h.saved_ccb_ptr;
	bcopy(saved_ccb, done_ccb, sizeof(*done_ccb));
	xpt_free_ccb(saved_ccb);
	if (done_ccb->ccb_h.cbfcnp != camperiphdone)
	periph->flags &= ~CAM_PERIPH_RECOVERY_INPROG;
	if (error != 0)
	done_ccb->ccb_h.retry_count = 0;
	xpt_action(done_ccb);

	out:
	/* Drop freeze taken due to CAM_DEV_QFREEZE flag set. */
	cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0);
	}

	/*
	* Generic Async Event handler. Peripheral drivers usually
	* filter out the events that require personal attention,
	* and leave the rest to this function.
	*/
	void
	cam_periph_async(struct cam_periph *periph, u_int32_t code,
	struct cam_path path, void arg)
	{
	switch (code) {
	case AC_LOST_DEVICE:
	cam_periph_invalidate(periph);
	break;
	default:
	break;
	}
	}

	void
	cam_periph_bus_settle(struct cam_periph *periph, u_int bus_settle)
	{
	struct ccb_getdevstats cgds;

	xpt_setup_ccb(&cgds.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	cgds.ccb_h.func_code = XPT_GDEV_STATS;
	xpt_action((union ccb *)&cgds);
	cam_periph_freeze_after_event(periph, &cgds.last_reset, bus_settle);
	}

	void
	cam_periph_freeze_after_event(struct cam_periph *periph,
	struct timeval* event_time, u_int duration_ms)
	{
	struct timeval delta;
	struct timeval duration_tv;

	if (!timevalisset(event_time))
	return;

	microtime(&delta);
	timevalsub(&delta, event_time);
	duration_tv.tv_sec = duration_ms / 1000;
	duration_tv.tv_usec = (duration_ms % 1000) * 1000;
	if (timevalcmp(&delta, &duration_tv, <)) {
	timevalsub(&duration_tv, &delta);

	duration_ms = duration_tv.tv_sec * 1000;
	duration_ms += duration_tv.tv_usec / 1000;
	cam_freeze_devq(periph->path);
	cam_release_devq(periph->path,
	RELSIM_RELEASE_AFTER_TIMEOUT,
	/reduction/0,
	/timeout/duration_ms,
	/getcount_only/0);
	}

	}

	static int
	camperiphscsistatuserror(union ccb ccb, union ccb *orig_ccb,
	cam_flags camflags, u_int32_t sense_flags,
	int openings, u_int32_t relsim_flags,
	u_int32_t timeout, u_int32_t action, const char **action_string)
	{
	struct cam_periph *periph;
	int error;

	switch (ccb->csio.scsi_status) {
	case SCSI_STATUS_OK:
	case SCSI_STATUS_COND_MET:
	case SCSI_STATUS_INTERMED:
	case SCSI_STATUS_INTERMED_COND_MET:
	error = 0;
	break;
	case SCSI_STATUS_CMD_TERMINATED:
	case SCSI_STATUS_CHECK_COND:
	error = camperiphscsisenseerror(ccb, orig_ccb,
	camflags,
	sense_flags,
	openings,
	relsim_flags,
	timeout,
	action,
	action_string);
	break;
	case SCSI_STATUS_QUEUE_FULL:
	{
	/* no decrement */
	struct ccb_getdevstats cgds;

	/*
	* First off, find out what the current
	* transaction counts are.
	*/
	xpt_setup_ccb(&cgds.ccb_h,
	ccb->ccb_h.path,
	CAM_PRIORITY_NORMAL);
	cgds.ccb_h.func_code = XPT_GDEV_STATS;
	xpt_action((union ccb *)&cgds);

	/*
	* If we were the only transaction active, treat
	* the QUEUE FULL as if it were a BUSY condition.
	*/
	if (cgds.dev_active != 0) {
	int total_openings;

	/*
	* Reduce the number of openings to
	* be 1 less than the amount it took
	* to get a queue full bounded by the
	* minimum allowed tag count for this
	* device.
	*/
	total_openings = cgds.dev_active + cgds.dev_openings;
	*openings = cgds.dev_active;
	if (*openings < cgds.mintags)
	*openings = cgds.mintags;
	if (*openings < total_openings)
	*relsim_flags = RELSIM_ADJUST_OPENINGS;
	else {
	/*
	* Some devices report queue full for
	* temporary resource shortages. For
	* this reason, we allow a minimum
	* tag count to be entered via a
	* quirk entry to prevent the queue
	* count on these devices from falling
	* to a pessimisticly low value. We
	* still wait for the next successful
	* completion, however, before queueing
	* more transactions to the device.
	*/
	*relsim_flags = RELSIM_RELEASE_AFTER_CMDCMPLT;
	}
	*timeout = 0;
	error = ERESTART;
	*action &= ~SSQ_PRINT_SENSE;
	break;
	}
	/* FALLTHROUGH */
	}
	case SCSI_STATUS_BUSY:
	/*
	* Restart the queue after either another
	* command completes or a 1 second timeout.
	*/
	periph = xpt_path_periph(ccb->ccb_h.path);
	if (periph->flags & CAM_PERIPH_INVALID) {
	error = EIO;
	*action_string = "Periph was invalidated";
	} else if ((sense_flags & SF_RETRY_BUSY) != 0 \|\|
	ccb->ccb_h.retry_count > 0) {
	if ((sense_flags & SF_RETRY_BUSY) == 0)
	ccb->ccb_h.retry_count--;
	error = ERESTART;
	*relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT
	\| RELSIM_RELEASE_AFTER_CMDCMPLT;
	*timeout = 1000;
	} else {
	error = EIO;
	*action_string = "Retries exhausted";
	}
	break;
	case SCSI_STATUS_RESERV_CONFLICT:
	default:
	error = EIO;
	break;
	}
	return (error);
	}

	static int
	camperiphscsisenseerror(union ccb ccb, union ccb *orig,
	cam_flags camflags, u_int32_t sense_flags,
	int openings, u_int32_t relsim_flags,
	u_int32_t timeout, u_int32_t action, const char **action_string)
	{
	struct cam_periph *periph;
	union ccb *orig_ccb = ccb;
	int error, recoveryccb;

	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	if (ccb->ccb_h.func_code == XPT_SCSI_IO && ccb->csio.bio != NULL)
	biotrack(ccb->csio.bio, __func__);
	#endif

	periph = xpt_path_periph(ccb->ccb_h.path);
	recoveryccb = (ccb->ccb_h.cbfcnp == camperiphdone);
	if ((periph->flags & CAM_PERIPH_RECOVERY_INPROG) && !recoveryccb) {
	/*
	* If error recovery is already in progress, don't attempt
	* to process this error, but requeue it unconditionally
	* and attempt to process it once error recovery has
	* completed. This failed command is probably related to
	* the error that caused the currently active error recovery
	* action so our current recovery efforts should also
	* address this command. Be aware that the error recovery
	* code assumes that only one recovery action is in progress
	* on a particular peripheral instance at any given time
	* (e.g. only one saved CCB for error recovery) so it is
	* imperitive that we don't violate this assumption.
	*/
	error = ERESTART;
	*action &= ~SSQ_PRINT_SENSE;
	} else {
	scsi_sense_action err_action;
	struct ccb_getdev cgd;

	/*
	* Grab the inquiry data for this device.
	*/
	xpt_setup_ccb(&cgd.ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL);
	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_action((union ccb *)&cgd);

	err_action = scsi_error_action(&ccb->csio, &cgd.inq_data,
	sense_flags);
	error = err_action & SS_ERRMASK;

	/*
	* Do not autostart sequential access devices
	* to avoid unexpected tape loading.
	*/
	if ((err_action & SS_MASK) == SS_START &&
	SID_TYPE(&cgd.inq_data) == T_SEQUENTIAL) {
	*action_string = "Will not autostart a "
	"sequential access device";
	goto sense_error_done;
	}

	/*
	* Avoid recovery recursion if recovery action is the same.
	*/
	if ((err_action & SS_MASK) >= SS_START && recoveryccb) {
	if (((err_action & SS_MASK) == SS_START &&
	ccb->csio.cdb_io.cdb_bytes[0] == START_STOP_UNIT) \|\|
	((err_action & SS_MASK) == SS_TUR &&
	(ccb->csio.cdb_io.cdb_bytes[0] == TEST_UNIT_READY))) {
	err_action = SS_RETRY\|SSQ_DECREMENT_COUNT\|EIO;
	*relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
	*timeout = 500;
	}
	}

	/*
	* If the recovery action will consume a retry,
	* make sure we actually have retries available.
	*/
	if ((err_action & SSQ_DECREMENT_COUNT) != 0) {
	if (ccb->ccb_h.retry_count > 0 &&
	(periph->flags & CAM_PERIPH_INVALID) == 0)
	ccb->ccb_h.retry_count--;
	else {
	*action_string = "Retries exhausted";
	goto sense_error_done;
	}
	}

	if ((err_action & SS_MASK) >= SS_START) {
	/*
	* Do common portions of commands that
	* use recovery CCBs.
	*/
	orig_ccb = xpt_alloc_ccb_nowait();
	if (orig_ccb == NULL) {
	*action_string = "Can't allocate recovery CCB";
	goto sense_error_done;
	}
	/*
	* Clear freeze flag for original request here, as
	* this freeze will be dropped as part of ERESTART.
	*/
	ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
	bcopy(ccb, orig_ccb, sizeof(*orig_ccb));
	}

	switch (err_action & SS_MASK) {
	case SS_NOP:
	*action_string = "No recovery action needed";
	error = 0;
	break;
	case SS_RETRY:
	*action_string = "Retrying command (per sense data)";
	error = ERESTART;
	break;
	case SS_FAIL:
	*action_string = "Unretryable error";
	break;
	case SS_START:
	{
	int le;

	/*
	* Send a start unit command to the device, and
	* then retry the command.
	*/
	*action_string = "Attempting to start unit";
	periph->flags \|= CAM_PERIPH_RECOVERY_INPROG;

	/*
	* Check for removable media and set
	* load/eject flag appropriately.
	*/
	if (SID_IS_REMOVABLE(&cgd.inq_data))
	le = TRUE;
	else
	le = FALSE;

	scsi_start_stop(&ccb->csio,
	/retries/1,
	camperiphdone,
	MSG_SIMPLE_Q_TAG,
	/start/TRUE,
	/load/eject/le,
	/immediate/FALSE,
	SSD_FULL_SIZE,
	/timeout/50000);
	break;
	}
	case SS_TUR:
	{
	/*
	* Send a Test Unit Ready to the device.
	* If the 'many' flag is set, we send 120
	* test unit ready commands, one every half
	* second. Otherwise, we just send one TUR.
	* We only want to do this if the retry
	* count has not been exhausted.
	*/
	int retries;

	if ((err_action & SSQ_MANY) != 0 && (periph->flags &
	CAM_PERIPH_RECOVERY_WAIT_FAILED) == 0) {
	periph->flags \|= CAM_PERIPH_RECOVERY_WAIT;
	*action_string = "Polling device for readiness";
	retries = 120;
	} else {
	*action_string = "Testing device for readiness";
	retries = 1;
	}
	periph->flags \|= CAM_PERIPH_RECOVERY_INPROG;
	scsi_test_unit_ready(&ccb->csio,
	retries,
	camperiphdone,
	MSG_SIMPLE_Q_TAG,
	SSD_FULL_SIZE,
	/timeout/5000);

	/*
	* Accomplish our 500ms delay by deferring
	* the release of our device queue appropriately.
	*/
	*relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
	*timeout = 500;
	break;
	}
	default:
	panic("Unhandled error action %x", err_action);
	}

	if ((err_action & SS_MASK) >= SS_START) {
	/*
	* Drop the priority, so that the recovery
	* CCB is the first to execute. Freeze the queue
	* after this command is sent so that we can
	* restore the old csio and have it queued in
	* the proper order before we release normal
	* transactions to the device.
	*/
	ccb->ccb_h.pinfo.priority--;
	ccb->ccb_h.flags \|= CAM_DEV_QFREEZE;
	ccb->ccb_h.saved_ccb_ptr = orig_ccb;
	error = ERESTART;
	*orig = orig_ccb;
	}

	sense_error_done:
	*action = err_action;
	}
	return (error);
	}

	/*
	* Generic error handler. Peripheral drivers usually filter
	* out the errors that they handle in a unique manner, then
	* call this function.
	*/
	int
	cam_periph_error(union ccb *ccb, cam_flags camflags,
	u_int32_t sense_flags)
	{
	struct cam_path *newpath;
	union ccb orig_ccb, scan_ccb;
	struct cam_periph *periph;
	const char *action_string;
	cam_status status;
	int frozen, error, openings, devctl_err;
	u_int32_t action, relsim_flags, timeout;

	action = SSQ_PRINT_SENSE;
	periph = xpt_path_periph(ccb->ccb_h.path);
	action_string = NULL;
	status = ccb->ccb_h.status;
	frozen = (status & CAM_DEV_QFRZN) != 0;
	status &= CAM_STATUS_MASK;
	devctl_err = openings = relsim_flags = timeout = 0;
	orig_ccb = ccb;

	/* Filter the errors that should be reported via devctl */
	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
	case CAM_CMD_TIMEOUT:
	case CAM_REQ_ABORTED:
	case CAM_REQ_CMP_ERR:
	case CAM_REQ_TERMIO:
	case CAM_UNREC_HBA_ERROR:
	case CAM_DATA_RUN_ERR:
	case CAM_SCSI_STATUS_ERROR:
	case CAM_ATA_STATUS_ERROR:
	case CAM_SMP_STATUS_ERROR:
	devctl_err++;
	break;
	default:
	break;
	}

	switch (status) {
	case CAM_REQ_CMP:
	error = 0;
	action &= ~SSQ_PRINT_SENSE;
	break;
	case CAM_SCSI_STATUS_ERROR:
	error = camperiphscsistatuserror(ccb, &orig_ccb,
	camflags, sense_flags, &openings, &relsim_flags,
	&timeout, &action, &action_string);
	break;
	case CAM_AUTOSENSE_FAIL:
	error = EIO; /* we have to kill the command */
	break;
	case CAM_UA_ABORT:
	case CAM_UA_TERMIO:
	case CAM_MSG_REJECT_REC:
	/* XXX Don't know that these are correct */
	error = EIO;
	break;
	case CAM_SEL_TIMEOUT:
	if ((camflags & CAM_RETRY_SELTO) != 0) {
	if (ccb->ccb_h.retry_count > 0 &&
	(periph->flags & CAM_PERIPH_INVALID) == 0) {
	ccb->ccb_h.retry_count--;
	error = ERESTART;

	/*
	* Wait a bit to give the device
	* time to recover before we try again.
	*/
	relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
	timeout = periph_selto_delay;
	break;
	}
	action_string = "Retries exhausted";
	}
	/* FALLTHROUGH */
	case CAM_DEV_NOT_THERE:
	error = ENXIO;
	action = SSQ_LOST;
	break;
	case CAM_REQ_INVALID:
	case CAM_PATH_INVALID:
	case CAM_NO_HBA:
	case CAM_PROVIDE_FAIL:
	case CAM_REQ_TOO_BIG:
	case CAM_LUN_INVALID:
	case CAM_TID_INVALID:
	case CAM_FUNC_NOTAVAIL:
	error = EINVAL;
	break;
	case CAM_SCSI_BUS_RESET:
	case CAM_BDR_SENT:
	/*
	* Commands that repeatedly timeout and cause these
	* kinds of error recovery actions, should return
	* CAM_CMD_TIMEOUT, which allows us to safely assume
	* that this command was an innocent bystander to
	* these events and should be unconditionally
	* retried.
	*/
	case CAM_REQUEUE_REQ:
	/* Unconditional requeue if device is still there */
	if (periph->flags & CAM_PERIPH_INVALID) {
	action_string = "Periph was invalidated";
	error = EIO;
	} else if (sense_flags & SF_NO_RETRY) {
	error = EIO;
	action_string = "Retry was blocked";
	} else {
	error = ERESTART;
	action &= ~SSQ_PRINT_SENSE;
	}
	break;
	case CAM_RESRC_UNAVAIL:
	/* Wait a bit for the resource shortage to abate. */
	timeout = periph_noresrc_delay;
	/* FALLTHROUGH */
	case CAM_BUSY:
	if (timeout == 0) {
	/* Wait a bit for the busy condition to abate. */
	timeout = periph_busy_delay;
	}
	relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
	/* FALLTHROUGH */
	case CAM_ATA_STATUS_ERROR:
	case CAM_REQ_CMP_ERR:
	case CAM_CMD_TIMEOUT:
	case CAM_UNEXP_BUSFREE:
	case CAM_UNCOR_PARITY:
	case CAM_DATA_RUN_ERR:
	default:
	if (periph->flags & CAM_PERIPH_INVALID) {
	error = EIO;
	action_string = "Periph was invalidated";
	} else if (ccb->ccb_h.retry_count == 0) {
	error = EIO;
	action_string = "Retries exhausted";
	} else if (sense_flags & SF_NO_RETRY) {
	error = EIO;
	action_string = "Retry was blocked";
	} else {
	ccb->ccb_h.retry_count--;
	error = ERESTART;
	}
	break;
	}

	if ((sense_flags & SF_PRINT_ALWAYS) \|\|
	CAM_DEBUGGED(ccb->ccb_h.path, CAM_DEBUG_INFO))
	action \|= SSQ_PRINT_SENSE;
	else if (sense_flags & SF_NO_PRINT)
	action &= ~SSQ_PRINT_SENSE;
	if ((action & SSQ_PRINT_SENSE) != 0)
	cam_error_print(orig_ccb, CAM_ESF_ALL, CAM_EPF_ALL);
	if (error != 0 && (action & SSQ_PRINT_SENSE) != 0) {
	if (error != ERESTART) {
	if (action_string == NULL)
	action_string = "Unretryable error";
	xpt_print(ccb->ccb_h.path, "Error %d, %s\n",
	error, action_string);
	} else if (action_string != NULL)
	xpt_print(ccb->ccb_h.path, "%s\n", action_string);
	else {
	xpt_print(ccb->ccb_h.path,
	"Retrying command, %d more tries remain\n",
	ccb->ccb_h.retry_count);
	}
	}

	if (devctl_err && (error != 0 \|\| (action & SSQ_PRINT_SENSE) != 0))
	cam_periph_devctl_notify(orig_ccb);

	if ((action & SSQ_LOST) != 0) {
	lun_id_t lun_id;

	/*
	* For a selection timeout, we consider all of the LUNs on
	* the target to be gone. If the status is CAM_DEV_NOT_THERE,
	* then we only get rid of the device(s) specified by the
	* path in the original CCB.
	*/
	if (status == CAM_SEL_TIMEOUT)
	lun_id = CAM_LUN_WILDCARD;
	else
	lun_id = xpt_path_lun_id(ccb->ccb_h.path);

	/* Should we do more if we can't create the path?? */
	if (xpt_create_path(&newpath, periph,
	xpt_path_path_id(ccb->ccb_h.path),
	xpt_path_target_id(ccb->ccb_h.path),
	lun_id) == CAM_REQ_CMP) {
	/*
	* Let peripheral drivers know that this
	* device has gone away.
	*/
	xpt_async(AC_LOST_DEVICE, newpath, NULL);
	xpt_free_path(newpath);
	}
	}

	/* Broadcast UNIT ATTENTIONs to all periphs. */
	if ((action & SSQ_UA) != 0)
	xpt_async(AC_UNIT_ATTENTION, orig_ccb->ccb_h.path, orig_ccb);

	/* Rescan target on "Reported LUNs data has changed" */
	if ((action & SSQ_RESCAN) != 0) {
	if (xpt_create_path(&newpath, NULL,
	xpt_path_path_id(ccb->ccb_h.path),
	xpt_path_target_id(ccb->ccb_h.path),
	CAM_LUN_WILDCARD) == CAM_REQ_CMP) {
	scan_ccb = xpt_alloc_ccb_nowait();
	if (scan_ccb != NULL) {
	scan_ccb->ccb_h.path = newpath;
	scan_ccb->ccb_h.func_code = XPT_SCAN_TGT;
	scan_ccb->crcn.flags = 0;
	xpt_rescan(scan_ccb);
	} else {
	xpt_print(newpath,
	"Can't allocate CCB to rescan target\n");
	xpt_free_path(newpath);
	}
	}
	}

	/* Attempt a retry */
	if (error == ERESTART \|\| error == 0) {
	if (frozen != 0)
	ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
	if (error == ERESTART)
	xpt_action(ccb);
	if (frozen != 0)
	cam_release_devq(ccb->ccb_h.path,
	relsim_flags,
	openings,
	timeout,
	/getcount_only/0);
	}

	return (error);
	}

	#define CAM_PERIPH_DEVD_MSG_SIZE 256

	static void
	cam_periph_devctl_notify(union ccb *ccb)
	{
	struct cam_periph *periph;
	struct ccb_getdev *cgd;
	struct sbuf sb;
	int serr, sk, asc, ascq;
	char sbmsg, type;

	sbmsg = malloc(CAM_PERIPH_DEVD_MSG_SIZE, M_CAMPERIPH, M_NOWAIT);
	if (sbmsg == NULL)
	return;

	sbuf_new(&sb, sbmsg, CAM_PERIPH_DEVD_MSG_SIZE, SBUF_FIXEDLEN);

	periph = xpt_path_periph(ccb->ccb_h.path);
	sbuf_printf(&sb, "device=%s%d ", periph->periph_name,
	periph->unit_number);

	sbuf_printf(&sb, "serial=\"");
	if ((cgd = (struct ccb_getdev *)xpt_alloc_ccb_nowait()) != NULL) {
	xpt_setup_ccb(&cgd->ccb_h, ccb->ccb_h.path,
	CAM_PRIORITY_NORMAL);
	cgd->ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_action((union ccb *)cgd);

	if (cgd->ccb_h.status == CAM_REQ_CMP)
	sbuf_bcat(&sb, cgd->serial_num, cgd->serial_num_len);
	xpt_free_ccb((union ccb *)cgd);
	}
	sbuf_printf(&sb, "\" ");
	sbuf_printf(&sb, "cam_status=\"0x%x\" ", ccb->ccb_h.status);

	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
	case CAM_CMD_TIMEOUT:
	sbuf_printf(&sb, "timeout=%d ", ccb->ccb_h.timeout);
	type = "timeout";
	break;
	case CAM_SCSI_STATUS_ERROR:
	sbuf_printf(&sb, "scsi_status=%d ", ccb->csio.scsi_status);
	if (scsi_extract_sense_ccb(ccb, &serr, &sk, &asc, &ascq))
	sbuf_printf(&sb, "scsi_sense=\"%02x %02x %02x %02x\" ",
	serr, sk, asc, ascq);
	type = "error";
	break;
	case CAM_ATA_STATUS_ERROR:
	sbuf_printf(&sb, "RES=\"");
	ata_res_sbuf(&ccb->ataio.res, &sb);
	sbuf_printf(&sb, "\" ");
	type = "error";
	break;
	default:
	type = "error";
	break;
	}

	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
	sbuf_printf(&sb, "CDB=\"");
	scsi_cdb_sbuf(scsiio_cdb_ptr(&ccb->csio), &sb);
	sbuf_printf(&sb, "\" ");
	} else if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	sbuf_printf(&sb, "ACB=\"");
	ata_cmd_sbuf(&ccb->ataio.cmd, &sb);
	sbuf_printf(&sb, "\" ");
	}

	if (sbuf_finish(&sb) == 0)
	devctl_notify("CAM", "periph", type, sbuf_data(&sb));
	sbuf_delete(&sb);
	free(sbmsg, M_CAMPERIPH);
	}

	/*
	* Sysctl to force an invalidation of the drive right now. Can be
	* called with CTLFLAG_MPSAFE since we take periph lock.
	*/
	int
	cam_periph_invalidate_sysctl(SYSCTL_HANDLER_ARGS)
	{
	struct cam_periph *periph;
	int error, value;

	periph = arg1;
	value = 0;
	error = sysctl_handle_int(oidp, &value, 0, req);
	if (error != 0 \|\| req->newptr == NULL \|\| value != 1)
	return (error);

	cam_periph_lock(periph);
	cam_periph_invalidate(periph);
	cam_periph_unlock(periph);

	return (0);
	}
	diff --git a/sys/cam/cam_xpt.c b/sys/cam/cam_xpt.c
	index 12d001ef729c..d71b8ef81240 100644
	--- a/sys/cam/cam_xpt.c
	+++ b/sys/cam/cam_xpt.c
	@@ -1,5586 +1,5586 @@
	/*-
	* Implementation of the Common Access Method Transport (XPT) layer.
	*
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1997, 1998, 1999 Justin T. Gibbs.
	* Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include "opt_printf.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/bio.h>
	#include <sys/bus.h>
	#include <sys/systm.h>
	#include <sys/types.h>
	#include <sys/malloc.h>
	#include <sys/kernel.h>
	#include <sys/time.h>
	#include <sys/conf.h>
	#include <sys/fcntl.h>
	#include <sys/proc.h>
	#include <sys/sbuf.h>
	#include <sys/smp.h>
	#include <sys/taskqueue.h>

	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysctl.h>
	#include <sys/kthread.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_iosched.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_queue.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/cam_xpt_internal.h>
	#include <cam/cam_debug.h>
	#include <cam/cam_compat.h>

	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_message.h>
	#include <cam/scsi/scsi_pass.h>

	#include <machine/stdarg.h> /* for xpt_print below */

	#include "opt_cam.h"

	/* Wild guess based on not wanting to grow the stack too much */
	#define XPT_PRINT_MAXLEN 512
	#ifdef PRINTF_BUFR_SIZE
	#define XPT_PRINT_LEN PRINTF_BUFR_SIZE
	#else
	#define XPT_PRINT_LEN 128
	#endif
	_Static_assert(XPT_PRINT_LEN <= XPT_PRINT_MAXLEN, "XPT_PRINT_LEN is too large");

	/*
	* This is the maximum number of high powered commands (e.g. start unit)
	* that can be outstanding at a particular time.
	*/
	#ifndef CAM_MAX_HIGHPOWER
	#define CAM_MAX_HIGHPOWER 4
	#endif

	/* Datastructures internal to the xpt layer */
	MALLOC_DEFINE(M_CAMXPT, "CAM XPT", "CAM XPT buffers");
	MALLOC_DEFINE(M_CAMDEV, "CAM DEV", "CAM devices");
	MALLOC_DEFINE(M_CAMCCB, "CAM CCB", "CAM CCBs");
	MALLOC_DEFINE(M_CAMPATH, "CAM path", "CAM paths");

	struct xpt_softc {
	uint32_t xpt_generation;

	/* number of high powered commands that can go through right now */
	struct mtx xpt_highpower_lock;
	STAILQ_HEAD(highpowerlist, cam_ed) highpowerq;
	int num_highpower;

	/* queue for handling async rescan requests. */
	TAILQ_HEAD(, ccb_hdr) ccb_scanq;
	int buses_to_config;
	int buses_config_done;
	int announce_nosbuf;

	/*
	* Registered buses
	*
	* N.B., "busses" is an archaic spelling of "buses". In new code
	* "buses" is preferred.
	*/
	TAILQ_HEAD(,cam_eb) xpt_busses;
	u_int bus_generation;

	int boot_delay;
	struct callout boot_callout;
	struct task boot_task;
	struct root_hold_token xpt_rootmount;

	struct mtx xpt_topo_lock;
	struct taskqueue *xpt_taskq;
	};

	typedef enum {
	DM_RET_COPY = 0x01,
	DM_RET_FLAG_MASK = 0x0f,
	DM_RET_NONE = 0x00,
	DM_RET_STOP = 0x10,
	DM_RET_DESCEND = 0x20,
	DM_RET_ERROR = 0x30,
	DM_RET_ACTION_MASK = 0xf0
	} dev_match_ret;

	typedef enum {
	XPT_DEPTH_BUS,
	XPT_DEPTH_TARGET,
	XPT_DEPTH_DEVICE,
	XPT_DEPTH_PERIPH
	} xpt_traverse_depth;

	struct xpt_traverse_config {
	xpt_traverse_depth depth;
	void *tr_func;
	void *tr_arg;
	};

	typedef int xpt_busfunc_t (struct cam_eb bus, void arg);
	typedef int xpt_targetfunc_t (struct cam_et target, void arg);
	typedef int xpt_devicefunc_t (struct cam_ed device, void arg);
	typedef int xpt_periphfunc_t (struct cam_periph periph, void arg);
	typedef int xpt_pdrvfunc_t (struct periph_driver *pdrv, void arg);

	/* Transport layer configuration information */
	static struct xpt_softc xsoftc;

	MTX_SYSINIT(xpt_topo_init, &xsoftc.xpt_topo_lock, "XPT topology lock", MTX_DEF);

	SYSCTL_INT(_kern_cam, OID_AUTO, boot_delay, CTLFLAG_RDTUN,
	&xsoftc.boot_delay, 0, "Bus registration wait time");
	SYSCTL_UINT(_kern_cam, OID_AUTO, xpt_generation, CTLFLAG_RD,
	&xsoftc.xpt_generation, 0, "CAM peripheral generation count");
	SYSCTL_INT(_kern_cam, OID_AUTO, announce_nosbuf, CTLFLAG_RWTUN,
	&xsoftc.announce_nosbuf, 0, "Don't use sbuf for announcements");

	struct cam_doneq {
	struct mtx_padalign cam_doneq_mtx;
	STAILQ_HEAD(, ccb_hdr) cam_doneq;
	int cam_doneq_sleep;
	};

	static struct cam_doneq cam_doneqs[MAXCPU];
	static u_int __read_mostly cam_num_doneqs;
	static struct proc *cam_proc;

	SYSCTL_INT(_kern_cam, OID_AUTO, num_doneqs, CTLFLAG_RDTUN,
	&cam_num_doneqs, 0, "Number of completion queues/threads");

	struct cam_periph *xpt_periph;

	static periph_init_t xpt_periph_init;

	static struct periph_driver xpt_driver =
	{
	xpt_periph_init, "xpt",
	TAILQ_HEAD_INITIALIZER(xpt_driver.units), /* generation */ 0,
	CAM_PERIPH_DRV_EARLY
	};

	PERIPHDRIVER_DECLARE(xpt, xpt_driver);

	static d_open_t xptopen;
	static d_close_t xptclose;
	static d_ioctl_t xptioctl;
	static d_ioctl_t xptdoioctl;

	static struct cdevsw xpt_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = 0,
	.d_open = xptopen,
	.d_close = xptclose,
	.d_ioctl = xptioctl,
	.d_name = "xpt",
	};

	/* Storage for debugging datastructures */
	struct cam_path *cam_dpath;
	u_int32_t __read_mostly cam_dflags = CAM_DEBUG_FLAGS;
	SYSCTL_UINT(_kern_cam, OID_AUTO, dflags, CTLFLAG_RWTUN,
	&cam_dflags, 0, "Enabled debug flags");
	u_int32_t cam_debug_delay = CAM_DEBUG_DELAY;
	SYSCTL_UINT(_kern_cam, OID_AUTO, debug_delay, CTLFLAG_RWTUN,
	&cam_debug_delay, 0, "Delay in us after each debug message");

	/* Our boot-time initialization hook */
	static int cam_module_event_handler(module_t, int /modeventtype_t/, void *);

	static moduledata_t cam_moduledata = {
	"cam",
	cam_module_event_handler,
	NULL
	};

	static int xpt_init(void *);

	DECLARE_MODULE(cam, cam_moduledata, SI_SUB_CONFIGURE, SI_ORDER_SECOND);
	MODULE_VERSION(cam, 1);

	static void xpt_async_bcast(struct async_list *async_head,
	u_int32_t async_code,
	struct cam_path *path,
	void *async_arg);
	static path_id_t xptnextfreepathid(void);
	static path_id_t xptpathid(const char *sim_name, int sim_unit, int sim_bus);
	static union ccb xpt_get_ccb(struct cam_periph periph);
	static union ccb xpt_get_ccb_nowait(struct cam_periph periph);
	static void xpt_run_allocq(struct cam_periph *periph, int sleep);
	static void xpt_run_allocq_task(void *context, int pending);
	static void xpt_run_devq(struct cam_devq *devq);
	static callout_func_t xpt_release_devq_timeout;
	static void xpt_acquire_bus(struct cam_eb *bus);
	static void xpt_release_bus(struct cam_eb *bus);
	static uint32_t xpt_freeze_devq_device(struct cam_ed *dev, u_int count);
	static int xpt_release_devq_device(struct cam_ed *dev, u_int count,
	int run_queue);
	static struct cam_et*
	xpt_alloc_target(struct cam_eb *bus, target_id_t target_id);
	static void xpt_acquire_target(struct cam_et *target);
	static void xpt_release_target(struct cam_et *target);
	static struct cam_eb*
	xpt_find_bus(path_id_t path_id);
	static struct cam_et*
	xpt_find_target(struct cam_eb *bus, target_id_t target_id);
	static struct cam_ed*
	xpt_find_device(struct cam_et *target, lun_id_t lun_id);
	static void xpt_config(void *arg);
	static void xpt_hold_boot_locked(void);
	static int xpt_schedule_dev(struct camq queue, cam_pinfo dev_pinfo,
	u_int32_t new_priority);
	static xpt_devicefunc_t xptpassannouncefunc;
	static void xptaction(struct cam_sim sim, union ccb work_ccb);
	static void xptpoll(struct cam_sim *sim);
	static void camisr_runqueue(void);
	static void xpt_done_process(struct ccb_hdr *ccb_h);
	static void xpt_done_td(void *);
	static dev_match_ret xptbusmatch(struct dev_match_pattern *patterns,
	u_int num_patterns, struct cam_eb *bus);
	static dev_match_ret xptdevicematch(struct dev_match_pattern *patterns,
	u_int num_patterns,
	struct cam_ed *device);
	static dev_match_ret xptperiphmatch(struct dev_match_pattern *patterns,
	u_int num_patterns,
	struct cam_periph *periph);
	static xpt_busfunc_t xptedtbusfunc;
	static xpt_targetfunc_t xptedttargetfunc;
	static xpt_devicefunc_t xptedtdevicefunc;
	static xpt_periphfunc_t xptedtperiphfunc;
	static xpt_pdrvfunc_t xptplistpdrvfunc;
	static xpt_periphfunc_t xptplistperiphfunc;
	static int xptedtmatch(struct ccb_dev_match *cdm);
	static int xptperiphlistmatch(struct ccb_dev_match *cdm);
	static int xptbustraverse(struct cam_eb *start_bus,
	xpt_busfunc_t tr_func, void arg);
	static int xpttargettraverse(struct cam_eb *bus,
	struct cam_et *start_target,
	xpt_targetfunc_t tr_func, void arg);
	static int xptdevicetraverse(struct cam_et *target,
	struct cam_ed *start_device,
	xpt_devicefunc_t tr_func, void arg);
	static int xptperiphtraverse(struct cam_ed *device,
	struct cam_periph *start_periph,
	xpt_periphfunc_t tr_func, void arg);
	static int xptpdrvtraverse(struct periph_driver **start_pdrv,
	xpt_pdrvfunc_t tr_func, void arg);
	static int xptpdperiphtraverse(struct periph_driver **pdrv,
	struct cam_periph *start_periph,
	xpt_periphfunc_t *tr_func,
	void *arg);
	static xpt_busfunc_t xptdefbusfunc;
	static xpt_targetfunc_t xptdeftargetfunc;
	static xpt_devicefunc_t xptdefdevicefunc;
	static xpt_periphfunc_t xptdefperiphfunc;
	static void xpt_finishconfig_task(void *context, int pending);
	static void xpt_dev_async_default(u_int32_t async_code,
	struct cam_eb *bus,
	struct cam_et *target,
	struct cam_ed *device,
	void *async_arg);
	static struct cam_ed * xpt_alloc_device_default(struct cam_eb *bus,
	struct cam_et *target,
	lun_id_t lun_id);
	static xpt_devicefunc_t xptsetasyncfunc;
	static xpt_busfunc_t xptsetasyncbusfunc;
	static cam_status xptregister(struct cam_periph *periph,
	void *arg);

	static __inline int
	xpt_schedule_devq(struct cam_devq devq, struct cam_ed dev)
	{
	int retval;

	mtx_assert(&devq->send_mtx, MA_OWNED);
	if ((dev->ccbq.queue.entries > 0) &&
	(dev->ccbq.dev_openings > 0) &&
	(dev->ccbq.queue.qfrozen_cnt == 0)) {
	/*
	* The priority of a device waiting for controller
	* resources is that of the highest priority CCB
	* enqueued.
	*/
	retval =
	xpt_schedule_dev(&devq->send_queue,
	&dev->devq_entry,
	CAMQ_GET_PRIO(&dev->ccbq.queue));
	} else {
	retval = 0;
	}
	return (retval);
	}

	static __inline int
	device_is_queued(struct cam_ed *device)
	{
	return (device->devq_entry.index != CAM_UNQUEUED_INDEX);
	}

	static void
	xpt_periph_init(void)
	{
	make_dev(&xpt_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0600, "xpt0");
	}

	static int
	xptopen(struct cdev dev, int flags, int fmt, struct thread td)
	{

	/*
	* Only allow read-write access.
	*/
	if (((flags & FWRITE) == 0) \|\| ((flags & FREAD) == 0))
	return(EPERM);

	/*
	* We don't allow nonblocking access.
	*/
	if ((flags & O_NONBLOCK) != 0) {
	printf("%s: can't do nonblocking access\n", devtoname(dev));
	return(ENODEV);
	}

	return(0);
	}

	static int
	xptclose(struct cdev dev, int flag, int fmt, struct thread td)
	{

	return(0);
	}

	/*
	* Don't automatically grab the xpt softc lock here even though this is going
	* through the xpt device. The xpt device is really just a back door for
	* accessing other devices and SIMs, so the right thing to do is to grab
	* the appropriate SIM lock once the bus/SIM is located.
	*/
	static int
	xptioctl(struct cdev dev, u_long cmd, caddr_t addr, int flag, struct thread td)
	{
	int error;

	if ((error = xptdoioctl(dev, cmd, addr, flag, td)) == ENOTTY) {
	error = cam_compat_ioctl(dev, cmd, addr, flag, td, xptdoioctl);
	}
	return (error);
	}

	static int
	xptdoioctl(struct cdev dev, u_long cmd, caddr_t addr, int flag, struct thread td)
	{
	int error;

	error = 0;

	switch(cmd) {
	/*
	* For the transport layer CAMIOCOMMAND ioctl, we really only want
	* to accept CCB types that don't quite make sense to send through a
	* passthrough driver. XPT_PATH_INQ is an exception to this, as stated
	* in the CAM spec.
	*/
	case CAMIOCOMMAND: {
	union ccb *ccb;
	union ccb *inccb;
	struct cam_eb *bus;

	inccb = (union ccb *)addr;
	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	if (inccb->ccb_h.func_code == XPT_SCSI_IO)
	inccb->csio.bio = NULL;
	#endif

	if (inccb->ccb_h.flags & CAM_UNLOCKED)
	return (EINVAL);

	bus = xpt_find_bus(inccb->ccb_h.path_id);
	if (bus == NULL)
	return (EINVAL);

	switch (inccb->ccb_h.func_code) {
	case XPT_SCAN_BUS:
	case XPT_RESET_BUS:
	if (inccb->ccb_h.target_id != CAM_TARGET_WILDCARD \|\|
	inccb->ccb_h.target_lun != CAM_LUN_WILDCARD) {
	xpt_release_bus(bus);
	return (EINVAL);
	}
	break;
	case XPT_SCAN_TGT:
	if (inccb->ccb_h.target_id == CAM_TARGET_WILDCARD \|\|
	inccb->ccb_h.target_lun != CAM_LUN_WILDCARD) {
	xpt_release_bus(bus);
	return (EINVAL);
	}
	break;
	default:
	break;
	}

	switch(inccb->ccb_h.func_code) {
	case XPT_SCAN_BUS:
	case XPT_RESET_BUS:
	case XPT_PATH_INQ:
	case XPT_ENG_INQ:
	case XPT_SCAN_LUN:
	case XPT_SCAN_TGT:

	ccb = xpt_alloc_ccb();

	/*
	* Create a path using the bus, target, and lun the
	* user passed in.
	*/
	if (xpt_create_path(&ccb->ccb_h.path, NULL,
	inccb->ccb_h.path_id,
	inccb->ccb_h.target_id,
	inccb->ccb_h.target_lun) !=
	CAM_REQ_CMP){
	error = EINVAL;
	xpt_free_ccb(ccb);
	break;
	}
	/* Ensure all of our fields are correct */
	xpt_setup_ccb(&ccb->ccb_h, ccb->ccb_h.path,
	inccb->ccb_h.pinfo.priority);
	xpt_merge_ccb(ccb, inccb);
	xpt_path_lock(ccb->ccb_h.path);
	cam_periph_runccb(ccb, NULL, 0, 0, NULL);
	xpt_path_unlock(ccb->ccb_h.path);
	bcopy(ccb, inccb, sizeof(union ccb));
	xpt_free_path(ccb->ccb_h.path);
	xpt_free_ccb(ccb);
	break;

	case XPT_DEBUG: {
	union ccb ccb;

	/*
	* This is an immediate CCB, so it's okay to
	* allocate it on the stack.
	*/

	/*
	* Create a path using the bus, target, and lun the
	* user passed in.
	*/
	if (xpt_create_path(&ccb.ccb_h.path, NULL,
	inccb->ccb_h.path_id,
	inccb->ccb_h.target_id,
	inccb->ccb_h.target_lun) !=
	CAM_REQ_CMP){
	error = EINVAL;
	break;
	}
	/* Ensure all of our fields are correct */
	xpt_setup_ccb(&ccb.ccb_h, ccb.ccb_h.path,
	inccb->ccb_h.pinfo.priority);
	xpt_merge_ccb(&ccb, inccb);
	xpt_action(&ccb);
	bcopy(&ccb, inccb, sizeof(union ccb));
	xpt_free_path(ccb.ccb_h.path);
	break;
	}
	case XPT_DEV_MATCH: {
	struct cam_periph_map_info mapinfo;
	struct cam_path *old_path;

	/*
	* We can't deal with physical addresses for this
	* type of transaction.
	*/
	if ((inccb->ccb_h.flags & CAM_DATA_MASK) !=
	CAM_DATA_VADDR) {
	error = EINVAL;
	break;
	}

	/*
	* Save this in case the caller had it set to
	* something in particular.
	*/
	old_path = inccb->ccb_h.path;

	/*
	* We really don't need a path for the matching
	* code. The path is needed because of the
	* debugging statements in xpt_action(). They
	* assume that the CCB has a valid path.
	*/
	inccb->ccb_h.path = xpt_periph->path;

	bzero(&mapinfo, sizeof(mapinfo));

	/*
	* Map the pattern and match buffers into kernel
	* virtual address space.
	*/
	- error = cam_periph_mapmem(inccb, &mapinfo, MAXPHYS);
	+ error = cam_periph_mapmem(inccb, &mapinfo, maxphys);

	if (error) {
	inccb->ccb_h.path = old_path;
	break;
	}

	/*
	* This is an immediate CCB, we can send it on directly.
	*/
	xpt_action(inccb);

	/*
	* Map the buffers back into user space.
	*/
	cam_periph_unmapmem(inccb, &mapinfo);

	inccb->ccb_h.path = old_path;

	error = 0;
	break;
	}
	default:
	error = ENOTSUP;
	break;
	}
	xpt_release_bus(bus);
	break;
	}
	/*
	* This is the getpassthru ioctl. It takes a XPT_GDEVLIST ccb as input,
	* with the periphal driver name and unit name filled in. The other
	* fields don't really matter as input. The passthrough driver name
	* ("pass"), and unit number are passed back in the ccb. The current
	* device generation number, and the index into the device peripheral
	* driver list, and the status are also passed back. Note that
	* since we do everything in one pass, unlike the XPT_GDEVLIST ccb,
	* we never return a status of CAM_GDEVLIST_LIST_CHANGED. It is
	* (or rather should be) impossible for the device peripheral driver
	* list to change since we look at the whole thing in one pass, and
	* we do it with lock protection.
	*
	*/
	case CAMGETPASSTHRU: {
	union ccb *ccb;
	struct cam_periph *periph;
	struct periph_driver **p_drv;
	char *name;
	u_int unit;
	int base_periph_found;

	ccb = (union ccb *)addr;
	unit = ccb->cgdl.unit_number;
	name = ccb->cgdl.periph_name;
	base_periph_found = 0;
	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	if (ccb->ccb_h.func_code == XPT_SCSI_IO)
	ccb->csio.bio = NULL;
	#endif

	/*
	* Sanity check -- make sure we don't get a null peripheral
	* driver name.
	*/
	if (*ccb->cgdl.periph_name == '\0') {
	error = EINVAL;
	break;
	}

	/* Keep the list from changing while we traverse it */
	xpt_lock_buses();

	/* first find our driver in the list of drivers */
	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++)
	if (strcmp((*p_drv)->driver_name, name) == 0)
	break;

	if (*p_drv == NULL) {
	xpt_unlock_buses();
	ccb->ccb_h.status = CAM_REQ_CMP_ERR;
	ccb->cgdl.status = CAM_GDEVLIST_ERROR;
	*ccb->cgdl.periph_name = '\0';
	ccb->cgdl.unit_number = 0;
	error = ENOENT;
	break;
	}

	/*
	* Run through every peripheral instance of this driver
	* and check to see whether it matches the unit passed
	* in by the user. If it does, get out of the loops and
	* find the passthrough driver associated with that
	* peripheral driver.
	*/
	for (periph = TAILQ_FIRST(&(*p_drv)->units); periph != NULL;
	periph = TAILQ_NEXT(periph, unit_links)) {
	if (periph->unit_number == unit)
	break;
	}
	/*
	* If we found the peripheral driver that the user passed
	* in, go through all of the peripheral drivers for that
	* particular device and look for a passthrough driver.
	*/
	if (periph != NULL) {
	struct cam_ed *device;
	int i;

	base_periph_found = 1;
	device = periph->path->device;
	for (i = 0, periph = SLIST_FIRST(&device->periphs);
	periph != NULL;
	periph = SLIST_NEXT(periph, periph_links), i++) {
	/*
	* Check to see whether we have a
	* passthrough device or not.
	*/
	if (strcmp(periph->periph_name, "pass") == 0) {
	/*
	* Fill in the getdevlist fields.
	*/
	strlcpy(ccb->cgdl.periph_name,
	periph->periph_name,
	sizeof(ccb->cgdl.periph_name));
	ccb->cgdl.unit_number =
	periph->unit_number;
	if (SLIST_NEXT(periph, periph_links))
	ccb->cgdl.status =
	CAM_GDEVLIST_MORE_DEVS;
	else
	ccb->cgdl.status =
	CAM_GDEVLIST_LAST_DEVICE;
	ccb->cgdl.generation =
	device->generation;
	ccb->cgdl.index = i;
	/*
	* Fill in some CCB header fields
	* that the user may want.
	*/
	ccb->ccb_h.path_id =
	periph->path->bus->path_id;
	ccb->ccb_h.target_id =
	periph->path->target->target_id;
	ccb->ccb_h.target_lun =
	periph->path->device->lun_id;
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	}
	}

	/*
	* If the periph is null here, one of two things has
	* happened. The first possibility is that we couldn't
	* find the unit number of the particular peripheral driver
	* that the user is asking about. e.g. the user asks for
	* the passthrough driver for "da11". We find the list of
	* "da" peripherals all right, but there is no unit 11.
	* The other possibility is that we went through the list
	* of peripheral drivers attached to the device structure,
	* but didn't find one with the name "pass". Either way,
	* we return ENOENT, since we couldn't find something.
	*/
	if (periph == NULL) {
	ccb->ccb_h.status = CAM_REQ_CMP_ERR;
	ccb->cgdl.status = CAM_GDEVLIST_ERROR;
	*ccb->cgdl.periph_name = '\0';
	ccb->cgdl.unit_number = 0;
	error = ENOENT;
	/*
	* It is unfortunate that this is even necessary,
	* but there are many, many clueless users out there.
	* If this is true, the user is looking for the
	* passthrough driver, but doesn't have one in his
	* kernel.
	*/
	if (base_periph_found == 1) {
	printf("xptioctl: pass driver is not in the "
	"kernel\n");
	printf("xptioctl: put \"device pass\" in "
	"your kernel config file\n");
	}
	}
	xpt_unlock_buses();
	break;
	}
	default:
	error = ENOTTY;
	break;
	}

	return(error);
	}

	static int
	cam_module_event_handler(module_t mod, int what, void *arg)
	{
	int error;

	switch (what) {
	case MOD_LOAD:
	if ((error = xpt_init(NULL)) != 0)
	return (error);
	break;
	case MOD_UNLOAD:
	return EBUSY;
	default:
	return EOPNOTSUPP;
	}

	return 0;
	}

	static struct xpt_proto *
	xpt_proto_find(cam_proto proto)
	{
	struct xpt_proto **pp;

	SET_FOREACH(pp, cam_xpt_proto_set) {
	if ((*pp)->proto == proto)
	return *pp;
	}

	return NULL;
	}

	static void
	xpt_rescan_done(struct cam_periph periph, union ccb done_ccb)
	{

	if (done_ccb->ccb_h.ppriv_ptr1 == NULL) {
	xpt_free_path(done_ccb->ccb_h.path);
	xpt_free_ccb(done_ccb);
	} else {
	done_ccb->ccb_h.cbfcnp = done_ccb->ccb_h.ppriv_ptr1;
	(*done_ccb->ccb_h.cbfcnp)(periph, done_ccb);
	}
	xpt_release_boot();
	}

	/* thread to handle bus rescans */
	static void
	xpt_scanner_thread(void *dummy)
	{
	union ccb *ccb;
	struct mtx *mtx;
	struct cam_ed *device;

	xpt_lock_buses();
	for (;;) {
	if (TAILQ_EMPTY(&xsoftc.ccb_scanq))
	msleep(&xsoftc.ccb_scanq, &xsoftc.xpt_topo_lock, PRIBIO,
	"-", 0);
	if ((ccb = (union ccb *)TAILQ_FIRST(&xsoftc.ccb_scanq)) != NULL) {
	TAILQ_REMOVE(&xsoftc.ccb_scanq, &ccb->ccb_h, sim_links.tqe);
	xpt_unlock_buses();

	/*
	* We need to lock the device's mutex which we use as
	* the path mutex. We can't do it directly because the
	* cam_path in the ccb may wind up going away because
	* the path lock may be dropped and the path retired in
	* the completion callback. We do this directly to keep
	* the reference counts in cam_path sane. We also have
	* to copy the device pointer because ccb_h.path may
	* be freed in the callback.
	*/
	mtx = xpt_path_mtx(ccb->ccb_h.path);
	device = ccb->ccb_h.path->device;
	xpt_acquire_device(device);
	mtx_lock(mtx);
	xpt_action(ccb);
	mtx_unlock(mtx);
	xpt_release_device(device);

	xpt_lock_buses();
	}
	}
	}

	void
	xpt_rescan(union ccb *ccb)
	{
	struct ccb_hdr *hdr;

	/* Prepare request */
	if (ccb->ccb_h.path->target->target_id == CAM_TARGET_WILDCARD &&
	ccb->ccb_h.path->device->lun_id == CAM_LUN_WILDCARD)
	ccb->ccb_h.func_code = XPT_SCAN_BUS;
	else if (ccb->ccb_h.path->target->target_id != CAM_TARGET_WILDCARD &&
	ccb->ccb_h.path->device->lun_id == CAM_LUN_WILDCARD)
	ccb->ccb_h.func_code = XPT_SCAN_TGT;
	else if (ccb->ccb_h.path->target->target_id != CAM_TARGET_WILDCARD &&
	ccb->ccb_h.path->device->lun_id != CAM_LUN_WILDCARD)
	ccb->ccb_h.func_code = XPT_SCAN_LUN;
	else {
	xpt_print(ccb->ccb_h.path, "illegal scan path\n");
	xpt_free_path(ccb->ccb_h.path);
	xpt_free_ccb(ccb);
	return;
	}
	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
	("xpt_rescan: func %#x %s\n", ccb->ccb_h.func_code,
	xpt_action_name(ccb->ccb_h.func_code)));

	ccb->ccb_h.ppriv_ptr1 = ccb->ccb_h.cbfcnp;
	ccb->ccb_h.cbfcnp = xpt_rescan_done;
	xpt_setup_ccb(&ccb->ccb_h, ccb->ccb_h.path, CAM_PRIORITY_XPT);
	/* Don't make duplicate entries for the same paths. */
	xpt_lock_buses();
	if (ccb->ccb_h.ppriv_ptr1 == NULL) {
	TAILQ_FOREACH(hdr, &xsoftc.ccb_scanq, sim_links.tqe) {
	if (xpt_path_comp(hdr->path, ccb->ccb_h.path) == 0) {
	wakeup(&xsoftc.ccb_scanq);
	xpt_unlock_buses();
	xpt_print(ccb->ccb_h.path, "rescan already queued\n");
	xpt_free_path(ccb->ccb_h.path);
	xpt_free_ccb(ccb);
	return;
	}
	}
	}
	TAILQ_INSERT_TAIL(&xsoftc.ccb_scanq, &ccb->ccb_h, sim_links.tqe);
	xpt_hold_boot_locked();
	wakeup(&xsoftc.ccb_scanq);
	xpt_unlock_buses();
	}

	/* Functions accessed by the peripheral drivers */
	static int
	xpt_init(void *dummy)
	{
	struct cam_sim *xpt_sim;
	struct cam_path *path;
	struct cam_devq *devq;
	cam_status status;
	int error, i;

	TAILQ_INIT(&xsoftc.xpt_busses);
	TAILQ_INIT(&xsoftc.ccb_scanq);
	STAILQ_INIT(&xsoftc.highpowerq);
	xsoftc.num_highpower = CAM_MAX_HIGHPOWER;

	mtx_init(&xsoftc.xpt_highpower_lock, "XPT highpower lock", NULL, MTX_DEF);
	xsoftc.xpt_taskq = taskqueue_create("CAM XPT task", M_WAITOK,
	taskqueue_thread_enqueue, /context/&xsoftc.xpt_taskq);

	#ifdef CAM_BOOT_DELAY
	/*
	* Override this value at compile time to assist our users
	* who don't use loader to boot a kernel.
	*/
	xsoftc.boot_delay = CAM_BOOT_DELAY;
	#endif

	/*
	* The xpt layer is, itself, the equivalent of a SIM.
	* Allow 16 ccbs in the ccb pool for it. This should
	* give decent parallelism when we probe buses and
	* perform other XPT functions.
	*/
	devq = cam_simq_alloc(16);
	xpt_sim = cam_sim_alloc(xptaction,
	xptpoll,
	"xpt",
	/softc/NULL,
	/unit/0,
	/mtx/NULL,
	/max_dev_transactions/0,
	/max_tagged_dev_transactions/0,
	devq);
	if (xpt_sim == NULL)
	return (ENOMEM);

	if ((status = xpt_bus_register(xpt_sim, NULL, 0)) != CAM_SUCCESS) {
	printf("xpt_init: xpt_bus_register failed with status %#x,"
	" failing attach\n", status);
	return (EINVAL);
	}

	/*
	* Looking at the XPT from the SIM layer, the XPT is
	* the equivalent of a peripheral driver. Allocate
	* a peripheral driver entry for us.
	*/
	if ((status = xpt_create_path(&path, NULL, CAM_XPT_PATH_ID,
	CAM_TARGET_WILDCARD,
	CAM_LUN_WILDCARD)) != CAM_REQ_CMP) {
	printf("xpt_init: xpt_create_path failed with status %#x,"
	" failing attach\n", status);
	return (EINVAL);
	}
	xpt_path_lock(path);
	cam_periph_alloc(xptregister, NULL, NULL, NULL, "xpt", CAM_PERIPH_BIO,
	path, NULL, 0, xpt_sim);
	xpt_path_unlock(path);
	xpt_free_path(path);

	if (cam_num_doneqs < 1)
	cam_num_doneqs = 1 + mp_ncpus / 6;
	else if (cam_num_doneqs > MAXCPU)
	cam_num_doneqs = MAXCPU;
	for (i = 0; i < cam_num_doneqs; i++) {
	mtx_init(&cam_doneqs[i].cam_doneq_mtx, "CAM doneq", NULL,
	MTX_DEF);
	STAILQ_INIT(&cam_doneqs[i].cam_doneq);
	error = kproc_kthread_add(xpt_done_td, &cam_doneqs[i],
	&cam_proc, NULL, 0, 0, "cam", "doneq%d", i);
	if (error != 0) {
	cam_num_doneqs = i;
	break;
	}
	}
	if (cam_num_doneqs < 1) {
	printf("xpt_init: Cannot init completion queues "
	"- failing attach\n");
	return (ENOMEM);
	}

	/*
	* Register a callback for when interrupts are enabled.
	*/
	config_intrhook_oneshot(xpt_config, NULL);

	return (0);
	}

	static cam_status
	xptregister(struct cam_periph periph, void arg)
	{
	struct cam_sim *xpt_sim;

	if (periph == NULL) {
	printf("xptregister: periph was NULL!!\n");
	return(CAM_REQ_CMP_ERR);
	}

	xpt_sim = (struct cam_sim *)arg;
	xpt_sim->softc = periph;
	xpt_periph = periph;
	periph->softc = NULL;

	return(CAM_REQ_CMP);
	}

	int32_t
	xpt_add_periph(struct cam_periph *periph)
	{
	struct cam_ed *device;
	int32_t status;

	TASK_INIT(&periph->periph_run_task, 0, xpt_run_allocq_task, periph);
	device = periph->path->device;
	status = CAM_REQ_CMP;
	if (device != NULL) {
	mtx_lock(&device->target->bus->eb_mtx);
	device->generation++;
	SLIST_INSERT_HEAD(&device->periphs, periph, periph_links);
	mtx_unlock(&device->target->bus->eb_mtx);
	atomic_add_32(&xsoftc.xpt_generation, 1);
	}

	return (status);
	}

	void
	xpt_remove_periph(struct cam_periph *periph)
	{
	struct cam_ed *device;

	device = periph->path->device;
	if (device != NULL) {
	mtx_lock(&device->target->bus->eb_mtx);
	device->generation++;
	SLIST_REMOVE(&device->periphs, periph, cam_periph, periph_links);
	mtx_unlock(&device->target->bus->eb_mtx);
	atomic_add_32(&xsoftc.xpt_generation, 1);
	}
	}

	void
	xpt_announce_periph(struct cam_periph periph, char announce_string)
	{
	struct cam_path *path = periph->path;
	struct xpt_proto *proto;

	cam_periph_assert(periph, MA_OWNED);
	periph->flags \|= CAM_PERIPH_ANNOUNCED;

	printf("%s%d at %s%d bus %d scbus%d target %d lun %jx\n",
	periph->periph_name, periph->unit_number,
	path->bus->sim->sim_name,
	path->bus->sim->unit_number,
	path->bus->sim->bus_id,
	path->bus->path_id,
	path->target->target_id,
	(uintmax_t)path->device->lun_id);
	printf("%s%d: ", periph->periph_name, periph->unit_number);
	proto = xpt_proto_find(path->device->protocol);
	if (proto)
	proto->ops->announce(path->device);
	else
	printf("%s%d: Unknown protocol device %d\n",
	periph->periph_name, periph->unit_number,
	path->device->protocol);
	if (path->device->serial_num_len > 0) {
	/* Don't wrap the screen - print only the first 60 chars */
	printf("%s%d: Serial Number %.60s\n", periph->periph_name,
	periph->unit_number, path->device->serial_num);
	}
	/* Announce transport details. */
	path->bus->xport->ops->announce(periph);
	/* Announce command queueing. */
	if (path->device->inq_flags & SID_CmdQue
	\|\| path->device->flags & CAM_DEV_TAG_AFTER_COUNT) {
	printf("%s%d: Command Queueing enabled\n",
	periph->periph_name, periph->unit_number);
	}
	/* Announce caller's details if they've passed in. */
	if (announce_string != NULL)
	printf("%s%d: %s\n", periph->periph_name,
	periph->unit_number, announce_string);
	}

	void
	xpt_announce_periph_sbuf(struct cam_periph periph, struct sbuf sb,
	char *announce_string)
	{
	struct cam_path *path = periph->path;
	struct xpt_proto *proto;

	cam_periph_assert(periph, MA_OWNED);
	periph->flags \|= CAM_PERIPH_ANNOUNCED;

	/* Fall back to the non-sbuf method if necessary */
	if (xsoftc.announce_nosbuf != 0) {
	xpt_announce_periph(periph, announce_string);
	return;
	}
	proto = xpt_proto_find(path->device->protocol);
	if (((proto != NULL) && (proto->ops->announce_sbuf == NULL)) \|\|
	(path->bus->xport->ops->announce_sbuf == NULL)) {
	xpt_announce_periph(periph, announce_string);
	return;
	}

	sbuf_printf(sb, "%s%d at %s%d bus %d scbus%d target %d lun %jx\n",
	periph->periph_name, periph->unit_number,
	path->bus->sim->sim_name,
	path->bus->sim->unit_number,
	path->bus->sim->bus_id,
	path->bus->path_id,
	path->target->target_id,
	(uintmax_t)path->device->lun_id);
	sbuf_printf(sb, "%s%d: ", periph->periph_name, periph->unit_number);

	if (proto)
	proto->ops->announce_sbuf(path->device, sb);
	else
	sbuf_printf(sb, "%s%d: Unknown protocol device %d\n",
	periph->periph_name, periph->unit_number,
	path->device->protocol);
	if (path->device->serial_num_len > 0) {
	/* Don't wrap the screen - print only the first 60 chars */
	sbuf_printf(sb, "%s%d: Serial Number %.60s\n",
	periph->periph_name, periph->unit_number,
	path->device->serial_num);
	}
	/* Announce transport details. */
	path->bus->xport->ops->announce_sbuf(periph, sb);
	/* Announce command queueing. */
	if (path->device->inq_flags & SID_CmdQue
	\|\| path->device->flags & CAM_DEV_TAG_AFTER_COUNT) {
	sbuf_printf(sb, "%s%d: Command Queueing enabled\n",
	periph->periph_name, periph->unit_number);
	}
	/* Announce caller's details if they've passed in. */
	if (announce_string != NULL)
	sbuf_printf(sb, "%s%d: %s\n", periph->periph_name,
	periph->unit_number, announce_string);
	}

	void
	xpt_announce_quirks(struct cam_periph periph, int quirks, char bit_string)
	{
	if (quirks != 0) {
	printf("%s%d: quirks=0x%b\n", periph->periph_name,
	periph->unit_number, quirks, bit_string);
	}
	}

	void
	xpt_announce_quirks_sbuf(struct cam_periph periph, struct sbuf sb,
	int quirks, char *bit_string)
	{
	if (xsoftc.announce_nosbuf != 0) {
	xpt_announce_quirks(periph, quirks, bit_string);
	return;
	}

	if (quirks != 0) {
	sbuf_printf(sb, "%s%d: quirks=0x%b\n", periph->periph_name,
	periph->unit_number, quirks, bit_string);
	}
	}

	void
	xpt_denounce_periph(struct cam_periph *periph)
	{
	struct cam_path *path = periph->path;
	struct xpt_proto *proto;

	cam_periph_assert(periph, MA_OWNED);
	printf("%s%d at %s%d bus %d scbus%d target %d lun %jx\n",
	periph->periph_name, periph->unit_number,
	path->bus->sim->sim_name,
	path->bus->sim->unit_number,
	path->bus->sim->bus_id,
	path->bus->path_id,
	path->target->target_id,
	(uintmax_t)path->device->lun_id);
	printf("%s%d: ", periph->periph_name, periph->unit_number);
	proto = xpt_proto_find(path->device->protocol);
	if (proto)
	proto->ops->denounce(path->device);
	else
	printf("%s%d: Unknown protocol device %d\n",
	periph->periph_name, periph->unit_number,
	path->device->protocol);
	if (path->device->serial_num_len > 0)
	printf(" s/n %.60s", path->device->serial_num);
	printf(" detached\n");
	}

	void
	xpt_denounce_periph_sbuf(struct cam_periph periph, struct sbuf sb)
	{
	struct cam_path *path = periph->path;
	struct xpt_proto *proto;

	cam_periph_assert(periph, MA_OWNED);

	/* Fall back to the non-sbuf method if necessary */
	if (xsoftc.announce_nosbuf != 0) {
	xpt_denounce_periph(periph);
	return;
	}
	proto = xpt_proto_find(path->device->protocol);
	if ((proto != NULL) && (proto->ops->denounce_sbuf == NULL)) {
	xpt_denounce_periph(periph);
	return;
	}

	sbuf_printf(sb, "%s%d at %s%d bus %d scbus%d target %d lun %jx\n",
	periph->periph_name, periph->unit_number,
	path->bus->sim->sim_name,
	path->bus->sim->unit_number,
	path->bus->sim->bus_id,
	path->bus->path_id,
	path->target->target_id,
	(uintmax_t)path->device->lun_id);
	sbuf_printf(sb, "%s%d: ", periph->periph_name, periph->unit_number);

	if (proto)
	proto->ops->denounce_sbuf(path->device, sb);
	else
	sbuf_printf(sb, "%s%d: Unknown protocol device %d\n",
	periph->periph_name, periph->unit_number,
	path->device->protocol);
	if (path->device->serial_num_len > 0)
	sbuf_printf(sb, " s/n %.60s", path->device->serial_num);
	sbuf_printf(sb, " detached\n");
	}

	int
	xpt_getattr(char buf, size_t len, const char attr, struct cam_path *path)
	{
	int ret = -1, l, o;
	struct ccb_dev_advinfo cdai;
	struct scsi_vpd_device_id *did;
	struct scsi_vpd_id_descriptor *idd;

	xpt_path_assert(path, MA_OWNED);

	memset(&cdai, 0, sizeof(cdai));
	xpt_setup_ccb(&cdai.ccb_h, path, CAM_PRIORITY_NORMAL);
	cdai.ccb_h.func_code = XPT_DEV_ADVINFO;
	cdai.flags = CDAI_FLAG_NONE;
	cdai.bufsiz = len;
	cdai.buf = buf;

	if (!strcmp(attr, "GEOM::ident"))
	cdai.buftype = CDAI_TYPE_SERIAL_NUM;
	else if (!strcmp(attr, "GEOM::physpath"))
	cdai.buftype = CDAI_TYPE_PHYS_PATH;
	else if (strcmp(attr, "GEOM::lunid") == 0 \|\|
	strcmp(attr, "GEOM::lunname") == 0) {
	cdai.buftype = CDAI_TYPE_SCSI_DEVID;
	cdai.bufsiz = CAM_SCSI_DEVID_MAXLEN;
	cdai.buf = malloc(cdai.bufsiz, M_CAMXPT, M_NOWAIT);
	if (cdai.buf == NULL) {
	ret = ENOMEM;
	goto out;
	}
	} else
	goto out;

	xpt_action((union ccb )&cdai); / can only be synchronous */
	if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE);
	if (cdai.provsiz == 0)
	goto out;
	switch(cdai.buftype) {
	case CDAI_TYPE_SCSI_DEVID:
	did = (struct scsi_vpd_device_id *)cdai.buf;
	if (strcmp(attr, "GEOM::lunid") == 0) {
	idd = scsi_get_devid(did, cdai.provsiz,
	scsi_devid_is_lun_naa);
	if (idd == NULL)
	idd = scsi_get_devid(did, cdai.provsiz,
	scsi_devid_is_lun_eui64);
	if (idd == NULL)
	idd = scsi_get_devid(did, cdai.provsiz,
	scsi_devid_is_lun_uuid);
	if (idd == NULL)
	idd = scsi_get_devid(did, cdai.provsiz,
	scsi_devid_is_lun_md5);
	} else
	idd = NULL;

	if (idd == NULL)
	idd = scsi_get_devid(did, cdai.provsiz,
	scsi_devid_is_lun_t10);
	if (idd == NULL)
	idd = scsi_get_devid(did, cdai.provsiz,
	scsi_devid_is_lun_name);
	if (idd == NULL)
	break;

	ret = 0;
	if ((idd->proto_codeset & SVPD_ID_CODESET_MASK) ==
	SVPD_ID_CODESET_ASCII) {
	if (idd->length < len) {
	for (l = 0; l < idd->length; l++)
	buf[l] = idd->identifier[l] ?
	idd->identifier[l] : ' ';
	buf[l] = 0;
	} else
	ret = EFAULT;
	break;
	}
	if ((idd->proto_codeset & SVPD_ID_CODESET_MASK) ==
	SVPD_ID_CODESET_UTF8) {
	l = strnlen(idd->identifier, idd->length);
	if (l < len) {
	bcopy(idd->identifier, buf, l);
	buf[l] = 0;
	} else
	ret = EFAULT;
	break;
	}
	if ((idd->id_type & SVPD_ID_TYPE_MASK) ==
	SVPD_ID_TYPE_UUID && idd->identifier[0] == 0x10) {
	if ((idd->length - 2) * 2 + 4 >= len) {
	ret = EFAULT;
	break;
	}
	for (l = 2, o = 0; l < idd->length; l++) {
	if (l == 6 \|\| l == 8 \|\| l == 10 \|\| l == 12)
	o += sprintf(buf + o, "-");
	o += sprintf(buf + o, "%02x",
	idd->identifier[l]);
	}
	break;
	}
	if (idd->length * 2 < len) {
	for (l = 0; l < idd->length; l++)
	sprintf(buf + l * 2, "%02x",
	idd->identifier[l]);
	} else
	ret = EFAULT;
	break;
	default:
	if (cdai.provsiz < len) {
	cdai.buf[cdai.provsiz] = 0;
	ret = 0;
	} else
	ret = EFAULT;
	break;
	}

	out:
	if ((char *)cdai.buf != buf)
	free(cdai.buf, M_CAMXPT);
	return ret;
	}

	static dev_match_ret
	xptbusmatch(struct dev_match_pattern *patterns, u_int num_patterns,
	struct cam_eb *bus)
	{
	dev_match_ret retval;
	u_int i;

	retval = DM_RET_NONE;

	/*
	* If we aren't given something to match against, that's an error.
	*/
	if (bus == NULL)
	return(DM_RET_ERROR);

	/*
	* If there are no match entries, then this bus matches no
	* matter what.
	*/
	if ((patterns == NULL) \|\| (num_patterns == 0))
	return(DM_RET_DESCEND \| DM_RET_COPY);

	for (i = 0; i < num_patterns; i++) {
	struct bus_match_pattern *cur_pattern;

	/*
	* If the pattern in question isn't for a bus node, we
	* aren't interested. However, we do indicate to the
	* calling routine that we should continue descending the
	* tree, since the user wants to match against lower-level
	* EDT elements.
	*/
	if (patterns[i].type != DEV_MATCH_BUS) {
	if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE)
	retval \|= DM_RET_DESCEND;
	continue;
	}

	cur_pattern = &patterns[i].pattern.bus_pattern;

	/*
	* If they want to match any bus node, we give them any
	* device node.
	*/
	if (cur_pattern->flags == BUS_MATCH_ANY) {
	/* set the copy flag */
	retval \|= DM_RET_COPY;

	/*
	* If we've already decided on an action, go ahead
	* and return.
	*/
	if ((retval & DM_RET_ACTION_MASK) != DM_RET_NONE)
	return(retval);
	}

	/*
	* Not sure why someone would do this...
	*/
	if (cur_pattern->flags == BUS_MATCH_NONE)
	continue;

	if (((cur_pattern->flags & BUS_MATCH_PATH) != 0)
	&& (cur_pattern->path_id != bus->path_id))
	continue;

	if (((cur_pattern->flags & BUS_MATCH_BUS_ID) != 0)
	&& (cur_pattern->bus_id != bus->sim->bus_id))
	continue;

	if (((cur_pattern->flags & BUS_MATCH_UNIT) != 0)
	&& (cur_pattern->unit_number != bus->sim->unit_number))
	continue;

	if (((cur_pattern->flags & BUS_MATCH_NAME) != 0)
	&& (strncmp(cur_pattern->dev_name, bus->sim->sim_name,
	DEV_IDLEN) != 0))
	continue;

	/*
	* If we get to this point, the user definitely wants
	* information on this bus. So tell the caller to copy the
	* data out.
	*/
	retval \|= DM_RET_COPY;

	/*
	* If the return action has been set to descend, then we
	* know that we've already seen a non-bus matching
	* expression, therefore we need to further descend the tree.
	* This won't change by continuing around the loop, so we
	* go ahead and return. If we haven't seen a non-bus
	* matching expression, we keep going around the loop until
	* we exhaust the matching expressions. We'll set the stop
	* flag once we fall out of the loop.
	*/
	if ((retval & DM_RET_ACTION_MASK) == DM_RET_DESCEND)
	return(retval);
	}

	/*
	* If the return action hasn't been set to descend yet, that means
	* we haven't seen anything other than bus matching patterns. So
	* tell the caller to stop descending the tree -- the user doesn't
	* want to match against lower level tree elements.
	*/
	if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE)
	retval \|= DM_RET_STOP;

	return(retval);
	}

	static dev_match_ret
	xptdevicematch(struct dev_match_pattern *patterns, u_int num_patterns,
	struct cam_ed *device)
	{
	dev_match_ret retval;
	u_int i;

	retval = DM_RET_NONE;

	/*
	* If we aren't given something to match against, that's an error.
	*/
	if (device == NULL)
	return(DM_RET_ERROR);

	/*
	* If there are no match entries, then this device matches no
	* matter what.
	*/
	if ((patterns == NULL) \|\| (num_patterns == 0))
	return(DM_RET_DESCEND \| DM_RET_COPY);

	for (i = 0; i < num_patterns; i++) {
	struct device_match_pattern *cur_pattern;
	struct scsi_vpd_device_id *device_id_page;

	/*
	* If the pattern in question isn't for a device node, we
	* aren't interested.
	*/
	if (patterns[i].type != DEV_MATCH_DEVICE) {
	if ((patterns[i].type == DEV_MATCH_PERIPH)
	&& ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE))
	retval \|= DM_RET_DESCEND;
	continue;
	}

	cur_pattern = &patterns[i].pattern.device_pattern;

	/* Error out if mutually exclusive options are specified. */
	if ((cur_pattern->flags & (DEV_MATCH_INQUIRY\|DEV_MATCH_DEVID))
	== (DEV_MATCH_INQUIRY\|DEV_MATCH_DEVID))
	return(DM_RET_ERROR);

	/*
	* If they want to match any device node, we give them any
	* device node.
	*/
	if (cur_pattern->flags == DEV_MATCH_ANY)
	goto copy_dev_node;

	/*
	* Not sure why someone would do this...
	*/
	if (cur_pattern->flags == DEV_MATCH_NONE)
	continue;

	if (((cur_pattern->flags & DEV_MATCH_PATH) != 0)
	&& (cur_pattern->path_id != device->target->bus->path_id))
	continue;

	if (((cur_pattern->flags & DEV_MATCH_TARGET) != 0)
	&& (cur_pattern->target_id != device->target->target_id))
	continue;

	if (((cur_pattern->flags & DEV_MATCH_LUN) != 0)
	&& (cur_pattern->target_lun != device->lun_id))
	continue;

	if (((cur_pattern->flags & DEV_MATCH_INQUIRY) != 0)
	&& (cam_quirkmatch((caddr_t)&device->inq_data,
	(caddr_t)&cur_pattern->data.inq_pat,
	1, sizeof(cur_pattern->data.inq_pat),
	scsi_static_inquiry_match) == NULL))
	continue;

	device_id_page = (struct scsi_vpd_device_id *)device->device_id;
	if (((cur_pattern->flags & DEV_MATCH_DEVID) != 0)
	&& (device->device_id_len < SVPD_DEVICE_ID_HDR_LEN
	\|\| scsi_devid_match((uint8_t *)device_id_page->desc_list,
	device->device_id_len
	- SVPD_DEVICE_ID_HDR_LEN,
	cur_pattern->data.devid_pat.id,
	cur_pattern->data.devid_pat.id_len) != 0))
	continue;

	copy_dev_node:
	/*
	* If we get to this point, the user definitely wants
	* information on this device. So tell the caller to copy
	* the data out.
	*/
	retval \|= DM_RET_COPY;

	/*
	* If the return action has been set to descend, then we
	* know that we've already seen a peripheral matching
	* expression, therefore we need to further descend the tree.
	* This won't change by continuing around the loop, so we
	* go ahead and return. If we haven't seen a peripheral
	* matching expression, we keep going around the loop until
	* we exhaust the matching expressions. We'll set the stop
	* flag once we fall out of the loop.
	*/
	if ((retval & DM_RET_ACTION_MASK) == DM_RET_DESCEND)
	return(retval);
	}

	/*
	* If the return action hasn't been set to descend yet, that means
	* we haven't seen any peripheral matching patterns. So tell the
	* caller to stop descending the tree -- the user doesn't want to
	* match against lower level tree elements.
	*/
	if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE)
	retval \|= DM_RET_STOP;

	return(retval);
	}

	/*
	* Match a single peripheral against any number of match patterns.
	*/
	static dev_match_ret
	xptperiphmatch(struct dev_match_pattern *patterns, u_int num_patterns,
	struct cam_periph *periph)
	{
	dev_match_ret retval;
	u_int i;

	/*
	* If we aren't given something to match against, that's an error.
	*/
	if (periph == NULL)
	return(DM_RET_ERROR);

	/*
	* If there are no match entries, then this peripheral matches no
	* matter what.
	*/
	if ((patterns == NULL) \|\| (num_patterns == 0))
	return(DM_RET_STOP \| DM_RET_COPY);

	/*
	* There aren't any nodes below a peripheral node, so there's no
	* reason to descend the tree any further.
	*/
	retval = DM_RET_STOP;

	for (i = 0; i < num_patterns; i++) {
	struct periph_match_pattern *cur_pattern;

	/*
	* If the pattern in question isn't for a peripheral, we
	* aren't interested.
	*/
	if (patterns[i].type != DEV_MATCH_PERIPH)
	continue;

	cur_pattern = &patterns[i].pattern.periph_pattern;

	/*
	* If they want to match on anything, then we will do so.
	*/
	if (cur_pattern->flags == PERIPH_MATCH_ANY) {
	/* set the copy flag */
	retval \|= DM_RET_COPY;

	/*
	* We've already set the return action to stop,
	* since there are no nodes below peripherals in
	* the tree.
	*/
	return(retval);
	}

	/*
	* Not sure why someone would do this...
	*/
	if (cur_pattern->flags == PERIPH_MATCH_NONE)
	continue;

	if (((cur_pattern->flags & PERIPH_MATCH_PATH) != 0)
	&& (cur_pattern->path_id != periph->path->bus->path_id))
	continue;

	/*
	* For the target and lun id's, we have to make sure the
	* target and lun pointers aren't NULL. The xpt peripheral
	* has a wildcard target and device.
	*/
	if (((cur_pattern->flags & PERIPH_MATCH_TARGET) != 0)
	&& ((periph->path->target == NULL)
	\|\|(cur_pattern->target_id != periph->path->target->target_id)))
	continue;

	if (((cur_pattern->flags & PERIPH_MATCH_LUN) != 0)
	&& ((periph->path->device == NULL)
	\|\| (cur_pattern->target_lun != periph->path->device->lun_id)))
	continue;

	if (((cur_pattern->flags & PERIPH_MATCH_UNIT) != 0)
	&& (cur_pattern->unit_number != periph->unit_number))
	continue;

	if (((cur_pattern->flags & PERIPH_MATCH_NAME) != 0)
	&& (strncmp(cur_pattern->periph_name, periph->periph_name,
	DEV_IDLEN) != 0))
	continue;

	/*
	* If we get to this point, the user definitely wants
	* information on this peripheral. So tell the caller to
	* copy the data out.
	*/
	retval \|= DM_RET_COPY;

	/*
	* The return action has already been set to stop, since
	* peripherals don't have any nodes below them in the EDT.
	*/
	return(retval);
	}

	/*
	* If we get to this point, the peripheral that was passed in
	* doesn't match any of the patterns.
	*/
	return(retval);
	}

	static int
	xptedtbusfunc(struct cam_eb bus, void arg)
	{
	struct ccb_dev_match *cdm;
	struct cam_et *target;
	dev_match_ret retval;

	cdm = (struct ccb_dev_match *)arg;

	/*
	* If our position is for something deeper in the tree, that means
	* that we've already seen this node. So, we keep going down.
	*/
	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
	&& (cdm->pos.cookie.bus == bus)
	&& (cdm->pos.position_type & CAM_DEV_POS_TARGET)
	&& (cdm->pos.cookie.target != NULL))
	retval = DM_RET_DESCEND;
	else
	retval = xptbusmatch(cdm->patterns, cdm->num_patterns, bus);

	/*
	* If we got an error, bail out of the search.
	*/
	if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) {
	cdm->status = CAM_DEV_MATCH_ERROR;
	return(0);
	}

	/*
	* If the copy flag is set, copy this bus out.
	*/
	if (retval & DM_RET_COPY) {
	int spaceleft, j;

	spaceleft = cdm->match_buf_len - (cdm->num_matches *
	sizeof(struct dev_match_result));

	/*
	* If we don't have enough space to put in another
	* match result, save our position and tell the
	* user there are more devices to check.
	*/
	if (spaceleft < sizeof(struct dev_match_result)) {
	bzero(&cdm->pos, sizeof(cdm->pos));
	cdm->pos.position_type =
	CAM_DEV_POS_EDT \| CAM_DEV_POS_BUS;

	cdm->pos.cookie.bus = bus;
	cdm->pos.generations[CAM_BUS_GENERATION]=
	xsoftc.bus_generation;
	cdm->status = CAM_DEV_MATCH_MORE;
	return(0);
	}
	j = cdm->num_matches;
	cdm->num_matches++;
	cdm->matches[j].type = DEV_MATCH_BUS;
	cdm->matches[j].result.bus_result.path_id = bus->path_id;
	cdm->matches[j].result.bus_result.bus_id = bus->sim->bus_id;
	cdm->matches[j].result.bus_result.unit_number =
	bus->sim->unit_number;
	strlcpy(cdm->matches[j].result.bus_result.dev_name,
	bus->sim->sim_name,
	sizeof(cdm->matches[j].result.bus_result.dev_name));
	}

	/*
	* If the user is only interested in buses, there's no
	* reason to descend to the next level in the tree.
	*/
	if ((retval & DM_RET_ACTION_MASK) == DM_RET_STOP)
	return(1);

	/*
	* If there is a target generation recorded, check it to
	* make sure the target list hasn't changed.
	*/
	mtx_lock(&bus->eb_mtx);
	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
	&& (cdm->pos.cookie.bus == bus)
	&& (cdm->pos.position_type & CAM_DEV_POS_TARGET)
	&& (cdm->pos.cookie.target != NULL)) {
	if ((cdm->pos.generations[CAM_TARGET_GENERATION] !=
	bus->generation)) {
	mtx_unlock(&bus->eb_mtx);
	cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
	return (0);
	}
	target = (struct cam_et *)cdm->pos.cookie.target;
	target->refcount++;
	} else
	target = NULL;
	mtx_unlock(&bus->eb_mtx);

	return (xpttargettraverse(bus, target, xptedttargetfunc, arg));
	}

	static int
	xptedttargetfunc(struct cam_et target, void arg)
	{
	struct ccb_dev_match *cdm;
	struct cam_eb *bus;
	struct cam_ed *device;

	cdm = (struct ccb_dev_match *)arg;
	bus = target->bus;

	/*
	* If there is a device list generation recorded, check it to
	* make sure the device list hasn't changed.
	*/
	mtx_lock(&bus->eb_mtx);
	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
	&& (cdm->pos.cookie.bus == bus)
	&& (cdm->pos.position_type & CAM_DEV_POS_TARGET)
	&& (cdm->pos.cookie.target == target)
	&& (cdm->pos.position_type & CAM_DEV_POS_DEVICE)
	&& (cdm->pos.cookie.device != NULL)) {
	if (cdm->pos.generations[CAM_DEV_GENERATION] !=
	target->generation) {
	mtx_unlock(&bus->eb_mtx);
	cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
	return(0);
	}
	device = (struct cam_ed *)cdm->pos.cookie.device;
	device->refcount++;
	} else
	device = NULL;
	mtx_unlock(&bus->eb_mtx);

	return (xptdevicetraverse(target, device, xptedtdevicefunc, arg));
	}

	static int
	xptedtdevicefunc(struct cam_ed device, void arg)
	{
	struct cam_eb *bus;
	struct cam_periph *periph;
	struct ccb_dev_match *cdm;
	dev_match_ret retval;

	cdm = (struct ccb_dev_match *)arg;
	bus = device->target->bus;

	/*
	* If our position is for something deeper in the tree, that means
	* that we've already seen this node. So, we keep going down.
	*/
	if ((cdm->pos.position_type & CAM_DEV_POS_DEVICE)
	&& (cdm->pos.cookie.device == device)
	&& (cdm->pos.position_type & CAM_DEV_POS_PERIPH)
	&& (cdm->pos.cookie.periph != NULL))
	retval = DM_RET_DESCEND;
	else
	retval = xptdevicematch(cdm->patterns, cdm->num_patterns,
	device);

	if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) {
	cdm->status = CAM_DEV_MATCH_ERROR;
	return(0);
	}

	/*
	* If the copy flag is set, copy this device out.
	*/
	if (retval & DM_RET_COPY) {
	int spaceleft, j;

	spaceleft = cdm->match_buf_len - (cdm->num_matches *
	sizeof(struct dev_match_result));

	/*
	* If we don't have enough space to put in another
	* match result, save our position and tell the
	* user there are more devices to check.
	*/
	if (spaceleft < sizeof(struct dev_match_result)) {
	bzero(&cdm->pos, sizeof(cdm->pos));
	cdm->pos.position_type =
	CAM_DEV_POS_EDT \| CAM_DEV_POS_BUS \|
	CAM_DEV_POS_TARGET \| CAM_DEV_POS_DEVICE;

	cdm->pos.cookie.bus = device->target->bus;
	cdm->pos.generations[CAM_BUS_GENERATION]=
	xsoftc.bus_generation;
	cdm->pos.cookie.target = device->target;
	cdm->pos.generations[CAM_TARGET_GENERATION] =
	device->target->bus->generation;
	cdm->pos.cookie.device = device;
	cdm->pos.generations[CAM_DEV_GENERATION] =
	device->target->generation;
	cdm->status = CAM_DEV_MATCH_MORE;
	return(0);
	}
	j = cdm->num_matches;
	cdm->num_matches++;
	cdm->matches[j].type = DEV_MATCH_DEVICE;
	cdm->matches[j].result.device_result.path_id =
	device->target->bus->path_id;
	cdm->matches[j].result.device_result.target_id =
	device->target->target_id;
	cdm->matches[j].result.device_result.target_lun =
	device->lun_id;
	cdm->matches[j].result.device_result.protocol =
	device->protocol;
	bcopy(&device->inq_data,
	&cdm->matches[j].result.device_result.inq_data,
	sizeof(struct scsi_inquiry_data));
	bcopy(&device->ident_data,
	&cdm->matches[j].result.device_result.ident_data,
	sizeof(struct ata_params));

	/* Let the user know whether this device is unconfigured */
	if (device->flags & CAM_DEV_UNCONFIGURED)
	cdm->matches[j].result.device_result.flags =
	DEV_RESULT_UNCONFIGURED;
	else
	cdm->matches[j].result.device_result.flags =
	DEV_RESULT_NOFLAG;
	}

	/*
	* If the user isn't interested in peripherals, don't descend
	* the tree any further.
	*/
	if ((retval & DM_RET_ACTION_MASK) == DM_RET_STOP)
	return(1);

	/*
	* If there is a peripheral list generation recorded, make sure
	* it hasn't changed.
	*/
	xpt_lock_buses();
	mtx_lock(&bus->eb_mtx);
	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
	&& (cdm->pos.cookie.bus == bus)
	&& (cdm->pos.position_type & CAM_DEV_POS_TARGET)
	&& (cdm->pos.cookie.target == device->target)
	&& (cdm->pos.position_type & CAM_DEV_POS_DEVICE)
	&& (cdm->pos.cookie.device == device)
	&& (cdm->pos.position_type & CAM_DEV_POS_PERIPH)
	&& (cdm->pos.cookie.periph != NULL)) {
	if (cdm->pos.generations[CAM_PERIPH_GENERATION] !=
	device->generation) {
	mtx_unlock(&bus->eb_mtx);
	xpt_unlock_buses();
	cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
	return(0);
	}
	periph = (struct cam_periph *)cdm->pos.cookie.periph;
	periph->refcount++;
	} else
	periph = NULL;
	mtx_unlock(&bus->eb_mtx);
	xpt_unlock_buses();

	return (xptperiphtraverse(device, periph, xptedtperiphfunc, arg));
	}

	static int
	xptedtperiphfunc(struct cam_periph periph, void arg)
	{
	struct ccb_dev_match *cdm;
	dev_match_ret retval;

	cdm = (struct ccb_dev_match *)arg;

	retval = xptperiphmatch(cdm->patterns, cdm->num_patterns, periph);

	if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) {
	cdm->status = CAM_DEV_MATCH_ERROR;
	return(0);
	}

	/*
	* If the copy flag is set, copy this peripheral out.
	*/
	if (retval & DM_RET_COPY) {
	int spaceleft, j;
	size_t l;

	spaceleft = cdm->match_buf_len - (cdm->num_matches *
	sizeof(struct dev_match_result));

	/*
	* If we don't have enough space to put in another
	* match result, save our position and tell the
	* user there are more devices to check.
	*/
	if (spaceleft < sizeof(struct dev_match_result)) {
	bzero(&cdm->pos, sizeof(cdm->pos));
	cdm->pos.position_type =
	CAM_DEV_POS_EDT \| CAM_DEV_POS_BUS \|
	CAM_DEV_POS_TARGET \| CAM_DEV_POS_DEVICE \|
	CAM_DEV_POS_PERIPH;

	cdm->pos.cookie.bus = periph->path->bus;
	cdm->pos.generations[CAM_BUS_GENERATION]=
	xsoftc.bus_generation;
	cdm->pos.cookie.target = periph->path->target;
	cdm->pos.generations[CAM_TARGET_GENERATION] =
	periph->path->bus->generation;
	cdm->pos.cookie.device = periph->path->device;
	cdm->pos.generations[CAM_DEV_GENERATION] =
	periph->path->target->generation;
	cdm->pos.cookie.periph = periph;
	cdm->pos.generations[CAM_PERIPH_GENERATION] =
	periph->path->device->generation;
	cdm->status = CAM_DEV_MATCH_MORE;
	return(0);
	}

	j = cdm->num_matches;
	cdm->num_matches++;
	cdm->matches[j].type = DEV_MATCH_PERIPH;
	cdm->matches[j].result.periph_result.path_id =
	periph->path->bus->path_id;
	cdm->matches[j].result.periph_result.target_id =
	periph->path->target->target_id;
	cdm->matches[j].result.periph_result.target_lun =
	periph->path->device->lun_id;
	cdm->matches[j].result.periph_result.unit_number =
	periph->unit_number;
	l = sizeof(cdm->matches[j].result.periph_result.periph_name);
	strlcpy(cdm->matches[j].result.periph_result.periph_name,
	periph->periph_name, l);
	}

	return(1);
	}

	static int
	xptedtmatch(struct ccb_dev_match *cdm)
	{
	struct cam_eb *bus;
	int ret;

	cdm->num_matches = 0;

	/*
	* Check the bus list generation. If it has changed, the user
	* needs to reset everything and start over.
	*/
	xpt_lock_buses();
	if ((cdm->pos.position_type & CAM_DEV_POS_BUS)
	&& (cdm->pos.cookie.bus != NULL)) {
	if (cdm->pos.generations[CAM_BUS_GENERATION] !=
	xsoftc.bus_generation) {
	xpt_unlock_buses();
	cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
	return(0);
	}
	bus = (struct cam_eb *)cdm->pos.cookie.bus;
	bus->refcount++;
	} else
	bus = NULL;
	xpt_unlock_buses();

	ret = xptbustraverse(bus, xptedtbusfunc, cdm);

	/*
	* If we get back 0, that means that we had to stop before fully
	* traversing the EDT. It also means that one of the subroutines
	* has set the status field to the proper value. If we get back 1,
	* we've fully traversed the EDT and copied out any matching entries.
	*/
	if (ret == 1)
	cdm->status = CAM_DEV_MATCH_LAST;

	return(ret);
	}

	static int
	xptplistpdrvfunc(struct periph_driver *pdrv, void arg)
	{
	struct cam_periph *periph;
	struct ccb_dev_match *cdm;

	cdm = (struct ccb_dev_match *)arg;

	xpt_lock_buses();
	if ((cdm->pos.position_type & CAM_DEV_POS_PDPTR)
	&& (cdm->pos.cookie.pdrv == pdrv)
	&& (cdm->pos.position_type & CAM_DEV_POS_PERIPH)
	&& (cdm->pos.cookie.periph != NULL)) {
	if (cdm->pos.generations[CAM_PERIPH_GENERATION] !=
	(*pdrv)->generation) {
	xpt_unlock_buses();
	cdm->status = CAM_DEV_MATCH_LIST_CHANGED;
	return(0);
	}
	periph = (struct cam_periph *)cdm->pos.cookie.periph;
	periph->refcount++;
	} else
	periph = NULL;
	xpt_unlock_buses();

	return (xptpdperiphtraverse(pdrv, periph, xptplistperiphfunc, arg));
	}

	static int
	xptplistperiphfunc(struct cam_periph periph, void arg)
	{
	struct ccb_dev_match *cdm;
	dev_match_ret retval;

	cdm = (struct ccb_dev_match *)arg;

	retval = xptperiphmatch(cdm->patterns, cdm->num_patterns, periph);

	if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) {
	cdm->status = CAM_DEV_MATCH_ERROR;
	return(0);
	}

	/*
	* If the copy flag is set, copy this peripheral out.
	*/
	if (retval & DM_RET_COPY) {
	int spaceleft, j;
	size_t l;

	spaceleft = cdm->match_buf_len - (cdm->num_matches *
	sizeof(struct dev_match_result));

	/*
	* If we don't have enough space to put in another
	* match result, save our position and tell the
	* user there are more devices to check.
	*/
	if (spaceleft < sizeof(struct dev_match_result)) {
	struct periph_driver **pdrv;

	pdrv = NULL;
	bzero(&cdm->pos, sizeof(cdm->pos));
	cdm->pos.position_type =
	CAM_DEV_POS_PDRV \| CAM_DEV_POS_PDPTR \|
	CAM_DEV_POS_PERIPH;

	/*
	* This may look a bit non-sensical, but it is
	* actually quite logical. There are very few
	* peripheral drivers, and bloating every peripheral
	* structure with a pointer back to its parent
	* peripheral driver linker set entry would cost
	* more in the long run than doing this quick lookup.
	*/
	for (pdrv = periph_drivers; *pdrv != NULL; pdrv++) {
	if (strcmp((*pdrv)->driver_name,
	periph->periph_name) == 0)
	break;
	}

	if (*pdrv == NULL) {
	cdm->status = CAM_DEV_MATCH_ERROR;
	return(0);
	}

	cdm->pos.cookie.pdrv = pdrv;
	/*
	* The periph generation slot does double duty, as
	* does the periph pointer slot. They are used for
	* both edt and pdrv lookups and positioning.
	*/
	cdm->pos.cookie.periph = periph;
	cdm->pos.generations[CAM_PERIPH_GENERATION] =
	(*pdrv)->generation;
	cdm->status = CAM_DEV_MATCH_MORE;
	return(0);
	}

	j = cdm->num_matches;
	cdm->num_matches++;
	cdm->matches[j].type = DEV_MATCH_PERIPH;
	cdm->matches[j].result.periph_result.path_id =
	periph->path->bus->path_id;

	/*
	* The transport layer peripheral doesn't have a target or
	* lun.
	*/
	if (periph->path->target)
	cdm->matches[j].result.periph_result.target_id =
	periph->path->target->target_id;
	else
	cdm->matches[j].result.periph_result.target_id =
	CAM_TARGET_WILDCARD;

	if (periph->path->device)
	cdm->matches[j].result.periph_result.target_lun =
	periph->path->device->lun_id;
	else
	cdm->matches[j].result.periph_result.target_lun =
	CAM_LUN_WILDCARD;

	cdm->matches[j].result.periph_result.unit_number =
	periph->unit_number;
	l = sizeof(cdm->matches[j].result.periph_result.periph_name);
	strlcpy(cdm->matches[j].result.periph_result.periph_name,
	periph->periph_name, l);
	}

	return(1);
	}

	static int
	xptperiphlistmatch(struct ccb_dev_match *cdm)
	{
	int ret;

	cdm->num_matches = 0;

	/*
	* At this point in the edt traversal function, we check the bus
	* list generation to make sure that no buses have been added or
	* removed since the user last sent a XPT_DEV_MATCH ccb through.
	* For the peripheral driver list traversal function, however, we
	* don't have to worry about new peripheral driver types coming or
	* going; they're in a linker set, and therefore can't change
	* without a recompile.
	*/

	if ((cdm->pos.position_type & CAM_DEV_POS_PDPTR)
	&& (cdm->pos.cookie.pdrv != NULL))
	ret = xptpdrvtraverse(
	(struct periph_driver **)cdm->pos.cookie.pdrv,
	xptplistpdrvfunc, cdm);
	else
	ret = xptpdrvtraverse(NULL, xptplistpdrvfunc, cdm);

	/*
	* If we get back 0, that means that we had to stop before fully
	* traversing the peripheral driver tree. It also means that one of
	* the subroutines has set the status field to the proper value. If
	* we get back 1, we've fully traversed the EDT and copied out any
	* matching entries.
	*/
	if (ret == 1)
	cdm->status = CAM_DEV_MATCH_LAST;

	return(ret);
	}

	static int
	xptbustraverse(struct cam_eb start_bus, xpt_busfunc_t tr_func, void *arg)
	{
	struct cam_eb bus, next_bus;
	int retval;

	retval = 1;
	if (start_bus)
	bus = start_bus;
	else {
	xpt_lock_buses();
	bus = TAILQ_FIRST(&xsoftc.xpt_busses);
	if (bus == NULL) {
	xpt_unlock_buses();
	return (retval);
	}
	bus->refcount++;
	xpt_unlock_buses();
	}
	for (; bus != NULL; bus = next_bus) {
	retval = tr_func(bus, arg);
	if (retval == 0) {
	xpt_release_bus(bus);
	break;
	}
	xpt_lock_buses();
	next_bus = TAILQ_NEXT(bus, links);
	if (next_bus)
	next_bus->refcount++;
	xpt_unlock_buses();
	xpt_release_bus(bus);
	}
	return(retval);
	}

	static int
	xpttargettraverse(struct cam_eb bus, struct cam_et start_target,
	xpt_targetfunc_t tr_func, void arg)
	{
	struct cam_et target, next_target;
	int retval;

	retval = 1;
	if (start_target)
	target = start_target;
	else {
	mtx_lock(&bus->eb_mtx);
	target = TAILQ_FIRST(&bus->et_entries);
	if (target == NULL) {
	mtx_unlock(&bus->eb_mtx);
	return (retval);
	}
	target->refcount++;
	mtx_unlock(&bus->eb_mtx);
	}
	for (; target != NULL; target = next_target) {
	retval = tr_func(target, arg);
	if (retval == 0) {
	xpt_release_target(target);
	break;
	}
	mtx_lock(&bus->eb_mtx);
	next_target = TAILQ_NEXT(target, links);
	if (next_target)
	next_target->refcount++;
	mtx_unlock(&bus->eb_mtx);
	xpt_release_target(target);
	}
	return(retval);
	}

	static int
	xptdevicetraverse(struct cam_et target, struct cam_ed start_device,
	xpt_devicefunc_t tr_func, void arg)
	{
	struct cam_eb *bus;
	struct cam_ed device, next_device;
	int retval;

	retval = 1;
	bus = target->bus;
	if (start_device)
	device = start_device;
	else {
	mtx_lock(&bus->eb_mtx);
	device = TAILQ_FIRST(&target->ed_entries);
	if (device == NULL) {
	mtx_unlock(&bus->eb_mtx);
	return (retval);
	}
	device->refcount++;
	mtx_unlock(&bus->eb_mtx);
	}
	for (; device != NULL; device = next_device) {
	mtx_lock(&device->device_mtx);
	retval = tr_func(device, arg);
	mtx_unlock(&device->device_mtx);
	if (retval == 0) {
	xpt_release_device(device);
	break;
	}
	mtx_lock(&bus->eb_mtx);
	next_device = TAILQ_NEXT(device, links);
	if (next_device)
	next_device->refcount++;
	mtx_unlock(&bus->eb_mtx);
	xpt_release_device(device);
	}
	return(retval);
	}

	static int
	xptperiphtraverse(struct cam_ed device, struct cam_periph start_periph,
	xpt_periphfunc_t tr_func, void arg)
	{
	struct cam_eb *bus;
	struct cam_periph periph, next_periph;
	int retval;

	retval = 1;

	bus = device->target->bus;
	if (start_periph)
	periph = start_periph;
	else {
	xpt_lock_buses();
	mtx_lock(&bus->eb_mtx);
	periph = SLIST_FIRST(&device->periphs);
	while (periph != NULL && (periph->flags & CAM_PERIPH_FREE) != 0)
	periph = SLIST_NEXT(periph, periph_links);
	if (periph == NULL) {
	mtx_unlock(&bus->eb_mtx);
	xpt_unlock_buses();
	return (retval);
	}
	periph->refcount++;
	mtx_unlock(&bus->eb_mtx);
	xpt_unlock_buses();
	}
	for (; periph != NULL; periph = next_periph) {
	retval = tr_func(periph, arg);
	if (retval == 0) {
	cam_periph_release_locked(periph);
	break;
	}
	xpt_lock_buses();
	mtx_lock(&bus->eb_mtx);
	next_periph = SLIST_NEXT(periph, periph_links);
	while (next_periph != NULL &&
	(next_periph->flags & CAM_PERIPH_FREE) != 0)
	next_periph = SLIST_NEXT(next_periph, periph_links);
	if (next_periph)
	next_periph->refcount++;
	mtx_unlock(&bus->eb_mtx);
	xpt_unlock_buses();
	cam_periph_release_locked(periph);
	}
	return(retval);
	}

	static int
	xptpdrvtraverse(struct periph_driver **start_pdrv,
	xpt_pdrvfunc_t tr_func, void arg)
	{
	struct periph_driver **pdrv;
	int retval;

	retval = 1;

	/*
	* We don't traverse the peripheral driver list like we do the
	* other lists, because it is a linker set, and therefore cannot be
	* changed during runtime. If the peripheral driver list is ever
	* re-done to be something other than a linker set (i.e. it can
	* change while the system is running), the list traversal should
	* be modified to work like the other traversal functions.
	*/
	for (pdrv = (start_pdrv ? start_pdrv : periph_drivers);
	*pdrv != NULL; pdrv++) {
	retval = tr_func(pdrv, arg);

	if (retval == 0)
	return(retval);
	}

	return(retval);
	}

	static int
	xptpdperiphtraverse(struct periph_driver **pdrv,
	struct cam_periph *start_periph,
	xpt_periphfunc_t tr_func, void arg)
	{
	struct cam_periph periph, next_periph;
	int retval;

	retval = 1;

	if (start_periph)
	periph = start_periph;
	else {
	xpt_lock_buses();
	periph = TAILQ_FIRST(&(*pdrv)->units);
	while (periph != NULL && (periph->flags & CAM_PERIPH_FREE) != 0)
	periph = TAILQ_NEXT(periph, unit_links);
	if (periph == NULL) {
	xpt_unlock_buses();
	return (retval);
	}
	periph->refcount++;
	xpt_unlock_buses();
	}
	for (; periph != NULL; periph = next_periph) {
	cam_periph_lock(periph);
	retval = tr_func(periph, arg);
	cam_periph_unlock(periph);
	if (retval == 0) {
	cam_periph_release(periph);
	break;
	}
	xpt_lock_buses();
	next_periph = TAILQ_NEXT(periph, unit_links);
	while (next_periph != NULL &&
	(next_periph->flags & CAM_PERIPH_FREE) != 0)
	next_periph = TAILQ_NEXT(next_periph, unit_links);
	if (next_periph)
	next_periph->refcount++;
	xpt_unlock_buses();
	cam_periph_release(periph);
	}
	return(retval);
	}

	static int
	xptdefbusfunc(struct cam_eb bus, void arg)
	{
	struct xpt_traverse_config *tr_config;

	tr_config = (struct xpt_traverse_config *)arg;

	if (tr_config->depth == XPT_DEPTH_BUS) {
	xpt_busfunc_t *tr_func;

	tr_func = (xpt_busfunc_t *)tr_config->tr_func;

	return(tr_func(bus, tr_config->tr_arg));
	} else
	return(xpttargettraverse(bus, NULL, xptdeftargetfunc, arg));
	}

	static int
	xptdeftargetfunc(struct cam_et target, void arg)
	{
	struct xpt_traverse_config *tr_config;

	tr_config = (struct xpt_traverse_config *)arg;

	if (tr_config->depth == XPT_DEPTH_TARGET) {
	xpt_targetfunc_t *tr_func;

	tr_func = (xpt_targetfunc_t *)tr_config->tr_func;

	return(tr_func(target, tr_config->tr_arg));
	} else
	return(xptdevicetraverse(target, NULL, xptdefdevicefunc, arg));
	}

	static int
	xptdefdevicefunc(struct cam_ed device, void arg)
	{
	struct xpt_traverse_config *tr_config;

	tr_config = (struct xpt_traverse_config *)arg;

	if (tr_config->depth == XPT_DEPTH_DEVICE) {
	xpt_devicefunc_t *tr_func;

	tr_func = (xpt_devicefunc_t *)tr_config->tr_func;

	return(tr_func(device, tr_config->tr_arg));
	} else
	return(xptperiphtraverse(device, NULL, xptdefperiphfunc, arg));
	}

	static int
	xptdefperiphfunc(struct cam_periph periph, void arg)
	{
	struct xpt_traverse_config *tr_config;
	xpt_periphfunc_t *tr_func;

	tr_config = (struct xpt_traverse_config *)arg;

	tr_func = (xpt_periphfunc_t *)tr_config->tr_func;

	/*
	* Unlike the other default functions, we don't check for depth
	* here. The peripheral driver level is the last level in the EDT,
	* so if we're here, we should execute the function in question.
	*/
	return(tr_func(periph, tr_config->tr_arg));
	}

	/*
	* Execute the given function for every bus in the EDT.
	*/
	static int
	xpt_for_all_busses(xpt_busfunc_t tr_func, void arg)
	{
	struct xpt_traverse_config tr_config;

	tr_config.depth = XPT_DEPTH_BUS;
	tr_config.tr_func = tr_func;
	tr_config.tr_arg = arg;

	return(xptbustraverse(NULL, xptdefbusfunc, &tr_config));
	}

	/*
	* Execute the given function for every device in the EDT.
	*/
	static int
	xpt_for_all_devices(xpt_devicefunc_t tr_func, void arg)
	{
	struct xpt_traverse_config tr_config;

	tr_config.depth = XPT_DEPTH_DEVICE;
	tr_config.tr_func = tr_func;
	tr_config.tr_arg = arg;

	return(xptbustraverse(NULL, xptdefbusfunc, &tr_config));
	}

	static int
	xptsetasyncfunc(struct cam_ed device, void arg)
	{
	struct cam_path path;
	struct ccb_getdev cgd;
	struct ccb_setasync csa = (struct ccb_setasync )arg;

	/*
	* Don't report unconfigured devices (Wildcard devs,
	* devices only for target mode, device instances
	* that have been invalidated but are waiting for
	* their last reference count to be released).
	*/
	if ((device->flags & CAM_DEV_UNCONFIGURED) != 0)
	return (1);

	xpt_compile_path(&path,
	NULL,
	device->target->bus->path_id,
	device->target->target_id,
	device->lun_id);
	xpt_setup_ccb(&cgd.ccb_h, &path, CAM_PRIORITY_NORMAL);
	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_action((union ccb *)&cgd);
	csa->callback(csa->callback_arg,
	AC_FOUND_DEVICE,
	&path, &cgd);
	xpt_release_path(&path);

	return(1);
	}

	static int
	xptsetasyncbusfunc(struct cam_eb bus, void arg)
	{
	struct cam_path path;
	struct ccb_pathinq cpi;
	struct ccb_setasync csa = (struct ccb_setasync )arg;

	xpt_compile_path(&path, /periph/NULL,
	bus->path_id,
	CAM_TARGET_WILDCARD,
	CAM_LUN_WILDCARD);
	xpt_path_lock(&path);
	xpt_path_inq(&cpi, &path);
	csa->callback(csa->callback_arg,
	AC_PATH_REGISTERED,
	&path, &cpi);
	xpt_path_unlock(&path);
	xpt_release_path(&path);

	return(1);
	}

	void
	xpt_action(union ccb *start_ccb)
	{

	CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_TRACE,
	("xpt_action: func %#x %s\n", start_ccb->ccb_h.func_code,
	xpt_action_name(start_ccb->ccb_h.func_code)));

	start_ccb->ccb_h.status = CAM_REQ_INPROG;
	(*(start_ccb->ccb_h.path->bus->xport->ops->action))(start_ccb);
	}

	void
	xpt_action_default(union ccb *start_ccb)
	{
	struct cam_path *path;
	struct cam_sim *sim;
	struct mtx *mtx;

	path = start_ccb->ccb_h.path;
	CAM_DEBUG(path, CAM_DEBUG_TRACE,
	("xpt_action_default: func %#x %s\n", start_ccb->ccb_h.func_code,
	xpt_action_name(start_ccb->ccb_h.func_code)));

	switch (start_ccb->ccb_h.func_code) {
	case XPT_SCSI_IO:
	{
	struct cam_ed *device;

	/*
	* For the sake of compatibility with SCSI-1
	* devices that may not understand the identify
	* message, we include lun information in the
	* second byte of all commands. SCSI-1 specifies
	* that luns are a 3 bit value and reserves only 3
	* bits for lun information in the CDB. Later
	* revisions of the SCSI spec allow for more than 8
	* luns, but have deprecated lun information in the
	* CDB. So, if the lun won't fit, we must omit.
	*
	* Also be aware that during initial probing for devices,
	* the inquiry information is unknown but initialized to 0.
	* This means that this code will be exercised while probing
	* devices with an ANSI revision greater than 2.
	*/
	device = path->device;
	if (device->protocol_version <= SCSI_REV_2
	&& start_ccb->ccb_h.target_lun < 8
	&& (start_ccb->ccb_h.flags & CAM_CDB_POINTER) == 0) {
	start_ccb->csio.cdb_io.cdb_bytes[1] \|=
	start_ccb->ccb_h.target_lun << 5;
	}
	start_ccb->csio.scsi_status = SCSI_STATUS_OK;
	}
	/* FALLTHROUGH */
	case XPT_TARGET_IO:
	case XPT_CONT_TARGET_IO:
	start_ccb->csio.sense_resid = 0;
	start_ccb->csio.resid = 0;
	/* FALLTHROUGH */
	case XPT_ATA_IO:
	if (start_ccb->ccb_h.func_code == XPT_ATA_IO)
	start_ccb->ataio.resid = 0;
	/* FALLTHROUGH */
	case XPT_NVME_IO:
	case XPT_NVME_ADMIN:
	case XPT_MMC_IO:
	case XPT_RESET_DEV:
	case XPT_ENG_EXEC:
	case XPT_SMP_IO:
	{
	struct cam_devq *devq;

	devq = path->bus->sim->devq;
	mtx_lock(&devq->send_mtx);
	cam_ccbq_insert_ccb(&path->device->ccbq, start_ccb);
	if (xpt_schedule_devq(devq, path->device) != 0)
	xpt_run_devq(devq);
	mtx_unlock(&devq->send_mtx);
	break;
	}
	case XPT_CALC_GEOMETRY:
	/* Filter out garbage */
	if (start_ccb->ccg.block_size == 0
	\|\| start_ccb->ccg.volume_size == 0) {
	start_ccb->ccg.cylinders = 0;
	start_ccb->ccg.heads = 0;
	start_ccb->ccg.secs_per_track = 0;
	start_ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	goto call_sim;
	case XPT_ABORT:
	{
	union ccb* abort_ccb;

	abort_ccb = start_ccb->cab.abort_ccb;
	if (XPT_FC_IS_DEV_QUEUED(abort_ccb)) {
	struct cam_ed *device;
	struct cam_devq *devq;

	device = abort_ccb->ccb_h.path->device;
	devq = device->sim->devq;

	mtx_lock(&devq->send_mtx);
	if (abort_ccb->ccb_h.pinfo.index > 0) {
	cam_ccbq_remove_ccb(&device->ccbq, abort_ccb);
	abort_ccb->ccb_h.status =
	CAM_REQ_ABORTED\|CAM_DEV_QFRZN;
	xpt_freeze_devq_device(device, 1);
	mtx_unlock(&devq->send_mtx);
	xpt_done(abort_ccb);
	start_ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	mtx_unlock(&devq->send_mtx);

	if (abort_ccb->ccb_h.pinfo.index == CAM_UNQUEUED_INDEX
	&& (abort_ccb->ccb_h.status & CAM_SIM_QUEUED) == 0) {
	/*
	* We've caught this ccb en route to
	* the SIM. Flag it for abort and the
	* SIM will do so just before starting
	* real work on the CCB.
	*/
	abort_ccb->ccb_h.status =
	CAM_REQ_ABORTED\|CAM_DEV_QFRZN;
	xpt_freeze_devq(abort_ccb->ccb_h.path, 1);
	start_ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	}
	if (XPT_FC_IS_QUEUED(abort_ccb)
	&& (abort_ccb->ccb_h.pinfo.index == CAM_DONEQ_INDEX)) {
	/*
	* It's already completed but waiting
	* for our SWI to get to it.
	*/
	start_ccb->ccb_h.status = CAM_UA_ABORT;
	break;
	}
	/*
	* If we weren't able to take care of the abort request
	* in the XPT, pass the request down to the SIM for processing.
	*/
	}
	/* FALLTHROUGH */
	case XPT_ACCEPT_TARGET_IO:
	case XPT_EN_LUN:
	case XPT_IMMED_NOTIFY:
	case XPT_NOTIFY_ACK:
	case XPT_RESET_BUS:
	case XPT_IMMEDIATE_NOTIFY:
	case XPT_NOTIFY_ACKNOWLEDGE:
	case XPT_GET_SIM_KNOB_OLD:
	case XPT_GET_SIM_KNOB:
	case XPT_SET_SIM_KNOB:
	case XPT_GET_TRAN_SETTINGS:
	case XPT_SET_TRAN_SETTINGS:
	case XPT_PATH_INQ:
	call_sim:
	sim = path->bus->sim;
	mtx = sim->mtx;
	if (mtx && !mtx_owned(mtx))
	mtx_lock(mtx);
	else
	mtx = NULL;

	CAM_DEBUG(path, CAM_DEBUG_TRACE,
	("Calling sim->sim_action(): func=%#x\n", start_ccb->ccb_h.func_code));
	(*(sim->sim_action))(sim, start_ccb);
	CAM_DEBUG(path, CAM_DEBUG_TRACE,
	("sim->sim_action returned: status=%#x\n", start_ccb->ccb_h.status));
	if (mtx)
	mtx_unlock(mtx);
	break;
	case XPT_PATH_STATS:
	start_ccb->cpis.last_reset = path->bus->last_reset;
	start_ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	case XPT_GDEV_TYPE:
	{
	struct cam_ed *dev;

	dev = path->device;
	if ((dev->flags & CAM_DEV_UNCONFIGURED) != 0) {
	start_ccb->ccb_h.status = CAM_DEV_NOT_THERE;
	} else {
	struct ccb_getdev *cgd;

	cgd = &start_ccb->cgd;
	cgd->protocol = dev->protocol;
	cgd->inq_data = dev->inq_data;
	cgd->ident_data = dev->ident_data;
	cgd->inq_flags = dev->inq_flags;
	cgd->ccb_h.status = CAM_REQ_CMP;
	cgd->serial_num_len = dev->serial_num_len;
	if ((dev->serial_num_len > 0)
	&& (dev->serial_num != NULL))
	bcopy(dev->serial_num, cgd->serial_num,
	dev->serial_num_len);
	}
	break;
	}
	case XPT_GDEV_STATS:
	{
	struct ccb_getdevstats *cgds = &start_ccb->cgds;
	struct cam_ed *dev = path->device;
	struct cam_eb *bus = path->bus;
	struct cam_et *tar = path->target;
	struct cam_devq *devq = bus->sim->devq;

	mtx_lock(&devq->send_mtx);
	cgds->dev_openings = dev->ccbq.dev_openings;
	cgds->dev_active = dev->ccbq.dev_active;
	cgds->allocated = dev->ccbq.allocated;
	cgds->queued = cam_ccbq_pending_ccb_count(&dev->ccbq);
	cgds->held = cgds->allocated - cgds->dev_active - cgds->queued;
	cgds->last_reset = tar->last_reset;
	cgds->maxtags = dev->maxtags;
	cgds->mintags = dev->mintags;
	if (timevalcmp(&tar->last_reset, &bus->last_reset, <))
	cgds->last_reset = bus->last_reset;
	mtx_unlock(&devq->send_mtx);
	cgds->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_GDEVLIST:
	{
	struct cam_periph *nperiph;
	struct periph_list *periph_head;
	struct ccb_getdevlist *cgdl;
	u_int i;
	struct cam_ed *device;
	int found;

	found = 0;

	/*
	* Don't want anyone mucking with our data.
	*/
	device = path->device;
	periph_head = &device->periphs;
	cgdl = &start_ccb->cgdl;

	/*
	* Check and see if the list has changed since the user
	* last requested a list member. If so, tell them that the
	* list has changed, and therefore they need to start over
	* from the beginning.
	*/
	if ((cgdl->index != 0) &&
	(cgdl->generation != device->generation)) {
	cgdl->status = CAM_GDEVLIST_LIST_CHANGED;
	break;
	}

	/*
	* Traverse the list of peripherals and attempt to find
	* the requested peripheral.
	*/
	for (nperiph = SLIST_FIRST(periph_head), i = 0;
	(nperiph != NULL) && (i <= cgdl->index);
	nperiph = SLIST_NEXT(nperiph, periph_links), i++) {
	if (i == cgdl->index) {
	strlcpy(cgdl->periph_name,
	nperiph->periph_name,
	sizeof(cgdl->periph_name));
	cgdl->unit_number = nperiph->unit_number;
	found = 1;
	}
	}
	if (found == 0) {
	cgdl->status = CAM_GDEVLIST_ERROR;
	break;
	}

	if (nperiph == NULL)
	cgdl->status = CAM_GDEVLIST_LAST_DEVICE;
	else
	cgdl->status = CAM_GDEVLIST_MORE_DEVS;

	cgdl->index++;
	cgdl->generation = device->generation;

	cgdl->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_DEV_MATCH:
	{
	dev_pos_type position_type;
	struct ccb_dev_match *cdm;

	cdm = &start_ccb->cdm;

	/*
	* There are two ways of getting at information in the EDT.
	* The first way is via the primary EDT tree. It starts
	* with a list of buses, then a list of targets on a bus,
	* then devices/luns on a target, and then peripherals on a
	* device/lun. The "other" way is by the peripheral driver
	* lists. The peripheral driver lists are organized by
	* peripheral driver. (obviously) So it makes sense to
	* use the peripheral driver list if the user is looking
	* for something like "da1", or all "da" devices. If the
	* user is looking for something on a particular bus/target
	* or lun, it's generally better to go through the EDT tree.
	*/

	if (cdm->pos.position_type != CAM_DEV_POS_NONE)
	position_type = cdm->pos.position_type;
	else {
	u_int i;

	position_type = CAM_DEV_POS_NONE;

	for (i = 0; i < cdm->num_patterns; i++) {
	if ((cdm->patterns[i].type == DEV_MATCH_BUS)
	\|\|(cdm->patterns[i].type == DEV_MATCH_DEVICE)){
	position_type = CAM_DEV_POS_EDT;
	break;
	}
	}

	if (cdm->num_patterns == 0)
	position_type = CAM_DEV_POS_EDT;
	else if (position_type == CAM_DEV_POS_NONE)
	position_type = CAM_DEV_POS_PDRV;
	}

	switch(position_type & CAM_DEV_POS_TYPEMASK) {
	case CAM_DEV_POS_EDT:
	xptedtmatch(cdm);
	break;
	case CAM_DEV_POS_PDRV:
	xptperiphlistmatch(cdm);
	break;
	default:
	cdm->status = CAM_DEV_MATCH_ERROR;
	break;
	}

	if (cdm->status == CAM_DEV_MATCH_ERROR)
	start_ccb->ccb_h.status = CAM_REQ_CMP_ERR;
	else
	start_ccb->ccb_h.status = CAM_REQ_CMP;

	break;
	}
	case XPT_SASYNC_CB:
	{
	struct ccb_setasync *csa;
	struct async_node *cur_entry;
	struct async_list *async_head;
	u_int32_t added;

	csa = &start_ccb->csa;
	added = csa->event_enable;
	async_head = &path->device->asyncs;

	/*
	* If there is already an entry for us, simply
	* update it.
	*/
	cur_entry = SLIST_FIRST(async_head);
	while (cur_entry != NULL) {
	if ((cur_entry->callback_arg == csa->callback_arg)
	&& (cur_entry->callback == csa->callback))
	break;
	cur_entry = SLIST_NEXT(cur_entry, links);
	}

	if (cur_entry != NULL) {
	/*
	* If the request has no flags set,
	* remove the entry.
	*/
	added &= ~cur_entry->event_enable;
	if (csa->event_enable == 0) {
	SLIST_REMOVE(async_head, cur_entry,
	async_node, links);
	xpt_release_device(path->device);
	free(cur_entry, M_CAMXPT);
	} else {
	cur_entry->event_enable = csa->event_enable;
	}
	csa->event_enable = added;
	} else {
	cur_entry = malloc(sizeof(*cur_entry), M_CAMXPT,
	M_NOWAIT);
	if (cur_entry == NULL) {
	csa->ccb_h.status = CAM_RESRC_UNAVAIL;
	break;
	}
	cur_entry->event_enable = csa->event_enable;
	cur_entry->event_lock = (path->bus->sim->mtx &&
	mtx_owned(path->bus->sim->mtx)) ? 1 : 0;
	cur_entry->callback_arg = csa->callback_arg;
	cur_entry->callback = csa->callback;
	SLIST_INSERT_HEAD(async_head, cur_entry, links);
	xpt_acquire_device(path->device);
	}
	start_ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_REL_SIMQ:
	{
	struct ccb_relsim *crs;
	struct cam_ed *dev;

	crs = &start_ccb->crs;
	dev = path->device;
	if (dev == NULL) {
	crs->ccb_h.status = CAM_DEV_NOT_THERE;
	break;
	}

	if ((crs->release_flags & RELSIM_ADJUST_OPENINGS) != 0) {
	/* Don't ever go below one opening */
	if (crs->openings > 0) {
	xpt_dev_ccbq_resize(path, crs->openings);
	if (bootverbose) {
	xpt_print(path,
	"number of openings is now %d\n",
	crs->openings);
	}
	}
	}

	mtx_lock(&dev->sim->devq->send_mtx);
	if ((crs->release_flags & RELSIM_RELEASE_AFTER_TIMEOUT) != 0) {
	if ((dev->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0) {
	/*
	* Just extend the old timeout and decrement
	* the freeze count so that a single timeout
	* is sufficient for releasing the queue.
	*/
	start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE;
	callout_stop(&dev->callout);
	} else {
	start_ccb->ccb_h.flags \|= CAM_DEV_QFREEZE;
	}

	callout_reset_sbt(&dev->callout,
	SBT_1MS * crs->release_timeout, 0,
	xpt_release_devq_timeout, dev, 0);

	dev->flags \|= CAM_DEV_REL_TIMEOUT_PENDING;
	}

	if ((crs->release_flags & RELSIM_RELEASE_AFTER_CMDCMPLT) != 0) {
	if ((dev->flags & CAM_DEV_REL_ON_COMPLETE) != 0) {
	/*
	* Decrement the freeze count so that a single
	* completion is still sufficient to unfreeze
	* the queue.
	*/
	start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE;
	} else {
	dev->flags \|= CAM_DEV_REL_ON_COMPLETE;
	start_ccb->ccb_h.flags \|= CAM_DEV_QFREEZE;
	}
	}

	if ((crs->release_flags & RELSIM_RELEASE_AFTER_QEMPTY) != 0) {
	if ((dev->flags & CAM_DEV_REL_ON_QUEUE_EMPTY) != 0
	\|\| (dev->ccbq.dev_active == 0)) {
	start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE;
	} else {
	dev->flags \|= CAM_DEV_REL_ON_QUEUE_EMPTY;
	start_ccb->ccb_h.flags \|= CAM_DEV_QFREEZE;
	}
	}
	mtx_unlock(&dev->sim->devq->send_mtx);

	if ((start_ccb->ccb_h.flags & CAM_DEV_QFREEZE) == 0)
	xpt_release_devq(path, /count/1, /run_queue/TRUE);
	start_ccb->crs.qfrozen_cnt = dev->ccbq.queue.qfrozen_cnt;
	start_ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_DEBUG: {
	struct cam_path *oldpath;

	/* Check that all request bits are supported. */
	if (start_ccb->cdbg.flags & ~(CAM_DEBUG_COMPILE)) {
	start_ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
	break;
	}

	cam_dflags = CAM_DEBUG_NONE;
	if (cam_dpath != NULL) {
	oldpath = cam_dpath;
	cam_dpath = NULL;
	xpt_free_path(oldpath);
	}
	if (start_ccb->cdbg.flags != CAM_DEBUG_NONE) {
	if (xpt_create_path(&cam_dpath, NULL,
	start_ccb->ccb_h.path_id,
	start_ccb->ccb_h.target_id,
	start_ccb->ccb_h.target_lun) !=
	CAM_REQ_CMP) {
	start_ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
	} else {
	cam_dflags = start_ccb->cdbg.flags;
	start_ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_print(cam_dpath, "debugging flags now %x\n",
	cam_dflags);
	}
	} else
	start_ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_NOOP:
	if ((start_ccb->ccb_h.flags & CAM_DEV_QFREEZE) != 0)
	xpt_freeze_devq(path, 1);
	start_ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	case XPT_REPROBE_LUN:
	xpt_async(AC_INQ_CHANGED, path, NULL);
	start_ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(start_ccb);
	break;
	case XPT_ASYNC:
	start_ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(start_ccb);
	break;
	default:
	case XPT_SDEV_TYPE:
	case XPT_TERM_IO:
	case XPT_ENG_INQ:
	/* XXX Implement */
	xpt_print(start_ccb->ccb_h.path,
	"%s: CCB type %#x %s not supported\n", __func__,
	start_ccb->ccb_h.func_code,
	xpt_action_name(start_ccb->ccb_h.func_code));
	start_ccb->ccb_h.status = CAM_PROVIDE_FAIL;
	if (start_ccb->ccb_h.func_code & XPT_FC_DEV_QUEUED) {
	xpt_done(start_ccb);
	}
	break;
	}
	CAM_DEBUG(path, CAM_DEBUG_TRACE,
	("xpt_action_default: func= %#x %s status %#x\n",
	start_ccb->ccb_h.func_code,
	xpt_action_name(start_ccb->ccb_h.func_code),
	start_ccb->ccb_h.status));
	}

	/*
	* Call the sim poll routine to allow the sim to complete
	* any inflight requests, then call camisr_runqueue to
	* complete any CCB that the polling completed.
	*/
	void
	xpt_sim_poll(struct cam_sim *sim)
	{
	struct mtx *mtx;

	mtx = sim->mtx;
	if (mtx)
	mtx_lock(mtx);
	(*(sim->sim_poll))(sim);
	if (mtx)
	mtx_unlock(mtx);
	camisr_runqueue();
	}

	uint32_t
	xpt_poll_setup(union ccb *start_ccb)
	{
	u_int32_t timeout;
	struct cam_sim *sim;
	struct cam_devq *devq;
	struct cam_ed *dev;

	timeout = start_ccb->ccb_h.timeout * 10;
	sim = start_ccb->ccb_h.path->bus->sim;
	devq = sim->devq;
	dev = start_ccb->ccb_h.path->device;

	/*
	* Steal an opening so that no other queued requests
	* can get it before us while we simulate interrupts.
	*/
	mtx_lock(&devq->send_mtx);
	dev->ccbq.dev_openings--;
	while((devq->send_openings <= 0 \|\| dev->ccbq.dev_openings < 0) &&
	(--timeout > 0)) {
	mtx_unlock(&devq->send_mtx);
	DELAY(100);
	xpt_sim_poll(sim);
	mtx_lock(&devq->send_mtx);
	}
	dev->ccbq.dev_openings++;
	mtx_unlock(&devq->send_mtx);

	return (timeout);
	}

	void
	xpt_pollwait(union ccb *start_ccb, uint32_t timeout)
	{

	while (--timeout > 0) {
	xpt_sim_poll(start_ccb->ccb_h.path->bus->sim);
	if ((start_ccb->ccb_h.status & CAM_STATUS_MASK)
	!= CAM_REQ_INPROG)
	break;
	DELAY(100);
	}

	if (timeout == 0) {
	/*
	* XXX Is it worth adding a sim_timeout entry
	* point so we can attempt recovery? If
	* this is only used for dumps, I don't think
	* it is.
	*/
	start_ccb->ccb_h.status = CAM_CMD_TIMEOUT;
	}
	}

	void
	xpt_polled_action(union ccb *start_ccb)
	{
	uint32_t timeout;
	struct cam_ed *dev;

	timeout = start_ccb->ccb_h.timeout * 10;
	dev = start_ccb->ccb_h.path->device;

	mtx_unlock(&dev->device_mtx);

	timeout = xpt_poll_setup(start_ccb);
	if (timeout > 0) {
	xpt_action(start_ccb);
	xpt_pollwait(start_ccb, timeout);
	} else {
	start_ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
	}

	mtx_lock(&dev->device_mtx);
	}

	/*
	* Schedule a peripheral driver to receive a ccb when its
	* target device has space for more transactions.
	*/
	void
	xpt_schedule(struct cam_periph *periph, u_int32_t new_priority)
	{

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("xpt_schedule\n"));
	cam_periph_assert(periph, MA_OWNED);
	if (new_priority < periph->scheduled_priority) {
	periph->scheduled_priority = new_priority;
	xpt_run_allocq(periph, 0);
	}
	}

	/*
	* Schedule a device to run on a given queue.
	* If the device was inserted as a new entry on the queue,
	* return 1 meaning the device queue should be run. If we
	* were already queued, implying someone else has already
	* started the queue, return 0 so the caller doesn't attempt
	* to run the queue.
	*/
	static int
	xpt_schedule_dev(struct camq queue, cam_pinfo pinfo,
	u_int32_t new_priority)
	{
	int retval;
	u_int32_t old_priority;

	CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_schedule_dev\n"));

	old_priority = pinfo->priority;

	/*
	* Are we already queued?
	*/
	if (pinfo->index != CAM_UNQUEUED_INDEX) {
	/* Simply reorder based on new priority */
	if (new_priority < old_priority) {
	camq_change_priority(queue, pinfo->index,
	new_priority);
	CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
	("changed priority to %d\n",
	new_priority));
	retval = 1;
	} else
	retval = 0;
	} else {
	/* New entry on the queue */
	if (new_priority < old_priority)
	pinfo->priority = new_priority;

	CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
	("Inserting onto queue\n"));
	pinfo->generation = ++queue->generation;
	camq_insert(queue, pinfo);
	retval = 1;
	}
	return (retval);
	}

	static void
	xpt_run_allocq_task(void *context, int pending)
	{
	struct cam_periph *periph = context;

	cam_periph_lock(periph);
	periph->flags &= ~CAM_PERIPH_RUN_TASK;
	xpt_run_allocq(periph, 1);
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	}

	static void
	xpt_run_allocq(struct cam_periph *periph, int sleep)
	{
	struct cam_ed *device;
	union ccb *ccb;
	uint32_t prio;

	cam_periph_assert(periph, MA_OWNED);
	if (periph->periph_allocating)
	return;
	cam_periph_doacquire(periph);
	periph->periph_allocating = 1;
	CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_run_allocq(%p)\n", periph));
	device = periph->path->device;
	ccb = NULL;
	restart:
	while ((prio = min(periph->scheduled_priority,
	periph->immediate_priority)) != CAM_PRIORITY_NONE &&
	(periph->periph_allocated - (ccb != NULL ? 1 : 0) <
	device->ccbq.total_openings \|\| prio <= CAM_PRIORITY_OOB)) {
	if (ccb == NULL &&
	(ccb = xpt_get_ccb_nowait(periph)) == NULL) {
	if (sleep) {
	ccb = xpt_get_ccb(periph);
	goto restart;
	}
	if (periph->flags & CAM_PERIPH_RUN_TASK)
	break;
	cam_periph_doacquire(periph);
	periph->flags \|= CAM_PERIPH_RUN_TASK;
	taskqueue_enqueue(xsoftc.xpt_taskq,
	&periph->periph_run_task);
	break;
	}
	xpt_setup_ccb(&ccb->ccb_h, periph->path, prio);
	if (prio == periph->immediate_priority) {
	periph->immediate_priority = CAM_PRIORITY_NONE;
	CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
	("waking cam_periph_getccb()\n"));
	SLIST_INSERT_HEAD(&periph->ccb_list, &ccb->ccb_h,
	periph_links.sle);
	wakeup(&periph->ccb_list);
	} else {
	periph->scheduled_priority = CAM_PRIORITY_NONE;
	CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
	("calling periph_start()\n"));
	periph->periph_start(periph, ccb);
	}
	ccb = NULL;
	}
	if (ccb != NULL)
	xpt_release_ccb(ccb);
	periph->periph_allocating = 0;
	cam_periph_release_locked(periph);
	}

	static void
	xpt_run_devq(struct cam_devq *devq)
	{
	struct mtx *mtx;

	CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_run_devq\n"));

	devq->send_queue.qfrozen_cnt++;
	while ((devq->send_queue.entries > 0)
	&& (devq->send_openings > 0)
	&& (devq->send_queue.qfrozen_cnt <= 1)) {
	struct cam_ed *device;
	union ccb *work_ccb;
	struct cam_sim *sim;
	struct xpt_proto *proto;

	device = (struct cam_ed *)camq_remove(&devq->send_queue,
	CAMQ_HEAD);
	CAM_DEBUG_PRINT(CAM_DEBUG_XPT,
	("running device %p\n", device));

	work_ccb = cam_ccbq_peek_ccb(&device->ccbq, CAMQ_HEAD);
	if (work_ccb == NULL) {
	printf("device on run queue with no ccbs???\n");
	continue;
	}

	if ((work_ccb->ccb_h.flags & CAM_HIGH_POWER) != 0) {
	mtx_lock(&xsoftc.xpt_highpower_lock);
	if (xsoftc.num_highpower <= 0) {
	/*
	* We got a high power command, but we
	* don't have any available slots. Freeze
	* the device queue until we have a slot
	* available.
	*/
	xpt_freeze_devq_device(device, 1);
	STAILQ_INSERT_TAIL(&xsoftc.highpowerq, device,
	highpowerq_entry);

	mtx_unlock(&xsoftc.xpt_highpower_lock);
	continue;
	} else {
	/*
	* Consume a high power slot while
	* this ccb runs.
	*/
	xsoftc.num_highpower--;
	}
	mtx_unlock(&xsoftc.xpt_highpower_lock);
	}
	cam_ccbq_remove_ccb(&device->ccbq, work_ccb);
	cam_ccbq_send_ccb(&device->ccbq, work_ccb);
	devq->send_openings--;
	devq->send_active++;
	xpt_schedule_devq(devq, device);
	mtx_unlock(&devq->send_mtx);

	if ((work_ccb->ccb_h.flags & CAM_DEV_QFREEZE) != 0) {
	/*
	* The client wants to freeze the queue
	* after this CCB is sent.
	*/
	xpt_freeze_devq(work_ccb->ccb_h.path, 1);
	}

	/* In Target mode, the peripheral driver knows best... */
	if (work_ccb->ccb_h.func_code == XPT_SCSI_IO) {
	if ((device->inq_flags & SID_CmdQue) != 0
	&& work_ccb->csio.tag_action != CAM_TAG_ACTION_NONE)
	work_ccb->ccb_h.flags \|= CAM_TAG_ACTION_VALID;
	else
	/*
	* Clear this in case of a retried CCB that
	* failed due to a rejected tag.
	*/
	work_ccb->ccb_h.flags &= ~CAM_TAG_ACTION_VALID;
	}

	KASSERT(device == work_ccb->ccb_h.path->device,
	("device (%p) / path->device (%p) mismatch",
	device, work_ccb->ccb_h.path->device));
	proto = xpt_proto_find(device->protocol);
	if (proto && proto->ops->debug_out)
	proto->ops->debug_out(work_ccb);

	/*
	* Device queues can be shared among multiple SIM instances
	* that reside on different buses. Use the SIM from the
	* queued device, rather than the one from the calling bus.
	*/
	sim = device->sim;
	mtx = sim->mtx;
	if (mtx && !mtx_owned(mtx))
	mtx_lock(mtx);
	else
	mtx = NULL;
	work_ccb->ccb_h.qos.periph_data = cam_iosched_now();
	(*(sim->sim_action))(sim, work_ccb);
	if (mtx)
	mtx_unlock(mtx);
	mtx_lock(&devq->send_mtx);
	}
	devq->send_queue.qfrozen_cnt--;
	}

	/*
	* This function merges stuff from the src ccb into the dst ccb, while keeping
	* important fields in the dst ccb constant.
	*/
	void
	xpt_merge_ccb(union ccb dst_ccb, union ccb src_ccb)
	{

	/*
	* Pull fields that are valid for peripheral drivers to set
	* into the dst CCB along with the CCB "payload".
	*/
	dst_ccb->ccb_h.retry_count = src_ccb->ccb_h.retry_count;
	dst_ccb->ccb_h.func_code = src_ccb->ccb_h.func_code;
	dst_ccb->ccb_h.timeout = src_ccb->ccb_h.timeout;
	dst_ccb->ccb_h.flags = src_ccb->ccb_h.flags;
	bcopy(&(&src_ccb->ccb_h)[1], &(&dst_ccb->ccb_h)[1],
	sizeof(union ccb) - sizeof(struct ccb_hdr));
	}

	void
	xpt_setup_ccb_flags(struct ccb_hdr ccb_h, struct cam_path path,
	u_int32_t priority, u_int32_t flags)
	{

	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_setup_ccb\n"));
	ccb_h->pinfo.priority = priority;
	ccb_h->path = path;
	ccb_h->path_id = path->bus->path_id;
	if (path->target)
	ccb_h->target_id = path->target->target_id;
	else
	ccb_h->target_id = CAM_TARGET_WILDCARD;
	if (path->device) {
	ccb_h->target_lun = path->device->lun_id;
	ccb_h->pinfo.generation = ++path->device->ccbq.queue.generation;
	} else {
	ccb_h->target_lun = CAM_TARGET_WILDCARD;
	}
	ccb_h->pinfo.index = CAM_UNQUEUED_INDEX;
	ccb_h->flags = flags;
	ccb_h->xflags = 0;
	}

	void
	xpt_setup_ccb(struct ccb_hdr ccb_h, struct cam_path path, u_int32_t priority)
	{
	xpt_setup_ccb_flags(ccb_h, path, priority, /flags/ 0);
	}

	/* Path manipulation functions */
	cam_status
	xpt_create_path(struct cam_path *new_path_ptr, struct cam_periph perph,
	path_id_t path_id, target_id_t target_id, lun_id_t lun_id)
	{
	struct cam_path *path;
	cam_status status;

	path = (struct cam_path )malloc(sizeof(path), M_CAMPATH, M_NOWAIT);

	if (path == NULL) {
	status = CAM_RESRC_UNAVAIL;
	return(status);
	}
	status = xpt_compile_path(path, perph, path_id, target_id, lun_id);
	if (status != CAM_REQ_CMP) {
	free(path, M_CAMPATH);
	path = NULL;
	}
	*new_path_ptr = path;
	return (status);
	}

	cam_status
	xpt_create_path_unlocked(struct cam_path **new_path_ptr,
	struct cam_periph *periph, path_id_t path_id,
	target_id_t target_id, lun_id_t lun_id)
	{

	return (xpt_create_path(new_path_ptr, periph, path_id, target_id,
	lun_id));
	}

	cam_status
	xpt_compile_path(struct cam_path new_path, struct cam_periph perph,
	path_id_t path_id, target_id_t target_id, lun_id_t lun_id)
	{
	struct cam_eb *bus;
	struct cam_et *target;
	struct cam_ed *device;
	cam_status status;

	status = CAM_REQ_CMP; /* Completed without error */
	target = NULL; /* Wildcarded */
	device = NULL; /* Wildcarded */

	/*
	* We will potentially modify the EDT, so block interrupts
	* that may attempt to create cam paths.
	*/
	bus = xpt_find_bus(path_id);
	if (bus == NULL) {
	status = CAM_PATH_INVALID;
	} else {
	xpt_lock_buses();
	mtx_lock(&bus->eb_mtx);
	target = xpt_find_target(bus, target_id);
	if (target == NULL) {
	/* Create one */
	struct cam_et *new_target;

	new_target = xpt_alloc_target(bus, target_id);
	if (new_target == NULL) {
	status = CAM_RESRC_UNAVAIL;
	} else {
	target = new_target;
	}
	}
	xpt_unlock_buses();
	if (target != NULL) {
	device = xpt_find_device(target, lun_id);
	if (device == NULL) {
	/* Create one */
	struct cam_ed *new_device;

	new_device =
	(*(bus->xport->ops->alloc_device))(bus,
	target,
	lun_id);
	if (new_device == NULL) {
	status = CAM_RESRC_UNAVAIL;
	} else {
	device = new_device;
	}
	}
	}
	mtx_unlock(&bus->eb_mtx);
	}

	/*
	* Only touch the user's data if we are successful.
	*/
	if (status == CAM_REQ_CMP) {
	new_path->periph = perph;
	new_path->bus = bus;
	new_path->target = target;
	new_path->device = device;
	CAM_DEBUG(new_path, CAM_DEBUG_TRACE, ("xpt_compile_path\n"));
	} else {
	if (device != NULL)
	xpt_release_device(device);
	if (target != NULL)
	xpt_release_target(target);
	if (bus != NULL)
	xpt_release_bus(bus);
	}
	return (status);
	}

	cam_status
	xpt_clone_path(struct cam_path *new_path_ptr, struct cam_path path)
	{
	struct cam_path *new_path;

	new_path = (struct cam_path )malloc(sizeof(path), M_CAMPATH, M_NOWAIT);
	if (new_path == NULL)
	return(CAM_RESRC_UNAVAIL);
	new_path = path;
	if (path->bus != NULL)
	xpt_acquire_bus(path->bus);
	if (path->target != NULL)
	xpt_acquire_target(path->target);
	if (path->device != NULL)
	xpt_acquire_device(path->device);
	*new_path_ptr = new_path;
	return (CAM_REQ_CMP);
	}

	void
	xpt_release_path(struct cam_path *path)
	{
	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_release_path\n"));
	if (path->device != NULL) {
	xpt_release_device(path->device);
	path->device = NULL;
	}
	if (path->target != NULL) {
	xpt_release_target(path->target);
	path->target = NULL;
	}
	if (path->bus != NULL) {
	xpt_release_bus(path->bus);
	path->bus = NULL;
	}
	}

	void
	xpt_free_path(struct cam_path *path)
	{

	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_free_path\n"));
	xpt_release_path(path);
	free(path, M_CAMPATH);
	}

	void
	xpt_path_counts(struct cam_path path, uint32_t bus_ref,
	uint32_t periph_ref, uint32_t target_ref, uint32_t *device_ref)
	{

	xpt_lock_buses();
	if (bus_ref) {
	if (path->bus)
	*bus_ref = path->bus->refcount;
	else
	*bus_ref = 0;
	}
	if (periph_ref) {
	if (path->periph)
	*periph_ref = path->periph->refcount;
	else
	*periph_ref = 0;
	}
	xpt_unlock_buses();
	if (target_ref) {
	if (path->target)
	*target_ref = path->target->refcount;
	else
	*target_ref = 0;
	}
	if (device_ref) {
	if (path->device)
	*device_ref = path->device->refcount;
	else
	*device_ref = 0;
	}
	}

	/*
	* Return -1 for failure, 0 for exact match, 1 for match with wildcards
	* in path1, 2 for match with wildcards in path2.
	*/
	int
	xpt_path_comp(struct cam_path path1, struct cam_path path2)
	{
	int retval = 0;

	if (path1->bus != path2->bus) {
	if (path1->bus->path_id == CAM_BUS_WILDCARD)
	retval = 1;
	else if (path2->bus->path_id == CAM_BUS_WILDCARD)
	retval = 2;
	else
	return (-1);
	}
	if (path1->target != path2->target) {
	if (path1->target->target_id == CAM_TARGET_WILDCARD) {
	if (retval == 0)
	retval = 1;
	} else if (path2->target->target_id == CAM_TARGET_WILDCARD)
	retval = 2;
	else
	return (-1);
	}
	if (path1->device != path2->device) {
	if (path1->device->lun_id == CAM_LUN_WILDCARD) {
	if (retval == 0)
	retval = 1;
	} else if (path2->device->lun_id == CAM_LUN_WILDCARD)
	retval = 2;
	else
	return (-1);
	}
	return (retval);
	}

	int
	xpt_path_comp_dev(struct cam_path path, struct cam_ed dev)
	{
	int retval = 0;

	if (path->bus != dev->target->bus) {
	if (path->bus->path_id == CAM_BUS_WILDCARD)
	retval = 1;
	else if (dev->target->bus->path_id == CAM_BUS_WILDCARD)
	retval = 2;
	else
	return (-1);
	}
	if (path->target != dev->target) {
	if (path->target->target_id == CAM_TARGET_WILDCARD) {
	if (retval == 0)
	retval = 1;
	} else if (dev->target->target_id == CAM_TARGET_WILDCARD)
	retval = 2;
	else
	return (-1);
	}
	if (path->device != dev) {
	if (path->device->lun_id == CAM_LUN_WILDCARD) {
	if (retval == 0)
	retval = 1;
	} else if (dev->lun_id == CAM_LUN_WILDCARD)
	retval = 2;
	else
	return (-1);
	}
	return (retval);
	}

	void
	xpt_print_path(struct cam_path *path)
	{
	struct sbuf sb;
	char buffer[XPT_PRINT_LEN];

	sbuf_new(&sb, buffer, XPT_PRINT_LEN, SBUF_FIXEDLEN);
	xpt_path_sbuf(path, &sb);
	sbuf_finish(&sb);
	printf("%s", sbuf_data(&sb));
	sbuf_delete(&sb);
	}

	void
	xpt_print_device(struct cam_ed *device)
	{

	if (device == NULL)
	printf("(nopath): ");
	else {
	printf("(noperiph:%s%d:%d:%d:%jx): ", device->sim->sim_name,
	device->sim->unit_number,
	device->sim->bus_id,
	device->target->target_id,
	(uintmax_t)device->lun_id);
	}
	}

	void
	xpt_print(struct cam_path path, const char fmt, ...)
	{
	va_list ap;
	struct sbuf sb;
	char buffer[XPT_PRINT_LEN];

	sbuf_new(&sb, buffer, XPT_PRINT_LEN, SBUF_FIXEDLEN);

	xpt_path_sbuf(path, &sb);
	va_start(ap, fmt);
	sbuf_vprintf(&sb, fmt, ap);
	va_end(ap);

	sbuf_finish(&sb);
	printf("%s", sbuf_data(&sb));
	sbuf_delete(&sb);
	}

	int
	xpt_path_string(struct cam_path path, char str, size_t str_len)
	{
	struct sbuf sb;
	int len;

	sbuf_new(&sb, str, str_len, 0);
	len = xpt_path_sbuf(path, &sb);
	sbuf_finish(&sb);
	return (len);
	}

	int
	xpt_path_sbuf(struct cam_path path, struct sbuf sb)
	{

	if (path == NULL)
	sbuf_printf(sb, "(nopath): ");
	else {
	if (path->periph != NULL)
	sbuf_printf(sb, "(%s%d:", path->periph->periph_name,
	path->periph->unit_number);
	else
	sbuf_printf(sb, "(noperiph:");

	if (path->bus != NULL)
	sbuf_printf(sb, "%s%d:%d:", path->bus->sim->sim_name,
	path->bus->sim->unit_number,
	path->bus->sim->bus_id);
	else
	sbuf_printf(sb, "nobus:");

	if (path->target != NULL)
	sbuf_printf(sb, "%d:", path->target->target_id);
	else
	sbuf_printf(sb, "X:");

	if (path->device != NULL)
	sbuf_printf(sb, "%jx): ",
	(uintmax_t)path->device->lun_id);
	else
	sbuf_printf(sb, "X): ");
	}

	return(sbuf_len(sb));
	}

	path_id_t
	xpt_path_path_id(struct cam_path *path)
	{
	return(path->bus->path_id);
	}

	target_id_t
	xpt_path_target_id(struct cam_path *path)
	{
	if (path->target != NULL)
	return (path->target->target_id);
	else
	return (CAM_TARGET_WILDCARD);
	}

	lun_id_t
	xpt_path_lun_id(struct cam_path *path)
	{
	if (path->device != NULL)
	return (path->device->lun_id);
	else
	return (CAM_LUN_WILDCARD);
	}

	struct cam_sim *
	xpt_path_sim(struct cam_path *path)
	{

	return (path->bus->sim);
	}

	struct cam_periph*
	xpt_path_periph(struct cam_path *path)
	{

	return (path->periph);
	}

	/*
	* Release a CAM control block for the caller. Remit the cost of the structure
	* to the device referenced by the path. If the this device had no 'credits'
	* and peripheral drivers have registered async callbacks for this notification
	* call them now.
	*/
	void
	xpt_release_ccb(union ccb *free_ccb)
	{
	struct cam_ed *device;
	struct cam_periph *periph;

	CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_release_ccb\n"));
	xpt_path_assert(free_ccb->ccb_h.path, MA_OWNED);
	device = free_ccb->ccb_h.path->device;
	periph = free_ccb->ccb_h.path->periph;

	xpt_free_ccb(free_ccb);
	periph->periph_allocated--;
	cam_ccbq_release_opening(&device->ccbq);
	xpt_run_allocq(periph, 0);
	}

	/* Functions accessed by SIM drivers */

	static struct xpt_xport_ops xport_default_ops = {
	.alloc_device = xpt_alloc_device_default,
	.action = xpt_action_default,
	.async = xpt_dev_async_default,
	};
	static struct xpt_xport xport_default = {
	.xport = XPORT_UNKNOWN,
	.name = "unknown",
	.ops = &xport_default_ops,
	};

	CAM_XPT_XPORT(xport_default);

	/*
	* A sim structure, listing the SIM entry points and instance
	* identification info is passed to xpt_bus_register to hook the SIM
	* into the CAM framework. xpt_bus_register creates a cam_eb entry
	* for this new bus and places it in the array of buses and assigns
	* it a path_id. The path_id may be influenced by "hard wiring"
	* information specified by the user. Once interrupt services are
	* available, the bus will be probed.
	*/
	int32_t
	xpt_bus_register(struct cam_sim *sim, device_t parent, u_int32_t bus)
	{
	struct cam_eb *new_bus;
	struct cam_eb *old_bus;
	struct ccb_pathinq cpi;
	struct cam_path *path;
	cam_status status;

	sim->bus_id = bus;
	new_bus = (struct cam_eb )malloc(sizeof(new_bus),
	M_CAMXPT, M_NOWAIT\|M_ZERO);
	if (new_bus == NULL) {
	/* Couldn't satisfy request */
	return (CAM_RESRC_UNAVAIL);
	}

	mtx_init(&new_bus->eb_mtx, "CAM bus lock", NULL, MTX_DEF);
	TAILQ_INIT(&new_bus->et_entries);
	cam_sim_hold(sim);
	new_bus->sim = sim;
	timevalclear(&new_bus->last_reset);
	new_bus->flags = 0;
	new_bus->refcount = 1; /* Held until a bus_deregister event */
	new_bus->generation = 0;

	xpt_lock_buses();
	sim->path_id = new_bus->path_id =
	xptpathid(sim->sim_name, sim->unit_number, sim->bus_id);
	old_bus = TAILQ_FIRST(&xsoftc.xpt_busses);
	while (old_bus != NULL
	&& old_bus->path_id < new_bus->path_id)
	old_bus = TAILQ_NEXT(old_bus, links);
	if (old_bus != NULL)
	TAILQ_INSERT_BEFORE(old_bus, new_bus, links);
	else
	TAILQ_INSERT_TAIL(&xsoftc.xpt_busses, new_bus, links);
	xsoftc.bus_generation++;
	xpt_unlock_buses();

	/*
	* Set a default transport so that a PATH_INQ can be issued to
	* the SIM. This will then allow for probing and attaching of
	* a more appropriate transport.
	*/
	new_bus->xport = &xport_default;

	status = xpt_create_path(&path, /periph/NULL, sim->path_id,
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD);
	if (status != CAM_REQ_CMP) {
	xpt_release_bus(new_bus);
	return (CAM_RESRC_UNAVAIL);
	}

	xpt_path_inq(&cpi, path);

	if (cpi.ccb_h.status == CAM_REQ_CMP) {
	struct xpt_xport **xpt;

	SET_FOREACH(xpt, cam_xpt_xport_set) {
	if ((*xpt)->xport == cpi.transport) {
	new_bus->xport = *xpt;
	break;
	}
	}
	if (new_bus->xport == NULL) {
	xpt_print(path,
	"No transport found for %d\n", cpi.transport);
	xpt_release_bus(new_bus);
	free(path, M_CAMXPT);
	return (CAM_RESRC_UNAVAIL);
	}
	}

	/* Notify interested parties */
	if (sim->path_id != CAM_XPT_PATH_ID) {
	xpt_async(AC_PATH_REGISTERED, path, &cpi);
	if ((cpi.hba_misc & PIM_NOSCAN) == 0) {
	union ccb *scan_ccb;

	/* Initiate bus rescan. */
	scan_ccb = xpt_alloc_ccb_nowait();
	if (scan_ccb != NULL) {
	scan_ccb->ccb_h.path = path;
	scan_ccb->ccb_h.func_code = XPT_SCAN_BUS;
	scan_ccb->crcn.flags = 0;
	xpt_rescan(scan_ccb);
	} else {
	xpt_print(path,
	"Can't allocate CCB to scan bus\n");
	xpt_free_path(path);
	}
	} else
	xpt_free_path(path);
	} else
	xpt_free_path(path);
	return (CAM_SUCCESS);
	}

	int32_t
	xpt_bus_deregister(path_id_t pathid)
	{
	struct cam_path bus_path;
	cam_status status;

	status = xpt_compile_path(&bus_path, NULL, pathid,
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD);
	if (status != CAM_REQ_CMP)
	return (status);

	xpt_async(AC_LOST_DEVICE, &bus_path, NULL);
	xpt_async(AC_PATH_DEREGISTERED, &bus_path, NULL);

	/* Release the reference count held while registered. */
	xpt_release_bus(bus_path.bus);
	xpt_release_path(&bus_path);

	return (CAM_REQ_CMP);
	}

	static path_id_t
	xptnextfreepathid(void)
	{
	struct cam_eb *bus;
	path_id_t pathid;
	const char *strval;

	mtx_assert(&xsoftc.xpt_topo_lock, MA_OWNED);
	pathid = 0;
	bus = TAILQ_FIRST(&xsoftc.xpt_busses);
	retry:
	/* Find an unoccupied pathid */
	while (bus != NULL && bus->path_id <= pathid) {
	if (bus->path_id == pathid)
	pathid++;
	bus = TAILQ_NEXT(bus, links);
	}

	/*
	* Ensure that this pathid is not reserved for
	* a bus that may be registered in the future.
	*/
	if (resource_string_value("scbus", pathid, "at", &strval) == 0) {
	++pathid;
	/* Start the search over */
	goto retry;
	}
	return (pathid);
	}

	static path_id_t
	xptpathid(const char *sim_name, int sim_unit, int sim_bus)
	{
	path_id_t pathid;
	int i, dunit, val;
	char buf[32];
	const char *dname;

	pathid = CAM_XPT_PATH_ID;
	snprintf(buf, sizeof(buf), "%s%d", sim_name, sim_unit);
	if (strcmp(buf, "xpt0") == 0 && sim_bus == 0)
	return (pathid);
	i = 0;
	while ((resource_find_match(&i, &dname, &dunit, "at", buf)) == 0) {
	if (strcmp(dname, "scbus")) {
	/* Avoid a bit of foot shooting. */
	continue;
	}
	if (dunit < 0) /* unwired?! */
	continue;
	if (resource_int_value("scbus", dunit, "bus", &val) == 0) {
	if (sim_bus == val) {
	pathid = dunit;
	break;
	}
	} else if (sim_bus == 0) {
	/* Unspecified matches bus 0 */
	pathid = dunit;
	break;
	} else {
	printf("Ambiguous scbus configuration for %s%d "
	"bus %d, cannot wire down. The kernel "
	"config entry for scbus%d should "
	"specify a controller bus.\n"
	"Scbus will be assigned dynamically.\n",
	sim_name, sim_unit, sim_bus, dunit);
	break;
	}
	}

	if (pathid == CAM_XPT_PATH_ID)
	pathid = xptnextfreepathid();
	return (pathid);
	}

	static const char *
	xpt_async_string(u_int32_t async_code)
	{

	switch (async_code) {
	case AC_BUS_RESET: return ("AC_BUS_RESET");
	case AC_UNSOL_RESEL: return ("AC_UNSOL_RESEL");
	case AC_SCSI_AEN: return ("AC_SCSI_AEN");
	case AC_SENT_BDR: return ("AC_SENT_BDR");
	case AC_PATH_REGISTERED: return ("AC_PATH_REGISTERED");
	case AC_PATH_DEREGISTERED: return ("AC_PATH_DEREGISTERED");
	case AC_FOUND_DEVICE: return ("AC_FOUND_DEVICE");
	case AC_LOST_DEVICE: return ("AC_LOST_DEVICE");
	case AC_TRANSFER_NEG: return ("AC_TRANSFER_NEG");
	case AC_INQ_CHANGED: return ("AC_INQ_CHANGED");
	case AC_GETDEV_CHANGED: return ("AC_GETDEV_CHANGED");
	case AC_CONTRACT: return ("AC_CONTRACT");
	case AC_ADVINFO_CHANGED: return ("AC_ADVINFO_CHANGED");
	case AC_UNIT_ATTENTION: return ("AC_UNIT_ATTENTION");
	}
	return ("AC_UNKNOWN");
	}

	static int
	xpt_async_size(u_int32_t async_code)
	{

	switch (async_code) {
	case AC_BUS_RESET: return (0);
	case AC_UNSOL_RESEL: return (0);
	case AC_SCSI_AEN: return (0);
	case AC_SENT_BDR: return (0);
	case AC_PATH_REGISTERED: return (sizeof(struct ccb_pathinq));
	case AC_PATH_DEREGISTERED: return (0);
	case AC_FOUND_DEVICE: return (sizeof(struct ccb_getdev));
	case AC_LOST_DEVICE: return (0);
	case AC_TRANSFER_NEG: return (sizeof(struct ccb_trans_settings));
	case AC_INQ_CHANGED: return (0);
	case AC_GETDEV_CHANGED: return (0);
	case AC_CONTRACT: return (sizeof(struct ac_contract));
	case AC_ADVINFO_CHANGED: return (-1);
	case AC_UNIT_ATTENTION: return (sizeof(struct ccb_scsiio));
	}
	return (0);
	}

	static int
	xpt_async_process_dev(struct cam_ed device, void arg)
	{
	union ccb *ccb = arg;
	struct cam_path *path = ccb->ccb_h.path;
	void *async_arg = ccb->casync.async_arg_ptr;
	u_int32_t async_code = ccb->casync.async_code;
	int relock;

	if (path->device != device
	&& path->device->lun_id != CAM_LUN_WILDCARD
	&& device->lun_id != CAM_LUN_WILDCARD)
	return (1);

	/*
	* The async callback could free the device.
	* If it is a broadcast async, it doesn't hold
	* device reference, so take our own reference.
	*/
	xpt_acquire_device(device);

	/*
	* If async for specific device is to be delivered to
	* the wildcard client, take the specific device lock.
	* XXX: We may need a way for client to specify it.
	*/
	if ((device->lun_id == CAM_LUN_WILDCARD &&
	path->device->lun_id != CAM_LUN_WILDCARD) \|\|
	(device->target->target_id == CAM_TARGET_WILDCARD &&
	path->target->target_id != CAM_TARGET_WILDCARD) \|\|
	(device->target->bus->path_id == CAM_BUS_WILDCARD &&
	path->target->bus->path_id != CAM_BUS_WILDCARD)) {
	mtx_unlock(&device->device_mtx);
	xpt_path_lock(path);
	relock = 1;
	} else
	relock = 0;

	(*(device->target->bus->xport->ops->async))(async_code,
	device->target->bus, device->target, device, async_arg);
	xpt_async_bcast(&device->asyncs, async_code, path, async_arg);

	if (relock) {
	xpt_path_unlock(path);
	mtx_lock(&device->device_mtx);
	}
	xpt_release_device(device);
	return (1);
	}

	static int
	xpt_async_process_tgt(struct cam_et target, void arg)
	{
	union ccb *ccb = arg;
	struct cam_path *path = ccb->ccb_h.path;

	if (path->target != target
	&& path->target->target_id != CAM_TARGET_WILDCARD
	&& target->target_id != CAM_TARGET_WILDCARD)
	return (1);

	if (ccb->casync.async_code == AC_SENT_BDR) {
	/* Update our notion of when the last reset occurred */
	microtime(&target->last_reset);
	}

	return (xptdevicetraverse(target, NULL, xpt_async_process_dev, ccb));
	}

	static void
	xpt_async_process(struct cam_periph periph, union ccb ccb)
	{
	struct cam_eb *bus;
	struct cam_path *path;
	void *async_arg;
	u_int32_t async_code;

	path = ccb->ccb_h.path;
	async_code = ccb->casync.async_code;
	async_arg = ccb->casync.async_arg_ptr;
	CAM_DEBUG(path, CAM_DEBUG_TRACE \| CAM_DEBUG_INFO,
	("xpt_async(%s)\n", xpt_async_string(async_code)));
	bus = path->bus;

	if (async_code == AC_BUS_RESET) {
	/* Update our notion of when the last reset occurred */
	microtime(&bus->last_reset);
	}

	xpttargettraverse(bus, NULL, xpt_async_process_tgt, ccb);

	/*
	* If this wasn't a fully wildcarded async, tell all
	* clients that want all async events.
	*/
	if (bus != xpt_periph->path->bus) {
	xpt_path_lock(xpt_periph->path);
	xpt_async_process_dev(xpt_periph->path->device, ccb);
	xpt_path_unlock(xpt_periph->path);
	}

	if (path->device != NULL && path->device->lun_id != CAM_LUN_WILDCARD)
	xpt_release_devq(path, 1, TRUE);
	else
	xpt_release_simq(path->bus->sim, TRUE);
	if (ccb->casync.async_arg_size > 0)
	free(async_arg, M_CAMXPT);
	xpt_free_path(path);
	xpt_free_ccb(ccb);
	}

	static void
	xpt_async_bcast(struct async_list *async_head,
	u_int32_t async_code,
	struct cam_path path, void async_arg)
	{
	struct async_node *cur_entry;
	struct mtx *mtx;

	cur_entry = SLIST_FIRST(async_head);
	while (cur_entry != NULL) {
	struct async_node *next_entry;
	/*
	* Grab the next list entry before we call the current
	* entry's callback. This is because the callback function
	* can delete its async callback entry.
	*/
	next_entry = SLIST_NEXT(cur_entry, links);
	if ((cur_entry->event_enable & async_code) != 0) {
	mtx = cur_entry->event_lock ?
	path->device->sim->mtx : NULL;
	if (mtx)
	mtx_lock(mtx);
	cur_entry->callback(cur_entry->callback_arg,
	async_code, path,
	async_arg);
	if (mtx)
	mtx_unlock(mtx);
	}
	cur_entry = next_entry;
	}
	}

	void
	xpt_async(u_int32_t async_code, struct cam_path path, void async_arg)
	{
	union ccb *ccb;
	int size;

	ccb = xpt_alloc_ccb_nowait();
	if (ccb == NULL) {
	xpt_print(path, "Can't allocate CCB to send %s\n",
	xpt_async_string(async_code));
	return;
	}

	if (xpt_clone_path(&ccb->ccb_h.path, path) != CAM_REQ_CMP) {
	xpt_print(path, "Can't allocate path to send %s\n",
	xpt_async_string(async_code));
	xpt_free_ccb(ccb);
	return;
	}
	ccb->ccb_h.path->periph = NULL;
	ccb->ccb_h.func_code = XPT_ASYNC;
	ccb->ccb_h.cbfcnp = xpt_async_process;
	ccb->ccb_h.flags \|= CAM_UNLOCKED;
	ccb->casync.async_code = async_code;
	ccb->casync.async_arg_size = 0;
	size = xpt_async_size(async_code);
	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
	("xpt_async: func %#x %s aync_code %d %s\n",
	ccb->ccb_h.func_code,
	xpt_action_name(ccb->ccb_h.func_code),
	async_code,
	xpt_async_string(async_code)));
	if (size > 0 && async_arg != NULL) {
	ccb->casync.async_arg_ptr = malloc(size, M_CAMXPT, M_NOWAIT);
	if (ccb->casync.async_arg_ptr == NULL) {
	xpt_print(path, "Can't allocate argument to send %s\n",
	xpt_async_string(async_code));
	xpt_free_path(ccb->ccb_h.path);
	xpt_free_ccb(ccb);
	return;
	}
	memcpy(ccb->casync.async_arg_ptr, async_arg, size);
	ccb->casync.async_arg_size = size;
	} else if (size < 0) {
	ccb->casync.async_arg_ptr = async_arg;
	ccb->casync.async_arg_size = size;
	}
	if (path->device != NULL && path->device->lun_id != CAM_LUN_WILDCARD)
	xpt_freeze_devq(path, 1);
	else
	xpt_freeze_simq(path->bus->sim, 1);
	xpt_action(ccb);
	}

	static void
	xpt_dev_async_default(u_int32_t async_code, struct cam_eb *bus,
	struct cam_et target, struct cam_ed device,
	void *async_arg)
	{

	/*
	* We only need to handle events for real devices.
	*/
	if (target->target_id == CAM_TARGET_WILDCARD
	\|\| device->lun_id == CAM_LUN_WILDCARD)
	return;

	printf("%s called\n", __func__);
	}

	static uint32_t
	xpt_freeze_devq_device(struct cam_ed *dev, u_int count)
	{
	struct cam_devq *devq;
	uint32_t freeze;

	devq = dev->sim->devq;
	mtx_assert(&devq->send_mtx, MA_OWNED);
	CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE,
	("xpt_freeze_devq_device(%d) %u->%u\n", count,
	dev->ccbq.queue.qfrozen_cnt, dev->ccbq.queue.qfrozen_cnt + count));
	freeze = (dev->ccbq.queue.qfrozen_cnt += count);
	/* Remove frozen device from sendq. */
	if (device_is_queued(dev))
	camq_remove(&devq->send_queue, dev->devq_entry.index);
	return (freeze);
	}

	u_int32_t
	xpt_freeze_devq(struct cam_path *path, u_int count)
	{
	struct cam_ed *dev = path->device;
	struct cam_devq *devq;
	uint32_t freeze;

	devq = dev->sim->devq;
	mtx_lock(&devq->send_mtx);
	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_freeze_devq(%d)\n", count));
	freeze = xpt_freeze_devq_device(dev, count);
	mtx_unlock(&devq->send_mtx);
	return (freeze);
	}

	u_int32_t
	xpt_freeze_simq(struct cam_sim *sim, u_int count)
	{
	struct cam_devq *devq;
	uint32_t freeze;

	devq = sim->devq;
	mtx_lock(&devq->send_mtx);
	freeze = (devq->send_queue.qfrozen_cnt += count);
	mtx_unlock(&devq->send_mtx);
	return (freeze);
	}

	static void
	xpt_release_devq_timeout(void *arg)
	{
	struct cam_ed *dev;
	struct cam_devq *devq;

	dev = (struct cam_ed *)arg;
	CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE, ("xpt_release_devq_timeout\n"));
	devq = dev->sim->devq;
	mtx_assert(&devq->send_mtx, MA_OWNED);
	if (xpt_release_devq_device(dev, /count/1, /run_queue/TRUE))
	xpt_run_devq(devq);
	}

	void
	xpt_release_devq(struct cam_path *path, u_int count, int run_queue)
	{
	struct cam_ed *dev;
	struct cam_devq *devq;

	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_release_devq(%d, %d)\n",
	count, run_queue));
	dev = path->device;
	devq = dev->sim->devq;
	mtx_lock(&devq->send_mtx);
	if (xpt_release_devq_device(dev, count, run_queue))
	xpt_run_devq(dev->sim->devq);
	mtx_unlock(&devq->send_mtx);
	}

	static int
	xpt_release_devq_device(struct cam_ed *dev, u_int count, int run_queue)
	{

	mtx_assert(&dev->sim->devq->send_mtx, MA_OWNED);
	CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE,
	("xpt_release_devq_device(%d, %d) %u->%u\n", count, run_queue,
	dev->ccbq.queue.qfrozen_cnt, dev->ccbq.queue.qfrozen_cnt - count));
	if (count > dev->ccbq.queue.qfrozen_cnt) {
	#ifdef INVARIANTS
	printf("xpt_release_devq(): requested %u > present %u\n",
	count, dev->ccbq.queue.qfrozen_cnt);
	#endif
	count = dev->ccbq.queue.qfrozen_cnt;
	}
	dev->ccbq.queue.qfrozen_cnt -= count;
	if (dev->ccbq.queue.qfrozen_cnt == 0) {
	/*
	* No longer need to wait for a successful
	* command completion.
	*/
	dev->flags &= ~CAM_DEV_REL_ON_COMPLETE;
	/*
	* Remove any timeouts that might be scheduled
	* to release this queue.
	*/
	if ((dev->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0) {
	callout_stop(&dev->callout);
	dev->flags &= ~CAM_DEV_REL_TIMEOUT_PENDING;
	}
	/*
	* Now that we are unfrozen schedule the
	* device so any pending transactions are
	* run.
	*/
	xpt_schedule_devq(dev->sim->devq, dev);
	} else
	run_queue = 0;
	return (run_queue);
	}

	void
	xpt_release_simq(struct cam_sim *sim, int run_queue)
	{
	struct cam_devq *devq;

	devq = sim->devq;
	mtx_lock(&devq->send_mtx);
	if (devq->send_queue.qfrozen_cnt <= 0) {
	#ifdef INVARIANTS
	printf("xpt_release_simq: requested 1 > present %u\n",
	devq->send_queue.qfrozen_cnt);
	#endif
	} else
	devq->send_queue.qfrozen_cnt--;
	if (devq->send_queue.qfrozen_cnt == 0) {
	/*
	* If there is a timeout scheduled to release this
	* sim queue, remove it. The queue frozen count is
	* already at 0.
	*/
	if ((sim->flags & CAM_SIM_REL_TIMEOUT_PENDING) != 0){
	callout_stop(&sim->callout);
	sim->flags &= ~CAM_SIM_REL_TIMEOUT_PENDING;
	}
	if (run_queue) {
	/*
	* Now that we are unfrozen run the send queue.
	*/
	xpt_run_devq(sim->devq);
	}
	}
	mtx_unlock(&devq->send_mtx);
	}

	void
	xpt_done(union ccb *done_ccb)
	{
	struct cam_doneq *queue;
	int run, hash;

	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	if (done_ccb->ccb_h.func_code == XPT_SCSI_IO &&
	done_ccb->csio.bio != NULL)
	biotrack(done_ccb->csio.bio, __func__);
	#endif

	CAM_DEBUG(done_ccb->ccb_h.path, CAM_DEBUG_TRACE,
	("xpt_done: func= %#x %s status %#x\n",
	done_ccb->ccb_h.func_code,
	xpt_action_name(done_ccb->ccb_h.func_code),
	done_ccb->ccb_h.status));
	if ((done_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0)
	return;

	/* Store the time the ccb was in the sim */
	done_ccb->ccb_h.qos.periph_data = cam_iosched_delta_t(done_ccb->ccb_h.qos.periph_data);
	hash = (u_int)(done_ccb->ccb_h.path_id + done_ccb->ccb_h.target_id +
	done_ccb->ccb_h.target_lun) % cam_num_doneqs;
	queue = &cam_doneqs[hash];
	mtx_lock(&queue->cam_doneq_mtx);
	run = (queue->cam_doneq_sleep && STAILQ_EMPTY(&queue->cam_doneq));
	STAILQ_INSERT_TAIL(&queue->cam_doneq, &done_ccb->ccb_h, sim_links.stqe);
	done_ccb->ccb_h.pinfo.index = CAM_DONEQ_INDEX;
	mtx_unlock(&queue->cam_doneq_mtx);
	if (run)
	wakeup(&queue->cam_doneq);
	}

	void
	xpt_done_direct(union ccb *done_ccb)
	{

	CAM_DEBUG(done_ccb->ccb_h.path, CAM_DEBUG_TRACE,
	("xpt_done_direct: status %#x\n", done_ccb->ccb_h.status));
	if ((done_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0)
	return;

	/* Store the time the ccb was in the sim */
	done_ccb->ccb_h.qos.periph_data = cam_iosched_delta_t(done_ccb->ccb_h.qos.periph_data);
	xpt_done_process(&done_ccb->ccb_h);
	}

	union ccb *
	xpt_alloc_ccb(void)
	{
	union ccb *new_ccb;

	new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO\|M_WAITOK);
	return (new_ccb);
	}

	union ccb *
	xpt_alloc_ccb_nowait(void)
	{
	union ccb *new_ccb;

	new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO\|M_NOWAIT);
	return (new_ccb);
	}

	void
	xpt_free_ccb(union ccb *free_ccb)
	{
	free(free_ccb, M_CAMCCB);
	}

	/* Private XPT functions */

	/*
	* Get a CAM control block for the caller. Charge the structure to the device
	* referenced by the path. If we don't have sufficient resources to allocate
	* more ccbs, we return NULL.
	*/
	static union ccb *
	xpt_get_ccb_nowait(struct cam_periph *periph)
	{
	union ccb *new_ccb;

	new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO\|M_NOWAIT);
	if (new_ccb == NULL)
	return (NULL);
	periph->periph_allocated++;
	cam_ccbq_take_opening(&periph->path->device->ccbq);
	return (new_ccb);
	}

	static union ccb *
	xpt_get_ccb(struct cam_periph *periph)
	{
	union ccb *new_ccb;

	cam_periph_unlock(periph);
	new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO\|M_WAITOK);
	cam_periph_lock(periph);
	periph->periph_allocated++;
	cam_ccbq_take_opening(&periph->path->device->ccbq);
	return (new_ccb);
	}

	union ccb *
	cam_periph_getccb(struct cam_periph *periph, u_int32_t priority)
	{
	struct ccb_hdr *ccb_h;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("cam_periph_getccb\n"));
	cam_periph_assert(periph, MA_OWNED);
	while ((ccb_h = SLIST_FIRST(&periph->ccb_list)) == NULL \|\|
	ccb_h->pinfo.priority != priority) {
	if (priority < periph->immediate_priority) {
	periph->immediate_priority = priority;
	xpt_run_allocq(periph, 0);
	} else
	cam_periph_sleep(periph, &periph->ccb_list, PRIBIO,
	"cgticb", 0);
	}
	SLIST_REMOVE_HEAD(&periph->ccb_list, periph_links.sle);
	return ((union ccb *)ccb_h);
	}

	static void
	xpt_acquire_bus(struct cam_eb *bus)
	{

	xpt_lock_buses();
	bus->refcount++;
	xpt_unlock_buses();
	}

	static void
	xpt_release_bus(struct cam_eb *bus)
	{

	xpt_lock_buses();
	KASSERT(bus->refcount >= 1, ("bus->refcount >= 1"));
	if (--bus->refcount > 0) {
	xpt_unlock_buses();
	return;
	}
	TAILQ_REMOVE(&xsoftc.xpt_busses, bus, links);
	xsoftc.bus_generation++;
	xpt_unlock_buses();
	KASSERT(TAILQ_EMPTY(&bus->et_entries),
	("destroying bus, but target list is not empty"));
	cam_sim_release(bus->sim);
	mtx_destroy(&bus->eb_mtx);
	free(bus, M_CAMXPT);
	}

	static struct cam_et *
	xpt_alloc_target(struct cam_eb *bus, target_id_t target_id)
	{
	struct cam_et cur_target, target;

	mtx_assert(&xsoftc.xpt_topo_lock, MA_OWNED);
	mtx_assert(&bus->eb_mtx, MA_OWNED);
	target = (struct cam_et )malloc(sizeof(target), M_CAMXPT,
	M_NOWAIT\|M_ZERO);
	if (target == NULL)
	return (NULL);

	TAILQ_INIT(&target->ed_entries);
	target->bus = bus;
	target->target_id = target_id;
	target->refcount = 1;
	target->generation = 0;
	target->luns = NULL;
	mtx_init(&target->luns_mtx, "CAM LUNs lock", NULL, MTX_DEF);
	timevalclear(&target->last_reset);
	/*
	* Hold a reference to our parent bus so it
	* will not go away before we do.
	*/
	bus->refcount++;

	/* Insertion sort into our bus's target list */
	cur_target = TAILQ_FIRST(&bus->et_entries);
	while (cur_target != NULL && cur_target->target_id < target_id)
	cur_target = TAILQ_NEXT(cur_target, links);
	if (cur_target != NULL) {
	TAILQ_INSERT_BEFORE(cur_target, target, links);
	} else {
	TAILQ_INSERT_TAIL(&bus->et_entries, target, links);
	}
	bus->generation++;
	return (target);
	}

	static void
	xpt_acquire_target(struct cam_et *target)
	{
	struct cam_eb *bus = target->bus;

	mtx_lock(&bus->eb_mtx);
	target->refcount++;
	mtx_unlock(&bus->eb_mtx);
	}

	static void
	xpt_release_target(struct cam_et *target)
	{
	struct cam_eb *bus = target->bus;

	mtx_lock(&bus->eb_mtx);
	if (--target->refcount > 0) {
	mtx_unlock(&bus->eb_mtx);
	return;
	}
	TAILQ_REMOVE(&bus->et_entries, target, links);
	bus->generation++;
	mtx_unlock(&bus->eb_mtx);
	KASSERT(TAILQ_EMPTY(&target->ed_entries),
	("destroying target, but device list is not empty"));
	xpt_release_bus(bus);
	mtx_destroy(&target->luns_mtx);
	if (target->luns)
	free(target->luns, M_CAMXPT);
	free(target, M_CAMXPT);
	}

	static struct cam_ed *
	xpt_alloc_device_default(struct cam_eb bus, struct cam_et target,
	lun_id_t lun_id)
	{
	struct cam_ed *device;

	device = xpt_alloc_device(bus, target, lun_id);
	if (device == NULL)
	return (NULL);

	device->mintags = 1;
	device->maxtags = 1;
	return (device);
	}

	static void
	xpt_destroy_device(void *context, int pending)
	{
	struct cam_ed *device = context;

	mtx_lock(&device->device_mtx);
	mtx_destroy(&device->device_mtx);
	free(device, M_CAMDEV);
	}

	struct cam_ed *
	xpt_alloc_device(struct cam_eb bus, struct cam_et target, lun_id_t lun_id)
	{
	struct cam_ed cur_device, device;
	struct cam_devq *devq;
	cam_status status;

	mtx_assert(&bus->eb_mtx, MA_OWNED);
	/* Make space for us in the device queue on our bus */
	devq = bus->sim->devq;
	mtx_lock(&devq->send_mtx);
	status = cam_devq_resize(devq, devq->send_queue.array_size + 1);
	mtx_unlock(&devq->send_mtx);
	if (status != CAM_REQ_CMP)
	return (NULL);

	device = (struct cam_ed )malloc(sizeof(device),
	M_CAMDEV, M_NOWAIT\|M_ZERO);
	if (device == NULL)
	return (NULL);

	cam_init_pinfo(&device->devq_entry);
	device->target = target;
	device->lun_id = lun_id;
	device->sim = bus->sim;
	if (cam_ccbq_init(&device->ccbq,
	bus->sim->max_dev_openings) != 0) {
	free(device, M_CAMDEV);
	return (NULL);
	}
	SLIST_INIT(&device->asyncs);
	SLIST_INIT(&device->periphs);
	device->generation = 0;
	device->flags = CAM_DEV_UNCONFIGURED;
	device->tag_delay_count = 0;
	device->tag_saved_openings = 0;
	device->refcount = 1;
	mtx_init(&device->device_mtx, "CAM device lock", NULL, MTX_DEF);
	callout_init_mtx(&device->callout, &devq->send_mtx, 0);
	TASK_INIT(&device->device_destroy_task, 0, xpt_destroy_device, device);
	/*
	* Hold a reference to our parent bus so it
	* will not go away before we do.
	*/
	target->refcount++;

	cur_device = TAILQ_FIRST(&target->ed_entries);
	while (cur_device != NULL && cur_device->lun_id < lun_id)
	cur_device = TAILQ_NEXT(cur_device, links);
	if (cur_device != NULL)
	TAILQ_INSERT_BEFORE(cur_device, device, links);
	else
	TAILQ_INSERT_TAIL(&target->ed_entries, device, links);
	target->generation++;
	return (device);
	}

	void
	xpt_acquire_device(struct cam_ed *device)
	{
	struct cam_eb *bus = device->target->bus;

	mtx_lock(&bus->eb_mtx);
	device->refcount++;
	mtx_unlock(&bus->eb_mtx);
	}

	void
	xpt_release_device(struct cam_ed *device)
	{
	struct cam_eb *bus = device->target->bus;
	struct cam_devq *devq;

	mtx_lock(&bus->eb_mtx);
	if (--device->refcount > 0) {
	mtx_unlock(&bus->eb_mtx);
	return;
	}

	TAILQ_REMOVE(&device->target->ed_entries, device,links);
	device->target->generation++;
	mtx_unlock(&bus->eb_mtx);

	/* Release our slot in the devq */
	devq = bus->sim->devq;
	mtx_lock(&devq->send_mtx);
	cam_devq_resize(devq, devq->send_queue.array_size - 1);

	KASSERT(SLIST_EMPTY(&device->periphs),
	("destroying device, but periphs list is not empty"));
	KASSERT(device->devq_entry.index == CAM_UNQUEUED_INDEX,
	("destroying device while still queued for ccbs"));

	/* The send_mtx must be held when accessing the callout */
	if ((device->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0)
	callout_stop(&device->callout);

	mtx_unlock(&devq->send_mtx);

	xpt_release_target(device->target);

	cam_ccbq_fini(&device->ccbq);
	/*
	* Free allocated memory. free(9) does nothing if the
	* supplied pointer is NULL, so it is safe to call without
	* checking.
	*/
	free(device->supported_vpds, M_CAMXPT);
	free(device->device_id, M_CAMXPT);
	free(device->ext_inq, M_CAMXPT);
	free(device->physpath, M_CAMXPT);
	free(device->rcap_buf, M_CAMXPT);
	free(device->serial_num, M_CAMXPT);
	free(device->nvme_data, M_CAMXPT);
	free(device->nvme_cdata, M_CAMXPT);
	taskqueue_enqueue(xsoftc.xpt_taskq, &device->device_destroy_task);
	}

	u_int32_t
	xpt_dev_ccbq_resize(struct cam_path *path, int newopenings)
	{
	int result;
	struct cam_ed *dev;

	dev = path->device;
	mtx_lock(&dev->sim->devq->send_mtx);
	result = cam_ccbq_resize(&dev->ccbq, newopenings);
	mtx_unlock(&dev->sim->devq->send_mtx);
	if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0
	\|\| (dev->inq_flags & SID_CmdQue) != 0)
	dev->tag_saved_openings = newopenings;
	return (result);
	}

	static struct cam_eb *
	xpt_find_bus(path_id_t path_id)
	{
	struct cam_eb *bus;

	xpt_lock_buses();
	for (bus = TAILQ_FIRST(&xsoftc.xpt_busses);
	bus != NULL;
	bus = TAILQ_NEXT(bus, links)) {
	if (bus->path_id == path_id) {
	bus->refcount++;
	break;
	}
	}
	xpt_unlock_buses();
	return (bus);
	}

	static struct cam_et *
	xpt_find_target(struct cam_eb *bus, target_id_t target_id)
	{
	struct cam_et *target;

	mtx_assert(&bus->eb_mtx, MA_OWNED);
	for (target = TAILQ_FIRST(&bus->et_entries);
	target != NULL;
	target = TAILQ_NEXT(target, links)) {
	if (target->target_id == target_id) {
	target->refcount++;
	break;
	}
	}
	return (target);
	}

	static struct cam_ed *
	xpt_find_device(struct cam_et *target, lun_id_t lun_id)
	{
	struct cam_ed *device;

	mtx_assert(&target->bus->eb_mtx, MA_OWNED);
	for (device = TAILQ_FIRST(&target->ed_entries);
	device != NULL;
	device = TAILQ_NEXT(device, links)) {
	if (device->lun_id == lun_id) {
	device->refcount++;
	break;
	}
	}
	return (device);
	}

	void
	xpt_start_tags(struct cam_path *path)
	{
	struct ccb_relsim crs;
	struct cam_ed *device;
	struct cam_sim *sim;
	int newopenings;

	device = path->device;
	sim = path->bus->sim;
	device->flags &= ~CAM_DEV_TAG_AFTER_COUNT;
	xpt_freeze_devq(path, /count/1);
	device->inq_flags \|= SID_CmdQue;
	if (device->tag_saved_openings != 0)
	newopenings = device->tag_saved_openings;
	else
	newopenings = min(device->maxtags,
	sim->max_tagged_dev_openings);
	xpt_dev_ccbq_resize(path, newopenings);
	xpt_async(AC_GETDEV_CHANGED, path, NULL);
	xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL);
	crs.ccb_h.func_code = XPT_REL_SIMQ;
	crs.release_flags = RELSIM_RELEASE_AFTER_QEMPTY;
	crs.openings
	= crs.release_timeout
	= crs.qfrozen_cnt
	= 0;
	xpt_action((union ccb *)&crs);
	}

	void
	xpt_stop_tags(struct cam_path *path)
	{
	struct ccb_relsim crs;
	struct cam_ed *device;
	struct cam_sim *sim;

	device = path->device;
	sim = path->bus->sim;
	device->flags &= ~CAM_DEV_TAG_AFTER_COUNT;
	device->tag_delay_count = 0;
	xpt_freeze_devq(path, /count/1);
	device->inq_flags &= ~SID_CmdQue;
	xpt_dev_ccbq_resize(path, sim->max_dev_openings);
	xpt_async(AC_GETDEV_CHANGED, path, NULL);
	xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL);
	crs.ccb_h.func_code = XPT_REL_SIMQ;
	crs.release_flags = RELSIM_RELEASE_AFTER_QEMPTY;
	crs.openings
	= crs.release_timeout
	= crs.qfrozen_cnt
	= 0;
	xpt_action((union ccb *)&crs);
	}

	/*
	* Assume all possible buses are detected by this time, so allow boot
	* as soon as they all are scanned.
	*/
	static void
	xpt_boot_delay(void *arg)
	{

	xpt_release_boot();
	}

	/*
	* Now that all config hooks have completed, start boot_delay timer,
	* waiting for possibly still undetected buses (USB) to appear.
	*/
	static void
	xpt_ch_done(void *arg)
	{

	callout_init(&xsoftc.boot_callout, 1);
	callout_reset_sbt(&xsoftc.boot_callout, SBT_1MS * xsoftc.boot_delay, 0,
	xpt_boot_delay, NULL, 0);
	}
	SYSINIT(xpt_hw_delay, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_ANY, xpt_ch_done, NULL);

	/*
	* Now that interrupts are enabled, go find our devices
	*/
	static void
	xpt_config(void *arg)
	{
	if (taskqueue_start_threads(&xsoftc.xpt_taskq, 1, PRIBIO, "CAM taskq"))
	printf("xpt_config: failed to create taskqueue thread.\n");

	/* Setup debugging path */
	if (cam_dflags != CAM_DEBUG_NONE) {
	if (xpt_create_path(&cam_dpath, NULL,
	CAM_DEBUG_BUS, CAM_DEBUG_TARGET,
	CAM_DEBUG_LUN) != CAM_REQ_CMP) {
	printf("xpt_config: xpt_create_path() failed for debug"
	" target %d:%d:%d, debugging disabled\n",
	CAM_DEBUG_BUS, CAM_DEBUG_TARGET, CAM_DEBUG_LUN);
	cam_dflags = CAM_DEBUG_NONE;
	}
	} else
	cam_dpath = NULL;

	periphdriver_init(1);
	xpt_hold_boot();

	/* Fire up rescan thread. */
	if (kproc_kthread_add(xpt_scanner_thread, NULL, &cam_proc, NULL, 0, 0,
	"cam", "scanner")) {
	printf("xpt_config: failed to create rescan thread.\n");
	}
	}

	void
	xpt_hold_boot_locked(void)
	{

	if (xsoftc.buses_to_config++ == 0)
	root_mount_hold_token("CAM", &xsoftc.xpt_rootmount);
	}

	void
	xpt_hold_boot(void)
	{

	xpt_lock_buses();
	xpt_hold_boot_locked();
	xpt_unlock_buses();
	}

	void
	xpt_release_boot(void)
	{

	xpt_lock_buses();
	if (--xsoftc.buses_to_config == 0) {
	if (xsoftc.buses_config_done == 0) {
	xsoftc.buses_config_done = 1;
	xsoftc.buses_to_config++;
	TASK_INIT(&xsoftc.boot_task, 0, xpt_finishconfig_task,
	NULL);
	taskqueue_enqueue(taskqueue_thread, &xsoftc.boot_task);
	} else
	root_mount_rel(&xsoftc.xpt_rootmount);
	}
	xpt_unlock_buses();
	}

	/*
	* If the given device only has one peripheral attached to it, and if that
	* peripheral is the passthrough driver, announce it. This insures that the
	* user sees some sort of announcement for every peripheral in their system.
	*/
	static int
	xptpassannouncefunc(struct cam_ed device, void arg)
	{
	struct cam_periph *periph;
	int i;

	for (periph = SLIST_FIRST(&device->periphs), i = 0; periph != NULL;
	periph = SLIST_NEXT(periph, periph_links), i++);

	periph = SLIST_FIRST(&device->periphs);
	if ((i == 1)
	&& (strncmp(periph->periph_name, "pass", 4) == 0))
	xpt_announce_periph(periph, NULL);

	return(1);
	}

	static void
	xpt_finishconfig_task(void *context, int pending)
	{

	periphdriver_init(2);
	/*
	* Check for devices with no "standard" peripheral driver
	* attached. For any devices like that, announce the
	* passthrough driver so the user will see something.
	*/
	if (!bootverbose)
	xpt_for_all_devices(xptpassannouncefunc, NULL);

	xpt_release_boot();
	}

	cam_status
	xpt_register_async(int event, ac_callback_t cbfunc, void cbarg,
	struct cam_path *path)
	{
	struct ccb_setasync csa;
	cam_status status;
	int xptpath = 0;

	if (path == NULL) {
	status = xpt_create_path(&path, /periph/NULL, CAM_XPT_PATH_ID,
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD);
	if (status != CAM_REQ_CMP)
	return (status);
	xpt_path_lock(path);
	xptpath = 1;
	}

	xpt_setup_ccb(&csa.ccb_h, path, CAM_PRIORITY_NORMAL);
	csa.ccb_h.func_code = XPT_SASYNC_CB;
	csa.event_enable = event;
	csa.callback = cbfunc;
	csa.callback_arg = cbarg;
	xpt_action((union ccb *)&csa);
	status = csa.ccb_h.status;

	CAM_DEBUG(csa.ccb_h.path, CAM_DEBUG_TRACE,
	("xpt_register_async: func %p\n", cbfunc));

	if (xptpath) {
	xpt_path_unlock(path);
	xpt_free_path(path);
	}

	if ((status == CAM_REQ_CMP) &&
	(csa.event_enable & AC_FOUND_DEVICE)) {
	/*
	* Get this peripheral up to date with all
	* the currently existing devices.
	*/
	xpt_for_all_devices(xptsetasyncfunc, &csa);
	}
	if ((status == CAM_REQ_CMP) &&
	(csa.event_enable & AC_PATH_REGISTERED)) {
	/*
	* Get this peripheral up to date with all
	* the currently existing buses.
	*/
	xpt_for_all_busses(xptsetasyncbusfunc, &csa);
	}

	return (status);
	}

	static void
	xptaction(struct cam_sim sim, union ccb work_ccb)
	{
	CAM_DEBUG(work_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xptaction\n"));

	switch (work_ccb->ccb_h.func_code) {
	/* Common cases first */
	case XPT_PATH_INQ: /* Path routing inquiry */
	{
	struct ccb_pathinq *cpi;

	cpi = &work_ccb->cpi;
	cpi->version_num = 1; /* XXX??? */
	cpi->hba_inquiry = 0;
	cpi->target_sprt = 0;
	cpi->hba_misc = 0;
	cpi->hba_eng_cnt = 0;
	cpi->max_target = 0;
	cpi->max_lun = 0;
	cpi->initiator_id = 0;
	strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strlcpy(cpi->hba_vid, "", HBA_IDLEN);
	strlcpy(cpi->dev_name, sim->sim_name, DEV_IDLEN);
	cpi->unit_number = sim->unit_number;
	cpi->bus_id = sim->bus_id;
	cpi->base_transfer_speed = 0;
	cpi->protocol = PROTO_UNSPECIFIED;
	cpi->protocol_version = PROTO_VERSION_UNSPECIFIED;
	cpi->transport = XPORT_UNSPECIFIED;
	cpi->transport_version = XPORT_VERSION_UNSPECIFIED;
	cpi->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	default:
	work_ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	}
	xpt_done(work_ccb);
	}

	/*
	* The xpt as a "controller" has no interrupt sources, so polling
	* is a no-op.
	*/
	static void
	xptpoll(struct cam_sim *sim)
	{
	}

	void
	xpt_lock_buses(void)
	{
	mtx_lock(&xsoftc.xpt_topo_lock);
	}

	void
	xpt_unlock_buses(void)
	{
	mtx_unlock(&xsoftc.xpt_topo_lock);
	}

	struct mtx *
	xpt_path_mtx(struct cam_path *path)
	{

	return (&path->device->device_mtx);
	}

	static void
	xpt_done_process(struct ccb_hdr *ccb_h)
	{
	struct cam_sim *sim = NULL;
	struct cam_devq *devq = NULL;
	struct mtx *mtx = NULL;

	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	struct ccb_scsiio *csio;

	if (ccb_h->func_code == XPT_SCSI_IO) {
	csio = &((union ccb *)ccb_h)->csio;
	if (csio->bio != NULL)
	biotrack(csio->bio, __func__);
	}
	#endif

	if (ccb_h->flags & CAM_HIGH_POWER) {
	struct highpowerlist *hphead;
	struct cam_ed *device;

	mtx_lock(&xsoftc.xpt_highpower_lock);
	hphead = &xsoftc.highpowerq;

	device = STAILQ_FIRST(hphead);

	/*
	* Increment the count since this command is done.
	*/
	xsoftc.num_highpower++;

	/*
	* Any high powered commands queued up?
	*/
	if (device != NULL) {
	STAILQ_REMOVE_HEAD(hphead, highpowerq_entry);
	mtx_unlock(&xsoftc.xpt_highpower_lock);

	mtx_lock(&device->sim->devq->send_mtx);
	xpt_release_devq_device(device,
	/count/1, /runqueue/TRUE);
	mtx_unlock(&device->sim->devq->send_mtx);
	} else
	mtx_unlock(&xsoftc.xpt_highpower_lock);
	}

	/*
	* Insulate against a race where the periph is destroyed but CCBs are
	* still not all processed. This shouldn't happen, but allows us better
	* bug diagnostic when it does.
	*/
	if (ccb_h->path->bus)
	sim = ccb_h->path->bus->sim;

	if (ccb_h->status & CAM_RELEASE_SIMQ) {
	KASSERT(sim, ("sim missing for CAM_RELEASE_SIMQ request"));
	xpt_release_simq(sim, /run_queue/FALSE);
	ccb_h->status &= ~CAM_RELEASE_SIMQ;
	}

	if ((ccb_h->flags & CAM_DEV_QFRZDIS)
	&& (ccb_h->status & CAM_DEV_QFRZN)) {
	xpt_release_devq(ccb_h->path, /count/1, /run_queue/TRUE);
	ccb_h->status &= ~CAM_DEV_QFRZN;
	}

	if ((ccb_h->func_code & XPT_FC_USER_CCB) == 0) {
	struct cam_ed *dev = ccb_h->path->device;

	if (sim)
	devq = sim->devq;
	KASSERT(devq, ("Periph disappeared with CCB %p %s request pending.",
	ccb_h, xpt_action_name(ccb_h->func_code)));

	mtx_lock(&devq->send_mtx);
	devq->send_active--;
	devq->send_openings++;
	cam_ccbq_ccb_done(&dev->ccbq, (union ccb *)ccb_h);

	if (((dev->flags & CAM_DEV_REL_ON_QUEUE_EMPTY) != 0
	&& (dev->ccbq.dev_active == 0))) {
	dev->flags &= ~CAM_DEV_REL_ON_QUEUE_EMPTY;
	xpt_release_devq_device(dev, /count/1,
	/run_queue/FALSE);
	}

	if (((dev->flags & CAM_DEV_REL_ON_COMPLETE) != 0
	&& (ccb_h->status&CAM_STATUS_MASK) != CAM_REQUEUE_REQ)) {
	dev->flags &= ~CAM_DEV_REL_ON_COMPLETE;
	xpt_release_devq_device(dev, /count/1,
	/run_queue/FALSE);
	}

	if (!device_is_queued(dev))
	(void)xpt_schedule_devq(devq, dev);
	xpt_run_devq(devq);
	mtx_unlock(&devq->send_mtx);

	if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0) {
	mtx = xpt_path_mtx(ccb_h->path);
	mtx_lock(mtx);

	if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0
	&& (--dev->tag_delay_count == 0))
	xpt_start_tags(ccb_h->path);
	}
	}

	if ((ccb_h->flags & CAM_UNLOCKED) == 0) {
	if (mtx == NULL) {
	mtx = xpt_path_mtx(ccb_h->path);
	mtx_lock(mtx);
	}
	} else {
	if (mtx != NULL) {
	mtx_unlock(mtx);
	mtx = NULL;
	}
	}

	/* Call the peripheral driver's callback */
	ccb_h->pinfo.index = CAM_UNQUEUED_INDEX;
	(ccb_h->cbfcnp)(ccb_h->path->periph, (union ccb )ccb_h);
	if (mtx != NULL)
	mtx_unlock(mtx);
	}

	void
	xpt_done_td(void *arg)
	{
	struct cam_doneq *queue = arg;
	struct ccb_hdr *ccb_h;
	STAILQ_HEAD(, ccb_hdr) doneq;

	STAILQ_INIT(&doneq);
	mtx_lock(&queue->cam_doneq_mtx);
	while (1) {
	while (STAILQ_EMPTY(&queue->cam_doneq)) {
	queue->cam_doneq_sleep = 1;
	msleep(&queue->cam_doneq, &queue->cam_doneq_mtx,
	PRIBIO, "-", 0);
	queue->cam_doneq_sleep = 0;
	}
	STAILQ_CONCAT(&doneq, &queue->cam_doneq);
	mtx_unlock(&queue->cam_doneq_mtx);

	THREAD_NO_SLEEPING();
	while ((ccb_h = STAILQ_FIRST(&doneq)) != NULL) {
	STAILQ_REMOVE_HEAD(&doneq, sim_links.stqe);
	xpt_done_process(ccb_h);
	}
	THREAD_SLEEPING_OK();

	mtx_lock(&queue->cam_doneq_mtx);
	}
	}

	static void
	camisr_runqueue(void)
	{
	struct ccb_hdr *ccb_h;
	struct cam_doneq *queue;
	int i;

	/* Process global queues. */
	for (i = 0; i < cam_num_doneqs; i++) {
	queue = &cam_doneqs[i];
	mtx_lock(&queue->cam_doneq_mtx);
	while ((ccb_h = STAILQ_FIRST(&queue->cam_doneq)) != NULL) {
	STAILQ_REMOVE_HEAD(&queue->cam_doneq, sim_links.stqe);
	mtx_unlock(&queue->cam_doneq_mtx);
	xpt_done_process(ccb_h);
	mtx_lock(&queue->cam_doneq_mtx);
	}
	mtx_unlock(&queue->cam_doneq_mtx);
	}
	}

	struct kv
	{
	uint32_t v;
	const char *name;
	};

	static struct kv map[] = {
	{ XPT_NOOP, "XPT_NOOP" },
	{ XPT_SCSI_IO, "XPT_SCSI_IO" },
	{ XPT_GDEV_TYPE, "XPT_GDEV_TYPE" },
	{ XPT_GDEVLIST, "XPT_GDEVLIST" },
	{ XPT_PATH_INQ, "XPT_PATH_INQ" },
	{ XPT_REL_SIMQ, "XPT_REL_SIMQ" },
	{ XPT_SASYNC_CB, "XPT_SASYNC_CB" },
	{ XPT_SDEV_TYPE, "XPT_SDEV_TYPE" },
	{ XPT_SCAN_BUS, "XPT_SCAN_BUS" },
	{ XPT_DEV_MATCH, "XPT_DEV_MATCH" },
	{ XPT_DEBUG, "XPT_DEBUG" },
	{ XPT_PATH_STATS, "XPT_PATH_STATS" },
	{ XPT_GDEV_STATS, "XPT_GDEV_STATS" },
	{ XPT_DEV_ADVINFO, "XPT_DEV_ADVINFO" },
	{ XPT_ASYNC, "XPT_ASYNC" },
	{ XPT_ABORT, "XPT_ABORT" },
	{ XPT_RESET_BUS, "XPT_RESET_BUS" },
	{ XPT_RESET_DEV, "XPT_RESET_DEV" },
	{ XPT_TERM_IO, "XPT_TERM_IO" },
	{ XPT_SCAN_LUN, "XPT_SCAN_LUN" },
	{ XPT_GET_TRAN_SETTINGS, "XPT_GET_TRAN_SETTINGS" },
	{ XPT_SET_TRAN_SETTINGS, "XPT_SET_TRAN_SETTINGS" },
	{ XPT_CALC_GEOMETRY, "XPT_CALC_GEOMETRY" },
	{ XPT_ATA_IO, "XPT_ATA_IO" },
	{ XPT_GET_SIM_KNOB, "XPT_GET_SIM_KNOB" },
	{ XPT_SET_SIM_KNOB, "XPT_SET_SIM_KNOB" },
	{ XPT_NVME_IO, "XPT_NVME_IO" },
	{ XPT_MMC_IO, "XPT_MMC_IO" },
	{ XPT_SMP_IO, "XPT_SMP_IO" },
	{ XPT_SCAN_TGT, "XPT_SCAN_TGT" },
	{ XPT_NVME_ADMIN, "XPT_NVME_ADMIN" },
	{ XPT_ENG_INQ, "XPT_ENG_INQ" },
	{ XPT_ENG_EXEC, "XPT_ENG_EXEC" },
	{ XPT_EN_LUN, "XPT_EN_LUN" },
	{ XPT_TARGET_IO, "XPT_TARGET_IO" },
	{ XPT_ACCEPT_TARGET_IO, "XPT_ACCEPT_TARGET_IO" },
	{ XPT_CONT_TARGET_IO, "XPT_CONT_TARGET_IO" },
	{ XPT_IMMED_NOTIFY, "XPT_IMMED_NOTIFY" },
	{ XPT_NOTIFY_ACK, "XPT_NOTIFY_ACK" },
	{ XPT_IMMEDIATE_NOTIFY, "XPT_IMMEDIATE_NOTIFY" },
	{ XPT_NOTIFY_ACKNOWLEDGE, "XPT_NOTIFY_ACKNOWLEDGE" },
	{ 0, 0 }
	};

	const char *
	xpt_action_name(uint32_t action)
	{
	static char buffer[32]; /* Only for unknown messages -- racy */
	struct kv *walker = map;

	while (walker->name != NULL) {
	if (walker->v == action)
	return (walker->name);
	walker++;
	}

	snprintf(buffer, sizeof(buffer), "%#x", action);
	return (buffer);
	}
	diff --git a/sys/cam/ctl/ctl_backend_block.c b/sys/cam/ctl/ctl_backend_block.c
	index 2e79b166339b..cd4fab766844 100644
	--- a/sys/cam/ctl/ctl_backend_block.c
	+++ b/sys/cam/ctl/ctl_backend_block.c
	@@ -1,2843 +1,2843 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2003 Silicon Graphics International Corp.
	* Copyright (c) 2009-2011 Spectra Logic Corporation
	* Copyright (c) 2012 The FreeBSD Foundation
	* Copyright (c) 2014-2015 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Portions of this software were developed by Edward Tomasz Napierala
	* under sponsorship from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification.
	* 2. Redistributions in binary form must reproduce at minimum a disclaimer
	* substantially similar to the "NO WARRANTY" disclaimer below
	* ("Disclaimer") and any redistribution must be conditioned upon
	* including a substantially similar Disclaimer requirement for further
	* binary redistribution.
	*
	* NO WARRANTY
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	* HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
	* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGES.
	*
	* $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
	*/
	/*
	* CAM Target Layer driver backend for block devices.
	*
	* Author: Ken Merry <ken@FreeBSD.org>
	*/
	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/types.h>
	#include <sys/kthread.h>
	#include <sys/bio.h>
	#include <sys/fcntl.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/condvar.h>
	#include <sys/malloc.h>
	#include <sys/conf.h>
	#include <sys/ioccom.h>
	#include <sys/queue.h>
	#include <sys/sbuf.h>
	#include <sys/endian.h>
	#include <sys/uio.h>
	#include <sys/buf.h>
	#include <sys/taskqueue.h>
	#include <sys/vnode.h>
	#include <sys/namei.h>
	#include <sys/mount.h>
	#include <sys/disk.h>
	#include <sys/fcntl.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	#include <sys/proc.h>
	#include <sys/pcpu.h>
	#include <sys/module.h>
	#include <sys/sdt.h>
	#include <sys/devicestat.h>
	#include <sys/sysctl.h>
	#include <sys/nv.h>
	#include <sys/dnv.h>
	#include <sys/sx.h>

	#include <geom/geom.h>

	#include <cam/cam.h>
	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_da.h>
	#include <cam/ctl/ctl_io.h>
	#include <cam/ctl/ctl.h>
	#include <cam/ctl/ctl_backend.h>
	#include <cam/ctl/ctl_ioctl.h>
	#include <cam/ctl/ctl_ha.h>
	#include <cam/ctl/ctl_scsi_all.h>
	#include <cam/ctl/ctl_private.h>
	#include <cam/ctl/ctl_error.h>

	/*
	* The idea here is that we'll allocate enough S/G space to hold a 1MB
	* I/O. If we get an I/O larger than that, we'll split it.
	*/
	#define CTLBLK_HALF_IO_SIZE (512 * 1024)
	#define CTLBLK_MAX_IO_SIZE (CTLBLK_HALF_IO_SIZE * 2)
	-#define CTLBLK_MAX_SEG MIN(CTLBLK_HALF_IO_SIZE, MAXPHYS)
	-#define CTLBLK_HALF_SEGS MAX(CTLBLK_HALF_IO_SIZE / CTLBLK_MAX_SEG, 1)
	+#define CTLBLK_MIN_SEG (128 * 1024)
	+#define CTLBLK_MAX_SEG MIN(CTLBLK_HALF_IO_SIZE, maxphys)
	+#define CTLBLK_HALF_SEGS MAX(CTLBLK_HALF_IO_SIZE / CTLBLK_MIN_SEG, 1)
	#define CTLBLK_MAX_SEGS (CTLBLK_HALF_SEGS * 2)
	+#define CTLBLK_NUM_SEGS (CTLBLK_MAX_IO_SIZE / CTLBLK_MAX_SEG)

	#ifdef CTLBLK_DEBUG
	#define DPRINTF(fmt, args...) \
	printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
	#else
	#define DPRINTF(fmt, args...) do {} while(0)
	#endif

	#define PRIV(io) \
	((struct ctl_ptr_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_BACKEND])
	#define ARGS(io) \
	((struct ctl_lba_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN])

	SDT_PROVIDER_DEFINE(cbb);

	typedef enum {
	CTL_BE_BLOCK_LUN_UNCONFIGURED = 0x01,
	CTL_BE_BLOCK_LUN_WAITING = 0x04,
	} ctl_be_block_lun_flags;

	typedef enum {
	CTL_BE_BLOCK_NONE,
	CTL_BE_BLOCK_DEV,
	CTL_BE_BLOCK_FILE
	} ctl_be_block_type;

	struct ctl_be_block_filedata {
	struct ucred *cred;
	};

	union ctl_be_block_bedata {
	struct ctl_be_block_filedata file;
	};

	struct ctl_be_block_io;
	struct ctl_be_block_lun;

	typedef void (cbb_dispatch_t)(struct ctl_be_block_lun be_lun,
	struct ctl_be_block_io *beio);
	typedef uint64_t (cbb_getattr_t)(struct ctl_be_block_lun be_lun,
	const char *attrname);

	/*
	* Backend LUN structure. There is a 1:1 mapping between a block device
	* and a backend block LUN, and between a backend block LUN and a CTL LUN.
	*/
	struct ctl_be_block_lun {
	struct ctl_be_lun cbe_lun; /* Must be first element. */
	struct ctl_lun_create_params params;
	char *dev_path;
	ctl_be_block_type dev_type;
	struct vnode *vn;
	union ctl_be_block_bedata backend;
	cbb_dispatch_t dispatch;
	cbb_dispatch_t lun_flush;
	cbb_dispatch_t unmap;
	cbb_dispatch_t get_lba_status;
	cbb_getattr_t getattr;
	uint64_t size_blocks;
	uint64_t size_bytes;
	struct ctl_be_block_softc *softc;
	struct devstat *disk_stats;
	ctl_be_block_lun_flags flags;
	SLIST_ENTRY(ctl_be_block_lun) links;
	struct taskqueue *io_taskqueue;
	struct task io_task;
	int num_threads;
	STAILQ_HEAD(, ctl_io_hdr) input_queue;
	STAILQ_HEAD(, ctl_io_hdr) config_read_queue;
	STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
	STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
	struct mtx_padalign io_lock;
	struct mtx_padalign queue_lock;
	};

	/*
	* Overall softc structure for the block backend module.
	*/
	struct ctl_be_block_softc {
	struct sx modify_lock;
	struct mtx lock;
	int num_luns;
	SLIST_HEAD(, ctl_be_block_lun) lun_list;
	uma_zone_t beio_zone;
	- uma_zone_t buf_zone;
	-#if (CTLBLK_MAX_SEG > 131072)
	- uma_zone_t buf128_zone;
	-#endif
	+ uma_zone_t bufmin_zone;
	+ uma_zone_t bufmax_zone;
	};

	static struct ctl_be_block_softc backend_block_softc;

	/*
	* Per-I/O information.
	*/
	struct ctl_be_block_io {
	union ctl_io *io;
	struct ctl_sg_entry sg_segs[CTLBLK_MAX_SEGS];
	struct iovec xiovecs[CTLBLK_MAX_SEGS];
	int refcnt;
	int bio_cmd;
	int two_sglists;
	int num_segs;
	int num_bios_sent;
	int num_bios_done;
	int send_complete;
	int first_error;
	uint64_t first_error_offset;
	struct bintime ds_t0;
	devstat_tag_type ds_tag_type;
	devstat_trans_flags ds_trans_type;
	uint64_t io_len;
	uint64_t io_offset;
	int io_arg;
	struct ctl_be_block_softc *softc;
	struct ctl_be_block_lun *lun;
	void (beio_cont)(struct ctl_be_block_io beio); /* to continue processing */
	};

	extern struct ctl_softc *control_softc;

	static int cbb_num_threads = 14;
	SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"CAM Target Layer Block Backend");
	SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RWTUN,
	&cbb_num_threads, 0, "Number of threads per backing file");

	static struct ctl_be_block_io ctl_alloc_beio(struct ctl_be_block_softc softc);
	static void ctl_free_beio(struct ctl_be_block_io *beio);
	static void ctl_complete_beio(struct ctl_be_block_io *beio);
	static int ctl_be_block_move_done(union ctl_io *io);
	static void ctl_be_block_biodone(struct bio *bio);
	static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio);
	static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio);
	static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio);
	static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun,
	const char *attrname);
	static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio);
	static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio);
	static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio);
	static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun,
	const char *attrname);
	static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
	union ctl_io *io);
	static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
	union ctl_io *io);
	static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
	union ctl_io *io);
	static void ctl_be_block_worker(void *context, int pending);
	static int ctl_be_block_submit(union ctl_io *io);
	static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
	int flag, struct thread *td);
	static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
	struct ctl_lun_req *req);
	static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
	struct ctl_lun_req *req);
	static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
	static int ctl_be_block_open(struct ctl_be_block_lun *be_lun,
	struct ctl_lun_req *req);
	static int ctl_be_block_create(struct ctl_be_block_softc *softc,
	struct ctl_lun_req *req);
	static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
	struct ctl_lun_req *req);
	static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
	struct ctl_lun_req *req);
	static void ctl_be_block_lun_shutdown(struct ctl_be_lun *cbe_lun);
	static int ctl_be_block_config_write(union ctl_io *io);
	static int ctl_be_block_config_read(union ctl_io *io);
	static int ctl_be_block_lun_info(struct ctl_be_lun cbe_lun, struct sbuf sb);
	static uint64_t ctl_be_block_lun_attr(struct ctl_be_lun cbe_lun, const char attrname);
	static int ctl_be_block_init(void);
	static int ctl_be_block_shutdown(void);

	static struct ctl_backend_driver ctl_be_block_driver =
	{
	.name = "block",
	.flags = CTL_BE_FLAG_HAS_CONFIG,
	.init = ctl_be_block_init,
	.shutdown = ctl_be_block_shutdown,
	.data_submit = ctl_be_block_submit,
	.data_move_done = ctl_be_block_move_done,
	.config_read = ctl_be_block_config_read,
	.config_write = ctl_be_block_config_write,
	.ioctl = ctl_be_block_ioctl,
	.lun_info = ctl_be_block_lun_info,
	.lun_attr = ctl_be_block_lun_attr
	};

	MALLOC_DEFINE(M_CTLBLK, "ctlblock", "Memory used for CTL block backend");
	CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);

	static void
	ctl_alloc_seg(struct ctl_be_block_softc softc, struct ctl_sg_entry sg,
	size_t len)
	{

	-#if (CTLBLK_MAX_SEG > 131072)
	- if (len <= 131072)
	- sg->addr = uma_zalloc(softc->buf128_zone, M_WAITOK);
	- else
	-#endif
	- sg->addr = uma_zalloc(softc->buf_zone, M_WAITOK);
	+ if (len <= CTLBLK_MIN_SEG) {
	+ sg->addr = uma_zalloc(softc->bufmin_zone, M_WAITOK);
	+ } else {
	+ KASSERT(len <= CTLBLK_MAX_SEG,
	+ ("Too large alloc %zu > %lu", len, CTLBLK_MAX_SEG));
	+ sg->addr = uma_zalloc(softc->bufmax_zone, M_WAITOK);
	+ }
	sg->len = len;
	}

	static void
	ctl_free_seg(struct ctl_be_block_softc softc, struct ctl_sg_entry sg)
	{

	-#if (CTLBLK_MAX_SEG > 131072)
	- if (sg->len <= 131072)
	- uma_zfree(softc->buf128_zone, sg->addr);
	- else
	-#endif
	- uma_zfree(softc->buf_zone, sg->addr);
	+ if (sg->len <= CTLBLK_MIN_SEG) {
	+ uma_zfree(softc->bufmin_zone, sg->addr);
	+ } else {
	+ KASSERT(sg->len <= CTLBLK_MAX_SEG,
	+ ("Too large free %zu > %lu", sg->len, CTLBLK_MAX_SEG));
	+ uma_zfree(softc->bufmax_zone, sg->addr);
	+ }
	}

	static struct ctl_be_block_io *
	ctl_alloc_beio(struct ctl_be_block_softc *softc)
	{
	struct ctl_be_block_io *beio;

	beio = uma_zalloc(softc->beio_zone, M_WAITOK \| M_ZERO);
	beio->softc = softc;
	beio->refcnt = 1;
	return (beio);
	}

	static void
	ctl_real_free_beio(struct ctl_be_block_io *beio)
	{
	struct ctl_be_block_softc *softc = beio->softc;
	int i;

	for (i = 0; i < beio->num_segs; i++) {
	ctl_free_seg(softc, &beio->sg_segs[i]);

	/* For compare we had two equal S/G lists. */
	if (beio->two_sglists) {
	ctl_free_seg(softc,
	&beio->sg_segs[i + CTLBLK_HALF_SEGS]);
	}
	}

	uma_zfree(softc->beio_zone, beio);
	}

	static void
	ctl_refcnt_beio(void *arg, int diff)
	{
	struct ctl_be_block_io *beio = arg;

	if (atomic_fetchadd_int(&beio->refcnt, diff) + diff == 0)
	ctl_real_free_beio(beio);
	}

	static void
	ctl_free_beio(struct ctl_be_block_io *beio)
	{

	ctl_refcnt_beio(beio, -1);
	}

	static void
	ctl_complete_beio(struct ctl_be_block_io *beio)
	{
	union ctl_io *io = beio->io;

	if (beio->beio_cont != NULL) {
	beio->beio_cont(beio);
	} else {
	ctl_free_beio(beio);
	ctl_data_submit_done(io);
	}
	}

	static size_t
	cmp(uint8_t a, uint8_t b, size_t size)
	{
	size_t i;

	for (i = 0; i < size; i++) {
	if (a[i] != b[i])
	break;
	}
	return (i);
	}

	static void
	ctl_be_block_compare(union ctl_io *io)
	{
	struct ctl_be_block_io *beio;
	uint64_t off, res;
	int i;
	uint8_t info[8];

	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
	off = 0;
	for (i = 0; i < beio->num_segs; i++) {
	res = cmp(beio->sg_segs[i].addr,
	beio->sg_segs[i + CTLBLK_HALF_SEGS].addr,
	beio->sg_segs[i].len);
	off += res;
	if (res < beio->sg_segs[i].len)
	break;
	}
	if (i < beio->num_segs) {
	scsi_u64to8b(off, info);
	ctl_set_sense(&io->scsiio, /current_error/ 1,
	/sense_key/ SSD_KEY_MISCOMPARE,
	/asc/ 0x1D, /ascq/ 0x00,
	/type/ SSD_ELEM_INFO,
	/size/ sizeof(info), /data/ &info,
	/type/ SSD_ELEM_NONE);
	} else
	ctl_set_success(&io->scsiio);
	}

	static int
	ctl_be_block_move_done(union ctl_io *io)
	{
	struct ctl_be_block_io *beio;
	struct ctl_be_block_lun *be_lun;
	struct ctl_lba_len_flags *lbalen;
	#ifdef CTL_TIME_IO
	struct bintime cur_bt;
	#endif

	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
	be_lun = beio->lun;

	DPRINTF("entered\n");

	#ifdef CTL_TIME_IO
	getbinuptime(&cur_bt);
	bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
	bintime_add(&io->io_hdr.dma_bt, &cur_bt);
	#endif
	io->io_hdr.num_dmas++;
	io->scsiio.kern_rel_offset += io->scsiio.kern_data_len;

	/*
	* We set status at this point for read commands, and write
	* commands with errors.
	*/
	if (io->io_hdr.flags & CTL_FLAG_ABORT) {
	;
	} else if ((io->io_hdr.port_status != 0) &&
	((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE \|\|
	(io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
	ctl_set_internal_failure(&io->scsiio, /sks_valid/ 1,
	/retry_count/ io->io_hdr.port_status);
	} else if (io->scsiio.kern_data_resid != 0 &&
	(io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT &&
	((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE \|\|
	(io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
	ctl_set_invalid_field_ciu(&io->scsiio);
	} else if ((io->io_hdr.port_status == 0) &&
	((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) {
	lbalen = ARGS(beio->io);
	if (lbalen->flags & CTL_LLF_READ) {
	ctl_set_success(&io->scsiio);
	} else if (lbalen->flags & CTL_LLF_COMPARE) {
	/* We have two data blocks ready for comparison. */
	ctl_be_block_compare(io);
	}
	}

	/*
	* If this is a read, or a write with errors, it is done.
	*/
	if ((beio->bio_cmd == BIO_READ)
	\|\| ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
	\|\| ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
	ctl_complete_beio(beio);
	return (0);
	}

	/*
	* At this point, we have a write and the DMA completed
	* successfully. We now have to queue it to the task queue to
	* execute the backend I/O. That is because we do blocking
	* memory allocations, and in the file backing case, blocking I/O.
	* This move done routine is generally called in the SIM's
	* interrupt context, and therefore we cannot block.
	*/
	mtx_lock(&be_lun->queue_lock);
	STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
	mtx_unlock(&be_lun->queue_lock);
	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);

	return (0);
	}

	static void
	ctl_be_block_biodone(struct bio *bio)
	{
	struct ctl_be_block_io *beio;
	struct ctl_be_block_lun *be_lun;
	union ctl_io *io;
	int error;

	beio = bio->bio_caller1;
	be_lun = beio->lun;
	io = beio->io;

	DPRINTF("entered\n");

	error = bio->bio_error;
	mtx_lock(&be_lun->io_lock);
	if (error != 0 &&
	(beio->first_error == 0 \|\|
	bio->bio_offset < beio->first_error_offset)) {
	beio->first_error = error;
	beio->first_error_offset = bio->bio_offset;
	}

	beio->num_bios_done++;

	/*
	* XXX KDM will this cause WITNESS to complain? Holding a lock
	* during the free might cause it to complain.
	*/
	g_destroy_bio(bio);

	/*
	* If the send complete bit isn't set, or we aren't the last I/O to
	* complete, then we're done.
	*/
	if ((beio->send_complete == 0)
	\|\| (beio->num_bios_done < beio->num_bios_sent)) {
	mtx_unlock(&be_lun->io_lock);
	return;
	}

	/*
	* At this point, we've verified that we are the last I/O to
	* complete, so it's safe to drop the lock.
	*/
	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
	beio->ds_tag_type, beio->ds_trans_type,
	/now/ NULL, /then/&beio->ds_t0);
	mtx_unlock(&be_lun->io_lock);

	/*
	* If there are any errors from the backing device, we fail the
	* entire I/O with a medium error.
	*/
	error = beio->first_error;
	if (error != 0) {
	if (error == EOPNOTSUPP) {
	ctl_set_invalid_opcode(&io->scsiio);
	} else if (error == ENOSPC \|\| error == EDQUOT) {
	ctl_set_space_alloc_fail(&io->scsiio);
	} else if (error == EROFS \|\| error == EACCES) {
	ctl_set_hw_write_protected(&io->scsiio);
	} else if (beio->bio_cmd == BIO_FLUSH) {
	/* XXX KDM is there is a better error here? */
	ctl_set_internal_failure(&io->scsiio,
	/sks_valid/ 1,
	/retry_count/ 0xbad2);
	} else {
	ctl_set_medium_error(&io->scsiio,
	beio->bio_cmd == BIO_READ);
	}
	ctl_complete_beio(beio);
	return;
	}

	/*
	* If this is a write, a flush, a delete or verify, we're all done.
	* If this is a read, we can now send the data to the user.
	*/
	if ((beio->bio_cmd == BIO_WRITE)
	\|\| (beio->bio_cmd == BIO_FLUSH)
	\|\| (beio->bio_cmd == BIO_DELETE)
	\|\| (ARGS(io)->flags & CTL_LLF_VERIFY)) {
	ctl_set_success(&io->scsiio);
	ctl_complete_beio(beio);
	} else {
	if ((ARGS(io)->flags & CTL_LLF_READ) &&
	beio->beio_cont == NULL) {
	ctl_set_success(&io->scsiio);
	ctl_serseq_done(io);
	}
	#ifdef CTL_TIME_IO
	getbinuptime(&io->io_hdr.dma_start_bt);
	#endif
	ctl_datamove(io);
	}
	}

	static void
	ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio)
	{
	union ctl_io *io = beio->io;
	struct mount *mountpoint;
	int error, lock_flags;

	DPRINTF("entered\n");

	binuptime(&beio->ds_t0);
	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);

	(void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);

	if (MNT_SHARED_WRITES(mountpoint) \|\|
	((mountpoint == NULL) && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
	lock_flags = LK_SHARED;
	else
	lock_flags = LK_EXCLUSIVE;
	vn_lock(be_lun->vn, lock_flags \| LK_RETRY);
	error = VOP_FSYNC(be_lun->vn, beio->io_arg ? MNT_NOWAIT : MNT_WAIT,
	curthread);
	VOP_UNLOCK(be_lun->vn);

	vn_finished_write(mountpoint);

	mtx_lock(&be_lun->io_lock);
	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
	beio->ds_tag_type, beio->ds_trans_type,
	/now/ NULL, /then/&beio->ds_t0);
	mtx_unlock(&be_lun->io_lock);

	if (error == 0)
	ctl_set_success(&io->scsiio);
	else {
	/* XXX KDM is there is a better error here? */
	ctl_set_internal_failure(&io->scsiio,
	/sks_valid/ 1,
	/retry_count/ 0xbad1);
	}

	ctl_complete_beio(beio);
	}

	SDT_PROBE_DEFINE1(cbb, , read, file_start, "uint64_t");
	SDT_PROBE_DEFINE1(cbb, , write, file_start, "uint64_t");
	SDT_PROBE_DEFINE1(cbb, , read, file_done,"uint64_t");
	SDT_PROBE_DEFINE1(cbb, , write, file_done, "uint64_t");

	static void
	ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio)
	{
	struct ctl_be_block_filedata *file_data;
	union ctl_io *io;
	struct uio xuio;
	struct iovec *xiovec;
	size_t s;
	int error, flags, i;

	DPRINTF("entered\n");

	file_data = &be_lun->backend.file;
	io = beio->io;
	flags = 0;
	if (ARGS(io)->flags & CTL_LLF_DPO)
	flags \|= IO_DIRECT;
	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
	flags \|= IO_SYNC;

	bzero(&xuio, sizeof(xuio));
	if (beio->bio_cmd == BIO_READ) {
	SDT_PROBE0(cbb, , read, file_start);
	xuio.uio_rw = UIO_READ;
	} else {
	SDT_PROBE0(cbb, , write, file_start);
	xuio.uio_rw = UIO_WRITE;
	}
	xuio.uio_offset = beio->io_offset;
	xuio.uio_resid = beio->io_len;
	xuio.uio_segflg = UIO_SYSSPACE;
	xuio.uio_iov = beio->xiovecs;
	xuio.uio_iovcnt = beio->num_segs;
	xuio.uio_td = curthread;

	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
	xiovec->iov_base = beio->sg_segs[i].addr;
	xiovec->iov_len = beio->sg_segs[i].len;
	}

	binuptime(&beio->ds_t0);
	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);

	if (beio->bio_cmd == BIO_READ) {
	vn_lock(be_lun->vn, LK_SHARED \| LK_RETRY);

	/*
	* UFS pays attention to IO_DIRECT for reads. If the
	* DIRECTIO option is configured into the kernel, it calls
	* ffs_rawread(). But that only works for single-segment
	* uios with user space addresses. In our case, with a
	* kernel uio, it still reads into the buffer cache, but it
	* will just try to release the buffer from the cache later
	* on in ffs_read().
	*
	* ZFS does not pay attention to IO_DIRECT for reads.
	*
	* UFS does not pay attention to IO_SYNC for reads.
	*
	* ZFS pays attention to IO_SYNC (which translates into the
	* Solaris define FRSYNC for zfs_read()) for reads. It
	* attempts to sync the file before reading.
	*/
	error = VOP_READ(be_lun->vn, &xuio, flags, file_data->cred);

	VOP_UNLOCK(be_lun->vn);
	SDT_PROBE0(cbb, , read, file_done);
	if (error == 0 && xuio.uio_resid > 0) {
	/*
	* If we red less then requested (EOF), then
	* we should clean the rest of the buffer.
	*/
	s = beio->io_len - xuio.uio_resid;
	for (i = 0; i < beio->num_segs; i++) {
	if (s >= beio->sg_segs[i].len) {
	s -= beio->sg_segs[i].len;
	continue;
	}
	bzero((uint8_t *)beio->sg_segs[i].addr + s,
	beio->sg_segs[i].len - s);
	s = 0;
	}
	}
	} else {
	struct mount *mountpoint;
	int lock_flags;

	(void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);

	if (MNT_SHARED_WRITES(mountpoint) \|\| ((mountpoint == NULL)
	&& MNT_SHARED_WRITES(be_lun->vn->v_mount)))
	lock_flags = LK_SHARED;
	else
	lock_flags = LK_EXCLUSIVE;
	vn_lock(be_lun->vn, lock_flags \| LK_RETRY);

	/*
	* UFS pays attention to IO_DIRECT for writes. The write
	* is done asynchronously. (Normally the write would just
	* get put into cache.
	*
	* UFS pays attention to IO_SYNC for writes. It will
	* attempt to write the buffer out synchronously if that
	* flag is set.
	*
	* ZFS does not pay attention to IO_DIRECT for writes.
	*
	* ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
	* for writes. It will flush the transaction from the
	* cache before returning.
	*/
	error = VOP_WRITE(be_lun->vn, &xuio, flags, file_data->cred);
	VOP_UNLOCK(be_lun->vn);

	vn_finished_write(mountpoint);
	SDT_PROBE0(cbb, , write, file_done);
	}

	mtx_lock(&be_lun->io_lock);
	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
	beio->ds_tag_type, beio->ds_trans_type,
	/now/ NULL, /then/&beio->ds_t0);
	mtx_unlock(&be_lun->io_lock);

	/*
	* If we got an error, set the sense data to "MEDIUM ERROR" and
	* return the I/O to the user.
	*/
	if (error != 0) {
	if (error == ENOSPC \|\| error == EDQUOT) {
	ctl_set_space_alloc_fail(&io->scsiio);
	} else if (error == EROFS \|\| error == EACCES) {
	ctl_set_hw_write_protected(&io->scsiio);
	} else {
	ctl_set_medium_error(&io->scsiio,
	beio->bio_cmd == BIO_READ);
	}
	ctl_complete_beio(beio);
	return;
	}

	/*
	* If this is a write or a verify, we're all done.
	* If this is a read, we can now send the data to the user.
	*/
	if ((beio->bio_cmd == BIO_WRITE) \|\|
	(ARGS(io)->flags & CTL_LLF_VERIFY)) {
	ctl_set_success(&io->scsiio);
	ctl_complete_beio(beio);
	} else {
	if ((ARGS(io)->flags & CTL_LLF_READ) &&
	beio->beio_cont == NULL) {
	ctl_set_success(&io->scsiio);
	ctl_serseq_done(io);
	}
	#ifdef CTL_TIME_IO
	getbinuptime(&io->io_hdr.dma_start_bt);
	#endif
	ctl_datamove(io);
	}
	}

	static void
	ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio)
	{
	union ctl_io *io = beio->io;
	struct ctl_lba_len_flags *lbalen = ARGS(io);
	struct scsi_get_lba_status_data *data;
	off_t roff, off;
	int error, status;

	DPRINTF("entered\n");

	off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
	vn_lock(be_lun->vn, LK_SHARED \| LK_RETRY);
	error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off,
	0, curthread->td_ucred, curthread);
	if (error == 0 && off > roff)
	status = 0; /* mapped up to off */
	else {
	error = VOP_IOCTL(be_lun->vn, FIOSEEKDATA, &off,
	0, curthread->td_ucred, curthread);
	if (error == 0 && off > roff)
	status = 1; /* deallocated up to off */
	else {
	status = 0; /* unknown up to the end */
	off = be_lun->size_bytes;
	}
	}
	VOP_UNLOCK(be_lun->vn);

	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
	lbalen->lba), data->descr[0].length);
	data->descr[0].status = status;

	ctl_complete_beio(beio);
	}

	static uint64_t
	ctl_be_block_getattr_file(struct ctl_be_block_lun be_lun, const char attrname)
	{
	struct vattr vattr;
	struct statfs statfs;
	uint64_t val;
	int error;

	val = UINT64_MAX;
	if (be_lun->vn == NULL)
	return (val);
	vn_lock(be_lun->vn, LK_SHARED \| LK_RETRY);
	if (strcmp(attrname, "blocksused") == 0) {
	error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
	if (error == 0)
	val = vattr.va_bytes / be_lun->cbe_lun.blocksize;
	}
	if (strcmp(attrname, "blocksavail") == 0 &&
	!VN_IS_DOOMED(be_lun->vn)) {
	error = VFS_STATFS(be_lun->vn->v_mount, &statfs);
	if (error == 0)
	val = statfs.f_bavail * statfs.f_bsize /
	be_lun->cbe_lun.blocksize;
	}
	VOP_UNLOCK(be_lun->vn);
	return (val);
	}

	static void
	ctl_be_block_dispatch_zvol(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio)
	{
	union ctl_io *io;
	struct cdevsw *csw;
	struct cdev *dev;
	struct uio xuio;
	struct iovec *xiovec;
	int error, flags, i, ref;

	DPRINTF("entered\n");

	io = beio->io;
	flags = 0;
	if (ARGS(io)->flags & CTL_LLF_DPO)
	flags \|= IO_DIRECT;
	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
	flags \|= IO_SYNC;

	bzero(&xuio, sizeof(xuio));
	if (beio->bio_cmd == BIO_READ) {
	SDT_PROBE0(cbb, , read, file_start);
	xuio.uio_rw = UIO_READ;
	} else {
	SDT_PROBE0(cbb, , write, file_start);
	xuio.uio_rw = UIO_WRITE;
	}
	xuio.uio_offset = beio->io_offset;
	xuio.uio_resid = beio->io_len;
	xuio.uio_segflg = UIO_SYSSPACE;
	xuio.uio_iov = beio->xiovecs;
	xuio.uio_iovcnt = beio->num_segs;
	xuio.uio_td = curthread;

	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
	xiovec->iov_base = beio->sg_segs[i].addr;
	xiovec->iov_len = beio->sg_segs[i].len;
	}

	binuptime(&beio->ds_t0);
	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);

	csw = devvn_refthread(be_lun->vn, &dev, &ref);
	if (csw) {
	if (beio->bio_cmd == BIO_READ)
	error = csw->d_read(dev, &xuio, flags);
	else
	error = csw->d_write(dev, &xuio, flags);
	dev_relthread(dev, ref);
	} else
	error = ENXIO;

	if (beio->bio_cmd == BIO_READ)
	SDT_PROBE0(cbb, , read, file_done);
	else
	SDT_PROBE0(cbb, , write, file_done);

	mtx_lock(&be_lun->io_lock);
	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
	beio->ds_tag_type, beio->ds_trans_type,
	/now/ NULL, /then/&beio->ds_t0);
	mtx_unlock(&be_lun->io_lock);

	/*
	* If we got an error, set the sense data to "MEDIUM ERROR" and
	* return the I/O to the user.
	*/
	if (error != 0) {
	if (error == ENOSPC \|\| error == EDQUOT) {
	ctl_set_space_alloc_fail(&io->scsiio);
	} else if (error == EROFS \|\| error == EACCES) {
	ctl_set_hw_write_protected(&io->scsiio);
	} else {
	ctl_set_medium_error(&io->scsiio,
	beio->bio_cmd == BIO_READ);
	}
	ctl_complete_beio(beio);
	return;
	}

	/*
	* If this is a write or a verify, we're all done.
	* If this is a read, we can now send the data to the user.
	*/
	if ((beio->bio_cmd == BIO_WRITE) \|\|
	(ARGS(io)->flags & CTL_LLF_VERIFY)) {
	ctl_set_success(&io->scsiio);
	ctl_complete_beio(beio);
	} else {
	if ((ARGS(io)->flags & CTL_LLF_READ) &&
	beio->beio_cont == NULL) {
	ctl_set_success(&io->scsiio);
	ctl_serseq_done(io);
	}
	#ifdef CTL_TIME_IO
	getbinuptime(&io->io_hdr.dma_start_bt);
	#endif
	ctl_datamove(io);
	}
	}

	static void
	ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio)
	{
	union ctl_io *io = beio->io;
	struct cdevsw *csw;
	struct cdev *dev;
	struct ctl_lba_len_flags *lbalen = ARGS(io);
	struct scsi_get_lba_status_data *data;
	off_t roff, off;
	int error, ref, status;

	DPRINTF("entered\n");

	csw = devvn_refthread(be_lun->vn, &dev, &ref);
	if (csw == NULL) {
	status = 0; /* unknown up to the end */
	off = be_lun->size_bytes;
	goto done;
	}
	off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
	error = csw->d_ioctl(dev, FIOSEEKHOLE, (caddr_t)&off, FREAD,
	curthread);
	if (error == 0 && off > roff)
	status = 0; /* mapped up to off */
	else {
	error = csw->d_ioctl(dev, FIOSEEKDATA, (caddr_t)&off, FREAD,
	curthread);
	if (error == 0 && off > roff)
	status = 1; /* deallocated up to off */
	else {
	status = 0; /* unknown up to the end */
	off = be_lun->size_bytes;
	}
	}
	dev_relthread(dev, ref);

	done:
	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
	lbalen->lba), data->descr[0].length);
	data->descr[0].status = status;

	ctl_complete_beio(beio);
	}

	static void
	ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio)
	{
	struct bio *bio;
	struct cdevsw *csw;
	struct cdev *dev;
	int ref;

	DPRINTF("entered\n");

	/* This can't fail, it's a blocking allocation. */
	bio = g_alloc_bio();

	bio->bio_cmd = BIO_FLUSH;
	bio->bio_offset = 0;
	bio->bio_data = 0;
	bio->bio_done = ctl_be_block_biodone;
	bio->bio_caller1 = beio;
	bio->bio_pblkno = 0;

	/*
	* We don't need to acquire the LUN lock here, because we are only
	* sending one bio, and so there is no other context to synchronize
	* with.
	*/
	beio->num_bios_sent = 1;
	beio->send_complete = 1;

	binuptime(&beio->ds_t0);
	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);

	csw = devvn_refthread(be_lun->vn, &dev, &ref);
	if (csw) {
	bio->bio_dev = dev;
	csw->d_strategy(bio);
	dev_relthread(dev, ref);
	} else {
	bio->bio_error = ENXIO;
	ctl_be_block_biodone(bio);
	}
	}

	static void
	ctl_be_block_unmap_dev_range(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio,
	uint64_t off, uint64_t len, int last)
	{
	struct bio *bio;
	uint64_t maxlen;
	struct cdevsw *csw;
	struct cdev *dev;
	int ref;

	csw = devvn_refthread(be_lun->vn, &dev, &ref);
	maxlen = LONG_MAX - (LONG_MAX % be_lun->cbe_lun.blocksize);
	while (len > 0) {
	bio = g_alloc_bio();
	bio->bio_cmd = BIO_DELETE;
	bio->bio_dev = dev;
	bio->bio_offset = off;
	bio->bio_length = MIN(len, maxlen);
	bio->bio_data = 0;
	bio->bio_done = ctl_be_block_biodone;
	bio->bio_caller1 = beio;
	bio->bio_pblkno = off / be_lun->cbe_lun.blocksize;

	off += bio->bio_length;
	len -= bio->bio_length;

	mtx_lock(&be_lun->io_lock);
	beio->num_bios_sent++;
	if (last && len == 0)
	beio->send_complete = 1;
	mtx_unlock(&be_lun->io_lock);

	if (csw) {
	csw->d_strategy(bio);
	} else {
	bio->bio_error = ENXIO;
	ctl_be_block_biodone(bio);
	}
	}
	if (csw)
	dev_relthread(dev, ref);
	}

	static void
	ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio)
	{
	union ctl_io *io;
	struct ctl_ptr_len_flags *ptrlen;
	struct scsi_unmap_desc buf, end;
	uint64_t len;

	io = beio->io;

	DPRINTF("entered\n");

	binuptime(&beio->ds_t0);
	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);

	if (beio->io_offset == -1) {
	beio->io_len = 0;
	ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
	buf = (struct scsi_unmap_desc *)ptrlen->ptr;
	end = buf + ptrlen->len / sizeof(*buf);
	for (; buf < end; buf++) {
	len = (uint64_t)scsi_4btoul(buf->length) *
	be_lun->cbe_lun.blocksize;
	beio->io_len += len;
	ctl_be_block_unmap_dev_range(be_lun, beio,
	scsi_8btou64(buf->lba) * be_lun->cbe_lun.blocksize,
	len, (end - buf < 2) ? TRUE : FALSE);
	}
	} else
	ctl_be_block_unmap_dev_range(be_lun, beio,
	beio->io_offset, beio->io_len, TRUE);
	}

	static void
	ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
	struct ctl_be_block_io *beio)
	{
	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
	struct bio *bio;
	struct cdevsw *csw;
	struct cdev *dev;
	off_t cur_offset;
	int i, max_iosize, ref;

	DPRINTF("entered\n");
	csw = devvn_refthread(be_lun->vn, &dev, &ref);

	/*
	* We have to limit our I/O size to the maximum supported by the
	* backend device.
	*/
	if (csw) {
	max_iosize = dev->si_iosize_max;
	if (max_iosize < PAGE_SIZE)
	max_iosize = DFLTPHYS;
	} else
	max_iosize = DFLTPHYS;

	cur_offset = beio->io_offset;
	for (i = 0; i < beio->num_segs; i++) {
	size_t cur_size;
	uint8_t *cur_ptr;

	cur_size = beio->sg_segs[i].len;
	cur_ptr = beio->sg_segs[i].addr;

	while (cur_size > 0) {
	/* This can't fail, it's a blocking allocation. */
	bio = g_alloc_bio();

	KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));

	bio->bio_cmd = beio->bio_cmd;
	bio->bio_dev = dev;
	bio->bio_caller1 = beio;
	bio->bio_length = min(cur_size, max_iosize);
	bio->bio_offset = cur_offset;
	bio->bio_data = cur_ptr;
	bio->bio_done = ctl_be_block_biodone;
	bio->bio_pblkno = cur_offset / be_lun->cbe_lun.blocksize;

	cur_offset += bio->bio_length;
	cur_ptr += bio->bio_length;
	cur_size -= bio->bio_length;

	TAILQ_INSERT_TAIL(&queue, bio, bio_queue);
	beio->num_bios_sent++;
	}
	}
	beio->send_complete = 1;
	binuptime(&beio->ds_t0);
	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);

	/*
	* Fire off all allocated requests!
	*/
	while ((bio = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, bio, bio_queue);
	if (csw)
	csw->d_strategy(bio);
	else {
	bio->bio_error = ENXIO;
	ctl_be_block_biodone(bio);
	}
	}
	if (csw)
	dev_relthread(dev, ref);
	}

	static uint64_t
	ctl_be_block_getattr_dev(struct ctl_be_block_lun be_lun, const char attrname)
	{
	struct diocgattr_arg arg;
	struct cdevsw *csw;
	struct cdev *dev;
	int error, ref;

	csw = devvn_refthread(be_lun->vn, &dev, &ref);
	if (csw == NULL)
	return (UINT64_MAX);
	strlcpy(arg.name, attrname, sizeof(arg.name));
	arg.len = sizeof(arg.value.off);
	if (csw->d_ioctl) {
	error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
	curthread);
	} else
	error = ENODEV;
	dev_relthread(dev, ref);
	if (error != 0)
	return (UINT64_MAX);
	return (arg.value.off);
	}

	static void
	ctl_be_block_cw_dispatch_sync(struct ctl_be_block_lun *be_lun,
	union ctl_io *io)
	{
	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
	struct ctl_be_block_io *beio;
	struct ctl_lba_len_flags *lbalen;

	DPRINTF("entered\n");
	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
	lbalen = (struct ctl_lba_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];

	beio->io_len = lbalen->len * cbe_lun->blocksize;
	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
	beio->io_arg = (lbalen->flags & SSC_IMMED) != 0;
	beio->bio_cmd = BIO_FLUSH;
	beio->ds_trans_type = DEVSTAT_NO_DATA;
	DPRINTF("SYNC\n");
	be_lun->lun_flush(be_lun, beio);
	}

	static void
	ctl_be_block_cw_done_ws(struct ctl_be_block_io *beio)
	{
	union ctl_io *io;

	io = beio->io;
	ctl_free_beio(beio);
	if ((io->io_hdr.flags & CTL_FLAG_ABORT) \|\|
	((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
	(io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
	ctl_config_write_done(io);
	return;
	}

	ctl_be_block_config_write(io);
	}

	static void
	ctl_be_block_cw_dispatch_ws(struct ctl_be_block_lun *be_lun,
	union ctl_io *io)
	{
	struct ctl_be_block_softc *softc = be_lun->softc;
	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
	struct ctl_be_block_io *beio;
	struct ctl_lba_len_flags *lbalen;
	uint64_t len_left, lba;
	uint32_t pb, pbo, adj;
	int i, seglen;
	uint8_t buf, end;

	DPRINTF("entered\n");

	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
	lbalen = ARGS(beio->io);

	if (lbalen->flags & ~(SWS_LBDATA \| SWS_UNMAP \| SWS_ANCHOR \| SWS_NDOB) \|\|
	(lbalen->flags & (SWS_UNMAP \| SWS_ANCHOR) && be_lun->unmap == NULL)) {
	ctl_free_beio(beio);
	ctl_set_invalid_field(&io->scsiio,
	/sks_valid/ 1,
	/command/ 1,
	/field/ 1,
	/bit_valid/ 0,
	/bit/ 0);
	ctl_config_write_done(io);
	return;
	}

	if (lbalen->flags & (SWS_UNMAP \| SWS_ANCHOR)) {
	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
	beio->io_len = (uint64_t)lbalen->len * cbe_lun->blocksize;
	beio->bio_cmd = BIO_DELETE;
	beio->ds_trans_type = DEVSTAT_FREE;

	be_lun->unmap(be_lun, beio);
	return;
	}

	beio->bio_cmd = BIO_WRITE;
	beio->ds_trans_type = DEVSTAT_WRITE;

	DPRINTF("WRITE SAME at LBA %jx len %u\n",
	(uintmax_t)lbalen->lba, lbalen->len);

	pb = cbe_lun->blocksize << be_lun->cbe_lun.pblockexp;
	if (be_lun->cbe_lun.pblockoff > 0)
	pbo = pb - cbe_lun->blocksize * be_lun->cbe_lun.pblockoff;
	else
	pbo = 0;
	len_left = (uint64_t)lbalen->len * cbe_lun->blocksize;
	- for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) {
	+ for (i = 0, lba = 0; i < CTLBLK_NUM_SEGS && len_left > 0; i++) {
	/*
	* Setup the S/G entry for this chunk.
	*/
	seglen = MIN(CTLBLK_MAX_SEG, len_left);
	if (pb > cbe_lun->blocksize) {
	adj = ((lbalen->lba + lba) * cbe_lun->blocksize +
	seglen - pbo) % pb;
	if (seglen > adj)
	seglen -= adj;
	else
	seglen -= seglen % cbe_lun->blocksize;
	} else
	seglen -= seglen % cbe_lun->blocksize;
	ctl_alloc_seg(softc, &beio->sg_segs[i], seglen);

	DPRINTF("segment %d addr %p len %zd\n", i,
	beio->sg_segs[i].addr, beio->sg_segs[i].len);

	beio->num_segs++;
	len_left -= seglen;

	buf = beio->sg_segs[i].addr;
	end = buf + seglen;
	for (; buf < end; buf += cbe_lun->blocksize) {
	if (lbalen->flags & SWS_NDOB) {
	memset(buf, 0, cbe_lun->blocksize);
	} else {
	memcpy(buf, io->scsiio.kern_data_ptr,
	cbe_lun->blocksize);
	}
	if (lbalen->flags & SWS_LBDATA)
	scsi_ulto4b(lbalen->lba + lba, buf);
	lba++;
	}
	}

	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
	beio->io_len = lba * cbe_lun->blocksize;

	/* We can not do all in one run. Correct and schedule rerun. */
	if (len_left > 0) {
	lbalen->lba += lba;
	lbalen->len -= lba;
	beio->beio_cont = ctl_be_block_cw_done_ws;
	}

	be_lun->dispatch(be_lun, beio);
	}

	static void
	ctl_be_block_cw_dispatch_unmap(struct ctl_be_block_lun *be_lun,
	union ctl_io *io)
	{
	struct ctl_be_block_io *beio;
	struct ctl_ptr_len_flags *ptrlen;

	DPRINTF("entered\n");

	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
	ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];

	if ((ptrlen->flags & ~SU_ANCHOR) != 0 \|\| be_lun->unmap == NULL) {
	ctl_free_beio(beio);
	ctl_set_invalid_field(&io->scsiio,
	/sks_valid/ 0,
	/command/ 1,
	/field/ 0,
	/bit_valid/ 0,
	/bit/ 0);
	ctl_config_write_done(io);
	return;
	}

	beio->io_len = 0;
	beio->io_offset = -1;
	beio->bio_cmd = BIO_DELETE;
	beio->ds_trans_type = DEVSTAT_FREE;
	DPRINTF("UNMAP\n");
	be_lun->unmap(be_lun, beio);
	}

	static void
	ctl_be_block_cr_done(struct ctl_be_block_io *beio)
	{
	union ctl_io *io;

	io = beio->io;
	ctl_free_beio(beio);
	ctl_config_read_done(io);
	}

	static void
	ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
	union ctl_io *io)
	{
	struct ctl_be_block_io *beio;
	struct ctl_be_block_softc *softc;

	DPRINTF("entered\n");

	softc = be_lun->softc;
	beio = ctl_alloc_beio(softc);
	beio->io = io;
	beio->lun = be_lun;
	beio->beio_cont = ctl_be_block_cr_done;
	PRIV(io)->ptr = (void *)beio;

	switch (io->scsiio.cdb[0]) {
	case SERVICE_ACTION_IN: /* GET LBA STATUS */
	beio->bio_cmd = -1;
	beio->ds_trans_type = DEVSTAT_NO_DATA;
	beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
	beio->io_len = 0;
	if (be_lun->get_lba_status)
	be_lun->get_lba_status(be_lun, beio);
	else
	ctl_be_block_cr_done(beio);
	break;
	default:
	panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
	break;
	}
	}

	static void
	ctl_be_block_cw_done(struct ctl_be_block_io *beio)
	{
	union ctl_io *io;

	io = beio->io;
	ctl_free_beio(beio);
	ctl_config_write_done(io);
	}

	static void
	ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
	union ctl_io *io)
	{
	struct ctl_be_block_io *beio;
	struct ctl_be_block_softc *softc;

	DPRINTF("entered\n");

	softc = be_lun->softc;
	beio = ctl_alloc_beio(softc);
	beio->io = io;
	beio->lun = be_lun;
	beio->beio_cont = ctl_be_block_cw_done;
	switch (io->scsiio.tag_type) {
	case CTL_TAG_ORDERED:
	beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
	break;
	case CTL_TAG_HEAD_OF_QUEUE:
	beio->ds_tag_type = DEVSTAT_TAG_HEAD;
	break;
	case CTL_TAG_UNTAGGED:
	case CTL_TAG_SIMPLE:
	case CTL_TAG_ACA:
	default:
	beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
	break;
	}
	PRIV(io)->ptr = (void *)beio;

	switch (io->scsiio.cdb[0]) {
	case SYNCHRONIZE_CACHE:
	case SYNCHRONIZE_CACHE_16:
	ctl_be_block_cw_dispatch_sync(be_lun, io);
	break;
	case WRITE_SAME_10:
	case WRITE_SAME_16:
	ctl_be_block_cw_dispatch_ws(be_lun, io);
	break;
	case UNMAP:
	ctl_be_block_cw_dispatch_unmap(be_lun, io);
	break;
	default:
	panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
	break;
	}
	}

	SDT_PROBE_DEFINE1(cbb, , read, start, "uint64_t");
	SDT_PROBE_DEFINE1(cbb, , write, start, "uint64_t");
	SDT_PROBE_DEFINE1(cbb, , read, alloc_done, "uint64_t");
	SDT_PROBE_DEFINE1(cbb, , write, alloc_done, "uint64_t");

	static void
	ctl_be_block_next(struct ctl_be_block_io *beio)
	{
	struct ctl_be_block_lun *be_lun;
	union ctl_io *io;

	io = beio->io;
	be_lun = beio->lun;
	ctl_free_beio(beio);
	if ((io->io_hdr.flags & CTL_FLAG_ABORT) \|\|
	((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
	(io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
	ctl_data_submit_done(io);
	return;
	}

	io->io_hdr.status &= ~CTL_STATUS_MASK;
	io->io_hdr.status \|= CTL_STATUS_NONE;

	mtx_lock(&be_lun->queue_lock);
	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
	mtx_unlock(&be_lun->queue_lock);
	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
	}

	static void
	ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
	union ctl_io *io)
	{
	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
	struct ctl_be_block_io *beio;
	struct ctl_be_block_softc *softc;
	struct ctl_lba_len_flags *lbalen;
	struct ctl_ptr_len_flags *bptrlen;
	uint64_t len_left, lbas;
	int i;

	softc = be_lun->softc;

	DPRINTF("entered\n");

	lbalen = ARGS(io);
	if (lbalen->flags & CTL_LLF_WRITE) {
	SDT_PROBE0(cbb, , write, start);
	} else {
	SDT_PROBE0(cbb, , read, start);
	}

	beio = ctl_alloc_beio(softc);
	beio->io = io;
	beio->lun = be_lun;
	bptrlen = PRIV(io);
	bptrlen->ptr = (void *)beio;

	switch (io->scsiio.tag_type) {
	case CTL_TAG_ORDERED:
	beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
	break;
	case CTL_TAG_HEAD_OF_QUEUE:
	beio->ds_tag_type = DEVSTAT_TAG_HEAD;
	break;
	case CTL_TAG_UNTAGGED:
	case CTL_TAG_SIMPLE:
	case CTL_TAG_ACA:
	default:
	beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
	break;
	}

	if (lbalen->flags & CTL_LLF_WRITE) {
	beio->bio_cmd = BIO_WRITE;
	beio->ds_trans_type = DEVSTAT_WRITE;
	} else {
	beio->bio_cmd = BIO_READ;
	beio->ds_trans_type = DEVSTAT_READ;
	}

	DPRINTF("%s at LBA %jx len %u @%ju\n",
	(beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
	(uintmax_t)lbalen->lba, lbalen->len, bptrlen->len);
	if (lbalen->flags & CTL_LLF_COMPARE) {
	beio->two_sglists = 1;
	lbas = CTLBLK_HALF_IO_SIZE;
	} else {
	lbas = CTLBLK_MAX_IO_SIZE;
	}
	lbas = MIN(lbalen->len - bptrlen->len, lbas / cbe_lun->blocksize);
	beio->io_offset = (lbalen->lba + bptrlen->len) * cbe_lun->blocksize;
	beio->io_len = lbas * cbe_lun->blocksize;
	bptrlen->len += lbas;

	for (i = 0, len_left = beio->io_len; len_left > 0; i++) {
	KASSERT(i < CTLBLK_MAX_SEGS, ("Too many segs (%d >= %d)",
	i, CTLBLK_MAX_SEGS));

	/*
	* Setup the S/G entry for this chunk.
	*/
	ctl_alloc_seg(softc, &beio->sg_segs[i],
	- min(CTLBLK_MAX_SEG, len_left));
	+ MIN(CTLBLK_MAX_SEG, len_left));

	DPRINTF("segment %d addr %p len %zd\n", i,
	beio->sg_segs[i].addr, beio->sg_segs[i].len);

	/* Set up second segment for compare operation. */
	if (beio->two_sglists) {
	ctl_alloc_seg(softc,
	&beio->sg_segs[i + CTLBLK_HALF_SEGS],
	beio->sg_segs[i].len);
	}

	beio->num_segs++;
	len_left -= beio->sg_segs[i].len;
	}
	if (bptrlen->len < lbalen->len)
	beio->beio_cont = ctl_be_block_next;
	io->scsiio.be_move_done = ctl_be_block_move_done;
	/* For compare we have separate S/G lists for read and datamove. */
	if (beio->two_sglists)
	io->scsiio.kern_data_ptr = (uint8_t *)&beio->sg_segs[CTLBLK_HALF_SEGS];
	else
	io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
	io->scsiio.kern_data_len = beio->io_len;
	io->scsiio.kern_sg_entries = beio->num_segs;
	io->scsiio.kern_data_ref = ctl_refcnt_beio;
	io->scsiio.kern_data_arg = beio;
	io->io_hdr.flags \|= CTL_FLAG_ALLOCATED;

	/*
	* For the read case, we need to read the data into our buffers and
	* then we can send it back to the user. For the write case, we
	* need to get the data from the user first.
	*/
	if (beio->bio_cmd == BIO_READ) {
	SDT_PROBE0(cbb, , read, alloc_done);
	be_lun->dispatch(be_lun, beio);
	} else {
	SDT_PROBE0(cbb, , write, alloc_done);
	#ifdef CTL_TIME_IO
	getbinuptime(&io->io_hdr.dma_start_bt);
	#endif
	ctl_datamove(io);
	}
	}

	static void
	ctl_be_block_worker(void *context, int pending)
	{
	struct ctl_be_block_lun be_lun = (struct ctl_be_block_lun )context;
	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
	union ctl_io *io;
	struct ctl_be_block_io *beio;

	DPRINTF("entered\n");
	/*
	* Fetch and process I/Os from all queues. If we detect LUN
	* CTL_LUN_FLAG_NO_MEDIA status here -- it is result of a race,
	* so make response maximally opaque to not confuse initiator.
	*/
	for (;;) {
	mtx_lock(&be_lun->queue_lock);
	io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
	if (io != NULL) {
	DPRINTF("datamove queue\n");
	STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr,
	ctl_io_hdr, links);
	mtx_unlock(&be_lun->queue_lock);
	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
	if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
	ctl_set_busy(&io->scsiio);
	ctl_complete_beio(beio);
	return;
	}
	be_lun->dispatch(be_lun, beio);
	continue;
	}
	io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
	if (io != NULL) {
	DPRINTF("config write queue\n");
	STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr,
	ctl_io_hdr, links);
	mtx_unlock(&be_lun->queue_lock);
	if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
	ctl_set_busy(&io->scsiio);
	ctl_config_write_done(io);
	return;
	}
	ctl_be_block_cw_dispatch(be_lun, io);
	continue;
	}
	io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_read_queue);
	if (io != NULL) {
	DPRINTF("config read queue\n");
	STAILQ_REMOVE(&be_lun->config_read_queue, &io->io_hdr,
	ctl_io_hdr, links);
	mtx_unlock(&be_lun->queue_lock);
	if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
	ctl_set_busy(&io->scsiio);
	ctl_config_read_done(io);
	return;
	}
	ctl_be_block_cr_dispatch(be_lun, io);
	continue;
	}
	io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
	if (io != NULL) {
	DPRINTF("input queue\n");
	STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr,
	ctl_io_hdr, links);
	mtx_unlock(&be_lun->queue_lock);
	if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
	ctl_set_busy(&io->scsiio);
	ctl_data_submit_done(io);
	return;
	}
	ctl_be_block_dispatch(be_lun, io);
	continue;
	}

	/*
	* If we get here, there is no work left in the queues, so
	* just break out and let the task queue go to sleep.
	*/
	mtx_unlock(&be_lun->queue_lock);
	break;
	}
	}

	/*
	* Entry point from CTL to the backend for I/O. We queue everything to a
	* work thread, so this just puts the I/O on a queue and wakes up the
	* thread.
	*/
	static int
	ctl_be_block_submit(union ctl_io *io)
	{
	struct ctl_be_block_lun *be_lun;

	DPRINTF("entered\n");

	be_lun = (struct ctl_be_block_lun *)CTL_BACKEND_LUN(io);

	/*
	* Make sure we only get SCSI I/O.
	*/
	KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type "
	"%#x) encountered", io->io_hdr.io_type));

	PRIV(io)->len = 0;

	mtx_lock(&be_lun->queue_lock);
	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
	mtx_unlock(&be_lun->queue_lock);
	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);

	return (CTL_RETVAL_COMPLETE);
	}

	static int
	ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
	int flag, struct thread *td)
	{
	struct ctl_be_block_softc *softc = &backend_block_softc;
	int error;

	error = 0;
	switch (cmd) {
	case CTL_LUN_REQ: {
	struct ctl_lun_req *lun_req;

	lun_req = (struct ctl_lun_req *)addr;

	switch (lun_req->reqtype) {
	case CTL_LUNREQ_CREATE:
	error = ctl_be_block_create(softc, lun_req);
	break;
	case CTL_LUNREQ_RM:
	error = ctl_be_block_rm(softc, lun_req);
	break;
	case CTL_LUNREQ_MODIFY:
	error = ctl_be_block_modify(softc, lun_req);
	break;
	default:
	lun_req->status = CTL_LUN_ERROR;
	snprintf(lun_req->error_str, sizeof(lun_req->error_str),
	"invalid LUN request type %d",
	lun_req->reqtype);
	break;
	}
	break;
	}
	default:
	error = ENOTTY;
	break;
	}

	return (error);
	}

	static int
	ctl_be_block_open_file(struct ctl_be_block_lun be_lun, struct ctl_lun_req req)
	{
	struct ctl_be_lun *cbe_lun;
	struct ctl_be_block_filedata *file_data;
	struct ctl_lun_create_params *params;
	const char *value;
	struct vattr vattr;
	off_t ps, pss, po, pos, us, uss, uo, uos;
	int error;

	cbe_lun = &be_lun->cbe_lun;
	file_data = &be_lun->backend.file;
	params = &be_lun->params;

	be_lun->dev_type = CTL_BE_BLOCK_FILE;
	be_lun->dispatch = ctl_be_block_dispatch_file;
	be_lun->lun_flush = ctl_be_block_flush_file;
	be_lun->get_lba_status = ctl_be_block_gls_file;
	be_lun->getattr = ctl_be_block_getattr_file;
	be_lun->unmap = NULL;
	cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;

	error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
	if (error != 0) {
	snprintf(req->error_str, sizeof(req->error_str),
	"error calling VOP_GETATTR() for file %s",
	be_lun->dev_path);
	return (error);
	}

	file_data->cred = crhold(curthread->td_ucred);
	if (params->lun_size_bytes != 0)
	be_lun->size_bytes = params->lun_size_bytes;
	else
	be_lun->size_bytes = vattr.va_size;

	/*
	* For files we can use any logical block size. Prefer 512 bytes
	* for compatibility reasons. If file's vattr.va_blocksize
	* (preferred I/O block size) is bigger and multiple to chosen
	* logical block size -- report it as physical block size.
	*/
	if (params->blocksize_bytes != 0)
	cbe_lun->blocksize = params->blocksize_bytes;
	else if (cbe_lun->lun_type == T_CDROM)
	cbe_lun->blocksize = 2048;
	else
	cbe_lun->blocksize = 512;
	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
	0 : (be_lun->size_blocks - 1);

	us = ps = vattr.va_blocksize;
	uo = po = 0;

	value = dnvlist_get_string(cbe_lun->options, "pblocksize", NULL);
	if (value != NULL)
	ctl_expand_number(value, &ps);
	value = dnvlist_get_string(cbe_lun->options, "pblockoffset", NULL);
	if (value != NULL)
	ctl_expand_number(value, &po);
	pss = ps / cbe_lun->blocksize;
	pos = po / cbe_lun->blocksize;
	if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
	((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
	cbe_lun->pblockexp = fls(pss) - 1;
	cbe_lun->pblockoff = (pss - pos) % pss;
	}

	value = dnvlist_get_string(cbe_lun->options, "ublocksize", NULL);
	if (value != NULL)
	ctl_expand_number(value, &us);
	value = dnvlist_get_string(cbe_lun->options, "ublockoffset", NULL);
	if (value != NULL)
	ctl_expand_number(value, &uo);
	uss = us / cbe_lun->blocksize;
	uos = uo / cbe_lun->blocksize;
	if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
	((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
	cbe_lun->ublockexp = fls(uss) - 1;
	cbe_lun->ublockoff = (uss - uos) % uss;
	}

	/*
	* Sanity check. The media size has to be at least one
	* sector long.
	*/
	if (be_lun->size_bytes < cbe_lun->blocksize) {
	error = EINVAL;
	snprintf(req->error_str, sizeof(req->error_str),
	"file %s size %ju < block size %u", be_lun->dev_path,
	(uintmax_t)be_lun->size_bytes, cbe_lun->blocksize);
	}

	cbe_lun->opttxferlen = CTLBLK_MAX_IO_SIZE / cbe_lun->blocksize;
	return (error);
	}

	static int
	ctl_be_block_open_dev(struct ctl_be_block_lun be_lun, struct ctl_lun_req req)
	{
	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
	struct ctl_lun_create_params *params;
	struct cdevsw *csw;
	struct cdev *dev;
	const char *value;
	int error, atomic, maxio, ref, unmap, tmp;
	off_t ps, pss, po, pos, us, uss, uo, uos, otmp;

	params = &be_lun->params;

	be_lun->dev_type = CTL_BE_BLOCK_DEV;
	csw = devvn_refthread(be_lun->vn, &dev, &ref);
	if (csw == NULL)
	return (ENXIO);
	if (strcmp(csw->d_name, "zvol") == 0) {
	be_lun->dispatch = ctl_be_block_dispatch_zvol;
	be_lun->get_lba_status = ctl_be_block_gls_zvol;
	atomic = maxio = CTLBLK_MAX_IO_SIZE;
	} else {
	be_lun->dispatch = ctl_be_block_dispatch_dev;
	be_lun->get_lba_status = NULL;
	atomic = 0;
	maxio = dev->si_iosize_max;
	if (maxio <= 0)
	maxio = DFLTPHYS;
	if (maxio > CTLBLK_MAX_SEG)
	maxio = CTLBLK_MAX_SEG;
	}
	be_lun->lun_flush = ctl_be_block_flush_dev;
	be_lun->getattr = ctl_be_block_getattr_dev;
	be_lun->unmap = ctl_be_block_unmap_dev;

	if (!csw->d_ioctl) {
	dev_relthread(dev, ref);
	snprintf(req->error_str, sizeof(req->error_str),
	"no d_ioctl for device %s!", be_lun->dev_path);
	return (ENODEV);
	}

	error = csw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD,
	curthread);
	if (error) {
	dev_relthread(dev, ref);
	snprintf(req->error_str, sizeof(req->error_str),
	"error %d returned for DIOCGSECTORSIZE ioctl "
	"on %s!", error, be_lun->dev_path);
	return (error);
	}

	/*
	* If the user has asked for a blocksize that is greater than the
	* backing device's blocksize, we can do it only if the blocksize
	* the user is asking for is an even multiple of the underlying
	* device's blocksize.
	*/
	if ((params->blocksize_bytes != 0) &&
	(params->blocksize_bytes >= tmp)) {
	if (params->blocksize_bytes % tmp == 0) {
	cbe_lun->blocksize = params->blocksize_bytes;
	} else {
	dev_relthread(dev, ref);
	snprintf(req->error_str, sizeof(req->error_str),
	"requested blocksize %u is not an even "
	"multiple of backing device blocksize %u",
	params->blocksize_bytes, tmp);
	return (EINVAL);
	}
	} else if (params->blocksize_bytes != 0) {
	dev_relthread(dev, ref);
	snprintf(req->error_str, sizeof(req->error_str),
	"requested blocksize %u < backing device "
	"blocksize %u", params->blocksize_bytes, tmp);
	return (EINVAL);
	} else if (cbe_lun->lun_type == T_CDROM)
	cbe_lun->blocksize = MAX(tmp, 2048);
	else
	cbe_lun->blocksize = tmp;

	error = csw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&otmp, FREAD,
	curthread);
	if (error) {
	dev_relthread(dev, ref);
	snprintf(req->error_str, sizeof(req->error_str),
	"error %d returned for DIOCGMEDIASIZE "
	" ioctl on %s!", error,
	be_lun->dev_path);
	return (error);
	}

	if (params->lun_size_bytes != 0) {
	if (params->lun_size_bytes > otmp) {
	dev_relthread(dev, ref);
	snprintf(req->error_str, sizeof(req->error_str),
	"requested LUN size %ju > backing device "
	"size %ju",
	(uintmax_t)params->lun_size_bytes,
	(uintmax_t)otmp);
	return (EINVAL);
	}

	be_lun->size_bytes = params->lun_size_bytes;
	} else
	be_lun->size_bytes = otmp;
	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
	0 : (be_lun->size_blocks - 1);

	error = csw->d_ioctl(dev, DIOCGSTRIPESIZE, (caddr_t)&ps, FREAD,
	curthread);
	if (error)
	ps = po = 0;
	else {
	error = csw->d_ioctl(dev, DIOCGSTRIPEOFFSET, (caddr_t)&po,
	FREAD, curthread);
	if (error)
	po = 0;
	}
	us = ps;
	uo = po;

	value = dnvlist_get_string(cbe_lun->options, "pblocksize", NULL);
	if (value != NULL)
	ctl_expand_number(value, &ps);
	value = dnvlist_get_string(cbe_lun->options, "pblockoffset", NULL);
	if (value != NULL)
	ctl_expand_number(value, &po);
	pss = ps / cbe_lun->blocksize;
	pos = po / cbe_lun->blocksize;
	if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
	((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
	cbe_lun->pblockexp = fls(pss) - 1;
	cbe_lun->pblockoff = (pss - pos) % pss;
	}

	value = dnvlist_get_string(cbe_lun->options, "ublocksize", NULL);
	if (value != NULL)
	ctl_expand_number(value, &us);
	value = dnvlist_get_string(cbe_lun->options, "ublockoffset", NULL);
	if (value != NULL)
	ctl_expand_number(value, &uo);
	uss = us / cbe_lun->blocksize;
	uos = uo / cbe_lun->blocksize;
	if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
	((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
	cbe_lun->ublockexp = fls(uss) - 1;
	cbe_lun->ublockoff = (uss - uos) % uss;
	}

	cbe_lun->atomicblock = atomic / cbe_lun->blocksize;
	cbe_lun->opttxferlen = maxio / cbe_lun->blocksize;

	if (be_lun->dispatch == ctl_be_block_dispatch_zvol) {
	unmap = 1;
	} else {
	struct diocgattr_arg arg;

	strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
	arg.len = sizeof(arg.value.i);
	error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
	curthread);
	unmap = (error == 0) ? arg.value.i : 0;
	}
	value = dnvlist_get_string(cbe_lun->options, "unmap", NULL);
	if (value != NULL)
	unmap = (strcmp(value, "on") == 0);
	if (unmap)
	cbe_lun->flags \|= CTL_LUN_FLAG_UNMAP;
	else
	cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;

	dev_relthread(dev, ref);
	return (0);
	}

	static int
	ctl_be_block_close(struct ctl_be_block_lun *be_lun)
	{
	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
	int flags;

	if (be_lun->vn) {
	flags = FREAD;
	if ((cbe_lun->flags & CTL_LUN_FLAG_READONLY) == 0)
	flags \|= FWRITE;
	(void)vn_close(be_lun->vn, flags, NOCRED, curthread);
	be_lun->vn = NULL;

	switch (be_lun->dev_type) {
	case CTL_BE_BLOCK_DEV:
	break;
	case CTL_BE_BLOCK_FILE:
	if (be_lun->backend.file.cred != NULL) {
	crfree(be_lun->backend.file.cred);
	be_lun->backend.file.cred = NULL;
	}
	break;
	case CTL_BE_BLOCK_NONE:
	break;
	default:
	panic("Unexpected backend type %d", be_lun->dev_type);
	break;
	}
	be_lun->dev_type = CTL_BE_BLOCK_NONE;
	}
	return (0);
	}

	static int
	ctl_be_block_open(struct ctl_be_block_lun be_lun, struct ctl_lun_req req)
	{
	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
	struct nameidata nd;
	const char *value;
	int error, flags;

	error = 0;
	if (rootvnode == NULL) {
	snprintf(req->error_str, sizeof(req->error_str),
	"Root filesystem is not mounted");
	return (1);
	}
	pwd_ensure_dirs();

	value = dnvlist_get_string(cbe_lun->options, "file", NULL);
	if (value == NULL) {
	snprintf(req->error_str, sizeof(req->error_str),
	"no file argument specified");
	return (1);
	}
	free(be_lun->dev_path, M_CTLBLK);
	be_lun->dev_path = strdup(value, M_CTLBLK);

	flags = FREAD;
	value = dnvlist_get_string(cbe_lun->options, "readonly", NULL);
	if (value != NULL) {
	if (strcmp(value, "on") != 0)
	flags \|= FWRITE;
	} else if (cbe_lun->lun_type == T_DIRECT)
	flags \|= FWRITE;

	again:
	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread);
	error = vn_open(&nd, &flags, 0, NULL);
	if ((error == EROFS \|\| error == EACCES) && (flags & FWRITE)) {
	flags &= ~FWRITE;
	goto again;
	}
	if (error) {
	/*
	* This is the only reasonable guess we can make as far as
	* path if the user doesn't give us a fully qualified path.
	* If they want to specify a file, they need to specify the
	* full path.
	*/
	if (be_lun->dev_path[0] != '/') {
	char *dev_name;

	asprintf(&dev_name, M_CTLBLK, "/dev/%s",
	be_lun->dev_path);
	free(be_lun->dev_path, M_CTLBLK);
	be_lun->dev_path = dev_name;
	goto again;
	}
	snprintf(req->error_str, sizeof(req->error_str),
	"error opening %s: %d", be_lun->dev_path, error);
	return (error);
	}
	if (flags & FWRITE)
	cbe_lun->flags &= ~CTL_LUN_FLAG_READONLY;
	else
	cbe_lun->flags \|= CTL_LUN_FLAG_READONLY;

	NDFREE(&nd, NDF_ONLY_PNBUF);
	be_lun->vn = nd.ni_vp;

	/* We only support disks and files. */
	if (vn_isdisk_error(be_lun->vn, &error)) {
	error = ctl_be_block_open_dev(be_lun, req);
	} else if (be_lun->vn->v_type == VREG) {
	error = ctl_be_block_open_file(be_lun, req);
	} else {
	error = EINVAL;
	snprintf(req->error_str, sizeof(req->error_str),
	"%s is not a disk or plain file", be_lun->dev_path);
	}
	VOP_UNLOCK(be_lun->vn);

	if (error != 0)
	ctl_be_block_close(be_lun);
	cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
	if (be_lun->dispatch != ctl_be_block_dispatch_dev)
	cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
	value = dnvlist_get_string(cbe_lun->options, "serseq", NULL);
	if (value != NULL && strcmp(value, "on") == 0)
	cbe_lun->serseq = CTL_LUN_SERSEQ_ON;
	else if (value != NULL && strcmp(value, "read") == 0)
	cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
	else if (value != NULL && strcmp(value, "off") == 0)
	cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
	return (0);
	}

	static int
	ctl_be_block_create(struct ctl_be_block_softc softc, struct ctl_lun_req req)
	{
	struct ctl_be_lun *cbe_lun;
	struct ctl_be_block_lun *be_lun;
	struct ctl_lun_create_params *params;
	char num_thread_str[16];
	char tmpstr[32];
	const char *value;
	int retval, num_threads;
	int tmp_num_threads;

	params = &req->reqdata.create;
	retval = 0;
	req->status = CTL_LUN_OK;

	be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO \| M_WAITOK);
	cbe_lun = &be_lun->cbe_lun;
	be_lun->params = req->reqdata.create;
	be_lun->softc = softc;
	STAILQ_INIT(&be_lun->input_queue);
	STAILQ_INIT(&be_lun->config_read_queue);
	STAILQ_INIT(&be_lun->config_write_queue);
	STAILQ_INIT(&be_lun->datamove_queue);
	mtx_init(&be_lun->io_lock, "ctlblock io", NULL, MTX_DEF);
	mtx_init(&be_lun->queue_lock, "ctlblock queue", NULL, MTX_DEF);
	cbe_lun->options = nvlist_clone(req->args_nvl);

	if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
	cbe_lun->lun_type = params->device_type;
	else
	cbe_lun->lun_type = T_DIRECT;
	be_lun->flags = 0;
	cbe_lun->flags = 0;
	value = dnvlist_get_string(cbe_lun->options, "ha_role", NULL);
	if (value != NULL) {
	if (strcmp(value, "primary") == 0)
	cbe_lun->flags \|= CTL_LUN_FLAG_PRIMARY;
	} else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
	cbe_lun->flags \|= CTL_LUN_FLAG_PRIMARY;

	if (cbe_lun->lun_type == T_DIRECT \|\|
	cbe_lun->lun_type == T_CDROM) {
	be_lun->size_bytes = params->lun_size_bytes;
	if (params->blocksize_bytes != 0)
	cbe_lun->blocksize = params->blocksize_bytes;
	else if (cbe_lun->lun_type == T_CDROM)
	cbe_lun->blocksize = 2048;
	else
	cbe_lun->blocksize = 512;
	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
	0 : (be_lun->size_blocks - 1);

	if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) \|\|
	control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
	retval = ctl_be_block_open(be_lun, req);
	if (retval != 0) {
	retval = 0;
	req->status = CTL_LUN_WARNING;
	}
	}
	num_threads = cbb_num_threads;
	} else {
	num_threads = 1;
	}

	value = dnvlist_get_string(cbe_lun->options, "num_threads", NULL);
	if (value != NULL) {
	tmp_num_threads = strtol(value, NULL, 0);

	/*
	* We don't let the user specify less than one
	* thread, but hope he's clueful enough not to
	* specify 1000 threads.
	*/
	if (tmp_num_threads < 1) {
	snprintf(req->error_str, sizeof(req->error_str),
	"invalid number of threads %s",
	num_thread_str);
	goto bailout_error;
	}
	num_threads = tmp_num_threads;
	}

	if (be_lun->vn == NULL)
	cbe_lun->flags \|= CTL_LUN_FLAG_NO_MEDIA;
	/* Tell the user the blocksize we ended up using */
	params->lun_size_bytes = be_lun->size_bytes;
	params->blocksize_bytes = cbe_lun->blocksize;
	if (params->flags & CTL_LUN_FLAG_ID_REQ) {
	cbe_lun->req_lun_id = params->req_lun_id;
	cbe_lun->flags \|= CTL_LUN_FLAG_ID_REQ;
	} else
	cbe_lun->req_lun_id = 0;

	cbe_lun->lun_shutdown = ctl_be_block_lun_shutdown;
	cbe_lun->be = &ctl_be_block_driver;

	if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
	snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%04d",
	softc->num_luns);
	strncpy((char *)cbe_lun->serial_num, tmpstr,
	MIN(sizeof(cbe_lun->serial_num), sizeof(tmpstr)));

	/* Tell the user what we used for a serial number */
	strncpy((char *)params->serial_num, tmpstr,
	MIN(sizeof(params->serial_num), sizeof(tmpstr)));
	} else {
	strncpy((char *)cbe_lun->serial_num, params->serial_num,
	MIN(sizeof(cbe_lun->serial_num),
	sizeof(params->serial_num)));
	}
	if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
	snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%04d", softc->num_luns);
	strncpy((char *)cbe_lun->device_id, tmpstr,
	MIN(sizeof(cbe_lun->device_id), sizeof(tmpstr)));

	/* Tell the user what we used for a device ID */
	strncpy((char *)params->device_id, tmpstr,
	MIN(sizeof(params->device_id), sizeof(tmpstr)));
	} else {
	strncpy((char *)cbe_lun->device_id, params->device_id,
	MIN(sizeof(cbe_lun->device_id),
	sizeof(params->device_id)));
	}

	TASK_INIT(&be_lun->io_task, /priority/0, ctl_be_block_worker, be_lun);

	be_lun->io_taskqueue = taskqueue_create("ctlblocktq", M_WAITOK,
	taskqueue_thread_enqueue, /context/&be_lun->io_taskqueue);

	if (be_lun->io_taskqueue == NULL) {
	snprintf(req->error_str, sizeof(req->error_str),
	"unable to create taskqueue");
	goto bailout_error;
	}

	/*
	* Note that we start the same number of threads by default for
	* both the file case and the block device case. For the file
	* case, we need multiple threads to allow concurrency, because the
	* vnode interface is designed to be a blocking interface. For the
	* block device case, ZFS zvols at least will block the caller's
	* context in many instances, and so we need multiple threads to
	* overcome that problem. Other block devices don't need as many
	* threads, but they shouldn't cause too many problems.
	*
	* If the user wants to just have a single thread for a block
	* device, he can specify that when the LUN is created, or change
	* the tunable/sysctl to alter the default number of threads.
	*/
	retval = taskqueue_start_threads_in_proc(&be_lun->io_taskqueue,
	/num threads/num_threads,
	/priority/PUSER,
	/proc/control_softc->ctl_proc,
	/thread name/"block");

	if (retval != 0)
	goto bailout_error;

	be_lun->num_threads = num_threads;

	retval = ctl_add_lun(&be_lun->cbe_lun);
	if (retval != 0) {
	snprintf(req->error_str, sizeof(req->error_str),
	"ctl_add_lun() returned error %d, see dmesg for "
	"details", retval);
	retval = 0;
	goto bailout_error;
	}

	be_lun->disk_stats = devstat_new_entry("cbb", cbe_lun->lun_id,
	cbe_lun->blocksize,
	DEVSTAT_ALL_SUPPORTED,
	cbe_lun->lun_type
	\| DEVSTAT_TYPE_IF_OTHER,
	DEVSTAT_PRIORITY_OTHER);

	mtx_lock(&softc->lock);
	softc->num_luns++;
	SLIST_INSERT_HEAD(&softc->lun_list, be_lun, links);
	mtx_unlock(&softc->lock);

	params->req_lun_id = cbe_lun->lun_id;

	return (retval);

	bailout_error:
	req->status = CTL_LUN_ERROR;

	if (be_lun->io_taskqueue != NULL)
	taskqueue_free(be_lun->io_taskqueue);
	ctl_be_block_close(be_lun);
	if (be_lun->dev_path != NULL)
	free(be_lun->dev_path, M_CTLBLK);
	nvlist_destroy(cbe_lun->options);
	mtx_destroy(&be_lun->queue_lock);
	mtx_destroy(&be_lun->io_lock);
	free(be_lun, M_CTLBLK);

	return (retval);
	}

	static int
	ctl_be_block_rm(struct ctl_be_block_softc softc, struct ctl_lun_req req)
	{
	struct ctl_lun_rm_params *params;
	struct ctl_be_block_lun *be_lun;
	struct ctl_be_lun *cbe_lun;
	int retval;

	params = &req->reqdata.rm;

	sx_xlock(&softc->modify_lock);
	mtx_lock(&softc->lock);
	SLIST_FOREACH(be_lun, &softc->lun_list, links) {
	if (be_lun->cbe_lun.lun_id == params->lun_id) {
	SLIST_REMOVE(&softc->lun_list, be_lun,
	ctl_be_block_lun, links);
	softc->num_luns--;
	break;
	}
	}
	mtx_unlock(&softc->lock);
	sx_xunlock(&softc->modify_lock);
	if (be_lun == NULL) {
	snprintf(req->error_str, sizeof(req->error_str),
	"LUN %u is not managed by the block backend",
	params->lun_id);
	goto bailout_error;
	}
	cbe_lun = &be_lun->cbe_lun;

	if (be_lun->vn != NULL) {
	cbe_lun->flags \|= CTL_LUN_FLAG_NO_MEDIA;
	ctl_lun_no_media(cbe_lun);
	taskqueue_drain_all(be_lun->io_taskqueue);
	ctl_be_block_close(be_lun);
	}

	mtx_lock(&softc->lock);
	be_lun->flags \|= CTL_BE_BLOCK_LUN_WAITING;
	mtx_unlock(&softc->lock);

	retval = ctl_remove_lun(cbe_lun);
	if (retval != 0) {
	snprintf(req->error_str, sizeof(req->error_str),
	"error %d returned from ctl_remove_lun() for "
	"LUN %d", retval, params->lun_id);
	mtx_lock(&softc->lock);
	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
	mtx_unlock(&softc->lock);
	goto bailout_error;
	}

	mtx_lock(&softc->lock);
	while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
	retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblockrm", 0);
	if (retval == EINTR)
	break;
	}
	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
	if (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
	mtx_unlock(&softc->lock);
	free(be_lun, M_CTLBLK);
	} else {
	mtx_unlock(&softc->lock);
	return (EINTR);
	}

	req->status = CTL_LUN_OK;
	return (0);

	bailout_error:
	req->status = CTL_LUN_ERROR;
	return (0);
	}

	static int
	ctl_be_block_modify(struct ctl_be_block_softc softc, struct ctl_lun_req req)
	{
	struct ctl_lun_modify_params *params;
	struct ctl_be_block_lun *be_lun;
	struct ctl_be_lun *cbe_lun;
	const char *value;
	uint64_t oldsize;
	int error, wasprim;

	params = &req->reqdata.modify;

	sx_xlock(&softc->modify_lock);
	mtx_lock(&softc->lock);
	SLIST_FOREACH(be_lun, &softc->lun_list, links) {
	if (be_lun->cbe_lun.lun_id == params->lun_id)
	break;
	}
	mtx_unlock(&softc->lock);
	if (be_lun == NULL) {
	snprintf(req->error_str, sizeof(req->error_str),
	"LUN %u is not managed by the block backend",
	params->lun_id);
	goto bailout_error;
	}
	cbe_lun = &be_lun->cbe_lun;

	if (params->lun_size_bytes != 0)
	be_lun->params.lun_size_bytes = params->lun_size_bytes;

	if (req->args_nvl != NULL) {
	nvlist_destroy(cbe_lun->options);
	cbe_lun->options = nvlist_clone(req->args_nvl);
	}

	wasprim = (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY);
	value = dnvlist_get_string(cbe_lun->options, "ha_role", NULL);
	if (value != NULL) {
	if (strcmp(value, "primary") == 0)
	cbe_lun->flags \|= CTL_LUN_FLAG_PRIMARY;
	else
	cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
	} else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
	cbe_lun->flags \|= CTL_LUN_FLAG_PRIMARY;
	else
	cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
	if (wasprim != (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)) {
	if (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)
	ctl_lun_primary(cbe_lun);
	else
	ctl_lun_secondary(cbe_lun);
	}

	oldsize = be_lun->size_blocks;
	if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) \|\|
	control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
	if (be_lun->vn == NULL)
	error = ctl_be_block_open(be_lun, req);
	else if (vn_isdisk_error(be_lun->vn, &error))
	error = ctl_be_block_open_dev(be_lun, req);
	else if (be_lun->vn->v_type == VREG) {
	vn_lock(be_lun->vn, LK_SHARED \| LK_RETRY);
	error = ctl_be_block_open_file(be_lun, req);
	VOP_UNLOCK(be_lun->vn);
	} else
	error = EINVAL;
	if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) &&
	be_lun->vn != NULL) {
	cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA;
	ctl_lun_has_media(cbe_lun);
	} else if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) == 0 &&
	be_lun->vn == NULL) {
	cbe_lun->flags \|= CTL_LUN_FLAG_NO_MEDIA;
	ctl_lun_no_media(cbe_lun);
	}
	cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED;
	} else {
	if (be_lun->vn != NULL) {
	cbe_lun->flags \|= CTL_LUN_FLAG_NO_MEDIA;
	ctl_lun_no_media(cbe_lun);
	taskqueue_drain_all(be_lun->io_taskqueue);
	error = ctl_be_block_close(be_lun);
	} else
	error = 0;
	}
	if (be_lun->size_blocks != oldsize)
	ctl_lun_capacity_changed(cbe_lun);

	/* Tell the user the exact size we ended up using */
	params->lun_size_bytes = be_lun->size_bytes;

	sx_xunlock(&softc->modify_lock);
	req->status = error ? CTL_LUN_WARNING : CTL_LUN_OK;
	return (0);

	bailout_error:
	sx_xunlock(&softc->modify_lock);
	req->status = CTL_LUN_ERROR;
	return (0);
	}

	static void
	ctl_be_block_lun_shutdown(struct ctl_be_lun *cbe_lun)
	{
	struct ctl_be_block_lun be_lun = (struct ctl_be_block_lun )cbe_lun;
	struct ctl_be_block_softc *softc = be_lun->softc;

	taskqueue_drain_all(be_lun->io_taskqueue);
	taskqueue_free(be_lun->io_taskqueue);
	if (be_lun->disk_stats != NULL)
	devstat_remove_entry(be_lun->disk_stats);
	nvlist_destroy(be_lun->cbe_lun.options);
	free(be_lun->dev_path, M_CTLBLK);
	mtx_destroy(&be_lun->queue_lock);
	mtx_destroy(&be_lun->io_lock);

	mtx_lock(&softc->lock);
	be_lun->flags \|= CTL_BE_BLOCK_LUN_UNCONFIGURED;
	if (be_lun->flags & CTL_BE_BLOCK_LUN_WAITING)
	wakeup(be_lun);
	else
	free(be_lun, M_CTLBLK);
	mtx_unlock(&softc->lock);
	}

	static int
	ctl_be_block_config_write(union ctl_io *io)
	{
	struct ctl_be_block_lun *be_lun;
	struct ctl_be_lun *cbe_lun;
	int retval;

	DPRINTF("entered\n");

	cbe_lun = CTL_BACKEND_LUN(io);
	be_lun = (struct ctl_be_block_lun *)cbe_lun;

	retval = 0;
	switch (io->scsiio.cdb[0]) {
	case SYNCHRONIZE_CACHE:
	case SYNCHRONIZE_CACHE_16:
	case WRITE_SAME_10:
	case WRITE_SAME_16:
	case UNMAP:
	/*
	* The upper level CTL code will filter out any CDBs with
	* the immediate bit set and return the proper error.
	*
	* We don't really need to worry about what LBA range the
	* user asked to be synced out. When they issue a sync
	* cache command, we'll sync out the whole thing.
	*/
	mtx_lock(&be_lun->queue_lock);
	STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
	links);
	mtx_unlock(&be_lun->queue_lock);
	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
	break;
	case START_STOP_UNIT: {
	struct scsi_start_stop_unit *cdb;
	struct ctl_lun_req req;

	cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
	if ((cdb->how & SSS_PC_MASK) != 0) {
	ctl_set_success(&io->scsiio);
	ctl_config_write_done(io);
	break;
	}
	if (cdb->how & SSS_START) {
	if ((cdb->how & SSS_LOEJ) && be_lun->vn == NULL) {
	retval = ctl_be_block_open(be_lun, &req);
	cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED;
	if (retval == 0) {
	cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA;
	ctl_lun_has_media(cbe_lun);
	} else {
	cbe_lun->flags \|= CTL_LUN_FLAG_NO_MEDIA;
	ctl_lun_no_media(cbe_lun);
	}
	}
	ctl_start_lun(cbe_lun);
	} else {
	ctl_stop_lun(cbe_lun);
	if (cdb->how & SSS_LOEJ) {
	cbe_lun->flags \|= CTL_LUN_FLAG_NO_MEDIA;
	cbe_lun->flags \|= CTL_LUN_FLAG_EJECTED;
	ctl_lun_ejected(cbe_lun);
	if (be_lun->vn != NULL)
	ctl_be_block_close(be_lun);
	}
	}

	ctl_set_success(&io->scsiio);
	ctl_config_write_done(io);
	break;
	}
	case PREVENT_ALLOW:
	ctl_set_success(&io->scsiio);
	ctl_config_write_done(io);
	break;
	default:
	ctl_set_invalid_opcode(&io->scsiio);
	ctl_config_write_done(io);
	retval = CTL_RETVAL_COMPLETE;
	break;
	}

	return (retval);
	}

	static int
	ctl_be_block_config_read(union ctl_io *io)
	{
	struct ctl_be_block_lun *be_lun;
	int retval = 0;

	DPRINTF("entered\n");

	be_lun = (struct ctl_be_block_lun *)CTL_BACKEND_LUN(io);

	switch (io->scsiio.cdb[0]) {
	case SERVICE_ACTION_IN:
	if (io->scsiio.cdb[1] == SGLS_SERVICE_ACTION) {
	mtx_lock(&be_lun->queue_lock);
	STAILQ_INSERT_TAIL(&be_lun->config_read_queue,
	&io->io_hdr, links);
	mtx_unlock(&be_lun->queue_lock);
	taskqueue_enqueue(be_lun->io_taskqueue,
	&be_lun->io_task);
	retval = CTL_RETVAL_QUEUED;
	break;
	}
	ctl_set_invalid_field(&io->scsiio,
	/sks_valid/ 1,
	/command/ 1,
	/field/ 1,
	/bit_valid/ 1,
	/bit/ 4);
	ctl_config_read_done(io);
	retval = CTL_RETVAL_COMPLETE;
	break;
	default:
	ctl_set_invalid_opcode(&io->scsiio);
	ctl_config_read_done(io);
	retval = CTL_RETVAL_COMPLETE;
	break;
	}

	return (retval);
	}

	static int
	ctl_be_block_lun_info(struct ctl_be_lun cbe_lun, struct sbuf sb)
	{
	struct ctl_be_block_lun lun = (struct ctl_be_block_lun )cbe_lun;
	int retval;

	retval = sbuf_printf(sb, "\t<num_threads>");
	if (retval != 0)
	goto bailout;
	retval = sbuf_printf(sb, "%d", lun->num_threads);
	if (retval != 0)
	goto bailout;
	retval = sbuf_printf(sb, "</num_threads>\n");

	bailout:
	return (retval);
	}

	static uint64_t
	ctl_be_block_lun_attr(struct ctl_be_lun cbe_lun, const char attrname)
	{
	struct ctl_be_block_lun lun = (struct ctl_be_block_lun )cbe_lun;

	if (lun->getattr == NULL)
	return (UINT64_MAX);
	return (lun->getattr(lun, attrname));
	}

	static int
	ctl_be_block_init(void)
	{
	struct ctl_be_block_softc *softc = &backend_block_softc;

	sx_init(&softc->modify_lock, "ctlblock modify");
	mtx_init(&softc->lock, "ctlblock", NULL, MTX_DEF);
	softc->beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	- softc->buf_zone = uma_zcreate("ctlblock", CTLBLK_MAX_SEG,
	+ softc->bufmin_zone = uma_zcreate("ctlblockmin", CTLBLK_MIN_SEG,
	NULL, NULL, NULL, NULL, /align/ 0, /flags/0);
	-#if (CTLBLK_MAX_SEG > 131072)
	- softc->buf128_zone = uma_zcreate("ctlblock128", 131072,
	- NULL, NULL, NULL, NULL, /align/ 0, /flags/0);
	-#endif
	+ if (CTLBLK_MIN_SEG < CTLBLK_MAX_SEG)
	+ softc->bufmax_zone = uma_zcreate("ctlblockmax", CTLBLK_MAX_SEG,
	+ NULL, NULL, NULL, NULL, /align/ 0, /flags/0);
	SLIST_INIT(&softc->lun_list);
	return (0);
	}

	static int
	ctl_be_block_shutdown(void)
	{
	struct ctl_be_block_softc *softc = &backend_block_softc;
	struct ctl_be_block_lun *lun;

	mtx_lock(&softc->lock);
	while ((lun = SLIST_FIRST(&softc->lun_list)) != NULL) {
	SLIST_REMOVE_HEAD(&softc->lun_list, links);
	softc->num_luns--;
	/*
	* Drop our lock here. Since ctl_remove_lun() can call
	* back into us, this could potentially lead to a recursive
	* lock of the same mutex, which would cause a hang.
	*/
	mtx_unlock(&softc->lock);
	ctl_remove_lun(&lun->cbe_lun);
	mtx_lock(&softc->lock);
	}
	mtx_unlock(&softc->lock);
	- uma_zdestroy(softc->buf_zone);
	-#if (CTLBLK_MAX_SEG > 131072)
	- uma_zdestroy(softc->buf128_zone);
	-#endif
	+ uma_zdestroy(softc->bufmin_zone);
	+ if (CTLBLK_MIN_SEG < CTLBLK_MAX_SEG)
	+ uma_zdestroy(softc->bufmax_zone);
	uma_zdestroy(softc->beio_zone);
	mtx_destroy(&softc->lock);
	sx_destroy(&softc->modify_lock);
	return (0);
	}
	diff --git a/sys/cam/mmc/mmc_da.c b/sys/cam/mmc/mmc_da.c
	index 1f6d116510f6..127d1cb48602 100644
	--- a/sys/cam/mmc/mmc_da.c
	+++ b/sys/cam/mmc/mmc_da.c
	@@ -1,2026 +1,2026 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2006 Bernd Walter <tisco@FreeBSD.org> All rights reserved.
	* Copyright (c) 2009 Alexander Motin <mav@FreeBSD.org> All rights reserved.
	* Copyright (c) 2015-2017 Ilya Bakulin <kibab@FreeBSD.org> All rights reserved.
	* Copyright (c) 2006 M. Warner Losh <imp@FreeBSD.org>
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* Some code derived from the sys/dev/mmc and sys/cam/ata
	* Thanks to Warner Losh <imp@FreeBSD.org>, Alexander Motin <mav@FreeBSD.org>
	* Bernd Walter <tisco@FreeBSD.org>, and other authors.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	//#include "opt_sdda.h"

	#include <sys/param.h>

	#ifdef _KERNEL
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bio.h>
	#include <sys/sysctl.h>
	#include <sys/endian.h>
	#include <sys/taskqueue.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/conf.h>
	#include <sys/devicestat.h>
	#include <sys/eventhandler.h>
	#include <sys/malloc.h>
	#include <sys/cons.h>
	#include <sys/proc.h>
	#include <sys/reboot.h>
	#include <geom/geom_disk.h>
	#include <machine/_inttypes.h> /* for PRIu64 */
	#endif /* _KERNEL */

	#ifndef _KERNEL
	#include <stdio.h>
	#include <string.h>
	#endif /* _KERNEL */

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_queue.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/cam_xpt_internal.h>
	#include <cam/cam_debug.h>

	#include <cam/mmc/mmc_all.h>

	#ifdef _KERNEL

	typedef enum {
	SDDA_FLAG_OPEN = 0x0002,
	SDDA_FLAG_DIRTY = 0x0004
	} sdda_flags;

	typedef enum {
	SDDA_STATE_INIT,
	SDDA_STATE_INVALID,
	SDDA_STATE_NORMAL,
	SDDA_STATE_PART_SWITCH,
	} sdda_state;

	#define SDDA_FMT_BOOT "sdda%dboot"
	#define SDDA_FMT_GP "sdda%dgp"
	#define SDDA_FMT_RPMB "sdda%drpmb"
	#define SDDA_LABEL_ENH "enh"

	#define SDDA_PART_NAMELEN (16 + 1)

	struct sdda_softc;

	struct sdda_part {
	struct disk *disk;
	struct bio_queue_head bio_queue;
	sdda_flags flags;
	struct sdda_softc *sc;
	u_int cnt;
	u_int type;
	bool ro;
	char name[SDDA_PART_NAMELEN];
	};

	struct sdda_softc {
	int outstanding_cmds; /* Number of active commands */
	int refcount; /* Active xpt_action() calls */
	sdda_state state;
	struct mmc_data *mmcdata;
	struct cam_periph *periph;
	// sdda_quirks quirks;
	struct task start_init_task;
	uint32_t raw_csd[4];
	uint8_t raw_ext_csd[512]; /* MMC only? */
	struct mmc_csd csd;
	struct mmc_cid cid;
	struct mmc_scr scr;
	/* Calculated from CSD */
	uint64_t sector_count;
	uint64_t mediasize;

	/* Calculated from CID */
	char card_id_string[64];/* Formatted CID info (serial, MFG, etc) */
	char card_sn_string[16];/* Formatted serial # for disk->d_ident */
	/* Determined from CSD + is highspeed card*/
	uint32_t card_f_max;

	/* Generic switch timeout */
	uint32_t cmd6_time;
	uint32_t timings; /* Mask of bus timings supported */
	uint32_t vccq_120; /* Mask of bus timings at VCCQ of 1.2 V */
	uint32_t vccq_180; /* Mask of bus timings at VCCQ of 1.8 V */
	/* MMC partitions support */
	struct sdda_part *part[MMC_PART_MAX];
	uint8_t part_curr; /* Partition currently switched to */
	uint8_t part_requested; /* What partition we're currently switching to */
	uint32_t part_time; /* Partition switch timeout [us] */
	off_t enh_base; /* Enhanced user data area slice base ... */
	off_t enh_size; /* ... and size [bytes] */
	int log_count;
	struct timeval log_time;
	};

	static const char *mmc_errmsg[] =
	{
	"None",
	"Timeout",
	"Bad CRC",
	"Fifo",
	"Failed",
	"Invalid",
	"NO MEMORY"
	};

	#define ccb_bp ppriv_ptr1

	static disk_strategy_t sddastrategy;
	static periph_init_t sddainit;
	static void sddaasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg);
	static periph_ctor_t sddaregister;
	static periph_dtor_t sddacleanup;
	static periph_start_t sddastart;
	static periph_oninv_t sddaoninvalidate;
	static void sddadone(struct cam_periph *periph,
	union ccb *done_ccb);
	static int sddaerror(union ccb *ccb, u_int32_t cam_flags,
	u_int32_t sense_flags);

	static int mmc_handle_reply(union ccb *ccb);
	static uint16_t get_rca(struct cam_periph *periph);
	static void sdda_start_init(void context, union ccb start_ccb);
	static void sdda_start_init_task(void *context, int pending);
	static void sdda_process_mmc_partitions(struct cam_periph periph, union ccb start_ccb);
	static uint32_t sdda_get_host_caps(struct cam_periph periph, union ccb ccb);
	static int mmc_select_card(struct cam_periph periph, union ccb ccb, uint32_t rca);
	static inline uint32_t mmc_get_sector_size(struct cam_periph *periph) {return MMC_SECTOR_SIZE;}

	static SYSCTL_NODE(_kern_cam, OID_AUTO, sdda, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"CAM Direct Access Disk driver");

	static int sdda_mmcsd_compat = 1;
	SYSCTL_INT(_kern_cam_sdda, OID_AUTO, mmcsd_compat, CTLFLAG_RDTUN,
	&sdda_mmcsd_compat, 1, "Enable creation of mmcsd aliases.");

	/* TODO: actually issue GET_TRAN_SETTINGS to get R/O status */
	static inline bool sdda_get_read_only(struct cam_periph periph, union ccb start_ccb)
	{

	return (false);
	}

	static uint32_t mmc_get_spec_vers(struct cam_periph *periph);
	static uint64_t mmc_get_media_size(struct cam_periph *periph);
	static uint32_t mmc_get_cmd6_timeout(struct cam_periph *periph);
	static bool sdda_add_part(struct cam_periph *periph, u_int type,
	const char *name, u_int cnt, off_t media_size, bool ro);

	static struct periph_driver sddadriver =
	{
	sddainit, "sdda",
	TAILQ_HEAD_INITIALIZER(sddadriver.units), /* generation */ 0
	};

	PERIPHDRIVER_DECLARE(sdda, sddadriver);

	static MALLOC_DEFINE(M_SDDA, "sd_da", "sd_da buffers");

	static const int exp[8] = {
	1, 10, 100, 1000, 10000, 100000, 1000000, 10000000
	};

	static const int mant[16] = {
	0, 10, 12, 13, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 70, 80
	};

	static const int cur_min[8] = {
	500, 1000, 5000, 10000, 25000, 35000, 60000, 100000
	};

	static const int cur_max[8] = {
	1000, 5000, 10000, 25000, 35000, 45000, 800000, 200000
	};

	static uint16_t
	get_rca(struct cam_periph *periph) {
	return periph->path->device->mmc_ident_data.card_rca;
	}

	/*
	* Figure out if CCB execution resulted in error.
	* Look at both CAM-level errors and on MMC protocol errors.
	*
	* Return value is always MMC error.
	*/
	static int
	mmc_handle_reply(union ccb *ccb)
	{
	KASSERT(ccb->ccb_h.func_code == XPT_MMC_IO,
	("ccb %p: cannot handle non-XPT_MMC_IO errors, got func_code=%d",
	ccb, ccb->ccb_h.func_code));

	/* CAM-level error should always correspond to MMC-level error */
	if (((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) &&
	(ccb->mmcio.cmd.error != MMC_ERR_NONE))
	panic("CCB status is OK but MMC error != MMC_ERR_NONE");

	if (ccb->mmcio.cmd.error != MMC_ERR_NONE) {
	xpt_print_path(ccb->ccb_h.path);
	printf("CMD%d failed, err %d (%s)\n",
	ccb->mmcio.cmd.opcode,
	ccb->mmcio.cmd.error,
	mmc_errmsg[ccb->mmcio.cmd.error]);
	}
	return (ccb->mmcio.cmd.error);
	}

	static uint32_t
	mmc_get_bits(uint32_t *bits, int bit_len, int start, int size)
	{
	const int i = (bit_len / 32) - (start / 32) - 1;
	const int shift = start & 31;
	uint32_t retval = bits[i] >> shift;
	if (size + shift > 32)
	retval \|= bits[i - 1] << (32 - shift);
	return (retval & ((1llu << size) - 1));
	}

	static void
	mmc_decode_csd_sd(uint32_t raw_csd, struct mmc_csd csd)
	{
	int v;
	int m;
	int e;

	memset(csd, 0, sizeof(*csd));
	csd->csd_structure = v = mmc_get_bits(raw_csd, 128, 126, 2);
	if (v == 0) {
	m = mmc_get_bits(raw_csd, 128, 115, 4);
	e = mmc_get_bits(raw_csd, 128, 112, 3);
	csd->tacc = (exp[e] * mant[m] + 9) / 10;
	csd->nsac = mmc_get_bits(raw_csd, 128, 104, 8) * 100;
	m = mmc_get_bits(raw_csd, 128, 99, 4);
	e = mmc_get_bits(raw_csd, 128, 96, 3);
	csd->tran_speed = exp[e] * 10000 * mant[m];
	csd->ccc = mmc_get_bits(raw_csd, 128, 84, 12);
	csd->read_bl_len = 1 << mmc_get_bits(raw_csd, 128, 80, 4);
	csd->read_bl_partial = mmc_get_bits(raw_csd, 128, 79, 1);
	csd->write_blk_misalign = mmc_get_bits(raw_csd, 128, 78, 1);
	csd->read_blk_misalign = mmc_get_bits(raw_csd, 128, 77, 1);
	csd->dsr_imp = mmc_get_bits(raw_csd, 128, 76, 1);
	csd->vdd_r_curr_min = cur_min[mmc_get_bits(raw_csd, 128, 59, 3)];
	csd->vdd_r_curr_max = cur_max[mmc_get_bits(raw_csd, 128, 56, 3)];
	csd->vdd_w_curr_min = cur_min[mmc_get_bits(raw_csd, 128, 53, 3)];
	csd->vdd_w_curr_max = cur_max[mmc_get_bits(raw_csd, 128, 50, 3)];
	m = mmc_get_bits(raw_csd, 128, 62, 12);
	e = mmc_get_bits(raw_csd, 128, 47, 3);
	csd->capacity = ((1 + m) << (e + 2)) * csd->read_bl_len;
	csd->erase_blk_en = mmc_get_bits(raw_csd, 128, 46, 1);
	csd->erase_sector = mmc_get_bits(raw_csd, 128, 39, 7) + 1;
	csd->wp_grp_size = mmc_get_bits(raw_csd, 128, 32, 7);
	csd->wp_grp_enable = mmc_get_bits(raw_csd, 128, 31, 1);
	csd->r2w_factor = 1 << mmc_get_bits(raw_csd, 128, 26, 3);
	csd->write_bl_len = 1 << mmc_get_bits(raw_csd, 128, 22, 4);
	csd->write_bl_partial = mmc_get_bits(raw_csd, 128, 21, 1);
	} else if (v == 1) {
	m = mmc_get_bits(raw_csd, 128, 115, 4);
	e = mmc_get_bits(raw_csd, 128, 112, 3);
	csd->tacc = (exp[e] * mant[m] + 9) / 10;
	csd->nsac = mmc_get_bits(raw_csd, 128, 104, 8) * 100;
	m = mmc_get_bits(raw_csd, 128, 99, 4);
	e = mmc_get_bits(raw_csd, 128, 96, 3);
	csd->tran_speed = exp[e] * 10000 * mant[m];
	csd->ccc = mmc_get_bits(raw_csd, 128, 84, 12);
	csd->read_bl_len = 1 << mmc_get_bits(raw_csd, 128, 80, 4);
	csd->read_bl_partial = mmc_get_bits(raw_csd, 128, 79, 1);
	csd->write_blk_misalign = mmc_get_bits(raw_csd, 128, 78, 1);
	csd->read_blk_misalign = mmc_get_bits(raw_csd, 128, 77, 1);
	csd->dsr_imp = mmc_get_bits(raw_csd, 128, 76, 1);
	csd->capacity = ((uint64_t)mmc_get_bits(raw_csd, 128, 48, 22) + 1) *
	512 * 1024;
	csd->erase_blk_en = mmc_get_bits(raw_csd, 128, 46, 1);
	csd->erase_sector = mmc_get_bits(raw_csd, 128, 39, 7) + 1;
	csd->wp_grp_size = mmc_get_bits(raw_csd, 128, 32, 7);
	csd->wp_grp_enable = mmc_get_bits(raw_csd, 128, 31, 1);
	csd->r2w_factor = 1 << mmc_get_bits(raw_csd, 128, 26, 3);
	csd->write_bl_len = 1 << mmc_get_bits(raw_csd, 128, 22, 4);
	csd->write_bl_partial = mmc_get_bits(raw_csd, 128, 21, 1);
	} else
	panic("unknown SD CSD version");
	}

	static void
	mmc_decode_csd_mmc(uint32_t raw_csd, struct mmc_csd csd)
	{
	int m;
	int e;

	memset(csd, 0, sizeof(*csd));
	csd->csd_structure = mmc_get_bits(raw_csd, 128, 126, 2);
	csd->spec_vers = mmc_get_bits(raw_csd, 128, 122, 4);
	m = mmc_get_bits(raw_csd, 128, 115, 4);
	e = mmc_get_bits(raw_csd, 128, 112, 3);
	csd->tacc = exp[e] * mant[m] + 9 / 10;
	csd->nsac = mmc_get_bits(raw_csd, 128, 104, 8) * 100;
	m = mmc_get_bits(raw_csd, 128, 99, 4);
	e = mmc_get_bits(raw_csd, 128, 96, 3);
	csd->tran_speed = exp[e] * 10000 * mant[m];
	csd->ccc = mmc_get_bits(raw_csd, 128, 84, 12);
	csd->read_bl_len = 1 << mmc_get_bits(raw_csd, 128, 80, 4);
	csd->read_bl_partial = mmc_get_bits(raw_csd, 128, 79, 1);
	csd->write_blk_misalign = mmc_get_bits(raw_csd, 128, 78, 1);
	csd->read_blk_misalign = mmc_get_bits(raw_csd, 128, 77, 1);
	csd->dsr_imp = mmc_get_bits(raw_csd, 128, 76, 1);
	csd->vdd_r_curr_min = cur_min[mmc_get_bits(raw_csd, 128, 59, 3)];
	csd->vdd_r_curr_max = cur_max[mmc_get_bits(raw_csd, 128, 56, 3)];
	csd->vdd_w_curr_min = cur_min[mmc_get_bits(raw_csd, 128, 53, 3)];
	csd->vdd_w_curr_max = cur_max[mmc_get_bits(raw_csd, 128, 50, 3)];
	m = mmc_get_bits(raw_csd, 128, 62, 12);
	e = mmc_get_bits(raw_csd, 128, 47, 3);
	csd->capacity = ((1 + m) << (e + 2)) * csd->read_bl_len;
	csd->erase_blk_en = 0;
	csd->erase_sector = (mmc_get_bits(raw_csd, 128, 42, 5) + 1) *
	(mmc_get_bits(raw_csd, 128, 37, 5) + 1);
	csd->wp_grp_size = mmc_get_bits(raw_csd, 128, 32, 5);
	csd->wp_grp_enable = mmc_get_bits(raw_csd, 128, 31, 1);
	csd->r2w_factor = 1 << mmc_get_bits(raw_csd, 128, 26, 3);
	csd->write_bl_len = 1 << mmc_get_bits(raw_csd, 128, 22, 4);
	csd->write_bl_partial = mmc_get_bits(raw_csd, 128, 21, 1);
	}

	static void
	mmc_decode_cid_sd(uint32_t raw_cid, struct mmc_cid cid)
	{
	int i;

	/* There's no version info, so we take it on faith */
	memset(cid, 0, sizeof(*cid));
	cid->mid = mmc_get_bits(raw_cid, 128, 120, 8);
	cid->oid = mmc_get_bits(raw_cid, 128, 104, 16);
	for (i = 0; i < 5; i++)
	cid->pnm[i] = mmc_get_bits(raw_cid, 128, 96 - i * 8, 8);
	cid->pnm[5] = 0;
	cid->prv = mmc_get_bits(raw_cid, 128, 56, 8);
	cid->psn = mmc_get_bits(raw_cid, 128, 24, 32);
	cid->mdt_year = mmc_get_bits(raw_cid, 128, 12, 8) + 2000;
	cid->mdt_month = mmc_get_bits(raw_cid, 128, 8, 4);
	}

	static void
	mmc_decode_cid_mmc(uint32_t raw_cid, struct mmc_cid cid)
	{
	int i;

	/* There's no version info, so we take it on faith */
	memset(cid, 0, sizeof(*cid));
	cid->mid = mmc_get_bits(raw_cid, 128, 120, 8);
	cid->oid = mmc_get_bits(raw_cid, 128, 104, 8);
	for (i = 0; i < 6; i++)
	cid->pnm[i] = mmc_get_bits(raw_cid, 128, 96 - i * 8, 8);
	cid->pnm[6] = 0;
	cid->prv = mmc_get_bits(raw_cid, 128, 48, 8);
	cid->psn = mmc_get_bits(raw_cid, 128, 16, 32);
	cid->mdt_month = mmc_get_bits(raw_cid, 128, 12, 4);
	cid->mdt_year = mmc_get_bits(raw_cid, 128, 8, 4) + 1997;
	}

	static void
	mmc_format_card_id_string(struct sdda_softc sc, struct mmc_params mmcp)
	{
	char oidstr[8];
	uint8_t c1;
	uint8_t c2;

	/*
	* Format a card ID string for use by the mmcsd driver, it's what
	* appears between the <> in the following:
	* mmcsd0: 968MB <SD SD01G 8.0 SN 2686905 Mfg 08/2008 by 3 TN> at mmc0
	* 22.5MHz/4bit/128-block
	*
	* Also format just the card serial number, which the mmcsd driver will
	* use as the disk->d_ident string.
	*
	* The card_id_string in mmc_ivars is currently allocated as 64 bytes,
	* and our max formatted length is currently 55 bytes if every field
	* contains the largest value.
	*
	* Sometimes the oid is two printable ascii chars; when it's not,
	* format it as 0xnnnn instead.
	*/
	c1 = (sc->cid.oid >> 8) & 0x0ff;
	c2 = sc->cid.oid & 0x0ff;
	if (c1 > 0x1f && c1 < 0x7f && c2 > 0x1f && c2 < 0x7f)
	snprintf(oidstr, sizeof(oidstr), "%c%c", c1, c2);
	else
	snprintf(oidstr, sizeof(oidstr), "0x%04x", sc->cid.oid);
	snprintf(sc->card_sn_string, sizeof(sc->card_sn_string),
	"%08X", sc->cid.psn);
	snprintf(sc->card_id_string, sizeof(sc->card_id_string),
	"%s%s %s %d.%d SN %08X MFG %02d/%04d by %d %s",
	mmcp->card_features & CARD_FEATURE_MMC ? "MMC" : "SD",
	mmcp->card_features & CARD_FEATURE_SDHC ? "HC" : "",
	sc->cid.pnm, sc->cid.prv >> 4, sc->cid.prv & 0x0f,
	sc->cid.psn, sc->cid.mdt_month, sc->cid.mdt_year,
	sc->cid.mid, oidstr);
	}

	static int
	sddaopen(struct disk *dp)
	{
	struct sdda_part *part;
	struct cam_periph *periph;
	struct sdda_softc *softc;
	int error;

	part = (struct sdda_part *)dp->d_drv1;
	softc = part->sc;
	periph = softc->periph;
	if (cam_periph_acquire(periph) != 0) {
	return(ENXIO);
	}

	cam_periph_lock(periph);
	if ((error = cam_periph_hold(periph, PRIBIO\|PCATCH)) != 0) {
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (error);
	}

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sddaopen\n"));

	part->flags \|= SDDA_FLAG_OPEN;

	cam_periph_unhold(periph);
	cam_periph_unlock(periph);
	return (0);
	}

	static int
	sddaclose(struct disk *dp)
	{
	struct sdda_part *part;
	struct cam_periph *periph;
	struct sdda_softc *softc;

	part = (struct sdda_part *)dp->d_drv1;
	softc = part->sc;
	periph = softc->periph;
	part->flags &= ~SDDA_FLAG_OPEN;

	cam_periph_lock(periph);

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sddaclose\n"));

	while (softc->refcount != 0)
	cam_periph_sleep(periph, &softc->refcount, PRIBIO, "sddaclose", 1);
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (0);
	}

	static void
	sddaschedule(struct cam_periph *periph)
	{
	struct sdda_softc softc = (struct sdda_softc )periph->softc;
	struct sdda_part *part;
	struct bio *bp;
	int i;

	/* Check if we have more work to do. */
	/* Find partition that has outstanding commands. Prefer current partition. */
	bp = bioq_first(&softc->part[softc->part_curr]->bio_queue);
	if (bp == NULL) {
	for (i = 0; i < MMC_PART_MAX; i++) {
	if ((part = softc->part[i]) != NULL &&
	(bp = bioq_first(&softc->part[i]->bio_queue)) != NULL)
	break;
	}
	}
	if (bp != NULL) {
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	}
	}

	/*
	* Actually translate the requested transfer into one the physical driver
	* can understand. The transfer is described by a buf and will include
	* only one physical transfer.
	*/
	static void
	sddastrategy(struct bio *bp)
	{
	struct cam_periph *periph;
	struct sdda_part *part;
	struct sdda_softc *softc;

	part = (struct sdda_part *)bp->bio_disk->d_drv1;
	softc = part->sc;
	periph = softc->periph;

	cam_periph_lock(periph);

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sddastrategy(%p)\n", bp));

	/*
	* If the device has been made invalid, error out
	*/
	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
	cam_periph_unlock(periph);
	biofinish(bp, NULL, ENXIO);
	return;
	}

	/*
	* Place it in the queue of disk activities for this disk
	*/
	bioq_disksort(&part->bio_queue, bp);

	/*
	* Schedule ourselves for performing the work.
	*/
	sddaschedule(periph);
	cam_periph_unlock(periph);

	return;
	}

	static void
	sddainit(void)
	{
	cam_status status;

	/*
	* Install a global async callback. This callback will
	* receive async callbacks like "new device found".
	*/
	status = xpt_register_async(AC_FOUND_DEVICE, sddaasync, NULL, NULL);

	if (status != CAM_REQ_CMP) {
	printf("sdda: Failed to attach master async callback "
	"due to status 0x%x!\n", status);
	}
	}

	/*
	* Callback from GEOM, called when it has finished cleaning up its
	* resources.
	*/
	static void
	sddadiskgonecb(struct disk *dp)
	{
	struct cam_periph *periph;
	struct sdda_part *part;

	part = (struct sdda_part *)dp->d_drv1;
	periph = part->sc->periph;
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sddadiskgonecb\n"));

	cam_periph_release(periph);
	}

	static void
	sddaoninvalidate(struct cam_periph *periph)
	{
	struct sdda_softc *softc;
	struct sdda_part *part;

	softc = (struct sdda_softc *)periph->softc;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sddaoninvalidate\n"));

	/*
	* De-register any async callbacks.
	*/
	xpt_register_async(0, sddaasync, periph, periph->path);

	/*
	* Return all queued I/O with ENXIO.
	* XXX Handle any transactions queued to the card
	* with XPT_ABORT_CCB.
	*/
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("bioq_flush start\n"));
	for (int i = 0; i < MMC_PART_MAX; i++) {
	if ((part = softc->part[i]) != NULL) {
	bioq_flush(&part->bio_queue, NULL, ENXIO);
	disk_gone(part->disk);
	}
	}
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("bioq_flush end\n"));

	}

	static void
	sddacleanup(struct cam_periph *periph)
	{
	struct sdda_softc *softc;
	struct sdda_part *part;
	int i;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sddacleanup\n"));
	softc = (struct sdda_softc *)periph->softc;

	cam_periph_unlock(periph);

	for (i = 0; i < MMC_PART_MAX; i++) {
	if ((part = softc->part[i]) != NULL) {
	disk_destroy(part->disk);
	free(part, M_DEVBUF);
	softc->part[i] = NULL;
	}
	}
	free(softc, M_DEVBUF);
	cam_periph_lock(periph);
	}

	static void
	sddaasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg)
	{
	struct ccb_getdev cgd;
	struct cam_periph *periph;
	struct sdda_softc *softc;

	periph = (struct cam_periph *)callback_arg;
	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("sddaasync(code=%d)\n", code));
	switch (code) {
	case AC_FOUND_DEVICE:
	{
	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("=> AC_FOUND_DEVICE\n"));
	struct ccb_getdev *cgd;
	cam_status status;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL)
	break;

	if (cgd->protocol != PROTO_MMCSD)
	break;

	if (!(path->device->mmc_ident_data.card_features & CARD_FEATURE_MEMORY)) {
	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("No memory on the card!\n"));
	break;
	}

	/*
	* Allocate a peripheral instance for
	* this device and start the probe
	* process.
	*/
	status = cam_periph_alloc(sddaregister, sddaoninvalidate,
	sddacleanup, sddastart,
	"sdda", CAM_PERIPH_BIO,
	path, sddaasync,
	AC_FOUND_DEVICE, cgd);

	if (status != CAM_REQ_CMP
	&& status != CAM_REQ_INPROG)
	printf("sddaasync: Unable to attach to new device "
	"due to status 0x%x\n", status);
	break;
	}
	case AC_GETDEV_CHANGED:
	{
	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("=> AC_GETDEV_CHANGED\n"));
	softc = (struct sdda_softc *)periph->softc;
	xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_action((union ccb *)&cgd);
	cam_periph_async(periph, code, path, arg);
	break;
	}
	case AC_ADVINFO_CHANGED:
	{
	uintptr_t buftype;
	int i;

	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("=> AC_ADVINFO_CHANGED\n"));
	buftype = (uintptr_t)arg;
	if (buftype == CDAI_TYPE_PHYS_PATH) {
	struct sdda_softc *softc;
	struct sdda_part *part;

	softc = periph->softc;
	for (i = 0; i < MMC_PART_MAX; i++) {
	if ((part = softc->part[i]) != NULL) {
	disk_attr_changed(part->disk, "GEOM::physpath",
	M_NOWAIT);
	}
	}
	}
	break;
	}
	default:
	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("=> default?!\n"));
	cam_periph_async(periph, code, path, arg);
	break;
	}
	}

	static int
	sddagetattr(struct bio *bp)
	{
	struct cam_periph *periph;
	struct sdda_softc *softc;
	struct sdda_part *part;
	int ret;

	part = (struct sdda_part *)bp->bio_disk->d_drv1;
	softc = part->sc;
	periph = softc->periph;
	cam_periph_lock(periph);
	ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute,
	periph->path);
	cam_periph_unlock(periph);
	if (ret == 0)
	bp->bio_completed = bp->bio_length;
	return (ret);
	}

	static cam_status
	sddaregister(struct cam_periph periph, void arg)
	{
	struct sdda_softc *softc;
	struct ccb_getdev *cgd;
	union ccb request_ccb; / CCB representing the probe request */

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sddaregister\n"));
	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL) {
	printf("sddaregister: no getdev CCB, can't register device\n");
	return (CAM_REQ_CMP_ERR);
	}

	softc = (struct sdda_softc )malloc(sizeof(softc), M_DEVBUF,
	M_NOWAIT\|M_ZERO);
	if (softc == NULL) {
	printf("sddaregister: Unable to probe new device. "
	"Unable to allocate softc\n");
	return (CAM_REQ_CMP_ERR);
	}

	softc->state = SDDA_STATE_INIT;
	softc->mmcdata =
	(struct mmc_data *)malloc(sizeof(struct mmc_data), M_DEVBUF, M_NOWAIT\|M_ZERO);
	if (softc->mmcdata == NULL) {
	printf("sddaregister: Unable to probe new device. "
	"Unable to allocate mmcdata\n");
	free(softc, M_DEVBUF);
	return (CAM_REQ_CMP_ERR);
	}
	periph->softc = softc;
	softc->periph = periph;

	request_ccb = (union ccb*) arg;
	xpt_schedule(periph, CAM_PRIORITY_XPT);
	TASK_INIT(&softc->start_init_task, 0, sdda_start_init_task, periph);
	taskqueue_enqueue(taskqueue_thread, &softc->start_init_task);

	return (CAM_REQ_CMP);
	}

	static int
	mmc_exec_app_cmd(struct cam_periph periph, union ccb ccb,
	struct mmc_command *cmd) {
	int err;

	/* Send APP_CMD first */
	memset(&ccb->mmcio.cmd, 0, sizeof(struct mmc_command));
	memset(&ccb->mmcio.stop, 0, sizeof(struct mmc_command));
	cam_fill_mmcio(&ccb->mmcio,
	/retries/ 0,
	/cbfcnp/ NULL,
	/flags/ CAM_DIR_NONE,
	/mmc_opcode/ MMC_APP_CMD,
	/mmc_arg/ get_rca(periph) << 16,
	/mmc_flags/ MMC_RSP_R1 \| MMC_CMD_AC,
	/mmc_data/ NULL,
	/timeout/ 0);

	cam_periph_runccb(ccb, sddaerror, CAM_FLAG_NONE, /sense_flags/0, NULL);
	err = mmc_handle_reply(ccb);
	if (err != 0)
	return (err);
	if (!(ccb->mmcio.cmd.resp[0] & R1_APP_CMD))
	return (EIO);

	/* Now exec actual command */
	int flags = 0;
	if (cmd->data != NULL) {
	ccb->mmcio.cmd.data = cmd->data;
	if (cmd->data->flags & MMC_DATA_READ)
	flags \|= CAM_DIR_IN;
	if (cmd->data->flags & MMC_DATA_WRITE)
	flags \|= CAM_DIR_OUT;
	} else flags = CAM_DIR_NONE;

	cam_fill_mmcio(&ccb->mmcio,
	/retries/ 0,
	/cbfcnp/ NULL,
	/flags/ flags,
	/mmc_opcode/ cmd->opcode,
	/mmc_arg/ cmd->arg,
	/mmc_flags/ cmd->flags,
	/mmc_data/ cmd->data,
	/timeout/ 0);

	cam_periph_runccb(ccb, sddaerror, CAM_FLAG_NONE, /sense_flags/0, NULL);
	err = mmc_handle_reply(ccb);
	if (err != 0)
	return (err);
	memcpy(cmd->resp, ccb->mmcio.cmd.resp, sizeof(cmd->resp));
	cmd->error = ccb->mmcio.cmd.error;

	return (0);
	}

	static int
	mmc_app_get_scr(struct cam_periph periph, union ccb ccb, uint32_t *rawscr) {
	int err;
	struct mmc_command cmd;
	struct mmc_data d;

	memset(&cmd, 0, sizeof(cmd));
	memset(&d, 0, sizeof(d));

	memset(rawscr, 0, 8);
	cmd.opcode = ACMD_SEND_SCR;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_ADTC;
	cmd.arg = 0;

	d.data = rawscr;
	d.len = 8;
	d.flags = MMC_DATA_READ;
	cmd.data = &d;

	err = mmc_exec_app_cmd(periph, ccb, &cmd);
	rawscr[0] = be32toh(rawscr[0]);
	rawscr[1] = be32toh(rawscr[1]);
	return (err);
	}

	static int
	mmc_send_ext_csd(struct cam_periph periph, union ccb ccb,
	uint8_t *rawextcsd, size_t buf_len) {
	int err;
	struct mmc_data d;

	KASSERT(buf_len == 512, ("Buffer for ext csd must be 512 bytes"));
	memset(&d, 0, sizeof(d));
	d.data = rawextcsd;
	d.len = buf_len;
	d.flags = MMC_DATA_READ;
	memset(d.data, 0, d.len);

	cam_fill_mmcio(&ccb->mmcio,
	/retries/ 0,
	/cbfcnp/ NULL,
	/flags/ CAM_DIR_IN,
	/mmc_opcode/ MMC_SEND_EXT_CSD,
	/mmc_arg/ 0,
	/mmc_flags/ MMC_RSP_R1 \| MMC_CMD_ADTC,
	/mmc_data/ &d,
	/timeout/ 0);

	cam_periph_runccb(ccb, sddaerror, CAM_FLAG_NONE, /sense_flags/0, NULL);
	err = mmc_handle_reply(ccb);
	return (err);
	}

	static void
	mmc_app_decode_scr(uint32_t raw_scr, struct mmc_scr scr)
	{
	unsigned int scr_struct;

	memset(scr, 0, sizeof(*scr));

	scr_struct = mmc_get_bits(raw_scr, 64, 60, 4);
	if (scr_struct != 0) {
	printf("Unrecognised SCR structure version %d\n",
	scr_struct);
	return;
	}
	scr->sda_vsn = mmc_get_bits(raw_scr, 64, 56, 4);
	scr->bus_widths = mmc_get_bits(raw_scr, 64, 48, 4);
	}

	static inline void
	mmc_switch_fill_mmcio(union ccb *ccb,
	uint8_t set, uint8_t index, uint8_t value, u_int timeout)
	{
	int arg = (MMC_SWITCH_FUNC_WR << 24) \|
	(index << 16) \|
	(value << 8) \|
	set;

	cam_fill_mmcio(&ccb->mmcio,
	/retries/ 0,
	/cbfcnp/ NULL,
	/flags/ CAM_DIR_NONE,
	/mmc_opcode/ MMC_SWITCH_FUNC,
	/mmc_arg/ arg,
	/mmc_flags/ MMC_RSP_R1B \| MMC_CMD_AC,
	/mmc_data/ NULL,
	/timeout/ timeout);
	}

	static int
	mmc_select_card(struct cam_periph periph, union ccb ccb, uint32_t rca)
	{
	int flags, err;

	flags = (rca ? MMC_RSP_R1B : MMC_RSP_NONE) \| MMC_CMD_AC;
	cam_fill_mmcio(&ccb->mmcio,
	/retries/ 0,
	/cbfcnp/ NULL,
	/flags/ CAM_DIR_IN,
	/mmc_opcode/ MMC_SELECT_CARD,
	/mmc_arg/ rca << 16,
	/mmc_flags/ flags,
	/mmc_data/ NULL,
	/timeout/ 0);

	cam_periph_runccb(ccb, sddaerror, CAM_FLAG_NONE, /sense_flags/0, NULL);
	err = mmc_handle_reply(ccb);
	return (err);
	}

	static int
	mmc_switch(struct cam_periph periph, union ccb ccb,
	uint8_t set, uint8_t index, uint8_t value, u_int timeout)
	{
	int err;

	mmc_switch_fill_mmcio(ccb, set, index, value, timeout);
	cam_periph_runccb(ccb, sddaerror, CAM_FLAG_NONE, /sense_flags/0, NULL);
	err = mmc_handle_reply(ccb);
	return (err);
	}

	static uint32_t
	mmc_get_spec_vers(struct cam_periph *periph) {
	struct sdda_softc softc = (struct sdda_softc )periph->softc;

	return (softc->csd.spec_vers);
	}

	static uint64_t
	mmc_get_media_size(struct cam_periph *periph) {
	struct sdda_softc softc = (struct sdda_softc )periph->softc;

	return (softc->mediasize);
	}

	static uint32_t
	mmc_get_cmd6_timeout(struct cam_periph *periph)
	{
	struct sdda_softc softc = (struct sdda_softc )periph->softc;

	if (mmc_get_spec_vers(periph) >= 6)
	return (softc->raw_ext_csd[EXT_CSD_GEN_CMD6_TIME] * 10);
	return (500 * 1000);
	}

	static int
	mmc_sd_switch(struct cam_periph periph, union ccb ccb,
	uint8_t mode, uint8_t grp, uint8_t value,
	uint8_t *res) {
	struct mmc_data mmc_d;
	uint32_t arg;
	int err;

	memset(res, 0, 64);
	memset(&mmc_d, 0, sizeof(mmc_d));
	mmc_d.len = 64;
	mmc_d.data = res;
	mmc_d.flags = MMC_DATA_READ;

	arg = mode << 31; /* 0 - check, 1 - set */
	arg \|= 0x00FFFFFF;
	arg &= ~(0xF << (grp * 4));
	arg \|= value << (grp * 4);

	cam_fill_mmcio(&ccb->mmcio,
	/retries/ 0,
	/cbfcnp/ NULL,
	/flags/ CAM_DIR_IN,
	/mmc_opcode/ SD_SWITCH_FUNC,
	/mmc_arg/ arg,
	/mmc_flags/ MMC_RSP_R1 \| MMC_CMD_ADTC,
	/mmc_data/ &mmc_d,
	/timeout/ 0);

	cam_periph_runccb(ccb, sddaerror, CAM_FLAG_NONE, /sense_flags/0, NULL);
	err = mmc_handle_reply(ccb);
	return (err);
	}

	static int
	mmc_set_timing(struct cam_periph *periph,
	union ccb *ccb,
	enum mmc_bus_timing timing)
	{
	u_char switch_res[64];
	int err;
	uint8_t value;
	struct sdda_softc softc = (struct sdda_softc )periph->softc;
	struct mmc_params *mmcp = &periph->path->device->mmc_ident_data;

	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
	("mmc_set_timing(timing=%d)", timing));
	switch (timing) {
	case bus_timing_normal:
	value = 0;
	break;
	case bus_timing_hs:
	value = 1;
	break;
	default:
	return (MMC_ERR_INVALID);
	}
	if (mmcp->card_features & CARD_FEATURE_MMC) {
	err = mmc_switch(periph, ccb, EXT_CSD_CMD_SET_NORMAL,
	EXT_CSD_HS_TIMING, value, softc->cmd6_time);
	} else {
	err = mmc_sd_switch(periph, ccb, SD_SWITCH_MODE_SET, SD_SWITCH_GROUP1, value, switch_res);
	}

	/* Set high-speed timing on the host */
	struct ccb_trans_settings_mmc *cts;
	cts = &ccb->cts.proto_specific.mmc;
	ccb->ccb_h.func_code = XPT_SET_TRAN_SETTINGS;
	ccb->ccb_h.flags = CAM_DIR_NONE;
	ccb->ccb_h.retry_count = 0;
	ccb->ccb_h.timeout = 100;
	ccb->ccb_h.cbfcnp = NULL;
	cts->ios.timing = timing;
	cts->ios_valid = MMC_BT;
	xpt_action(ccb);

	return (err);
	}

	static void
	sdda_start_init_task(void *context, int pending) {
	union ccb *new_ccb;
	struct cam_periph *periph;

	periph = (struct cam_periph *)context;
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sdda_start_init_task\n"));
	new_ccb = xpt_alloc_ccb();
	xpt_setup_ccb(&new_ccb->ccb_h, periph->path,
	CAM_PRIORITY_NONE);

	cam_periph_lock(periph);
	cam_periph_hold(periph, PRIBIO\|PCATCH);
	sdda_start_init(context, new_ccb);
	cam_periph_unhold(periph);
	cam_periph_unlock(periph);
	xpt_free_ccb(new_ccb);
	}

	static void
	sdda_set_bus_width(struct cam_periph periph, union ccb ccb, int width) {
	struct sdda_softc softc = (struct sdda_softc )periph->softc;
	struct mmc_params *mmcp = &periph->path->device->mmc_ident_data;
	int err;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sdda_set_bus_width\n"));

	/* First set for the card, then for the host */
	if (mmcp->card_features & CARD_FEATURE_MMC) {
	uint8_t value;
	switch (width) {
	case bus_width_1:
	value = EXT_CSD_BUS_WIDTH_1;
	break;
	case bus_width_4:
	value = EXT_CSD_BUS_WIDTH_4;
	break;
	case bus_width_8:
	value = EXT_CSD_BUS_WIDTH_8;
	break;
	default:
	panic("Invalid bus width %d", width);
	}
	err = mmc_switch(periph, ccb, EXT_CSD_CMD_SET_NORMAL,
	EXT_CSD_BUS_WIDTH, value, softc->cmd6_time);
	} else {
	/* For SD cards we send ACMD6 with the required bus width in arg */
	struct mmc_command cmd;
	memset(&cmd, 0, sizeof(struct mmc_command));
	cmd.opcode = ACMD_SET_BUS_WIDTH;
	cmd.arg = width;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_AC;
	err = mmc_exec_app_cmd(periph, ccb, &cmd);
	}

	if (err != MMC_ERR_NONE) {
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH, ("Error %d when setting bus width on the card\n", err));
	return;
	}
	/* Now card is done, set the host to the same width */
	struct ccb_trans_settings_mmc *cts;
	cts = &ccb->cts.proto_specific.mmc;
	ccb->ccb_h.func_code = XPT_SET_TRAN_SETTINGS;
	ccb->ccb_h.flags = CAM_DIR_NONE;
	ccb->ccb_h.retry_count = 0;
	ccb->ccb_h.timeout = 100;
	ccb->ccb_h.cbfcnp = NULL;
	cts->ios.bus_width = width;
	cts->ios_valid = MMC_BW;
	xpt_action(ccb);
	}

	static inline const char
	*part_type(u_int type)
	{

	switch (type) {
	case EXT_CSD_PART_CONFIG_ACC_RPMB:
	return ("RPMB");
	case EXT_CSD_PART_CONFIG_ACC_DEFAULT:
	return ("default");
	case EXT_CSD_PART_CONFIG_ACC_BOOT0:
	return ("boot0");
	case EXT_CSD_PART_CONFIG_ACC_BOOT1:
	return ("boot1");
	case EXT_CSD_PART_CONFIG_ACC_GP0:
	case EXT_CSD_PART_CONFIG_ACC_GP1:
	case EXT_CSD_PART_CONFIG_ACC_GP2:
	case EXT_CSD_PART_CONFIG_ACC_GP3:
	return ("general purpose");
	default:
	return ("(unknown type)");
	}
	}

	static inline const char
	*bus_width_str(enum mmc_bus_width w)
	{

	switch (w) {
	case bus_width_1:
	return ("1-bit");
	case bus_width_4:
	return ("4-bit");
	case bus_width_8:
	return ("8-bit");
	}
	}

	static uint32_t
	sdda_get_host_caps(struct cam_periph periph, union ccb ccb)
	{
	struct ccb_trans_settings_mmc *cts;

	cts = &ccb->cts.proto_specific.mmc;

	ccb->ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
	ccb->ccb_h.flags = CAM_DIR_NONE;
	ccb->ccb_h.retry_count = 0;
	ccb->ccb_h.timeout = 100;
	ccb->ccb_h.cbfcnp = NULL;
	xpt_action(ccb);

	if (ccb->ccb_h.status != CAM_REQ_CMP)
	panic("Cannot get host caps");
	return (cts->host_caps);
	}

	static uint32_t
	sdda_get_max_data(struct cam_periph periph, union ccb ccb)
	{
	struct ccb_trans_settings_mmc *cts;

	cts = &ccb->cts.proto_specific.mmc;
	memset(cts, 0, sizeof(struct ccb_trans_settings_mmc));

	ccb->ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
	ccb->ccb_h.flags = CAM_DIR_NONE;
	ccb->ccb_h.retry_count = 0;
	ccb->ccb_h.timeout = 100;
	ccb->ccb_h.cbfcnp = NULL;
	xpt_action(ccb);

	if (ccb->ccb_h.status != CAM_REQ_CMP)
	panic("Cannot get host max data");
	KASSERT(cts->host_max_data != 0, ("host_max_data == 0?!"));
	return (cts->host_max_data);
	}

	static void
	sdda_start_init(void context, union ccb start_ccb)
	{
	struct cam_periph periph = (struct cam_periph )context;
	struct ccb_trans_settings_mmc *cts;
	uint32_t host_caps;
	uint32_t sec_count;
	int err;
	int host_f_max;
	uint8_t card_type;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sdda_start_init\n"));
	/* periph was held for us when this task was enqueued */
	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
	cam_periph_release(periph);
	return;
	}

	struct sdda_softc softc = (struct sdda_softc )periph->softc;
	//struct ccb_mmcio *mmcio = &start_ccb->mmcio;
	struct mmc_params *mmcp = &periph->path->device->mmc_ident_data;
	struct cam_ed *device = periph->path->device;

	if (mmcp->card_features & CARD_FEATURE_MMC) {
	mmc_decode_csd_mmc(mmcp->card_csd, &softc->csd);
	mmc_decode_cid_mmc(mmcp->card_cid, &softc->cid);
	if (mmc_get_spec_vers(periph) >= 4) {
	err = mmc_send_ext_csd(periph, start_ccb,
	(uint8_t *)&softc->raw_ext_csd,
	sizeof(softc->raw_ext_csd));
	if (err != 0) {
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH,
	("Cannot read EXT_CSD, err %d", err));
	return;
	}
	}
	} else {
	mmc_decode_csd_sd(mmcp->card_csd, &softc->csd);
	mmc_decode_cid_sd(mmcp->card_cid, &softc->cid);
	}

	softc->sector_count = softc->csd.capacity / 512;
	softc->mediasize = softc->csd.capacity;
	softc->cmd6_time = mmc_get_cmd6_timeout(periph);

	/* MMC >= 4.x have EXT_CSD that has its own opinion about capacity */
	if (mmc_get_spec_vers(periph) >= 4) {
	sec_count = softc->raw_ext_csd[EXT_CSD_SEC_CNT] +
	(softc->raw_ext_csd[EXT_CSD_SEC_CNT + 1] << 8) +
	(softc->raw_ext_csd[EXT_CSD_SEC_CNT + 2] << 16) +
	(softc->raw_ext_csd[EXT_CSD_SEC_CNT + 3] << 24);
	if (sec_count != 0) {
	softc->sector_count = sec_count;
	softc->mediasize = softc->sector_count * 512;
	/* FIXME: there should be a better name for this option...*/
	mmcp->card_features \|= CARD_FEATURE_SDHC;
	}
	}
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH,
	("Capacity: %"PRIu64", sectors: %"PRIu64"\n",
	softc->mediasize,
	softc->sector_count));
	mmc_format_card_id_string(softc, mmcp);

	/* Update info for CAM */
	device->serial_num_len = strlen(softc->card_sn_string);
	device->serial_num = (u_int8_t *)malloc((device->serial_num_len + 1),
	M_CAMXPT, M_NOWAIT);
	strlcpy(device->serial_num, softc->card_sn_string, device->serial_num_len + 1);

	device->device_id_len = strlen(softc->card_id_string);
	device->device_id = (u_int8_t *)malloc((device->device_id_len + 1),
	M_CAMXPT, M_NOWAIT);
	strlcpy(device->device_id, softc->card_id_string, device->device_id_len + 1);

	strlcpy(mmcp->model, softc->card_id_string, sizeof(mmcp->model));

	/* Set the clock frequency that the card can handle */
	cts = &start_ccb->cts.proto_specific.mmc;

	/* First, get the host's max freq */
	start_ccb->ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
	start_ccb->ccb_h.flags = CAM_DIR_NONE;
	start_ccb->ccb_h.retry_count = 0;
	start_ccb->ccb_h.timeout = 100;
	start_ccb->ccb_h.cbfcnp = NULL;
	xpt_action(start_ccb);

	if (start_ccb->ccb_h.status != CAM_REQ_CMP)
	panic("Cannot get max host freq");
	host_f_max = cts->host_f_max;
	host_caps = cts->host_caps;
	if (cts->ios.bus_width != bus_width_1)
	panic("Bus width in ios is not 1-bit");

	/* Now check if the card supports High-speed */
	softc->card_f_max = softc->csd.tran_speed;

	if (host_caps & MMC_CAP_HSPEED) {
	/* Find out if the card supports High speed timing */
	if (mmcp->card_features & CARD_FEATURE_SD20) {
	/* Get and decode SCR */
	uint32_t rawscr[2];
	uint8_t res[64];
	if (mmc_app_get_scr(periph, start_ccb, rawscr)) {
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH, ("Cannot get SCR\n"));
	goto finish_hs_tests;
	}
	mmc_app_decode_scr(rawscr, &softc->scr);

	if ((softc->scr.sda_vsn >= 1) && (softc->csd.ccc & (1<<10))) {
	mmc_sd_switch(periph, start_ccb, SD_SWITCH_MODE_CHECK,
	SD_SWITCH_GROUP1, SD_SWITCH_NOCHANGE, res);
	if (res[13] & 2) {
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH, ("Card supports HS\n"));
	softc->card_f_max = SD_HS_MAX;
	}

	/*
	* We deselect then reselect the card here. Some cards
	* become unselected and timeout with the above two
	* commands, although the state tables / diagrams in the
	* standard suggest they go back to the transfer state.
	* Other cards don't become deselected, and if we
	* attempt to blindly re-select them, we get timeout
	* errors from some controllers. So we deselect then
	* reselect to handle all situations.
	*/
	mmc_select_card(periph, start_ccb, 0);
	mmc_select_card(periph, start_ccb, get_rca(periph));
	} else {
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH, ("Not trying the switch\n"));
	goto finish_hs_tests;
	}
	}

	if (mmcp->card_features & CARD_FEATURE_MMC && mmc_get_spec_vers(periph) >= 4) {
	card_type = softc->raw_ext_csd[EXT_CSD_CARD_TYPE];
	if (card_type & EXT_CSD_CARD_TYPE_HS_52)
	softc->card_f_max = MMC_TYPE_HS_52_MAX;
	else if (card_type & EXT_CSD_CARD_TYPE_HS_26)
	softc->card_f_max = MMC_TYPE_HS_26_MAX;
	if ((card_type & EXT_CSD_CARD_TYPE_DDR_52_1_2V) != 0 &&
	(host_caps & MMC_CAP_SIGNALING_120) != 0) {
	setbit(&softc->timings, bus_timing_mmc_ddr52);
	setbit(&softc->vccq_120, bus_timing_mmc_ddr52);
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH, ("Card supports DDR52 at 1.2V\n"));
	}
	if ((card_type & EXT_CSD_CARD_TYPE_DDR_52_1_8V) != 0 &&
	(host_caps & MMC_CAP_SIGNALING_180) != 0) {
	setbit(&softc->timings, bus_timing_mmc_ddr52);
	setbit(&softc->vccq_180, bus_timing_mmc_ddr52);
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH, ("Card supports DDR52 at 1.8V\n"));
	}
	if ((card_type & EXT_CSD_CARD_TYPE_HS200_1_2V) != 0 &&
	(host_caps & MMC_CAP_SIGNALING_120) != 0) {
	setbit(&softc->timings, bus_timing_mmc_hs200);
	setbit(&softc->vccq_120, bus_timing_mmc_hs200);
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH, ("Card supports HS200 at 1.2V\n"));
	}
	if ((card_type & EXT_CSD_CARD_TYPE_HS200_1_8V) != 0 &&
	(host_caps & MMC_CAP_SIGNALING_180) != 0) {
	setbit(&softc->timings, bus_timing_mmc_hs200);
	setbit(&softc->vccq_180, bus_timing_mmc_hs200);
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH, ("Card supports HS200 at 1.8V\n"));
	}
	}
	}
	int f_max;
	finish_hs_tests:
	f_max = min(host_f_max, softc->card_f_max);
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH, ("Set SD freq to %d MHz (min out of host f=%d MHz and card f=%d MHz)\n", f_max / 1000000, host_f_max / 1000000, softc->card_f_max / 1000000));

	/* Enable high-speed timing on the card */
	if (f_max > 25000000) {
	err = mmc_set_timing(periph, start_ccb, bus_timing_hs);
	if (err != MMC_ERR_NONE) {
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("Cannot switch card to high-speed mode"));
	f_max = 25000000;
	}
	}
	/* If possible, set lower-level signaling */
	enum mmc_bus_timing timing;
	/* FIXME: MMCCAM supports max. bus_timing_mmc_ddr52 at the moment. */
	for (timing = bus_timing_mmc_ddr52; timing > bus_timing_normal; timing--) {
	if (isset(&softc->vccq_120, timing)) {
	/* Set VCCQ = 1.2V */
	start_ccb->ccb_h.func_code = XPT_SET_TRAN_SETTINGS;
	start_ccb->ccb_h.flags = CAM_DIR_NONE;
	start_ccb->ccb_h.retry_count = 0;
	start_ccb->ccb_h.timeout = 100;
	start_ccb->ccb_h.cbfcnp = NULL;
	cts->ios.vccq = vccq_120;
	cts->ios_valid = MMC_VCCQ;
	xpt_action(start_ccb);
	break;
	} else if (isset(&softc->vccq_180, timing)) {
	/* Set VCCQ = 1.8V */
	start_ccb->ccb_h.func_code = XPT_SET_TRAN_SETTINGS;
	start_ccb->ccb_h.flags = CAM_DIR_NONE;
	start_ccb->ccb_h.retry_count = 0;
	start_ccb->ccb_h.timeout = 100;
	start_ccb->ccb_h.cbfcnp = NULL;
	cts->ios.vccq = vccq_180;
	cts->ios_valid = MMC_VCCQ;
	xpt_action(start_ccb);
	break;
	} else {
	/* Set VCCQ = 3.3V */
	start_ccb->ccb_h.func_code = XPT_SET_TRAN_SETTINGS;
	start_ccb->ccb_h.flags = CAM_DIR_NONE;
	start_ccb->ccb_h.retry_count = 0;
	start_ccb->ccb_h.timeout = 100;
	start_ccb->ccb_h.cbfcnp = NULL;
	cts->ios.vccq = vccq_330;
	cts->ios_valid = MMC_VCCQ;
	xpt_action(start_ccb);
	break;
	}
	}

	/* Set frequency on the controller */
	start_ccb->ccb_h.func_code = XPT_SET_TRAN_SETTINGS;
	start_ccb->ccb_h.flags = CAM_DIR_NONE;
	start_ccb->ccb_h.retry_count = 0;
	start_ccb->ccb_h.timeout = 100;
	start_ccb->ccb_h.cbfcnp = NULL;
	cts->ios.clock = f_max;
	cts->ios_valid = MMC_CLK;
	xpt_action(start_ccb);

	/* Set bus width */
	enum mmc_bus_width desired_bus_width = bus_width_1;
	enum mmc_bus_width max_host_bus_width =
	(host_caps & MMC_CAP_8_BIT_DATA ? bus_width_8 :
	host_caps & MMC_CAP_4_BIT_DATA ? bus_width_4 : bus_width_1);
	enum mmc_bus_width max_card_bus_width = bus_width_1;
	if (mmcp->card_features & CARD_FEATURE_SD20 &&
	softc->scr.bus_widths & SD_SCR_BUS_WIDTH_4)
	max_card_bus_width = bus_width_4;
	/*
	* Unlike SD, MMC cards don't have any information about supported bus width...
	* So we need to perform read/write test to find out the width.
	*/
	/* TODO: figure out bus width for MMC; use 8-bit for now (to test on BBB) */
	if (mmcp->card_features & CARD_FEATURE_MMC)
	max_card_bus_width = bus_width_8;

	desired_bus_width = min(max_host_bus_width, max_card_bus_width);
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH,
	("Set bus width to %s (min of host %s and card %s)\n",
	bus_width_str(desired_bus_width),
	bus_width_str(max_host_bus_width),
	bus_width_str(max_card_bus_width)));
	sdda_set_bus_width(periph, start_ccb, desired_bus_width);

	softc->state = SDDA_STATE_NORMAL;

	cam_periph_unhold(periph);
	/* MMC partitions support */
	if (mmcp->card_features & CARD_FEATURE_MMC && mmc_get_spec_vers(periph) >= 4) {
	sdda_process_mmc_partitions(periph, start_ccb);
	} else if (mmcp->card_features & CARD_FEATURE_SD20) {
	/* For SD[HC] cards, just add one partition that is the whole card */
	if (sdda_add_part(periph, 0, "sdda",
	periph->unit_number,
	mmc_get_media_size(periph),
	sdda_get_read_only(periph, start_ccb)) == false)
	return;
	softc->part_curr = 0;
	}
	cam_periph_hold(periph, PRIBIO\|PCATCH);

	xpt_announce_periph(periph, softc->card_id_string);
	/*
	* Add async callbacks for bus reset and bus device reset calls.
	* I don't bother checking if this fails as, in most cases,
	* the system will function just fine without them and the only
	* alternative would be to not attach the device on failure.
	*/
	xpt_register_async(AC_LOST_DEVICE \| AC_GETDEV_CHANGED \|
	AC_ADVINFO_CHANGED, sddaasync, periph, periph->path);
	}

	static bool
	sdda_add_part(struct cam_periph periph, u_int type, const char name,
	u_int cnt, off_t media_size, bool ro)
	{
	struct sdda_softc sc = (struct sdda_softc )periph->softc;
	struct sdda_part *part;
	struct ccb_pathinq cpi;

	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH,
	("Partition type '%s', size %ju %s\n",
	part_type(type),
	media_size,
	ro ? "(read-only)" : ""));

	part = sc->part[type] = malloc(sizeof(*part), M_DEVBUF,
	M_NOWAIT \| M_ZERO);
	if (part == NULL) {
	printf("Cannot add partition for sdda\n");
	return (false);
	}

	part->cnt = cnt;
	part->type = type;
	part->ro = ro;
	part->sc = sc;
	snprintf(part->name, sizeof(part->name), name, periph->unit_number);

	/*
	* Due to the nature of RPMB partition it doesn't make much sense
	* to add it as a disk. It would be more appropriate to create a
	* userland tool to operate on the partition or leverage the existing
	* tools from sysutils/mmc-utils.
	*/
	if (type == EXT_CSD_PART_CONFIG_ACC_RPMB) {
	/* TODO: Create device, assign IOCTL handler */
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH,
	("Don't know what to do with RPMB partitions yet\n"));
	return (false);
	}

	bioq_init(&part->bio_queue);

	bzero(&cpi, sizeof(cpi));
	xpt_setup_ccb(&cpi.ccb_h, periph->path, CAM_PRIORITY_NONE);
	cpi.ccb_h.func_code = XPT_PATH_INQ;
	xpt_action((union ccb *)&cpi);

	/*
	* Register this media as a disk
	*/
	(void)cam_periph_hold(periph, PRIBIO);
	cam_periph_unlock(periph);

	part->disk = disk_alloc();
	part->disk->d_rotation_rate = DISK_RR_NON_ROTATING;
	part->disk->d_devstat = devstat_new_entry(part->name,
	cnt, 512,
	DEVSTAT_ALL_SUPPORTED,
	DEVSTAT_TYPE_DIRECT \| XPORT_DEVSTAT_TYPE(cpi.transport),
	DEVSTAT_PRIORITY_DISK);

	part->disk->d_open = sddaopen;
	part->disk->d_close = sddaclose;
	part->disk->d_strategy = sddastrategy;
	part->disk->d_getattr = sddagetattr;
	// sc->disk->d_dump = sddadump;
	part->disk->d_gone = sddadiskgonecb;
	part->disk->d_name = part->name;
	part->disk->d_drv1 = part;
	part->disk->d_maxsize =
	- MIN(MAXPHYS, sdda_get_max_data(periph,
	+ MIN(maxphys, sdda_get_max_data(periph,
	(union ccb )&cpi) mmc_get_sector_size(periph));
	part->disk->d_unit = cnt;
	part->disk->d_flags = 0;
	strlcpy(part->disk->d_descr, sc->card_id_string,
	MIN(sizeof(part->disk->d_descr), sizeof(sc->card_id_string)));
	strlcpy(part->disk->d_ident, sc->card_sn_string,
	MIN(sizeof(part->disk->d_ident), sizeof(sc->card_sn_string)));
	part->disk->d_hba_vendor = cpi.hba_vendor;
	part->disk->d_hba_device = cpi.hba_device;
	part->disk->d_hba_subvendor = cpi.hba_subvendor;
	part->disk->d_hba_subdevice = cpi.hba_subdevice;
	snprintf(part->disk->d_attachment, sizeof(part->disk->d_attachment),
	"%s%d", cpi.dev_name, cpi.unit_number);

	part->disk->d_sectorsize = mmc_get_sector_size(periph);
	part->disk->d_mediasize = media_size;
	part->disk->d_stripesize = 0;
	part->disk->d_fwsectors = 0;
	part->disk->d_fwheads = 0;

	if (sdda_mmcsd_compat)
	disk_add_alias(part->disk, "mmcsd");

	/*
	* Acquire a reference to the periph before we register with GEOM.
	* We'll release this reference once GEOM calls us back (via
	* sddadiskgonecb()) telling us that our provider has been freed.
	*/
	if (cam_periph_acquire(periph) != 0) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (false);
	}
	disk_create(part->disk, DISK_VERSION);
	cam_periph_lock(periph);
	cam_periph_unhold(periph);

	return (true);
	}

	/*
	* For MMC cards, process EXT_CSD and add partitions that are supported by
	* this device.
	*/
	static void
	sdda_process_mmc_partitions(struct cam_periph periph, union ccb ccb)
	{
	struct sdda_softc sc = (struct sdda_softc )periph->softc;
	struct mmc_params *mmcp = &periph->path->device->mmc_ident_data;
	off_t erase_size, sector_size, size, wp_size;
	int i;
	const uint8_t *ext_csd;
	uint8_t rev;
	bool comp, ro;

	ext_csd = sc->raw_ext_csd;

	/*
	* Enhanced user data area and general purpose partitions are only
	* supported in revision 1.4 (EXT_CSD_REV == 4) and later, the RPMB
	* partition in revision 1.5 (MMC v4.41, EXT_CSD_REV == 5) and later.
	*/
	rev = ext_csd[EXT_CSD_REV];

	/*
	* Ignore user-creatable enhanced user data area and general purpose
	* partitions partitions as long as partitioning hasn't been finished.
	*/
	comp = (ext_csd[EXT_CSD_PART_SET] & EXT_CSD_PART_SET_COMPLETED) != 0;

	/*
	* Add enhanced user data area slice, unless it spans the entirety of
	* the user data area. The enhanced area is of a multiple of high
	* capacity write protect groups ((ERASE_GRP_SIZE + HC_WP_GRP_SIZE) *
	* 512 KB) and its offset given in either sectors or bytes, depending
	* on whether it's a high capacity device or not.
	* NB: The slicer and its slices need to be registered before adding
	* the disk for the corresponding user data area as re-tasting is
	* racy.
	*/
	sector_size = mmc_get_sector_size(periph);
	size = ext_csd[EXT_CSD_ENH_SIZE_MULT] +
	(ext_csd[EXT_CSD_ENH_SIZE_MULT + 1] << 8) +
	(ext_csd[EXT_CSD_ENH_SIZE_MULT + 2] << 16);
	if (rev >= 4 && comp == TRUE && size > 0 &&
	(ext_csd[EXT_CSD_PART_SUPPORT] &
	EXT_CSD_PART_SUPPORT_ENH_ATTR_EN) != 0 &&
	(ext_csd[EXT_CSD_PART_ATTR] & (EXT_CSD_PART_ATTR_ENH_USR)) != 0) {
	erase_size = ext_csd[EXT_CSD_ERASE_GRP_SIZE] * 1024 *
	MMC_SECTOR_SIZE;
	wp_size = ext_csd[EXT_CSD_HC_WP_GRP_SIZE];
	size = erase_size wp_size;
	if (size != mmc_get_media_size(periph) * sector_size) {
	sc->enh_size = size;
	sc->enh_base = (ext_csd[EXT_CSD_ENH_START_ADDR] +
	(ext_csd[EXT_CSD_ENH_START_ADDR + 1] << 8) +
	(ext_csd[EXT_CSD_ENH_START_ADDR + 2] << 16) +
	(ext_csd[EXT_CSD_ENH_START_ADDR + 3] << 24)) *
	((mmcp->card_features & CARD_FEATURE_SDHC) ? 1: MMC_SECTOR_SIZE);
	} else
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH,
	("enhanced user data area spans entire device"));
	}

	/*
	* Add default partition. This may be the only one or the user
	* data area in case partitions are supported.
	*/
	ro = sdda_get_read_only(periph, ccb);
	sdda_add_part(periph, EXT_CSD_PART_CONFIG_ACC_DEFAULT, "sdda",
	periph->unit_number, mmc_get_media_size(periph), ro);
	sc->part_curr = EXT_CSD_PART_CONFIG_ACC_DEFAULT;

	if (mmc_get_spec_vers(periph) < 3)
	return;

	/* Belatedly announce enhanced user data slice. */
	if (sc->enh_size != 0) {
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH,
	("enhanced user data area off 0x%jx size %ju bytes\n",
	sc->enh_base, sc->enh_size));
	}

	/*
	* Determine partition switch timeout (provided in units of 10 ms)
	* and ensure it's at least 300 ms as some eMMC chips lie.
	*/
	sc->part_time = max(ext_csd[EXT_CSD_PART_SWITCH_TO] * 10 * 1000,
	300 * 1000);

	/* Add boot partitions, which are of a fixed multiple of 128 KB. */
	size = ext_csd[EXT_CSD_BOOT_SIZE_MULT] * MMC_BOOT_RPMB_BLOCK_SIZE;
	if (size > 0 && (sdda_get_host_caps(periph, ccb) & MMC_CAP_BOOT_NOACC) == 0) {
	sdda_add_part(periph, EXT_CSD_PART_CONFIG_ACC_BOOT0,
	SDDA_FMT_BOOT, 0, size,
	ro \| ((ext_csd[EXT_CSD_BOOT_WP_STATUS] &
	EXT_CSD_BOOT_WP_STATUS_BOOT0_MASK) != 0));
	sdda_add_part(periph, EXT_CSD_PART_CONFIG_ACC_BOOT1,
	SDDA_FMT_BOOT, 1, size,
	ro \| ((ext_csd[EXT_CSD_BOOT_WP_STATUS] &
	EXT_CSD_BOOT_WP_STATUS_BOOT1_MASK) != 0));
	}

	/* Add RPMB partition, which also is of a fixed multiple of 128 KB. */
	size = ext_csd[EXT_CSD_RPMB_MULT] * MMC_BOOT_RPMB_BLOCK_SIZE;
	if (rev >= 5 && size > 0)
	sdda_add_part(periph, EXT_CSD_PART_CONFIG_ACC_RPMB,
	SDDA_FMT_RPMB, 0, size, ro);

	if (rev <= 3 \|\| comp == FALSE)
	return;

	/*
	* Add general purpose partitions, which are of a multiple of high
	* capacity write protect groups, too.
	*/
	if ((ext_csd[EXT_CSD_PART_SUPPORT] & EXT_CSD_PART_SUPPORT_EN) != 0) {
	erase_size = ext_csd[EXT_CSD_ERASE_GRP_SIZE] * 1024 *
	MMC_SECTOR_SIZE;
	wp_size = ext_csd[EXT_CSD_HC_WP_GRP_SIZE];
	for (i = 0; i < MMC_PART_GP_MAX; i++) {
	size = ext_csd[EXT_CSD_GP_SIZE_MULT + i * 3] +
	(ext_csd[EXT_CSD_GP_SIZE_MULT + i * 3 + 1] << 8) +
	(ext_csd[EXT_CSD_GP_SIZE_MULT + i * 3 + 2] << 16);
	if (size == 0)
	continue;
	sdda_add_part(periph, EXT_CSD_PART_CONFIG_ACC_GP0 + i,
	SDDA_FMT_GP, i, size * erase_size * wp_size, ro);
	}
	}
	}

	/*
	* We cannot just call mmc_switch() since it will sleep, and we are in
	* GEOM context and cannot sleep. Instead, create an MMCIO request to switch
	* partitions and send it to h/w, and upon completion resume processing
	* the I/O queue.
	* This function cannot fail, instead check switch errors in sddadone().
	*/
	static void
	sdda_init_switch_part(struct cam_periph periph, union ccb start_ccb,
	uint8_t part)
	{
	struct sdda_softc sc = (struct sdda_softc )periph->softc;
	uint8_t value;

	KASSERT(part < MMC_PART_MAX, ("%s: invalid partition index", __func__));
	sc->part_requested = part;

	value = (sc->raw_ext_csd[EXT_CSD_PART_CONFIG] &
	~EXT_CSD_PART_CONFIG_ACC_MASK) \| part;

	mmc_switch_fill_mmcio(start_ccb, EXT_CSD_CMD_SET_NORMAL,
	EXT_CSD_PART_CONFIG, value, sc->part_time);
	start_ccb->ccb_h.cbfcnp = sddadone;

	sc->outstanding_cmds++;
	cam_periph_unlock(periph);
	xpt_action(start_ccb);
	cam_periph_lock(periph);
	}

	/* Called with periph lock held! */
	static void
	sddastart(struct cam_periph periph, union ccb start_ccb)
	{
	struct bio *bp;
	struct sdda_softc softc = (struct sdda_softc )periph->softc;
	struct sdda_part *part;
	struct mmc_params *mmcp = &periph->path->device->mmc_ident_data;
	uint8_t part_index;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sddastart\n"));

	if (softc->state != SDDA_STATE_NORMAL) {
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("device is not in SDDA_STATE_NORMAL yet\n"));
	xpt_release_ccb(start_ccb);
	return;
	}

	/* Find partition that has outstanding commands. Prefer current partition. */
	part_index = softc->part_curr;
	part = softc->part[softc->part_curr];
	bp = bioq_first(&part->bio_queue);
	if (bp == NULL) {
	for (part_index = 0; part_index < MMC_PART_MAX; part_index++) {
	if ((part = softc->part[part_index]) != NULL &&
	(bp = bioq_first(&softc->part[part_index]->bio_queue)) != NULL)
	break;
	}
	}
	if (bp == NULL) {
	xpt_release_ccb(start_ccb);
	return;
	}
	if (part_index != softc->part_curr) {
	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH,
	("Partition %d -> %d\n", softc->part_curr, part_index));
	/*
	* According to section "6.2.2 Command restrictions" of the eMMC
	* specification v5.1, CMD19/CMD21 aren't allowed to be used with
	* RPMB partitions. So we pause re-tuning along with triggering
	* it up-front to decrease the likelihood of re-tuning becoming
	* necessary while accessing an RPMB partition. Consequently, an
	* RPMB partition should immediately be switched away from again
	* after an access in order to allow for re-tuning to take place
	* anew.
	*/
	/* TODO: pause retune if switching to RPMB partition */
	softc->state = SDDA_STATE_PART_SWITCH;
	sdda_init_switch_part(periph, start_ccb, part_index);
	return;
	}

	bioq_remove(&part->bio_queue, bp);

	switch (bp->bio_cmd) {
	case BIO_WRITE:
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("BIO_WRITE\n"));
	part->flags \|= SDDA_FLAG_DIRTY;
	/* FALLTHROUGH */
	case BIO_READ:
	{
	struct ccb_mmcio *mmcio;
	uint64_t blockno = bp->bio_pblkno;
	uint16_t count = bp->bio_bcount / 512;
	uint16_t opcode;

	if (bp->bio_cmd == BIO_READ)
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("BIO_READ\n"));
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE,
	("Block %"PRIu64" cnt %u\n", blockno, count));

	/* Construct new MMC command */
	if (bp->bio_cmd == BIO_READ) {
	if (count > 1)
	opcode = MMC_READ_MULTIPLE_BLOCK;
	else
	opcode = MMC_READ_SINGLE_BLOCK;
	} else {
	if (count > 1)
	opcode = MMC_WRITE_MULTIPLE_BLOCK;
	else
	opcode = MMC_WRITE_BLOCK;
	}

	start_ccb->ccb_h.func_code = XPT_MMC_IO;
	start_ccb->ccb_h.flags = (bp->bio_cmd == BIO_READ ? CAM_DIR_IN : CAM_DIR_OUT);
	start_ccb->ccb_h.retry_count = 0;
	start_ccb->ccb_h.timeout = 15 * 1000;
	start_ccb->ccb_h.cbfcnp = sddadone;

	mmcio = &start_ccb->mmcio;
	mmcio->cmd.opcode = opcode;
	mmcio->cmd.arg = blockno;
	if (!(mmcp->card_features & CARD_FEATURE_SDHC))
	mmcio->cmd.arg <<= 9;

	mmcio->cmd.flags = MMC_RSP_R1 \| MMC_CMD_ADTC;
	mmcio->cmd.data = softc->mmcdata;
	memset(mmcio->cmd.data, 0, sizeof(struct mmc_data));
	mmcio->cmd.data->data = bp->bio_data;
	mmcio->cmd.data->len = 512 * count;
	mmcio->cmd.data->flags = (bp->bio_cmd == BIO_READ ? MMC_DATA_READ : MMC_DATA_WRITE);
	/* Direct h/w to issue CMD12 upon completion */
	if (count > 1) {
	mmcio->cmd.data->flags \|= MMC_DATA_MULTI;
	mmcio->stop.opcode = MMC_STOP_TRANSMISSION;
	mmcio->stop.flags = MMC_RSP_R1B \| MMC_CMD_AC;
	mmcio->stop.arg = 0;
	}

	break;
	}
	case BIO_FLUSH:
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("BIO_FLUSH\n"));
	sddaschedule(periph);
	break;
	case BIO_DELETE:
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("BIO_DELETE\n"));
	sddaschedule(periph);
	break;
	default:
	biofinish(bp, NULL, EOPNOTSUPP);
	xpt_release_ccb(start_ccb);
	return;
	}
	start_ccb->ccb_h.ccb_bp = bp;
	softc->outstanding_cmds++;
	softc->refcount++;
	cam_periph_unlock(periph);
	xpt_action(start_ccb);
	cam_periph_lock(periph);

	/* May have more work to do, so ensure we stay scheduled */
	sddaschedule(periph);
	}

	static void
	sddadone(struct cam_periph periph, union ccb done_ccb)
	{
	struct bio *bp;
	struct sdda_softc *softc;
	struct ccb_mmcio *mmcio;
	struct cam_path *path;
	uint32_t card_status;
	int error = 0;

	softc = (struct sdda_softc *)periph->softc;
	mmcio = &done_ccb->mmcio;
	path = done_ccb->ccb_h.path;

	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("sddadone\n"));
	// cam_periph_lock(periph);
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("Error!!!\n"));
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	error = 5; /* EIO */
	} else {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	panic("REQ_CMP with QFRZN");
	error = 0;
	}

	card_status = mmcio->cmd.resp[0];
	CAM_DEBUG(path, CAM_DEBUG_TRACE,
	("Card status: %08x\n", R1_STATUS(card_status)));
	CAM_DEBUG(path, CAM_DEBUG_TRACE,
	("Current state: %d\n", R1_CURRENT_STATE(card_status)));

	/* Process result of switching MMC partitions */
	if (softc->state == SDDA_STATE_PART_SWITCH) {
	CAM_DEBUG(path, CAM_DEBUG_TRACE,
	("Completing partition switch to %d\n",
	softc->part_requested));
	softc->outstanding_cmds--;
	/* Complete partition switch */
	softc->state = SDDA_STATE_NORMAL;
	if (error != MMC_ERR_NONE) {
	/* TODO: Unpause retune if accessing RPMB */
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	return;
	}

	softc->raw_ext_csd[EXT_CSD_PART_CONFIG] =
	(softc->raw_ext_csd[EXT_CSD_PART_CONFIG] &
	~EXT_CSD_PART_CONFIG_ACC_MASK) \| softc->part_requested;
	/* TODO: Unpause retune if accessing RPMB */
	softc->part_curr = softc->part_requested;
	xpt_release_ccb(done_ccb);

	/* Return to processing BIO requests */
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	return;
	}

	bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
	bp->bio_error = error;
	if (error != 0) {
	bp->bio_resid = bp->bio_bcount;
	bp->bio_flags \|= BIO_ERROR;
	} else {
	/* XXX: How many bytes remaining? */
	bp->bio_resid = 0;
	if (bp->bio_resid > 0)
	bp->bio_flags \|= BIO_ERROR;
	}

	softc->outstanding_cmds--;
	xpt_release_ccb(done_ccb);
	/*
	* Release the periph refcount taken in sddastart() for each CCB.
	*/
	KASSERT(softc->refcount >= 1, ("sddadone softc %p refcount %d", softc, softc->refcount));
	softc->refcount--;
	biodone(bp);
	}

	static int
	sddaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
	{
	return(cam_periph_error(ccb, cam_flags, sense_flags));
	}
	#endif /* _KERNEL */
	diff --git a/sys/cam/nvme/nvme_da.c b/sys/cam/nvme/nvme_da.c
	index 73079f90d781..8e7f7318ce3b 100644
	--- a/sys/cam/nvme/nvme_da.c
	+++ b/sys/cam/nvme/nvme_da.c
	@@ -1,1362 +1,1362 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2015 Netflix, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* Derived from ata_da.c:
	* Copyright (c) 2009 Alexander Motin <mav@FreeBSD.org>
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>

	#ifdef _KERNEL
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bio.h>
	#include <sys/sysctl.h>
	#include <sys/taskqueue.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/conf.h>
	#include <sys/devicestat.h>
	#include <sys/eventhandler.h>
	#include <sys/malloc.h>
	#include <sys/cons.h>
	#include <sys/proc.h>
	#include <sys/reboot.h>
	#include <sys/sbuf.h>
	#include <geom/geom.h>
	#include <geom/geom_disk.h>
	#endif /* _KERNEL */

	#ifndef _KERNEL
	#include <stdio.h>
	#include <string.h>
	#endif /* _KERNEL */

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_iosched.h>

	#include <cam/nvme/nvme_all.h>

	typedef enum {
	NDA_STATE_NORMAL
	} nda_state;

	typedef enum {
	NDA_FLAG_OPEN = 0x0001,
	NDA_FLAG_DIRTY = 0x0002,
	NDA_FLAG_SCTX_INIT = 0x0004,
	} nda_flags;
	#define NDA_FLAG_STRING \
	"\020" \
	"\001OPEN" \
	"\002DIRTY" \
	"\003SCTX_INIT"

	typedef enum {
	NDA_Q_4K = 0x01,
	NDA_Q_NONE = 0x00,
	} nda_quirks;

	#define NDA_Q_BIT_STRING \
	"\020" \
	"\001Bit 0"

	typedef enum {
	NDA_CCB_BUFFER_IO = 0x01,
	NDA_CCB_DUMP = 0x02,
	NDA_CCB_TRIM = 0x03,
	NDA_CCB_PASS = 0x04,
	NDA_CCB_TYPE_MASK = 0x0F,
	} nda_ccb_state;

	/* Offsets into our private area for storing information */
	#define ccb_state ccb_h.ppriv_field0
	#define ccb_bp ccb_h.ppriv_ptr1 /* For NDA_CCB_BUFFER_IO */
	#define ccb_trim ccb_h.ppriv_ptr1 /* For NDA_CCB_TRIM */

	struct nda_softc {
	struct cam_iosched_softc *cam_iosched;
	int outstanding_cmds; /* Number of active commands */
	int refcount; /* Active xpt_action() calls */
	nda_state state;
	nda_flags flags;
	nda_quirks quirks;
	int unmappedio;
	quad_t deletes;
	uint32_t nsid; /* Namespace ID for this nda device */
	struct disk *disk;
	struct task sysctl_task;
	struct sysctl_ctx_list sysctl_ctx;
	struct sysctl_oid *sysctl_tree;
	uint64_t trim_count;
	uint64_t trim_ranges;
	uint64_t trim_lbas;
	#ifdef CAM_TEST_FAILURE
	int force_read_error;
	int force_write_error;
	int periodic_read_error;
	int periodic_read_count;
	#endif
	#ifdef CAM_IO_STATS
	struct sysctl_ctx_list sysctl_stats_ctx;
	struct sysctl_oid *sysctl_stats_tree;
	u_int timeouts;
	u_int errors;
	u_int invalidations;
	#endif
	};

	struct nda_trim_request {
	struct nvme_dsm_range dsm[NVME_MAX_DSM_TRIM / sizeof(struct nvme_dsm_range)];
	TAILQ_HEAD(, bio) bps;
	};
	_Static_assert(NVME_MAX_DSM_TRIM % sizeof(struct nvme_dsm_range) == 0,
	"NVME_MAX_DSM_TRIM must be an integral number of ranges");

	/* Need quirk table */

	static disk_ioctl_t ndaioctl;
	static disk_strategy_t ndastrategy;
	static dumper_t ndadump;
	static periph_init_t ndainit;
	static void ndaasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg);
	static void ndasysctlinit(void *context, int pending);
	static int ndaflagssysctl(SYSCTL_HANDLER_ARGS);
	static periph_ctor_t ndaregister;
	static periph_dtor_t ndacleanup;
	static periph_start_t ndastart;
	static periph_oninv_t ndaoninvalidate;
	static void ndadone(struct cam_periph *periph,
	union ccb *done_ccb);
	static int ndaerror(union ccb *ccb, u_int32_t cam_flags,
	u_int32_t sense_flags);
	static void ndashutdown(void *arg, int howto);
	static void ndasuspend(void *arg);

	#ifndef NDA_DEFAULT_SEND_ORDERED
	#define NDA_DEFAULT_SEND_ORDERED 1
	#endif
	#ifndef NDA_DEFAULT_TIMEOUT
	#define NDA_DEFAULT_TIMEOUT 30 /* Timeout in seconds */
	#endif
	#ifndef NDA_DEFAULT_RETRY
	#define NDA_DEFAULT_RETRY 4
	#endif
	#ifndef NDA_MAX_TRIM_ENTRIES
	#define NDA_MAX_TRIM_ENTRIES (NVME_MAX_DSM_TRIM / sizeof(struct nvme_dsm_range))/* Number of DSM trims to use, max 256 */
	#endif

	static SYSCTL_NODE(_kern_cam, OID_AUTO, nda, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"CAM Direct Access Disk driver");

	//static int nda_retry_count = NDA_DEFAULT_RETRY;
	static int nda_send_ordered = NDA_DEFAULT_SEND_ORDERED;
	static int nda_default_timeout = NDA_DEFAULT_TIMEOUT;
	static int nda_max_trim_entries = NDA_MAX_TRIM_ENTRIES;
	static int nda_enable_biospeedup = 1;
	static int nda_nvd_compat = 1;
	SYSCTL_INT(_kern_cam_nda, OID_AUTO, max_trim, CTLFLAG_RDTUN,
	&nda_max_trim_entries, NDA_MAX_TRIM_ENTRIES,
	"Maximum number of BIO_DELETE to send down as a DSM TRIM.");
	SYSCTL_INT(_kern_cam_nda, OID_AUTO, enable_biospeedup, CTLFLAG_RDTUN,
	&nda_enable_biospeedup, 0, "Enable BIO_SPEEDUP processing.");
	SYSCTL_INT(_kern_cam_nda, OID_AUTO, nvd_compat, CTLFLAG_RDTUN,
	&nda_nvd_compat, 1, "Enable creation of nvd aliases.");

	/*
	* All NVMe media is non-rotational, so all nvme device instances
	* share this to implement the sysctl.
	*/
	static int nda_rotating_media = 0;

	static struct periph_driver ndadriver =
	{
	ndainit, "nda",
	TAILQ_HEAD_INITIALIZER(ndadriver.units), /* generation */ 0
	};

	PERIPHDRIVER_DECLARE(nda, ndadriver);

	static MALLOC_DEFINE(M_NVMEDA, "nvme_da", "nvme_da buffers");

	/*
	* nice wrappers. Maybe these belong in nvme_all.c instead of
	* here, but this is the only place that uses these. Should
	* we ever grow another NVME periph, we should move them
	* all there wholesale.
	*/

	static void
	nda_nvme_flush(struct nda_softc softc, struct ccb_nvmeio nvmeio)
	{
	cam_fill_nvmeio(nvmeio,
	0, /* retries */
	ndadone, /* cbfcnp */
	CAM_DIR_NONE, /* flags */
	NULL, /* data_ptr */
	0, /* dxfer_len */
	nda_default_timeout * 1000); /* timeout 30s */
	nvme_ns_flush_cmd(&nvmeio->cmd, softc->nsid);
	}

	static void
	nda_nvme_trim(struct nda_softc softc, struct ccb_nvmeio nvmeio,
	void *payload, uint32_t num_ranges)
	{
	cam_fill_nvmeio(nvmeio,
	0, /* retries */
	ndadone, /* cbfcnp */
	CAM_DIR_OUT, /* flags */
	payload, /* data_ptr */
	num_ranges * sizeof(struct nvme_dsm_range), /* dxfer_len */
	nda_default_timeout * 1000); /* timeout 30s */
	nvme_ns_trim_cmd(&nvmeio->cmd, softc->nsid, num_ranges);
	}

	static void
	nda_nvme_write(struct nda_softc softc, struct ccb_nvmeio nvmeio,
	void *payload, uint64_t lba, uint32_t len, uint32_t count)
	{
	cam_fill_nvmeio(nvmeio,
	0, /* retries */
	ndadone, /* cbfcnp */
	CAM_DIR_OUT, /* flags */
	payload, /* data_ptr */
	len, /* dxfer_len */
	nda_default_timeout * 1000); /* timeout 30s */
	nvme_ns_write_cmd(&nvmeio->cmd, softc->nsid, lba, count);
	}

	static void
	nda_nvme_rw_bio(struct nda_softc softc, struct ccb_nvmeio nvmeio,
	struct bio *bp, uint32_t rwcmd)
	{
	int flags = rwcmd == NVME_OPC_READ ? CAM_DIR_IN : CAM_DIR_OUT;
	void *payload;
	uint64_t lba;
	uint32_t count;

	if (bp->bio_flags & BIO_UNMAPPED) {
	flags \|= CAM_DATA_BIO;
	payload = bp;
	} else {
	payload = bp->bio_data;
	}

	lba = bp->bio_pblkno;
	count = bp->bio_bcount / softc->disk->d_sectorsize;

	cam_fill_nvmeio(nvmeio,
	0, /* retries */
	ndadone, /* cbfcnp */
	flags, /* flags */
	payload, /* data_ptr */
	bp->bio_bcount, /* dxfer_len */
	nda_default_timeout * 1000); /* timeout 30s */
	nvme_ns_rw_cmd(&nvmeio->cmd, rwcmd, softc->nsid, lba, count);
	}

	static int
	ndaopen(struct disk *dp)
	{
	struct cam_periph *periph;
	struct nda_softc *softc;
	int error;

	periph = (struct cam_periph *)dp->d_drv1;
	if (cam_periph_acquire(periph) != 0) {
	return(ENXIO);
	}

	cam_periph_lock(periph);
	if ((error = cam_periph_hold(periph, PRIBIO\|PCATCH)) != 0) {
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (error);
	}

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE \| CAM_DEBUG_PERIPH,
	("ndaopen\n"));

	softc = (struct nda_softc *)periph->softc;
	softc->flags \|= NDA_FLAG_OPEN;

	cam_periph_unhold(periph);
	cam_periph_unlock(periph);
	return (0);
	}

	static int
	ndaclose(struct disk *dp)
	{
	struct cam_periph *periph;
	struct nda_softc *softc;
	union ccb *ccb;
	int error;

	periph = (struct cam_periph *)dp->d_drv1;
	softc = (struct nda_softc *)periph->softc;
	cam_periph_lock(periph);

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE \| CAM_DEBUG_PERIPH,
	("ndaclose\n"));

	if ((softc->flags & NDA_FLAG_DIRTY) != 0 &&
	(periph->flags & CAM_PERIPH_INVALID) == 0 &&
	cam_periph_hold(periph, PRIBIO) == 0) {
	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	nda_nvme_flush(softc, &ccb->nvmeio);
	error = cam_periph_runccb(ccb, ndaerror, /cam_flags/0,
	/sense_flags/0, softc->disk->d_devstat);

	if (error != 0)
	xpt_print(periph->path, "Synchronize cache failed\n");
	else
	softc->flags &= ~NDA_FLAG_DIRTY;
	xpt_release_ccb(ccb);
	cam_periph_unhold(periph);
	}

	softc->flags &= ~NDA_FLAG_OPEN;

	while (softc->refcount != 0)
	cam_periph_sleep(periph, &softc->refcount, PRIBIO, "ndaclose", 1);
	KASSERT(softc->outstanding_cmds == 0,
	("nda %d outstanding commands", softc->outstanding_cmds));
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (0);
	}

	static void
	ndaschedule(struct cam_periph *periph)
	{
	struct nda_softc softc = (struct nda_softc )periph->softc;

	if (softc->state != NDA_STATE_NORMAL)
	return;

	cam_iosched_schedule(softc->cam_iosched, periph);
	}

	static int
	ndaioctl(struct disk dp, u_long cmd, void data, int fflag,
	struct thread *td)
	{
	struct cam_periph *periph;
	struct nda_softc *softc;

	periph = (struct cam_periph *)dp->d_drv1;
	softc = (struct nda_softc *)periph->softc;

	switch (cmd) {
	case NVME_IO_TEST:
	case NVME_BIO_TEST:
	/*
	* These don't map well to the underlying CCBs, so
	* they are usupported via CAM.
	*/
	return (ENOTTY);
	case NVME_GET_NSID:
	{
	struct nvme_get_nsid gnsid = (struct nvme_get_nsid )data;
	struct ccb_pathinq cpi;

	xpt_path_inq(&cpi, periph->path);
	strncpy(gnsid->cdev, cpi.xport_specific.nvme.dev_name,
	sizeof(gnsid->cdev));
	gnsid->nsid = cpi.xport_specific.nvme.nsid;
	return (0);
	}
	case NVME_PASSTHROUGH_CMD:
	{
	struct nvme_pt_command *pt;
	union ccb *ccb;
	struct cam_periph_map_info mapinfo;
	u_int maxmap = dp->d_maxsize;
	int error;

	/*
	* Create a NVME_IO CCB to do the passthrough command.
	*/
	pt = (struct nvme_pt_command *)data;
	ccb = xpt_alloc_ccb();
	xpt_setup_ccb(&ccb->ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	ccb->ccb_state = NDA_CCB_PASS;
	cam_fill_nvmeio(&ccb->nvmeio,
	0, /* Retries */
	ndadone,
	(pt->is_read ? CAM_DIR_IN : CAM_DIR_OUT) \| CAM_DATA_VADDR,
	pt->buf,
	pt->len,
	nda_default_timeout * 1000);
	memcpy(&ccb->nvmeio.cmd, &pt->cmd, sizeof(pt->cmd));

	/*
	* Wire the user memory in this request for the I/O
	*/
	memset(&mapinfo, 0, sizeof(mapinfo));
	error = cam_periph_mapmem(ccb, &mapinfo, maxmap);
	if (error)
	goto out;

	/*
	* Lock the periph and run the command.
	*/
	cam_periph_lock(periph);
	cam_periph_runccb(ccb, NULL, CAM_RETRY_SELTO,
	SF_RETRY_UA \| SF_NO_PRINT, NULL);

	/*
	* Tear down mapping and return status.
	*/
	cam_periph_unlock(periph);
	cam_periph_unmapmem(ccb, &mapinfo);
	error = (ccb->ccb_h.status == CAM_REQ_CMP) ? 0 : EIO;
	out:
	cam_periph_lock(periph);
	xpt_release_ccb(ccb);
	cam_periph_unlock(periph);
	return (error);
	}
	default:
	break;
	}
	return (ENOTTY);
	}

	/*
	* Actually translate the requested transfer into one the physical driver
	* can understand. The transfer is described by a buf and will include
	* only one physical transfer.
	*/
	static void
	ndastrategy(struct bio *bp)
	{
	struct cam_periph *periph;
	struct nda_softc *softc;

	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
	softc = (struct nda_softc *)periph->softc;

	cam_periph_lock(periph);

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastrategy(%p)\n", bp));

	/*
	* If the device has been made invalid, error out
	*/
	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
	cam_periph_unlock(periph);
	biofinish(bp, NULL, ENXIO);
	return;
	}

	if (bp->bio_cmd == BIO_DELETE)
	softc->deletes++;

	/*
	* Place it in the queue of disk activities for this disk
	*/
	cam_iosched_queue_work(softc->cam_iosched, bp);

	/*
	* Schedule ourselves for performing the work.
	*/
	ndaschedule(periph);
	cam_periph_unlock(periph);

	return;
	}

	static int
	ndadump(void arg, void virtual, vm_offset_t physical, off_t offset, size_t length)
	{
	struct cam_periph *periph;
	struct nda_softc *softc;
	u_int secsize;
	struct ccb_nvmeio nvmeio;
	struct disk *dp;
	uint64_t lba;
	uint32_t count;
	int error = 0;

	dp = arg;
	periph = dp->d_drv1;
	softc = (struct nda_softc *)periph->softc;
	secsize = softc->disk->d_sectorsize;
	lba = offset / secsize;
	count = length / secsize;

	if ((periph->flags & CAM_PERIPH_INVALID) != 0)
	return (ENXIO);

	/* xpt_get_ccb returns a zero'd allocation for the ccb, mimic that here */
	memset(&nvmeio, 0, sizeof(nvmeio));
	if (length > 0) {
	xpt_setup_ccb(&nvmeio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	nvmeio.ccb_state = NDA_CCB_DUMP;
	nda_nvme_write(softc, &nvmeio, virtual, lba, length, count);
	error = cam_periph_runccb((union ccb *)&nvmeio, cam_periph_error,
	0, SF_NO_RECOVERY \| SF_NO_RETRY, NULL);
	if (error != 0)
	printf("Aborting dump due to I/O error %d.\n", error);

	return (error);
	}

	/* Flush */
	xpt_setup_ccb(&nvmeio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);

	nvmeio.ccb_state = NDA_CCB_DUMP;
	nda_nvme_flush(softc, &nvmeio);
	error = cam_periph_runccb((union ccb *)&nvmeio, cam_periph_error,
	0, SF_NO_RECOVERY \| SF_NO_RETRY, NULL);
	if (error != 0)
	xpt_print(periph->path, "flush cmd failed\n");
	return (error);
	}

	static void
	ndainit(void)
	{
	cam_status status;

	/*
	* Install a global async callback. This callback will
	* receive async callbacks like "new device found".
	*/
	status = xpt_register_async(AC_FOUND_DEVICE, ndaasync, NULL, NULL);

	if (status != CAM_REQ_CMP) {
	printf("nda: Failed to attach master async callback "
	"due to status 0x%x!\n", status);
	} else if (nda_send_ordered) {
	/* Register our event handlers */
	if ((EVENTHANDLER_REGISTER(power_suspend, ndasuspend,
	NULL, EVENTHANDLER_PRI_LAST)) == NULL)
	printf("ndainit: power event registration failed!\n");
	if ((EVENTHANDLER_REGISTER(shutdown_post_sync, ndashutdown,
	NULL, SHUTDOWN_PRI_DEFAULT)) == NULL)
	printf("ndainit: shutdown event registration failed!\n");
	}
	}

	/*
	* Callback from GEOM, called when it has finished cleaning up its
	* resources.
	*/
	static void
	ndadiskgonecb(struct disk *dp)
	{
	struct cam_periph *periph;

	periph = (struct cam_periph *)dp->d_drv1;

	cam_periph_release(periph);
	}

	static void
	ndaoninvalidate(struct cam_periph *periph)
	{
	struct nda_softc *softc;

	softc = (struct nda_softc *)periph->softc;

	/*
	* De-register any async callbacks.
	*/
	xpt_register_async(0, ndaasync, periph, periph->path);
	#ifdef CAM_IO_STATS
	softc->invalidations++;
	#endif

	/*
	* Return all queued I/O with ENXIO.
	* XXX Handle any transactions queued to the card
	* with XPT_ABORT_CCB.
	*/
	cam_iosched_flush(softc->cam_iosched, NULL, ENXIO);

	disk_gone(softc->disk);
	}

	static void
	ndacleanup(struct cam_periph *periph)
	{
	struct nda_softc *softc;

	softc = (struct nda_softc *)periph->softc;

	cam_periph_unlock(periph);

	cam_iosched_fini(softc->cam_iosched);

	/*
	* If we can't free the sysctl tree, oh well...
	*/
	if ((softc->flags & NDA_FLAG_SCTX_INIT) != 0) {
	#ifdef CAM_IO_STATS
	if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0)
	xpt_print(periph->path,
	"can't remove sysctl stats context\n");
	#endif
	if (sysctl_ctx_free(&softc->sysctl_ctx) != 0)
	xpt_print(periph->path,
	"can't remove sysctl context\n");
	}

	disk_destroy(softc->disk);
	free(softc, M_DEVBUF);
	cam_periph_lock(periph);
	}

	static void
	ndaasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg)
	{
	struct cam_periph *periph;

	periph = (struct cam_periph *)callback_arg;
	switch (code) {
	case AC_FOUND_DEVICE:
	{
	struct ccb_getdev *cgd;
	cam_status status;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL)
	break;

	if (cgd->protocol != PROTO_NVME)
	break;

	/*
	* Allocate a peripheral instance for
	* this device and start the probe
	* process.
	*/
	status = cam_periph_alloc(ndaregister, ndaoninvalidate,
	ndacleanup, ndastart,
	"nda", CAM_PERIPH_BIO,
	path, ndaasync,
	AC_FOUND_DEVICE, cgd);

	if (status != CAM_REQ_CMP
	&& status != CAM_REQ_INPROG)
	printf("ndaasync: Unable to attach to new device "
	"due to status 0x%x\n", status);
	break;
	}
	case AC_ADVINFO_CHANGED:
	{
	uintptr_t buftype;

	buftype = (uintptr_t)arg;
	if (buftype == CDAI_TYPE_PHYS_PATH) {
	struct nda_softc *softc;

	softc = periph->softc;
	disk_attr_changed(softc->disk, "GEOM::physpath",
	M_NOWAIT);
	}
	break;
	}
	case AC_LOST_DEVICE:
	default:
	cam_periph_async(periph, code, path, arg);
	break;
	}
	}

	static void
	ndasysctlinit(void *context, int pending)
	{
	struct cam_periph *periph;
	struct nda_softc *softc;
	char tmpstr[32], tmpstr2[16];

	periph = (struct cam_periph *)context;

	/* periph was held for us when this task was enqueued */
	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
	cam_periph_release(periph);
	return;
	}

	softc = (struct nda_softc *)periph->softc;
	snprintf(tmpstr, sizeof(tmpstr), "CAM NDA unit %d", periph->unit_number);
	snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);

	sysctl_ctx_init(&softc->sysctl_ctx);
	softc->flags \|= NDA_FLAG_SCTX_INIT;
	softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_kern_cam_nda), OID_AUTO, tmpstr2,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, tmpstr, "device_index");
	if (softc->sysctl_tree == NULL) {
	printf("ndasysctlinit: unable to allocate sysctl tree\n");
	cam_periph_release(periph);
	return;
	}

	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "unmapped_io", CTLFLAG_RD,
	&softc->unmappedio, 0, "Unmapped I/O leaf");

	SYSCTL_ADD_QUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "deletes", CTLFLAG_RD,
	&softc->deletes, "Number of BIO_DELETE requests");

	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"trim_count", CTLFLAG_RD, &softc->trim_count,
	"Total number of unmap/dsm commands sent");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"trim_ranges", CTLFLAG_RD, &softc->trim_ranges,
	"Total number of ranges in unmap/dsm commands");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"trim_lbas", CTLFLAG_RD, &softc->trim_lbas,
	"Total lbas in the unmap/dsm commands sent");

	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "rotating", CTLFLAG_RD, &nda_rotating_media, 1,
	"Rotating media");

	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "flags", CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	softc, 0, ndaflagssysctl, "A",
	"Flags for drive");

	#ifdef CAM_IO_STATS
	softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats",
	CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, "Statistics");
	if (softc->sysctl_stats_tree == NULL) {
	printf("ndasysctlinit: unable to allocate sysctl tree for stats\n");
	cam_periph_release(periph);
	return;
	}
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO, "timeouts", CTLFLAG_RD,
	&softc->timeouts, 0,
	"Device timeouts reported by the SIM");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO, "errors", CTLFLAG_RD,
	&softc->errors, 0,
	"Transport errors reported by the SIM.");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO, "pack_invalidations", CTLFLAG_RD,
	&softc->invalidations, 0,
	"Device pack invalidations.");
	#endif

	#ifdef CAM_TEST_FAILURE
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "invalidate", CTLTYPE_U64 \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	periph, 0, cam_periph_invalidate_sysctl, "I",
	"Write 1 to invalidate the drive immediately");
	#endif

	cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx,
	softc->sysctl_tree);

	cam_periph_release(periph);
	}

	static int
	ndaflagssysctl(SYSCTL_HANDLER_ARGS)
	{
	struct sbuf sbuf;
	struct nda_softc *softc = arg1;
	int error;

	sbuf_new_for_sysctl(&sbuf, NULL, 0, req);
	if (softc->flags != 0)
	sbuf_printf(&sbuf, "0x%b", (unsigned)softc->flags, NDA_FLAG_STRING);
	else
	sbuf_printf(&sbuf, "0");
	error = sbuf_finish(&sbuf);
	sbuf_delete(&sbuf);

	return (error);
	}

	static int
	ndagetattr(struct bio *bp)
	{
	int ret;
	struct cam_periph *periph;

	if (g_handleattr_int(bp, "GEOM::canspeedup", nda_enable_biospeedup))
	return (EJUSTRETURN);

	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
	cam_periph_lock(periph);
	ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute,
	periph->path);
	cam_periph_unlock(periph);
	if (ret == 0)
	bp->bio_completed = bp->bio_length;
	return ret;
	}

	static cam_status
	ndaregister(struct cam_periph periph, void arg)
	{
	struct nda_softc *softc;
	struct disk *disk;
	struct ccb_pathinq cpi;
	const struct nvme_namespace_data *nsd;
	const struct nvme_controller_data *cd;
	char announce_buf[80];
	uint8_t flbas_fmt, lbads, vwc_present;
	u_int maxio;
	int quirks;

	nsd = nvme_get_identify_ns(periph);
	cd = nvme_get_identify_cntrl(periph);

	softc = (struct nda_softc )malloc(sizeof(softc), M_DEVBUF,
	M_NOWAIT \| M_ZERO);

	if (softc == NULL) {
	printf("ndaregister: Unable to probe new device. "
	"Unable to allocate softc\n");
	return(CAM_REQ_CMP_ERR);
	}

	if (cam_iosched_init(&softc->cam_iosched, periph) != 0) {
	printf("ndaregister: Unable to probe new device. "
	"Unable to allocate iosched memory\n");
	free(softc, M_DEVBUF);
	return(CAM_REQ_CMP_ERR);
	}

	/* ident_data parsing */

	periph->softc = softc;
	softc->quirks = NDA_Q_NONE;
	xpt_path_inq(&cpi, periph->path);
	TASK_INIT(&softc->sysctl_task, 0, ndasysctlinit, periph);

	/*
	* The name space ID is the lun, save it for later I/O
	*/
	softc->nsid = (uint32_t)xpt_path_lun_id(periph->path);

	/*
	* Register this media as a disk
	*/
	(void)cam_periph_hold(periph, PRIBIO);
	cam_periph_unlock(periph);
	snprintf(announce_buf, sizeof(announce_buf),
	"kern.cam.nda.%d.quirks", periph->unit_number);
	quirks = softc->quirks;
	TUNABLE_INT_FETCH(announce_buf, &quirks);
	softc->quirks = quirks;
	cam_iosched_set_sort_queue(softc->cam_iosched, 0);
	softc->disk = disk = disk_alloc();
	disk->d_rotation_rate = DISK_RR_NON_ROTATING;
	disk->d_open = ndaopen;
	disk->d_close = ndaclose;
	disk->d_strategy = ndastrategy;
	disk->d_ioctl = ndaioctl;
	disk->d_getattr = ndagetattr;
	disk->d_dump = ndadump;
	disk->d_gone = ndadiskgonecb;
	disk->d_name = "nda";
	disk->d_drv1 = periph;
	disk->d_unit = periph->unit_number;
	maxio = cpi.maxio; /* Honor max I/O size of SIM */
	if (maxio == 0)
	maxio = DFLTPHYS; /* traditional default */
	- else if (maxio > MAXPHYS)
	- maxio = MAXPHYS; /* for safety */
	+ else if (maxio > maxphys)
	+ maxio = maxphys; /* for safety */
	disk->d_maxsize = maxio;
	flbas_fmt = (nsd->flbas >> NVME_NS_DATA_FLBAS_FORMAT_SHIFT) &
	NVME_NS_DATA_FLBAS_FORMAT_MASK;
	lbads = (nsd->lbaf[flbas_fmt] >> NVME_NS_DATA_LBAF_LBADS_SHIFT) &
	NVME_NS_DATA_LBAF_LBADS_MASK;
	disk->d_sectorsize = 1 << lbads;
	disk->d_mediasize = (off_t)(disk->d_sectorsize * nsd->nsze);
	disk->d_delmaxsize = disk->d_mediasize;
	disk->d_flags = DISKFLAG_DIRECT_COMPLETION;
	if (nvme_ctrlr_has_dataset_mgmt(cd))
	disk->d_flags \|= DISKFLAG_CANDELETE;
	vwc_present = (cd->vwc >> NVME_CTRLR_DATA_VWC_PRESENT_SHIFT) &
	NVME_CTRLR_DATA_VWC_PRESENT_MASK;
	if (vwc_present)
	disk->d_flags \|= DISKFLAG_CANFLUSHCACHE;
	if ((cpi.hba_misc & PIM_UNMAPPED) != 0) {
	disk->d_flags \|= DISKFLAG_UNMAPPED_BIO;
	softc->unmappedio = 1;
	}
	/*
	* d_ident and d_descr are both far bigger than the length of either
	* the serial or model number strings.
	*/
	cam_strvis(disk->d_descr, cd->mn,
	NVME_MODEL_NUMBER_LENGTH, sizeof(disk->d_descr));
	cam_strvis(disk->d_ident, cd->sn,
	NVME_SERIAL_NUMBER_LENGTH, sizeof(disk->d_ident));
	disk->d_hba_vendor = cpi.hba_vendor;
	disk->d_hba_device = cpi.hba_device;
	disk->d_hba_subvendor = cpi.hba_subvendor;
	disk->d_hba_subdevice = cpi.hba_subdevice;
	snprintf(disk->d_attachment, sizeof(disk->d_attachment),
	"%s%d", cpi.dev_name, cpi.unit_number);
	if (((nsd->nsfeat >> NVME_NS_DATA_NSFEAT_NPVALID_SHIFT) &
	NVME_NS_DATA_NSFEAT_NPVALID_MASK) != 0 && nsd->npwg != 0)
	disk->d_stripesize = ((nsd->npwg + 1) * disk->d_sectorsize);
	else
	disk->d_stripesize = nsd->noiob * disk->d_sectorsize;
	disk->d_stripeoffset = 0;
	disk->d_devstat = devstat_new_entry(periph->periph_name,
	periph->unit_number, disk->d_sectorsize,
	DEVSTAT_ALL_SUPPORTED,
	DEVSTAT_TYPE_DIRECT \| XPORT_DEVSTAT_TYPE(cpi.transport),
	DEVSTAT_PRIORITY_DISK);
	/*
	* Add alias for older nvd drives to ease transition.
	*/
	if (nda_nvd_compat)
	disk_add_alias(disk, "nvd");

	/*
	* Acquire a reference to the periph before we register with GEOM.
	* We'll release this reference once GEOM calls us back (via
	* ndadiskgonecb()) telling us that our provider has been freed.
	*/
	if (cam_periph_acquire(periph) != 0) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}
	disk_create(softc->disk, DISK_VERSION);
	cam_periph_lock(periph);
	cam_periph_unhold(periph);

	snprintf(announce_buf, sizeof(announce_buf),
	"%juMB (%ju %u byte sectors)",
	(uintmax_t)((uintmax_t)disk->d_mediasize / (1024*1024)),
	(uintmax_t)disk->d_mediasize / disk->d_sectorsize,
	disk->d_sectorsize);
	xpt_announce_periph(periph, announce_buf);
	xpt_announce_quirks(periph, softc->quirks, NDA_Q_BIT_STRING);

	/*
	* Create our sysctl variables, now that we know
	* we have successfully attached.
	*/
	if (cam_periph_acquire(periph) == 0)
	taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task);

	/*
	* Register for device going away and info about the drive
	* changing (though with NVMe, it can't)
	*/
	xpt_register_async(AC_LOST_DEVICE \| AC_ADVINFO_CHANGED,
	ndaasync, periph, periph->path);

	softc->state = NDA_STATE_NORMAL;
	return(CAM_REQ_CMP);
	}

	static void
	ndastart(struct cam_periph periph, union ccb start_ccb)
	{
	struct nda_softc softc = (struct nda_softc )periph->softc;
	struct ccb_nvmeio *nvmeio = &start_ccb->nvmeio;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastart\n"));

	switch (softc->state) {
	case NDA_STATE_NORMAL:
	{
	struct bio *bp;

	bp = cam_iosched_next_bio(softc->cam_iosched);
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastart: bio %p\n", bp));
	if (bp == NULL) {
	xpt_release_ccb(start_ccb);
	break;
	}

	switch (bp->bio_cmd) {
	case BIO_WRITE:
	softc->flags \|= NDA_FLAG_DIRTY;
	/* FALLTHROUGH */
	case BIO_READ:
	{
	#ifdef CAM_TEST_FAILURE
	int fail = 0;

	/*
	* Support the failure ioctls. If the command is a
	* read, and there are pending forced read errors, or
	* if a write and pending write errors, then fail this
	* operation with EIO. This is useful for testing
	* purposes. Also, support having every Nth read fail.
	*
	* This is a rather blunt tool.
	*/
	if (bp->bio_cmd == BIO_READ) {
	if (softc->force_read_error) {
	softc->force_read_error--;
	fail = 1;
	}
	if (softc->periodic_read_error > 0) {
	if (++softc->periodic_read_count >=
	softc->periodic_read_error) {
	softc->periodic_read_count = 0;
	fail = 1;
	}
	}
	} else {
	if (softc->force_write_error) {
	softc->force_write_error--;
	fail = 1;
	}
	}
	if (fail) {
	biofinish(bp, NULL, EIO);
	xpt_release_ccb(start_ccb);
	ndaschedule(periph);
	return;
	}
	#endif
	KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 \|\|
	round_page(bp->bio_bcount + bp->bio_ma_offset) /
	PAGE_SIZE == bp->bio_ma_n,
	("Short bio %p", bp));
	nda_nvme_rw_bio(softc, &start_ccb->nvmeio, bp, bp->bio_cmd == BIO_READ ?
	NVME_OPC_READ : NVME_OPC_WRITE);
	break;
	}
	case BIO_DELETE:
	{
	struct nvme_dsm_range dsm_range, dsm_end;
	struct nda_trim_request *trim;
	struct bio *bp1;
	int ents;
	uint32_t totalcount = 0, ranges = 0;

	trim = malloc(sizeof(*trim), M_NVMEDA, M_ZERO \| M_NOWAIT);
	if (trim == NULL) {
	biofinish(bp, NULL, ENOMEM);
	xpt_release_ccb(start_ccb);
	ndaschedule(periph);
	return;
	}
	TAILQ_INIT(&trim->bps);
	bp1 = bp;
	ents = min(nitems(trim->dsm), nda_max_trim_entries);
	ents = max(ents, 1);
	dsm_range = trim->dsm;
	dsm_end = dsm_range + ents;
	do {
	TAILQ_INSERT_TAIL(&trim->bps, bp1, bio_queue);
	dsm_range->length =
	htole32(bp1->bio_bcount / softc->disk->d_sectorsize);
	dsm_range->starting_lba =
	htole64(bp1->bio_offset / softc->disk->d_sectorsize);
	ranges++;
	totalcount += dsm_range->length;
	dsm_range++;
	if (dsm_range >= dsm_end)
	break;
	bp1 = cam_iosched_next_trim(softc->cam_iosched);
	/* XXX -- Could collapse adjacent ranges, but we don't for now */
	/* XXX -- Could limit based on total payload size */
	} while (bp1 != NULL);
	start_ccb->ccb_trim = trim;
	nda_nvme_trim(softc, &start_ccb->nvmeio, trim->dsm,
	dsm_range - trim->dsm);
	start_ccb->ccb_state = NDA_CCB_TRIM;
	softc->trim_count++;
	softc->trim_ranges += ranges;
	softc->trim_lbas += totalcount;
	/*
	* Note: We can have multiple TRIMs in flight, so we don't call
	* cam_iosched_submit_trim(softc->cam_iosched);
	* since that forces the I/O scheduler to only schedule one at a time.
	* On NVMe drives, this is a performance disaster.
	*/
	goto out;
	}
	case BIO_FLUSH:
	nda_nvme_flush(softc, nvmeio);
	break;
	default:
	biofinish(bp, NULL, EOPNOTSUPP);
	xpt_release_ccb(start_ccb);
	ndaschedule(periph);
	return;
	}
	start_ccb->ccb_state = NDA_CCB_BUFFER_IO;
	start_ccb->ccb_bp = bp;
	out:
	start_ccb->ccb_h.flags \|= CAM_UNLOCKED;
	softc->outstanding_cmds++;
	softc->refcount++; /* For submission only */
	cam_periph_unlock(periph);
	xpt_action(start_ccb);
	cam_periph_lock(periph);
	softc->refcount--; /* Submission done */

	/* May have more work to do, so ensure we stay scheduled */
	ndaschedule(periph);
	break;
	}
	}
	}

	static void
	ndadone(struct cam_periph periph, union ccb done_ccb)
	{
	struct nda_softc *softc;
	struct ccb_nvmeio *nvmeio = &done_ccb->nvmeio;
	struct cam_path *path;
	int state;

	softc = (struct nda_softc *)periph->softc;
	path = done_ccb->ccb_h.path;

	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("ndadone\n"));

	state = nvmeio->ccb_state & NDA_CCB_TYPE_MASK;
	switch (state) {
	case NDA_CCB_BUFFER_IO:
	case NDA_CCB_TRIM:
	{
	int error;

	cam_periph_lock(periph);
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	error = ndaerror(done_ccb, 0, 0);
	if (error == ERESTART) {
	/* A retry was scheduled, so just return. */
	cam_periph_unlock(periph);
	return;
	}
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	} else {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	panic("REQ_CMP with QFRZN");
	error = 0;
	}
	if (state == NDA_CCB_BUFFER_IO) {
	struct bio *bp;

	bp = (struct bio *)done_ccb->ccb_bp;
	bp->bio_error = error;
	if (error != 0) {
	bp->bio_resid = bp->bio_bcount;
	bp->bio_flags \|= BIO_ERROR;
	} else {
	bp->bio_resid = 0;
	}
	softc->outstanding_cmds--;

	/*
	* We need to call cam_iosched before we call biodone so that we
	* don't measure any activity that happens in the completion
	* routine, which in the case of sendfile can be quite
	* extensive.
	*/
	cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
	xpt_release_ccb(done_ccb);
	ndaschedule(periph);
	cam_periph_unlock(periph);
	biodone(bp);
	} else { /* state == NDA_CCB_TRIM */
	struct nda_trim_request *trim;
	struct bio bp1, bp2;
	TAILQ_HEAD(, bio) queue;

	trim = nvmeio->ccb_trim;
	TAILQ_INIT(&queue);
	TAILQ_CONCAT(&queue, &trim->bps, bio_queue);
	free(trim, M_NVMEDA);

	/*
	* Since we can have multiple trims in flight, we don't
	* need to call this here.
	* cam_iosched_trim_done(softc->cam_iosched);
	*/
	/*
	* The the I/O scheduler that we're finishing the I/O
	* so we can keep book. The first one we pass in the CCB
	* which has the timing information. The rest we pass in NULL
	* so we can keep proper counts.
	*/
	bp1 = TAILQ_FIRST(&queue);
	cam_iosched_bio_complete(softc->cam_iosched, bp1, done_ccb);
	xpt_release_ccb(done_ccb);
	softc->outstanding_cmds--;
	ndaschedule(periph);
	cam_periph_unlock(periph);
	while ((bp2 = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, bp2, bio_queue);
	bp2->bio_error = error;
	if (error != 0) {
	bp2->bio_flags \|= BIO_ERROR;
	bp2->bio_resid = bp1->bio_bcount;
	} else
	bp2->bio_resid = 0;
	if (bp1 != bp2)
	cam_iosched_bio_complete(softc->cam_iosched, bp2, NULL);
	biodone(bp2);
	}
	}
	return;
	}
	case NDA_CCB_DUMP:
	/* No-op. We're polling */
	return;
	case NDA_CCB_PASS:
	/* NVME_PASSTHROUGH_CMD runs this CCB and releases it */
	return;
	default:
	break;
	}
	xpt_release_ccb(done_ccb);
	}

	static int
	ndaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
	{
	struct nda_softc *softc;
	struct cam_periph *periph;

	periph = xpt_path_periph(ccb->ccb_h.path);
	softc = (struct nda_softc *)periph->softc;

	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
	case CAM_CMD_TIMEOUT:
	#ifdef CAM_IO_STATS
	softc->timeouts++;
	#endif
	break;
	case CAM_REQ_ABORTED:
	case CAM_REQ_CMP_ERR:
	case CAM_REQ_TERMIO:
	case CAM_UNREC_HBA_ERROR:
	case CAM_DATA_RUN_ERR:
	case CAM_ATA_STATUS_ERROR:
	#ifdef CAM_IO_STATS
	softc->errors++;
	#endif
	break;
	default:
	break;
	}

	return(cam_periph_error(ccb, cam_flags, sense_flags));
	}

	/*
	* Step through all NDA peripheral drivers, and if the device is still open,
	* sync the disk cache to physical media.
	*/
	static void
	ndaflush(void)
	{
	struct cam_periph *periph;
	struct nda_softc *softc;
	union ccb *ccb;
	int error;

	CAM_PERIPH_FOREACH(periph, &ndadriver) {
	softc = (struct nda_softc *)periph->softc;

	if (SCHEDULER_STOPPED()) {
	/*
	* If we paniced with the lock held or the periph is not
	* open, do not recurse. Otherwise, call ndadump since
	* that avoids the sleeping cam_periph_getccb does if no
	* CCBs are available.
	*/
	if (!cam_periph_owned(periph) &&
	(softc->flags & NDA_FLAG_OPEN)) {
	ndadump(softc->disk, NULL, 0, 0, 0);
	}
	continue;
	}

	/*
	* We only sync the cache if the drive is still open
	*/
	cam_periph_lock(periph);
	if ((softc->flags & NDA_FLAG_OPEN) == 0) {
	cam_periph_unlock(periph);
	continue;
	}

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	nda_nvme_flush(softc, &ccb->nvmeio);
	error = cam_periph_runccb(ccb, ndaerror, /cam_flags/0,
	/sense_flags/ SF_NO_RECOVERY \| SF_NO_RETRY,
	softc->disk->d_devstat);
	if (error != 0)
	xpt_print(periph->path, "Synchronize cache failed\n");
	xpt_release_ccb(ccb);
	cam_periph_unlock(periph);
	}
	}

	static void
	ndashutdown(void *arg, int howto)
	{

	ndaflush();
	}

	static void
	ndasuspend(void *arg)
	{

	ndaflush();
	}
	diff --git a/sys/cam/scsi/scsi_cd.c b/sys/cam/scsi/scsi_cd.c
	index 2b4bdc6020fd..e009b0a586c3 100644
	--- a/sys/cam/scsi/scsi_cd.c
	+++ b/sys/cam/scsi/scsi_cd.c
	@@ -1,4248 +1,4248 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1997 Justin T. Gibbs.
	* Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003 Kenneth D. Merry.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*-
	* Portions of this driver taken from the original FreeBSD cd driver.
	* Written by Julian Elischer (julian@tfs.com)
	* for TRW Financial Systems for use under the MACH(2.5) operating system.
	*
	* TRW Financial Systems, in accordance with their agreement with Carnegie
	* Mellon University, makes this software available to CMU to distribute
	* or use in any manner that they see fit as long as this message is kept with
	* the software. For this reason TFS also grants any other persons or
	* organisations permission to use or modify this software.
	*
	* TFS supplies this software to be publicly redistributed
	* on the understanding that TFS is not responsible for the correct
	* functioning of this software in any circumstances.
	*
	* Ported to run under 386BSD by Julian Elischer (julian@tfs.com) Sept 1992
	*
	* from: cd.c,v 1.83 1997/05/04 15:24:22 joerg Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_cd.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bio.h>
	#include <sys/conf.h>
	#include <sys/disk.h>
	#include <sys/malloc.h>
	#include <sys/cdio.h>
	#include <sys/cdrio.h>
	#include <sys/dvdio.h>
	#include <sys/devicestat.h>
	#include <sys/proc.h>
	#include <sys/sbuf.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/taskqueue.h>
	#include <geom/geom_disk.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/cam_queue.h>
	#include <cam/cam_sim.h>

	#include <cam/scsi/scsi_message.h>
	#include <cam/scsi/scsi_da.h>
	#include <cam/scsi/scsi_cd.h>

	#define LEADOUT 0xaa /* leadout toc entry */

	struct cd_params {
	u_int32_t blksize;
	u_long disksize;
	};

	typedef enum {
	CD_Q_NONE = 0x00,
	CD_Q_NO_TOUCH = 0x01,
	CD_Q_BCD_TRACKS = 0x02,
	CD_Q_10_BYTE_ONLY = 0x10,
	CD_Q_RETRY_BUSY = 0x40
	} cd_quirks;

	#define CD_Q_BIT_STRING \
	"\020" \
	"\001NO_TOUCH" \
	"\002BCD_TRACKS" \
	"\00510_BYTE_ONLY" \
	"\007RETRY_BUSY"

	typedef enum {
	CD_FLAG_INVALID = 0x0001,
	CD_FLAG_NEW_DISC = 0x0002,
	CD_FLAG_DISC_LOCKED = 0x0004,
	CD_FLAG_DISC_REMOVABLE = 0x0008,
	CD_FLAG_SAW_MEDIA = 0x0010,
	CD_FLAG_ACTIVE = 0x0080,
	CD_FLAG_SCHED_ON_COMP = 0x0100,
	CD_FLAG_RETRY_UA = 0x0200,
	CD_FLAG_VALID_MEDIA = 0x0400,
	CD_FLAG_VALID_TOC = 0x0800,
	CD_FLAG_SCTX_INIT = 0x1000,
	CD_FLAG_MEDIA_WAIT = 0x2000,
	CD_FLAG_MEDIA_SCAN_ACT = 0x4000
	} cd_flags;

	typedef enum {
	CD_CCB_PROBE = 0x01,
	CD_CCB_BUFFER_IO = 0x02,
	CD_CCB_TUR = 0x03,
	CD_CCB_MEDIA_PREVENT = 0x04,
	CD_CCB_MEDIA_ALLOW = 0x05,
	CD_CCB_MEDIA_SIZE = 0x06,
	CD_CCB_MEDIA_TOC_HDR = 0x07,
	CD_CCB_MEDIA_TOC_FULL = 0x08,
	CD_CCB_MEDIA_TOC_LEAD = 0x09,
	CD_CCB_TYPE_MASK = 0x0F,
	CD_CCB_RETRY_UA = 0x10
	} cd_ccb_state;

	#define ccb_state ppriv_field0
	#define ccb_bp ppriv_ptr1

	struct cd_tocdata {
	struct ioc_toc_header header;
	struct cd_toc_entry entries[100];
	};

	struct cd_toc_single {
	struct ioc_toc_header header;
	struct cd_toc_entry entry;
	};

	typedef enum {
	CD_STATE_PROBE,
	CD_STATE_NORMAL,
	CD_STATE_MEDIA_PREVENT,
	CD_STATE_MEDIA_ALLOW,
	CD_STATE_MEDIA_SIZE,
	CD_STATE_MEDIA_TOC_HDR,
	CD_STATE_MEDIA_TOC_FULL,
	CD_STATE_MEDIA_TOC_LEAD
	} cd_state;

	struct cd_softc {
	cam_pinfo pinfo;
	cd_state state;
	volatile cd_flags flags;
	struct bio_queue_head bio_queue;
	LIST_HEAD(, ccb_hdr) pending_ccbs;
	struct cd_params params;
	union ccb saved_ccb;
	cd_quirks quirks;
	struct cam_periph *periph;
	int minimum_command_size;
	int outstanding_cmds;
	int tur;
	struct task sysctl_task;
	struct sysctl_ctx_list sysctl_ctx;
	struct sysctl_oid *sysctl_tree;
	STAILQ_HEAD(, cd_mode_params) mode_queue;
	struct cd_tocdata toc;
	int toc_read_len;
	struct cd_toc_single leadout;
	struct disk *disk;
	struct callout mediapoll_c;

	#define CD_ANNOUNCETMP_SZ 120
	char announce_temp[CD_ANNOUNCETMP_SZ];
	#define CD_ANNOUNCE_SZ 400
	char announce_buf[CD_ANNOUNCE_SZ];
	};

	struct cd_page_sizes {
	int page;
	int page_size;
	};

	static struct cd_page_sizes cd_page_size_table[] =
	{
	{ AUDIO_PAGE, sizeof(struct cd_audio_page)}
	};

	struct cd_quirk_entry {
	struct scsi_inquiry_pattern inq_pat;
	cd_quirks quirks;
	};

	/*
	* NOTE ON 10_BYTE_ONLY quirks: Any 10_BYTE_ONLY quirks MUST be because
	* your device hangs when it gets a 10 byte command. Adding a quirk just
	* to get rid of the informative diagnostic message is not acceptable. All
	* 10_BYTE_ONLY quirks must be documented in full in a PR (which should be
	* referenced in a comment along with the quirk) , and must be approved by
	* ken@FreeBSD.org. Any quirks added that don't adhere to this policy may
	* be removed until the submitter can explain why they are needed.
	* 10_BYTE_ONLY quirks will be removed (as they will no longer be necessary)
	* when the CAM_NEW_TRAN_CODE work is done.
	*/
	static struct cd_quirk_entry cd_quirk_table[] =
	{
	{
	{ T_CDROM, SIP_MEDIA_REMOVABLE, "CHINON", "CD-ROM CDS-535","*"},
	/* quirks */ CD_Q_BCD_TRACKS
	},
	{
	/*
	* VMware returns BUSY status when storage has transient
	* connectivity problems, so better wait.
	*/
	{T_CDROM, SIP_MEDIA_REMOVABLE, "NECVMWar", "VMware IDE CDR10", "*"},
	/quirks/ CD_Q_RETRY_BUSY
	}
	};

	#ifdef COMPAT_FREEBSD32
	struct ioc_read_toc_entry32 {
	u_char address_format;
	u_char starting_track;
	u_short data_len;
	uint32_t data; /* (struct cd_toc_entry ) /
	};
	#define CDIOREADTOCENTRYS_32 \
	_IOC_NEWTYPE(CDIOREADTOCENTRYS, struct ioc_read_toc_entry32)
	#endif

	static disk_open_t cdopen;
	static disk_close_t cdclose;
	static disk_ioctl_t cdioctl;
	static disk_strategy_t cdstrategy;

	static periph_init_t cdinit;
	static periph_ctor_t cdregister;
	static periph_dtor_t cdcleanup;
	static periph_start_t cdstart;
	static periph_oninv_t cdoninvalidate;
	static void cdasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg);
	static int cdcmdsizesysctl(SYSCTL_HANDLER_ARGS);
	static int cdrunccb(union ccb *ccb,
	int (error_routine)(union ccb ccb,
	u_int32_t cam_flags,
	u_int32_t sense_flags),
	u_int32_t cam_flags, u_int32_t sense_flags);
	static void cddone(struct cam_periph *periph,
	union ccb *start_ccb);
	static union cd_pages cdgetpage(struct cd_mode_params mode_params);
	static int cdgetpagesize(int page_num);
	static void cdprevent(struct cam_periph *periph, int action);
	static void cdmediaprobedone(struct cam_periph *periph);
	static int cdcheckmedia(struct cam_periph *periph, int do_wait);
	#if 0
	static int cdsize(struct cam_periph periph, u_int32_t size);
	#endif
	static int cd6byteworkaround(union ccb *ccb);
	static int cderror(union ccb *ccb, u_int32_t cam_flags,
	u_int32_t sense_flags);
	static int cdreadtoc(struct cam_periph *periph, u_int32_t mode,
	u_int32_t start, u_int8_t *data,
	u_int32_t len, u_int32_t sense_flags);
	static int cdgetmode(struct cam_periph *periph,
	struct cd_mode_params *data, u_int32_t page);
	static int cdsetmode(struct cam_periph *periph,
	struct cd_mode_params *data);
	static int cdplay(struct cam_periph *periph, u_int32_t blk,
	u_int32_t len);
	static int cdreadsubchannel(struct cam_periph *periph,
	u_int32_t mode, u_int32_t format,
	int track,
	struct cd_sub_channel_info *data,
	u_int32_t len);
	static int cdplaymsf(struct cam_periph *periph, u_int32_t startm,
	u_int32_t starts, u_int32_t startf,
	u_int32_t endm, u_int32_t ends,
	u_int32_t endf);
	static int cdplaytracks(struct cam_periph *periph,
	u_int32_t strack, u_int32_t sindex,
	u_int32_t etrack, u_int32_t eindex);
	static int cdpause(struct cam_periph *periph, u_int32_t go);
	static int cdstopunit(struct cam_periph *periph, u_int32_t eject);
	static int cdstartunit(struct cam_periph *periph, int load);
	static int cdsetspeed(struct cam_periph *periph,
	u_int32_t rdspeed, u_int32_t wrspeed);
	static int cdreportkey(struct cam_periph *periph,
	struct dvd_authinfo *authinfo);
	static int cdsendkey(struct cam_periph *periph,
	struct dvd_authinfo *authinfo);
	static int cdreaddvdstructure(struct cam_periph *periph,
	struct dvd_struct *dvdstruct);
	static callout_func_t cdmediapoll;

	static struct periph_driver cddriver =
	{
	cdinit, "cd",
	TAILQ_HEAD_INITIALIZER(cddriver.units), /* generation */ 0
	};

	PERIPHDRIVER_DECLARE(cd, cddriver);

	#ifndef CD_DEFAULT_POLL_PERIOD
	#define CD_DEFAULT_POLL_PERIOD 3
	#endif
	#ifndef CD_DEFAULT_RETRY
	#define CD_DEFAULT_RETRY 4
	#endif
	#ifndef CD_DEFAULT_TIMEOUT
	#define CD_DEFAULT_TIMEOUT 30000
	#endif

	static int cd_poll_period = CD_DEFAULT_POLL_PERIOD;
	static int cd_retry_count = CD_DEFAULT_RETRY;
	static int cd_timeout = CD_DEFAULT_TIMEOUT;

	static SYSCTL_NODE(_kern_cam, OID_AUTO, cd, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"CAM CDROM driver");
	SYSCTL_INT(_kern_cam_cd, OID_AUTO, poll_period, CTLFLAG_RWTUN,
	&cd_poll_period, 0, "Media polling period in seconds");
	SYSCTL_INT(_kern_cam_cd, OID_AUTO, retry_count, CTLFLAG_RWTUN,
	&cd_retry_count, 0, "Normal I/O retry count");
	SYSCTL_INT(_kern_cam_cd, OID_AUTO, timeout, CTLFLAG_RWTUN,
	&cd_timeout, 0, "Timeout, in us, for read operations");

	static MALLOC_DEFINE(M_SCSICD, "scsi_cd", "scsi_cd buffers");

	static void
	cdinit(void)
	{
	cam_status status;

	/*
	* Install a global async callback. This callback will
	* receive async callbacks like "new device found".
	*/
	status = xpt_register_async(AC_FOUND_DEVICE, cdasync, NULL, NULL);

	if (status != CAM_REQ_CMP) {
	printf("cd: Failed to attach master async callback "
	"due to status 0x%x!\n", status);
	}
	}

	/*
	* Callback from GEOM, called when it has finished cleaning up its
	* resources.
	*/
	static void
	cddiskgonecb(struct disk *dp)
	{
	struct cam_periph *periph;

	periph = (struct cam_periph *)dp->d_drv1;
	cam_periph_release(periph);
	}

	static void
	cdoninvalidate(struct cam_periph *periph)
	{
	struct cd_softc *softc;

	softc = (struct cd_softc *)periph->softc;

	/*
	* De-register any async callbacks.
	*/
	xpt_register_async(0, cdasync, periph, periph->path);

	softc->flags \|= CD_FLAG_INVALID;

	/*
	* Return all queued I/O with ENXIO.
	* XXX Handle any transactions queued to the card
	* with XPT_ABORT_CCB.
	*/
	bioq_flush(&softc->bio_queue, NULL, ENXIO);

	disk_gone(softc->disk);
	}

	static void
	cdcleanup(struct cam_periph *periph)
	{
	struct cd_softc *softc;

	softc = (struct cd_softc *)periph->softc;

	cam_periph_unlock(periph);
	if ((softc->flags & CD_FLAG_SCTX_INIT) != 0
	&& sysctl_ctx_free(&softc->sysctl_ctx) != 0) {
	xpt_print(periph->path, "can't remove sysctl context\n");
	}

	callout_drain(&softc->mediapoll_c);
	disk_destroy(softc->disk);
	free(softc, M_DEVBUF);
	cam_periph_lock(periph);
	}

	static void
	cdasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg)
	{
	struct cam_periph *periph;
	struct cd_softc *softc;

	periph = (struct cam_periph *)callback_arg;
	switch (code) {
	case AC_FOUND_DEVICE:
	{
	struct ccb_getdev *cgd;
	cam_status status;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL)
	break;

	if (cgd->protocol != PROTO_SCSI)
	break;
	if (SID_QUAL(&cgd->inq_data) != SID_QUAL_LU_CONNECTED)
	break;
	if (SID_TYPE(&cgd->inq_data) != T_CDROM
	&& SID_TYPE(&cgd->inq_data) != T_WORM)
	break;

	/*
	* Allocate a peripheral instance for
	* this device and start the probe
	* process.
	*/
	status = cam_periph_alloc(cdregister, cdoninvalidate,
	cdcleanup, cdstart,
	"cd", CAM_PERIPH_BIO,
	path, cdasync,
	AC_FOUND_DEVICE, cgd);

	if (status != CAM_REQ_CMP
	&& status != CAM_REQ_INPROG)
	printf("cdasync: Unable to attach new device "
	"due to status 0x%x\n", status);

	break;
	}
	case AC_UNIT_ATTENTION:
	{
	union ccb *ccb;
	int error_code, sense_key, asc, ascq;

	softc = (struct cd_softc *)periph->softc;
	ccb = (union ccb *)arg;

	/*
	* Handle all media change UNIT ATTENTIONs except
	* our own, as they will be handled by cderror().
	*/
	if (xpt_path_periph(ccb->ccb_h.path) != periph &&
	scsi_extract_sense_ccb(ccb,
	&error_code, &sense_key, &asc, &ascq)) {
	if (asc == 0x28 && ascq == 0x00)
	disk_media_changed(softc->disk, M_NOWAIT);
	}
	cam_periph_async(periph, code, path, arg);
	break;
	}
	case AC_SCSI_AEN:
	softc = (struct cd_softc *)periph->softc;
	if (softc->state == CD_STATE_NORMAL && !softc->tur) {
	if (cam_periph_acquire(periph) == 0) {
	softc->tur = 1;
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	}
	}
	/* FALLTHROUGH */
	case AC_SENT_BDR:
	case AC_BUS_RESET:
	{
	struct ccb_hdr *ccbh;

	softc = (struct cd_softc *)periph->softc;
	/*
	* Don't fail on the expected unit attention
	* that will occur.
	*/
	softc->flags \|= CD_FLAG_RETRY_UA;
	LIST_FOREACH(ccbh, &softc->pending_ccbs, periph_links.le)
	ccbh->ccb_state \|= CD_CCB_RETRY_UA;
	/* FALLTHROUGH */
	}
	default:
	cam_periph_async(periph, code, path, arg);
	break;
	}
	}

	static void
	cdsysctlinit(void *context, int pending)
	{
	struct cam_periph *periph;
	struct cd_softc *softc;
	char tmpstr[32], tmpstr2[16];

	periph = (struct cam_periph *)context;
	if (cam_periph_acquire(periph) != 0)
	return;

	softc = (struct cd_softc *)periph->softc;
	snprintf(tmpstr, sizeof(tmpstr), "CAM CD unit %d", periph->unit_number);
	snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);

	sysctl_ctx_init(&softc->sysctl_ctx);
	softc->flags \|= CD_FLAG_SCTX_INIT;
	softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_kern_cam_cd), OID_AUTO,
	tmpstr2, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, tmpstr,
	"device_index");

	if (softc->sysctl_tree == NULL) {
	printf("cdsysctlinit: unable to allocate sysctl tree\n");
	cam_periph_release(periph);
	return;
	}

	/*
	* Now register the sysctl handler, so the user can the value on
	* the fly.
	*/
	SYSCTL_ADD_PROC(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "minimum_cmd_size",
	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_NEEDGIANT,
	&softc->minimum_command_size, 0, cdcmdsizesysctl, "I",
	"Minimum CDB size");

	cam_periph_release(periph);
	}

	/*
	* We have a handler function for this so we can check the values when the
	* user sets them, instead of every time we look at them.
	*/
	static int
	cdcmdsizesysctl(SYSCTL_HANDLER_ARGS)
	{
	int error, value;

	value = (int )arg1;

	error = sysctl_handle_int(oidp, &value, 0, req);

	if ((error != 0)
	\|\| (req->newptr == NULL))
	return (error);

	/*
	* The only real values we can have here are 6 or 10. I don't
	* really forsee having 12 be an option at any time in the future.
	* So if the user sets something less than or equal to 6, we'll set
	* it to 6. If he sets something greater than 6, we'll set it to 10.
	*
	* I suppose we could just return an error here for the wrong values,
	* but I don't think it's necessary to do so, as long as we can
	* determine the user's intent without too much trouble.
	*/
	if (value < 6)
	value = 6;
	else if (value > 6)
	value = 10;

	(int )arg1 = value;

	return (0);
	}

	static cam_status
	cdregister(struct cam_periph periph, void arg)
	{
	struct cd_softc *softc;
	struct ccb_pathinq cpi;
	struct ccb_getdev *cgd;
	char tmpstr[80];
	caddr_t match;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL) {
	printf("cdregister: no getdev CCB, can't register device\n");
	return(CAM_REQ_CMP_ERR);
	}

	softc = (struct cd_softc )malloc(sizeof(softc),M_DEVBUF,
	M_NOWAIT \| M_ZERO);
	if (softc == NULL) {
	printf("cdregister: Unable to probe new device. "
	"Unable to allocate softc\n");
	return(CAM_REQ_CMP_ERR);
	}

	LIST_INIT(&softc->pending_ccbs);
	STAILQ_INIT(&softc->mode_queue);
	softc->state = CD_STATE_PROBE;
	bioq_init(&softc->bio_queue);
	if (SID_IS_REMOVABLE(&cgd->inq_data))
	softc->flags \|= CD_FLAG_DISC_REMOVABLE;

	periph->softc = softc;
	softc->periph = periph;

	/*
	* See if this device has any quirks.
	*/
	match = cam_quirkmatch((caddr_t)&cgd->inq_data,
	(caddr_t)cd_quirk_table,
	nitems(cd_quirk_table),
	sizeof(*cd_quirk_table), scsi_inquiry_match);

	if (match != NULL)
	softc->quirks = ((struct cd_quirk_entry *)match)->quirks;
	else
	softc->quirks = CD_Q_NONE;

	/* Check if the SIM does not want 6 byte commands */
	xpt_path_inq(&cpi, periph->path);
	if (cpi.ccb_h.status == CAM_REQ_CMP && (cpi.hba_misc & PIM_NO_6_BYTE))
	softc->quirks \|= CD_Q_10_BYTE_ONLY;

	TASK_INIT(&softc->sysctl_task, 0, cdsysctlinit, periph);

	/* The default is 6 byte commands, unless quirked otherwise */
	if (softc->quirks & CD_Q_10_BYTE_ONLY)
	softc->minimum_command_size = 10;
	else
	softc->minimum_command_size = 6;

	/*
	* Refcount and block open attempts until we are setup
	* Can't block
	*/
	(void)cam_periph_hold(periph, PRIBIO);
	cam_periph_unlock(periph);
	/*
	* Load the user's default, if any.
	*/
	snprintf(tmpstr, sizeof(tmpstr), "kern.cam.cd.%d.minimum_cmd_size",
	periph->unit_number);
	TUNABLE_INT_FETCH(tmpstr, &softc->minimum_command_size);

	/* 6 and 10 are the only permissible values here. */
	if (softc->minimum_command_size < 6)
	softc->minimum_command_size = 6;
	else if (softc->minimum_command_size > 6)
	softc->minimum_command_size = 10;

	/*
	* We need to register the statistics structure for this device,
	* but we don't have the blocksize yet for it. So, we register
	* the structure and indicate that we don't have the blocksize
	* yet. Unlike other SCSI peripheral drivers, we explicitly set
	* the device type here to be CDROM, rather than just ORing in
	* the device type. This is because this driver can attach to either
	* CDROM or WORM devices, and we want this peripheral driver to
	* show up in the devstat list as a CD peripheral driver, not a
	* WORM peripheral driver. WORM drives will also have the WORM
	* driver attached to them.
	*/
	softc->disk = disk_alloc();
	softc->disk->d_devstat = devstat_new_entry("cd",
	periph->unit_number, 0,
	DEVSTAT_BS_UNAVAILABLE,
	DEVSTAT_TYPE_CDROM \|
	XPORT_DEVSTAT_TYPE(cpi.transport),
	DEVSTAT_PRIORITY_CD);
	softc->disk->d_open = cdopen;
	softc->disk->d_close = cdclose;
	softc->disk->d_strategy = cdstrategy;
	softc->disk->d_gone = cddiskgonecb;
	softc->disk->d_ioctl = cdioctl;
	softc->disk->d_name = "cd";
	cam_strvis(softc->disk->d_descr, cgd->inq_data.vendor,
	sizeof(cgd->inq_data.vendor), sizeof(softc->disk->d_descr));
	strlcat(softc->disk->d_descr, " ", sizeof(softc->disk->d_descr));
	cam_strvis(&softc->disk->d_descr[strlen(softc->disk->d_descr)],
	cgd->inq_data.product, sizeof(cgd->inq_data.product),
	sizeof(softc->disk->d_descr) - strlen(softc->disk->d_descr));
	softc->disk->d_unit = periph->unit_number;
	softc->disk->d_drv1 = periph;
	if (cpi.maxio == 0)
	softc->disk->d_maxsize = DFLTPHYS; /* traditional default */
	- else if (cpi.maxio > MAXPHYS)
	- softc->disk->d_maxsize = MAXPHYS; /* for safety */
	+ else if (cpi.maxio > maxphys)
	+ softc->disk->d_maxsize = maxphys; /* for safety */
	else
	softc->disk->d_maxsize = cpi.maxio;
	softc->disk->d_flags = 0;
	softc->disk->d_hba_vendor = cpi.hba_vendor;
	softc->disk->d_hba_device = cpi.hba_device;
	softc->disk->d_hba_subvendor = cpi.hba_subvendor;
	softc->disk->d_hba_subdevice = cpi.hba_subdevice;
	snprintf(softc->disk->d_attachment, sizeof(softc->disk->d_attachment),
	"%s%d", cpi.dev_name, cpi.unit_number);

	/*
	* Acquire a reference to the periph before we register with GEOM.
	* We'll release this reference once GEOM calls us back (via
	* dadiskgonecb()) telling us that our provider has been freed.
	*/
	if (cam_periph_acquire(periph) != 0) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}

	disk_create(softc->disk, DISK_VERSION);
	cam_periph_lock(periph);

	/*
	* Add an async callback so that we get
	* notified if this device goes away.
	*/
	xpt_register_async(AC_SENT_BDR \| AC_BUS_RESET \| AC_LOST_DEVICE \|
	AC_SCSI_AEN \| AC_UNIT_ATTENTION, cdasync, periph, periph->path);

	/*
	* Schedule a periodic media polling events.
	*/
	callout_init_mtx(&softc->mediapoll_c, cam_periph_mtx(periph), 0);
	if ((softc->flags & CD_FLAG_DISC_REMOVABLE) &&
	(cgd->inq_flags & SID_AEN) == 0 &&
	cd_poll_period != 0)
	callout_reset(&softc->mediapoll_c, cd_poll_period * hz,
	cdmediapoll, periph);

	xpt_schedule(periph, CAM_PRIORITY_DEV);
	return(CAM_REQ_CMP);
	}

	static int
	cdopen(struct disk *dp)
	{
	struct cam_periph *periph;
	struct cd_softc *softc;
	int error;

	periph = (struct cam_periph *)dp->d_drv1;
	softc = (struct cd_softc *)periph->softc;

	if (cam_periph_acquire(periph) != 0)
	return(ENXIO);

	cam_periph_lock(periph);

	if (softc->flags & CD_FLAG_INVALID) {
	cam_periph_release_locked(periph);
	cam_periph_unlock(periph);
	return(ENXIO);
	}

	if ((error = cam_periph_hold(periph, PRIBIO \| PCATCH)) != 0) {
	cam_periph_release_locked(periph);
	cam_periph_unlock(periph);
	return (error);
	}

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE \| CAM_DEBUG_PERIPH,
	("cdopen\n"));

	/*
	* Check for media, and set the appropriate flags. We don't bail
	* if we don't have media, but then we don't allow anything but the
	* CDIOCEJECT/CDIOCCLOSE ioctls if there is no media.
	*/
	cdcheckmedia(periph, /do_wait/ 1);

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("leaving cdopen\n"));
	cam_periph_unhold(periph);

	cam_periph_unlock(periph);

	return (0);
	}

	static int
	cdclose(struct disk *dp)
	{
	struct cam_periph *periph;
	struct cd_softc *softc;

	periph = (struct cam_periph *)dp->d_drv1;
	softc = (struct cd_softc *)periph->softc;

	cam_periph_lock(periph);
	if (cam_periph_hold(periph, PRIBIO) != 0) {
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (0);
	}

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE \| CAM_DEBUG_PERIPH,
	("cdclose\n"));

	if ((softc->flags & CD_FLAG_DISC_REMOVABLE) != 0)
	cdprevent(periph, PR_ALLOW);

	/*
	* Since we're closing this CD, mark the blocksize as unavailable.
	* It will be marked as available when the CD is opened again.
	*/
	softc->disk->d_devstat->flags \|= DEVSTAT_BS_UNAVAILABLE;

	/*
	* We'll check the media and toc again at the next open().
	*/
	softc->flags &= ~(CD_FLAG_VALID_MEDIA\|CD_FLAG_VALID_TOC);

	cam_periph_unhold(periph);
	cam_periph_release_locked(periph);
	cam_periph_unlock(periph);

	return (0);
	}

	static int
	cdrunccb(union ccb ccb, int (error_routine)(union ccb *ccb,
	u_int32_t cam_flags,
	u_int32_t sense_flags),
	u_int32_t cam_flags, u_int32_t sense_flags)
	{
	struct cd_softc *softc;
	struct cam_periph *periph;
	int error;

	periph = xpt_path_periph(ccb->ccb_h.path);
	softc = (struct cd_softc *)periph->softc;

	error = cam_periph_runccb(ccb, error_routine, cam_flags, sense_flags,
	softc->disk->d_devstat);

	return(error);
	}

	/*
	* Actually translate the requested transfer into one the physical driver
	* can understand. The transfer is described by a buf and will include
	* only one physical transfer.
	*/
	static void
	cdstrategy(struct bio *bp)
	{
	struct cam_periph *periph;
	struct cd_softc *softc;

	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE,
	("cdstrategy(%p)\n", bp));

	softc = (struct cd_softc *)periph->softc;

	/*
	* If the device has been made invalid, error out
	*/
	if ((softc->flags & CD_FLAG_INVALID)) {
	cam_periph_unlock(periph);
	biofinish(bp, NULL, ENXIO);
	return;
	}

	/*
	* Place it in the queue of disk activities for this disk
	*/
	bioq_disksort(&softc->bio_queue, bp);

	/*
	* If we don't know that we have valid media, schedule the media
	* check first. The I/O will get executed after the media check.
	*/
	if ((softc->flags & CD_FLAG_VALID_MEDIA) == 0)
	cdcheckmedia(periph, /do_wait/ 0);
	else
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);

	cam_periph_unlock(periph);
	return;
	}

	static void
	cdstart(struct cam_periph periph, union ccb start_ccb)
	{
	struct cd_softc *softc;
	struct bio *bp;
	struct ccb_scsiio *csio;

	softc = (struct cd_softc *)periph->softc;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("entering cdstart\n"));

	switch (softc->state) {
	case CD_STATE_NORMAL:
	{
	bp = bioq_first(&softc->bio_queue);
	if (bp == NULL) {
	if (softc->tur) {
	softc->tur = 0;
	csio = &start_ccb->csio;
	scsi_test_unit_ready(csio,
	/retries/ cd_retry_count,
	cddone,
	MSG_SIMPLE_Q_TAG,
	SSD_FULL_SIZE,
	cd_timeout);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = CD_CCB_TUR;
	xpt_action(start_ccb);
	} else
	xpt_release_ccb(start_ccb);
	} else {
	if (softc->tur) {
	softc->tur = 0;
	cam_periph_release_locked(periph);
	}
	bioq_remove(&softc->bio_queue, bp);

	if ((bp->bio_cmd != BIO_READ) &&
	(bp->bio_cmd != BIO_WRITE)) {
	biofinish(bp, NULL, EOPNOTSUPP);
	xpt_release_ccb(start_ccb);
	return;
	}

	scsi_read_write(&start_ccb->csio,
	/retries/ cd_retry_count,
	/* cbfcnp */ cddone,
	MSG_SIMPLE_Q_TAG,
	/* read */bp->bio_cmd == BIO_READ ?
	SCSI_RW_READ : SCSI_RW_WRITE,
	/* byte2 */ 0,
	/* minimum_cmd_size */ 10,
	/* lba */ bp->bio_offset /
	softc->params.blksize,
	bp->bio_bcount / softc->params.blksize,
	/* data_ptr */ bp->bio_data,
	/* dxfer_len */ bp->bio_bcount,
	/* sense_len */ cd_retry_count ?
	SSD_FULL_SIZE : SF_NO_PRINT,
	/* timeout */ cd_timeout);
	/* Use READ CD command for audio tracks. */
	if (softc->params.blksize == 2352) {
	start_ccb->csio.cdb_io.cdb_bytes[0] = READ_CD;
	start_ccb->csio.cdb_io.cdb_bytes[9] = 0xf8;
	start_ccb->csio.cdb_io.cdb_bytes[10] = 0;
	start_ccb->csio.cdb_io.cdb_bytes[11] = 0;
	start_ccb->csio.cdb_len = 12;
	}
	start_ccb->ccb_h.ccb_state = CD_CCB_BUFFER_IO;

	LIST_INSERT_HEAD(&softc->pending_ccbs,
	&start_ccb->ccb_h, periph_links.le);
	softc->outstanding_cmds++;

	/* We expect a unit attention from this device */
	if ((softc->flags & CD_FLAG_RETRY_UA) != 0) {
	start_ccb->ccb_h.ccb_state \|= CD_CCB_RETRY_UA;
	softc->flags &= ~CD_FLAG_RETRY_UA;
	}

	start_ccb->ccb_h.ccb_bp = bp;
	bp = bioq_first(&softc->bio_queue);

	xpt_action(start_ccb);
	}
	if (bp != NULL \|\| softc->tur) {
	/* Have more work to do, so ensure we stay scheduled */
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	}
	break;
	}
	case CD_STATE_PROBE:
	case CD_STATE_MEDIA_SIZE:
	{
	struct scsi_read_capacity_data *rcap;

	rcap = (struct scsi_read_capacity_data )malloc(sizeof(rcap),
	M_SCSICD, M_NOWAIT \| M_ZERO);
	if (rcap == NULL) {
	xpt_print(periph->path,
	"%s: Couldn't malloc read_capacity data\n",
	__func__);
	xpt_release_ccb(start_ccb);
	/*
	* We can't probe because we can't allocate memory,
	* so invalidate the peripheral. The system probably
	* has larger problems at this stage. If we've
	* already probed (and are re-probing capacity), we
	* don't need to invalidate.
	*
	* XXX KDM need to reset probe state and kick out
	* pending I/O.
	*/
	if (softc->state == CD_STATE_PROBE)
	cam_periph_invalidate(periph);
	break;
	}

	/*
	* Set the default capacity and sector size to something that
	* GEOM can handle. This will get reset when a read capacity
	* completes successfully.
	*/
	softc->disk->d_sectorsize = 2048;
	softc->disk->d_mediasize = 0;

	csio = &start_ccb->csio;
	scsi_read_capacity(csio,
	/retries/ cd_retry_count,
	cddone,
	MSG_SIMPLE_Q_TAG,
	rcap,
	SSD_FULL_SIZE,
	/timeout/20000);
	start_ccb->ccb_h.ccb_bp = NULL;
	if (softc->state == CD_STATE_PROBE)
	start_ccb->ccb_h.ccb_state = CD_CCB_PROBE;
	else
	start_ccb->ccb_h.ccb_state = CD_CCB_MEDIA_SIZE;
	xpt_action(start_ccb);
	break;
	}
	case CD_STATE_MEDIA_ALLOW:
	case CD_STATE_MEDIA_PREVENT:
	{
	/*
	* If the CD is already locked, we don't need to do this.
	* Move on to the capacity check.
	*/
	if (softc->state == CD_STATE_MEDIA_PREVENT
	&& (softc->flags & CD_FLAG_DISC_LOCKED) != 0) {
	softc->state = CD_STATE_MEDIA_SIZE;
	xpt_release_ccb(start_ccb);
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	break;
	}

	scsi_prevent(&start_ccb->csio,
	/retries/ cd_retry_count,
	/cbfcnp/ cddone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/action/ (softc->state == CD_STATE_MEDIA_ALLOW) ?
	PR_ALLOW : PR_PREVENT,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ 60000);

	start_ccb->ccb_h.ccb_bp = NULL;
	if (softc->state == CD_STATE_MEDIA_ALLOW)
	start_ccb->ccb_h.ccb_state = CD_CCB_MEDIA_ALLOW;
	else
	start_ccb->ccb_h.ccb_state = CD_CCB_MEDIA_PREVENT;
	xpt_action(start_ccb);
	break;
	}
	case CD_STATE_MEDIA_TOC_HDR: {
	struct ioc_toc_header *toch;

	bzero(&softc->toc, sizeof(softc->toc));

	toch = &softc->toc.header;

	scsi_read_toc(&start_ccb->csio,
	/retries/ cd_retry_count,
	/cbfcnp/ cddone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/byte1_flags/ 0,
	/format/ SRTOC_FORMAT_TOC,
	/track/ 0,
	/data_ptr/ (uint8_t *)toch,
	/dxfer_len/ sizeof(*toch),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ 50000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = CD_CCB_MEDIA_TOC_HDR;
	xpt_action(start_ccb);
	break;
	}
	case CD_STATE_MEDIA_TOC_FULL: {
	bzero(&softc->toc, sizeof(softc->toc));

	scsi_read_toc(&start_ccb->csio,
	/retries/ cd_retry_count,
	/cbfcnp/ cddone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/byte1_flags/ 0,
	/format/ SRTOC_FORMAT_TOC,
	/track/ 0,
	/data_ptr/ (uint8_t *)&softc->toc,
	/dxfer_len/ softc->toc_read_len ?
	softc->toc_read_len :
	sizeof(softc->toc),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ 50000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = CD_CCB_MEDIA_TOC_FULL;
	xpt_action(start_ccb);
	break;
	}
	case CD_STATE_MEDIA_TOC_LEAD: {
	struct cd_toc_single *leadout;

	leadout = &softc->leadout;
	bzero(leadout, sizeof(*leadout));

	scsi_read_toc(&start_ccb->csio,
	/retries/ cd_retry_count,
	/cbfcnp/ cddone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/byte1_flags/ CD_MSF,
	/format/ SRTOC_FORMAT_TOC,
	/track/ LEADOUT,
	/data_ptr/ (uint8_t *)leadout,
	/dxfer_len/ sizeof(*leadout),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ 50000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = CD_CCB_MEDIA_TOC_LEAD;
	xpt_action(start_ccb);
	break;
	}
	}
	}

	static void
	cddone(struct cam_periph periph, union ccb done_ccb)
	{
	struct cd_softc *softc;
	struct ccb_scsiio *csio;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("entering cddone\n"));

	softc = (struct cd_softc *)periph->softc;
	csio = &done_ccb->csio;

	switch (csio->ccb_h.ccb_state & CD_CCB_TYPE_MASK) {
	case CD_CCB_BUFFER_IO:
	{
	struct bio *bp;
	int error;

	bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
	error = 0;

	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	int sf;

	if ((done_ccb->ccb_h.ccb_state & CD_CCB_RETRY_UA) != 0)
	sf = SF_RETRY_UA;
	else
	sf = 0;

	error = cderror(done_ccb, CAM_RETRY_SELTO, sf);
	if (error == ERESTART) {
	/*
	* A retry was scheuled, so
	* just return.
	*/
	return;
	}
	}

	if (error != 0) {
	xpt_print(periph->path,
	"cddone: got error %#x back\n", error);
	bioq_flush(&softc->bio_queue, NULL, EIO);
	bp->bio_resid = bp->bio_bcount;
	bp->bio_error = error;
	bp->bio_flags \|= BIO_ERROR;
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);

	} else {
	bp->bio_resid = csio->resid;
	bp->bio_error = 0;
	if (bp->bio_resid != 0) {
	/*
	* Short transfer ???
	* XXX: not sure this is correct for partial
	* transfers at EOM
	*/
	bp->bio_flags \|= BIO_ERROR;
	}
	}

	LIST_REMOVE(&done_ccb->ccb_h, periph_links.le);
	softc->outstanding_cmds--;

	biofinish(bp, NULL, 0);
	break;
	}
	case CD_CCB_PROBE:
	{
	struct scsi_read_capacity_data *rdcap;
	char *announce_buf;
	struct cd_params *cdp;
	int error;

	cdp = &softc->params;
	announce_buf = softc->announce_temp;
	bzero(announce_buf, CD_ANNOUNCETMP_SZ);

	rdcap = (struct scsi_read_capacity_data *)csio->data_ptr;

	cdp->disksize = scsi_4btoul (rdcap->addr) + 1;
	cdp->blksize = scsi_4btoul (rdcap->length);

	/*
	* Retry any UNIT ATTENTION type errors. They
	* are expected at boot.
	*/
	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP \|\|
	(error = cderror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA \| SF_NO_PRINT)) == 0) {
	snprintf(announce_buf, CD_ANNOUNCETMP_SZ,
	"%juMB (%ju %u byte sectors)",
	((uintmax_t)cdp->disksize * cdp->blksize) /
	(1024 * 1024),
	(uintmax_t)cdp->disksize, cdp->blksize);
	} else {
	if (error == ERESTART) {
	/*
	* A retry was scheuled, so
	* just return.
	*/
	return;
	} else {
	int asc, ascq;
	int sense_key, error_code;
	int have_sense;
	cam_status status;
	struct ccb_getdev cgd;

	/* Don't wedge this device's queue */
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);

	status = done_ccb->ccb_h.status;

	xpt_setup_ccb(&cgd.ccb_h,
	done_ccb->ccb_h.path,
	CAM_PRIORITY_NORMAL);
	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_action((union ccb *)&cgd);

	if (scsi_extract_sense_ccb(done_ccb,
	&error_code, &sense_key, &asc, &ascq))
	have_sense = TRUE;
	else
	have_sense = FALSE;

	/*
	* Attach to anything that claims to be a
	* CDROM or WORM device, as long as it
	* doesn't return a "Logical unit not
	* supported" (0x25) error.
	*/
	if ((have_sense) && (asc != 0x25)
	&& (error_code == SSD_CURRENT_ERROR
	\|\| error_code == SSD_DESC_CURRENT_ERROR)) {
	const char *sense_key_desc;
	const char *asc_desc;

	scsi_sense_desc(sense_key, asc, ascq,
	&cgd.inq_data,
	&sense_key_desc,
	&asc_desc);
	snprintf(announce_buf,
	CD_ANNOUNCETMP_SZ,
	"Attempt to query device "
	"size failed: %s, %s",
	sense_key_desc,
	asc_desc);
	} else if ((have_sense == 0)
	&& ((status & CAM_STATUS_MASK) ==
	CAM_SCSI_STATUS_ERROR)
	&& (csio->scsi_status ==
	SCSI_STATUS_BUSY)) {
	snprintf(announce_buf,
	CD_ANNOUNCETMP_SZ,
	"Attempt to query device "
	"size failed: SCSI Status: %s",
	scsi_status_string(csio));
	} else if (SID_TYPE(&cgd.inq_data) == T_CDROM) {
	/*
	* We only print out an error for
	* CDROM type devices. For WORM
	* devices, we don't print out an
	* error since a few WORM devices
	* don't support CDROM commands.
	* If we have sense information, go
	* ahead and print it out.
	* Otherwise, just say that we
	* couldn't attach.
	*/

	/*
	* Just print out the error, not
	* the full probe message, when we
	* don't attach.
	*/
	if (have_sense)
	scsi_sense_print(
	&done_ccb->csio);
	else {
	xpt_print(periph->path,
	"got CAM status %#x\n",
	done_ccb->ccb_h.status);
	}
	xpt_print(periph->path, "fatal error, "
	"failed to attach to device\n");
	/*
	* Invalidate this peripheral.
	*/
	cam_periph_invalidate(periph);

	announce_buf = NULL;
	} else {
	/*
	* Invalidate this peripheral.
	*/
	cam_periph_invalidate(periph);
	announce_buf = NULL;
	}
	}
	}
	free(rdcap, M_SCSICD);
	if (announce_buf != NULL) {
	struct sbuf sb;

	sbuf_new(&sb, softc->announce_buf, CD_ANNOUNCE_SZ,
	SBUF_FIXEDLEN);
	xpt_announce_periph_sbuf(periph, &sb, announce_buf);
	xpt_announce_quirks_sbuf(periph, &sb, softc->quirks,
	CD_Q_BIT_STRING);
	sbuf_finish(&sb);
	sbuf_putbuf(&sb);

	/*
	* Create our sysctl variables, now that we know
	* we have successfully attached.
	*/
	taskqueue_enqueue(taskqueue_thread,&softc->sysctl_task);
	}
	softc->state = CD_STATE_NORMAL;
	/*
	* Since our peripheral may be invalidated by an error
	* above or an external event, we must release our CCB
	* before releasing the probe lock on the peripheral.
	* The peripheral will only go away once the last lock
	* is removed, and we need it around for the CCB release
	* operation.
	*/
	xpt_release_ccb(done_ccb);
	cam_periph_unhold(periph);
	return;
	}
	case CD_CCB_TUR:
	{
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	if (cderror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA \| SF_NO_RECOVERY \| SF_NO_PRINT) ==
	ERESTART)
	return;
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	xpt_release_ccb(done_ccb);
	cam_periph_release_locked(periph);
	return;
	}
	case CD_CCB_MEDIA_ALLOW:
	case CD_CCB_MEDIA_PREVENT:
	{
	int error;
	int is_prevent;

	error = 0;

	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	error = cderror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA \| SF_NO_PRINT);
	}
	if (error == ERESTART)
	return;
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);

	/*
	* Note that just like the original cdcheckmedia(), we do
	* a prevent without failing the whole operation if the
	* prevent fails. We try, but keep going if it doesn't
	* work.
	*/

	if ((done_ccb->ccb_h.ccb_state & CD_CCB_TYPE_MASK) ==
	CD_CCB_MEDIA_PREVENT)
	is_prevent = 1;
	else
	is_prevent = 0;

	xpt_release_ccb(done_ccb);

	if (is_prevent != 0) {
	if (error == 0)
	softc->flags \|= CD_FLAG_DISC_LOCKED;
	else
	softc->flags &= ~CD_FLAG_DISC_LOCKED;
	softc->state = CD_STATE_MEDIA_SIZE;
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	} else {
	if (error == 0)
	softc->flags &= ~CD_FLAG_DISC_LOCKED;
	softc->state = CD_STATE_NORMAL;
	if (bioq_first(&softc->bio_queue) != NULL)
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	}
	return;
	}
	case CD_CCB_MEDIA_SIZE:
	{
	struct scsi_read_capacity_data *rdcap;
	int error;

	error = 0;
	if ((csio->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	error = cderror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA \| SF_NO_PRINT);
	}
	if (error == ERESTART)
	return;
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	rdcap = (struct scsi_read_capacity_data *)csio->data_ptr;

	if (error == 0) {
	softc->params.disksize =scsi_4btoul(rdcap->addr) + 1;
	softc->params.blksize = scsi_4btoul(rdcap->length);

	/* Make sure we got at least some block size. */
	if (softc->params.blksize == 0)
	error = EIO;
	/*
	* SCSI-3 mandates that the reported blocksize shall be
	* 2048. Older drives sometimes report funny values,
	* trim it down to 2048, or other parts of the kernel
	* will get confused.
	*
	* XXX we leave drives alone that might report 512
	* bytes, as well as drives reporting more weird
	* sizes like perhaps 4K.
	*/
	if (softc->params.blksize > 2048
	&& softc->params.blksize <= 2352)
	softc->params.blksize = 2048;
	}
	free(rdcap, M_SCSICD);

	if (error == 0) {
	softc->disk->d_sectorsize = softc->params.blksize;
	softc->disk->d_mediasize =
	(off_t)softc->params.blksize *
	softc->params.disksize;
	softc->flags \|= CD_FLAG_SAW_MEDIA \| CD_FLAG_VALID_MEDIA;
	softc->state = CD_STATE_MEDIA_TOC_HDR;
	} else {
	softc->flags &= ~(CD_FLAG_VALID_MEDIA \|
	CD_FLAG_VALID_TOC);
	bioq_flush(&softc->bio_queue, NULL, EINVAL);
	softc->state = CD_STATE_MEDIA_ALLOW;
	cdmediaprobedone(periph);
	}
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	return;
	}
	case CD_CCB_MEDIA_TOC_HDR:
	case CD_CCB_MEDIA_TOC_FULL:
	case CD_CCB_MEDIA_TOC_LEAD:
	{
	int error;
	struct ioc_toc_header *toch;
	int num_entries;
	int cdindex;

	error = 0;

	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	error = cderror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA \| SF_NO_PRINT);
	}
	if (error == ERESTART)
	return;

	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);

	/*
	* We will get errors here for media that doesn't have a table
	* of contents. According to the MMC-3 spec: "When a Read
	* TOC/PMA/ATIP command is presented for a DDCD/CD-R/RW media,
	* where the first TOC has not been recorded (no complete
	* session) and the Format codes 0000b, 0001b, or 0010b are
	* specified, this command shall be rejected with an INVALID
	* FIELD IN CDB. Devices that are not capable of reading an
	* incomplete session on DDC/CD-R/RW media shall report
	* CANNOT READ MEDIUM - INCOMPATIBLE FORMAT."
	*
	* So this isn't fatal if we can't read the table of contents,
	* it just means that the user won't be able to issue the
	* play tracks ioctl, and likely lots of other stuff won't
	* work either. They need to burn the CD before we can do
	* a whole lot with it. So we don't print anything here if
	* we get an error back.
	*
	* We also bail out if the drive doesn't at least give us
	* the full TOC header.
	*/
	if ((error != 0)
	\|\| ((csio->dxfer_len - csio->resid) <
	sizeof(struct ioc_toc_header))) {
	softc->flags &= ~CD_FLAG_VALID_TOC;
	bzero(&softc->toc, sizeof(softc->toc));
	/*
	* Failing the TOC read is not an error.
	*/
	softc->state = CD_STATE_NORMAL;
	xpt_release_ccb(done_ccb);

	cdmediaprobedone(periph);

	/*
	* Go ahead and schedule I/O execution if there is
	* anything in the queue. It'll probably get
	* kicked out with an error.
	*/
	if (bioq_first(&softc->bio_queue) != NULL)
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	return;
	}

	/*
	* Note that this is NOT the storage location used for the
	* leadout!
	*/
	toch = &softc->toc.header;

	if (softc->quirks & CD_Q_BCD_TRACKS) {
	toch->starting_track = bcd2bin(toch->starting_track);
	toch->ending_track = bcd2bin(toch->ending_track);
	}

	/* Number of TOC entries, plus leadout */
	num_entries = (toch->ending_track - toch->starting_track) + 2;
	cdindex = toch->starting_track + num_entries -1;

	if ((done_ccb->ccb_h.ccb_state & CD_CCB_TYPE_MASK) ==
	CD_CCB_MEDIA_TOC_HDR) {
	if (num_entries <= 0) {
	softc->flags &= ~CD_FLAG_VALID_TOC;
	bzero(&softc->toc, sizeof(softc->toc));
	/*
	* Failing the TOC read is not an error.
	*/
	softc->state = CD_STATE_NORMAL;
	xpt_release_ccb(done_ccb);

	cdmediaprobedone(periph);

	/*
	* Go ahead and schedule I/O execution if
	* there is anything in the queue. It'll
	* probably get kicked out with an error.
	*/
	if (bioq_first(&softc->bio_queue) != NULL)
	xpt_schedule(periph,
	CAM_PRIORITY_NORMAL);
	} else {
	softc->toc_read_len = num_entries *
	sizeof(struct cd_toc_entry);
	softc->toc_read_len += sizeof(*toch);

	softc->state = CD_STATE_MEDIA_TOC_FULL;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	}

	return;
	} else if ((done_ccb->ccb_h.ccb_state & CD_CCB_TYPE_MASK) ==
	CD_CCB_MEDIA_TOC_LEAD) {
	struct cd_toc_single *leadout;

	leadout = (struct cd_toc_single *)csio->data_ptr;
	softc->toc.entries[cdindex - toch->starting_track] =
	leadout->entry;
	} else if (((done_ccb->ccb_h.ccb_state & CD_CCB_TYPE_MASK) ==
	CD_CCB_MEDIA_TOC_FULL)
	&& (cdindex == toch->ending_track + 1)) {
	/*
	* XXX KDM is this necessary? Probably only if the
	* drive doesn't return leadout information with the
	* table of contents.
	*/
	softc->state = CD_STATE_MEDIA_TOC_LEAD;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	return;
	}

	if (softc->quirks & CD_Q_BCD_TRACKS) {
	for (cdindex = 0; cdindex < num_entries - 1; cdindex++){
	softc->toc.entries[cdindex].track =
	bcd2bin(softc->toc.entries[cdindex].track);
	}
	}

	softc->flags \|= CD_FLAG_VALID_TOC;
	/* If the first track is audio, correct sector size. */
	if ((softc->toc.entries[0].control & 4) == 0) {
	softc->disk->d_sectorsize =softc->params.blksize = 2352;
	softc->disk->d_mediasize =
	(off_t)softc->params.blksize *
	softc->params.disksize;
	}
	softc->state = CD_STATE_NORMAL;

	/*
	* We unconditionally (re)set the blocksize each time the
	* CD device is opened. This is because the CD can change,
	* and therefore the blocksize might change.
	* XXX problems here if some slice or partition is still
	* open with the old size?
	*/
	if ((softc->disk->d_devstat->flags & DEVSTAT_BS_UNAVAILABLE)!=0)
	softc->disk->d_devstat->flags &=
	~DEVSTAT_BS_UNAVAILABLE;
	softc->disk->d_devstat->block_size = softc->params.blksize;

	xpt_release_ccb(done_ccb);

	cdmediaprobedone(periph);

	if (bioq_first(&softc->bio_queue) != NULL)
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	return;
	}
	default:
	break;
	}
	xpt_release_ccb(done_ccb);
	}

	static union cd_pages *
	cdgetpage(struct cd_mode_params *mode_params)
	{
	union cd_pages *page;

	if (mode_params->cdb_size == 10)
	page = (union cd_pages *)find_mode_page_10(
	(struct scsi_mode_header_10 *)mode_params->mode_buf);
	else
	page = (union cd_pages *)find_mode_page_6(
	(struct scsi_mode_header_6 *)mode_params->mode_buf);

	return (page);
	}

	static int
	cdgetpagesize(int page_num)
	{
	u_int i;

	for (i = 0; i < nitems(cd_page_size_table); i++) {
	if (cd_page_size_table[i].page == page_num)
	return (cd_page_size_table[i].page_size);
	}

	return (-1);
	}

	static struct cd_toc_entry *
	te_data_get_ptr(void *irtep, u_long cmd)
	{
	union {
	struct ioc_read_toc_entry irte;
	#ifdef COMPAT_FREEBSD32
	struct ioc_read_toc_entry32 irte32;
	#endif
	} *irteup;

	irteup = irtep;
	switch (IOCPARM_LEN(cmd)) {
	case sizeof(irteup->irte):
	return (irteup->irte.data);
	#ifdef COMPAT_FREEBSD32
	case sizeof(irteup->irte32):
	return ((struct cd_toc_entry *)(uintptr_t)irteup->irte32.data);
	#endif
	default:
	panic("Unhandled ioctl command %ld", cmd);
	}
	}

	static int
	cdioctl(struct disk dp, u_long cmd, void addr, int flag, struct thread *td)
	{

	struct cam_periph *periph;
	struct cd_softc *softc;
	int error = 0;

	periph = (struct cam_periph *)dp->d_drv1;
	cam_periph_lock(periph);

	softc = (struct cd_softc *)periph->softc;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE,
	("cdioctl(%#lx)\n", cmd));

	if ((error = cam_periph_hold(periph, PRIBIO \| PCATCH)) != 0) {
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (error);
	}

	/*
	* If we don't have media loaded, check for it. If still don't
	* have media loaded, we can only do a load or eject.
	*
	* We only care whether media is loaded if this is a cd-specific ioctl
	* (thus the IOCGROUP check below). Note that this will break if
	* anyone adds any ioctls into the switch statement below that don't
	* have their ioctl group set to 'c'.
	*/
	if (((softc->flags & CD_FLAG_VALID_MEDIA) == 0)
	&& ((cmd != CDIOCCLOSE)
	&& (cmd != CDIOCEJECT))
	&& (IOCGROUP(cmd) == 'c')) {
	error = cdcheckmedia(periph, /do_wait/ 1);
	if (error != 0) {
	cam_periph_unhold(periph);
	cam_periph_unlock(periph);
	return (error);
	}
	}
	/*
	* Drop the lock here so later mallocs can use WAITOK. The periph
	* is essentially locked still with the cam_periph_hold call above.
	*/
	cam_periph_unlock(periph);

	switch (cmd) {
	case CDIOCPLAYTRACKS:
	{
	struct ioc_play_track *args
	= (struct ioc_play_track *) addr;
	struct cd_mode_params params;
	union cd_pages *page;

	params.alloc_len = sizeof(union cd_mode_data_6_10);
	params.mode_buf = malloc(params.alloc_len, M_SCSICD,
	M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOCPLAYTRACKS\n"));

	error = cdgetmode(periph, &params, AUDIO_PAGE);
	if (error) {
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	page = cdgetpage(&params);

	page->audio.flags &= ~CD_PA_SOTC;
	page->audio.flags \|= CD_PA_IMMED;
	error = cdsetmode(periph, &params);
	free(params.mode_buf, M_SCSICD);
	if (error) {
	cam_periph_unlock(periph);
	break;
	}

	/*
	* This was originally implemented with the PLAY
	* AUDIO TRACK INDEX command, but that command was
	* deprecated after SCSI-2. Most (all?) SCSI CDROM
	* drives support it but ATAPI and ATAPI-derivative
	* drives don't seem to support it. So we keep a
	* cache of the table of contents and translate
	* track numbers to MSF format.
	*/
	if (softc->flags & CD_FLAG_VALID_TOC) {
	union msf_lba sentry, eentry;
	int st, et;

	if (args->end_track <
	softc->toc.header.ending_track + 1)
	args->end_track++;
	if (args->end_track >
	softc->toc.header.ending_track + 1)
	args->end_track =
	softc->toc.header.ending_track + 1;
	st = args->start_track -
	softc->toc.header.starting_track;
	et = args->end_track -
	softc->toc.header.starting_track;
	if ((st < 0)
	\|\| (et < 0)
	\|\| (st > (softc->toc.header.ending_track -
	softc->toc.header.starting_track))) {
	error = EINVAL;
	cam_periph_unlock(periph);
	break;
	}
	sentry = &softc->toc.entries[st].addr;
	eentry = &softc->toc.entries[et].addr;
	error = cdplaymsf(periph,
	sentry->msf.minute,
	sentry->msf.second,
	sentry->msf.frame,
	eentry->msf.minute,
	eentry->msf.second,
	eentry->msf.frame);
	} else {
	/*
	* If we don't have a valid TOC, try the
	* play track index command. It is part of
	* the SCSI-2 spec, but was removed in the
	* MMC specs. ATAPI and ATAPI-derived
	* drives don't support it.
	*/
	if (softc->quirks & CD_Q_BCD_TRACKS) {
	args->start_track =
	bin2bcd(args->start_track);
	args->end_track =
	bin2bcd(args->end_track);
	}
	error = cdplaytracks(periph,
	args->start_track,
	args->start_index,
	args->end_track,
	args->end_index);
	}
	cam_periph_unlock(periph);
	}
	break;
	case CDIOCPLAYMSF:
	{
	struct ioc_play_msf *args
	= (struct ioc_play_msf *) addr;
	struct cd_mode_params params;
	union cd_pages *page;

	params.alloc_len = sizeof(union cd_mode_data_6_10);
	params.mode_buf = malloc(params.alloc_len, M_SCSICD,
	M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOCPLAYMSF\n"));

	error = cdgetmode(periph, &params, AUDIO_PAGE);
	if (error) {
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	page = cdgetpage(&params);

	page->audio.flags &= ~CD_PA_SOTC;
	page->audio.flags \|= CD_PA_IMMED;
	error = cdsetmode(periph, &params);
	free(params.mode_buf, M_SCSICD);
	if (error) {
	cam_periph_unlock(periph);
	break;
	}
	error = cdplaymsf(periph,
	args->start_m,
	args->start_s,
	args->start_f,
	args->end_m,
	args->end_s,
	args->end_f);
	cam_periph_unlock(periph);
	}
	break;
	case CDIOCPLAYBLOCKS:
	{
	struct ioc_play_blocks *args
	= (struct ioc_play_blocks *) addr;
	struct cd_mode_params params;
	union cd_pages *page;

	params.alloc_len = sizeof(union cd_mode_data_6_10);
	params.mode_buf = malloc(params.alloc_len, M_SCSICD,
	M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOCPLAYBLOCKS\n"));

	error = cdgetmode(periph, &params, AUDIO_PAGE);
	if (error) {
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	page = cdgetpage(&params);

	page->audio.flags &= ~CD_PA_SOTC;
	page->audio.flags \|= CD_PA_IMMED;
	error = cdsetmode(periph, &params);
	free(params.mode_buf, M_SCSICD);
	if (error) {
	cam_periph_unlock(periph);
	break;
	}
	error = cdplay(periph, args->blk, args->len);
	cam_periph_unlock(periph);
	}
	break;
	case CDIOCREADSUBCHANNEL:
	{
	struct ioc_read_subchannel *args
	= (struct ioc_read_subchannel *) addr;
	struct cd_sub_channel_info *data;
	u_int32_t len = args->data_len;

	data = malloc(sizeof(struct cd_sub_channel_info),
	M_SCSICD, M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOCREADSUBCHANNEL\n"));

	if ((len > sizeof(struct cd_sub_channel_info)) \|\|
	(len < sizeof(struct cd_sub_channel_header))) {
	printf(
	"scsi_cd: cdioctl: "
	"cdioreadsubchannel: error, len=%d\n",
	len);
	error = EINVAL;
	free(data, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}

	if (softc->quirks & CD_Q_BCD_TRACKS)
	args->track = bin2bcd(args->track);

	error = cdreadsubchannel(periph, args->address_format,
	args->data_format, args->track, data, len);

	if (error) {
	free(data, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	if (softc->quirks & CD_Q_BCD_TRACKS)
	data->what.track_info.track_number =
	bcd2bin(data->what.track_info.track_number);
	len = min(len, ((data->header.data_len[0] << 8) +
	data->header.data_len[1] +
	sizeof(struct cd_sub_channel_header)));
	cam_periph_unlock(periph);
	error = copyout(data, args->data, len);
	free(data, M_SCSICD);
	}
	break;

	case CDIOREADTOCHEADER:
	{
	struct ioc_toc_header *th;

	th = malloc(sizeof(struct ioc_toc_header), M_SCSICD,
	M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOREADTOCHEADER\n"));

	error = cdreadtoc(periph, 0, 0, (u_int8_t *)th,
	sizeof (th), /sense_flags*/SF_NO_PRINT);
	if (error) {
	free(th, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	if (softc->quirks & CD_Q_BCD_TRACKS) {
	/* we are going to have to convert the BCD
	* encoding on the cd to what is expected
	*/
	th->starting_track =
	bcd2bin(th->starting_track);
	th->ending_track = bcd2bin(th->ending_track);
	}
	th->len = ntohs(th->len);
	bcopy(th, addr, sizeof(*th));
	free(th, M_SCSICD);
	cam_periph_unlock(periph);
	}
	break;
	case CDIOREADTOCENTRYS:
	#ifdef COMPAT_FREEBSD32
	case CDIOREADTOCENTRYS_32:
	#endif
	{
	struct cd_tocdata *data;
	struct cd_toc_single *lead;
	struct ioc_read_toc_entry *te =
	(struct ioc_read_toc_entry *) addr;
	struct ioc_toc_header *th;
	u_int32_t len, readlen, idx, num;
	u_int32_t starting_track = te->starting_track;

	data = malloc(sizeof(*data), M_SCSICD, M_WAITOK \| M_ZERO);
	lead = malloc(sizeof(*lead), M_SCSICD, M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOREADTOCENTRYS\n"));

	if (te->data_len < sizeof(struct cd_toc_entry)
	\|\| (te->data_len % sizeof(struct cd_toc_entry)) != 0
	\|\| (te->address_format != CD_MSF_FORMAT
	&& te->address_format != CD_LBA_FORMAT)) {
	error = EINVAL;
	printf("scsi_cd: error in readtocentries, "
	"returning EINVAL\n");
	free(data, M_SCSICD);
	free(lead, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}

	th = &data->header;
	error = cdreadtoc(periph, 0, 0, (u_int8_t *)th,
	sizeof (th), /sense_flags*/0);
	if (error) {
	free(data, M_SCSICD);
	free(lead, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}

	if (softc->quirks & CD_Q_BCD_TRACKS) {
	/* we are going to have to convert the BCD
	* encoding on the cd to what is expected
	*/
	th->starting_track =
	bcd2bin(th->starting_track);
	th->ending_track = bcd2bin(th->ending_track);
	}

	if (starting_track == 0)
	starting_track = th->starting_track;
	else if (starting_track == LEADOUT)
	starting_track = th->ending_track + 1;
	else if (starting_track < th->starting_track \|\|
	starting_track > th->ending_track + 1) {
	printf("scsi_cd: error in readtocentries, "
	"returning EINVAL\n");
	free(data, M_SCSICD);
	free(lead, M_SCSICD);
	cam_periph_unlock(periph);
	error = EINVAL;
	break;
	}

	/* calculate reading length without leadout entry */
	readlen = (th->ending_track - starting_track + 1) *
	sizeof(struct cd_toc_entry);

	/* and with leadout entry */
	len = readlen + sizeof(struct cd_toc_entry);
	if (te->data_len < len) {
	len = te->data_len;
	if (readlen > len)
	readlen = len;
	}
	if (len > sizeof(data->entries)) {
	printf("scsi_cd: error in readtocentries, "
	"returning EINVAL\n");
	error = EINVAL;
	free(data, M_SCSICD);
	free(lead, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	num = len / sizeof(struct cd_toc_entry);

	if (readlen > 0) {
	error = cdreadtoc(periph, te->address_format,
	starting_track,
	(u_int8_t *)data,
	readlen + sizeof (*th),
	/sense_flags/0);
	if (error) {
	free(data, M_SCSICD);
	free(lead, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	}

	/* make leadout entry if needed */
	idx = starting_track + num - 1;
	if (softc->quirks & CD_Q_BCD_TRACKS)
	th->ending_track = bcd2bin(th->ending_track);
	if (idx == th->ending_track + 1) {
	error = cdreadtoc(periph, te->address_format,
	LEADOUT, (u_int8_t *)lead,
	sizeof(*lead),
	/sense_flags/0);
	if (error) {
	free(data, M_SCSICD);
	free(lead, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	data->entries[idx - starting_track] =
	lead->entry;
	}
	if (softc->quirks & CD_Q_BCD_TRACKS) {
	for (idx = 0; idx < num - 1; idx++) {
	data->entries[idx].track =
	bcd2bin(data->entries[idx].track);
	}
	}

	cam_periph_unlock(periph);
	error = copyout(data->entries, te_data_get_ptr(te, cmd),
	len);
	free(data, M_SCSICD);
	free(lead, M_SCSICD);
	}
	break;
	case CDIOREADTOCENTRY:
	{
	struct cd_toc_single *data;
	struct ioc_read_toc_single_entry *te =
	(struct ioc_read_toc_single_entry *) addr;
	struct ioc_toc_header *th;
	u_int32_t track;

	data = malloc(sizeof(*data), M_SCSICD, M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOREADTOCENTRY\n"));

	if (te->address_format != CD_MSF_FORMAT
	&& te->address_format != CD_LBA_FORMAT) {
	printf("error in readtocentry, "
	" returning EINVAL\n");
	free(data, M_SCSICD);
	error = EINVAL;
	cam_periph_unlock(periph);
	break;
	}

	th = &data->header;
	error = cdreadtoc(periph, 0, 0, (u_int8_t *)th,
	sizeof (th), /sense_flags*/0);
	if (error) {
	free(data, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}

	if (softc->quirks & CD_Q_BCD_TRACKS) {
	/* we are going to have to convert the BCD
	* encoding on the cd to what is expected
	*/
	th->starting_track =
	bcd2bin(th->starting_track);
	th->ending_track = bcd2bin(th->ending_track);
	}
	track = te->track;
	if (track == 0)
	track = th->starting_track;
	else if (track == LEADOUT)
	/* OK */;
	else if (track < th->starting_track \|\|
	track > th->ending_track + 1) {
	printf("error in readtocentry, "
	" returning EINVAL\n");
	free(data, M_SCSICD);
	error = EINVAL;
	cam_periph_unlock(periph);
	break;
	}

	error = cdreadtoc(periph, te->address_format, track,
	(u_int8_t )data, sizeof(data),
	/sense_flags/0);
	if (error) {
	free(data, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}

	if (softc->quirks & CD_Q_BCD_TRACKS)
	data->entry.track = bcd2bin(data->entry.track);
	bcopy(&data->entry, &te->entry,
	sizeof(struct cd_toc_entry));
	free(data, M_SCSICD);
	cam_periph_unlock(periph);
	}
	break;
	case CDIOCSETPATCH:
	{
	struct ioc_patch arg = (struct ioc_patch )addr;
	struct cd_mode_params params;
	union cd_pages *page;

	params.alloc_len = sizeof(union cd_mode_data_6_10);
	params.mode_buf = malloc(params.alloc_len, M_SCSICD,
	M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOCSETPATCH\n"));

	error = cdgetmode(periph, &params, AUDIO_PAGE);
	if (error) {
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	page = cdgetpage(&params);

	page->audio.port[LEFT_PORT].channels =
	arg->patch[0];
	page->audio.port[RIGHT_PORT].channels =
	arg->patch[1];
	page->audio.port[2].channels = arg->patch[2];
	page->audio.port[3].channels = arg->patch[3];
	error = cdsetmode(periph, &params);
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	}
	break;
	case CDIOCGETVOL:
	{
	struct ioc_vol arg = (struct ioc_vol ) addr;
	struct cd_mode_params params;
	union cd_pages *page;

	params.alloc_len = sizeof(union cd_mode_data_6_10);
	params.mode_buf = malloc(params.alloc_len, M_SCSICD,
	M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOCGETVOL\n"));

	error = cdgetmode(periph, &params, AUDIO_PAGE);
	if (error) {
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	page = cdgetpage(&params);

	arg->vol[LEFT_PORT] =
	page->audio.port[LEFT_PORT].volume;
	arg->vol[RIGHT_PORT] =
	page->audio.port[RIGHT_PORT].volume;
	arg->vol[2] = page->audio.port[2].volume;
	arg->vol[3] = page->audio.port[3].volume;
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	}
	break;
	case CDIOCSETVOL:
	{
	struct ioc_vol arg = (struct ioc_vol ) addr;
	struct cd_mode_params params;
	union cd_pages *page;

	params.alloc_len = sizeof(union cd_mode_data_6_10);
	params.mode_buf = malloc(params.alloc_len, M_SCSICD,
	M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOCSETVOL\n"));

	error = cdgetmode(periph, &params, AUDIO_PAGE);
	if (error) {
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	page = cdgetpage(&params);

	page->audio.port[LEFT_PORT].channels = CHANNEL_0;
	page->audio.port[LEFT_PORT].volume =
	arg->vol[LEFT_PORT];
	page->audio.port[RIGHT_PORT].channels = CHANNEL_1;
	page->audio.port[RIGHT_PORT].volume =
	arg->vol[RIGHT_PORT];
	page->audio.port[2].volume = arg->vol[2];
	page->audio.port[3].volume = arg->vol[3];
	error = cdsetmode(periph, &params);
	cam_periph_unlock(periph);
	free(params.mode_buf, M_SCSICD);
	}
	break;
	case CDIOCSETMONO:
	{
	struct cd_mode_params params;
	union cd_pages *page;

	params.alloc_len = sizeof(union cd_mode_data_6_10);
	params.mode_buf = malloc(params.alloc_len, M_SCSICD,
	M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOCSETMONO\n"));

	error = cdgetmode(periph, &params, AUDIO_PAGE);
	if (error) {
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	page = cdgetpage(&params);

	page->audio.port[LEFT_PORT].channels =
	LEFT_CHANNEL \| RIGHT_CHANNEL;
	page->audio.port[RIGHT_PORT].channels =
	LEFT_CHANNEL \| RIGHT_CHANNEL;
	page->audio.port[2].channels = 0;
	page->audio.port[3].channels = 0;
	error = cdsetmode(periph, &params);
	cam_periph_unlock(periph);
	free(params.mode_buf, M_SCSICD);
	}
	break;
	case CDIOCSETSTEREO:
	{
	struct cd_mode_params params;
	union cd_pages *page;

	params.alloc_len = sizeof(union cd_mode_data_6_10);
	params.mode_buf = malloc(params.alloc_len, M_SCSICD,
	M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOCSETSTEREO\n"));

	error = cdgetmode(periph, &params, AUDIO_PAGE);
	if (error) {
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	page = cdgetpage(&params);

	page->audio.port[LEFT_PORT].channels =
	LEFT_CHANNEL;
	page->audio.port[RIGHT_PORT].channels =
	RIGHT_CHANNEL;
	page->audio.port[2].channels = 0;
	page->audio.port[3].channels = 0;
	error = cdsetmode(periph, &params);
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	}
	break;
	case CDIOCSETMUTE:
	{
	struct cd_mode_params params;
	union cd_pages *page;

	params.alloc_len = sizeof(union cd_mode_data_6_10);
	params.mode_buf = malloc(params.alloc_len, M_SCSICD,
	M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOCSETMUTE\n"));

	error = cdgetmode(periph, &params, AUDIO_PAGE);
	if (error) {
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	page = cdgetpage(&params);

	page->audio.port[LEFT_PORT].channels = 0;
	page->audio.port[RIGHT_PORT].channels = 0;
	page->audio.port[2].channels = 0;
	page->audio.port[3].channels = 0;
	error = cdsetmode(periph, &params);
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	}
	break;
	case CDIOCSETLEFT:
	{
	struct cd_mode_params params;
	union cd_pages *page;

	params.alloc_len = sizeof(union cd_mode_data_6_10);
	params.mode_buf = malloc(params.alloc_len, M_SCSICD,
	M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOCSETLEFT\n"));

	error = cdgetmode(periph, &params, AUDIO_PAGE);
	if (error) {
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	page = cdgetpage(&params);

	page->audio.port[LEFT_PORT].channels = LEFT_CHANNEL;
	page->audio.port[RIGHT_PORT].channels = LEFT_CHANNEL;
	page->audio.port[2].channels = 0;
	page->audio.port[3].channels = 0;
	error = cdsetmode(periph, &params);
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	}
	break;
	case CDIOCSETRIGHT:
	{
	struct cd_mode_params params;
	union cd_pages *page;

	params.alloc_len = sizeof(union cd_mode_data_6_10);
	params.mode_buf = malloc(params.alloc_len, M_SCSICD,
	M_WAITOK \| M_ZERO);

	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
	("trying to do CDIOCSETRIGHT\n"));

	error = cdgetmode(periph, &params, AUDIO_PAGE);
	if (error) {
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	break;
	}
	page = cdgetpage(&params);

	page->audio.port[LEFT_PORT].channels = RIGHT_CHANNEL;
	page->audio.port[RIGHT_PORT].channels = RIGHT_CHANNEL;
	page->audio.port[2].channels = 0;
	page->audio.port[3].channels = 0;
	error = cdsetmode(periph, &params);
	free(params.mode_buf, M_SCSICD);
	cam_periph_unlock(periph);
	}
	break;
	case CDIOCRESUME:
	cam_periph_lock(periph);
	error = cdpause(periph, 1);
	cam_periph_unlock(periph);
	break;
	case CDIOCPAUSE:
	cam_periph_lock(periph);
	error = cdpause(periph, 0);
	cam_periph_unlock(periph);
	break;
	case CDIOCSTART:
	cam_periph_lock(periph);
	error = cdstartunit(periph, 0);
	cam_periph_unlock(periph);
	break;
	case CDIOCCLOSE:
	cam_periph_lock(periph);
	error = cdstartunit(periph, 1);
	cam_periph_unlock(periph);
	break;
	case CDIOCSTOP:
	cam_periph_lock(periph);
	error = cdstopunit(periph, 0);
	cam_periph_unlock(periph);
	break;
	case CDIOCEJECT:
	cam_periph_lock(periph);
	error = cdstopunit(periph, 1);
	cam_periph_unlock(periph);
	break;
	case CDIOCALLOW:
	cam_periph_lock(periph);
	cdprevent(periph, PR_ALLOW);
	cam_periph_unlock(periph);
	break;
	case CDIOCPREVENT:
	cam_periph_lock(periph);
	cdprevent(periph, PR_PREVENT);
	cam_periph_unlock(periph);
	break;
	case CDIOCSETDEBUG:
	/* sc_link->flags \|= (SDEV_DB1 \| SDEV_DB2); */
	error = ENOTTY;
	break;
	case CDIOCCLRDEBUG:
	/* sc_link->flags &= ~(SDEV_DB1 \| SDEV_DB2); */
	error = ENOTTY;
	break;
	case CDIOCRESET:
	/* return (cd_reset(periph)); */
	error = ENOTTY;
	break;
	case CDRIOCREADSPEED:
	cam_periph_lock(periph);
	error = cdsetspeed(periph, (u_int32_t )addr, CDR_MAX_SPEED);
	cam_periph_unlock(periph);
	break;
	case CDRIOCWRITESPEED:
	cam_periph_lock(periph);
	error = cdsetspeed(periph, CDR_MAX_SPEED, (u_int32_t )addr);
	cam_periph_unlock(periph);
	break;
	case CDRIOCGETBLOCKSIZE:
	(int )addr = softc->params.blksize;
	break;
	case CDRIOCSETBLOCKSIZE:
	if ((int )addr <= 0) {
	error = EINVAL;
	break;
	}
	softc->disk->d_sectorsize = softc->params.blksize = (int )addr;
	break;
	case DVDIOCSENDKEY:
	case DVDIOCREPORTKEY: {
	struct dvd_authinfo *authinfo;

	authinfo = (struct dvd_authinfo *)addr;

	if (cmd == DVDIOCREPORTKEY)
	error = cdreportkey(periph, authinfo);
	else
	error = cdsendkey(periph, authinfo);
	break;
	}
	case DVDIOCREADSTRUCTURE: {
	struct dvd_struct *dvdstruct;

	dvdstruct = (struct dvd_struct *)addr;

	error = cdreaddvdstructure(periph, dvdstruct);

	break;
	}
	default:
	cam_periph_lock(periph);
	error = cam_periph_ioctl(periph, cmd, addr, cderror);
	cam_periph_unlock(periph);
	break;
	}

	cam_periph_lock(periph);
	cam_periph_unhold(periph);

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("leaving cdioctl\n"));
	if (error && bootverbose) {
	printf("scsi_cd.c::ioctl cmd=%08lx error=%d\n", cmd, error);
	}
	cam_periph_unlock(periph);

	return (error);
	}

	static void
	cdprevent(struct cam_periph *periph, int action)
	{
	union ccb *ccb;
	struct cd_softc *softc;
	int error;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("entering cdprevent\n"));

	softc = (struct cd_softc *)periph->softc;

	if (((action == PR_ALLOW)
	&& (softc->flags & CD_FLAG_DISC_LOCKED) == 0)
	\|\| ((action == PR_PREVENT)
	&& (softc->flags & CD_FLAG_DISC_LOCKED) != 0)) {
	return;
	}

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	scsi_prevent(&ccb->csio,
	/retries/ cd_retry_count,
	/cbfcnp/NULL,
	MSG_SIMPLE_Q_TAG,
	action,
	SSD_FULL_SIZE,
	/* timeout */60000);

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA\|SF_NO_PRINT);

	xpt_release_ccb(ccb);

	if (error == 0) {
	if (action == PR_ALLOW)
	softc->flags &= ~CD_FLAG_DISC_LOCKED;
	else
	softc->flags \|= CD_FLAG_DISC_LOCKED;
	}
	}

	static void
	cdmediaprobedone(struct cam_periph *periph)
	{
	struct cd_softc *softc;

	softc = (struct cd_softc *)periph->softc;

	softc->flags &= ~CD_FLAG_MEDIA_SCAN_ACT;

	if ((softc->flags & CD_FLAG_MEDIA_WAIT) != 0) {
	softc->flags &= ~CD_FLAG_MEDIA_WAIT;
	wakeup(&softc->toc);
	}
	}

	/*
	* XXX: the disk media and sector size is only really able to change
	* XXX: while the device is closed.
	*/

	static int
	cdcheckmedia(struct cam_periph *periph, int do_wait)
	{
	struct cd_softc *softc;
	int error;

	softc = (struct cd_softc *)periph->softc;
	error = 0;

	if ((do_wait != 0)
	&& ((softc->flags & CD_FLAG_MEDIA_WAIT) == 0)) {
	softc->flags \|= CD_FLAG_MEDIA_WAIT;
	}
	if ((softc->flags & CD_FLAG_MEDIA_SCAN_ACT) == 0) {
	softc->state = CD_STATE_MEDIA_PREVENT;
	softc->flags \|= CD_FLAG_MEDIA_SCAN_ACT;
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	}

	if (do_wait == 0)
	goto bailout;

	error = msleep(&softc->toc, cam_periph_mtx(periph), PRIBIO,"cdmedia",0);

	if (error != 0)
	goto bailout;

	/*
	* Check to see whether we have a valid size from the media. We
	* may or may not have a valid TOC.
	*/
	if ((softc->flags & CD_FLAG_VALID_MEDIA) == 0)
	error = EINVAL;
	bailout:

	return (error);
	}

	#if 0
	static int
	cdcheckmedia(struct cam_periph *periph)
	{
	struct cd_softc *softc;
	struct ioc_toc_header *toch;
	struct cd_toc_single leadout;
	u_int32_t size, toclen;
	int error, num_entries, cdindex;

	softc = (struct cd_softc *)periph->softc;

	cdprevent(periph, PR_PREVENT);
	softc->disk->d_sectorsize = 2048;
	softc->disk->d_mediasize = 0;

	/*
	* Get the disc size and block size. If we can't get it, we don't
	* have media, most likely.
	*/
	if ((error = cdsize(periph, &size)) != 0) {
	softc->flags &= ~(CD_FLAG_VALID_MEDIA\|CD_FLAG_VALID_TOC);
	cdprevent(periph, PR_ALLOW);
	return (error);
	} else {
	softc->flags \|= CD_FLAG_SAW_MEDIA \| CD_FLAG_VALID_MEDIA;
	softc->disk->d_sectorsize = softc->params.blksize;
	softc->disk->d_mediasize =
	(off_t)softc->params.blksize * softc->params.disksize;
	}

	/*
	* Now we check the table of contents. This (currently) is only
	* used for the CDIOCPLAYTRACKS ioctl. It may be used later to do
	* things like present a separate entry in /dev for each track,
	* like that acd(4) driver does.
	*/
	bzero(&softc->toc, sizeof(softc->toc));
	toch = &softc->toc.header;
	/*
	* We will get errors here for media that doesn't have a table of
	* contents. According to the MMC-3 spec: "When a Read TOC/PMA/ATIP
	* command is presented for a DDCD/CD-R/RW media, where the first TOC
	* has not been recorded (no complete session) and the Format codes
	* 0000b, 0001b, or 0010b are specified, this command shall be rejected
	* with an INVALID FIELD IN CDB. Devices that are not capable of
	* reading an incomplete session on DDC/CD-R/RW media shall report
	* CANNOT READ MEDIUM - INCOMPATIBLE FORMAT."
	*
	* So this isn't fatal if we can't read the table of contents, it
	* just means that the user won't be able to issue the play tracks
	* ioctl, and likely lots of other stuff won't work either. They
	* need to burn the CD before we can do a whole lot with it. So
	* we don't print anything here if we get an error back.
	*/
	error = cdreadtoc(periph, 0, 0, (u_int8_t )toch, sizeof(toch),
	SF_NO_PRINT);
	/*
	* Errors in reading the table of contents aren't fatal, we just
	* won't have a valid table of contents cached.
	*/
	if (error != 0) {
	error = 0;
	bzero(&softc->toc, sizeof(softc->toc));
	goto bailout;
	}

	if (softc->quirks & CD_Q_BCD_TRACKS) {
	toch->starting_track = bcd2bin(toch->starting_track);
	toch->ending_track = bcd2bin(toch->ending_track);
	}

	/* Number of TOC entries, plus leadout */
	num_entries = (toch->ending_track - toch->starting_track) + 2;

	if (num_entries <= 0)
	goto bailout;

	toclen = num_entries * sizeof(struct cd_toc_entry);

	error = cdreadtoc(periph, CD_MSF_FORMAT, toch->starting_track,
	(u_int8_t )&softc->toc, toclen + sizeof(toch),
	SF_NO_PRINT);
	if (error != 0) {
	error = 0;
	bzero(&softc->toc, sizeof(softc->toc));
	goto bailout;
	}

	if (softc->quirks & CD_Q_BCD_TRACKS) {
	toch->starting_track = bcd2bin(toch->starting_track);
	toch->ending_track = bcd2bin(toch->ending_track);
	}
	/*
	* XXX KDM is this necessary? Probably only if the drive doesn't
	* return leadout information with the table of contents.
	*/
	cdindex = toch->starting_track + num_entries -1;
	if (cdindex == toch->ending_track + 1) {
	error = cdreadtoc(periph, CD_MSF_FORMAT, LEADOUT,
	(u_int8_t *)&leadout, sizeof(leadout),
	SF_NO_PRINT);
	if (error != 0) {
	error = 0;
	goto bailout;
	}
	softc->toc.entries[cdindex - toch->starting_track] =
	leadout.entry;
	}
	if (softc->quirks & CD_Q_BCD_TRACKS) {
	for (cdindex = 0; cdindex < num_entries - 1; cdindex++) {
	softc->toc.entries[cdindex].track =
	bcd2bin(softc->toc.entries[cdindex].track);
	}
	}

	softc->flags \|= CD_FLAG_VALID_TOC;

	/* If the first track is audio, correct sector size. */
	if ((softc->toc.entries[0].control & 4) == 0) {
	softc->disk->d_sectorsize = softc->params.blksize = 2352;
	softc->disk->d_mediasize =
	(off_t)softc->params.blksize * softc->params.disksize;
	}

	bailout:

	/*
	* We unconditionally (re)set the blocksize each time the
	* CD device is opened. This is because the CD can change,
	* and therefore the blocksize might change.
	* XXX problems here if some slice or partition is still
	* open with the old size?
	*/
	if ((softc->disk->d_devstat->flags & DEVSTAT_BS_UNAVAILABLE) != 0)
	softc->disk->d_devstat->flags &= ~DEVSTAT_BS_UNAVAILABLE;
	softc->disk->d_devstat->block_size = softc->params.blksize;

	return (error);
	}

	static int
	cdsize(struct cam_periph periph, u_int32_t size)
	{
	struct cd_softc *softc;
	union ccb *ccb;
	struct scsi_read_capacity_data *rcap_buf;
	int error;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("entering cdsize\n"));

	softc = (struct cd_softc *)periph->softc;

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	/* XXX Should be M_WAITOK */
	rcap_buf = malloc(sizeof(struct scsi_read_capacity_data),
	M_SCSICD, M_NOWAIT \| M_ZERO);
	if (rcap_buf == NULL)
	return (ENOMEM);

	scsi_read_capacity(&ccb->csio,
	/retries/ cd_retry_count,
	/cbfcnp/NULL,
	MSG_SIMPLE_Q_TAG,
	rcap_buf,
	SSD_FULL_SIZE,
	/* timeout */20000);

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA\|SF_NO_PRINT);

	xpt_release_ccb(ccb);

	softc->params.disksize = scsi_4btoul(rcap_buf->addr) + 1;
	softc->params.blksize = scsi_4btoul(rcap_buf->length);
	/* Make sure we got at least some block size. */
	if (error == 0 && softc->params.blksize == 0)
	error = EIO;
	/*
	* SCSI-3 mandates that the reported blocksize shall be 2048.
	* Older drives sometimes report funny values, trim it down to
	* 2048, or other parts of the kernel will get confused.
	*
	* XXX we leave drives alone that might report 512 bytes, as
	* well as drives reporting more weird sizes like perhaps 4K.
	*/
	if (softc->params.blksize > 2048 && softc->params.blksize <= 2352)
	softc->params.blksize = 2048;

	free(rcap_buf, M_SCSICD);
	*size = softc->params.disksize;

	return (error);

	}
	#endif

	static int
	cd6byteworkaround(union ccb *ccb)
	{
	u_int8_t *cdb;
	struct cam_periph *periph;
	struct cd_softc *softc;
	struct cd_mode_params *params;
	int frozen, found;

	periph = xpt_path_periph(ccb->ccb_h.path);
	softc = (struct cd_softc *)periph->softc;

	cdb = ccb->csio.cdb_io.cdb_bytes;

	if ((ccb->ccb_h.flags & CAM_CDB_POINTER)
	\|\| ((cdb[0] != MODE_SENSE_6)
	&& (cdb[0] != MODE_SELECT_6)))
	return (0);

	/*
	* Because there is no convenient place to stash the overall
	* cd_mode_params structure pointer, we have to grab it like this.
	* This means that ALL MODE_SENSE and MODE_SELECT requests in the
	* cd(4) driver MUST go through cdgetmode() and cdsetmode()!
	*
	* XXX It would be nice if, at some point, we could increase the
	* number of available peripheral private pointers. Both pointers
	* are currently used in most every peripheral driver.
	*/
	found = 0;

	STAILQ_FOREACH(params, &softc->mode_queue, links) {
	if (params->mode_buf == ccb->csio.data_ptr) {
	found = 1;
	break;
	}
	}

	/*
	* This shouldn't happen. All mode sense and mode select
	* operations in the cd(4) driver MUST go through cdgetmode() and
	* cdsetmode()!
	*/
	if (found == 0) {
	xpt_print(periph->path,
	"mode buffer not found in mode queue!\n");
	return (0);
	}

	params->cdb_size = 10;
	softc->minimum_command_size = 10;
	xpt_print(ccb->ccb_h.path,
	"%s(6) failed, increasing minimum CDB size to 10 bytes\n",
	(cdb[0] == MODE_SENSE_6) ? "MODE_SENSE" : "MODE_SELECT");

	if (cdb[0] == MODE_SENSE_6) {
	struct scsi_mode_sense_10 ms10;
	struct scsi_mode_sense_6 *ms6;
	int len;

	ms6 = (struct scsi_mode_sense_6 *)cdb;

	bzero(&ms10, sizeof(ms10));
	ms10.opcode = MODE_SENSE_10;
	ms10.byte2 = ms6->byte2;
	ms10.page = ms6->page;

	/*
	* 10 byte mode header, block descriptor,
	* sizeof(union cd_pages)
	*/
	len = sizeof(struct cd_mode_data_10);
	ccb->csio.dxfer_len = len;

	scsi_ulto2b(len, ms10.length);
	ms10.control = ms6->control;
	bcopy(&ms10, cdb, 10);
	ccb->csio.cdb_len = 10;
	} else {
	struct scsi_mode_select_10 ms10;
	struct scsi_mode_select_6 *ms6;
	struct scsi_mode_header_6 *header6;
	struct scsi_mode_header_10 *header10;
	struct scsi_mode_page_header *page_header;
	int blk_desc_len, page_num, page_size, len;

	ms6 = (struct scsi_mode_select_6 *)cdb;

	bzero(&ms10, sizeof(ms10));
	ms10.opcode = MODE_SELECT_10;
	ms10.byte2 = ms6->byte2;

	header6 = (struct scsi_mode_header_6 *)params->mode_buf;
	header10 = (struct scsi_mode_header_10 *)params->mode_buf;

	page_header = find_mode_page_6(header6);
	page_num = page_header->page_code;

	blk_desc_len = header6->blk_desc_len;

	page_size = cdgetpagesize(page_num);

	if (page_size != (page_header->page_length +
	sizeof(*page_header)))
	page_size = page_header->page_length +
	sizeof(*page_header);

	len = sizeof(*header10) + blk_desc_len + page_size;

	len = min(params->alloc_len, len);

	/*
	* Since the 6 byte parameter header is shorter than the 10
	* byte parameter header, we need to copy the actual mode
	* page data, and the block descriptor, if any, so things wind
	* up in the right place. The regions will overlap, but
	* bcopy() does the right thing.
	*/
	bcopy(params->mode_buf + sizeof(*header6),
	params->mode_buf + sizeof(*header10),
	len - sizeof(*header10));

	/* Make sure these fields are set correctly. */
	scsi_ulto2b(0, header10->data_length);
	header10->medium_type = 0;
	scsi_ulto2b(blk_desc_len, header10->blk_desc_len);

	ccb->csio.dxfer_len = len;

	scsi_ulto2b(len, ms10.length);
	ms10.control = ms6->control;
	bcopy(&ms10, cdb, 10);
	ccb->csio.cdb_len = 10;
	}

	frozen = (ccb->ccb_h.status & CAM_DEV_QFRZN) != 0;
	ccb->ccb_h.status = CAM_REQUEUE_REQ;
	xpt_action(ccb);
	if (frozen) {
	cam_release_devq(ccb->ccb_h.path,
	/relsim_flags/0,
	/openings/0,
	/timeout/0,
	/getcount_only/0);
	}

	return (ERESTART);
	}

	static int
	cderror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
	{
	struct cd_softc *softc;
	struct cam_periph *periph;
	int error, error_code, sense_key, asc, ascq;

	periph = xpt_path_periph(ccb->ccb_h.path);
	softc = (struct cd_softc *)periph->softc;

	error = 0;

	/*
	* We use a status of CAM_REQ_INVALID as shorthand -- if a 6 byte
	* CDB comes back with this particular error, try transforming it
	* into the 10 byte version.
	*/
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) {
	error = cd6byteworkaround(ccb);
	} else if (scsi_extract_sense_ccb(ccb,
	&error_code, &sense_key, &asc, &ascq)) {
	if (sense_key == SSD_KEY_ILLEGAL_REQUEST)
	error = cd6byteworkaround(ccb);
	else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
	asc == 0x28 && ascq == 0x00)
	disk_media_changed(softc->disk, M_NOWAIT);
	else if (sense_key == SSD_KEY_NOT_READY &&
	asc == 0x3a && (softc->flags & CD_FLAG_SAW_MEDIA)) {
	softc->flags &= ~CD_FLAG_SAW_MEDIA;
	disk_media_gone(softc->disk, M_NOWAIT);
	}
	}

	if (error == ERESTART)
	return (error);

	/*
	* XXX
	* Until we have a better way of doing pack validation,
	* don't treat UAs as errors.
	*/
	sense_flags \|= SF_RETRY_UA;

	if (softc->quirks & CD_Q_RETRY_BUSY)
	sense_flags \|= SF_RETRY_BUSY;
	return (cam_periph_error(ccb, cam_flags, sense_flags));
	}

	static void
	cdmediapoll(void *arg)
	{
	struct cam_periph *periph = arg;
	struct cd_softc *softc = periph->softc;

	if (softc->state == CD_STATE_NORMAL && !softc->tur &&
	softc->outstanding_cmds == 0) {
	if (cam_periph_acquire(periph) == 0) {
	softc->tur = 1;
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	}
	}
	/* Queue us up again */
	if (cd_poll_period != 0)
	callout_schedule(&softc->mediapoll_c, cd_poll_period * hz);
	}

	/*
	* Read table of contents
	*/
	static int
	cdreadtoc(struct cam_periph *periph, u_int32_t mode, u_int32_t start,
	u_int8_t *data, u_int32_t len, u_int32_t sense_flags)
	{
	u_int32_t ntoc;
	struct ccb_scsiio *csio;
	union ccb *ccb;
	int error;

	ntoc = len;
	error = 0;

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	csio = &ccb->csio;

	scsi_read_toc(csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* byte1_flags */ (mode == CD_MSF_FORMAT) ? CD_MSF : 0,
	/* format */ SRTOC_FORMAT_TOC,
	/* track*/ start,
	/* data_ptr */ data,
	/* dxfer_len */ len,
	/* sense_len */ SSD_FULL_SIZE,
	/* timeout */ 50000);

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA \| sense_flags);

	xpt_release_ccb(ccb);

	return(error);
	}

	static int
	cdreadsubchannel(struct cam_periph *periph, u_int32_t mode,
	u_int32_t format, int track,
	struct cd_sub_channel_info *data, u_int32_t len)
	{
	struct scsi_read_subchannel *scsi_cmd;
	struct ccb_scsiio *csio;
	union ccb *ccb;
	int error;

	error = 0;

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	csio = &ccb->csio;

	cam_fill_csio(csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* flags */ CAM_DIR_IN,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* data_ptr / (u_int8_t )data,
	/* dxfer_len */ len,
	/* sense_len */ SSD_FULL_SIZE,
	sizeof(struct scsi_read_subchannel),
	/* timeout */ 50000);

	scsi_cmd = (struct scsi_read_subchannel *)&csio->cdb_io.cdb_bytes;
	bzero (scsi_cmd, sizeof(*scsi_cmd));

	scsi_cmd->op_code = READ_SUBCHANNEL;
	if (mode == CD_MSF_FORMAT)
	scsi_cmd->byte1 \|= CD_MSF;
	scsi_cmd->byte2 = SRS_SUBQ;
	scsi_cmd->subchan_format = format;
	scsi_cmd->track = track;
	scsi_ulto2b(len, (u_int8_t *)scsi_cmd->data_len);
	scsi_cmd->control = 0;

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	xpt_release_ccb(ccb);

	return(error);
	}

	/*
	* All MODE_SENSE requests in the cd(4) driver MUST go through this
	* routine. See comments in cd6byteworkaround() for details.
	*/
	static int
	cdgetmode(struct cam_periph periph, struct cd_mode_params data,
	u_int32_t page)
	{
	struct ccb_scsiio *csio;
	struct cd_softc *softc;
	union ccb *ccb;
	int param_len;
	int error;

	softc = (struct cd_softc *)periph->softc;

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	csio = &ccb->csio;

	data->cdb_size = softc->minimum_command_size;
	if (data->cdb_size < 10)
	param_len = sizeof(struct cd_mode_data);
	else
	param_len = sizeof(struct cd_mode_data_10);

	/* Don't say we've got more room than we actually allocated */
	param_len = min(param_len, data->alloc_len);

	scsi_mode_sense_len(csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* dbd */ 0,
	/* page_code */ SMS_PAGE_CTRL_CURRENT,
	/* page */ page,
	/* param_buf */ data->mode_buf,
	/* param_len */ param_len,
	/* minimum_cmd_size */ softc->minimum_command_size,
	/* sense_len */ SSD_FULL_SIZE,
	/* timeout */ 50000);

	/*
	* It would be nice not to have to do this, but there's no
	* available pointer in the CCB that would allow us to stuff the
	* mode params structure in there and retrieve it in
	* cd6byteworkaround(), so we can set the cdb size. The cdb size
	* lets the caller know what CDB size we ended up using, so they
	* can find the actual mode page offset.
	*/
	STAILQ_INSERT_TAIL(&softc->mode_queue, data, links);

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	xpt_release_ccb(ccb);

	STAILQ_REMOVE(&softc->mode_queue, data, cd_mode_params, links);

	/*
	* This is a bit of belt-and-suspenders checking, but if we run
	* into a situation where the target sends back multiple block
	* descriptors, we might not have enough space in the buffer to
	* see the whole mode page. Better to return an error than
	* potentially access memory beyond our malloced region.
	*/
	if (error == 0) {
	u_int32_t data_len;

	if (data->cdb_size == 10) {
	struct scsi_mode_header_10 *hdr10;

	hdr10 = (struct scsi_mode_header_10 *)data->mode_buf;
	data_len = scsi_2btoul(hdr10->data_length);
	data_len += sizeof(hdr10->data_length);
	} else {
	struct scsi_mode_header_6 *hdr6;

	hdr6 = (struct scsi_mode_header_6 *)data->mode_buf;
	data_len = hdr6->data_length;
	data_len += sizeof(hdr6->data_length);
	}

	/*
	* Complain if there is more mode data available than we
	* allocated space for. This could potentially happen if
	* we miscalculated the page length for some reason, if the
	* drive returns multiple block descriptors, or if it sets
	* the data length incorrectly.
	*/
	if (data_len > data->alloc_len) {
	xpt_print(periph->path, "allocated modepage %d length "
	"%d < returned length %d\n", page, data->alloc_len,
	data_len);
	error = ENOSPC;
	}
	}
	return (error);
	}

	/*
	* All MODE_SELECT requests in the cd(4) driver MUST go through this
	* routine. See comments in cd6byteworkaround() for details.
	*/
	static int
	cdsetmode(struct cam_periph periph, struct cd_mode_params data)
	{
	struct ccb_scsiio *csio;
	struct cd_softc *softc;
	union ccb *ccb;
	int cdb_size, param_len;
	int error;

	softc = (struct cd_softc *)periph->softc;

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	csio = &ccb->csio;

	error = 0;

	/*
	* If the data is formatted for the 10 byte version of the mode
	* select parameter list, we need to use the 10 byte CDB.
	* Otherwise, we use whatever the stored minimum command size.
	*/
	if (data->cdb_size == 10)
	cdb_size = data->cdb_size;
	else
	cdb_size = softc->minimum_command_size;

	if (cdb_size >= 10) {
	struct scsi_mode_header_10 *mode_header;
	u_int32_t data_len;

	mode_header = (struct scsi_mode_header_10 *)data->mode_buf;

	data_len = scsi_2btoul(mode_header->data_length);

	scsi_ulto2b(0, mode_header->data_length);
	/*
	* SONY drives do not allow a mode select with a medium_type
	* value that has just been returned by a mode sense; use a
	* medium_type of 0 (Default) instead.
	*/
	mode_header->medium_type = 0;

	/*
	* Pass back whatever the drive passed to us, plus the size
	* of the data length field.
	*/
	param_len = data_len + sizeof(mode_header->data_length);

	} else {
	struct scsi_mode_header_6 *mode_header;

	mode_header = (struct scsi_mode_header_6 *)data->mode_buf;

	param_len = mode_header->data_length + 1;

	mode_header->data_length = 0;
	/*
	* SONY drives do not allow a mode select with a medium_type
	* value that has just been returned by a mode sense; use a
	* medium_type of 0 (Default) instead.
	*/
	mode_header->medium_type = 0;
	}

	/* Don't say we've got more room than we actually allocated */
	param_len = min(param_len, data->alloc_len);

	scsi_mode_select_len(csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* scsi_page_fmt */ 1,
	/* save_pages */ 0,
	/* param_buf */ data->mode_buf,
	/* param_len */ param_len,
	/* minimum_cmd_size */ cdb_size,
	/* sense_len */ SSD_FULL_SIZE,
	/* timeout */ 50000);

	/* See comments in cdgetmode() and cd6byteworkaround(). */
	STAILQ_INSERT_TAIL(&softc->mode_queue, data, links);

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	xpt_release_ccb(ccb);

	STAILQ_REMOVE(&softc->mode_queue, data, cd_mode_params, links);

	return (error);
	}

	static int
	cdplay(struct cam_periph *periph, u_int32_t blk, u_int32_t len)
	{
	struct ccb_scsiio *csio;
	union ccb *ccb;
	int error;
	u_int8_t cdb_len;

	error = 0;
	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	csio = &ccb->csio;
	/*
	* Use the smallest possible command to perform the operation.
	*/
	if ((len & 0xffff0000) == 0) {
	/*
	* We can fit in a 10 byte cdb.
	*/
	struct scsi_play_10 *scsi_cmd;

	scsi_cmd = (struct scsi_play_10 *)&csio->cdb_io.cdb_bytes;
	bzero (scsi_cmd, sizeof(*scsi_cmd));
	scsi_cmd->op_code = PLAY_10;
	scsi_ulto4b(blk, (u_int8_t *)scsi_cmd->blk_addr);
	scsi_ulto2b(len, (u_int8_t *)scsi_cmd->xfer_len);
	cdb_len = sizeof(*scsi_cmd);
	} else {
	struct scsi_play_12 *scsi_cmd;

	scsi_cmd = (struct scsi_play_12 *)&csio->cdb_io.cdb_bytes;
	bzero (scsi_cmd, sizeof(*scsi_cmd));
	scsi_cmd->op_code = PLAY_12;
	scsi_ulto4b(blk, (u_int8_t *)scsi_cmd->blk_addr);
	scsi_ulto4b(len, (u_int8_t *)scsi_cmd->xfer_len);
	cdb_len = sizeof(*scsi_cmd);
	}
	cam_fill_csio(csio,
	/retries/ cd_retry_count,
	/cbfcnp/NULL,
	/flags/CAM_DIR_NONE,
	MSG_SIMPLE_Q_TAG,
	/dataptr/NULL,
	/datalen/0,
	/sense_len/SSD_FULL_SIZE,
	cdb_len,
	/timeout/50 * 1000);

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	xpt_release_ccb(ccb);

	return(error);
	}

	static int
	cdplaymsf(struct cam_periph *periph, u_int32_t startm, u_int32_t starts,
	u_int32_t startf, u_int32_t endm, u_int32_t ends, u_int32_t endf)
	{
	struct scsi_play_msf *scsi_cmd;
	struct ccb_scsiio *csio;
	union ccb *ccb;
	int error;

	error = 0;

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	csio = &ccb->csio;

	cam_fill_csio(csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* flags */ CAM_DIR_NONE,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* data_ptr */ NULL,
	/* dxfer_len */ 0,
	/* sense_len */ SSD_FULL_SIZE,
	sizeof(struct scsi_play_msf),
	/* timeout */ 50000);

	scsi_cmd = (struct scsi_play_msf *)&csio->cdb_io.cdb_bytes;
	bzero (scsi_cmd, sizeof(*scsi_cmd));

	scsi_cmd->op_code = PLAY_MSF;
	scsi_cmd->start_m = startm;
	scsi_cmd->start_s = starts;
	scsi_cmd->start_f = startf;
	scsi_cmd->end_m = endm;
	scsi_cmd->end_s = ends;
	scsi_cmd->end_f = endf;

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	xpt_release_ccb(ccb);

	return(error);
	}

	static int
	cdplaytracks(struct cam_periph *periph, u_int32_t strack, u_int32_t sindex,
	u_int32_t etrack, u_int32_t eindex)
	{
	struct scsi_play_track *scsi_cmd;
	struct ccb_scsiio *csio;
	union ccb *ccb;
	int error;

	error = 0;

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	csio = &ccb->csio;

	cam_fill_csio(csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* flags */ CAM_DIR_NONE,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* data_ptr */ NULL,
	/* dxfer_len */ 0,
	/* sense_len */ SSD_FULL_SIZE,
	sizeof(struct scsi_play_track),
	/* timeout */ 50000);

	scsi_cmd = (struct scsi_play_track *)&csio->cdb_io.cdb_bytes;
	bzero (scsi_cmd, sizeof(*scsi_cmd));

	scsi_cmd->op_code = PLAY_TRACK;
	scsi_cmd->start_track = strack;
	scsi_cmd->start_index = sindex;
	scsi_cmd->end_track = etrack;
	scsi_cmd->end_index = eindex;

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	xpt_release_ccb(ccb);

	return(error);
	}

	static int
	cdpause(struct cam_periph *periph, u_int32_t go)
	{
	struct scsi_pause *scsi_cmd;
	struct ccb_scsiio *csio;
	union ccb *ccb;
	int error;

	error = 0;

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	csio = &ccb->csio;

	cam_fill_csio(csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* flags */ CAM_DIR_NONE,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* data_ptr */ NULL,
	/* dxfer_len */ 0,
	/* sense_len */ SSD_FULL_SIZE,
	sizeof(struct scsi_pause),
	/* timeout */ 50000);

	scsi_cmd = (struct scsi_pause *)&csio->cdb_io.cdb_bytes;
	bzero (scsi_cmd, sizeof(*scsi_cmd));

	scsi_cmd->op_code = PAUSE;
	scsi_cmd->resume = go;

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	xpt_release_ccb(ccb);

	return(error);
	}

	static int
	cdstartunit(struct cam_periph *periph, int load)
	{
	union ccb *ccb;
	int error;

	error = 0;

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	scsi_start_stop(&ccb->csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* start */ TRUE,
	/* load_eject */ load,
	/* immediate */ FALSE,
	/* sense_len */ SSD_FULL_SIZE,
	/* timeout */ 50000);

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	xpt_release_ccb(ccb);

	return(error);
	}

	static int
	cdstopunit(struct cam_periph *periph, u_int32_t eject)
	{
	union ccb *ccb;
	int error;

	error = 0;

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	scsi_start_stop(&ccb->csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* start */ FALSE,
	/* load_eject */ eject,
	/* immediate */ FALSE,
	/* sense_len */ SSD_FULL_SIZE,
	/* timeout */ 50000);

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	xpt_release_ccb(ccb);

	return(error);
	}

	static int
	cdsetspeed(struct cam_periph *periph, u_int32_t rdspeed, u_int32_t wrspeed)
	{
	struct scsi_set_speed *scsi_cmd;
	struct ccb_scsiio *csio;
	union ccb *ccb;
	int error;

	error = 0;
	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	csio = &ccb->csio;

	/* Preserve old behavior: units in multiples of CDROM speed */
	if (rdspeed < 177)
	rdspeed *= 177;
	if (wrspeed < 177)
	wrspeed *= 177;

	cam_fill_csio(csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* flags */ CAM_DIR_NONE,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* data_ptr */ NULL,
	/* dxfer_len */ 0,
	/* sense_len */ SSD_FULL_SIZE,
	sizeof(struct scsi_set_speed),
	/* timeout */ 50000);

	scsi_cmd = (struct scsi_set_speed *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));

	scsi_cmd->opcode = SET_CD_SPEED;
	scsi_ulto2b(rdspeed, scsi_cmd->readspeed);
	scsi_ulto2b(wrspeed, scsi_cmd->writespeed);

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	xpt_release_ccb(ccb);

	return(error);
	}

	static int
	cdreportkey(struct cam_periph periph, struct dvd_authinfo authinfo)
	{
	union ccb *ccb;
	u_int8_t *databuf;
	u_int32_t lba;
	int error;
	int length;

	error = 0;
	databuf = NULL;
	lba = 0;

	switch (authinfo->format) {
	case DVD_REPORT_AGID:
	length = sizeof(struct scsi_report_key_data_agid);
	break;
	case DVD_REPORT_CHALLENGE:
	length = sizeof(struct scsi_report_key_data_challenge);
	break;
	case DVD_REPORT_KEY1:
	length = sizeof(struct scsi_report_key_data_key1_key2);
	break;
	case DVD_REPORT_TITLE_KEY:
	length = sizeof(struct scsi_report_key_data_title);
	/* The lba field is only set for the title key */
	lba = authinfo->lba;
	break;
	case DVD_REPORT_ASF:
	length = sizeof(struct scsi_report_key_data_asf);
	break;
	case DVD_REPORT_RPC:
	length = sizeof(struct scsi_report_key_data_rpc);
	break;
	case DVD_INVALIDATE_AGID:
	length = 0;
	break;
	default:
	return (EINVAL);
	}

	if (length != 0) {
	databuf = malloc(length, M_DEVBUF, M_WAITOK \| M_ZERO);
	} else
	databuf = NULL;

	cam_periph_lock(periph);
	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	scsi_report_key(&ccb->csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* lba */ lba,
	/* agid */ authinfo->agid,
	/* key_format */ authinfo->format,
	/* data_ptr */ databuf,
	/* dxfer_len */ length,
	/* sense_len */ SSD_FULL_SIZE,
	/* timeout */ 50000);

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	if (error != 0)
	goto bailout;

	if (ccb->csio.resid != 0) {
	xpt_print(periph->path, "warning, residual for report key "
	"command is %d\n", ccb->csio.resid);
	}

	switch(authinfo->format) {
	case DVD_REPORT_AGID: {
	struct scsi_report_key_data_agid *agid_data;

	agid_data = (struct scsi_report_key_data_agid *)databuf;

	authinfo->agid = (agid_data->agid & RKD_AGID_MASK) >>
	RKD_AGID_SHIFT;
	break;
	}
	case DVD_REPORT_CHALLENGE: {
	struct scsi_report_key_data_challenge *chal_data;

	chal_data = (struct scsi_report_key_data_challenge *)databuf;

	bcopy(chal_data->challenge_key, authinfo->keychal,
	min(sizeof(chal_data->challenge_key),
	sizeof(authinfo->keychal)));
	break;
	}
	case DVD_REPORT_KEY1: {
	struct scsi_report_key_data_key1_key2 *key1_data;

	key1_data = (struct scsi_report_key_data_key1_key2 *)databuf;

	bcopy(key1_data->key1, authinfo->keychal,
	min(sizeof(key1_data->key1), sizeof(authinfo->keychal)));
	break;
	}
	case DVD_REPORT_TITLE_KEY: {
	struct scsi_report_key_data_title *title_data;

	title_data = (struct scsi_report_key_data_title *)databuf;

	authinfo->cpm = (title_data->byte0 & RKD_TITLE_CPM) >>
	RKD_TITLE_CPM_SHIFT;
	authinfo->cp_sec = (title_data->byte0 & RKD_TITLE_CP_SEC) >>
	RKD_TITLE_CP_SEC_SHIFT;
	authinfo->cgms = (title_data->byte0 & RKD_TITLE_CMGS_MASK) >>
	RKD_TITLE_CMGS_SHIFT;
	bcopy(title_data->title_key, authinfo->keychal,
	min(sizeof(title_data->title_key),
	sizeof(authinfo->keychal)));
	break;
	}
	case DVD_REPORT_ASF: {
	struct scsi_report_key_data_asf *asf_data;

	asf_data = (struct scsi_report_key_data_asf *)databuf;

	authinfo->asf = asf_data->success & RKD_ASF_SUCCESS;
	break;
	}
	case DVD_REPORT_RPC: {
	struct scsi_report_key_data_rpc *rpc_data;

	rpc_data = (struct scsi_report_key_data_rpc *)databuf;

	authinfo->reg_type = (rpc_data->byte4 & RKD_RPC_TYPE_MASK) >>
	RKD_RPC_TYPE_SHIFT;
	authinfo->vend_rsts =
	(rpc_data->byte4 & RKD_RPC_VENDOR_RESET_MASK) >>
	RKD_RPC_VENDOR_RESET_SHIFT;
	authinfo->user_rsts = rpc_data->byte4 & RKD_RPC_USER_RESET_MASK;
	authinfo->region = rpc_data->region_mask;
	authinfo->rpc_scheme = rpc_data->rpc_scheme1;
	break;
	}
	case DVD_INVALIDATE_AGID:
	break;
	default:
	/* This should be impossible, since we checked above */
	error = EINVAL;
	goto bailout;
	break; /* NOTREACHED */
	}

	bailout:
	xpt_release_ccb(ccb);
	cam_periph_unlock(periph);

	if (databuf != NULL)
	free(databuf, M_DEVBUF);

	return(error);
	}

	static int
	cdsendkey(struct cam_periph periph, struct dvd_authinfo authinfo)
	{
	union ccb *ccb;
	u_int8_t *databuf;
	int length;
	int error;

	error = 0;
	databuf = NULL;

	switch(authinfo->format) {
	case DVD_SEND_CHALLENGE: {
	struct scsi_report_key_data_challenge *challenge_data;

	length = sizeof(*challenge_data);

	challenge_data = malloc(length, M_DEVBUF, M_WAITOK \| M_ZERO);

	databuf = (u_int8_t *)challenge_data;

	scsi_ulto2b(length - sizeof(challenge_data->data_len),
	challenge_data->data_len);

	bcopy(authinfo->keychal, challenge_data->challenge_key,
	min(sizeof(authinfo->keychal),
	sizeof(challenge_data->challenge_key)));
	break;
	}
	case DVD_SEND_KEY2: {
	struct scsi_report_key_data_key1_key2 *key2_data;

	length = sizeof(*key2_data);

	key2_data = malloc(length, M_DEVBUF, M_WAITOK \| M_ZERO);

	databuf = (u_int8_t *)key2_data;

	scsi_ulto2b(length - sizeof(key2_data->data_len),
	key2_data->data_len);

	bcopy(authinfo->keychal, key2_data->key1,
	min(sizeof(authinfo->keychal), sizeof(key2_data->key1)));

	break;
	}
	case DVD_SEND_RPC: {
	struct scsi_send_key_data_rpc *rpc_data;

	length = sizeof(*rpc_data);

	rpc_data = malloc(length, M_DEVBUF, M_WAITOK \| M_ZERO);

	databuf = (u_int8_t *)rpc_data;

	scsi_ulto2b(length - sizeof(rpc_data->data_len),
	rpc_data->data_len);

	rpc_data->region_code = authinfo->region;
	break;
	}
	default:
	return (EINVAL);
	}

	cam_periph_lock(periph);
	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	scsi_send_key(&ccb->csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* agid */ authinfo->agid,
	/* key_format */ authinfo->format,
	/* data_ptr */ databuf,
	/* dxfer_len */ length,
	/* sense_len */ SSD_FULL_SIZE,
	/* timeout */ 50000);

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	xpt_release_ccb(ccb);
	cam_periph_unlock(periph);

	if (databuf != NULL)
	free(databuf, M_DEVBUF);

	return(error);
	}

	static int
	cdreaddvdstructure(struct cam_periph periph, struct dvd_struct dvdstruct)
	{
	union ccb *ccb;
	u_int8_t *databuf;
	u_int32_t address;
	int error;
	int length;

	error = 0;
	databuf = NULL;
	/* The address is reserved for many of the formats */
	address = 0;

	switch(dvdstruct->format) {
	case DVD_STRUCT_PHYSICAL:
	length = sizeof(struct scsi_read_dvd_struct_data_physical);
	break;
	case DVD_STRUCT_COPYRIGHT:
	length = sizeof(struct scsi_read_dvd_struct_data_copyright);
	break;
	case DVD_STRUCT_DISCKEY:
	length = sizeof(struct scsi_read_dvd_struct_data_disc_key);
	break;
	case DVD_STRUCT_BCA:
	length = sizeof(struct scsi_read_dvd_struct_data_bca);
	break;
	case DVD_STRUCT_MANUFACT:
	length = sizeof(struct scsi_read_dvd_struct_data_manufacturer);
	break;
	case DVD_STRUCT_CMI:
	return (ENODEV);
	case DVD_STRUCT_PROTDISCID:
	length = sizeof(struct scsi_read_dvd_struct_data_prot_discid);
	break;
	case DVD_STRUCT_DISCKEYBLOCK:
	length = sizeof(struct scsi_read_dvd_struct_data_disc_key_blk);
	break;
	case DVD_STRUCT_DDS:
	length = sizeof(struct scsi_read_dvd_struct_data_dds);
	break;
	case DVD_STRUCT_MEDIUM_STAT:
	length = sizeof(struct scsi_read_dvd_struct_data_medium_status);
	break;
	case DVD_STRUCT_SPARE_AREA:
	length = sizeof(struct scsi_read_dvd_struct_data_spare_area);
	break;
	case DVD_STRUCT_RMD_LAST:
	return (ENODEV);
	case DVD_STRUCT_RMD_RMA:
	return (ENODEV);
	case DVD_STRUCT_PRERECORDED:
	length = sizeof(struct scsi_read_dvd_struct_data_leadin);
	break;
	case DVD_STRUCT_UNIQUEID:
	length = sizeof(struct scsi_read_dvd_struct_data_disc_id);
	break;
	case DVD_STRUCT_DCB:
	return (ENODEV);
	case DVD_STRUCT_LIST:
	/*
	* This is the maximum allocation length for the READ DVD
	* STRUCTURE command. There's nothing in the MMC3 spec
	* that indicates a limit in the amount of data that can
	* be returned from this call, other than the limits
	* imposed by the 2-byte length variables.
	*/
	length = 65535;
	break;
	default:
	return (EINVAL);
	}

	if (length != 0) {
	databuf = malloc(length, M_DEVBUF, M_WAITOK \| M_ZERO);
	} else
	databuf = NULL;

	cam_periph_lock(periph);
	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	scsi_read_dvd_structure(&ccb->csio,
	/* retries */ cd_retry_count,
	/* cbfcnp */ NULL,
	/* tag_action */ MSG_SIMPLE_Q_TAG,
	/* lba */ address,
	/* layer_number */ dvdstruct->layer_num,
	/* key_format */ dvdstruct->format,
	/* agid */ dvdstruct->agid,
	/* data_ptr */ databuf,
	/* dxfer_len */ length,
	/* sense_len */ SSD_FULL_SIZE,
	/* timeout */ 50000);

	error = cdrunccb(ccb, cderror, /cam_flags/CAM_RETRY_SELTO,
	/sense_flags/SF_RETRY_UA);

	if (error != 0)
	goto bailout;

	switch(dvdstruct->format) {
	case DVD_STRUCT_PHYSICAL: {
	struct scsi_read_dvd_struct_data_layer_desc *inlayer;
	struct dvd_layer *outlayer;
	struct scsi_read_dvd_struct_data_physical *phys_data;

	phys_data =
	(struct scsi_read_dvd_struct_data_physical *)databuf;
	inlayer = &phys_data->layer_desc;
	outlayer = (struct dvd_layer *)&dvdstruct->data;

	dvdstruct->length = sizeof(*inlayer);

	outlayer->book_type = (inlayer->book_type_version &
	RDSD_BOOK_TYPE_MASK) >> RDSD_BOOK_TYPE_SHIFT;
	outlayer->book_version = (inlayer->book_type_version &
	RDSD_BOOK_VERSION_MASK);
	outlayer->disc_size = (inlayer->disc_size_max_rate &
	RDSD_DISC_SIZE_MASK) >> RDSD_DISC_SIZE_SHIFT;
	outlayer->max_rate = (inlayer->disc_size_max_rate &
	RDSD_MAX_RATE_MASK);
	outlayer->nlayers = (inlayer->layer_info &
	RDSD_NUM_LAYERS_MASK) >> RDSD_NUM_LAYERS_SHIFT;
	outlayer->track_path = (inlayer->layer_info &
	RDSD_TRACK_PATH_MASK) >> RDSD_TRACK_PATH_SHIFT;
	outlayer->layer_type = (inlayer->layer_info &
	RDSD_LAYER_TYPE_MASK);
	outlayer->linear_density = (inlayer->density &
	RDSD_LIN_DENSITY_MASK) >> RDSD_LIN_DENSITY_SHIFT;
	outlayer->track_density = (inlayer->density &
	RDSD_TRACK_DENSITY_MASK);
	outlayer->bca = (inlayer->bca & RDSD_BCA_MASK) >>
	RDSD_BCA_SHIFT;
	outlayer->start_sector = scsi_3btoul(inlayer->main_data_start);
	outlayer->end_sector = scsi_3btoul(inlayer->main_data_end);
	outlayer->end_sector_l0 =
	scsi_3btoul(inlayer->end_sector_layer0);
	break;
	}
	case DVD_STRUCT_COPYRIGHT: {
	struct scsi_read_dvd_struct_data_copyright *copy_data;

	copy_data = (struct scsi_read_dvd_struct_data_copyright *)
	databuf;

	dvdstruct->cpst = copy_data->cps_type;
	dvdstruct->rmi = copy_data->region_info;
	dvdstruct->length = 0;

	break;
	}
	default:
	/*
	* Tell the user what the overall length is, no matter
	* what we can actually fit in the data buffer.
	*/
	dvdstruct->length = length - ccb->csio.resid -
	sizeof(struct scsi_read_dvd_struct_data_header);

	/*
	* But only actually copy out the smaller of what we read
	* in or what the structure can take.
	*/
	bcopy(databuf + sizeof(struct scsi_read_dvd_struct_data_header),
	dvdstruct->data,
	min(sizeof(dvdstruct->data), dvdstruct->length));
	break;
	}

	bailout:
	xpt_release_ccb(ccb);
	cam_periph_unlock(periph);

	if (databuf != NULL)
	free(databuf, M_DEVBUF);

	return(error);
	}

	void
	scsi_report_key(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, u_int32_t lba, u_int8_t agid,
	u_int8_t key_format, u_int8_t *data_ptr, u_int32_t dxfer_len,
	u_int8_t sense_len, u_int32_t timeout)
	{
	struct scsi_report_key *scsi_cmd;

	scsi_cmd = (struct scsi_report_key *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));
	scsi_cmd->opcode = REPORT_KEY;
	scsi_ulto4b(lba, scsi_cmd->lba);
	scsi_ulto2b(dxfer_len, scsi_cmd->alloc_len);
	scsi_cmd->agid_keyformat = (agid << RK_KF_AGID_SHIFT) \|
	(key_format & RK_KF_KEYFORMAT_MASK);

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ (dxfer_len == 0) ? CAM_DIR_NONE : CAM_DIR_IN,
	tag_action,
	/data_ptr/ data_ptr,
	/dxfer_len/ dxfer_len,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}

	void
	scsi_send_key(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, u_int8_t agid, u_int8_t key_format,
	u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len,
	u_int32_t timeout)
	{
	struct scsi_send_key *scsi_cmd;

	scsi_cmd = (struct scsi_send_key *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));
	scsi_cmd->opcode = SEND_KEY;

	scsi_ulto2b(dxfer_len, scsi_cmd->param_len);
	scsi_cmd->agid_keyformat = (agid << RK_KF_AGID_SHIFT) \|
	(key_format & RK_KF_KEYFORMAT_MASK);

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ CAM_DIR_OUT,
	tag_action,
	/data_ptr/ data_ptr,
	/dxfer_len/ dxfer_len,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}

	void
	scsi_read_dvd_structure(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, u_int32_t address,
	u_int8_t layer_number, u_int8_t format, u_int8_t agid,
	u_int8_t *data_ptr, u_int32_t dxfer_len,
	u_int8_t sense_len, u_int32_t timeout)
	{
	struct scsi_read_dvd_structure *scsi_cmd;

	scsi_cmd = (struct scsi_read_dvd_structure *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));
	scsi_cmd->opcode = READ_DVD_STRUCTURE;

	scsi_ulto4b(address, scsi_cmd->address);
	scsi_cmd->layer_number = layer_number;
	scsi_cmd->format = format;
	scsi_ulto2b(dxfer_len, scsi_cmd->alloc_len);
	/* The AGID is the top two bits of this byte */
	scsi_cmd->agid = agid << 6;

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ CAM_DIR_IN,
	tag_action,
	/data_ptr/ data_ptr,
	/dxfer_len/ dxfer_len,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}

	void
	scsi_read_toc(struct ccb_scsiio *csio, uint32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	uint8_t tag_action, uint8_t byte1_flags, uint8_t format,
	uint8_t track, uint8_t *data_ptr, uint32_t dxfer_len,
	int sense_len, int timeout)
	{
	struct scsi_read_toc *scsi_cmd;

	scsi_cmd = (struct scsi_read_toc *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));
	scsi_cmd->op_code = READ_TOC;

	/*
	* The structure is counting from 1, the function counting from 0.
	* The spec counts from 0. In MMC-6, there is only one flag, the
	* MSF flag. But we put the whole byte in for a bit a future-proofing.
	*/
	scsi_cmd->byte2 = byte1_flags;
	scsi_cmd->format = format;
	scsi_cmd->from_track = track;
	scsi_ulto2b(dxfer_len, scsi_cmd->data_len);

	cam_fill_csio(csio,
	/* retries */ retries,
	/* cbfcnp */ cbfcnp,
	/* flags */ CAM_DIR_IN,
	/* tag_action */ tag_action,
	/* data_ptr */ data_ptr,
	/* dxfer_len */ dxfer_len,
	/* sense_len */ sense_len,
	sizeof(*scsi_cmd),
	/* timeout */ timeout);
	}
	diff --git a/sys/cam/scsi/scsi_da.c b/sys/cam/scsi/scsi_da.c
	index ac1d9456a5d1..490f75336efd 100644
	--- a/sys/cam/scsi/scsi_da.c
	+++ b/sys/cam/scsi/scsi_da.c
	@@ -1,6639 +1,6639 @@
	/*-
	* Implementation of SCSI Direct Access Peripheral driver for CAM.
	*
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1997 Justin T. Gibbs.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>

	#ifdef _KERNEL
	#include "opt_da.h"
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bio.h>
	#include <sys/sysctl.h>
	#include <sys/taskqueue.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/conf.h>
	#include <sys/devicestat.h>
	#include <sys/eventhandler.h>
	#include <sys/malloc.h>
	#include <sys/cons.h>
	#include <sys/endian.h>
	#include <sys/proc.h>
	#include <sys/sbuf.h>
	#include <geom/geom.h>
	#include <geom/geom_disk.h>
	#include <machine/atomic.h>
	#endif /* _KERNEL */

	#ifndef _KERNEL
	#include <stdio.h>
	#include <string.h>
	#endif /* _KERNEL */

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_xpt_periph.h>
	#ifdef _KERNEL
	#include <cam/cam_xpt_internal.h>
	#endif /* _KERNEL */
	#include <cam/cam_sim.h>
	#include <cam/cam_iosched.h>

	#include <cam/scsi/scsi_message.h>
	#include <cam/scsi/scsi_da.h>

	#ifdef _KERNEL
	/*
	* Note that there are probe ordering dependencies here. The order isn't
	* controlled by this enumeration, but by explicit state transitions in
	* dastart() and dadone(). Here are some of the dependencies:
	*
	* 1. RC should come first, before RC16, unless there is evidence that RC16
	* is supported.
	* 2. BDC needs to come before any of the ATA probes, or the ZONE probe.
	* 3. The ATA probes should go in this order:
	* ATA -> LOGDIR -> IDDIR -> SUP -> ATA_ZONE
	*/
	typedef enum {
	DA_STATE_PROBE_WP,
	DA_STATE_PROBE_RC,
	DA_STATE_PROBE_RC16,
	DA_STATE_PROBE_LBP,
	DA_STATE_PROBE_BLK_LIMITS,
	DA_STATE_PROBE_BDC,
	DA_STATE_PROBE_ATA,
	DA_STATE_PROBE_ATA_LOGDIR,
	DA_STATE_PROBE_ATA_IDDIR,
	DA_STATE_PROBE_ATA_SUP,
	DA_STATE_PROBE_ATA_ZONE,
	DA_STATE_PROBE_ZONE,
	DA_STATE_NORMAL
	} da_state;

	typedef enum {
	DA_FLAG_PACK_INVALID = 0x000001,
	DA_FLAG_NEW_PACK = 0x000002,
	DA_FLAG_PACK_LOCKED = 0x000004,
	DA_FLAG_PACK_REMOVABLE = 0x000008,
	DA_FLAG_ROTATING = 0x000010,
	DA_FLAG_NEED_OTAG = 0x000020,
	DA_FLAG_WAS_OTAG = 0x000040,
	DA_FLAG_RETRY_UA = 0x000080,
	DA_FLAG_OPEN = 0x000100,
	DA_FLAG_SCTX_INIT = 0x000200,
	DA_FLAG_CAN_RC16 = 0x000400,
	DA_FLAG_PROBED = 0x000800,
	DA_FLAG_DIRTY = 0x001000,
	DA_FLAG_ANNOUNCED = 0x002000,
	DA_FLAG_CAN_ATA_DMA = 0x004000,
	DA_FLAG_CAN_ATA_LOG = 0x008000,
	DA_FLAG_CAN_ATA_IDLOG = 0x010000,
	DA_FLAG_CAN_ATA_SUPCAP = 0x020000,
	DA_FLAG_CAN_ATA_ZONE = 0x040000,
	DA_FLAG_TUR_PENDING = 0x080000,
	DA_FLAG_UNMAPPEDIO = 0x100000
	} da_flags;
	#define DA_FLAG_STRING \
	"\020" \
	"\001PACK_INVALID" \
	"\002NEW_PACK" \
	"\003PACK_LOCKED" \
	"\004PACK_REMOVABLE" \
	"\005ROTATING" \
	"\006NEED_OTAG" \
	"\007WAS_OTAG" \
	"\010RETRY_UA" \
	"\011OPEN" \
	"\012SCTX_INIT" \
	"\013CAN_RC16" \
	"\014PROBED" \
	"\015DIRTY" \
	"\016ANNOUCNED" \
	"\017CAN_ATA_DMA" \
	"\020CAN_ATA_LOG" \
	"\021CAN_ATA_IDLOG" \
	"\022CAN_ATA_SUPACP" \
	"\023CAN_ATA_ZONE" \
	"\024TUR_PENDING" \
	"\025UNMAPPEDIO"

	typedef enum {
	DA_Q_NONE = 0x00,
	DA_Q_NO_SYNC_CACHE = 0x01,
	DA_Q_NO_6_BYTE = 0x02,
	DA_Q_NO_PREVENT = 0x04,
	DA_Q_4K = 0x08,
	DA_Q_NO_RC16 = 0x10,
	DA_Q_NO_UNMAP = 0x20,
	DA_Q_RETRY_BUSY = 0x40,
	DA_Q_SMR_DM = 0x80,
	DA_Q_STRICT_UNMAP = 0x100,
	DA_Q_128KB = 0x200
	} da_quirks;

	#define DA_Q_BIT_STRING \
	"\020" \
	"\001NO_SYNC_CACHE" \
	"\002NO_6_BYTE" \
	"\003NO_PREVENT" \
	"\0044K" \
	"\005NO_RC16" \
	"\006NO_UNMAP" \
	"\007RETRY_BUSY" \
	"\010SMR_DM" \
	"\011STRICT_UNMAP" \
	"\012128KB"

	typedef enum {
	DA_CCB_PROBE_RC = 0x01,
	DA_CCB_PROBE_RC16 = 0x02,
	DA_CCB_PROBE_LBP = 0x03,
	DA_CCB_PROBE_BLK_LIMITS = 0x04,
	DA_CCB_PROBE_BDC = 0x05,
	DA_CCB_PROBE_ATA = 0x06,
	DA_CCB_BUFFER_IO = 0x07,
	DA_CCB_DUMP = 0x0A,
	DA_CCB_DELETE = 0x0B,
	DA_CCB_TUR = 0x0C,
	DA_CCB_PROBE_ZONE = 0x0D,
	DA_CCB_PROBE_ATA_LOGDIR = 0x0E,
	DA_CCB_PROBE_ATA_IDDIR = 0x0F,
	DA_CCB_PROBE_ATA_SUP = 0x10,
	DA_CCB_PROBE_ATA_ZONE = 0x11,
	DA_CCB_PROBE_WP = 0x12,
	DA_CCB_TYPE_MASK = 0x1F,
	DA_CCB_RETRY_UA = 0x20
	} da_ccb_state;

	/*
	* Order here is important for method choice
	*
	* We prefer ATA_TRIM as tests run against a Sandforce 2281 SSD attached to
	* LSI 2008 (mps) controller (FW: v12, Drv: v14) resulted 20% quicker deletes
	* using ATA_TRIM than the corresponding UNMAP results for a real world mysql
	* import taking 5mins.
	*
	*/
	typedef enum {
	DA_DELETE_NONE,
	DA_DELETE_DISABLE,
	DA_DELETE_ATA_TRIM,
	DA_DELETE_UNMAP,
	DA_DELETE_WS16,
	DA_DELETE_WS10,
	DA_DELETE_ZERO,
	DA_DELETE_MIN = DA_DELETE_ATA_TRIM,
	DA_DELETE_MAX = DA_DELETE_ZERO
	} da_delete_methods;

	/*
	* For SCSI, host managed drives show up as a separate device type. For
	* ATA, host managed drives also have a different device signature.
	* XXX KDM figure out the ATA host managed signature.
	*/
	typedef enum {
	DA_ZONE_NONE = 0x00,
	DA_ZONE_DRIVE_MANAGED = 0x01,
	DA_ZONE_HOST_AWARE = 0x02,
	DA_ZONE_HOST_MANAGED = 0x03
	} da_zone_mode;

	/*
	* We distinguish between these interface cases in addition to the drive type:
	* o ATA drive behind a SCSI translation layer that knows about ZBC/ZAC
	* o ATA drive behind a SCSI translation layer that does not know about
	* ZBC/ZAC, and so needs to be managed via ATA passthrough. In this
	* case, we would need to share the ATA code with the ada(4) driver.
	* o SCSI drive.
	*/
	typedef enum {
	DA_ZONE_IF_SCSI,
	DA_ZONE_IF_ATA_PASS,
	DA_ZONE_IF_ATA_SAT,
	} da_zone_interface;

	typedef enum {
	DA_ZONE_FLAG_RZ_SUP = 0x0001,
	DA_ZONE_FLAG_OPEN_SUP = 0x0002,
	DA_ZONE_FLAG_CLOSE_SUP = 0x0004,
	DA_ZONE_FLAG_FINISH_SUP = 0x0008,
	DA_ZONE_FLAG_RWP_SUP = 0x0010,
	DA_ZONE_FLAG_SUP_MASK = (DA_ZONE_FLAG_RZ_SUP \|
	DA_ZONE_FLAG_OPEN_SUP \|
	DA_ZONE_FLAG_CLOSE_SUP \|
	DA_ZONE_FLAG_FINISH_SUP \|
	DA_ZONE_FLAG_RWP_SUP),
	DA_ZONE_FLAG_URSWRZ = 0x0020,
	DA_ZONE_FLAG_OPT_SEQ_SET = 0x0040,
	DA_ZONE_FLAG_OPT_NONSEQ_SET = 0x0080,
	DA_ZONE_FLAG_MAX_SEQ_SET = 0x0100,
	DA_ZONE_FLAG_SET_MASK = (DA_ZONE_FLAG_OPT_SEQ_SET \|
	DA_ZONE_FLAG_OPT_NONSEQ_SET \|
	DA_ZONE_FLAG_MAX_SEQ_SET)
	} da_zone_flags;

	static struct da_zone_desc {
	da_zone_flags value;
	const char *desc;
	} da_zone_desc_table[] = {
	{DA_ZONE_FLAG_RZ_SUP, "Report Zones" },
	{DA_ZONE_FLAG_OPEN_SUP, "Open" },
	{DA_ZONE_FLAG_CLOSE_SUP, "Close" },
	{DA_ZONE_FLAG_FINISH_SUP, "Finish" },
	{DA_ZONE_FLAG_RWP_SUP, "Reset Write Pointer" },
	};

	typedef void da_delete_func_t (struct cam_periph periph, union ccb ccb,
	struct bio *bp);
	static da_delete_func_t da_delete_trim;
	static da_delete_func_t da_delete_unmap;
	static da_delete_func_t da_delete_ws;

	static const void * da_delete_functions[] = {
	NULL,
	NULL,
	da_delete_trim,
	da_delete_unmap,
	da_delete_ws,
	da_delete_ws,
	da_delete_ws
	};

	static const char *da_delete_method_names[] =
	{ "NONE", "DISABLE", "ATA_TRIM", "UNMAP", "WS16", "WS10", "ZERO" };
	static const char *da_delete_method_desc[] =
	{ "NONE", "DISABLED", "ATA TRIM", "UNMAP", "WRITE SAME(16) with UNMAP",
	"WRITE SAME(10) with UNMAP", "ZERO" };

	/* Offsets into our private area for storing information */
	#define ccb_state ppriv_field0
	#define ccb_bp ppriv_ptr1

	struct disk_params {
	u_int8_t heads;
	u_int32_t cylinders;
	u_int8_t secs_per_track;
	u_int32_t secsize; /* Number of bytes/sector */
	u_int64_t sectors; /* total number sectors */
	u_int stripesize;
	u_int stripeoffset;
	};

	#define UNMAP_RANGE_MAX 0xffffffff
	#define UNMAP_HEAD_SIZE 8
	#define UNMAP_RANGE_SIZE 16
	#define UNMAP_MAX_RANGES 2048 /* Protocol Max is 4095 */
	#define UNMAP_BUF_SIZE ((UNMAP_MAX_RANGES * UNMAP_RANGE_SIZE) + \
	UNMAP_HEAD_SIZE)

	#define WS10_MAX_BLKS 0xffff
	#define WS16_MAX_BLKS 0xffffffff
	#define ATA_TRIM_MAX_RANGES ((UNMAP_BUF_SIZE / \
	(ATA_DSM_RANGE_SIZE * ATA_DSM_BLK_SIZE)) * ATA_DSM_BLK_SIZE)

	#define DA_WORK_TUR (1 << 16)

	typedef enum {
	DA_REF_OPEN = 1,
	DA_REF_OPEN_HOLD,
	DA_REF_CLOSE_HOLD,
	DA_REF_PROBE_HOLD,
	DA_REF_TUR,
	DA_REF_GEOM,
	DA_REF_SYSCTL,
	DA_REF_REPROBE,
	DA_REF_MAX /* KEEP LAST */
	} da_ref_token;

	struct da_softc {
	struct cam_iosched_softc *cam_iosched;
	struct bio_queue_head delete_run_queue;
	LIST_HEAD(, ccb_hdr) pending_ccbs;
	int refcount; /* Active xpt_action() calls */
	da_state state;
	da_flags flags;
	da_quirks quirks;
	int minimum_cmd_size;
	int error_inject;
	int trim_max_ranges;
	int delete_available; /* Delete methods possibly available */
	da_zone_mode zone_mode;
	da_zone_interface zone_interface;
	da_zone_flags zone_flags;
	struct ata_gp_log_dir ata_logdir;
	int valid_logdir_len;
	struct ata_identify_log_pages ata_iddir;
	int valid_iddir_len;
	uint64_t optimal_seq_zones;
	uint64_t optimal_nonseq_zones;
	uint64_t max_seq_zones;
	u_int maxio;
	uint32_t unmap_max_ranges;
	uint32_t unmap_max_lba; /* Max LBAs in UNMAP req */
	uint32_t unmap_gran;
	uint32_t unmap_gran_align;
	uint64_t ws_max_blks;
	uint64_t trim_count;
	uint64_t trim_ranges;
	uint64_t trim_lbas;
	da_delete_methods delete_method_pref;
	da_delete_methods delete_method;
	da_delete_func_t *delete_func;
	int p_type;
	struct disk_params params;
	struct disk *disk;
	union ccb saved_ccb;
	struct task sysctl_task;
	struct sysctl_ctx_list sysctl_ctx;
	struct sysctl_oid *sysctl_tree;
	struct callout sendordered_c;
	uint64_t wwpn;
	uint8_t unmap_buf[UNMAP_BUF_SIZE];
	struct scsi_read_capacity_data_long rcaplong;
	struct callout mediapoll_c;
	int ref_flags[DA_REF_MAX];
	#ifdef CAM_IO_STATS
	struct sysctl_ctx_list sysctl_stats_ctx;
	struct sysctl_oid *sysctl_stats_tree;
	u_int errors;
	u_int timeouts;
	u_int invalidations;
	#endif
	#define DA_ANNOUNCETMP_SZ 160
	char announce_temp[DA_ANNOUNCETMP_SZ];
	#define DA_ANNOUNCE_SZ 400
	char announcebuf[DA_ANNOUNCE_SZ];
	};

	#define dadeleteflag(softc, delete_method, enable) \
	if (enable) { \
	softc->delete_available \|= (1 << delete_method); \
	} else { \
	softc->delete_available &= ~(1 << delete_method); \
	}

	struct da_quirk_entry {
	struct scsi_inquiry_pattern inq_pat;
	da_quirks quirks;
	};

	static const char quantum[] = "QUANTUM";
	static const char microp[] = "MICROP";

	static struct da_quirk_entry da_quirk_table[] =
	{
	/* SPI, FC devices */
	{
	/*
	* Fujitsu M2513A MO drives.
	* Tested devices: M2513A2 firmware versions 1200 & 1300.
	* (dip switch selects whether T_DIRECT or T_OPTICAL device)
	* Reported by: W.Scholten <whs@xs4all.nl>
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/* See above. */
	{T_OPTICAL, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* This particular Fujitsu drive doesn't like the
	* synchronize cache command.
	* Reported by: Tom Jackson <toj@gorilla.net>
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "FUJITSU", "M2954", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* This drive doesn't like the synchronize cache command
	* either. Reported by: Matthew Jacob <mjacob@feral.com>
	* in NetBSD PR kern/6027, August 24, 1998.
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, microp, "2217", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* This drive doesn't like the synchronize cache command
	* either. Reported by: Hellmuth Michaelis (hm@kts.org)
	* (PR 8882).
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, microp, "2112", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Doesn't like the synchronize cache command.
	* Reported by: Blaz Zupan <blaz@gold.amis.net>
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "NEC", "D3847", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Doesn't like the synchronize cache command.
	* Reported by: Blaz Zupan <blaz@gold.amis.net>
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, quantum, "MAVERICK 540S", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Doesn't like the synchronize cache command.
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS525S", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Doesn't like the synchronize cache command.
	* Reported by: walter@pelissero.de
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS540S", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Doesn't work correctly with 6 byte reads/writes.
	* Returns illegal request, and points to byte 9 of the
	* 6-byte CDB.
	* Reported by: Adam McDougall <bsdx@spawnet.com>
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 4", ""},
	/quirks/ DA_Q_NO_6_BYTE
	},
	{
	/* See above. */
	{T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 2", ""},
	/quirks/ DA_Q_NO_6_BYTE
	},
	{
	/*
	* Doesn't like the synchronize cache command.
	* Reported by: walter@pelissero.de
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "CONNER", "CP3500", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* The CISS RAID controllers do not support SYNC_CACHE
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "COMPAQ", "RAID", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* The STEC SSDs sometimes hang on UNMAP.
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "STEC", "", ""},
	/quirks/ DA_Q_NO_UNMAP
	},
	{
	/*
	* VMware returns BUSY status when storage has transient
	* connectivity problems, so better wait.
	* Also VMware returns odd errors on misaligned UNMAPs.
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "VMware", "", "*"},
	/quirks/ DA_Q_RETRY_BUSY \| DA_Q_STRICT_UNMAP
	},
	/* USB mass storage devices supported by umass(4) */
	{
	/*
	* EXATELECOM (Sigmatel) i-Bead 100/105 USB Flash MP3 Player
	* PR: kern/51675
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "EXATEL", "i-BEAD10", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Power Quotient Int. (PQI) USB flash key
	* PR: kern/53067
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic", "USB Flash Disk",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Creative Nomad MUVO mp3 player (USB)
	* PR: kern/53094
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "NOMAD_MUVO", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE\|DA_Q_NO_PREVENT
	},
	{
	/*
	* Jungsoft NEXDISK USB flash key
	* PR: kern/54737
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "JUNGSOFT", "NEXDISK", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* FreeDik USB Mini Data Drive
	* PR: kern/54786
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "FreeDik*", "Mini Data Drive",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Sigmatel USB Flash MP3 Player
	* PR: kern/57046
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "SigmaTel", "MSCN", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE\|DA_Q_NO_PREVENT
	},
	{
	/*
	* Neuros USB Digital Audio Computer
	* PR: kern/63645
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "NEUROS", "dig. audio comp.",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* SEAGRAND NP-900 MP3 Player
	* PR: kern/64563
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "SEAGRAND", "NP-900", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE\|DA_Q_NO_PREVENT
	},
	{
	/*
	* iRiver iFP MP3 player (with UMS Firmware)
	* PR: kern/54881, i386/63941, kern/66124
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "iRiver", "iFP", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Frontier Labs NEX IA+ Digital Audio Player, rev 1.10/0.01
	* PR: kern/70158
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "FL" , "Nex", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* ZICPlay USB MP3 Player with FM
	* PR: kern/75057
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "ACTIONS" , "USB DISK", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* TEAC USB floppy mechanisms
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "TEAC" , "FD-05", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Kingston DataTraveler II+ USB Pen-Drive.
	* Reported by: Pawel Jakub Dawidek <pjd@FreeBSD.org>
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston" , "DataTraveler II+",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* USB DISK Pro PMAP
	* Reported by: jhs
	* PR: usb/96381
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, " ", "USB DISK Pro", "PMAP"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Motorola E398 Mobile Phone (TransFlash memory card).
	* Reported by: Wojciech A. Koszek <dunstan@FreeBSD.czest.pl>
	* PR: usb/89889
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Motorola" , "Motorola Phone",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Qware BeatZkey! Pro
	* PR: usb/79164
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "GENERIC", "USB DISK DEVICE",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Time DPA20B 1GB MP3 Player
	* PR: usb/81846
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB2.0", "(FS) FLASH DISK",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Samsung USB key 128Mb
	* PR: usb/90081
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB-DISK", "FreeDik-FlashUsb",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Kingston DataTraveler 2.0 USB Flash memory.
	* PR: usb/89196
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler 2.0",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Creative MUVO Slim mp3 player (USB)
	* PR: usb/86131
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "MuVo Slim",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE\|DA_Q_NO_PREVENT
	},
	{
	/*
	* United MP5512 Portable MP3 Player (2-in-1 USB DISK/MP3)
	* PR: usb/80487
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "MUSIC DISK",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* SanDisk Micro Cruzer 128MB
	* PR: usb/75970
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "SanDisk" , "Micro Cruzer",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* TOSHIBA TransMemory USB sticks
	* PR: kern/94660
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "TOSHIBA", "TransMemory",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* PNY USB 3.0 Flash Drives
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "PNY", "USB 3.0 FD*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE \| DA_Q_NO_RC16
	},
	{
	/*
	* PNY USB Flash keys
	* PR: usb/75578, usb/72344, usb/65436
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "" , "USB DISK",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Genesys GL3224
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic", "STORAGE DEVICE",
	"120?"}, /quirks/ DA_Q_NO_SYNC_CACHE \| DA_Q_4K \| DA_Q_NO_RC16
	},
	{
	/*
	* Genesys 6-in-1 Card Reader
	* PR: usb/94647
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic", "STORAGE DEVICE",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Rekam Digital CAMERA
	* PR: usb/98713
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "CAMERA", "4MP-9J6",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* iRiver H10 MP3 player
	* PR: usb/102547
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "H10*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* iRiver U10 MP3 player
	* PR: usb/92306
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "U10*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* X-Micro Flash Disk
	* PR: usb/96901
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "X-Micro", "Flash Disk",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* EasyMP3 EM732X USB 2.0 Flash MP3 Player
	* PR: usb/96546
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "EM732X", "MP3 Player*",
	"1.00"}, /quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Denver MP3 player
	* PR: usb/107101
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "DENVER", "MP3 PLAYER",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Philips USB Key Audio KEY013
	* PR: usb/68412
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "PHILIPS", "Key", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE \| DA_Q_NO_PREVENT
	},
	{
	/*
	* JNC MP3 Player
	* PR: usb/94439
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "JNC" , "MP3 Player",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* SAMSUNG MP0402H
	* PR: usb/108427
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "MP0402H", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* I/O Magic USB flash - Giga Bank
	* PR: usb/108810
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "GS-Magic", "stor", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* JoyFly 128mb USB Flash Drive
	* PR: 96133
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "Flash Disk*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* ChipsBnk usb stick
	* PR: 103702
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "ChipsBnk", "USB*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Storcase (Kingston) InfoStation IFS FC2/SATA-R 201A
	* PR: 129858
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "IFS", "FC2/SATA-R*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Samsung YP-U3 mp3-player
	* PR: 125398
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Samsung", "YP-U3",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Netac", "OnlyDisk*",
	"2000"}, /quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Sony Cyber-Shot DSC cameras
	* PR: usb/137035
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Sony", "Sony DSC", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE \| DA_Q_NO_PREVENT
	},
	{
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler G3",
	"1.00"}, /quirks/ DA_Q_NO_PREVENT
	},
	{
	/* At least several Transcent USB sticks lie on RC16. */
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "JetFlash", "Transcend*",
	""}, /quirks*/ DA_Q_NO_RC16
	},
	{
	/*
	* I-O Data USB Flash Disk
	* PR: usb/211716
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "I-O DATA", "USB Flash Disk*",
	""}, /quirks*/ DA_Q_NO_RC16
	},
	{
	/*
	* SLC CHIPFANCIER USB drives
	* PR: usb/234503 (RC10 right, RC16 wrong)
	* 16GB, 32GB and 128GB confirmed to have same issue
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "*SLC", "CHIPFANCIER",
	""}, /quirks*/ DA_Q_NO_RC16
	},
	/* ATA/SATA devices over SAS/USB/... */
	{
	/* Sandisk X400 */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SanDisk SD8SB8U1", "" },
	/quirks/DA_Q_128KB
	},
	{
	/* Hitachi Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "Hitachi", "H??????????E3", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Micron Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Micron 5100 MTFDDAK", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Samsung Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD155UI", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Samsung Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD155UI", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Samsung Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD204UI", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Samsung Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD204UI", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DL", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST????DL", "", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???DM", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST???DM", "", "*" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DM", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST????DM", "", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500423AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "3AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500424AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "4AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640423AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "3AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640424AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "4AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750420AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "0AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750422AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "2AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750423AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "3AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Thin Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???LT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Thin Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST???LT", "", "*" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RX", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RX", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RX", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RX", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PKT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PKT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PKT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PKT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Blue Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PVT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Blue Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PVT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Blue Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PVT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Blue Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PVT", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Olympus digital cameras (C-3040ZOOM, C-2040ZOOM, C-1)
	* PR: usb/97472
	*/
	{ T_DIRECT, SIP_MEDIA_REMOVABLE, "OLYMPUS", "C", ""},
	/quirks/ DA_Q_NO_6_BYTE \| DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Olympus digital cameras (D-370)
	* PR: usb/97472
	*/
	{ T_DIRECT, SIP_MEDIA_REMOVABLE, "OLYMPUS", "D", ""},
	/quirks/ DA_Q_NO_6_BYTE
	},
	{
	/*
	* Olympus digital cameras (E-100RS, E-10).
	* PR: usb/97472
	*/
	{ T_DIRECT, SIP_MEDIA_REMOVABLE, "OLYMPUS", "E", ""},
	/quirks/ DA_Q_NO_6_BYTE \| DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Olympus FE-210 camera
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "OLYMPUS", "FE210*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Pentax Digital Camera
	* PR: usb/93389
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "PENTAX", "DIGITAL CAMERA",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* LG UP3S MP3 player
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "LG", "UP3S",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Laser MP3-2GA13 MP3 player
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "(HS) Flash Disk",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* LaCie external 250GB Hard drive des by Porsche
	* Submitted by: Ben Stuyts <ben@altesco.nl>
	* PR: 121474
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HM250JI", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	/* SATA SSDs */
	{
	/*
	* Corsair Force 2 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair CSSD-F", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Corsair Force 3 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair Force 3", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Corsair Neutron GTX SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Corsair Neutron GTX", "*" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Corsair Force GT & GS SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair Force G", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Crucial M4 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "M4-CT???M4SSD2", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Crucial RealSSD C300 SSDs
	* 4k optimised
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "C300-CTFDDAC???MAG*",
	"" }, /quirks*/DA_Q_4K
	},
	{
	/*
	* Intel 320 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSA2CW", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Intel 330 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2CT", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Intel 510 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2MH", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Intel 520 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2BW", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Intel S3610 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2BX", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Intel X25-M Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSA2M", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Kingston E100 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "KINGSTON SE100S3", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Kingston HyperX 3k SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "KINGSTON SH103S3", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Marvell SSDs (entry taken from OpenSolaris)
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "MARVELL SD88SA02", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* OCZ Agility 2 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "OCZ-AGILITY2", "*" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* OCZ Agility 3 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-AGILITY3", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* OCZ Deneva R Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "DENRSTE251M45", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* OCZ Vertex 2 SSDs (inc pro series)
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ?VERTEX2", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* OCZ Vertex 3 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-VERTEX3", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* OCZ Vertex 4 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-VERTEX4", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Samsung 750 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 750", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Samsung 830 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG SSD 830 Series", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Samsung 840 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 840", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Samsung 845 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 845", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Samsung 850 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 850", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Samsung 843T Series SSDs (MZ7WD*)
	* Samsung PM851 Series SSDs (MZ7TE*)
	* Samsung PM853T Series SSDs (MZ7GE*)
	* Samsung SM863 Series SSDs (MZ7KM*)
	* 4k optimised
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG MZ7", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Same as for SAMSUNG MZ7* but enable the quirks for SSD
	* starting with MZ7* too
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "MZ7", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Same as above but enable the quirks for SSD SAMSUNG MZ7*
	* connected via SATA-to-SAS interposer and because of this
	* starting without "ATA"
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "MZ7", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* SuperTalent TeraDrive CT SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "FTM??CT25H", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* XceedIOPS SATA SSDs
	* 4k optimised
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SG9XCS2D", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Hama Innostor USB-Stick
	*/
	{ T_DIRECT, SIP_MEDIA_REMOVABLE, "Innostor", "Innostor", "" },
	/quirks/DA_Q_NO_RC16
	},
	{
	/*
	* Seagate Lamarr 8TB Shingled Magnetic Recording (SMR)
	* Drive Managed SATA hard drive. This drive doesn't report
	* in firmware that it is a drive managed SMR drive.
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST8000AS000[23]", "" },
	/quirks/DA_Q_SMR_DM
	},
	{
	/*
	* MX-ES USB Drive by Mach Xtreme
	*/
	{ T_DIRECT, SIP_MEDIA_REMOVABLE, "MX", "MXUB3", ""},
	/quirks/DA_Q_NO_RC16
	},
	};

	static disk_strategy_t dastrategy;
	static dumper_t dadump;
	static periph_init_t dainit;
	static void daasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg);
	static void dasysctlinit(void *context, int pending);
	static int dasysctlsofttimeout(SYSCTL_HANDLER_ARGS);
	static int dacmdsizesysctl(SYSCTL_HANDLER_ARGS);
	static int dadeletemethodsysctl(SYSCTL_HANDLER_ARGS);
	static int dabitsysctl(SYSCTL_HANDLER_ARGS);
	static int daflagssysctl(SYSCTL_HANDLER_ARGS);
	static int dazonemodesysctl(SYSCTL_HANDLER_ARGS);
	static int dazonesupsysctl(SYSCTL_HANDLER_ARGS);
	static int dadeletemaxsysctl(SYSCTL_HANDLER_ARGS);
	static void dadeletemethodset(struct da_softc *softc,
	da_delete_methods delete_method);
	static off_t dadeletemaxsize(struct da_softc *softc,
	da_delete_methods delete_method);
	static void dadeletemethodchoose(struct da_softc *softc,
	da_delete_methods default_method);
	static void daprobedone(struct cam_periph periph, union ccb ccb);

	static periph_ctor_t daregister;
	static periph_dtor_t dacleanup;
	static periph_start_t dastart;
	static periph_oninv_t daoninvalidate;
	static void dazonedone(struct cam_periph periph, union ccb ccb);
	static void dadone(struct cam_periph *periph,
	union ccb *done_ccb);
	static void dadone_probewp(struct cam_periph *periph,
	union ccb *done_ccb);
	static void dadone_proberc(struct cam_periph *periph,
	union ccb *done_ccb);
	static void dadone_probelbp(struct cam_periph *periph,
	union ccb *done_ccb);
	static void dadone_probeblklimits(struct cam_periph *periph,
	union ccb *done_ccb);
	static void dadone_probebdc(struct cam_periph *periph,
	union ccb *done_ccb);
	static void dadone_probeata(struct cam_periph *periph,
	union ccb *done_ccb);
	static void dadone_probeatalogdir(struct cam_periph *periph,
	union ccb *done_ccb);
	static void dadone_probeataiddir(struct cam_periph *periph,
	union ccb *done_ccb);
	static void dadone_probeatasup(struct cam_periph *periph,
	union ccb *done_ccb);
	static void dadone_probeatazone(struct cam_periph *periph,
	union ccb *done_ccb);
	static void dadone_probezone(struct cam_periph *periph,
	union ccb *done_ccb);
	static void dadone_tur(struct cam_periph *periph,
	union ccb *done_ccb);
	static int daerror(union ccb *ccb, u_int32_t cam_flags,
	u_int32_t sense_flags);
	static void daprevent(struct cam_periph *periph, int action);
	static void dareprobe(struct cam_periph *periph);
	static void dasetgeom(struct cam_periph *periph, uint32_t block_len,
	uint64_t maxsector,
	struct scsi_read_capacity_data_long *rcaplong,
	size_t rcap_size);
	static callout_func_t dasendorderedtag;
	static void dashutdown(void *arg, int howto);
	static callout_func_t damediapoll;

	#ifndef DA_DEFAULT_POLL_PERIOD
	#define DA_DEFAULT_POLL_PERIOD 3
	#endif

	#ifndef DA_DEFAULT_TIMEOUT
	#define DA_DEFAULT_TIMEOUT 60 /* Timeout in seconds */
	#endif

	#ifndef DA_DEFAULT_SOFTTIMEOUT
	#define DA_DEFAULT_SOFTTIMEOUT 0
	#endif

	#ifndef DA_DEFAULT_RETRY
	#define DA_DEFAULT_RETRY 4
	#endif

	#ifndef DA_DEFAULT_SEND_ORDERED
	#define DA_DEFAULT_SEND_ORDERED 1
	#endif

	static int da_poll_period = DA_DEFAULT_POLL_PERIOD;
	static int da_retry_count = DA_DEFAULT_RETRY;
	static int da_default_timeout = DA_DEFAULT_TIMEOUT;
	static sbintime_t da_default_softtimeout = DA_DEFAULT_SOFTTIMEOUT;
	static int da_send_ordered = DA_DEFAULT_SEND_ORDERED;
	static int da_disable_wp_detection = 0;
	static int da_enable_biospeedup = 1;

	static SYSCTL_NODE(_kern_cam, OID_AUTO, da, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"CAM Direct Access Disk driver");
	SYSCTL_INT(_kern_cam_da, OID_AUTO, poll_period, CTLFLAG_RWTUN,
	&da_poll_period, 0, "Media polling period in seconds");
	SYSCTL_INT(_kern_cam_da, OID_AUTO, retry_count, CTLFLAG_RWTUN,
	&da_retry_count, 0, "Normal I/O retry count");
	SYSCTL_INT(_kern_cam_da, OID_AUTO, default_timeout, CTLFLAG_RWTUN,
	&da_default_timeout, 0, "Normal I/O timeout (in seconds)");
	SYSCTL_INT(_kern_cam_da, OID_AUTO, send_ordered, CTLFLAG_RWTUN,
	&da_send_ordered, 0, "Send Ordered Tags");
	SYSCTL_INT(_kern_cam_da, OID_AUTO, disable_wp_detection, CTLFLAG_RWTUN,
	&da_disable_wp_detection, 0,
	"Disable detection of write-protected disks");
	SYSCTL_INT(_kern_cam_da, OID_AUTO, enable_biospeedup, CTLFLAG_RDTUN,
	&da_enable_biospeedup, 0, "Enable BIO_SPEEDUP processing");

	SYSCTL_PROC(_kern_cam_da, OID_AUTO, default_softtimeout,
	CTLTYPE_UINT \| CTLFLAG_RW \| CTLFLAG_NEEDGIANT, NULL, 0,
	dasysctlsofttimeout, "I",
	"Soft I/O timeout (ms)");
	TUNABLE_INT64("kern.cam.da.default_softtimeout", &da_default_softtimeout);

	/*
	* DA_ORDEREDTAG_INTERVAL determines how often, relative
	* to the default timeout, we check to see whether an ordered
	* tagged transaction is appropriate to prevent simple tag
	* starvation. Since we'd like to ensure that there is at least
	* 1/2 of the timeout length left for a starved transaction to
	* complete after we've sent an ordered tag, we must poll at least
	* four times in every timeout period. This takes care of the worst
	* case where a starved transaction starts during an interval that
	* meets the requirement "don't send an ordered tag" test so it takes
	* us two intervals to determine that a tag must be sent.
	*/
	#ifndef DA_ORDEREDTAG_INTERVAL
	#define DA_ORDEREDTAG_INTERVAL 4
	#endif

	static struct periph_driver dadriver =
	{
	dainit, "da",
	TAILQ_HEAD_INITIALIZER(dadriver.units), /* generation */ 0
	};

	PERIPHDRIVER_DECLARE(da, dadriver);

	static MALLOC_DEFINE(M_SCSIDA, "scsi_da", "scsi_da buffers");

	/*
	* This driver takes out references / holds in well defined pairs, never
	* recursively. These macros / inline functions enforce those rules. They
	* are only enabled with DA_TRACK_REFS or INVARIANTS. If DA_TRACK_REFS is
	* defined to be 2 or larger, the tracking also includes debug printfs.
	*/
	#if defined(DA_TRACK_REFS) \|\| defined(INVARIANTS)

	#ifndef DA_TRACK_REFS
	#define DA_TRACK_REFS 1
	#endif

	#if DA_TRACK_REFS > 1
	static const char *da_ref_text[] = {
	"bogus",
	"open",
	"open hold",
	"close hold",
	"reprobe hold",
	"Test Unit Ready",
	"Geom",
	"sysctl",
	"reprobe",
	"max -- also bogus"
	};

	#define DA_PERIPH_PRINT(periph, msg, args...) \
	CAM_PERIPH_PRINT(periph, msg, ##args)
	#else
	#define DA_PERIPH_PRINT(periph, msg, args...)
	#endif

	static inline void
	token_sanity(da_ref_token token)
	{
	if ((unsigned)token >= DA_REF_MAX)
	panic("Bad token value passed in %d\n", token);
	}

	static inline int
	da_periph_hold(struct cam_periph *periph, int priority, da_ref_token token)
	{
	int err = cam_periph_hold(periph, priority);

	token_sanity(token);
	DA_PERIPH_PRINT(periph, "Holding device %s (%d): %d\n",
	da_ref_text[token], token, err);
	if (err == 0) {
	int cnt;
	struct da_softc *softc = periph->softc;

	cnt = atomic_fetchadd_int(&softc->ref_flags[token], 1);
	if (cnt != 0)
	panic("Re-holding for reason %d, cnt = %d", token, cnt);
	}
	return (err);
	}

	static inline void
	da_periph_unhold(struct cam_periph *periph, da_ref_token token)
	{
	int cnt;
	struct da_softc *softc = periph->softc;

	token_sanity(token);
	DA_PERIPH_PRINT(periph, "Unholding device %s (%d)\n",
	da_ref_text[token], token);
	cnt = atomic_fetchadd_int(&softc->ref_flags[token], -1);
	if (cnt != 1)
	panic("Unholding %d with cnt = %d", token, cnt);
	cam_periph_unhold(periph);
	}

	static inline int
	da_periph_acquire(struct cam_periph *periph, da_ref_token token)
	{
	int err = cam_periph_acquire(periph);

	token_sanity(token);
	DA_PERIPH_PRINT(periph, "acquiring device %s (%d): %d\n",
	da_ref_text[token], token, err);
	if (err == 0) {
	int cnt;
	struct da_softc *softc = periph->softc;

	cnt = atomic_fetchadd_int(&softc->ref_flags[token], 1);
	if (cnt != 0)
	panic("Re-refing for reason %d, cnt = %d", token, cnt);
	}
	return (err);
	}

	static inline void
	da_periph_release(struct cam_periph *periph, da_ref_token token)
	{
	int cnt;
	struct da_softc *softc = periph->softc;

	token_sanity(token);
	DA_PERIPH_PRINT(periph, "releasing device %s (%d)\n",
	da_ref_text[token], token);
	cnt = atomic_fetchadd_int(&softc->ref_flags[token], -1);
	if (cnt != 1)
	panic("Releasing %d with cnt = %d", token, cnt);
	cam_periph_release(periph);
	}

	static inline void
	da_periph_release_locked(struct cam_periph *periph, da_ref_token token)
	{
	int cnt;
	struct da_softc *softc = periph->softc;

	token_sanity(token);
	DA_PERIPH_PRINT(periph, "releasing device (locked) %s (%d)\n",
	da_ref_text[token], token);
	cnt = atomic_fetchadd_int(&softc->ref_flags[token], -1);
	if (cnt != 1)
	panic("releasing (locked) %d with cnt = %d", token, cnt);
	cam_periph_release_locked(periph);
	}

	#define cam_periph_hold POISON
	#define cam_periph_unhold POISON
	#define cam_periph_acquire POISON
	#define cam_periph_release POISON
	#define cam_periph_release_locked POISON

	#else
	#define da_periph_hold(periph, prio, token) cam_periph_hold((periph), (prio))
	#define da_periph_unhold(periph, token) cam_periph_unhold((periph))
	#define da_periph_acquire(periph, token) cam_periph_acquire((periph))
	#define da_periph_release(periph, token) cam_periph_release((periph))
	#define da_periph_release_locked(periph, token) cam_periph_release_locked((periph))
	#endif

	static int
	daopen(struct disk *dp)
	{
	struct cam_periph *periph;
	struct da_softc *softc;
	int error;

	periph = (struct cam_periph *)dp->d_drv1;
	if (da_periph_acquire(periph, DA_REF_OPEN) != 0) {
	return (ENXIO);
	}

	cam_periph_lock(periph);
	if ((error = da_periph_hold(periph, PRIBIO\|PCATCH, DA_REF_OPEN_HOLD)) != 0) {
	cam_periph_unlock(periph);
	da_periph_release(periph, DA_REF_OPEN);
	return (error);
	}

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE \| CAM_DEBUG_PERIPH,
	("daopen\n"));

	softc = (struct da_softc *)periph->softc;
	dareprobe(periph);

	/* Wait for the disk size update. */
	error = cam_periph_sleep(periph, &softc->disk->d_mediasize, PRIBIO,
	"dareprobe", 0);
	if (error != 0)
	xpt_print(periph->path, "unable to retrieve capacity data\n");

	if (periph->flags & CAM_PERIPH_INVALID)
	error = ENXIO;

	if (error == 0 && (softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 &&
	(softc->quirks & DA_Q_NO_PREVENT) == 0)
	daprevent(periph, PR_PREVENT);

	if (error == 0) {
	softc->flags &= ~DA_FLAG_PACK_INVALID;
	softc->flags \|= DA_FLAG_OPEN;
	}

	da_periph_unhold(periph, DA_REF_OPEN_HOLD);
	cam_periph_unlock(periph);

	if (error != 0)
	da_periph_release(periph, DA_REF_OPEN);

	return (error);
	}

	static int
	daclose(struct disk *dp)
	{
	struct cam_periph *periph;
	struct da_softc *softc;
	union ccb *ccb;

	periph = (struct cam_periph *)dp->d_drv1;
	softc = (struct da_softc *)periph->softc;
	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE \| CAM_DEBUG_PERIPH,
	("daclose\n"));

	if (da_periph_hold(periph, PRIBIO, DA_REF_CLOSE_HOLD) == 0) {
	/* Flush disk cache. */
	if ((softc->flags & DA_FLAG_DIRTY) != 0 &&
	(softc->quirks & DA_Q_NO_SYNC_CACHE) == 0 &&
	(softc->flags & DA_FLAG_PACK_INVALID) == 0) {
	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	scsi_synchronize_cache(&ccb->csio, /retries/1,
	/cbfcnp/NULL, MSG_SIMPLE_Q_TAG,
	/begin_lba/0, /lb_count/0, SSD_FULL_SIZE,
	5 * 60 * 1000);
	cam_periph_runccb(ccb, daerror, /cam_flags/0,
	/sense_flags/SF_RETRY_UA \| SF_QUIET_IR,
	softc->disk->d_devstat);
	softc->flags &= ~DA_FLAG_DIRTY;
	xpt_release_ccb(ccb);
	}

	/* Allow medium removal. */
	if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 &&
	(softc->quirks & DA_Q_NO_PREVENT) == 0)
	daprevent(periph, PR_ALLOW);

	da_periph_unhold(periph, DA_REF_CLOSE_HOLD);
	}

	/*
	* If we've got removable media, mark the blocksize as
	* unavailable, since it could change when new media is
	* inserted.
	*/
	if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0)
	softc->disk->d_devstat->flags \|= DEVSTAT_BS_UNAVAILABLE;

	softc->flags &= ~DA_FLAG_OPEN;
	while (softc->refcount != 0)
	cam_periph_sleep(periph, &softc->refcount, PRIBIO, "daclose", 1);
	cam_periph_unlock(periph);
	da_periph_release(periph, DA_REF_OPEN);
	return (0);
	}

	static void
	daschedule(struct cam_periph *periph)
	{
	struct da_softc softc = (struct da_softc )periph->softc;

	if (softc->state != DA_STATE_NORMAL)
	return;

	cam_iosched_schedule(softc->cam_iosched, periph);
	}

	/*
	* Actually translate the requested transfer into one the physical driver
	* can understand. The transfer is described by a buf and will include
	* only one physical transfer.
	*/
	static void
	dastrategy(struct bio *bp)
	{
	struct cam_periph *periph;
	struct da_softc *softc;

	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
	softc = (struct da_softc *)periph->softc;

	cam_periph_lock(periph);

	/*
	* If the device has been made invalid, error out
	*/
	if ((softc->flags & DA_FLAG_PACK_INVALID)) {
	cam_periph_unlock(periph);
	biofinish(bp, NULL, ENXIO);
	return;
	}

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastrategy(%p)\n", bp));

	/*
	* Zone commands must be ordered, because they can depend on the
	* effects of previously issued commands, and they may affect
	* commands after them.
	*/
	if (bp->bio_cmd == BIO_ZONE)
	bp->bio_flags \|= BIO_ORDERED;

	/*
	* Place it in the queue of disk activities for this disk
	*/
	cam_iosched_queue_work(softc->cam_iosched, bp);

	/*
	* Schedule ourselves for performing the work.
	*/
	daschedule(periph);
	cam_periph_unlock(periph);

	return;
	}

	static int
	dadump(void arg, void virtual, vm_offset_t physical, off_t offset, size_t length)
	{
	struct cam_periph *periph;
	struct da_softc *softc;
	u_int secsize;
	struct ccb_scsiio csio;
	struct disk *dp;
	int error = 0;

	dp = arg;
	periph = dp->d_drv1;
	softc = (struct da_softc *)periph->softc;
	secsize = softc->params.secsize;

	if ((softc->flags & DA_FLAG_PACK_INVALID) != 0)
	return (ENXIO);

	memset(&csio, 0, sizeof(csio));
	if (length > 0) {
	xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	csio.ccb_h.ccb_state = DA_CCB_DUMP;
	scsi_read_write(&csio,
	/retries/0,
	/cbfcnp/NULL,
	MSG_ORDERED_Q_TAG,
	/read/SCSI_RW_WRITE,
	/byte2/0,
	/minimum_cmd_size/ softc->minimum_cmd_size,
	offset / secsize,
	length / secsize,
	/data_ptr/(u_int8_t *) virtual,
	/dxfer_len/length,
	/sense_len/SSD_FULL_SIZE,
	da_default_timeout * 1000);
	error = cam_periph_runccb((union ccb *)&csio, cam_periph_error,
	0, SF_NO_RECOVERY \| SF_NO_RETRY, NULL);
	if (error != 0)
	printf("Aborting dump due to I/O error.\n");
	return (error);
	}

	/*
	* Sync the disk cache contents to the physical media.
	*/
	if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) {
	xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	csio.ccb_h.ccb_state = DA_CCB_DUMP;
	scsi_synchronize_cache(&csio,
	/retries/0,
	/cbfcnp/NULL,
	MSG_SIMPLE_Q_TAG,
	/begin_lba/0,/* Cover the whole disk */
	/lb_count/0,
	SSD_FULL_SIZE,
	5 * 1000);
	error = cam_periph_runccb((union ccb *)&csio, cam_periph_error,
	0, SF_NO_RECOVERY \| SF_NO_RETRY, NULL);
	if (error != 0)
	xpt_print(periph->path, "Synchronize cache failed\n");
	}
	return (error);
	}

	static int
	dagetattr(struct bio *bp)
	{
	int ret;
	struct cam_periph *periph;

	if (g_handleattr_int(bp, "GEOM::canspeedup", da_enable_biospeedup))
	return (EJUSTRETURN);

	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
	cam_periph_lock(periph);
	ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute,
	periph->path);
	cam_periph_unlock(periph);
	if (ret == 0)
	bp->bio_completed = bp->bio_length;
	return ret;
	}

	static void
	dainit(void)
	{
	cam_status status;

	/*
	* Install a global async callback. This callback will
	* receive async callbacks like "new device found".
	*/
	status = xpt_register_async(AC_FOUND_DEVICE, daasync, NULL, NULL);

	if (status != CAM_REQ_CMP) {
	printf("da: Failed to attach master async callback "
	"due to status 0x%x!\n", status);
	} else if (da_send_ordered) {
	/* Register our shutdown event handler */
	if ((EVENTHANDLER_REGISTER(shutdown_post_sync, dashutdown,
	NULL, SHUTDOWN_PRI_DEFAULT)) == NULL)
	printf("dainit: shutdown event registration failed!\n");
	}
	}

	/*
	* Callback from GEOM, called when it has finished cleaning up its
	* resources.
	*/
	static void
	dadiskgonecb(struct disk *dp)
	{
	struct cam_periph *periph;

	periph = (struct cam_periph *)dp->d_drv1;
	da_periph_release(periph, DA_REF_GEOM);
	}

	static void
	daoninvalidate(struct cam_periph *periph)
	{
	struct da_softc *softc;

	cam_periph_assert(periph, MA_OWNED);
	softc = (struct da_softc *)periph->softc;

	/*
	* De-register any async callbacks.
	*/
	xpt_register_async(0, daasync, periph, periph->path);

	softc->flags \|= DA_FLAG_PACK_INVALID;
	#ifdef CAM_IO_STATS
	softc->invalidations++;
	#endif

	/*
	* Return all queued I/O with ENXIO.
	* XXX Handle any transactions queued to the card
	* with XPT_ABORT_CCB.
	*/
	cam_iosched_flush(softc->cam_iosched, NULL, ENXIO);

	/*
	* Tell GEOM that we've gone away, we'll get a callback when it is
	* done cleaning up its resources.
	*/
	disk_gone(softc->disk);
	}

	static void
	dacleanup(struct cam_periph *periph)
	{
	struct da_softc *softc;

	softc = (struct da_softc *)periph->softc;

	cam_periph_unlock(periph);

	cam_iosched_fini(softc->cam_iosched);

	/*
	* If we can't free the sysctl tree, oh well...
	*/
	if ((softc->flags & DA_FLAG_SCTX_INIT) != 0) {
	#ifdef CAM_IO_STATS
	if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0)
	xpt_print(periph->path,
	"can't remove sysctl stats context\n");
	#endif
	if (sysctl_ctx_free(&softc->sysctl_ctx) != 0)
	xpt_print(periph->path,
	"can't remove sysctl context\n");
	}

	callout_drain(&softc->mediapoll_c);
	disk_destroy(softc->disk);
	callout_drain(&softc->sendordered_c);
	free(softc, M_DEVBUF);
	cam_periph_lock(periph);
	}

	static void
	daasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg)
	{
	struct cam_periph *periph;
	struct da_softc *softc;

	periph = (struct cam_periph *)callback_arg;
	switch (code) {
	case AC_FOUND_DEVICE: /* callback to create periph, no locking yet */
	{
	struct ccb_getdev *cgd;
	cam_status status;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL)
	break;

	if (cgd->protocol != PROTO_SCSI)
	break;
	if (SID_QUAL(&cgd->inq_data) != SID_QUAL_LU_CONNECTED)
	break;
	if (SID_TYPE(&cgd->inq_data) != T_DIRECT
	&& SID_TYPE(&cgd->inq_data) != T_RBC
	&& SID_TYPE(&cgd->inq_data) != T_OPTICAL
	&& SID_TYPE(&cgd->inq_data) != T_ZBC_HM)
	break;

	/*
	* Allocate a peripheral instance for
	* this device and start the probe
	* process.
	*/
	status = cam_periph_alloc(daregister, daoninvalidate,
	dacleanup, dastart,
	"da", CAM_PERIPH_BIO,
	path, daasync,
	AC_FOUND_DEVICE, cgd);

	if (status != CAM_REQ_CMP
	&& status != CAM_REQ_INPROG)
	printf("daasync: Unable to attach to new device "
	"due to status 0x%x\n", status);
	return;
	}
	case AC_ADVINFO_CHANGED: /* Doesn't touch periph */
	{
	uintptr_t buftype;

	buftype = (uintptr_t)arg;
	if (buftype == CDAI_TYPE_PHYS_PATH) {
	struct da_softc *softc;

	softc = periph->softc;
	disk_attr_changed(softc->disk, "GEOM::physpath",
	M_NOWAIT);
	}
	break;
	}
	case AC_UNIT_ATTENTION:
	{
	union ccb *ccb;
	int error_code, sense_key, asc, ascq;

	softc = (struct da_softc *)periph->softc;
	ccb = (union ccb *)arg;

	/*
	* Handle all UNIT ATTENTIONs except our own, as they will be
	* handled by daerror(). Since this comes from a different periph,
	* that periph's lock is held, not ours, so we have to take it ours
	* out to touch softc flags.
	*/
	if (xpt_path_periph(ccb->ccb_h.path) != periph &&
	scsi_extract_sense_ccb(ccb,
	&error_code, &sense_key, &asc, &ascq)) {
	if (asc == 0x2A && ascq == 0x09) {
	xpt_print(ccb->ccb_h.path,
	"Capacity data has changed\n");
	cam_periph_lock(periph);
	softc->flags &= ~DA_FLAG_PROBED;
	dareprobe(periph);
	cam_periph_unlock(periph);
	} else if (asc == 0x28 && ascq == 0x00) {
	cam_periph_lock(periph);
	softc->flags &= ~DA_FLAG_PROBED;
	cam_periph_unlock(periph);
	disk_media_changed(softc->disk, M_NOWAIT);
	} else if (asc == 0x3F && ascq == 0x03) {
	xpt_print(ccb->ccb_h.path,
	"INQUIRY data has changed\n");
	cam_periph_lock(periph);
	softc->flags &= ~DA_FLAG_PROBED;
	dareprobe(periph);
	cam_periph_unlock(periph);
	}
	}
	break;
	}
	case AC_SCSI_AEN: /* Called for this path: periph locked */
	/*
	* Appears to be currently unused for SCSI devices, only ata SIMs
	* generate this.
	*/
	cam_periph_assert(periph, MA_OWNED);
	softc = (struct da_softc *)periph->softc;
	if (!cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR) &&
	(softc->flags & DA_FLAG_TUR_PENDING) == 0) {
	if (da_periph_acquire(periph, DA_REF_TUR) == 0) {
	cam_iosched_set_work_flags(softc->cam_iosched, DA_WORK_TUR);
	daschedule(periph);
	}
	}
	/* FALLTHROUGH */
	case AC_SENT_BDR: /* Called for this path: periph locked */
	case AC_BUS_RESET: /* Called for this path: periph locked */
	{
	struct ccb_hdr *ccbh;

	cam_periph_assert(periph, MA_OWNED);
	softc = (struct da_softc *)periph->softc;
	/*
	* Don't fail on the expected unit attention
	* that will occur.
	*/
	softc->flags \|= DA_FLAG_RETRY_UA;
	LIST_FOREACH(ccbh, &softc->pending_ccbs, periph_links.le)
	ccbh->ccb_state \|= DA_CCB_RETRY_UA;
	break;
	}
	case AC_INQ_CHANGED: /* Called for this path: periph locked */
	cam_periph_assert(periph, MA_OWNED);
	softc = (struct da_softc *)periph->softc;
	softc->flags &= ~DA_FLAG_PROBED;
	dareprobe(periph);
	break;
	default:
	break;
	}
	cam_periph_async(periph, code, path, arg);
	}

	static void
	dasysctlinit(void *context, int pending)
	{
	struct cam_periph *periph;
	struct da_softc *softc;
	char tmpstr[32], tmpstr2[16];
	struct ccb_trans_settings cts;

	periph = (struct cam_periph *)context;
	/*
	* periph was held for us when this task was enqueued
	*/
	if (periph->flags & CAM_PERIPH_INVALID) {
	da_periph_release(periph, DA_REF_SYSCTL);
	return;
	}

	softc = (struct da_softc *)periph->softc;
	snprintf(tmpstr, sizeof(tmpstr), "CAM DA unit %d", periph->unit_number);
	snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);

	sysctl_ctx_init(&softc->sysctl_ctx);
	cam_periph_lock(periph);
	softc->flags \|= DA_FLAG_SCTX_INIT;
	cam_periph_unlock(periph);
	softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_kern_cam_da), OID_AUTO, tmpstr2,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, tmpstr, "device_index");
	if (softc->sysctl_tree == NULL) {
	printf("dasysctlinit: unable to allocate sysctl tree\n");
	da_periph_release(periph, DA_REF_SYSCTL);
	return;
	}

	/*
	* Now register the sysctl handler, so the user can change the value on
	* the fly.
	*/
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "delete_method",
	CTLTYPE_STRING \| CTLFLAG_RWTUN \| CTLFLAG_NEEDGIANT,
	softc, 0, dadeletemethodsysctl, "A",
	"BIO_DELETE execution method");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "delete_max",
	CTLTYPE_U64 \| CTLFLAG_RW \| CTLFLAG_NEEDGIANT,
	softc, 0, dadeletemaxsysctl, "Q",
	"Maximum BIO_DELETE size");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "minimum_cmd_size",
	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_NEEDGIANT,
	&softc->minimum_cmd_size, 0, dacmdsizesysctl, "I",
	"Minimum CDB size");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"trim_count", CTLFLAG_RD, &softc->trim_count,
	"Total number of unmap/dsm commands sent");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"trim_ranges", CTLFLAG_RD, &softc->trim_ranges,
	"Total number of ranges in unmap/dsm commands");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"trim_lbas", CTLFLAG_RD, &softc->trim_lbas,
	"Total lbas in the unmap/dsm commands sent");

	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "zone_mode",
	CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_NEEDGIANT,
	softc, 0, dazonemodesysctl, "A",
	"Zone Mode");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "zone_support",
	CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_NEEDGIANT,
	softc, 0, dazonesupsysctl, "A",
	"Zone Support");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"optimal_seq_zones", CTLFLAG_RD, &softc->optimal_seq_zones,
	"Optimal Number of Open Sequential Write Preferred Zones");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"optimal_nonseq_zones", CTLFLAG_RD,
	&softc->optimal_nonseq_zones,
	"Optimal Number of Non-Sequentially Written Sequential Write "
	"Preferred Zones");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"max_seq_zones", CTLFLAG_RD, &softc->max_seq_zones,
	"Maximum Number of Open Sequential Write Required Zones");

	SYSCTL_ADD_INT(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO,
	"error_inject",
	CTLFLAG_RW,
	&softc->error_inject,
	0,
	"error_inject leaf");

	SYSCTL_ADD_INT(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO,
	"p_type",
	CTLFLAG_RD,
	&softc->p_type,
	0,
	"DIF protection type");

	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "flags", CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	softc, 0, daflagssysctl, "A",
	"Flags for drive");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "rotating", CTLTYPE_INT \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&softc->flags, (u_int)DA_FLAG_ROTATING, dabitsysctl, "I",
	"Rotating media DEPRECATED gone in FreeBSD 14");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "unmapped_io", CTLTYPE_INT \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&softc->flags, (u_int)DA_FLAG_UNMAPPEDIO, dabitsysctl, "I",
	"Unmapped I/O support DEPRECATED gone in FreeBSD 14");

	#ifdef CAM_TEST_FAILURE
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "invalidate", CTLTYPE_U64 \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	periph, 0, cam_periph_invalidate_sysctl, "I",
	"Write 1 to invalidate the drive immediately");
	#endif

	/*
	* Add some addressing info.
	*/
	memset(&cts, 0, sizeof (cts));
	xpt_setup_ccb(&cts.ccb_h, periph->path, CAM_PRIORITY_NONE);
	cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
	cts.type = CTS_TYPE_CURRENT_SETTINGS;
	cam_periph_lock(periph);
	xpt_action((union ccb *)&cts);
	cam_periph_unlock(periph);
	if (cts.ccb_h.status != CAM_REQ_CMP) {
	da_periph_release(periph, DA_REF_SYSCTL);
	return;
	}
	if (cts.protocol == PROTO_SCSI && cts.transport == XPORT_FC) {
	struct ccb_trans_settings_fc *fc = &cts.xport_specific.fc;
	if (fc->valid & CTS_FC_VALID_WWPN) {
	softc->wwpn = fc->wwpn;
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "wwpn", CTLFLAG_RD,
	&softc->wwpn, "World Wide Port Name");
	}
	}

	#ifdef CAM_IO_STATS
	/*
	* Now add some useful stats.
	* XXX These should live in cam_periph and be common to all periphs
	*/
	softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats",
	CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, "Statistics");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO,
	"errors",
	CTLFLAG_RD,
	&softc->errors,
	0,
	"Transport errors reported by the SIM");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO,
	"timeouts",
	CTLFLAG_RD,
	&softc->timeouts,
	0,
	"Device timeouts reported by the SIM");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO,
	"pack_invalidations",
	CTLFLAG_RD,
	&softc->invalidations,
	0,
	"Device pack invalidations");
	#endif

	cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx,
	softc->sysctl_tree);

	da_periph_release(periph, DA_REF_SYSCTL);
	}

	static int
	dadeletemaxsysctl(SYSCTL_HANDLER_ARGS)
	{
	int error;
	uint64_t value;
	struct da_softc *softc;

	softc = (struct da_softc *)arg1;

	value = softc->disk->d_delmaxsize;
	error = sysctl_handle_64(oidp, &value, 0, req);
	if ((error != 0) \|\| (req->newptr == NULL))
	return (error);

	/* only accept values smaller than the calculated value */
	if (value > dadeletemaxsize(softc, softc->delete_method)) {
	return (EINVAL);
	}
	softc->disk->d_delmaxsize = value;

	return (0);
	}

	static int
	dacmdsizesysctl(SYSCTL_HANDLER_ARGS)
	{
	int error, value;

	value = (int )arg1;

	error = sysctl_handle_int(oidp, &value, 0, req);

	if ((error != 0)
	\|\| (req->newptr == NULL))
	return (error);

	/*
	* Acceptable values here are 6, 10, 12 or 16.
	*/
	if (value < 6)
	value = 6;
	else if ((value > 6)
	&& (value <= 10))
	value = 10;
	else if ((value > 10)
	&& (value <= 12))
	value = 12;
	else if (value > 12)
	value = 16;

	(int )arg1 = value;

	return (0);
	}

	static int
	dasysctlsofttimeout(SYSCTL_HANDLER_ARGS)
	{
	sbintime_t value;
	int error;

	value = da_default_softtimeout / SBT_1MS;

	error = sysctl_handle_int(oidp, (int *)&value, 0, req);
	if ((error != 0) \|\| (req->newptr == NULL))
	return (error);

	/* XXX Should clip this to a reasonable level */
	if (value > da_default_timeout * 1000)
	return (EINVAL);

	da_default_softtimeout = value * SBT_1MS;
	return (0);
	}

	static void
	dadeletemethodset(struct da_softc *softc, da_delete_methods delete_method)
	{

	softc->delete_method = delete_method;
	softc->disk->d_delmaxsize = dadeletemaxsize(softc, delete_method);
	softc->delete_func = da_delete_functions[delete_method];

	if (softc->delete_method > DA_DELETE_DISABLE)
	softc->disk->d_flags \|= DISKFLAG_CANDELETE;
	else
	softc->disk->d_flags &= ~DISKFLAG_CANDELETE;
	}

	static off_t
	dadeletemaxsize(struct da_softc *softc, da_delete_methods delete_method)
	{
	off_t sectors;

	switch(delete_method) {
	case DA_DELETE_UNMAP:
	sectors = (off_t)softc->unmap_max_lba;
	break;
	case DA_DELETE_ATA_TRIM:
	sectors = (off_t)ATA_DSM_RANGE_MAX * softc->trim_max_ranges;
	break;
	case DA_DELETE_WS16:
	sectors = omin(softc->ws_max_blks, WS16_MAX_BLKS);
	break;
	case DA_DELETE_ZERO:
	case DA_DELETE_WS10:
	sectors = omin(softc->ws_max_blks, WS10_MAX_BLKS);
	break;
	default:
	return 0;
	}

	return (off_t)softc->params.secsize *
	omin(sectors, softc->params.sectors);
	}

	static void
	daprobedone(struct cam_periph periph, union ccb ccb)
	{
	struct da_softc *softc;

	softc = (struct da_softc *)periph->softc;

	cam_periph_assert(periph, MA_OWNED);

	dadeletemethodchoose(softc, DA_DELETE_NONE);

	if (bootverbose && (softc->flags & DA_FLAG_ANNOUNCED) == 0) {
	char buf[80];
	int i, sep;

	snprintf(buf, sizeof(buf), "Delete methods: <");
	sep = 0;
	for (i = 0; i <= DA_DELETE_MAX; i++) {
	if ((softc->delete_available & (1 << i)) == 0 &&
	i != softc->delete_method)
	continue;
	if (sep)
	strlcat(buf, ",", sizeof(buf));
	strlcat(buf, da_delete_method_names[i],
	sizeof(buf));
	if (i == softc->delete_method)
	strlcat(buf, "(*)", sizeof(buf));
	sep = 1;
	}
	strlcat(buf, ">", sizeof(buf));
	printf("%s%d: %s\n", periph->periph_name,
	periph->unit_number, buf);
	}
	if ((softc->disk->d_flags & DISKFLAG_WRITE_PROTECT) != 0 &&
	(softc->flags & DA_FLAG_ANNOUNCED) == 0) {
	printf("%s%d: Write Protected\n", periph->periph_name,
	periph->unit_number);
	}

	/*
	* Since our peripheral may be invalidated by an error
	* above or an external event, we must release our CCB
	* before releasing the probe lock on the peripheral.
	* The peripheral will only go away once the last lock
	* is removed, and we need it around for the CCB release
	* operation.
	*/
	xpt_release_ccb(ccb);
	softc->state = DA_STATE_NORMAL;
	softc->flags \|= DA_FLAG_PROBED;
	daschedule(periph);
	wakeup(&softc->disk->d_mediasize);
	if ((softc->flags & DA_FLAG_ANNOUNCED) == 0) {
	softc->flags \|= DA_FLAG_ANNOUNCED;
	da_periph_unhold(periph, DA_REF_PROBE_HOLD);
	} else
	da_periph_release_locked(periph, DA_REF_REPROBE);
	}

	static void
	dadeletemethodchoose(struct da_softc *softc, da_delete_methods default_method)
	{
	int i, methods;

	/* If available, prefer the method requested by user. */
	i = softc->delete_method_pref;
	methods = softc->delete_available \| (1 << DA_DELETE_DISABLE);
	if (methods & (1 << i)) {
	dadeletemethodset(softc, i);
	return;
	}

	/* Use the pre-defined order to choose the best performing delete. */
	for (i = DA_DELETE_MIN; i <= DA_DELETE_MAX; i++) {
	if (i == DA_DELETE_ZERO)
	continue;
	if (softc->delete_available & (1 << i)) {
	dadeletemethodset(softc, i);
	return;
	}
	}

	/* Fallback to default. */
	dadeletemethodset(softc, default_method);
	}

	static int
	dabitsysctl(SYSCTL_HANDLER_ARGS)
	{
	u_int *flags = arg1;
	u_int test = arg2;
	int tmpout, error;

	tmpout = !!(*flags & test);
	error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout));
	if (error \|\| !req->newptr)
	return (error);

	return (EPERM);
	}

	static int
	daflagssysctl(SYSCTL_HANDLER_ARGS)
	{
	struct sbuf sbuf;
	struct da_softc *softc = arg1;
	int error;

	sbuf_new_for_sysctl(&sbuf, NULL, 0, req);
	if (softc->flags != 0)
	sbuf_printf(&sbuf, "0x%b", (unsigned)softc->flags, DA_FLAG_STRING);
	else
	sbuf_printf(&sbuf, "0");
	error = sbuf_finish(&sbuf);
	sbuf_delete(&sbuf);

	return (error);
	}

	static int
	dadeletemethodsysctl(SYSCTL_HANDLER_ARGS)
	{
	char buf[16];
	const char *p;
	struct da_softc *softc;
	int i, error, value;

	softc = (struct da_softc *)arg1;

	value = softc->delete_method;
	if (value < 0 \|\| value > DA_DELETE_MAX)
	p = "UNKNOWN";
	else
	p = da_delete_method_names[value];
	strncpy(buf, p, sizeof(buf));
	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	for (i = 0; i <= DA_DELETE_MAX; i++) {
	if (strcmp(buf, da_delete_method_names[i]) == 0)
	break;
	}
	if (i > DA_DELETE_MAX)
	return (EINVAL);
	softc->delete_method_pref = i;
	dadeletemethodchoose(softc, DA_DELETE_NONE);
	return (0);
	}

	static int
	dazonemodesysctl(SYSCTL_HANDLER_ARGS)
	{
	char tmpbuf[40];
	struct da_softc *softc;
	int error;

	softc = (struct da_softc *)arg1;

	switch (softc->zone_mode) {
	case DA_ZONE_DRIVE_MANAGED:
	snprintf(tmpbuf, sizeof(tmpbuf), "Drive Managed");
	break;
	case DA_ZONE_HOST_AWARE:
	snprintf(tmpbuf, sizeof(tmpbuf), "Host Aware");
	break;
	case DA_ZONE_HOST_MANAGED:
	snprintf(tmpbuf, sizeof(tmpbuf), "Host Managed");
	break;
	case DA_ZONE_NONE:
	default:
	snprintf(tmpbuf, sizeof(tmpbuf), "Not Zoned");
	break;
	}

	error = sysctl_handle_string(oidp, tmpbuf, sizeof(tmpbuf), req);

	return (error);
	}

	static int
	dazonesupsysctl(SYSCTL_HANDLER_ARGS)
	{
	char tmpbuf[180];
	struct da_softc *softc;
	struct sbuf sb;
	int error, first;
	unsigned int i;

	softc = (struct da_softc *)arg1;

	error = 0;
	first = 1;
	sbuf_new(&sb, tmpbuf, sizeof(tmpbuf), 0);

	for (i = 0; i < sizeof(da_zone_desc_table) /
	sizeof(da_zone_desc_table[0]); i++) {
	if (softc->zone_flags & da_zone_desc_table[i].value) {
	if (first == 0)
	sbuf_printf(&sb, ", ");
	else
	first = 0;
	sbuf_cat(&sb, da_zone_desc_table[i].desc);
	}
	}

	if (first == 1)
	sbuf_printf(&sb, "None");

	sbuf_finish(&sb);

	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);

	return (error);
	}

	static cam_status
	daregister(struct cam_periph periph, void arg)
	{
	struct da_softc *softc;
	struct ccb_pathinq cpi;
	struct ccb_getdev *cgd;
	char tmpstr[80];
	caddr_t match;
	int quirks;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL) {
	printf("daregister: no getdev CCB, can't register device\n");
	return(CAM_REQ_CMP_ERR);
	}

	softc = (struct da_softc )malloc(sizeof(softc), M_DEVBUF,
	M_NOWAIT\|M_ZERO);

	if (softc == NULL) {
	printf("daregister: Unable to probe new device. "
	"Unable to allocate softc\n");
	return(CAM_REQ_CMP_ERR);
	}

	if (cam_iosched_init(&softc->cam_iosched, periph) != 0) {
	printf("daregister: Unable to probe new device. "
	"Unable to allocate iosched memory\n");
	free(softc, M_DEVBUF);
	return(CAM_REQ_CMP_ERR);
	}

	LIST_INIT(&softc->pending_ccbs);
	softc->state = DA_STATE_PROBE_WP;
	bioq_init(&softc->delete_run_queue);
	if (SID_IS_REMOVABLE(&cgd->inq_data))
	softc->flags \|= DA_FLAG_PACK_REMOVABLE;
	softc->unmap_max_ranges = UNMAP_MAX_RANGES;
	softc->unmap_max_lba = UNMAP_RANGE_MAX;
	softc->unmap_gran = 0;
	softc->unmap_gran_align = 0;
	softc->ws_max_blks = WS16_MAX_BLKS;
	softc->trim_max_ranges = ATA_TRIM_MAX_RANGES;
	softc->flags \|= DA_FLAG_ROTATING;

	periph->softc = softc;

	/*
	* See if this device has any quirks.
	*/
	match = cam_quirkmatch((caddr_t)&cgd->inq_data,
	(caddr_t)da_quirk_table,
	nitems(da_quirk_table),
	sizeof(*da_quirk_table), scsi_inquiry_match);

	if (match != NULL)
	softc->quirks = ((struct da_quirk_entry *)match)->quirks;
	else
	softc->quirks = DA_Q_NONE;

	/* Check if the SIM does not want 6 byte commands */
	xpt_path_inq(&cpi, periph->path);
	if (cpi.ccb_h.status == CAM_REQ_CMP && (cpi.hba_misc & PIM_NO_6_BYTE))
	softc->quirks \|= DA_Q_NO_6_BYTE;

	/* Override quirks if tunable is set */
	snprintf(tmpstr, sizeof(tmpstr), "kern.cam.da.%d.quirks",
	periph->unit_number);
	quirks = softc->quirks;
	TUNABLE_INT_FETCH(tmpstr, &quirks);
	softc->quirks = quirks;

	if (SID_TYPE(&cgd->inq_data) == T_ZBC_HM)
	softc->zone_mode = DA_ZONE_HOST_MANAGED;
	else if (softc->quirks & DA_Q_SMR_DM)
	softc->zone_mode = DA_ZONE_DRIVE_MANAGED;
	else
	softc->zone_mode = DA_ZONE_NONE;

	if (softc->zone_mode != DA_ZONE_NONE) {
	if (scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) {
	if (scsi_vpd_supported_page(periph, SVPD_ZONED_BDC))
	softc->zone_interface = DA_ZONE_IF_ATA_SAT;
	else
	softc->zone_interface = DA_ZONE_IF_ATA_PASS;
	} else
	softc->zone_interface = DA_ZONE_IF_SCSI;
	}

	TASK_INIT(&softc->sysctl_task, 0, dasysctlinit, periph);

	/*
	* Take an exclusive section lock qon the periph while dastart is called
	* to finish the probe. The lock will be dropped in dadone at the end
	* of probe. This locks out daopen and daclose from racing with the
	* probe.
	*
	* XXX if cam_periph_hold returns an error, we don't hold a refcount.
	*/
	(void)da_periph_hold(periph, PRIBIO, DA_REF_PROBE_HOLD);

	/*
	* Schedule a periodic event to occasionally send an
	* ordered tag to a device.
	*/
	callout_init_mtx(&softc->sendordered_c, cam_periph_mtx(periph), 0);
	callout_reset(&softc->sendordered_c,
	(da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL,
	dasendorderedtag, periph);

	cam_periph_unlock(periph);
	/*
	* RBC devices don't have to support READ(6), only READ(10).
	*/
	if (softc->quirks & DA_Q_NO_6_BYTE \|\| SID_TYPE(&cgd->inq_data) == T_RBC)
	softc->minimum_cmd_size = 10;
	else
	softc->minimum_cmd_size = 6;

	/*
	* Load the user's default, if any.
	*/
	snprintf(tmpstr, sizeof(tmpstr), "kern.cam.da.%d.minimum_cmd_size",
	periph->unit_number);
	TUNABLE_INT_FETCH(tmpstr, &softc->minimum_cmd_size);

	/*
	* 6, 10, 12 and 16 are the currently permissible values.
	*/
	if (softc->minimum_cmd_size > 12)
	softc->minimum_cmd_size = 16;
	else if (softc->minimum_cmd_size > 10)
	softc->minimum_cmd_size = 12;
	else if (softc->minimum_cmd_size > 6)
	softc->minimum_cmd_size = 10;
	else
	softc->minimum_cmd_size = 6;

	/* Predict whether device may support READ CAPACITY(16). */
	if (SID_ANSI_REV(&cgd->inq_data) >= SCSI_REV_SPC3 &&
	(softc->quirks & DA_Q_NO_RC16) == 0) {
	softc->flags \|= DA_FLAG_CAN_RC16;
	}

	/*
	* Register this media as a disk.
	*/
	softc->disk = disk_alloc();
	softc->disk->d_devstat = devstat_new_entry(periph->periph_name,
	periph->unit_number, 0,
	DEVSTAT_BS_UNAVAILABLE,
	SID_TYPE(&cgd->inq_data) \|
	XPORT_DEVSTAT_TYPE(cpi.transport),
	DEVSTAT_PRIORITY_DISK);
	softc->disk->d_open = daopen;
	softc->disk->d_close = daclose;
	softc->disk->d_strategy = dastrategy;
	softc->disk->d_dump = dadump;
	softc->disk->d_getattr = dagetattr;
	softc->disk->d_gone = dadiskgonecb;
	softc->disk->d_name = "da";
	softc->disk->d_drv1 = periph;
	if (cpi.maxio == 0)
	softc->maxio = DFLTPHYS; /* traditional default */
	- else if (cpi.maxio > MAXPHYS)
	- softc->maxio = MAXPHYS; /* for safety */
	+ else if (cpi.maxio > maxphys)
	+ softc->maxio = maxphys; /* for safety */
	else
	softc->maxio = cpi.maxio;
	if (softc->quirks & DA_Q_128KB)
	softc->maxio = min(softc->maxio, 128 * 1024);
	softc->disk->d_maxsize = softc->maxio;
	softc->disk->d_unit = periph->unit_number;
	softc->disk->d_flags = DISKFLAG_DIRECT_COMPLETION \| DISKFLAG_CANZONE;
	if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0)
	softc->disk->d_flags \|= DISKFLAG_CANFLUSHCACHE;
	if ((cpi.hba_misc & PIM_UNMAPPED) != 0) {
	softc->flags \|= DA_FLAG_UNMAPPEDIO;
	softc->disk->d_flags \|= DISKFLAG_UNMAPPED_BIO;
	}
	cam_strvis(softc->disk->d_descr, cgd->inq_data.vendor,
	sizeof(cgd->inq_data.vendor), sizeof(softc->disk->d_descr));
	strlcat(softc->disk->d_descr, " ", sizeof(softc->disk->d_descr));
	cam_strvis(&softc->disk->d_descr[strlen(softc->disk->d_descr)],
	cgd->inq_data.product, sizeof(cgd->inq_data.product),
	sizeof(softc->disk->d_descr) - strlen(softc->disk->d_descr));
	softc->disk->d_hba_vendor = cpi.hba_vendor;
	softc->disk->d_hba_device = cpi.hba_device;
	softc->disk->d_hba_subvendor = cpi.hba_subvendor;
	softc->disk->d_hba_subdevice = cpi.hba_subdevice;
	snprintf(softc->disk->d_attachment, sizeof(softc->disk->d_attachment),
	"%s%d", cpi.dev_name, cpi.unit_number);

	/*
	* Acquire a reference to the periph before we register with GEOM.
	* We'll release this reference once GEOM calls us back (via
	* dadiskgonecb()) telling us that our provider has been freed.
	*/
	if (da_periph_acquire(periph, DA_REF_GEOM) != 0) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}

	disk_create(softc->disk, DISK_VERSION);
	cam_periph_lock(periph);

	/*
	* Add async callbacks for events of interest.
	* I don't bother checking if this fails as,
	* in most cases, the system will function just
	* fine without them and the only alternative
	* would be to not attach the device on failure.
	*/
	xpt_register_async(AC_SENT_BDR \| AC_BUS_RESET \| AC_LOST_DEVICE \|
	AC_ADVINFO_CHANGED \| AC_SCSI_AEN \| AC_UNIT_ATTENTION \|
	AC_INQ_CHANGED, daasync, periph, periph->path);

	/*
	* Emit an attribute changed notification just in case
	* physical path information arrived before our async
	* event handler was registered, but after anyone attaching
	* to our disk device polled it.
	*/
	disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT);

	/*
	* Schedule a periodic media polling events.
	*/
	callout_init_mtx(&softc->mediapoll_c, cam_periph_mtx(periph), 0);
	if ((softc->flags & DA_FLAG_PACK_REMOVABLE) &&
	(cgd->inq_flags & SID_AEN) == 0 &&
	da_poll_period != 0)
	callout_reset(&softc->mediapoll_c, da_poll_period * hz,
	damediapoll, periph);

	xpt_schedule(periph, CAM_PRIORITY_DEV);

	return(CAM_REQ_CMP);
	}

	static int
	da_zone_bio_to_scsi(int disk_zone_cmd)
	{
	switch (disk_zone_cmd) {
	case DISK_ZONE_OPEN:
	return ZBC_OUT_SA_OPEN;
	case DISK_ZONE_CLOSE:
	return ZBC_OUT_SA_CLOSE;
	case DISK_ZONE_FINISH:
	return ZBC_OUT_SA_FINISH;
	case DISK_ZONE_RWP:
	return ZBC_OUT_SA_RWP;
	}

	return -1;
	}

	static int
	da_zone_cmd(struct cam_periph periph, union ccb ccb, struct bio *bp,
	int *queue_ccb)
	{
	struct da_softc *softc;
	int error;

	error = 0;

	if (bp->bio_cmd != BIO_ZONE) {
	error = EINVAL;
	goto bailout;
	}

	softc = periph->softc;

	switch (bp->bio_zone.zone_cmd) {
	case DISK_ZONE_OPEN:
	case DISK_ZONE_CLOSE:
	case DISK_ZONE_FINISH:
	case DISK_ZONE_RWP: {
	int zone_flags;
	int zone_sa;
	uint64_t lba;

	zone_sa = da_zone_bio_to_scsi(bp->bio_zone.zone_cmd);
	if (zone_sa == -1) {
	xpt_print(periph->path, "Cannot translate zone "
	"cmd %#x to SCSI\n", bp->bio_zone.zone_cmd);
	error = EINVAL;
	goto bailout;
	}

	zone_flags = 0;
	lba = bp->bio_zone.zone_params.rwp.id;

	if (bp->bio_zone.zone_params.rwp.flags &
	DISK_ZONE_RWP_FLAG_ALL)
	zone_flags \|= ZBC_OUT_ALL;

	if (softc->zone_interface != DA_ZONE_IF_ATA_PASS) {
	scsi_zbc_out(&ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/service_action/ zone_sa,
	/zone_id/ lba,
	/zone_flags/ zone_flags,
	/data_ptr/ NULL,
	/dxfer_len/ 0,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);
	} else {
	/*
	* Note that in this case, even though we can
	* technically use NCQ, we don't bother for several
	* reasons:
	* 1. It hasn't been tested on a SAT layer that
	* supports it. This is new as of SAT-4.
	* 2. Even when there is a SAT layer that supports
	* it, that SAT layer will also probably support
	* ZBC -> ZAC translation, since they are both
	* in the SAT-4 spec.
	* 3. Translation will likely be preferable to ATA
	* passthrough. LSI / Avago at least single
	* steps ATA passthrough commands in the HBA,
	* regardless of protocol, so unless that
	* changes, there is a performance penalty for
	* doing ATA passthrough no matter whether
	* you're using NCQ/FPDMA, DMA or PIO.
	* 4. It requires a 32-byte CDB, which at least at
	* this point in CAM requires a CDB pointer, which
	* would require us to allocate an additional bit
	* of storage separate from the CCB.
	*/
	error = scsi_ata_zac_mgmt_out(&ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/use_ncq/ 0,
	/zm_action/ zone_sa,
	/zone_id/ lba,
	/zone_flags/ zone_flags,
	/data_ptr/ NULL,
	/dxfer_len/ 0,
	/cdb_storage/ NULL,
	/cdb_storage_len/ 0,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);
	if (error != 0) {
	error = EINVAL;
	xpt_print(periph->path,
	"scsi_ata_zac_mgmt_out() returned an "
	"error!");
	goto bailout;
	}
	}
	*queue_ccb = 1;

	break;
	}
	case DISK_ZONE_REPORT_ZONES: {
	uint8_t *rz_ptr;
	uint32_t num_entries, alloc_size;
	struct disk_zone_report *rep;

	rep = &bp->bio_zone.zone_params.report;

	num_entries = rep->entries_allocated;
	if (num_entries == 0) {
	xpt_print(periph->path, "No entries allocated for "
	"Report Zones request\n");
	error = EINVAL;
	goto bailout;
	}
	alloc_size = sizeof(struct scsi_report_zones_hdr) +
	(sizeof(struct scsi_report_zones_desc) * num_entries);
	alloc_size = min(alloc_size, softc->disk->d_maxsize);
	rz_ptr = malloc(alloc_size, M_SCSIDA, M_NOWAIT \| M_ZERO);
	if (rz_ptr == NULL) {
	xpt_print(periph->path, "Unable to allocate memory "
	"for Report Zones request\n");
	error = ENOMEM;
	goto bailout;
	}

	if (softc->zone_interface != DA_ZONE_IF_ATA_PASS) {
	scsi_zbc_in(&ccb->csio,
	/retries/ da_retry_count,
	/cbcfnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/service_action/ ZBC_IN_SA_REPORT_ZONES,
	/zone_start_lba/ rep->starting_id,
	/zone_options/ rep->rep_options,
	/data_ptr/ rz_ptr,
	/dxfer_len/ alloc_size,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);
	} else {
	/*
	* Note that in this case, even though we can
	* technically use NCQ, we don't bother for several
	* reasons:
	* 1. It hasn't been tested on a SAT layer that
	* supports it. This is new as of SAT-4.
	* 2. Even when there is a SAT layer that supports
	* it, that SAT layer will also probably support
	* ZBC -> ZAC translation, since they are both
	* in the SAT-4 spec.
	* 3. Translation will likely be preferable to ATA
	* passthrough. LSI / Avago at least single
	* steps ATA passthrough commands in the HBA,
	* regardless of protocol, so unless that
	* changes, there is a performance penalty for
	* doing ATA passthrough no matter whether
	* you're using NCQ/FPDMA, DMA or PIO.
	* 4. It requires a 32-byte CDB, which at least at
	* this point in CAM requires a CDB pointer, which
	* would require us to allocate an additional bit
	* of storage separate from the CCB.
	*/
	error = scsi_ata_zac_mgmt_in(&ccb->csio,
	/retries/ da_retry_count,
	/cbcfnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/use_ncq/ 0,
	/zm_action/ ATA_ZM_REPORT_ZONES,
	/zone_id/ rep->starting_id,
	/zone_flags/ rep->rep_options,
	/data_ptr/ rz_ptr,
	/dxfer_len/ alloc_size,
	/cdb_storage/ NULL,
	/cdb_storage_len/ 0,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);
	if (error != 0) {
	error = EINVAL;
	xpt_print(periph->path,
	"scsi_ata_zac_mgmt_in() returned an "
	"error!");
	goto bailout;
	}
	}

	/*
	* For BIO_ZONE, this isn't normally needed. However, it
	* is used by devstat_end_transaction_bio() to determine
	* how much data was transferred.
	*/
	/*
	* XXX KDM we have a problem. But I'm not sure how to fix
	* it. devstat uses bio_bcount - bio_resid to calculate
	* the amount of data transferred. The GEOM disk code
	* uses bio_length - bio_resid to calculate the amount of
	* data in bio_completed. We have different structure
	* sizes above and below the ada(4) driver. So, if we
	* use the sizes above, the amount transferred won't be
	* quite accurate for devstat. If we use different sizes
	* for bio_bcount and bio_length (above and below
	* respectively), then the residual needs to match one or
	* the other. Everything is calculated after the bio
	* leaves the driver, so changing the values around isn't
	* really an option. For now, just set the count to the
	* passed in length. This means that the calculations
	* above (e.g. bio_completed) will be correct, but the
	* amount of data reported to devstat will be slightly
	* under or overstated.
	*/
	bp->bio_bcount = bp->bio_length;

	*queue_ccb = 1;

	break;
	}
	case DISK_ZONE_GET_PARAMS: {
	struct disk_zone_disk_params *params;

	params = &bp->bio_zone.zone_params.disk_params;
	bzero(params, sizeof(*params));

	switch (softc->zone_mode) {
	case DA_ZONE_DRIVE_MANAGED:
	params->zone_mode = DISK_ZONE_MODE_DRIVE_MANAGED;
	break;
	case DA_ZONE_HOST_AWARE:
	params->zone_mode = DISK_ZONE_MODE_HOST_AWARE;
	break;
	case DA_ZONE_HOST_MANAGED:
	params->zone_mode = DISK_ZONE_MODE_HOST_MANAGED;
	break;
	default:
	case DA_ZONE_NONE:
	params->zone_mode = DISK_ZONE_MODE_NONE;
	break;
	}

	if (softc->zone_flags & DA_ZONE_FLAG_URSWRZ)
	params->flags \|= DISK_ZONE_DISK_URSWRZ;

	if (softc->zone_flags & DA_ZONE_FLAG_OPT_SEQ_SET) {
	params->optimal_seq_zones = softc->optimal_seq_zones;
	params->flags \|= DISK_ZONE_OPT_SEQ_SET;
	}

	if (softc->zone_flags & DA_ZONE_FLAG_OPT_NONSEQ_SET) {
	params->optimal_nonseq_zones =
	softc->optimal_nonseq_zones;
	params->flags \|= DISK_ZONE_OPT_NONSEQ_SET;
	}

	if (softc->zone_flags & DA_ZONE_FLAG_MAX_SEQ_SET) {
	params->max_seq_zones = softc->max_seq_zones;
	params->flags \|= DISK_ZONE_MAX_SEQ_SET;
	}
	if (softc->zone_flags & DA_ZONE_FLAG_RZ_SUP)
	params->flags \|= DISK_ZONE_RZ_SUP;

	if (softc->zone_flags & DA_ZONE_FLAG_OPEN_SUP)
	params->flags \|= DISK_ZONE_OPEN_SUP;

	if (softc->zone_flags & DA_ZONE_FLAG_CLOSE_SUP)
	params->flags \|= DISK_ZONE_CLOSE_SUP;

	if (softc->zone_flags & DA_ZONE_FLAG_FINISH_SUP)
	params->flags \|= DISK_ZONE_FINISH_SUP;

	if (softc->zone_flags & DA_ZONE_FLAG_RWP_SUP)
	params->flags \|= DISK_ZONE_RWP_SUP;
	break;
	}
	default:
	break;
	}
	bailout:
	return (error);
	}

	static void
	dastart(struct cam_periph periph, union ccb start_ccb)
	{
	struct da_softc *softc;

	cam_periph_assert(periph, MA_OWNED);
	softc = (struct da_softc *)periph->softc;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastart\n"));

	skipstate:
	switch (softc->state) {
	case DA_STATE_NORMAL:
	{
	struct bio *bp;
	uint8_t tag_code;

	more:
	bp = cam_iosched_next_bio(softc->cam_iosched);
	if (bp == NULL) {
	if (cam_iosched_has_work_flags(softc->cam_iosched,
	DA_WORK_TUR)) {
	softc->flags \|= DA_FLAG_TUR_PENDING;
	cam_iosched_clr_work_flags(softc->cam_iosched,
	DA_WORK_TUR);
	scsi_test_unit_ready(&start_ccb->csio,
	/retries/ da_retry_count,
	dadone_tur,
	MSG_SIMPLE_Q_TAG,
	SSD_FULL_SIZE,
	da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_TUR;
	xpt_action(start_ccb);
	} else
	xpt_release_ccb(start_ccb);
	break;
	}

	if (bp->bio_cmd == BIO_DELETE) {
	if (softc->delete_func != NULL) {
	softc->delete_func(periph, start_ccb, bp);
	goto out;
	} else {
	/*
	* Not sure this is possible, but failsafe by
	* lying and saying "sure, done."
	*/
	biofinish(bp, NULL, 0);
	goto more;
	}
	}

	if (cam_iosched_has_work_flags(softc->cam_iosched,
	DA_WORK_TUR)) {
	cam_iosched_clr_work_flags(softc->cam_iosched,
	DA_WORK_TUR);
	da_periph_release_locked(periph, DA_REF_TUR);
	}

	if ((bp->bio_flags & BIO_ORDERED) != 0 \|\|
	(softc->flags & DA_FLAG_NEED_OTAG) != 0) {
	softc->flags &= ~DA_FLAG_NEED_OTAG;
	softc->flags \|= DA_FLAG_WAS_OTAG;
	tag_code = MSG_ORDERED_Q_TAG;
	} else {
	tag_code = MSG_SIMPLE_Q_TAG;
	}

	switch (bp->bio_cmd) {
	case BIO_WRITE:
	case BIO_READ:
	{
	void *data_ptr;
	int rw_op;

	biotrack(bp, __func__);

	if (bp->bio_cmd == BIO_WRITE) {
	softc->flags \|= DA_FLAG_DIRTY;
	rw_op = SCSI_RW_WRITE;
	} else {
	rw_op = SCSI_RW_READ;
	}

	data_ptr = bp->bio_data;
	if ((bp->bio_flags & (BIO_UNMAPPED\|BIO_VLIST)) != 0) {
	rw_op \|= SCSI_RW_BIO;
	data_ptr = bp;
	}

	scsi_read_write(&start_ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/tag_code,
	rw_op,
	/byte2/0,
	softc->minimum_cmd_size,
	/lba/bp->bio_pblkno,
	/block_count/bp->bio_bcount /
	softc->params.secsize,
	data_ptr,
	/dxfer_len/ bp->bio_bcount,
	/sense_len/SSD_FULL_SIZE,
	da_default_timeout * 1000);
	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	start_ccb->csio.bio = bp;
	#endif
	break;
	}
	case BIO_FLUSH:
	/*
	* If we don't support sync cache, or the disk
	* isn't dirty, FLUSH is a no-op. Use the
	* allocated CCB for the next bio if one is
	* available.
	*/
	if ((softc->quirks & DA_Q_NO_SYNC_CACHE) != 0 \|\|
	(softc->flags & DA_FLAG_DIRTY) == 0) {
	biodone(bp);
	goto skipstate;
	}

	/*
	* BIO_FLUSH doesn't currently communicate
	* range data, so we synchronize the cache
	* over the whole disk.
	*/
	scsi_synchronize_cache(&start_ccb->csio,
	/retries/1,
	/cbfcnp/dadone,
	/tag_action/tag_code,
	/begin_lba/0,
	/lb_count/0,
	SSD_FULL_SIZE,
	da_default_timeout*1000);
	/*
	* Clear the dirty flag before sending the command.
	* Either this sync cache will be successful, or it
	* will fail after a retry. If it fails, it is
	* unlikely to be successful if retried later, so
	* we'll save ourselves time by just marking the
	* device clean.
	*/
	softc->flags &= ~DA_FLAG_DIRTY;
	break;
	case BIO_ZONE: {
	int error, queue_ccb;

	queue_ccb = 0;

	error = da_zone_cmd(periph, start_ccb, bp,&queue_ccb);
	if ((error != 0)
	\|\| (queue_ccb == 0)) {
	biofinish(bp, NULL, error);
	xpt_release_ccb(start_ccb);
	return;
	}
	break;
	}
	default:
	biofinish(bp, NULL, EOPNOTSUPP);
	xpt_release_ccb(start_ccb);
	return;
	}
	start_ccb->ccb_h.ccb_state = DA_CCB_BUFFER_IO;
	start_ccb->ccb_h.flags \|= CAM_UNLOCKED;
	start_ccb->ccb_h.softtimeout = sbttotv(da_default_softtimeout);

	out:
	LIST_INSERT_HEAD(&softc->pending_ccbs,
	&start_ccb->ccb_h, periph_links.le);

	/* We expect a unit attention from this device */
	if ((softc->flags & DA_FLAG_RETRY_UA) != 0) {
	start_ccb->ccb_h.ccb_state \|= DA_CCB_RETRY_UA;
	softc->flags &= ~DA_FLAG_RETRY_UA;
	}

	start_ccb->ccb_h.ccb_bp = bp;
	softc->refcount++;
	cam_periph_unlock(periph);
	xpt_action(start_ccb);
	cam_periph_lock(periph);

	/* May have more work to do, so ensure we stay scheduled */
	daschedule(periph);
	break;
	}
	case DA_STATE_PROBE_WP:
	{
	void *mode_buf;
	int mode_buf_len;

	if (da_disable_wp_detection) {
	if ((softc->flags & DA_FLAG_CAN_RC16) != 0)
	softc->state = DA_STATE_PROBE_RC16;
	else
	softc->state = DA_STATE_PROBE_RC;
	goto skipstate;
	}
	mode_buf_len = 192;
	mode_buf = malloc(mode_buf_len, M_SCSIDA, M_NOWAIT);
	if (mode_buf == NULL) {
	xpt_print(periph->path, "Unable to send mode sense - "
	"malloc failure\n");
	if ((softc->flags & DA_FLAG_CAN_RC16) != 0)
	softc->state = DA_STATE_PROBE_RC16;
	else
	softc->state = DA_STATE_PROBE_RC;
	goto skipstate;
	}
	scsi_mode_sense_len(&start_ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone_probewp,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/dbd/ FALSE,
	/pc/ SMS_PAGE_CTRL_CURRENT,
	/page/ SMS_ALL_PAGES_PAGE,
	/param_buf/ mode_buf,
	/param_len/ mode_buf_len,
	/minimum_cmd_size/ softc->minimum_cmd_size,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_WP;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_RC:
	{
	struct scsi_read_capacity_data *rcap;

	rcap = (struct scsi_read_capacity_data *)
	malloc(sizeof(*rcap), M_SCSIDA, M_NOWAIT\|M_ZERO);
	if (rcap == NULL) {
	printf("dastart: Couldn't malloc read_capacity data\n");
	/* da_free_periph??? */
	break;
	}
	scsi_read_capacity(&start_ccb->csio,
	/retries/da_retry_count,
	dadone_proberc,
	MSG_SIMPLE_Q_TAG,
	rcap,
	SSD_FULL_SIZE,
	/timeout/5000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_RC;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_RC16:
	{
	struct scsi_read_capacity_data_long *rcaplong;

	rcaplong = (struct scsi_read_capacity_data_long *)
	malloc(sizeof(*rcaplong), M_SCSIDA, M_NOWAIT\|M_ZERO);
	if (rcaplong == NULL) {
	printf("dastart: Couldn't malloc read_capacity data\n");
	/* da_free_periph??? */
	break;
	}
	scsi_read_capacity_16(&start_ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone_proberc,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/lba/ 0,
	/reladr/ 0,
	/pmi/ 0,
	/rcap_buf/ (uint8_t *)rcaplong,
	/rcap_buf_len/ sizeof(*rcaplong),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_RC16;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_LBP:
	{
	struct scsi_vpd_logical_block_prov *lbp;

	if (!scsi_vpd_supported_page(periph, SVPD_LBP)) {
	/*
	* If we get here we don't support any SBC-3 delete
	* methods with UNMAP as the Logical Block Provisioning
	* VPD page support is required for devices which
	* support it according to T10/1799-D Revision 31
	* however older revisions of the spec don't mandate
	* this so we currently don't remove these methods
	* from the available set.
	*/
	softc->state = DA_STATE_PROBE_BLK_LIMITS;
	goto skipstate;
	}

	lbp = (struct scsi_vpd_logical_block_prov *)
	malloc(sizeof(*lbp), M_SCSIDA, M_NOWAIT\|M_ZERO);

	if (lbp == NULL) {
	printf("dastart: Couldn't malloc lbp data\n");
	/* da_free_periph??? */
	break;
	}

	scsi_inquiry(&start_ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone_probelbp,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/inq_buf/(u_int8_t *)lbp,
	/inq_len/sizeof(*lbp),
	/evpd/TRUE,
	/page_code/SVPD_LBP,
	/sense_len/SSD_MIN_SIZE,
	/timeout/da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_LBP;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_BLK_LIMITS:
	{
	struct scsi_vpd_block_limits *block_limits;

	if (!scsi_vpd_supported_page(periph, SVPD_BLOCK_LIMITS)) {
	/* Not supported skip to next probe */
	softc->state = DA_STATE_PROBE_BDC;
	goto skipstate;
	}

	block_limits = (struct scsi_vpd_block_limits *)
	malloc(sizeof(*block_limits), M_SCSIDA, M_NOWAIT\|M_ZERO);

	if (block_limits == NULL) {
	printf("dastart: Couldn't malloc block_limits data\n");
	/* da_free_periph??? */
	break;
	}

	scsi_inquiry(&start_ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone_probeblklimits,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/inq_buf/(u_int8_t *)block_limits,
	/inq_len/sizeof(*block_limits),
	/evpd/TRUE,
	/page_code/SVPD_BLOCK_LIMITS,
	/sense_len/SSD_MIN_SIZE,
	/timeout/da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_BLK_LIMITS;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_BDC:
	{
	struct scsi_vpd_block_characteristics *bdc;

	if (!scsi_vpd_supported_page(periph, SVPD_BDC)) {
	softc->state = DA_STATE_PROBE_ATA;
	goto skipstate;
	}

	bdc = (struct scsi_vpd_block_characteristics *)
	malloc(sizeof(*bdc), M_SCSIDA, M_NOWAIT\|M_ZERO);

	if (bdc == NULL) {
	printf("dastart: Couldn't malloc bdc data\n");
	/* da_free_periph??? */
	break;
	}

	scsi_inquiry(&start_ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone_probebdc,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/inq_buf/(u_int8_t *)bdc,
	/inq_len/sizeof(*bdc),
	/evpd/TRUE,
	/page_code/SVPD_BDC,
	/sense_len/SSD_MIN_SIZE,
	/timeout/da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_BDC;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_ATA:
	{
	struct ata_params *ata_params;

	if (!scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) {
	if ((softc->zone_mode == DA_ZONE_HOST_AWARE)
	\|\| (softc->zone_mode == DA_ZONE_HOST_MANAGED)) {
	/*
	* Note that if the ATA VPD page isn't
	* supported, we aren't talking to an ATA
	* device anyway. Support for that VPD
	* page is mandatory for SCSI to ATA (SAT)
	* translation layers.
	*/
	softc->state = DA_STATE_PROBE_ZONE;
	goto skipstate;
	}
	daprobedone(periph, start_ccb);
	break;
	}

	ata_params = &periph->path->device->ident_data;

	scsi_ata_identify(&start_ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone_probeata,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/data_ptr/(u_int8_t *)ata_params,
	/dxfer_len/sizeof(*ata_params),
	/sense_len/SSD_FULL_SIZE,
	/timeout/da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_ATA_LOGDIR:
	{
	struct ata_gp_log_dir *log_dir;
	int retval;

	retval = 0;

	if ((softc->flags & DA_FLAG_CAN_ATA_LOG) == 0) {
	/*
	* If we don't have log support, not much point in
	* trying to probe zone support.
	*/
	daprobedone(periph, start_ccb);
	break;
	}

	/*
	* If we have an ATA device (the SCSI ATA Information VPD
	* page should be present and the ATA identify should have
	* succeeded) and it supports logs, ask for the log directory.
	*/

	log_dir = malloc(sizeof(*log_dir), M_SCSIDA, M_NOWAIT\|M_ZERO);
	if (log_dir == NULL) {
	xpt_print(periph->path, "Couldn't malloc log_dir "
	"data\n");
	daprobedone(periph, start_ccb);
	break;
	}

	retval = scsi_ata_read_log(&start_ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone_probeatalogdir,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/log_address/ ATA_LOG_DIRECTORY,
	/page_number/ 0,
	/block_count/ 1,
	/protocol/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
	AP_PROTO_DMA : AP_PROTO_PIO_IN,
	/data_ptr/ (uint8_t *)log_dir,
	/dxfer_len/ sizeof(*log_dir),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);

	if (retval != 0) {
	xpt_print(periph->path, "scsi_ata_read_log() failed!");
	free(log_dir, M_SCSIDA);
	daprobedone(periph, start_ccb);
	break;
	}
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_LOGDIR;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_ATA_IDDIR:
	{
	struct ata_identify_log_pages *id_dir;
	int retval;

	retval = 0;

	/*
	* Check here to see whether the Identify Device log is
	* supported in the directory of logs. If so, continue
	* with requesting the log of identify device pages.
	*/
	if ((softc->flags & DA_FLAG_CAN_ATA_IDLOG) == 0) {
	daprobedone(periph, start_ccb);
	break;
	}

	id_dir = malloc(sizeof(*id_dir), M_SCSIDA, M_NOWAIT \| M_ZERO);
	if (id_dir == NULL) {
	xpt_print(periph->path, "Couldn't malloc id_dir "
	"data\n");
	daprobedone(periph, start_ccb);
	break;
	}

	retval = scsi_ata_read_log(&start_ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone_probeataiddir,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/log_address/ ATA_IDENTIFY_DATA_LOG,
	/page_number/ ATA_IDL_PAGE_LIST,
	/block_count/ 1,
	/protocol/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
	AP_PROTO_DMA : AP_PROTO_PIO_IN,
	/data_ptr/ (uint8_t *)id_dir,
	/dxfer_len/ sizeof(*id_dir),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);

	if (retval != 0) {
	xpt_print(periph->path, "scsi_ata_read_log() failed!");
	free(id_dir, M_SCSIDA);
	daprobedone(periph, start_ccb);
	break;
	}
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_IDDIR;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_ATA_SUP:
	{
	struct ata_identify_log_sup_cap *sup_cap;
	int retval;

	retval = 0;

	/*
	* Check here to see whether the Supported Capabilities log
	* is in the list of Identify Device logs.
	*/
	if ((softc->flags & DA_FLAG_CAN_ATA_SUPCAP) == 0) {
	daprobedone(periph, start_ccb);
	break;
	}

	sup_cap = malloc(sizeof(*sup_cap), M_SCSIDA, M_NOWAIT\|M_ZERO);
	if (sup_cap == NULL) {
	xpt_print(periph->path, "Couldn't malloc sup_cap "
	"data\n");
	daprobedone(periph, start_ccb);
	break;
	}

	retval = scsi_ata_read_log(&start_ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone_probeatasup,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/log_address/ ATA_IDENTIFY_DATA_LOG,
	/page_number/ ATA_IDL_SUP_CAP,
	/block_count/ 1,
	/protocol/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
	AP_PROTO_DMA : AP_PROTO_PIO_IN,
	/data_ptr/ (uint8_t *)sup_cap,
	/dxfer_len/ sizeof(*sup_cap),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);

	if (retval != 0) {
	xpt_print(periph->path, "scsi_ata_read_log() failed!");
	free(sup_cap, M_SCSIDA);
	daprobedone(periph, start_ccb);
	break;
	}

	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_SUP;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_ATA_ZONE:
	{
	struct ata_zoned_info_log *ata_zone;
	int retval;

	retval = 0;

	/*
	* Check here to see whether the zoned device information
	* page is supported. If so, continue on to request it.
	* If not, skip to DA_STATE_PROBE_LOG or done.
	*/
	if ((softc->flags & DA_FLAG_CAN_ATA_ZONE) == 0) {
	daprobedone(periph, start_ccb);
	break;
	}
	ata_zone = malloc(sizeof(*ata_zone), M_SCSIDA,
	M_NOWAIT\|M_ZERO);
	if (ata_zone == NULL) {
	xpt_print(periph->path, "Couldn't malloc ata_zone "
	"data\n");
	daprobedone(periph, start_ccb);
	break;
	}

	retval = scsi_ata_read_log(&start_ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone_probeatazone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/log_address/ ATA_IDENTIFY_DATA_LOG,
	/page_number/ ATA_IDL_ZDI,
	/block_count/ 1,
	/protocol/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
	AP_PROTO_DMA : AP_PROTO_PIO_IN,
	/data_ptr/ (uint8_t *)ata_zone,
	/dxfer_len/ sizeof(*ata_zone),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);

	if (retval != 0) {
	xpt_print(periph->path, "scsi_ata_read_log() failed!");
	free(ata_zone, M_SCSIDA);
	daprobedone(periph, start_ccb);
	break;
	}
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_ZONE;
	xpt_action(start_ccb);

	break;
	}
	case DA_STATE_PROBE_ZONE:
	{
	struct scsi_vpd_zoned_bdc *bdc;

	/*
	* Note that this page will be supported for SCSI protocol
	* devices that support ZBC (SMR devices), as well as ATA
	* protocol devices that are behind a SAT (SCSI to ATA
	* Translation) layer that supports converting ZBC commands
	* to their ZAC equivalents.
	*/
	if (!scsi_vpd_supported_page(periph, SVPD_ZONED_BDC)) {
	daprobedone(periph, start_ccb);
	break;
	}
	bdc = (struct scsi_vpd_zoned_bdc *)
	malloc(sizeof(*bdc), M_SCSIDA, M_NOWAIT\|M_ZERO);

	if (bdc == NULL) {
	xpt_release_ccb(start_ccb);
	xpt_print(periph->path, "Couldn't malloc zone VPD "
	"data\n");
	break;
	}
	scsi_inquiry(&start_ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone_probezone,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/inq_buf/(u_int8_t *)bdc,
	/inq_len/sizeof(*bdc),
	/evpd/TRUE,
	/page_code/SVPD_ZONED_BDC,
	/sense_len/SSD_FULL_SIZE,
	/timeout/da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ZONE;
	xpt_action(start_ccb);
	break;
	}
	}
	}

	/*
	* In each of the methods below, while its the caller's
	* responsibility to ensure the request will fit into a
	* single device request, we might have changed the delete
	* method due to the device incorrectly advertising either
	* its supported methods or limits.
	*
	* To prevent this causing further issues we validate the
	* against the methods limits, and warn which would
	* otherwise be unnecessary.
	*/
	static void
	da_delete_unmap(struct cam_periph periph, union ccb ccb, struct bio *bp)
	{
	struct da_softc softc = (struct da_softc )periph->softc;
	struct bio *bp1;
	uint8_t *buf = softc->unmap_buf;
	struct scsi_unmap_desc d = (void )&buf[UNMAP_HEAD_SIZE];
	uint64_t lba, lastlba = (uint64_t)-1;
	uint64_t totalcount = 0;
	uint64_t count;
	uint32_t c, lastcount = 0, ranges = 0;

	/*
	* Currently this doesn't take the UNMAP
	* Granularity and Granularity Alignment
	* fields into account.
	*
	* This could result in both unoptimal unmap
	* requests as as well as UNMAP calls unmapping
	* fewer LBA's than requested.
	*/

	bzero(softc->unmap_buf, sizeof(softc->unmap_buf));
	bp1 = bp;
	do {
	/*
	* Note: ada and da are different in how they store the
	* pending bp's in a trim. ada stores all of them in the
	* trim_req.bps. da stores all but the first one in the
	* delete_run_queue. ada then completes all the bps in
	* its adadone() loop. da completes all the bps in the
	* delete_run_queue in dadone, and relies on the biodone
	* after to complete. This should be reconciled since there's
	* no real reason to do it differently. XXX
	*/
	if (bp1 != bp)
	bioq_insert_tail(&softc->delete_run_queue, bp1);
	lba = bp1->bio_pblkno;
	count = bp1->bio_bcount / softc->params.secsize;

	/* Try to extend the previous range. */
	if (lba == lastlba) {
	c = omin(count, UNMAP_RANGE_MAX - lastcount);
	lastlba += c;
	lastcount += c;
	scsi_ulto4b(lastcount, d[ranges - 1].length);
	count -= c;
	lba += c;
	totalcount += c;
	} else if ((softc->quirks & DA_Q_STRICT_UNMAP) &&
	softc->unmap_gran != 0) {
	/* Align length of the previous range. */
	if ((c = lastcount % softc->unmap_gran) != 0) {
	if (lastcount <= c) {
	totalcount -= lastcount;
	lastlba = (uint64_t)-1;
	lastcount = 0;
	ranges--;
	} else {
	totalcount -= c;
	lastlba -= c;
	lastcount -= c;
	scsi_ulto4b(lastcount,
	d[ranges - 1].length);
	}
	}
	/* Align beginning of the new range. */
	c = (lba - softc->unmap_gran_align) % softc->unmap_gran;
	if (c != 0) {
	c = softc->unmap_gran - c;
	if (count <= c) {
	count = 0;
	} else {
	lba += c;
	count -= c;
	}
	}
	}

	while (count > 0) {
	c = omin(count, UNMAP_RANGE_MAX);
	if (totalcount + c > softc->unmap_max_lba \|\|
	ranges >= softc->unmap_max_ranges) {
	xpt_print(periph->path,
	"%s issuing short delete %ld > %ld"
	"\|\| %d >= %d",
	da_delete_method_desc[softc->delete_method],
	totalcount + c, softc->unmap_max_lba,
	ranges, softc->unmap_max_ranges);
	break;
	}
	scsi_u64to8b(lba, d[ranges].lba);
	scsi_ulto4b(c, d[ranges].length);
	lba += c;
	totalcount += c;
	ranges++;
	count -= c;
	lastlba = lba;
	lastcount = c;
	}
	bp1 = cam_iosched_next_trim(softc->cam_iosched);
	if (bp1 == NULL)
	break;
	if (ranges >= softc->unmap_max_ranges \|\|
	totalcount + bp1->bio_bcount /
	softc->params.secsize > softc->unmap_max_lba) {
	cam_iosched_put_back_trim(softc->cam_iosched, bp1);
	break;
	}
	} while (1);

	/* Align length of the last range. */
	if ((softc->quirks & DA_Q_STRICT_UNMAP) && softc->unmap_gran != 0 &&
	(c = lastcount % softc->unmap_gran) != 0) {
	if (lastcount <= c)
	ranges--;
	else
	scsi_ulto4b(lastcount - c, d[ranges - 1].length);
	}

	scsi_ulto2b(ranges * 16 + 6, &buf[0]);
	scsi_ulto2b(ranges * 16, &buf[2]);

	scsi_unmap(&ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/byte2/0,
	/data_ptr/ buf,
	/dxfer_len/ ranges * 16 + 8,
	/sense_len/SSD_FULL_SIZE,
	da_default_timeout * 1000);
	ccb->ccb_h.ccb_state = DA_CCB_DELETE;
	ccb->ccb_h.flags \|= CAM_UNLOCKED;
	softc->trim_count++;
	softc->trim_ranges += ranges;
	softc->trim_lbas += totalcount;
	cam_iosched_submit_trim(softc->cam_iosched);
	}

	static void
	da_delete_trim(struct cam_periph periph, union ccb ccb, struct bio *bp)
	{
	struct da_softc softc = (struct da_softc )periph->softc;
	struct bio *bp1;
	uint8_t *buf = softc->unmap_buf;
	uint64_t lastlba = (uint64_t)-1;
	uint64_t count;
	uint64_t lba;
	uint32_t lastcount = 0, c, requestcount;
	int ranges = 0, off, block_count;

	bzero(softc->unmap_buf, sizeof(softc->unmap_buf));
	bp1 = bp;
	do {
	if (bp1 != bp)//XXX imp XXX
	bioq_insert_tail(&softc->delete_run_queue, bp1);
	lba = bp1->bio_pblkno;
	count = bp1->bio_bcount / softc->params.secsize;
	requestcount = count;

	/* Try to extend the previous range. */
	if (lba == lastlba) {
	c = omin(count, ATA_DSM_RANGE_MAX - lastcount);
	lastcount += c;
	off = (ranges - 1) * 8;
	buf[off + 6] = lastcount & 0xff;
	buf[off + 7] = (lastcount >> 8) & 0xff;
	count -= c;
	lba += c;
	}

	while (count > 0) {
	c = omin(count, ATA_DSM_RANGE_MAX);
	off = ranges * 8;

	buf[off + 0] = lba & 0xff;
	buf[off + 1] = (lba >> 8) & 0xff;
	buf[off + 2] = (lba >> 16) & 0xff;
	buf[off + 3] = (lba >> 24) & 0xff;
	buf[off + 4] = (lba >> 32) & 0xff;
	buf[off + 5] = (lba >> 40) & 0xff;
	buf[off + 6] = c & 0xff;
	buf[off + 7] = (c >> 8) & 0xff;
	lba += c;
	ranges++;
	count -= c;
	lastcount = c;
	if (count != 0 && ranges == softc->trim_max_ranges) {
	xpt_print(periph->path,
	"%s issuing short delete %ld > %ld\n",
	da_delete_method_desc[softc->delete_method],
	requestcount,
	(softc->trim_max_ranges - ranges) *
	ATA_DSM_RANGE_MAX);
	break;
	}
	}
	lastlba = lba;
	bp1 = cam_iosched_next_trim(softc->cam_iosched);
	if (bp1 == NULL)
	break;
	if (bp1->bio_bcount / softc->params.secsize >
	(softc->trim_max_ranges - ranges) * ATA_DSM_RANGE_MAX) {
	cam_iosched_put_back_trim(softc->cam_iosched, bp1);
	break;
	}
	} while (1);

	block_count = howmany(ranges, ATA_DSM_BLK_RANGES);
	scsi_ata_trim(&ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/MSG_SIMPLE_Q_TAG,
	block_count,
	/data_ptr/buf,
	/dxfer_len/block_count * ATA_DSM_BLK_SIZE,
	/sense_len/SSD_FULL_SIZE,
	da_default_timeout * 1000);
	ccb->ccb_h.ccb_state = DA_CCB_DELETE;
	ccb->ccb_h.flags \|= CAM_UNLOCKED;
	cam_iosched_submit_trim(softc->cam_iosched);
	}

	/*
	* We calculate ws_max_blks here based off d_delmaxsize instead
	* of using softc->ws_max_blks as it is absolute max for the
	* device not the protocol max which may well be lower.
	*/
	static void
	da_delete_ws(struct cam_periph periph, union ccb ccb, struct bio *bp)
	{
	struct da_softc *softc;
	struct bio *bp1;
	uint64_t ws_max_blks;
	uint64_t lba;
	uint64_t count; /* forward compat with WS32 */

	softc = (struct da_softc *)periph->softc;
	ws_max_blks = softc->disk->d_delmaxsize / softc->params.secsize;
	lba = bp->bio_pblkno;
	count = 0;
	bp1 = bp;
	do {
	if (bp1 != bp)//XXX imp XXX
	bioq_insert_tail(&softc->delete_run_queue, bp1);
	count += bp1->bio_bcount / softc->params.secsize;
	if (count > ws_max_blks) {
	xpt_print(periph->path,
	"%s issuing short delete %ld > %ld\n",
	da_delete_method_desc[softc->delete_method],
	count, ws_max_blks);
	count = omin(count, ws_max_blks);
	break;
	}
	bp1 = cam_iosched_next_trim(softc->cam_iosched);
	if (bp1 == NULL)
	break;
	if (lba + count != bp1->bio_pblkno \|\|
	count + bp1->bio_bcount /
	softc->params.secsize > ws_max_blks) {
	cam_iosched_put_back_trim(softc->cam_iosched, bp1);
	break;
	}
	} while (1);

	scsi_write_same(&ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/byte2/softc->delete_method ==
	DA_DELETE_ZERO ? 0 : SWS_UNMAP,
	softc->delete_method == DA_DELETE_WS16 ? 16 : 10,
	/lba/lba,
	/block_count/count,
	/data_ptr/ __DECONST(void *, zero_region),
	/dxfer_len/ softc->params.secsize,
	/sense_len/SSD_FULL_SIZE,
	da_default_timeout * 1000);
	ccb->ccb_h.ccb_state = DA_CCB_DELETE;
	ccb->ccb_h.flags \|= CAM_UNLOCKED;
	cam_iosched_submit_trim(softc->cam_iosched);
	}

	static int
	cmd6workaround(union ccb *ccb)
	{
	struct scsi_rw_6 cmd6;
	struct scsi_rw_10 *cmd10;
	struct da_softc *softc;
	u_int8_t *cdb;
	struct bio *bp;
	int frozen;

	cdb = ccb->csio.cdb_io.cdb_bytes;
	softc = (struct da_softc *)xpt_path_periph(ccb->ccb_h.path)->softc;

	if (ccb->ccb_h.ccb_state == DA_CCB_DELETE) {
	da_delete_methods old_method = softc->delete_method;

	/*
	* Typically there are two reasons for failure here
	* 1. Delete method was detected as supported but isn't
	* 2. Delete failed due to invalid params e.g. too big
	*
	* While we will attempt to choose an alternative delete method
	* this may result in short deletes if the existing delete
	* requests from geom are big for the new method chosen.
	*
	* This method assumes that the error which triggered this
	* will not retry the io otherwise a panic will occur
	*/
	dadeleteflag(softc, old_method, 0);
	dadeletemethodchoose(softc, DA_DELETE_DISABLE);
	if (softc->delete_method == DA_DELETE_DISABLE)
	xpt_print(ccb->ccb_h.path,
	"%s failed, disabling BIO_DELETE\n",
	da_delete_method_desc[old_method]);
	else
	xpt_print(ccb->ccb_h.path,
	"%s failed, switching to %s BIO_DELETE\n",
	da_delete_method_desc[old_method],
	da_delete_method_desc[softc->delete_method]);

	while ((bp = bioq_takefirst(&softc->delete_run_queue)) != NULL)
	cam_iosched_queue_work(softc->cam_iosched, bp);
	cam_iosched_queue_work(softc->cam_iosched,
	(struct bio *)ccb->ccb_h.ccb_bp);
	ccb->ccb_h.ccb_bp = NULL;
	return (0);
	}

	/* Detect unsupported PREVENT ALLOW MEDIUM REMOVAL. */
	if ((ccb->ccb_h.flags & CAM_CDB_POINTER) == 0 &&
	(*cdb == PREVENT_ALLOW) &&
	(softc->quirks & DA_Q_NO_PREVENT) == 0) {
	if (bootverbose)
	xpt_print(ccb->ccb_h.path,
	"PREVENT ALLOW MEDIUM REMOVAL not supported.\n");
	softc->quirks \|= DA_Q_NO_PREVENT;
	return (0);
	}

	/* Detect unsupported SYNCHRONIZE CACHE(10). */
	if ((ccb->ccb_h.flags & CAM_CDB_POINTER) == 0 &&
	(*cdb == SYNCHRONIZE_CACHE) &&
	(softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) {
	if (bootverbose)
	xpt_print(ccb->ccb_h.path,
	"SYNCHRONIZE CACHE(10) not supported.\n");
	softc->quirks \|= DA_Q_NO_SYNC_CACHE;
	softc->disk->d_flags &= ~DISKFLAG_CANFLUSHCACHE;
	return (0);
	}

	/* Translation only possible if CDB is an array and cmd is R/W6 */
	if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0 \|\|
	(cdb != READ_6 && cdb != WRITE_6))
	return 0;

	xpt_print(ccb->ccb_h.path, "READ(6)/WRITE(6) not supported, "
	"increasing minimum_cmd_size to 10.\n");
	softc->minimum_cmd_size = 10;

	bcopy(cdb, &cmd6, sizeof(struct scsi_rw_6));
	cmd10 = (struct scsi_rw_10 *)cdb;
	cmd10->opcode = (cmd6.opcode == READ_6) ? READ_10 : WRITE_10;
	cmd10->byte2 = 0;
	scsi_ulto4b(scsi_3btoul(cmd6.addr), cmd10->addr);
	cmd10->reserved = 0;
	scsi_ulto2b(cmd6.length, cmd10->length);
	cmd10->control = cmd6.control;
	ccb->csio.cdb_len = sizeof(*cmd10);

	/* Requeue request, unfreezing queue if necessary */
	frozen = (ccb->ccb_h.status & CAM_DEV_QFRZN) != 0;
	ccb->ccb_h.status = CAM_REQUEUE_REQ;
	xpt_action(ccb);
	if (frozen) {
	cam_release_devq(ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	return (ERESTART);
	}

	static void
	dazonedone(struct cam_periph periph, union ccb ccb)
	{
	struct da_softc *softc;
	struct bio *bp;

	softc = periph->softc;
	bp = (struct bio *)ccb->ccb_h.ccb_bp;

	switch (bp->bio_zone.zone_cmd) {
	case DISK_ZONE_OPEN:
	case DISK_ZONE_CLOSE:
	case DISK_ZONE_FINISH:
	case DISK_ZONE_RWP:
	break;
	case DISK_ZONE_REPORT_ZONES: {
	uint32_t avail_len;
	struct disk_zone_report *rep;
	struct scsi_report_zones_hdr *hdr;
	struct scsi_report_zones_desc *desc;
	struct disk_zone_rep_entry *entry;
	uint32_t hdr_len, num_avail;
	uint32_t num_to_fill, i;
	int ata;

	rep = &bp->bio_zone.zone_params.report;
	avail_len = ccb->csio.dxfer_len - ccb->csio.resid;
	/*
	* Note that bio_resid isn't normally used for zone
	* commands, but it is used by devstat_end_transaction_bio()
	* to determine how much data was transferred. Because
	* the size of the SCSI/ATA data structures is different
	* than the size of the BIO interface structures, the
	* amount of data actually transferred from the drive will
	* be different than the amount of data transferred to
	* the user.
	*/
	bp->bio_resid = ccb->csio.resid;
	hdr = (struct scsi_report_zones_hdr *)ccb->csio.data_ptr;
	if (avail_len < sizeof(*hdr)) {
	/*
	* Is there a better error than EIO here? We asked
	* for at least the header, and we got less than
	* that.
	*/
	bp->bio_error = EIO;
	bp->bio_flags \|= BIO_ERROR;
	bp->bio_resid = bp->bio_bcount;
	break;
	}

	if (softc->zone_interface == DA_ZONE_IF_ATA_PASS)
	ata = 1;
	else
	ata = 0;

	hdr_len = ata ? le32dec(hdr->length) :
	scsi_4btoul(hdr->length);
	if (hdr_len > 0)
	rep->entries_available = hdr_len / sizeof(*desc);
	else
	rep->entries_available = 0;
	/*
	* NOTE: using the same values for the BIO version of the
	* same field as the SCSI/ATA values. This means we could
	* get some additional values that aren't defined in bio.h
	* if more values of the same field are defined later.
	*/
	rep->header.same = hdr->byte4 & SRZ_SAME_MASK;
	rep->header.maximum_lba = ata ? le64dec(hdr->maximum_lba) :
	scsi_8btou64(hdr->maximum_lba);
	/*
	* If the drive reports no entries that match the query,
	* we're done.
	*/
	if (hdr_len == 0) {
	rep->entries_filled = 0;
	break;
	}

	num_avail = min((avail_len - sizeof(hdr)) / sizeof(desc),
	hdr_len / sizeof(*desc));
	/*
	* If the drive didn't return any data, then we're done.
	*/
	if (num_avail == 0) {
	rep->entries_filled = 0;
	break;
	}

	num_to_fill = min(num_avail, rep->entries_allocated);
	/*
	* If the user didn't allocate any entries for us to fill,
	* we're done.
	*/
	if (num_to_fill == 0) {
	rep->entries_filled = 0;
	break;
	}

	for (i = 0, desc = &hdr->desc_list[0], entry=&rep->entries[0];
	i < num_to_fill; i++, desc++, entry++) {
	/*
	* NOTE: we're mapping the values here directly
	* from the SCSI/ATA bit definitions to the bio.h
	* definitons. There is also a warning in
	* disk_zone.h, but the impact is that if
	* additional values are added in the SCSI/ATA
	* specs these will be visible to consumers of
	* this interface.
	*/
	entry->zone_type = desc->zone_type & SRZ_TYPE_MASK;
	entry->zone_condition =
	(desc->zone_flags & SRZ_ZONE_COND_MASK) >>
	SRZ_ZONE_COND_SHIFT;
	entry->zone_flags \|= desc->zone_flags &
	(SRZ_ZONE_NON_SEQ\|SRZ_ZONE_RESET);
	entry->zone_length =
	ata ? le64dec(desc->zone_length) :
	scsi_8btou64(desc->zone_length);
	entry->zone_start_lba =
	ata ? le64dec(desc->zone_start_lba) :
	scsi_8btou64(desc->zone_start_lba);
	entry->write_pointer_lba =
	ata ? le64dec(desc->write_pointer_lba) :
	scsi_8btou64(desc->write_pointer_lba);
	}
	rep->entries_filled = num_to_fill;
	break;
	}
	case DISK_ZONE_GET_PARAMS:
	default:
	/*
	* In theory we should not get a GET_PARAMS bio, since it
	* should be handled without queueing the command to the
	* drive.
	*/
	panic("%s: Invalid zone command %d", __func__,
	bp->bio_zone.zone_cmd);
	break;
	}

	if (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES)
	free(ccb->csio.data_ptr, M_SCSIDA);
	}

	static void
	dadone(struct cam_periph periph, union ccb done_ccb)
	{
	struct bio bp, bp1;
	struct da_softc *softc;
	struct ccb_scsiio *csio;
	u_int32_t priority;
	da_ccb_state state;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone\n"));

	softc = (struct da_softc *)periph->softc;
	priority = done_ccb->ccb_h.pinfo.priority;
	csio = &done_ccb->csio;

	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	if (csio->bio != NULL)
	biotrack(csio->bio, __func__);
	#endif
	state = csio->ccb_h.ccb_state & DA_CCB_TYPE_MASK;

	cam_periph_lock(periph);
	bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	int error;
	int sf;

	if ((csio->ccb_h.ccb_state & DA_CCB_RETRY_UA) != 0)
	sf = SF_RETRY_UA;
	else
	sf = 0;

	error = daerror(done_ccb, CAM_RETRY_SELTO, sf);
	if (error == ERESTART) {
	/* A retry was scheduled, so just return. */
	cam_periph_unlock(periph);
	return;
	}
	bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
	if (error != 0) {
	int queued_error;

	/*
	* return all queued I/O with EIO, so that
	* the client can retry these I/Os in the
	* proper order should it attempt to recover.
	*/
	queued_error = EIO;

	if (error == ENXIO
	&& (softc->flags & DA_FLAG_PACK_INVALID)== 0) {
	/*
	* Catastrophic error. Mark our pack as
	* invalid.
	*
	* XXX See if this is really a media
	* XXX change first?
	*/
	xpt_print(periph->path, "Invalidating pack\n");
	softc->flags \|= DA_FLAG_PACK_INVALID;
	#ifdef CAM_IO_STATS
	softc->invalidations++;
	#endif
	queued_error = ENXIO;
	}
	cam_iosched_flush(softc->cam_iosched, NULL,
	queued_error);
	if (bp != NULL) {
	bp->bio_error = error;
	bp->bio_resid = bp->bio_bcount;
	bp->bio_flags \|= BIO_ERROR;
	}
	} else if (bp != NULL) {
	if (state == DA_CCB_DELETE)
	bp->bio_resid = 0;
	else
	bp->bio_resid = csio->resid;
	bp->bio_error = 0;
	if (bp->bio_resid != 0)
	bp->bio_flags \|= BIO_ERROR;
	}
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	} else if (bp != NULL) {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	panic("REQ_CMP with QFRZN");
	if (bp->bio_cmd == BIO_ZONE)
	dazonedone(periph, done_ccb);
	else if (state == DA_CCB_DELETE)
	bp->bio_resid = 0;
	else
	bp->bio_resid = csio->resid;
	if ((csio->resid > 0) && (bp->bio_cmd != BIO_ZONE))
	bp->bio_flags \|= BIO_ERROR;
	if (softc->error_inject != 0) {
	bp->bio_error = softc->error_inject;
	bp->bio_resid = bp->bio_bcount;
	bp->bio_flags \|= BIO_ERROR;
	softc->error_inject = 0;
	}
	}

	if (bp != NULL)
	biotrack(bp, __func__);
	LIST_REMOVE(&done_ccb->ccb_h, periph_links.le);
	if (LIST_EMPTY(&softc->pending_ccbs))
	softc->flags \|= DA_FLAG_WAS_OTAG;

	/*
	* We need to call cam_iosched before we call biodone so that we don't
	* measure any activity that happens in the completion routine, which in
	* the case of sendfile can be quite extensive. Release the periph
	* refcount taken in dastart() for each CCB.
	*/
	cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
	xpt_release_ccb(done_ccb);
	KASSERT(softc->refcount >= 1, ("dadone softc %p refcount %d", softc, softc->refcount));
	softc->refcount--;
	if (state == DA_CCB_DELETE) {
	TAILQ_HEAD(, bio) queue;

	TAILQ_INIT(&queue);
	TAILQ_CONCAT(&queue, &softc->delete_run_queue.queue, bio_queue);
	softc->delete_run_queue.insert_point = NULL;
	/*
	* Normally, the xpt_release_ccb() above would make sure
	* that when we have more work to do, that work would
	* get kicked off. However, we specifically keep
	* delete_running set to 0 before the call above to
	* allow other I/O to progress when many BIO_DELETE
	* requests are pushed down. We set delete_running to 0
	* and call daschedule again so that we don't stall if
	* there are no other I/Os pending apart from BIO_DELETEs.
	*/
	cam_iosched_trim_done(softc->cam_iosched);
	daschedule(periph);
	cam_periph_unlock(periph);
	while ((bp1 = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, bp1, bio_queue);
	bp1->bio_error = bp->bio_error;
	if (bp->bio_flags & BIO_ERROR) {
	bp1->bio_flags \|= BIO_ERROR;
	bp1->bio_resid = bp1->bio_bcount;
	} else
	bp1->bio_resid = 0;
	biodone(bp1);
	}
	} else {
	daschedule(periph);
	cam_periph_unlock(periph);
	}
	if (bp != NULL)
	biodone(bp);
	return;
	}

	static void
	dadone_probewp(struct cam_periph periph, union ccb done_ccb)
	{
	struct scsi_mode_header_6 *mode_hdr6;
	struct scsi_mode_header_10 *mode_hdr10;
	struct da_softc *softc;
	struct ccb_scsiio *csio;
	u_int32_t priority;
	uint8_t dev_spec;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone_probewp\n"));

	softc = (struct da_softc *)periph->softc;
	priority = done_ccb->ccb_h.pinfo.priority;
	csio = &done_ccb->csio;

	cam_periph_assert(periph, MA_OWNED);

	KASSERT(softc->state == DA_STATE_PROBE_WP,
	("State (%d) not PROBE_WP in dadone_probewp, periph %p ccb %p",
	softc->state, periph, done_ccb));
	KASSERT((csio->ccb_h.ccb_state & DA_CCB_TYPE_MASK) == DA_CCB_PROBE_WP,
	("CCB State (%lu) not PROBE_WP in dadone_probewp, periph %p ccb %p",
	(unsigned long)csio->ccb_h.ccb_state & DA_CCB_TYPE_MASK, periph,
	done_ccb));

	if (softc->minimum_cmd_size > 6) {
	mode_hdr10 = (struct scsi_mode_header_10 *)csio->data_ptr;
	dev_spec = mode_hdr10->dev_spec;
	} else {
	mode_hdr6 = (struct scsi_mode_header_6 *)csio->data_ptr;
	dev_spec = mode_hdr6->dev_spec;
	}
	if (cam_ccb_status(done_ccb) == CAM_REQ_CMP) {
	if ((dev_spec & 0x80) != 0)
	softc->disk->d_flags \|= DISKFLAG_WRITE_PROTECT;
	else
	softc->disk->d_flags &= ~DISKFLAG_WRITE_PROTECT;
	} else {
	int error;

	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(csio->data_ptr, M_SCSIDA);
	if ((softc->flags & DA_FLAG_CAN_RC16) != 0)
	softc->state = DA_STATE_PROBE_RC16;
	else
	softc->state = DA_STATE_PROBE_RC;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}

	static void
	dadone_proberc(struct cam_periph periph, union ccb done_ccb)
	{
	struct scsi_read_capacity_data *rdcap;
	struct scsi_read_capacity_data_long *rcaplong;
	struct da_softc *softc;
	struct ccb_scsiio *csio;
	da_ccb_state state;
	char *announce_buf;
	u_int32_t priority;
	int lbp, n;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone_proberc\n"));

	softc = (struct da_softc *)periph->softc;
	priority = done_ccb->ccb_h.pinfo.priority;
	csio = &done_ccb->csio;
	state = csio->ccb_h.ccb_state & DA_CCB_TYPE_MASK;

	KASSERT(softc->state == DA_STATE_PROBE_RC \|\| softc->state == DA_STATE_PROBE_RC16,
	("State (%d) not PROBE_RC* in dadone_proberc, periph %p ccb %p",
	softc->state, periph, done_ccb));
	KASSERT(state == DA_CCB_PROBE_RC \|\| state == DA_CCB_PROBE_RC16,
	("CCB State (%lu) not PROBE_RC* in dadone_probewp, periph %p ccb %p",
	(unsigned long)state, periph, done_ccb));

	lbp = 0;
	rdcap = NULL;
	rcaplong = NULL;
	/* XXX TODO: can this be a malloc? */
	announce_buf = softc->announce_temp;
	bzero(announce_buf, DA_ANNOUNCETMP_SZ);

	if (state == DA_CCB_PROBE_RC)
	rdcap =(struct scsi_read_capacity_data *)csio->data_ptr;
	else
	rcaplong = (struct scsi_read_capacity_data_long *)
	csio->data_ptr;

	cam_periph_assert(periph, MA_OWNED);

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	struct disk_params *dp;
	uint32_t block_size;
	uint64_t maxsector;
	u_int lalba; /* Lowest aligned LBA. */

	if (state == DA_CCB_PROBE_RC) {
	block_size = scsi_4btoul(rdcap->length);
	maxsector = scsi_4btoul(rdcap->addr);
	lalba = 0;

	/*
	* According to SBC-2, if the standard 10
	* byte READ CAPACITY command returns 2^32,
	* we should issue the 16 byte version of
	* the command, since the device in question
	* has more sectors than can be represented
	* with the short version of the command.
	*/
	if (maxsector == 0xffffffff) {
	free(rdcap, M_SCSIDA);
	softc->state = DA_STATE_PROBE_RC16;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}
	} else {
	block_size = scsi_4btoul(rcaplong->length);
	maxsector = scsi_8btou64(rcaplong->addr);
	lalba = scsi_2btoul(rcaplong->lalba_lbp);
	}

	/*
	* Because GEOM code just will panic us if we
	* give them an 'illegal' value we'll avoid that
	* here.
	*/
	if (block_size == 0) {
	block_size = 512;
	if (maxsector == 0)
	maxsector = -1;
	}
	- if (block_size >= MAXPHYS) {
	+ if (block_size >= maxphys) {
	xpt_print(periph->path,
	"unsupportable block size %ju\n",
	(uintmax_t) block_size);
	announce_buf = NULL;
	cam_periph_invalidate(periph);
	} else {
	/*
	* We pass rcaplong into dasetgeom(),
	* because it will only use it if it is
	* non-NULL.
	*/
	dasetgeom(periph, block_size, maxsector,
	rcaplong, sizeof(*rcaplong));
	lbp = (lalba & SRC16_LBPME_A);
	dp = &softc->params;
	n = snprintf(announce_buf, DA_ANNOUNCETMP_SZ,
	"%juMB (%ju %u byte sectors",
	((uintmax_t)dp->secsize * dp->sectors) /
	(1024 * 1024),
	(uintmax_t)dp->sectors, dp->secsize);
	if (softc->p_type != 0) {
	n += snprintf(announce_buf + n,
	DA_ANNOUNCETMP_SZ - n,
	", DIF type %d", softc->p_type);
	}
	snprintf(announce_buf + n, DA_ANNOUNCETMP_SZ - n, ")");
	}
	} else {
	int error;

	/*
	* Retry any UNIT ATTENTION type errors. They
	* are expected at boot.
	*/
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART) {
	/*
	* A retry was scheuled, so
	* just return.
	*/
	return;
	} else if (error != 0) {
	int asc, ascq;
	int sense_key, error_code;
	int have_sense;
	cam_status status;
	struct ccb_getdev cgd;

	/* Don't wedge this device's queue */
	status = done_ccb->ccb_h.status;
	if ((status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);

	xpt_setup_ccb(&cgd.ccb_h, done_ccb->ccb_h.path,
	CAM_PRIORITY_NORMAL);
	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_action((union ccb *)&cgd);

	if (scsi_extract_sense_ccb(done_ccb,
	&error_code, &sense_key, &asc, &ascq))
	have_sense = TRUE;
	else
	have_sense = FALSE;

	/*
	* If we tried READ CAPACITY(16) and failed,
	* fallback to READ CAPACITY(10).
	*/
	if ((state == DA_CCB_PROBE_RC16) &&
	(softc->flags & DA_FLAG_CAN_RC16) &&
	(((csio->ccb_h.status & CAM_STATUS_MASK) ==
	CAM_REQ_INVALID) \|\|
	((have_sense) &&
	(error_code == SSD_CURRENT_ERROR \|\|
	error_code == SSD_DESC_CURRENT_ERROR) &&
	(sense_key == SSD_KEY_ILLEGAL_REQUEST)))) {
	cam_periph_assert(periph, MA_OWNED);
	softc->flags &= ~DA_FLAG_CAN_RC16;
	free(rdcap, M_SCSIDA);
	softc->state = DA_STATE_PROBE_RC;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}

	/*
	* Attach to anything that claims to be a
	* direct access or optical disk device,
	* as long as it doesn't return a "Logical
	* unit not supported" (0x25) error.
	* "Internal Target Failure" (0x44) is also
	* special and typically means that the
	* device is a SATA drive behind a SATL
	* translation that's fallen into a
	* terminally fatal state.
	*/
	if ((have_sense)
	&& (asc != 0x25) && (asc != 0x44)
	&& (error_code == SSD_CURRENT_ERROR
	\|\| error_code == SSD_DESC_CURRENT_ERROR)) {
	const char *sense_key_desc;
	const char *asc_desc;

	dasetgeom(periph, 512, -1, NULL, 0);
	scsi_sense_desc(sense_key, asc, ascq,
	&cgd.inq_data, &sense_key_desc,
	&asc_desc);
	snprintf(announce_buf, DA_ANNOUNCETMP_SZ,
	"Attempt to query device "
	"size failed: %s, %s",
	sense_key_desc, asc_desc);
	} else {
	if (have_sense)
	scsi_sense_print(&done_ccb->csio);
	else {
	xpt_print(periph->path,
	"got CAM status %#x\n",
	done_ccb->ccb_h.status);
	}

	xpt_print(periph->path, "fatal error, "
	"failed to attach to device\n");

	announce_buf = NULL;

	/*
	* Free up resources.
	*/
	cam_periph_invalidate(periph);
	}
	}
	}
	free(csio->data_ptr, M_SCSIDA);
	if (announce_buf != NULL &&
	((softc->flags & DA_FLAG_ANNOUNCED) == 0)) {
	struct sbuf sb;

	sbuf_new(&sb, softc->announcebuf, DA_ANNOUNCE_SZ,
	SBUF_FIXEDLEN);
	xpt_announce_periph_sbuf(periph, &sb, announce_buf);
	xpt_announce_quirks_sbuf(periph, &sb, softc->quirks,
	DA_Q_BIT_STRING);
	sbuf_finish(&sb);
	sbuf_putbuf(&sb);

	/*
	* Create our sysctl variables, now that we know
	* we have successfully attached.
	*/
	/* increase the refcount */
	if (da_periph_acquire(periph, DA_REF_SYSCTL) == 0) {
	taskqueue_enqueue(taskqueue_thread,
	&softc->sysctl_task);
	} else {
	/* XXX This message is useless! */
	xpt_print(periph->path, "fatal error, "
	"could not acquire reference count\n");
	}
	}

	/* We already probed the device. */
	if (softc->flags & DA_FLAG_PROBED) {
	daprobedone(periph, done_ccb);
	return;
	}

	/* Ensure re-probe doesn't see old delete. */
	softc->delete_available = 0;
	dadeleteflag(softc, DA_DELETE_ZERO, 1);
	if (lbp && (softc->quirks & DA_Q_NO_UNMAP) == 0) {
	/*
	* Based on older SBC-3 spec revisions
	* any of the UNMAP methods "may" be
	* available via LBP given this flag so
	* we flag all of them as available and
	* then remove those which further
	* probes confirm aren't available
	* later.
	*
	* We could also check readcap(16) p_type
	* flag to exclude one or more invalid
	* write same (X) types here
	*/
	dadeleteflag(softc, DA_DELETE_WS16, 1);
	dadeleteflag(softc, DA_DELETE_WS10, 1);
	dadeleteflag(softc, DA_DELETE_UNMAP, 1);

	softc->state = DA_STATE_PROBE_LBP;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}

	softc->state = DA_STATE_PROBE_BDC;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}

	static void
	dadone_probelbp(struct cam_periph periph, union ccb done_ccb)
	{
	struct scsi_vpd_logical_block_prov *lbp;
	struct da_softc *softc;
	struct ccb_scsiio *csio;
	u_int32_t priority;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone_probelbp\n"));

	softc = (struct da_softc *)periph->softc;
	priority = done_ccb->ccb_h.pinfo.priority;
	csio = &done_ccb->csio;
	lbp = (struct scsi_vpd_logical_block_prov *)csio->data_ptr;

	cam_periph_assert(periph, MA_OWNED);

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	/*
	* T10/1799-D Revision 31 states at least one of these
	* must be supported but we don't currently enforce this.
	*/
	dadeleteflag(softc, DA_DELETE_WS16,
	(lbp->flags & SVPD_LBP_WS16));
	dadeleteflag(softc, DA_DELETE_WS10,
	(lbp->flags & SVPD_LBP_WS10));
	dadeleteflag(softc, DA_DELETE_UNMAP,
	(lbp->flags & SVPD_LBP_UNMAP));
	} else {
	int error;
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}

	/*
	* Failure indicates we don't support any SBC-3
	* delete methods with UNMAP
	*/
	}
	}

	free(lbp, M_SCSIDA);
	softc->state = DA_STATE_PROBE_BLK_LIMITS;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}

	static void
	dadone_probeblklimits(struct cam_periph periph, union ccb done_ccb)
	{
	struct scsi_vpd_block_limits *block_limits;
	struct da_softc *softc;
	struct ccb_scsiio *csio;
	u_int32_t priority;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone_probeblklimits\n"));

	softc = (struct da_softc *)periph->softc;
	priority = done_ccb->ccb_h.pinfo.priority;
	csio = &done_ccb->csio;
	block_limits = (struct scsi_vpd_block_limits *)csio->data_ptr;

	cam_periph_assert(periph, MA_OWNED);

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	uint32_t max_txfer_len = scsi_4btoul(
	block_limits->max_txfer_len);
	uint32_t max_unmap_lba_cnt = scsi_4btoul(
	block_limits->max_unmap_lba_cnt);
	uint32_t max_unmap_blk_cnt = scsi_4btoul(
	block_limits->max_unmap_blk_cnt);
	uint32_t unmap_gran = scsi_4btoul(
	block_limits->opt_unmap_grain);
	uint32_t unmap_gran_align = scsi_4btoul(
	block_limits->unmap_grain_align);
	uint64_t ws_max_blks = scsi_8btou64(
	block_limits->max_write_same_length);

	if (max_txfer_len != 0) {
	softc->disk->d_maxsize = MIN(softc->maxio,
	(off_t)max_txfer_len * softc->params.secsize);
	}

	/*
	* We should already support UNMAP but we check lba
	* and block count to be sure
	*/
	if (max_unmap_lba_cnt != 0x00L &&
	max_unmap_blk_cnt != 0x00L) {
	softc->unmap_max_lba = max_unmap_lba_cnt;
	softc->unmap_max_ranges = min(max_unmap_blk_cnt,
	UNMAP_MAX_RANGES);
	if (unmap_gran > 1) {
	softc->unmap_gran = unmap_gran;
	if (unmap_gran_align & 0x80000000) {
	softc->unmap_gran_align =
	unmap_gran_align & 0x7fffffff;
	}
	}
	} else {
	/*
	* Unexpected UNMAP limits which means the
	* device doesn't actually support UNMAP
	*/
	dadeleteflag(softc, DA_DELETE_UNMAP, 0);
	}

	if (ws_max_blks != 0x00L)
	softc->ws_max_blks = ws_max_blks;
	} else {
	int error;
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}

	/*
	* Failure here doesn't mean UNMAP is not
	* supported as this is an optional page.
	*/
	softc->unmap_max_lba = 1;
	softc->unmap_max_ranges = 1;
	}
	}

	free(block_limits, M_SCSIDA);
	softc->state = DA_STATE_PROBE_BDC;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}

	static void
	dadone_probebdc(struct cam_periph periph, union ccb done_ccb)
	{
	struct scsi_vpd_block_device_characteristics *bdc;
	struct da_softc *softc;
	struct ccb_scsiio *csio;
	u_int32_t priority;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone_probebdc\n"));

	softc = (struct da_softc *)periph->softc;
	priority = done_ccb->ccb_h.pinfo.priority;
	csio = &done_ccb->csio;
	bdc = (struct scsi_vpd_block_device_characteristics *)csio->data_ptr;

	cam_periph_assert(periph, MA_OWNED);

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	uint32_t valid_len;

	/*
	* Disable queue sorting for non-rotational media
	* by default.
	*/
	u_int16_t old_rate = softc->disk->d_rotation_rate;

	valid_len = csio->dxfer_len - csio->resid;
	if (SBDC_IS_PRESENT(bdc, valid_len,
	medium_rotation_rate)) {
	softc->disk->d_rotation_rate =
	scsi_2btoul(bdc->medium_rotation_rate);
	if (softc->disk->d_rotation_rate ==
	SVPD_BDC_RATE_NON_ROTATING) {
	cam_iosched_set_sort_queue(
	softc->cam_iosched, 0);
	softc->flags &= ~DA_FLAG_ROTATING;
	}
	if (softc->disk->d_rotation_rate != old_rate) {
	disk_attr_changed(softc->disk,
	"GEOM::rotation_rate", M_NOWAIT);
	}
	}
	if ((SBDC_IS_PRESENT(bdc, valid_len, flags))
	&& (softc->zone_mode == DA_ZONE_NONE)) {
	int ata_proto;

	if (scsi_vpd_supported_page(periph,
	SVPD_ATA_INFORMATION))
	ata_proto = 1;
	else
	ata_proto = 0;

	/*
	* The Zoned field will only be set for
	* Drive Managed and Host Aware drives. If
	* they are Host Managed, the device type
	* in the standard INQUIRY data should be
	* set to T_ZBC_HM (0x14).
	*/
	if ((bdc->flags & SVPD_ZBC_MASK) ==
	SVPD_HAW_ZBC) {
	softc->zone_mode = DA_ZONE_HOST_AWARE;
	softc->zone_interface = (ata_proto) ?
	DA_ZONE_IF_ATA_SAT : DA_ZONE_IF_SCSI;
	} else if ((bdc->flags & SVPD_ZBC_MASK) ==
	SVPD_DM_ZBC) {
	softc->zone_mode =DA_ZONE_DRIVE_MANAGED;
	softc->zone_interface = (ata_proto) ?
	DA_ZONE_IF_ATA_SAT : DA_ZONE_IF_SCSI;
	} else if ((bdc->flags & SVPD_ZBC_MASK) !=
	SVPD_ZBC_NR) {
	xpt_print(periph->path, "Unknown zoned "
	"type %#x",
	bdc->flags & SVPD_ZBC_MASK);
	}
	}
	} else {
	int error;
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(bdc, M_SCSIDA);
	softc->state = DA_STATE_PROBE_ATA;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}

	static void
	dadone_probeata(struct cam_periph periph, union ccb done_ccb)
	{
	struct ata_params *ata_params;
	struct ccb_scsiio *csio;
	struct da_softc *softc;
	u_int32_t priority;
	int continue_probe;
	int error;
	int16_t *ptr;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone_probeata\n"));

	softc = (struct da_softc *)periph->softc;
	priority = done_ccb->ccb_h.pinfo.priority;
	csio = &done_ccb->csio;
	ata_params = (struct ata_params *)csio->data_ptr;
	ptr = (uint16_t *)ata_params;
	continue_probe = 0;
	error = 0;

	cam_periph_assert(periph, MA_OWNED);

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	uint16_t old_rate;

	ata_param_fixup(ata_params);
	if (ata_params->support_dsm & ATA_SUPPORT_DSM_TRIM &&
	(softc->quirks & DA_Q_NO_UNMAP) == 0) {
	dadeleteflag(softc, DA_DELETE_ATA_TRIM, 1);
	if (ata_params->max_dsm_blocks != 0)
	softc->trim_max_ranges = min(
	softc->trim_max_ranges,
	ata_params->max_dsm_blocks *
	ATA_DSM_BLK_RANGES);
	}
	/*
	* Disable queue sorting for non-rotational media
	* by default.
	*/
	old_rate = softc->disk->d_rotation_rate;
	softc->disk->d_rotation_rate = ata_params->media_rotation_rate;
	if (softc->disk->d_rotation_rate == ATA_RATE_NON_ROTATING) {
	cam_iosched_set_sort_queue(softc->cam_iosched, 0);
	softc->flags &= ~DA_FLAG_ROTATING;
	}
	if (softc->disk->d_rotation_rate != old_rate) {
	disk_attr_changed(softc->disk,
	"GEOM::rotation_rate", M_NOWAIT);
	}

	cam_periph_assert(periph, MA_OWNED);
	if (ata_params->capabilities1 & ATA_SUPPORT_DMA)
	softc->flags \|= DA_FLAG_CAN_ATA_DMA;

	if (ata_params->support.extension & ATA_SUPPORT_GENLOG)
	softc->flags \|= DA_FLAG_CAN_ATA_LOG;

	/*
	* At this point, if we have a SATA host aware drive,
	* we communicate via ATA passthrough unless the
	* SAT layer supports ZBC -> ZAC translation. In
	* that case,
	*
	* XXX KDM figure out how to detect a host managed
	* SATA drive.
	*/
	if (softc->zone_mode == DA_ZONE_NONE) {
	/*
	* Note that we don't override the zone
	* mode or interface if it has already been
	* set. This is because it has either been
	* set as a quirk, or when we probed the
	* SCSI Block Device Characteristics page,
	* the zoned field was set. The latter
	* means that the SAT layer supports ZBC to
	* ZAC translation, and we would prefer to
	* use that if it is available.
	*/
	if ((ata_params->support3 &
	ATA_SUPPORT_ZONE_MASK) ==
	ATA_SUPPORT_ZONE_HOST_AWARE) {
	softc->zone_mode = DA_ZONE_HOST_AWARE;
	softc->zone_interface =
	DA_ZONE_IF_ATA_PASS;
	} else if ((ata_params->support3 &
	ATA_SUPPORT_ZONE_MASK) ==
	ATA_SUPPORT_ZONE_DEV_MANAGED) {
	softc->zone_mode =DA_ZONE_DRIVE_MANAGED;
	softc->zone_interface = DA_ZONE_IF_ATA_PASS;
	}
	}

	} else {
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	if ((softc->zone_mode == DA_ZONE_HOST_AWARE)
	\|\| (softc->zone_mode == DA_ZONE_HOST_MANAGED)) {
	/*
	* If the ATA IDENTIFY failed, we could be talking
	* to a SCSI drive, although that seems unlikely,
	* since the drive did report that it supported the
	* ATA Information VPD page. If the ATA IDENTIFY
	* succeeded, and the SAT layer doesn't support
	* ZBC -> ZAC translation, continue on to get the
	* directory of ATA logs, and complete the rest of
	* the ZAC probe. If the SAT layer does support
	* ZBC -> ZAC translation, we want to use that,
	* and we'll probe the SCSI Zoned Block Device
	* Characteristics VPD page next.
	*/
	if ((error == 0)
	&& (softc->flags & DA_FLAG_CAN_ATA_LOG)
	&& (softc->zone_interface == DA_ZONE_IF_ATA_PASS))
	softc->state = DA_STATE_PROBE_ATA_LOGDIR;
	else
	softc->state = DA_STATE_PROBE_ZONE;
	continue_probe = 1;
	}
	if (continue_probe != 0) {
	xpt_schedule(periph, priority);
	xpt_release_ccb(done_ccb);
	return;
	} else
	daprobedone(periph, done_ccb);
	return;
	}

	static void
	dadone_probeatalogdir(struct cam_periph periph, union ccb done_ccb)
	{
	struct da_softc *softc;
	struct ccb_scsiio *csio;
	u_int32_t priority;
	int error;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone_probeatalogdir\n"));

	softc = (struct da_softc *)periph->softc;
	priority = done_ccb->ccb_h.pinfo.priority;
	csio = &done_ccb->csio;

	cam_periph_assert(periph, MA_OWNED);
	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	error = 0;
	softc->valid_logdir_len = 0;
	bzero(&softc->ata_logdir, sizeof(softc->ata_logdir));
	softc->valid_logdir_len = csio->dxfer_len - csio->resid;
	if (softc->valid_logdir_len > 0)
	bcopy(csio->data_ptr, &softc->ata_logdir,
	min(softc->valid_logdir_len,
	sizeof(softc->ata_logdir)));
	/*
	* Figure out whether the Identify Device log is
	* supported. The General Purpose log directory
	* has a header, and lists the number of pages
	* available for each GP log identified by the
	* offset into the list.
	*/
	if ((softc->valid_logdir_len >=
	((ATA_IDENTIFY_DATA_LOG + 1) * sizeof(uint16_t)))
	&& (le16dec(softc->ata_logdir.header) ==
	ATA_GP_LOG_DIR_VERSION)
	&& (le16dec(&softc->ata_logdir.num_pages[
	(ATA_IDENTIFY_DATA_LOG *
	sizeof(uint16_t)) - sizeof(uint16_t)]) > 0)){
	softc->flags \|= DA_FLAG_CAN_ATA_IDLOG;
	} else {
	softc->flags &= ~DA_FLAG_CAN_ATA_IDLOG;
	}
	} else {
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	/*
	* If we can't get the ATA log directory,
	* then ATA logs are effectively not
	* supported even if the bit is set in the
	* identify data.
	*/
	softc->flags &= ~(DA_FLAG_CAN_ATA_LOG \|
	DA_FLAG_CAN_ATA_IDLOG);
	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(csio->data_ptr, M_SCSIDA);

	if ((error == 0)
	&& (softc->flags & DA_FLAG_CAN_ATA_IDLOG)) {
	softc->state = DA_STATE_PROBE_ATA_IDDIR;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}
	daprobedone(periph, done_ccb);
	return;
	}

	static void
	dadone_probeataiddir(struct cam_periph periph, union ccb done_ccb)
	{
	struct da_softc *softc;
	struct ccb_scsiio *csio;
	u_int32_t priority;
	int error;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone_probeataiddir\n"));

	softc = (struct da_softc *)periph->softc;
	priority = done_ccb->ccb_h.pinfo.priority;
	csio = &done_ccb->csio;

	cam_periph_assert(periph, MA_OWNED);

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	off_t entries_offset, max_entries;
	error = 0;

	softc->valid_iddir_len = 0;
	bzero(&softc->ata_iddir, sizeof(softc->ata_iddir));
	softc->flags &= ~(DA_FLAG_CAN_ATA_SUPCAP \|
	DA_FLAG_CAN_ATA_ZONE);
	softc->valid_iddir_len = csio->dxfer_len - csio->resid;
	if (softc->valid_iddir_len > 0)
	bcopy(csio->data_ptr, &softc->ata_iddir,
	min(softc->valid_iddir_len,
	sizeof(softc->ata_iddir)));

	entries_offset =
	__offsetof(struct ata_identify_log_pages,entries);
	max_entries = softc->valid_iddir_len - entries_offset;
	if ((softc->valid_iddir_len > (entries_offset + 1))
	&& (le64dec(softc->ata_iddir.header) == ATA_IDLOG_REVISION)
	&& (softc->ata_iddir.entry_count > 0)) {
	int num_entries, i;

	num_entries = softc->ata_iddir.entry_count;
	num_entries = min(num_entries,
	softc->valid_iddir_len - entries_offset);
	for (i = 0; i < num_entries && i < max_entries; i++) {
	if (softc->ata_iddir.entries[i] ==
	ATA_IDL_SUP_CAP)
	softc->flags \|= DA_FLAG_CAN_ATA_SUPCAP;
	else if (softc->ata_iddir.entries[i] ==
	ATA_IDL_ZDI)
	softc->flags \|= DA_FLAG_CAN_ATA_ZONE;

	if ((softc->flags & DA_FLAG_CAN_ATA_SUPCAP)
	&& (softc->flags & DA_FLAG_CAN_ATA_ZONE))
	break;
	}
	}
	} else {
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	/*
	* If we can't get the ATA Identify Data log
	* directory, then it effectively isn't
	* supported even if the ATA Log directory
	* a non-zero number of pages present for
	* this log.
	*/
	softc->flags &= ~DA_FLAG_CAN_ATA_IDLOG;
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(csio->data_ptr, M_SCSIDA);

	if ((error == 0) && (softc->flags & DA_FLAG_CAN_ATA_SUPCAP)) {
	softc->state = DA_STATE_PROBE_ATA_SUP;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}
	daprobedone(periph, done_ccb);
	return;
	}

	static void
	dadone_probeatasup(struct cam_periph periph, union ccb done_ccb)
	{
	struct da_softc *softc;
	struct ccb_scsiio *csio;
	u_int32_t priority;
	int error;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone_probeatasup\n"));

	softc = (struct da_softc *)periph->softc;
	priority = done_ccb->ccb_h.pinfo.priority;
	csio = &done_ccb->csio;

	cam_periph_assert(periph, MA_OWNED);

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	uint32_t valid_len;
	size_t needed_size;
	struct ata_identify_log_sup_cap *sup_cap;
	error = 0;

	sup_cap = (struct ata_identify_log_sup_cap *)csio->data_ptr;
	valid_len = csio->dxfer_len - csio->resid;
	needed_size = __offsetof(struct ata_identify_log_sup_cap,
	sup_zac_cap) + 1 + sizeof(sup_cap->sup_zac_cap);
	if (valid_len >= needed_size) {
	uint64_t zoned, zac_cap;

	zoned = le64dec(sup_cap->zoned_cap);
	if (zoned & ATA_ZONED_VALID) {
	/*
	* This should have already been
	* set, because this is also in the
	* ATA identify data.
	*/
	if ((zoned & ATA_ZONED_MASK) ==
	ATA_SUPPORT_ZONE_HOST_AWARE)
	softc->zone_mode = DA_ZONE_HOST_AWARE;
	else if ((zoned & ATA_ZONED_MASK) ==
	ATA_SUPPORT_ZONE_DEV_MANAGED)
	softc->zone_mode =
	DA_ZONE_DRIVE_MANAGED;
	}

	zac_cap = le64dec(sup_cap->sup_zac_cap);
	if (zac_cap & ATA_SUP_ZAC_CAP_VALID) {
	if (zac_cap & ATA_REPORT_ZONES_SUP)
	softc->zone_flags \|=
	DA_ZONE_FLAG_RZ_SUP;
	if (zac_cap & ATA_ND_OPEN_ZONE_SUP)
	softc->zone_flags \|=
	DA_ZONE_FLAG_OPEN_SUP;
	if (zac_cap & ATA_ND_CLOSE_ZONE_SUP)
	softc->zone_flags \|=
	DA_ZONE_FLAG_CLOSE_SUP;
	if (zac_cap & ATA_ND_FINISH_ZONE_SUP)
	softc->zone_flags \|=
	DA_ZONE_FLAG_FINISH_SUP;
	if (zac_cap & ATA_ND_RWP_SUP)
	softc->zone_flags \|=
	DA_ZONE_FLAG_RWP_SUP;
	} else {
	/*
	* This field was introduced in
	* ACS-4, r08 on April 28th, 2015.
	* If the drive firmware was written
	* to an earlier spec, it won't have
	* the field. So, assume all
	* commands are supported.
	*/
	softc->zone_flags \|= DA_ZONE_FLAG_SUP_MASK;
	}
	}
	} else {
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	/*
	* If we can't get the ATA Identify Data
	* Supported Capabilities page, clear the
	* flag...
	*/
	softc->flags &= ~DA_FLAG_CAN_ATA_SUPCAP;
	/*
	* And clear zone capabilities.
	*/
	softc->zone_flags &= ~DA_ZONE_FLAG_SUP_MASK;
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(csio->data_ptr, M_SCSIDA);

	if ((error == 0) && (softc->flags & DA_FLAG_CAN_ATA_ZONE)) {
	softc->state = DA_STATE_PROBE_ATA_ZONE;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}
	daprobedone(periph, done_ccb);
	return;
	}

	static void
	dadone_probeatazone(struct cam_periph periph, union ccb done_ccb)
	{
	struct da_softc *softc;
	struct ccb_scsiio *csio;
	int error;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone_probeatazone\n"));

	softc = (struct da_softc *)periph->softc;
	csio = &done_ccb->csio;

	cam_periph_assert(periph, MA_OWNED);

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	struct ata_zoned_info_log *zi_log;
	uint32_t valid_len;
	size_t needed_size;

	zi_log = (struct ata_zoned_info_log *)csio->data_ptr;

	valid_len = csio->dxfer_len - csio->resid;
	needed_size = __offsetof(struct ata_zoned_info_log,
	version_info) + 1 + sizeof(zi_log->version_info);
	if (valid_len >= needed_size) {
	uint64_t tmpvar;

	tmpvar = le64dec(zi_log->zoned_cap);
	if (tmpvar & ATA_ZDI_CAP_VALID) {
	if (tmpvar & ATA_ZDI_CAP_URSWRZ)
	softc->zone_flags \|=
	DA_ZONE_FLAG_URSWRZ;
	else
	softc->zone_flags &=
	~DA_ZONE_FLAG_URSWRZ;
	}
	tmpvar = le64dec(zi_log->optimal_seq_zones);
	if (tmpvar & ATA_ZDI_OPT_SEQ_VALID) {
	softc->zone_flags \|= DA_ZONE_FLAG_OPT_SEQ_SET;
	softc->optimal_seq_zones = (tmpvar &
	ATA_ZDI_OPT_SEQ_MASK);
	} else {
	softc->zone_flags &= ~DA_ZONE_FLAG_OPT_SEQ_SET;
	softc->optimal_seq_zones = 0;
	}

	tmpvar =le64dec(zi_log->optimal_nonseq_zones);
	if (tmpvar & ATA_ZDI_OPT_NS_VALID) {
	softc->zone_flags \|=
	DA_ZONE_FLAG_OPT_NONSEQ_SET;
	softc->optimal_nonseq_zones =
	(tmpvar & ATA_ZDI_OPT_NS_MASK);
	} else {
	softc->zone_flags &=
	~DA_ZONE_FLAG_OPT_NONSEQ_SET;
	softc->optimal_nonseq_zones = 0;
	}

	tmpvar = le64dec(zi_log->max_seq_req_zones);
	if (tmpvar & ATA_ZDI_MAX_SEQ_VALID) {
	softc->zone_flags \|= DA_ZONE_FLAG_MAX_SEQ_SET;
	softc->max_seq_zones =
	(tmpvar & ATA_ZDI_MAX_SEQ_MASK);
	} else {
	softc->zone_flags &= ~DA_ZONE_FLAG_MAX_SEQ_SET;
	softc->max_seq_zones = 0;
	}
	}
	} else {
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	softc->flags &= ~DA_FLAG_CAN_ATA_ZONE;
	softc->flags &= ~DA_ZONE_FLAG_SET_MASK;

	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(csio->data_ptr, M_SCSIDA);

	daprobedone(periph, done_ccb);
	return;
	}

	static void
	dadone_probezone(struct cam_periph periph, union ccb done_ccb)
	{
	struct da_softc *softc;
	struct ccb_scsiio *csio;
	int error;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone_probezone\n"));

	softc = (struct da_softc *)periph->softc;
	csio = &done_ccb->csio;

	cam_periph_assert(periph, MA_OWNED);

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	uint32_t valid_len;
	size_t needed_len;
	struct scsi_vpd_zoned_bdc *zoned_bdc;

	error = 0;
	zoned_bdc = (struct scsi_vpd_zoned_bdc *)csio->data_ptr;
	valid_len = csio->dxfer_len - csio->resid;
	needed_len = __offsetof(struct scsi_vpd_zoned_bdc,
	max_seq_req_zones) + 1 +
	sizeof(zoned_bdc->max_seq_req_zones);
	if ((valid_len >= needed_len)
	&& (scsi_2btoul(zoned_bdc->page_length) >= SVPD_ZBDC_PL)) {
	if (zoned_bdc->flags & SVPD_ZBDC_URSWRZ)
	softc->zone_flags \|= DA_ZONE_FLAG_URSWRZ;
	else
	softc->zone_flags &= ~DA_ZONE_FLAG_URSWRZ;
	softc->optimal_seq_zones =
	scsi_4btoul(zoned_bdc->optimal_seq_zones);
	softc->zone_flags \|= DA_ZONE_FLAG_OPT_SEQ_SET;
	softc->optimal_nonseq_zones = scsi_4btoul(
	zoned_bdc->optimal_nonseq_zones);
	softc->zone_flags \|= DA_ZONE_FLAG_OPT_NONSEQ_SET;
	softc->max_seq_zones =
	scsi_4btoul(zoned_bdc->max_seq_req_zones);
	softc->zone_flags \|= DA_ZONE_FLAG_MAX_SEQ_SET;
	}
	/*
	* All of the zone commands are mandatory for SCSI
	* devices.
	*
	* XXX KDM this is valid as of September 2015.
	* Re-check this assumption once the SAT spec is
	* updated to support SCSI ZBC to ATA ZAC mapping.
	* Since ATA allows zone commands to be reported
	* as supported or not, this may not necessarily
	* be true for an ATA device behind a SAT (SCSI to
	* ATA Translation) layer.
	*/
	softc->zone_flags \|= DA_ZONE_FLAG_SUP_MASK;
	} else {
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(csio->data_ptr, M_SCSIDA);

	daprobedone(periph, done_ccb);
	return;
	}

	static void
	dadone_tur(struct cam_periph periph, union ccb done_ccb)
	{
	struct da_softc *softc;
	struct ccb_scsiio *csio;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone_tur\n"));

	softc = (struct da_softc *)periph->softc;
	csio = &done_ccb->csio;

	cam_periph_assert(periph, MA_OWNED);

	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	if (daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA \| SF_NO_RECOVERY \| SF_NO_PRINT) == ERESTART)
	return; /* Will complete again, keep reference */
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	softc->flags &= ~DA_FLAG_TUR_PENDING;
	xpt_release_ccb(done_ccb);
	da_periph_release_locked(periph, DA_REF_TUR);
	return;
	}

	static void
	dareprobe(struct cam_periph *periph)
	{
	struct da_softc *softc;
	int status;

	softc = (struct da_softc *)periph->softc;

	cam_periph_assert(periph, MA_OWNED);

	/* Probe in progress; don't interfere. */
	if (softc->state != DA_STATE_NORMAL)
	return;

	status = da_periph_acquire(periph, DA_REF_REPROBE);
	KASSERT(status == 0, ("dareprobe: cam_periph_acquire failed"));

	softc->state = DA_STATE_PROBE_WP;
	xpt_schedule(periph, CAM_PRIORITY_DEV);
	}

	static int
	daerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
	{
	struct da_softc *softc;
	struct cam_periph *periph;
	int error, error_code, sense_key, asc, ascq;

	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	if (ccb->csio.bio != NULL)
	biotrack(ccb->csio.bio, __func__);
	#endif

	periph = xpt_path_periph(ccb->ccb_h.path);
	softc = (struct da_softc *)periph->softc;

	cam_periph_assert(periph, MA_OWNED);

	/*
	* Automatically detect devices that do not support
	* READ(6)/WRITE(6) and upgrade to using 10 byte cdbs.
	*/
	error = 0;
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) {
	error = cmd6workaround(ccb);
	} else if (scsi_extract_sense_ccb(ccb,
	&error_code, &sense_key, &asc, &ascq)) {
	if (sense_key == SSD_KEY_ILLEGAL_REQUEST)
	error = cmd6workaround(ccb);
	/*
	* If the target replied with CAPACITY DATA HAS CHANGED UA,
	* query the capacity and notify upper layers.
	*/
	else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
	asc == 0x2A && ascq == 0x09) {
	xpt_print(periph->path, "Capacity data has changed\n");
	softc->flags &= ~DA_FLAG_PROBED;
	dareprobe(periph);
	sense_flags \|= SF_NO_PRINT;
	} else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
	asc == 0x28 && ascq == 0x00) {
	softc->flags &= ~DA_FLAG_PROBED;
	disk_media_changed(softc->disk, M_NOWAIT);
	} else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
	asc == 0x3F && ascq == 0x03) {
	xpt_print(periph->path, "INQUIRY data has changed\n");
	softc->flags &= ~DA_FLAG_PROBED;
	dareprobe(periph);
	sense_flags \|= SF_NO_PRINT;
	} else if (sense_key == SSD_KEY_NOT_READY &&
	asc == 0x3a && (softc->flags & DA_FLAG_PACK_INVALID) == 0) {
	softc->flags \|= DA_FLAG_PACK_INVALID;
	disk_media_gone(softc->disk, M_NOWAIT);
	}
	}
	if (error == ERESTART)
	return (ERESTART);

	#ifdef CAM_IO_STATS
	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
	case CAM_CMD_TIMEOUT:
	softc->timeouts++;
	break;
	case CAM_REQ_ABORTED:
	case CAM_REQ_CMP_ERR:
	case CAM_REQ_TERMIO:
	case CAM_UNREC_HBA_ERROR:
	case CAM_DATA_RUN_ERR:
	softc->errors++;
	break;
	default:
	break;
	}
	#endif

	/*
	* XXX
	* Until we have a better way of doing pack validation,
	* don't treat UAs as errors.
	*/
	sense_flags \|= SF_RETRY_UA;

	if (softc->quirks & DA_Q_RETRY_BUSY)
	sense_flags \|= SF_RETRY_BUSY;
	return(cam_periph_error(ccb, cam_flags, sense_flags));
	}

	static void
	damediapoll(void *arg)
	{
	struct cam_periph *periph = arg;
	struct da_softc *softc = periph->softc;

	if (!cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR) &&
	(softc->flags & DA_FLAG_TUR_PENDING) == 0 &&
	softc->state == DA_STATE_NORMAL &&
	LIST_EMPTY(&softc->pending_ccbs)) {
	if (da_periph_acquire(periph, DA_REF_TUR) == 0) {
	cam_iosched_set_work_flags(softc->cam_iosched, DA_WORK_TUR);
	daschedule(periph);
	}
	}
	/* Queue us up again */
	if (da_poll_period != 0)
	callout_schedule(&softc->mediapoll_c, da_poll_period * hz);
	}

	static void
	daprevent(struct cam_periph *periph, int action)
	{
	struct da_softc *softc;
	union ccb *ccb;
	int error;

	cam_periph_assert(periph, MA_OWNED);
	softc = (struct da_softc *)periph->softc;

	if (((action == PR_ALLOW)
	&& (softc->flags & DA_FLAG_PACK_LOCKED) == 0)
	\|\| ((action == PR_PREVENT)
	&& (softc->flags & DA_FLAG_PACK_LOCKED) != 0)) {
	return;
	}

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	scsi_prevent(&ccb->csio,
	/retries/1,
	/cbcfp/NULL,
	MSG_SIMPLE_Q_TAG,
	action,
	SSD_FULL_SIZE,
	5000);

	error = cam_periph_runccb(ccb, daerror, CAM_RETRY_SELTO,
	SF_RETRY_UA \| SF_NO_PRINT, softc->disk->d_devstat);

	if (error == 0) {
	if (action == PR_ALLOW)
	softc->flags &= ~DA_FLAG_PACK_LOCKED;
	else
	softc->flags \|= DA_FLAG_PACK_LOCKED;
	}

	xpt_release_ccb(ccb);
	}

	static void
	dasetgeom(struct cam_periph *periph, uint32_t block_len, uint64_t maxsector,
	struct scsi_read_capacity_data_long *rcaplong, size_t rcap_len)
	{
	struct ccb_calc_geometry ccg;
	struct da_softc *softc;
	struct disk_params *dp;
	u_int lbppbe, lalba;
	int error;

	softc = (struct da_softc *)periph->softc;

	dp = &softc->params;
	dp->secsize = block_len;
	dp->sectors = maxsector + 1;
	if (rcaplong != NULL) {
	lbppbe = rcaplong->prot_lbppbe & SRC16_LBPPBE;
	lalba = scsi_2btoul(rcaplong->lalba_lbp);
	lalba &= SRC16_LALBA_A;
	if (rcaplong->prot & SRC16_PROT_EN)
	softc->p_type = ((rcaplong->prot & SRC16_P_TYPE) >>
	SRC16_P_TYPE_SHIFT) + 1;
	else
	softc->p_type = 0;
	} else {
	lbppbe = 0;
	lalba = 0;
	softc->p_type = 0;
	}

	if (lbppbe > 0) {
	dp->stripesize = block_len << lbppbe;
	dp->stripeoffset = (dp->stripesize - block_len * lalba) %
	dp->stripesize;
	} else if (softc->quirks & DA_Q_4K) {
	dp->stripesize = 4096;
	dp->stripeoffset = 0;
	} else if (softc->unmap_gran != 0) {
	dp->stripesize = block_len * softc->unmap_gran;
	dp->stripeoffset = (dp->stripesize - block_len *
	softc->unmap_gran_align) % dp->stripesize;
	} else {
	dp->stripesize = 0;
	dp->stripeoffset = 0;
	}
	/*
	* Have the controller provide us with a geometry
	* for this disk. The only time the geometry
	* matters is when we boot and the controller
	* is the only one knowledgeable enough to come
	* up with something that will make this a bootable
	* device.
	*/
	xpt_setup_ccb(&ccg.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	ccg.ccb_h.func_code = XPT_CALC_GEOMETRY;
	ccg.block_size = dp->secsize;
	ccg.volume_size = dp->sectors;
	ccg.heads = 0;
	ccg.secs_per_track = 0;
	ccg.cylinders = 0;
	xpt_action((union ccb*)&ccg);
	if ((ccg.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	/*
	* We don't know what went wrong here- but just pick
	* a geometry so we don't have nasty things like divide
	* by zero.
	*/
	dp->heads = 255;
	dp->secs_per_track = 255;
	dp->cylinders = dp->sectors / (255 * 255);
	if (dp->cylinders == 0) {
	dp->cylinders = 1;
	}
	} else {
	dp->heads = ccg.heads;
	dp->secs_per_track = ccg.secs_per_track;
	dp->cylinders = ccg.cylinders;
	}

	/*
	* If the user supplied a read capacity buffer, and if it is
	* different than the previous buffer, update the data in the EDT.
	* If it's the same, we don't bother. This avoids sending an
	* update every time someone opens this device.
	*/
	if ((rcaplong != NULL)
	&& (bcmp(rcaplong, &softc->rcaplong,
	min(sizeof(softc->rcaplong), rcap_len)) != 0)) {
	struct ccb_dev_advinfo cdai;

	xpt_setup_ccb(&cdai.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	cdai.ccb_h.func_code = XPT_DEV_ADVINFO;
	cdai.buftype = CDAI_TYPE_RCAPLONG;
	cdai.flags = CDAI_FLAG_STORE;
	cdai.bufsiz = rcap_len;
	cdai.buf = (uint8_t *)rcaplong;
	xpt_action((union ccb *)&cdai);
	if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE);
	if (cdai.ccb_h.status != CAM_REQ_CMP) {
	xpt_print(periph->path, "%s: failed to set read "
	"capacity advinfo\n", __func__);
	/* Use cam_error_print() to decode the status */
	cam_error_print((union ccb *)&cdai, CAM_ESF_CAM_STATUS,
	CAM_EPF_ALL);
	} else {
	bcopy(rcaplong, &softc->rcaplong,
	min(sizeof(softc->rcaplong), rcap_len));
	}
	}

	softc->disk->d_sectorsize = softc->params.secsize;
	softc->disk->d_mediasize = softc->params.secsize * (off_t)softc->params.sectors;
	softc->disk->d_stripesize = softc->params.stripesize;
	softc->disk->d_stripeoffset = softc->params.stripeoffset;
	/* XXX: these are not actually "firmware" values, so they may be wrong */
	softc->disk->d_fwsectors = softc->params.secs_per_track;
	softc->disk->d_fwheads = softc->params.heads;
	softc->disk->d_devstat->block_size = softc->params.secsize;
	softc->disk->d_devstat->flags &= ~DEVSTAT_BS_UNAVAILABLE;

	error = disk_resize(softc->disk, M_NOWAIT);
	if (error != 0)
	xpt_print(periph->path, "disk_resize(9) failed, error = %d\n", error);
	}

	static void
	dasendorderedtag(void *arg)
	{
	struct cam_periph *periph = arg;
	struct da_softc *softc = periph->softc;

	cam_periph_assert(periph, MA_OWNED);
	if (da_send_ordered) {
	if (!LIST_EMPTY(&softc->pending_ccbs)) {
	if ((softc->flags & DA_FLAG_WAS_OTAG) == 0)
	softc->flags \|= DA_FLAG_NEED_OTAG;
	softc->flags &= ~DA_FLAG_WAS_OTAG;
	}
	}

	/* Queue us up again */
	callout_reset(&softc->sendordered_c,
	(da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL,
	dasendorderedtag, periph);
	}

	/*
	* Step through all DA peripheral drivers, and if the device is still open,
	* sync the disk cache to physical media.
	*/
	static void
	dashutdown(void * arg, int howto)
	{
	struct cam_periph *periph;
	struct da_softc *softc;
	union ccb *ccb;
	int error;

	CAM_PERIPH_FOREACH(periph, &dadriver) {
	softc = (struct da_softc *)periph->softc;
	if (SCHEDULER_STOPPED()) {
	/* If we paniced with the lock held, do not recurse. */
	if (!cam_periph_owned(periph) &&
	(softc->flags & DA_FLAG_OPEN)) {
	dadump(softc->disk, NULL, 0, 0, 0);
	}
	continue;
	}
	cam_periph_lock(periph);

	/*
	* We only sync the cache if the drive is still open, and
	* if the drive is capable of it..
	*/
	if (((softc->flags & DA_FLAG_OPEN) == 0)
	\|\| (softc->quirks & DA_Q_NO_SYNC_CACHE)) {
	cam_periph_unlock(periph);
	continue;
	}

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	scsi_synchronize_cache(&ccb->csio,
	/retries/0,
	/cbfcnp/NULL,
	MSG_SIMPLE_Q_TAG,
	/begin_lba/0, /* whole disk */
	/lb_count/0,
	SSD_FULL_SIZE,
	60 * 60 * 1000);

	error = cam_periph_runccb(ccb, daerror, /cam_flags/0,
	/sense_flags/ SF_NO_RECOVERY \| SF_NO_RETRY \| SF_QUIET_IR,
	softc->disk->d_devstat);
	if (error != 0)
	xpt_print(periph->path, "Synchronize cache failed\n");
	xpt_release_ccb(ccb);
	cam_periph_unlock(periph);
	}
	}

	#else /* !_KERNEL */

	/*
	* XXX These are only left out of the kernel build to silence warnings. If,
	* for some reason these functions are used in the kernel, the ifdefs should
	* be moved so they are included both in the kernel and userland.
	*/
	void
	scsi_format_unit(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, u_int8_t byte2, u_int16_t ileave,
	u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len,
	u_int32_t timeout)
	{
	struct scsi_format_unit *scsi_cmd;

	scsi_cmd = (struct scsi_format_unit *)&csio->cdb_io.cdb_bytes;
	scsi_cmd->opcode = FORMAT_UNIT;
	scsi_cmd->byte2 = byte2;
	scsi_ulto2b(ileave, scsi_cmd->interleave);

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
	tag_action,
	data_ptr,
	dxfer_len,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}

	void
	scsi_read_defects(struct ccb_scsiio *csio, uint32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	uint8_t tag_action, uint8_t list_format,
	uint32_t addr_desc_index, uint8_t *data_ptr,
	uint32_t dxfer_len, int minimum_cmd_size,
	uint8_t sense_len, uint32_t timeout)
	{
	uint8_t cdb_len;

	/*
	* These conditions allow using the 10 byte command. Otherwise we
	* need to use the 12 byte command.
	*/
	if ((minimum_cmd_size <= 10)
	&& (addr_desc_index == 0)
	&& (dxfer_len <= SRDD10_MAX_LENGTH)) {
	struct scsi_read_defect_data_10 *cdb10;

	cdb10 = (struct scsi_read_defect_data_10 *)
	&csio->cdb_io.cdb_bytes;

	cdb_len = sizeof(*cdb10);
	bzero(cdb10, cdb_len);
	cdb10->opcode = READ_DEFECT_DATA_10;
	cdb10->format = list_format;
	scsi_ulto2b(dxfer_len, cdb10->alloc_length);
	} else {
	struct scsi_read_defect_data_12 *cdb12;

	cdb12 = (struct scsi_read_defect_data_12 *)
	&csio->cdb_io.cdb_bytes;

	cdb_len = sizeof(*cdb12);
	bzero(cdb12, cdb_len);
	cdb12->opcode = READ_DEFECT_DATA_12;
	cdb12->format = list_format;
	scsi_ulto4b(dxfer_len, cdb12->alloc_length);
	scsi_ulto4b(addr_desc_index, cdb12->address_descriptor_index);
	}

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ CAM_DIR_IN,
	tag_action,
	data_ptr,
	dxfer_len,
	sense_len,
	cdb_len,
	timeout);
	}

	void
	scsi_sanitize(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, u_int8_t byte2, u_int16_t control,
	u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len,
	u_int32_t timeout)
	{
	struct scsi_sanitize *scsi_cmd;

	scsi_cmd = (struct scsi_sanitize *)&csio->cdb_io.cdb_bytes;
	scsi_cmd->opcode = SANITIZE;
	scsi_cmd->byte2 = byte2;
	scsi_cmd->control = control;
	scsi_ulto2b(dxfer_len, scsi_cmd->length);

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
	tag_action,
	data_ptr,
	dxfer_len,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}

	#endif /* _KERNEL */

	void
	scsi_zbc_out(struct ccb_scsiio *csio, uint32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	uint8_t tag_action, uint8_t service_action, uint64_t zone_id,
	uint8_t zone_flags, uint8_t *data_ptr, uint32_t dxfer_len,
	uint8_t sense_len, uint32_t timeout)
	{
	struct scsi_zbc_out *scsi_cmd;

	scsi_cmd = (struct scsi_zbc_out *)&csio->cdb_io.cdb_bytes;
	scsi_cmd->opcode = ZBC_OUT;
	scsi_cmd->service_action = service_action;
	scsi_u64to8b(zone_id, scsi_cmd->zone_id);
	scsi_cmd->zone_flags = zone_flags;

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
	tag_action,
	data_ptr,
	dxfer_len,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}

	void
	scsi_zbc_in(struct ccb_scsiio *csio, uint32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	uint8_t tag_action, uint8_t service_action, uint64_t zone_start_lba,
	uint8_t zone_options, uint8_t *data_ptr, uint32_t dxfer_len,
	uint8_t sense_len, uint32_t timeout)
	{
	struct scsi_zbc_in *scsi_cmd;

	scsi_cmd = (struct scsi_zbc_in *)&csio->cdb_io.cdb_bytes;
	scsi_cmd->opcode = ZBC_IN;
	scsi_cmd->service_action = service_action;
	scsi_ulto4b(dxfer_len, scsi_cmd->length);
	scsi_u64to8b(zone_start_lba, scsi_cmd->zone_start_lba);
	scsi_cmd->zone_options = zone_options;

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ (dxfer_len > 0) ? CAM_DIR_IN : CAM_DIR_NONE,
	tag_action,
	data_ptr,
	dxfer_len,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);

	}

	int
	scsi_ata_zac_mgmt_out(struct ccb_scsiio *csio, uint32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	uint8_t tag_action, int use_ncq,
	uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags,
	uint8_t *data_ptr, uint32_t dxfer_len,
	uint8_t *cdb_storage, size_t cdb_storage_len,
	uint8_t sense_len, uint32_t timeout)
	{
	uint8_t command_out, protocol, ata_flags;
	uint16_t features_out;
	uint32_t sectors_out, auxiliary;
	int retval;

	retval = 0;

	if (use_ncq == 0) {
	command_out = ATA_ZAC_MANAGEMENT_OUT;
	features_out = (zm_action & 0xf) \| (zone_flags << 8);
	ata_flags = AP_FLAG_BYT_BLOK_BLOCKS;
	if (dxfer_len == 0) {
	protocol = AP_PROTO_NON_DATA;
	ata_flags \|= AP_FLAG_TLEN_NO_DATA;
	sectors_out = 0;
	} else {
	protocol = AP_PROTO_DMA;
	ata_flags \|= AP_FLAG_TLEN_SECT_CNT \|
	AP_FLAG_TDIR_TO_DEV;
	sectors_out = ((dxfer_len >> 9) & 0xffff);
	}
	auxiliary = 0;
	} else {
	ata_flags = AP_FLAG_BYT_BLOK_BLOCKS;
	if (dxfer_len == 0) {
	command_out = ATA_NCQ_NON_DATA;
	features_out = ATA_NCQ_ZAC_MGMT_OUT;
	/*
	* We're assuming the SCSI to ATA translation layer
	* will set the NCQ tag number in the tag field.
	* That isn't clear from the SAT-4 spec (as of rev 05).
	*/
	sectors_out = 0;
	ata_flags \|= AP_FLAG_TLEN_NO_DATA;
	} else {
	command_out = ATA_SEND_FPDMA_QUEUED;
	/*
	* Note that we're defaulting to normal priority,
	* and assuming that the SCSI to ATA translation
	* layer will insert the NCQ tag number in the tag
	* field. That isn't clear in the SAT-4 spec (as
	* of rev 05).
	*/
	sectors_out = ATA_SFPDMA_ZAC_MGMT_OUT << 8;

	ata_flags \|= AP_FLAG_TLEN_FEAT \|
	AP_FLAG_TDIR_TO_DEV;

	/*
	* For SEND FPDMA QUEUED, the transfer length is
	* encoded in the FEATURE register, and 0 means
	* that 65536 512 byte blocks are to be tranferred.
	* In practice, it seems unlikely that we'll see
	* a transfer that large, and it may confuse the
	* the SAT layer, because generally that means that
	* 0 bytes should be transferred.
	*/
	if (dxfer_len == (65536 * 512)) {
	features_out = 0;
	} else if (dxfer_len <= (65535 * 512)) {
	features_out = ((dxfer_len >> 9) & 0xffff);
	} else {
	/* The transfer is too big. */
	retval = 1;
	goto bailout;
	}
	}

	auxiliary = (zm_action & 0xf) \| (zone_flags << 8);
	protocol = AP_PROTO_FPDMA;
	}

	protocol \|= AP_EXTEND;

	retval = scsi_ata_pass(csio,
	retries,
	cbfcnp,
	/flags/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
	tag_action,
	/protocol/ protocol,
	/ata_flags/ ata_flags,
	/features/ features_out,
	/sector_count/ sectors_out,
	/lba/ zone_id,
	/command/ command_out,
	/device/ 0,
	/icc/ 0,
	/auxiliary/ auxiliary,
	/control/ 0,
	/data_ptr/ data_ptr,
	/dxfer_len/ dxfer_len,
	/cdb_storage/ cdb_storage,
	/cdb_storage_len/ cdb_storage_len,
	/minimum_cmd_size/ 0,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ timeout);

	bailout:

	return (retval);
	}

	int
	scsi_ata_zac_mgmt_in(struct ccb_scsiio *csio, uint32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	uint8_t tag_action, int use_ncq,
	uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags,
	uint8_t *data_ptr, uint32_t dxfer_len,
	uint8_t *cdb_storage, size_t cdb_storage_len,
	uint8_t sense_len, uint32_t timeout)
	{
	uint8_t command_out, protocol;
	uint16_t features_out, sectors_out;
	uint32_t auxiliary;
	int ata_flags;
	int retval;

	retval = 0;
	ata_flags = AP_FLAG_TDIR_FROM_DEV \| AP_FLAG_BYT_BLOK_BLOCKS;

	if (use_ncq == 0) {
	command_out = ATA_ZAC_MANAGEMENT_IN;
	/* XXX KDM put a macro here */
	features_out = (zm_action & 0xf) \| (zone_flags << 8);
	sectors_out = dxfer_len >> 9; /* XXX KDM macro */
	protocol = AP_PROTO_DMA;
	ata_flags \|= AP_FLAG_TLEN_SECT_CNT;
	auxiliary = 0;
	} else {
	ata_flags \|= AP_FLAG_TLEN_FEAT;

	command_out = ATA_RECV_FPDMA_QUEUED;
	sectors_out = ATA_RFPDMA_ZAC_MGMT_IN << 8;

	/*
	* For RECEIVE FPDMA QUEUED, the transfer length is
	* encoded in the FEATURE register, and 0 means
	* that 65536 512 byte blocks are to be tranferred.
	* In practice, it seems unlikely that we'll see
	* a transfer that large, and it may confuse the
	* the SAT layer, because generally that means that
	* 0 bytes should be transferred.
	*/
	if (dxfer_len == (65536 * 512)) {
	features_out = 0;
	} else if (dxfer_len <= (65535 * 512)) {
	features_out = ((dxfer_len >> 9) & 0xffff);
	} else {
	/* The transfer is too big. */
	retval = 1;
	goto bailout;
	}
	auxiliary = (zm_action & 0xf) \| (zone_flags << 8),
	protocol = AP_PROTO_FPDMA;
	}

	protocol \|= AP_EXTEND;

	retval = scsi_ata_pass(csio,
	retries,
	cbfcnp,
	/flags/ CAM_DIR_IN,
	tag_action,
	/protocol/ protocol,
	/ata_flags/ ata_flags,
	/features/ features_out,
	/sector_count/ sectors_out,
	/lba/ zone_id,
	/command/ command_out,
	/device/ 0,
	/icc/ 0,
	/auxiliary/ auxiliary,
	/control/ 0,
	/data_ptr/ data_ptr,
	/dxfer_len/ (dxfer_len >> 9) * 512, /* XXX KDM */
	/cdb_storage/ cdb_storage,
	/cdb_storage_len/ cdb_storage_len,
	/minimum_cmd_size/ 0,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ timeout);

	bailout:
	return (retval);
	}
	diff --git a/sys/cam/scsi/scsi_pass.c b/sys/cam/scsi/scsi_pass.c
	index 79e86ccaf267..146ff8f26700 100644
	--- a/sys/cam/scsi/scsi_pass.c
	+++ b/sys/cam/scsi/scsi_pass.c
	@@ -1,2250 +1,2250 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1997, 1998, 2000 Justin T. Gibbs.
	* Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/conf.h>
	#include <sys/types.h>
	#include <sys/bio.h>
	#include <sys/bus.h>
	#include <sys/devicestat.h>
	#include <sys/errno.h>
	#include <sys/fcntl.h>
	#include <sys/malloc.h>
	#include <sys/proc.h>
	#include <sys/poll.h>
	#include <sys/selinfo.h>
	#include <sys/sdt.h>
	#include <sys/sysent.h>
	#include <sys/taskqueue.h>
	#include <vm/uma.h>
	#include <vm/vm.h>
	#include <vm/vm_extern.h>

	#include <machine/bus.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_queue.h>
	#include <cam/cam_xpt.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/cam_debug.h>
	#include <cam/cam_compat.h>
	#include <cam/cam_xpt_periph.h>

	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_pass.h>

	typedef enum {
	PASS_FLAG_OPEN = 0x01,
	PASS_FLAG_LOCKED = 0x02,
	PASS_FLAG_INVALID = 0x04,
	PASS_FLAG_INITIAL_PHYSPATH = 0x08,
	PASS_FLAG_ZONE_INPROG = 0x10,
	PASS_FLAG_ZONE_VALID = 0x20,
	PASS_FLAG_UNMAPPED_CAPABLE = 0x40,
	PASS_FLAG_ABANDONED_REF_SET = 0x80
	} pass_flags;

	typedef enum {
	PASS_STATE_NORMAL
	} pass_state;

	typedef enum {
	PASS_CCB_BUFFER_IO,
	PASS_CCB_QUEUED_IO
	} pass_ccb_types;

	#define ccb_type ppriv_field0
	#define ccb_ioreq ppriv_ptr1

	/*
	* The maximum number of memory segments we preallocate.
	*/
	#define PASS_MAX_SEGS 16

	typedef enum {
	PASS_IO_NONE = 0x00,
	PASS_IO_USER_SEG_MALLOC = 0x01,
	PASS_IO_KERN_SEG_MALLOC = 0x02,
	PASS_IO_ABANDONED = 0x04
	} pass_io_flags;

	struct pass_io_req {
	union ccb ccb;
	union ccb *alloced_ccb;
	union ccb *user_ccb_ptr;
	camq_entry user_periph_links;
	ccb_ppriv_area user_periph_priv;
	struct cam_periph_map_info mapinfo;
	pass_io_flags flags;
	ccb_flags data_flags;
	int num_user_segs;
	bus_dma_segment_t user_segs[PASS_MAX_SEGS];
	int num_kern_segs;
	bus_dma_segment_t kern_segs[PASS_MAX_SEGS];
	bus_dma_segment_t *user_segptr;
	bus_dma_segment_t *kern_segptr;
	int num_bufs;
	uint32_t dirs[CAM_PERIPH_MAXMAPS];
	uint32_t lengths[CAM_PERIPH_MAXMAPS];
	uint8_t *user_bufs[CAM_PERIPH_MAXMAPS];
	uint8_t *kern_bufs[CAM_PERIPH_MAXMAPS];
	struct bintime start_time;
	TAILQ_ENTRY(pass_io_req) links;
	};

	struct pass_softc {
	pass_state state;
	pass_flags flags;
	u_int8_t pd_type;
	union ccb saved_ccb;
	int open_count;
	u_int maxio;
	struct devstat *device_stats;
	struct cdev *dev;
	struct cdev *alias_dev;
	struct task add_physpath_task;
	struct task shutdown_kqueue_task;
	struct selinfo read_select;
	TAILQ_HEAD(, pass_io_req) incoming_queue;
	TAILQ_HEAD(, pass_io_req) active_queue;
	TAILQ_HEAD(, pass_io_req) abandoned_queue;
	TAILQ_HEAD(, pass_io_req) done_queue;
	struct cam_periph *periph;
	char zone_name[12];
	char io_zone_name[12];
	uma_zone_t pass_zone;
	uma_zone_t pass_io_zone;
	size_t io_zone_size;
	};

	static d_open_t passopen;
	static d_close_t passclose;
	static d_ioctl_t passioctl;
	static d_ioctl_t passdoioctl;
	static d_poll_t passpoll;
	static d_kqfilter_t passkqfilter;
	static void passreadfiltdetach(struct knote *kn);
	static int passreadfilt(struct knote *kn, long hint);

	static periph_init_t passinit;
	static periph_ctor_t passregister;
	static periph_oninv_t passoninvalidate;
	static periph_dtor_t passcleanup;
	static periph_start_t passstart;
	static void pass_shutdown_kqueue(void *context, int pending);
	static void pass_add_physpath(void *context, int pending);
	static void passasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg);
	static void passdone(struct cam_periph *periph,
	union ccb *done_ccb);
	static int passcreatezone(struct cam_periph *periph);
	static void passiocleanup(struct pass_softc *softc,
	struct pass_io_req *io_req);
	static int passcopysglist(struct cam_periph *periph,
	struct pass_io_req *io_req,
	ccb_flags direction);
	static int passmemsetup(struct cam_periph *periph,
	struct pass_io_req *io_req);
	static int passmemdone(struct cam_periph *periph,
	struct pass_io_req *io_req);
	static int passerror(union ccb *ccb, u_int32_t cam_flags,
	u_int32_t sense_flags);
	static int passsendccb(struct cam_periph periph, union ccb ccb,
	union ccb *inccb);

	static struct periph_driver passdriver =
	{
	passinit, "pass",
	TAILQ_HEAD_INITIALIZER(passdriver.units), /* generation */ 0
	};

	PERIPHDRIVER_DECLARE(pass, passdriver);

	static struct cdevsw pass_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = D_TRACKCLOSE,
	.d_open = passopen,
	.d_close = passclose,
	.d_ioctl = passioctl,
	.d_poll = passpoll,
	.d_kqfilter = passkqfilter,
	.d_name = "pass",
	};

	static struct filterops passread_filtops = {
	.f_isfd = 1,
	.f_detach = passreadfiltdetach,
	.f_event = passreadfilt
	};

	static MALLOC_DEFINE(M_SCSIPASS, "scsi_pass", "scsi passthrough buffers");

	static void
	passinit(void)
	{
	cam_status status;

	/*
	* Install a global async callback. This callback will
	* receive async callbacks like "new device found".
	*/
	status = xpt_register_async(AC_FOUND_DEVICE, passasync, NULL, NULL);

	if (status != CAM_REQ_CMP) {
	printf("pass: Failed to attach master async callback "
	"due to status 0x%x!\n", status);
	}

	}

	static void
	passrejectios(struct cam_periph *periph)
	{
	struct pass_io_req io_req, io_req2;
	struct pass_softc *softc;

	softc = (struct pass_softc *)periph->softc;

	/*
	* The user can no longer get status for I/O on the done queue, so
	* clean up all outstanding I/O on the done queue.
	*/
	TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) {
	TAILQ_REMOVE(&softc->done_queue, io_req, links);
	passiocleanup(softc, io_req);
	uma_zfree(softc->pass_zone, io_req);
	}

	/*
	* The underlying device is gone, so we can't issue these I/Os.
	* The devfs node has been shut down, so we can't return status to
	* the user. Free any I/O left on the incoming queue.
	*/
	TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links, io_req2) {
	TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
	passiocleanup(softc, io_req);
	uma_zfree(softc->pass_zone, io_req);
	}

	/*
	* Normally we would put I/Os on the abandoned queue and acquire a
	* reference when we saw the final close. But, the device went
	* away and devfs may have moved everything off to deadfs by the
	* time the I/O done callback is called; as a result, we won't see
	* any more closes. So, if we have any active I/Os, we need to put
	* them on the abandoned queue. When the abandoned queue is empty,
	* we'll release the remaining reference (see below) to the peripheral.
	*/
	TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links, io_req2) {
	TAILQ_REMOVE(&softc->active_queue, io_req, links);
	io_req->flags \|= PASS_IO_ABANDONED;
	TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req, links);
	}

	/*
	* If we put any I/O on the abandoned queue, acquire a reference.
	*/
	if ((!TAILQ_EMPTY(&softc->abandoned_queue))
	&& ((softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0)) {
	cam_periph_doacquire(periph);
	softc->flags \|= PASS_FLAG_ABANDONED_REF_SET;
	}
	}

	static void
	passdevgonecb(void *arg)
	{
	struct cam_periph *periph;
	struct mtx *mtx;
	struct pass_softc *softc;
	int i;

	periph = (struct cam_periph *)arg;
	mtx = cam_periph_mtx(periph);
	mtx_lock(mtx);

	softc = (struct pass_softc *)periph->softc;
	KASSERT(softc->open_count >= 0, ("Negative open count %d",
	softc->open_count));

	/*
	* When we get this callback, we will get no more close calls from
	* devfs. So if we have any dangling opens, we need to release the
	* reference held for that particular context.
	*/
	for (i = 0; i < softc->open_count; i++)
	cam_periph_release_locked(periph);

	softc->open_count = 0;

	/*
	* Release the reference held for the device node, it is gone now.
	* Accordingly, inform all queued I/Os of their fate.
	*/
	cam_periph_release_locked(periph);
	passrejectios(periph);

	/*
	* We reference the SIM lock directly here, instead of using
	* cam_periph_unlock(). The reason is that the final call to
	* cam_periph_release_locked() above could result in the periph
	* getting freed. If that is the case, dereferencing the periph
	* with a cam_periph_unlock() call would cause a page fault.
	*/
	mtx_unlock(mtx);

	/*
	* We have to remove our kqueue context from a thread because it
	* may sleep. It would be nice if we could get a callback from
	* kqueue when it is done cleaning up resources.
	*/
	taskqueue_enqueue(taskqueue_thread, &softc->shutdown_kqueue_task);
	}

	static void
	passoninvalidate(struct cam_periph *periph)
	{
	struct pass_softc *softc;

	softc = (struct pass_softc *)periph->softc;

	/*
	* De-register any async callbacks.
	*/
	xpt_register_async(0, passasync, periph, periph->path);

	softc->flags \|= PASS_FLAG_INVALID;

	/*
	* Tell devfs this device has gone away, and ask for a callback
	* when it has cleaned up its state.
	*/
	destroy_dev_sched_cb(softc->dev, passdevgonecb, periph);
	}

	static void
	passcleanup(struct cam_periph *periph)
	{
	struct pass_softc *softc;

	softc = (struct pass_softc *)periph->softc;

	cam_periph_assert(periph, MA_OWNED);
	KASSERT(TAILQ_EMPTY(&softc->active_queue),
	("%s called when there are commands on the active queue!\n",
	__func__));
	KASSERT(TAILQ_EMPTY(&softc->abandoned_queue),
	("%s called when there are commands on the abandoned queue!\n",
	__func__));
	KASSERT(TAILQ_EMPTY(&softc->incoming_queue),
	("%s called when there are commands on the incoming queue!\n",
	__func__));
	KASSERT(TAILQ_EMPTY(&softc->done_queue),
	("%s called when there are commands on the done queue!\n",
	__func__));

	devstat_remove_entry(softc->device_stats);

	cam_periph_unlock(periph);

	/*
	* We call taskqueue_drain() for the physpath task to make sure it
	* is complete. We drop the lock because this can potentially
	* sleep. XXX KDM that is bad. Need a way to get a callback when
	* a taskqueue is drained.
	*
	* Note that we don't drain the kqueue shutdown task queue. This
	* is because we hold a reference on the periph for kqueue, and
	* release that reference from the kqueue shutdown task queue. So
	* we cannot come into this routine unless we've released that
	* reference. Also, because that could be the last reference, we
	* could be called from the cam_periph_release() call in
	* pass_shutdown_kqueue(). In that case, the taskqueue_drain()
	* would deadlock. It would be preferable if we had a way to
	* get a callback when a taskqueue is done.
	*/
	taskqueue_drain(taskqueue_thread, &softc->add_physpath_task);

	cam_periph_lock(periph);

	free(softc, M_DEVBUF);
	}

	static void
	pass_shutdown_kqueue(void *context, int pending)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;

	periph = context;
	softc = periph->softc;

	knlist_clear(&softc->read_select.si_note, /is_locked/ 0);
	knlist_destroy(&softc->read_select.si_note);

	/*
	* Release the reference we held for kqueue.
	*/
	cam_periph_release(periph);
	}

	static void
	pass_add_physpath(void *context, int pending)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;
	struct mtx *mtx;
	char *physpath;

	/*
	* If we have one, create a devfs alias for our
	* physical path.
	*/
	periph = context;
	softc = periph->softc;
	physpath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK);
	mtx = cam_periph_mtx(periph);
	mtx_lock(mtx);

	if (periph->flags & CAM_PERIPH_INVALID)
	goto out;

	if (xpt_getattr(physpath, MAXPATHLEN,
	"GEOM::physpath", periph->path) == 0
	&& strlen(physpath) != 0) {
	mtx_unlock(mtx);
	make_dev_physpath_alias(MAKEDEV_WAITOK, &softc->alias_dev,
	softc->dev, softc->alias_dev, physpath);
	mtx_lock(mtx);
	}

	out:
	/*
	* Now that we've made our alias, we no longer have to have a
	* reference to the device.
	*/
	if ((softc->flags & PASS_FLAG_INITIAL_PHYSPATH) == 0)
	softc->flags \|= PASS_FLAG_INITIAL_PHYSPATH;

	/*
	* We always acquire a reference to the periph before queueing this
	* task queue function, so it won't go away before we run.
	*/
	while (pending-- > 0)
	cam_periph_release_locked(periph);
	mtx_unlock(mtx);

	free(physpath, M_DEVBUF);
	}

	static void
	passasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg)
	{
	struct cam_periph *periph;

	periph = (struct cam_periph *)callback_arg;

	switch (code) {
	case AC_FOUND_DEVICE:
	{
	struct ccb_getdev *cgd;
	cam_status status;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL)
	break;

	/*
	* Allocate a peripheral instance for
	* this device and start the probe
	* process.
	*/
	status = cam_periph_alloc(passregister, passoninvalidate,
	passcleanup, passstart, "pass",
	CAM_PERIPH_BIO, path,
	passasync, AC_FOUND_DEVICE, cgd);

	if (status != CAM_REQ_CMP
	&& status != CAM_REQ_INPROG) {
	const struct cam_status_entry *entry;

	entry = cam_fetch_status_entry(status);

	printf("passasync: Unable to attach new device "
	"due to status %#x: %s\n", status, entry ?
	entry->status_text : "Unknown");
	}

	break;
	}
	case AC_ADVINFO_CHANGED:
	{
	uintptr_t buftype;

	buftype = (uintptr_t)arg;
	if (buftype == CDAI_TYPE_PHYS_PATH) {
	struct pass_softc *softc;

	softc = (struct pass_softc *)periph->softc;
	/*
	* Acquire a reference to the periph before we
	* start the taskqueue, so that we don't run into
	* a situation where the periph goes away before
	* the task queue has a chance to run.
	*/
	if (cam_periph_acquire(periph) != 0)
	break;

	taskqueue_enqueue(taskqueue_thread,
	&softc->add_physpath_task);
	}
	break;
	}
	default:
	cam_periph_async(periph, code, path, arg);
	break;
	}
	}

	static cam_status
	passregister(struct cam_periph periph, void arg)
	{
	struct pass_softc *softc;
	struct ccb_getdev *cgd;
	struct ccb_pathinq cpi;
	struct make_dev_args args;
	int error, no_tags;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL) {
	printf("%s: no getdev CCB, can't register device\n", __func__);
	return(CAM_REQ_CMP_ERR);
	}

	softc = (struct pass_softc )malloc(sizeof(softc),
	M_DEVBUF, M_NOWAIT);

	if (softc == NULL) {
	printf("%s: Unable to probe new device. "
	"Unable to allocate softc\n", __func__);
	return(CAM_REQ_CMP_ERR);
	}

	bzero(softc, sizeof(*softc));
	softc->state = PASS_STATE_NORMAL;
	if (cgd->protocol == PROTO_SCSI \|\| cgd->protocol == PROTO_ATAPI)
	softc->pd_type = SID_TYPE(&cgd->inq_data);
	else if (cgd->protocol == PROTO_SATAPM)
	softc->pd_type = T_ENCLOSURE;
	else
	softc->pd_type = T_DIRECT;

	periph->softc = softc;
	softc->periph = periph;
	TAILQ_INIT(&softc->incoming_queue);
	TAILQ_INIT(&softc->active_queue);
	TAILQ_INIT(&softc->abandoned_queue);
	TAILQ_INIT(&softc->done_queue);
	snprintf(softc->zone_name, sizeof(softc->zone_name), "%s%d",
	periph->periph_name, periph->unit_number);
	snprintf(softc->io_zone_name, sizeof(softc->io_zone_name), "%s%dIO",
	periph->periph_name, periph->unit_number);
	- softc->io_zone_size = MAXPHYS;
	+ softc->io_zone_size = maxphys;
	knlist_init_mtx(&softc->read_select.si_note, cam_periph_mtx(periph));

	xpt_path_inq(&cpi, periph->path);

	if (cpi.maxio == 0)
	softc->maxio = DFLTPHYS; /* traditional default */
	- else if (cpi.maxio > MAXPHYS)
	- softc->maxio = MAXPHYS; /* for safety */
	+ else if (cpi.maxio > maxphys)
	+ softc->maxio = maxphys; /* for safety */
	else
	softc->maxio = cpi.maxio; /* real value */

	if (cpi.hba_misc & PIM_UNMAPPED)
	softc->flags \|= PASS_FLAG_UNMAPPED_CAPABLE;

	/*
	* We pass in 0 for a blocksize, since we don't
	* know what the blocksize of this device is, if
	* it even has a blocksize.
	*/
	cam_periph_unlock(periph);
	no_tags = (cgd->inq_data.flags & SID_CmdQue) == 0;
	softc->device_stats = devstat_new_entry("pass",
	periph->unit_number, 0,
	DEVSTAT_NO_BLOCKSIZE
	\| (no_tags ? DEVSTAT_NO_ORDERED_TAGS : 0),
	softc->pd_type \|
	XPORT_DEVSTAT_TYPE(cpi.transport) \|
	DEVSTAT_TYPE_PASS,
	DEVSTAT_PRIORITY_PASS);

	/*
	* Initialize the taskqueue handler for shutting down kqueue.
	*/
	TASK_INIT(&softc->shutdown_kqueue_task, /priority/ 0,
	pass_shutdown_kqueue, periph);

	/*
	* Acquire a reference to the periph that we can release once we've
	* cleaned up the kqueue.
	*/
	if (cam_periph_acquire(periph) != 0) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}

	/*
	* Acquire a reference to the periph before we create the devfs
	* instance for it. We'll release this reference once the devfs
	* instance has been freed.
	*/
	if (cam_periph_acquire(periph) != 0) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}

	/* Register the device */
	make_dev_args_init(&args);
	args.mda_devsw = &pass_cdevsw;
	args.mda_unit = periph->unit_number;
	args.mda_uid = UID_ROOT;
	args.mda_gid = GID_OPERATOR;
	args.mda_mode = 0600;
	args.mda_si_drv1 = periph;
	args.mda_flags = MAKEDEV_NOWAIT;
	error = make_dev_s(&args, &softc->dev, "%s%d", periph->periph_name,
	periph->unit_number);
	if (error != 0) {
	cam_periph_lock(periph);
	cam_periph_release_locked(periph);
	return (CAM_REQ_CMP_ERR);
	}

	/*
	* Hold a reference to the periph before we create the physical
	* path alias so it can't go away.
	*/
	if (cam_periph_acquire(periph) != 0) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}

	cam_periph_lock(periph);

	TASK_INIT(&softc->add_physpath_task, /priority/0,
	pass_add_physpath, periph);

	/*
	* See if physical path information is already available.
	*/
	taskqueue_enqueue(taskqueue_thread, &softc->add_physpath_task);

	/*
	* Add an async callback so that we get notified if
	* this device goes away or its physical path
	* (stored in the advanced info data of the EDT) has
	* changed.
	*/
	xpt_register_async(AC_LOST_DEVICE \| AC_ADVINFO_CHANGED,
	passasync, periph, periph->path);

	if (bootverbose)
	xpt_announce_periph(periph, NULL);

	return(CAM_REQ_CMP);
	}

	static int
	passopen(struct cdev dev, int flags, int fmt, struct thread td)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;
	int error;

	periph = (struct cam_periph *)dev->si_drv1;
	if (cam_periph_acquire(periph) != 0)
	return (ENXIO);

	cam_periph_lock(periph);

	softc = (struct pass_softc *)periph->softc;

	if (softc->flags & PASS_FLAG_INVALID) {
	cam_periph_release_locked(periph);
	cam_periph_unlock(periph);
	return(ENXIO);
	}

	/*
	* Don't allow access when we're running at a high securelevel.
	*/
	error = securelevel_gt(td->td_ucred, 1);
	if (error) {
	cam_periph_release_locked(periph);
	cam_periph_unlock(periph);
	return(error);
	}

	/*
	* Only allow read-write access.
	*/
	if (((flags & FWRITE) == 0) \|\| ((flags & FREAD) == 0)) {
	cam_periph_release_locked(periph);
	cam_periph_unlock(periph);
	return(EPERM);
	}

	/*
	* We don't allow nonblocking access.
	*/
	if ((flags & O_NONBLOCK) != 0) {
	xpt_print(periph->path, "can't do nonblocking access\n");
	cam_periph_release_locked(periph);
	cam_periph_unlock(periph);
	return(EINVAL);
	}

	softc->open_count++;

	cam_periph_unlock(periph);

	return (error);
	}

	static int
	passclose(struct cdev dev, int flag, int fmt, struct thread td)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;
	struct mtx *mtx;

	periph = (struct cam_periph *)dev->si_drv1;
	mtx = cam_periph_mtx(periph);
	mtx_lock(mtx);

	softc = periph->softc;
	softc->open_count--;

	if (softc->open_count == 0) {
	struct pass_io_req io_req, io_req2;

	TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) {
	TAILQ_REMOVE(&softc->done_queue, io_req, links);
	passiocleanup(softc, io_req);
	uma_zfree(softc->pass_zone, io_req);
	}

	TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links,
	io_req2) {
	TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
	passiocleanup(softc, io_req);
	uma_zfree(softc->pass_zone, io_req);
	}

	/*
	* If there are any active I/Os, we need to forcibly acquire a
	* reference to the peripheral so that we don't go away
	* before they complete. We'll release the reference when
	* the abandoned queue is empty.
	*/
	io_req = TAILQ_FIRST(&softc->active_queue);
	if ((io_req != NULL)
	&& (softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0) {
	cam_periph_doacquire(periph);
	softc->flags \|= PASS_FLAG_ABANDONED_REF_SET;
	}

	/*
	* Since the I/O in the active queue is not under our
	* control, just set a flag so that we can clean it up when
	* it completes and put it on the abandoned queue. This
	* will prevent our sending spurious completions in the
	* event that the device is opened again before these I/Os
	* complete.
	*/
	TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links,
	io_req2) {
	TAILQ_REMOVE(&softc->active_queue, io_req, links);
	io_req->flags \|= PASS_IO_ABANDONED;
	TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req,
	links);
	}
	}

	cam_periph_release_locked(periph);

	/*
	* We reference the lock directly here, instead of using
	* cam_periph_unlock(). The reason is that the call to
	* cam_periph_release_locked() above could result in the periph
	* getting freed. If that is the case, dereferencing the periph
	* with a cam_periph_unlock() call would cause a page fault.
	*
	* cam_periph_release() avoids this problem using the same method,
	* but we're manually acquiring and dropping the lock here to
	* protect the open count and avoid another lock acquisition and
	* release.
	*/
	mtx_unlock(mtx);

	return (0);
	}

	static void
	passstart(struct cam_periph periph, union ccb start_ccb)
	{
	struct pass_softc *softc;

	softc = (struct pass_softc *)periph->softc;

	switch (softc->state) {
	case PASS_STATE_NORMAL: {
	struct pass_io_req *io_req;

	/*
	* Check for any queued I/O requests that require an
	* allocated slot.
	*/
	io_req = TAILQ_FIRST(&softc->incoming_queue);
	if (io_req == NULL) {
	xpt_release_ccb(start_ccb);
	break;
	}
	TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
	TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links);
	/*
	* Merge the user's CCB into the allocated CCB.
	*/
	xpt_merge_ccb(start_ccb, &io_req->ccb);
	start_ccb->ccb_h.ccb_type = PASS_CCB_QUEUED_IO;
	start_ccb->ccb_h.ccb_ioreq = io_req;
	start_ccb->ccb_h.cbfcnp = passdone;
	io_req->alloced_ccb = start_ccb;
	binuptime(&io_req->start_time);
	devstat_start_transaction(softc->device_stats,
	&io_req->start_time);

	xpt_action(start_ccb);

	/*
	* If we have any more I/O waiting, schedule ourselves again.
	*/
	if (!TAILQ_EMPTY(&softc->incoming_queue))
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	break;
	}
	default:
	break;
	}
	}

	static void
	passdone(struct cam_periph periph, union ccb done_ccb)
	{
	struct pass_softc *softc;
	struct ccb_scsiio *csio;

	softc = (struct pass_softc *)periph->softc;

	cam_periph_assert(periph, MA_OWNED);

	csio = &done_ccb->csio;
	switch (csio->ccb_h.ccb_type) {
	case PASS_CCB_QUEUED_IO: {
	struct pass_io_req *io_req;

	io_req = done_ccb->ccb_h.ccb_ioreq;
	#if 0
	xpt_print(periph->path, "%s: called for user CCB %p\n",
	__func__, io_req->user_ccb_ptr);
	#endif
	if (((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)
	&& (done_ccb->ccb_h.flags & CAM_PASS_ERR_RECOVER)
	&& ((io_req->flags & PASS_IO_ABANDONED) == 0)) {
	int error;

	error = passerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA \| SF_NO_PRINT);

	if (error == ERESTART) {
	/*
	* A retry was scheduled, so
	* just return.
	*/
	return;
	}
	}

	/*
	* Copy the allocated CCB contents back to the malloced CCB
	* so we can give status back to the user when he requests it.
	*/
	bcopy(done_ccb, &io_req->ccb, sizeof(*done_ccb));

	/*
	* Log data/transaction completion with devstat(9).
	*/
	switch (done_ccb->ccb_h.func_code) {
	case XPT_SCSI_IO:
	devstat_end_transaction(softc->device_stats,
	done_ccb->csio.dxfer_len - done_ccb->csio.resid,
	done_ccb->csio.tag_action & 0x3,
	((done_ccb->ccb_h.flags & CAM_DIR_MASK) ==
	CAM_DIR_NONE) ? DEVSTAT_NO_DATA :
	(done_ccb->ccb_h.flags & CAM_DIR_OUT) ?
	DEVSTAT_WRITE : DEVSTAT_READ, NULL,
	&io_req->start_time);
	break;
	case XPT_ATA_IO:
	devstat_end_transaction(softc->device_stats,
	done_ccb->ataio.dxfer_len - done_ccb->ataio.resid,
	0, /* Not used in ATA */
	((done_ccb->ccb_h.flags & CAM_DIR_MASK) ==
	CAM_DIR_NONE) ? DEVSTAT_NO_DATA :
	(done_ccb->ccb_h.flags & CAM_DIR_OUT) ?
	DEVSTAT_WRITE : DEVSTAT_READ, NULL,
	&io_req->start_time);
	break;
	case XPT_SMP_IO:
	/*
	* XXX KDM this isn't quite right, but there isn't
	* currently an easy way to represent a bidirectional
	* transfer in devstat. The only way to do it
	* and have the byte counts come out right would
	* mean that we would have to record two
	* transactions, one for the request and one for the
	* response. For now, so that we report something,
	* just treat the entire thing as a read.
	*/
	devstat_end_transaction(softc->device_stats,
	done_ccb->smpio.smp_request_len +
	done_ccb->smpio.smp_response_len,
	DEVSTAT_TAG_SIMPLE, DEVSTAT_READ, NULL,
	&io_req->start_time);
	break;
	default:
	devstat_end_transaction(softc->device_stats, 0,
	DEVSTAT_TAG_NONE, DEVSTAT_NO_DATA, NULL,
	&io_req->start_time);
	break;
	}

	/*
	* In the normal case, take the completed I/O off of the
	* active queue and put it on the done queue. Notitfy the
	* user that we have a completed I/O.
	*/
	if ((io_req->flags & PASS_IO_ABANDONED) == 0) {
	TAILQ_REMOVE(&softc->active_queue, io_req, links);
	TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links);
	selwakeuppri(&softc->read_select, PRIBIO);
	KNOTE_LOCKED(&softc->read_select.si_note, 0);
	} else {
	/*
	* In the case of an abandoned I/O (final close
	* without fetching the I/O), take it off of the
	* abandoned queue and free it.
	*/
	TAILQ_REMOVE(&softc->abandoned_queue, io_req, links);
	passiocleanup(softc, io_req);
	uma_zfree(softc->pass_zone, io_req);

	/*
	* Release the done_ccb here, since we may wind up
	* freeing the peripheral when we decrement the
	* reference count below.
	*/
	xpt_release_ccb(done_ccb);

	/*
	* If the abandoned queue is empty, we can release
	* our reference to the periph since we won't have
	* any more completions coming.
	*/
	if ((TAILQ_EMPTY(&softc->abandoned_queue))
	&& (softc->flags & PASS_FLAG_ABANDONED_REF_SET)) {
	softc->flags &= ~PASS_FLAG_ABANDONED_REF_SET;
	cam_periph_release_locked(periph);
	}

	/*
	* We have already released the CCB, so we can
	* return.
	*/
	return;
	}
	break;
	}
	}
	xpt_release_ccb(done_ccb);
	}

	static int
	passcreatezone(struct cam_periph *periph)
	{
	struct pass_softc *softc;
	int error;

	error = 0;
	softc = (struct pass_softc *)periph->softc;

	cam_periph_assert(periph, MA_OWNED);
	KASSERT(((softc->flags & PASS_FLAG_ZONE_VALID) == 0),
	("%s called when the pass(4) zone is valid!\n", __func__));
	KASSERT((softc->pass_zone == NULL),
	("%s called when the pass(4) zone is allocated!\n", __func__));

	if ((softc->flags & PASS_FLAG_ZONE_INPROG) == 0) {
	/*
	* We're the first context through, so we need to create
	* the pass(4) UMA zone for I/O requests.
	*/
	softc->flags \|= PASS_FLAG_ZONE_INPROG;

	/*
	* uma_zcreate() does a blocking (M_WAITOK) allocation,
	* so we cannot hold a mutex while we call it.
	*/
	cam_periph_unlock(periph);

	softc->pass_zone = uma_zcreate(softc->zone_name,
	sizeof(struct pass_io_req), NULL, NULL, NULL, NULL,
	/align/ 0, /flags/ 0);

	softc->pass_io_zone = uma_zcreate(softc->io_zone_name,
	softc->io_zone_size, NULL, NULL, NULL, NULL,
	/align/ 0, /flags/ 0);

	cam_periph_lock(periph);

	if ((softc->pass_zone == NULL)
	\|\| (softc->pass_io_zone == NULL)) {
	if (softc->pass_zone == NULL)
	xpt_print(periph->path, "unable to allocate "
	"IO Req UMA zone\n");
	else
	xpt_print(periph->path, "unable to allocate "
	"IO UMA zone\n");
	softc->flags &= ~PASS_FLAG_ZONE_INPROG;
	goto bailout;
	}

	/*
	* Set the flags appropriately and notify any other waiters.
	*/
	softc->flags &= PASS_FLAG_ZONE_INPROG;
	softc->flags \|= PASS_FLAG_ZONE_VALID;
	wakeup(&softc->pass_zone);
	} else {
	/*
	* In this case, the UMA zone has not yet been created, but
	* another context is in the process of creating it. We
	* need to sleep until the creation is either done or has
	* failed.
	*/
	while ((softc->flags & PASS_FLAG_ZONE_INPROG)
	&& ((softc->flags & PASS_FLAG_ZONE_VALID) == 0)) {
	error = msleep(&softc->pass_zone,
	cam_periph_mtx(periph), PRIBIO,
	"paszon", 0);
	if (error != 0)
	goto bailout;
	}
	/*
	* If the zone creation failed, no luck for the user.
	*/
	if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0){
	error = ENOMEM;
	goto bailout;
	}
	}
	bailout:
	return (error);
	}

	static void
	passiocleanup(struct pass_softc softc, struct pass_io_req io_req)
	{
	union ccb *ccb;
	u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
	int i, numbufs;

	ccb = &io_req->ccb;

	switch (ccb->ccb_h.func_code) {
	case XPT_DEV_MATCH:
	numbufs = min(io_req->num_bufs, 2);

	if (numbufs == 1) {
	data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
	} else {
	data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
	data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
	}
	break;
	case XPT_SCSI_IO:
	case XPT_CONT_TARGET_IO:
	data_ptrs[0] = &ccb->csio.data_ptr;
	numbufs = min(io_req->num_bufs, 1);
	break;
	case XPT_ATA_IO:
	data_ptrs[0] = &ccb->ataio.data_ptr;
	numbufs = min(io_req->num_bufs, 1);
	break;
	case XPT_SMP_IO:
	numbufs = min(io_req->num_bufs, 2);
	data_ptrs[0] = &ccb->smpio.smp_request;
	data_ptrs[1] = &ccb->smpio.smp_response;
	break;
	case XPT_DEV_ADVINFO:
	numbufs = min(io_req->num_bufs, 1);
	data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
	break;
	case XPT_NVME_IO:
	case XPT_NVME_ADMIN:
	data_ptrs[0] = &ccb->nvmeio.data_ptr;
	numbufs = min(io_req->num_bufs, 1);
	break;
	default:
	/* allow ourselves to be swapped once again */
	return;
	break; /* NOTREACHED */
	}

	if (io_req->flags & PASS_IO_USER_SEG_MALLOC) {
	free(io_req->user_segptr, M_SCSIPASS);
	io_req->user_segptr = NULL;
	}

	/*
	* We only want to free memory we malloced.
	*/
	if (io_req->data_flags == CAM_DATA_VADDR) {
	for (i = 0; i < io_req->num_bufs; i++) {
	if (io_req->kern_bufs[i] == NULL)
	continue;

	free(io_req->kern_bufs[i], M_SCSIPASS);
	io_req->kern_bufs[i] = NULL;
	}
	} else if (io_req->data_flags == CAM_DATA_SG) {
	for (i = 0; i < io_req->num_kern_segs; i++) {
	if ((uint8_t *)(uintptr_t)
	io_req->kern_segptr[i].ds_addr == NULL)
	continue;

	uma_zfree(softc->pass_io_zone, (uint8_t *)(uintptr_t)
	io_req->kern_segptr[i].ds_addr);
	io_req->kern_segptr[i].ds_addr = 0;
	}
	}

	if (io_req->flags & PASS_IO_KERN_SEG_MALLOC) {
	free(io_req->kern_segptr, M_SCSIPASS);
	io_req->kern_segptr = NULL;
	}

	if (io_req->data_flags != CAM_DATA_PADDR) {
	for (i = 0; i < numbufs; i++) {
	/*
	* Restore the user's buffer pointers to their
	* previous values.
	*/
	if (io_req->user_bufs[i] != NULL)
	*data_ptrs[i] = io_req->user_bufs[i];
	}
	}

	}

	static int
	passcopysglist(struct cam_periph periph, struct pass_io_req io_req,
	ccb_flags direction)
	{
	bus_size_t kern_watermark, user_watermark, len_copied, len_to_copy;
	bus_dma_segment_t user_sglist, kern_sglist;
	int i, j, error;

	error = 0;
	kern_watermark = 0;
	user_watermark = 0;
	len_to_copy = 0;
	len_copied = 0;
	user_sglist = io_req->user_segptr;
	kern_sglist = io_req->kern_segptr;

	for (i = 0, j = 0; i < io_req->num_user_segs &&
	j < io_req->num_kern_segs;) {
	uint8_t user_ptr, kern_ptr;

	len_to_copy = min(user_sglist[i].ds_len -user_watermark,
	kern_sglist[j].ds_len - kern_watermark);

	user_ptr = (uint8_t *)(uintptr_t)user_sglist[i].ds_addr;
	user_ptr = user_ptr + user_watermark;
	kern_ptr = (uint8_t *)(uintptr_t)kern_sglist[j].ds_addr;
	kern_ptr = kern_ptr + kern_watermark;

	user_watermark += len_to_copy;
	kern_watermark += len_to_copy;

	if (direction == CAM_DIR_IN) {
	error = copyout(kern_ptr, user_ptr, len_to_copy);
	if (error != 0) {
	xpt_print(periph->path, "%s: copyout of %u "
	"bytes from %p to %p failed with "
	"error %d\n", __func__, len_to_copy,
	kern_ptr, user_ptr, error);
	goto bailout;
	}
	} else {
	error = copyin(user_ptr, kern_ptr, len_to_copy);
	if (error != 0) {
	xpt_print(periph->path, "%s: copyin of %u "
	"bytes from %p to %p failed with "
	"error %d\n", __func__, len_to_copy,
	user_ptr, kern_ptr, error);
	goto bailout;
	}
	}

	len_copied += len_to_copy;

	if (user_sglist[i].ds_len == user_watermark) {
	i++;
	user_watermark = 0;
	}

	if (kern_sglist[j].ds_len == kern_watermark) {
	j++;
	kern_watermark = 0;
	}
	}

	bailout:

	return (error);
	}

	static int
	passmemsetup(struct cam_periph periph, struct pass_io_req io_req)
	{
	union ccb *ccb;
	struct pass_softc *softc;
	int numbufs, i;
	uint8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
	uint32_t lengths[CAM_PERIPH_MAXMAPS];
	uint32_t dirs[CAM_PERIPH_MAXMAPS];
	uint32_t num_segs;
	uint16_t *seg_cnt_ptr;
	size_t maxmap;
	int error;

	cam_periph_assert(periph, MA_NOTOWNED);

	softc = periph->softc;

	error = 0;
	ccb = &io_req->ccb;
	maxmap = 0;
	num_segs = 0;
	seg_cnt_ptr = NULL;

	switch(ccb->ccb_h.func_code) {
	case XPT_DEV_MATCH:
	if (ccb->cdm.match_buf_len == 0) {
	printf("%s: invalid match buffer length 0\n", __func__);
	return(EINVAL);
	}
	if (ccb->cdm.pattern_buf_len > 0) {
	data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
	lengths[0] = ccb->cdm.pattern_buf_len;
	dirs[0] = CAM_DIR_OUT;
	data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
	lengths[1] = ccb->cdm.match_buf_len;
	dirs[1] = CAM_DIR_IN;
	numbufs = 2;
	} else {
	data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
	lengths[0] = ccb->cdm.match_buf_len;
	dirs[0] = CAM_DIR_IN;
	numbufs = 1;
	}
	io_req->data_flags = CAM_DATA_VADDR;
	break;
	case XPT_SCSI_IO:
	case XPT_CONT_TARGET_IO:
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
	return(0);

	/*
	* The user shouldn't be able to supply a bio.
	*/
	if ((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO)
	return (EINVAL);

	io_req->data_flags = ccb->ccb_h.flags & CAM_DATA_MASK;

	data_ptrs[0] = &ccb->csio.data_ptr;
	lengths[0] = ccb->csio.dxfer_len;
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	num_segs = ccb->csio.sglist_cnt;
	seg_cnt_ptr = &ccb->csio.sglist_cnt;
	numbufs = 1;
	maxmap = softc->maxio;
	break;
	case XPT_ATA_IO:
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
	return(0);

	/*
	* We only support a single virtual address for ATA I/O.
	*/
	if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
	return (EINVAL);

	io_req->data_flags = CAM_DATA_VADDR;

	data_ptrs[0] = &ccb->ataio.data_ptr;
	lengths[0] = ccb->ataio.dxfer_len;
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	numbufs = 1;
	maxmap = softc->maxio;
	break;
	case XPT_SMP_IO:
	io_req->data_flags = CAM_DATA_VADDR;

	data_ptrs[0] = &ccb->smpio.smp_request;
	lengths[0] = ccb->smpio.smp_request_len;
	dirs[0] = CAM_DIR_OUT;
	data_ptrs[1] = &ccb->smpio.smp_response;
	lengths[1] = ccb->smpio.smp_response_len;
	dirs[1] = CAM_DIR_IN;
	numbufs = 2;
	maxmap = softc->maxio;
	break;
	case XPT_DEV_ADVINFO:
	if (ccb->cdai.bufsiz == 0)
	return (0);

	io_req->data_flags = CAM_DATA_VADDR;

	data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
	lengths[0] = ccb->cdai.bufsiz;
	dirs[0] = CAM_DIR_IN;
	numbufs = 1;
	break;
	case XPT_NVME_ADMIN:
	case XPT_NVME_IO:
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
	return (0);

	io_req->data_flags = ccb->ccb_h.flags & CAM_DATA_MASK;

	data_ptrs[0] = &ccb->nvmeio.data_ptr;
	lengths[0] = ccb->nvmeio.dxfer_len;
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	num_segs = ccb->nvmeio.sglist_cnt;
	seg_cnt_ptr = &ccb->nvmeio.sglist_cnt;
	numbufs = 1;
	maxmap = softc->maxio;
	break;
	default:
	return(EINVAL);
	break; /* NOTREACHED */
	}

	io_req->num_bufs = numbufs;

	/*
	* If there is a maximum, check to make sure that the user's
	* request fits within the limit. In general, we should only have
	* a maximum length for requests that go to hardware. Otherwise it
	* is whatever we're able to malloc.
	*/
	for (i = 0; i < numbufs; i++) {
	io_req->user_bufs[i] = *data_ptrs[i];
	io_req->dirs[i] = dirs[i];
	io_req->lengths[i] = lengths[i];

	if (maxmap == 0)
	continue;

	if (lengths[i] <= maxmap)
	continue;

	xpt_print(periph->path, "%s: data length %u > max allowed %u "
	"bytes\n", __func__, lengths[i], maxmap);
	error = EINVAL;
	goto bailout;
	}

	switch (io_req->data_flags) {
	case CAM_DATA_VADDR:
	/* Map or copy the buffer into kernel address space */
	for (i = 0; i < numbufs; i++) {
	uint8_t *tmp_buf;

	/*
	* If for some reason no length is specified, we
	* don't need to allocate anything.
	*/
	if (io_req->lengths[i] == 0)
	continue;

	tmp_buf = malloc(lengths[i], M_SCSIPASS,
	M_WAITOK \| M_ZERO);
	io_req->kern_bufs[i] = tmp_buf;
	*data_ptrs[i] = tmp_buf;

	#if 0
	xpt_print(periph->path, "%s: malloced %p len %u, user "
	"buffer %p, operation: %s\n", __func__,
	tmp_buf, lengths[i], io_req->user_bufs[i],
	(dirs[i] == CAM_DIR_IN) ? "read" : "write");
	#endif
	/*
	* We only need to copy in if the user is writing.
	*/
	if (dirs[i] != CAM_DIR_OUT)
	continue;

	error = copyin(io_req->user_bufs[i],
	io_req->kern_bufs[i], lengths[i]);
	if (error != 0) {
	xpt_print(periph->path, "%s: copy of user "
	"buffer from %p to %p failed with "
	"error %d\n", __func__,
	io_req->user_bufs[i],
	io_req->kern_bufs[i], error);
	goto bailout;
	}
	}
	break;
	case CAM_DATA_PADDR:
	/* Pass down the pointer as-is */
	break;
	case CAM_DATA_SG: {
	size_t sg_length, size_to_go, alloc_size;
	uint32_t num_segs_needed;

	/*
	* Copy the user S/G list in, and then copy in the
	* individual segments.
	*/
	/*
	* We shouldn't see this, but check just in case.
	*/
	if (numbufs != 1) {
	xpt_print(periph->path, "%s: cannot currently handle "
	"more than one S/G list per CCB\n", __func__);
	error = EINVAL;
	goto bailout;
	}

	/*
	* We have to have at least one segment.
	*/
	if (num_segs == 0) {
	xpt_print(periph->path, "%s: CAM_DATA_SG flag set, "
	"but sglist_cnt=0!\n", __func__);
	error = EINVAL;
	goto bailout;
	}

	/*
	* Make sure the user specified the total length and didn't
	* just leave it to us to decode the S/G list.
	*/
	if (lengths[0] == 0) {
	xpt_print(periph->path, "%s: no dxfer_len specified, "
	"but CAM_DATA_SG flag is set!\n", __func__);
	error = EINVAL;
	goto bailout;
	}

	/*
	* We allocate buffers in io_zone_size increments for an
	- * S/G list. This will generally be MAXPHYS.
	+ * S/G list. This will generally be maxphys.
	*/
	if (lengths[0] <= softc->io_zone_size)
	num_segs_needed = 1;
	else {
	num_segs_needed = lengths[0] / softc->io_zone_size;
	if ((lengths[0] % softc->io_zone_size) != 0)
	num_segs_needed++;
	}

	/* Figure out the size of the S/G list */
	sg_length = num_segs * sizeof(bus_dma_segment_t);
	io_req->num_user_segs = num_segs;
	io_req->num_kern_segs = num_segs_needed;

	/* Save the user's S/G list pointer for later restoration */
	io_req->user_bufs[0] = *data_ptrs[0];

	/*
	* If we have enough segments allocated by default to handle
	* the length of the user's S/G list,
	*/
	if (num_segs > PASS_MAX_SEGS) {
	io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) *
	num_segs, M_SCSIPASS, M_WAITOK \| M_ZERO);
	io_req->flags \|= PASS_IO_USER_SEG_MALLOC;
	} else
	io_req->user_segptr = io_req->user_segs;

	error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length);
	if (error != 0) {
	xpt_print(periph->path, "%s: copy of user S/G list "
	"from %p to %p failed with error %d\n",
	__func__, *data_ptrs[0], io_req->user_segptr,
	error);
	goto bailout;
	}

	if (num_segs_needed > PASS_MAX_SEGS) {
	io_req->kern_segptr = malloc(sizeof(bus_dma_segment_t) *
	num_segs_needed, M_SCSIPASS, M_WAITOK \| M_ZERO);
	io_req->flags \|= PASS_IO_KERN_SEG_MALLOC;
	} else {
	io_req->kern_segptr = io_req->kern_segs;
	}

	/*
	* Allocate the kernel S/G list.
	*/
	for (size_to_go = lengths[0], i = 0;
	size_to_go > 0 && i < num_segs_needed;
	i++, size_to_go -= alloc_size) {
	uint8_t *kern_ptr;

	alloc_size = min(size_to_go, softc->io_zone_size);
	kern_ptr = uma_zalloc(softc->pass_io_zone, M_WAITOK);
	io_req->kern_segptr[i].ds_addr =
	(bus_addr_t)(uintptr_t)kern_ptr;
	io_req->kern_segptr[i].ds_len = alloc_size;
	}
	if (size_to_go > 0) {
	printf("%s: size_to_go = %zu, software error!\n",
	__func__, size_to_go);
	error = EINVAL;
	goto bailout;
	}

	data_ptrs[0] = (uint8_t )io_req->kern_segptr;
	*seg_cnt_ptr = io_req->num_kern_segs;

	/*
	* We only need to copy data here if the user is writing.
	*/
	if (dirs[0] == CAM_DIR_OUT)
	error = passcopysglist(periph, io_req, dirs[0]);
	break;
	}
	case CAM_DATA_SG_PADDR: {
	size_t sg_length;

	/*
	* We shouldn't see this, but check just in case.
	*/
	if (numbufs != 1) {
	printf("%s: cannot currently handle more than one "
	"S/G list per CCB\n", __func__);
	error = EINVAL;
	goto bailout;
	}

	/*
	* We have to have at least one segment.
	*/
	if (num_segs == 0) {
	xpt_print(periph->path, "%s: CAM_DATA_SG_PADDR flag "
	"set, but sglist_cnt=0!\n", __func__);
	error = EINVAL;
	goto bailout;
	}

	/*
	* Make sure the user specified the total length and didn't
	* just leave it to us to decode the S/G list.
	*/
	if (lengths[0] == 0) {
	xpt_print(periph->path, "%s: no dxfer_len specified, "
	"but CAM_DATA_SG flag is set!\n", __func__);
	error = EINVAL;
	goto bailout;
	}

	/* Figure out the size of the S/G list */
	sg_length = num_segs * sizeof(bus_dma_segment_t);
	io_req->num_user_segs = num_segs;
	io_req->num_kern_segs = io_req->num_user_segs;

	/* Save the user's S/G list pointer for later restoration */
	io_req->user_bufs[0] = *data_ptrs[0];

	if (num_segs > PASS_MAX_SEGS) {
	io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) *
	num_segs, M_SCSIPASS, M_WAITOK \| M_ZERO);
	io_req->flags \|= PASS_IO_USER_SEG_MALLOC;
	} else
	io_req->user_segptr = io_req->user_segs;

	io_req->kern_segptr = io_req->user_segptr;

	error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length);
	if (error != 0) {
	xpt_print(periph->path, "%s: copy of user S/G list "
	"from %p to %p failed with error %d\n",
	__func__, *data_ptrs[0], io_req->user_segptr,
	error);
	goto bailout;
	}
	break;
	}
	default:
	case CAM_DATA_BIO:
	/*
	* A user shouldn't be attaching a bio to the CCB. It
	* isn't a user-accessible structure.
	*/
	error = EINVAL;
	break;
	}

	bailout:
	if (error != 0)
	passiocleanup(softc, io_req);

	return (error);
	}

	static int
	passmemdone(struct cam_periph periph, struct pass_io_req io_req)
	{
	struct pass_softc *softc;
	int error;
	int i;

	error = 0;
	softc = (struct pass_softc *)periph->softc;

	switch (io_req->data_flags) {
	case CAM_DATA_VADDR:
	/*
	* Copy back to the user buffer if this was a read.
	*/
	for (i = 0; i < io_req->num_bufs; i++) {
	if (io_req->dirs[i] != CAM_DIR_IN)
	continue;

	error = copyout(io_req->kern_bufs[i],
	io_req->user_bufs[i], io_req->lengths[i]);
	if (error != 0) {
	xpt_print(periph->path, "Unable to copy %u "
	"bytes from %p to user address %p\n",
	io_req->lengths[i],
	io_req->kern_bufs[i],
	io_req->user_bufs[i]);
	goto bailout;
	}
	}
	break;
	case CAM_DATA_PADDR:
	/* Do nothing. The pointer is a physical address already */
	break;
	case CAM_DATA_SG:
	/*
	* Copy back to the user buffer if this was a read.
	* Restore the user's S/G list buffer pointer.
	*/
	if (io_req->dirs[0] == CAM_DIR_IN)
	error = passcopysglist(periph, io_req, io_req->dirs[0]);
	break;
	case CAM_DATA_SG_PADDR:
	/*
	* Restore the user's S/G list buffer pointer. No need to
	* copy.
	*/
	break;
	default:
	case CAM_DATA_BIO:
	error = EINVAL;
	break;
	}

	bailout:
	/*
	* Reset the user's pointers to their original values and free
	* allocated memory.
	*/
	passiocleanup(softc, io_req);

	return (error);
	}

	static int
	passioctl(struct cdev dev, u_long cmd, caddr_t addr, int flag, struct thread td)
	{
	int error;

	if ((error = passdoioctl(dev, cmd, addr, flag, td)) == ENOTTY) {
	error = cam_compat_ioctl(dev, cmd, addr, flag, td, passdoioctl);
	}
	return (error);
	}

	static int
	passdoioctl(struct cdev dev, u_long cmd, caddr_t addr, int flag, struct thread td)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;
	int error;
	uint32_t priority;

	periph = (struct cam_periph *)dev->si_drv1;
	cam_periph_lock(periph);
	softc = (struct pass_softc *)periph->softc;

	error = 0;

	switch (cmd) {
	case CAMIOCOMMAND:
	{
	union ccb *inccb;
	union ccb *ccb;
	int ccb_malloced;

	inccb = (union ccb *)addr;
	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	if (inccb->ccb_h.func_code == XPT_SCSI_IO)
	inccb->csio.bio = NULL;
	#endif

	if (inccb->ccb_h.flags & CAM_UNLOCKED) {
	error = EINVAL;
	break;
	}

	/*
	* Some CCB types, like scan bus and scan lun can only go
	* through the transport layer device.
	*/
	if (inccb->ccb_h.func_code & XPT_FC_XPT_ONLY) {
	xpt_print(periph->path, "CCB function code %#x is "
	"restricted to the XPT device\n",
	inccb->ccb_h.func_code);
	error = ENODEV;
	break;
	}

	/* Compatibility for RL/priority-unaware code. */
	priority = inccb->ccb_h.pinfo.priority;
	if (priority <= CAM_PRIORITY_OOB)
	priority += CAM_PRIORITY_OOB + 1;

	/*
	* Non-immediate CCBs need a CCB from the per-device pool
	* of CCBs, which is scheduled by the transport layer.
	* Immediate CCBs and user-supplied CCBs should just be
	* malloced.
	*/
	if ((inccb->ccb_h.func_code & XPT_FC_QUEUED)
	&& ((inccb->ccb_h.func_code & XPT_FC_USER_CCB) == 0)) {
	ccb = cam_periph_getccb(periph, priority);
	ccb_malloced = 0;
	} else {
	ccb = xpt_alloc_ccb_nowait();

	if (ccb != NULL)
	xpt_setup_ccb(&ccb->ccb_h, periph->path,
	priority);
	ccb_malloced = 1;
	}

	if (ccb == NULL) {
	xpt_print(periph->path, "unable to allocate CCB\n");
	error = ENOMEM;
	break;
	}

	error = passsendccb(periph, ccb, inccb);

	if (ccb_malloced)
	xpt_free_ccb(ccb);
	else
	xpt_release_ccb(ccb);

	break;
	}
	case CAMIOQUEUE:
	{
	struct pass_io_req *io_req;
	union ccb *user_ccb, ccb;
	xpt_opcode fc;

	#ifdef COMPAT_FREEBSD32
	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
	error = ENOTTY;
	goto bailout;
	}
	#endif
	if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0) {
	error = passcreatezone(periph);
	if (error != 0)
	goto bailout;
	}

	/*
	* We're going to do a blocking allocation for this I/O
	* request, so we have to drop the lock.
	*/
	cam_periph_unlock(periph);

	io_req = uma_zalloc(softc->pass_zone, M_WAITOK \| M_ZERO);
	ccb = &io_req->ccb;
	user_ccb = (union ccb **)addr;

	/*
	* Unlike the CAMIOCOMMAND ioctl above, we only have a
	* pointer to the user's CCB, so we have to copy the whole
	* thing in to a buffer we have allocated (above) instead
	* of allowing the ioctl code to malloc a buffer and copy
	* it in.
	*
	* This is an advantage for this asynchronous interface,
	* since we don't want the memory to get freed while the
	* CCB is outstanding.
	*/
	#if 0
	xpt_print(periph->path, "Copying user CCB %p to "
	"kernel address %p\n", *user_ccb, ccb);
	#endif
	error = copyin(user_ccb, ccb, sizeof(ccb));
	if (error != 0) {
	xpt_print(periph->path, "Copy of user CCB %p to "
	"kernel address %p failed with error %d\n",
	*user_ccb, ccb, error);
	goto camioqueue_error;
	}
	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	if (ccb->ccb_h.func_code == XPT_SCSI_IO)
	ccb->csio.bio = NULL;
	#endif

	if (ccb->ccb_h.flags & CAM_UNLOCKED) {
	error = EINVAL;
	goto camioqueue_error;
	}

	if (ccb->ccb_h.flags & CAM_CDB_POINTER) {
	if (ccb->csio.cdb_len > IOCDBLEN) {
	error = EINVAL;
	goto camioqueue_error;
	}
	error = copyin(ccb->csio.cdb_io.cdb_ptr,
	ccb->csio.cdb_io.cdb_bytes, ccb->csio.cdb_len);
	if (error != 0)
	goto camioqueue_error;
	ccb->ccb_h.flags &= ~CAM_CDB_POINTER;
	}

	/*
	* Some CCB types, like scan bus and scan lun can only go
	* through the transport layer device.
	*/
	if (ccb->ccb_h.func_code & XPT_FC_XPT_ONLY) {
	xpt_print(periph->path, "CCB function code %#x is "
	"restricted to the XPT device\n",
	ccb->ccb_h.func_code);
	error = ENODEV;
	goto camioqueue_error;
	}

	/*
	* Save the user's CCB pointer as well as his linked list
	* pointers and peripheral private area so that we can
	* restore these later.
	*/
	io_req->user_ccb_ptr = *user_ccb;
	io_req->user_periph_links = ccb->ccb_h.periph_links;
	io_req->user_periph_priv = ccb->ccb_h.periph_priv;

	/*
	* Now that we've saved the user's values, we can set our
	* own peripheral private entry.
	*/
	ccb->ccb_h.ccb_ioreq = io_req;

	/* Compatibility for RL/priority-unaware code. */
	priority = ccb->ccb_h.pinfo.priority;
	if (priority <= CAM_PRIORITY_OOB)
	priority += CAM_PRIORITY_OOB + 1;

	/*
	* Setup fields in the CCB like the path and the priority.
	* The path in particular cannot be done in userland, since
	* it is a pointer to a kernel data structure.
	*/
	xpt_setup_ccb_flags(&ccb->ccb_h, periph->path, priority,
	ccb->ccb_h.flags);

	/*
	* Setup our done routine. There is no way for the user to
	* have a valid pointer here.
	*/
	ccb->ccb_h.cbfcnp = passdone;

	fc = ccb->ccb_h.func_code;
	/*
	* If this function code has memory that can be mapped in
	* or out, we need to call passmemsetup().
	*/
	if ((fc == XPT_SCSI_IO) \|\| (fc == XPT_ATA_IO)
	\|\| (fc == XPT_SMP_IO) \|\| (fc == XPT_DEV_MATCH)
	\|\| (fc == XPT_DEV_ADVINFO)
	\|\| (fc == XPT_NVME_ADMIN) \|\| (fc == XPT_NVME_IO)) {
	error = passmemsetup(periph, io_req);
	if (error != 0)
	goto camioqueue_error;
	} else
	io_req->mapinfo.num_bufs_used = 0;

	cam_periph_lock(periph);

	/*
	* Everything goes on the incoming queue initially.
	*/
	TAILQ_INSERT_TAIL(&softc->incoming_queue, io_req, links);

	/*
	* If the CCB is queued, and is not a user CCB, then
	* we need to allocate a slot for it. Call xpt_schedule()
	* so that our start routine will get called when a CCB is
	* available.
	*/
	if ((fc & XPT_FC_QUEUED)
	&& ((fc & XPT_FC_USER_CCB) == 0)) {
	xpt_schedule(periph, priority);
	break;
	}

	/*
	* At this point, the CCB in question is either an
	* immediate CCB (like XPT_DEV_ADVINFO) or it is a user CCB
	* and therefore should be malloced, not allocated via a slot.
	* Remove the CCB from the incoming queue and add it to the
	* active queue.
	*/
	TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
	TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links);

	xpt_action(ccb);

	/*
	* If this is not a queued CCB (i.e. it is an immediate CCB),
	* then it is already done. We need to put it on the done
	* queue for the user to fetch.
	*/
	if ((fc & XPT_FC_QUEUED) == 0) {
	TAILQ_REMOVE(&softc->active_queue, io_req, links);
	TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links);
	}
	break;

	camioqueue_error:
	uma_zfree(softc->pass_zone, io_req);
	cam_periph_lock(periph);
	break;
	}
	case CAMIOGET:
	{
	union ccb **user_ccb;
	struct pass_io_req *io_req;
	int old_error;

	#ifdef COMPAT_FREEBSD32
	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
	error = ENOTTY;
	goto bailout;
	}
	#endif
	user_ccb = (union ccb **)addr;
	old_error = 0;

	io_req = TAILQ_FIRST(&softc->done_queue);
	if (io_req == NULL) {
	error = ENOENT;
	break;
	}

	/*
	* Remove the I/O from the done queue.
	*/
	TAILQ_REMOVE(&softc->done_queue, io_req, links);

	/*
	* We have to drop the lock during the copyout because the
	* copyout can result in VM faults that require sleeping.
	*/
	cam_periph_unlock(periph);

	/*
	* Do any needed copies (e.g. for reads) and revert the
	* pointers in the CCB back to the user's pointers.
	*/
	error = passmemdone(periph, io_req);

	old_error = error;

	io_req->ccb.ccb_h.periph_links = io_req->user_periph_links;
	io_req->ccb.ccb_h.periph_priv = io_req->user_periph_priv;

	#if 0
	xpt_print(periph->path, "Copying to user CCB %p from "
	"kernel address %p\n", *user_ccb, &io_req->ccb);
	#endif

	error = copyout(&io_req->ccb, *user_ccb, sizeof(union ccb));
	if (error != 0) {
	xpt_print(periph->path, "Copy to user CCB %p from "
	"kernel address %p failed with error %d\n",
	*user_ccb, &io_req->ccb, error);
	}

	/*
	* Prefer the first error we got back, and make sure we
	* don't overwrite bad status with good.
	*/
	if (old_error != 0)
	error = old_error;

	cam_periph_lock(periph);

	/*
	* At this point, if there was an error, we could potentially
	* re-queue the I/O and try again. But why? The error
	* would almost certainly happen again. We might as well
	* not leak memory.
	*/
	uma_zfree(softc->pass_zone, io_req);
	break;
	}
	default:
	error = cam_periph_ioctl(periph, cmd, addr, passerror);
	break;
	}

	bailout:
	cam_periph_unlock(periph);

	return(error);
	}

	static int
	passpoll(struct cdev dev, int poll_events, struct thread td)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;
	int revents;

	periph = (struct cam_periph *)dev->si_drv1;
	softc = (struct pass_softc *)periph->softc;

	revents = poll_events & (POLLOUT \| POLLWRNORM);
	if ((poll_events & (POLLIN \| POLLRDNORM)) != 0) {
	cam_periph_lock(periph);

	if (!TAILQ_EMPTY(&softc->done_queue)) {
	revents \|= poll_events & (POLLIN \| POLLRDNORM);
	}
	cam_periph_unlock(periph);
	if (revents == 0)
	selrecord(td, &softc->read_select);
	}

	return (revents);
	}

	static int
	passkqfilter(struct cdev dev, struct knote kn)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;

	periph = (struct cam_periph *)dev->si_drv1;
	softc = (struct pass_softc *)periph->softc;

	kn->kn_hook = (caddr_t)periph;
	kn->kn_fop = &passread_filtops;
	knlist_add(&softc->read_select.si_note, kn, 0);

	return (0);
	}

	static void
	passreadfiltdetach(struct knote *kn)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;

	periph = (struct cam_periph *)kn->kn_hook;
	softc = (struct pass_softc *)periph->softc;

	knlist_remove(&softc->read_select.si_note, kn, 0);
	}

	static int
	passreadfilt(struct knote *kn, long hint)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;
	int retval;

	periph = (struct cam_periph *)kn->kn_hook;
	softc = (struct pass_softc *)periph->softc;

	cam_periph_assert(periph, MA_OWNED);

	if (TAILQ_EMPTY(&softc->done_queue))
	retval = 0;
	else
	retval = 1;

	return (retval);
	}

	/*
	* Generally, "ccb" should be the CCB supplied by the kernel. "inccb"
	* should be the CCB that is copied in from the user.
	*/
	static int
	passsendccb(struct cam_periph periph, union ccb ccb, union ccb *inccb)
	{
	struct pass_softc *softc;
	struct cam_periph_map_info mapinfo;
	uint8_t *cmd;
	xpt_opcode fc;
	int error;

	softc = (struct pass_softc *)periph->softc;

	/*
	* There are some fields in the CCB header that need to be
	* preserved, the rest we get from the user.
	*/
	xpt_merge_ccb(ccb, inccb);

	if (ccb->ccb_h.flags & CAM_CDB_POINTER) {
	cmd = __builtin_alloca(ccb->csio.cdb_len);
	error = copyin(ccb->csio.cdb_io.cdb_ptr, cmd, ccb->csio.cdb_len);
	if (error)
	return (error);
	ccb->csio.cdb_io.cdb_ptr = cmd;
	}

	/*
	* Let cam_periph_mapmem do a sanity check on the data pointer format.
	* Even if no data transfer is needed, it's a cheap check and it
	* simplifies the code.
	*/
	fc = ccb->ccb_h.func_code;
	if ((fc == XPT_SCSI_IO) \|\| (fc == XPT_ATA_IO) \|\| (fc == XPT_SMP_IO)
	\|\| (fc == XPT_DEV_MATCH) \|\| (fc == XPT_DEV_ADVINFO) \|\| (fc == XPT_MMC_IO)
	\|\| (fc == XPT_NVME_ADMIN) \|\| (fc == XPT_NVME_IO)) {
	bzero(&mapinfo, sizeof(mapinfo));

	/*
	* cam_periph_mapmem calls into proc and vm functions that can
	* sleep as well as trigger I/O, so we can't hold the lock.
	* Dropping it here is reasonably safe.
	*/
	cam_periph_unlock(periph);
	error = cam_periph_mapmem(ccb, &mapinfo, softc->maxio);
	cam_periph_lock(periph);

	/*
	* cam_periph_mapmem returned an error, we can't continue.
	* Return the error to the user.
	*/
	if (error)
	return(error);
	} else
	/* Ensure that the unmap call later on is a no-op. */
	mapinfo.num_bufs_used = 0;

	/*
	* If the user wants us to perform any error recovery, then honor
	* that request. Otherwise, it's up to the user to perform any
	* error recovery.
	*/
	cam_periph_runccb(ccb, (ccb->ccb_h.flags & CAM_PASS_ERR_RECOVER) ?
	passerror : NULL, /* cam_flags */ CAM_RETRY_SELTO,
	/* sense_flags */ SF_RETRY_UA \| SF_NO_PRINT,
	softc->device_stats);

	cam_periph_unlock(periph);
	cam_periph_unmapmem(ccb, &mapinfo);
	cam_periph_lock(periph);

	ccb->ccb_h.cbfcnp = NULL;
	ccb->ccb_h.periph_priv = inccb->ccb_h.periph_priv;
	bcopy(ccb, inccb, sizeof(union ccb));

	return(0);
	}

	static int
	passerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;

	periph = xpt_path_periph(ccb->ccb_h.path);
	softc = (struct pass_softc *)periph->softc;

	return(cam_periph_error(ccb, cam_flags, sense_flags));
	}
	diff --git a/sys/cam/scsi/scsi_sa.c b/sys/cam/scsi/scsi_sa.c
	index fe2912c8b52c..9441e0d4673b 100644
	--- a/sys/cam/scsi/scsi_sa.c
	+++ b/sys/cam/scsi/scsi_sa.c
	@@ -1,5904 +1,5904 @@
	/*-
	* Implementation of SCSI Sequential Access Peripheral driver for CAM.
	*
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1999, 2000 Matthew Jacob
	* Copyright (c) 2013, 2014, 2015 Spectra Logic Corporation
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/queue.h>
	#ifdef _KERNEL
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#endif
	#include <sys/types.h>
	#include <sys/time.h>
	#include <sys/bio.h>
	#include <sys/limits.h>
	#include <sys/malloc.h>
	#include <sys/mtio.h>
	#ifdef _KERNEL
	#include <sys/conf.h>
	#include <sys/sbuf.h>
	#include <sys/sysctl.h>
	#include <sys/taskqueue.h>
	#endif
	#include <sys/fcntl.h>
	#include <sys/devicestat.h>

	#ifndef _KERNEL
	#include <stdio.h>
	#include <string.h>
	#endif

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/cam_debug.h>

	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_message.h>
	#include <cam/scsi/scsi_sa.h>

	#ifdef _KERNEL

	#include "opt_sa.h"

	#ifndef SA_IO_TIMEOUT
	#define SA_IO_TIMEOUT 32
	#endif
	#ifndef SA_SPACE_TIMEOUT
	#define SA_SPACE_TIMEOUT 1 * 60
	#endif
	#ifndef SA_REWIND_TIMEOUT
	#define SA_REWIND_TIMEOUT 2 * 60
	#endif
	#ifndef SA_ERASE_TIMEOUT
	#define SA_ERASE_TIMEOUT 4 * 60
	#endif
	#ifndef SA_REP_DENSITY_TIMEOUT
	#define SA_REP_DENSITY_TIMEOUT 90
	#endif

	#define SCSIOP_TIMEOUT (60 * 1000) /* not an option */

	#define IO_TIMEOUT (SA_IO_TIMEOUT * 60 * 1000)
	#define REWIND_TIMEOUT (SA_REWIND_TIMEOUT * 60 * 1000)
	#define ERASE_TIMEOUT (SA_ERASE_TIMEOUT * 60 * 1000)
	#define SPACE_TIMEOUT (SA_SPACE_TIMEOUT * 60 * 1000)
	#define REP_DENSITY_TIMEOUT (SA_REP_DENSITY_TIMEOUT * 60 * 1000)

	/*
	* Additional options that can be set for config: SA_1FM_AT_EOT
	*/

	#ifndef UNUSED_PARAMETER
	#define UNUSED_PARAMETER(x) x = x
	#endif

	#define QFRLS(ccb) \
	if (((ccb)->ccb_h.status & CAM_DEV_QFRZN) != 0) \
	cam_release_devq((ccb)->ccb_h.path, 0, 0, 0, FALSE)

	/*
	* Driver states
	*/

	static MALLOC_DEFINE(M_SCSISA, "SCSI sa", "SCSI sequential access buffers");

	typedef enum {
	SA_STATE_NORMAL, SA_STATE_ABNORMAL
	} sa_state;

	#define ccb_pflags ppriv_field0
	#define ccb_bp ppriv_ptr1

	/* bits in ccb_pflags */
	#define SA_POSITION_UPDATED 0x1

	typedef enum {
	SA_FLAG_OPEN = 0x0001,
	SA_FLAG_FIXED = 0x0002,
	SA_FLAG_TAPE_LOCKED = 0x0004,
	SA_FLAG_TAPE_MOUNTED = 0x0008,
	SA_FLAG_TAPE_WP = 0x0010,
	SA_FLAG_TAPE_WRITTEN = 0x0020,
	SA_FLAG_EOM_PENDING = 0x0040,
	SA_FLAG_EIO_PENDING = 0x0080,
	SA_FLAG_EOF_PENDING = 0x0100,
	SA_FLAG_ERR_PENDING = (SA_FLAG_EOM_PENDING\|SA_FLAG_EIO_PENDING\|
	SA_FLAG_EOF_PENDING),
	SA_FLAG_INVALID = 0x0200,
	SA_FLAG_COMP_ENABLED = 0x0400,
	SA_FLAG_COMP_SUPP = 0x0800,
	SA_FLAG_COMP_UNSUPP = 0x1000,
	SA_FLAG_TAPE_FROZEN = 0x2000,
	SA_FLAG_PROTECT_SUPP = 0x4000,

	SA_FLAG_COMPRESSION = (SA_FLAG_COMP_SUPP\|SA_FLAG_COMP_ENABLED\|
	SA_FLAG_COMP_UNSUPP),
	SA_FLAG_SCTX_INIT = 0x8000
	} sa_flags;

	typedef enum {
	SA_MODE_REWIND = 0x00,
	SA_MODE_NOREWIND = 0x01,
	SA_MODE_OFFLINE = 0x02
	} sa_mode;

	typedef enum {
	SA_PARAM_NONE = 0x000,
	SA_PARAM_BLOCKSIZE = 0x001,
	SA_PARAM_DENSITY = 0x002,
	SA_PARAM_COMPRESSION = 0x004,
	SA_PARAM_BUFF_MODE = 0x008,
	SA_PARAM_NUMBLOCKS = 0x010,
	SA_PARAM_WP = 0x020,
	SA_PARAM_SPEED = 0x040,
	SA_PARAM_DENSITY_EXT = 0x080,
	SA_PARAM_LBP = 0x100,
	SA_PARAM_ALL = 0x1ff
	} sa_params;

	typedef enum {
	SA_QUIRK_NONE = 0x000,
	SA_QUIRK_NOCOMP = 0x001, /* Can't deal with compression at all*/
	SA_QUIRK_FIXED = 0x002, /* Force fixed mode */
	SA_QUIRK_VARIABLE = 0x004, /* Force variable mode */
	SA_QUIRK_2FM = 0x008, /* Needs Two File Marks at EOD */
	SA_QUIRK_1FM = 0x010, /* No more than 1 File Mark at EOD */
	SA_QUIRK_NODREAD = 0x020, /* Don't try and dummy read density */
	SA_QUIRK_NO_MODESEL = 0x040, /* Don't do mode select at all */
	SA_QUIRK_NO_CPAGE = 0x080, /* Don't use DEVICE COMPRESSION page */
	SA_QUIRK_NO_LONG_POS = 0x100 /* No long position information */
	} sa_quirks;

	#define SA_QUIRK_BIT_STRING \
	"\020" \
	"\001NOCOMP" \
	"\002FIXED" \
	"\003VARIABLE" \
	"\0042FM" \
	"\0051FM" \
	"\006NODREAD" \
	"\007NO_MODESEL" \
	"\010NO_CPAGE" \
	"\011NO_LONG_POS"

	#define SAMODE(z) (dev2unit(z) & 0x3)
	#define SA_IS_CTRL(z) (dev2unit(z) & (1 << 4))

	#define SA_NOT_CTLDEV 0
	#define SA_CTLDEV 1

	#define SA_ATYPE_R 0
	#define SA_ATYPE_NR 1
	#define SA_ATYPE_ER 2
	#define SA_NUM_ATYPES 3

	#define SAMINOR(ctl, access) \
	((ctl << 4) \| (access & 0x3))

	struct sa_devs {
	struct cdev *ctl_dev;
	struct cdev *r_dev;
	struct cdev *nr_dev;
	struct cdev *er_dev;
	};

	#define SASBADDBASE(sb, indent, data, xfmt, name, type, xsize, desc) \
	sbuf_printf(sb, "%*s<%s type=\"%s\" size=\"%zd\" " \
	"fmt=\"%s\" desc=\"%s\">" #xfmt "</%s>\n", indent, "", \
	#name, #type, xsize, #xfmt, desc ? desc : "", data, #name);

	#define SASBADDINT(sb, indent, data, fmt, name) \
	SASBADDBASE(sb, indent, data, fmt, name, int, sizeof(data), \
	NULL)

	#define SASBADDINTDESC(sb, indent, data, fmt, name, desc) \
	SASBADDBASE(sb, indent, data, fmt, name, int, sizeof(data), \
	desc)

	#define SASBADDUINT(sb, indent, data, fmt, name) \
	SASBADDBASE(sb, indent, data, fmt, name, uint, sizeof(data), \
	NULL)

	#define SASBADDUINTDESC(sb, indent, data, fmt, name, desc) \
	SASBADDBASE(sb, indent, data, fmt, name, uint, sizeof(data), \
	desc)

	#define SASBADDFIXEDSTR(sb, indent, data, fmt, name) \
	SASBADDBASE(sb, indent, data, fmt, name, str, sizeof(data), \
	NULL)

	#define SASBADDFIXEDSTRDESC(sb, indent, data, fmt, name, desc) \
	SASBADDBASE(sb, indent, data, fmt, name, str, sizeof(data), \
	desc)

	#define SASBADDVARSTR(sb, indent, data, fmt, name, maxlen) \
	SASBADDBASE(sb, indent, data, fmt, name, str, maxlen, NULL)

	#define SASBADDVARSTRDESC(sb, indent, data, fmt, name, maxlen, desc) \
	SASBADDBASE(sb, indent, data, fmt, name, str, maxlen, desc)

	#define SASBADDNODE(sb, indent, name) { \
	sbuf_printf(sb, "%*s<%s type=\"%s\">\n", indent, "", #name, \
	"node"); \
	indent += 2; \
	}

	#define SASBADDNODENUM(sb, indent, name, num) { \
	sbuf_printf(sb, "%*s<%s type=\"%s\" num=\"%d\">\n", indent, "", \
	#name, "node", num); \
	indent += 2; \
	}

	#define SASBENDNODE(sb, indent, name) { \
	indent -= 2; \
	sbuf_printf(sb, "%*s</%s>\n", indent, "", #name); \
	}

	#define SA_DENSITY_TYPES 4

	struct sa_prot_state {
	int initialized;
	uint32_t prot_method;
	uint32_t pi_length;
	uint32_t lbp_w;
	uint32_t lbp_r;
	uint32_t rbdp;
	};

	struct sa_prot_info {
	struct sa_prot_state cur_prot_state;
	struct sa_prot_state pending_prot_state;
	};

	/*
	* A table mapping protection parameters to their types and values.
	*/
	struct sa_prot_map {
	char *name;
	mt_param_set_type param_type;
	off_t offset;
	uint32_t min_val;
	uint32_t max_val;
	uint32_t *value;
	} sa_prot_table[] = {
	{ "prot_method", MT_PARAM_SET_UNSIGNED,
	__offsetof(struct sa_prot_state, prot_method),
	/min_val/ 0, /max_val/ 255, NULL },
	{ "pi_length", MT_PARAM_SET_UNSIGNED,
	__offsetof(struct sa_prot_state, pi_length),
	/min_val/ 0, /max_val/ SA_CTRL_DP_PI_LENGTH_MASK, NULL },
	{ "lbp_w", MT_PARAM_SET_UNSIGNED,
	__offsetof(struct sa_prot_state, lbp_w),
	/min_val/ 0, /max_val/ 1, NULL },
	{ "lbp_r", MT_PARAM_SET_UNSIGNED,
	__offsetof(struct sa_prot_state, lbp_r),
	/min_val/ 0, /max_val/ 1, NULL },
	{ "rbdp", MT_PARAM_SET_UNSIGNED,
	__offsetof(struct sa_prot_state, rbdp),
	/min_val/ 0, /max_val/ 1, NULL }
	};

	#define SA_NUM_PROT_ENTS nitems(sa_prot_table)

	#define SA_PROT_ENABLED(softc) ((softc->flags & SA_FLAG_PROTECT_SUPP) \
	&& (softc->prot_info.cur_prot_state.initialized != 0) \
	&& (softc->prot_info.cur_prot_state.prot_method != 0))

	#define SA_PROT_LEN(softc) softc->prot_info.cur_prot_state.pi_length

	struct sa_softc {
	sa_state state;
	sa_flags flags;
	sa_quirks quirks;
	u_int si_flags;
	struct cam_periph *periph;
	struct bio_queue_head bio_queue;
	int queue_count;
	struct devstat *device_stats;
	struct sa_devs devs;
	int open_count;
	int num_devs_to_destroy;
	int blk_gran;
	int blk_mask;
	int blk_shift;
	u_int32_t max_blk;
	u_int32_t min_blk;
	u_int32_t maxio;
	u_int32_t cpi_maxio;
	int allow_io_split;
	int inject_eom;
	int set_pews_status;
	u_int32_t comp_algorithm;
	u_int32_t saved_comp_algorithm;
	u_int32_t media_blksize;
	u_int32_t last_media_blksize;
	u_int32_t media_numblks;
	u_int8_t media_density;
	u_int8_t speed;
	u_int8_t scsi_rev;
	u_int8_t dsreg; /* mtio mt_dsreg, redux */
	int buffer_mode;
	int filemarks;
	union ccb saved_ccb;
	int last_resid_was_io;
	uint8_t density_type_bits[SA_DENSITY_TYPES];
	int density_info_valid[SA_DENSITY_TYPES];
	uint8_t density_info[SA_DENSITY_TYPES][SRDS_MAX_LENGTH];

	struct sa_prot_info prot_info;

	int sili;
	int eot_warn;

	/*
	* Current position information. -1 means that the given value is
	* unknown. fileno and blkno are always calculated. blkno is
	* relative to the previous file mark. rep_fileno and rep_blkno
	* are as reported by the drive, if it supports the long form
	* report for the READ POSITION command. rep_blkno is relative to
	* the beginning of the partition.
	*
	* bop means that the drive is at the beginning of the partition.
	* eop means that the drive is between early warning and end of
	* partition, inside the current partition.
	* bpew means that the position is in a PEWZ (Programmable Early
	* Warning Zone)
	*/
	daddr_t partition; /* Absolute from BOT */
	daddr_t fileno; /* Relative to beginning of partition */
	daddr_t blkno; /* Relative to last file mark */
	daddr_t rep_blkno; /* Relative to beginning of partition */
	daddr_t rep_fileno; /* Relative to beginning of partition */
	int bop; /* Beginning of Partition */
	int eop; /* End of Partition */
	int bpew; /* Beyond Programmable Early Warning */

	/*
	* Latched Error Info
	*/
	struct {
	struct scsi_sense_data _last_io_sense;
	u_int64_t _last_io_resid;
	u_int8_t _last_io_cdb[CAM_MAX_CDBLEN];
	struct scsi_sense_data _last_ctl_sense;
	u_int64_t _last_ctl_resid;
	u_int8_t _last_ctl_cdb[CAM_MAX_CDBLEN];
	#define last_io_sense errinfo._last_io_sense
	#define last_io_resid errinfo._last_io_resid
	#define last_io_cdb errinfo._last_io_cdb
	#define last_ctl_sense errinfo._last_ctl_sense
	#define last_ctl_resid errinfo._last_ctl_resid
	#define last_ctl_cdb errinfo._last_ctl_cdb
	} errinfo;
	/*
	* Misc other flags/state
	*/
	u_int32_t
	: 29,
	open_rdonly : 1, /* open read-only */
	open_pending_mount : 1, /* open pending mount */
	ctrl_mode : 1; /* control device open */

	struct task sysctl_task;
	struct sysctl_ctx_list sysctl_ctx;
	struct sysctl_oid *sysctl_tree;
	};

	struct sa_quirk_entry {
	struct scsi_inquiry_pattern inq_pat; /* matching pattern */
	sa_quirks quirks; /* specific quirk type */
	u_int32_t prefblk; /* preferred blocksize when in fixed mode */
	};

	static struct sa_quirk_entry sa_quirk_table[] =
	{
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "OnStream",
	"ADR", ""}, SA_QUIRK_FIXED\|SA_QUIRK_NODREAD \|
	SA_QUIRK_1FM\|SA_QUIRK_NO_MODESEL, 32768
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "ARCHIVE",
	"Python 06408", ""}, SA_QUIRK_NODREAD, 0
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "ARCHIVE",
	"Python 25601", ""}, SA_QUIRK_NOCOMP\|SA_QUIRK_NODREAD, 0
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "ARCHIVE",
	"Python", ""}, SA_QUIRK_NODREAD, 0
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "ARCHIVE",
	"VIPER 150", ""}, SA_QUIRK_FIXED\|SA_QUIRK_1FM, 512
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "ARCHIVE",
	"VIPER 2525 25462", "-011"},
	SA_QUIRK_NOCOMP\|SA_QUIRK_1FM\|SA_QUIRK_NODREAD, 0
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "ARCHIVE",
	"VIPER 2525", ""}, SA_QUIRK_FIXED\|SA_QUIRK_1FM, 1024
	},
	#if 0
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "HP",
	"C15", ""}, SA_QUIRK_VARIABLE\|SA_QUIRK_NO_CPAGE, 0,
	},
	#endif
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "HP",
	"C56", ""}, SA_QUIRK_VARIABLE\|SA_QUIRK_2FM, 0
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "HP",
	"T20", ""}, SA_QUIRK_FIXED\|SA_QUIRK_1FM, 512
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "HP",
	"T4000", ""}, SA_QUIRK_FIXED\|SA_QUIRK_1FM, 512
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "HP",
	"HP-88780", ""}, SA_QUIRK_VARIABLE\|SA_QUIRK_2FM, 0
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "KENNEDY",
	"", ""}, SA_QUIRK_VARIABLE\|SA_QUIRK_2FM, 0
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "M4 DATA",
	"123107 SCSI", ""}, SA_QUIRK_VARIABLE\|SA_QUIRK_2FM, 0
	},
	{ /* jreynold@primenet.com */
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "Seagate",
	"STT8000N", ""}, SA_QUIRK_1FM, 0
	},
	{ /* mike@sentex.net */
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "Seagate",
	"STT20000", ""}, SA_QUIRK_1FM, 0
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "SEAGATE",
	"DAT 06241-XXX", "*"}, SA_QUIRK_VARIABLE\|SA_QUIRK_2FM, 0
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "TANDBERG",
	" TDC 3600", "U07:"}, SA_QUIRK_NOCOMP\|SA_QUIRK_1FM, 512
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "TANDBERG",
	" TDC 3800", "*"}, SA_QUIRK_NOCOMP\|SA_QUIRK_1FM, 512
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "TANDBERG",
	" TDC 4100", "*"}, SA_QUIRK_NOCOMP\|SA_QUIRK_1FM, 512
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "TANDBERG",
	" TDC 4200", "*"}, SA_QUIRK_NOCOMP\|SA_QUIRK_1FM, 512
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "TANDBERG",
	" SLR", ""}, SA_QUIRK_1FM, 0
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "WANGTEK",
	"5525ES", ""}, SA_QUIRK_FIXED\|SA_QUIRK_1FM, 512
	},
	{
	{ T_SEQUENTIAL, SIP_MEDIA_REMOVABLE, "WANGTEK",
	"51000", ""}, SA_QUIRK_FIXED\|SA_QUIRK_1FM, 1024
	}
	};

	static d_open_t saopen;
	static d_close_t saclose;
	static d_strategy_t sastrategy;
	static d_ioctl_t saioctl;
	static periph_init_t sainit;
	static periph_ctor_t saregister;
	static periph_oninv_t saoninvalidate;
	static periph_dtor_t sacleanup;
	static periph_start_t sastart;
	static void saasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg);
	static void sadone(struct cam_periph *periph,
	union ccb *start_ccb);
	static int saerror(union ccb *ccb, u_int32_t cam_flags,
	u_int32_t sense_flags);
	static int samarkswanted(struct cam_periph *);
	static int sacheckeod(struct cam_periph *periph);
	static int sagetparams(struct cam_periph *periph,
	sa_params params_to_get,
	u_int32_t blocksize, u_int8_t density,
	u_int32_t numblocks, int buff_mode,
	u_int8_t write_protect, u_int8_t speed,
	int comp_supported, int comp_enabled,
	u_int32_t *comp_algorithm,
	sa_comp_t *comp_page,
	struct scsi_control_data_prot_subpage
	*prot_page, int dp_size,
	int prot_changeable);
	static int sasetprot(struct cam_periph *periph,
	struct sa_prot_state *new_prot);
	static int sasetparams(struct cam_periph *periph,
	sa_params params_to_set,
	u_int32_t blocksize, u_int8_t density,
	u_int32_t comp_algorithm,
	u_int32_t sense_flags);
	static int sasetsili(struct cam_periph *periph,
	struct mtparamset *ps, int num_params);
	static int saseteotwarn(struct cam_periph *periph,
	struct mtparamset *ps, int num_params);
	static void safillprot(struct sa_softc softc, int indent,
	struct sbuf *sb);
	static void sapopulateprots(struct sa_prot_state *cur_state,
	struct sa_prot_map *new_table,
	int table_ents);
	static struct sa_prot_map safindprotent(char name, struct sa_prot_map *table,
	int table_ents);
	static int sasetprotents(struct cam_periph *periph,
	struct mtparamset *ps, int num_params);
	static struct sa_param_ent safindparament(struct mtparamset ps);
	static int saparamsetlist(struct cam_periph *periph,
	struct mtsetlist *list, int need_copy);
	static int saextget(struct cdev dev, struct cam_periph periph,
	struct sbuf sb, struct mtextget g);
	static int saparamget(struct sa_softc softc, struct sbuf sb);
	static void saprevent(struct cam_periph *periph, int action);
	static int sarewind(struct cam_periph *periph);
	static int saspace(struct cam_periph *periph, int count,
	scsi_space_code code);
	static void sadevgonecb(void *arg);
	static void sasetupdev(struct sa_softc softc, struct cdev dev);
	static int samount(struct cam_periph , int, struct cdev );
	static int saretension(struct cam_periph *periph);
	static int sareservereleaseunit(struct cam_periph *periph,
	int reserve);
	static int saloadunload(struct cam_periph *periph, int load);
	static int saerase(struct cam_periph *periph, int longerase);
	static int sawritefilemarks(struct cam_periph *periph,
	int nmarks, int setmarks, int immed);
	static int sagetpos(struct cam_periph *periph);
	static int sardpos(struct cam_periph periph, int, u_int32_t );
	static int sasetpos(struct cam_periph *periph, int,
	struct mtlocate *);
	static void safilldenstypesb(struct sbuf sb, int indent,
	uint8_t *buf, int buf_len,
	int is_density);
	static void safilldensitysb(struct sa_softc softc, int indent,
	struct sbuf *sb);

	#ifndef SA_DEFAULT_IO_SPLIT
	#define SA_DEFAULT_IO_SPLIT 0
	#endif

	static int sa_allow_io_split = SA_DEFAULT_IO_SPLIT;

	/*
	* Tunable to allow the user to set a global allow_io_split value. Note
	* that this WILL GO AWAY in FreeBSD 11.0. Silently splitting the I/O up
	* is bad behavior, because it hides the true tape block size from the
	* application.
	*/
	static SYSCTL_NODE(_kern_cam, OID_AUTO, sa, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"CAM Sequential Access Tape Driver");
	SYSCTL_INT(_kern_cam_sa, OID_AUTO, allow_io_split, CTLFLAG_RDTUN,
	&sa_allow_io_split, 0, "Default I/O split value");

	static struct periph_driver sadriver =
	{
	sainit, "sa",
	TAILQ_HEAD_INITIALIZER(sadriver.units), /* generation */ 0
	};

	PERIPHDRIVER_DECLARE(sa, sadriver);

	/* For 2.2-stable support */
	#ifndef D_TAPE
	#define D_TAPE 0
	#endif

	static struct cdevsw sa_cdevsw = {
	.d_version = D_VERSION,
	.d_open = saopen,
	.d_close = saclose,
	.d_read = physread,
	.d_write = physwrite,
	.d_ioctl = saioctl,
	.d_strategy = sastrategy,
	.d_name = "sa",
	.d_flags = D_TAPE \| D_TRACKCLOSE,
	};

	static int
	saopen(struct cdev dev, int flags, int fmt, struct thread td)
	{
	struct cam_periph *periph;
	struct sa_softc *softc;
	int error;

	periph = (struct cam_periph *)dev->si_drv1;
	if (cam_periph_acquire(periph) != 0) {
	return (ENXIO);
	}

	cam_periph_lock(periph);

	softc = (struct sa_softc *)periph->softc;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE\|CAM_DEBUG_INFO,
	("saopen(%s): softc=0x%x\n", devtoname(dev), softc->flags));

	if (SA_IS_CTRL(dev)) {
	softc->ctrl_mode = 1;
	softc->open_count++;
	cam_periph_unlock(periph);
	return (0);
	}

	if ((error = cam_periph_hold(periph, PRIBIO\|PCATCH)) != 0) {
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (error);
	}

	if (softc->flags & SA_FLAG_OPEN) {
	error = EBUSY;
	} else if (softc->flags & SA_FLAG_INVALID) {
	error = ENXIO;
	} else {
	/*
	* Preserve whether this is a read_only open.
	*/
	softc->open_rdonly = (flags & O_RDWR) == O_RDONLY;

	/*
	* The function samount ensures media is loaded and ready.
	* It also does a device RESERVE if the tape isn't yet mounted.
	*
	* If the mount fails and this was a non-blocking open,
	* make this a 'open_pending_mount' action.
	*/
	error = samount(periph, flags, dev);
	if (error && (flags & O_NONBLOCK)) {
	softc->flags \|= SA_FLAG_OPEN;
	softc->open_pending_mount = 1;
	softc->open_count++;
	cam_periph_unhold(periph);
	cam_periph_unlock(periph);
	return (0);
	}
	}

	if (error) {
	cam_periph_unhold(periph);
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (error);
	}

	saprevent(periph, PR_PREVENT);
	softc->flags \|= SA_FLAG_OPEN;
	softc->open_count++;

	cam_periph_unhold(periph);
	cam_periph_unlock(periph);
	return (error);
	}

	static int
	saclose(struct cdev dev, int flag, int fmt, struct thread td)
	{
	struct cam_periph *periph;
	struct sa_softc *softc;
	int mode, error, writing, tmp, i;
	int closedbits = SA_FLAG_OPEN;

	mode = SAMODE(dev);
	periph = (struct cam_periph *)dev->si_drv1;
	cam_periph_lock(periph);

	softc = (struct sa_softc *)periph->softc;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE\|CAM_DEBUG_INFO,
	("saclose(%s): softc=0x%x\n", devtoname(dev), softc->flags));

	softc->open_rdonly = 0;
	if (SA_IS_CTRL(dev)) {
	softc->ctrl_mode = 0;
	softc->open_count--;
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (0);
	}

	if (softc->open_pending_mount) {
	softc->flags &= ~SA_FLAG_OPEN;
	softc->open_pending_mount = 0;
	softc->open_count--;
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (0);
	}

	if ((error = cam_periph_hold(periph, PRIBIO)) != 0) {
	cam_periph_unlock(periph);
	return (error);
	}

	/*
	* Were we writing the tape?
	*/
	writing = (softc->flags & SA_FLAG_TAPE_WRITTEN) != 0;

	/*
	* See whether or not we need to write filemarks. If this
	* fails, we probably have to assume we've lost tape
	* position.
	*/
	error = sacheckeod(periph);
	if (error) {
	xpt_print(periph->path,
	"failed to write terminating filemark(s)\n");
	softc->flags \|= SA_FLAG_TAPE_FROZEN;
	}

	/*
	* Whatever we end up doing, allow users to eject tapes from here on.
	*/
	saprevent(periph, PR_ALLOW);

	/*
	* Decide how to end...
	*/
	if ((softc->flags & SA_FLAG_TAPE_MOUNTED) == 0) {
	closedbits \|= SA_FLAG_TAPE_FROZEN;
	} else switch (mode) {
	case SA_MODE_OFFLINE:
	/*
	* An 'offline' close is an unconditional release of
	* frozen && mount conditions, irrespective of whether
	* these operations succeeded. The reason for this is
	* to allow at least some kind of programmatic way
	* around our state getting all fouled up. If somebody
	* issues an 'offline' command, that will be allowed
	* to clear state.
	*/
	(void) sarewind(periph);
	(void) saloadunload(periph, FALSE);
	closedbits \|= SA_FLAG_TAPE_MOUNTED\|SA_FLAG_TAPE_FROZEN;
	break;
	case SA_MODE_REWIND:
	/*
	* If the rewind fails, return an error- if anyone cares,
	* but not overwriting any previous error.
	*
	* We don't clear the notion of mounted here, but we do
	* clear the notion of frozen if we successfully rewound.
	*/
	tmp = sarewind(periph);
	if (tmp) {
	if (error != 0)
	error = tmp;
	} else {
	closedbits \|= SA_FLAG_TAPE_FROZEN;
	}
	break;
	case SA_MODE_NOREWIND:
	/*
	* If we're not rewinding/unloading the tape, find out
	* whether we need to back up over one of two filemarks
	* we wrote (if we wrote two filemarks) so that appends
	* from this point on will be sane.
	*/
	if (error == 0 && writing && (softc->quirks & SA_QUIRK_2FM)) {
	tmp = saspace(periph, -1, SS_FILEMARKS);
	if (tmp) {
	xpt_print(periph->path, "unable to backspace "
	"over one of double filemarks at end of "
	"tape\n");
	xpt_print(periph->path, "it is possible that "
	"this device needs a SA_QUIRK_1FM quirk set"
	"for it\n");
	softc->flags \|= SA_FLAG_TAPE_FROZEN;
	}
	}
	break;
	default:
	xpt_print(periph->path, "unknown mode 0x%x in saclose\n", mode);
	/* NOTREACHED */
	break;
	}

	/*
	* We wish to note here that there are no more filemarks to be written.
	*/
	softc->filemarks = 0;
	softc->flags &= ~SA_FLAG_TAPE_WRITTEN;

	/*
	* And we are no longer open for business.
	*/
	softc->flags &= ~closedbits;
	softc->open_count--;

	/*
	* Invalidate any density information that depends on having tape
	* media in the drive.
	*/
	for (i = 0; i < SA_DENSITY_TYPES; i++) {
	if (softc->density_type_bits[i] & SRDS_MEDIA)
	softc->density_info_valid[i] = 0;
	}

	/*
	* Inform users if tape state if frozen....
	*/
	if (softc->flags & SA_FLAG_TAPE_FROZEN) {
	xpt_print(periph->path, "tape is now frozen- use an OFFLINE, "
	"REWIND or MTEOM command to clear this state.\n");
	}

	/* release the device if it is no longer mounted */
	if ((softc->flags & SA_FLAG_TAPE_MOUNTED) == 0)
	sareservereleaseunit(periph, FALSE);

	cam_periph_unhold(periph);
	cam_periph_unlock(periph);
	cam_periph_release(periph);

	return (error);
	}

	/*
	* Actually translate the requested transfer into one the physical driver
	* can understand. The transfer is described by a buf and will include
	* only one physical transfer.
	*/
	static void
	sastrategy(struct bio *bp)
	{
	struct cam_periph *periph;
	struct sa_softc *softc;

	bp->bio_resid = bp->bio_bcount;
	if (SA_IS_CTRL(bp->bio_dev)) {
	biofinish(bp, NULL, EINVAL);
	return;
	}
	periph = (struct cam_periph *)bp->bio_dev->si_drv1;
	cam_periph_lock(periph);

	softc = (struct sa_softc *)periph->softc;

	if (softc->flags & SA_FLAG_INVALID) {
	cam_periph_unlock(periph);
	biofinish(bp, NULL, ENXIO);
	return;
	}

	if (softc->flags & SA_FLAG_TAPE_FROZEN) {
	cam_periph_unlock(periph);
	biofinish(bp, NULL, EPERM);
	return;
	}

	/*
	* This should actually never occur as the write(2)
	* system call traps attempts to write to a read-only
	* file descriptor.
	*/
	if (bp->bio_cmd == BIO_WRITE && softc->open_rdonly) {
	cam_periph_unlock(periph);
	biofinish(bp, NULL, EBADF);
	return;
	}

	if (softc->open_pending_mount) {
	int error = samount(periph, 0, bp->bio_dev);
	if (error) {
	cam_periph_unlock(periph);
	biofinish(bp, NULL, ENXIO);
	return;
	}
	saprevent(periph, PR_PREVENT);
	softc->open_pending_mount = 0;
	}

	/*
	* If it's a null transfer, return immediately
	*/
	if (bp->bio_bcount == 0) {
	cam_periph_unlock(periph);
	biodone(bp);
	return;
	}

	/* valid request? */
	if (softc->flags & SA_FLAG_FIXED) {
	/*
	* Fixed block device. The byte count must
	* be a multiple of our block size.
	*/
	if (((softc->blk_mask != ~0) &&
	((bp->bio_bcount & softc->blk_mask) != 0)) \|\|
	((softc->blk_mask == ~0) &&
	((bp->bio_bcount % softc->min_blk) != 0))) {
	xpt_print(periph->path, "Invalid request. Fixed block "
	"device requests must be a multiple of %d bytes\n",
	softc->min_blk);
	cam_periph_unlock(periph);
	biofinish(bp, NULL, EINVAL);
	return;
	}
	} else if ((bp->bio_bcount > softc->max_blk) \|\|
	(bp->bio_bcount < softc->min_blk) \|\|
	(bp->bio_bcount & softc->blk_mask) != 0) {
	xpt_print_path(periph->path);
	printf("Invalid request. Variable block "
	"device requests must be ");
	if (softc->blk_mask != 0) {
	printf("a multiple of %d ", (0x1 << softc->blk_gran));
	}
	printf("between %d and %d bytes\n", softc->min_blk,
	softc->max_blk);
	cam_periph_unlock(periph);
	biofinish(bp, NULL, EINVAL);
	return;
	}

	/*
	* Place it at the end of the queue.
	*/
	bioq_insert_tail(&softc->bio_queue, bp);
	softc->queue_count++;
	#if 0
	CAM_DEBUG(periph->path, CAM_DEBUG_INFO,
	("sastrategy: queuing a %ld %s byte %s\n", bp->bio_bcount,
	(softc->flags & SA_FLAG_FIXED)? "fixed" : "variable",
	(bp->bio_cmd == BIO_READ)? "read" : "write"));
	#endif
	if (softc->queue_count > 1) {
	CAM_DEBUG(periph->path, CAM_DEBUG_INFO,
	("sastrategy: queue count now %d\n", softc->queue_count));
	}

	/*
	* Schedule ourselves for performing the work.
	*/
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	cam_periph_unlock(periph);

	return;
	}

	static int
	sasetsili(struct cam_periph periph, struct mtparamset ps, int num_params)
	{
	uint32_t sili_blocksize;
	struct sa_softc *softc;
	int error;

	error = 0;
	softc = (struct sa_softc *)periph->softc;

	if (ps->value_type != MT_PARAM_SET_SIGNED) {
	snprintf(ps->error_str, sizeof(ps->error_str),
	"sili is a signed parameter");
	goto bailout;
	}
	if ((ps->value.value_signed < 0)
	\|\| (ps->value.value_signed > 1)) {
	snprintf(ps->error_str, sizeof(ps->error_str),
	"invalid sili value %jd", (intmax_t)ps->value.value_signed);
	goto bailout_error;
	}
	/*
	* We only set the SILI flag in variable block
	* mode. You'll get a check condition in fixed
	* block mode if things don't line up in any case.
	*/
	if (softc->flags & SA_FLAG_FIXED) {
	snprintf(ps->error_str, sizeof(ps->error_str),
	"can't set sili bit in fixed block mode");
	goto bailout_error;
	}
	if (softc->sili == ps->value.value_signed)
	goto bailout;

	if (ps->value.value_signed == 1)
	sili_blocksize = 4;
	else
	sili_blocksize = 0;

	error = sasetparams(periph, SA_PARAM_BLOCKSIZE,
	sili_blocksize, 0, 0, SF_QUIET_IR);
	if (error != 0) {
	snprintf(ps->error_str, sizeof(ps->error_str),
	"sasetparams() returned error %d", error);
	goto bailout_error;
	}

	softc->sili = ps->value.value_signed;

	bailout:
	ps->status = MT_PARAM_STATUS_OK;
	return (error);

	bailout_error:
	ps->status = MT_PARAM_STATUS_ERROR;
	if (error == 0)
	error = EINVAL;

	return (error);
	}

	static int
	saseteotwarn(struct cam_periph periph, struct mtparamset ps, int num_params)
	{
	struct sa_softc *softc;
	int error;

	error = 0;
	softc = (struct sa_softc *)periph->softc;

	if (ps->value_type != MT_PARAM_SET_SIGNED) {
	snprintf(ps->error_str, sizeof(ps->error_str),
	"eot_warn is a signed parameter");
	ps->status = MT_PARAM_STATUS_ERROR;
	goto bailout;
	}
	if ((ps->value.value_signed < 0)
	\|\| (ps->value.value_signed > 1)) {
	snprintf(ps->error_str, sizeof(ps->error_str),
	"invalid eot_warn value %jd\n",
	(intmax_t)ps->value.value_signed);
	ps->status = MT_PARAM_STATUS_ERROR;
	goto bailout;
	}
	softc->eot_warn = ps->value.value_signed;
	ps->status = MT_PARAM_STATUS_OK;
	bailout:
	if (ps->status != MT_PARAM_STATUS_OK)
	error = EINVAL;

	return (error);
	}

	static void
	safillprot(struct sa_softc softc, int indent, struct sbuf *sb)
	{
	int tmpint;

	SASBADDNODE(sb, *indent, protection);
	if (softc->flags & SA_FLAG_PROTECT_SUPP)
	tmpint = 1;
	else
	tmpint = 0;
	SASBADDINTDESC(sb, *indent, tmpint, %d, protection_supported,
	"Set to 1 if protection information is supported");

	if ((tmpint != 0)
	&& (softc->prot_info.cur_prot_state.initialized != 0)) {
	struct sa_prot_state *prot;

	prot = &softc->prot_info.cur_prot_state;

	SASBADDUINTDESC(sb, *indent, prot->prot_method, %u,
	prot_method, "Current Protection Method");
	SASBADDUINTDESC(sb, *indent, prot->pi_length, %u,
	pi_length, "Length of Protection Information");
	SASBADDUINTDESC(sb, *indent, prot->lbp_w, %u,
	lbp_w, "Check Protection on Writes");
	SASBADDUINTDESC(sb, *indent, prot->lbp_r, %u,
	lbp_r, "Check and Include Protection on Reads");
	SASBADDUINTDESC(sb, *indent, prot->rbdp, %u,
	rbdp, "Transfer Protection Information for RECOVER "
	"BUFFERED DATA command");
	}
	SASBENDNODE(sb, *indent, protection);
	}

	static void
	sapopulateprots(struct sa_prot_state cur_state, struct sa_prot_map new_table,
	int table_ents)
	{
	int i;

	bcopy(sa_prot_table, new_table, min(table_ents * sizeof(*new_table),
	sizeof(sa_prot_table)));

	table_ents = min(table_ents, SA_NUM_PROT_ENTS);

	for (i = 0; i < table_ents; i++)
	new_table[i].value = (uint32_t )((uint8_t )cur_state +
	new_table[i].offset);

	return;
	}

	static struct sa_prot_map *
	safindprotent(char name, struct sa_prot_map table, int table_ents)
	{
	char *prot_name = "protection.";
	int i, prot_len;

	prot_len = strlen(prot_name);

	/*
	* This shouldn't happen, but we check just in case.
	*/
	if (strncmp(name, prot_name, prot_len) != 0)
	goto bailout;

	for (i = 0; i < table_ents; i++) {
	if (strcmp(&name[prot_len], table[i].name) != 0)
	continue;
	return (&table[i]);
	}
	bailout:
	return (NULL);
	}

	static int
	sasetprotents(struct cam_periph periph, struct mtparamset ps, int num_params)
	{
	struct sa_softc *softc;
	struct sa_prot_map prot_ents[SA_NUM_PROT_ENTS];
	struct sa_prot_state new_state;
	int error;
	int i;

	softc = (struct sa_softc *)periph->softc;
	error = 0;

	/*
	* Make sure that this tape drive supports protection information.
	* Otherwise we can't set anything.
	*/
	if ((softc->flags & SA_FLAG_PROTECT_SUPP) == 0) {
	snprintf(ps[0].error_str, sizeof(ps[0].error_str),
	"Protection information is not supported for this device");
	ps[0].status = MT_PARAM_STATUS_ERROR;
	goto bailout;
	}

	/*
	* We can't operate with physio(9) splitting enabled, because there
	* is no way to insure (especially in variable block mode) that
	* what the user writes (with a checksum block at the end) will
	* make it into the sa(4) driver intact.
	*/
	if ((softc->si_flags & SI_NOSPLIT) == 0) {
	snprintf(ps[0].error_str, sizeof(ps[0].error_str),
	"Protection information cannot be enabled with I/O "
	"splitting");
	ps[0].status = MT_PARAM_STATUS_ERROR;
	goto bailout;
	}

	/*
	* Take the current cached protection state and use that as the
	* basis for our new entries.
	*/
	bcopy(&softc->prot_info.cur_prot_state, &new_state, sizeof(new_state));

	/*
	* Populate the table mapping property names to pointers into the
	* state structure.
	*/
	sapopulateprots(&new_state, prot_ents, SA_NUM_PROT_ENTS);

	/*
	* For each parameter the user passed in, make sure the name, type
	* and value are valid.
	*/
	for (i = 0; i < num_params; i++) {
	struct sa_prot_map *ent;

	ent = safindprotent(ps[i].value_name, prot_ents,
	SA_NUM_PROT_ENTS);
	if (ent == NULL) {
	ps[i].status = MT_PARAM_STATUS_ERROR;
	snprintf(ps[i].error_str, sizeof(ps[i].error_str),
	"Invalid protection entry name %s",
	ps[i].value_name);
	error = EINVAL;
	goto bailout;
	}
	if (ent->param_type != ps[i].value_type) {
	ps[i].status = MT_PARAM_STATUS_ERROR;
	snprintf(ps[i].error_str, sizeof(ps[i].error_str),
	"Supplied type %d does not match actual type %d",
	ps[i].value_type, ent->param_type);
	error = EINVAL;
	goto bailout;
	}
	if ((ps[i].value.value_unsigned < ent->min_val)
	\|\| (ps[i].value.value_unsigned > ent->max_val)) {
	ps[i].status = MT_PARAM_STATUS_ERROR;
	snprintf(ps[i].error_str, sizeof(ps[i].error_str),
	"Value %ju is outside valid range %u - %u",
	(uintmax_t)ps[i].value.value_unsigned, ent->min_val,
	ent->max_val);
	error = EINVAL;
	goto bailout;
	}
	*(ent->value) = ps[i].value.value_unsigned;
	}

	/*
	* Actually send the protection settings to the drive.
	*/
	error = sasetprot(periph, &new_state);
	if (error != 0) {
	for (i = 0; i < num_params; i++) {
	ps[i].status = MT_PARAM_STATUS_ERROR;
	snprintf(ps[i].error_str, sizeof(ps[i].error_str),
	"Unable to set parameter, see dmesg(8)");
	}
	goto bailout;
	}

	/*
	* Let the user know that his settings were stored successfully.
	*/
	for (i = 0; i < num_params; i++)
	ps[i].status = MT_PARAM_STATUS_OK;

	bailout:
	return (error);
	}
	/*
	* Entry handlers generally only handle a single entry. Node handlers will
	* handle a contiguous range of parameters to set in a single call.
	*/
	typedef enum {
	SA_PARAM_TYPE_ENTRY,
	SA_PARAM_TYPE_NODE
	} sa_param_type;

	struct sa_param_ent {
	char *name;
	sa_param_type param_type;
	int (set_func)(struct cam_periph periph, struct mtparamset *ps,
	int num_params);
	} sa_param_table[] = {
	{"sili", SA_PARAM_TYPE_ENTRY, sasetsili },
	{"eot_warn", SA_PARAM_TYPE_ENTRY, saseteotwarn },
	{"protection.", SA_PARAM_TYPE_NODE, sasetprotents }
	};

	static struct sa_param_ent *
	safindparament(struct mtparamset *ps)
	{
	unsigned int i;

	for (i = 0; i < nitems(sa_param_table); i++){
	/*
	* For entries, we compare all of the characters. For
	* nodes, we only compare the first N characters. The node
	* handler will decode the rest.
	*/
	if (sa_param_table[i].param_type == SA_PARAM_TYPE_ENTRY) {
	if (strcmp(ps->value_name, sa_param_table[i].name) != 0)
	continue;
	} else {
	if (strncmp(ps->value_name, sa_param_table[i].name,
	strlen(sa_param_table[i].name)) != 0)
	continue;
	}
	return (&sa_param_table[i]);
	}

	return (NULL);
	}

	/*
	* Go through a list of parameters, coalescing contiguous parameters with
	* the same parent node into a single call to a set_func.
	*/
	static int
	saparamsetlist(struct cam_periph periph, struct mtsetlist list,
	int need_copy)
	{
	int i, contig_ents;
	int error;
	struct mtparamset params, first;
	struct sa_param_ent *first_ent;

	error = 0;
	params = NULL;

	if (list->num_params == 0)
	/* Nothing to do */
	goto bailout;

	/*
	* Verify that the user has the correct structure size.
	*/
	if ((list->num_params * sizeof(struct mtparamset)) !=
	list->param_len) {
	xpt_print(periph->path, "%s: length of params %d != "
	"sizeof(struct mtparamset) %zd * num_params %d\n",
	__func__, list->param_len, sizeof(struct mtparamset),
	list->num_params);
	error = EINVAL;
	goto bailout;
	}

	if (need_copy != 0) {
	/*
	* XXX KDM will dropping the lock cause an issue here?
	*/
	cam_periph_unlock(periph);
	params = malloc(list->param_len, M_SCSISA, M_WAITOK \| M_ZERO);
	error = copyin(list->params, params, list->param_len);
	cam_periph_lock(periph);

	if (error != 0)
	goto bailout;
	} else {
	params = list->params;
	}

	contig_ents = 0;
	first = NULL;
	first_ent = NULL;
	for (i = 0; i < list->num_params; i++) {
	struct sa_param_ent *ent;

	ent = safindparament(&params[i]);
	if (ent == NULL) {
	snprintf(params[i].error_str,
	sizeof(params[i].error_str),
	"%s: cannot find parameter %s", __func__,
	params[i].value_name);
	params[i].status = MT_PARAM_STATUS_ERROR;
	break;
	}

	if (first != NULL) {
	if (first_ent == ent) {
	/*
	* We're still in a contiguous list of
	* parameters that can be handled by one
	* node handler.
	*/
	contig_ents++;
	continue;
	} else {
	error = first_ent->set_func(periph, first,
	contig_ents);
	first = NULL;
	first_ent = NULL;
	contig_ents = 0;
	if (error != 0) {
	error = 0;
	break;
	}
	}
	}
	if (ent->param_type == SA_PARAM_TYPE_NODE) {
	first = &params[i];
	first_ent = ent;
	contig_ents = 1;
	} else {
	error = ent->set_func(periph, &params[i], 1);
	if (error != 0) {
	error = 0;
	break;
	}
	}
	}
	if (first != NULL)
	first_ent->set_func(periph, first, contig_ents);

	bailout:
	if (need_copy != 0) {
	if (error != EFAULT) {
	cam_periph_unlock(periph);
	copyout(params, list->params, list->param_len);
	cam_periph_lock(periph);
	}
	free(params, M_SCSISA);
	}
	return (error);
	}

	static int
	sagetparams_common(struct cdev dev, struct cam_periph periph)
	{
	struct sa_softc *softc;
	u_int8_t write_protect;
	int comp_enabled, comp_supported, error;

	softc = (struct sa_softc *)periph->softc;

	if (softc->open_pending_mount)
	return (0);

	/* The control device may issue getparams() if there are no opens. */
	if (SA_IS_CTRL(dev) && (softc->flags & SA_FLAG_OPEN) != 0)
	return (0);

	error = sagetparams(periph, SA_PARAM_ALL, &softc->media_blksize,
	&softc->media_density, &softc->media_numblks, &softc->buffer_mode,
	&write_protect, &softc->speed, &comp_supported, &comp_enabled,
	&softc->comp_algorithm, NULL, NULL, 0, 0);
	if (error)
	return (error);
	if (write_protect)
	softc->flags \|= SA_FLAG_TAPE_WP;
	else
	softc->flags &= ~SA_FLAG_TAPE_WP;
	softc->flags &= ~SA_FLAG_COMPRESSION;
	if (comp_supported) {
	if (softc->saved_comp_algorithm == 0)
	softc->saved_comp_algorithm =
	softc->comp_algorithm;
	softc->flags \|= SA_FLAG_COMP_SUPP;
	if (comp_enabled)
	softc->flags \|= SA_FLAG_COMP_ENABLED;
	} else
	softc->flags \|= SA_FLAG_COMP_UNSUPP;

	return (0);
	}

	#define PENDING_MOUNT_CHECK(softc, periph, dev) \
	if (softc->open_pending_mount) { \
	error = samount(periph, 0, dev); \
	if (error) { \
	break; \
	} \
	saprevent(periph, PR_PREVENT); \
	softc->open_pending_mount = 0; \
	}

	static int
	saioctl(struct cdev dev, u_long cmd, caddr_t arg, int flag, struct thread td)
	{
	struct cam_periph *periph;
	struct sa_softc *softc;
	scsi_space_code spaceop;
	int didlockperiph = 0;
	int mode;
	int error = 0;

	mode = SAMODE(dev);
	error = 0; /* shut up gcc */
	spaceop = 0; /* shut up gcc */

	periph = (struct cam_periph *)dev->si_drv1;
	cam_periph_lock(periph);
	softc = (struct sa_softc *)periph->softc;

	/*
	* Check for control mode accesses. We allow MTIOCGET and
	* MTIOCERRSTAT (but need to be the only one open in order
	* to clear latched status), and MTSETBSIZE, MTSETDNSTY
	* and MTCOMP (but need to be the only one accessing this
	* device to run those).
	*/

	if (SA_IS_CTRL(dev)) {
	switch (cmd) {
	case MTIOCGETEOTMODEL:
	case MTIOCGET:
	case MTIOCEXTGET:
	case MTIOCPARAMGET:
	case MTIOCRBLIM:
	break;
	case MTIOCERRSTAT:
	/*
	* If the periph isn't already locked, lock it
	* so our MTIOCERRSTAT can reset latched error stats.
	*
	* If the periph is already locked, skip it because
	* we're just getting status and it'll be up to the
	* other thread that has this device open to do
	* an MTIOCERRSTAT that would clear latched status.
	*/
	if ((periph->flags & CAM_PERIPH_LOCKED) == 0) {
	error = cam_periph_hold(periph, PRIBIO\|PCATCH);
	if (error != 0) {
	cam_periph_unlock(periph);
	return (error);
	}
	didlockperiph = 1;
	}
	break;

	case MTIOCTOP:
	{
	struct mtop mt = (struct mtop ) arg;

	/*
	* Check to make sure it's an OP we can perform
	* with no media inserted.
	*/
	switch (mt->mt_op) {
	case MTSETBSIZ:
	case MTSETDNSTY:
	case MTCOMP:
	mt = NULL;
	/* FALLTHROUGH */
	default:
	break;
	}
	if (mt != NULL) {
	break;
	}
	/* FALLTHROUGH */
	}
	case MTIOCSETEOTMODEL:
	/*
	* We need to acquire the peripheral here rather
	* than at open time because we are sharing writable
	* access to data structures.
	*/
	error = cam_periph_hold(periph, PRIBIO\|PCATCH);
	if (error != 0) {
	cam_periph_unlock(periph);
	return (error);
	}
	didlockperiph = 1;
	break;

	default:
	cam_periph_unlock(periph);
	return (EINVAL);
	}
	}

	/*
	* Find the device that the user is talking about
	*/
	switch (cmd) {
	case MTIOCGET:
	{
	struct mtget g = (struct mtget )arg;

	error = sagetparams_common(dev, periph);
	if (error)
	break;
	bzero(g, sizeof(struct mtget));
	g->mt_type = MT_ISAR;
	if (softc->flags & SA_FLAG_COMP_UNSUPP) {
	g->mt_comp = MT_COMP_UNSUPP;
	g->mt_comp0 = MT_COMP_UNSUPP;
	g->mt_comp1 = MT_COMP_UNSUPP;
	g->mt_comp2 = MT_COMP_UNSUPP;
	g->mt_comp3 = MT_COMP_UNSUPP;
	} else {
	if ((softc->flags & SA_FLAG_COMP_ENABLED) == 0) {
	g->mt_comp = MT_COMP_DISABLED;
	} else {
	g->mt_comp = softc->comp_algorithm;
	}
	g->mt_comp0 = softc->comp_algorithm;
	g->mt_comp1 = softc->comp_algorithm;
	g->mt_comp2 = softc->comp_algorithm;
	g->mt_comp3 = softc->comp_algorithm;
	}
	g->mt_density = softc->media_density;
	g->mt_density0 = softc->media_density;
	g->mt_density1 = softc->media_density;
	g->mt_density2 = softc->media_density;
	g->mt_density3 = softc->media_density;
	g->mt_blksiz = softc->media_blksize;
	g->mt_blksiz0 = softc->media_blksize;
	g->mt_blksiz1 = softc->media_blksize;
	g->mt_blksiz2 = softc->media_blksize;
	g->mt_blksiz3 = softc->media_blksize;
	g->mt_fileno = softc->fileno;
	g->mt_blkno = softc->blkno;
	g->mt_dsreg = (short) softc->dsreg;
	/*
	* Yes, we know that this is likely to overflow
	*/
	if (softc->last_resid_was_io) {
	if ((g->mt_resid = (short) softc->last_io_resid) != 0) {
	if (SA_IS_CTRL(dev) == 0 \|\| didlockperiph) {
	softc->last_io_resid = 0;
	}
	}
	} else {
	if ((g->mt_resid = (short)softc->last_ctl_resid) != 0) {
	if (SA_IS_CTRL(dev) == 0 \|\| didlockperiph) {
	softc->last_ctl_resid = 0;
	}
	}
	}
	error = 0;
	break;
	}
	case MTIOCEXTGET:
	case MTIOCPARAMGET:
	{
	struct mtextget g = (struct mtextget )arg;
	char *tmpstr2;
	struct sbuf *sb;

	/*
	* Report drive status using an XML format.
	*/

	/*
	* XXX KDM will dropping the lock cause any problems here?
	*/
	cam_periph_unlock(periph);
	sb = sbuf_new(NULL, NULL, g->alloc_len, SBUF_FIXEDLEN);
	if (sb == NULL) {
	g->status = MT_EXT_GET_ERROR;
	snprintf(g->error_str, sizeof(g->error_str),
	"Unable to allocate %d bytes for status info",
	g->alloc_len);
	cam_periph_lock(periph);
	goto extget_bailout;
	}
	cam_periph_lock(periph);

	if (cmd == MTIOCEXTGET)
	error = saextget(dev, periph, sb, g);
	else
	error = saparamget(softc, sb);

	if (error != 0)
	goto extget_bailout;

	error = sbuf_finish(sb);
	if (error == ENOMEM) {
	g->status = MT_EXT_GET_NEED_MORE_SPACE;
	error = 0;
	} else if (error != 0) {
	g->status = MT_EXT_GET_ERROR;
	snprintf(g->error_str, sizeof(g->error_str),
	"Error %d returned from sbuf_finish()", error);
	} else
	g->status = MT_EXT_GET_OK;

	error = 0;
	tmpstr2 = sbuf_data(sb);
	g->fill_len = strlen(tmpstr2) + 1;
	cam_periph_unlock(periph);

	error = copyout(tmpstr2, g->status_xml, g->fill_len);

	cam_periph_lock(periph);

	extget_bailout:
	sbuf_delete(sb);
	break;
	}
	case MTIOCPARAMSET:
	{
	struct mtsetlist list;
	struct mtparamset ps = (struct mtparamset )arg;

	bzero(&list, sizeof(list));
	list.num_params = 1;
	list.param_len = sizeof(*ps);
	list.params = ps;

	error = saparamsetlist(periph, &list, /need_copy/ 0);
	break;
	}
	case MTIOCSETLIST:
	{
	struct mtsetlist list = (struct mtsetlist )arg;

	error = saparamsetlist(periph, list, /need_copy/ 1);
	break;
	}
	case MTIOCERRSTAT:
	{
	struct scsi_tape_errors *sep =
	&((union mterrstat *)arg)->scsi_errstat;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE,
	("saioctl: MTIOCERRSTAT\n"));

	bzero(sep, sizeof(*sep));
	sep->io_resid = softc->last_io_resid;
	bcopy((caddr_t) &softc->last_io_sense, sep->io_sense,
	sizeof (sep->io_sense));
	bcopy((caddr_t) &softc->last_io_cdb, sep->io_cdb,
	sizeof (sep->io_cdb));
	sep->ctl_resid = softc->last_ctl_resid;
	bcopy((caddr_t) &softc->last_ctl_sense, sep->ctl_sense,
	sizeof (sep->ctl_sense));
	bcopy((caddr_t) &softc->last_ctl_cdb, sep->ctl_cdb,
	sizeof (sep->ctl_cdb));

	if ((SA_IS_CTRL(dev) == 0 && !softc->open_pending_mount) \|\|
	didlockperiph)
	bzero((caddr_t) &softc->errinfo,
	sizeof (softc->errinfo));
	error = 0;
	break;
	}
	case MTIOCTOP:
	{
	struct mtop *mt;
	int count;

	PENDING_MOUNT_CHECK(softc, periph, dev);

	mt = (struct mtop *)arg;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE,
	("saioctl: op=0x%x count=0x%x\n",
	mt->mt_op, mt->mt_count));

	count = mt->mt_count;
	switch (mt->mt_op) {
	case MTWEOF: /* write an end-of-file marker */
	/*
	* We don't need to clear the SA_FLAG_TAPE_WRITTEN
	* flag because by keeping track of filemarks
	* we have last written we know whether or not
	* we need to write more when we close the device.
	*/
	error = sawritefilemarks(periph, count, FALSE, FALSE);
	break;
	case MTWEOFI:
	/* write an end-of-file marker without waiting */
	error = sawritefilemarks(periph, count, FALSE, TRUE);
	break;
	case MTWSS: /* write a setmark */
	error = sawritefilemarks(periph, count, TRUE, FALSE);
	break;
	case MTBSR: /* backward space record */
	case MTFSR: /* forward space record */
	case MTBSF: /* backward space file */
	case MTFSF: /* forward space file */
	case MTBSS: /* backward space setmark */
	case MTFSS: /* forward space setmark */
	case MTEOD: /* space to end of recorded medium */
	{
	int nmarks;

	spaceop = SS_FILEMARKS;
	nmarks = softc->filemarks;
	error = sacheckeod(periph);
	if (error) {
	xpt_print(periph->path,
	"EOD check prior to spacing failed\n");
	softc->flags \|= SA_FLAG_EIO_PENDING;
	break;
	}
	nmarks -= softc->filemarks;
	switch(mt->mt_op) {
	case MTBSR:
	count = -count;
	/* FALLTHROUGH */
	case MTFSR:
	spaceop = SS_BLOCKS;
	break;
	case MTBSF:
	count = -count;
	/* FALLTHROUGH */
	case MTFSF:
	break;
	case MTBSS:
	count = -count;
	/* FALLTHROUGH */
	case MTFSS:
	spaceop = SS_SETMARKS;
	break;
	case MTEOD:
	spaceop = SS_EOD;
	count = 0;
	nmarks = 0;
	break;
	default:
	error = EINVAL;
	break;
	}
	if (error)
	break;

	nmarks = softc->filemarks;
	/*
	* XXX: Why are we checking again?
	*/
	error = sacheckeod(periph);
	if (error)
	break;
	nmarks -= softc->filemarks;
	error = saspace(periph, count - nmarks, spaceop);
	/*
	* At this point, clear that we've written the tape
	* and that we've written any filemarks. We really
	* don't know what the applications wishes to do next-
	* the sacheckeod's will make sure we terminated the
	* tape correctly if we'd been writing, but the next
	* action the user application takes will set again
	* whether we need to write filemarks.
	*/
	softc->flags &=
	~(SA_FLAG_TAPE_WRITTEN\|SA_FLAG_TAPE_FROZEN);
	softc->filemarks = 0;
	break;
	}
	case MTREW: /* rewind */
	PENDING_MOUNT_CHECK(softc, periph, dev);
	(void) sacheckeod(periph);
	error = sarewind(periph);
	/* see above */
	softc->flags &=
	~(SA_FLAG_TAPE_WRITTEN\|SA_FLAG_TAPE_FROZEN);
	softc->flags &= ~SA_FLAG_ERR_PENDING;
	softc->filemarks = 0;
	break;
	case MTERASE: /* erase */
	PENDING_MOUNT_CHECK(softc, periph, dev);
	error = saerase(periph, count);
	softc->flags &=
	~(SA_FLAG_TAPE_WRITTEN\|SA_FLAG_TAPE_FROZEN);
	softc->flags &= ~SA_FLAG_ERR_PENDING;
	break;
	case MTRETENS: /* re-tension tape */
	PENDING_MOUNT_CHECK(softc, periph, dev);
	error = saretension(periph);
	softc->flags &=
	~(SA_FLAG_TAPE_WRITTEN\|SA_FLAG_TAPE_FROZEN);
	softc->flags &= ~SA_FLAG_ERR_PENDING;
	break;
	case MTOFFL: /* rewind and put the drive offline */

	PENDING_MOUNT_CHECK(softc, periph, dev);

	(void) sacheckeod(periph);
	/* see above */
	softc->flags &= ~SA_FLAG_TAPE_WRITTEN;
	softc->filemarks = 0;

	error = sarewind(periph);
	/* clear the frozen flag anyway */
	softc->flags &= ~SA_FLAG_TAPE_FROZEN;

	/*
	* Be sure to allow media removal before ejecting.
	*/

	saprevent(periph, PR_ALLOW);
	if (error == 0) {
	error = saloadunload(periph, FALSE);
	if (error == 0) {
	softc->flags &= ~SA_FLAG_TAPE_MOUNTED;
	}
	}
	break;

	case MTLOAD:
	error = saloadunload(periph, TRUE);
	break;
	case MTNOP: /* no operation, sets status only */
	case MTCACHE: /* enable controller cache */
	case MTNOCACHE: /* disable controller cache */
	error = 0;
	break;

	case MTSETBSIZ: /* Set block size for device */

	PENDING_MOUNT_CHECK(softc, periph, dev);

	if ((softc->sili != 0)
	&& (count != 0)) {
	xpt_print(periph->path, "Can't enter fixed "
	"block mode with SILI enabled\n");
	error = EINVAL;
	break;
	}
	error = sasetparams(periph, SA_PARAM_BLOCKSIZE, count,
	0, 0, 0);
	if (error == 0) {
	softc->last_media_blksize =
	softc->media_blksize;
	softc->media_blksize = count;
	if (count) {
	softc->flags \|= SA_FLAG_FIXED;
	if (powerof2(count)) {
	softc->blk_shift =
	ffs(count) - 1;
	softc->blk_mask = count - 1;
	} else {
	softc->blk_mask = ~0;
	softc->blk_shift = 0;
	}
	/*
	* Make the user's desire 'persistent'.
	*/
	softc->quirks &= ~SA_QUIRK_VARIABLE;
	softc->quirks \|= SA_QUIRK_FIXED;
	} else {
	softc->flags &= ~SA_FLAG_FIXED;
	if (softc->max_blk == 0) {
	softc->max_blk = ~0;
	}
	softc->blk_shift = 0;
	if (softc->blk_gran != 0) {
	softc->blk_mask =
	softc->blk_gran - 1;
	} else {
	softc->blk_mask = 0;
	}
	/*
	* Make the user's desire 'persistent'.
	*/
	softc->quirks \|= SA_QUIRK_VARIABLE;
	softc->quirks &= ~SA_QUIRK_FIXED;
	}
	}
	break;
	case MTSETDNSTY: /* Set density for device and mode */
	PENDING_MOUNT_CHECK(softc, periph, dev);

	if (count > UCHAR_MAX) {
	error = EINVAL;
	break;
	} else {
	error = sasetparams(periph, SA_PARAM_DENSITY,
	0, count, 0, 0);
	}
	break;
	case MTCOMP: /* enable compression */
	PENDING_MOUNT_CHECK(softc, periph, dev);
	/*
	* Some devices don't support compression, and
	* don't like it if you ask them for the
	* compression page.
	*/
	if ((softc->quirks & SA_QUIRK_NOCOMP) \|\|
	(softc->flags & SA_FLAG_COMP_UNSUPP)) {
	error = ENODEV;
	break;
	}
	error = sasetparams(periph, SA_PARAM_COMPRESSION,
	0, 0, count, SF_NO_PRINT);
	break;
	default:
	error = EINVAL;
	}
	break;
	}
	case MTIOCIEOT:
	case MTIOCEEOT:
	error = 0;
	break;
	case MTIOCRDSPOS:
	PENDING_MOUNT_CHECK(softc, periph, dev);
	error = sardpos(periph, 0, (u_int32_t *) arg);
	break;
	case MTIOCRDHPOS:
	PENDING_MOUNT_CHECK(softc, periph, dev);
	error = sardpos(periph, 1, (u_int32_t *) arg);
	break;
	case MTIOCSLOCATE:
	case MTIOCHLOCATE: {
	struct mtlocate locate_info;
	int hard;

	bzero(&locate_info, sizeof(locate_info));
	locate_info.logical_id = ((uint32_t )arg);
	if (cmd == MTIOCSLOCATE)
	hard = 0;
	else
	hard = 1;

	PENDING_MOUNT_CHECK(softc, periph, dev);

	error = sasetpos(periph, hard, &locate_info);
	break;
	}
	case MTIOCEXTLOCATE:
	PENDING_MOUNT_CHECK(softc, periph, dev);
	error = sasetpos(periph, /hard/ 0, (struct mtlocate *)arg);
	softc->flags &=
	~(SA_FLAG_TAPE_WRITTEN\|SA_FLAG_TAPE_FROZEN);
	softc->flags &= ~SA_FLAG_ERR_PENDING;
	softc->filemarks = 0;
	break;
	case MTIOCGETEOTMODEL:
	error = 0;
	if (softc->quirks & SA_QUIRK_1FM)
	mode = 1;
	else
	mode = 2;
	((u_int32_t ) arg) = mode;
	break;
	case MTIOCSETEOTMODEL:
	error = 0;
	switch (((u_int32_t ) arg)) {
	case 1:
	softc->quirks &= ~SA_QUIRK_2FM;
	softc->quirks \|= SA_QUIRK_1FM;
	break;
	case 2:
	softc->quirks &= ~SA_QUIRK_1FM;
	softc->quirks \|= SA_QUIRK_2FM;
	break;
	default:
	error = EINVAL;
	break;
	}
	break;
	case MTIOCRBLIM: {
	struct mtrblim *rblim;

	rblim = (struct mtrblim *)arg;

	rblim->granularity = softc->blk_gran;
	rblim->min_block_length = softc->min_blk;
	rblim->max_block_length = softc->max_blk;
	break;
	}
	default:
	error = cam_periph_ioctl(periph, cmd, arg, saerror);
	break;
	}

	/*
	* Check to see if we cleared a frozen state
	*/
	if (error == 0 && (softc->flags & SA_FLAG_TAPE_FROZEN)) {
	switch(cmd) {
	case MTIOCRDSPOS:
	case MTIOCRDHPOS:
	case MTIOCSLOCATE:
	case MTIOCHLOCATE:
	/*
	* XXX KDM look at this.
	*/
	softc->fileno = (daddr_t) -1;
	softc->blkno = (daddr_t) -1;
	softc->rep_blkno = (daddr_t) -1;
	softc->rep_fileno = (daddr_t) -1;
	softc->partition = (daddr_t) -1;
	softc->flags &= ~SA_FLAG_TAPE_FROZEN;
	xpt_print(periph->path,
	"tape state now unfrozen.\n");
	break;
	default:
	break;
	}
	}
	if (didlockperiph) {
	cam_periph_unhold(periph);
	}
	cam_periph_unlock(periph);
	return (error);
	}

	static void
	sainit(void)
	{
	cam_status status;

	/*
	* Install a global async callback.
	*/
	status = xpt_register_async(AC_FOUND_DEVICE, saasync, NULL, NULL);

	if (status != CAM_REQ_CMP) {
	printf("sa: Failed to attach master async callback "
	"due to status 0x%x!\n", status);
	}
	}

	static void
	sadevgonecb(void *arg)
	{
	struct cam_periph *periph;
	struct mtx *mtx;
	struct sa_softc *softc;

	periph = (struct cam_periph *)arg;
	softc = (struct sa_softc *)periph->softc;

	mtx = cam_periph_mtx(periph);
	mtx_lock(mtx);

	softc->num_devs_to_destroy--;
	if (softc->num_devs_to_destroy == 0) {
	int i;

	/*
	* When we have gotten all of our callbacks, we will get
	* no more close calls from devfs. So if we have any
	* dangling opens, we need to release the reference held
	* for that particular context.
	*/
	for (i = 0; i < softc->open_count; i++)
	cam_periph_release_locked(periph);

	softc->open_count = 0;

	/*
	* Release the reference held for devfs, all of our
	* instances are gone now.
	*/
	cam_periph_release_locked(periph);
	}

	/*
	* We reference the lock directly here, instead of using
	* cam_periph_unlock(). The reason is that the final call to
	* cam_periph_release_locked() above could result in the periph
	* getting freed. If that is the case, dereferencing the periph
	* with a cam_periph_unlock() call would cause a page fault.
	*/
	mtx_unlock(mtx);
	}

	static void
	saoninvalidate(struct cam_periph *periph)
	{
	struct sa_softc *softc;

	softc = (struct sa_softc *)periph->softc;

	/*
	* De-register any async callbacks.
	*/
	xpt_register_async(0, saasync, periph, periph->path);

	softc->flags \|= SA_FLAG_INVALID;

	/*
	* Return all queued I/O with ENXIO.
	* XXX Handle any transactions queued to the card
	* with XPT_ABORT_CCB.
	*/
	bioq_flush(&softc->bio_queue, NULL, ENXIO);
	softc->queue_count = 0;

	/*
	* Tell devfs that all of our devices have gone away, and ask for a
	* callback when it has cleaned up its state.
	*/
	destroy_dev_sched_cb(softc->devs.ctl_dev, sadevgonecb, periph);
	destroy_dev_sched_cb(softc->devs.r_dev, sadevgonecb, periph);
	destroy_dev_sched_cb(softc->devs.nr_dev, sadevgonecb, periph);
	destroy_dev_sched_cb(softc->devs.er_dev, sadevgonecb, periph);
	}

	static void
	sacleanup(struct cam_periph *periph)
	{
	struct sa_softc *softc;

	softc = (struct sa_softc *)periph->softc;

	cam_periph_unlock(periph);

	if ((softc->flags & SA_FLAG_SCTX_INIT) != 0
	&& sysctl_ctx_free(&softc->sysctl_ctx) != 0)
	xpt_print(periph->path, "can't remove sysctl context\n");

	cam_periph_lock(periph);

	devstat_remove_entry(softc->device_stats);

	free(softc, M_SCSISA);
	}

	static void
	saasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg)
	{
	struct cam_periph *periph;

	periph = (struct cam_periph *)callback_arg;
	switch (code) {
	case AC_FOUND_DEVICE:
	{
	struct ccb_getdev *cgd;
	cam_status status;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL)
	break;

	if (cgd->protocol != PROTO_SCSI)
	break;
	if (SID_QUAL(&cgd->inq_data) != SID_QUAL_LU_CONNECTED)
	break;
	if (SID_TYPE(&cgd->inq_data) != T_SEQUENTIAL)
	break;

	/*
	* Allocate a peripheral instance for
	* this device and start the probe
	* process.
	*/
	status = cam_periph_alloc(saregister, saoninvalidate,
	sacleanup, sastart,
	"sa", CAM_PERIPH_BIO, path,
	saasync, AC_FOUND_DEVICE, cgd);

	if (status != CAM_REQ_CMP
	&& status != CAM_REQ_INPROG)
	printf("saasync: Unable to probe new device "
	"due to status 0x%x\n", status);
	break;
	}
	default:
	cam_periph_async(periph, code, path, arg);
	break;
	}
	}

	static void
	sasetupdev(struct sa_softc softc, struct cdev dev)
	{

	dev->si_iosize_max = softc->maxio;
	dev->si_flags \|= softc->si_flags;
	/*
	* Keep a count of how many non-alias devices we have created,
	* so we can make sure we clean them all up on shutdown. Aliases
	* are cleaned up when we destroy the device they're an alias for.
	*/
	if ((dev->si_flags & SI_ALIAS) == 0)
	softc->num_devs_to_destroy++;
	}

	static void
	sasysctlinit(void *context, int pending)
	{
	struct cam_periph *periph;
	struct sa_softc *softc;
	char tmpstr[32], tmpstr2[16];

	periph = (struct cam_periph *)context;
	/*
	* If the periph is invalid, no need to setup the sysctls.
	*/
	if (periph->flags & CAM_PERIPH_INVALID)
	goto bailout;

	softc = (struct sa_softc *)periph->softc;

	snprintf(tmpstr, sizeof(tmpstr), "CAM SA unit %d", periph->unit_number);
	snprintf(tmpstr2, sizeof(tmpstr2), "%u", periph->unit_number);

	sysctl_ctx_init(&softc->sysctl_ctx);
	softc->flags \|= SA_FLAG_SCTX_INIT;
	softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_kern_cam_sa), OID_AUTO, tmpstr2,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, tmpstr, "device_index");
	if (softc->sysctl_tree == NULL)
	goto bailout;

	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "allow_io_split", CTLFLAG_RDTUN \| CTLFLAG_NOFETCH,
	&softc->allow_io_split, 0, "Allow Splitting I/O");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "maxio", CTLFLAG_RD,
	&softc->maxio, 0, "Maximum I/O size");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "cpi_maxio", CTLFLAG_RD,
	&softc->cpi_maxio, 0, "Maximum Controller I/O size");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "inject_eom", CTLFLAG_RW,
	&softc->inject_eom, 0, "Queue EOM for the next write/read");

	bailout:
	/*
	* Release the reference that was held when this task was enqueued.
	*/
	cam_periph_release(periph);
	}

	static cam_status
	saregister(struct cam_periph periph, void arg)
	{
	struct sa_softc *softc;
	struct ccb_getdev *cgd;
	struct ccb_pathinq cpi;
	struct make_dev_args args;
	caddr_t match;
	char tmpstr[80];
	int error;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL) {
	printf("saregister: no getdev CCB, can't register device\n");
	return (CAM_REQ_CMP_ERR);
	}

	softc = (struct sa_softc *)
	malloc(sizeof (*softc), M_SCSISA, M_NOWAIT \| M_ZERO);
	if (softc == NULL) {
	printf("saregister: Unable to probe new device. "
	"Unable to allocate softc\n");
	return (CAM_REQ_CMP_ERR);
	}
	softc->scsi_rev = SID_ANSI_REV(&cgd->inq_data);
	softc->state = SA_STATE_NORMAL;
	softc->fileno = (daddr_t) -1;
	softc->blkno = (daddr_t) -1;
	softc->rep_fileno = (daddr_t) -1;
	softc->rep_blkno = (daddr_t) -1;
	softc->partition = (daddr_t) -1;
	softc->bop = -1;
	softc->eop = -1;
	softc->bpew = -1;

	bioq_init(&softc->bio_queue);
	softc->periph = periph;
	periph->softc = softc;

	/*
	* See if this device has any quirks.
	*/
	match = cam_quirkmatch((caddr_t)&cgd->inq_data,
	(caddr_t)sa_quirk_table,
	nitems(sa_quirk_table),
	sizeof(*sa_quirk_table), scsi_inquiry_match);

	if (match != NULL) {
	softc->quirks = ((struct sa_quirk_entry *)match)->quirks;
	softc->last_media_blksize =
	((struct sa_quirk_entry *)match)->prefblk;
	} else
	softc->quirks = SA_QUIRK_NONE;

	/*
	* Long format data for READ POSITION was introduced in SSC, which
	* was after SCSI-2. (Roughly equivalent to SCSI-3.) If the drive
	* reports that it is SCSI-2 or older, it is unlikely to support
	* long position data, but it might. Some drives from that era
	* claim to be SCSI-2, but do support long position information.
	* So, instead of immediately disabling long position information
	* for SCSI-2 devices, we'll try one pass through sagetpos(), and
	* then disable long position information if we get an error.
	*/
	if (cgd->inq_data.version <= SCSI_REV_CCS)
	softc->quirks \|= SA_QUIRK_NO_LONG_POS;

	if (cgd->inq_data.spc3_flags & SPC3_SID_PROTECT) {
	struct ccb_dev_advinfo cdai;
	struct scsi_vpd_extended_inquiry_data ext_inq;

	bzero(&ext_inq, sizeof(ext_inq));

	xpt_setup_ccb(&cdai.ccb_h, periph->path, CAM_PRIORITY_NORMAL);

	cdai.ccb_h.func_code = XPT_DEV_ADVINFO;
	cdai.flags = CDAI_FLAG_NONE;
	cdai.buftype = CDAI_TYPE_EXT_INQ;
	cdai.bufsiz = sizeof(ext_inq);
	cdai.buf = (uint8_t *)&ext_inq;
	xpt_action((union ccb *)&cdai);

	if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE);
	if ((cdai.ccb_h.status == CAM_REQ_CMP)
	&& (ext_inq.flags1 & SVPD_EID_SA_SPT_LBP))
	softc->flags \|= SA_FLAG_PROTECT_SUPP;
	}

	xpt_path_inq(&cpi, periph->path);

	/*
	* The SA driver supports a blocksize, but we don't know the
	* blocksize until we media is inserted. So, set a flag to
	* indicate that the blocksize is unavailable right now.
	*/
	cam_periph_unlock(periph);
	softc->device_stats = devstat_new_entry("sa", periph->unit_number, 0,
	DEVSTAT_BS_UNAVAILABLE, SID_TYPE(&cgd->inq_data) \|
	XPORT_DEVSTAT_TYPE(cpi.transport), DEVSTAT_PRIORITY_TAPE);

	/*
	* Load the default value that is either compiled in, or loaded
	* in the global kern.cam.sa.allow_io_split tunable.
	*/
	softc->allow_io_split = sa_allow_io_split;

	/*
	* Load a per-instance tunable, if it exists. NOTE that this
	* tunable WILL GO AWAY in FreeBSD 11.0.
	*/
	snprintf(tmpstr, sizeof(tmpstr), "kern.cam.sa.%u.allow_io_split",
	periph->unit_number);
	TUNABLE_INT_FETCH(tmpstr, &softc->allow_io_split);

	/*
	* If maxio isn't set, we fall back to DFLTPHYS. Otherwise we take
	- * the smaller of cpi.maxio or MAXPHYS.
	+ * the smaller of cpi.maxio or maxphys.
	*/
	if (cpi.maxio == 0)
	softc->maxio = DFLTPHYS;
	- else if (cpi.maxio > MAXPHYS)
	- softc->maxio = MAXPHYS;
	+ else if (cpi.maxio > maxphys)
	+ softc->maxio = maxphys;
	else
	softc->maxio = cpi.maxio;

	/*
	* Record the controller's maximum I/O size so we can report it to
	* the user later.
	*/
	softc->cpi_maxio = cpi.maxio;

	/*
	* By default we tell physio that we do not want our I/O split.
	* The user needs to have a 1:1 mapping between the size of his
	* write to a tape character device and the size of the write
	* that actually goes down to the drive.
	*/
	if (softc->allow_io_split == 0)
	softc->si_flags = SI_NOSPLIT;
	else
	softc->si_flags = 0;

	TASK_INIT(&softc->sysctl_task, 0, sasysctlinit, periph);

	/*
	* If the SIM supports unmapped I/O, let physio know that we can
	* handle unmapped buffers.
	*/
	if (cpi.hba_misc & PIM_UNMAPPED)
	softc->si_flags \|= SI_UNMAPPED;

	/*
	* Acquire a reference to the periph before we create the devfs
	* instances for it. We'll release this reference once the devfs
	* instances have been freed.
	*/
	if (cam_periph_acquire(periph) != 0) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}

	make_dev_args_init(&args);
	args.mda_devsw = &sa_cdevsw;
	args.mda_si_drv1 = softc->periph;
	args.mda_uid = UID_ROOT;
	args.mda_gid = GID_OPERATOR;
	args.mda_mode = 0660;

	args.mda_unit = SAMINOR(SA_CTLDEV, SA_ATYPE_R);
	error = make_dev_s(&args, &softc->devs.ctl_dev, "%s%d.ctl",
	periph->periph_name, periph->unit_number);
	if (error != 0) {
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}
	sasetupdev(softc, softc->devs.ctl_dev);

	args.mda_unit = SAMINOR(SA_NOT_CTLDEV, SA_ATYPE_R);
	error = make_dev_s(&args, &softc->devs.r_dev, "%s%d",
	periph->periph_name, periph->unit_number);
	if (error != 0) {
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}
	sasetupdev(softc, softc->devs.r_dev);

	args.mda_unit = SAMINOR(SA_NOT_CTLDEV, SA_ATYPE_NR);
	error = make_dev_s(&args, &softc->devs.nr_dev, "n%s%d",
	periph->periph_name, periph->unit_number);
	if (error != 0) {
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}
	sasetupdev(softc, softc->devs.nr_dev);

	args.mda_unit = SAMINOR(SA_NOT_CTLDEV, SA_ATYPE_ER);
	error = make_dev_s(&args, &softc->devs.er_dev, "e%s%d",
	periph->periph_name, periph->unit_number);
	if (error != 0) {
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}
	sasetupdev(softc, softc->devs.er_dev);

	cam_periph_lock(periph);

	softc->density_type_bits[0] = 0;
	softc->density_type_bits[1] = SRDS_MEDIA;
	softc->density_type_bits[2] = SRDS_MEDIUM_TYPE;
	softc->density_type_bits[3] = SRDS_MEDIUM_TYPE \| SRDS_MEDIA;
	/*
	* Bump the peripheral refcount for the sysctl thread, in case we
	* get invalidated before the thread has a chance to run.
	*/
	cam_periph_acquire(periph);
	taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task);

	/*
	* Add an async callback so that we get
	* notified if this device goes away.
	*/
	xpt_register_async(AC_LOST_DEVICE, saasync, periph, periph->path);

	xpt_announce_periph(periph, NULL);
	xpt_announce_quirks(periph, softc->quirks, SA_QUIRK_BIT_STRING);

	return (CAM_REQ_CMP);
	}

	static void
	sastart(struct cam_periph periph, union ccb start_ccb)
	{
	struct sa_softc *softc;

	softc = (struct sa_softc *)periph->softc;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sastart\n"));

	switch (softc->state) {
	case SA_STATE_NORMAL:
	{
	/* Pull a buffer from the queue and get going on it */
	struct bio *bp;

	/*
	* See if there is a buf with work for us to do..
	*/
	bp = bioq_first(&softc->bio_queue);
	if (bp == NULL) {
	xpt_release_ccb(start_ccb);
	} else if (((softc->flags & SA_FLAG_ERR_PENDING) != 0)
	\|\| (softc->inject_eom != 0)) {
	struct bio *done_bp;

	if (softc->inject_eom != 0) {
	softc->flags \|= SA_FLAG_EOM_PENDING;
	softc->inject_eom = 0;
	/*
	* If we're injecting EOM for writes, we
	* need to keep PEWS set for 3 queries
	* to cover 2 position requests from the
	* kernel via sagetpos(), and then allow
	* for one for the user to see the BPEW
	* flag (e.g. via mt status). After that,
	* it will be cleared.
	*/
	if (bp->bio_cmd == BIO_WRITE)
	softc->set_pews_status = 3;
	else
	softc->set_pews_status = 1;
	}
	again:
	softc->queue_count--;
	bioq_remove(&softc->bio_queue, bp);
	bp->bio_resid = bp->bio_bcount;
	done_bp = bp;
	if ((softc->flags & SA_FLAG_EOM_PENDING) != 0) {
	/*
	* We have two different behaviors for
	* writes when we hit either Early Warning
	* or the PEWZ (Programmable Early Warning
	* Zone). The default behavior is that
	* for all writes that are currently
	* queued after the write where we saw the
	* early warning, we will return the write
	* with the residual equal to the count.
	* i.e. tell the application that 0 bytes
	* were written.
	*
	* The alternate behavior, which is enabled
	* when eot_warn is set, is that in
	* addition to setting the residual equal
	* to the count, we will set the error
	* to ENOSPC.
	*
	* In either case, once queued writes are
	* cleared out, we clear the error flag
	* (see below) and the application is free to
	* attempt to write more.
	*/
	if (softc->eot_warn != 0) {
	bp->bio_flags \|= BIO_ERROR;
	bp->bio_error = ENOSPC;
	} else
	bp->bio_error = 0;
	} else if ((softc->flags & SA_FLAG_EOF_PENDING) != 0) {
	/*
	* This can only happen if we're reading
	* in fixed length mode. In this case,
	* we dump the rest of the list the
	* same way.
	*/
	bp->bio_error = 0;
	if (bioq_first(&softc->bio_queue) != NULL) {
	biodone(done_bp);
	goto again;
	}
	} else if ((softc->flags & SA_FLAG_EIO_PENDING) != 0) {
	bp->bio_error = EIO;
	bp->bio_flags \|= BIO_ERROR;
	}
	bp = bioq_first(&softc->bio_queue);
	/*
	* Only if we have no other buffers queued up
	* do we clear the pending error flag.
	*/
	if (bp == NULL)
	softc->flags &= ~SA_FLAG_ERR_PENDING;
	CAM_DEBUG(periph->path, CAM_DEBUG_INFO,
	("sastart- ERR_PENDING now 0x%x, bp is %sNULL, "
	"%d more buffers queued up\n",
	(softc->flags & SA_FLAG_ERR_PENDING),
	(bp != NULL)? "not " : " ", softc->queue_count));
	xpt_release_ccb(start_ccb);
	biodone(done_bp);
	} else {
	u_int32_t length;

	bioq_remove(&softc->bio_queue, bp);
	softc->queue_count--;

	if ((bp->bio_cmd != BIO_READ) &&
	(bp->bio_cmd != BIO_WRITE)) {
	biofinish(bp, NULL, EOPNOTSUPP);
	xpt_release_ccb(start_ccb);
	return;
	}
	length = bp->bio_bcount;

	if ((softc->flags & SA_FLAG_FIXED) != 0) {
	if (softc->blk_shift != 0) {
	length = length >> softc->blk_shift;
	} else if (softc->media_blksize != 0) {
	length = length / softc->media_blksize;
	} else {
	bp->bio_error = EIO;
	xpt_print(periph->path, "zero blocksize"
	" for FIXED length writes?\n");
	biodone(bp);
	break;
	}
	#if 0
	CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_INFO,
	("issuing a %d fixed record %s\n",
	length, (bp->bio_cmd == BIO_READ)? "read" :
	"write"));
	#endif
	} else {
	#if 0
	CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_INFO,
	("issuing a %d variable byte %s\n",
	length, (bp->bio_cmd == BIO_READ)? "read" :
	"write"));
	#endif
	}
	devstat_start_transaction_bio(softc->device_stats, bp);
	/*
	* Some people have theorized that we should
	* suppress illegal length indication if we are
	* running in variable block mode so that we don't
	* have to request sense every time our requested
	* block size is larger than the written block.
	* The residual information from the ccb allows
	* us to identify this situation anyway. The only
	* problem with this is that we will not get
	* information about blocks that are larger than
	* our read buffer unless we set the block size
	* in the mode page to something other than 0.
	*
	* I believe that this is a non-issue. If user apps
	* don't adjust their read size to match our record
	* size, that's just life. Anyway, the typical usage
	* would be to issue, e.g., 64KB reads and occasionally
	* have to do deal with 512 byte or 1KB intermediate
	* records.
	*
	* That said, though, we now support setting the
	* SILI bit on reads, and we set the blocksize to 4
	* bytes when we do that. This gives us
	* compatibility with software that wants this,
	* although the only real difference between that
	* and not setting the SILI bit on reads is that we
	* won't get a check condition on reads where our
	* request size is larger than the block on tape.
	* That probably only makes a real difference in
	* non-packetized SCSI, where you have to go back
	* to the drive to request sense and thus incur
	* more latency.
	*/
	softc->dsreg = (bp->bio_cmd == BIO_READ)?
	MTIO_DSREG_RD : MTIO_DSREG_WR;
	scsi_sa_read_write(&start_ccb->csio, 0, sadone,
	MSG_SIMPLE_Q_TAG, (bp->bio_cmd == BIO_READ ?
	SCSI_RW_READ : SCSI_RW_WRITE) \|
	((bp->bio_flags & BIO_UNMAPPED) != 0 ?
	SCSI_RW_BIO : 0), softc->sili,
	(softc->flags & SA_FLAG_FIXED) != 0, length,
	(bp->bio_flags & BIO_UNMAPPED) != 0 ? (void *)bp :
	bp->bio_data, bp->bio_bcount, SSD_FULL_SIZE,
	IO_TIMEOUT);
	start_ccb->ccb_h.ccb_pflags &= ~SA_POSITION_UPDATED;
	start_ccb->ccb_h.ccb_bp = bp;
	bp = bioq_first(&softc->bio_queue);
	xpt_action(start_ccb);
	}

	if (bp != NULL) {
	/* Have more work to do, so ensure we stay scheduled */
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	}
	break;
	}
	case SA_STATE_ABNORMAL:
	default:
	panic("state 0x%x in sastart", softc->state);
	break;
	}
	}

	static void
	sadone(struct cam_periph periph, union ccb done_ccb)
	{
	struct sa_softc *softc;
	struct ccb_scsiio *csio;
	struct bio *bp;
	int error;

	softc = (struct sa_softc *)periph->softc;
	csio = &done_ccb->csio;

	softc->dsreg = MTIO_DSREG_REST;
	bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
	error = 0;
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	if ((error = saerror(done_ccb, 0, 0)) == ERESTART) {
	/*
	* A retry was scheduled, so just return.
	*/
	return;
	}
	}

	if (error == EIO) {
	/*
	* Catastrophic error. Mark the tape as frozen
	* (we no longer know tape position).
	*
	* Return all queued I/O with EIO, and unfreeze
	* our queue so that future transactions that
	* attempt to fix this problem can get to the
	* device.
	*
	*/

	softc->flags \|= SA_FLAG_TAPE_FROZEN;
	bioq_flush(&softc->bio_queue, NULL, EIO);
	}
	if (error != 0) {
	bp->bio_resid = bp->bio_bcount;
	bp->bio_error = error;
	bp->bio_flags \|= BIO_ERROR;
	/*
	* In the error case, position is updated in saerror.
	*/
	} else {
	bp->bio_resid = csio->resid;
	bp->bio_error = 0;
	if (csio->resid != 0) {
	bp->bio_flags \|= BIO_ERROR;
	}
	if (bp->bio_cmd == BIO_WRITE) {
	softc->flags \|= SA_FLAG_TAPE_WRITTEN;
	softc->filemarks = 0;
	}
	if (!(csio->ccb_h.ccb_pflags & SA_POSITION_UPDATED) &&
	(softc->blkno != (daddr_t) -1)) {
	if ((softc->flags & SA_FLAG_FIXED) != 0) {
	u_int32_t l;
	if (softc->blk_shift != 0) {
	l = bp->bio_bcount >>
	softc->blk_shift;
	} else {
	l = bp->bio_bcount /
	softc->media_blksize;
	}
	softc->blkno += (daddr_t) l;
	} else {
	softc->blkno++;
	}
	}
	}
	/*
	* If we had an error (immediate or pending),
	* release the device queue now.
	*/
	if (error \|\| (softc->flags & SA_FLAG_ERR_PENDING))
	cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0);
	if (error \|\| bp->bio_resid) {
	CAM_DEBUG(periph->path, CAM_DEBUG_INFO,
	("error %d resid %ld count %ld\n", error,
	bp->bio_resid, bp->bio_bcount));
	}
	biofinish(bp, softc->device_stats, 0);
	xpt_release_ccb(done_ccb);
	}

	/*
	* Mount the tape (make sure it's ready for I/O).
	*/
	static int
	samount(struct cam_periph periph, int oflags, struct cdev dev)
	{
	struct sa_softc *softc;
	union ccb *ccb;
	int error;

	/*
	* oflags can be checked for 'kind' of open (read-only check) - later
	* dev can be checked for a control-mode or compression open - later
	*/
	UNUSED_PARAMETER(oflags);
	UNUSED_PARAMETER(dev);

	softc = (struct sa_softc *)periph->softc;

	/*
	* This should determine if something has happened since the last
	* open/mount that would invalidate the mount. We do not want
	* to retry this command- we just want the status. But we only
	* do this if we're mounted already- if we're not mounted,
	* we don't care about the unit read state and can instead use
	* this opportunity to attempt to reserve the tape unit.
	*/

	if (softc->flags & SA_FLAG_TAPE_MOUNTED) {
	ccb = cam_periph_getccb(periph, 1);
	scsi_test_unit_ready(&ccb->csio, 0, NULL,
	MSG_SIMPLE_Q_TAG, SSD_FULL_SIZE, IO_TIMEOUT);
	error = cam_periph_runccb(ccb, saerror, 0, SF_NO_PRINT,
	softc->device_stats);
	if (error == ENXIO) {
	softc->flags &= ~SA_FLAG_TAPE_MOUNTED;
	scsi_test_unit_ready(&ccb->csio, 0, NULL,
	MSG_SIMPLE_Q_TAG, SSD_FULL_SIZE, IO_TIMEOUT);
	error = cam_periph_runccb(ccb, saerror, 0, SF_NO_PRINT,
	softc->device_stats);
	} else if (error) {
	/*
	* We don't need to freeze the tape because we
	* will now attempt to rewind/load it.
	*/
	softc->flags &= ~SA_FLAG_TAPE_MOUNTED;
	if (CAM_DEBUGGED(periph->path, CAM_DEBUG_INFO)) {
	xpt_print(periph->path,
	"error %d on TUR in samount\n", error);
	}
	}
	} else {
	error = sareservereleaseunit(periph, TRUE);
	if (error) {
	return (error);
	}
	ccb = cam_periph_getccb(periph, 1);
	scsi_test_unit_ready(&ccb->csio, 0, NULL,
	MSG_SIMPLE_Q_TAG, SSD_FULL_SIZE, IO_TIMEOUT);
	error = cam_periph_runccb(ccb, saerror, 0, SF_NO_PRINT,
	softc->device_stats);
	}

	if ((softc->flags & SA_FLAG_TAPE_MOUNTED) == 0) {
	struct scsi_read_block_limits_data *rblim = NULL;
	int comp_enabled, comp_supported;
	u_int8_t write_protect, guessing = 0;

	/*
	* Clear out old state.
	*/
	softc->flags &= ~(SA_FLAG_TAPE_WP\|SA_FLAG_TAPE_WRITTEN\|
	SA_FLAG_ERR_PENDING\|SA_FLAG_COMPRESSION);
	softc->filemarks = 0;

	/*
	* Very first off, make sure we're loaded to BOT.
	*/
	scsi_load_unload(&ccb->csio, 2, NULL, MSG_SIMPLE_Q_TAG, FALSE,
	FALSE, FALSE, 1, SSD_FULL_SIZE, REWIND_TIMEOUT);
	error = cam_periph_runccb(ccb, saerror, 0, SF_NO_PRINT,
	softc->device_stats);

	/*
	* In case this doesn't work, do a REWIND instead
	*/
	if (error) {
	scsi_rewind(&ccb->csio, 2, NULL, MSG_SIMPLE_Q_TAG,
	FALSE, SSD_FULL_SIZE, REWIND_TIMEOUT);
	error = cam_periph_runccb(ccb, saerror, 0, SF_NO_PRINT,
	softc->device_stats);
	}
	if (error) {
	xpt_release_ccb(ccb);
	goto exit;
	}

	/*
	* Do a dummy test read to force access to the
	* media so that the drive will really know what's
	* there. We actually don't really care what the
	* blocksize on tape is and don't expect to really
	* read a full record.
	*/
	rblim = (struct scsi_read_block_limits_data *)
	malloc(8192, M_SCSISA, M_NOWAIT);
	if (rblim == NULL) {
	xpt_print(periph->path, "no memory for test read\n");
	xpt_release_ccb(ccb);
	error = ENOMEM;
	goto exit;
	}

	if ((softc->quirks & SA_QUIRK_NODREAD) == 0) {
	scsi_sa_read_write(&ccb->csio, 0, NULL,
	MSG_SIMPLE_Q_TAG, 1, FALSE, 0, 8192,
	(void *) rblim, 8192, SSD_FULL_SIZE,
	IO_TIMEOUT);
	(void) cam_periph_runccb(ccb, saerror, 0, SF_NO_PRINT,
	softc->device_stats);
	scsi_rewind(&ccb->csio, 1, NULL, MSG_SIMPLE_Q_TAG,
	FALSE, SSD_FULL_SIZE, REWIND_TIMEOUT);
	error = cam_periph_runccb(ccb, saerror, CAM_RETRY_SELTO,
	SF_NO_PRINT \| SF_RETRY_UA,
	softc->device_stats);
	if (error) {
	xpt_print(periph->path,
	"unable to rewind after test read\n");
	xpt_release_ccb(ccb);
	goto exit;
	}
	}

	/*
	* Next off, determine block limits.
	*/
	scsi_read_block_limits(&ccb->csio, 5, NULL, MSG_SIMPLE_Q_TAG,
	rblim, SSD_FULL_SIZE, SCSIOP_TIMEOUT);

	error = cam_periph_runccb(ccb, saerror, CAM_RETRY_SELTO,
	SF_NO_PRINT \| SF_RETRY_UA, softc->device_stats);

	xpt_release_ccb(ccb);

	if (error != 0) {
	/*
	* If it's less than SCSI-2, READ BLOCK LIMITS is not
	* a MANDATORY command. Anyway- it doesn't matter-
	* we can proceed anyway.
	*/
	softc->blk_gran = 0;
	softc->max_blk = ~0;
	softc->min_blk = 0;
	} else {
	if (softc->scsi_rev >= SCSI_REV_SPC) {
	softc->blk_gran = RBL_GRAN(rblim);
	} else {
	softc->blk_gran = 0;
	}
	/*
	* We take max_blk == min_blk to mean a default to
	* fixed mode- but note that whatever we get out of
	* sagetparams below will actually determine whether
	* we are actually in fixed mode.
	*/
	softc->max_blk = scsi_3btoul(rblim->maximum);
	softc->min_blk = scsi_2btoul(rblim->minimum);
	}
	/*
	* Next, perform a mode sense to determine
	* current density, blocksize, compression etc.
	*/
	error = sagetparams(periph, SA_PARAM_ALL,
	&softc->media_blksize,
	&softc->media_density,
	&softc->media_numblks,
	&softc->buffer_mode, &write_protect,
	&softc->speed, &comp_supported,
	&comp_enabled, &softc->comp_algorithm,
	NULL, NULL, 0, 0);

	if (error != 0) {
	/*
	* We could work a little harder here. We could
	* adjust our attempts to get information. It
	* might be an ancient tape drive. If someone
	* nudges us, we'll do that.
	*/
	goto exit;
	}

	/*
	* If no quirk has determined that this is a device that is
	* preferred to be in fixed or variable mode, now is the time
	* to find out.
	*/
	if ((softc->quirks & (SA_QUIRK_FIXED\|SA_QUIRK_VARIABLE)) == 0) {
	guessing = 1;
	/*
	* This could be expensive to find out. Luckily we
	* only need to do this once. If we start out in
	* 'default' mode, try and set ourselves to one
	* of the densities that would determine a wad
	* of other stuff. Go from highest to lowest.
	*/
	if (softc->media_density == SCSI_DEFAULT_DENSITY) {
	int i;
	static u_int8_t ctry[] = {
	SCSI_DENSITY_HALFINCH_PE,
	SCSI_DENSITY_HALFINCH_6250C,
	SCSI_DENSITY_HALFINCH_6250,
	SCSI_DENSITY_HALFINCH_1600,
	SCSI_DENSITY_HALFINCH_800,
	SCSI_DENSITY_QIC_4GB,
	SCSI_DENSITY_QIC_2GB,
	SCSI_DENSITY_QIC_525_320,
	SCSI_DENSITY_QIC_150,
	SCSI_DENSITY_QIC_120,
	SCSI_DENSITY_QIC_24,
	SCSI_DENSITY_QIC_11_9TRK,
	SCSI_DENSITY_QIC_11_4TRK,
	SCSI_DENSITY_QIC_1320,
	SCSI_DENSITY_QIC_3080,
	0
	};
	for (i = 0; ctry[i]; i++) {
	error = sasetparams(periph,
	SA_PARAM_DENSITY, 0, ctry[i],
	0, SF_NO_PRINT);
	if (error == 0) {
	softc->media_density = ctry[i];
	break;
	}
	}
	}
	switch (softc->media_density) {
	case SCSI_DENSITY_QIC_11_4TRK:
	case SCSI_DENSITY_QIC_11_9TRK:
	case SCSI_DENSITY_QIC_24:
	case SCSI_DENSITY_QIC_120:
	case SCSI_DENSITY_QIC_150:
	case SCSI_DENSITY_QIC_525_320:
	case SCSI_DENSITY_QIC_1320:
	case SCSI_DENSITY_QIC_3080:
	softc->quirks &= ~SA_QUIRK_2FM;
	softc->quirks \|= SA_QUIRK_FIXED\|SA_QUIRK_1FM;
	softc->last_media_blksize = 512;
	break;
	case SCSI_DENSITY_QIC_4GB:
	case SCSI_DENSITY_QIC_2GB:
	softc->quirks &= ~SA_QUIRK_2FM;
	softc->quirks \|= SA_QUIRK_FIXED\|SA_QUIRK_1FM;
	softc->last_media_blksize = 1024;
	break;
	default:
	softc->last_media_blksize =
	softc->media_blksize;
	softc->quirks \|= SA_QUIRK_VARIABLE;
	break;
	}
	}

	/*
	* If no quirk has determined that this is a device that needs
	* to have 2 Filemarks at EOD, now is the time to find out.
	*/

	if ((softc->quirks & SA_QUIRK_2FM) == 0) {
	switch (softc->media_density) {
	case SCSI_DENSITY_HALFINCH_800:
	case SCSI_DENSITY_HALFINCH_1600:
	case SCSI_DENSITY_HALFINCH_6250:
	case SCSI_DENSITY_HALFINCH_6250C:
	case SCSI_DENSITY_HALFINCH_PE:
	softc->quirks &= ~SA_QUIRK_1FM;
	softc->quirks \|= SA_QUIRK_2FM;
	break;
	default:
	break;
	}
	}

	/*
	* Now validate that some info we got makes sense.
	*/
	if ((softc->max_blk < softc->media_blksize) \|\|
	(softc->min_blk > softc->media_blksize &&
	softc->media_blksize)) {
	xpt_print(periph->path,
	"BLOCK LIMITS (%d..%d) could not match current "
	"block settings (%d)- adjusting\n", softc->min_blk,
	softc->max_blk, softc->media_blksize);
	softc->max_blk = softc->min_blk =
	softc->media_blksize;
	}

	/*
	* Now put ourselves into the right frame of mind based
	* upon quirks...
	*/
	tryagain:
	/*
	* If we want to be in FIXED mode and our current blocksize
	* is not equal to our last blocksize (if nonzero), try and
	* set ourselves to this last blocksize (as the 'preferred'
	* block size). The initial quirkmatch at registry sets the
	* initial 'last' blocksize. If, for whatever reason, this
	* 'last' blocksize is zero, set the blocksize to 512,
	* or min_blk if that's larger.
	*/
	if ((softc->quirks & SA_QUIRK_FIXED) &&
	(softc->quirks & SA_QUIRK_NO_MODESEL) == 0 &&
	(softc->media_blksize != softc->last_media_blksize)) {
	softc->media_blksize = softc->last_media_blksize;
	if (softc->media_blksize == 0) {
	softc->media_blksize = 512;
	if (softc->media_blksize < softc->min_blk) {
	softc->media_blksize = softc->min_blk;
	}
	}
	error = sasetparams(periph, SA_PARAM_BLOCKSIZE,
	softc->media_blksize, 0, 0, SF_NO_PRINT);
	if (error) {
	xpt_print(periph->path,
	"unable to set fixed blocksize to %d\n",
	softc->media_blksize);
	goto exit;
	}
	}

	if ((softc->quirks & SA_QUIRK_VARIABLE) &&
	(softc->media_blksize != 0)) {
	softc->last_media_blksize = softc->media_blksize;
	softc->media_blksize = 0;
	error = sasetparams(periph, SA_PARAM_BLOCKSIZE,
	0, 0, 0, SF_NO_PRINT);
	if (error) {
	/*
	* If this fails and we were guessing, just
	* assume that we got it wrong and go try
	* fixed block mode. Don't even check against
	* density code at this point.
	*/
	if (guessing) {
	softc->quirks &= ~SA_QUIRK_VARIABLE;
	softc->quirks \|= SA_QUIRK_FIXED;
	if (softc->last_media_blksize == 0)
	softc->last_media_blksize = 512;
	goto tryagain;
	}
	xpt_print(periph->path,
	"unable to set variable blocksize\n");
	goto exit;
	}
	}

	/*
	* Now that we have the current block size,
	* set up some parameters for sastart's usage.
	*/
	if (softc->media_blksize) {
	softc->flags \|= SA_FLAG_FIXED;
	if (powerof2(softc->media_blksize)) {
	softc->blk_shift =
	ffs(softc->media_blksize) - 1;
	softc->blk_mask = softc->media_blksize - 1;
	} else {
	softc->blk_mask = ~0;
	softc->blk_shift = 0;
	}
	} else {
	/*
	* The SCSI-3 spec allows 0 to mean "unspecified".
	* The SCSI-1 spec allows 0 to mean 'infinite'.
	*
	* Either works here.
	*/
	if (softc->max_blk == 0) {
	softc->max_blk = ~0;
	}
	softc->blk_shift = 0;
	if (softc->blk_gran != 0) {
	softc->blk_mask = softc->blk_gran - 1;
	} else {
	softc->blk_mask = 0;
	}
	}

	if (write_protect)
	softc->flags \|= SA_FLAG_TAPE_WP;

	if (comp_supported) {
	if (softc->saved_comp_algorithm == 0)
	softc->saved_comp_algorithm =
	softc->comp_algorithm;
	softc->flags \|= SA_FLAG_COMP_SUPP;
	if (comp_enabled)
	softc->flags \|= SA_FLAG_COMP_ENABLED;
	} else
	softc->flags \|= SA_FLAG_COMP_UNSUPP;

	if ((softc->buffer_mode == SMH_SA_BUF_MODE_NOBUF) &&
	(softc->quirks & SA_QUIRK_NO_MODESEL) == 0) {
	error = sasetparams(periph, SA_PARAM_BUFF_MODE, 0,
	0, 0, SF_NO_PRINT);
	if (error == 0) {
	softc->buffer_mode = SMH_SA_BUF_MODE_SIBUF;
	} else {
	xpt_print(periph->path,
	"unable to set buffered mode\n");
	}
	error = 0; /* not an error */
	}

	if (error == 0) {
	softc->flags \|= SA_FLAG_TAPE_MOUNTED;
	}
	exit:
	if (rblim != NULL)
	free(rblim, M_SCSISA);

	if (error != 0) {
	softc->dsreg = MTIO_DSREG_NIL;
	} else {
	softc->fileno = softc->blkno = 0;
	softc->rep_fileno = softc->rep_blkno = -1;
	softc->partition = 0;
	softc->dsreg = MTIO_DSREG_REST;
	}
	#ifdef SA_1FM_AT_EOD
	if ((softc->quirks & SA_QUIRK_2FM) == 0)
	softc->quirks \|= SA_QUIRK_1FM;
	#else
	if ((softc->quirks & SA_QUIRK_1FM) == 0)
	softc->quirks \|= SA_QUIRK_2FM;
	#endif
	} else
	xpt_release_ccb(ccb);

	/*
	* If we return an error, we're not mounted any more,
	* so release any device reservation.
	*/
	if (error != 0) {
	(void) sareservereleaseunit(periph, FALSE);
	} else {
	/*
	* Clear I/O residual.
	*/
	softc->last_io_resid = 0;
	softc->last_ctl_resid = 0;
	}
	return (error);
	}

	/*
	* How many filemarks do we need to write if we were to terminate the
	* tape session right now? Note that this can be a negative number
	*/

	static int
	samarkswanted(struct cam_periph *periph)
	{
	int markswanted;
	struct sa_softc *softc;

	softc = (struct sa_softc *)periph->softc;
	markswanted = 0;
	if ((softc->flags & SA_FLAG_TAPE_WRITTEN) != 0) {
	markswanted++;
	if (softc->quirks & SA_QUIRK_2FM)
	markswanted++;
	}
	markswanted -= softc->filemarks;
	return (markswanted);
	}

	static int
	sacheckeod(struct cam_periph *periph)
	{
	int error;
	int markswanted;

	markswanted = samarkswanted(periph);

	if (markswanted > 0) {
	error = sawritefilemarks(periph, markswanted, FALSE, FALSE);
	} else {
	error = 0;
	}
	return (error);
	}

	static int
	saerror(union ccb *ccb, u_int32_t cflgs, u_int32_t sflgs)
	{
	static const char *toobig =
	"%d-byte tape record bigger than supplied buffer\n";
	struct cam_periph *periph;
	struct sa_softc *softc;
	struct ccb_scsiio *csio;
	struct scsi_sense_data *sense;
	uint64_t resid = 0;
	int64_t info = 0;
	cam_status status;
	int error_code, sense_key, asc, ascq, error, aqvalid, stream_valid;
	int sense_len;
	uint8_t stream_bits;

	periph = xpt_path_periph(ccb->ccb_h.path);
	softc = (struct sa_softc *)periph->softc;
	csio = &ccb->csio;
	sense = &csio->sense_data;
	sense_len = csio->sense_len - csio->sense_resid;
	scsi_extract_sense_len(sense, sense_len, &error_code, &sense_key,
	&asc, &ascq, /show_errors/ 1);
	if (asc != -1 && ascq != -1)
	aqvalid = 1;
	else
	aqvalid = 0;
	if (scsi_get_stream_info(sense, sense_len, NULL, &stream_bits) == 0)
	stream_valid = 1;
	else
	stream_valid = 0;
	error = 0;

	status = csio->ccb_h.status & CAM_STATUS_MASK;

	/*
	* Calculate/latch up, any residuals... We do this in a funny 2-step
	* so we can print stuff here if we have CAM_DEBUG enabled for this
	* unit.
	*/
	if (status == CAM_SCSI_STATUS_ERROR) {
	if (scsi_get_sense_info(sense, sense_len, SSD_DESC_INFO, &resid,
	&info) == 0) {
	if ((softc->flags & SA_FLAG_FIXED) != 0)
	resid *= softc->media_blksize;
	} else {
	resid = csio->dxfer_len;
	info = resid;
	if ((softc->flags & SA_FLAG_FIXED) != 0) {
	if (softc->media_blksize)
	info /= softc->media_blksize;
	}
	}
	if (csio->cdb_io.cdb_bytes[0] == SA_READ \|\|
	csio->cdb_io.cdb_bytes[0] == SA_WRITE) {
	bcopy((caddr_t) sense, (caddr_t) &softc->last_io_sense,
	sizeof (struct scsi_sense_data));
	bcopy(csio->cdb_io.cdb_bytes, softc->last_io_cdb,
	(int) csio->cdb_len);
	softc->last_io_resid = resid;
	softc->last_resid_was_io = 1;
	} else {
	bcopy((caddr_t) sense, (caddr_t) &softc->last_ctl_sense,
	sizeof (struct scsi_sense_data));
	bcopy(csio->cdb_io.cdb_bytes, softc->last_ctl_cdb,
	(int) csio->cdb_len);
	softc->last_ctl_resid = resid;
	softc->last_resid_was_io = 0;
	}
	CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("CDB[0]=0x%x Key 0x%x "
	"ASC/ASCQ 0x%x/0x%x CAM STATUS 0x%x flags 0x%x resid %jd "
	"dxfer_len %d\n", csio->cdb_io.cdb_bytes[0] & 0xff,
	sense_key, asc, ascq, status,
	(stream_valid) ? stream_bits : 0, (intmax_t)resid,
	csio->dxfer_len));
	} else {
	CAM_DEBUG(periph->path, CAM_DEBUG_INFO,
	("Cam Status 0x%x\n", status));
	}

	switch (status) {
	case CAM_REQ_CMP:
	return (0);
	case CAM_SCSI_STATUS_ERROR:
	/*
	* If a read/write command, we handle it here.
	*/
	if (csio->cdb_io.cdb_bytes[0] == SA_READ \|\|
	csio->cdb_io.cdb_bytes[0] == SA_WRITE) {
	break;
	}
	/*
	* If this was just EOM/EOP, Filemark, Setmark, ILI or
	* PEW detected on a non read/write command, we assume
	* it's not an error and propagate the residual and return.
	*/
	if ((aqvalid && asc == 0 && ((ascq > 0 && ascq <= 5)
	\|\| (ascq == 0x07)))
	\|\| (aqvalid == 0 && sense_key == SSD_KEY_NO_SENSE)) {
	csio->resid = resid;
	QFRLS(ccb);
	return (0);
	}
	/*
	* Otherwise, we let the common code handle this.
	*/
	return (cam_periph_error(ccb, cflgs, sflgs));

	/*
	* XXX: To Be Fixed
	* We cannot depend upon CAM honoring retry counts for these.
	*/
	case CAM_SCSI_BUS_RESET:
	case CAM_BDR_SENT:
	if (ccb->ccb_h.retry_count <= 0) {
	return (EIO);
	}
	/* FALLTHROUGH */
	default:
	return (cam_periph_error(ccb, cflgs, sflgs));
	}

	/*
	* Handle filemark, end of tape, mismatched record sizes....
	* From this point out, we're only handling read/write cases.
	* Handle writes && reads differently.
	*/

	if (csio->cdb_io.cdb_bytes[0] == SA_WRITE) {
	if (sense_key == SSD_KEY_VOLUME_OVERFLOW) {
	csio->resid = resid;
	error = ENOSPC;
	} else if ((stream_valid != 0) && (stream_bits & SSD_EOM)) {
	softc->flags \|= SA_FLAG_EOM_PENDING;
	/*
	* Grotesque as it seems, the few times
	* I've actually seen a non-zero resid,
	* the tape drive actually lied and had
	* written all the data!.
	*/
	csio->resid = 0;
	}
	} else {
	csio->resid = resid;
	if (sense_key == SSD_KEY_BLANK_CHECK) {
	if (softc->quirks & SA_QUIRK_1FM) {
	error = 0;
	softc->flags \|= SA_FLAG_EOM_PENDING;
	} else {
	error = EIO;
	}
	} else if ((stream_valid != 0) && (stream_bits & SSD_FILEMARK)){
	if (softc->flags & SA_FLAG_FIXED) {
	error = -1;
	softc->flags \|= SA_FLAG_EOF_PENDING;
	}
	/*
	* Unconditionally, if we detected a filemark on a read,
	* mark that we've run moved a file ahead.
	*/
	if (softc->fileno != (daddr_t) -1) {
	softc->fileno++;
	softc->blkno = 0;
	csio->ccb_h.ccb_pflags \|= SA_POSITION_UPDATED;
	}
	}
	}

	/*
	* Incorrect Length usually applies to read, but can apply to writes.
	*/
	if (error == 0 && (stream_valid != 0) && (stream_bits & SSD_ILI)) {
	if (info < 0) {
	xpt_print(csio->ccb_h.path, toobig,
	csio->dxfer_len - info);
	csio->resid = csio->dxfer_len;
	error = EIO;
	} else {
	csio->resid = resid;
	if (softc->flags & SA_FLAG_FIXED) {
	softc->flags \|= SA_FLAG_EIO_PENDING;
	}
	/*
	* Bump the block number if we hadn't seen a filemark.
	* Do this independent of errors (we've moved anyway).
	*/
	if ((stream_valid == 0) \|\|
	(stream_bits & SSD_FILEMARK) == 0) {
	if (softc->blkno != (daddr_t) -1) {
	softc->blkno++;
	csio->ccb_h.ccb_pflags \|=
	SA_POSITION_UPDATED;
	}
	}
	}
	}

	if (error <= 0) {
	/*
	* Unfreeze the queue if frozen as we're not returning anything
	* to our waiters that would indicate an I/O error has occurred
	* (yet).
	*/
	QFRLS(ccb);
	error = 0;
	}
	return (error);
	}

	static int
	sagetparams(struct cam_periph *periph, sa_params params_to_get,
	u_int32_t blocksize, u_int8_t density, u_int32_t *numblocks,
	int buff_mode, u_int8_t write_protect, u_int8_t *speed,
	int comp_supported, int comp_enabled, u_int32_t *comp_algorithm,
	sa_comp_t tcs, struct scsi_control_data_prot_subpage prot_page,
	int dp_size, int prot_changeable)
	{
	union ccb *ccb;
	void *mode_buffer;
	struct scsi_mode_header_6 *mode_hdr;
	struct scsi_mode_blk_desc *mode_blk;
	int mode_buffer_len;
	struct sa_softc *softc;
	u_int8_t cpage;
	int error;
	cam_status status;

	softc = (struct sa_softc *)periph->softc;
	ccb = cam_periph_getccb(periph, 1);
	if (softc->quirks & SA_QUIRK_NO_CPAGE)
	cpage = SA_DEVICE_CONFIGURATION_PAGE;
	else
	cpage = SA_DATA_COMPRESSION_PAGE;

	retry:
	mode_buffer_len = sizeof(mode_hdr) + sizeof(mode_blk);

	if (params_to_get & SA_PARAM_COMPRESSION) {
	if (softc->quirks & SA_QUIRK_NOCOMP) {
	*comp_supported = FALSE;
	params_to_get &= ~SA_PARAM_COMPRESSION;
	} else
	mode_buffer_len += sizeof (sa_comp_t);
	}

	/* XXX Fix M_NOWAIT */
	mode_buffer = malloc(mode_buffer_len, M_SCSISA, M_NOWAIT \| M_ZERO);
	if (mode_buffer == NULL) {
	xpt_release_ccb(ccb);
	return (ENOMEM);
	}
	mode_hdr = (struct scsi_mode_header_6 *)mode_buffer;
	mode_blk = (struct scsi_mode_blk_desc *)&mode_hdr[1];

	/* it is safe to retry this */
	scsi_mode_sense(&ccb->csio, 5, NULL, MSG_SIMPLE_Q_TAG, FALSE,
	SMS_PAGE_CTRL_CURRENT, (params_to_get & SA_PARAM_COMPRESSION) ?
	cpage : SMS_VENDOR_SPECIFIC_PAGE, mode_buffer, mode_buffer_len,
	SSD_FULL_SIZE, SCSIOP_TIMEOUT);

	error = cam_periph_runccb(ccb, saerror, 0, SF_NO_PRINT,
	softc->device_stats);

	status = ccb->ccb_h.status & CAM_STATUS_MASK;

	if (error == EINVAL && (params_to_get & SA_PARAM_COMPRESSION) != 0) {
	/*
	* Hmm. Let's see if we can try another page...
	* If we've already done that, give up on compression
	* for this device and remember this for the future
	* and attempt the request without asking for compression
	* info.
	*/
	if (cpage == SA_DATA_COMPRESSION_PAGE) {
	cpage = SA_DEVICE_CONFIGURATION_PAGE;
	goto retry;
	}
	softc->quirks \|= SA_QUIRK_NOCOMP;
	free(mode_buffer, M_SCSISA);
	goto retry;
	} else if (status == CAM_SCSI_STATUS_ERROR) {
	/* Tell the user about the fatal error. */
	scsi_sense_print(&ccb->csio);
	goto sagetparamsexit;
	}

	/*
	* If the user only wants the compression information, and
	* the device doesn't send back the block descriptor, it's
	* no big deal. If the user wants more than just
	* compression, though, and the device doesn't pass back the
	* block descriptor, we need to send another mode sense to
	* get the block descriptor.
	*/
	if ((mode_hdr->blk_desc_len == 0) &&
	(params_to_get & SA_PARAM_COMPRESSION) &&
	(params_to_get & ~(SA_PARAM_COMPRESSION))) {
	/*
	* Decrease the mode buffer length by the size of
	* the compression page, to make sure the data
	* there doesn't get overwritten.
	*/
	mode_buffer_len -= sizeof (sa_comp_t);

	/*
	* Now move the compression page that we presumably
	* got back down the memory chunk a little bit so
	* it doesn't get spammed.
	*/
	bcopy(&mode_hdr[0], &mode_hdr[1], sizeof (sa_comp_t));
	bzero(&mode_hdr[0], sizeof (mode_hdr[0]));

	/*
	* Now, we issue another mode sense and just ask
	* for the block descriptor, etc.
	*/

	scsi_mode_sense(&ccb->csio, 2, NULL, MSG_SIMPLE_Q_TAG, FALSE,
	SMS_PAGE_CTRL_CURRENT, SMS_VENDOR_SPECIFIC_PAGE,
	mode_buffer, mode_buffer_len, SSD_FULL_SIZE,
	SCSIOP_TIMEOUT);

	error = cam_periph_runccb(ccb, saerror, 0, SF_NO_PRINT,
	softc->device_stats);

	if (error != 0)
	goto sagetparamsexit;
	}

	if (params_to_get & SA_PARAM_BLOCKSIZE)
	*blocksize = scsi_3btoul(mode_blk->blklen);

	if (params_to_get & SA_PARAM_NUMBLOCKS)
	*numblocks = scsi_3btoul(mode_blk->nblocks);

	if (params_to_get & SA_PARAM_BUFF_MODE)
	*buff_mode = mode_hdr->dev_spec & SMH_SA_BUF_MODE_MASK;

	if (params_to_get & SA_PARAM_DENSITY)
	*density = mode_blk->density;

	if (params_to_get & SA_PARAM_WP)
	*write_protect = (mode_hdr->dev_spec & SMH_SA_WP)? TRUE : FALSE;

	if (params_to_get & SA_PARAM_SPEED)
	*speed = mode_hdr->dev_spec & SMH_SA_SPEED_MASK;

	if (params_to_get & SA_PARAM_COMPRESSION) {
	sa_comp_t ntcs = (sa_comp_t ) &mode_blk[1];
	if (cpage == SA_DATA_COMPRESSION_PAGE) {
	struct scsi_data_compression_page *cp = &ntcs->dcomp;
	*comp_supported =
	(cp->dce_and_dcc & SA_DCP_DCC)? TRUE : FALSE;
	*comp_enabled =
	(cp->dce_and_dcc & SA_DCP_DCE)? TRUE : FALSE;
	*comp_algorithm = scsi_4btoul(cp->comp_algorithm);
	} else {
	struct scsi_dev_conf_page *cp = &ntcs->dconf;
	/*
	* We don't really know whether this device supports
	* Data Compression if the algorithm field is
	* zero. Just say we do.
	*/
	*comp_supported = TRUE;
	*comp_enabled =
	(cp->sel_comp_alg != SA_COMP_NONE)? TRUE : FALSE;
	*comp_algorithm = cp->sel_comp_alg;
	}
	if (tcs != NULL)
	bcopy(ntcs, tcs, sizeof (sa_comp_t));
	}

	if ((params_to_get & SA_PARAM_DENSITY_EXT)
	&& (softc->scsi_rev >= SCSI_REV_SPC)) {
	int i;

	for (i = 0; i < SA_DENSITY_TYPES; i++) {
	scsi_report_density_support(&ccb->csio,
	/retries/ 1,
	/cbfcnp/ NULL,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/media/ softc->density_type_bits[i] & SRDS_MEDIA,
	/medium_type/ softc->density_type_bits[i] &
	SRDS_MEDIUM_TYPE,
	/data_ptr/ softc->density_info[i],
	/length/ sizeof(softc->density_info[i]),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ REP_DENSITY_TIMEOUT);
	error = cam_periph_runccb(ccb, saerror, 0, SF_NO_PRINT,
	softc->device_stats);
	status = ccb->ccb_h.status & CAM_STATUS_MASK;

	/*
	* Some tape drives won't support this command at
	* all, but hopefully we'll minimize that with the
	* check for SPC or greater support above. If they
	* don't support the default report (neither the
	* MEDIA or MEDIUM_TYPE bits set), then there is
	* really no point in continuing on to look for
	* other reports.
	*/
	if ((error != 0)
	\|\| (status != CAM_REQ_CMP)) {
	error = 0;
	softc->density_info_valid[i] = 0;
	if (softc->density_type_bits[i] == 0)
	break;
	else
	continue;
	}
	softc->density_info_valid[i] = ccb->csio.dxfer_len -
	ccb->csio.resid;
	}
	}

	/*
	* Get logical block protection parameters if the drive supports it.
	*/
	if ((params_to_get & SA_PARAM_LBP)
	&& (softc->flags & SA_FLAG_PROTECT_SUPP)) {
	struct scsi_mode_header_10 *mode10_hdr;
	struct scsi_control_data_prot_subpage *dp_page;
	struct scsi_mode_sense_10 *cdb;
	struct sa_prot_state *prot;
	int dp_len, returned_len;

	if (dp_size == 0)
	dp_size = sizeof(*dp_page);

	dp_len = sizeof(*mode10_hdr) + dp_size;
	mode10_hdr = malloc(dp_len, M_SCSISA, M_NOWAIT \| M_ZERO);
	if (mode10_hdr == NULL) {
	error = ENOMEM;
	goto sagetparamsexit;
	}

	scsi_mode_sense_len(&ccb->csio,
	/retries/ 5,
	/cbfcnp/ NULL,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/dbd/ TRUE,
	/page_code/ (prot_changeable == 0) ?
	SMS_PAGE_CTRL_CURRENT :
	SMS_PAGE_CTRL_CHANGEABLE,
	/page/ SMS_CONTROL_MODE_PAGE,
	/param_buf/ (uint8_t *)mode10_hdr,
	/param_len/ dp_len,
	/minimum_cmd_size/ 10,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ SCSIOP_TIMEOUT);
	/*
	* XXX KDM we need to be able to set the subpage in the
	* fill function.
	*/
	cdb = (struct scsi_mode_sense_10 *)ccb->csio.cdb_io.cdb_bytes;
	cdb->subpage = SA_CTRL_DP_SUBPAGE_CODE;

	error = cam_periph_runccb(ccb, saerror, 0, SF_NO_PRINT,
	softc->device_stats);
	if (error != 0) {
	free(mode10_hdr, M_SCSISA);
	goto sagetparamsexit;
	}

	status = ccb->ccb_h.status & CAM_STATUS_MASK;
	if (status != CAM_REQ_CMP) {
	error = EINVAL;
	free(mode10_hdr, M_SCSISA);
	goto sagetparamsexit;
	}

	/*
	* The returned data length at least has to be long enough
	* for us to look at length in the mode page header.
	*/
	returned_len = ccb->csio.dxfer_len - ccb->csio.resid;
	if (returned_len < sizeof(mode10_hdr->data_length)) {
	error = EINVAL;
	free(mode10_hdr, M_SCSISA);
	goto sagetparamsexit;
	}

	returned_len = min(returned_len,
	sizeof(mode10_hdr->data_length) +
	scsi_2btoul(mode10_hdr->data_length));

	dp_page = (struct scsi_control_data_prot_subpage *)
	&mode10_hdr[1];

	/*
	* We also have to have enough data to include the prot_bits
	* in the subpage.
	*/
	if (returned_len < (sizeof(*mode10_hdr) +
	__offsetof(struct scsi_control_data_prot_subpage, prot_bits)
	+ sizeof(dp_page->prot_bits))) {
	error = EINVAL;
	free(mode10_hdr, M_SCSISA);
	goto sagetparamsexit;
	}

	prot = &softc->prot_info.cur_prot_state;
	prot->prot_method = dp_page->prot_method;
	prot->pi_length = dp_page->pi_length &
	SA_CTRL_DP_PI_LENGTH_MASK;
	prot->lbp_w = (dp_page->prot_bits & SA_CTRL_DP_LBP_W) ? 1 :0;
	prot->lbp_r = (dp_page->prot_bits & SA_CTRL_DP_LBP_R) ? 1 :0;
	prot->rbdp = (dp_page->prot_bits & SA_CTRL_DP_RBDP) ? 1 :0;
	prot->initialized = 1;

	if (prot_page != NULL)
	bcopy(dp_page, prot_page, min(sizeof(*prot_page),
	sizeof(*dp_page)));

	free(mode10_hdr, M_SCSISA);
	}

	if (CAM_DEBUGGED(periph->path, CAM_DEBUG_INFO)) {
	int idx;
	char *xyz = mode_buffer;
	xpt_print_path(periph->path);
	printf("Mode Sense Data=");
	for (idx = 0; idx < mode_buffer_len; idx++)
	printf(" 0x%02x", xyz[idx] & 0xff);
	printf("\n");
	}

	sagetparamsexit:

	xpt_release_ccb(ccb);
	free(mode_buffer, M_SCSISA);
	return (error);
	}

	/*
	* Set protection information to the pending protection information stored
	* in the softc.
	*/
	static int
	sasetprot(struct cam_periph periph, struct sa_prot_state new_prot)
	{
	struct sa_softc *softc;
	struct scsi_control_data_prot_subpage dp_page, dp_changeable;
	struct scsi_mode_header_10 mode10_hdr, mode10_changeable;
	union ccb *ccb;
	uint8_t current_speed;
	size_t dp_size, dp_page_length;
	int dp_len, buff_mode;
	int error;

	softc = (struct sa_softc *)periph->softc;
	mode10_hdr = NULL;
	mode10_changeable = NULL;
	ccb = NULL;

	/*
	* Start off with the size set to the actual length of the page
	* that we have defined.
	*/
	dp_size = sizeof(*dp_changeable);
	dp_page_length = dp_size -
	__offsetof(struct scsi_control_data_prot_subpage, prot_method);

	retry_length:

	dp_len = sizeof(*mode10_changeable) + dp_size;
	mode10_changeable = malloc(dp_len, M_SCSISA, M_NOWAIT \| M_ZERO);
	if (mode10_changeable == NULL) {
	error = ENOMEM;
	goto bailout;
	}

	dp_changeable =
	(struct scsi_control_data_prot_subpage *)&mode10_changeable[1];

	/*
	* First get the data protection page changeable parameters mask.
	* We need to know which parameters the drive supports changing.
	* We also need to know what the drive claims that its page length
	* is. The reason is that IBM drives in particular are very picky
	* about the page length. They want it (the length set in the
	* page structure itself) to be 28 bytes, and they want the
	* parameter list length specified in the mode select header to be
	* 40 bytes. So, to work with IBM drives as well as any other tape
	* drive, find out what the drive claims the page length is, and
	* make sure that we match that.
	*/
	error = sagetparams(periph, SA_PARAM_SPEED \| SA_PARAM_LBP,
	NULL, NULL, NULL, &buff_mode, NULL, &current_speed, NULL, NULL,
	NULL, NULL, dp_changeable, dp_size, /prot_changeable/ 1);
	if (error != 0)
	goto bailout;

	if (scsi_2btoul(dp_changeable->length) > dp_page_length) {
	dp_page_length = scsi_2btoul(dp_changeable->length);
	dp_size = dp_page_length +
	__offsetof(struct scsi_control_data_prot_subpage,
	prot_method);
	free(mode10_changeable, M_SCSISA);
	mode10_changeable = NULL;
	goto retry_length;
	}

	mode10_hdr = malloc(dp_len, M_SCSISA, M_NOWAIT \| M_ZERO);
	if (mode10_hdr == NULL) {
	error = ENOMEM;
	goto bailout;
	}

	dp_page = (struct scsi_control_data_prot_subpage *)&mode10_hdr[1];

	/*
	* Now grab the actual current settings in the page.
	*/
	error = sagetparams(periph, SA_PARAM_SPEED \| SA_PARAM_LBP,
	NULL, NULL, NULL, &buff_mode, NULL, &current_speed, NULL, NULL,
	NULL, NULL, dp_page, dp_size, /prot_changeable/ 0);
	if (error != 0)
	goto bailout;

	/* These two fields need to be 0 for MODE SELECT */
	scsi_ulto2b(0, mode10_hdr->data_length);
	mode10_hdr->medium_type = 0;
	/* We are not including a block descriptor */
	scsi_ulto2b(0, mode10_hdr->blk_desc_len);

	mode10_hdr->dev_spec = current_speed;
	/* if set, set single-initiator buffering mode */
	if (softc->buffer_mode == SMH_SA_BUF_MODE_SIBUF) {
	mode10_hdr->dev_spec \|= SMH_SA_BUF_MODE_SIBUF;
	}

	/*
	* For each field, make sure that the drive allows changing it
	* before bringing in the user's setting.
	*/
	if (dp_changeable->prot_method != 0)
	dp_page->prot_method = new_prot->prot_method;

	if (dp_changeable->pi_length & SA_CTRL_DP_PI_LENGTH_MASK) {
	dp_page->pi_length &= ~SA_CTRL_DP_PI_LENGTH_MASK;
	dp_page->pi_length \|= (new_prot->pi_length &
	SA_CTRL_DP_PI_LENGTH_MASK);
	}
	if (dp_changeable->prot_bits & SA_CTRL_DP_LBP_W) {
	if (new_prot->lbp_w)
	dp_page->prot_bits \|= SA_CTRL_DP_LBP_W;
	else
	dp_page->prot_bits &= ~SA_CTRL_DP_LBP_W;
	}

	if (dp_changeable->prot_bits & SA_CTRL_DP_LBP_R) {
	if (new_prot->lbp_r)
	dp_page->prot_bits \|= SA_CTRL_DP_LBP_R;
	else
	dp_page->prot_bits &= ~SA_CTRL_DP_LBP_R;
	}

	if (dp_changeable->prot_bits & SA_CTRL_DP_RBDP) {
	if (new_prot->rbdp)
	dp_page->prot_bits \|= SA_CTRL_DP_RBDP;
	else
	dp_page->prot_bits &= ~SA_CTRL_DP_RBDP;
	}

	ccb = cam_periph_getccb(periph, 1);

	scsi_mode_select_len(&ccb->csio,
	/retries/ 5,
	/cbfcnp/ NULL,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/scsi_page_fmt/ TRUE,
	/save_pages/ FALSE,
	/param_buf/ (uint8_t *)mode10_hdr,
	/param_len/ dp_len,
	/minimum_cmd_size/ 10,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ SCSIOP_TIMEOUT);

	error = cam_periph_runccb(ccb, saerror, 0, 0, softc->device_stats);
	if (error != 0)
	goto bailout;

	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	error = EINVAL;
	goto bailout;
	}

	/*
	* The operation was successful. We could just copy the settings
	* the user requested, but just in case the drive ignored some of
	* our settings, let's ask for status again.
	*/
	error = sagetparams(periph, SA_PARAM_SPEED \| SA_PARAM_LBP,
	NULL, NULL, NULL, &buff_mode, NULL, &current_speed, NULL, NULL,
	NULL, NULL, dp_page, dp_size, 0);

	bailout:
	if (ccb != NULL)
	xpt_release_ccb(ccb);
	free(mode10_hdr, M_SCSISA);
	free(mode10_changeable, M_SCSISA);
	return (error);
	}

	/*
	* The purpose of this function is to set one of four different parameters
	* for a tape drive:
	* - blocksize
	* - density
	* - compression / compression algorithm
	* - buffering mode
	*
	* The assumption is that this will be called from saioctl(), and therefore
	* from a process context. Thus the waiting malloc calls below. If that
	* assumption ever changes, the malloc calls should be changed to be
	* NOWAIT mallocs.
	*
	* Any or all of the four parameters may be set when this function is
	* called. It should handle setting more than one parameter at once.
	*/
	static int
	sasetparams(struct cam_periph *periph, sa_params params_to_set,
	u_int32_t blocksize, u_int8_t density, u_int32_t calg,
	u_int32_t sense_flags)
	{
	struct sa_softc *softc;
	u_int32_t current_blocksize;
	u_int32_t current_calg;
	u_int8_t current_density;
	u_int8_t current_speed;
	int comp_enabled, comp_supported;
	void *mode_buffer;
	int mode_buffer_len;
	struct scsi_mode_header_6 *mode_hdr;
	struct scsi_mode_blk_desc *mode_blk;
	sa_comp_t ccomp, cpage;
	int buff_mode;
	union ccb *ccb = NULL;
	int error;

	softc = (struct sa_softc *)periph->softc;

	ccomp = malloc(sizeof (sa_comp_t), M_SCSISA, M_NOWAIT);
	if (ccomp == NULL)
	return (ENOMEM);

	/*
	* Since it doesn't make sense to set the number of blocks, or
	* write protection, we won't try to get the current value. We
	* always want to get the blocksize, so we can set it back to the
	* proper value.
	*/
	error = sagetparams(periph,
	params_to_set \| SA_PARAM_BLOCKSIZE \| SA_PARAM_SPEED,
	&current_blocksize, &current_density, NULL, &buff_mode, NULL,
	&current_speed, &comp_supported, &comp_enabled,
	&current_calg, ccomp, NULL, 0, 0);

	if (error != 0) {
	free(ccomp, M_SCSISA);
	return (error);
	}

	mode_buffer_len = sizeof(mode_hdr) + sizeof(mode_blk);
	if (params_to_set & SA_PARAM_COMPRESSION)
	mode_buffer_len += sizeof (sa_comp_t);

	mode_buffer = malloc(mode_buffer_len, M_SCSISA, M_NOWAIT \| M_ZERO);
	if (mode_buffer == NULL) {
	free(ccomp, M_SCSISA);
	return (ENOMEM);
	}

	mode_hdr = (struct scsi_mode_header_6 *)mode_buffer;
	mode_blk = (struct scsi_mode_blk_desc *)&mode_hdr[1];

	ccb = cam_periph_getccb(periph, 1);

	retry:

	if (params_to_set & SA_PARAM_COMPRESSION) {
	if (mode_blk) {
	cpage = (sa_comp_t *)&mode_blk[1];
	} else {
	cpage = (sa_comp_t *)&mode_hdr[1];
	}
	bcopy(ccomp, cpage, sizeof (sa_comp_t));
	cpage->hdr.pagecode &= ~0x80;
	} else
	cpage = NULL;

	/*
	* If the caller wants us to set the blocksize, use the one they
	* pass in. Otherwise, use the blocksize we got back from the
	* mode select above.
	*/
	if (mode_blk) {
	if (params_to_set & SA_PARAM_BLOCKSIZE)
	scsi_ulto3b(blocksize, mode_blk->blklen);
	else
	scsi_ulto3b(current_blocksize, mode_blk->blklen);

	/*
	* Set density if requested, else preserve old density.
	* SCSI_SAME_DENSITY only applies to SCSI-2 or better
	* devices, else density we've latched up in our softc.
	*/
	if (params_to_set & SA_PARAM_DENSITY) {
	mode_blk->density = density;
	} else if (softc->scsi_rev > SCSI_REV_CCS) {
	mode_blk->density = SCSI_SAME_DENSITY;
	} else {
	mode_blk->density = softc->media_density;
	}
	}

	/*
	* For mode selects, these two fields must be zero.
	*/
	mode_hdr->data_length = 0;
	mode_hdr->medium_type = 0;

	/* set the speed to the current value */
	mode_hdr->dev_spec = current_speed;

	/* if set, set single-initiator buffering mode */
	if (softc->buffer_mode == SMH_SA_BUF_MODE_SIBUF) {
	mode_hdr->dev_spec \|= SMH_SA_BUF_MODE_SIBUF;
	}

	if (mode_blk)
	mode_hdr->blk_desc_len = sizeof(struct scsi_mode_blk_desc);
	else
	mode_hdr->blk_desc_len = 0;

	/*
	* First, if the user wants us to set the compression algorithm or
	* just turn compression on, check to make sure that this drive
	* supports compression.
	*/
	if (params_to_set & SA_PARAM_COMPRESSION) {
	/*
	* If the compression algorithm is 0, disable compression.
	* If the compression algorithm is non-zero, enable
	* compression and set the compression type to the
	* specified compression algorithm, unless the algorithm is
	* MT_COMP_ENABLE. In that case, we look at the
	* compression algorithm that is currently set and if it is
	* non-zero, we leave it as-is. If it is zero, and we have
	* saved a compression algorithm from a time when
	* compression was enabled before, set the compression to
	* the saved value.
	*/
	switch (ccomp->hdr.pagecode & ~0x80) {
	case SA_DEVICE_CONFIGURATION_PAGE:
	{
	struct scsi_dev_conf_page *dcp = &cpage->dconf;
	if (calg == 0) {
	dcp->sel_comp_alg = SA_COMP_NONE;
	break;
	}
	if (calg != MT_COMP_ENABLE) {
	dcp->sel_comp_alg = calg;
	} else if (dcp->sel_comp_alg == SA_COMP_NONE &&
	softc->saved_comp_algorithm != 0) {
	dcp->sel_comp_alg = softc->saved_comp_algorithm;
	}
	break;
	}
	case SA_DATA_COMPRESSION_PAGE:
	if (ccomp->dcomp.dce_and_dcc & SA_DCP_DCC) {
	struct scsi_data_compression_page *dcp = &cpage->dcomp;
	if (calg == 0) {
	/*
	* Disable compression, but leave the
	* decompression and the capability bit
	* alone.
	*/
	dcp->dce_and_dcc = SA_DCP_DCC;
	dcp->dde_and_red \|= SA_DCP_DDE;
	break;
	}
	/* enable compression && decompression */
	dcp->dce_and_dcc = SA_DCP_DCE \| SA_DCP_DCC;
	dcp->dde_and_red \|= SA_DCP_DDE;
	/*
	* If there, use compression algorithm from caller.
	* Otherwise, if there's a saved compression algorithm
	* and there is no current algorithm, use the saved
	* algorithm. Else parrot back what we got and hope
	* for the best.
	*/
	if (calg != MT_COMP_ENABLE) {
	scsi_ulto4b(calg, dcp->comp_algorithm);
	scsi_ulto4b(calg, dcp->decomp_algorithm);
	} else if (scsi_4btoul(dcp->comp_algorithm) == 0 &&
	softc->saved_comp_algorithm != 0) {
	scsi_ulto4b(softc->saved_comp_algorithm,
	dcp->comp_algorithm);
	scsi_ulto4b(softc->saved_comp_algorithm,
	dcp->decomp_algorithm);
	}
	break;
	}
	/*
	* Compression does not appear to be supported-
	* at least via the DATA COMPRESSION page. It
	* would be too much to ask us to believe that
	* the page itself is supported, but incorrectly
	* reports an ability to manipulate data compression,
	* so we'll assume that this device doesn't support
	* compression. We can just fall through for that.
	*/
	/* FALLTHROUGH */
	default:
	/*
	* The drive doesn't seem to support compression,
	* so turn off the set compression bit.
	*/
	params_to_set &= ~SA_PARAM_COMPRESSION;
	xpt_print(periph->path,
	"device does not seem to support compression\n");

	/*
	* If that was the only thing the user wanted us to set,
	* clean up allocated resources and return with
	* 'operation not supported'.
	*/
	if (params_to_set == SA_PARAM_NONE) {
	free(mode_buffer, M_SCSISA);
	xpt_release_ccb(ccb);
	return (ENODEV);
	}

	/*
	* That wasn't the only thing the user wanted us to set.
	* So, decrease the stated mode buffer length by the
	* size of the compression mode page.
	*/
	mode_buffer_len -= sizeof(sa_comp_t);
	}
	}

	/* It is safe to retry this operation */
	scsi_mode_select(&ccb->csio, 5, NULL, MSG_SIMPLE_Q_TAG,
	(params_to_set & SA_PARAM_COMPRESSION)? TRUE : FALSE,
	FALSE, mode_buffer, mode_buffer_len, SSD_FULL_SIZE, SCSIOP_TIMEOUT);

	error = cam_periph_runccb(ccb, saerror, 0,
	sense_flags, softc->device_stats);

	if (CAM_DEBUGGED(periph->path, CAM_DEBUG_INFO)) {
	int idx;
	char *xyz = mode_buffer;
	xpt_print_path(periph->path);
	printf("Err%d, Mode Select Data=", error);
	for (idx = 0; idx < mode_buffer_len; idx++)
	printf(" 0x%02x", xyz[idx] & 0xff);
	printf("\n");
	}

	if (error) {
	/*
	* If we can, try without setting density/blocksize.
	*/
	if (mode_blk) {
	if ((params_to_set &
	(SA_PARAM_DENSITY\|SA_PARAM_BLOCKSIZE)) == 0) {
	mode_blk = NULL;
	goto retry;
	}
	} else {
	mode_blk = (struct scsi_mode_blk_desc *)&mode_hdr[1];
	cpage = (sa_comp_t *)&mode_blk[1];
	}

	/*
	* If we were setting the blocksize, and that failed, we
	* want to set it to its original value. If we weren't
	* setting the blocksize, we don't want to change it.
	*/
	scsi_ulto3b(current_blocksize, mode_blk->blklen);

	/*
	* Set density if requested, else preserve old density.
	* SCSI_SAME_DENSITY only applies to SCSI-2 or better
	* devices, else density we've latched up in our softc.
	*/
	if (params_to_set & SA_PARAM_DENSITY) {
	mode_blk->density = current_density;
	} else if (softc->scsi_rev > SCSI_REV_CCS) {
	mode_blk->density = SCSI_SAME_DENSITY;
	} else {
	mode_blk->density = softc->media_density;
	}

	if (params_to_set & SA_PARAM_COMPRESSION)
	bcopy(ccomp, cpage, sizeof (sa_comp_t));

	/*
	* The retry count is the only CCB field that might have been
	* changed that we care about, so reset it back to 1.
	*/
	ccb->ccb_h.retry_count = 1;
	cam_periph_runccb(ccb, saerror, 0, sense_flags,
	softc->device_stats);
	}

	xpt_release_ccb(ccb);

	if (ccomp != NULL)
	free(ccomp, M_SCSISA);

	if (params_to_set & SA_PARAM_COMPRESSION) {
	if (error) {
	softc->flags &= ~SA_FLAG_COMP_ENABLED;
	/*
	* Even if we get an error setting compression,
	* do not say that we don't support it. We could
	* have been wrong, or it may be media specific.
	* softc->flags &= ~SA_FLAG_COMP_SUPP;
	*/
	softc->saved_comp_algorithm = softc->comp_algorithm;
	softc->comp_algorithm = 0;
	} else {
	softc->flags \|= SA_FLAG_COMP_ENABLED;
	softc->comp_algorithm = calg;
	}
	}

	free(mode_buffer, M_SCSISA);
	return (error);
	}

	static int
	saextget(struct cdev dev, struct cam_periph periph, struct sbuf *sb,
	struct mtextget *g)
	{
	int indent, error;
	char tmpstr[80];
	struct sa_softc *softc;
	int tmpint;
	uint32_t maxio_tmp;
	struct ccb_getdev cgd;

	softc = (struct sa_softc *)periph->softc;

	error = 0;

	error = sagetparams_common(dev, periph);
	if (error)
	goto extget_bailout;
	if (!SA_IS_CTRL(dev) && !softc->open_pending_mount)
	sagetpos(periph);

	indent = 0;
	SASBADDNODE(sb, indent, mtextget);
	/*
	* Basic CAM peripheral information.
	*/
	SASBADDVARSTR(sb, indent, periph->periph_name, %s, periph_name,
	strlen(periph->periph_name) + 1);
	SASBADDUINT(sb, indent, periph->unit_number, %u, unit_number);
	xpt_setup_ccb(&cgd.ccb_h,
	periph->path,
	CAM_PRIORITY_NORMAL);
	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_action((union ccb *)&cgd);
	if ((cgd.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	g->status = MT_EXT_GET_ERROR;
	snprintf(g->error_str, sizeof(g->error_str),
	"Error %#x returned for XPT_GDEV_TYPE CCB",
	cgd.ccb_h.status);
	goto extget_bailout;
	}

	cam_strvis(tmpstr, cgd.inq_data.vendor,
	sizeof(cgd.inq_data.vendor), sizeof(tmpstr));
	SASBADDVARSTRDESC(sb, indent, tmpstr, %s, vendor,
	sizeof(cgd.inq_data.vendor) + 1, "SCSI Vendor ID");

	cam_strvis(tmpstr, cgd.inq_data.product,
	sizeof(cgd.inq_data.product), sizeof(tmpstr));
	SASBADDVARSTRDESC(sb, indent, tmpstr, %s, product,
	sizeof(cgd.inq_data.product) + 1, "SCSI Product ID");

	cam_strvis(tmpstr, cgd.inq_data.revision,
	sizeof(cgd.inq_data.revision), sizeof(tmpstr));
	SASBADDVARSTRDESC(sb, indent, tmpstr, %s, revision,
	sizeof(cgd.inq_data.revision) + 1, "SCSI Revision");

	if (cgd.serial_num_len > 0) {
	char *tmpstr2;
	size_t ts2_len;
	int ts2_malloc;

	ts2_len = 0;

	if (cgd.serial_num_len > sizeof(tmpstr)) {
	ts2_len = cgd.serial_num_len + 1;
	ts2_malloc = 1;
	tmpstr2 = malloc(ts2_len, M_SCSISA, M_NOWAIT \| M_ZERO);
	/*
	* The 80 characters allocated on the stack above
	* will handle the vast majority of serial numbers.
	* If we run into one that is larger than that, and
	* we can't malloc the length without blocking,
	* bail out with an out of memory error.
	*/
	if (tmpstr2 == NULL) {
	error = ENOMEM;
	goto extget_bailout;
	}
	} else {
	ts2_len = sizeof(tmpstr);
	ts2_malloc = 0;
	tmpstr2 = tmpstr;
	}

	cam_strvis(tmpstr2, cgd.serial_num, cgd.serial_num_len,
	ts2_len);

	SASBADDVARSTRDESC(sb, indent, tmpstr2, %s, serial_num,
	(ssize_t)cgd.serial_num_len + 1, "Serial Number");
	if (ts2_malloc != 0)
	free(tmpstr2, M_SCSISA);
	} else {
	/*
	* We return a serial_num element in any case, but it will
	* be empty if the device has no serial number.
	*/
	tmpstr[0] = '\0';
	SASBADDVARSTRDESC(sb, indent, tmpstr, %s, serial_num,
	(ssize_t)0, "Serial Number");
	}

	SASBADDUINTDESC(sb, indent, softc->maxio, %u, maxio,
	"Maximum I/O size allowed by driver and controller");

	SASBADDUINTDESC(sb, indent, softc->cpi_maxio, %u, cpi_maxio,
	"Maximum I/O size reported by controller");

	SASBADDUINTDESC(sb, indent, softc->max_blk, %u, max_blk,
	"Maximum block size supported by tape drive and media");

	SASBADDUINTDESC(sb, indent, softc->min_blk, %u, min_blk,
	"Minimum block size supported by tape drive and media");

	SASBADDUINTDESC(sb, indent, softc->blk_gran, %u, blk_gran,
	"Block granularity supported by tape drive and media");

	maxio_tmp = min(softc->max_blk, softc->maxio);

	SASBADDUINTDESC(sb, indent, maxio_tmp, %u, max_effective_iosize,
	"Maximum possible I/O size");

	SASBADDINTDESC(sb, indent, softc->flags & SA_FLAG_FIXED ? 1 : 0, %d,
	fixed_mode, "Set to 1 for fixed block mode, 0 for variable block");

	/*
	* XXX KDM include SIM, bus, target, LUN?
	*/
	if (softc->flags & SA_FLAG_COMP_UNSUPP)
	tmpint = 0;
	else
	tmpint = 1;
	SASBADDINTDESC(sb, indent, tmpint, %d, compression_supported,
	"Set to 1 if compression is supported, 0 if not");
	if (softc->flags & SA_FLAG_COMP_ENABLED)
	tmpint = 1;
	else
	tmpint = 0;
	SASBADDINTDESC(sb, indent, tmpint, %d, compression_enabled,
	"Set to 1 if compression is enabled, 0 if not");
	SASBADDUINTDESC(sb, indent, softc->comp_algorithm, %u,
	compression_algorithm, "Numeric compression algorithm");

	safillprot(softc, &indent, sb);

	SASBADDUINTDESC(sb, indent, softc->media_blksize, %u,
	media_blocksize, "Block size reported by drive or set by user");
	SASBADDINTDESC(sb, indent, (intmax_t)softc->fileno, %jd,
	calculated_fileno, "Calculated file number, -1 if unknown");
	SASBADDINTDESC(sb, indent, (intmax_t)softc->blkno, %jd,
	calculated_rel_blkno, "Calculated block number relative to file, "
	"set to -1 if unknown");
	SASBADDINTDESC(sb, indent, (intmax_t)softc->rep_fileno, %jd,
	reported_fileno, "File number reported by drive, -1 if unknown");
	SASBADDINTDESC(sb, indent, (intmax_t)softc->rep_blkno, %jd,
	reported_blkno, "Block number relative to BOP/BOT reported by "
	"drive, -1 if unknown");
	SASBADDINTDESC(sb, indent, (intmax_t)softc->partition, %jd,
	partition, "Current partition number, 0 is the default");
	SASBADDINTDESC(sb, indent, softc->bop, %d, bop,
	"Set to 1 if drive is at the beginning of partition/tape, 0 if "
	"not, -1 if unknown");
	SASBADDINTDESC(sb, indent, softc->eop, %d, eop,
	"Set to 1 if drive is past early warning, 0 if not, -1 if unknown");
	SASBADDINTDESC(sb, indent, softc->bpew, %d, bpew,
	"Set to 1 if drive is past programmable early warning, 0 if not, "
	"-1 if unknown");
	SASBADDINTDESC(sb, indent, (intmax_t)softc->last_io_resid, %jd,
	residual, "Residual for the last I/O");
	/*
	* XXX KDM should we send a string with the current driver
	* status already decoded instead of a numeric value?
	*/
	SASBADDINTDESC(sb, indent, softc->dsreg, %d, dsreg,
	"Current state of the driver");

	safilldensitysb(softc, &indent, sb);

	SASBENDNODE(sb, indent, mtextget);

	extget_bailout:

	return (error);
	}

	static int
	saparamget(struct sa_softc softc, struct sbuf sb)
	{
	int indent;

	indent = 0;
	SASBADDNODE(sb, indent, mtparamget);
	SASBADDINTDESC(sb, indent, softc->sili, %d, sili,
	"Suppress an error on underlength variable reads");
	SASBADDINTDESC(sb, indent, softc->eot_warn, %d, eot_warn,
	"Return an error to warn that end of tape is approaching");
	safillprot(softc, &indent, sb);
	SASBENDNODE(sb, indent, mtparamget);

	return (0);
	}

	static void
	saprevent(struct cam_periph *periph, int action)
	{
	struct sa_softc *softc;
	union ccb *ccb;
	int error, sf;

	softc = (struct sa_softc *)periph->softc;

	if ((action == PR_ALLOW) && (softc->flags & SA_FLAG_TAPE_LOCKED) == 0)
	return;
	if ((action == PR_PREVENT) && (softc->flags & SA_FLAG_TAPE_LOCKED) != 0)
	return;

	/*
	* We can be quiet about illegal requests.
	*/
	if (CAM_DEBUGGED(periph->path, CAM_DEBUG_INFO)) {
	sf = 0;
	} else
	sf = SF_QUIET_IR;

	ccb = cam_periph_getccb(periph, 1);

	/* It is safe to retry this operation */
	scsi_prevent(&ccb->csio, 5, NULL, MSG_SIMPLE_Q_TAG, action,
	SSD_FULL_SIZE, SCSIOP_TIMEOUT);

	error = cam_periph_runccb(ccb, saerror, 0, sf, softc->device_stats);
	if (error == 0) {
	if (action == PR_ALLOW)
	softc->flags &= ~SA_FLAG_TAPE_LOCKED;
	else
	softc->flags \|= SA_FLAG_TAPE_LOCKED;
	}

	xpt_release_ccb(ccb);
	}

	static int
	sarewind(struct cam_periph *periph)
	{
	union ccb *ccb;
	struct sa_softc *softc;
	int error;

	softc = (struct sa_softc *)periph->softc;

	ccb = cam_periph_getccb(periph, 1);

	/* It is safe to retry this operation */
	scsi_rewind(&ccb->csio, 2, NULL, MSG_SIMPLE_Q_TAG, FALSE,
	SSD_FULL_SIZE, REWIND_TIMEOUT);

	softc->dsreg = MTIO_DSREG_REW;
	error = cam_periph_runccb(ccb, saerror, 0, 0, softc->device_stats);
	softc->dsreg = MTIO_DSREG_REST;

	xpt_release_ccb(ccb);
	if (error == 0) {
	softc->partition = softc->fileno = softc->blkno = (daddr_t) 0;
	softc->rep_fileno = softc->rep_blkno = (daddr_t) 0;
	} else {
	softc->fileno = softc->blkno = (daddr_t) -1;
	softc->partition = (daddr_t) -1;
	softc->rep_fileno = softc->rep_blkno = (daddr_t) -1;
	}
	return (error);
	}

	static int
	saspace(struct cam_periph *periph, int count, scsi_space_code code)
	{
	union ccb *ccb;
	struct sa_softc *softc;
	int error;

	softc = (struct sa_softc *)periph->softc;

	ccb = cam_periph_getccb(periph, 1);

	/* This cannot be retried */

	scsi_space(&ccb->csio, 0, NULL, MSG_SIMPLE_Q_TAG, code, count,
	SSD_FULL_SIZE, SPACE_TIMEOUT);

	/*
	* Clear residual because we will be using it.
	*/
	softc->last_ctl_resid = 0;

	softc->dsreg = (count < 0)? MTIO_DSREG_REV : MTIO_DSREG_FWD;
	error = cam_periph_runccb(ccb, saerror, 0, 0, softc->device_stats);
	softc->dsreg = MTIO_DSREG_REST;

	xpt_release_ccb(ccb);

	/*
	* If a spacing operation has failed, we need to invalidate
	* this mount.
	*
	* If the spacing operation was setmarks or to end of recorded data,
	* we no longer know our relative position.
	*
	* If the spacing operations was spacing files in reverse, we
	* take account of the residual, but still check against less
	* than zero- if we've gone negative, we must have hit BOT.
	*
	* If the spacing operations was spacing records in reverse and
	* we have a residual, we've either hit BOT or hit a filemark.
	* In the former case, we know our new record number (0). In
	* the latter case, we have absolutely no idea what the real
	* record number is- we've stopped between the end of the last
	* record in the previous file and the filemark that stopped
	* our spacing backwards.
	*/
	if (error) {
	softc->fileno = softc->blkno = (daddr_t) -1;
	softc->rep_blkno = softc->partition = (daddr_t) -1;
	softc->rep_fileno = (daddr_t) -1;
	} else if (code == SS_SETMARKS \|\| code == SS_EOD) {
	softc->fileno = softc->blkno = (daddr_t) -1;
	} else if (code == SS_FILEMARKS && softc->fileno != (daddr_t) -1) {
	softc->fileno += (count - softc->last_ctl_resid);
	if (softc->fileno < 0) /* we must of hit BOT */
	softc->fileno = 0;
	softc->blkno = 0;
	} else if (code == SS_BLOCKS && softc->blkno != (daddr_t) -1) {
	softc->blkno += (count - softc->last_ctl_resid);
	if (count < 0) {
	if (softc->last_ctl_resid \|\| softc->blkno < 0) {
	if (softc->fileno == 0) {
	softc->blkno = 0;
	} else {
	softc->blkno = (daddr_t) -1;
	}
	}
	}
	}
	if (error == 0)
	sagetpos(periph);

	return (error);
	}

	static int
	sawritefilemarks(struct cam_periph *periph, int nmarks, int setmarks, int immed)
	{
	union ccb *ccb;
	struct sa_softc *softc;
	int error, nwm = 0;

	softc = (struct sa_softc *)periph->softc;
	if (softc->open_rdonly)
	return (EBADF);

	ccb = cam_periph_getccb(periph, 1);
	/*
	* Clear residual because we will be using it.
	*/
	softc->last_ctl_resid = 0;

	softc->dsreg = MTIO_DSREG_FMK;
	/* this must not be retried */
	scsi_write_filemarks(&ccb->csio, 0, NULL, MSG_SIMPLE_Q_TAG,
	immed, setmarks, nmarks, SSD_FULL_SIZE, IO_TIMEOUT);
	softc->dsreg = MTIO_DSREG_REST;

	error = cam_periph_runccb(ccb, saerror, 0, 0, softc->device_stats);

	if (error == 0 && nmarks) {
	struct sa_softc softc = (struct sa_softc )periph->softc;
	nwm = nmarks - softc->last_ctl_resid;
	softc->filemarks += nwm;
	}

	xpt_release_ccb(ccb);

	/*
	* Update relative positions (if we're doing that).
	*/
	if (error) {
	softc->fileno = softc->blkno = softc->partition = (daddr_t) -1;
	} else if (softc->fileno != (daddr_t) -1) {
	softc->fileno += nwm;
	softc->blkno = 0;
	}

	/*
	* Ask the tape drive for position information.
	*/
	sagetpos(periph);

	/*
	* If we got valid position information, since we just wrote a file
	* mark, we know we're at the file mark and block 0 after that
	* filemark.
	*/
	if (softc->rep_fileno != (daddr_t) -1) {
	softc->fileno = softc->rep_fileno;
	softc->blkno = 0;
	}

	return (error);
	}

	static int
	sagetpos(struct cam_periph *periph)
	{
	union ccb *ccb;
	struct scsi_tape_position_long_data long_pos;
	struct sa_softc softc = (struct sa_softc )periph->softc;
	int error;

	if (softc->quirks & SA_QUIRK_NO_LONG_POS) {
	softc->rep_fileno = (daddr_t) -1;
	softc->rep_blkno = (daddr_t) -1;
	softc->bop = softc->eop = softc->bpew = -1;
	return (EOPNOTSUPP);
	}

	bzero(&long_pos, sizeof(long_pos));

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	scsi_read_position_10(&ccb->csio,
	/retries/ 1,
	/cbfcnp/ NULL,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/service_action/ SA_RPOS_LONG_FORM,
	/data_ptr/ (uint8_t *)&long_pos,
	/length/ sizeof(long_pos),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ SCSIOP_TIMEOUT);

	softc->dsreg = MTIO_DSREG_RBSY;
	error = cam_periph_runccb(ccb, saerror, 0, SF_QUIET_IR,
	softc->device_stats);
	softc->dsreg = MTIO_DSREG_REST;

	if (error == 0) {
	if (long_pos.flags & SA_RPOS_LONG_MPU) {
	/*
	* If the drive doesn't know what file mark it is
	* on, our calculated filemark isn't going to be
	* accurate either.
	*/
	softc->fileno = (daddr_t) -1;
	softc->rep_fileno = (daddr_t) -1;
	} else {
	softc->fileno = softc->rep_fileno =
	scsi_8btou64(long_pos.logical_file_num);
	}

	if (long_pos.flags & SA_RPOS_LONG_LONU) {
	softc->partition = (daddr_t) -1;
	softc->rep_blkno = (daddr_t) -1;
	/*
	* If the tape drive doesn't know its block
	* position, we can't claim to know it either.
	*/
	softc->blkno = (daddr_t) -1;
	} else {
	softc->partition = scsi_4btoul(long_pos.partition);
	softc->rep_blkno =
	scsi_8btou64(long_pos.logical_object_num);
	}
	if (long_pos.flags & SA_RPOS_LONG_BOP)
	softc->bop = 1;
	else
	softc->bop = 0;

	if (long_pos.flags & SA_RPOS_LONG_EOP)
	softc->eop = 1;
	else
	softc->eop = 0;

	if ((long_pos.flags & SA_RPOS_LONG_BPEW)
	\|\| (softc->set_pews_status != 0)) {
	softc->bpew = 1;
	if (softc->set_pews_status > 0)
	softc->set_pews_status--;
	} else
	softc->bpew = 0;
	} else if (error == EINVAL) {
	/*
	* If this drive returned an invalid-request type error,
	* then it likely doesn't support the long form report.
	*/
	softc->quirks \|= SA_QUIRK_NO_LONG_POS;
	}

	if (error != 0) {
	softc->rep_fileno = softc->rep_blkno = (daddr_t) -1;
	softc->partition = (daddr_t) -1;
	softc->bop = softc->eop = softc->bpew = -1;
	}

	xpt_release_ccb(ccb);

	return (error);
	}

	static int
	sardpos(struct cam_periph periph, int hard, u_int32_t blkptr)
	{
	struct scsi_tape_position_data loc;
	union ccb *ccb;
	struct sa_softc softc = (struct sa_softc )periph->softc;
	int error;

	/*
	* We try and flush any buffered writes here if we were writing
	* and we're trying to get hardware block position. It eats
	* up performance substantially, but I'm wary of drive firmware.
	*
	* I think that logical block position is probably okay-
	* but hardware block position might have to wait for data
	* to hit media to be valid. Caveat Emptor.
	*/

	if (hard && (softc->flags & SA_FLAG_TAPE_WRITTEN)) {
	error = sawritefilemarks(periph, 0, 0, 0);
	if (error && error != EACCES)
	return (error);
	}

	ccb = cam_periph_getccb(periph, 1);
	scsi_read_position(&ccb->csio, 1, NULL, MSG_SIMPLE_Q_TAG,
	hard, &loc, SSD_FULL_SIZE, SCSIOP_TIMEOUT);
	softc->dsreg = MTIO_DSREG_RBSY;
	error = cam_periph_runccb(ccb, saerror, 0, 0, softc->device_stats);
	softc->dsreg = MTIO_DSREG_REST;

	if (error == 0) {
	if (loc.flags & SA_RPOS_UNCERTAIN) {
	error = EINVAL; /* nothing is certain */
	} else {
	*blkptr = scsi_4btoul(loc.firstblk);
	}
	}

	xpt_release_ccb(ccb);
	return (error);
	}

	static int
	sasetpos(struct cam_periph periph, int hard, struct mtlocate locate_info)
	{
	union ccb *ccb;
	struct sa_softc *softc;
	int locate16;
	int immed, cp;
	int error;

	/*
	* We used to try and flush any buffered writes here.
	* Now we push this onto user applications to either
	* flush the pending writes themselves (via a zero count
	* WRITE FILEMARKS command) or they can trust their tape
	* drive to do this correctly for them.
	*/

	softc = (struct sa_softc *)periph->softc;
	ccb = cam_periph_getccb(periph, 1);

	cp = locate_info->flags & MT_LOCATE_FLAG_CHANGE_PART ? 1 : 0;
	immed = locate_info->flags & MT_LOCATE_FLAG_IMMED ? 1 : 0;

	/*
	* Determine whether we have to use LOCATE or LOCATE16. The hard
	* bit is only possible with LOCATE, but the new ioctls do not
	* allow setting that bit. So we can't get into the situation of
	* having the hard bit set with a block address that is larger than
	* 32-bits.
	*/
	if (hard != 0)
	locate16 = 0;
	else if ((locate_info->dest_type != MT_LOCATE_DEST_OBJECT)
	\|\| (locate_info->block_address_mode != MT_LOCATE_BAM_IMPLICIT)
	\|\| (locate_info->logical_id > SA_SPOS_MAX_BLK))
	locate16 = 1;
	else
	locate16 = 0;

	if (locate16 != 0) {
	scsi_locate_16(&ccb->csio,
	/retries/ 1,
	/cbfcnp/ NULL,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/immed/ immed,
	/cp/ cp,
	/dest_type/ locate_info->dest_type,
	/bam/ locate_info->block_address_mode,
	/partition/ locate_info->partition,
	/logical_id/ locate_info->logical_id,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ SPACE_TIMEOUT);
	} else {
	scsi_locate_10(&ccb->csio,
	/retries/ 1,
	/cbfcnp/ NULL,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/immed/ immed,
	/cp/ cp,
	/hard/ hard,
	/partition/ locate_info->partition,
	/block_address/ locate_info->logical_id,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ SPACE_TIMEOUT);
	}

	softc->dsreg = MTIO_DSREG_POS;
	error = cam_periph_runccb(ccb, saerror, 0, 0, softc->device_stats);
	softc->dsreg = MTIO_DSREG_REST;
	xpt_release_ccb(ccb);

	/*
	* We assume the calculated file and block numbers are unknown
	* unless we have enough information to populate them.
	*/
	softc->fileno = softc->blkno = (daddr_t) -1;

	/*
	* If the user requested changing the partition and the request
	* succeeded, note the partition.
	*/
	if ((error == 0)
	&& (cp != 0))
	softc->partition = locate_info->partition;
	else
	softc->partition = (daddr_t) -1;

	if (error == 0) {
	switch (locate_info->dest_type) {
	case MT_LOCATE_DEST_FILE:
	/*
	* This is the only case where we can reliably
	* calculate the file and block numbers.
	*/
	softc->fileno = locate_info->logical_id;
	softc->blkno = 0;
	break;
	case MT_LOCATE_DEST_OBJECT:
	case MT_LOCATE_DEST_SET:
	case MT_LOCATE_DEST_EOD:
	default:
	break;
	}
	}

	/*
	* Ask the drive for current position information.
	*/
	sagetpos(periph);

	return (error);
	}

	static int
	saretension(struct cam_periph *periph)
	{
	union ccb *ccb;
	struct sa_softc *softc;
	int error;

	softc = (struct sa_softc *)periph->softc;

	ccb = cam_periph_getccb(periph, 1);

	/* It is safe to retry this operation */
	scsi_load_unload(&ccb->csio, 5, NULL, MSG_SIMPLE_Q_TAG, FALSE,
	FALSE, TRUE, TRUE, SSD_FULL_SIZE, ERASE_TIMEOUT);

	softc->dsreg = MTIO_DSREG_TEN;
	error = cam_periph_runccb(ccb, saerror, 0, 0, softc->device_stats);
	softc->dsreg = MTIO_DSREG_REST;

	xpt_release_ccb(ccb);
	if (error == 0) {
	softc->partition = softc->fileno = softc->blkno = (daddr_t) 0;
	sagetpos(periph);
	} else
	softc->partition = softc->fileno = softc->blkno = (daddr_t) -1;
	return (error);
	}

	static int
	sareservereleaseunit(struct cam_periph *periph, int reserve)
	{
	union ccb *ccb;
	struct sa_softc *softc;
	int error;

	softc = (struct sa_softc *)periph->softc;
	ccb = cam_periph_getccb(periph, 1);

	/* It is safe to retry this operation */
	scsi_reserve_release_unit(&ccb->csio, 2, NULL, MSG_SIMPLE_Q_TAG,
	FALSE, 0, SSD_FULL_SIZE, SCSIOP_TIMEOUT, reserve);
	softc->dsreg = MTIO_DSREG_RBSY;
	error = cam_periph_runccb(ccb, saerror, 0,
	SF_RETRY_UA \| SF_NO_PRINT, softc->device_stats);
	softc->dsreg = MTIO_DSREG_REST;
	xpt_release_ccb(ccb);

	/*
	* If the error was Illegal Request, then the device doesn't support
	* RESERVE/RELEASE. This is not an error.
	*/
	if (error == EINVAL) {
	error = 0;
	}

	return (error);
	}

	static int
	saloadunload(struct cam_periph *periph, int load)
	{
	union ccb *ccb;
	struct sa_softc *softc;
	int error;

	softc = (struct sa_softc *)periph->softc;

	ccb = cam_periph_getccb(periph, 1);

	/* It is safe to retry this operation */
	scsi_load_unload(&ccb->csio, 5, NULL, MSG_SIMPLE_Q_TAG, FALSE,
	FALSE, FALSE, load, SSD_FULL_SIZE, REWIND_TIMEOUT);

	softc->dsreg = (load)? MTIO_DSREG_LD : MTIO_DSREG_UNL;
	error = cam_periph_runccb(ccb, saerror, 0, 0, softc->device_stats);
	softc->dsreg = MTIO_DSREG_REST;
	xpt_release_ccb(ccb);

	if (error \|\| load == 0) {
	softc->partition = softc->fileno = softc->blkno = (daddr_t) -1;
	softc->rep_fileno = softc->rep_blkno = (daddr_t) -1;
	} else if (error == 0) {
	softc->partition = softc->fileno = softc->blkno = (daddr_t) 0;
	sagetpos(periph);
	}
	return (error);
	}

	static int
	saerase(struct cam_periph *periph, int longerase)
	{

	union ccb *ccb;
	struct sa_softc *softc;
	int error;

	softc = (struct sa_softc *)periph->softc;
	if (softc->open_rdonly)
	return (EBADF);

	ccb = cam_periph_getccb(periph, 1);

	scsi_erase(&ccb->csio, 1, NULL, MSG_SIMPLE_Q_TAG, FALSE, longerase,
	SSD_FULL_SIZE, ERASE_TIMEOUT);

	softc->dsreg = MTIO_DSREG_ZER;
	error = cam_periph_runccb(ccb, saerror, 0, 0, softc->device_stats);
	softc->dsreg = MTIO_DSREG_REST;

	xpt_release_ccb(ccb);
	return (error);
	}

	/*
	* Fill an sbuf with density data in XML format. This particular macro
	* works for multi-byte integer fields.
	*
	* Note that 1 byte fields aren't supported here. The reason is that the
	* compiler does not evaluate the sizeof(), and assumes that any of the
	* sizes are possible for a given field. So passing in a multi-byte
	* field will result in a warning that the assignment makes an integer
	* from a pointer without a cast, if there is an assignment in the 1 byte
	* case.
	*/
	#define SAFILLDENSSB(dens_data, sb, indent, field, desc_remain, \
	len_to_go, cur_offset, desc){ \
	size_t cur_field_len; \
	\
	cur_field_len = sizeof(dens_data->field); \
	if (desc_remain < cur_field_len) { \
	len_to_go -= desc_remain; \
	cur_offset += desc_remain; \
	continue; \
	} \
	len_to_go -= cur_field_len; \
	cur_offset += cur_field_len; \
	desc_remain -= cur_field_len; \
	\
	switch (sizeof(dens_data->field)) { \
	case 1: \
	KASSERT(1 == 0, ("Programmer error, invalid 1 byte " \
	"field width for SAFILLDENSFIELD")); \
	break; \
	case 2: \
	SASBADDUINTDESC(sb, indent, \
	scsi_2btoul(dens_data->field), %u, field, desc); \
	break; \
	case 3: \
	SASBADDUINTDESC(sb, indent, \
	scsi_3btoul(dens_data->field), %u, field, desc); \
	break; \
	case 4: \
	SASBADDUINTDESC(sb, indent, \
	scsi_4btoul(dens_data->field), %u, field, desc); \
	break; \
	case 8: \
	SASBADDUINTDESC(sb, indent, \
	(uintmax_t)scsi_8btou64(dens_data->field), %ju, \
	field, desc); \
	break; \
	default: \
	break; \
	} \
	};
	/*
	* Fill an sbuf with density data in XML format. This particular macro
	* works for strings.
	*/
	#define SAFILLDENSSBSTR(dens_data, sb, indent, field, desc_remain, \
	len_to_go, cur_offset, desc){ \
	size_t cur_field_len; \
	char tmpstr[32]; \
	\
	cur_field_len = sizeof(dens_data->field); \
	if (desc_remain < cur_field_len) { \
	len_to_go -= desc_remain; \
	cur_offset += desc_remain; \
	continue; \
	} \
	len_to_go -= cur_field_len; \
	cur_offset += cur_field_len; \
	desc_remain -= cur_field_len; \
	\
	cam_strvis(tmpstr, dens_data->field, \
	sizeof(dens_data->field), sizeof(tmpstr)); \
	SASBADDVARSTRDESC(sb, indent, tmpstr, %s, field, \
	strlen(tmpstr) + 1, desc); \
	};

	/*
	* Fill an sbuf with density data descriptors.
	*/
	static void
	safilldenstypesb(struct sbuf sb, int indent, uint8_t *buf, int buf_len,
	int is_density)
	{
	struct scsi_density_hdr *hdr;
	uint32_t hdr_len;
	int len_to_go, cur_offset;
	int length_offset;
	int num_reports, need_close;

	/*
	* We need at least the header length. Note that this isn't an
	* error, not all tape drives will have every data type.
	*/
	if (buf_len < sizeof(*hdr))
	goto bailout;

	hdr = (struct scsi_density_hdr *)buf;
	hdr_len = scsi_2btoul(hdr->length);
	len_to_go = min(buf_len - sizeof(*hdr), hdr_len);
	if (is_density) {
	length_offset = __offsetof(struct scsi_density_data,
	bits_per_mm);
	} else {
	length_offset = __offsetof(struct scsi_medium_type_data,
	num_density_codes);
	}
	cur_offset = sizeof(*hdr);

	num_reports = 0;
	need_close = 0;

	while (len_to_go > length_offset) {
	struct scsi_density_data *dens_data;
	struct scsi_medium_type_data *type_data;
	int desc_remain;
	size_t cur_field_len;

	dens_data = NULL;
	type_data = NULL;

	if (is_density) {
	dens_data =(struct scsi_density_data *)&buf[cur_offset];
	if (dens_data->byte2 & SDD_DLV)
	desc_remain = scsi_2btoul(dens_data->length);
	else
	desc_remain = SDD_DEFAULT_LENGTH -
	length_offset;
	} else {
	type_data = (struct scsi_medium_type_data *)
	&buf[cur_offset];
	desc_remain = scsi_2btoul(type_data->length);
	}

	len_to_go -= length_offset;
	desc_remain = min(desc_remain, len_to_go);
	cur_offset += length_offset;

	if (need_close != 0) {
	SASBENDNODE(sb, *indent, density_entry);
	}

	SASBADDNODENUM(sb, *indent, density_entry, num_reports);
	num_reports++;
	need_close = 1;

	if (is_density) {
	SASBADDUINTDESC(sb, *indent,
	dens_data->primary_density_code, %u,
	primary_density_code, "Primary Density Code");
	SASBADDUINTDESC(sb, *indent,
	dens_data->secondary_density_code, %u,
	secondary_density_code, "Secondary Density Code");
	SASBADDUINTDESC(sb, *indent,
	dens_data->byte2 & ~SDD_DLV, %#x, density_flags,
	"Density Flags");

	SAFILLDENSSB(dens_data, sb, *indent, bits_per_mm,
	desc_remain, len_to_go, cur_offset, "Bits per mm");
	SAFILLDENSSB(dens_data, sb, *indent, media_width,
	desc_remain, len_to_go, cur_offset, "Media width");
	SAFILLDENSSB(dens_data, sb, *indent, tracks,
	desc_remain, len_to_go, cur_offset,
	"Number of Tracks");
	SAFILLDENSSB(dens_data, sb, *indent, capacity,
	desc_remain, len_to_go, cur_offset, "Capacity");

	SAFILLDENSSBSTR(dens_data, sb, *indent, assigning_org,
	desc_remain, len_to_go, cur_offset,
	"Assigning Organization");

	SAFILLDENSSBSTR(dens_data, sb, *indent, density_name,
	desc_remain, len_to_go, cur_offset, "Density Name");

	SAFILLDENSSBSTR(dens_data, sb, *indent, description,
	desc_remain, len_to_go, cur_offset, "Description");
	} else {
	int i;

	SASBADDUINTDESC(sb, *indent, type_data->medium_type,
	%u, medium_type, "Medium Type");

	cur_field_len =
	__offsetof(struct scsi_medium_type_data,
	media_width) -
	__offsetof(struct scsi_medium_type_data,
	num_density_codes);

	if (desc_remain < cur_field_len) {
	len_to_go -= desc_remain;
	cur_offset += desc_remain;
	continue;
	}
	len_to_go -= cur_field_len;
	cur_offset += cur_field_len;
	desc_remain -= cur_field_len;

	SASBADDINTDESC(sb, *indent,
	type_data->num_density_codes, %d,
	num_density_codes, "Number of Density Codes");
	SASBADDNODE(sb, *indent, density_code_list);
	for (i = 0; i < type_data->num_density_codes;
	i++) {
	SASBADDUINTDESC(sb, *indent,
	type_data->primary_density_codes[i], %u,
	density_code, "Density Code");
	}
	SASBENDNODE(sb, *indent, density_code_list);

	SAFILLDENSSB(type_data, sb, *indent, media_width,
	desc_remain, len_to_go, cur_offset,
	"Media width");
	SAFILLDENSSB(type_data, sb, *indent, medium_length,
	desc_remain, len_to_go, cur_offset,
	"Medium length");

	/*
	* Account for the two reserved bytes.
	*/
	cur_field_len = sizeof(type_data->reserved2);
	if (desc_remain < cur_field_len) {
	len_to_go -= desc_remain;
	cur_offset += desc_remain;
	continue;
	}
	len_to_go -= cur_field_len;
	cur_offset += cur_field_len;
	desc_remain -= cur_field_len;

	SAFILLDENSSBSTR(type_data, sb, *indent, assigning_org,
	desc_remain, len_to_go, cur_offset,
	"Assigning Organization");
	SAFILLDENSSBSTR(type_data, sb, *indent,
	medium_type_name, desc_remain, len_to_go,
	cur_offset, "Medium type name");
	SAFILLDENSSBSTR(type_data, sb, *indent, description,
	desc_remain, len_to_go, cur_offset, "Description");
	}
	}
	if (need_close != 0) {
	SASBENDNODE(sb, *indent, density_entry);
	}

	bailout:
	return;
	}

	/*
	* Fill an sbuf with density data information
	*/
	static void
	safilldensitysb(struct sa_softc softc, int indent, struct sbuf *sb)
	{
	int i, is_density;

	SASBADDNODE(sb, *indent, mtdensity);
	SASBADDUINTDESC(sb, *indent, softc->media_density, %u, media_density,
	"Current Medium Density");
	is_density = 0;
	for (i = 0; i < SA_DENSITY_TYPES; i++) {
	int tmpint;

	if (softc->density_info_valid[i] == 0)
	continue;

	SASBADDNODE(sb, *indent, density_report);
	if (softc->density_type_bits[i] & SRDS_MEDIUM_TYPE) {
	tmpint = 1;
	is_density = 0;
	} else {
	tmpint = 0;
	is_density = 1;
	}
	SASBADDINTDESC(sb, *indent, tmpint, %d, medium_type_report,
	"Medium type report");

	if (softc->density_type_bits[i] & SRDS_MEDIA)
	tmpint = 1;
	else
	tmpint = 0;
	SASBADDINTDESC(sb, *indent, tmpint, %d, media_report,
	"Media report");

	safilldenstypesb(sb, indent, softc->density_info[i],
	softc->density_info_valid[i], is_density);
	SASBENDNODE(sb, *indent, density_report);
	}
	SASBENDNODE(sb, *indent, mtdensity);
	}

	#endif /* _KERNEL */

	/*
	* Read tape block limits command.
	*/
	void
	scsi_read_block_limits(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action,
	struct scsi_read_block_limits_data *rlimit_buf,
	u_int8_t sense_len, u_int32_t timeout)
	{
	struct scsi_read_block_limits *scsi_cmd;

	cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_IN, tag_action,
	(u_int8_t )rlimit_buf, sizeof(rlimit_buf), sense_len,
	sizeof(*scsi_cmd), timeout);

	scsi_cmd = (struct scsi_read_block_limits *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));
	scsi_cmd->opcode = READ_BLOCK_LIMITS;
	}

	void
	scsi_sa_read_write(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int readop, int sli,
	int fixed, u_int32_t length, u_int8_t *data_ptr,
	u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout)
	{
	struct scsi_sa_rw *scsi_cmd;
	int read;

	read = (readop & SCSI_RW_DIRMASK) == SCSI_RW_READ;

	scsi_cmd = (struct scsi_sa_rw *)&csio->cdb_io.cdb_bytes;
	scsi_cmd->opcode = read ? SA_READ : SA_WRITE;
	scsi_cmd->sli_fixed = 0;
	if (sli && read)
	scsi_cmd->sli_fixed \|= SAR_SLI;
	if (fixed)
	scsi_cmd->sli_fixed \|= SARW_FIXED;
	scsi_ulto3b(length, scsi_cmd->length);
	scsi_cmd->control = 0;

	cam_fill_csio(csio, retries, cbfcnp, (read ? CAM_DIR_IN : CAM_DIR_OUT) \|
	((readop & SCSI_RW_BIO) != 0 ? CAM_DATA_BIO : 0),
	tag_action, data_ptr, dxfer_len, sense_len,
	sizeof(*scsi_cmd), timeout);
	}

	void
	scsi_load_unload(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int immediate, int eot,
	int reten, int load, u_int8_t sense_len,
	u_int32_t timeout)
	{
	struct scsi_load_unload *scsi_cmd;

	scsi_cmd = (struct scsi_load_unload *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));
	scsi_cmd->opcode = LOAD_UNLOAD;
	if (immediate)
	scsi_cmd->immediate = SLU_IMMED;
	if (eot)
	scsi_cmd->eot_reten_load \|= SLU_EOT;
	if (reten)
	scsi_cmd->eot_reten_load \|= SLU_RETEN;
	if (load)
	scsi_cmd->eot_reten_load \|= SLU_LOAD;

	cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_NONE, tag_action,
	NULL, 0, sense_len, sizeof(*scsi_cmd), timeout);
	}

	void
	scsi_rewind(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int immediate, u_int8_t sense_len,
	u_int32_t timeout)
	{
	struct scsi_rewind *scsi_cmd;

	scsi_cmd = (struct scsi_rewind *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));
	scsi_cmd->opcode = REWIND;
	if (immediate)
	scsi_cmd->immediate = SREW_IMMED;

	cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_NONE, tag_action, NULL,
	0, sense_len, sizeof(*scsi_cmd), timeout);
	}

	void
	scsi_space(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, scsi_space_code code,
	u_int32_t count, u_int8_t sense_len, u_int32_t timeout)
	{
	struct scsi_space *scsi_cmd;

	scsi_cmd = (struct scsi_space *)&csio->cdb_io.cdb_bytes;
	scsi_cmd->opcode = SPACE;
	scsi_cmd->code = code;
	scsi_ulto3b(count, scsi_cmd->count);
	scsi_cmd->control = 0;

	cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_NONE, tag_action, NULL,
	0, sense_len, sizeof(*scsi_cmd), timeout);
	}

	void
	scsi_write_filemarks(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int immediate, int setmark,
	u_int32_t num_marks, u_int8_t sense_len,
	u_int32_t timeout)
	{
	struct scsi_write_filemarks *scsi_cmd;

	scsi_cmd = (struct scsi_write_filemarks *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));
	scsi_cmd->opcode = WRITE_FILEMARKS;
	if (immediate)
	scsi_cmd->byte2 \|= SWFMRK_IMMED;
	if (setmark)
	scsi_cmd->byte2 \|= SWFMRK_WSMK;

	scsi_ulto3b(num_marks, scsi_cmd->num_marks);

	cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_NONE, tag_action, NULL,
	0, sense_len, sizeof(*scsi_cmd), timeout);
	}

	/*
	* The reserve and release unit commands differ only by their opcodes.
	*/
	void
	scsi_reserve_release_unit(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int third_party,
	int third_party_id, u_int8_t sense_len,
	u_int32_t timeout, int reserve)
	{
	struct scsi_reserve_release_unit *scsi_cmd;

	scsi_cmd = (struct scsi_reserve_release_unit *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));

	if (reserve)
	scsi_cmd->opcode = RESERVE_UNIT;
	else
	scsi_cmd->opcode = RELEASE_UNIT;

	if (third_party) {
	scsi_cmd->lun_thirdparty \|= SRRU_3RD_PARTY;
	scsi_cmd->lun_thirdparty \|=
	((third_party_id << SRRU_3RD_SHAMT) & SRRU_3RD_MASK);
	}

	cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_NONE, tag_action, NULL,
	0, sense_len, sizeof(*scsi_cmd), timeout);
	}

	void
	scsi_erase(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int immediate, int long_erase,
	u_int8_t sense_len, u_int32_t timeout)
	{
	struct scsi_erase *scsi_cmd;

	scsi_cmd = (struct scsi_erase *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));

	scsi_cmd->opcode = ERASE;

	if (immediate)
	scsi_cmd->lun_imm_long \|= SE_IMMED;

	if (long_erase)
	scsi_cmd->lun_imm_long \|= SE_LONG;

	cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_NONE, tag_action, NULL,
	0, sense_len, sizeof(*scsi_cmd), timeout);
	}

	/*
	* Read Tape Position command.
	*/
	void
	scsi_read_position(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int hardsoft,
	struct scsi_tape_position_data *sbp,
	u_int8_t sense_len, u_int32_t timeout)
	{
	struct scsi_tape_read_position *scmd;

	cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_IN, tag_action,
	(u_int8_t )sbp, sizeof (sbp), sense_len, sizeof(*scmd), timeout);
	scmd = (struct scsi_tape_read_position *)&csio->cdb_io.cdb_bytes;
	bzero(scmd, sizeof(*scmd));
	scmd->opcode = READ_POSITION;
	scmd->byte1 = hardsoft;
	}

	/*
	* Read Tape Position command.
	*/
	void
	scsi_read_position_10(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int service_action,
	u_int8_t *data_ptr, u_int32_t length,
	u_int32_t sense_len, u_int32_t timeout)
	{
	struct scsi_tape_read_position *scmd;

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/CAM_DIR_IN,
	tag_action,
	/data_ptr/data_ptr,
	/dxfer_len/length,
	sense_len,
	sizeof(*scmd),
	timeout);

	scmd = (struct scsi_tape_read_position *)&csio->cdb_io.cdb_bytes;
	bzero(scmd, sizeof(*scmd));
	scmd->opcode = READ_POSITION;
	scmd->byte1 = service_action;
	/*
	* The length is only currently set (as of SSC4r03) if the extended
	* form is specified. The other forms have fixed lengths.
	*/
	if (service_action == SA_RPOS_EXTENDED_FORM)
	scsi_ulto2b(length, scmd->length);
	}

	/*
	* Set Tape Position command.
	*/
	void
	scsi_set_position(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int hardsoft, u_int32_t blkno,
	u_int8_t sense_len, u_int32_t timeout)
	{
	struct scsi_tape_locate *scmd;

	cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_NONE, tag_action,
	(u_int8_t )NULL, 0, sense_len, sizeof(scmd), timeout);
	scmd = (struct scsi_tape_locate *)&csio->cdb_io.cdb_bytes;
	bzero(scmd, sizeof(*scmd));
	scmd->opcode = LOCATE;
	if (hardsoft)
	scmd->byte1 \|= SA_SPOS_BT;
	scsi_ulto4b(blkno, scmd->blkaddr);
	}

	/*
	* XXX KDM figure out how to make a compatibility function.
	*/
	void
	scsi_locate_10(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int immed, int cp, int hard,
	int64_t partition, u_int32_t block_address,
	int sense_len, u_int32_t timeout)
	{
	struct scsi_tape_locate *scmd;

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	CAM_DIR_NONE,
	tag_action,
	/data_ptr/ NULL,
	/dxfer_len/ 0,
	sense_len,
	sizeof(*scmd),
	timeout);
	scmd = (struct scsi_tape_locate *)&csio->cdb_io.cdb_bytes;
	bzero(scmd, sizeof(*scmd));
	scmd->opcode = LOCATE;
	if (immed)
	scmd->byte1 \|= SA_SPOS_IMMED;
	if (cp)
	scmd->byte1 \|= SA_SPOS_CP;
	if (hard)
	scmd->byte1 \|= SA_SPOS_BT;
	scsi_ulto4b(block_address, scmd->blkaddr);
	scmd->partition = partition;
	}

	void
	scsi_locate_16(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int immed, int cp, u_int8_t dest_type,
	int bam, int64_t partition, u_int64_t logical_id,
	int sense_len, u_int32_t timeout)
	{

	struct scsi_locate_16 *scsi_cmd;

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/CAM_DIR_NONE,
	tag_action,
	/data_ptr/NULL,
	/dxfer_len/0,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);

	scsi_cmd = (struct scsi_locate_16 *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));
	scsi_cmd->opcode = LOCATE_16;
	if (immed)
	scsi_cmd->byte1 \|= SA_LC_IMMEDIATE;
	if (cp)
	scsi_cmd->byte1 \|= SA_LC_CP;
	scsi_cmd->byte1 \|= (dest_type << SA_LC_DEST_TYPE_SHIFT);

	scsi_cmd->byte2 \|= bam;
	scsi_cmd->partition = partition;
	scsi_u64to8b(logical_id, scsi_cmd->logical_id);
	}

	void
	scsi_report_density_support(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int media, int medium_type,
	u_int8_t *data_ptr, u_int32_t length,
	u_int32_t sense_len, u_int32_t timeout)
	{
	struct scsi_report_density_support *scsi_cmd;

	scsi_cmd =(struct scsi_report_density_support *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));

	scsi_cmd->opcode = REPORT_DENSITY_SUPPORT;
	if (media != 0)
	scsi_cmd->byte1 \|= SRDS_MEDIA;
	if (medium_type != 0)
	scsi_cmd->byte1 \|= SRDS_MEDIUM_TYPE;

	scsi_ulto2b(length, scsi_cmd->length);

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/CAM_DIR_IN,
	tag_action,
	/data_ptr/data_ptr,
	/dxfer_len/length,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}

	void
	scsi_set_capacity(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int byte1, u_int32_t proportion,
	u_int32_t sense_len, u_int32_t timeout)
	{
	struct scsi_set_capacity *scsi_cmd;

	scsi_cmd = (struct scsi_set_capacity *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));

	scsi_cmd->opcode = SET_CAPACITY;

	scsi_cmd->byte1 = byte1;
	scsi_ulto2b(proportion, scsi_cmd->cap_proportion);

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/CAM_DIR_NONE,
	tag_action,
	/data_ptr/NULL,
	/dxfer_len/0,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}

	void
	scsi_format_medium(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int byte1, int byte2,
	u_int8_t *data_ptr, u_int32_t dxfer_len,
	u_int32_t sense_len, u_int32_t timeout)
	{
	struct scsi_format_medium *scsi_cmd;

	scsi_cmd = (struct scsi_format_medium*)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));

	scsi_cmd->opcode = FORMAT_MEDIUM;

	scsi_cmd->byte1 = byte1;
	scsi_cmd->byte2 = byte2;

	scsi_ulto2b(dxfer_len, scsi_cmd->length);

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/(dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
	tag_action,
	/data_ptr/ data_ptr,
	/dxfer_len/ dxfer_len,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}

	void
	scsi_allow_overwrite(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, int allow_overwrite, int partition,
	u_int64_t logical_id, u_int32_t sense_len, u_int32_t timeout)
	{
	struct scsi_allow_overwrite *scsi_cmd;

	scsi_cmd = (struct scsi_allow_overwrite *)&csio->cdb_io.cdb_bytes;
	bzero(scsi_cmd, sizeof(*scsi_cmd));

	scsi_cmd->opcode = ALLOW_OVERWRITE;

	scsi_cmd->allow_overwrite = allow_overwrite;
	scsi_cmd->partition = partition;
	scsi_u64to8b(logical_id, scsi_cmd->logical_id);

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	CAM_DIR_NONE,
	tag_action,
	/data_ptr/ NULL,
	/dxfer_len/ 0,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}
	diff --git a/sys/cam/scsi/scsi_sg.c b/sys/cam/scsi/scsi_sg.c
	index 81b964a6828e..8e3f0a27ab58 100644
	--- a/sys/cam/scsi/scsi_sg.c
	+++ b/sys/cam/scsi/scsi_sg.c
	@@ -1,1016 +1,1016 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2007 Scott Long
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* scsi_sg peripheral driver. This driver is meant to implement the Linux
	* SG passthrough interface for SCSI.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/types.h>
	#include <sys/bio.h>
	#include <sys/malloc.h>
	#include <sys/fcntl.h>
	#include <sys/ioccom.h>
	#include <sys/conf.h>
	#include <sys/errno.h>
	#include <sys/devicestat.h>
	#include <sys/proc.h>
	#include <sys/uio.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_queue.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/cam_debug.h>
	#include <cam/cam_sim.h>

	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_message.h>
	#include <cam/scsi/scsi_sg.h>

	#include <compat/linux/linux_ioctl.h>

	typedef enum {
	SG_FLAG_LOCKED = 0x01,
	SG_FLAG_INVALID = 0x02
	} sg_flags;

	typedef enum {
	SG_STATE_NORMAL
	} sg_state;

	typedef enum {
	SG_RDWR_FREE,
	SG_RDWR_INPROG,
	SG_RDWR_DONE
	} sg_rdwr_state;

	typedef enum {
	SG_CCB_RDWR_IO
	} sg_ccb_types;

	#define ccb_type ppriv_field0
	#define ccb_rdwr ppriv_ptr1

	struct sg_rdwr {
	TAILQ_ENTRY(sg_rdwr) rdwr_link;
	int tag;
	int state;
	int buf_len;
	char *buf;
	union ccb *ccb;
	union {
	struct sg_header hdr;
	struct sg_io_hdr io_hdr;
	} hdr;
	};

	struct sg_softc {
	sg_state state;
	sg_flags flags;
	int open_count;
	u_int maxio;
	struct devstat *device_stats;
	TAILQ_HEAD(, sg_rdwr) rdwr_done;
	struct cdev *dev;
	int sg_timeout;
	int sg_user_timeout;
	uint8_t pd_type;
	union ccb saved_ccb;
	};

	static d_open_t sgopen;
	static d_close_t sgclose;
	static d_ioctl_t sgioctl;
	static d_write_t sgwrite;
	static d_read_t sgread;

	static periph_init_t sginit;
	static periph_ctor_t sgregister;
	static periph_oninv_t sgoninvalidate;
	static periph_dtor_t sgcleanup;
	static void sgasync(void *callback_arg, uint32_t code,
	struct cam_path path, void arg);
	static void sgdone(struct cam_periph periph, union ccb done_ccb);
	static int sgsendccb(struct cam_periph periph, union ccb ccb);
	static int sgsendrdwr(struct cam_periph periph, union ccb ccb);
	static int sgerror(union ccb *ccb, uint32_t cam_flags,
	uint32_t sense_flags);
	static void sg_scsiio_status(struct ccb_scsiio *csio,
	u_short hoststat, u_short drvstat);

	static int scsi_group_len(u_char cmd);

	static struct periph_driver sgdriver =
	{
	sginit, "sg",
	TAILQ_HEAD_INITIALIZER(sgdriver.units), /* gen */ 0
	};
	PERIPHDRIVER_DECLARE(sg, sgdriver);

	static struct cdevsw sg_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = D_TRACKCLOSE,
	.d_open = sgopen,
	.d_close = sgclose,
	.d_ioctl = sgioctl,
	.d_write = sgwrite,
	.d_read = sgread,
	.d_name = "sg",
	};

	static int sg_version = 30125;

	static void
	sginit(void)
	{
	cam_status status;

	/*
	* Install a global async callback. This callback will receive aync
	* callbacks like "new device found".
	*/
	status = xpt_register_async(AC_FOUND_DEVICE, sgasync, NULL, NULL);

	if (status != CAM_REQ_CMP) {
	printf("sg: Failed to attach master async callbac "
	"due to status 0x%x!\n", status);
	}
	}

	static void
	sgdevgonecb(void *arg)
	{
	struct cam_periph *periph;
	struct sg_softc *softc;
	struct mtx *mtx;
	int i;

	periph = (struct cam_periph *)arg;
	mtx = cam_periph_mtx(periph);
	mtx_lock(mtx);

	softc = (struct sg_softc *)periph->softc;
	KASSERT(softc->open_count >= 0, ("Negative open count %d",
	softc->open_count));

	/*
	* When we get this callback, we will get no more close calls from
	* devfs. So if we have any dangling opens, we need to release the
	* reference held for that particular context.
	*/
	for (i = 0; i < softc->open_count; i++)
	cam_periph_release_locked(periph);

	softc->open_count = 0;

	/*
	* Release the reference held for the device node, it is gone now.
	*/
	cam_periph_release_locked(periph);

	/*
	* We reference the lock directly here, instead of using
	* cam_periph_unlock(). The reason is that the final call to
	* cam_periph_release_locked() above could result in the periph
	* getting freed. If that is the case, dereferencing the periph
	* with a cam_periph_unlock() call would cause a page fault.
	*/
	mtx_unlock(mtx);
	}

	static void
	sgoninvalidate(struct cam_periph *periph)
	{
	struct sg_softc *softc;

	softc = (struct sg_softc *)periph->softc;

	/*
	* Deregister any async callbacks.
	*/
	xpt_register_async(0, sgasync, periph, periph->path);

	softc->flags \|= SG_FLAG_INVALID;

	/*
	* Tell devfs this device has gone away, and ask for a callback
	* when it has cleaned up its state.
	*/
	destroy_dev_sched_cb(softc->dev, sgdevgonecb, periph);

	/*
	* XXX Return all queued I/O with ENXIO.
	* XXX Handle any transactions queued to the card
	* with XPT_ABORT_CCB.
	*/

	}

	static void
	sgcleanup(struct cam_periph *periph)
	{
	struct sg_softc *softc;

	softc = (struct sg_softc *)periph->softc;

	devstat_remove_entry(softc->device_stats);

	free(softc, M_DEVBUF);
	}

	static void
	sgasync(void callback_arg, uint32_t code, struct cam_path path, void *arg)
	{
	struct cam_periph *periph;

	periph = (struct cam_periph *)callback_arg;

	switch (code) {
	case AC_FOUND_DEVICE:
	{
	struct ccb_getdev *cgd;
	cam_status status;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL)
	break;

	if (cgd->protocol != PROTO_SCSI)
	break;

	/*
	* Allocate a peripheral instance for this device and
	* start the probe process.
	*/
	status = cam_periph_alloc(sgregister, sgoninvalidate,
	sgcleanup, NULL, "sg",
	CAM_PERIPH_BIO, path,
	sgasync, AC_FOUND_DEVICE, cgd);
	if ((status != CAM_REQ_CMP) && (status != CAM_REQ_INPROG)) {
	const struct cam_status_entry *entry;

	entry = cam_fetch_status_entry(status);
	printf("sgasync: Unable to attach new device "
	"due to status %#x: %s\n", status, entry ?
	entry->status_text : "Unknown");
	}
	break;
	}
	default:
	cam_periph_async(periph, code, path, arg);
	break;
	}
	}

	static cam_status
	sgregister(struct cam_periph periph, void arg)
	{
	struct sg_softc *softc;
	struct ccb_getdev *cgd;
	struct ccb_pathinq cpi;
	struct make_dev_args args;
	int no_tags, error;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL) {
	printf("sgregister: no getdev CCB, can't register device\n");
	return (CAM_REQ_CMP_ERR);
	}

	softc = malloc(sizeof(*softc), M_DEVBUF, M_ZERO \| M_NOWAIT);
	if (softc == NULL) {
	printf("sgregister: Unable to allocate softc\n");
	return (CAM_REQ_CMP_ERR);
	}

	softc->state = SG_STATE_NORMAL;
	softc->pd_type = SID_TYPE(&cgd->inq_data);
	softc->sg_timeout = SG_DEFAULT_TIMEOUT / SG_DEFAULT_HZ * hz;
	softc->sg_user_timeout = SG_DEFAULT_TIMEOUT;
	TAILQ_INIT(&softc->rdwr_done);
	periph->softc = softc;

	xpt_path_inq(&cpi, periph->path);

	if (cpi.maxio == 0)
	softc->maxio = DFLTPHYS; /* traditional default */
	- else if (cpi.maxio > MAXPHYS)
	- softc->maxio = MAXPHYS; /* for safety */
	+ else if (cpi.maxio > maxphys)
	+ softc->maxio = maxphys; /* for safety */
	else
	softc->maxio = cpi.maxio; /* real value */

	/*
	* We pass in 0 for all blocksize, since we don't know what the
	* blocksize of the device is, if it even has a blocksize.
	*/
	cam_periph_unlock(periph);
	no_tags = (cgd->inq_data.flags & SID_CmdQue) == 0;
	softc->device_stats = devstat_new_entry("sg",
	periph->unit_number, 0,
	DEVSTAT_NO_BLOCKSIZE
	\| (no_tags ? DEVSTAT_NO_ORDERED_TAGS : 0),
	softc->pd_type \|
	XPORT_DEVSTAT_TYPE(cpi.transport) \|
	DEVSTAT_TYPE_PASS,
	DEVSTAT_PRIORITY_PASS);

	/*
	* Acquire a reference to the periph before we create the devfs
	* instance for it. We'll release this reference once the devfs
	* instance has been freed.
	*/
	if (cam_periph_acquire(periph) != 0) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}

	/* Register the device */
	make_dev_args_init(&args);
	args.mda_devsw = &sg_cdevsw;
	args.mda_unit = periph->unit_number;
	args.mda_uid = UID_ROOT;
	args.mda_gid = GID_OPERATOR;
	args.mda_mode = 0600;
	args.mda_si_drv1 = periph;
	error = make_dev_s(&args, &softc->dev, "%s%d",
	periph->periph_name, periph->unit_number);
	if (error != 0) {
	cam_periph_lock(periph);
	cam_periph_release_locked(periph);
	return (CAM_REQ_CMP_ERR);
	}
	if (periph->unit_number < 26) {
	(void)make_dev_alias(softc->dev, "sg%c",
	periph->unit_number + 'a');
	} else {
	(void)make_dev_alias(softc->dev, "sg%c%c",
	((periph->unit_number / 26) - 1) + 'a',
	(periph->unit_number % 26) + 'a');
	}
	cam_periph_lock(periph);

	/*
	* Add as async callback so that we get
	* notified if this device goes away.
	*/
	xpt_register_async(AC_LOST_DEVICE, sgasync, periph, periph->path);

	if (bootverbose)
	xpt_announce_periph(periph, NULL);

	return (CAM_REQ_CMP);
	}

	static void
	sgdone(struct cam_periph periph, union ccb done_ccb)
	{
	struct sg_softc *softc;
	struct ccb_scsiio *csio;

	softc = (struct sg_softc *)periph->softc;
	csio = &done_ccb->csio;
	switch (csio->ccb_h.ccb_type) {
	case SG_CCB_RDWR_IO:
	{
	struct sg_rdwr *rdwr;
	int state;

	devstat_end_transaction(softc->device_stats,
	csio->dxfer_len,
	csio->tag_action & 0xf,
	((csio->ccb_h.flags & CAM_DIR_MASK) ==
	CAM_DIR_NONE) ? DEVSTAT_NO_DATA :
	(csio->ccb_h.flags & CAM_DIR_OUT) ?
	DEVSTAT_WRITE : DEVSTAT_READ,
	NULL, NULL);

	rdwr = done_ccb->ccb_h.ccb_rdwr;
	state = rdwr->state;
	rdwr->state = SG_RDWR_DONE;
	wakeup(rdwr);
	break;
	}
	default:
	panic("unknown sg CCB type");
	}
	}

	static int
	sgopen(struct cdev dev, int flags, int fmt, struct thread td)
	{
	struct cam_periph *periph;
	struct sg_softc *softc;
	int error = 0;

	periph = (struct cam_periph *)dev->si_drv1;
	if (cam_periph_acquire(periph) != 0)
	return (ENXIO);

	/*
	* Don't allow access when we're running at a high securelevel.
	*/
	error = securelevel_gt(td->td_ucred, 1);
	if (error) {
	cam_periph_release(periph);
	return (error);
	}

	cam_periph_lock(periph);

	softc = (struct sg_softc *)periph->softc;
	if (softc->flags & SG_FLAG_INVALID) {
	cam_periph_release_locked(periph);
	cam_periph_unlock(periph);
	return (ENXIO);
	}

	softc->open_count++;

	cam_periph_unlock(periph);

	return (error);
	}

	static int
	sgclose(struct cdev dev, int flag, int fmt, struct thread td)
	{
	struct cam_periph *periph;
	struct sg_softc *softc;
	struct mtx *mtx;

	periph = (struct cam_periph *)dev->si_drv1;
	mtx = cam_periph_mtx(periph);
	mtx_lock(mtx);

	softc = periph->softc;
	softc->open_count--;

	cam_periph_release_locked(periph);

	/*
	* We reference the lock directly here, instead of using
	* cam_periph_unlock(). The reason is that the call to
	* cam_periph_release_locked() above could result in the periph
	* getting freed. If that is the case, dereferencing the periph
	* with a cam_periph_unlock() call would cause a page fault.
	*
	* cam_periph_release() avoids this problem using the same method,
	* but we're manually acquiring and dropping the lock here to
	* protect the open count and avoid another lock acquisition and
	* release.
	*/
	mtx_unlock(mtx);

	return (0);
	}

	static int
	sgioctl(struct cdev dev, u_long cmd, caddr_t arg, int flag, struct thread td)
	{
	union ccb *ccb;
	struct ccb_scsiio *csio;
	struct cam_periph *periph;
	struct sg_softc *softc;
	struct sg_io_hdr *req;
	int dir, error;

	periph = (struct cam_periph *)dev->si_drv1;
	cam_periph_lock(periph);

	softc = (struct sg_softc *)periph->softc;
	error = 0;

	switch (cmd) {
	case SG_GET_VERSION_NUM:
	{
	int version = (int )arg;

	*version = sg_version;
	break;
	}
	case SG_SET_TIMEOUT:
	{
	u_int user_timeout = (u_int )arg;

	softc->sg_user_timeout = user_timeout;
	softc->sg_timeout = user_timeout / SG_DEFAULT_HZ * hz;
	break;
	}
	case SG_GET_TIMEOUT:
	/*
	* The value is returned directly to the syscall.
	*/
	td->td_retval[0] = softc->sg_user_timeout;
	error = 0;
	break;
	case SG_IO:
	req = (struct sg_io_hdr *)arg;

	if (req->cmd_len > IOCDBLEN) {
	error = EINVAL;
	break;
	}

	if (req->iovec_count != 0) {
	error = EOPNOTSUPP;
	break;
	}

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	csio = &ccb->csio;

	error = copyin(req->cmdp, &csio->cdb_io.cdb_bytes,
	req->cmd_len);
	if (error) {
	xpt_release_ccb(ccb);
	break;
	}

	switch(req->dxfer_direction) {
	case SG_DXFER_TO_DEV:
	dir = CAM_DIR_OUT;
	break;
	case SG_DXFER_FROM_DEV:
	dir = CAM_DIR_IN;
	break;
	case SG_DXFER_TO_FROM_DEV:
	dir = CAM_DIR_BOTH;
	break;
	case SG_DXFER_NONE:
	default:
	dir = CAM_DIR_NONE;
	break;
	}

	cam_fill_csio(csio,
	/retries/1,
	/cbfcnp/NULL,
	dir\|CAM_DEV_QFRZDIS,
	MSG_SIMPLE_Q_TAG,
	req->dxferp,
	req->dxfer_len,
	req->mx_sb_len,
	req->cmd_len,
	req->timeout);

	error = sgsendccb(periph, ccb);
	if (error) {
	req->host_status = DID_ERROR;
	req->driver_status = DRIVER_INVALID;
	xpt_release_ccb(ccb);
	break;
	}

	req->status = csio->scsi_status;
	req->masked_status = (csio->scsi_status >> 1) & 0x7f;
	sg_scsiio_status(csio, &req->host_status, &req->driver_status);
	req->resid = csio->resid;
	req->duration = csio->ccb_h.timeout;
	req->info = 0;

	if ((csio->ccb_h.status & CAM_AUTOSNS_VALID)
	&& (req->sbp != NULL)) {
	req->sb_len_wr = req->mx_sb_len - csio->sense_resid;
	error = copyout(&csio->sense_data, req->sbp,
	req->sb_len_wr);
	}

	xpt_release_ccb(ccb);
	break;

	case SG_GET_RESERVED_SIZE:
	{
	int size = (int )arg;
	*size = DFLTPHYS;
	break;
	}

	case SG_GET_SCSI_ID:
	{
	struct sg_scsi_id id = (struct sg_scsi_id )arg;

	id->host_no = cam_sim_path(xpt_path_sim(periph->path));
	id->channel = xpt_path_path_id(periph->path);
	id->scsi_id = xpt_path_target_id(periph->path);
	id->lun = xpt_path_lun_id(periph->path);
	id->scsi_type = softc->pd_type;
	id->h_cmd_per_lun = 1;
	id->d_queue_depth = 1;
	id->unused[0] = 0;
	id->unused[1] = 0;
	break;
	}

	case SG_GET_SG_TABLESIZE:
	{
	int size = (int )arg;
	*size = 0;
	break;
	}

	case SG_EMULATED_HOST:
	case SG_SET_TRANSFORM:
	case SG_GET_TRANSFORM:
	case SG_GET_NUM_WAITING:
	case SG_SCSI_RESET:
	case SG_GET_REQUEST_TABLE:
	case SG_SET_KEEP_ORPHAN:
	case SG_GET_KEEP_ORPHAN:
	case SG_GET_ACCESS_COUNT:
	case SG_SET_FORCE_LOW_DMA:
	case SG_GET_LOW_DMA:
	case SG_SET_FORCE_PACK_ID:
	case SG_GET_PACK_ID:
	case SG_SET_RESERVED_SIZE:
	case SG_GET_COMMAND_Q:
	case SG_SET_COMMAND_Q:
	case SG_SET_DEBUG:
	case SG_NEXT_CMD_LEN:
	default:
	#ifdef CAMDEBUG
	printf("sgioctl: rejecting cmd 0x%lx\n", cmd);
	#endif
	error = ENODEV;
	break;
	}

	cam_periph_unlock(periph);
	return (error);
	}

	static int
	sgwrite(struct cdev dev, struct uio uio, int ioflag)
	{
	union ccb *ccb;
	struct cam_periph *periph;
	struct ccb_scsiio *csio;
	struct sg_softc *sc;
	struct sg_header *hdr;
	struct sg_rdwr *rdwr;
	u_char cdb_cmd;
	char *buf;
	int error = 0, cdb_len, buf_len, dir;

	periph = dev->si_drv1;
	rdwr = malloc(sizeof(*rdwr), M_DEVBUF, M_WAITOK \| M_ZERO);
	hdr = &rdwr->hdr.hdr;

	/* Copy in the header block and sanity check it */
	if (uio->uio_resid < sizeof(*hdr)) {
	error = EINVAL;
	goto out_hdr;
	}
	error = uiomove(hdr, sizeof(*hdr), uio);
	if (error)
	goto out_hdr;

	/* XXX: We don't support SG 3.x read/write API. */
	if (hdr->reply_len < 0) {
	error = ENODEV;
	goto out_hdr;
	}

	ccb = xpt_alloc_ccb();
	if (ccb == NULL) {
	error = ENOMEM;
	goto out_hdr;
	}
	csio = &ccb->csio;

	/*
	* Copy in the CDB block. The designers of the interface didn't
	* bother to provide a size for this in the header, so we have to
	* figure it out ourselves.
	*/
	if (uio->uio_resid < 1)
	goto out_ccb;
	error = uiomove(&cdb_cmd, 1, uio);
	if (error)
	goto out_ccb;
	if (hdr->twelve_byte)
	cdb_len = 12;
	else
	cdb_len = scsi_group_len(cdb_cmd);
	/*
	* We've already read the first byte of the CDB and advanced the uio
	* pointer. Just read the rest.
	*/
	csio->cdb_io.cdb_bytes[0] = cdb_cmd;
	error = uiomove(&csio->cdb_io.cdb_bytes[1], cdb_len - 1, uio);
	if (error)
	goto out_ccb;

	/*
	* Now set up the data block. Again, the designers didn't bother
	* to make this reliable.
	*/
	buf_len = uio->uio_resid;
	if (buf_len != 0) {
	buf = malloc(buf_len, M_DEVBUF, M_WAITOK \| M_ZERO);
	error = uiomove(buf, buf_len, uio);
	if (error)
	goto out_buf;
	dir = CAM_DIR_OUT;
	} else if (hdr->reply_len != 0) {
	buf = malloc(hdr->reply_len, M_DEVBUF, M_WAITOK \| M_ZERO);
	buf_len = hdr->reply_len;
	dir = CAM_DIR_IN;
	} else {
	buf = NULL;
	buf_len = 0;
	dir = CAM_DIR_NONE;
	}

	cam_periph_lock(periph);
	sc = periph->softc;
	xpt_setup_ccb(&ccb->ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	cam_fill_csio(csio,
	/retries/1,
	sgdone,
	dir\|CAM_DEV_QFRZDIS,
	MSG_SIMPLE_Q_TAG,
	buf,
	buf_len,
	SG_MAX_SENSE,
	cdb_len,
	sc->sg_timeout);

	/*
	* Send off the command and hope that it works. This path does not
	* go through sgstart because the I/O is supposed to be asynchronous.
	*/
	rdwr->buf = buf;
	rdwr->buf_len = buf_len;
	rdwr->tag = hdr->pack_id;
	rdwr->ccb = ccb;
	rdwr->state = SG_RDWR_INPROG;
	ccb->ccb_h.ccb_rdwr = rdwr;
	ccb->ccb_h.ccb_type = SG_CCB_RDWR_IO;
	TAILQ_INSERT_TAIL(&sc->rdwr_done, rdwr, rdwr_link);
	error = sgsendrdwr(periph, ccb);
	cam_periph_unlock(periph);
	return (error);

	out_buf:
	free(buf, M_DEVBUF);
	out_ccb:
	xpt_free_ccb(ccb);
	out_hdr:
	free(rdwr, M_DEVBUF);
	return (error);
	}

	static int
	sgread(struct cdev dev, struct uio uio, int ioflag)
	{
	struct ccb_scsiio *csio;
	struct cam_periph *periph;
	struct sg_softc *sc;
	struct sg_header *hdr;
	struct sg_rdwr *rdwr;
	u_short hstat, dstat;
	int error, pack_len, reply_len, pack_id;

	periph = dev->si_drv1;

	/* XXX The pack len field needs to be updated and written out instead
	* of discarded. Not sure how to do that.
	*/
	uio->uio_rw = UIO_WRITE;
	if ((error = uiomove(&pack_len, 4, uio)) != 0)
	return (error);
	if ((error = uiomove(&reply_len, 4, uio)) != 0)
	return (error);
	if ((error = uiomove(&pack_id, 4, uio)) != 0)
	return (error);
	uio->uio_rw = UIO_READ;

	cam_periph_lock(periph);
	sc = periph->softc;
	search:
	TAILQ_FOREACH(rdwr, &sc->rdwr_done, rdwr_link) {
	if (rdwr->tag == pack_id)
	break;
	}
	if ((rdwr == NULL) \|\| (rdwr->state != SG_RDWR_DONE)) {
	if (cam_periph_sleep(periph, rdwr, PCATCH, "sgread", 0) == ERESTART)
	return (EAGAIN);
	goto search;
	}
	TAILQ_REMOVE(&sc->rdwr_done, rdwr, rdwr_link);
	cam_periph_unlock(periph);

	hdr = &rdwr->hdr.hdr;
	csio = &rdwr->ccb->csio;
	sg_scsiio_status(csio, &hstat, &dstat);
	hdr->host_status = hstat;
	hdr->driver_status = dstat;
	hdr->target_status = csio->scsi_status >> 1;

	switch (hstat) {
	case DID_OK:
	case DID_PASSTHROUGH:
	case DID_SOFT_ERROR:
	hdr->result = 0;
	break;
	case DID_NO_CONNECT:
	case DID_BUS_BUSY:
	case DID_TIME_OUT:
	hdr->result = EBUSY;
	break;
	case DID_BAD_TARGET:
	case DID_ABORT:
	case DID_PARITY:
	case DID_RESET:
	case DID_BAD_INTR:
	case DID_ERROR:
	default:
	hdr->result = EIO;
	break;
	}

	if (dstat == DRIVER_SENSE) {
	bcopy(&csio->sense_data, hdr->sense_buffer,
	min(csio->sense_len, SG_MAX_SENSE));
	#ifdef CAMDEBUG
	scsi_sense_print(csio);
	#endif
	}

	error = uiomove(&hdr->result, sizeof(*hdr) -
	offsetof(struct sg_header, result), uio);
	if ((error == 0) && (hdr->result == 0))
	error = uiomove(rdwr->buf, rdwr->buf_len, uio);

	cam_periph_lock(periph);
	xpt_free_ccb(rdwr->ccb);
	cam_periph_unlock(periph);
	free(rdwr->buf, M_DEVBUF);
	free(rdwr, M_DEVBUF);
	return (error);
	}

	static int
	sgsendccb(struct cam_periph periph, union ccb ccb)
	{
	struct sg_softc *softc;
	struct cam_periph_map_info mapinfo;
	int error;

	softc = periph->softc;
	bzero(&mapinfo, sizeof(mapinfo));

	/*
	* cam_periph_mapmem calls into proc and vm functions that can
	* sleep as well as trigger I/O, so we can't hold the lock.
	* Dropping it here is reasonably safe.
	* The only CCB opcode that is possible here is XPT_SCSI_IO, no
	* need for additional checks.
	*/
	cam_periph_unlock(periph);
	error = cam_periph_mapmem(ccb, &mapinfo, softc->maxio);
	cam_periph_lock(periph);
	if (error)
	return (error);

	error = cam_periph_runccb(ccb,
	sgerror,
	CAM_RETRY_SELTO,
	SF_RETRY_UA,
	softc->device_stats);

	cam_periph_unlock(periph);
	cam_periph_unmapmem(ccb, &mapinfo);
	cam_periph_lock(periph);

	return (error);
	}

	static int
	sgsendrdwr(struct cam_periph periph, union ccb ccb)
	{
	struct sg_softc *softc;

	softc = periph->softc;
	devstat_start_transaction(softc->device_stats, NULL);
	xpt_action(ccb);
	return (0);
	}

	static int
	sgerror(union ccb *ccb, uint32_t cam_flags, uint32_t sense_flags)
	{
	struct cam_periph *periph;
	struct sg_softc *softc;

	periph = xpt_path_periph(ccb->ccb_h.path);
	softc = (struct sg_softc *)periph->softc;

	return (cam_periph_error(ccb, cam_flags, sense_flags));
	}

	static void
	sg_scsiio_status(struct ccb_scsiio csio, u_short hoststat, u_short *drvstat)
	{
	int status;

	status = csio->ccb_h.status;

	switch (status & CAM_STATUS_MASK) {
	case CAM_REQ_CMP:
	*hoststat = DID_OK;
	*drvstat = 0;
	break;
	case CAM_REQ_CMP_ERR:
	*hoststat = DID_ERROR;
	*drvstat = 0;
	break;
	case CAM_REQ_ABORTED:
	*hoststat = DID_ABORT;
	*drvstat = 0;
	break;
	case CAM_REQ_INVALID:
	*hoststat = DID_ERROR;
	*drvstat = DRIVER_INVALID;
	break;
	case CAM_DEV_NOT_THERE:
	*hoststat = DID_BAD_TARGET;
	*drvstat = 0;
	break;
	case CAM_SEL_TIMEOUT:
	*hoststat = DID_NO_CONNECT;
	*drvstat = 0;
	break;
	case CAM_CMD_TIMEOUT:
	*hoststat = DID_TIME_OUT;
	*drvstat = 0;
	break;
	case CAM_SCSI_STATUS_ERROR:
	*hoststat = DID_ERROR;
	*drvstat = 0;
	break;
	case CAM_SCSI_BUS_RESET:
	*hoststat = DID_RESET;
	*drvstat = 0;
	break;
	case CAM_UNCOR_PARITY:
	*hoststat = DID_PARITY;
	*drvstat = 0;
	break;
	case CAM_SCSI_BUSY:
	*hoststat = DID_BUS_BUSY;
	*drvstat = 0;
	break;
	default:
	*hoststat = DID_ERROR;
	*drvstat = DRIVER_ERROR;
	}

	if (status & CAM_AUTOSNS_VALID)
	*drvstat = DRIVER_SENSE;
	}

	static int
	scsi_group_len(u_char cmd)
	{
	int len[] = {6, 10, 10, 12, 12, 12, 10, 10};
	int group;

	group = (cmd >> 5) & 0x7;
	return (len[group]);
	}
	diff --git a/sys/cam/scsi/scsi_target.c b/sys/cam/scsi/scsi_target.c
	index b8713bded1cb..b2874f49f13f 100644
	--- a/sys/cam/scsi/scsi_target.c
	+++ b/sys/cam/scsi/scsi_target.c
	@@ -1,1158 +1,1158 @@
	/*-
	* Generic SCSI Target Kernel Mode Driver
	*
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2002 Nate Lawson.
	* Copyright (c) 1998, 1999, 2001, 2002 Justin T. Gibbs.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/conf.h>
	#include <sys/malloc.h>
	#include <sys/poll.h>
	#include <sys/vnode.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/devicestat.h>
	#include <sys/proc.h>
	/* Includes to support callout */
	#include <sys/types.h>
	#include <sys/systm.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/cam_sim.h>
	#include <cam/scsi/scsi_targetio.h>

	/* Transaction information attached to each CCB sent by the user */
	struct targ_cmd_descr {
	struct cam_periph_map_info mapinfo;
	TAILQ_ENTRY(targ_cmd_descr) tqe;
	union ccb *user_ccb;
	int priority;
	int func_code;
	};

	/* Offset into the private CCB area for storing our descriptor */
	#define targ_descr periph_priv.entries[1].ptr

	TAILQ_HEAD(descr_queue, targ_cmd_descr);

	typedef enum {
	TARG_STATE_RESV = 0x00, /* Invalid state */
	TARG_STATE_OPENED = 0x01, /* Device opened, softc initialized */
	TARG_STATE_LUN_ENABLED = 0x02 /* Device enabled for a path */
	} targ_state;

	/* Per-instance device software context */
	struct targ_softc {
	/* CCBs (CTIOs, ATIOs, INOTs) pending on the controller */
	struct ccb_queue pending_ccb_queue;

	/* Command descriptors awaiting CTIO resources from the XPT */
	struct descr_queue work_queue;

	/* Command descriptors that have been aborted back to the user. */
	struct descr_queue abort_queue;

	/*
	* Queue of CCBs that have been copied out to userland, but our
	* userland daemon has not yet seen.
	*/
	struct ccb_queue user_ccb_queue;

	struct cam_periph *periph;
	struct cam_path *path;
	targ_state state;
	u_int maxio;
	struct selinfo read_select;
	struct devstat device_stats;
	};

	static d_open_t targopen;
	static d_read_t targread;
	static d_write_t targwrite;
	static d_ioctl_t targioctl;
	static d_poll_t targpoll;
	static d_kqfilter_t targkqfilter;
	static void targreadfiltdetach(struct knote *kn);
	static int targreadfilt(struct knote *kn, long hint);
	static struct filterops targread_filtops = {
	.f_isfd = 1,
	.f_detach = targreadfiltdetach,
	.f_event = targreadfilt,
	};

	static struct cdevsw targ_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = D_NEEDGIANT,
	.d_open = targopen,
	.d_read = targread,
	.d_write = targwrite,
	.d_ioctl = targioctl,
	.d_poll = targpoll,
	.d_name = "targ",
	.d_kqfilter = targkqfilter
	};

	static cam_status targendislun(struct cam_path *path, int enable,
	int grp6_len, int grp7_len);
	static cam_status targenable(struct targ_softc *softc,
	struct cam_path *path,
	int grp6_len, int grp7_len);
	static cam_status targdisable(struct targ_softc *softc);
	static periph_ctor_t targctor;
	static periph_dtor_t targdtor;
	static periph_start_t targstart;
	static int targusermerge(struct targ_softc *softc,
	struct targ_cmd_descr *descr,
	union ccb *ccb);
	static int targsendccb(struct targ_softc softc, union ccb ccb,
	struct targ_cmd_descr *descr);
	static void targdone(struct cam_periph *periph,
	union ccb *done_ccb);
	static int targreturnccb(struct targ_softc *softc,
	union ccb *ccb);
	static union ccb * targgetccb(struct targ_softc *softc, xpt_opcode type,
	int priority);
	static void targfreeccb(struct targ_softc softc, union ccb ccb);
	static struct targ_cmd_descr *
	targgetdescr(struct targ_softc *softc);
	static periph_init_t targinit;
	static void targasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg);
	static void abort_all_pending(struct targ_softc *softc);
	static void notify_user(struct targ_softc *softc);
	static int targcamstatus(cam_status status);
	static size_t targccblen(xpt_opcode func_code);

	static struct periph_driver targdriver =
	{
	targinit, "targ",
	TAILQ_HEAD_INITIALIZER(targdriver.units), /* generation */ 0
	};
	PERIPHDRIVER_DECLARE(targ, targdriver);

	static MALLOC_DEFINE(M_TARG, "TARG", "TARG data");

	/* Disable LUN if enabled and teardown softc */
	static void
	targcdevdtor(void *data)
	{
	struct targ_softc *softc;
	struct cam_periph *periph;

	softc = data;
	if (softc->periph == NULL) {
	printf("%s: destroying non-enabled target\n", __func__);
	free(softc, M_TARG);
	return;
	}

	/*
	* Acquire a hold on the periph so that it doesn't go away before
	* we are ready at the end of the function.
	*/
	periph = softc->periph;
	cam_periph_acquire(periph);
	cam_periph_lock(periph);
	(void)targdisable(softc);
	if (softc->periph != NULL) {
	cam_periph_invalidate(softc->periph);
	softc->periph = NULL;
	}
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	free(softc, M_TARG);
	}

	/*
	* Create softc and initialize it. There is no locking here because a
	* periph doesn't get created until an ioctl is issued to do so, and
	* that can't happen until this method returns.
	*/
	static int
	targopen(struct cdev dev, int flags, int fmt, struct thread td)
	{
	struct targ_softc *softc;

	/* Allocate its softc, initialize it */
	softc = malloc(sizeof(*softc), M_TARG,
	M_WAITOK \| M_ZERO);
	softc->state = TARG_STATE_OPENED;
	softc->periph = NULL;
	softc->path = NULL;

	TAILQ_INIT(&softc->pending_ccb_queue);
	TAILQ_INIT(&softc->work_queue);
	TAILQ_INIT(&softc->abort_queue);
	TAILQ_INIT(&softc->user_ccb_queue);
	knlist_init_mtx(&softc->read_select.si_note, NULL);

	devfs_set_cdevpriv(softc, targcdevdtor);
	return (0);
	}

	/* Enable/disable LUNs, set debugging level */
	static int
	targioctl(struct cdev dev, u_long cmd, caddr_t addr, int flag, struct thread td)
	{
	struct targ_softc *softc;
	cam_status status;

	devfs_get_cdevpriv((void **)&softc);

	switch (cmd) {
	case TARGIOCENABLE:
	{
	struct ioc_enable_lun *new_lun;
	struct cam_path *path;

	new_lun = (struct ioc_enable_lun *)addr;
	status = xpt_create_path(&path, /periph/NULL,
	new_lun->path_id,
	new_lun->target_id,
	new_lun->lun_id);
	if (status != CAM_REQ_CMP) {
	printf("Couldn't create path, status %#x\n", status);
	break;
	}
	xpt_path_lock(path);
	status = targenable(softc, path, new_lun->grp6_len,
	new_lun->grp7_len);
	xpt_path_unlock(path);
	xpt_free_path(path);
	break;
	}
	case TARGIOCDISABLE:
	if (softc->periph == NULL) {
	status = CAM_DEV_NOT_THERE;
	break;
	}
	cam_periph_lock(softc->periph);
	status = targdisable(softc);
	cam_periph_unlock(softc->periph);
	break;
	case TARGIOCDEBUG:
	{
	struct ccb_debug cdbg;

	/* If no periph available, disallow debugging changes */
	if ((softc->state & TARG_STATE_LUN_ENABLED) == 0) {
	status = CAM_DEV_NOT_THERE;
	break;
	}
	bzero(&cdbg, sizeof cdbg);
	if (((int )addr) != 0)
	cdbg.flags = CAM_DEBUG_PERIPH;
	else
	cdbg.flags = CAM_DEBUG_NONE;
	xpt_setup_ccb(&cdbg.ccb_h, softc->path, CAM_PRIORITY_NORMAL);
	cdbg.ccb_h.func_code = XPT_DEBUG;
	cdbg.ccb_h.cbfcnp = targdone;
	xpt_action((union ccb *)&cdbg);
	status = cdbg.ccb_h.status & CAM_STATUS_MASK;
	break;
	}
	default:
	status = CAM_PROVIDE_FAIL;
	break;
	}

	return (targcamstatus(status));
	}

	/* Writes are always ready, reads wait for user_ccb_queue or abort_queue */
	static int
	targpoll(struct cdev dev, int poll_events, struct thread td)
	{
	struct targ_softc *softc;
	int revents;

	devfs_get_cdevpriv((void **)&softc);

	/* Poll for write() is always ok. */
	revents = poll_events & (POLLOUT \| POLLWRNORM);
	if ((poll_events & (POLLIN \| POLLRDNORM)) != 0) {
	/* Poll for read() depends on user and abort queues. */
	cam_periph_lock(softc->periph);
	if (!TAILQ_EMPTY(&softc->user_ccb_queue) \|\|
	!TAILQ_EMPTY(&softc->abort_queue)) {
	revents \|= poll_events & (POLLIN \| POLLRDNORM);
	}
	cam_periph_unlock(softc->periph);
	/* Only sleep if the user didn't poll for write. */
	if (revents == 0)
	selrecord(td, &softc->read_select);
	}

	return (revents);
	}

	static int
	targkqfilter(struct cdev dev, struct knote kn)
	{
	struct targ_softc *softc;

	devfs_get_cdevpriv((void **)&softc);
	kn->kn_hook = (caddr_t)softc;
	kn->kn_fop = &targread_filtops;
	knlist_add(&softc->read_select.si_note, kn, 0);
	return (0);
	}

	static void
	targreadfiltdetach(struct knote *kn)
	{
	struct targ_softc *softc;

	softc = (struct targ_softc *)kn->kn_hook;
	knlist_remove(&softc->read_select.si_note, kn, 0);
	}

	/* Notify the user's kqueue when the user queue or abort queue gets a CCB */
	static int
	targreadfilt(struct knote *kn, long hint)
	{
	struct targ_softc *softc;
	int retval;

	softc = (struct targ_softc *)kn->kn_hook;
	cam_periph_lock(softc->periph);
	retval = !TAILQ_EMPTY(&softc->user_ccb_queue) \|\|
	!TAILQ_EMPTY(&softc->abort_queue);
	cam_periph_unlock(softc->periph);
	return (retval);
	}

	/* Send the HBA the enable/disable message */
	static cam_status
	targendislun(struct cam_path *path, int enable, int grp6_len, int grp7_len)
	{
	struct ccb_en_lun en_ccb;
	cam_status status;

	/* Tell the lun to begin answering selects */
	xpt_setup_ccb(&en_ccb.ccb_h, path, CAM_PRIORITY_NORMAL);
	en_ccb.ccb_h.func_code = XPT_EN_LUN;
	/* Don't need support for any vendor specific commands */
	en_ccb.grp6_len = grp6_len;
	en_ccb.grp7_len = grp7_len;
	en_ccb.enable = enable ? 1 : 0;
	xpt_action((union ccb *)&en_ccb);
	status = en_ccb.ccb_h.status & CAM_STATUS_MASK;
	if (status != CAM_REQ_CMP) {
	xpt_print(path, "%sable lun CCB rejected, status %#x\n",
	enable ? "en" : "dis", status);
	}
	return (status);
	}

	/* Enable target mode on a LUN, given its path */
	static cam_status
	targenable(struct targ_softc softc, struct cam_path path, int grp6_len,
	int grp7_len)
	{
	struct cam_periph *periph;
	struct ccb_pathinq cpi;
	cam_status status;

	if ((softc->state & TARG_STATE_LUN_ENABLED) != 0)
	return (CAM_LUN_ALRDY_ENA);

	/* Make sure SIM supports target mode */
	xpt_path_inq(&cpi, path);
	status = cpi.ccb_h.status & CAM_STATUS_MASK;
	if (status != CAM_REQ_CMP) {
	printf("pathinq failed, status %#x\n", status);
	goto enable_fail;
	}
	if ((cpi.target_sprt & PIT_PROCESSOR) == 0) {
	printf("controller does not support target mode\n");
	status = CAM_FUNC_NOTAVAIL;
	goto enable_fail;
	}
	if (cpi.maxio == 0)
	softc->maxio = DFLTPHYS; /* traditional default */
	- else if (cpi.maxio > MAXPHYS)
	- softc->maxio = MAXPHYS; /* for safety */
	+ else if (cpi.maxio > maxphys)
	+ softc->maxio = maxphys; /* for safety */
	else
	softc->maxio = cpi.maxio; /* real value */

	/* Destroy any periph on our path if it is disabled */
	periph = cam_periph_find(path, "targ");
	if (periph != NULL) {
	struct targ_softc *del_softc;

	del_softc = (struct targ_softc *)periph->softc;
	if ((del_softc->state & TARG_STATE_LUN_ENABLED) == 0) {
	cam_periph_invalidate(del_softc->periph);
	del_softc->periph = NULL;
	} else {
	printf("Requested path still in use by targ%d\n",
	periph->unit_number);
	status = CAM_LUN_ALRDY_ENA;
	goto enable_fail;
	}
	}

	/* Create a periph instance attached to this path */
	status = cam_periph_alloc(targctor, NULL, targdtor, targstart,
	"targ", CAM_PERIPH_BIO, path, targasync, 0, softc);
	if (status != CAM_REQ_CMP) {
	printf("cam_periph_alloc failed, status %#x\n", status);
	goto enable_fail;
	}

	/* Ensure that the periph now exists. */
	if (cam_periph_find(path, "targ") == NULL) {
	panic("targenable: succeeded but no periph?");
	/* NOTREACHED */
	}

	/* Send the enable lun message */
	status = targendislun(path, /enable/1, grp6_len, grp7_len);
	if (status != CAM_REQ_CMP) {
	printf("enable lun failed, status %#x\n", status);
	goto enable_fail;
	}
	softc->state \|= TARG_STATE_LUN_ENABLED;

	enable_fail:
	return (status);
	}

	/* Disable this softc's target instance if enabled */
	static cam_status
	targdisable(struct targ_softc *softc)
	{
	cam_status status;

	if ((softc->state & TARG_STATE_LUN_ENABLED) == 0)
	return (CAM_REQ_CMP);

	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targdisable\n"));

	/* Abort any ccbs pending on the controller */
	abort_all_pending(softc);

	/* Disable this lun */
	status = targendislun(softc->path, /enable/0,
	/grp6_len/0, /grp7_len/0);
	if (status == CAM_REQ_CMP)
	softc->state &= ~TARG_STATE_LUN_ENABLED;
	else
	printf("Disable lun failed, status %#x\n", status);

	return (status);
	}

	/* Initialize a periph (called from cam_periph_alloc) */
	static cam_status
	targctor(struct cam_periph periph, void arg)
	{
	struct targ_softc *softc;

	/* Store pointer to softc for periph-driven routines */
	softc = (struct targ_softc *)arg;
	periph->softc = softc;
	softc->periph = periph;
	softc->path = periph->path;
	return (CAM_REQ_CMP);
	}

	static void
	targdtor(struct cam_periph *periph)
	{
	struct targ_softc *softc;
	struct ccb_hdr *ccb_h;
	struct targ_cmd_descr *descr;

	softc = (struct targ_softc *)periph->softc;

	/*
	* targdisable() aborts CCBs back to the user and leaves them
	* on user_ccb_queue and abort_queue in case the user is still
	* interested in them. We free them now.
	*/
	while ((ccb_h = TAILQ_FIRST(&softc->user_ccb_queue)) != NULL) {
	TAILQ_REMOVE(&softc->user_ccb_queue, ccb_h, periph_links.tqe);
	targfreeccb(softc, (union ccb *)ccb_h);
	}
	while ((descr = TAILQ_FIRST(&softc->abort_queue)) != NULL) {
	TAILQ_REMOVE(&softc->abort_queue, descr, tqe);
	free(descr, M_TARG);
	}

	softc->periph = NULL;
	softc->path = NULL;
	periph->softc = NULL;
	}

	/* Receive CCBs from user mode proc and send them to the HBA */
	static int
	targwrite(struct cdev dev, struct uio uio, int ioflag)
	{
	union ccb *user_ccb;
	struct targ_softc *softc;
	struct targ_cmd_descr *descr;
	int write_len, error;
	int func_code, priority;

	devfs_get_cdevpriv((void **)&softc);
	write_len = error = 0;
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
	("write - uio_resid %zd\n", uio->uio_resid));
	while (uio->uio_resid >= sizeof(user_ccb) && error == 0) {
	union ccb *ccb;

	error = uiomove((caddr_t)&user_ccb, sizeof(user_ccb), uio);
	if (error != 0) {
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
	("write - uiomove failed (%d)\n", error));
	break;
	}
	priority = fuword32(&user_ccb->ccb_h.pinfo.priority);
	if (priority == CAM_PRIORITY_NONE) {
	error = EINVAL;
	break;
	}
	func_code = fuword32(&user_ccb->ccb_h.func_code);
	switch (func_code) {
	case XPT_ACCEPT_TARGET_IO:
	case XPT_IMMED_NOTIFY:
	case XPT_IMMEDIATE_NOTIFY:
	cam_periph_lock(softc->periph);
	ccb = targgetccb(softc, func_code, priority);
	descr = (struct targ_cmd_descr *)ccb->ccb_h.targ_descr;
	descr->user_ccb = user_ccb;
	descr->func_code = func_code;
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
	("Sent ATIO/INOT (%p)\n", user_ccb));
	xpt_action(ccb);
	TAILQ_INSERT_TAIL(&softc->pending_ccb_queue,
	&ccb->ccb_h,
	periph_links.tqe);
	cam_periph_unlock(softc->periph);
	break;
	default:
	cam_periph_lock(softc->periph);
	if ((func_code & XPT_FC_QUEUED) != 0) {
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
	("Sending queued ccb %#x (%p)\n",
	func_code, user_ccb));
	descr = targgetdescr(softc);
	descr->user_ccb = user_ccb;
	descr->priority = priority;
	descr->func_code = func_code;
	TAILQ_INSERT_TAIL(&softc->work_queue,
	descr, tqe);
	xpt_schedule(softc->periph, priority);
	} else {
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
	("Sending inline ccb %#x (%p)\n",
	func_code, user_ccb));
	ccb = targgetccb(softc, func_code, priority);
	descr = (struct targ_cmd_descr *)
	ccb->ccb_h.targ_descr;
	descr->user_ccb = user_ccb;
	descr->priority = priority;
	descr->func_code = func_code;
	if (targusermerge(softc, descr, ccb) != EFAULT)
	targsendccb(softc, ccb, descr);
	targreturnccb(softc, ccb);
	}
	cam_periph_unlock(softc->periph);
	break;
	}
	write_len += sizeof(user_ccb);
	}

	/*
	* If we've successfully taken in some amount of
	* data, return success for that data first. If
	* an error is persistent, it will be reported
	* on the next write.
	*/
	if (error != 0 && write_len == 0)
	return (error);
	if (write_len == 0 && uio->uio_resid != 0)
	return (ENOSPC);
	return (0);
	}

	/* Process requests (descrs) via the periph-supplied CCBs */
	static void
	targstart(struct cam_periph periph, union ccb start_ccb)
	{
	struct targ_softc *softc;
	struct targ_cmd_descr descr, next_descr;
	int error;

	softc = (struct targ_softc *)periph->softc;
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targstart %p\n", start_ccb));

	descr = TAILQ_FIRST(&softc->work_queue);
	if (descr == NULL) {
	xpt_release_ccb(start_ccb);
	} else {
	TAILQ_REMOVE(&softc->work_queue, descr, tqe);
	next_descr = TAILQ_FIRST(&softc->work_queue);

	/* Initiate a transaction using the descr and supplied CCB */
	error = targusermerge(softc, descr, start_ccb);
	if (error == 0)
	error = targsendccb(softc, start_ccb, descr);
	if (error != 0) {
	xpt_print(periph->path,
	"targsendccb failed, err %d\n", error);
	xpt_release_ccb(start_ccb);
	suword(&descr->user_ccb->ccb_h.status,
	CAM_REQ_CMP_ERR);
	TAILQ_INSERT_TAIL(&softc->abort_queue, descr, tqe);
	notify_user(softc);
	}

	/* If we have more work to do, stay scheduled */
	if (next_descr != NULL)
	xpt_schedule(periph, next_descr->priority);
	}
	}

	static int
	targusermerge(struct targ_softc softc, struct targ_cmd_descr descr,
	union ccb *ccb)
	{
	struct ccb_hdr u_ccbh, k_ccbh;
	size_t ccb_len;
	int error;

	u_ccbh = &descr->user_ccb->ccb_h;
	k_ccbh = &ccb->ccb_h;

	/*
	* There are some fields in the CCB header that need to be
	* preserved, the rest we get from the user ccb. (See xpt_merge_ccb)
	*/
	xpt_setup_ccb(k_ccbh, softc->path, descr->priority);
	k_ccbh->retry_count = fuword32(&u_ccbh->retry_count);
	k_ccbh->func_code = descr->func_code;
	k_ccbh->flags = fuword32(&u_ccbh->flags);
	k_ccbh->timeout = fuword32(&u_ccbh->timeout);
	ccb_len = targccblen(k_ccbh->func_code) - sizeof(struct ccb_hdr);
	error = copyin(u_ccbh + 1, k_ccbh + 1, ccb_len);
	if (error != 0) {
	k_ccbh->status = CAM_REQ_CMP_ERR;
	return (error);
	}

	/* Translate usermode abort_ccb pointer to its kernel counterpart */
	if (k_ccbh->func_code == XPT_ABORT) {
	struct ccb_abort *cab;
	struct ccb_hdr *ccb_h;

	cab = (struct ccb_abort *)ccb;
	TAILQ_FOREACH(ccb_h, &softc->pending_ccb_queue,
	periph_links.tqe) {
	struct targ_cmd_descr *ab_descr;

	ab_descr = (struct targ_cmd_descr *)ccb_h->targ_descr;
	if (ab_descr->user_ccb == cab->abort_ccb) {
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
	("Changing abort for %p to %p\n",
	cab->abort_ccb, ccb_h));
	cab->abort_ccb = (union ccb *)ccb_h;
	break;
	}
	}
	/* CCB not found, set appropriate status */
	if (ccb_h == NULL) {
	k_ccbh->status = CAM_PATH_INVALID;
	error = ESRCH;
	}
	}

	return (error);
	}

	/* Build and send a kernel CCB formed from descr->user_ccb */
	static int
	targsendccb(struct targ_softc softc, union ccb ccb,
	struct targ_cmd_descr *descr)
	{
	struct cam_periph_map_info *mapinfo;
	struct ccb_hdr *ccb_h;
	int error;

	ccb_h = &ccb->ccb_h;
	mapinfo = &descr->mapinfo;
	mapinfo->num_bufs_used = 0;

	/*
	* There's no way for the user to have a completion
	* function, so we put our own completion function in here.
	* We also stash in a reference to our descriptor so targreturnccb()
	* can find our mapping info.
	*/
	ccb_h->cbfcnp = targdone;
	ccb_h->targ_descr = descr;

	if ((ccb_h->func_code == XPT_CONT_TARGET_IO) \|\|
	(ccb_h->func_code == XPT_DEV_MATCH)) {
	error = cam_periph_mapmem(ccb, mapinfo, softc->maxio);

	/*
	* cam_periph_mapmem returned an error, we can't continue.
	* Return the error to the user.
	*/
	if (error) {
	ccb_h->status = CAM_REQ_CMP_ERR;
	mapinfo->num_bufs_used = 0;
	return (error);
	}
	}

	/*
	* Once queued on the pending CCB list, this CCB will be protected
	* by our error recovery handler.
	*/
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("sendccb %p\n", ccb));
	if (XPT_FC_IS_QUEUED(ccb)) {
	TAILQ_INSERT_TAIL(&softc->pending_ccb_queue, ccb_h,
	periph_links.tqe);
	}
	xpt_action(ccb);

	return (0);
	}

	/* Completion routine for CCBs (called at splsoftcam) */
	static void
	targdone(struct cam_periph periph, union ccb done_ccb)
	{
	struct targ_softc *softc;
	cam_status status;

	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH, ("targdone %p\n", done_ccb));
	softc = (struct targ_softc *)periph->softc;
	TAILQ_REMOVE(&softc->pending_ccb_queue, &done_ccb->ccb_h,
	periph_links.tqe);
	status = done_ccb->ccb_h.status & CAM_STATUS_MASK;

	/* If we're no longer enabled, throw away CCB */
	if ((softc->state & TARG_STATE_LUN_ENABLED) == 0) {
	targfreeccb(softc, done_ccb);
	return;
	}
	/* abort_all_pending() waits for pending queue to be empty */
	if (TAILQ_EMPTY(&softc->pending_ccb_queue))
	wakeup(&softc->pending_ccb_queue);

	switch (done_ccb->ccb_h.func_code) {
	/* All FC__QUEUED CCBs go back to userland /
	case XPT_IMMED_NOTIFY:
	case XPT_IMMEDIATE_NOTIFY:
	case XPT_ACCEPT_TARGET_IO:
	case XPT_CONT_TARGET_IO:
	TAILQ_INSERT_TAIL(&softc->user_ccb_queue, &done_ccb->ccb_h,
	periph_links.tqe);
	cam_periph_unlock(softc->periph);
	notify_user(softc);
	cam_periph_lock(softc->periph);
	break;
	default:
	panic("targdone: impossible xpt opcode %#x",
	done_ccb->ccb_h.func_code);
	/* NOTREACHED */
	}
	}

	/* Return CCBs to the user from the user queue and abort queue */
	static int
	targread(struct cdev dev, struct uio uio, int ioflag)
	{
	struct descr_queue *abort_queue;
	struct targ_cmd_descr *user_descr;
	struct targ_softc *softc;
	struct ccb_queue *user_queue;
	struct ccb_hdr *ccb_h;
	union ccb *user_ccb;
	int read_len, error;

	error = 0;
	read_len = 0;
	devfs_get_cdevpriv((void **)&softc);
	user_queue = &softc->user_ccb_queue;
	abort_queue = &softc->abort_queue;
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targread\n"));

	/* If no data is available, wait or return immediately */
	cam_periph_lock(softc->periph);
	ccb_h = TAILQ_FIRST(user_queue);
	user_descr = TAILQ_FIRST(abort_queue);
	while (ccb_h == NULL && user_descr == NULL) {
	if ((ioflag & IO_NDELAY) == 0) {
	error = cam_periph_sleep(softc->periph, user_queue,
	PRIBIO \| PCATCH, "targrd", 0);
	ccb_h = TAILQ_FIRST(user_queue);
	user_descr = TAILQ_FIRST(abort_queue);
	if (error != 0) {
	if (error == ERESTART) {
	continue;
	} else {
	goto read_fail;
	}
	}
	} else {
	cam_periph_unlock(softc->periph);
	return (EAGAIN);
	}
	}

	/* Data is available so fill the user's buffer */
	while (ccb_h != NULL) {
	struct targ_cmd_descr *descr;

	if (uio->uio_resid < sizeof(user_ccb))
	break;
	TAILQ_REMOVE(user_queue, ccb_h, periph_links.tqe);
	descr = (struct targ_cmd_descr *)ccb_h->targ_descr;
	user_ccb = descr->user_ccb;
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
	("targread ccb %p (%p)\n", ccb_h, user_ccb));
	error = targreturnccb(softc, (union ccb *)ccb_h);
	if (error != 0)
	goto read_fail;
	cam_periph_unlock(softc->periph);
	error = uiomove((caddr_t)&user_ccb, sizeof(user_ccb), uio);
	cam_periph_lock(softc->periph);
	if (error != 0)
	goto read_fail;
	read_len += sizeof(user_ccb);

	ccb_h = TAILQ_FIRST(user_queue);
	}

	/* Flush out any aborted descriptors */
	while (user_descr != NULL) {
	if (uio->uio_resid < sizeof(user_ccb))
	break;
	TAILQ_REMOVE(abort_queue, user_descr, tqe);
	user_ccb = user_descr->user_ccb;
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
	("targread aborted descr %p (%p)\n",
	user_descr, user_ccb));
	suword(&user_ccb->ccb_h.status, CAM_REQ_ABORTED);
	cam_periph_unlock(softc->periph);
	error = uiomove((caddr_t)&user_ccb, sizeof(user_ccb), uio);
	cam_periph_lock(softc->periph);
	if (error != 0)
	goto read_fail;
	read_len += sizeof(user_ccb);

	user_descr = TAILQ_FIRST(abort_queue);
	}

	/*
	* If we've successfully read some amount of data, don't report an
	* error. If the error is persistent, it will be reported on the
	* next read().
	*/
	if (read_len == 0 && uio->uio_resid != 0)
	error = ENOSPC;

	read_fail:
	cam_periph_unlock(softc->periph);
	return (error);
	}

	/* Copy completed ccb back to the user */
	static int
	targreturnccb(struct targ_softc softc, union ccb ccb)
	{
	struct targ_cmd_descr *descr;
	struct ccb_hdr *u_ccbh;
	size_t ccb_len;
	int error;

	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targreturnccb %p\n", ccb));
	descr = (struct targ_cmd_descr *)ccb->ccb_h.targ_descr;
	u_ccbh = &descr->user_ccb->ccb_h;

	/* Copy out the central portion of the ccb_hdr */
	copyout(&ccb->ccb_h.retry_count, &u_ccbh->retry_count,
	offsetof(struct ccb_hdr, periph_priv) -
	offsetof(struct ccb_hdr, retry_count));

	/* Copy out the rest of the ccb (after the ccb_hdr) */
	ccb_len = targccblen(ccb->ccb_h.func_code) - sizeof(struct ccb_hdr);
	if (descr->mapinfo.num_bufs_used != 0)
	cam_periph_unmapmem(ccb, &descr->mapinfo);
	error = copyout(&ccb->ccb_h + 1, u_ccbh + 1, ccb_len);
	if (error != 0) {
	xpt_print(softc->path,
	"targreturnccb - CCB copyout failed (%d)\n", error);
	}
	/* Free CCB or send back to devq. */
	targfreeccb(softc, ccb);

	return (error);
	}

	static union ccb *
	targgetccb(struct targ_softc *softc, xpt_opcode type, int priority)
	{
	union ccb *ccb;
	int ccb_len;

	ccb_len = targccblen(type);
	ccb = malloc(ccb_len, M_TARG, M_NOWAIT);
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("getccb %p\n", ccb));
	if (ccb == NULL) {
	return (ccb);
	}
	xpt_setup_ccb(&ccb->ccb_h, softc->path, priority);
	ccb->ccb_h.func_code = type;
	ccb->ccb_h.cbfcnp = targdone;
	ccb->ccb_h.targ_descr = targgetdescr(softc);
	if (ccb->ccb_h.targ_descr == NULL) {
	free (ccb, M_TARG);
	ccb = NULL;
	}
	return (ccb);
	}

	static void
	targfreeccb(struct targ_softc softc, union ccb ccb)
	{
	CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH, ("targfreeccb descr %p and\n",
	ccb->ccb_h.targ_descr));
	free(ccb->ccb_h.targ_descr, M_TARG);

	switch (ccb->ccb_h.func_code) {
	case XPT_ACCEPT_TARGET_IO:
	case XPT_IMMED_NOTIFY:
	case XPT_IMMEDIATE_NOTIFY:
	CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH, ("freeing ccb %p\n", ccb));
	free(ccb, M_TARG);
	break;
	default:
	/* Send back CCB if we got it from the periph */
	if (XPT_FC_IS_QUEUED(ccb)) {
	CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH,
	("returning queued ccb %p\n", ccb));
	xpt_release_ccb(ccb);
	} else {
	CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH,
	("freeing ccb %p\n", ccb));
	free(ccb, M_TARG);
	}
	break;
	}
	}

	static struct targ_cmd_descr *
	targgetdescr(struct targ_softc *softc)
	{
	struct targ_cmd_descr *descr;

	descr = malloc(sizeof(*descr), M_TARG,
	M_NOWAIT);
	if (descr) {
	descr->mapinfo.num_bufs_used = 0;
	}
	return (descr);
	}

	static void
	targinit(void)
	{
	struct cdev *dev;

	/* Add symbolic link to targ0 for compatibility. */
	dev = make_dev(&targ_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "targ");
	make_dev_alias(dev, "targ0");
	}

	static void
	targasync(void callback_arg, u_int32_t code, struct cam_path path, void *arg)
	{
	/* All events are handled in usermode by INOTs */
	panic("targasync() called, should be an INOT instead");
	}

	/* Cancel all pending requests and CCBs awaiting work. */
	static void
	abort_all_pending(struct targ_softc *softc)
	{
	struct targ_cmd_descr *descr;
	struct ccb_abort cab;
	struct ccb_hdr *ccb_h;

	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("abort_all_pending\n"));

	/* First abort the descriptors awaiting resources */
	while ((descr = TAILQ_FIRST(&softc->work_queue)) != NULL) {
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
	("Aborting descr from workq %p\n", descr));
	TAILQ_REMOVE(&softc->work_queue, descr, tqe);
	TAILQ_INSERT_TAIL(&softc->abort_queue, descr, tqe);
	}

	/*
	* Then abort all pending CCBs.
	* targdone() will return the aborted CCB via user_ccb_queue
	*/
	xpt_setup_ccb(&cab.ccb_h, softc->path, CAM_PRIORITY_NORMAL);
	cab.ccb_h.func_code = XPT_ABORT;
	cab.ccb_h.status = CAM_REQ_CMP_ERR;
	TAILQ_FOREACH(ccb_h, &softc->pending_ccb_queue, periph_links.tqe) {
	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
	("Aborting pending CCB %p\n", ccb_h));
	cab.abort_ccb = (union ccb *)ccb_h;
	xpt_action((union ccb *)&cab);
	if (cab.ccb_h.status != CAM_REQ_CMP) {
	xpt_print(cab.ccb_h.path,
	"Unable to abort CCB, status %#x\n",
	cab.ccb_h.status);
	}
	}

	/* If we aborted at least one pending CCB ok, wait for it. */
	if (cab.ccb_h.status == CAM_REQ_CMP) {
	cam_periph_sleep(softc->periph, &softc->pending_ccb_queue,
	PRIBIO \| PCATCH, "tgabrt", 0);
	}

	/* If we aborted anything from the work queue, wakeup user. */
	if (!TAILQ_EMPTY(&softc->user_ccb_queue)
	\|\| !TAILQ_EMPTY(&softc->abort_queue)) {
	cam_periph_unlock(softc->periph);
	notify_user(softc);
	cam_periph_lock(softc->periph);
	}
	}

	/* Notify the user that data is ready */
	static void
	notify_user(struct targ_softc *softc)
	{
	/*
	* Notify users sleeping via poll(), kqueue(), and
	* blocking read().
	*/
	selwakeuppri(&softc->read_select, PRIBIO);
	KNOTE_UNLOCKED(&softc->read_select.si_note, 0);
	wakeup(&softc->user_ccb_queue);
	}

	/* Convert CAM status to errno values */
	static int
	targcamstatus(cam_status status)
	{
	switch (status & CAM_STATUS_MASK) {
	case CAM_REQ_CMP: /* CCB request completed without error */
	return (0);
	case CAM_REQ_INPROG: /* CCB request is in progress */
	return (EINPROGRESS);
	case CAM_REQ_CMP_ERR: /* CCB request completed with an error */
	return (EIO);
	case CAM_PROVIDE_FAIL: /* Unable to provide requested capability */
	return (ENOTTY);
	case CAM_FUNC_NOTAVAIL: /* The requested function is not available */
	return (ENOTSUP);
	case CAM_LUN_ALRDY_ENA: /* LUN is already enabled for target mode */
	return (EADDRINUSE);
	case CAM_PATH_INVALID: /* Supplied Path ID is invalid */
	case CAM_DEV_NOT_THERE: /* SCSI Device Not Installed/there */
	return (ENOENT);
	case CAM_REQ_ABORTED: /* CCB request aborted by the host */
	return (ECANCELED);
	case CAM_CMD_TIMEOUT: /* Command timeout */
	return (ETIMEDOUT);
	case CAM_REQUEUE_REQ: /* Requeue to preserve transaction ordering */
	return (EAGAIN);
	case CAM_REQ_INVALID: /* CCB request was invalid */
	return (EINVAL);
	case CAM_RESRC_UNAVAIL: /* Resource Unavailable */
	return (ENOMEM);
	case CAM_BUSY: /* CAM subsystem is busy */
	case CAM_UA_ABORT: /* Unable to abort CCB request */
	return (EBUSY);
	default:
	return (ENXIO);
	}
	}

	static size_t
	targccblen(xpt_opcode func_code)
	{
	int len;

	/* Codes we expect to see as a target */
	switch (func_code) {
	case XPT_CONT_TARGET_IO:
	case XPT_SCSI_IO:
	len = sizeof(struct ccb_scsiio);
	break;
	case XPT_ACCEPT_TARGET_IO:
	len = sizeof(struct ccb_accept_tio);
	break;
	case XPT_IMMED_NOTIFY:
	len = sizeof(struct ccb_immed_notify);
	break;
	case XPT_IMMEDIATE_NOTIFY:
	len = sizeof(struct ccb_immediate_notify);
	break;
	case XPT_REL_SIMQ:
	len = sizeof(struct ccb_relsim);
	break;
	case XPT_PATH_INQ:
	len = sizeof(struct ccb_pathinq);
	break;
	case XPT_DEBUG:
	len = sizeof(struct ccb_debug);
	break;
	case XPT_ABORT:
	len = sizeof(struct ccb_abort);
	break;
	case XPT_EN_LUN:
	len = sizeof(struct ccb_en_lun);
	break;
	default:
	len = sizeof(union ccb);
	break;
	}

	return (len);
	}
	diff --git a/sys/compat/linprocfs/linprocfs.c b/sys/compat/linprocfs/linprocfs.c
	index 149598583159..79ffc4dfd5aa 100644
	--- a/sys/compat/linprocfs/linprocfs.c
	+++ b/sys/compat/linprocfs/linprocfs.c
	@@ -1,2106 +1,2106 @@
	/*-
	* SPDX-License-Identifier: BSD-4-Clause
	*
	* Copyright (c) 2000 Dag-Erling Coïdan Smørgrav
	* Copyright (c) 1999 Pierre Beyssac
	* Copyright (c) 1993 Jan-Simon Pendry
	* Copyright (c) 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Jan-Simon Pendry.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)procfs_status.c 8.4 (Berkeley) 6/15/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/queue.h>
	#include <sys/blist.h>
	#include <sys/conf.h>
	#include <sys/exec.h>
	#include <sys/fcntl.h>
	#include <sys/filedesc.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/linker.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/msg.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/proc.h>
	#include <sys/ptrace.h>
	#include <sys/resourcevar.h>
	#include <sys/resource.h>
	#include <sys/sbuf.h>
	#include <sys/sem.h>
	#include <sys/shm.h>
	#include <sys/smp.h>
	#include <sys/socket.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/systm.h>
	#include <sys/time.h>
	#include <sys/tty.h>
	#include <sys/user.h>
	#include <sys/uuid.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>
	#include <sys/bus.h>

	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/if_types.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_param.h>
	#include <vm/vm_object.h>
	#include <vm/swap_pager.h>

	#include <machine/clock.h>

	#include <geom/geom.h>
	#include <geom/geom_int.h>

	#if defined(__i386__) \|\| defined(__amd64__)
	#include <machine/cputypes.h>
	#include <machine/md_var.h>
	#endif /* __i386__ \|\| __amd64__ */

	#include <compat/linux/linux.h>
	#include <compat/linux/linux_mib.h>
	#include <compat/linux/linux_misc.h>
	#include <compat/linux/linux_util.h>
	#include <fs/pseudofs/pseudofs.h>
	#include <fs/procfs/procfs.h>

	/*
	* Various conversion macros
	*/
	#define T2J(x) ((long)(((x) * 100ULL) / (stathz ? stathz : hz))) /* ticks to jiffies */
	#define T2CS(x) ((unsigned long)(((x) * 100ULL) / (stathz ? stathz : hz))) /* ticks to centiseconds */
	#define T2S(x) ((x) / (stathz ? stathz : hz)) /* ticks to seconds */
	#define B2K(x) ((x) >> 10) /* bytes to kbytes */
	#define B2P(x) ((x) >> PAGE_SHIFT) /* bytes to pages */
	#define P2B(x) ((x) << PAGE_SHIFT) /* pages to bytes */
	#define P2K(x) ((x) << (PAGE_SHIFT - 10)) /* pages to kbytes */
	#define TV2J(x) ((x)->tv_sec * 100UL + (x)->tv_usec / 10000)

	/**
	* @brief Mapping of ki_stat in struct kinfo_proc to the linux state
	*
	* The linux procfs state field displays one of the characters RSDZTW to
	* denote running, sleeping in an interruptible wait, waiting in an
	* uninterruptible disk sleep, a zombie process, process is being traced
	* or stopped, or process is paging respectively.
	*
	* Our struct kinfo_proc contains the variable ki_stat which contains a
	* value out of SIDL, SRUN, SSLEEP, SSTOP, SZOMB, SWAIT and SLOCK.
	*
	* This character array is used with ki_stati-1 as an index and tries to
	* map our states to suitable linux states.
	*/
	static char linux_state[] = "RRSTZDD";

	/*
	* Filler function for proc/meminfo
	*/
	static int
	linprocfs_domeminfo(PFS_FILL_ARGS)
	{
	unsigned long memtotal; /* total memory in bytes */
	unsigned long memfree; /* free memory in bytes */
	unsigned long cached; /* page cache */
	unsigned long buffers; /* buffer cache */
	unsigned long long swaptotal; /* total swap space in bytes */
	unsigned long long swapused; /* used swap space in bytes */
	unsigned long long swapfree; /* free swap space in bytes */
	size_t sz;
	int error, i, j;

	memtotal = physmem * PAGE_SIZE;
	memfree = (unsigned long)vm_free_count() * PAGE_SIZE;
	swap_pager_status(&i, &j);
	swaptotal = (unsigned long long)i * PAGE_SIZE;
	swapused = (unsigned long long)j * PAGE_SIZE;
	swapfree = swaptotal - swapused;

	/*
	* This value may exclude wired pages, but we have no good way of
	* accounting for that.
	*/
	cached =
	(vm_active_count() + vm_inactive_count() + vm_laundry_count()) *
	PAGE_SIZE;

	sz = sizeof(buffers);
	error = kernel_sysctlbyname(curthread, "vfs.bufspace", &buffers, &sz,
	NULL, 0, 0, 0);
	if (error != 0)
	buffers = 0;

	sbuf_printf(sb,
	"MemTotal: %9lu kB\n"
	"MemFree: %9lu kB\n"
	"Buffers: %9lu kB\n"
	"Cached: %9lu kB\n"
	"SwapTotal:%9llu kB\n"
	"SwapFree: %9llu kB\n",
	B2K(memtotal), B2K(memfree), B2K(buffers),
	B2K(cached), B2K(swaptotal), B2K(swapfree));

	return (0);
	}

	#if defined(__i386__) \|\| defined(__amd64__)
	/*
	* Filler function for proc/cpuinfo (i386 & amd64 version)
	*/
	static int
	linprocfs_docpuinfo(PFS_FILL_ARGS)
	{
	int hw_model[2];
	char model[128];
	uint64_t freq;
	size_t size;
	u_int cache_size[4];
	int fqmhz, fqkhz;
	int i, j;

	/*
	* We default the flags to include all non-conflicting flags,
	* and the Intel versions of conflicting flags.
	*/
	static char *cpu_feature_names[] = {
	/* 0 */ "fpu", "vme", "de", "pse",
	/* 4 */ "tsc", "msr", "pae", "mce",
	/* 8 */ "cx8", "apic", "", "sep",
	/* 12 */ "mtrr", "pge", "mca", "cmov",
	/* 16 */ "pat", "pse36", "pn", "clflush",
	/* 20 */ "", "dts", "acpi", "mmx",
	/* 24 */ "fxsr", "sse", "sse2", "ss",
	/* 28 */ "ht", "tm", "ia64", "pbe"
	};

	static char *amd_feature_names[] = {
	/* 0 */ "", "", "", "",
	/* 4 */ "", "", "", "",
	/* 8 */ "", "", "", "syscall",
	/* 12 */ "", "", "", "",
	/* 16 */ "", "", "", "mp",
	/* 20 */ "nx", "", "mmxext", "",
	/* 24 */ "", "fxsr_opt", "pdpe1gb", "rdtscp",
	/* 28 */ "", "lm", "3dnowext", "3dnow"
	};

	static char *cpu_feature2_names[] = {
	/* 0 */ "pni", "pclmulqdq", "dtes64", "monitor",
	/* 4 */ "ds_cpl", "vmx", "smx", "est",
	/* 8 */ "tm2", "ssse3", "cid", "sdbg",
	/* 12 */ "fma", "cx16", "xtpr", "pdcm",
	/* 16 */ "", "pcid", "dca", "sse4_1",
	/* 20 */ "sse4_2", "x2apic", "movbe", "popcnt",
	/* 24 */ "tsc_deadline_timer", "aes", "xsave", "",
	/* 28 */ "avx", "f16c", "rdrand", "hypervisor"
	};

	static char *amd_feature2_names[] = {
	/* 0 */ "lahf_lm", "cmp_legacy", "svm", "extapic",
	/* 4 */ "cr8_legacy", "abm", "sse4a", "misalignsse",
	/* 8 */ "3dnowprefetch", "osvw", "ibs", "xop",
	/* 12 */ "skinit", "wdt", "", "lwp",
	/* 16 */ "fma4", "tce", "", "nodeid_msr",
	/* 20 */ "", "tbm", "topoext", "perfctr_core",
	/* 24 */ "perfctr_nb", "", "bpext", "ptsc",
	/* 28 */ "perfctr_llc", "mwaitx", "", ""
	};

	static char *cpu_stdext_feature_names[] = {
	/* 0 */ "fsgsbase", "tsc_adjust", "", "bmi1",
	/* 4 */ "hle", "avx2", "", "smep",
	/* 8 */ "bmi2", "erms", "invpcid", "rtm",
	/* 12 */ "cqm", "", "mpx", "rdt_a",
	/* 16 */ "avx512f", "avx512dq", "rdseed", "adx",
	/* 20 */ "smap", "avx512ifma", "", "clflushopt",
	/* 24 */ "clwb", "intel_pt", "avx512pf", "avx512er",
	/* 28 */ "avx512cd", "sha_ni", "avx512bw", "avx512vl"
	};

	static char *power_flags[] = {
	"ts", "fid", "vid",
	"ttp", "tm", "stc",
	"100mhzsteps", "hwpstate", "",
	"cpb", "eff_freq_ro", "proc_feedback",
	"acc_power",
	};

	hw_model[0] = CTL_HW;
	hw_model[1] = HW_MODEL;
	model[0] = '\0';
	size = sizeof(model);
	if (kernel_sysctl(td, hw_model, 2, &model, &size, 0, 0, 0, 0) != 0)
	strcpy(model, "unknown");
	#ifdef __i386__
	switch (cpu_vendor_id) {
	case CPU_VENDOR_AMD:
	if (cpu_class < CPUCLASS_686)
	cpu_feature_names[16] = "fcmov";
	break;
	case CPU_VENDOR_CYRIX:
	cpu_feature_names[24] = "cxmmx";
	break;
	}
	#endif
	if (cpu_exthigh >= 0x80000006)
	do_cpuid(0x80000006, cache_size);
	else
	memset(cache_size, 0, sizeof(cache_size));
	for (i = 0; i < mp_ncpus; ++i) {
	fqmhz = 0;
	fqkhz = 0;
	freq = atomic_load_acq_64(&tsc_freq);
	if (freq != 0) {
	fqmhz = (freq + 4999) / 1000000;
	fqkhz = ((freq + 4999) / 10000) % 100;
	}
	sbuf_printf(sb,
	"processor\t: %d\n"
	"vendor_id\t: %.20s\n"
	"cpu family\t: %u\n"
	"model\t\t: %u\n"
	"model name\t: %s\n"
	"stepping\t: %u\n"
	"cpu MHz\t\t: %d.%02d\n"
	"cache size\t: %d KB\n"
	"physical id\t: %d\n"
	"siblings\t: %d\n"
	"core id\t\t: %d\n"
	"cpu cores\t: %d\n"
	"apicid\t\t: %d\n"
	"initial apicid\t: %d\n"
	"fpu\t\t: %s\n"
	"fpu_exception\t: %s\n"
	"cpuid level\t: %d\n"
	"wp\t\t: %s\n",
	i, cpu_vendor, CPUID_TO_FAMILY(cpu_id),
	CPUID_TO_MODEL(cpu_id), model, cpu_id & CPUID_STEPPING,
	fqmhz, fqkhz,
	(cache_size[2] >> 16), 0, mp_ncpus, i, mp_ncpus,
	i, i, /cpu_id & CPUID_LOCAL_APIC_ID ??/
	(cpu_feature & CPUID_FPU) ? "yes" : "no", "yes",
	CPUID_TO_FAMILY(cpu_id), "yes");
	sbuf_cat(sb, "flags\t\t:");
	for (j = 0; j < nitems(cpu_feature_names); j++)
	if (cpu_feature & (1 << j) &&
	cpu_feature_names[j][0] != '\0')
	sbuf_printf(sb, " %s", cpu_feature_names[j]);
	for (j = 0; j < nitems(amd_feature_names); j++)
	if (amd_feature & (1 << j) &&
	amd_feature_names[j][0] != '\0')
	sbuf_printf(sb, " %s", amd_feature_names[j]);
	for (j = 0; j < nitems(cpu_feature2_names); j++)
	if (cpu_feature2 & (1 << j) &&
	cpu_feature2_names[j][0] != '\0')
	sbuf_printf(sb, " %s", cpu_feature2_names[j]);
	for (j = 0; j < nitems(amd_feature2_names); j++)
	if (amd_feature2 & (1 << j) &&
	amd_feature2_names[j][0] != '\0')
	sbuf_printf(sb, " %s", amd_feature2_names[j]);
	for (j = 0; j < nitems(cpu_stdext_feature_names); j++)
	if (cpu_stdext_feature & (1 << j) &&
	cpu_stdext_feature_names[j][0] != '\0')
	sbuf_printf(sb, " %s",
	cpu_stdext_feature_names[j]);
	sbuf_cat(sb, "\n");
	sbuf_printf(sb,
	"bugs\t\t: %s\n"
	"bogomips\t: %d.%02d\n"
	"clflush size\t: %d\n"
	"cache_alignment\t: %d\n"
	"address sizes\t: %d bits physical, %d bits virtual\n",
	#if defined(I586_CPU) && !defined(NO_F00F_HACK)
	(has_f00f_bug) ? "Intel F00F" : "",
	#else
	"",
	#endif
	fqmhz * 2, fqkhz,
	cpu_clflush_line_size, cpu_clflush_line_size,
	cpu_maxphyaddr,
	(cpu_maxphyaddr > 32) ? 48 : 0);
	sbuf_cat(sb, "power management: ");
	for (j = 0; j < nitems(power_flags); j++)
	if (amd_pminfo & (1 << j))
	sbuf_printf(sb, " %s", power_flags[j]);
	sbuf_cat(sb, "\n\n");

	/* XXX per-cpu vendor / class / model / id? */
	}
	sbuf_cat(sb, "\n");

	return (0);
	}
	#else
	/* ARM64TODO: implement non-stubbed linprocfs_docpuinfo */
	static int
	linprocfs_docpuinfo(PFS_FILL_ARGS)
	{
	int i;

	for (i = 0; i < mp_ncpus; ++i) {
	sbuf_printf(sb,
	"processor\t: %d\n"
	"BogoMIPS\t: %d.%02d\n",
	i, 0, 0);
	sbuf_cat(sb, "Features\t: ");
	sbuf_cat(sb, "\n");
	sbuf_printf(sb,
	"CPU implementer\t: \n"
	"CPU architecture: \n"
	"CPU variant\t: 0x%x\n"
	"CPU part\t: 0x%x\n"
	"CPU revision\t: %d\n",
	0, 0, 0);
	sbuf_cat(sb, "\n");
	}

	return (0);
	}
	#endif /* __i386__ \|\| __amd64__ */

	static const char *path_slash_sys = "/sys";
	static const char *fstype_sysfs = "sysfs";

	static int
	_mtab_helper(const struct pfs_node pn, const struct statfs sp,
	const char mntfrom, const char mntto, const char **fstype)
	{
	/* determine device name */
	*mntfrom = sp->f_mntfromname;

	/* determine mount point */
	*mntto = sp->f_mntonname;

	/* determine fs type */
	*fstype = sp->f_fstypename;
	if (strcmp(*fstype, pn->pn_info->pi_name) == 0)
	mntfrom = fstype = "proc";
	else if (strcmp(*fstype, "procfs") == 0)
	return (ECANCELED);

	if (strcmp(*fstype, "autofs") == 0) {
	/*
	* FreeBSD uses eg "map -hosts", whereas Linux
	* expects just "-hosts".
	*/
	if (strncmp(*mntfrom, "map ", 4) == 0)
	*mntfrom += 4;
	}

	if (strcmp(*fstype, "linsysfs") == 0) {
	*mntfrom = path_slash_sys;
	*fstype = fstype_sysfs;
	} else {
	/* For Linux msdosfs is called vfat */
	if (strcmp(*fstype, "msdosfs") == 0)
	*fstype = "vfat";
	}
	return (0);
	}

	static void
	_sbuf_mntoptions_helper(struct sbuf *sb, uint64_t f_flags)
	{
	sbuf_cat(sb, (f_flags & MNT_RDONLY) ? "ro" : "rw");
	#define ADD_OPTION(opt, name) \
	if (f_flags & (opt)) sbuf_cat(sb, "," name);
	ADD_OPTION(MNT_SYNCHRONOUS, "sync");
	ADD_OPTION(MNT_NOEXEC, "noexec");
	ADD_OPTION(MNT_NOSUID, "nosuid");
	ADD_OPTION(MNT_UNION, "union");
	ADD_OPTION(MNT_ASYNC, "async");
	ADD_OPTION(MNT_SUIDDIR, "suiddir");
	ADD_OPTION(MNT_NOSYMFOLLOW, "nosymfollow");
	ADD_OPTION(MNT_NOATIME, "noatime");
	#undef ADD_OPTION
	}

	/*
	* Filler function for proc/mtab and proc/<pid>/mounts.
	*
	* /proc/mtab doesn't exist in Linux' procfs, but is included here so
	* users can symlink /compat/linux/etc/mtab to /proc/mtab
	*/
	static int
	linprocfs_domtab(PFS_FILL_ARGS)
	{
	struct nameidata nd;
	const char lep, mntto, mntfrom, fstype;
	char dlep, flep;
	size_t lep_len;
	int error;
	struct statfs buf, sp;
	size_t count;

	/* resolve symlinks etc. in the emulation tree prefix */
	/*
	* Ideally, this would use the current chroot rather than some
	* hardcoded path.
	*/
	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, linux_emul_path, td);
	flep = NULL;
	error = namei(&nd);
	lep = linux_emul_path;
	if (error == 0) {
	if (vn_fullpath(nd.ni_vp, &dlep, &flep) == 0)
	lep = dlep;
	vrele(nd.ni_vp);
	}
	lep_len = strlen(lep);

	buf = NULL;
	error = kern_getfsstat(td, &buf, SIZE_T_MAX, &count,
	UIO_SYSSPACE, MNT_WAIT);
	if (error != 0) {
	free(buf, M_TEMP);
	free(flep, M_TEMP);
	return (error);
	}

	for (sp = buf; count > 0; sp++, count--) {
	error = _mtab_helper(pn, sp, &mntfrom, &mntto, &fstype);
	if (error != 0) {
	MPASS(error == ECANCELED);
	continue;
	}

	/* determine mount point */
	if (strncmp(mntto, lep, lep_len) == 0 && mntto[lep_len] == '/')
	mntto += lep_len;

	sbuf_printf(sb, "%s %s %s ", mntfrom, mntto, fstype);
	_sbuf_mntoptions_helper(sb, sp->f_flags);
	/* a real Linux mtab will also show NFS options */
	sbuf_printf(sb, " 0 0\n");
	}

	free(buf, M_TEMP);
	free(flep, M_TEMP);
	return (error);
	}

	static int
	linprocfs_doprocmountinfo(PFS_FILL_ARGS)
	{
	struct nameidata nd;
	const char mntfrom, mntto, *fstype;
	const char *lep;
	char dlep, flep;
	struct statfs buf, sp;
	size_t count, lep_len;
	int error;

	/*
	* Ideally, this would use the current chroot rather than some
	* hardcoded path.
	*/
	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, linux_emul_path, td);
	flep = NULL;
	error = namei(&nd);
	lep = linux_emul_path;
	if (error == 0) {
	if (vn_fullpath(nd.ni_vp, &dlep, &flep) == 0)
	lep = dlep;
	vrele(nd.ni_vp);
	}
	lep_len = strlen(lep);

	buf = NULL;
	error = kern_getfsstat(td, &buf, SIZE_T_MAX, &count,
	UIO_SYSSPACE, MNT_WAIT);
	if (error != 0)
	goto out;

	for (sp = buf; count > 0; sp++, count--) {
	error = _mtab_helper(pn, sp, &mntfrom, &mntto, &fstype);
	if (error != 0) {
	MPASS(error == ECANCELED);
	continue;
	}

	if (strncmp(mntto, lep, lep_len) == 0 && mntto[lep_len] == '/')
	mntto += lep_len;
	#if 0
	/*
	* If the prefix is a chroot, and this mountpoint is not under
	* the prefix, we should skip it. Leave it for now for
	* consistency with procmtab above.
	*/
	else
	continue;
	#endif

	/*
	* (1) mount id
	*
	* (2) parent mount id -- we don't have this cheaply, so
	* provide a dummy value
	*
	* (3) major:minor -- ditto
	*
	* (4) root filesystem mount -- probably a namespaces thing
	*
	* (5) mountto path
	*/
	sbuf_printf(sb, "%u 0 0:0 / %s ",
	sp->f_fsid.val[0] ^ sp->f_fsid.val[1], mntto);
	/* (6) mount options */
	_sbuf_mntoptions_helper(sb, sp->f_flags);
	/*
	* (7) zero or more optional fields -- again, namespace related
	*
	* (8) End of variable length fields separator ("-")
	*
	* (9) fstype
	*
	* (10) mount from
	*
	* (11) "superblock" options -- like (6), but different
	* semantics in Linux
	*/
	sbuf_printf(sb, " - %s %s %s\n", fstype, mntfrom,
	(sp->f_flags & MNT_RDONLY) ? "ro" : "rw");
	}

	error = 0;
	out:
	free(buf, M_TEMP);
	free(flep, M_TEMP);
	return (error);
	}

	/*
	* Filler function for proc/partitions
	*/
	static int
	linprocfs_dopartitions(PFS_FILL_ARGS)
	{
	struct g_class *cp;
	struct g_geom *gp;
	struct g_provider *pp;
	int major, minor;

	g_topology_lock();
	sbuf_printf(sb, "major minor #blocks name rio rmerge rsect "
	"ruse wio wmerge wsect wuse running use aveq\n");

	LIST_FOREACH(cp, &g_classes, class) {
	if (strcmp(cp->name, "DISK") == 0 \|\|
	strcmp(cp->name, "PART") == 0)
	LIST_FOREACH(gp, &cp->geom, geom) {
	LIST_FOREACH(pp, &gp->provider, provider) {
	if (linux_driver_get_major_minor(
	pp->name, &major, &minor) != 0) {
	major = 0;
	minor = 0;
	}
	sbuf_printf(sb, "%d %d %lld %s "
	"%d %d %d %d %d "
	"%d %d %d %d %d %d\n",
	major, minor,
	(long long)pp->mediasize, pp->name,
	0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0);
	}
	}
	}
	g_topology_unlock();

	return (0);
	}

	/*
	* Filler function for proc/stat
	*
	* Output depends on kernel version:
	*
	* v2.5.40 <=
	* user nice system idle
	* v2.5.41
	* user nice system idle iowait
	* v2.6.11
	* user nice system idle iowait irq softirq steal
	* v2.6.24
	* user nice system idle iowait irq softirq steal guest
	* v2.6.33 >=
	* user nice system idle iowait irq softirq steal guest guest_nice
	*/
	static int
	linprocfs_dostat(PFS_FILL_ARGS)
	{
	struct pcpu *pcpu;
	long cp_time[CPUSTATES];
	long *cp;
	struct timeval boottime;
	int i;
	char *zero_pad;
	bool has_intr = true;

	if (linux_kernver(td) >= LINUX_KERNVER(2,6,33)) {
	zero_pad = " 0 0 0 0\n";
	} else if (linux_kernver(td) >= LINUX_KERNVER(2,6,24)) {
	zero_pad = " 0 0 0\n";
	} else if (linux_kernver(td) >= LINUX_KERNVER(2,6,11)) {
	zero_pad = " 0 0\n";
	} else if (linux_kernver(td) >= LINUX_KERNVER(2,5,41)) {
	has_intr = false;
	zero_pad = " 0\n";
	} else {
	has_intr = false;
	zero_pad = "\n";
	}

	read_cpu_time(cp_time);
	getboottime(&boottime);
	/* Parameters common to all versions */
	sbuf_printf(sb, "cpu %lu %lu %lu %lu",
	T2J(cp_time[CP_USER]),
	T2J(cp_time[CP_NICE]),
	T2J(cp_time[CP_SYS]),
	T2J(cp_time[CP_IDLE]));

	/* Print interrupt stats if available */
	if (has_intr) {
	sbuf_printf(sb, " 0 %lu", T2J(cp_time[CP_INTR]));
	}

	/* Pad out remaining fields depending on version */
	sbuf_printf(sb, "%s", zero_pad);

	CPU_FOREACH(i) {
	pcpu = pcpu_find(i);
	cp = pcpu->pc_cp_time;
	sbuf_printf(sb, "cpu%d %lu %lu %lu %lu", i,
	T2J(cp[CP_USER]),
	T2J(cp[CP_NICE]),
	T2J(cp[CP_SYS]),
	T2J(cp[CP_IDLE]));

	if (has_intr) {
	sbuf_printf(sb, " 0 %lu", T2J(cp[CP_INTR]));
	}

	sbuf_printf(sb, "%s", zero_pad);
	}
	sbuf_printf(sb,
	"disk 0 0 0 0\n"
	"page %ju %ju\n"
	"swap %ju %ju\n"
	"intr %ju\n"
	"ctxt %ju\n"
	"btime %lld\n",
	(uintmax_t)VM_CNT_FETCH(v_vnodepgsin),
	(uintmax_t)VM_CNT_FETCH(v_vnodepgsout),
	(uintmax_t)VM_CNT_FETCH(v_swappgsin),
	(uintmax_t)VM_CNT_FETCH(v_swappgsout),
	(uintmax_t)VM_CNT_FETCH(v_intr),
	(uintmax_t)VM_CNT_FETCH(v_swtch),
	(long long)boottime.tv_sec);
	return (0);
	}

	static int
	linprocfs_doswaps(PFS_FILL_ARGS)
	{
	struct xswdev xsw;
	uintmax_t total, used;
	int n;
	char devname[SPECNAMELEN + 1];

	sbuf_printf(sb, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
	for (n = 0; ; n++) {
	if (swap_dev_info(n, &xsw, devname, sizeof(devname)) != 0)
	break;
	total = (uintmax_t)xsw.xsw_nblks * PAGE_SIZE / 1024;
	used = (uintmax_t)xsw.xsw_used * PAGE_SIZE / 1024;

	/*
	* The space and not tab after the device name is on
	* purpose. Linux does so.
	*/
	sbuf_printf(sb, "/dev/%-34s unknown\t\t%jd\t%jd\t-1\n",
	devname, total, used);
	}
	return (0);
	}

	/*
	* Filler function for proc/uptime
	*/
	static int
	linprocfs_douptime(PFS_FILL_ARGS)
	{
	long cp_time[CPUSTATES];
	struct timeval tv;

	getmicrouptime(&tv);
	read_cpu_time(cp_time);
	sbuf_printf(sb, "%lld.%02ld %ld.%02lu\n",
	(long long)tv.tv_sec, tv.tv_usec / 10000,
	T2S(cp_time[CP_IDLE] / mp_ncpus),
	T2CS(cp_time[CP_IDLE] / mp_ncpus) % 100);
	return (0);
	}

	/*
	* Get OS build date
	*/
	static void
	linprocfs_osbuild(struct thread td, struct sbuf sb)
	{
	#if 0
	char osbuild[256];
	char cp1, cp2;

	strncpy(osbuild, version, 256);
	osbuild[255] = '\0';
	cp1 = strstr(osbuild, "\n");
	cp2 = strstr(osbuild, ":");
	if (cp1 && cp2) {
	cp1 = cp2 = '\0';
	cp1 = strstr(osbuild, "#");
	} else
	cp1 = NULL;
	if (cp1)
	sbuf_printf(sb, "%s%s", cp1, cp2 + 1);
	else
	#endif
	sbuf_cat(sb, "#4 Sun Dec 18 04:30:00 CET 1977");
	}

	/*
	* Get OS builder
	*/
	static void
	linprocfs_osbuilder(struct thread td, struct sbuf sb)
	{
	#if 0
	char builder[256];
	char *cp;

	cp = strstr(version, "\n ");
	if (cp) {
	strncpy(builder, cp + 5, 256);
	builder[255] = '\0';
	cp = strstr(builder, ":");
	if (cp)
	*cp = '\0';
	}
	if (cp)
	sbuf_cat(sb, builder);
	else
	#endif
	sbuf_cat(sb, "des@freebsd.org");
	}

	/*
	* Filler function for proc/version
	*/
	static int
	linprocfs_doversion(PFS_FILL_ARGS)
	{
	char osname[LINUX_MAX_UTSNAME];
	char osrelease[LINUX_MAX_UTSNAME];

	linux_get_osname(td, osname);
	linux_get_osrelease(td, osrelease);
	sbuf_printf(sb, "%s version %s (", osname, osrelease);
	linprocfs_osbuilder(td, sb);
	sbuf_cat(sb, ") (gcc version " __VERSION__ ") ");
	linprocfs_osbuild(td, sb);
	sbuf_cat(sb, "\n");

	return (0);
	}

	/*
	* Filler function for proc/loadavg
	*/
	static int
	linprocfs_doloadavg(PFS_FILL_ARGS)
	{

	sbuf_printf(sb,
	"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
	(int)(averunnable.ldavg[0] / averunnable.fscale),
	(int)(averunnable.ldavg[0] * 100 / averunnable.fscale % 100),
	(int)(averunnable.ldavg[1] / averunnable.fscale),
	(int)(averunnable.ldavg[1] * 100 / averunnable.fscale % 100),
	(int)(averunnable.ldavg[2] / averunnable.fscale),
	(int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100),
	1, /* number of running tasks */
	nprocs, /* number of tasks */
	lastpid /* the last pid */
	);
	return (0);
	}

	static int
	linprocfs_get_tty_nr(struct proc *p)
	{
	struct session *sp;
	const char *ttyname;
	int error, major, minor, nr;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	sx_assert(&proctree_lock, SX_LOCKED);

	if ((p->p_flag & P_CONTROLT) == 0)
	return (-1);

	sp = p->p_pgrp->pg_session;
	if (sp == NULL)
	return (-1);

	ttyname = devtoname(sp->s_ttyp->t_dev);
	error = linux_driver_get_major_minor(ttyname, &major, &minor);
	if (error != 0)
	return (-1);

	nr = makedev(major, minor);
	return (nr);
	}

	/*
	* Filler function for proc/pid/stat
	*/
	static int
	linprocfs_doprocstat(PFS_FILL_ARGS)
	{
	struct kinfo_proc kp;
	struct timeval boottime;
	char state;
	static int ratelimit = 0;
	int tty_nr;
	vm_offset_t startcode, startdata;

	getboottime(&boottime);
	sx_slock(&proctree_lock);
	PROC_LOCK(p);
	fill_kinfo_proc(p, &kp);
	tty_nr = linprocfs_get_tty_nr(p);
	sx_sunlock(&proctree_lock);
	if (p->p_vmspace) {
	startcode = (vm_offset_t)p->p_vmspace->vm_taddr;
	startdata = (vm_offset_t)p->p_vmspace->vm_daddr;
	} else {
	startcode = 0;
	startdata = 0;
	}
	sbuf_printf(sb, "%d", p->p_pid);
	#define PS_ADD(name, fmt, arg) sbuf_printf(sb, " " fmt, arg)
	PS_ADD("comm", "(%s)", p->p_comm);
	if (kp.ki_stat > sizeof(linux_state)) {
	state = 'R';

	if (ratelimit == 0) {
	printf("linprocfs: don't know how to handle unknown FreeBSD state %d/%zd, mapping to R\n",
	kp.ki_stat, sizeof(linux_state));
	++ratelimit;
	}
	} else
	state = linux_state[kp.ki_stat - 1];
	PS_ADD("state", "%c", state);
	PS_ADD("ppid", "%d", p->p_pptr ? p->p_pptr->p_pid : 0);
	PS_ADD("pgrp", "%d", p->p_pgid);
	PS_ADD("session", "%d", p->p_session->s_sid);
	PROC_UNLOCK(p);
	PS_ADD("tty", "%d", tty_nr);
	PS_ADD("tpgid", "%d", kp.ki_tpgid);
	PS_ADD("flags", "%u", 0); /* XXX */
	PS_ADD("minflt", "%lu", kp.ki_rusage.ru_minflt);
	PS_ADD("cminflt", "%lu", kp.ki_rusage_ch.ru_minflt);
	PS_ADD("majflt", "%lu", kp.ki_rusage.ru_majflt);
	PS_ADD("cmajflt", "%lu", kp.ki_rusage_ch.ru_majflt);
	PS_ADD("utime", "%ld", TV2J(&kp.ki_rusage.ru_utime));
	PS_ADD("stime", "%ld", TV2J(&kp.ki_rusage.ru_stime));
	PS_ADD("cutime", "%ld", TV2J(&kp.ki_rusage_ch.ru_utime));
	PS_ADD("cstime", "%ld", TV2J(&kp.ki_rusage_ch.ru_stime));
	PS_ADD("priority", "%d", kp.ki_pri.pri_user);
	PS_ADD("nice", "%d", kp.ki_nice); /* 19 (nicest) to -19 */
	PS_ADD("0", "%d", 0); /* removed field */
	PS_ADD("itrealvalue", "%d", 0); /* XXX */
	PS_ADD("starttime", "%lu", TV2J(&kp.ki_start) - TV2J(&boottime));
	PS_ADD("vsize", "%ju", P2K((uintmax_t)kp.ki_size));
	PS_ADD("rss", "%ju", (uintmax_t)kp.ki_rssize);
	PS_ADD("rlim", "%lu", kp.ki_rusage.ru_maxrss);
	PS_ADD("startcode", "%ju", (uintmax_t)startcode);
	PS_ADD("endcode", "%ju", (uintmax_t)startdata);
	PS_ADD("startstack", "%u", 0); /* XXX */
	PS_ADD("kstkesp", "%u", 0); /* XXX */
	PS_ADD("kstkeip", "%u", 0); /* XXX */
	PS_ADD("signal", "%u", 0); /* XXX */
	PS_ADD("blocked", "%u", 0); /* XXX */
	PS_ADD("sigignore", "%u", 0); /* XXX */
	PS_ADD("sigcatch", "%u", 0); /* XXX */
	PS_ADD("wchan", "%u", 0); /* XXX */
	PS_ADD("nswap", "%lu", kp.ki_rusage.ru_nswap);
	PS_ADD("cnswap", "%lu", kp.ki_rusage_ch.ru_nswap);
	PS_ADD("exitsignal", "%d", 0); /* XXX */
	PS_ADD("processor", "%u", kp.ki_lastcpu);
	PS_ADD("rt_priority", "%u", 0); /* XXX / / >= 2.5.19 */
	PS_ADD("policy", "%u", kp.ki_pri.pri_class); /* >= 2.5.19 */
	#undef PS_ADD
	sbuf_putc(sb, '\n');

	return (0);
	}

	/*
	* Filler function for proc/pid/statm
	*/
	static int
	linprocfs_doprocstatm(PFS_FILL_ARGS)
	{
	struct kinfo_proc kp;
	segsz_t lsize;

	sx_slock(&proctree_lock);
	PROC_LOCK(p);
	fill_kinfo_proc(p, &kp);
	PROC_UNLOCK(p);
	sx_sunlock(&proctree_lock);

	/*
	* See comments in linprocfs_doprocstatus() regarding the
	* computation of lsize.
	*/
	/* size resident share trs drs lrs dt */
	sbuf_printf(sb, "%ju ", B2P((uintmax_t)kp.ki_size));
	sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_rssize);
	sbuf_printf(sb, "%ju ", (uintmax_t)0); /* XXX */
	sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_tsize);
	sbuf_printf(sb, "%ju ", (uintmax_t)(kp.ki_dsize + kp.ki_ssize));
	lsize = B2P(kp.ki_size) - kp.ki_dsize -
	kp.ki_ssize - kp.ki_tsize - 1;
	sbuf_printf(sb, "%ju ", (uintmax_t)lsize);
	sbuf_printf(sb, "%ju\n", (uintmax_t)0); /* XXX */

	return (0);
	}

	/*
	* Filler function for proc/pid/status
	*/
	static int
	linprocfs_doprocstatus(PFS_FILL_ARGS)
	{
	struct kinfo_proc kp;
	char *state;
	segsz_t lsize;
	struct thread *td2;
	struct sigacts *ps;
	l_sigset_t siglist, sigignore, sigcatch;
	int i;

	sx_slock(&proctree_lock);
	PROC_LOCK(p);
	td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */

	if (P_SHOULDSTOP(p)) {
	state = "T (stopped)";
	} else {
	switch(p->p_state) {
	case PRS_NEW:
	state = "I (idle)";
	break;
	case PRS_NORMAL:
	if (p->p_flag & P_WEXIT) {
	state = "X (exiting)";
	break;
	}
	switch(td2->td_state) {
	case TDS_INHIBITED:
	state = "S (sleeping)";
	break;
	case TDS_RUNQ:
	case TDS_RUNNING:
	state = "R (running)";
	break;
	default:
	state = "? (unknown)";
	break;
	}
	break;
	case PRS_ZOMBIE:
	state = "Z (zombie)";
	break;
	default:
	state = "? (unknown)";
	break;
	}
	}

	fill_kinfo_proc(p, &kp);
	sx_sunlock(&proctree_lock);

	sbuf_printf(sb, "Name:\t%s\n", p->p_comm); /* XXX escape */
	sbuf_printf(sb, "State:\t%s\n", state);

	/*
	* Credentials
	*/
	sbuf_printf(sb, "Tgid:\t%d\n", p->p_pid);
	sbuf_printf(sb, "Pid:\t%d\n", p->p_pid);
	sbuf_printf(sb, "PPid:\t%d\n", kp.ki_ppid );
	sbuf_printf(sb, "TracerPid:\t%d\n", kp.ki_tracer );
	sbuf_printf(sb, "Uid:\t%d %d %d %d\n", p->p_ucred->cr_ruid,
	p->p_ucred->cr_uid,
	p->p_ucred->cr_svuid,
	/* FreeBSD doesn't have fsuid */
	p->p_ucred->cr_uid);
	sbuf_printf(sb, "Gid:\t%d %d %d %d\n", p->p_ucred->cr_rgid,
	p->p_ucred->cr_gid,
	p->p_ucred->cr_svgid,
	/* FreeBSD doesn't have fsgid */
	p->p_ucred->cr_gid);
	sbuf_cat(sb, "Groups:\t");
	for (i = 0; i < p->p_ucred->cr_ngroups; i++)
	sbuf_printf(sb, "%d ", p->p_ucred->cr_groups[i]);
	PROC_UNLOCK(p);
	sbuf_putc(sb, '\n');

	/*
	* Memory
	*
	* While our approximation of VmLib may not be accurate (I
	* don't know of a simple way to verify it, and I'm not sure
	* it has much meaning anyway), I believe it's good enough.
	*
	* The same code that could (I think) accurately compute VmLib
	* could also compute VmLck, but I don't really care enough to
	* implement it. Submissions are welcome.
	*/
	sbuf_printf(sb, "VmSize:\t%8ju kB\n", B2K((uintmax_t)kp.ki_size));
	sbuf_printf(sb, "VmLck:\t%8u kB\n", P2K(0)); /* XXX */
	sbuf_printf(sb, "VmRSS:\t%8ju kB\n", P2K((uintmax_t)kp.ki_rssize));
	sbuf_printf(sb, "VmData:\t%8ju kB\n", P2K((uintmax_t)kp.ki_dsize));
	sbuf_printf(sb, "VmStk:\t%8ju kB\n", P2K((uintmax_t)kp.ki_ssize));
	sbuf_printf(sb, "VmExe:\t%8ju kB\n", P2K((uintmax_t)kp.ki_tsize));
	lsize = B2P(kp.ki_size) - kp.ki_dsize -
	kp.ki_ssize - kp.ki_tsize - 1;
	sbuf_printf(sb, "VmLib:\t%8ju kB\n", P2K((uintmax_t)lsize));

	/*
	* Signal masks
	*/
	PROC_LOCK(p);
	bsd_to_linux_sigset(&p->p_siglist, &siglist);
	ps = p->p_sigacts;
	mtx_lock(&ps->ps_mtx);
	bsd_to_linux_sigset(&ps->ps_sigignore, &sigignore);
	bsd_to_linux_sigset(&ps->ps_sigcatch, &sigcatch);
	mtx_unlock(&ps->ps_mtx);
	PROC_UNLOCK(p);

	sbuf_printf(sb, "SigPnd:\t%016jx\n", siglist.__mask);
	/*
	* XXX. SigBlk - target thread's signal mask, td_sigmask.
	* To implement SigBlk pseudofs should support proc/tid dir entries.
	*/
	sbuf_printf(sb, "SigBlk:\t%016x\n", 0);
	sbuf_printf(sb, "SigIgn:\t%016jx\n", sigignore.__mask);
	sbuf_printf(sb, "SigCgt:\t%016jx\n", sigcatch.__mask);

	/*
	* Linux also prints the capability masks, but we don't have
	* capabilities yet, and when we do get them they're likely to
	* be meaningless to Linux programs, so we lie. XXX
	*/
	sbuf_printf(sb, "CapInh:\t%016x\n", 0);
	sbuf_printf(sb, "CapPrm:\t%016x\n", 0);
	sbuf_printf(sb, "CapEff:\t%016x\n", 0);

	return (0);
	}

	/*
	* Filler function for proc/pid/cwd
	*/
	static int
	linprocfs_doproccwd(PFS_FILL_ARGS)
	{
	struct pwd *pwd;
	char *fullpath = "unknown";
	char *freepath = NULL;

	pwd = pwd_hold(td);
	vn_fullpath(pwd->pwd_cdir, &fullpath, &freepath);
	sbuf_printf(sb, "%s", fullpath);
	if (freepath)
	free(freepath, M_TEMP);
	pwd_drop(pwd);
	return (0);
	}

	/*
	* Filler function for proc/pid/root
	*/
	static int
	linprocfs_doprocroot(PFS_FILL_ARGS)
	{
	struct pwd *pwd;
	struct vnode *vp;
	char *fullpath = "unknown";
	char *freepath = NULL;

	pwd = pwd_hold(td);
	vp = jailed(p->p_ucred) ? pwd->pwd_jdir : pwd->pwd_rdir;
	vn_fullpath(vp, &fullpath, &freepath);
	sbuf_printf(sb, "%s", fullpath);
	if (freepath)
	free(freepath, M_TEMP);
	pwd_drop(pwd);
	return (0);
	}

	/*
	* Filler function for proc/pid/cmdline
	*/
	static int
	linprocfs_doproccmdline(PFS_FILL_ARGS)
	{
	int ret;

	PROC_LOCK(p);
	if ((ret = p_cansee(td, p)) != 0) {
	PROC_UNLOCK(p);
	return (ret);
	}

	/*
	* Mimic linux behavior and pass only processes with usermode
	* address space as valid. Return zero silently otherwize.
	*/
	if (p->p_vmspace == &vmspace0) {
	PROC_UNLOCK(p);
	return (0);
	}
	if (p->p_args != NULL) {
	sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length);
	PROC_UNLOCK(p);
	return (0);
	}

	if ((p->p_flag & P_SYSTEM) != 0) {
	PROC_UNLOCK(p);
	return (0);
	}

	PROC_UNLOCK(p);

	ret = proc_getargv(td, p, sb);
	return (ret);
	}

	/*
	* Filler function for proc/pid/environ
	*/
	static int
	linprocfs_doprocenviron(PFS_FILL_ARGS)
	{

	/*
	* Mimic linux behavior and pass only processes with usermode
	* address space as valid. Return zero silently otherwize.
	*/
	if (p->p_vmspace == &vmspace0)
	return (0);

	return (proc_getenvv(td, p, sb));
	}

	static char l32_map_str[] = "%08lx-%08lx %s%s%s%s %08lx %02x:%02x %lu%s%s\n";
	static char l64_map_str[] = "%016lx-%016lx %s%s%s%s %08lx %02x:%02x %lu%s%s\n";
	static char vdso_str[] = " [vdso]";
	static char stack_str[] = " [stack]";

	/*
	* Filler function for proc/pid/maps
	*/
	static int
	linprocfs_doprocmaps(PFS_FILL_ARGS)
	{
	struct vmspace *vm;
	vm_map_t map;
	vm_map_entry_t entry, tmp_entry;
	vm_object_t obj, tobj, lobj;
	vm_offset_t e_start, e_end;
	vm_ooffset_t off;
	vm_prot_t e_prot;
	unsigned int last_timestamp;
	char name = "", freename = NULL;
	const char *l_map_str;
	ino_t ino;
	int ref_count, shadow_count, flags;
	int error;
	struct vnode *vp;
	struct vattr vat;
	bool private;

	PROC_LOCK(p);
	error = p_candebug(td, p);
	PROC_UNLOCK(p);
	if (error)
	return (error);

	if (uio->uio_rw != UIO_READ)
	return (EOPNOTSUPP);

	error = 0;
	vm = vmspace_acquire_ref(p);
	if (vm == NULL)
	return (ESRCH);

	if (SV_CURPROC_FLAG(SV_LP64))
	l_map_str = l64_map_str;
	else
	l_map_str = l32_map_str;
	map = &vm->vm_map;
	vm_map_lock_read(map);
	VM_MAP_ENTRY_FOREACH(entry, map) {
	name = "";
	freename = NULL;
	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
	continue;
	e_prot = entry->protection;
	e_start = entry->start;
	e_end = entry->end;
	obj = entry->object.vm_object;
	off = entry->offset;
	for (lobj = tobj = obj; tobj != NULL;
	lobj = tobj, tobj = tobj->backing_object) {
	VM_OBJECT_RLOCK(tobj);
	off += lobj->backing_object_offset;
	if (lobj != obj)
	VM_OBJECT_RUNLOCK(lobj);
	}
	private = (entry->eflags & MAP_ENTRY_COW) != 0 \|\| obj == NULL \|\|
	(obj->flags & OBJ_ANON) != 0;
	last_timestamp = map->timestamp;
	vm_map_unlock_read(map);
	ino = 0;
	if (lobj) {
	vp = vm_object_vnode(lobj);
	if (vp != NULL)
	vref(vp);
	if (lobj != obj)
	VM_OBJECT_RUNLOCK(lobj);
	flags = obj->flags;
	ref_count = obj->ref_count;
	shadow_count = obj->shadow_count;
	VM_OBJECT_RUNLOCK(obj);
	if (vp != NULL) {
	vn_fullpath(vp, &name, &freename);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	VOP_GETATTR(vp, &vat, td->td_ucred);
	ino = vat.va_fileid;
	vput(vp);
	} else if (SV_PROC_ABI(p) == SV_ABI_LINUX) {
	if (e_start == p->p_sysent->sv_shared_page_base)
	name = vdso_str;
	if (e_end == p->p_sysent->sv_usrstack)
	name = stack_str;
	}
	} else {
	flags = 0;
	ref_count = 0;
	shadow_count = 0;
	}

	/*
	* format:
	* start, end, access, offset, major, minor, inode, name.
	*/
	error = sbuf_printf(sb, l_map_str,
	(u_long)e_start, (u_long)e_end,
	(e_prot & VM_PROT_READ)?"r":"-",
	(e_prot & VM_PROT_WRITE)?"w":"-",
	(e_prot & VM_PROT_EXECUTE)?"x":"-",
	private ? "p" : "s",
	(u_long)off,
	0,
	0,
	(u_long)ino,
	*name ? " " : " ",
	name
	);
	if (freename)
	free(freename, M_TEMP);
	vm_map_lock_read(map);
	if (error == -1) {
	error = 0;
	break;
	}
	if (last_timestamp != map->timestamp) {
	/*
	* Look again for the entry because the map was
	* modified while it was unlocked. Specifically,
	* the entry may have been clipped, merged, or deleted.
	*/
	vm_map_lookup_entry(map, e_end - 1, &tmp_entry);
	entry = tmp_entry;
	}
	}
	vm_map_unlock_read(map);
	vmspace_free(vm);

	return (error);
	}

	/*
	* Filler function for proc/pid/mem
	*/
	static int
	linprocfs_doprocmem(PFS_FILL_ARGS)
	{
	ssize_t resid;
	int error;

	resid = uio->uio_resid;
	error = procfs_doprocmem(PFS_FILL_ARGNAMES);

	if (uio->uio_rw == UIO_READ && resid != uio->uio_resid)
	return (0);

	if (error == EFAULT)
	error = EIO;

	return (error);
	}

	/*
	* Criteria for interface name translation
	*/
	#define IFP_IS_ETH(ifp) (ifp->if_type == IFT_ETHER)

	static int
	linux_ifname(struct ifnet ifp, char buffer, size_t buflen)
	{
	struct ifnet *ifscan;
	int ethno;

	IFNET_RLOCK_ASSERT();

	/* Short-circuit non ethernet interfaces */
	if (!IFP_IS_ETH(ifp))
	return (strlcpy(buffer, ifp->if_xname, buflen));

	/* Determine the (relative) unit number for ethernet interfaces */
	ethno = 0;
	CK_STAILQ_FOREACH(ifscan, &V_ifnet, if_link) {
	if (ifscan == ifp)
	return (snprintf(buffer, buflen, "eth%d", ethno));
	if (IFP_IS_ETH(ifscan))
	ethno++;
	}

	return (0);
	}

	/*
	* Filler function for proc/net/dev
	*/
	static int
	linprocfs_donetdev(PFS_FILL_ARGS)
	{
	char ifname[16]; /* XXX LINUX_IFNAMSIZ */
	struct ifnet *ifp;

	sbuf_printf(sb, "%6s\|%58s\|%s\n"
	"%6s\|%58s\|%58s\n",
	"Inter-", " Receive", " Transmit",
	" face",
	"bytes packets errs drop fifo frame compressed multicast",
	"bytes packets errs drop fifo colls carrier compressed");

	CURVNET_SET(TD_TO_VNET(curthread));
	IFNET_RLOCK();
	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	linux_ifname(ifp, ifname, sizeof ifname);
	sbuf_printf(sb, "%6.6s: ", ifname);
	sbuf_printf(sb, "%7ju %7ju %4ju %4ju %4lu %5lu %10lu %9ju ",
	(uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IBYTES),
	(uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS),
	(uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IERRORS),
	(uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS),
	/* rx_missed_errors */
	0UL, /* rx_fifo_errors */
	0UL, /* rx_length_errors +
	* rx_over_errors +
	* rx_crc_errors +
	* rx_frame_errors */
	0UL, /* rx_compressed */
	(uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS));
	/* XXX-BZ rx only? */
	sbuf_printf(sb, "%8ju %7ju %4ju %4ju %4lu %5ju %7lu %10lu\n",
	(uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OBYTES),
	(uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS),
	(uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OERRORS),
	(uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS),
	0UL, /* tx_fifo_errors */
	(uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS),
	0UL, /* tx_carrier_errors +
	* tx_aborted_errors +
	* tx_window_errors +
	* tx_heartbeat_errors*/
	0UL); /* tx_compressed */
	}
	IFNET_RUNLOCK();
	CURVNET_RESTORE();

	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/osrelease
	*/
	static int
	linprocfs_doosrelease(PFS_FILL_ARGS)
	{
	char osrelease[LINUX_MAX_UTSNAME];

	linux_get_osrelease(td, osrelease);
	sbuf_printf(sb, "%s\n", osrelease);

	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/ostype
	*/
	static int
	linprocfs_doostype(PFS_FILL_ARGS)
	{
	char osname[LINUX_MAX_UTSNAME];

	linux_get_osname(td, osname);
	sbuf_printf(sb, "%s\n", osname);

	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/version
	*/
	static int
	linprocfs_doosbuild(PFS_FILL_ARGS)
	{

	linprocfs_osbuild(td, sb);
	sbuf_cat(sb, "\n");
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/msgmax
	*/
	static int
	linprocfs_domsgmax(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%d\n", msginfo.msgmax);
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/msgmni
	*/
	static int
	linprocfs_domsgmni(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%d\n", msginfo.msgmni);
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/msgmnb
	*/
	static int
	linprocfs_domsgmnb(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%d\n", msginfo.msgmnb);
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/ngroups_max
	*
	* Note that in Linux it defaults to 65536, not 1023.
	*/
	static int
	linprocfs_dongroups_max(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%d\n", ngroups_max);
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/pid_max
	*/
	static int
	linprocfs_dopid_max(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%i\n", PID_MAX);
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/sem
	*/
	static int
	linprocfs_dosem(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%d %d %d %d\n", seminfo.semmsl, seminfo.semmns,
	seminfo.semopm, seminfo.semmni);
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/shmall
	*/
	static int
	linprocfs_doshmall(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%lu\n", shminfo.shmall);
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/shmmax
	*/
	static int
	linprocfs_doshmmax(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%lu\n", shminfo.shmmax);
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/shmmni
	*/
	static int
	linprocfs_doshmmni(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%lu\n", shminfo.shmmni);
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/tainted
	*/
	static int
	linprocfs_dotainted(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "0\n");
	return (0);
	}

	/*
	* Filler function for proc/sys/vm/min_free_kbytes
	*
	* This mirrors the approach in illumos to return zero for reads. Effectively,
	* it says, no memory is kept in reserve for "atomic allocations". This class
	* of allocation can be used at times when a thread cannot be suspended.
	*/
	static int
	linprocfs_dominfree(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%d\n", 0);
	return (0);
	}

	/*
	* Filler function for proc/scsi/device_info
	*/
	static int
	linprocfs_doscsidevinfo(PFS_FILL_ARGS)
	{

	return (0);
	}

	/*
	* Filler function for proc/scsi/scsi
	*/
	static int
	linprocfs_doscsiscsi(PFS_FILL_ARGS)
	{

	return (0);
	}

	/*
	* Filler function for proc/devices
	*/
	static int
	linprocfs_dodevices(PFS_FILL_ARGS)
	{
	char *char_devices;
	sbuf_printf(sb, "Character devices:\n");

	char_devices = linux_get_char_devices();
	sbuf_printf(sb, "%s", char_devices);
	linux_free_get_char_devices(char_devices);

	sbuf_printf(sb, "\nBlock devices:\n");

	return (0);
	}

	/*
	* Filler function for proc/cmdline
	*/
	static int
	linprocfs_docmdline(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "BOOT_IMAGE=%s", kernelname);
	sbuf_printf(sb, " ro root=302\n");
	return (0);
	}

	/*
	* Filler function for proc/filesystems
	*/
	static int
	linprocfs_dofilesystems(PFS_FILL_ARGS)
	{
	struct vfsconf *vfsp;

	vfsconf_slock();
	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
	if (vfsp->vfc_flags & VFCF_SYNTHETIC)
	sbuf_printf(sb, "nodev");
	sbuf_printf(sb, "\t%s\n", vfsp->vfc_name);
	}
	vfsconf_sunlock();
	return(0);
	}

	/*
	* Filler function for proc/modules
	*/
	static int
	linprocfs_domodules(PFS_FILL_ARGS)
	{
	#if 0
	struct linker_file *lf;

	TAILQ_FOREACH(lf, &linker_files, link) {
	sbuf_printf(sb, "%-20s%8lu%4d\n", lf->filename,
	(unsigned long)lf->size, lf->refs);
	}
	#endif
	return (0);
	}

	/*
	* Filler function for proc/pid/fd
	*/
	static int
	linprocfs_dofdescfs(PFS_FILL_ARGS)
	{

	if (p == curproc)
	sbuf_printf(sb, "/dev/fd");
	else
	sbuf_printf(sb, "unknown");
	return (0);
	}

	/*
	* Filler function for proc/pid/limits
	*/
	static const struct linux_rlimit_ident {
	const char *desc;
	const char *unit;
	unsigned int rlim_id;
	} linux_rlimits_ident[] = {
	{ "Max cpu time", "seconds", RLIMIT_CPU },
	{ "Max file size", "bytes", RLIMIT_FSIZE },
	{ "Max data size", "bytes", RLIMIT_DATA },
	{ "Max stack size", "bytes", RLIMIT_STACK },
	{ "Max core file size", "bytes", RLIMIT_CORE },
	{ "Max resident set", "bytes", RLIMIT_RSS },
	{ "Max processes", "processes", RLIMIT_NPROC },
	{ "Max open files", "files", RLIMIT_NOFILE },
	{ "Max locked memory", "bytes", RLIMIT_MEMLOCK },
	{ "Max address space", "bytes", RLIMIT_AS },
	{ "Max file locks", "locks", LINUX_RLIMIT_LOCKS },
	{ "Max pending signals", "signals", LINUX_RLIMIT_SIGPENDING },
	{ "Max msgqueue size", "bytes", LINUX_RLIMIT_MSGQUEUE },
	{ "Max nice priority", "", LINUX_RLIMIT_NICE },
	{ "Max realtime priority", "", LINUX_RLIMIT_RTPRIO },
	{ "Max realtime timeout", "us", LINUX_RLIMIT_RTTIME },
	{ 0, 0, 0 }
	};

	static int
	linprocfs_doproclimits(PFS_FILL_ARGS)
	{
	const struct linux_rlimit_ident *li;
	struct plimit *limp;
	struct rlimit rl;
	ssize_t size;
	int res, error;

	error = 0;

	PROC_LOCK(p);
	limp = lim_hold(p->p_limit);
	PROC_UNLOCK(p);
	size = sizeof(res);
	sbuf_printf(sb, "%-26s%-21s%-21s%-21s\n", "Limit", "Soft Limit",
	"Hard Limit", "Units");
	for (li = linux_rlimits_ident; li->desc != NULL; ++li) {
	switch (li->rlim_id)
	{
	case LINUX_RLIMIT_LOCKS:
	/* FALLTHROUGH */
	case LINUX_RLIMIT_RTTIME:
	rl.rlim_cur = RLIM_INFINITY;
	break;
	case LINUX_RLIMIT_SIGPENDING:
	error = kernel_sysctlbyname(td,
	"kern.sigqueue.max_pending_per_proc",
	&res, &size, 0, 0, 0, 0);
	if (error != 0)
	goto out;
	rl.rlim_cur = res;
	rl.rlim_max = res;
	break;
	case LINUX_RLIMIT_MSGQUEUE:
	error = kernel_sysctlbyname(td,
	"kern.ipc.msgmnb", &res, &size, 0, 0, 0, 0);
	if (error != 0)
	goto out;
	rl.rlim_cur = res;
	rl.rlim_max = res;
	break;
	case LINUX_RLIMIT_NICE:
	/* FALLTHROUGH */
	case LINUX_RLIMIT_RTPRIO:
	rl.rlim_cur = 0;
	rl.rlim_max = 0;
	break;
	default:
	rl = limp->pl_rlimit[li->rlim_id];
	break;
	}
	if (rl.rlim_cur == RLIM_INFINITY)
	sbuf_printf(sb, "%-26s%-21s%-21s%-10s\n",
	li->desc, "unlimited", "unlimited", li->unit);
	else
	sbuf_printf(sb, "%-26s%-21llu%-21llu%-10s\n",
	li->desc, (unsigned long long)rl.rlim_cur,
	(unsigned long long)rl.rlim_max, li->unit);
	}
	out:
	lim_free(limp);
	return (error);
	}

	/*
	* The point of the following two functions is to work around
	* an assertion in Chromium; see kern/240991 for details.
	*/
	static int
	linprocfs_dotaskattr(PFS_ATTR_ARGS)
	{

	vap->va_nlink = 3;
	return (0);
	}

	/*
	* Filler function for proc/<pid>/task/.dummy
	*/
	static int
	linprocfs_dotaskdummy(PFS_FILL_ARGS)
	{

	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/random/uuid
	*/
	static int
	linprocfs_douuid(PFS_FILL_ARGS)
	{
	struct uuid uuid;

	kern_uuidgen(&uuid, 1);
	sbuf_printf_uuid(sb, &uuid);
	sbuf_printf(sb, "\n");
	return(0);
	}

	/*
	* Filler function for proc/pid/auxv
	*/
	static int
	linprocfs_doauxv(PFS_FILL_ARGS)
	{
	struct sbuf *asb;
	off_t buflen, resid;
	int error;

	/*
	* Mimic linux behavior and pass only processes with usermode
	* address space as valid. Return zero silently otherwise.
	*/
	if (p->p_vmspace == &vmspace0)
	return (0);

	if (uio->uio_resid == 0)
	return (0);
	if (uio->uio_offset < 0 \|\| uio->uio_resid < 0)
	return (EINVAL);

	asb = sbuf_new_auto();
	if (asb == NULL)
	return (ENOMEM);
	error = proc_getauxv(td, p, asb);
	if (error == 0)
	error = sbuf_finish(asb);

	resid = sbuf_len(asb) - uio->uio_offset;
	if (resid > uio->uio_resid)
	buflen = uio->uio_resid;
	else
	buflen = resid;
	if (buflen > IOSIZE_MAX)
	return (EINVAL);
	- if (buflen > MAXPHYS)
	- buflen = MAXPHYS;
	+ if (buflen > maxphys)
	+ buflen = maxphys;
	if (resid <= 0)
	return (0);

	if (error == 0)
	error = uiomove(sbuf_data(asb) + uio->uio_offset, buflen, uio);
	sbuf_delete(asb);
	return (error);
	}

	/*
	* Constructor
	*/
	static int
	linprocfs_init(PFS_INIT_ARGS)
	{
	struct pfs_node *root;
	struct pfs_node *dir;
	struct pfs_node *sys;

	root = pi->pi_root;

	/* /proc/... */
	pfs_create_file(root, "cmdline", &linprocfs_docmdline,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "cpuinfo", &linprocfs_docpuinfo,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "devices", &linprocfs_dodevices,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "filesystems", &linprocfs_dofilesystems,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "loadavg", &linprocfs_doloadavg,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "meminfo", &linprocfs_domeminfo,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "modules", &linprocfs_domodules,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "mounts", &linprocfs_domtab,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "mtab", &linprocfs_domtab,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "partitions", &linprocfs_dopartitions,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_link(root, "self", &procfs_docurproc,
	NULL, NULL, NULL, 0);
	pfs_create_file(root, "stat", &linprocfs_dostat,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "swaps", &linprocfs_doswaps,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "uptime", &linprocfs_douptime,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "version", &linprocfs_doversion,
	NULL, NULL, NULL, PFS_RD);

	/* /proc/bus/... */
	dir = pfs_create_dir(root, "bus", NULL, NULL, NULL, 0);
	dir = pfs_create_dir(dir, "pci", NULL, NULL, NULL, 0);
	dir = pfs_create_dir(dir, "devices", NULL, NULL, NULL, 0);

	/* /proc/net/... */
	dir = pfs_create_dir(root, "net", NULL, NULL, NULL, 0);
	pfs_create_file(dir, "dev", &linprocfs_donetdev,
	NULL, NULL, NULL, PFS_RD);

	/* /proc/<pid>/... */
	dir = pfs_create_dir(root, "pid", NULL, NULL, NULL, PFS_PROCDEP);
	pfs_create_file(dir, "cmdline", &linprocfs_doproccmdline,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_link(dir, "cwd", &linprocfs_doproccwd,
	NULL, NULL, NULL, 0);
	pfs_create_file(dir, "environ", &linprocfs_doprocenviron,
	NULL, &procfs_candebug, NULL, PFS_RD);
	pfs_create_link(dir, "exe", &procfs_doprocfile,
	NULL, &procfs_notsystem, NULL, 0);
	pfs_create_file(dir, "maps", &linprocfs_doprocmaps,
	NULL, NULL, NULL, PFS_RD \| PFS_AUTODRAIN);
	pfs_create_file(dir, "mem", &linprocfs_doprocmem,
	procfs_attr_rw, &procfs_candebug, NULL, PFS_RDWR \| PFS_RAW);
	pfs_create_file(dir, "mountinfo", &linprocfs_doprocmountinfo,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "mounts", &linprocfs_domtab,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_link(dir, "root", &linprocfs_doprocroot,
	NULL, NULL, NULL, 0);
	pfs_create_file(dir, "stat", &linprocfs_doprocstat,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "statm", &linprocfs_doprocstatm,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "status", &linprocfs_doprocstatus,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_link(dir, "fd", &linprocfs_dofdescfs,
	NULL, NULL, NULL, 0);
	pfs_create_file(dir, "auxv", &linprocfs_doauxv,
	NULL, &procfs_candebug, NULL, PFS_RD\|PFS_RAWRD);
	pfs_create_file(dir, "limits", &linprocfs_doproclimits,
	NULL, NULL, NULL, PFS_RD);

	/* /proc/<pid>/task/... */
	dir = pfs_create_dir(dir, "task", linprocfs_dotaskattr, NULL, NULL, 0);
	pfs_create_file(dir, ".dummy", &linprocfs_dotaskdummy,
	NULL, NULL, NULL, PFS_RD);

	/* /proc/scsi/... */
	dir = pfs_create_dir(root, "scsi", NULL, NULL, NULL, 0);
	pfs_create_file(dir, "device_info", &linprocfs_doscsidevinfo,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "scsi", &linprocfs_doscsiscsi,
	NULL, NULL, NULL, PFS_RD);

	/* /proc/sys/... */
	sys = pfs_create_dir(root, "sys", NULL, NULL, NULL, 0);

	/* /proc/sys/kernel/... */
	dir = pfs_create_dir(sys, "kernel", NULL, NULL, NULL, 0);
	pfs_create_file(dir, "osrelease", &linprocfs_doosrelease,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "ostype", &linprocfs_doostype,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "version", &linprocfs_doosbuild,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "msgmax", &linprocfs_domsgmax,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "msgmni", &linprocfs_domsgmni,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "msgmnb", &linprocfs_domsgmnb,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "ngroups_max", &linprocfs_dongroups_max,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "pid_max", &linprocfs_dopid_max,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "sem", &linprocfs_dosem,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "shmall", &linprocfs_doshmall,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "shmmax", &linprocfs_doshmmax,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "shmmni", &linprocfs_doshmmni,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "tainted", &linprocfs_dotainted,
	NULL, NULL, NULL, PFS_RD);

	/* /proc/sys/kernel/random/... */
	dir = pfs_create_dir(dir, "random", NULL, NULL, NULL, 0);
	pfs_create_file(dir, "uuid", &linprocfs_douuid,
	NULL, NULL, NULL, PFS_RD);

	/* /proc/sys/vm/.... */
	dir = pfs_create_dir(sys, "vm", NULL, NULL, NULL, 0);
	pfs_create_file(dir, "min_free_kbytes", &linprocfs_dominfree,
	NULL, NULL, NULL, PFS_RD);

	return (0);
	}

	/*
	* Destructor
	*/
	static int
	linprocfs_uninit(PFS_INIT_ARGS)
	{

	/* nothing to do, pseudofs will GC */
	return (0);
	}

	PSEUDOFS(linprocfs, 1, VFCF_JAIL);
	#if defined(__aarch64__) \|\| defined(__amd64__)
	MODULE_DEPEND(linprocfs, linux_common, 1, 1, 1);
	#else
	MODULE_DEPEND(linprocfs, linux, 1, 1, 1);
	#endif
	MODULE_DEPEND(linprocfs, procfs, 1, 1, 1);
	MODULE_DEPEND(linprocfs, sysvmsg, 1, 1, 1);
	MODULE_DEPEND(linprocfs, sysvsem, 1, 1, 1);
	MODULE_DEPEND(linprocfs, sysvshm, 1, 1, 1);
	diff --git a/sys/compat/linux/linux_ioctl.c b/sys/compat/linux/linux_ioctl.c
	index b454a4308bd7..62cb958aa42f 100644
	--- a/sys/compat/linux/linux_ioctl.c
	+++ b/sys/compat/linux/linux_ioctl.c
	@@ -1,3806 +1,3806 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1994-1995 Søren Schmidt
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include "opt_compat.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#ifdef COMPAT_LINUX32
	#include <sys/abi_compat.h>
	#endif
	#include <sys/capsicum.h>
	#include <sys/cdio.h>
	#include <sys/dvdio.h>
	#include <sys/conf.h>
	#include <sys/disk.h>
	#include <sys/consio.h>
	#include <sys/ctype.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	#include <sys/jail.h>
	#include <sys/kbio.h>
	#include <sys/kcov.h>
	#include <sys/kernel.h>
	#include <sys/linker_set.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/proc.h>
	#include <sys/sbuf.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/soundcard.h>
	#include <sys/stdint.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/tty.h>
	#include <sys/uio.h>
	#include <sys/types.h>
	#include <sys/mman.h>
	#include <sys/resourcevar.h>

	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>

	#include <dev/evdev/input.h>
	#include <dev/usb/usb_ioctl.h>

	#ifdef COMPAT_LINUX32
	#include <machine/../linux32/linux.h>
	#include <machine/../linux32/linux32_proto.h>
	#else
	#include <machine/../linux/linux.h>
	#include <machine/../linux/linux_proto.h>
	#endif

	#include <compat/linux/linux_common.h>
	#include <compat/linux/linux_ioctl.h>
	#include <compat/linux/linux_mib.h>
	#include <compat/linux/linux_socket.h>
	#include <compat/linux/linux_timer.h>
	#include <compat/linux/linux_util.h>

	#include <contrib/v4l/videodev.h>
	#include <compat/linux/linux_videodev_compat.h>

	#include <contrib/v4l/videodev2.h>
	#include <compat/linux/linux_videodev2_compat.h>

	#include <cam/scsi/scsi_sg.h>

	CTASSERT(LINUX_IFNAMSIZ == IFNAMSIZ);

	#define DEFINE_LINUX_IOCTL_SET(shortname, SHORTNAME) \
	static linux_ioctl_function_t linux_ioctl_ ## shortname; \
	static struct linux_ioctl_handler shortname ## _handler = { \
	.func = linux_ioctl_ ## shortname, \
	.low = LINUX_IOCTL_ ## SHORTNAME ## _MIN, \
	.high = LINUX_IOCTL_ ## SHORTNAME ## _MAX, \
	}; \
	DATA_SET(linux_ioctl_handler_set, shortname ## _handler)

	DEFINE_LINUX_IOCTL_SET(cdrom, CDROM);
	DEFINE_LINUX_IOCTL_SET(vfat, VFAT);
	DEFINE_LINUX_IOCTL_SET(console, CONSOLE);
	DEFINE_LINUX_IOCTL_SET(hdio, HDIO);
	DEFINE_LINUX_IOCTL_SET(disk, DISK);
	DEFINE_LINUX_IOCTL_SET(socket, SOCKET);
	DEFINE_LINUX_IOCTL_SET(sound, SOUND);
	DEFINE_LINUX_IOCTL_SET(termio, TERMIO);
	DEFINE_LINUX_IOCTL_SET(private, PRIVATE);
	DEFINE_LINUX_IOCTL_SET(drm, DRM);
	DEFINE_LINUX_IOCTL_SET(sg, SG);
	DEFINE_LINUX_IOCTL_SET(v4l, VIDEO);
	DEFINE_LINUX_IOCTL_SET(v4l2, VIDEO2);
	DEFINE_LINUX_IOCTL_SET(fbsd_usb, FBSD_LUSB);
	DEFINE_LINUX_IOCTL_SET(evdev, EVDEV);
	DEFINE_LINUX_IOCTL_SET(kcov, KCOV);

	#undef DEFINE_LINUX_IOCTL_SET

	static int linux_ioctl_special(struct thread , struct linux_ioctl_args );

	/*
	* Keep sorted by low.
	*/
	static struct linux_ioctl_handler linux_ioctls[] = {
	{ .func = linux_ioctl_termio, .low = LINUX_IOCTL_TERMIO_MIN,
	.high = LINUX_IOCTL_TERMIO_MAX },
	};

	#ifdef __i386__
	static TAILQ_HEAD(, linux_ioctl_handler_element) linux_ioctl_handlers =
	TAILQ_HEAD_INITIALIZER(linux_ioctl_handlers);
	static struct sx linux_ioctl_sx;
	SX_SYSINIT(linux_ioctl, &linux_ioctl_sx, "Linux ioctl handlers");
	#else
	extern TAILQ_HEAD(, linux_ioctl_handler_element) linux_ioctl_handlers;
	extern struct sx linux_ioctl_sx;
	#endif
	#ifdef COMPAT_LINUX32
	static TAILQ_HEAD(, linux_ioctl_handler_element) linux32_ioctl_handlers =
	TAILQ_HEAD_INITIALIZER(linux32_ioctl_handlers);
	#endif

	/*
	* hdio related ioctls for VMWare support
	*/

	struct linux_hd_geometry {
	u_int8_t heads;
	u_int8_t sectors;
	u_int16_t cylinders;
	u_int32_t start;
	};

	struct linux_hd_big_geometry {
	u_int8_t heads;
	u_int8_t sectors;
	u_int32_t cylinders;
	u_int32_t start;
	};

	static int
	linux_ioctl_hdio(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;
	u_int sectorsize, fwcylinders, fwheads, fwsectors;
	off_t mediasize, bytespercyl;

	error = fget(td, args->fd, &cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	switch (args->cmd & 0xffff) {
	case LINUX_HDIO_GET_GEO:
	case LINUX_HDIO_GET_GEO_BIG:
	error = fo_ioctl(fp, DIOCGMEDIASIZE,
	(caddr_t)&mediasize, td->td_ucred, td);
	if (!error)
	error = fo_ioctl(fp, DIOCGSECTORSIZE,
	(caddr_t)&sectorsize, td->td_ucred, td);
	if (!error)
	error = fo_ioctl(fp, DIOCGFWHEADS,
	(caddr_t)&fwheads, td->td_ucred, td);
	if (!error)
	error = fo_ioctl(fp, DIOCGFWSECTORS,
	(caddr_t)&fwsectors, td->td_ucred, td);
	/*
	* XXX: DIOCGFIRSTOFFSET is not yet implemented, so
	* so pretend that GEOM always says 0. This is NOT VALID
	* for slices or partitions, only the per-disk raw devices.
	*/

	fdrop(fp, td);
	if (error)
	return (error);
	/*
	* 1. Calculate the number of bytes in a cylinder,
	* given the firmware's notion of heads and sectors
	* per cylinder.
	* 2. Calculate the number of cylinders, given the total
	* size of the media.
	* All internal calculations should have 64-bit precision.
	*/
	bytespercyl = (off_t) sectorsize * fwheads * fwsectors;
	fwcylinders = mediasize / bytespercyl;

	if ((args->cmd & 0xffff) == LINUX_HDIO_GET_GEO) {
	struct linux_hd_geometry hdg;

	hdg.cylinders = fwcylinders;
	hdg.heads = fwheads;
	hdg.sectors = fwsectors;
	hdg.start = 0;
	error = copyout(&hdg, (void *)args->arg, sizeof(hdg));
	} else if ((args->cmd & 0xffff) == LINUX_HDIO_GET_GEO_BIG) {
	struct linux_hd_big_geometry hdbg;

	memset(&hdbg, 0, sizeof(hdbg));
	hdbg.cylinders = fwcylinders;
	hdbg.heads = fwheads;
	hdbg.sectors = fwsectors;
	hdbg.start = 0;
	error = copyout(&hdbg, (void *)args->arg, sizeof(hdbg));
	}
	return (error);
	break;
	default:
	/* XXX */
	linux_msg(td,
	"%s fd=%d, cmd=0x%x ('%c',%d) is not implemented",
	__func__, args->fd, args->cmd,
	(int)(args->cmd & 0xff00) >> 8,
	(int)(args->cmd & 0xff));
	break;
	}
	fdrop(fp, td);
	return (ENOIOCTL);
	}

	static int
	linux_ioctl_disk(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;
	u_int sectorsize, psectorsize;
	uint64_t blksize64;
	off_t mediasize, stripesize;

	error = fget(td, args->fd, &cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	switch (args->cmd & 0xffff) {
	case LINUX_BLKGETSIZE:
	error = fo_ioctl(fp, DIOCGSECTORSIZE,
	(caddr_t)&sectorsize, td->td_ucred, td);
	if (!error)
	error = fo_ioctl(fp, DIOCGMEDIASIZE,
	(caddr_t)&mediasize, td->td_ucred, td);
	fdrop(fp, td);
	if (error)
	return (error);
	sectorsize = mediasize / sectorsize;
	/*
	* XXX: How do we know we return the right size of integer ?
	*/
	return (copyout(&sectorsize, (void *)args->arg,
	sizeof(sectorsize)));
	break;
	case LINUX_BLKGETSIZE64:
	error = fo_ioctl(fp, DIOCGMEDIASIZE,
	(caddr_t)&mediasize, td->td_ucred, td);
	fdrop(fp, td);
	if (error)
	return (error);
	blksize64 = mediasize;;
	return (copyout(&blksize64, (void *)args->arg,
	sizeof(blksize64)));
	case LINUX_BLKSSZGET:
	error = fo_ioctl(fp, DIOCGSECTORSIZE,
	(caddr_t)&sectorsize, td->td_ucred, td);
	fdrop(fp, td);
	if (error)
	return (error);
	return (copyout(&sectorsize, (void *)args->arg,
	sizeof(sectorsize)));
	break;
	case LINUX_BLKPBSZGET:
	error = fo_ioctl(fp, DIOCGSTRIPESIZE,
	(caddr_t)&stripesize, td->td_ucred, td);
	if (error != 0) {
	fdrop(fp, td);
	return (error);
	}
	if (stripesize > 0 && stripesize <= 4096) {
	psectorsize = stripesize;
	} else {
	error = fo_ioctl(fp, DIOCGSECTORSIZE,
	(caddr_t)&sectorsize, td->td_ucred, td);
	if (error != 0) {
	fdrop(fp, td);
	return (error);
	}
	psectorsize = sectorsize;
	}
	fdrop(fp, td);
	return (copyout(&psectorsize, (void *)args->arg,
	sizeof(psectorsize)));
	}
	fdrop(fp, td);
	return (ENOIOCTL);
	}

	/*
	* termio related ioctls
	*/

	struct linux_termio {
	unsigned short c_iflag;
	unsigned short c_oflag;
	unsigned short c_cflag;
	unsigned short c_lflag;
	unsigned char c_line;
	unsigned char c_cc[LINUX_NCC];
	};

	struct linux_termios {
	unsigned int c_iflag;
	unsigned int c_oflag;
	unsigned int c_cflag;
	unsigned int c_lflag;
	unsigned char c_line;
	unsigned char c_cc[LINUX_NCCS];
	};

	struct linux_winsize {
	unsigned short ws_row, ws_col;
	unsigned short ws_xpixel, ws_ypixel;
	};

	struct speedtab {
	int sp_speed; /* Speed. */
	int sp_code; /* Code. */
	};

	static struct speedtab sptab[] = {
	{ B0, LINUX_B0 }, { B50, LINUX_B50 },
	{ B75, LINUX_B75 }, { B110, LINUX_B110 },
	{ B134, LINUX_B134 }, { B150, LINUX_B150 },
	{ B200, LINUX_B200 }, { B300, LINUX_B300 },
	{ B600, LINUX_B600 }, { B1200, LINUX_B1200 },
	{ B1800, LINUX_B1800 }, { B2400, LINUX_B2400 },
	{ B4800, LINUX_B4800 }, { B9600, LINUX_B9600 },
	{ B19200, LINUX_B19200 }, { B38400, LINUX_B38400 },
	{ B57600, LINUX_B57600 }, { B115200, LINUX_B115200 },
	{-1, -1 }
	};

	struct linux_serial_struct {
	int type;
	int line;
	int port;
	int irq;
	int flags;
	int xmit_fifo_size;
	int custom_divisor;
	int baud_base;
	unsigned short close_delay;
	char reserved_char[2];
	int hub6;
	unsigned short closing_wait;
	unsigned short closing_wait2;
	int reserved[4];
	};

	static int
	linux_to_bsd_speed(int code, struct speedtab *table)
	{
	for ( ; table->sp_code != -1; table++)
	if (table->sp_code == code)
	return (table->sp_speed);
	return (-1);
	}

	static int
	bsd_to_linux_speed(int speed, struct speedtab *table)
	{
	for ( ; table->sp_speed != -1; table++)
	if (table->sp_speed == speed)
	return (table->sp_code);
	return (-1);
	}

	static void
	bsd_to_linux_termios(struct termios bios, struct linux_termios lios)
	{
	int i;

	lios->c_iflag = 0;
	if (bios->c_iflag & IGNBRK)
	lios->c_iflag \|= LINUX_IGNBRK;
	if (bios->c_iflag & BRKINT)
	lios->c_iflag \|= LINUX_BRKINT;
	if (bios->c_iflag & IGNPAR)
	lios->c_iflag \|= LINUX_IGNPAR;
	if (bios->c_iflag & PARMRK)
	lios->c_iflag \|= LINUX_PARMRK;
	if (bios->c_iflag & INPCK)
	lios->c_iflag \|= LINUX_INPCK;
	if (bios->c_iflag & ISTRIP)
	lios->c_iflag \|= LINUX_ISTRIP;
	if (bios->c_iflag & INLCR)
	lios->c_iflag \|= LINUX_INLCR;
	if (bios->c_iflag & IGNCR)
	lios->c_iflag \|= LINUX_IGNCR;
	if (bios->c_iflag & ICRNL)
	lios->c_iflag \|= LINUX_ICRNL;
	if (bios->c_iflag & IXON)
	lios->c_iflag \|= LINUX_IXON;
	if (bios->c_iflag & IXANY)
	lios->c_iflag \|= LINUX_IXANY;
	if (bios->c_iflag & IXOFF)
	lios->c_iflag \|= LINUX_IXOFF;
	if (bios->c_iflag & IMAXBEL)
	lios->c_iflag \|= LINUX_IMAXBEL;

	lios->c_oflag = 0;
	if (bios->c_oflag & OPOST)
	lios->c_oflag \|= LINUX_OPOST;
	if (bios->c_oflag & ONLCR)
	lios->c_oflag \|= LINUX_ONLCR;
	if (bios->c_oflag & TAB3)
	lios->c_oflag \|= LINUX_XTABS;

	lios->c_cflag = bsd_to_linux_speed(bios->c_ispeed, sptab);
	lios->c_cflag \|= (bios->c_cflag & CSIZE) >> 4;
	if (bios->c_cflag & CSTOPB)
	lios->c_cflag \|= LINUX_CSTOPB;
	if (bios->c_cflag & CREAD)
	lios->c_cflag \|= LINUX_CREAD;
	if (bios->c_cflag & PARENB)
	lios->c_cflag \|= LINUX_PARENB;
	if (bios->c_cflag & PARODD)
	lios->c_cflag \|= LINUX_PARODD;
	if (bios->c_cflag & HUPCL)
	lios->c_cflag \|= LINUX_HUPCL;
	if (bios->c_cflag & CLOCAL)
	lios->c_cflag \|= LINUX_CLOCAL;
	if (bios->c_cflag & CRTSCTS)
	lios->c_cflag \|= LINUX_CRTSCTS;

	lios->c_lflag = 0;
	if (bios->c_lflag & ISIG)
	lios->c_lflag \|= LINUX_ISIG;
	if (bios->c_lflag & ICANON)
	lios->c_lflag \|= LINUX_ICANON;
	if (bios->c_lflag & ECHO)
	lios->c_lflag \|= LINUX_ECHO;
	if (bios->c_lflag & ECHOE)
	lios->c_lflag \|= LINUX_ECHOE;
	if (bios->c_lflag & ECHOK)
	lios->c_lflag \|= LINUX_ECHOK;
	if (bios->c_lflag & ECHONL)
	lios->c_lflag \|= LINUX_ECHONL;
	if (bios->c_lflag & NOFLSH)
	lios->c_lflag \|= LINUX_NOFLSH;
	if (bios->c_lflag & TOSTOP)
	lios->c_lflag \|= LINUX_TOSTOP;
	if (bios->c_lflag & ECHOCTL)
	lios->c_lflag \|= LINUX_ECHOCTL;
	if (bios->c_lflag & ECHOPRT)
	lios->c_lflag \|= LINUX_ECHOPRT;
	if (bios->c_lflag & ECHOKE)
	lios->c_lflag \|= LINUX_ECHOKE;
	if (bios->c_lflag & FLUSHO)
	lios->c_lflag \|= LINUX_FLUSHO;
	if (bios->c_lflag & PENDIN)
	lios->c_lflag \|= LINUX_PENDIN;
	if (bios->c_lflag & IEXTEN)
	lios->c_lflag \|= LINUX_IEXTEN;

	for (i=0; i<LINUX_NCCS; i++)
	lios->c_cc[i] = LINUX_POSIX_VDISABLE;
	lios->c_cc[LINUX_VINTR] = bios->c_cc[VINTR];
	lios->c_cc[LINUX_VQUIT] = bios->c_cc[VQUIT];
	lios->c_cc[LINUX_VERASE] = bios->c_cc[VERASE];
	lios->c_cc[LINUX_VKILL] = bios->c_cc[VKILL];
	lios->c_cc[LINUX_VEOF] = bios->c_cc[VEOF];
	lios->c_cc[LINUX_VEOL] = bios->c_cc[VEOL];
	lios->c_cc[LINUX_VMIN] = bios->c_cc[VMIN];
	lios->c_cc[LINUX_VTIME] = bios->c_cc[VTIME];
	lios->c_cc[LINUX_VEOL2] = bios->c_cc[VEOL2];
	lios->c_cc[LINUX_VSUSP] = bios->c_cc[VSUSP];
	lios->c_cc[LINUX_VSTART] = bios->c_cc[VSTART];
	lios->c_cc[LINUX_VSTOP] = bios->c_cc[VSTOP];
	lios->c_cc[LINUX_VREPRINT] = bios->c_cc[VREPRINT];
	lios->c_cc[LINUX_VDISCARD] = bios->c_cc[VDISCARD];
	lios->c_cc[LINUX_VWERASE] = bios->c_cc[VWERASE];
	lios->c_cc[LINUX_VLNEXT] = bios->c_cc[VLNEXT];
	if (linux_preserve_vstatus)
	lios->c_cc[LINUX_VSTATUS] = bios->c_cc[VSTATUS];

	for (i=0; i<LINUX_NCCS; i++) {
	if (i != LINUX_VMIN && i != LINUX_VTIME &&
	lios->c_cc[i] == _POSIX_VDISABLE)
	lios->c_cc[i] = LINUX_POSIX_VDISABLE;
	}
	lios->c_line = 0;
	}

	static void
	linux_to_bsd_termios(struct linux_termios lios, struct termios bios)
	{
	int i;

	bios->c_iflag = 0;
	if (lios->c_iflag & LINUX_IGNBRK)
	bios->c_iflag \|= IGNBRK;
	if (lios->c_iflag & LINUX_BRKINT)
	bios->c_iflag \|= BRKINT;
	if (lios->c_iflag & LINUX_IGNPAR)
	bios->c_iflag \|= IGNPAR;
	if (lios->c_iflag & LINUX_PARMRK)
	bios->c_iflag \|= PARMRK;
	if (lios->c_iflag & LINUX_INPCK)
	bios->c_iflag \|= INPCK;
	if (lios->c_iflag & LINUX_ISTRIP)
	bios->c_iflag \|= ISTRIP;
	if (lios->c_iflag & LINUX_INLCR)
	bios->c_iflag \|= INLCR;
	if (lios->c_iflag & LINUX_IGNCR)
	bios->c_iflag \|= IGNCR;
	if (lios->c_iflag & LINUX_ICRNL)
	bios->c_iflag \|= ICRNL;
	if (lios->c_iflag & LINUX_IXON)
	bios->c_iflag \|= IXON;
	if (lios->c_iflag & LINUX_IXANY)
	bios->c_iflag \|= IXANY;
	if (lios->c_iflag & LINUX_IXOFF)
	bios->c_iflag \|= IXOFF;
	if (lios->c_iflag & LINUX_IMAXBEL)
	bios->c_iflag \|= IMAXBEL;

	bios->c_oflag = 0;
	if (lios->c_oflag & LINUX_OPOST)
	bios->c_oflag \|= OPOST;
	if (lios->c_oflag & LINUX_ONLCR)
	bios->c_oflag \|= ONLCR;
	if (lios->c_oflag & LINUX_XTABS)
	bios->c_oflag \|= TAB3;

	bios->c_cflag = (lios->c_cflag & LINUX_CSIZE) << 4;
	if (lios->c_cflag & LINUX_CSTOPB)
	bios->c_cflag \|= CSTOPB;
	if (lios->c_cflag & LINUX_CREAD)
	bios->c_cflag \|= CREAD;
	if (lios->c_cflag & LINUX_PARENB)
	bios->c_cflag \|= PARENB;
	if (lios->c_cflag & LINUX_PARODD)
	bios->c_cflag \|= PARODD;
	if (lios->c_cflag & LINUX_HUPCL)
	bios->c_cflag \|= HUPCL;
	if (lios->c_cflag & LINUX_CLOCAL)
	bios->c_cflag \|= CLOCAL;
	if (lios->c_cflag & LINUX_CRTSCTS)
	bios->c_cflag \|= CRTSCTS;

	bios->c_lflag = 0;
	if (lios->c_lflag & LINUX_ISIG)
	bios->c_lflag \|= ISIG;
	if (lios->c_lflag & LINUX_ICANON)
	bios->c_lflag \|= ICANON;
	if (lios->c_lflag & LINUX_ECHO)
	bios->c_lflag \|= ECHO;
	if (lios->c_lflag & LINUX_ECHOE)
	bios->c_lflag \|= ECHOE;
	if (lios->c_lflag & LINUX_ECHOK)
	bios->c_lflag \|= ECHOK;
	if (lios->c_lflag & LINUX_ECHONL)
	bios->c_lflag \|= ECHONL;
	if (lios->c_lflag & LINUX_NOFLSH)
	bios->c_lflag \|= NOFLSH;
	if (lios->c_lflag & LINUX_TOSTOP)
	bios->c_lflag \|= TOSTOP;
	if (lios->c_lflag & LINUX_ECHOCTL)
	bios->c_lflag \|= ECHOCTL;
	if (lios->c_lflag & LINUX_ECHOPRT)
	bios->c_lflag \|= ECHOPRT;
	if (lios->c_lflag & LINUX_ECHOKE)
	bios->c_lflag \|= ECHOKE;
	if (lios->c_lflag & LINUX_FLUSHO)
	bios->c_lflag \|= FLUSHO;
	if (lios->c_lflag & LINUX_PENDIN)
	bios->c_lflag \|= PENDIN;
	if (lios->c_lflag & LINUX_IEXTEN)
	bios->c_lflag \|= IEXTEN;

	for (i=0; i<NCCS; i++)
	bios->c_cc[i] = _POSIX_VDISABLE;
	bios->c_cc[VINTR] = lios->c_cc[LINUX_VINTR];
	bios->c_cc[VQUIT] = lios->c_cc[LINUX_VQUIT];
	bios->c_cc[VERASE] = lios->c_cc[LINUX_VERASE];
	bios->c_cc[VKILL] = lios->c_cc[LINUX_VKILL];
	bios->c_cc[VEOF] = lios->c_cc[LINUX_VEOF];
	bios->c_cc[VEOL] = lios->c_cc[LINUX_VEOL];
	bios->c_cc[VMIN] = lios->c_cc[LINUX_VMIN];
	bios->c_cc[VTIME] = lios->c_cc[LINUX_VTIME];
	bios->c_cc[VEOL2] = lios->c_cc[LINUX_VEOL2];
	bios->c_cc[VSUSP] = lios->c_cc[LINUX_VSUSP];
	bios->c_cc[VSTART] = lios->c_cc[LINUX_VSTART];
	bios->c_cc[VSTOP] = lios->c_cc[LINUX_VSTOP];
	bios->c_cc[VREPRINT] = lios->c_cc[LINUX_VREPRINT];
	bios->c_cc[VDISCARD] = lios->c_cc[LINUX_VDISCARD];
	bios->c_cc[VWERASE] = lios->c_cc[LINUX_VWERASE];
	bios->c_cc[VLNEXT] = lios->c_cc[LINUX_VLNEXT];
	if (linux_preserve_vstatus)
	bios->c_cc[VSTATUS] = lios->c_cc[LINUX_VSTATUS];

	for (i=0; i<NCCS; i++) {
	if (i != VMIN && i != VTIME &&
	bios->c_cc[i] == LINUX_POSIX_VDISABLE)
	bios->c_cc[i] = _POSIX_VDISABLE;
	}

	bios->c_ispeed = bios->c_ospeed =
	linux_to_bsd_speed(lios->c_cflag & LINUX_CBAUD, sptab);
	}

	static void
	bsd_to_linux_termio(struct termios bios, struct linux_termio lio)
	{
	struct linux_termios lios;

	memset(lio, 0, sizeof(*lio));
	bsd_to_linux_termios(bios, &lios);
	lio->c_iflag = lios.c_iflag;
	lio->c_oflag = lios.c_oflag;
	lio->c_cflag = lios.c_cflag;
	lio->c_lflag = lios.c_lflag;
	lio->c_line = lios.c_line;
	memcpy(lio->c_cc, lios.c_cc, LINUX_NCC);
	}

	static void
	linux_to_bsd_termio(struct linux_termio lio, struct termios bios)
	{
	struct linux_termios lios;
	int i;

	lios.c_iflag = lio->c_iflag;
	lios.c_oflag = lio->c_oflag;
	lios.c_cflag = lio->c_cflag;
	lios.c_lflag = lio->c_lflag;
	for (i=LINUX_NCC; i<LINUX_NCCS; i++)
	lios.c_cc[i] = LINUX_POSIX_VDISABLE;
	memcpy(lios.c_cc, lio->c_cc, LINUX_NCC);
	linux_to_bsd_termios(&lios, bios);
	}

	static int
	linux_ioctl_termio(struct thread td, struct linux_ioctl_args args)
	{
	struct termios bios;
	struct linux_termios lios;
	struct linux_termio lio;
	struct file *fp;
	int error;

	error = fget(td, args->fd, &cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);

	switch (args->cmd & 0xffff) {
	case LINUX_TCGETS:
	error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred,
	td);
	if (error)
	break;
	bsd_to_linux_termios(&bios, &lios);
	error = copyout(&lios, (void *)args->arg, sizeof(lios));
	break;

	case LINUX_TCSETS:
	error = copyin((void *)args->arg, &lios, sizeof(lios));
	if (error)
	break;
	linux_to_bsd_termios(&lios, &bios);
	error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCSETSW:
	error = copyin((void *)args->arg, &lios, sizeof(lios));
	if (error)
	break;
	linux_to_bsd_termios(&lios, &bios);
	error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCSETSF:
	error = copyin((void *)args->arg, &lios, sizeof(lios));
	if (error)
	break;
	linux_to_bsd_termios(&lios, &bios);
	error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCGETA:
	error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred,
	td);
	if (error)
	break;
	bsd_to_linux_termio(&bios, &lio);
	error = (copyout(&lio, (void *)args->arg, sizeof(lio)));
	break;

	case LINUX_TCSETA:
	error = copyin((void *)args->arg, &lio, sizeof(lio));
	if (error)
	break;
	linux_to_bsd_termio(&lio, &bios);
	error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCSETAW:
	error = copyin((void *)args->arg, &lio, sizeof(lio));
	if (error)
	break;
	linux_to_bsd_termio(&lio, &bios);
	error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCSETAF:
	error = copyin((void *)args->arg, &lio, sizeof(lio));
	if (error)
	break;
	linux_to_bsd_termio(&lio, &bios);
	error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	/* LINUX_TCSBRK */

	case LINUX_TCXONC: {
	switch (args->arg) {
	case LINUX_TCOOFF:
	args->cmd = TIOCSTOP;
	break;
	case LINUX_TCOON:
	args->cmd = TIOCSTART;
	break;
	case LINUX_TCIOFF:
	case LINUX_TCION: {
	int c;
	struct write_args wr;
	error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios,
	td->td_ucred, td);
	if (error)
	break;
	fdrop(fp, td);
	c = (args->arg == LINUX_TCIOFF) ? VSTOP : VSTART;
	c = bios.c_cc[c];
	if (c != _POSIX_VDISABLE) {
	wr.fd = args->fd;
	wr.buf = &c;
	wr.nbyte = sizeof(c);
	return (sys_write(td, &wr));
	} else
	return (0);
	}
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	args->arg = 0;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;
	}

	case LINUX_TCFLSH: {
	int val;
	switch (args->arg) {
	case LINUX_TCIFLUSH:
	val = FREAD;
	break;
	case LINUX_TCOFLUSH:
	val = FWRITE;
	break;
	case LINUX_TCIOFLUSH:
	val = FREAD \| FWRITE;
	break;
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	error = (fo_ioctl(fp,TIOCFLUSH,(caddr_t)&val,td->td_ucred,td));
	break;
	}

	case LINUX_TIOCEXCL:
	args->cmd = TIOCEXCL;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCNXCL:
	args->cmd = TIOCNXCL;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCSCTTY:
	args->cmd = TIOCSCTTY;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCGPGRP:
	args->cmd = TIOCGPGRP;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCSPGRP:
	args->cmd = TIOCSPGRP;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_TIOCOUTQ */
	/* LINUX_TIOCSTI */

	case LINUX_TIOCGWINSZ:
	args->cmd = TIOCGWINSZ;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCSWINSZ:
	args->cmd = TIOCSWINSZ;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCMGET:
	args->cmd = TIOCMGET;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCMBIS:
	args->cmd = TIOCMBIS;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCMBIC:
	args->cmd = TIOCMBIC;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCMSET:
	args->cmd = TIOCMSET;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	/* TIOCGSOFTCAR */
	/* TIOCSSOFTCAR */

	case LINUX_FIONREAD: /* LINUX_TIOCINQ */
	args->cmd = FIONREAD;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_TIOCLINUX */

	case LINUX_TIOCCONS:
	args->cmd = TIOCCONS;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCGSERIAL: {
	struct linux_serial_struct lss;

	bzero(&lss, sizeof(lss));
	lss.type = LINUX_PORT_16550A;
	lss.flags = 0;
	lss.close_delay = 0;
	error = copyout(&lss, (void *)args->arg, sizeof(lss));
	break;
	}

	case LINUX_TIOCSSERIAL: {
	struct linux_serial_struct lss;
	error = copyin((void *)args->arg, &lss, sizeof(lss));
	if (error)
	break;
	/* XXX - It really helps to have an implementation that
	* does nothing. NOT!
	*/
	error = 0;
	break;
	}

	case LINUX_TIOCPKT:
	args->cmd = TIOCPKT;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_FIONBIO:
	args->cmd = FIONBIO;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCNOTTY:
	args->cmd = TIOCNOTTY;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCSETD: {
	int line;
	switch (args->arg) {
	case LINUX_N_TTY:
	line = TTYDISC;
	break;
	case LINUX_N_SLIP:
	line = SLIPDISC;
	break;
	case LINUX_N_PPP:
	line = PPPDISC;
	break;
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	error = (fo_ioctl(fp, TIOCSETD, (caddr_t)&line, td->td_ucred,
	td));
	break;
	}

	case LINUX_TIOCGETD: {
	int linux_line;
	int bsd_line = TTYDISC;
	error = fo_ioctl(fp, TIOCGETD, (caddr_t)&bsd_line,
	td->td_ucred, td);
	if (error)
	break;
	switch (bsd_line) {
	case TTYDISC:
	linux_line = LINUX_N_TTY;
	break;
	case SLIPDISC:
	linux_line = LINUX_N_SLIP;
	break;
	case PPPDISC:
	linux_line = LINUX_N_PPP;
	break;
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	error = (copyout(&linux_line, (void *)args->arg, sizeof(int)));
	break;
	}

	/* LINUX_TCSBRKP */
	/* LINUX_TIOCTTYGSTRUCT */

	case LINUX_FIONCLEX:
	args->cmd = FIONCLEX;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_FIOCLEX:
	args->cmd = FIOCLEX;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_FIOASYNC:
	args->cmd = FIOASYNC;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_TIOCSERCONFIG */
	/* LINUX_TIOCSERGWILD */
	/* LINUX_TIOCSERSWILD */
	/* LINUX_TIOCGLCKTRMIOS */
	/* LINUX_TIOCSLCKTRMIOS */

	case LINUX_TIOCSBRK:
	args->cmd = TIOCSBRK;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCCBRK:
	args->cmd = TIOCCBRK;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;
	case LINUX_TIOCGPTN: {
	int nb;

	error = fo_ioctl(fp, TIOCGPTN, (caddr_t)&nb, td->td_ucred, td);
	if (!error)
	error = copyout(&nb, (void *)args->arg,
	sizeof(int));
	break;
	}
	case LINUX_TIOCSPTLCK:
	/* Our unlockpt() does nothing. */
	error = 0;
	break;
	default:
	error = ENOIOCTL;
	break;
	}

	fdrop(fp, td);
	return (error);
	}

	/*
	* CDROM related ioctls
	*/

	struct linux_cdrom_msf
	{
	u_char cdmsf_min0;
	u_char cdmsf_sec0;
	u_char cdmsf_frame0;
	u_char cdmsf_min1;
	u_char cdmsf_sec1;
	u_char cdmsf_frame1;
	};

	struct linux_cdrom_tochdr
	{
	u_char cdth_trk0;
	u_char cdth_trk1;
	};

	union linux_cdrom_addr
	{
	struct {
	u_char minute;
	u_char second;
	u_char frame;
	} msf;
	int lba;
	};

	struct linux_cdrom_tocentry
	{
	u_char cdte_track;
	u_char cdte_adr:4;
	u_char cdte_ctrl:4;
	u_char cdte_format;
	union linux_cdrom_addr cdte_addr;
	u_char cdte_datamode;
	};

	struct linux_cdrom_subchnl
	{
	u_char cdsc_format;
	u_char cdsc_audiostatus;
	u_char cdsc_adr:4;
	u_char cdsc_ctrl:4;
	u_char cdsc_trk;
	u_char cdsc_ind;
	union linux_cdrom_addr cdsc_absaddr;
	union linux_cdrom_addr cdsc_reladdr;
	};

	struct l_cdrom_read_audio {
	union linux_cdrom_addr addr;
	u_char addr_format;
	l_int nframes;
	u_char *buf;
	};

	struct l_dvd_layer {
	u_char book_version:4;
	u_char book_type:4;
	u_char min_rate:4;
	u_char disc_size:4;
	u_char layer_type:4;
	u_char track_path:1;
	u_char nlayers:2;
	u_char track_density:4;
	u_char linear_density:4;
	u_char bca:1;
	u_int32_t start_sector;
	u_int32_t end_sector;
	u_int32_t end_sector_l0;
	};

	struct l_dvd_physical {
	u_char type;
	u_char layer_num;
	struct l_dvd_layer layer[4];
	};

	struct l_dvd_copyright {
	u_char type;
	u_char layer_num;
	u_char cpst;
	u_char rmi;
	};

	struct l_dvd_disckey {
	u_char type;
	l_uint agid:2;
	u_char value[2048];
	};

	struct l_dvd_bca {
	u_char type;
	l_int len;
	u_char value[188];
	};

	struct l_dvd_manufact {
	u_char type;
	u_char layer_num;
	l_int len;
	u_char value[2048];
	};

	typedef union {
	u_char type;
	struct l_dvd_physical physical;
	struct l_dvd_copyright copyright;
	struct l_dvd_disckey disckey;
	struct l_dvd_bca bca;
	struct l_dvd_manufact manufact;
	} l_dvd_struct;

	typedef u_char l_dvd_key[5];
	typedef u_char l_dvd_challenge[10];

	struct l_dvd_lu_send_agid {
	u_char type;
	l_uint agid:2;
	};

	struct l_dvd_host_send_challenge {
	u_char type;
	l_uint agid:2;
	l_dvd_challenge chal;
	};

	struct l_dvd_send_key {
	u_char type;
	l_uint agid:2;
	l_dvd_key key;
	};

	struct l_dvd_lu_send_challenge {
	u_char type;
	l_uint agid:2;
	l_dvd_challenge chal;
	};

	struct l_dvd_lu_send_title_key {
	u_char type;
	l_uint agid:2;
	l_dvd_key title_key;
	l_int lba;
	l_uint cpm:1;
	l_uint cp_sec:1;
	l_uint cgms:2;
	};

	struct l_dvd_lu_send_asf {
	u_char type;
	l_uint agid:2;
	l_uint asf:1;
	};

	struct l_dvd_host_send_rpcstate {
	u_char type;
	u_char pdrc;
	};

	struct l_dvd_lu_send_rpcstate {
	u_char type:2;
	u_char vra:3;
	u_char ucca:3;
	u_char region_mask;
	u_char rpc_scheme;
	};

	typedef union {
	u_char type;
	struct l_dvd_lu_send_agid lsa;
	struct l_dvd_host_send_challenge hsc;
	struct l_dvd_send_key lsk;
	struct l_dvd_lu_send_challenge lsc;
	struct l_dvd_send_key hsk;
	struct l_dvd_lu_send_title_key lstk;
	struct l_dvd_lu_send_asf lsasf;
	struct l_dvd_host_send_rpcstate hrpcs;
	struct l_dvd_lu_send_rpcstate lrpcs;
	} l_dvd_authinfo;

	static void
	bsd_to_linux_msf_lba(u_char af, union msf_lba bp, union linux_cdrom_addr lp)
	{
	if (af == CD_LBA_FORMAT)
	lp->lba = bp->lba;
	else {
	lp->msf.minute = bp->msf.minute;
	lp->msf.second = bp->msf.second;
	lp->msf.frame = bp->msf.frame;
	}
	}

	static void
	set_linux_cdrom_addr(union linux_cdrom_addr *addr, int format, int lba)
	{
	if (format == LINUX_CDROM_MSF) {
	addr->msf.frame = lba % 75;
	lba /= 75;
	lba += 2;
	addr->msf.second = lba % 60;
	addr->msf.minute = lba / 60;
	} else
	addr->lba = lba;
	}

	static int
	linux_to_bsd_dvd_struct(l_dvd_struct lp, struct dvd_struct bp)
	{
	bp->format = lp->type;
	switch (bp->format) {
	case DVD_STRUCT_PHYSICAL:
	if (bp->layer_num >= 4)
	return (EINVAL);
	bp->layer_num = lp->physical.layer_num;
	break;
	case DVD_STRUCT_COPYRIGHT:
	bp->layer_num = lp->copyright.layer_num;
	break;
	case DVD_STRUCT_DISCKEY:
	bp->agid = lp->disckey.agid;
	break;
	case DVD_STRUCT_BCA:
	case DVD_STRUCT_MANUFACT:
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	bsd_to_linux_dvd_struct(struct dvd_struct bp, l_dvd_struct lp)
	{
	switch (bp->format) {
	case DVD_STRUCT_PHYSICAL: {
	struct dvd_layer blp = (struct dvd_layer )bp->data;
	struct l_dvd_layer *llp = &lp->physical.layer[bp->layer_num];
	memset(llp, 0, sizeof(*llp));
	llp->book_version = blp->book_version;
	llp->book_type = blp->book_type;
	llp->min_rate = blp->max_rate;
	llp->disc_size = blp->disc_size;
	llp->layer_type = blp->layer_type;
	llp->track_path = blp->track_path;
	llp->nlayers = blp->nlayers;
	llp->track_density = blp->track_density;
	llp->linear_density = blp->linear_density;
	llp->bca = blp->bca;
	llp->start_sector = blp->start_sector;
	llp->end_sector = blp->end_sector;
	llp->end_sector_l0 = blp->end_sector_l0;
	break;
	}
	case DVD_STRUCT_COPYRIGHT:
	lp->copyright.cpst = bp->cpst;
	lp->copyright.rmi = bp->rmi;
	break;
	case DVD_STRUCT_DISCKEY:
	memcpy(lp->disckey.value, bp->data, sizeof(lp->disckey.value));
	break;
	case DVD_STRUCT_BCA:
	lp->bca.len = bp->length;
	memcpy(lp->bca.value, bp->data, sizeof(lp->bca.value));
	break;
	case DVD_STRUCT_MANUFACT:
	lp->manufact.len = bp->length;
	memcpy(lp->manufact.value, bp->data,
	sizeof(lp->manufact.value));
	/* lp->manufact.layer_num is unused in Linux (redhat 7.0). */
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	linux_to_bsd_dvd_authinfo(l_dvd_authinfo lp, int bcode,
	struct dvd_authinfo *bp)
	{
	switch (lp->type) {
	case LINUX_DVD_LU_SEND_AGID:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_AGID;
	bp->agid = lp->lsa.agid;
	break;
	case LINUX_DVD_HOST_SEND_CHALLENGE:
	*bcode = DVDIOCSENDKEY;
	bp->format = DVD_SEND_CHALLENGE;
	bp->agid = lp->hsc.agid;
	memcpy(bp->keychal, lp->hsc.chal, 10);
	break;
	case LINUX_DVD_LU_SEND_KEY1:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_KEY1;
	bp->agid = lp->lsk.agid;
	break;
	case LINUX_DVD_LU_SEND_CHALLENGE:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_CHALLENGE;
	bp->agid = lp->lsc.agid;
	break;
	case LINUX_DVD_HOST_SEND_KEY2:
	*bcode = DVDIOCSENDKEY;
	bp->format = DVD_SEND_KEY2;
	bp->agid = lp->hsk.agid;
	memcpy(bp->keychal, lp->hsk.key, 5);
	break;
	case LINUX_DVD_LU_SEND_TITLE_KEY:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_TITLE_KEY;
	bp->agid = lp->lstk.agid;
	bp->lba = lp->lstk.lba;
	break;
	case LINUX_DVD_LU_SEND_ASF:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_ASF;
	bp->agid = lp->lsasf.agid;
	break;
	case LINUX_DVD_INVALIDATE_AGID:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_INVALIDATE_AGID;
	bp->agid = lp->lsa.agid;
	break;
	case LINUX_DVD_LU_SEND_RPC_STATE:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_RPC;
	break;
	case LINUX_DVD_HOST_SEND_RPC_STATE:
	*bcode = DVDIOCSENDKEY;
	bp->format = DVD_SEND_RPC;
	bp->region = lp->hrpcs.pdrc;
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	bsd_to_linux_dvd_authinfo(struct dvd_authinfo bp, l_dvd_authinfo lp)
	{
	switch (lp->type) {
	case LINUX_DVD_LU_SEND_AGID:
	lp->lsa.agid = bp->agid;
	break;
	case LINUX_DVD_HOST_SEND_CHALLENGE:
	lp->type = LINUX_DVD_LU_SEND_KEY1;
	break;
	case LINUX_DVD_LU_SEND_KEY1:
	memcpy(lp->lsk.key, bp->keychal, sizeof(lp->lsk.key));
	break;
	case LINUX_DVD_LU_SEND_CHALLENGE:
	memcpy(lp->lsc.chal, bp->keychal, sizeof(lp->lsc.chal));
	break;
	case LINUX_DVD_HOST_SEND_KEY2:
	lp->type = LINUX_DVD_AUTH_ESTABLISHED;
	break;
	case LINUX_DVD_LU_SEND_TITLE_KEY:
	memcpy(lp->lstk.title_key, bp->keychal,
	sizeof(lp->lstk.title_key));
	lp->lstk.cpm = bp->cpm;
	lp->lstk.cp_sec = bp->cp_sec;
	lp->lstk.cgms = bp->cgms;
	break;
	case LINUX_DVD_LU_SEND_ASF:
	lp->lsasf.asf = bp->asf;
	break;
	case LINUX_DVD_INVALIDATE_AGID:
	break;
	case LINUX_DVD_LU_SEND_RPC_STATE:
	lp->lrpcs.type = bp->reg_type;
	lp->lrpcs.vra = bp->vend_rsts;
	lp->lrpcs.ucca = bp->user_rsts;
	lp->lrpcs.region_mask = bp->region;
	lp->lrpcs.rpc_scheme = bp->rpc_scheme;
	break;
	case LINUX_DVD_HOST_SEND_RPC_STATE:
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	linux_ioctl_cdrom(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;

	error = fget(td, args->fd, &cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	switch (args->cmd & 0xffff) {
	case LINUX_CDROMPAUSE:
	args->cmd = CDIOCPAUSE;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMRESUME:
	args->cmd = CDIOCRESUME;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMPLAYMSF:
	args->cmd = CDIOCPLAYMSF;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMPLAYTRKIND:
	args->cmd = CDIOCPLAYTRACKS;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMREADTOCHDR: {
	struct ioc_toc_header th;
	struct linux_cdrom_tochdr lth;
	error = fo_ioctl(fp, CDIOREADTOCHEADER, (caddr_t)&th,
	td->td_ucred, td);
	if (!error) {
	lth.cdth_trk0 = th.starting_track;
	lth.cdth_trk1 = th.ending_track;
	copyout(&lth, (void *)args->arg, sizeof(lth));
	}
	break;
	}

	case LINUX_CDROMREADTOCENTRY: {
	struct linux_cdrom_tocentry lte;
	struct ioc_read_toc_single_entry irtse;

	error = copyin((void *)args->arg, &lte, sizeof(lte));
	if (error)
	break;
	irtse.address_format = lte.cdte_format;
	irtse.track = lte.cdte_track;
	error = fo_ioctl(fp, CDIOREADTOCENTRY, (caddr_t)&irtse,
	td->td_ucred, td);
	if (!error) {
	lte.cdte_ctrl = irtse.entry.control;
	lte.cdte_adr = irtse.entry.addr_type;
	bsd_to_linux_msf_lba(irtse.address_format,
	&irtse.entry.addr, &lte.cdte_addr);
	error = copyout(&lte, (void *)args->arg, sizeof(lte));
	}
	break;
	}

	case LINUX_CDROMSTOP:
	args->cmd = CDIOCSTOP;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMSTART:
	args->cmd = CDIOCSTART;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMEJECT:
	args->cmd = CDIOCEJECT;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_CDROMVOLCTRL */

	case LINUX_CDROMSUBCHNL: {
	struct linux_cdrom_subchnl sc;
	struct ioc_read_subchannel bsdsc;
	struct cd_sub_channel_info bsdinfo;

	error = copyin((void *)args->arg, &sc, sizeof(sc));
	if (error)
	break;

	/*
	* Invoke the native ioctl and bounce the returned data through
	* the userspace buffer. This works because the Linux structure
	* is the same size as our structures for the subchannel header
	* and position data.
	*/
	bsdsc.address_format = CD_LBA_FORMAT;
	bsdsc.data_format = CD_CURRENT_POSITION;
	bsdsc.track = 0;
	bsdsc.data_len = sizeof(sc);
	bsdsc.data = (void *)args->arg;
	error = fo_ioctl(fp, CDIOCREADSUBCHANNEL, (caddr_t)&bsdsc,
	td->td_ucred, td);
	if (error)
	break;
	error = copyin((void *)args->arg, &bsdinfo, sizeof(bsdinfo));
	if (error)
	break;
	sc.cdsc_audiostatus = bsdinfo.header.audio_status;
	sc.cdsc_adr = bsdinfo.what.position.addr_type;
	sc.cdsc_ctrl = bsdinfo.what.position.control;
	sc.cdsc_trk = bsdinfo.what.position.track_number;
	sc.cdsc_ind = bsdinfo.what.position.index_number;
	set_linux_cdrom_addr(&sc.cdsc_absaddr, sc.cdsc_format,
	bsdinfo.what.position.absaddr.lba);
	set_linux_cdrom_addr(&sc.cdsc_reladdr, sc.cdsc_format,
	bsdinfo.what.position.reladdr.lba);
	error = copyout(&sc, (void *)args->arg, sizeof(sc));
	break;
	}

	/* LINUX_CDROMREADMODE2 */
	/* LINUX_CDROMREADMODE1 */
	/* LINUX_CDROMREADAUDIO */
	/* LINUX_CDROMEJECT_SW */
	/* LINUX_CDROMMULTISESSION */
	/* LINUX_CDROM_GET_UPC */

	case LINUX_CDROMRESET:
	args->cmd = CDIOCRESET;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_CDROMVOLREAD */
	/* LINUX_CDROMREADRAW */
	/* LINUX_CDROMREADCOOKED */
	/* LINUX_CDROMSEEK */
	/* LINUX_CDROMPLAYBLK */
	/* LINUX_CDROMREADALL */
	/* LINUX_CDROMCLOSETRAY */
	/* LINUX_CDROMLOADFROMSLOT */
	/* LINUX_CDROMGETSPINDOWN */
	/* LINUX_CDROMSETSPINDOWN */
	/* LINUX_CDROM_SET_OPTIONS */
	/* LINUX_CDROM_CLEAR_OPTIONS */
	/* LINUX_CDROM_SELECT_SPEED */
	/* LINUX_CDROM_SELECT_DISC */
	/* LINUX_CDROM_MEDIA_CHANGED */
	/* LINUX_CDROM_DRIVE_STATUS */
	/* LINUX_CDROM_DISC_STATUS */
	/* LINUX_CDROM_CHANGER_NSLOTS */
	/* LINUX_CDROM_LOCKDOOR */
	/* LINUX_CDROM_DEBUG */
	/* LINUX_CDROM_GET_CAPABILITY */
	/* LINUX_CDROMAUDIOBUFSIZ */

	case LINUX_DVD_READ_STRUCT: {
	l_dvd_struct *lds;
	struct dvd_struct *bds;

	lds = malloc(sizeof(*lds), M_LINUX, M_WAITOK);
	bds = malloc(sizeof(*bds), M_LINUX, M_WAITOK);
	error = copyin((void )args->arg, lds, sizeof(lds));
	if (error)
	goto out;
	error = linux_to_bsd_dvd_struct(lds, bds);
	if (error)
	goto out;
	error = fo_ioctl(fp, DVDIOCREADSTRUCTURE, (caddr_t)bds,
	td->td_ucred, td);
	if (error)
	goto out;
	error = bsd_to_linux_dvd_struct(bds, lds);
	if (error)
	goto out;
	error = copyout(lds, (void )args->arg, sizeof(lds));
	out:
	free(bds, M_LINUX);
	free(lds, M_LINUX);
	break;
	}

	/* LINUX_DVD_WRITE_STRUCT */

	case LINUX_DVD_AUTH: {
	l_dvd_authinfo lda;
	struct dvd_authinfo bda;
	int bcode;

	error = copyin((void *)args->arg, &lda, sizeof(lda));
	if (error)
	break;
	error = linux_to_bsd_dvd_authinfo(&lda, &bcode, &bda);
	if (error)
	break;
	error = fo_ioctl(fp, bcode, (caddr_t)&bda, td->td_ucred,
	td);
	if (error) {
	if (lda.type == LINUX_DVD_HOST_SEND_KEY2) {
	lda.type = LINUX_DVD_AUTH_FAILURE;
	copyout(&lda, (void *)args->arg, sizeof(lda));
	}
	break;
	}
	error = bsd_to_linux_dvd_authinfo(&bda, &lda);
	if (error)
	break;
	error = copyout(&lda, (void *)args->arg, sizeof(lda));
	break;
	}

	case LINUX_SCSI_GET_BUS_NUMBER:
	{
	struct sg_scsi_id id;

	error = fo_ioctl(fp, SG_GET_SCSI_ID, (caddr_t)&id,
	td->td_ucred, td);
	if (error)
	break;
	error = copyout(&id.channel, (void *)args->arg, sizeof(int));
	break;
	}

	case LINUX_SCSI_GET_IDLUN:
	{
	struct sg_scsi_id id;
	struct scsi_idlun idl;

	error = fo_ioctl(fp, SG_GET_SCSI_ID, (caddr_t)&id,
	td->td_ucred, td);
	if (error)
	break;
	idl.dev_id = (id.scsi_id & 0xff) + ((id.lun & 0xff) << 8) +
	((id.channel & 0xff) << 16) + ((id.host_no & 0xff) << 24);
	idl.host_unique_id = id.host_no;
	error = copyout(&idl, (void *)args->arg, sizeof(idl));
	break;
	}

	/* LINUX_CDROM_SEND_PACKET */
	/* LINUX_CDROM_NEXT_WRITABLE */
	/* LINUX_CDROM_LAST_WRITTEN */

	default:
	error = ENOIOCTL;
	break;
	}

	fdrop(fp, td);
	return (error);
	}

	static int
	linux_ioctl_vfat(struct thread td, struct linux_ioctl_args args)
	{

	return (ENOTTY);
	}

	/*
	* Sound related ioctls
	*/

	struct linux_old_mixer_info {
	char id[16];
	char name[32];
	};

	static u_int32_t dirbits[4] = { IOC_VOID, IOC_IN, IOC_OUT, IOC_INOUT };

	#define SETDIR(c) (((c) & ~IOC_DIRMASK) \| dirbits[args->cmd >> 30])

	static int
	linux_ioctl_sound(struct thread td, struct linux_ioctl_args args)
	{

	switch (args->cmd & 0xffff) {
	case LINUX_SOUND_MIXER_WRITE_VOLUME:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_VOLUME);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_BASS:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_BASS);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_TREBLE:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_TREBLE);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_SYNTH:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_SYNTH);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_PCM:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_PCM);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_SPEAKER:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_SPEAKER);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_LINE:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_MIC:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_MIC);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_CD:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_CD);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_IMIX:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_IMIX);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_ALTPCM:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_ALTPCM);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_RECLEV:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_RECLEV);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_IGAIN:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_IGAIN);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_OGAIN:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_OGAIN);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_LINE1:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE1);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_LINE2:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE2);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_LINE3:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE3);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_MONITOR:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_MONITOR);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_INFO: {
	/* Key on encoded length */
	switch ((args->cmd >> 16) & 0x1fff) {
	case 0x005c: { /* SOUND_MIXER_INFO */
	args->cmd = SOUND_MIXER_INFO;
	return (sys_ioctl(td, (struct ioctl_args *)args));
	}
	case 0x0030: { /* SOUND_OLD_MIXER_INFO */
	struct linux_old_mixer_info info;
	bzero(&info, sizeof(info));
	strncpy(info.id, "OSS", sizeof(info.id) - 1);
	strncpy(info.name, "FreeBSD OSS Mixer", sizeof(info.name) - 1);
	copyout(&info, (void *)args->arg, sizeof(info));
	return (0);
	}
	default:
	return (ENOIOCTL);
	}
	break;
	}

	case LINUX_OSS_GETVERSION: {
	int version = linux_get_oss_version(td);
	return (copyout(&version, (void *)args->arg, sizeof(int)));
	}

	case LINUX_SOUND_MIXER_READ_STEREODEVS:
	args->cmd = SOUND_MIXER_READ_STEREODEVS;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_READ_CAPS:
	args->cmd = SOUND_MIXER_READ_CAPS;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_READ_RECMASK:
	args->cmd = SOUND_MIXER_READ_RECMASK;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_READ_DEVMASK:
	args->cmd = SOUND_MIXER_READ_DEVMASK;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_RECSRC:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_RECSRC);
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_RESET:
	args->cmd = SNDCTL_DSP_RESET;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SYNC:
	args->cmd = SNDCTL_DSP_SYNC;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SPEED:
	args->cmd = SNDCTL_DSP_SPEED;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_STEREO:
	args->cmd = SNDCTL_DSP_STEREO;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETBLKSIZE: /* LINUX_SNDCTL_DSP_SETBLKSIZE */
	args->cmd = SNDCTL_DSP_GETBLKSIZE;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SETFMT:
	args->cmd = SNDCTL_DSP_SETFMT;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_PCM_WRITE_CHANNELS:
	args->cmd = SOUND_PCM_WRITE_CHANNELS;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_PCM_WRITE_FILTER:
	args->cmd = SOUND_PCM_WRITE_FILTER;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_POST:
	args->cmd = SNDCTL_DSP_POST;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SUBDIVIDE:
	args->cmd = SNDCTL_DSP_SUBDIVIDE;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SETFRAGMENT:
	args->cmd = SNDCTL_DSP_SETFRAGMENT;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETFMTS:
	args->cmd = SNDCTL_DSP_GETFMTS;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETOSPACE:
	args->cmd = SNDCTL_DSP_GETOSPACE;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETISPACE:
	args->cmd = SNDCTL_DSP_GETISPACE;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_NONBLOCK:
	args->cmd = SNDCTL_DSP_NONBLOCK;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETCAPS:
	args->cmd = SNDCTL_DSP_GETCAPS;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SETTRIGGER: /* LINUX_SNDCTL_GETTRIGGER */
	args->cmd = SNDCTL_DSP_SETTRIGGER;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETIPTR:
	args->cmd = SNDCTL_DSP_GETIPTR;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETOPTR:
	args->cmd = SNDCTL_DSP_GETOPTR;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SETDUPLEX:
	args->cmd = SNDCTL_DSP_SETDUPLEX;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETODELAY:
	args->cmd = SNDCTL_DSP_GETODELAY;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_RESET:
	args->cmd = SNDCTL_SEQ_RESET;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_SYNC:
	args->cmd = SNDCTL_SEQ_SYNC;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SYNTH_INFO:
	args->cmd = SNDCTL_SYNTH_INFO;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_CTRLRATE:
	args->cmd = SNDCTL_SEQ_CTRLRATE;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_GETOUTCOUNT:
	args->cmd = SNDCTL_SEQ_GETOUTCOUNT;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_GETINCOUNT:
	args->cmd = SNDCTL_SEQ_GETINCOUNT;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_PERCMODE:
	args->cmd = SNDCTL_SEQ_PERCMODE;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_FM_LOAD_INSTR:
	args->cmd = SNDCTL_FM_LOAD_INSTR;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_TESTMIDI:
	args->cmd = SNDCTL_SEQ_TESTMIDI;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_RESETSAMPLES:
	args->cmd = SNDCTL_SEQ_RESETSAMPLES;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_NRSYNTHS:
	args->cmd = SNDCTL_SEQ_NRSYNTHS;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_NRMIDIS:
	args->cmd = SNDCTL_SEQ_NRMIDIS;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_MIDI_INFO:
	args->cmd = SNDCTL_MIDI_INFO;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_TRESHOLD:
	args->cmd = SNDCTL_SEQ_TRESHOLD;
	return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SYNTH_MEMAVL:
	args->cmd = SNDCTL_SYNTH_MEMAVL;
	return (sys_ioctl(td, (struct ioctl_args *)args));
	}

	return (ENOIOCTL);
	}

	/*
	* Console related ioctls
	*/

	static int
	linux_ioctl_console(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;

	error = fget(td, args->fd, &cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	switch (args->cmd & 0xffff) {
	case LINUX_KIOCSOUND:
	args->cmd = KIOCSOUND;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDMKTONE:
	args->cmd = KDMKTONE;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDGETLED:
	args->cmd = KDGETLED;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDSETLED:
	args->cmd = KDSETLED;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDSETMODE:
	args->cmd = KDSETMODE;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDGETMODE:
	args->cmd = KDGETMODE;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDGKBMODE:
	args->cmd = KDGKBMODE;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDSKBMODE: {
	int kbdmode;
	switch (args->arg) {
	case LINUX_KBD_RAW:
	kbdmode = K_RAW;
	break;
	case LINUX_KBD_XLATE:
	kbdmode = K_XLATE;
	break;
	case LINUX_KBD_MEDIUMRAW:
	kbdmode = K_RAW;
	break;
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	error = (fo_ioctl(fp, KDSKBMODE, (caddr_t)&kbdmode,
	td->td_ucred, td));
	break;
	}

	case LINUX_VT_OPENQRY:
	args->cmd = VT_OPENQRY;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_GETMODE:
	args->cmd = VT_GETMODE;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_SETMODE: {
	struct vt_mode mode;
	if ((error = copyin((void *)args->arg, &mode, sizeof(mode))))
	break;
	if (LINUX_SIG_VALID(mode.relsig))
	mode.relsig = linux_to_bsd_signal(mode.relsig);
	else
	mode.relsig = 0;
	if (LINUX_SIG_VALID(mode.acqsig))
	mode.acqsig = linux_to_bsd_signal(mode.acqsig);
	else
	mode.acqsig = 0;
	/* XXX. Linux ignores frsig and set it to 0. */
	mode.frsig = 0;
	if ((error = copyout(&mode, (void *)args->arg, sizeof(mode))))
	break;
	args->cmd = VT_SETMODE;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;
	}

	case LINUX_VT_GETSTATE:
	args->cmd = VT_GETACTIVE;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_RELDISP:
	args->cmd = VT_RELDISP;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_ACTIVATE:
	args->cmd = VT_ACTIVATE;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_WAITACTIVE:
	args->cmd = VT_WAITACTIVE;
	error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	default:
	error = ENOIOCTL;
	break;
	}

	fdrop(fp, td);
	return (error);
	}

	/*
	* Implement the SIOCGIFNAME ioctl
	*/

	static int
	linux_ioctl_ifname(struct thread td, struct l_ifreq uifr)
	{
	struct l_ifreq ifr;
	struct ifnet *ifp;
	int error, ethno, index;

	error = copyin(uifr, &ifr, sizeof(ifr));
	if (error != 0)
	return (error);

	CURVNET_SET(TD_TO_VNET(curthread));
	IFNET_RLOCK();
	index = 1; /* ifr.ifr_ifindex starts from 1 */
	ethno = 0;
	error = ENODEV;
	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if (ifr.ifr_ifindex == index) {
	if (IFP_IS_ETH(ifp))
	snprintf(ifr.ifr_name, LINUX_IFNAMSIZ,
	"eth%d", ethno);
	else
	strlcpy(ifr.ifr_name, ifp->if_xname,
	LINUX_IFNAMSIZ);
	error = 0;
	break;
	}
	if (IFP_IS_ETH(ifp))
	ethno++;
	index++;
	}
	IFNET_RUNLOCK();
	if (error == 0)
	error = copyout(&ifr, uifr, sizeof(ifr));
	CURVNET_RESTORE();

	return (error);
	}

	/*
	* Implement the SIOCGIFCONF ioctl
	*/

	static int
	linux_ifconf(struct thread td, struct ifconf uifc)
	{
	#ifdef COMPAT_LINUX32
	struct l_ifconf ifc;
	#else
	struct ifconf ifc;
	#endif
	struct l_ifreq ifr;
	struct ifnet *ifp;
	struct ifaddr *ifa;
	struct sbuf *sb;
	int error, ethno, full = 0, valid_len, max_len;

	error = copyin(uifc, &ifc, sizeof(ifc));
	if (error != 0)
	return (error);

	- max_len = MAXPHYS - 1;
	+ max_len = maxphys - 1;

	CURVNET_SET(TD_TO_VNET(td));
	/* handle the 'request buffer size' case */
	if ((l_uintptr_t)ifc.ifc_buf == PTROUT(NULL)) {
	ifc.ifc_len = 0;
	IFNET_RLOCK();
	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	struct sockaddr *sa = ifa->ifa_addr;
	if (sa->sa_family == AF_INET)
	ifc.ifc_len += sizeof(ifr);
	}
	}
	IFNET_RUNLOCK();
	error = copyout(&ifc, uifc, sizeof(ifc));
	CURVNET_RESTORE();
	return (error);
	}

	if (ifc.ifc_len <= 0) {
	CURVNET_RESTORE();
	return (EINVAL);
	}

	again:
	/* Keep track of eth interfaces */
	ethno = 0;
	if (ifc.ifc_len <= max_len) {
	max_len = ifc.ifc_len;
	full = 1;
	}
	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
	max_len = 0;
	valid_len = 0;

	/* Return all AF_INET addresses of all interfaces */
	IFNET_RLOCK();
	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	int addrs = 0;

	bzero(&ifr, sizeof(ifr));
	if (IFP_IS_ETH(ifp))
	snprintf(ifr.ifr_name, LINUX_IFNAMSIZ, "eth%d",
	ethno++);
	else
	strlcpy(ifr.ifr_name, ifp->if_xname, LINUX_IFNAMSIZ);

	/* Walk the address list */
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	struct sockaddr *sa = ifa->ifa_addr;

	if (sa->sa_family == AF_INET) {
	ifr.ifr_addr.sa_family = LINUX_AF_INET;
	memcpy(ifr.ifr_addr.sa_data, sa->sa_data,
	sizeof(ifr.ifr_addr.sa_data));
	sbuf_bcat(sb, &ifr, sizeof(ifr));
	max_len += sizeof(ifr);
	addrs++;
	}

	if (sbuf_error(sb) == 0)
	valid_len = sbuf_len(sb);
	}
	if (addrs == 0) {
	bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
	sbuf_bcat(sb, &ifr, sizeof(ifr));
	max_len += sizeof(ifr);

	if (sbuf_error(sb) == 0)
	valid_len = sbuf_len(sb);
	}
	}
	IFNET_RUNLOCK();

	if (valid_len != max_len && !full) {
	sbuf_delete(sb);
	goto again;
	}

	ifc.ifc_len = valid_len;
	sbuf_finish(sb);
	error = copyout(sbuf_data(sb), PTRIN(ifc.ifc_buf), ifc.ifc_len);
	if (error == 0)
	error = copyout(&ifc, uifc, sizeof(ifc));
	sbuf_delete(sb);
	CURVNET_RESTORE();

	return (error);
	}

	static int
	linux_gifflags(struct thread td, struct ifnet ifp, struct l_ifreq *ifr)
	{
	l_short flags;

	linux_ifflags(ifp, &flags);

	return (copyout(&flags, &ifr->ifr_flags, sizeof(flags)));
	}

	static int
	linux_gifhwaddr(struct ifnet ifp, struct l_ifreq ifr)
	{
	struct l_sockaddr lsa;

	if (linux_ifhwaddr(ifp, &lsa) != 0)
	return (ENOENT);

	return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof(lsa)));
	}

	/*
	* If we fault in bsd_to_linux_ifreq() then we will fault when we call
	* the native ioctl(). Thus, we don't really need to check the return
	* value of this function.
	*/
	static int
	bsd_to_linux_ifreq(struct ifreq *arg)
	{
	struct ifreq ifr;
	size_t ifr_len = sizeof(struct ifreq);
	int error;

	if ((error = copyin(arg, &ifr, ifr_len)))
	return (error);

	(u_short )&ifr.ifr_addr = ifr.ifr_addr.sa_family;

	error = copyout(&ifr, arg, ifr_len);

	return (error);
	}

	/*
	* Socket related ioctls
	*/

	static int
	linux_ioctl_socket(struct thread td, struct linux_ioctl_args args)
	{
	char lifname[LINUX_IFNAMSIZ], ifname[IFNAMSIZ];
	struct ifnet *ifp;
	struct file *fp;
	int error, type;

	ifp = NULL;
	error = 0;

	error = fget(td, args->fd, &cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	type = fp->f_type;
	fdrop(fp, td);
	if (type != DTYPE_SOCKET) {
	/* not a socket - probably a tap / vmnet device */
	switch (args->cmd) {
	case LINUX_SIOCGIFADDR:
	case LINUX_SIOCSIFADDR:
	case LINUX_SIOCGIFFLAGS:
	return (linux_ioctl_special(td, args));
	default:
	return (ENOIOCTL);
	}
	}

	switch (args->cmd & 0xffff) {
	case LINUX_FIOGETOWN:
	case LINUX_FIOSETOWN:
	case LINUX_SIOCADDMULTI:
	case LINUX_SIOCATMARK:
	case LINUX_SIOCDELMULTI:
	case LINUX_SIOCGIFNAME:
	case LINUX_SIOCGIFCONF:
	case LINUX_SIOCGPGRP:
	case LINUX_SIOCSPGRP:
	case LINUX_SIOCGIFCOUNT:
	/* these ioctls don't take an interface name */
	break;

	case LINUX_SIOCGIFFLAGS:
	case LINUX_SIOCGIFADDR:
	case LINUX_SIOCSIFADDR:
	case LINUX_SIOCGIFDSTADDR:
	case LINUX_SIOCGIFBRDADDR:
	case LINUX_SIOCGIFNETMASK:
	case LINUX_SIOCSIFNETMASK:
	case LINUX_SIOCGIFMTU:
	case LINUX_SIOCSIFMTU:
	case LINUX_SIOCSIFNAME:
	case LINUX_SIOCGIFHWADDR:
	case LINUX_SIOCSIFHWADDR:
	case LINUX_SIOCDEVPRIVATE:
	case LINUX_SIOCDEVPRIVATE+1:
	case LINUX_SIOCGIFINDEX:
	/* copy in the interface name and translate it. */
	error = copyin((void *)args->arg, lifname, LINUX_IFNAMSIZ);
	if (error != 0)
	return (error);
	memset(ifname, 0, sizeof(ifname));
	ifp = ifname_linux_to_bsd(td, lifname, ifname);
	if (ifp == NULL)
	return (EINVAL);
	/*
	* We need to copy it back out in case we pass the
	* request on to our native ioctl(), which will expect
	* the ifreq to be in user space and have the correct
	* interface name.
	*/
	error = copyout(ifname, (void *)args->arg, IFNAMSIZ);
	if (error != 0)
	return (error);
	break;

	default:
	return (ENOIOCTL);
	}

	switch (args->cmd & 0xffff) {
	case LINUX_FIOSETOWN:
	args->cmd = FIOSETOWN;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCSPGRP:
	args->cmd = SIOCSPGRP;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_FIOGETOWN:
	args->cmd = FIOGETOWN;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCGPGRP:
	args->cmd = SIOCGPGRP;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCATMARK:
	args->cmd = SIOCATMARK;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	/* LINUX_SIOCGSTAMP */

	case LINUX_SIOCGIFNAME:
	error = linux_ioctl_ifname(td, (struct l_ifreq *)args->arg);
	break;

	case LINUX_SIOCGIFCONF:
	error = linux_ifconf(td, (struct ifconf *)args->arg);
	break;

	case LINUX_SIOCGIFFLAGS:
	args->cmd = SIOCGIFFLAGS;
	error = linux_gifflags(td, ifp, (struct l_ifreq *)args->arg);
	break;

	case LINUX_SIOCGIFADDR:
	args->cmd = SIOCGIFADDR;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	bsd_to_linux_ifreq((struct ifreq *)args->arg);
	break;

	case LINUX_SIOCSIFADDR:
	/* XXX probably doesn't work, included for completeness */
	args->cmd = SIOCSIFADDR;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCGIFDSTADDR:
	args->cmd = SIOCGIFDSTADDR;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	bsd_to_linux_ifreq((struct ifreq *)args->arg);
	break;

	case LINUX_SIOCGIFBRDADDR:
	args->cmd = SIOCGIFBRDADDR;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	bsd_to_linux_ifreq((struct ifreq *)args->arg);
	break;

	case LINUX_SIOCGIFNETMASK:
	args->cmd = SIOCGIFNETMASK;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	bsd_to_linux_ifreq((struct ifreq *)args->arg);
	break;

	case LINUX_SIOCSIFNETMASK:
	error = ENOIOCTL;
	break;

	case LINUX_SIOCGIFMTU:
	args->cmd = SIOCGIFMTU;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCSIFMTU:
	args->cmd = SIOCSIFMTU;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCSIFNAME:
	error = ENOIOCTL;
	break;

	case LINUX_SIOCGIFHWADDR:
	error = linux_gifhwaddr(ifp, (struct l_ifreq *)args->arg);
	break;

	case LINUX_SIOCSIFHWADDR:
	error = ENOIOCTL;
	break;

	case LINUX_SIOCADDMULTI:
	args->cmd = SIOCADDMULTI;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCDELMULTI:
	args->cmd = SIOCDELMULTI;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCGIFINDEX:
	args->cmd = SIOCGIFINDEX;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCGIFCOUNT:
	error = 0;
	break;

	/*
	* XXX This is slightly bogus, but these ioctls are currently
	* XXX only used by the aironet (if_an) network driver.
	*/
	case LINUX_SIOCDEVPRIVATE:
	args->cmd = SIOCGPRIVATE_0;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCDEVPRIVATE+1:
	args->cmd = SIOCGPRIVATE_1;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;
	}

	if (ifp != NULL)
	/* restore the original interface name */
	copyout(lifname, (void *)args->arg, LINUX_IFNAMSIZ);

	return (error);
	}

	/*
	* Device private ioctl handler
	*/
	static int
	linux_ioctl_private(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error, type;

	error = fget(td, args->fd, &cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	type = fp->f_type;
	fdrop(fp, td);
	if (type == DTYPE_SOCKET)
	return (linux_ioctl_socket(td, args));
	return (ENOIOCTL);
	}

	/*
	* DRM ioctl handler (sys/dev/drm)
	*/
	static int
	linux_ioctl_drm(struct thread td, struct linux_ioctl_args args)
	{
	args->cmd = SETDIR(args->cmd);
	return (sys_ioctl(td, (struct ioctl_args *)args));
	}

	#ifdef COMPAT_LINUX32
	static int
	linux_ioctl_sg_io(struct thread td, struct linux_ioctl_args args)
	{
	struct sg_io_hdr io;
	struct sg_io_hdr32 io32;
	struct file *fp;
	int error;

	error = fget(td, args->fd, &cap_ioctl_rights, &fp);
	if (error != 0) {
	printf("sg_linux_ioctl: fget returned %d\n", error);
	return (error);
	}

	if ((error = copyin((void *)args->arg, &io32, sizeof(io32))) != 0)
	goto out;

	CP(io32, io, interface_id);
	CP(io32, io, dxfer_direction);
	CP(io32, io, cmd_len);
	CP(io32, io, mx_sb_len);
	CP(io32, io, iovec_count);
	CP(io32, io, dxfer_len);
	PTRIN_CP(io32, io, dxferp);
	PTRIN_CP(io32, io, cmdp);
	PTRIN_CP(io32, io, sbp);
	CP(io32, io, timeout);
	CP(io32, io, flags);
	CP(io32, io, pack_id);
	PTRIN_CP(io32, io, usr_ptr);
	CP(io32, io, status);
	CP(io32, io, masked_status);
	CP(io32, io, msg_status);
	CP(io32, io, sb_len_wr);
	CP(io32, io, host_status);
	CP(io32, io, driver_status);
	CP(io32, io, resid);
	CP(io32, io, duration);
	CP(io32, io, info);

	if ((error = fo_ioctl(fp, SG_IO, (caddr_t)&io, td->td_ucred, td)) != 0)
	goto out;

	CP(io, io32, interface_id);
	CP(io, io32, dxfer_direction);
	CP(io, io32, cmd_len);
	CP(io, io32, mx_sb_len);
	CP(io, io32, iovec_count);
	CP(io, io32, dxfer_len);
	PTROUT_CP(io, io32, dxferp);
	PTROUT_CP(io, io32, cmdp);
	PTROUT_CP(io, io32, sbp);
	CP(io, io32, timeout);
	CP(io, io32, flags);
	CP(io, io32, pack_id);
	PTROUT_CP(io, io32, usr_ptr);
	CP(io, io32, status);
	CP(io, io32, masked_status);
	CP(io, io32, msg_status);
	CP(io, io32, sb_len_wr);
	CP(io, io32, host_status);
	CP(io, io32, driver_status);
	CP(io, io32, resid);
	CP(io, io32, duration);
	CP(io, io32, info);

	error = copyout(&io32, (void *)args->arg, sizeof(io32));

	out:
	fdrop(fp, td);
	return (error);
	}
	#endif

	static int
	linux_ioctl_sg(struct thread td, struct linux_ioctl_args args)
	{

	switch (args->cmd) {
	case LINUX_SG_GET_VERSION_NUM:
	args->cmd = SG_GET_VERSION_NUM;
	break;
	case LINUX_SG_SET_TIMEOUT:
	args->cmd = SG_SET_TIMEOUT;
	break;
	case LINUX_SG_GET_TIMEOUT:
	args->cmd = SG_GET_TIMEOUT;
	break;
	case LINUX_SG_IO:
	args->cmd = SG_IO;
	#ifdef COMPAT_LINUX32
	return (linux_ioctl_sg_io(td, args));
	#endif
	break;
	case LINUX_SG_GET_RESERVED_SIZE:
	args->cmd = SG_GET_RESERVED_SIZE;
	break;
	case LINUX_SG_GET_SCSI_ID:
	args->cmd = SG_GET_SCSI_ID;
	break;
	case LINUX_SG_GET_SG_TABLESIZE:
	args->cmd = SG_GET_SG_TABLESIZE;
	break;
	default:
	return (ENODEV);
	}
	return (sys_ioctl(td, (struct ioctl_args *)args));
	}

	/*
	* Video4Linux (V4L) ioctl handler
	*/
	static int
	linux_to_bsd_v4l_tuner(struct l_video_tuner lvt, struct video_tuner vt)
	{
	vt->tuner = lvt->tuner;
	strlcpy(vt->name, lvt->name, LINUX_VIDEO_TUNER_NAME_SIZE);
	vt->rangelow = lvt->rangelow; /* possible long size conversion */
	vt->rangehigh = lvt->rangehigh; /* possible long size conversion */
	vt->flags = lvt->flags;
	vt->mode = lvt->mode;
	vt->signal = lvt->signal;
	return (0);
	}

	static int
	bsd_to_linux_v4l_tuner(struct video_tuner vt, struct l_video_tuner lvt)
	{
	lvt->tuner = vt->tuner;
	strlcpy(lvt->name, vt->name, LINUX_VIDEO_TUNER_NAME_SIZE);
	lvt->rangelow = vt->rangelow; /* possible long size conversion */
	lvt->rangehigh = vt->rangehigh; /* possible long size conversion */
	lvt->flags = vt->flags;
	lvt->mode = vt->mode;
	lvt->signal = vt->signal;
	return (0);
	}

	#ifdef COMPAT_LINUX_V4L_CLIPLIST
	static int
	linux_to_bsd_v4l_clip(struct l_video_clip lvc, struct video_clip vc)
	{
	vc->x = lvc->x;
	vc->y = lvc->y;
	vc->width = lvc->width;
	vc->height = lvc->height;
	vc->next = PTRIN(lvc->next); /* possible pointer size conversion */
	return (0);
	}
	#endif

	static int
	linux_to_bsd_v4l_window(struct l_video_window lvw, struct video_window vw)
	{
	vw->x = lvw->x;
	vw->y = lvw->y;
	vw->width = lvw->width;
	vw->height = lvw->height;
	vw->chromakey = lvw->chromakey;
	vw->flags = lvw->flags;
	vw->clips = PTRIN(lvw->clips); /* possible pointer size conversion */
	vw->clipcount = lvw->clipcount;
	return (0);
	}

	static int
	bsd_to_linux_v4l_window(struct video_window vw, struct l_video_window lvw)
	{
	memset(lvw, 0, sizeof(*lvw));

	lvw->x = vw->x;
	lvw->y = vw->y;
	lvw->width = vw->width;
	lvw->height = vw->height;
	lvw->chromakey = vw->chromakey;
	lvw->flags = vw->flags;
	lvw->clips = PTROUT(vw->clips); /* possible pointer size conversion */
	lvw->clipcount = vw->clipcount;
	return (0);
	}

	static int
	linux_to_bsd_v4l_buffer(struct l_video_buffer lvb, struct video_buffer vb)
	{
	vb->base = PTRIN(lvb->base); /* possible pointer size conversion */
	vb->height = lvb->height;
	vb->width = lvb->width;
	vb->depth = lvb->depth;
	vb->bytesperline = lvb->bytesperline;
	return (0);
	}

	static int
	bsd_to_linux_v4l_buffer(struct video_buffer vb, struct l_video_buffer lvb)
	{
	lvb->base = PTROUT(vb->base); /* possible pointer size conversion */
	lvb->height = vb->height;
	lvb->width = vb->width;
	lvb->depth = vb->depth;
	lvb->bytesperline = vb->bytesperline;
	return (0);
	}

	static int
	linux_to_bsd_v4l_code(struct l_video_code lvc, struct video_code vc)
	{
	strlcpy(vc->loadwhat, lvc->loadwhat, LINUX_VIDEO_CODE_LOADWHAT_SIZE);
	vc->datasize = lvc->datasize;
	vc->data = PTRIN(lvc->data); /* possible pointer size conversion */
	return (0);
	}

	#ifdef COMPAT_LINUX_V4L_CLIPLIST
	static int
	linux_v4l_clip_copy(void lvc, struct video_clip *ppvc)
	{
	int error;
	struct video_clip vclip;
	struct l_video_clip l_vclip;

	error = copyin(lvc, &l_vclip, sizeof(l_vclip));
	if (error) return (error);
	linux_to_bsd_v4l_clip(&l_vclip, &vclip);
	/* XXX: If there can be no concurrency: s/M_NOWAIT/M_WAITOK/ */
	if ((ppvc = malloc(sizeof(*ppvc), M_LINUX, M_NOWAIT)) == NULL)
	return (ENOMEM); /* XXX: Linux has no ENOMEM here. */
	memcpy(*ppvc, &vclip, sizeof(vclip));
	(*ppvc)->next = NULL;
	return (0);
	}

	static int
	linux_v4l_cliplist_free(struct video_window *vw)
	{
	struct video_clip **ppvc;
	struct video_clip **ppvc_next;

	for (ppvc = &(vw->clips); *ppvc != NULL; ppvc = ppvc_next) {
	ppvc_next = &((*ppvc)->next);
	free(*ppvc, M_LINUX);
	}
	vw->clips = NULL;

	return (0);
	}

	static int
	linux_v4l_cliplist_copy(struct l_video_window lvw, struct video_window vw)
	{
	int error;
	int clipcount;
	void *plvc;
	struct video_clip **ppvc;

	/*
	* XXX: The cliplist is used to pass in a list of clipping
	* rectangles or, if clipcount == VIDEO_CLIP_BITMAP, a
	* clipping bitmap. Some Linux apps, however, appear to
	* leave cliplist and clips uninitialized. In any case,
	* the cliplist is not used by pwc(4), at the time of
	* writing, FreeBSD's only V4L driver. When a driver
	* that uses the cliplist is developed, this code may
	* need re-examiniation.
	*/
	error = 0;
	clipcount = vw->clipcount;
	if (clipcount == VIDEO_CLIP_BITMAP) {
	/*
	* In this case, the pointer (clips) is overloaded
	* to be a "void *" to a bitmap, therefore there
	* is no struct video_clip to copy now.
	*/
	} else if (clipcount > 0 && clipcount <= 16384) {
	/*
	* Clips points to list of clip rectangles, so
	* copy the list.
	*
	* XXX: Upper limit of 16384 was used here to try to
	* avoid cases when clipcount and clips pointer
	* are uninitialized and therefore have high random
	* values, as is the case in the Linux Skype
	* application. The value 16384 was chosen as that
	* is what is used in the Linux stradis(4) MPEG
	* decoder driver, the only place we found an
	* example of cliplist use.
	*/
	plvc = PTRIN(lvw->clips);
	vw->clips = NULL;
	ppvc = &(vw->clips);
	while (clipcount-- > 0) {
	if (plvc == NULL) {
	error = EFAULT;
	break;
	} else {
	error = linux_v4l_clip_copy(plvc, ppvc);
	if (error) {
	linux_v4l_cliplist_free(vw);
	break;
	}
	}
	ppvc = &((*ppvc)->next);
	plvc = PTRIN(((struct l_video_clip *) plvc)->next);
	}
	} else {
	/*
	* clipcount == 0 or negative (but not VIDEO_CLIP_BITMAP)
	* Force cliplist to null.
	*/
	vw->clipcount = 0;
	vw->clips = NULL;
	}
	return (error);
	}
	#endif

	static int
	linux_ioctl_v4l(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;
	struct video_tuner vtun;
	struct video_window vwin;
	struct video_buffer vbuf;
	struct video_code vcode;
	struct l_video_tuner l_vtun;
	struct l_video_window l_vwin;
	struct l_video_buffer l_vbuf;
	struct l_video_code l_vcode;

	switch (args->cmd & 0xffff) {
	case LINUX_VIDIOCGCAP: args->cmd = VIDIOCGCAP; break;
	case LINUX_VIDIOCGCHAN: args->cmd = VIDIOCGCHAN; break;
	case LINUX_VIDIOCSCHAN: args->cmd = VIDIOCSCHAN; break;

	case LINUX_VIDIOCGTUNER:
	error = fget(td, args->fd,
	&cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	error = copyin((void *) args->arg, &l_vtun, sizeof(l_vtun));
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	linux_to_bsd_v4l_tuner(&l_vtun, &vtun);
	error = fo_ioctl(fp, VIDIOCGTUNER, &vtun, td->td_ucred, td);
	if (!error) {
	bsd_to_linux_v4l_tuner(&vtun, &l_vtun);
	error = copyout(&l_vtun, (void *) args->arg,
	sizeof(l_vtun));
	}
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOCSTUNER:
	error = fget(td, args->fd,
	&cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	error = copyin((void *) args->arg, &l_vtun, sizeof(l_vtun));
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	linux_to_bsd_v4l_tuner(&l_vtun, &vtun);
	error = fo_ioctl(fp, VIDIOCSTUNER, &vtun, td->td_ucred, td);
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOCGPICT: args->cmd = VIDIOCGPICT; break;
	case LINUX_VIDIOCSPICT: args->cmd = VIDIOCSPICT; break;
	case LINUX_VIDIOCCAPTURE: args->cmd = VIDIOCCAPTURE; break;

	case LINUX_VIDIOCGWIN:
	error = fget(td, args->fd,
	&cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	error = fo_ioctl(fp, VIDIOCGWIN, &vwin, td->td_ucred, td);
	if (!error) {
	bsd_to_linux_v4l_window(&vwin, &l_vwin);
	error = copyout(&l_vwin, (void *) args->arg,
	sizeof(l_vwin));
	}
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOCSWIN:
	error = fget(td, args->fd,
	&cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	error = copyin((void *) args->arg, &l_vwin, sizeof(l_vwin));
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	linux_to_bsd_v4l_window(&l_vwin, &vwin);
	#ifdef COMPAT_LINUX_V4L_CLIPLIST
	error = linux_v4l_cliplist_copy(&l_vwin, &vwin);
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	#endif
	error = fo_ioctl(fp, VIDIOCSWIN, &vwin, td->td_ucred, td);
	fdrop(fp, td);
	#ifdef COMPAT_LINUX_V4L_CLIPLIST
	linux_v4l_cliplist_free(&vwin);
	#endif
	return (error);

	case LINUX_VIDIOCGFBUF:
	error = fget(td, args->fd,
	&cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	error = fo_ioctl(fp, VIDIOCGFBUF, &vbuf, td->td_ucred, td);
	if (!error) {
	bsd_to_linux_v4l_buffer(&vbuf, &l_vbuf);
	error = copyout(&l_vbuf, (void *) args->arg,
	sizeof(l_vbuf));
	}
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOCSFBUF:
	error = fget(td, args->fd,
	&cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	error = copyin((void *) args->arg, &l_vbuf, sizeof(l_vbuf));
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	linux_to_bsd_v4l_buffer(&l_vbuf, &vbuf);
	error = fo_ioctl(fp, VIDIOCSFBUF, &vbuf, td->td_ucred, td);
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOCKEY: args->cmd = VIDIOCKEY; break;
	case LINUX_VIDIOCGFREQ: args->cmd = VIDIOCGFREQ; break;
	case LINUX_VIDIOCSFREQ: args->cmd = VIDIOCSFREQ; break;
	case LINUX_VIDIOCGAUDIO: args->cmd = VIDIOCGAUDIO; break;
	case LINUX_VIDIOCSAUDIO: args->cmd = VIDIOCSAUDIO; break;
	case LINUX_VIDIOCSYNC: args->cmd = VIDIOCSYNC; break;
	case LINUX_VIDIOCMCAPTURE: args->cmd = VIDIOCMCAPTURE; break;
	case LINUX_VIDIOCGMBUF: args->cmd = VIDIOCGMBUF; break;
	case LINUX_VIDIOCGUNIT: args->cmd = VIDIOCGUNIT; break;
	case LINUX_VIDIOCGCAPTURE: args->cmd = VIDIOCGCAPTURE; break;
	case LINUX_VIDIOCSCAPTURE: args->cmd = VIDIOCSCAPTURE; break;
	case LINUX_VIDIOCSPLAYMODE: args->cmd = VIDIOCSPLAYMODE; break;
	case LINUX_VIDIOCSWRITEMODE: args->cmd = VIDIOCSWRITEMODE; break;
	case LINUX_VIDIOCGPLAYINFO: args->cmd = VIDIOCGPLAYINFO; break;

	case LINUX_VIDIOCSMICROCODE:
	error = fget(td, args->fd,
	&cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	error = copyin((void *) args->arg, &l_vcode, sizeof(l_vcode));
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	linux_to_bsd_v4l_code(&l_vcode, &vcode);
	error = fo_ioctl(fp, VIDIOCSMICROCODE, &vcode, td->td_ucred, td);
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOCGVBIFMT: args->cmd = VIDIOCGVBIFMT; break;
	case LINUX_VIDIOCSVBIFMT: args->cmd = VIDIOCSVBIFMT; break;
	default: return (ENOIOCTL);
	}

	error = sys_ioctl(td, (struct ioctl_args *)args);
	return (error);
	}

	/*
	* Special ioctl handler
	*/
	static int
	linux_ioctl_special(struct thread td, struct linux_ioctl_args args)
	{
	int error;

	switch (args->cmd) {
	case LINUX_SIOCGIFADDR:
	args->cmd = SIOCGIFADDR;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;
	case LINUX_SIOCSIFADDR:
	args->cmd = SIOCSIFADDR;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;
	case LINUX_SIOCGIFFLAGS:
	args->cmd = SIOCGIFFLAGS;
	error = sys_ioctl(td, (struct ioctl_args *)args);
	break;
	default:
	error = ENOIOCTL;
	}

	return (error);
	}

	static int
	linux_to_bsd_v4l2_standard(struct l_v4l2_standard lvstd, struct v4l2_standard vstd)
	{
	vstd->index = lvstd->index;
	vstd->id = lvstd->id;
	CTASSERT(sizeof(vstd->name) == sizeof(lvstd->name));
	memcpy(vstd->name, lvstd->name, sizeof(vstd->name));
	vstd->frameperiod = lvstd->frameperiod;
	vstd->framelines = lvstd->framelines;
	CTASSERT(sizeof(vstd->reserved) == sizeof(lvstd->reserved));
	memcpy(vstd->reserved, lvstd->reserved, sizeof(vstd->reserved));
	return (0);
	}

	static int
	bsd_to_linux_v4l2_standard(struct v4l2_standard vstd, struct l_v4l2_standard lvstd)
	{
	lvstd->index = vstd->index;
	lvstd->id = vstd->id;
	CTASSERT(sizeof(vstd->name) == sizeof(lvstd->name));
	memcpy(lvstd->name, vstd->name, sizeof(lvstd->name));
	lvstd->frameperiod = vstd->frameperiod;
	lvstd->framelines = vstd->framelines;
	CTASSERT(sizeof(vstd->reserved) == sizeof(lvstd->reserved));
	memcpy(lvstd->reserved, vstd->reserved, sizeof(lvstd->reserved));
	return (0);
	}

	static int
	linux_to_bsd_v4l2_buffer(struct l_v4l2_buffer lvb, struct v4l2_buffer vb)
	{
	vb->index = lvb->index;
	vb->type = lvb->type;
	vb->bytesused = lvb->bytesused;
	vb->flags = lvb->flags;
	vb->field = lvb->field;
	vb->timestamp.tv_sec = lvb->timestamp.tv_sec;
	vb->timestamp.tv_usec = lvb->timestamp.tv_usec;
	memcpy(&vb->timecode, &lvb->timecode, sizeof (lvb->timecode));
	vb->sequence = lvb->sequence;
	vb->memory = lvb->memory;
	if (lvb->memory == V4L2_MEMORY_USERPTR)
	/* possible pointer size conversion */
	vb->m.userptr = (unsigned long)PTRIN(lvb->m.userptr);
	else
	vb->m.offset = lvb->m.offset;
	vb->length = lvb->length;
	vb->input = lvb->input;
	vb->reserved = lvb->reserved;
	return (0);
	}

	static int
	bsd_to_linux_v4l2_buffer(struct v4l2_buffer vb, struct l_v4l2_buffer lvb)
	{
	lvb->index = vb->index;
	lvb->type = vb->type;
	lvb->bytesused = vb->bytesused;
	lvb->flags = vb->flags;
	lvb->field = vb->field;
	lvb->timestamp.tv_sec = vb->timestamp.tv_sec;
	lvb->timestamp.tv_usec = vb->timestamp.tv_usec;
	memcpy(&lvb->timecode, &vb->timecode, sizeof (vb->timecode));
	lvb->sequence = vb->sequence;
	lvb->memory = vb->memory;
	if (vb->memory == V4L2_MEMORY_USERPTR)
	/* possible pointer size conversion */
	lvb->m.userptr = PTROUT(vb->m.userptr);
	else
	lvb->m.offset = vb->m.offset;
	lvb->length = vb->length;
	lvb->input = vb->input;
	lvb->reserved = vb->reserved;
	return (0);
	}

	static int
	linux_to_bsd_v4l2_format(struct l_v4l2_format lvf, struct v4l2_format vf)
	{
	vf->type = lvf->type;
	if (lvf->type == V4L2_BUF_TYPE_VIDEO_OVERLAY
	#ifdef V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY
	\|\| lvf->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY
	#endif
	)
	/*
	* XXX TODO - needs 32 -> 64 bit conversion:
	* (unused by webcams?)
	*/
	return (EINVAL);
	memcpy(&vf->fmt, &lvf->fmt, sizeof(vf->fmt));
	return (0);
	}

	static int
	bsd_to_linux_v4l2_format(struct v4l2_format vf, struct l_v4l2_format lvf)
	{
	lvf->type = vf->type;
	if (vf->type == V4L2_BUF_TYPE_VIDEO_OVERLAY
	#ifdef V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY
	\|\| vf->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY
	#endif
	)
	/*
	* XXX TODO - needs 32 -> 64 bit conversion:
	* (unused by webcams?)
	*/
	return (EINVAL);
	memcpy(&lvf->fmt, &vf->fmt, sizeof(vf->fmt));
	return (0);
	}
	static int
	linux_ioctl_v4l2(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;
	struct v4l2_format vformat;
	struct l_v4l2_format l_vformat;
	struct v4l2_standard vstd;
	struct l_v4l2_standard l_vstd;
	struct l_v4l2_buffer l_vbuf;
	struct v4l2_buffer vbuf;
	struct v4l2_input vinp;

	switch (args->cmd & 0xffff) {
	case LINUX_VIDIOC_RESERVED:
	case LINUX_VIDIOC_LOG_STATUS:
	if ((args->cmd & IOC_DIRMASK) != LINUX_IOC_VOID)
	return (ENOIOCTL);
	args->cmd = (args->cmd & 0xffff) \| IOC_VOID;
	break;

	case LINUX_VIDIOC_OVERLAY:
	case LINUX_VIDIOC_STREAMON:
	case LINUX_VIDIOC_STREAMOFF:
	case LINUX_VIDIOC_S_STD:
	case LINUX_VIDIOC_S_TUNER:
	case LINUX_VIDIOC_S_AUDIO:
	case LINUX_VIDIOC_S_AUDOUT:
	case LINUX_VIDIOC_S_MODULATOR:
	case LINUX_VIDIOC_S_FREQUENCY:
	case LINUX_VIDIOC_S_CROP:
	case LINUX_VIDIOC_S_JPEGCOMP:
	case LINUX_VIDIOC_S_PRIORITY:
	case LINUX_VIDIOC_DBG_S_REGISTER:
	case LINUX_VIDIOC_S_HW_FREQ_SEEK:
	case LINUX_VIDIOC_SUBSCRIBE_EVENT:
	case LINUX_VIDIOC_UNSUBSCRIBE_EVENT:
	args->cmd = (args->cmd & ~IOC_DIRMASK) \| IOC_IN;
	break;

	case LINUX_VIDIOC_QUERYCAP:
	case LINUX_VIDIOC_G_STD:
	case LINUX_VIDIOC_G_AUDIO:
	case LINUX_VIDIOC_G_INPUT:
	case LINUX_VIDIOC_G_OUTPUT:
	case LINUX_VIDIOC_G_AUDOUT:
	case LINUX_VIDIOC_G_JPEGCOMP:
	case LINUX_VIDIOC_QUERYSTD:
	case LINUX_VIDIOC_G_PRIORITY:
	case LINUX_VIDIOC_QUERY_DV_PRESET:
	args->cmd = (args->cmd & ~IOC_DIRMASK) \| IOC_OUT;
	break;

	case LINUX_VIDIOC_ENUM_FMT:
	case LINUX_VIDIOC_REQBUFS:
	case LINUX_VIDIOC_G_PARM:
	case LINUX_VIDIOC_S_PARM:
	case LINUX_VIDIOC_G_CTRL:
	case LINUX_VIDIOC_S_CTRL:
	case LINUX_VIDIOC_G_TUNER:
	case LINUX_VIDIOC_QUERYCTRL:
	case LINUX_VIDIOC_QUERYMENU:
	case LINUX_VIDIOC_S_INPUT:
	case LINUX_VIDIOC_S_OUTPUT:
	case LINUX_VIDIOC_ENUMOUTPUT:
	case LINUX_VIDIOC_G_MODULATOR:
	case LINUX_VIDIOC_G_FREQUENCY:
	case LINUX_VIDIOC_CROPCAP:
	case LINUX_VIDIOC_G_CROP:
	case LINUX_VIDIOC_ENUMAUDIO:
	case LINUX_VIDIOC_ENUMAUDOUT:
	case LINUX_VIDIOC_G_SLICED_VBI_CAP:
	#ifdef VIDIOC_ENUM_FRAMESIZES
	case LINUX_VIDIOC_ENUM_FRAMESIZES:
	case LINUX_VIDIOC_ENUM_FRAMEINTERVALS:
	case LINUX_VIDIOC_ENCODER_CMD:
	case LINUX_VIDIOC_TRY_ENCODER_CMD:
	#endif
	case LINUX_VIDIOC_DBG_G_REGISTER:
	case LINUX_VIDIOC_DBG_G_CHIP_IDENT:
	case LINUX_VIDIOC_ENUM_DV_PRESETS:
	case LINUX_VIDIOC_S_DV_PRESET:
	case LINUX_VIDIOC_G_DV_PRESET:
	case LINUX_VIDIOC_S_DV_TIMINGS:
	case LINUX_VIDIOC_G_DV_TIMINGS:
	args->cmd = (args->cmd & ~IOC_DIRMASK) \| IOC_INOUT;
	break;

	case LINUX_VIDIOC_G_FMT:
	case LINUX_VIDIOC_S_FMT:
	case LINUX_VIDIOC_TRY_FMT:
	error = copyin((void *)args->arg, &l_vformat, sizeof(l_vformat));
	if (error)
	return (error);
	error = fget(td, args->fd,
	&cap_ioctl_rights, &fp);
	if (error)
	return (error);
	if (linux_to_bsd_v4l2_format(&l_vformat, &vformat) != 0)
	error = EINVAL;
	else if ((args->cmd & 0xffff) == LINUX_VIDIOC_G_FMT)
	error = fo_ioctl(fp, VIDIOC_G_FMT, &vformat,
	td->td_ucred, td);
	else if ((args->cmd & 0xffff) == LINUX_VIDIOC_S_FMT)
	error = fo_ioctl(fp, VIDIOC_S_FMT, &vformat,
	td->td_ucred, td);
	else
	error = fo_ioctl(fp, VIDIOC_TRY_FMT, &vformat,
	td->td_ucred, td);
	bsd_to_linux_v4l2_format(&vformat, &l_vformat);
	copyout(&l_vformat, (void *)args->arg, sizeof(l_vformat));
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOC_ENUMSTD:
	error = copyin((void *)args->arg, &l_vstd, sizeof(l_vstd));
	if (error)
	return (error);
	linux_to_bsd_v4l2_standard(&l_vstd, &vstd);
	error = fget(td, args->fd,
	&cap_ioctl_rights, &fp);
	if (error)
	return (error);
	error = fo_ioctl(fp, VIDIOC_ENUMSTD, (caddr_t)&vstd,
	td->td_ucred, td);
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	bsd_to_linux_v4l2_standard(&vstd, &l_vstd);
	error = copyout(&l_vstd, (void *)args->arg, sizeof(l_vstd));
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOC_ENUMINPUT:
	/*
	* The Linux struct l_v4l2_input differs only in size,
	* it has no padding at the end.
	*/
	error = copyin((void *)args->arg, &vinp,
	sizeof(struct l_v4l2_input));
	if (error != 0)
	return (error);
	error = fget(td, args->fd,
	&cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	error = fo_ioctl(fp, VIDIOC_ENUMINPUT, (caddr_t)&vinp,
	td->td_ucred, td);
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	error = copyout(&vinp, (void *)args->arg,
	sizeof(struct l_v4l2_input));
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOC_QUERYBUF:
	case LINUX_VIDIOC_QBUF:
	case LINUX_VIDIOC_DQBUF:
	error = copyin((void *)args->arg, &l_vbuf, sizeof(l_vbuf));
	if (error)
	return (error);
	error = fget(td, args->fd,
	&cap_ioctl_rights, &fp);
	if (error)
	return (error);
	linux_to_bsd_v4l2_buffer(&l_vbuf, &vbuf);
	if ((args->cmd & 0xffff) == LINUX_VIDIOC_QUERYBUF)
	error = fo_ioctl(fp, VIDIOC_QUERYBUF, &vbuf,
	td->td_ucred, td);
	else if ((args->cmd & 0xffff) == LINUX_VIDIOC_QBUF)
	error = fo_ioctl(fp, VIDIOC_QBUF, &vbuf,
	td->td_ucred, td);
	else
	error = fo_ioctl(fp, VIDIOC_DQBUF, &vbuf,
	td->td_ucred, td);
	bsd_to_linux_v4l2_buffer(&vbuf, &l_vbuf);
	copyout(&l_vbuf, (void *)args->arg, sizeof(l_vbuf));
	fdrop(fp, td);
	return (error);

	/*
	* XXX TODO - these need 32 -> 64 bit conversion:
	* (are any of them needed for webcams?)
	*/
	case LINUX_VIDIOC_G_FBUF:
	case LINUX_VIDIOC_S_FBUF:

	case LINUX_VIDIOC_G_EXT_CTRLS:
	case LINUX_VIDIOC_S_EXT_CTRLS:
	case LINUX_VIDIOC_TRY_EXT_CTRLS:

	case LINUX_VIDIOC_DQEVENT:

	default: return (ENOIOCTL);
	}

	error = sys_ioctl(td, (struct ioctl_args *)args);
	return (error);
	}

	/*
	* Support for emulators/linux-libusb. This port uses FBSD_LUSB* macros
	* instead of USB* ones. This lets us to provide correct values for cmd.
	* 0xffffffe0 -- 0xffffffff range seemed to be the least collision-prone.
	*/
	static int
	linux_ioctl_fbsd_usb(struct thread td, struct linux_ioctl_args args)
	{
	int error;

	error = 0;
	switch (args->cmd) {
	case FBSD_LUSB_DEVICEENUMERATE:
	args->cmd = USB_DEVICEENUMERATE;
	break;
	case FBSD_LUSB_DEV_QUIRK_ADD:
	args->cmd = USB_DEV_QUIRK_ADD;
	break;
	case FBSD_LUSB_DEV_QUIRK_GET:
	args->cmd = USB_DEV_QUIRK_GET;
	break;
	case FBSD_LUSB_DEV_QUIRK_REMOVE:
	args->cmd = USB_DEV_QUIRK_REMOVE;
	break;
	case FBSD_LUSB_DO_REQUEST:
	args->cmd = USB_DO_REQUEST;
	break;
	case FBSD_LUSB_FS_CLEAR_STALL_SYNC:
	args->cmd = USB_FS_CLEAR_STALL_SYNC;
	break;
	case FBSD_LUSB_FS_CLOSE:
	args->cmd = USB_FS_CLOSE;
	break;
	case FBSD_LUSB_FS_COMPLETE:
	args->cmd = USB_FS_COMPLETE;
	break;
	case FBSD_LUSB_FS_INIT:
	args->cmd = USB_FS_INIT;
	break;
	case FBSD_LUSB_FS_OPEN:
	args->cmd = USB_FS_OPEN;
	break;
	case FBSD_LUSB_FS_START:
	args->cmd = USB_FS_START;
	break;
	case FBSD_LUSB_FS_STOP:
	args->cmd = USB_FS_STOP;
	break;
	case FBSD_LUSB_FS_UNINIT:
	args->cmd = USB_FS_UNINIT;
	break;
	case FBSD_LUSB_GET_CONFIG:
	args->cmd = USB_GET_CONFIG;
	break;
	case FBSD_LUSB_GET_DEVICEINFO:
	args->cmd = USB_GET_DEVICEINFO;
	break;
	case FBSD_LUSB_GET_DEVICE_DESC:
	args->cmd = USB_GET_DEVICE_DESC;
	break;
	case FBSD_LUSB_GET_FULL_DESC:
	args->cmd = USB_GET_FULL_DESC;
	break;
	case FBSD_LUSB_GET_IFACE_DRIVER:
	args->cmd = USB_GET_IFACE_DRIVER;
	break;
	case FBSD_LUSB_GET_PLUGTIME:
	args->cmd = USB_GET_PLUGTIME;
	break;
	case FBSD_LUSB_GET_POWER_MODE:
	args->cmd = USB_GET_POWER_MODE;
	break;
	case FBSD_LUSB_GET_REPORT_DESC:
	args->cmd = USB_GET_REPORT_DESC;
	break;
	case FBSD_LUSB_GET_REPORT_ID:
	args->cmd = USB_GET_REPORT_ID;
	break;
	case FBSD_LUSB_GET_TEMPLATE:
	args->cmd = USB_GET_TEMPLATE;
	break;
	case FBSD_LUSB_IFACE_DRIVER_ACTIVE:
	args->cmd = USB_IFACE_DRIVER_ACTIVE;
	break;
	case FBSD_LUSB_IFACE_DRIVER_DETACH:
	args->cmd = USB_IFACE_DRIVER_DETACH;
	break;
	case FBSD_LUSB_QUIRK_NAME_GET:
	args->cmd = USB_QUIRK_NAME_GET;
	break;
	case FBSD_LUSB_READ_DIR:
	args->cmd = USB_READ_DIR;
	break;
	case FBSD_LUSB_SET_ALTINTERFACE:
	args->cmd = USB_SET_ALTINTERFACE;
	break;
	case FBSD_LUSB_SET_CONFIG:
	args->cmd = USB_SET_CONFIG;
	break;
	case FBSD_LUSB_SET_IMMED:
	args->cmd = USB_SET_IMMED;
	break;
	case FBSD_LUSB_SET_POWER_MODE:
	args->cmd = USB_SET_POWER_MODE;
	break;
	case FBSD_LUSB_SET_TEMPLATE:
	args->cmd = USB_SET_TEMPLATE;
	break;
	case FBSD_LUSB_FS_OPEN_STREAM:
	args->cmd = USB_FS_OPEN_STREAM;
	break;
	case FBSD_LUSB_GET_DEV_PORT_PATH:
	args->cmd = USB_GET_DEV_PORT_PATH;
	break;
	case FBSD_LUSB_GET_POWER_USAGE:
	args->cmd = USB_GET_POWER_USAGE;
	break;
	case FBSD_LUSB_DEVICESTATS:
	args->cmd = USB_DEVICESTATS;
	break;
	default:
	error = ENOIOCTL;
	}
	if (error != ENOIOCTL)
	error = sys_ioctl(td, (struct ioctl_args *)args);
	return (error);
	}

	/*
	* Some evdev ioctls must be translated.
	* - EVIOCGMTSLOTS is a IOC_READ ioctl on Linux although it has input data
	* (must be IOC_INOUT on FreeBSD).
	* - On Linux, EVIOCGRAB, EVIOCREVOKE and EVIOCRMFF are defined as _IOW with
	* an int argument. You don't pass an int pointer to the ioctl(), however,
	* but just the int directly. On FreeBSD, they are defined as _IOWINT for
	* this to work.
	*/
	static int
	linux_ioctl_evdev(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	clockid_t clock;
	int error;

	args->cmd = SETDIR(args->cmd);

	switch (args->cmd) {
	case (EVIOCGRAB & ~IOC_DIRMASK) \| IOC_IN:
	args->cmd = EVIOCGRAB;
	break;
	case (EVIOCREVOKE & ~IOC_DIRMASK) \| IOC_IN:
	args->cmd = EVIOCREVOKE;
	break;
	case (EVIOCRMFF & ~IOC_DIRMASK) \| IOC_IN:
	args->cmd = EVIOCRMFF;
	break;
	case EVIOCSCLOCKID: {
	error = copyin(PTRIN(args->arg), &clock, sizeof(clock));
	if (error != 0)
	return (error);
	if (clock & ~(LINUX_IOCTL_EVDEV_CLK))
	return (EINVAL);
	error = linux_to_native_clockid(&clock, clock);
	if (error != 0)
	return (error);

	error = fget(td, args->fd,
	&cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);

	error = fo_ioctl(fp, EVIOCSCLOCKID, &clock, td->td_ucred, td);
	fdrop(fp, td);
	return (error);
	}
	default:
	break;
	}

	if (IOCBASECMD(args->cmd) ==
	((EVIOCGMTSLOTS(0) & ~IOC_DIRMASK) \| IOC_OUT))
	args->cmd = (args->cmd & ~IOC_DIRMASK) \| IOC_INOUT;

	return (sys_ioctl(td, (struct ioctl_args *)args));
	}

	static int
	linux_ioctl_kcov(struct thread td, struct linux_ioctl_args args)
	{
	int error;

	error = 0;
	switch (args->cmd & 0xffff) {
	case LINUX_KCOV_INIT_TRACE:
	args->cmd = KIOSETBUFSIZE;
	break;
	case LINUX_KCOV_ENABLE:
	args->cmd = KIOENABLE;
	if (args->arg == 0)
	args->arg = KCOV_MODE_TRACE_PC;
	else if (args->arg == 1)
	args->arg = KCOV_MODE_TRACE_CMP;
	else
	error = EINVAL;
	break;
	case LINUX_KCOV_DISABLE:
	args->cmd = KIODISABLE;
	break;
	default:
	error = ENOTTY;
	break;
	}

	if (error == 0)
	error = sys_ioctl(td, (struct ioctl_args *)args);
	return (error);
	}

	/*
	* main ioctl syscall function
	*/

	static int
	linux_ioctl_fallback(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	struct linux_ioctl_handler_element *he;
	int error, cmd;

	error = fget(td, args->fd, &cap_ioctl_rights, &fp);
	if (error != 0)
	return (error);
	if ((fp->f_flag & (FREAD\|FWRITE)) == 0) {
	fdrop(fp, td);
	return (EBADF);
	}

	/* Iterate over the ioctl handlers */
	cmd = args->cmd & 0xffff;
	sx_slock(&linux_ioctl_sx);
	mtx_lock(&Giant);
	#ifdef COMPAT_LINUX32
	TAILQ_FOREACH(he, &linux32_ioctl_handlers, list) {
	if (cmd >= he->low && cmd <= he->high) {
	error = (*he->func)(td, args);
	if (error != ENOIOCTL) {
	mtx_unlock(&Giant);
	sx_sunlock(&linux_ioctl_sx);
	fdrop(fp, td);
	return (error);
	}
	}
	}
	#endif
	TAILQ_FOREACH(he, &linux_ioctl_handlers, list) {
	if (cmd >= he->low && cmd <= he->high) {
	error = (*he->func)(td, args);
	if (error != ENOIOCTL) {
	mtx_unlock(&Giant);
	sx_sunlock(&linux_ioctl_sx);
	fdrop(fp, td);
	return (error);
	}
	}
	}
	mtx_unlock(&Giant);
	sx_sunlock(&linux_ioctl_sx);
	fdrop(fp, td);

	switch (args->cmd & 0xffff) {
	case LINUX_BTRFS_IOC_CLONE:
	case LINUX_F2FS_IOC_GET_FEATURES:
	case LINUX_FS_IOC_FIEMAP:
	return (ENOTSUP);

	default:
	linux_msg(td, "%s fd=%d, cmd=0x%x ('%c',%d) is not implemented",
	__func__, args->fd, args->cmd,
	(int)(args->cmd & 0xff00) >> 8, (int)(args->cmd & 0xff));
	break;
	}

	return (EINVAL);
	}

	int
	linux_ioctl(struct thread td, struct linux_ioctl_args args)
	{
	struct linux_ioctl_handler *handler;
	int error, cmd, i;

	cmd = args->cmd & 0xffff;

	/*
	* array of ioctls known at compilation time. Elides a lot of work on
	* each call compared to the list variant. Everything frequently used
	* should be moved here.
	*
	* Arguably the magic creating the list should create an array instead.
	*
	* For now just a linear scan.
	*/
	for (i = 0; i < nitems(linux_ioctls); i++) {
	handler = &linux_ioctls[i];
	if (cmd >= handler->low && cmd <= handler->high) {
	error = (*handler->func)(td, args);
	if (error != ENOIOCTL) {
	return (error);
	}
	}
	}
	return (linux_ioctl_fallback(td, args));
	}

	int
	linux_ioctl_register_handler(struct linux_ioctl_handler *h)
	{
	struct linux_ioctl_handler_element he, cur;

	if (h == NULL \|\| h->func == NULL)
	return (EINVAL);

	/*
	* Reuse the element if the handler is already on the list, otherwise
	* create a new element.
	*/
	sx_xlock(&linux_ioctl_sx);
	TAILQ_FOREACH(he, &linux_ioctl_handlers, list) {
	if (he->func == h->func)
	break;
	}
	if (he == NULL) {
	he = malloc(sizeof(*he),
	M_LINUX, M_WAITOK);
	he->func = h->func;
	} else
	TAILQ_REMOVE(&linux_ioctl_handlers, he, list);

	/* Initialize range information. */
	he->low = h->low;
	he->high = h->high;
	he->span = h->high - h->low + 1;

	/* Add the element to the list, sorted on span. */
	TAILQ_FOREACH(cur, &linux_ioctl_handlers, list) {
	if (cur->span > he->span) {
	TAILQ_INSERT_BEFORE(cur, he, list);
	sx_xunlock(&linux_ioctl_sx);
	return (0);
	}
	}
	TAILQ_INSERT_TAIL(&linux_ioctl_handlers, he, list);
	sx_xunlock(&linux_ioctl_sx);

	return (0);
	}

	int
	linux_ioctl_unregister_handler(struct linux_ioctl_handler *h)
	{
	struct linux_ioctl_handler_element *he;

	if (h == NULL \|\| h->func == NULL)
	return (EINVAL);

	sx_xlock(&linux_ioctl_sx);
	TAILQ_FOREACH(he, &linux_ioctl_handlers, list) {
	if (he->func == h->func) {
	TAILQ_REMOVE(&linux_ioctl_handlers, he, list);
	sx_xunlock(&linux_ioctl_sx);
	free(he, M_LINUX);
	return (0);
	}
	}
	sx_xunlock(&linux_ioctl_sx);

	return (EINVAL);
	}

	#ifdef COMPAT_LINUX32
	int
	linux32_ioctl_register_handler(struct linux_ioctl_handler *h)
	{
	struct linux_ioctl_handler_element he, cur;

	if (h == NULL \|\| h->func == NULL)
	return (EINVAL);

	/*
	* Reuse the element if the handler is already on the list, otherwise
	* create a new element.
	*/
	sx_xlock(&linux_ioctl_sx);
	TAILQ_FOREACH(he, &linux32_ioctl_handlers, list) {
	if (he->func == h->func)
	break;
	}
	if (he == NULL) {
	he = malloc(sizeof(*he), M_LINUX, M_WAITOK);
	he->func = h->func;
	} else
	TAILQ_REMOVE(&linux32_ioctl_handlers, he, list);

	/* Initialize range information. */
	he->low = h->low;
	he->high = h->high;
	he->span = h->high - h->low + 1;

	/* Add the element to the list, sorted on span. */
	TAILQ_FOREACH(cur, &linux32_ioctl_handlers, list) {
	if (cur->span > he->span) {
	TAILQ_INSERT_BEFORE(cur, he, list);
	sx_xunlock(&linux_ioctl_sx);
	return (0);
	}
	}
	TAILQ_INSERT_TAIL(&linux32_ioctl_handlers, he, list);
	sx_xunlock(&linux_ioctl_sx);

	return (0);
	}

	int
	linux32_ioctl_unregister_handler(struct linux_ioctl_handler *h)
	{
	struct linux_ioctl_handler_element *he;

	if (h == NULL \|\| h->func == NULL)
	return (EINVAL);

	sx_xlock(&linux_ioctl_sx);
	TAILQ_FOREACH(he, &linux32_ioctl_handlers, list) {
	if (he->func == h->func) {
	TAILQ_REMOVE(&linux32_ioctl_handlers, he, list);
	sx_xunlock(&linux_ioctl_sx);
	free(he, M_LINUX);
	return (0);
	}
	}
	sx_xunlock(&linux_ioctl_sx);

	return (EINVAL);
	}
	#endif
	diff --git a/sys/conf/options b/sys/conf/options
	index c2773adee472..d773aa5e6d38 100644
	--- a/sys/conf/options
	+++ b/sys/conf/options
	@@ -1,1015 +1,1015 @@
	# $FreeBSD$
	#
	# On the handling of kernel options
	#
	# All kernel options should be listed in NOTES, with suitable
	# descriptions. Negative options (options that make some code not
	# compile) should be commented out; LINT (generated from NOTES) should
	# compile as much code as possible. Try to structure option-using
	# code so that a single option only switch code on, or only switch
	# code off, to make it possible to have a full compile-test. If
	# necessary, you can check for COMPILING_LINT to get maximum code
	# coverage.
	#
	# All new options shall also be listed in either "conf/options" or
	# "conf/options.<machine>". Options that affect a single source-file
	# <xxx>.[c\|s] should be directed into "opt_<xxx>.h", while options
	# that affect multiple files should either go in "opt_global.h" if
	# this is a kernel-wide option (used just about everywhere), or in
	# "opt_<option-name-in-lower-case>.h" if it affects only some files.
	# Note that the effect of listing only an option without a
	# header-file-name in conf/options (and cousins) is that the last
	# convention is followed.
	#
	# This handling scheme is not yet fully implemented.
	#
	#
	# Format of this file:
	# Option name filename
	#
	# If filename is missing, the default is
	# opt_<name-of-option-in-lower-case>.h

	AAC_DEBUG opt_aac.h
	AACRAID_DEBUG opt_aacraid.h
	AHC_ALLOW_MEMIO opt_aic7xxx.h
	AHC_TMODE_ENABLE opt_aic7xxx.h
	AHC_DUMP_EEPROM opt_aic7xxx.h
	AHC_DEBUG opt_aic7xxx.h
	AHC_DEBUG_OPTS opt_aic7xxx.h
	AHC_REG_PRETTY_PRINT opt_aic7xxx.h
	AHD_DEBUG opt_aic79xx.h
	AHD_DEBUG_OPTS opt_aic79xx.h
	AHD_TMODE_ENABLE opt_aic79xx.h
	AHD_REG_PRETTY_PRINT opt_aic79xx.h

	TWA_DEBUG opt_twa.h

	# Debugging options.
	ALT_BREAK_TO_DEBUGGER opt_kdb.h
	BREAK_TO_DEBUGGER opt_kdb.h
	BUF_TRACKING opt_global.h
	DDB
	DDB_BUFR_SIZE opt_ddb.h
	DDB_CAPTURE_DEFAULTBUFSIZE opt_ddb.h
	DDB_CAPTURE_MAXBUFSIZE opt_ddb.h
	DDB_CTF opt_ddb.h
	DDB_NUMSYM opt_ddb.h
	EARLY_PRINTF opt_global.h
	FULL_BUF_TRACKING opt_global.h
	GDB
	KDB opt_global.h
	KDB_TRACE opt_kdb.h
	KDB_UNATTENDED opt_kdb.h
	KLD_DEBUG opt_kld.h
	NUM_CORE_FILES opt_global.h
	QUEUE_MACRO_DEBUG_TRACE opt_global.h
	QUEUE_MACRO_DEBUG_TRASH opt_global.h
	SYSCTL_DEBUG opt_sysctl.h
	TEXTDUMP_PREFERRED opt_ddb.h
	TEXTDUMP_VERBOSE opt_ddb.h
	TSLOG opt_global.h
	TSLOGSIZE opt_global.h

	# Miscellaneous options.
	ALQ
	ALTERA_SDCARD_FAST_SIM opt_altera_sdcard.h
	ATSE_CFI_HACK opt_cfi.h
	AUDIT opt_global.h
	BOOTHOWTO opt_global.h
	BOOTVERBOSE opt_global.h
	CALLOUT_PROFILING
	CAPABILITIES opt_capsicum.h
	CAPABILITY_MODE opt_capsicum.h
	COMPAT_43 opt_global.h
	COMPAT_43TTY opt_global.h
	COMPAT_FREEBSD4 opt_global.h
	COMPAT_FREEBSD5 opt_global.h
	COMPAT_FREEBSD6 opt_global.h
	COMPAT_FREEBSD7 opt_global.h
	COMPAT_FREEBSD9 opt_global.h
	COMPAT_FREEBSD10 opt_global.h
	COMPAT_FREEBSD11 opt_global.h
	COMPAT_FREEBSD12 opt_global.h
	COMPAT_CLOUDABI32 opt_dontuse.h
	COMPAT_CLOUDABI64 opt_dontuse.h
	COMPAT_LINUXKPI opt_dontuse.h
	_COMPAT_LINUX32 opt_compat.h # XXX: make sure opt_compat.h exists
	COMPILING_LINT opt_global.h
	CY_PCI_FASTINTR
	DEADLKRES opt_watchdog.h
	EXPERIMENTAL opt_global.h
	EXT_RESOURCES opt_global.h
	DIRECTIO
	FILEMON opt_dontuse.h
	FFCLOCK
	FULL_PREEMPTION opt_sched.h
	GZIO opt_gzio.h
	IMAGACT_BINMISC opt_dontuse.h
	IPI_PREEMPTION opt_sched.h
	GEOM_BDE opt_geom.h
	GEOM_CACHE opt_geom.h
	GEOM_CONCAT opt_geom.h
	GEOM_ELI opt_geom.h
	GEOM_GATE opt_geom.h
	GEOM_JOURNAL opt_geom.h
	GEOM_LABEL opt_geom.h
	GEOM_LABEL_GPT opt_geom.h
	GEOM_LINUX_LVM opt_geom.h
	GEOM_MAP opt_geom.h
	GEOM_MIRROR opt_geom.h
	GEOM_MOUNTVER opt_geom.h
	GEOM_MULTIPATH opt_geom.h
	GEOM_NOP opt_geom.h
	GEOM_PART_APM opt_geom.h
	GEOM_PART_BSD opt_geom.h
	GEOM_PART_BSD64 opt_geom.h
	GEOM_PART_EBR opt_geom.h
	GEOM_PART_GPT opt_geom.h
	GEOM_PART_LDM opt_geom.h
	GEOM_PART_MBR opt_geom.h
	GEOM_PART_VTOC8 opt_geom.h
	GEOM_RAID opt_geom.h
	GEOM_RAID3 opt_geom.h
	GEOM_SHSEC opt_geom.h
	GEOM_STRIPE opt_geom.h
	GEOM_UZIP opt_geom.h
	GEOM_UZIP_DEBUG opt_geom.h
	GEOM_VINUM opt_geom.h
	GEOM_VIRSTOR opt_geom.h
	GEOM_ZERO opt_geom.h
	IFLIB opt_iflib.h
	KDTRACE_HOOKS opt_global.h
	KDTRACE_FRAME opt_kdtrace.h
	KN_HASHSIZE opt_kqueue.h
	KSTACK_MAX_PAGES
	KSTACK_PAGES
	KSTACK_USAGE_PROF
	KTRACE
	KTRACE_REQUEST_POOL opt_ktrace.h
	LIBICONV
	MAC opt_global.h
	MAC_BIBA opt_dontuse.h
	MAC_BSDEXTENDED opt_dontuse.h
	MAC_IFOFF opt_dontuse.h
	MAC_LOMAC opt_dontuse.h
	MAC_MLS opt_dontuse.h
	MAC_NONE opt_dontuse.h
	MAC_NTPD opt_dontuse.h
	MAC_PARTITION opt_dontuse.h
	MAC_PORTACL opt_dontuse.h
	MAC_SEEOTHERUIDS opt_dontuse.h
	MAC_STATIC opt_mac.h
	MAC_STUB opt_dontuse.h
	MAC_TEST opt_dontuse.h
	MAC_VERIEXEC opt_dontuse.h
	MAC_VERIEXEC_SHA1 opt_dontuse.h
	MAC_VERIEXEC_SHA256 opt_dontuse.h
	MAC_VERIEXEC_SHA384 opt_dontuse.h
	MAC_VERIEXEC_SHA512 opt_dontuse.h
	MD_ROOT opt_md.h
	MD_ROOT_FSTYPE opt_md.h
	MD_ROOT_READONLY opt_md.h
	MD_ROOT_SIZE opt_md.h
	MD_ROOT_MEM opt_md.h
	MFI_DEBUG opt_mfi.h
	MFI_DECODE_LOG opt_mfi.h
	MPROF_BUFFERS opt_mprof.h
	MPROF_HASH_SIZE opt_mprof.h
	NEW_PCIB opt_global.h
	NO_ADAPTIVE_MUTEXES opt_adaptive_mutexes.h
	NO_ADAPTIVE_RWLOCKS
	NO_ADAPTIVE_SX
	NO_OBSOLETE_CODE opt_global.h
	NO_SYSCTL_DESCR opt_global.h
	NSWBUF_MIN opt_param.h
	MBUF_PACKET_ZONE_DISABLE opt_global.h
	PANIC_REBOOT_WAIT_TIME opt_panic.h
	PCI_HP opt_pci.h
	PCI_IOV opt_global.h
	PPC_DEBUG opt_ppc.h
	PPC_PROBE_CHIPSET opt_ppc.h
	PPS_SYNC opt_ntp.h
	PREEMPTION opt_sched.h
	QUOTA
	SCHED_4BSD opt_sched.h
	SCHED_STATS opt_sched.h
	SCHED_ULE opt_sched.h
	SLEEPQUEUE_PROFILING
	SLHCI_DEBUG opt_slhci.h
	STACK opt_stack.h
	SUIDDIR
	MSGMNB opt_sysvipc.h
	MSGMNI opt_sysvipc.h
	MSGSEG opt_sysvipc.h
	MSGSSZ opt_sysvipc.h
	MSGTQL opt_sysvipc.h
	SEMMNI opt_sysvipc.h
	SEMMNS opt_sysvipc.h
	SEMMNU opt_sysvipc.h
	SEMMSL opt_sysvipc.h
	SEMOPM opt_sysvipc.h
	SEMUME opt_sysvipc.h
	SHMALL opt_sysvipc.h
	SHMMAX opt_sysvipc.h
	SHMMAXPGS opt_sysvipc.h
	SHMMIN opt_sysvipc.h
	SHMMNI opt_sysvipc.h
	SHMSEG opt_sysvipc.h
	SYSVMSG opt_sysvipc.h
	SYSVSEM opt_sysvipc.h
	SYSVSHM opt_sysvipc.h
	SW_WATCHDOG opt_watchdog.h
	TCPHPTS opt_inet.h
	TURNSTILE_PROFILING
	UMTX_PROFILING
	UMTX_CHAINS opt_global.h
	VERBOSE_SYSINIT
	ZSTDIO opt_zstdio.h

	# Sanitizers
	COVERAGE opt_global.h
	KCOV
	KCSAN opt_global.h
	KUBSAN opt_global.h

	# POSIX kernel options
	P1003_1B_MQUEUE opt_posix.h
	P1003_1B_SEMAPHORES opt_posix.h
	_KPOSIX_PRIORITY_SCHEDULING opt_posix.h

	# Do we want the config file compiled into the kernel?
	INCLUDE_CONFIG_FILE opt_config.h

	# Options for static filesystems. These should only be used at config
	# time, since the corresponding lkms cannot work if there are any static
	# dependencies. Unusability is enforced by hiding the defines for the
	# options in a never-included header.
	AUTOFS opt_dontuse.h
	CD9660 opt_dontuse.h
	EXT2FS opt_dontuse.h
	FDESCFS opt_dontuse.h
	FFS opt_dontuse.h
	FUSEFS opt_dontuse.h
	MSDOSFS opt_dontuse.h
	NULLFS opt_dontuse.h
	PROCFS opt_dontuse.h
	PSEUDOFS opt_dontuse.h
	SMBFS opt_dontuse.h
	TMPFS opt_dontuse.h
	UDF opt_dontuse.h
	UNIONFS opt_dontuse.h
	ZFS opt_dontuse.h

	# Pseudofs debugging
	PSEUDOFS_TRACE opt_pseudofs.h

	# In-kernel GSS-API
	KGSSAPI opt_kgssapi.h
	KGSSAPI_DEBUG opt_kgssapi.h

	# These static filesystems have one slightly bogus static dependency in
	# sys/i386/i386/autoconf.c. If any of these filesystems are
	# statically compiled into the kernel, code for mounting them as root
	# filesystems will be enabled - but look below.
	# NFSCL - client
	# NFSD - server
	NFSCL opt_nfs.h
	NFSD opt_nfs.h

	# filesystems and libiconv bridge
	CD9660_ICONV opt_dontuse.h
	MSDOSFS_ICONV opt_dontuse.h
	UDF_ICONV opt_dontuse.h

	# If you are following the conditions in the copyright,
	# you can enable soft-updates which will speed up a lot of thigs
	# and make the system safer from crashes at the same time.
	# otherwise a STUB module will be compiled in.
	SOFTUPDATES opt_ffs.h

	# On small, embedded systems, it can be useful to turn off support for
	# snapshots. It saves about 30-40k for a feature that would be lightly
	# used, if it is used at all.
	NO_FFS_SNAPSHOT opt_ffs.h

	# Enabling this option turns on support for Access Control Lists in UFS,
	# which can be used to support high security configurations. Depends on
	# UFS_EXTATTR.
	UFS_ACL opt_ufs.h

	# Enabling this option turns on support for extended attributes in UFS-based
	# filesystems, which can be used to support high security configurations
	# as well as new filesystem features.
	UFS_EXTATTR opt_ufs.h
	UFS_EXTATTR_AUTOSTART opt_ufs.h

	# Enable fast hash lookups for large directories on UFS-based filesystems.
	UFS_DIRHASH opt_ufs.h

	# Enable gjournal-based UFS journal.
	UFS_GJOURNAL opt_ufs.h

	# The below sentence is not in English, and neither is this one.
	# We plan to remove the static dependences above, with a
	# <filesystem>_ROOT option to control if it usable as root. This list
	# allows these options to be present in config files already (though
	# they won't make any difference yet).
	NFS_ROOT opt_nfsroot.h

	# SMB/CIFS requester
	NETSMB opt_netsmb.h

	# Enable debugnet(4) networking support.
	DEBUGNET opt_global.h
	# Enable netdump(4) client support.
	NETDUMP opt_global.h
	# Enable netgdb(4) support.
	NETGDB opt_global.h

	# Options used only in subr_param.c.
	HZ opt_param.h
	MAXFILES opt_param.h
	NBUF opt_param.h
	NSFBUFS opt_param.h
	VM_BCACHE_SIZE_MAX opt_param.h
	VM_SWZONE_SIZE_MAX opt_param.h
	MAXUSERS
	DFLDSIZ opt_param.h
	MAXDSIZ opt_param.h
	MAXSSIZ opt_param.h

	# Generic SCSI options.
	CAM_MAX_HIGHPOWER opt_cam.h
	CAMDEBUG opt_cam.h
	CAM_DEBUG_COMPILE opt_cam.h
	CAM_DEBUG_DELAY opt_cam.h
	CAM_DEBUG_BUS opt_cam.h
	CAM_DEBUG_TARGET opt_cam.h
	CAM_DEBUG_LUN opt_cam.h
	CAM_DEBUG_FLAGS opt_cam.h
	CAM_BOOT_DELAY opt_cam.h
	CAM_IOSCHED_DYNAMIC opt_cam.h
	CAM_IO_STATS opt_cam.h
	CAM_TEST_FAILURE opt_cam.h
	SCSI_DELAY opt_scsi.h
	SCSI_NO_SENSE_STRINGS opt_scsi.h
	SCSI_NO_OP_STRINGS opt_scsi.h

	# Options used only in cam/ata/ata_da.c
	ATA_STATIC_ID opt_ada.h

	# Options used only in cam/scsi/scsi_cd.c
	CHANGER_MIN_BUSY_SECONDS opt_cd.h
	CHANGER_MAX_BUSY_SECONDS opt_cd.h

	# Options used only in cam/scsi/scsi_da.c
	DA_TRACK_REFS opt_da.h

	# Options used only in cam/scsi/scsi_sa.c.
	SA_IO_TIMEOUT opt_sa.h
	SA_SPACE_TIMEOUT opt_sa.h
	SA_REWIND_TIMEOUT opt_sa.h
	SA_ERASE_TIMEOUT opt_sa.h
	SA_1FM_AT_EOD opt_sa.h

	# Options used only in cam/scsi/scsi_pt.c
	SCSI_PT_DEFAULT_TIMEOUT opt_pt.h

	# Options used only in cam/scsi/scsi_ses.c
	SES_ENABLE_PASSTHROUGH opt_ses.h

	# Options used in dev/sym/ (Symbios SCSI driver).
	SYM_SETUP_SCSI_DIFF opt_sym.h #-HVD support for 825a, 875, 885
	# disabled:0 (default), enabled:1
	SYM_SETUP_PCI_PARITY opt_sym.h #-PCI parity checking
	# disabled:0, enabled:1 (default)
	SYM_SETUP_MAX_LUN opt_sym.h #-Number of LUNs supported
	# default:8, range:[1..64]

	# Options used only in dev/isp/*
	ISP_TARGET_MODE opt_isp.h
	ISP_FW_CRASH_DUMP opt_isp.h
	ISP_DEFAULT_ROLES opt_isp.h
	ISP_INTERNAL_TARGET opt_isp.h
	ISP_FCTAPE_OFF opt_isp.h

	# Options used only in dev/iscsi
	ISCSI_INITIATOR_DEBUG opt_iscsi_initiator.h

	# Net stuff.
	ACCEPT_FILTER_DATA
	ACCEPT_FILTER_DNS
	ACCEPT_FILTER_HTTP
	ALTQ opt_global.h
	ALTQ_CBQ opt_altq.h
	ALTQ_CDNR opt_altq.h
	ALTQ_CODEL opt_altq.h
	ALTQ_DEBUG opt_altq.h
	ALTQ_HFSC opt_altq.h
	ALTQ_FAIRQ opt_altq.h
	ALTQ_NOPCC opt_altq.h
	ALTQ_PRIQ opt_altq.h
	ALTQ_RED opt_altq.h
	ALTQ_RIO opt_altq.h
	BOOTP opt_bootp.h
	BOOTP_BLOCKSIZE opt_bootp.h
	BOOTP_COMPAT opt_bootp.h
	BOOTP_NFSROOT opt_bootp.h
	BOOTP_NFSV3 opt_bootp.h
	BOOTP_WIRED_TO opt_bootp.h
	DEVICE_POLLING
	DUMMYNET opt_ipdn.h
	RATELIMIT opt_ratelimit.h
	RATELIMIT_DEBUG opt_ratelimit.h
	INET opt_inet.h
	INET6 opt_inet6.h
	STATS opt_global.h
	IPDIVERT
	IPFILTER opt_ipfilter.h
	IPFILTER_DEFAULT_BLOCK opt_ipfilter.h
	IPFILTER_LOG opt_ipfilter.h
	IPFILTER_LOOKUP opt_ipfilter.h
	IPFIREWALL opt_ipfw.h
	IPFIREWALL_DEFAULT_TO_ACCEPT opt_ipfw.h
	IPFIREWALL_NAT opt_ipfw.h
	IPFIREWALL_NAT64 opt_ipfw.h
	IPFIREWALL_NPTV6 opt_ipfw.h
	IPFIREWALL_VERBOSE opt_ipfw.h
	IPFIREWALL_VERBOSE_LIMIT opt_ipfw.h
	IPFIREWALL_PMOD opt_ipfw.h
	IPSEC opt_ipsec.h
	IPSEC_DEBUG opt_ipsec.h
	IPSEC_SUPPORT opt_ipsec.h
	IPSTEALTH
	KERN_TLS
	KRPC
	LIBALIAS
	LIBMCHAIN
	MBUF_PROFILING
	MBUF_STRESS_TEST
	MROUTING opt_mrouting.h
	NFSLOCKD
	PCBGROUP opt_pcbgroup.h
	PF_DEFAULT_TO_DROP opt_pf.h
	RADIX_MPATH opt_mpath.h
	ROUTE_MPATH opt_route.h
	ROUTETABLES opt_route.h
	RSS opt_rss.h
	SLIP_IFF_OPTS opt_slip.h
	TCPDEBUG
	TCPPCAP opt_global.h
	SIFTR
	TCP_BLACKBOX opt_global.h
	TCP_HHOOK opt_inet.h
	TCP_OFFLOAD opt_inet.h # Enable code to dispatch TCP offloading
	TCP_RFC7413 opt_inet.h
	TCP_RFC7413_MAX_KEYS opt_inet.h
	TCP_RFC7413_MAX_PSKS opt_inet.h
	TCP_SIGNATURE opt_ipsec.h
	VLAN_ARRAY opt_vlan.h
	XDR
	XBONEHACK

	#
	# SCTP
	#
	SCTP opt_sctp.h
	SCTP_SUPPORT opt_sctp.h
	SCTP_DEBUG opt_sctp.h # Enable debug printfs
	SCTP_LOCK_LOGGING opt_sctp.h # Log to KTR lock activity
	SCTP_MBUF_LOGGING opt_sctp.h # Log to KTR general mbuf aloc/free
	SCTP_MBCNT_LOGGING opt_sctp.h # Log to KTR mbcnt activity
	SCTP_PACKET_LOGGING opt_sctp.h # Log to a packet buffer last N packets
	SCTP_LTRACE_CHUNKS opt_sctp.h # Log to KTR chunks processed
	SCTP_LTRACE_ERRORS opt_sctp.h # Log to KTR error returns.
	SCTP_USE_PERCPU_STAT opt_sctp.h # Use per cpu stats.
	SCTP_MCORE_INPUT opt_sctp.h # Have multiple input threads for input mbufs
	SCTP_LOCAL_TRACE_BUF opt_sctp.h # Use tracebuffer exported via sysctl
	SCTP_DETAILED_STR_STATS opt_sctp.h # Use per PR-SCTP policy stream stats
	#
	#
	#

	# Netgraph(4). Use option NETGRAPH to enable the base netgraph code.
	# Each netgraph node type can be either be compiled into the kernel
	# or loaded dynamically. To get the former, include the corresponding
	# option below. Each type has its own man page, e.g. ng_async(4).
	NETGRAPH
	NETGRAPH_DEBUG opt_netgraph.h
	NETGRAPH_ASYNC opt_netgraph.h
	NETGRAPH_ATMLLC opt_netgraph.h
	NETGRAPH_ATM_ATMPIF opt_netgraph.h
	NETGRAPH_BLUETOOTH opt_netgraph.h
	NETGRAPH_BLUETOOTH_BT3C opt_netgraph.h
	NETGRAPH_BLUETOOTH_H4 opt_netgraph.h
	NETGRAPH_BLUETOOTH_HCI opt_netgraph.h
	NETGRAPH_BLUETOOTH_L2CAP opt_netgraph.h
	NETGRAPH_BLUETOOTH_SOCKET opt_netgraph.h
	NETGRAPH_BLUETOOTH_UBT opt_netgraph.h
	NETGRAPH_BLUETOOTH_UBTBCMFW opt_netgraph.h
	NETGRAPH_BPF opt_netgraph.h
	NETGRAPH_BRIDGE opt_netgraph.h
	NETGRAPH_CAR opt_netgraph.h
	NETGRAPH_CHECKSUM opt_netgraph.h
	NETGRAPH_CISCO opt_netgraph.h
	NETGRAPH_DEFLATE opt_netgraph.h
	NETGRAPH_DEVICE opt_netgraph.h
	NETGRAPH_ECHO opt_netgraph.h
	NETGRAPH_EIFACE opt_netgraph.h
	NETGRAPH_ETHER opt_netgraph.h
	NETGRAPH_ETHER_ECHO opt_netgraph.h
	NETGRAPH_FEC opt_netgraph.h
	NETGRAPH_FRAME_RELAY opt_netgraph.h
	NETGRAPH_GIF opt_netgraph.h
	NETGRAPH_GIF_DEMUX opt_netgraph.h
	NETGRAPH_HOLE opt_netgraph.h
	NETGRAPH_IFACE opt_netgraph.h
	NETGRAPH_IP_INPUT opt_netgraph.h
	NETGRAPH_IPFW opt_netgraph.h
	NETGRAPH_KSOCKET opt_netgraph.h
	NETGRAPH_L2TP opt_netgraph.h
	NETGRAPH_LMI opt_netgraph.h
	NETGRAPH_MPPC_COMPRESSION opt_netgraph.h
	NETGRAPH_MPPC_ENCRYPTION opt_netgraph.h
	NETGRAPH_NAT opt_netgraph.h
	NETGRAPH_NETFLOW opt_netgraph.h
	NETGRAPH_ONE2MANY opt_netgraph.h
	NETGRAPH_PATCH opt_netgraph.h
	NETGRAPH_PIPE opt_netgraph.h
	NETGRAPH_PPP opt_netgraph.h
	NETGRAPH_PPPOE opt_netgraph.h
	NETGRAPH_PPTPGRE opt_netgraph.h
	NETGRAPH_PRED1 opt_netgraph.h
	NETGRAPH_RFC1490 opt_netgraph.h
	NETGRAPH_SOCKET opt_netgraph.h
	NETGRAPH_SPLIT opt_netgraph.h
	NETGRAPH_SPPP opt_netgraph.h
	NETGRAPH_TAG opt_netgraph.h
	NETGRAPH_TCPMSS opt_netgraph.h
	NETGRAPH_TEE opt_netgraph.h
	NETGRAPH_TTY opt_netgraph.h
	NETGRAPH_UI opt_netgraph.h
	NETGRAPH_VJC opt_netgraph.h
	NETGRAPH_VLAN opt_netgraph.h

	# NgATM options
	NGATM_ATM opt_netgraph.h
	NGATM_ATMBASE opt_netgraph.h
	NGATM_SSCOP opt_netgraph.h
	NGATM_SSCFU opt_netgraph.h
	NGATM_UNI opt_netgraph.h
	NGATM_CCATM opt_netgraph.h

	# DRM options
	DRM_DEBUG opt_drm.h

	TI_SF_BUF_JUMBO opt_ti.h
	TI_JUMBO_HDRSPLIT opt_ti.h

	# Misc debug flags. Most of these should probably be replaced with
	# 'DEBUG', and then let people recompile just the interesting modules
	# with 'make CC="cc -DDEBUG"'.
	CLUSTERDEBUG opt_debug_cluster.h
	DEBUG_1284 opt_ppb_1284.h
	LPT_DEBUG opt_lpt.h
	PLIP_DEBUG opt_plip.h
	LOCKF_DEBUG opt_debug_lockf.h
	SI_DEBUG opt_debug_si.h
	IFMEDIA_DEBUG opt_ifmedia.h

	# Fb options
	FB_DEBUG opt_fb.h
	FB_INSTALL_CDEV opt_fb.h

	# ppbus related options
	PERIPH_1284 opt_ppb_1284.h
	DONTPROBE_1284 opt_ppb_1284.h

	# smbus related options
	ENABLE_ALART opt_intpm.h

	# These cause changes all over the kernel
	BLKDEV_IOSIZE opt_global.h
	BURN_BRIDGES opt_global.h
	DEBUG opt_global.h
	DEBUG_LOCKS opt_global.h
	DEBUG_VFS_LOCKS opt_global.h
	DFLTPHYS opt_global.h
	DIAGNOSTIC opt_global.h
	INVARIANT_SUPPORT opt_global.h
	INVARIANTS opt_global.h
	KASSERT_PANIC_OPTIONAL opt_global.h
	MAXCPU opt_global.h
	MAXMEMDOM opt_global.h
	-MAXPHYS opt_global.h
	+MAXPHYS opt_maxphys.h
	MCLSHIFT opt_global.h
	MUTEX_NOINLINE opt_global.h
	LOCK_PROFILING opt_global.h
	LOCK_PROFILING_FAST opt_global.h
	MSIZE opt_global.h
	REGRESSION opt_global.h
	RWLOCK_NOINLINE opt_global.h
	SX_NOINLINE opt_global.h
	VFS_BIO_DEBUG opt_global.h

	# These are VM related options
	VM_KMEM_SIZE opt_vm.h
	VM_KMEM_SIZE_SCALE opt_vm.h
	VM_KMEM_SIZE_MAX opt_vm.h
	VM_NRESERVLEVEL opt_vm.h
	VM_LEVEL_0_ORDER opt_vm.h
	NO_SWAPPING opt_vm.h
	MALLOC_MAKE_FAILURES opt_vm.h
	MALLOC_PROFILE opt_vm.h
	MALLOC_DEBUG_MAXZONES opt_vm.h

	# The MemGuard replacement allocator used for tamper-after-free detection
	DEBUG_MEMGUARD opt_vm.h

	# The RedZone malloc(9) protection
	DEBUG_REDZONE opt_vm.h

	# Standard SMP options
	EARLY_AP_STARTUP opt_global.h
	SMP opt_global.h
	NUMA opt_global.h

	# Size of the kernel message buffer
	MSGBUF_SIZE opt_msgbuf.h

	# NFS options
	NFS_MINATTRTIMO opt_nfs.h
	NFS_MAXATTRTIMO opt_nfs.h
	NFS_MINDIRATTRTIMO opt_nfs.h
	NFS_MAXDIRATTRTIMO opt_nfs.h
	NFS_DEBUG opt_nfs.h

	# TMPFS options
	TMPFS_PAGES_MINRESERVED opt_tmpfs.h

	# Options for uart(4)
	UART_PPS_ON_CTS opt_uart.h
	UART_POLL_FREQ opt_uart.h
	UART_DEV_TOLERANCE_PCT opt_uart.h

	# options for bus/device framework
	BUS_DEBUG opt_bus.h

	# options for USB support
	USB_DEBUG opt_usb.h
	USB_HOST_ALIGN opt_usb.h
	USB_REQ_DEBUG opt_usb.h
	USB_TEMPLATE opt_usb.h
	USB_VERBOSE opt_usb.h
	USB_DMA_SINGLE_ALLOC opt_usb.h
	USB_EHCI_BIG_ENDIAN_DESC opt_usb.h
	U3G_DEBUG opt_u3g.h
	UKBD_DFLT_KEYMAP opt_ukbd.h
	UPLCOM_INTR_INTERVAL opt_uplcom.h
	UVSCOM_DEFAULT_OPKTSIZE opt_uvscom.h
	UVSCOM_INTR_INTERVAL opt_uvscom.h

	# options for the Realtek rtwn driver
	RTWN_DEBUG opt_rtwn.h
	RTWN_WITHOUT_UCODE opt_rtwn.h

	# Embedded system options
	INIT_PATH

	ROOTDEVNAME

	FDC_DEBUG opt_fdc.h
	PCFCLOCK_VERBOSE opt_pcfclock.h
	PCFCLOCK_MAX_RETRIES opt_pcfclock.h

	KTR opt_global.h
	KTR_ALQ opt_ktr.h
	KTR_MASK opt_ktr.h
	KTR_CPUMASK opt_ktr.h
	KTR_COMPILE opt_global.h
	KTR_BOOT_ENTRIES opt_global.h
	KTR_ENTRIES opt_global.h
	KTR_VERBOSE opt_ktr.h
	WITNESS opt_global.h
	WITNESS_KDB opt_witness.h
	WITNESS_NO_VNODE opt_witness.h
	WITNESS_SKIPSPIN opt_witness.h
	WITNESS_COUNT opt_witness.h
	OPENSOLARIS_WITNESS opt_global.h

	EPOCH_TRACE opt_global.h

	# options for ACPI support
	ACPI_DEBUG opt_acpi.h
	ACPI_MAX_TASKS opt_acpi.h
	ACPI_MAX_THREADS opt_acpi.h
	DEV_ACPI opt_acpi.h

	# options for IOMMU support
	IOMMU opt_iommu.h

	# ISA support
	DEV_ISA opt_isa.h
	ISAPNP opt_isa.h

	# various 'device presence' options.
	DEV_BPF opt_bpf.h
	DEV_CARP opt_carp.h
	DEV_NETMAP opt_global.h
	DEV_PCI opt_pci.h
	DEV_PF opt_pf.h
	DEV_PFLOG opt_pf.h
	DEV_PFSYNC opt_pf.h
	DEV_SPLASH opt_splash.h
	DEV_VLAN opt_vlan.h

	# ed driver
	ED_HPP opt_ed.h
	ED_3C503 opt_ed.h
	ED_SIC opt_ed.h

	# bce driver
	BCE_DEBUG opt_bce.h
	BCE_NVRAM_WRITE_SUPPORT opt_bce.h

	SOCKBUF_DEBUG opt_global.h


	# options for hifn driver
	HIFN_DEBUG opt_hifn.h
	HIFN_RNDTEST opt_hifn.h

	# options for safenet driver
	SAFE_DEBUG opt_safe.h
	SAFE_NO_RNG opt_safe.h
	SAFE_RNDTEST opt_safe.h

	# syscons/vt options
	MAXCONS opt_syscons.h
	SC_ALT_MOUSE_IMAGE opt_syscons.h
	SC_CUT_SPACES2TABS opt_syscons.h
	SC_CUT_SEPCHARS opt_syscons.h
	SC_DEBUG_LEVEL opt_syscons.h
	SC_DFLT_FONT opt_syscons.h
	SC_DFLT_TERM opt_syscons.h
	SC_DISABLE_KDBKEY opt_syscons.h
	SC_DISABLE_REBOOT opt_syscons.h
	SC_HISTORY_SIZE opt_syscons.h
	SC_KERNEL_CONS_ATTR opt_syscons.h
	SC_KERNEL_CONS_ATTRS opt_syscons.h
	SC_KERNEL_CONS_REV_ATTR opt_syscons.h
	SC_MOUSE_CHAR opt_syscons.h
	SC_NO_CUTPASTE opt_syscons.h
	SC_NO_FONT_LOADING opt_syscons.h
	SC_NO_HISTORY opt_syscons.h
	SC_NO_MODE_CHANGE opt_syscons.h
	SC_NO_SUSPEND_VTYSWITCH opt_syscons.h
	SC_NO_SYSMOUSE opt_syscons.h
	SC_NO_TERM_DUMB opt_syscons.h
	SC_NO_TERM_SC opt_syscons.h
	SC_NO_TERM_TEKEN opt_syscons.h
	SC_NORM_ATTR opt_syscons.h
	SC_NORM_REV_ATTR opt_syscons.h
	SC_PIXEL_MODE opt_syscons.h
	SC_RENDER_DEBUG opt_syscons.h
	SC_TWOBUTTON_MOUSE opt_syscons.h
	VT_ALT_TO_ESC_HACK opt_syscons.h
	VT_FB_MAX_WIDTH opt_syscons.h
	VT_FB_MAX_HEIGHT opt_syscons.h
	VT_MAXWINDOWS opt_syscons.h
	VT_TWOBUTTON_MOUSE opt_syscons.h
	DEV_SC opt_syscons.h
	DEV_VT opt_syscons.h

	# teken terminal emulator options
	TEKEN_CONS25 opt_teken.h
	TEKEN_UTF8 opt_teken.h
	TERMINAL_KERN_ATTR opt_teken.h
	TERMINAL_NORM_ATTR opt_teken.h

	# options for printf
	PRINTF_BUFR_SIZE opt_printf.h
	BOOT_TAG opt_printf.h
	BOOT_TAG_SZ opt_printf.h

	# kbd options
	KBD_DISABLE_KEYMAP_LOAD opt_kbd.h
	KBD_INSTALL_CDEV opt_kbd.h
	KBD_MAXRETRY opt_kbd.h
	KBD_MAXWAIT opt_kbd.h
	KBD_RESETDELAY opt_kbd.h
	KBDIO_DEBUG opt_kbd.h

	KBDMUX_DFLT_KEYMAP opt_kbdmux.h

	# options for the Atheros driver
	ATH_DEBUG opt_ath.h
	ATH_TXBUF opt_ath.h
	ATH_RXBUF opt_ath.h
	ATH_DIAGAPI opt_ath.h
	ATH_TX99_DIAG opt_ath.h
	ATH_ENABLE_11N opt_ath.h
	ATH_ENABLE_DFS opt_ath.h
	ATH_EEPROM_FIRMWARE opt_ath.h
	ATH_ENABLE_RADIOTAP_VENDOR_EXT opt_ath.h
	ATH_DEBUG_ALQ opt_ath.h
	ATH_KTR_INTR_DEBUG opt_ath.h

	# options for the Atheros hal
	# XXX For now, this breaks non-AR9130 chipsets, so only use it
	# XXX when actually targeting AR9130.
	AH_SUPPORT_AR9130 opt_ah.h

	# This is required for AR933x SoC support
	AH_SUPPORT_AR9330 opt_ah.h
	AH_SUPPORT_AR9340 opt_ah.h
	AH_SUPPORT_QCA9530 opt_ah.h
	AH_SUPPORT_QCA9550 opt_ah.h

	AH_DEBUG opt_ah.h
	AH_ASSERT opt_ah.h
	AH_DEBUG_ALQ opt_ah.h
	AH_REGOPS_FUNC opt_ah.h
	AH_WRITE_REGDOMAIN opt_ah.h
	AH_DEBUG_COUNTRY opt_ah.h
	AH_WRITE_EEPROM opt_ah.h
	AH_PRIVATE_DIAG opt_ah.h
	AH_NEED_DESC_SWAP opt_ah.h
	AH_USE_INIPDGAIN opt_ah.h
	AH_MAXCHAN opt_ah.h
	AH_RXCFG_SDMAMW_4BYTES opt_ah.h
	AH_INTERRUPT_DEBUGGING opt_ah.h
	# AR5416 and later interrupt mitigation
	# XXX do not use this for AR9130
	AH_AR5416_INTERRUPT_MITIGATION opt_ah.h

	# options for the Altera mSGDMA driver (altera_msgdma)
	ALTERA_MSGDMA_DESC_STD opt_altera_msgdma.h
	ALTERA_MSGDMA_DESC_EXT opt_altera_msgdma.h
	ALTERA_MSGDMA_DESC_PF_STD opt_altera_msgdma.h
	ALTERA_MSGDMA_DESC_PF_EXT opt_altera_msgdma.h

	# options for the Broadcom BCM43xx driver (bwi)
	BWI_DEBUG opt_bwi.h
	BWI_DEBUG_VERBOSE opt_bwi.h

	# options for the Brodacom BCM43xx driver (bwn)
	BWN_DEBUG opt_bwn.h
	BWN_GPL_PHY opt_bwn.h
	BWN_USE_SIBA opt_bwn.h

	# Options for the SIBA driver
	SIBA_DEBUG opt_siba.h

	# options for the Marvell 8335 wireless driver
	MALO_DEBUG opt_malo.h
	MALO_TXBUF opt_malo.h
	MALO_RXBUF opt_malo.h

	# options for the Marvell wireless driver
	MWL_DEBUG opt_mwl.h
	MWL_TXBUF opt_mwl.h
	MWL_RXBUF opt_mwl.h
	MWL_DIAGAPI opt_mwl.h
	MWL_AGGR_SIZE opt_mwl.h
	MWL_TX_NODROP opt_mwl.h

	# Options for the Marvell NETA driver
	MVNETA_MULTIQUEUE opt_mvneta.h
	MVNETA_KTR opt_mvneta.h

	# Options for the Intel 802.11ac wireless driver
	IWM_DEBUG opt_iwm.h

	# Options for the Intel 802.11n wireless driver
	IWN_DEBUG opt_iwn.h

	# Options for the Intel 3945ABG wireless driver
	WPI_DEBUG opt_wpi.h

	# dcons options
	DCONS_BUF_SIZE opt_dcons.h
	DCONS_POLL_HZ opt_dcons.h
	DCONS_FORCE_CONSOLE opt_dcons.h
	DCONS_FORCE_GDB opt_dcons.h

	# HWPMC options
	HWPMC_DEBUG opt_global.h
	HWPMC_HOOKS
	HWPMC_MIPS_BACKTRACE opt_hwpmc_hooks.h

	# 802.11 support layer
	IEEE80211_DEBUG opt_wlan.h
	IEEE80211_DEBUG_REFCNT opt_wlan.h
	IEEE80211_SUPPORT_MESH opt_wlan.h
	IEEE80211_SUPPORT_SUPERG opt_wlan.h
	IEEE80211_SUPPORT_TDMA opt_wlan.h
	IEEE80211_ALQ opt_wlan.h
	IEEE80211_DFS_DEBUG opt_wlan.h

	# 802.11 TDMA support
	TDMA_SLOTLEN_DEFAULT opt_tdma.h
	TDMA_SLOTCNT_DEFAULT opt_tdma.h
	TDMA_BINTVAL_DEFAULT opt_tdma.h
	TDMA_TXRATE_11B_DEFAULT opt_tdma.h
	TDMA_TXRATE_11G_DEFAULT opt_tdma.h
	TDMA_TXRATE_11A_DEFAULT opt_tdma.h
	TDMA_TXRATE_TURBO_DEFAULT opt_tdma.h
	TDMA_TXRATE_HALF_DEFAULT opt_tdma.h
	TDMA_TXRATE_QUARTER_DEFAULT opt_tdma.h
	TDMA_TXRATE_11NA_DEFAULT opt_tdma.h
	TDMA_TXRATE_11NG_DEFAULT opt_tdma.h

	# VideoMode
	PICKMODE_DEBUG opt_videomode.h

	# Network stack virtualization options
	VIMAGE opt_global.h
	VNET_DEBUG opt_global.h

	# Common Flash Interface (CFI) options
	CFI_SUPPORT_STRATAFLASH opt_cfi.h
	CFI_ARMEDANDDANGEROUS opt_cfi.h
	CFI_HARDWAREBYTESWAP opt_cfi.h

	# Sound options
	SND_DEBUG opt_snd.h
	SND_DIAGNOSTIC opt_snd.h
	SND_FEEDER_MULTIFORMAT opt_snd.h
	SND_FEEDER_FULL_MULTIFORMAT opt_snd.h
	SND_FEEDER_RATE_HP opt_snd.h
	SND_PCM_64 opt_snd.h
	SND_OLDSTEREO opt_snd.h

	X86BIOS

	# Flattened device tree options
	FDT opt_platform.h
	FDT_DTB_STATIC opt_platform.h

	# OFED Infiniband stack
	OFED opt_ofed.h
	OFED_DEBUG_INIT opt_ofed.h
	SDP opt_ofed.h
	SDP_DEBUG opt_ofed.h
	IPOIB opt_ofed.h
	IPOIB_DEBUG opt_ofed.h
	IPOIB_CM opt_ofed.h

	# Resource Accounting
	RACCT opt_global.h
	RACCT_DEFAULT_TO_DISABLED opt_global.h

	# Resource Limits
	RCTL opt_global.h

	# Random number generator(s)
	# Alternative RNG algorithm.
	RANDOM_FENESTRASX opt_global.h
	# With this, no entropy processor is loaded, but the entropy
	# harvesting infrastructure is present. This means an entropy
	# processor may be loaded as a module.
	RANDOM_LOADABLE opt_global.h
	# This turns on high-rate and potentially expensive harvesting in
	# the uma slab allocator.
	RANDOM_ENABLE_UMA opt_global.h
	RANDOM_ENABLE_ETHER opt_global.h

	# This options turns TPM into entropy source.
	TPM_HARVEST opt_tpm.h

	# BHND(4) driver
	BHND_LOGLEVEL opt_global.h

	# GPIO and child devices
	GPIO_SPI_DEBUG opt_gpio.h

	# SPI devices
	SPIGEN_LEGACY_CDEVNAME opt_spi.h

	# etherswitch(4) driver
	RTL8366_SOFT_RESET opt_etherswitch.h

	# evdev protocol support
	EVDEV_SUPPORT opt_evdev.h
	EVDEV_DEBUG opt_evdev.h
	UINPUT_DEBUG opt_evdev.h

	# Hyper-V network driver
	HN_DEBUG opt_hn.h

	# CAM-based MMC stack
	MMCCAM
	# Encrypted kernel crash dumps
	EKCD opt_ekcd.h

	# NVME options
	NVME_USE_NVD opt_nvme.h

	# amdsbwd options
	AMDSBWD_DEBUG opt_amdsbwd.h

	# gcov support
	GCOV opt_global.h
	LINDEBUGFS
	diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
	index bf06f69192d9..f042eff7cd2e 100644
	--- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
	+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
	@@ -1,1206 +1,1206 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	* All rights reserved.
	*
	* Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
	*/

	#include <sys/zfs_context.h>
	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/bio.h>
	#include <sys/file.h>
	#include <sys/spa.h>
	#include <sys/spa_impl.h>
	#include <sys/vdev_impl.h>
	#include <sys/vdev_os.h>
	#include <sys/fs/zfs.h>
	#include <sys/zio.h>
	#include <geom/geom.h>
	#include <geom/geom_disk.h>
	#include <geom/geom_int.h>

	#ifndef g_topology_locked
	#define g_topology_locked() sx_xlocked(&topology_lock)
	#endif

	/*
	* Virtual device vector for GEOM.
	*/

	static g_attrchanged_t vdev_geom_attrchanged;
	struct g_class zfs_vdev_class = {
	.name = "ZFS::VDEV",
	.version = G_VERSION,
	.attrchanged = vdev_geom_attrchanged,
	};

	struct consumer_vdev_elem {
	SLIST_ENTRY(consumer_vdev_elem) elems;
	vdev_t *vd;
	};

	SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
	/* BEGIN CSTYLED */
	_Static_assert(sizeof (((struct g_consumer *)NULL)->private)
	== sizeof (struct consumer_priv_t*),
	"consumer_priv_t* can't be stored in g_consumer.private");

	DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);

	SYSCTL_DECL(_vfs_zfs_vdev);
	/* Don't send BIO_FLUSH. */
	static int vdev_geom_bio_flush_disable;
	SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
	&vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
	/* Don't send BIO_DELETE. */
	static int vdev_geom_bio_delete_disable;
	SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
	&vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
	/* END CSTYLED */

	/* Declare local functions */
	static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);

	/*
	* Thread local storage used to indicate when a thread is probing geoms
	* for their guids. If NULL, this thread is not tasting geoms. If non NULL,
	* it is looking for a replacement for the vdev_t* that is its value.
	*/
	uint_t zfs_geom_probe_vdev_key;

	static void
	vdev_geom_set_physpath(vdev_t vd, struct g_consumer cp,
	boolean_t do_null_update)
	{
	boolean_t needs_update = B_FALSE;
	char *physpath;
	int error, physpath_len;

	physpath_len = MAXPATHLEN;
	physpath = g_malloc(physpath_len, M_WAITOK\|M_ZERO);
	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
	if (error == 0) {
	char *old_physpath;

	/* g_topology lock ensures that vdev has not been closed */
	g_topology_assert();
	old_physpath = vd->vdev_physpath;
	vd->vdev_physpath = spa_strdup(physpath);

	if (old_physpath != NULL) {
	needs_update = (strcmp(old_physpath,
	vd->vdev_physpath) != 0);
	spa_strfree(old_physpath);
	} else
	needs_update = do_null_update;
	}
	g_free(physpath);

	/*
	* If the physical path changed, update the config.
	* Only request an update for previously unset physpaths if
	* requested by the caller.
	*/
	if (needs_update)
	spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);

	}

	static void
	vdev_geom_attrchanged(struct g_consumer cp, const char attr)
	{
	struct consumer_priv_t *priv;
	struct consumer_vdev_elem *elem;

	priv = (struct consumer_priv_t *)&cp->private;
	if (SLIST_EMPTY(priv))
	return;

	SLIST_FOREACH(elem, priv, elems) {
	vdev_t *vd = elem->vd;
	if (strcmp(attr, "GEOM::physpath") == 0) {
	vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE);
	return;
	}
	}
	}

	static void
	vdev_geom_resize(struct g_consumer *cp)
	{
	struct consumer_priv_t *priv;
	struct consumer_vdev_elem *elem;
	spa_t *spa;
	vdev_t *vd;

	priv = (struct consumer_priv_t *)&cp->private;
	if (SLIST_EMPTY(priv))
	return;

	SLIST_FOREACH(elem, priv, elems) {
	vd = elem->vd;
	if (vd->vdev_state != VDEV_STATE_HEALTHY)
	continue;
	spa = vd->vdev_spa;
	if (!spa->spa_autoexpand)
	continue;
	vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL);
	}
	}

	static void
	vdev_geom_orphan(struct g_consumer *cp)
	{
	struct consumer_priv_t *priv;
	// cppcheck-suppress uninitvar
	struct consumer_vdev_elem *elem;

	g_topology_assert();

	priv = (struct consumer_priv_t *)&cp->private;
	if (SLIST_EMPTY(priv))
	/* Vdev close in progress. Ignore the event. */
	return;

	/*
	* Orphan callbacks occur from the GEOM event thread.
	* Concurrent with this call, new I/O requests may be
	* working their way through GEOM about to find out
	* (only once executed by the g_down thread) that we've
	* been orphaned from our disk provider. These I/Os
	* must be retired before we can detach our consumer.
	* This is most easily achieved by acquiring the
	* SPA ZIO configuration lock as a writer, but doing
	* so with the GEOM topology lock held would cause
	* a lock order reversal. Instead, rely on the SPA's
	* async removal support to invoke a close on this
	* vdev once it is safe to do so.
	*/
	// cppcheck-suppress All
	SLIST_FOREACH(elem, priv, elems) {
	// cppcheck-suppress uninitvar
	vdev_t *vd = elem->vd;

	vd->vdev_remove_wanted = B_TRUE;
	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
	}
	}

	static struct g_consumer *
	vdev_geom_attach(struct g_provider pp, vdev_t vd, boolean_t sanity)
	{
	struct g_geom *gp;
	struct g_consumer *cp;
	int error;

	g_topology_assert();

	ZFS_LOG(1, "Attaching to %s.", pp->name);

	if (sanity) {
	if (pp->sectorsize > VDEV_PAD_SIZE \|\| !ISP2(pp->sectorsize)) {
	ZFS_LOG(1, "Failing attach of %s. "
	"Incompatible sectorsize %d\n",
	pp->name, pp->sectorsize);
	return (NULL);
	} else if (pp->mediasize < SPA_MINDEVSIZE) {
	ZFS_LOG(1, "Failing attach of %s. "
	"Incompatible mediasize %ju\n",
	pp->name, pp->mediasize);
	return (NULL);
	}
	}

	/* Do we have geom already? No? Create one. */
	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
	if (gp->flags & G_GEOM_WITHER)
	continue;
	if (strcmp(gp->name, "zfs::vdev") != 0)
	continue;
	break;
	}
	if (gp == NULL) {
	gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
	gp->orphan = vdev_geom_orphan;
	gp->attrchanged = vdev_geom_attrchanged;
	gp->resize = vdev_geom_resize;
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error != 0) {
	ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
	__LINE__, error);
	vdev_geom_detach(cp, B_FALSE);
	return (NULL);
	}
	error = g_access(cp, 1, 0, 1);
	if (error != 0) {
	ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
	__LINE__, error);
	vdev_geom_detach(cp, B_FALSE);
	return (NULL);
	}
	ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
	} else {
	/* Check if we are already connected to this provider. */
	LIST_FOREACH(cp, &gp->consumer, consumer) {
	if (cp->provider == pp) {
	ZFS_LOG(1, "Found consumer for %s.", pp->name);
	break;
	}
	}
	if (cp == NULL) {
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error != 0) {
	ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
	__func__, __LINE__, error);
	vdev_geom_detach(cp, B_FALSE);
	return (NULL);
	}
	error = g_access(cp, 1, 0, 1);
	if (error != 0) {
	ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
	__func__, __LINE__, error);
	vdev_geom_detach(cp, B_FALSE);
	return (NULL);
	}
	ZFS_LOG(1, "Created consumer for %s.", pp->name);
	} else {
	error = g_access(cp, 1, 0, 1);
	if (error != 0) {
	ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
	__func__, __LINE__, error);
	return (NULL);
	}
	ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
	}
	}

	if (vd != NULL)
	vd->vdev_tsd = cp;

	cp->flags \|= G_CF_DIRECT_SEND \| G_CF_DIRECT_RECEIVE;
	return (cp);
	}

	static void
	vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
	{
	struct g_geom *gp;

	g_topology_assert();

	ZFS_LOG(1, "Detaching from %s.",
	cp->provider && cp->provider->name ? cp->provider->name : "NULL");

	gp = cp->geom;
	if (open_for_read)
	g_access(cp, -1, 0, -1);
	/* Destroy consumer on last close. */
	if (cp->acr == 0 && cp->ace == 0) {
	if (cp->acw > 0)
	g_access(cp, 0, -cp->acw, 0);
	if (cp->provider != NULL) {
	ZFS_LOG(1, "Destroying consumer for %s.",
	cp->provider->name ? cp->provider->name : "NULL");
	g_detach(cp);
	}
	g_destroy_consumer(cp);
	}
	/* Destroy geom if there are no consumers left. */
	if (LIST_EMPTY(&gp->consumer)) {
	ZFS_LOG(1, "Destroyed geom %s.", gp->name);
	g_wither_geom(gp, ENXIO);
	}
	}

	static void
	vdev_geom_close_locked(vdev_t *vd)
	{
	struct g_consumer *cp;
	struct consumer_priv_t *priv;
	struct consumer_vdev_elem elem, elem_temp;

	g_topology_assert();

	cp = vd->vdev_tsd;
	vd->vdev_delayed_close = B_FALSE;
	if (cp == NULL)
	return;

	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
	priv = (struct consumer_priv_t *)&cp->private;
	vd->vdev_tsd = NULL;
	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
	if (elem->vd == vd) {
	SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
	g_free(elem);
	}
	}

	vdev_geom_detach(cp, B_TRUE);
	}

	/*
	* Issue one or more bios to the vdev in parallel
	* cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO
	* operation is described by parallel entries from each array. There may be
	* more bios actually issued than entries in the array
	*/
	static void
	vdev_geom_io(struct g_consumer cp, int cmds, void *datas, off_t offsets,
	off_t sizes, int errors, int ncmds)
	{
	struct bio **bios;
	uint8_t *p;
	off_t off, maxio, s, end;
	int i, n_bios, j;
	size_t bios_size;

	- maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
	+ maxio = maxphys - (maxphys % cp->provider->sectorsize);
	n_bios = 0;

	/* How many bios are required for all commands ? */
	for (i = 0; i < ncmds; i++)
	n_bios += (sizes[i] + maxio - 1) / maxio;

	/* Allocate memory for the bios */
	bios_size = n_bios * sizeof (struct bio *);
	bios = kmem_zalloc(bios_size, KM_SLEEP);

	/* Prepare and issue all of the bios */
	for (i = j = 0; i < ncmds; i++) {
	off = offsets[i];
	p = datas[i];
	s = sizes[i];
	end = off + s;
	ASSERT((off % cp->provider->sectorsize) == 0);
	ASSERT((s % cp->provider->sectorsize) == 0);

	for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
	bios[j] = g_alloc_bio();
	bios[j]->bio_cmd = cmds[i];
	bios[j]->bio_done = NULL;
	bios[j]->bio_offset = off;
	bios[j]->bio_length = MIN(s, maxio);
	bios[j]->bio_data = (caddr_t)p;
	g_io_request(bios[j], cp);
	}
	}
	ASSERT(j == n_bios);

	/* Wait for all of the bios to complete, and clean them up */
	for (i = j = 0; i < ncmds; i++) {
	off = offsets[i];
	s = sizes[i];
	end = off + s;

	for (; off < end; off += maxio, s -= maxio, j++) {
	errors[i] = biowait(bios[j], "vdev_geom_io") \|\|
	errors[i];
	g_destroy_bio(bios[j]);
	}
	}
	kmem_free(bios, bios_size);
	}

	/*
	* Read the vdev config from a device. Return the number of valid labels that
	* were found. The vdev config will be returned in config if and only if at
	* least one valid label was found.
	*/
	static int
	vdev_geom_read_config(struct g_consumer cp, nvlist_t *configp)
	{
	struct g_provider *pp;
	nvlist_t *config;
	vdev_phys_t *vdev_lists[VDEV_LABELS];
	char *buf;
	size_t buflen;
	uint64_t psize, state, txg;
	off_t offsets[VDEV_LABELS];
	off_t size;
	off_t sizes[VDEV_LABELS];
	int cmds[VDEV_LABELS];
	int errors[VDEV_LABELS];
	int l, nlabels;

	g_topology_assert_not();

	pp = cp->provider;
	ZFS_LOG(1, "Reading config from %s...", pp->name);

	psize = pp->mediasize;
	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));

	size = sizeof (*vdev_lists[0]) + pp->sectorsize -
	((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1;

	buflen = sizeof (vdev_lists[0]->vp_nvlist);

	/* Create all of the IO requests */
	for (l = 0; l < VDEV_LABELS; l++) {
	cmds[l] = BIO_READ;
	vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
	offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
	sizes[l] = size;
	errors[l] = 0;
	ASSERT(offsets[l] % pp->sectorsize == 0);
	}

	/* Issue the IO requests */
	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
	VDEV_LABELS);

	/* Parse the labels */
	config = *configp = NULL;
	nlabels = 0;
	for (l = 0; l < VDEV_LABELS; l++) {
	if (errors[l] != 0)
	continue;

	buf = vdev_lists[l]->vp_nvlist;

	if (nvlist_unpack(buf, buflen, &config, 0) != 0)
	continue;

	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	&state) != 0 \|\| state > POOL_STATE_L2CACHE) {
	nvlist_free(config);
	continue;
	}

	if (state != POOL_STATE_SPARE &&
	state != POOL_STATE_L2CACHE &&
	(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
	&txg) != 0 \|\| txg == 0)) {
	nvlist_free(config);
	continue;
	}

	if (*configp != NULL)
	nvlist_free(*configp);
	*configp = config;
	nlabels++;
	}

	/* Free the label storage */
	for (l = 0; l < VDEV_LABELS; l++)
	kmem_free(vdev_lists[l], size);

	return (nlabels);
	}

	static void
	resize_configs(nvlist_t **configs, uint64_t count, uint64_t id)
	{
	nvlist_t **new_configs;
	uint64_t i;

	if (id < *count)
	return;
	new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *),
	KM_SLEEP);
	for (i = 0; i < *count; i++)
	new_configs[i] = (*configs)[i];
	if (*configs != NULL)
	kmem_free(configs, count * sizeof (void *));
	*configs = new_configs;
	*count = id + 1;
	}

	static void
	process_vdev_config(nvlist_t **configs, uint64_t count, nvlist_t *cfg,
	const char name, uint64_t known_pool_guid)
	{
	nvlist_t *vdev_tree;
	uint64_t pool_guid;
	uint64_t vdev_guid;
	uint64_t id, txg, known_txg;
	char *pname;

	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 \|\|
	strcmp(pname, name) != 0)
	goto ignore;

	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
	goto ignore;

	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
	goto ignore;

	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
	goto ignore;

	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
	goto ignore;

	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);

	if (*known_pool_guid != 0) {
	if (pool_guid != *known_pool_guid)
	goto ignore;
	} else
	*known_pool_guid = pool_guid;

	resize_configs(configs, count, id);

	if ((*configs)[id] != NULL) {
	VERIFY(nvlist_lookup_uint64((*configs)[id],
	ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
	if (txg <= known_txg)
	goto ignore;
	nvlist_free((*configs)[id]);
	}

	(*configs)[id] = cfg;
	return;

	ignore:
	nvlist_free(cfg);
	}

	int
	vdev_geom_read_pool_label(const char *name,
	nvlist_t **configs, uint64_t count)
	{
	struct g_class *mp;
	struct g_geom *gp;
	struct g_provider *pp;
	struct g_consumer *zcp;
	nvlist_t *vdev_cfg;
	uint64_t pool_guid;
	int nlabels;

	DROP_GIANT();
	g_topology_lock();

	*configs = NULL;
	*count = 0;
	pool_guid = 0;
	LIST_FOREACH(mp, &g_classes, class) {
	if (mp == &zfs_vdev_class)
	continue;
	LIST_FOREACH(gp, &mp->geom, geom) {
	if (gp->flags & G_GEOM_WITHER)
	continue;
	LIST_FOREACH(pp, &gp->provider, provider) {
	if (pp->flags & G_PF_WITHER)
	continue;
	zcp = vdev_geom_attach(pp, NULL, B_TRUE);
	if (zcp == NULL)
	continue;
	g_topology_unlock();
	nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
	g_topology_lock();
	vdev_geom_detach(zcp, B_TRUE);
	if (nlabels == 0)
	continue;
	ZFS_LOG(1, "successfully read vdev config");

	process_vdev_config(configs, count,
	vdev_cfg, name, &pool_guid);
	}
	}
	}
	g_topology_unlock();
	PICKUP_GIANT();

	return (*count > 0 ? 0 : ENOENT);
	}

	enum match {
	NO_MATCH = 0, /* No matching labels found */
	TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */
	ZERO_MATCH = 1, /* Should never be returned */
	ONE_MATCH = 2, /* 1 label matching the vdev_guid */
	TWO_MATCH = 3, /* 2 label matching the vdev_guid */
	THREE_MATCH = 4, /* 3 label matching the vdev_guid */
	FULL_MATCH = 5 /* all labels match the vdev_guid */
	};

	static enum match
	vdev_attach_ok(vdev_t vd, struct g_provider pp)
	{
	nvlist_t *config;
	uint64_t pool_guid, top_guid, vdev_guid;
	struct g_consumer *cp;
	int nlabels;

	cp = vdev_geom_attach(pp, NULL, B_TRUE);
	if (cp == NULL) {
	ZFS_LOG(1, "Unable to attach tasting instance to %s.",
	pp->name);
	return (NO_MATCH);
	}
	g_topology_unlock();
	nlabels = vdev_geom_read_config(cp, &config);
	g_topology_lock();
	vdev_geom_detach(cp, B_TRUE);
	if (nlabels == 0) {
	ZFS_LOG(1, "Unable to read config from %s.", pp->name);
	return (NO_MATCH);
	}

	pool_guid = 0;
	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
	top_guid = 0;
	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
	vdev_guid = 0;
	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
	nvlist_free(config);

	/*
	* Check that the label's pool guid matches the desired guid.
	* Inactive spares and L2ARCs do not have any pool guid in the label.
	*/
	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
	ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
	pp->name,
	(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
	return (NO_MATCH);
	}

	/*
	* Check that the label's vdev guid matches the desired guid.
	* The second condition handles possible race on vdev detach, when
	* remaining vdev receives GUID of destroyed top level mirror vdev.
	*/
	if (vdev_guid == vd->vdev_guid) {
	ZFS_LOG(1, "guids match for provider %s.", pp->name);
	return (ZERO_MATCH + nlabels);
	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
	ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
	return (TOPGUID_MATCH);
	}
	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
	pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
	return (NO_MATCH);
	}

	static struct g_consumer *
	vdev_geom_attach_by_guids(vdev_t *vd)
	{
	struct g_class *mp;
	struct g_geom *gp;
	struct g_provider pp, best_pp;
	struct g_consumer *cp;
	const char *vdpath;
	enum match match, best_match;

	g_topology_assert();

	vdpath = vd->vdev_path + sizeof ("/dev/") - 1;
	cp = NULL;
	best_pp = NULL;
	best_match = NO_MATCH;
	LIST_FOREACH(mp, &g_classes, class) {
	if (mp == &zfs_vdev_class)
	continue;
	LIST_FOREACH(gp, &mp->geom, geom) {
	if (gp->flags & G_GEOM_WITHER)
	continue;
	LIST_FOREACH(pp, &gp->provider, provider) {
	match = vdev_attach_ok(vd, pp);
	if (match > best_match) {
	best_match = match;
	best_pp = pp;
	} else if (match == best_match) {
	if (strcmp(pp->name, vdpath) == 0) {
	best_pp = pp;
	}
	}
	if (match == FULL_MATCH)
	goto out;
	}
	}
	}

	out:
	if (best_pp) {
	cp = vdev_geom_attach(best_pp, vd, B_TRUE);
	if (cp == NULL) {
	printf("ZFS WARNING: Unable to attach to %s.\n",
	best_pp->name);
	}
	}
	return (cp);
	}

	static struct g_consumer *
	vdev_geom_open_by_guids(vdev_t *vd)
	{
	struct g_consumer *cp;
	char *buf;
	size_t len;

	g_topology_assert();

	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
	(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
	cp = vdev_geom_attach_by_guids(vd);
	if (cp != NULL) {
	len = strlen(cp->provider->name) + strlen("/dev/") + 1;
	buf = kmem_alloc(len, KM_SLEEP);

	snprintf(buf, len, "/dev/%s", cp->provider->name);
	spa_strfree(vd->vdev_path);
	vd->vdev_path = buf;

	ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
	(uintmax_t)spa_guid(vd->vdev_spa),
	(uintmax_t)vd->vdev_guid, cp->provider->name);
	} else {
	ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
	(uintmax_t)spa_guid(vd->vdev_spa),
	(uintmax_t)vd->vdev_guid);
	}

	return (cp);
	}

	static struct g_consumer *
	vdev_geom_open_by_path(vdev_t *vd, int check_guid)
	{
	struct g_provider *pp;
	struct g_consumer *cp;

	g_topology_assert();

	cp = NULL;
	pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1);
	if (pp != NULL) {
	ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
	if (!check_guid \|\| vdev_attach_ok(vd, pp) == FULL_MATCH)
	cp = vdev_geom_attach(pp, vd, B_FALSE);
	}

	return (cp);
	}

	static int
	vdev_geom_open(vdev_t vd, uint64_t psize, uint64_t *max_psize,
	uint64_t logical_ashift, uint64_t physical_ashift)
	{
	struct g_provider *pp;
	struct g_consumer *cp;
	int error, has_trim;
	uint16_t rate;

	/*
	* Set the TLS to indicate downstack that we
	* should not access zvols
	*/
	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);

	/*
	* We must have a pathname, and it must be absolute.
	*/
	if (vd->vdev_path == NULL \|\| strncmp(vd->vdev_path, "/dev/", 5) != 0) {
	vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	return (EINVAL);
	}

	/*
	* Reopen the device if it's not currently open. Otherwise,
	* just update the physical size of the device.
	*/
	if ((cp = vd->vdev_tsd) != NULL) {
	ASSERT(vd->vdev_reopening);
	goto skip_open;
	}

	DROP_GIANT();
	g_topology_lock();
	error = 0;

	if (vd->vdev_spa->spa_is_splitting \|\|
	((vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
	(vd->vdev_spa->spa_load_state == SPA_LOAD_NONE \|\|
	vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) {
	/*
	* We are dealing with a vdev that hasn't been previously
	* opened (since boot), and we are not loading an
	* existing pool configuration. This looks like a
	* vdev add operation to a new or existing pool.
	* Assume the user really wants to do this, and find
	* GEOM provider by its name, ignoring GUID mismatches.
	*
	* XXPOLICY: It would be safer to only allow a device
	* that is unlabeled or labeled but missing
	* GUID information to be opened in this fashion,
	* unless we are doing a split, in which case we
	* should allow any guid.
	*/
	cp = vdev_geom_open_by_path(vd, 0);
	} else {
	/*
	* Try using the recorded path for this device, but only
	* accept it if its label data contains the expected GUIDs.
	*/
	cp = vdev_geom_open_by_path(vd, 1);
	if (cp == NULL) {
	/*
	* The device at vd->vdev_path doesn't have the
	* expected GUIDs. The disks might have merely
	* moved around so try all other GEOM providers
	* to find one with the right GUIDs.
	*/
	cp = vdev_geom_open_by_guids(vd);
	}
	}

	/* Clear the TLS now that tasting is done */
	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);

	if (cp == NULL) {
	ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
	error = ENOENT;
	} else {
	struct consumer_priv_t *priv;
	struct consumer_vdev_elem *elem;
	int spamode;

	priv = (struct consumer_priv_t *)&cp->private;
	if (cp->private == NULL)
	SLIST_INIT(priv);
	elem = g_malloc(sizeof (*elem), M_WAITOK\|M_ZERO);
	elem->vd = vd;
	SLIST_INSERT_HEAD(priv, elem, elems);

	spamode = spa_mode(vd->vdev_spa);
	if (cp->provider->sectorsize > VDEV_PAD_SIZE \|\|
	!ISP2(cp->provider->sectorsize)) {
	ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
	cp->provider->name);

	vdev_geom_close_locked(vd);
	error = EINVAL;
	cp = NULL;
	} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
	int i;

	for (i = 0; i < 5; i++) {
	error = g_access(cp, 0, 1, 0);
	if (error == 0)
	break;
	g_topology_unlock();
	tsleep(vd, 0, "vdev", hz / 2);
	g_topology_lock();
	}
	if (error != 0) {
	printf("ZFS WARNING: Unable to open %s for "
	"writing (error=%d).\n",
	cp->provider->name, error);
	vdev_geom_close_locked(vd);
	cp = NULL;
	}
	}
	}

	/* Fetch initial physical path information for this device. */
	if (cp != NULL) {
	vdev_geom_attrchanged(cp, "GEOM::physpath");

	/* Set other GEOM characteristics */
	vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE);
	}

	g_topology_unlock();
	PICKUP_GIANT();
	if (cp == NULL) {
	vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]",
	error);
	return (error);
	}
	skip_open:
	pp = cp->provider;

	/*
	* Determine the actual size of the device.
	*/
	max_psize = psize = pp->mediasize;

	/*
	* Determine the device's minimum transfer size and preferred
	* transfer size.
	*/
	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
	*physical_ashift = 0;
	if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) &&
	ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) &&
	pp->stripeoffset == 0)
	*physical_ashift = highbit(pp->stripesize) - 1;

	/*
	* Clear the nowritecache settings, so that on a vdev_reopen()
	* we will try again.
	*/
	vd->vdev_nowritecache = B_FALSE;

	/* Inform the ZIO pipeline that we are non-rotational. */
	error = g_getattr("GEOM::rotation_rate", cp, &rate);
	if (error == 0 && rate == DISK_RR_NON_ROTATING)
	vd->vdev_nonrot = B_TRUE;
	else
	vd->vdev_nonrot = B_FALSE;

	/* Set when device reports it supports TRIM. */
	error = g_getattr("GEOM::candelete", cp, &has_trim);
	vd->vdev_has_trim = (error == 0 && has_trim);

	/* Set when device reports it supports secure TRIM. */
	/* unavailable on FreeBSD */
	vd->vdev_has_securetrim = B_FALSE;

	return (0);
	}

	static void
	vdev_geom_close(vdev_t *vd)
	{
	struct g_consumer *cp;
	boolean_t locked;

	cp = vd->vdev_tsd;

	DROP_GIANT();
	locked = g_topology_locked();
	if (!locked)
	g_topology_lock();

	if (!vd->vdev_reopening \|\|
	(cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 \|\|
	(cp->provider != NULL && cp->provider->error != 0))))
	vdev_geom_close_locked(vd);

	if (!locked)
	g_topology_unlock();
	PICKUP_GIANT();
	}

	static void
	vdev_geom_io_intr(struct bio *bp)
	{
	vdev_t *vd;
	zio_t *zio;

	zio = bp->bio_caller1;
	vd = zio->io_vd;
	zio->io_error = bp->bio_error;
	if (zio->io_error == 0 && bp->bio_resid != 0)
	zio->io_error = SET_ERROR(EIO);

	switch (zio->io_error) {
	case ENOTSUP:
	/*
	* If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
	* that future attempts will never succeed. In this case
	* we set a persistent flag so that we don't bother with
	* requests in the future.
	*/
	switch (bp->bio_cmd) {
	case BIO_FLUSH:
	vd->vdev_nowritecache = B_TRUE;
	break;
	case BIO_DELETE:
	break;
	}
	break;
	case ENXIO:
	if (!vd->vdev_remove_wanted) {
	/*
	* If provider's error is set we assume it is being
	* removed.
	*/
	if (bp->bio_to->error != 0) {
	vd->vdev_remove_wanted = B_TRUE;
	spa_async_request(zio->io_spa,
	SPA_ASYNC_REMOVE);
	} else if (!vd->vdev_delayed_close) {
	vd->vdev_delayed_close = B_TRUE;
	}
	}
	break;
	}

	/*
	* We have to split bio freeing into two parts, because the ABD code
	* cannot be called in this context and vdev_op_io_done is not called
	* for ZIO_TYPE_IOCTL zio-s.
	*/
	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
	g_destroy_bio(bp);
	zio->io_bio = NULL;
	}
	zio_delay_interrupt(zio);
	}

	static void
	vdev_geom_io_start(zio_t *zio)
	{
	vdev_t *vd;
	struct g_consumer *cp;
	struct bio *bp;

	vd = zio->io_vd;

	switch (zio->io_type) {
	case ZIO_TYPE_IOCTL:
	/* XXPOLICY */
	if (!vdev_readable(vd)) {
	zio->io_error = SET_ERROR(ENXIO);
	zio_interrupt(zio);
	return;
	} else {
	switch (zio->io_cmd) {
	case DKIOCFLUSHWRITECACHE:
	if (zfs_nocacheflush \|\|
	vdev_geom_bio_flush_disable)
	break;
	if (vd->vdev_nowritecache) {
	zio->io_error = SET_ERROR(ENOTSUP);
	break;
	}
	goto sendreq;
	default:
	zio->io_error = SET_ERROR(ENOTSUP);
	}
	}

	zio_execute(zio);
	return;
	case ZIO_TYPE_TRIM:
	if (!vdev_geom_bio_delete_disable) {
	goto sendreq;
	}
	zio_execute(zio);
	return;
	default:
	;
	/* PASSTHROUGH --- placate compiler */
	}
	sendreq:
	ASSERT(zio->io_type == ZIO_TYPE_READ \|\|
	zio->io_type == ZIO_TYPE_WRITE \|\|
	zio->io_type == ZIO_TYPE_TRIM \|\|
	zio->io_type == ZIO_TYPE_IOCTL);

	cp = vd->vdev_tsd;
	if (cp == NULL) {
	zio->io_error = SET_ERROR(ENXIO);
	zio_interrupt(zio);
	return;
	}
	bp = g_alloc_bio();
	bp->bio_caller1 = zio;
	switch (zio->io_type) {
	case ZIO_TYPE_READ:
	case ZIO_TYPE_WRITE:
	zio->io_target_timestamp = zio_handle_io_delay(zio);
	bp->bio_offset = zio->io_offset;
	bp->bio_length = zio->io_size;
	if (zio->io_type == ZIO_TYPE_READ) {
	bp->bio_cmd = BIO_READ;
	bp->bio_data =
	abd_borrow_buf(zio->io_abd, zio->io_size);
	} else {
	bp->bio_cmd = BIO_WRITE;
	bp->bio_data =
	abd_borrow_buf_copy(zio->io_abd, zio->io_size);
	}
	break;
	case ZIO_TYPE_TRIM:
	bp->bio_cmd = BIO_DELETE;
	bp->bio_data = NULL;
	bp->bio_offset = zio->io_offset;
	bp->bio_length = zio->io_size;
	break;
	case ZIO_TYPE_IOCTL:
	bp->bio_cmd = BIO_FLUSH;
	bp->bio_flags \|= BIO_ORDERED;
	bp->bio_data = NULL;
	bp->bio_offset = cp->provider->mediasize;
	bp->bio_length = 0;
	break;
	default:
	panic("invalid zio->io_type: %d\n", zio->io_type);
	}
	bp->bio_done = vdev_geom_io_intr;
	zio->io_bio = bp;

	g_io_request(bp, cp);
	}

	static void
	vdev_geom_io_done(zio_t *zio)
	{
	struct bio *bp = zio->io_bio;

	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
	ASSERT(bp == NULL);
	return;
	}

	if (bp == NULL) {
	ASSERT3S(zio->io_error, ==, ENXIO);
	return;
	}

	if (zio->io_type == ZIO_TYPE_READ)
	abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
	else
	abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);

	g_destroy_bio(bp);
	zio->io_bio = NULL;
	}

	static void
	vdev_geom_hold(vdev_t *vd)
	{
	}

	static void
	vdev_geom_rele(vdev_t *vd)
	{
	}

	vdev_ops_t vdev_disk_ops = {
	vdev_geom_open,
	vdev_geom_close,
	vdev_default_asize,
	vdev_geom_io_start,
	vdev_geom_io_done,
	NULL,
	NULL,
	vdev_geom_hold,
	vdev_geom_rele,
	NULL,
	vdev_default_xlate,
	VDEV_TYPE_DISK, /* name of this vdev type */
	B_TRUE /* leaf vdev */
	};
	diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
	index 8163fc9df2d4..092eb34eaa47 100644
	--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
	+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
	@@ -1,1475 +1,1475 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	*
	* Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	* All rights reserved.
	*
	* Portions Copyright 2010 Robert Milkowski
	*
	* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2013, Joyent, Inc. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */

	/*
	* ZFS volume emulation driver.
	*
	* Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
	* Volumes are accessed through the symbolic links named:
	*
	* /dev/zvol/<pool_name>/<dataset_name>
	*
	* Volumes are persistent through reboot. No user command needs to be
	* run before opening and using a device.
	*
	* On FreeBSD ZVOLs are simply GEOM providers like any other storage device
	* in the system. Except when they're simply character devices (volmode=dev).
	*/

	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/errno.h>
	#include <sys/uio.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/kmem.h>
	#include <sys/conf.h>
	#include <sys/cmn_err.h>
	#include <sys/stat.h>
	#include <sys/proc.h>
	#include <sys/zap.h>
	#include <sys/spa.h>
	#include <sys/spa_impl.h>
	#include <sys/zio.h>
	#include <sys/disk.h>
	#include <sys/dmu_traverse.h>
	#include <sys/dnode.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_dir.h>
	#include <sys/byteorder.h>
	#include <sys/sunddi.h>
	#include <sys/dirent.h>
	#include <sys/policy.h>
	#include <sys/queue.h>
	#include <sys/fs/zfs.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/zil.h>
	#include <sys/zfs_znode.h>
	#include <sys/zfs_rlock.h>
	#include <sys/vdev_impl.h>
	#include <sys/vdev_raidz.h>
	#include <sys/zvol.h>
	#include <sys/zil_impl.h>
	#include <sys/dataset_kstats.h>
	#include <sys/dbuf.h>
	#include <sys/dmu_tx.h>
	#include <sys/zfeature.h>
	#include <sys/zio_checksum.h>
	#include <sys/zil_impl.h>
	#include <sys/filio.h>

	#include <geom/geom.h>
	#include <sys/zvol.h>
	#include <sys/zvol_impl.h>

	#include "zfs_namecheck.h"

	#define ZVOL_DUMPSIZE "dumpsize"

	#ifdef ZVOL_LOCK_DEBUG
	#define ZVOL_RW_READER RW_WRITER
	#define ZVOL_RW_READ_HELD RW_WRITE_HELD
	#else
	#define ZVOL_RW_READER RW_READER
	#define ZVOL_RW_READ_HELD RW_READ_HELD
	#endif

	enum zvol_geom_state {
	ZVOL_GEOM_UNINIT,
	ZVOL_GEOM_STOPPED,
	ZVOL_GEOM_RUNNING,
	};

	struct zvol_state_os {
	int zso_volmode;
	#define zso_dev _zso_state._zso_dev
	#define zso_geom _zso_state._zso_geom
	union {
	/* volmode=dev */
	struct zvol_state_dev {
	struct cdev *zsd_cdev;
	uint64_t zsd_sync_cnt;
	} _zso_dev;

	/* volmode=geom */
	struct zvol_state_geom {
	struct g_provider *zsg_provider;
	struct bio_queue_head zsg_queue;
	struct mtx zsg_queue_mtx;
	enum zvol_geom_state zsg_state;
	} _zso_geom;
	} _zso_state;
	};

	static uint32_t zvol_minors;

	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
	SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
	"Expose as GEOM providers (1), device files (2) or neither");
	static boolean_t zpool_on_zvol = B_FALSE;
	SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
	"Allow zpools to use zvols as vdevs (DANGEROUS)");

	/*
	* Toggle unmap functionality.
	*/
	boolean_t zvol_unmap_enabled = B_TRUE;

	SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");

	/*
	* zvol maximum transfer in one DMU tx.
	*/
	int zvol_maxphys = DMU_MAX_ACCESS / 2;

	static void zvol_ensure_zilog(zvol_state_t *zv);

	static d_open_t zvol_cdev_open;
	static d_close_t zvol_cdev_close;
	static d_ioctl_t zvol_cdev_ioctl;
	static d_read_t zvol_cdev_read;
	static d_write_t zvol_cdev_write;
	static d_strategy_t zvol_geom_bio_strategy;

	static struct cdevsw zvol_cdevsw = {
	.d_name = "zvol",
	.d_version = D_VERSION,
	.d_flags = D_DISK \| D_TRACKCLOSE,
	.d_open = zvol_cdev_open,
	.d_close = zvol_cdev_close,
	.d_ioctl = zvol_cdev_ioctl,
	.d_read = zvol_cdev_read,
	.d_write = zvol_cdev_write,
	.d_strategy = zvol_geom_bio_strategy,
	};

	extern uint_t zfs_geom_probe_vdev_key;

	struct g_class zfs_zvol_class = {
	.name = "ZFS::ZVOL",
	.version = G_VERSION,
	};

	DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);

	static int zvol_geom_open(struct g_provider *pp, int flag, int count);
	static int zvol_geom_close(struct g_provider *pp, int flag, int count);
	static void zvol_geom_run(zvol_state_t *zv);
	static void zvol_geom_destroy(zvol_state_t *zv);
	static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
	static void zvol_geom_worker(void *arg);
	static void zvol_geom_bio_start(struct bio *bp);
	static int zvol_geom_bio_getattr(struct bio *bp);
	/* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */

	/*
	* GEOM mode implementation
	*/

	/ARGSUSED/
	static int
	zvol_geom_open(struct g_provider *pp, int flag, int count)
	{
	zvol_state_t *zv;
	int err = 0;
	boolean_t drop_suspend = B_TRUE;
	boolean_t drop_namespace = B_FALSE;

	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
	/*
	* if zfs_geom_probe_vdev_key is set, that means that zfs is
	* attempting to probe geom providers while looking for a
	* replacement for a missing VDEV. In this case, the
	* spa_namespace_lock will not be held, but it is still illegal
	* to use a zvol as a vdev. Deadlocks can result if another
	* thread has spa_namespace_lock
	*/
	return (SET_ERROR(EOPNOTSUPP));
	}

	retry:
	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
	zv = pp->private;
	if (zv == NULL) {
	if (drop_namespace)
	mutex_exit(&spa_namespace_lock);
	rw_exit(&zvol_state_lock);
	return (SET_ERROR(ENXIO));
	}

	if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
	/*
	* We need to guarantee that the namespace lock is held
	* to avoid spurious failures in zvol_first_open
	*/
	drop_namespace = B_TRUE;
	if (!mutex_tryenter(&spa_namespace_lock)) {
	rw_exit(&zvol_state_lock);
	mutex_enter(&spa_namespace_lock);
	goto retry;
	}
	}
	mutex_enter(&zv->zv_state_lock);

	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);

	/*
	* make sure zvol is not suspended during first open
	* (hold zv_suspend_lock) and respect proper lock acquisition
	* ordering - zv_suspend_lock before zv_state_lock
	*/
	if (zv->zv_open_count == 0) {
	if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
	mutex_exit(&zv->zv_state_lock);
	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
	mutex_enter(&zv->zv_state_lock);
	/* check to see if zv_suspend_lock is needed */
	if (zv->zv_open_count != 0) {
	rw_exit(&zv->zv_suspend_lock);
	drop_suspend = B_FALSE;
	}
	}
	} else {
	drop_suspend = B_FALSE;
	}
	rw_exit(&zvol_state_lock);

	ASSERT(MUTEX_HELD(&zv->zv_state_lock));

	if (zv->zv_open_count == 0) {
	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
	err = zvol_first_open(zv, !(flag & FWRITE));
	if (err)
	goto out_mutex;
	pp->mediasize = zv->zv_volsize;
	pp->stripeoffset = 0;
	pp->stripesize = zv->zv_volblocksize;
	}

	/*
	* Check for a bad on-disk format version now since we
	* lied about owning the dataset readonly before.
	*/
	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) \|\|
	dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
	err = EROFS;
	goto out_open_count;
	}
	if (zv->zv_flags & ZVOL_EXCL) {
	err = EBUSY;
	goto out_open_count;
	}
	#ifdef FEXCL
	if (flag & FEXCL) {
	if (zv->zv_open_count != 0) {
	err = EBUSY;
	goto out_open_count;
	}
	zv->zv_flags \|= ZVOL_EXCL;
	}
	#endif

	zv->zv_open_count += count;
	if (drop_namespace)
	mutex_exit(&spa_namespace_lock);
	mutex_exit(&zv->zv_state_lock);
	if (drop_suspend)
	rw_exit(&zv->zv_suspend_lock);
	return (0);

	out_open_count:
	if (zv->zv_open_count == 0)
	zvol_last_close(zv);
	out_mutex:
	if (drop_namespace)
	mutex_exit(&spa_namespace_lock);
	mutex_exit(&zv->zv_state_lock);
	if (drop_suspend)
	rw_exit(&zv->zv_suspend_lock);
	return (SET_ERROR(err));
	}

	/ARGSUSED/
	static int
	zvol_geom_close(struct g_provider *pp, int flag, int count)
	{
	zvol_state_t *zv;
	boolean_t drop_suspend = B_TRUE;

	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
	zv = pp->private;
	if (zv == NULL) {
	rw_exit(&zvol_state_lock);
	return (SET_ERROR(ENXIO));
	}

	mutex_enter(&zv->zv_state_lock);
	if (zv->zv_flags & ZVOL_EXCL) {
	ASSERT(zv->zv_open_count == 1);
	zv->zv_flags &= ~ZVOL_EXCL;
	}

	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);

	/*
	* If the open count is zero, this is a spurious close.
	* That indicates a bug in the kernel / DDI framework.
	*/
	ASSERT(zv->zv_open_count > 0);

	/*
	* make sure zvol is not suspended during last close
	* (hold zv_suspend_lock) and respect proper lock acquisition
	* ordering - zv_suspend_lock before zv_state_lock
	*/
	if ((zv->zv_open_count - count) == 0) {
	if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
	mutex_exit(&zv->zv_state_lock);
	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
	mutex_enter(&zv->zv_state_lock);
	/* check to see if zv_suspend_lock is needed */
	if (zv->zv_open_count != 1) {
	rw_exit(&zv->zv_suspend_lock);
	drop_suspend = B_FALSE;
	}
	}
	} else {
	drop_suspend = B_FALSE;
	}
	rw_exit(&zvol_state_lock);

	ASSERT(MUTEX_HELD(&zv->zv_state_lock));

	/*
	* You may get multiple opens, but only one close.
	*/
	zv->zv_open_count -= count;

	if (zv->zv_open_count == 0) {
	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
	zvol_last_close(zv);
	}

	mutex_exit(&zv->zv_state_lock);

	if (drop_suspend)
	rw_exit(&zv->zv_suspend_lock);
	return (0);
	}

	static void
	zvol_geom_run(zvol_state_t *zv)
	{
	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
	struct g_provider *pp = zsg->zsg_provider;

	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);

	g_error_provider(pp, 0);

	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
	"zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
	}

	static void
	zvol_geom_destroy(zvol_state_t *zv)
	{
	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
	struct g_provider *pp = zsg->zsg_provider;

	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);

	g_topology_assert();

	mutex_enter(&zv->zv_state_lock);
	VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
	mutex_exit(&zv->zv_state_lock);
	zsg->zsg_provider = NULL;
	pp->private = NULL;
	g_wither_geom(pp->geom, ENXIO);
	}

	static int
	zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
	{
	int count, error, flags;

	g_topology_assert();

	/*
	* To make it easier we expect either open or close, but not both
	* at the same time.
	*/
	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) \|\|
	(acr <= 0 && acw <= 0 && ace <= 0),
	("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
	pp->name, acr, acw, ace));

	if (pp->private == NULL) {
	if (acr <= 0 && acw <= 0 && ace <= 0)
	return (0);
	return (pp->error);
	}

	/*
	* We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
	* ace != 0, because GEOM already handles that and handles it a bit
	* differently. GEOM allows for multiple read/exclusive consumers and
	* ZFS allows only one exclusive consumer, no matter if it is reader or
	* writer. I like better the way GEOM works so I'll leave it for GEOM
	* to decide what to do.
	*/

	count = acr + acw + ace;
	if (count == 0)
	return (0);

	flags = 0;
	if (acr != 0 \|\| ace != 0)
	flags \|= FREAD;
	if (acw != 0)
	flags \|= FWRITE;

	g_topology_unlock();
	if (count > 0)
	error = zvol_geom_open(pp, flags, count);
	else
	error = zvol_geom_close(pp, flags, -count);
	g_topology_lock();
	return (error);
	}

	static void
	zvol_geom_worker(void *arg)
	{
	zvol_state_t *zv = arg;
	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
	struct bio *bp;

	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);

	thread_lock(curthread);
	sched_prio(curthread, PRIBIO);
	thread_unlock(curthread);

	for (;;) {
	mtx_lock(&zsg->zsg_queue_mtx);
	bp = bioq_takefirst(&zsg->zsg_queue);
	if (bp == NULL) {
	if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
	zsg->zsg_state = ZVOL_GEOM_RUNNING;
	wakeup(&zsg->zsg_state);
	mtx_unlock(&zsg->zsg_queue_mtx);
	kthread_exit();
	}
	msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
	PRIBIO \| PDROP, "zvol:io", 0);
	continue;
	}
	mtx_unlock(&zsg->zsg_queue_mtx);
	zvol_geom_bio_strategy(bp);
	}
	}

	static void
	zvol_geom_bio_start(struct bio *bp)
	{
	zvol_state_t *zv = bp->bio_to->private;
	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
	boolean_t first;

	if (bp->bio_cmd == BIO_GETATTR) {
	if (zvol_geom_bio_getattr(bp))
	g_io_deliver(bp, EOPNOTSUPP);
	return;
	}

	if (!THREAD_CAN_SLEEP()) {
	mtx_lock(&zsg->zsg_queue_mtx);
	first = (bioq_first(&zsg->zsg_queue) == NULL);
	bioq_insert_tail(&zsg->zsg_queue, bp);
	mtx_unlock(&zsg->zsg_queue_mtx);
	if (first)
	wakeup_one(&zsg->zsg_queue);
	return;
	}

	zvol_geom_bio_strategy(bp);
	}

	static int
	zvol_geom_bio_getattr(struct bio *bp)
	{
	zvol_state_t *zv;

	zv = bp->bio_to->private;
	ASSERT(zv != NULL);

	spa_t *spa = dmu_objset_spa(zv->zv_objset);
	uint64_t refd, avail, usedobjs, availobjs;

	if (g_handleattr_int(bp, "GEOM::candelete", 1))
	return (0);
	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
	dmu_objset_space(zv->zv_objset, &refd, &avail,
	&usedobjs, &availobjs);
	if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
	return (0);
	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
	dmu_objset_space(zv->zv_objset, &refd, &avail,
	&usedobjs, &availobjs);
	if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
	return (0);
	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
	avail = metaslab_class_get_space(spa_normal_class(spa));
	avail -= metaslab_class_get_alloc(spa_normal_class(spa));
	if (g_handleattr_off_t(bp, "poolblocksavail",
	avail / DEV_BSIZE))
	return (0);
	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
	refd = metaslab_class_get_alloc(spa_normal_class(spa));
	if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
	return (0);
	}
	return (1);
	}

	static void
	zvol_geom_bio_strategy(struct bio *bp)
	{
	zvol_state_t *zv;
	uint64_t off, volsize;
	size_t resid;
	char *addr;
	objset_t *os;
	zfs_locked_range_t *lr;
	int error = 0;
	boolean_t doread = B_FALSE;
	boolean_t is_dumpified;
	boolean_t sync;

	if (bp->bio_to)
	zv = bp->bio_to->private;
	else
	zv = bp->bio_dev->si_drv2;

	if (zv == NULL) {
	error = SET_ERROR(ENXIO);
	goto out;
	}

	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);

	switch (bp->bio_cmd) {
	case BIO_READ:
	doread = B_TRUE;
	break;
	case BIO_WRITE:
	case BIO_FLUSH:
	case BIO_DELETE:
	if (zv->zv_flags & ZVOL_RDONLY) {
	error = SET_ERROR(EROFS);
	goto resume;
	}
	zvol_ensure_zilog(zv);
	if (bp->bio_cmd == BIO_FLUSH)
	goto sync;
	break;
	default:
	error = EOPNOTSUPP;
	goto resume;
	}

	off = bp->bio_offset;
	volsize = zv->zv_volsize;

	os = zv->zv_objset;
	ASSERT(os != NULL);

	addr = bp->bio_data;
	resid = bp->bio_length;

	if (resid > 0 && off >= volsize) {
	error = SET_ERROR(EIO);
	goto resume;
	}

	is_dumpified = B_FALSE;
	sync = !doread && !is_dumpified &&
	zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;

	/*
	* There must be no buffer changes when doing a dmu_sync() because
	* we can't change the data whilst calculating the checksum.
	*/
	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
	doread ? RL_READER : RL_WRITER);

	if (bp->bio_cmd == BIO_DELETE) {
	dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error != 0) {
	dmu_tx_abort(tx);
	} else {
	zvol_log_truncate(zv, tx, off, resid, sync);
	dmu_tx_commit(tx);
	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
	off, resid);
	resid = 0;
	}
	goto unlock;
	}
	while (resid != 0 && off < volsize) {
	size_t size = MIN(resid, zvol_maxphys);
	if (doread) {
	error = dmu_read(os, ZVOL_OBJ, off, size, addr,
	DMU_READ_PREFETCH);
	} else {
	dmu_tx_t *tx = dmu_tx_create(os);
	dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error) {
	dmu_tx_abort(tx);
	} else {
	dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
	zvol_log_write(zv, tx, off, size, sync);
	dmu_tx_commit(tx);
	}
	}
	if (error) {
	/* convert checksum errors into IO errors */
	if (error == ECKSUM)
	error = SET_ERROR(EIO);
	break;
	}
	off += size;
	addr += size;
	resid -= size;
	}
	unlock:
	zfs_rangelock_exit(lr);

	bp->bio_completed = bp->bio_length - resid;
	if (bp->bio_completed < bp->bio_length && off > volsize)
	error = EINVAL;

	switch (bp->bio_cmd) {
	case BIO_FLUSH:
	break;
	case BIO_READ:
	dataset_kstats_update_read_kstats(&zv->zv_kstat,
	bp->bio_completed);
	break;
	case BIO_WRITE:
	dataset_kstats_update_write_kstats(&zv->zv_kstat,
	bp->bio_completed);
	break;
	case BIO_DELETE:
	break;
	default:
	break;
	}

	if (sync) {
	sync:
	zil_commit(zv->zv_zilog, ZVOL_OBJ);
	}
	resume:
	rw_exit(&zv->zv_suspend_lock);
	out:
	if (bp->bio_to)
	g_io_deliver(bp, error);
	else
	biofinish(bp, NULL, error);
	}

	/*
	* Character device mode implementation
	*/

	static int
	zvol_cdev_read(struct cdev dev, struct uio uio, int ioflag)
	{
	zvol_state_t *zv;
	uint64_t volsize;
	zfs_locked_range_t *lr;
	int error = 0;

	zv = dev->si_drv2;

	volsize = zv->zv_volsize;
	/*
	* uio_loffset == volsize isn't an error as
	* its required for EOF processing.
	*/
	if (uio->uio_resid > 0 &&
	(uio->uio_loffset < 0 \|\| uio->uio_loffset > volsize))
	return (SET_ERROR(EIO));

	lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
	uio->uio_resid, RL_READER);
	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
	uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);

	/* don't read past the end */
	if (bytes > volsize - uio->uio_loffset)
	bytes = volsize - uio->uio_loffset;

	error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes);
	if (error) {
	/* convert checksum errors into IO errors */
	if (error == ECKSUM)
	error = SET_ERROR(EIO);
	break;
	}
	}
	zfs_rangelock_exit(lr);

	return (error);
	}

	static int
	zvol_cdev_write(struct cdev dev, struct uio uio, int ioflag)
	{
	zvol_state_t *zv;
	uint64_t volsize;
	zfs_locked_range_t *lr;
	int error = 0;
	boolean_t sync;

	zv = dev->si_drv2;

	volsize = zv->zv_volsize;

	if (uio->uio_resid > 0 &&
	(uio->uio_loffset < 0 \|\| uio->uio_loffset > volsize))
	return (SET_ERROR(EIO));

	sync = (ioflag & IO_SYNC) \|\|
	(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);

	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
	zvol_ensure_zilog(zv);

	lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
	uio->uio_resid, RL_WRITER);
	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
	uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
	uint64_t off = uio->uio_loffset;
	dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);

	if (bytes > volsize - off) /* don't write past the end */
	bytes = volsize - off;

	dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error) {
	dmu_tx_abort(tx);
	break;
	}
	error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
	if (error == 0)
	zvol_log_write(zv, tx, off, bytes, sync);
	dmu_tx_commit(tx);

	if (error)
	break;
	}
	zfs_rangelock_exit(lr);
	if (sync)
	zil_commit(zv->zv_zilog, ZVOL_OBJ);
	rw_exit(&zv->zv_suspend_lock);
	return (error);
	}

	static int
	zvol_cdev_open(struct cdev dev, int flags, int fmt, struct thread td)
	{
	zvol_state_t *zv;
	struct zvol_state_dev *zsd;
	int err = 0;
	boolean_t drop_suspend = B_TRUE;

	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
	zv = dev->si_drv2;
	if (zv == NULL) {
	rw_exit(&zvol_state_lock);
	return (SET_ERROR(ENXIO));
	}

	mutex_enter(&zv->zv_state_lock);

	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);

	/*
	* make sure zvol is not suspended during first open
	* (hold zv_suspend_lock) and respect proper lock acquisition
	* ordering - zv_suspend_lock before zv_state_lock
	*/
	if (zv->zv_open_count == 0) {
	if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
	mutex_exit(&zv->zv_state_lock);
	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
	mutex_enter(&zv->zv_state_lock);
	/* check to see if zv_suspend_lock is needed */
	if (zv->zv_open_count != 0) {
	rw_exit(&zv->zv_suspend_lock);
	drop_suspend = B_FALSE;
	}
	}
	} else {
	drop_suspend = B_FALSE;
	}
	rw_exit(&zvol_state_lock);

	ASSERT(MUTEX_HELD(&zv->zv_state_lock));

	if (zv->zv_open_count == 0) {
	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
	err = zvol_first_open(zv, !(flags & FWRITE));
	if (err)
	goto out_locked;
	}

	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
	err = EROFS;
	goto out_opened;
	}
	if (zv->zv_flags & ZVOL_EXCL) {
	err = EBUSY;
	goto out_opened;
	}
	#ifdef FEXCL
	if (flags & FEXCL) {
	if (zv->zv_open_count != 0) {
	err = EBUSY;
	goto out_opened;
	}
	zv->zv_flags \|= ZVOL_EXCL;
	}
	#endif

	zv->zv_open_count++;
	if (flags & (FSYNC \| FDSYNC)) {
	zsd = &zv->zv_zso->zso_dev;
	zsd->zsd_sync_cnt++;
	if (zsd->zsd_sync_cnt == 1 &&
	(zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
	zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
	}

	mutex_exit(&zv->zv_state_lock);
	if (drop_suspend)
	rw_exit(&zv->zv_suspend_lock);
	return (0);

	out_opened:
	if (zv->zv_open_count == 0)
	zvol_last_close(zv);
	out_locked:
	mutex_exit(&zv->zv_state_lock);
	if (drop_suspend)
	rw_exit(&zv->zv_suspend_lock);
	return (SET_ERROR(err));
	}

	static int
	zvol_cdev_close(struct cdev dev, int flags, int fmt, struct thread td)
	{
	zvol_state_t *zv;
	struct zvol_state_dev *zsd;
	boolean_t drop_suspend = B_TRUE;

	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
	zv = dev->si_drv2;
	if (zv == NULL) {
	rw_exit(&zvol_state_lock);
	return (SET_ERROR(ENXIO));
	}

	mutex_enter(&zv->zv_state_lock);
	if (zv->zv_flags & ZVOL_EXCL) {
	ASSERT(zv->zv_open_count == 1);
	zv->zv_flags &= ~ZVOL_EXCL;
	}

	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);

	/*
	* If the open count is zero, this is a spurious close.
	* That indicates a bug in the kernel / DDI framework.
	*/
	ASSERT(zv->zv_open_count > 0);
	/*
	* make sure zvol is not suspended during last close
	* (hold zv_suspend_lock) and respect proper lock acquisition
	* ordering - zv_suspend_lock before zv_state_lock
	*/
	if (zv->zv_open_count == 1) {
	if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
	mutex_exit(&zv->zv_state_lock);
	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
	mutex_enter(&zv->zv_state_lock);
	/* check to see if zv_suspend_lock is needed */
	if (zv->zv_open_count != 1) {
	rw_exit(&zv->zv_suspend_lock);
	drop_suspend = B_FALSE;
	}
	}
	} else {
	drop_suspend = B_FALSE;
	}
	rw_exit(&zvol_state_lock);

	ASSERT(MUTEX_HELD(&zv->zv_state_lock));

	/*
	* You may get multiple opens, but only one close.
	*/
	zv->zv_open_count--;
	if (flags & (FSYNC \| FDSYNC)) {
	zsd = &zv->zv_zso->zso_dev;
	zsd->zsd_sync_cnt--;
	}

	if (zv->zv_open_count == 0) {
	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
	zvol_last_close(zv);
	}

	mutex_exit(&zv->zv_state_lock);

	if (drop_suspend)
	rw_exit(&zv->zv_suspend_lock);
	return (0);
	}

	static int
	zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
	int fflag, struct thread *td)
	{
	zvol_state_t *zv;
	zfs_locked_range_t *lr;
	off_t offset, length;
	int i, error;
	boolean_t sync;

	zv = dev->si_drv2;

	error = 0;
	KASSERT(zv->zv_open_count > 0,
	("Device with zero access count in %s", __func__));

	i = IOCPARM_LEN(cmd);
	switch (cmd) {
	case DIOCGSECTORSIZE:
	(uint32_t )data = DEV_BSIZE;
	break;
	case DIOCGMEDIASIZE:
	(off_t )data = zv->zv_volsize;
	break;
	case DIOCGFLUSH:
	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
	if (zv->zv_zilog != NULL)
	zil_commit(zv->zv_zilog, ZVOL_OBJ);
	rw_exit(&zv->zv_suspend_lock);
	break;
	case DIOCGDELETE:
	if (!zvol_unmap_enabled)
	break;

	offset = ((off_t *)data)[0];
	length = ((off_t *)data)[1];
	if ((offset % DEV_BSIZE) != 0 \|\| (length % DEV_BSIZE) != 0 \|\|
	offset < 0 \|\| offset >= zv->zv_volsize \|\|
	length <= 0) {
	printf("%s: offset=%jd length=%jd\n", __func__, offset,
	length);
	error = EINVAL;
	break;
	}
	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
	zvol_ensure_zilog(zv);
	lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
	RL_WRITER);
	dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error != 0) {
	sync = FALSE;
	dmu_tx_abort(tx);
	} else {
	sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
	zvol_log_truncate(zv, tx, offset, length, sync);
	dmu_tx_commit(tx);
	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
	offset, length);
	}
	zfs_rangelock_exit(lr);
	if (sync)
	zil_commit(zv->zv_zilog, ZVOL_OBJ);
	rw_exit(&zv->zv_suspend_lock);
	break;
	case DIOCGSTRIPESIZE:
	(off_t )data = zv->zv_volblocksize;
	break;
	case DIOCGSTRIPEOFFSET:
	(off_t )data = 0;
	break;
	case DIOCGATTR: {
	spa_t *spa = dmu_objset_spa(zv->zv_objset);
	struct diocgattr_arg arg = (struct diocgattr_arg )data;
	uint64_t refd, avail, usedobjs, availobjs;

	if (strcmp(arg->name, "GEOM::candelete") == 0)
	arg->value.i = 1;
	else if (strcmp(arg->name, "blocksavail") == 0) {
	dmu_objset_space(zv->zv_objset, &refd, &avail,
	&usedobjs, &availobjs);
	arg->value.off = avail / DEV_BSIZE;
	} else if (strcmp(arg->name, "blocksused") == 0) {
	dmu_objset_space(zv->zv_objset, &refd, &avail,
	&usedobjs, &availobjs);
	arg->value.off = refd / DEV_BSIZE;
	} else if (strcmp(arg->name, "poolblocksavail") == 0) {
	avail = metaslab_class_get_space(spa_normal_class(spa));
	avail -= metaslab_class_get_alloc(
	spa_normal_class(spa));
	arg->value.off = avail / DEV_BSIZE;
	} else if (strcmp(arg->name, "poolblocksused") == 0) {
	refd = metaslab_class_get_alloc(spa_normal_class(spa));
	arg->value.off = refd / DEV_BSIZE;
	} else
	error = ENOIOCTL;
	break;
	}
	case FIOSEEKHOLE:
	case FIOSEEKDATA: {
	off_t off = (off_t )data;
	uint64_t noff;
	boolean_t hole;

	hole = (cmd == FIOSEEKHOLE);
	noff = *off;
	error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
	*off = noff;
	break;
	}
	default:
	error = ENOIOCTL;
	}

	return (error);
	}

	/*
	* Misc. helpers
	*/

	static void
	zvol_ensure_zilog(zvol_state_t *zv)
	{
	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));

	/*
	* Open a ZIL if this is the first time we have written to this
	* zvol. We protect zv->zv_zilog with zv_suspend_lock rather
	* than zv_state_lock so that we don't need to acquire an
	* additional lock in this path.
	*/
	if (zv->zv_zilog == NULL) {
	if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
	rw_exit(&zv->zv_suspend_lock);
	rw_enter(&zv->zv_suspend_lock, RW_WRITER);
	}
	if (zv->zv_zilog == NULL) {
	zv->zv_zilog = zil_open(zv->zv_objset,
	zvol_get_data);
	zv->zv_flags \|= ZVOL_WRITTEN_TO;
	}
	rw_downgrade(&zv->zv_suspend_lock);
	}
	}

	static boolean_t
	zvol_is_zvol_impl(const char *device)
	{
	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
	}

	static void
	zvol_rename_minor(zvol_state_t zv, const char newname)
	{
	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
	ASSERT(MUTEX_HELD(&zv->zv_state_lock));

	/* move to new hashtable entry */
	zv->zv_hash = zvol_name_hash(zv->zv_name);
	hlist_del(&zv->zv_hlink);
	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));

	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
	struct g_provider *pp = zsg->zsg_provider;
	struct g_geom *gp;

	g_topology_lock();
	gp = pp->geom;
	ASSERT(gp != NULL);

	zsg->zsg_provider = NULL;
	g_wither_provider(pp, ENXIO);

	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
	pp->flags \|= G_PF_DIRECT_RECEIVE \| G_PF_DIRECT_SEND;
	pp->sectorsize = DEV_BSIZE;
	pp->mediasize = zv->zv_volsize;
	pp->private = zv;
	zsg->zsg_provider = pp;
	g_error_provider(pp, 0);
	g_topology_unlock();
	} else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
	struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
	struct cdev *dev;
	struct make_dev_args args;

	dev = zsd->zsd_cdev;
	if (dev != NULL) {
	destroy_dev(dev);
	dev = zsd->zsd_cdev = NULL;
	if (zv->zv_open_count > 0) {
	zv->zv_flags &= ~ZVOL_EXCL;
	zv->zv_open_count = 0;
	/* XXX need suspend lock but lock order */
	zvol_last_close(zv);
	}
	}

	make_dev_args_init(&args);
	args.mda_flags = MAKEDEV_CHECKNAME \| MAKEDEV_WAITOK;
	args.mda_devsw = &zvol_cdevsw;
	args.mda_cr = NULL;
	args.mda_uid = UID_ROOT;
	args.mda_gid = GID_OPERATOR;
	args.mda_mode = 0640;
	args.mda_si_drv2 = zv;
	if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
	== 0) {
	- dev->si_iosize_max = MAXPHYS;
	+ dev->si_iosize_max = maxphys;
	zsd->zsd_cdev = dev;
	}
	}
	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
	}

	/*
	* Remove minor node for the specified volume.
	*/
	static void
	zvol_free(zvol_state_t *zv)
	{
	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
	ASSERT(zv->zv_open_count == 0);

	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);

	rw_destroy(&zv->zv_suspend_lock);
	zfs_rangelock_fini(&zv->zv_rangelock);

	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;

	g_topology_lock();
	zvol_geom_destroy(zv);
	g_topology_unlock();
	mtx_destroy(&zsg->zsg_queue_mtx);
	} else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
	struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
	struct cdev *dev = zsd->zsd_cdev;

	if (dev != NULL)
	destroy_dev(dev);
	}

	mutex_destroy(&zv->zv_state_lock);
	dataset_kstats_destroy(&zv->zv_kstat);
	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
	kmem_free(zv, sizeof (zvol_state_t));
	zvol_minors--;
	}

	/*
	* Create a minor node (plus a whole lot more) for the specified volume.
	*/
	static int
	zvol_create_minor_impl(const char *name)
	{
	zvol_state_t *zv;
	objset_t *os;
	dmu_object_info_t *doi;
	uint64_t volsize;
	uint64_t volmode, hash;
	int error;

	ZFS_LOG(1, "Creating ZVOL %s...", name);

	hash = zvol_name_hash(name);
	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
	mutex_exit(&zv->zv_state_lock);
	return (SET_ERROR(EEXIST));
	}

	DROP_GIANT();
	/* lie and say we're read-only */
	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);

	if (error)
	goto out_doi;

	error = dmu_object_info(os, ZVOL_OBJ, doi);
	if (error)
	goto out_dmu_objset_disown;

	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
	if (error)
	goto out_dmu_objset_disown;

	error = dsl_prop_get_integer(name,
	zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
	if (error != 0 \|\| volmode == ZFS_VOLMODE_DEFAULT)
	volmode = zvol_volmode;
	/*
	* zvol_alloc equivalent ...
	*/
	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
	zv->zv_hash = hash;
	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
	zv->zv_zso->zso_volmode = volmode;
	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
	struct g_provider *pp;
	struct g_geom *gp;

	zsg->zsg_state = ZVOL_GEOM_UNINIT;
	mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);

	g_topology_lock();
	gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
	gp->start = zvol_geom_bio_start;
	gp->access = zvol_geom_access;
	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
	/* TODO: NULL check? */
	pp->flags \|= G_PF_DIRECT_RECEIVE \| G_PF_DIRECT_SEND;
	pp->sectorsize = DEV_BSIZE;
	pp->mediasize = 0;
	pp->private = zv;

	zsg->zsg_provider = pp;
	bioq_init(&zsg->zsg_queue);
	} else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
	struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
	struct cdev *dev;
	struct make_dev_args args;

	make_dev_args_init(&args);
	args.mda_flags = MAKEDEV_CHECKNAME \| MAKEDEV_WAITOK;
	args.mda_devsw = &zvol_cdevsw;
	args.mda_cr = NULL;
	args.mda_uid = UID_ROOT;
	args.mda_gid = GID_OPERATOR;
	args.mda_mode = 0640;
	args.mda_si_drv2 = zv;
	error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
	if (error != 0) {
	mutex_destroy(&zv->zv_state_lock);
	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
	kmem_free(zv, sizeof (*zv));
	dmu_objset_disown(os, B_TRUE, FTAG);
	goto out_giant;
	}
	- dev->si_iosize_max = MAXPHYS;
	+ dev->si_iosize_max = maxphys;
	zsd->zsd_cdev = dev;
	}
	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);

	if (dmu_objset_is_snapshot(os) \|\| !spa_writeable(dmu_objset_spa(os)))
	zv->zv_flags \|= ZVOL_RDONLY;

	zv->zv_volblocksize = doi->doi_data_block_size;
	zv->zv_volsize = volsize;
	zv->zv_objset = os;

	if (spa_writeable(dmu_objset_spa(os))) {
	if (zil_replay_disable)
	zil_destroy(dmu_objset_zil(os), B_FALSE);
	else
	zil_replay(os, zv, zvol_replay_vector);
	}
	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);

	/* XXX do prefetch */

	zv->zv_objset = NULL;
	out_dmu_objset_disown:
	dmu_objset_disown(os, B_TRUE, FTAG);

	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
	if (error == 0)
	zvol_geom_run(zv);
	g_topology_unlock();
	}
	out_doi:
	kmem_free(doi, sizeof (dmu_object_info_t));
	if (error == 0) {
	rw_enter(&zvol_state_lock, RW_WRITER);
	zvol_insert(zv);
	zvol_minors++;
	rw_exit(&zvol_state_lock);
	}
	ZFS_LOG(1, "ZVOL %s created.", name);
	out_giant:
	PICKUP_GIANT();
	return (error);
	}

	static void
	zvol_clear_private(zvol_state_t *zv)
	{
	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
	struct g_provider *pp = zsg->zsg_provider;

	if (pp == NULL) /* XXX when? */
	return;

	mtx_lock(&zsg->zsg_queue_mtx);
	zsg->zsg_state = ZVOL_GEOM_STOPPED;
	pp->private = NULL;
	wakeup_one(&zsg->zsg_queue);
	while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
	msleep(&zsg->zsg_state,
	&zsg->zsg_queue_mtx,
	0, "zvol:w", 0);
	mtx_unlock(&zsg->zsg_queue_mtx);
	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
	}
	}

	static int
	zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
	{
	zv->zv_volsize = volsize;
	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
	struct g_provider *pp = zsg->zsg_provider;

	if (pp == NULL) /* XXX when? */
	return (0);

	g_topology_lock();

	/*
	* Do not invoke resize event when initial size was zero.
	* ZVOL initializes the size on first open, this is not
	* real resizing.
	*/
	if (pp->mediasize == 0)
	pp->mediasize = zv->zv_volsize;
	else
	g_resize_provider(pp, zv->zv_volsize);

	g_topology_unlock();
	}
	return (0);
	}

	static void
	zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
	{
	// XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
	}

	static void
	zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
	{
	// XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
	}

	const static zvol_platform_ops_t zvol_freebsd_ops = {
	.zv_free = zvol_free,
	.zv_rename_minor = zvol_rename_minor,
	.zv_create_minor = zvol_create_minor_impl,
	.zv_update_volsize = zvol_update_volsize,
	.zv_clear_private = zvol_clear_private,
	.zv_is_zvol = zvol_is_zvol_impl,
	.zv_set_disk_ro = zvol_set_disk_ro_impl,
	.zv_set_capacity = zvol_set_capacity_impl,
	};

	/*
	* Public interfaces
	*/

	int
	zvol_busy(void)
	{
	return (zvol_minors != 0);
	}

	int
	zvol_init(void)
	{
	zvol_init_impl();
	zvol_register_ops(&zvol_freebsd_ops);
	return (0);
	}

	void
	zvol_fini(void)
	{
	zvol_fini_impl();
	}
	diff --git a/sys/dev/ahci/ahci.c b/sys/dev/ahci/ahci.c
	index 5c194e905cdf..8991d9c23fbc 100644
	--- a/sys/dev/ahci/ahci.c
	+++ b/sys/dev/ahci/ahci.c
	@@ -1,2909 +1,2906 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2009-2012 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/module.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/endian.h>
	#include <sys/malloc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysctl.h>
	#include <machine/stdarg.h>
	#include <machine/resource.h>
	#include <machine/bus.h>
	#include <sys/rman.h>
	#include "ahci.h"

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/cam_debug.h>

	/* local prototypes */
	static void ahci_intr(void *data);
	static void ahci_intr_one(void *data);
	static void ahci_intr_one_edge(void *data);
	static int ahci_ch_init(device_t dev);
	static int ahci_ch_deinit(device_t dev);
	static int ahci_ch_suspend(device_t dev);
	static int ahci_ch_resume(device_t dev);
	static void ahci_ch_pm(void *arg);
	static void ahci_ch_intr(void *arg);
	static void ahci_ch_intr_direct(void *arg);
	static void ahci_ch_intr_main(struct ahci_channel *ch, uint32_t istatus);
	static void ahci_begin_transaction(struct ahci_channel ch, union ccb ccb);
	static void ahci_dmasetprd(void arg, bus_dma_segment_t segs, int nsegs, int error);
	static void ahci_execute_transaction(struct ahci_slot *slot);
	static void ahci_timeout(void *arg);
	static void ahci_end_transaction(struct ahci_slot *slot, enum ahci_err_type et);
	static int ahci_setup_fis(struct ahci_channel ch, struct ahci_cmd_tab ctp, union ccb *ccb, int tag);
	static void ahci_dmainit(device_t dev);
	static void ahci_dmasetupc_cb(void xsc, bus_dma_segment_t segs, int nsegs, int error);
	static void ahci_dmafini(device_t dev);
	static void ahci_slotsalloc(device_t dev);
	static void ahci_slotsfree(device_t dev);
	static void ahci_reset(struct ahci_channel *ch);
	static void ahci_start(struct ahci_channel *ch, int fbs);
	static void ahci_stop(struct ahci_channel *ch);
	static void ahci_clo(struct ahci_channel *ch);
	static void ahci_start_fr(struct ahci_channel *ch);
	static void ahci_stop_fr(struct ahci_channel *ch);
	static int ahci_phy_check_events(struct ahci_channel *ch, u_int32_t serr);
	static uint32_t ahci_ch_detval(struct ahci_channel *ch, uint32_t val);

	static int ahci_sata_connect(struct ahci_channel *ch);
	static int ahci_sata_phy_reset(struct ahci_channel *ch);
	static int ahci_wait_ready(struct ahci_channel *ch, int t, int t0);

	static void ahci_issue_recovery(struct ahci_channel *ch);
	static void ahci_process_read_log(struct ahci_channel ch, union ccb ccb);
	static void ahci_process_request_sense(struct ahci_channel ch, union ccb ccb);

	static void ahciaction(struct cam_sim sim, union ccb ccb);
	static void ahcipoll(struct cam_sim *sim);

	static MALLOC_DEFINE(M_AHCI, "AHCI driver", "AHCI driver data buffers");

	#define recovery_type spriv_field0
	#define RECOVERY_NONE 0
	#define RECOVERY_READ_LOG 1
	#define RECOVERY_REQUEST_SENSE 2
	#define recovery_slot spriv_field1

	static uint32_t
	ahci_ch_detval(struct ahci_channel *ch, uint32_t val)
	{

	return ch->disablephy ? ATA_SC_DET_DISABLE : val;
	}

	int
	ahci_ctlr_setup(device_t dev)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);
	/* Clear interrupts */
	ATA_OUTL(ctlr->r_mem, AHCI_IS, ATA_INL(ctlr->r_mem, AHCI_IS));
	/* Configure CCC */
	if (ctlr->ccc) {
	ATA_OUTL(ctlr->r_mem, AHCI_CCCP, ATA_INL(ctlr->r_mem, AHCI_PI));
	ATA_OUTL(ctlr->r_mem, AHCI_CCCC,
	(ctlr->ccc << AHCI_CCCC_TV_SHIFT) \|
	(4 << AHCI_CCCC_CC_SHIFT) \|
	AHCI_CCCC_EN);
	ctlr->cccv = (ATA_INL(ctlr->r_mem, AHCI_CCCC) &
	AHCI_CCCC_INT_MASK) >> AHCI_CCCC_INT_SHIFT;
	if (bootverbose) {
	device_printf(dev,
	"CCC with %dms/4cmd enabled on vector %d\n",
	ctlr->ccc, ctlr->cccv);
	}
	}
	/* Enable AHCI interrupts */
	ATA_OUTL(ctlr->r_mem, AHCI_GHC,
	ATA_INL(ctlr->r_mem, AHCI_GHC) \| AHCI_GHC_IE);
	return (0);
	}

	int
	ahci_ctlr_reset(device_t dev)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);
	uint32_t v;
	int timeout;

	/* BIOS/OS Handoff */
	if ((ATA_INL(ctlr->r_mem, AHCI_VS) >= 0x00010200) &&
	(ATA_INL(ctlr->r_mem, AHCI_CAP2) & AHCI_CAP2_BOH) &&
	((v = ATA_INL(ctlr->r_mem, AHCI_BOHC)) & AHCI_BOHC_OOS) == 0) {
	/* Request OS ownership. */
	ATA_OUTL(ctlr->r_mem, AHCI_BOHC, v \| AHCI_BOHC_OOS);

	/* Wait up to 2s for BIOS ownership release. */
	for (timeout = 0; timeout < 80; timeout++) {
	DELAY(25000);
	v = ATA_INL(ctlr->r_mem, AHCI_BOHC);
	if ((v & AHCI_BOHC_BOS) == 0)
	break;
	if ((v & AHCI_BOHC_BB) == 0)
	break;
	}
	}

	/* Enable AHCI mode */
	ATA_OUTL(ctlr->r_mem, AHCI_GHC, AHCI_GHC_AE);
	/* Reset AHCI controller */
	ATA_OUTL(ctlr->r_mem, AHCI_GHC, AHCI_GHC_AE\|AHCI_GHC_HR);
	for (timeout = 1000; timeout > 0; timeout--) {
	DELAY(1000);
	if ((ATA_INL(ctlr->r_mem, AHCI_GHC) & AHCI_GHC_HR) == 0)
	break;
	}
	if (timeout == 0) {
	device_printf(dev, "AHCI controller reset failure\n");
	return (ENXIO);
	}
	/* Reenable AHCI mode */
	ATA_OUTL(ctlr->r_mem, AHCI_GHC, AHCI_GHC_AE);

	if (ctlr->quirks & AHCI_Q_RESTORE_CAP) {
	/*
	* Restore capability field.
	* This is write to a read-only register to restore its state.
	* On fully standard-compliant hardware this is not needed and
	* this operation shall not take place. See ahci_pci.c for
	* platforms using this quirk.
	*/
	ATA_OUTL(ctlr->r_mem, AHCI_CAP, ctlr->caps);
	}

	return (0);
	}

	int
	ahci_attach(device_t dev)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);
	int error, i, speed, unit;
	uint32_t u, version;
	device_t child;

	ctlr->dev = dev;
	ctlr->ccc = 0;
	resource_int_value(device_get_name(dev),
	device_get_unit(dev), "ccc", &ctlr->ccc);
	mtx_init(&ctlr->ch_mtx, "AHCI channels lock", NULL, MTX_DEF);

	/* Setup our own memory management for channels. */
	ctlr->sc_iomem.rm_start = rman_get_start(ctlr->r_mem);
	ctlr->sc_iomem.rm_end = rman_get_end(ctlr->r_mem);
	ctlr->sc_iomem.rm_type = RMAN_ARRAY;
	ctlr->sc_iomem.rm_descr = "I/O memory addresses";
	if ((error = rman_init(&ctlr->sc_iomem)) != 0) {
	ahci_free_mem(dev);
	return (error);
	}
	if ((error = rman_manage_region(&ctlr->sc_iomem,
	rman_get_start(ctlr->r_mem), rman_get_end(ctlr->r_mem))) != 0) {
	ahci_free_mem(dev);
	rman_fini(&ctlr->sc_iomem);
	return (error);
	}
	/* Get the HW capabilities */
	version = ATA_INL(ctlr->r_mem, AHCI_VS);
	ctlr->caps = ATA_INL(ctlr->r_mem, AHCI_CAP);
	if (version >= 0x00010200)
	ctlr->caps2 = ATA_INL(ctlr->r_mem, AHCI_CAP2);
	if (ctlr->caps & AHCI_CAP_EMS)
	ctlr->capsem = ATA_INL(ctlr->r_mem, AHCI_EM_CTL);

	if (ctlr->quirks & AHCI_Q_FORCE_PI) {
	/*
	* Enable ports.
	* The spec says that BIOS sets up bits corresponding to
	* available ports. On platforms where this information
	* is missing, the driver can define available ports on its own.
	*/
	int nports = (ctlr->caps & AHCI_CAP_NPMASK) + 1;
	int nmask = (1 << nports) - 1;

	ATA_OUTL(ctlr->r_mem, AHCI_PI, nmask);
	device_printf(dev, "Forcing PI to %d ports (mask = %x)\n",
	nports, nmask);
	}

	ctlr->ichannels = ATA_INL(ctlr->r_mem, AHCI_PI);

	/* Identify and set separate quirks for HBA and RAID f/w Marvells. */
	if ((ctlr->quirks & AHCI_Q_ALTSIG) &&
	(ctlr->caps & AHCI_CAP_SPM) == 0)
	ctlr->quirks \|= AHCI_Q_NOBSYRES;

	if (ctlr->quirks & AHCI_Q_1CH) {
	ctlr->caps &= ~AHCI_CAP_NPMASK;
	ctlr->ichannels &= 0x01;
	}
	if (ctlr->quirks & AHCI_Q_2CH) {
	ctlr->caps &= ~AHCI_CAP_NPMASK;
	ctlr->caps \|= 1;
	ctlr->ichannels &= 0x03;
	}
	if (ctlr->quirks & AHCI_Q_4CH) {
	ctlr->caps &= ~AHCI_CAP_NPMASK;
	ctlr->caps \|= 3;
	ctlr->ichannels &= 0x0f;
	}
	ctlr->channels = MAX(flsl(ctlr->ichannels),
	(ctlr->caps & AHCI_CAP_NPMASK) + 1);
	if (ctlr->quirks & AHCI_Q_NOPMP)
	ctlr->caps &= ~AHCI_CAP_SPM;
	if (ctlr->quirks & AHCI_Q_NONCQ)
	ctlr->caps &= ~AHCI_CAP_SNCQ;
	if ((ctlr->caps & AHCI_CAP_CCCS) == 0)
	ctlr->ccc = 0;
	ctlr->emloc = ATA_INL(ctlr->r_mem, AHCI_EM_LOC);

	/* Create controller-wide DMA tag. */
	if (bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0,
	(ctlr->caps & AHCI_CAP_64BIT) ? BUS_SPACE_MAXADDR :
	BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL,
	BUS_SPACE_MAXSIZE, BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE,
	ctlr->dma_coherent ? BUS_DMA_COHERENT : 0, NULL, NULL,
	&ctlr->dma_tag)) {
	ahci_free_mem(dev);
	rman_fini(&ctlr->sc_iomem);
	return (ENXIO);
	}

	ahci_ctlr_setup(dev);

	/* Setup interrupts. */
	if ((error = ahci_setup_interrupt(dev)) != 0) {
	bus_dma_tag_destroy(ctlr->dma_tag);
	ahci_free_mem(dev);
	rman_fini(&ctlr->sc_iomem);
	return (error);
	}

	i = 0;
	for (u = ctlr->ichannels; u != 0; u >>= 1)
	i += (u & 1);
	ctlr->direct = (ctlr->msi && (ctlr->numirqs > 1 \|\| i <= 3));
	resource_int_value(device_get_name(dev), device_get_unit(dev),
	"direct", &ctlr->direct);
	/* Announce HW capabilities. */
	speed = (ctlr->caps & AHCI_CAP_ISS) >> AHCI_CAP_ISS_SHIFT;
	device_printf(dev,
	"AHCI v%x.%02x with %d %sGbps ports, Port Multiplier %s%s\n",
	((version >> 20) & 0xf0) + ((version >> 16) & 0x0f),
	((version >> 4) & 0xf0) + (version & 0x0f),
	(ctlr->caps & AHCI_CAP_NPMASK) + 1,
	((speed == 1) ? "1.5":((speed == 2) ? "3":
	((speed == 3) ? "6":"?"))),
	(ctlr->caps & AHCI_CAP_SPM) ?
	"supported" : "not supported",
	(ctlr->caps & AHCI_CAP_FBSS) ?
	" with FBS" : "");
	if (ctlr->quirks != 0) {
	device_printf(dev, "quirks=0x%b\n", ctlr->quirks,
	AHCI_Q_BIT_STRING);
	}
	if (bootverbose) {
	device_printf(dev, "Caps:%s%s%s%s%s%s%s%s %sGbps",
	(ctlr->caps & AHCI_CAP_64BIT) ? " 64bit":"",
	(ctlr->caps & AHCI_CAP_SNCQ) ? " NCQ":"",
	(ctlr->caps & AHCI_CAP_SSNTF) ? " SNTF":"",
	(ctlr->caps & AHCI_CAP_SMPS) ? " MPS":"",
	(ctlr->caps & AHCI_CAP_SSS) ? " SS":"",
	(ctlr->caps & AHCI_CAP_SALP) ? " ALP":"",
	(ctlr->caps & AHCI_CAP_SAL) ? " AL":"",
	(ctlr->caps & AHCI_CAP_SCLO) ? " CLO":"",
	((speed == 1) ? "1.5":((speed == 2) ? "3":
	((speed == 3) ? "6":"?"))));
	printf("%s%s%s%s%s%s %dcmd%s%s%s %dports\n",
	(ctlr->caps & AHCI_CAP_SAM) ? " AM":"",
	(ctlr->caps & AHCI_CAP_SPM) ? " PM":"",
	(ctlr->caps & AHCI_CAP_FBSS) ? " FBS":"",
	(ctlr->caps & AHCI_CAP_PMD) ? " PMD":"",
	(ctlr->caps & AHCI_CAP_SSC) ? " SSC":"",
	(ctlr->caps & AHCI_CAP_PSC) ? " PSC":"",
	((ctlr->caps & AHCI_CAP_NCS) >> AHCI_CAP_NCS_SHIFT) + 1,
	(ctlr->caps & AHCI_CAP_CCCS) ? " CCC":"",
	(ctlr->caps & AHCI_CAP_EMS) ? " EM":"",
	(ctlr->caps & AHCI_CAP_SXS) ? " eSATA":"",
	(ctlr->caps & AHCI_CAP_NPMASK) + 1);
	}
	if (bootverbose && version >= 0x00010200) {
	device_printf(dev, "Caps2:%s%s%s%s%s%s\n",
	(ctlr->caps2 & AHCI_CAP2_DESO) ? " DESO":"",
	(ctlr->caps2 & AHCI_CAP2_SADM) ? " SADM":"",
	(ctlr->caps2 & AHCI_CAP2_SDS) ? " SDS":"",
	(ctlr->caps2 & AHCI_CAP2_APST) ? " APST":"",
	(ctlr->caps2 & AHCI_CAP2_NVMP) ? " NVMP":"",
	(ctlr->caps2 & AHCI_CAP2_BOH) ? " BOH":"");
	}
	/* Attach all channels on this controller */
	for (unit = 0; unit < ctlr->channels; unit++) {
	child = device_add_child(dev, "ahcich", -1);
	if (child == NULL) {
	device_printf(dev, "failed to add channel device\n");
	continue;
	}
	device_set_ivars(child, (void *)(intptr_t)unit);
	if ((ctlr->ichannels & (1 << unit)) == 0)
	device_disable(child);
	}
	/* Attach any remapped NVME device */
	for (; unit < ctlr->channels + ctlr->remapped_devices; unit++) {
	child = device_add_child(dev, "nvme", -1);
	if (child == NULL) {
	device_printf(dev, "failed to add remapped NVMe device");
	continue;
	}
	device_set_ivars(child, (void *)(intptr_t)(unit \| AHCI_REMAPPED_UNIT));
	}

	if (ctlr->caps & AHCI_CAP_EMS) {
	child = device_add_child(dev, "ahciem", -1);
	if (child == NULL)
	device_printf(dev, "failed to add enclosure device\n");
	else
	device_set_ivars(child, (void *)(intptr_t)AHCI_EM_UNIT);
	}
	bus_generic_attach(dev);
	return (0);
	}

	int
	ahci_detach(device_t dev)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);
	int i;

	/* Detach & delete all children */
	device_delete_children(dev);

	/* Free interrupts. */
	for (i = 0; i < ctlr->numirqs; i++) {
	if (ctlr->irqs[i].r_irq) {
	bus_teardown_intr(dev, ctlr->irqs[i].r_irq,
	ctlr->irqs[i].handle);
	bus_release_resource(dev, SYS_RES_IRQ,
	ctlr->irqs[i].r_irq_rid, ctlr->irqs[i].r_irq);
	}
	}
	bus_dma_tag_destroy(ctlr->dma_tag);
	/* Free memory. */
	rman_fini(&ctlr->sc_iomem);
	ahci_free_mem(dev);
	mtx_destroy(&ctlr->ch_mtx);
	return (0);
	}

	void
	ahci_free_mem(device_t dev)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);

	/* Release memory resources */
	if (ctlr->r_mem)
	bus_release_resource(dev, SYS_RES_MEMORY, ctlr->r_rid, ctlr->r_mem);
	if (ctlr->r_msix_table)
	bus_release_resource(dev, SYS_RES_MEMORY,
	ctlr->r_msix_tab_rid, ctlr->r_msix_table);
	if (ctlr->r_msix_pba)
	bus_release_resource(dev, SYS_RES_MEMORY,
	ctlr->r_msix_pba_rid, ctlr->r_msix_pba);

	ctlr->r_msix_pba = ctlr->r_mem = ctlr->r_msix_table = NULL;
	}

	int
	ahci_setup_interrupt(device_t dev)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);
	int i;

	/* Check for single MSI vector fallback. */
	if (ctlr->numirqs > 1 &&
	(ATA_INL(ctlr->r_mem, AHCI_GHC) & AHCI_GHC_MRSM) != 0) {
	device_printf(dev, "Falling back to one MSI\n");
	ctlr->numirqs = 1;
	}

	/* Ensure we don't overrun irqs. */
	if (ctlr->numirqs > AHCI_MAX_IRQS) {
	device_printf(dev, "Too many irqs %d > %d (clamping)\n",
	ctlr->numirqs, AHCI_MAX_IRQS);
	ctlr->numirqs = AHCI_MAX_IRQS;
	}

	/* Allocate all IRQs. */
	for (i = 0; i < ctlr->numirqs; i++) {
	ctlr->irqs[i].ctlr = ctlr;
	ctlr->irqs[i].r_irq_rid = i + (ctlr->msi ? 1 : 0);
	if (ctlr->channels == 1 && !ctlr->ccc && ctlr->msi)
	ctlr->irqs[i].mode = AHCI_IRQ_MODE_ONE;
	else if (ctlr->numirqs == 1 \|\| i >= ctlr->channels \|\|
	(ctlr->ccc && i == ctlr->cccv))
	ctlr->irqs[i].mode = AHCI_IRQ_MODE_ALL;
	else if (ctlr->channels > ctlr->numirqs &&
	i == ctlr->numirqs - 1)
	ctlr->irqs[i].mode = AHCI_IRQ_MODE_AFTER;
	else
	ctlr->irqs[i].mode = AHCI_IRQ_MODE_ONE;
	if (!(ctlr->irqs[i].r_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ,
	&ctlr->irqs[i].r_irq_rid, RF_SHAREABLE \| RF_ACTIVE))) {
	device_printf(dev, "unable to map interrupt\n");
	return (ENXIO);
	}
	if ((bus_setup_intr(dev, ctlr->irqs[i].r_irq, ATA_INTR_FLAGS, NULL,
	(ctlr->irqs[i].mode != AHCI_IRQ_MODE_ONE) ? ahci_intr :
	((ctlr->quirks & AHCI_Q_EDGEIS) ? ahci_intr_one_edge :
	ahci_intr_one),
	&ctlr->irqs[i], &ctlr->irqs[i].handle))) {
	/* SOS XXX release r_irq */
	device_printf(dev, "unable to setup interrupt\n");
	return (ENXIO);
	}
	if (ctlr->numirqs > 1) {
	bus_describe_intr(dev, ctlr->irqs[i].r_irq,
	ctlr->irqs[i].handle,
	ctlr->irqs[i].mode == AHCI_IRQ_MODE_ONE ?
	"ch%d" : "%d", i);
	}
	}
	return (0);
	}

	/*
	* Common case interrupt handler.
	*/
	static void
	ahci_intr(void *data)
	{
	struct ahci_controller_irq *irq = data;
	struct ahci_controller *ctlr = irq->ctlr;
	u_int32_t is, ise = 0;
	void *arg;
	int unit;

	if (irq->mode == AHCI_IRQ_MODE_ALL) {
	unit = 0;
	if (ctlr->ccc)
	is = ctlr->ichannels;
	else
	is = ATA_INL(ctlr->r_mem, AHCI_IS);
	} else { /* AHCI_IRQ_MODE_AFTER */
	unit = irq->r_irq_rid - 1;
	is = ATA_INL(ctlr->r_mem, AHCI_IS);
	is &= (0xffffffff << unit);
	}
	/* CCC interrupt is edge triggered. */
	if (ctlr->ccc)
	ise = 1 << ctlr->cccv;
	/* Some controllers have edge triggered IS. */
	if (ctlr->quirks & AHCI_Q_EDGEIS)
	ise \|= is;
	if (ise != 0)
	ATA_OUTL(ctlr->r_mem, AHCI_IS, ise);
	for (; unit < ctlr->channels; unit++) {
	if ((is & (1 << unit)) != 0 &&
	(arg = ctlr->interrupt[unit].argument)) {
	ctlr->interrupt[unit].function(arg);
	}
	}
	for (; unit < ctlr->channels + ctlr->remapped_devices; unit++) {
	if ((arg = ctlr->interrupt[unit].argument)) {
	ctlr->interrupt[unit].function(arg);
	}
	}

	/* AHCI declares level triggered IS. */
	if (!(ctlr->quirks & AHCI_Q_EDGEIS))
	ATA_OUTL(ctlr->r_mem, AHCI_IS, is);
	ATA_RBL(ctlr->r_mem, AHCI_IS);
	}

	/*
	* Simplified interrupt handler for multivector MSI mode.
	*/
	static void
	ahci_intr_one(void *data)
	{
	struct ahci_controller_irq *irq = data;
	struct ahci_controller *ctlr = irq->ctlr;
	void *arg;
	int unit;

	unit = irq->r_irq_rid - 1;
	if ((arg = ctlr->interrupt[unit].argument))
	ctlr->interrupt[unit].function(arg);
	/* AHCI declares level triggered IS. */
	ATA_OUTL(ctlr->r_mem, AHCI_IS, 1 << unit);
	ATA_RBL(ctlr->r_mem, AHCI_IS);
	}

	static void
	ahci_intr_one_edge(void *data)
	{
	struct ahci_controller_irq *irq = data;
	struct ahci_controller *ctlr = irq->ctlr;
	void *arg;
	int unit;

	unit = irq->r_irq_rid - 1;
	/* Some controllers have edge triggered IS. */
	ATA_OUTL(ctlr->r_mem, AHCI_IS, 1 << unit);
	if ((arg = ctlr->interrupt[unit].argument))
	ctlr->interrupt[unit].function(arg);
	ATA_RBL(ctlr->r_mem, AHCI_IS);
	}

	struct resource *
	ahci_alloc_resource(device_t dev, device_t child, int type, int *rid,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);
	struct resource *res;
	rman_res_t st;
	int offset, size, unit;
	bool is_em, is_remapped;

	unit = (intptr_t)device_get_ivars(child);
	is_em = is_remapped = false;
	if (unit & AHCI_REMAPPED_UNIT) {
	unit &= AHCI_UNIT;
	unit -= ctlr->channels;
	is_remapped = true;
	} else if (unit & AHCI_EM_UNIT) {
	unit &= AHCI_UNIT;
	is_em = true;
	}
	res = NULL;
	switch (type) {
	case SYS_RES_MEMORY:
	if (is_remapped) {
	offset = ctlr->remap_offset + unit * ctlr->remap_size;
	size = ctlr->remap_size;
	} else if (!is_em) {
	offset = AHCI_OFFSET + (unit << 7);
	size = 128;
	} else if (*rid == 0) {
	offset = AHCI_EM_CTL;
	size = 4;
	} else {
	offset = (ctlr->emloc & 0xffff0000) >> 14;
	size = (ctlr->emloc & 0x0000ffff) << 2;
	if (*rid != 1) {
	if (*rid == 2 && (ctlr->capsem &
	(AHCI_EM_XMT \| AHCI_EM_SMB)) == 0)
	offset += size;
	else
	break;
	}
	}
	st = rman_get_start(ctlr->r_mem);
	res = rman_reserve_resource(&ctlr->sc_iomem, st + offset,
	st + offset + size - 1, size, RF_ACTIVE, child);
	if (res) {
	bus_space_handle_t bsh;
	bus_space_tag_t bst;
	bsh = rman_get_bushandle(ctlr->r_mem);
	bst = rman_get_bustag(ctlr->r_mem);
	bus_space_subregion(bst, bsh, offset, 128, &bsh);
	rman_set_bushandle(res, bsh);
	rman_set_bustag(res, bst);
	}
	break;
	case SYS_RES_IRQ:
	if (*rid == ATA_IRQ_RID)
	res = ctlr->irqs[0].r_irq;
	break;
	}
	return (res);
	}

	int
	ahci_release_resource(device_t dev, device_t child, int type, int rid,
	struct resource *r)
	{

	switch (type) {
	case SYS_RES_MEMORY:
	rman_release_resource(r);
	return (0);
	case SYS_RES_IRQ:
	if (rid != ATA_IRQ_RID)
	return (ENOENT);
	return (0);
	}
	return (EINVAL);
	}

	int
	ahci_setup_intr(device_t dev, device_t child, struct resource *irq,
	int flags, driver_filter_t filter, driver_intr_t function,
	void argument, void *cookiep)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);
	int unit = (intptr_t)device_get_ivars(child) & AHCI_UNIT;

	if (filter != NULL) {
	printf("ahci.c: we cannot use a filter here\n");
	return (EINVAL);
	}
	ctlr->interrupt[unit].function = function;
	ctlr->interrupt[unit].argument = argument;
	return (0);
	}

	int
	ahci_teardown_intr(device_t dev, device_t child, struct resource *irq,
	void *cookie)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);
	int unit = (intptr_t)device_get_ivars(child) & AHCI_UNIT;

	ctlr->interrupt[unit].function = NULL;
	ctlr->interrupt[unit].argument = NULL;
	return (0);
	}

	int
	ahci_print_child(device_t dev, device_t child)
	{
	intptr_t ivars;
	int retval;

	retval = bus_print_child_header(dev, child);
	ivars = (intptr_t)device_get_ivars(child);
	if ((ivars & AHCI_EM_UNIT) == 0)
	retval += printf(" at channel %d", (int)ivars & AHCI_UNIT);
	retval += bus_print_child_footer(dev, child);
	return (retval);
	}

	int
	ahci_child_location_str(device_t dev, device_t child, char *buf,
	size_t buflen)
	{
	intptr_t ivars;

	ivars = (intptr_t)device_get_ivars(child);
	if ((ivars & AHCI_EM_UNIT) == 0)
	snprintf(buf, buflen, "channel=%d", (int)ivars & AHCI_UNIT);
	return (0);
	}

	bus_dma_tag_t
	ahci_get_dma_tag(device_t dev, device_t child)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);

	return (ctlr->dma_tag);
	}

	void
	ahci_attached(device_t dev, struct ahci_channel *ch)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);

	mtx_lock(&ctlr->ch_mtx);
	ctlr->ch[ch->unit] = ch;
	mtx_unlock(&ctlr->ch_mtx);
	}

	void
	ahci_detached(device_t dev, struct ahci_channel *ch)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);

	mtx_lock(&ctlr->ch_mtx);
	mtx_lock(&ch->mtx);
	ctlr->ch[ch->unit] = NULL;
	mtx_unlock(&ch->mtx);
	mtx_unlock(&ctlr->ch_mtx);
	}

	struct ahci_channel *
	ahci_getch(device_t dev, int n)
	{
	struct ahci_controller *ctlr = device_get_softc(dev);
	struct ahci_channel *ch;

	KASSERT(n >= 0 && n < AHCI_MAX_PORTS, ("Bad channel number %d", n));
	mtx_lock(&ctlr->ch_mtx);
	ch = ctlr->ch[n];
	if (ch != NULL)
	mtx_lock(&ch->mtx);
	mtx_unlock(&ctlr->ch_mtx);
	return (ch);
	}

	void
	ahci_putch(struct ahci_channel *ch)
	{

	mtx_unlock(&ch->mtx);
	}

	static int
	ahci_ch_probe(device_t dev)
	{

	device_set_desc_copy(dev, "AHCI channel");
	return (BUS_PROBE_DEFAULT);
	}

	static int
	ahci_ch_disablephy_proc(SYSCTL_HANDLER_ARGS)
	{
	struct ahci_channel *ch;
	int error, value;

	ch = arg1;
	value = ch->disablephy;
	error = sysctl_handle_int(oidp, &value, 0, req);
	if (error != 0 \|\| req->newptr == NULL \|\| (value != 0 && value != 1))
	return (error);

	mtx_lock(&ch->mtx);
	ch->disablephy = value;
	if (value) {
	ahci_ch_deinit(ch->dev);
	} else {
	ahci_ch_init(ch->dev);
	ahci_phy_check_events(ch, ATA_SE_PHY_CHANGED \| ATA_SE_EXCHANGED);
	}
	mtx_unlock(&ch->mtx);

	return (0);
	}

	static int
	ahci_ch_attach(device_t dev)
	{
	struct ahci_controller *ctlr = device_get_softc(device_get_parent(dev));
	struct ahci_channel *ch = device_get_softc(dev);
	struct cam_devq *devq;
	struct sysctl_ctx_list *ctx;
	struct sysctl_oid *tree;
	int rid, error, i, sata_rev = 0;
	u_int32_t version;

	ch->dev = dev;
	ch->unit = (intptr_t)device_get_ivars(dev);
	ch->caps = ctlr->caps;
	ch->caps2 = ctlr->caps2;
	ch->start = ctlr->ch_start;
	ch->quirks = ctlr->quirks;
	ch->vendorid = ctlr->vendorid;
	ch->deviceid = ctlr->deviceid;
	ch->subvendorid = ctlr->subvendorid;
	ch->subdeviceid = ctlr->subdeviceid;
	ch->numslots = ((ch->caps & AHCI_CAP_NCS) >> AHCI_CAP_NCS_SHIFT) + 1;
	mtx_init(&ch->mtx, "AHCI channel lock", NULL, MTX_DEF);
	ch->pm_level = 0;
	resource_int_value(device_get_name(dev),
	device_get_unit(dev), "pm_level", &ch->pm_level);
	STAILQ_INIT(&ch->doneq);
	if (ch->pm_level > 3)
	callout_init_mtx(&ch->pm_timer, &ch->mtx, 0);
	callout_init_mtx(&ch->reset_timer, &ch->mtx, 0);
	/* JMicron external ports (0) sometimes limited */
	if ((ctlr->quirks & AHCI_Q_SATA1_UNIT0) && ch->unit == 0)
	sata_rev = 1;
	if (ch->quirks & AHCI_Q_SATA2)
	sata_rev = 2;
	resource_int_value(device_get_name(dev),
	device_get_unit(dev), "sata_rev", &sata_rev);
	for (i = 0; i < 16; i++) {
	ch->user[i].revision = sata_rev;
	ch->user[i].mode = 0;
	ch->user[i].bytecount = 8192;
	ch->user[i].tags = ch->numslots;
	ch->user[i].caps = 0;
	ch->curr[i] = ch->user[i];
	if (ch->pm_level) {
	ch->user[i].caps = CTS_SATA_CAPS_H_PMREQ \|
	CTS_SATA_CAPS_H_APST \|
	CTS_SATA_CAPS_D_PMREQ \| CTS_SATA_CAPS_D_APST;
	}
	ch->user[i].caps \|= CTS_SATA_CAPS_H_DMAAA \|
	CTS_SATA_CAPS_H_AN;
	}
	rid = 0;
	if (!(ch->r_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&rid, RF_ACTIVE)))
	return (ENXIO);
	ch->chcaps = ATA_INL(ch->r_mem, AHCI_P_CMD);
	version = ATA_INL(ctlr->r_mem, AHCI_VS);
	if (version < 0x00010200 && (ctlr->caps & AHCI_CAP_FBSS))
	ch->chcaps \|= AHCI_P_CMD_FBSCP;
	if (ch->caps2 & AHCI_CAP2_SDS)
	ch->chscaps = ATA_INL(ch->r_mem, AHCI_P_DEVSLP);
	if (bootverbose) {
	device_printf(dev, "Caps:%s%s%s%s%s%s\n",
	(ch->chcaps & AHCI_P_CMD_HPCP) ? " HPCP":"",
	(ch->chcaps & AHCI_P_CMD_MPSP) ? " MPSP":"",
	(ch->chcaps & AHCI_P_CMD_CPD) ? " CPD":"",
	(ch->chcaps & AHCI_P_CMD_ESP) ? " ESP":"",
	(ch->chcaps & AHCI_P_CMD_FBSCP) ? " FBSCP":"",
	(ch->chscaps & AHCI_P_DEVSLP_DSP) ? " DSP":"");
	}
	ahci_dmainit(dev);
	ahci_slotsalloc(dev);
	mtx_lock(&ch->mtx);
	ahci_ch_init(dev);
	rid = ATA_IRQ_RID;
	if (!(ch->r_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ,
	&rid, RF_SHAREABLE \| RF_ACTIVE))) {
	device_printf(dev, "Unable to map interrupt\n");
	error = ENXIO;
	goto err0;
	}
	if ((bus_setup_intr(dev, ch->r_irq, ATA_INTR_FLAGS, NULL,
	ctlr->direct ? ahci_ch_intr_direct : ahci_ch_intr,
	ch, &ch->ih))) {
	device_printf(dev, "Unable to setup interrupt\n");
	error = ENXIO;
	goto err1;
	}
	/* Create the device queue for our SIM. */
	devq = cam_simq_alloc(ch->numslots);
	if (devq == NULL) {
	device_printf(dev, "Unable to allocate simq\n");
	error = ENOMEM;
	goto err1;
	}
	/* Construct SIM entry */
	ch->sim = cam_sim_alloc(ahciaction, ahcipoll, "ahcich", ch,
	device_get_unit(dev), (struct mtx *)&ch->mtx,
	(ch->quirks & AHCI_Q_NOCCS) ? 1 : min(2, ch->numslots),
	(ch->caps & AHCI_CAP_SNCQ) ? ch->numslots : 0,
	devq);
	if (ch->sim == NULL) {
	cam_simq_free(devq);
	device_printf(dev, "unable to allocate sim\n");
	error = ENOMEM;
	goto err1;
	}
	if (xpt_bus_register(ch->sim, dev, 0) != CAM_SUCCESS) {
	device_printf(dev, "unable to register xpt bus\n");
	error = ENXIO;
	goto err2;
	}
	if (xpt_create_path(&ch->path, /periph/NULL, cam_sim_path(ch->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	device_printf(dev, "unable to create path\n");
	error = ENXIO;
	goto err3;
	}
	if (ch->pm_level > 3) {
	callout_reset(&ch->pm_timer,
	(ch->pm_level == 4) ? hz / 1000 : hz / 8,
	ahci_ch_pm, ch);
	}
	mtx_unlock(&ch->mtx);
	ahci_attached(device_get_parent(dev), ch);
	ctx = device_get_sysctl_ctx(dev);
	tree = device_get_sysctl_tree(dev);
	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "disable_phy",
	CTLFLAG_RW \| CTLTYPE_UINT \| CTLFLAG_NEEDGIANT, ch,
	0, ahci_ch_disablephy_proc, "IU", "Disable PHY");
	return (0);

	err3:
	xpt_bus_deregister(cam_sim_path(ch->sim));
	err2:
	cam_sim_free(ch->sim, /free_devq/TRUE);
	err1:
	bus_release_resource(dev, SYS_RES_IRQ, ATA_IRQ_RID, ch->r_irq);
	err0:
	bus_release_resource(dev, SYS_RES_MEMORY, ch->unit, ch->r_mem);
	mtx_unlock(&ch->mtx);
	mtx_destroy(&ch->mtx);
	return (error);
	}

	static int
	ahci_ch_detach(device_t dev)
	{
	struct ahci_channel *ch = device_get_softc(dev);

	ahci_detached(device_get_parent(dev), ch);
	mtx_lock(&ch->mtx);
	xpt_async(AC_LOST_DEVICE, ch->path, NULL);
	/* Forget about reset. */
	if (ch->resetting) {
	ch->resetting = 0;
	xpt_release_simq(ch->sim, TRUE);
	}
	xpt_free_path(ch->path);
	xpt_bus_deregister(cam_sim_path(ch->sim));
	cam_sim_free(ch->sim, /free_devq/TRUE);
	mtx_unlock(&ch->mtx);

	if (ch->pm_level > 3)
	callout_drain(&ch->pm_timer);
	callout_drain(&ch->reset_timer);
	bus_teardown_intr(dev, ch->r_irq, ch->ih);
	bus_release_resource(dev, SYS_RES_IRQ, ATA_IRQ_RID, ch->r_irq);

	ahci_ch_deinit(dev);
	ahci_slotsfree(dev);
	ahci_dmafini(dev);

	bus_release_resource(dev, SYS_RES_MEMORY, ch->unit, ch->r_mem);
	mtx_destroy(&ch->mtx);
	return (0);
	}

	static int
	ahci_ch_init(device_t dev)
	{
	struct ahci_channel *ch = device_get_softc(dev);
	uint64_t work;

	/* Disable port interrupts */
	ATA_OUTL(ch->r_mem, AHCI_P_IE, 0);
	/* Setup work areas */
	work = ch->dma.work_bus + AHCI_CL_OFFSET;
	ATA_OUTL(ch->r_mem, AHCI_P_CLB, work & 0xffffffff);
	ATA_OUTL(ch->r_mem, AHCI_P_CLBU, work >> 32);
	work = ch->dma.rfis_bus;
	ATA_OUTL(ch->r_mem, AHCI_P_FB, work & 0xffffffff);
	ATA_OUTL(ch->r_mem, AHCI_P_FBU, work >> 32);
	/* Activate the channel and power/spin up device */
	ATA_OUTL(ch->r_mem, AHCI_P_CMD,
	(AHCI_P_CMD_ACTIVE \| AHCI_P_CMD_POD \| AHCI_P_CMD_SUD \|
	((ch->pm_level == 2 \|\| ch->pm_level == 3) ? AHCI_P_CMD_ALPE : 0) \|
	((ch->pm_level > 2) ? AHCI_P_CMD_ASP : 0 )));
	ahci_start_fr(ch);
	ahci_start(ch, 1);
	return (0);
	}

	static int
	ahci_ch_deinit(device_t dev)
	{
	struct ahci_channel *ch = device_get_softc(dev);

	/* Disable port interrupts. */
	ATA_OUTL(ch->r_mem, AHCI_P_IE, 0);
	/* Reset command register. */
	ahci_stop(ch);
	ahci_stop_fr(ch);
	ATA_OUTL(ch->r_mem, AHCI_P_CMD, 0);
	/* Allow everything, including partial and slumber modes. */
	ATA_OUTL(ch->r_mem, AHCI_P_SCTL, 0);
	/* Request slumber mode transition and give some time to get there. */
	ATA_OUTL(ch->r_mem, AHCI_P_CMD, AHCI_P_CMD_SLUMBER);
	DELAY(100);
	/* Disable PHY. */
	ATA_OUTL(ch->r_mem, AHCI_P_SCTL, ATA_SC_DET_DISABLE);
	return (0);
	}

	static int
	ahci_ch_suspend(device_t dev)
	{
	struct ahci_channel *ch = device_get_softc(dev);

	mtx_lock(&ch->mtx);
	xpt_freeze_simq(ch->sim, 1);
	/* Forget about reset. */
	if (ch->resetting) {
	ch->resetting = 0;
	callout_stop(&ch->reset_timer);
	xpt_release_simq(ch->sim, TRUE);
	}
	while (ch->oslots)
	msleep(ch, &ch->mtx, PRIBIO, "ahcisusp", hz/100);
	ahci_ch_deinit(dev);
	mtx_unlock(&ch->mtx);
	return (0);
	}

	static int
	ahci_ch_resume(device_t dev)
	{
	struct ahci_channel *ch = device_get_softc(dev);

	mtx_lock(&ch->mtx);
	ahci_ch_init(dev);
	ahci_reset(ch);
	xpt_release_simq(ch->sim, TRUE);
	mtx_unlock(&ch->mtx);
	return (0);
	}

	devclass_t ahcich_devclass;
	static device_method_t ahcich_methods[] = {
	DEVMETHOD(device_probe, ahci_ch_probe),
	DEVMETHOD(device_attach, ahci_ch_attach),
	DEVMETHOD(device_detach, ahci_ch_detach),
	DEVMETHOD(device_suspend, ahci_ch_suspend),
	DEVMETHOD(device_resume, ahci_ch_resume),
	DEVMETHOD_END
	};
	static driver_t ahcich_driver = {
	"ahcich",
	ahcich_methods,
	sizeof(struct ahci_channel)
	};
	DRIVER_MODULE(ahcich, ahci, ahcich_driver, ahcich_devclass, NULL, NULL);

	struct ahci_dc_cb_args {
	bus_addr_t maddr;
	int error;
	};

	static void
	ahci_dmainit(device_t dev)
	{
	struct ahci_channel *ch = device_get_softc(dev);
	struct ahci_dc_cb_args dcba;
	size_t rfsize;
	int error;

	/* Command area. */
	error = bus_dma_tag_create(bus_get_dma_tag(dev), 1024, 0,
	BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
	NULL, NULL, AHCI_WORK_SIZE, 1, AHCI_WORK_SIZE,
	0, NULL, NULL, &ch->dma.work_tag);
	if (error != 0)
	goto error;
	error = bus_dmamem_alloc(ch->dma.work_tag, (void **)&ch->dma.work,
	BUS_DMA_ZERO, &ch->dma.work_map);
	if (error != 0)
	goto error;
	error = bus_dmamap_load(ch->dma.work_tag, ch->dma.work_map, ch->dma.work,
	AHCI_WORK_SIZE, ahci_dmasetupc_cb, &dcba, BUS_DMA_NOWAIT);
	if (error != 0 \|\| (error = dcba.error) != 0) {
	bus_dmamem_free(ch->dma.work_tag, ch->dma.work, ch->dma.work_map);
	goto error;
	}
	ch->dma.work_bus = dcba.maddr;
	/* FIS receive area. */
	if (ch->chcaps & AHCI_P_CMD_FBSCP)
	rfsize = 4096;
	else
	rfsize = 256;
	error = bus_dma_tag_create(bus_get_dma_tag(dev), rfsize, 0,
	BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
	NULL, NULL, rfsize, 1, rfsize,
	0, NULL, NULL, &ch->dma.rfis_tag);
	if (error != 0)
	goto error;
	error = bus_dmamem_alloc(ch->dma.rfis_tag, (void **)&ch->dma.rfis, 0,
	&ch->dma.rfis_map);
	if (error != 0)
	goto error;
	error = bus_dmamap_load(ch->dma.rfis_tag, ch->dma.rfis_map, ch->dma.rfis,
	rfsize, ahci_dmasetupc_cb, &dcba, BUS_DMA_NOWAIT);
	if (error != 0 \|\| (error = dcba.error) != 0) {
	bus_dmamem_free(ch->dma.rfis_tag, ch->dma.rfis, ch->dma.rfis_map);
	goto error;
	}
	ch->dma.rfis_bus = dcba.maddr;
	/* Data area. */
	error = bus_dma_tag_create(bus_get_dma_tag(dev), 2, 0,
	BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
	NULL, NULL,
	- AHCI_SG_ENTRIES * PAGE_SIZE * ch->numslots,
	- AHCI_SG_ENTRIES, AHCI_PRD_MAX,
	+ AHCI_SG_ENTRIES * PAGE_SIZE, AHCI_SG_ENTRIES, AHCI_PRD_MAX,
	0, busdma_lock_mutex, &ch->mtx, &ch->dma.data_tag);
	if (error != 0)
	goto error;
	return;

	error:
	device_printf(dev, "WARNING - DMA initialization failed, error %d\n",
	error);
	ahci_dmafini(dev);
	}

	static void
	ahci_dmasetupc_cb(void xsc, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct ahci_dc_cb_args dcba = (struct ahci_dc_cb_args )xsc;

	if (!(dcba->error = error))
	dcba->maddr = segs[0].ds_addr;
	}

	static void
	ahci_dmafini(device_t dev)
	{
	struct ahci_channel *ch = device_get_softc(dev);

	if (ch->dma.data_tag) {
	bus_dma_tag_destroy(ch->dma.data_tag);
	ch->dma.data_tag = NULL;
	}
	if (ch->dma.rfis_bus) {
	bus_dmamap_unload(ch->dma.rfis_tag, ch->dma.rfis_map);
	bus_dmamem_free(ch->dma.rfis_tag, ch->dma.rfis, ch->dma.rfis_map);
	ch->dma.rfis_bus = 0;
	ch->dma.rfis = NULL;
	}
	if (ch->dma.work_bus) {
	bus_dmamap_unload(ch->dma.work_tag, ch->dma.work_map);
	bus_dmamem_free(ch->dma.work_tag, ch->dma.work, ch->dma.work_map);
	ch->dma.work_bus = 0;
	ch->dma.work = NULL;
	}
	if (ch->dma.work_tag) {
	bus_dma_tag_destroy(ch->dma.work_tag);
	ch->dma.work_tag = NULL;
	}
	}

	static void
	ahci_slotsalloc(device_t dev)
	{
	struct ahci_channel *ch = device_get_softc(dev);
	int i;

	/* Alloc and setup command/dma slots */
	bzero(ch->slot, sizeof(ch->slot));
	for (i = 0; i < ch->numslots; i++) {
	struct ahci_slot *slot = &ch->slot[i];

	slot->ch = ch;
	slot->slot = i;
	slot->state = AHCI_SLOT_EMPTY;
	+ slot->ct_offset = AHCI_CT_OFFSET + AHCI_CT_SIZE * i;
	slot->ccb = NULL;
	callout_init_mtx(&slot->timeout, &ch->mtx, 0);

	if (bus_dmamap_create(ch->dma.data_tag, 0, &slot->dma.data_map))
	device_printf(ch->dev, "FAILURE - create data_map\n");
	}
	}

	static void
	ahci_slotsfree(device_t dev)
	{
	struct ahci_channel *ch = device_get_softc(dev);
	int i;

	/* Free all dma slots */
	for (i = 0; i < ch->numslots; i++) {
	struct ahci_slot *slot = &ch->slot[i];

	callout_drain(&slot->timeout);
	if (slot->dma.data_map) {
	bus_dmamap_destroy(ch->dma.data_tag, slot->dma.data_map);
	slot->dma.data_map = NULL;
	}
	}
	}

	static int
	ahci_phy_check_events(struct ahci_channel *ch, u_int32_t serr)
	{

	if (((ch->pm_level == 0) && (serr & ATA_SE_PHY_CHANGED)) \|\|
	((ch->pm_level != 0 \|\| ch->listening) && (serr & ATA_SE_EXCHANGED))) {
	u_int32_t status = ATA_INL(ch->r_mem, AHCI_P_SSTS);
	union ccb *ccb;

	if (bootverbose) {
	if ((status & ATA_SS_DET_MASK) != ATA_SS_DET_NO_DEVICE)
	device_printf(ch->dev, "CONNECT requested\n");
	else
	device_printf(ch->dev, "DISCONNECT requested\n");
	}
	ahci_reset(ch);
	if ((ccb = xpt_alloc_ccb_nowait()) == NULL)
	return (0);
	if (xpt_create_path(&ccb->ccb_h.path, NULL,
	cam_sim_path(ch->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	xpt_free_ccb(ccb);
	return (0);
	}
	xpt_rescan(ccb);
	return (1);
	}
	return (0);
	}

	static void
	ahci_cpd_check_events(struct ahci_channel *ch)
	{
	u_int32_t status;
	union ccb *ccb;
	device_t dev;

	if (ch->pm_level == 0)
	return;

	status = ATA_INL(ch->r_mem, AHCI_P_CMD);
	if ((status & AHCI_P_CMD_CPD) == 0)
	return;

	if (bootverbose) {
	dev = ch->dev;
	if (status & AHCI_P_CMD_CPS) {
	device_printf(dev, "COLD CONNECT requested\n");
	} else
	device_printf(dev, "COLD DISCONNECT requested\n");
	}
	ahci_reset(ch);
	if ((ccb = xpt_alloc_ccb_nowait()) == NULL)
	return;
	if (xpt_create_path(&ccb->ccb_h.path, NULL, cam_sim_path(ch->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	xpt_free_ccb(ccb);
	return;
	}
	xpt_rescan(ccb);
	}

	static void
	ahci_notify_events(struct ahci_channel *ch, u_int32_t status)
	{
	struct cam_path *dpath;
	int i;

	if (ch->caps & AHCI_CAP_SSNTF)
	ATA_OUTL(ch->r_mem, AHCI_P_SNTF, status);
	if (bootverbose)
	device_printf(ch->dev, "SNTF 0x%04x\n", status);
	for (i = 0; i < 16; i++) {
	if ((status & (1 << i)) == 0)
	continue;
	if (xpt_create_path(&dpath, NULL,
	xpt_path_path_id(ch->path), i, 0) == CAM_REQ_CMP) {
	xpt_async(AC_SCSI_AEN, dpath, NULL);
	xpt_free_path(dpath);
	}
	}
	}

	static void
	ahci_done(struct ahci_channel ch, union ccb ccb)
	{

	mtx_assert(&ch->mtx, MA_OWNED);
	if ((ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0 \|\|
	ch->batch == 0) {
	xpt_done(ccb);
	return;
	}

	STAILQ_INSERT_TAIL(&ch->doneq, &ccb->ccb_h, sim_links.stqe);
	}

	static void
	ahci_ch_intr(void *arg)
	{
	struct ahci_channel ch = (struct ahci_channel )arg;
	uint32_t istatus;

	/* Read interrupt statuses. */
	istatus = ATA_INL(ch->r_mem, AHCI_P_IS);

	mtx_lock(&ch->mtx);
	ahci_ch_intr_main(ch, istatus);
	mtx_unlock(&ch->mtx);
	}

	static void
	ahci_ch_intr_direct(void *arg)
	{
	struct ahci_channel ch = (struct ahci_channel )arg;
	struct ccb_hdr *ccb_h;
	uint32_t istatus;
	STAILQ_HEAD(, ccb_hdr) tmp_doneq = STAILQ_HEAD_INITIALIZER(tmp_doneq);

	/* Read interrupt statuses. */
	istatus = ATA_INL(ch->r_mem, AHCI_P_IS);

	mtx_lock(&ch->mtx);
	ch->batch = 1;
	ahci_ch_intr_main(ch, istatus);
	ch->batch = 0;
	/*
	* Prevent the possibility of issues caused by processing the queue
	* while unlocked below by moving the contents to a local queue.
	*/
	STAILQ_CONCAT(&tmp_doneq, &ch->doneq);
	mtx_unlock(&ch->mtx);
	while ((ccb_h = STAILQ_FIRST(&tmp_doneq)) != NULL) {
	STAILQ_REMOVE_HEAD(&tmp_doneq, sim_links.stqe);
	xpt_done_direct((union ccb *)ccb_h);
	}
	}

	static void
	ahci_ch_pm(void *arg)
	{
	struct ahci_channel ch = (struct ahci_channel )arg;
	uint32_t work;

	if (ch->numrslots != 0)
	return;
	work = ATA_INL(ch->r_mem, AHCI_P_CMD);
	if (ch->pm_level == 4)
	work \|= AHCI_P_CMD_PARTIAL;
	else
	work \|= AHCI_P_CMD_SLUMBER;
	ATA_OUTL(ch->r_mem, AHCI_P_CMD, work);
	}

	static void
	ahci_ch_intr_main(struct ahci_channel *ch, uint32_t istatus)
	{
	uint32_t cstatus, serr = 0, sntf = 0, ok, err;
	enum ahci_err_type et;
	int i, ccs, port, reset = 0;

	/* Clear interrupt statuses. */
	ATA_OUTL(ch->r_mem, AHCI_P_IS, istatus);
	/* Read command statuses. */
	if (ch->numtslots != 0)
	cstatus = ATA_INL(ch->r_mem, AHCI_P_SACT);
	else
	cstatus = 0;
	if (ch->numrslots != ch->numtslots)
	cstatus \|= ATA_INL(ch->r_mem, AHCI_P_CI);
	/* Read SNTF in one of possible ways. */
	if ((istatus & AHCI_P_IX_SDB) &&
	(ch->pm_present \|\| ch->curr[0].atapi != 0)) {
	if (ch->caps & AHCI_CAP_SSNTF)
	sntf = ATA_INL(ch->r_mem, AHCI_P_SNTF);
	else if (ch->fbs_enabled) {
	u_int8_t *fis = ch->dma.rfis + 0x58;

	for (i = 0; i < 16; i++) {
	if (fis[1] & 0x80) {
	fis[1] &= 0x7f;
	sntf \|= 1 << i;
	}
	fis += 256;
	}
	} else {
	u_int8_t *fis = ch->dma.rfis + 0x58;

	if (fis[1] & 0x80)
	sntf = (1 << (fis[1] & 0x0f));
	}
	}
	/* Process PHY events */
	if (istatus & (AHCI_P_IX_PC \| AHCI_P_IX_PRC \| AHCI_P_IX_OF \|
	AHCI_P_IX_IF \| AHCI_P_IX_HBD \| AHCI_P_IX_HBF \| AHCI_P_IX_TFE)) {
	serr = ATA_INL(ch->r_mem, AHCI_P_SERR);
	if (serr) {
	ATA_OUTL(ch->r_mem, AHCI_P_SERR, serr);
	reset = ahci_phy_check_events(ch, serr);
	}
	}
	/* Process cold presence detection events */
	if ((istatus & AHCI_P_IX_CPD) && !reset)
	ahci_cpd_check_events(ch);
	/* Process command errors */
	if (istatus & (AHCI_P_IX_OF \| AHCI_P_IX_IF \|
	AHCI_P_IX_HBD \| AHCI_P_IX_HBF \| AHCI_P_IX_TFE)) {
	if (ch->quirks & AHCI_Q_NOCCS) {
	/*
	* ASMedia chips sometimes report failed commands as
	* completed. Count all running commands as failed.
	*/
	cstatus \|= ch->rslots;

	/* They also report wrong CCS, so try to guess one. */
	ccs = powerof2(cstatus) ? ffs(cstatus) - 1 : -1;
	} else {
	ccs = (ATA_INL(ch->r_mem, AHCI_P_CMD) &
	AHCI_P_CMD_CCS_MASK) >> AHCI_P_CMD_CCS_SHIFT;
	}
	//device_printf(dev, "%s ERROR is %08x cs %08x ss %08x rs %08x tfd %02x serr %08x fbs %08x ccs %d\n",
	// __func__, istatus, cstatus, sstatus, ch->rslots, ATA_INL(ch->r_mem, AHCI_P_TFD),
	// serr, ATA_INL(ch->r_mem, AHCI_P_FBS), ccs);
	port = -1;
	if (ch->fbs_enabled) {
	uint32_t fbs = ATA_INL(ch->r_mem, AHCI_P_FBS);
	if (fbs & AHCI_P_FBS_SDE) {
	port = (fbs & AHCI_P_FBS_DWE)
	>> AHCI_P_FBS_DWE_SHIFT;
	} else {
	for (i = 0; i < 16; i++) {
	if (ch->numrslotspd[i] == 0)
	continue;
	if (port == -1)
	port = i;
	else if (port != i) {
	port = -2;
	break;
	}
	}
	}
	}
	err = ch->rslots & cstatus;
	} else {
	ccs = 0;
	err = 0;
	port = -1;
	}
	/* Complete all successful commands. */
	ok = ch->rslots & ~cstatus;
	for (i = 0; i < ch->numslots; i++) {
	if ((ok >> i) & 1)
	ahci_end_transaction(&ch->slot[i], AHCI_ERR_NONE);
	}
	/* On error, complete the rest of commands with error statuses. */
	if (err) {
	if (ch->frozen) {
	union ccb *fccb = ch->frozen;
	ch->frozen = NULL;
	fccb->ccb_h.status = CAM_REQUEUE_REQ \| CAM_RELEASE_SIMQ;
	if (!(fccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(fccb->ccb_h.path, 1);
	fccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	ahci_done(ch, fccb);
	}
	for (i = 0; i < ch->numslots; i++) {
	/* XXX: reqests in loading state. */
	if (((err >> i) & 1) == 0)
	continue;
	if (port >= 0 &&
	ch->slot[i].ccb->ccb_h.target_id != port)
	continue;
	if (istatus & AHCI_P_IX_TFE) {
	if (port != -2) {
	/* Task File Error */
	if (ch->numtslotspd[
	ch->slot[i].ccb->ccb_h.target_id] == 0) {
	/* Untagged operation. */
	if (i == ccs)
	et = AHCI_ERR_TFE;
	else
	et = AHCI_ERR_INNOCENT;
	} else {
	/* Tagged operation. */
	et = AHCI_ERR_NCQ;
	}
	} else {
	et = AHCI_ERR_TFE;
	ch->fatalerr = 1;
	}
	} else if (istatus & AHCI_P_IX_IF) {
	if (ch->numtslots == 0 && i != ccs && port != -2)
	et = AHCI_ERR_INNOCENT;
	else
	et = AHCI_ERR_SATA;
	} else
	et = AHCI_ERR_INVALID;
	ahci_end_transaction(&ch->slot[i], et);
	}
	/*
	* We can't reinit port if there are some other
	* commands active, use resume to complete them.
	*/
	if (ch->rslots != 0 && !ch->recoverycmd)
	ATA_OUTL(ch->r_mem, AHCI_P_FBS, AHCI_P_FBS_EN \| AHCI_P_FBS_DEC);
	}
	/* Process NOTIFY events */
	if (sntf)
	ahci_notify_events(ch, sntf);
	}

	/* Must be called with channel locked. */
	static int
	ahci_check_collision(struct ahci_channel ch, union ccb ccb)
	{
	int t = ccb->ccb_h.target_id;

	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA)) {
	/* Tagged command while we have no supported tag free. */
	if (((~ch->oslots) & (0xffffffff >> (32 -
	ch->curr[t].tags))) == 0)
	return (1);
	/* If we have FBS */
	if (ch->fbs_enabled) {
	/* Tagged command while untagged are active. */
	if (ch->numrslotspd[t] != 0 && ch->numtslotspd[t] == 0)
	return (1);
	} else {
	/* Tagged command while untagged are active. */
	if (ch->numrslots != 0 && ch->numtslots == 0)
	return (1);
	/* Tagged command while tagged to other target is active. */
	if (ch->numtslots != 0 &&
	ch->taggedtarget != ccb->ccb_h.target_id)
	return (1);
	}
	} else {
	/* If we have FBS */
	if (ch->fbs_enabled) {
	/* Untagged command while tagged are active. */
	if (ch->numrslotspd[t] != 0 && ch->numtslotspd[t] != 0)
	return (1);
	} else {
	/* Untagged command while tagged are active. */
	if (ch->numrslots != 0 && ch->numtslots != 0)
	return (1);
	}
	}
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & (CAM_ATAIO_CONTROL \| CAM_ATAIO_NEEDRESULT))) {
	/* Atomic command while anything active. */
	if (ch->numrslots != 0)
	return (1);
	}
	/* We have some atomic command running. */
	if (ch->aslots != 0)
	return (1);
	return (0);
	}

	/* Must be called with channel locked. */
	static void
	ahci_begin_transaction(struct ahci_channel ch, union ccb ccb)
	{
	struct ahci_slot *slot;
	int tag, tags;

	/* Choose empty slot. */
	tags = ch->numslots;
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA))
	tags = ch->curr[ccb->ccb_h.target_id].tags;
	if (ch->lastslot + 1 < tags)
	tag = ffs(~(ch->oslots >> (ch->lastslot + 1)));
	else
	tag = 0;
	if (tag == 0 \|\| tag + ch->lastslot >= tags)
	tag = ffs(~ch->oslots) - 1;
	else
	tag += ch->lastslot;
	ch->lastslot = tag;
	/* Occupy chosen slot. */
	slot = &ch->slot[tag];
	slot->ccb = ccb;
	/* Stop PM timer. */
	if (ch->numrslots == 0 && ch->pm_level > 3)
	callout_stop(&ch->pm_timer);
	/* Update channel stats. */
	ch->oslots \|= (1 << tag);
	ch->numrslots++;
	ch->numrslotspd[ccb->ccb_h.target_id]++;
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA)) {
	ch->numtslots++;
	ch->numtslotspd[ccb->ccb_h.target_id]++;
	ch->taggedtarget = ccb->ccb_h.target_id;
	}
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & (CAM_ATAIO_CONTROL \| CAM_ATAIO_NEEDRESULT)))
	ch->aslots \|= (1 << tag);
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) {
	slot->state = AHCI_SLOT_LOADING;
	bus_dmamap_load_ccb(ch->dma.data_tag, slot->dma.data_map, ccb,
	ahci_dmasetprd, slot, 0);
	} else {
	slot->dma.nsegs = 0;
	ahci_execute_transaction(slot);
	}
	}

	/* Locked by busdma engine. */
	static void
	ahci_dmasetprd(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct ahci_slot *slot = arg;
	struct ahci_channel *ch = slot->ch;
	struct ahci_cmd_tab *ctp;
	struct ahci_dma_prd *prd;
	int i;

	if (error) {
	device_printf(ch->dev, "DMA load error\n");
	ahci_end_transaction(slot, AHCI_ERR_INVALID);
	return;
	}
	KASSERT(nsegs <= AHCI_SG_ENTRIES, ("too many DMA segment entries\n"));
	/* Get a piece of the workspace for this request */
	- ctp = (struct ahci_cmd_tab *)
	- (ch->dma.work + AHCI_CT_OFFSET + (AHCI_CT_SIZE * slot->slot));
	+ ctp = (struct ahci_cmd_tab *)(ch->dma.work + slot->ct_offset);
	/* Fill S/G table */
	prd = &ctp->prd_tab[0];
	for (i = 0; i < nsegs; i++) {
	prd[i].dba = htole64(segs[i].ds_addr);
	prd[i].dbc = htole32((segs[i].ds_len - 1) & AHCI_PRD_MASK);
	}
	slot->dma.nsegs = nsegs;
	bus_dmamap_sync(ch->dma.data_tag, slot->dma.data_map,
	((slot->ccb->ccb_h.flags & CAM_DIR_IN) ?
	BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE));
	ahci_execute_transaction(slot);
	}

	/* Must be called with channel locked. */
	static void
	ahci_execute_transaction(struct ahci_slot *slot)
	{
	struct ahci_channel *ch = slot->ch;
	struct ahci_cmd_tab *ctp;
	struct ahci_cmd_list *clp;
	union ccb *ccb = slot->ccb;
	int port = ccb->ccb_h.target_id & 0x0f;
	int fis_size, i, softreset;
	uint8_t *fis = ch->dma.rfis + 0x40;
	uint8_t val;
	uint16_t cmd_flags;

	/* Get a piece of the workspace for this request */
	- ctp = (struct ahci_cmd_tab *)
	- (ch->dma.work + AHCI_CT_OFFSET + (AHCI_CT_SIZE * slot->slot));
	+ ctp = (struct ahci_cmd_tab *)(ch->dma.work + slot->ct_offset);
	/* Setup the FIS for this request */
	if (!(fis_size = ahci_setup_fis(ch, ctp, ccb, slot->slot))) {
	device_printf(ch->dev, "Setting up SATA FIS failed\n");
	ahci_end_transaction(slot, AHCI_ERR_INVALID);
	return;
	}
	/* Setup the command list entry */
	clp = (struct ahci_cmd_list *)
	(ch->dma.work + AHCI_CL_OFFSET + (AHCI_CL_SIZE * slot->slot));
	cmd_flags =
	(ccb->ccb_h.flags & CAM_DIR_OUT ? AHCI_CMD_WRITE : 0) \|
	(ccb->ccb_h.func_code == XPT_SCSI_IO ?
	(AHCI_CMD_ATAPI \| AHCI_CMD_PREFETCH) : 0) \|
	(fis_size / sizeof(u_int32_t)) \|
	(port << 12);
	clp->prd_length = htole16(slot->dma.nsegs);
	/* Special handling for Soft Reset command. */
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL)) {
	if (ccb->ataio.cmd.control & ATA_A_RESET) {
	softreset = 1;
	/* Kick controller into sane state */
	ahci_stop(ch);
	ahci_clo(ch);
	ahci_start(ch, 0);
	cmd_flags \|= AHCI_CMD_RESET \| AHCI_CMD_CLR_BUSY;
	} else {
	softreset = 2;
	/* Prepare FIS receive area for check. */
	for (i = 0; i < 20; i++)
	fis[i] = 0xff;
	}
	} else
	softreset = 0;
	clp->bytecount = 0;
	clp->cmd_flags = htole16(cmd_flags);
	- clp->cmd_table_phys = htole64(ch->dma.work_bus + AHCI_CT_OFFSET +
	- (AHCI_CT_SIZE * slot->slot));
	+ clp->cmd_table_phys = htole64(ch->dma.work_bus + slot->ct_offset);
	bus_dmamap_sync(ch->dma.work_tag, ch->dma.work_map,
	BUS_DMASYNC_PREREAD \| BUS_DMASYNC_PREWRITE);
	bus_dmamap_sync(ch->dma.rfis_tag, ch->dma.rfis_map,
	BUS_DMASYNC_PREREAD);
	/* Set ACTIVE bit for NCQ commands. */
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA)) {
	ATA_OUTL(ch->r_mem, AHCI_P_SACT, 1 << slot->slot);
	}
	/* If FBS is enabled, set PMP port. */
	if (ch->fbs_enabled) {
	ATA_OUTL(ch->r_mem, AHCI_P_FBS, AHCI_P_FBS_EN \|
	(port << AHCI_P_FBS_DEV_SHIFT));
	}
	/* Issue command to the controller. */
	slot->state = AHCI_SLOT_RUNNING;
	ch->rslots \|= (1 << slot->slot);
	ATA_OUTL(ch->r_mem, AHCI_P_CI, (1 << slot->slot));
	/* Device reset commands doesn't interrupt. Poll them. */
	if (ccb->ccb_h.func_code == XPT_ATA_IO &&
	(ccb->ataio.cmd.command == ATA_DEVICE_RESET \|\| softreset)) {
	int count, timeout = ccb->ccb_h.timeout * 100;
	enum ahci_err_type et = AHCI_ERR_NONE;

	for (count = 0; count < timeout; count++) {
	DELAY(10);
	if (!(ATA_INL(ch->r_mem, AHCI_P_CI) & (1 << slot->slot)))
	break;
	if ((ATA_INL(ch->r_mem, AHCI_P_TFD) & ATA_S_ERROR) &&
	softreset != 1) {
	#if 0
	device_printf(ch->dev,
	"Poll error on slot %d, TFD: %04x\n",
	slot->slot, ATA_INL(ch->r_mem, AHCI_P_TFD));
	#endif
	et = AHCI_ERR_TFE;
	break;
	}
	/* Workaround for ATI SB600/SB700 chipsets. */
	if (ccb->ccb_h.target_id == 15 &&
	(ch->quirks & AHCI_Q_ATI_PMP_BUG) &&
	(ATA_INL(ch->r_mem, AHCI_P_IS) & AHCI_P_IX_IPM)) {
	et = AHCI_ERR_TIMEOUT;
	break;
	}
	}

	/*
	* Some Marvell controllers require additional time
	* after soft reset to work properly. Setup delay
	* to 50ms after soft reset.
	*/
	if (ch->quirks & AHCI_Q_MRVL_SR_DEL)
	DELAY(50000);

	/*
	* Marvell HBAs with non-RAID firmware do not wait for
	* readiness after soft reset, so we have to wait here.
	* Marvell RAIDs do not have this problem, but instead
	* sometimes forget to update FIS receive area, breaking
	* this wait.
	*/
	if ((ch->quirks & AHCI_Q_NOBSYRES) == 0 &&
	(ch->quirks & AHCI_Q_ATI_PMP_BUG) == 0 &&
	softreset == 2 && et == AHCI_ERR_NONE) {
	for ( ; count < timeout; count++) {
	bus_dmamap_sync(ch->dma.rfis_tag,
	ch->dma.rfis_map, BUS_DMASYNC_POSTREAD);
	val = fis[2];
	bus_dmamap_sync(ch->dma.rfis_tag,
	ch->dma.rfis_map, BUS_DMASYNC_PREREAD);
	if ((val & ATA_S_BUSY) == 0)
	break;
	DELAY(10);
	}
	}

	if (timeout && (count >= timeout)) {
	device_printf(ch->dev, "Poll timeout on slot %d port %d\n",
	slot->slot, port);
	device_printf(ch->dev, "is %08x cs %08x ss %08x "
	"rs %08x tfd %02x serr %08x cmd %08x\n",
	ATA_INL(ch->r_mem, AHCI_P_IS),
	ATA_INL(ch->r_mem, AHCI_P_CI),
	ATA_INL(ch->r_mem, AHCI_P_SACT), ch->rslots,
	ATA_INL(ch->r_mem, AHCI_P_TFD),
	ATA_INL(ch->r_mem, AHCI_P_SERR),
	ATA_INL(ch->r_mem, AHCI_P_CMD));
	et = AHCI_ERR_TIMEOUT;
	}

	/* Kick controller into sane state and enable FBS. */
	if (softreset == 2)
	ch->eslots \|= (1 << slot->slot);
	ahci_end_transaction(slot, et);
	return;
	}
	/* Start command execution timeout */
	callout_reset_sbt(&slot->timeout, SBT_1MS * ccb->ccb_h.timeout / 2,
	0, ahci_timeout, slot, 0);
	return;
	}

	/* Must be called with channel locked. */
	static void
	ahci_process_timeout(struct ahci_channel *ch)
	{
	int i;

	mtx_assert(&ch->mtx, MA_OWNED);
	/* Handle the rest of commands. */
	for (i = 0; i < ch->numslots; i++) {
	/* Do we have a running request on slot? */
	if (ch->slot[i].state < AHCI_SLOT_RUNNING)
	continue;
	ahci_end_transaction(&ch->slot[i], AHCI_ERR_TIMEOUT);
	}
	}

	/* Must be called with channel locked. */
	static void
	ahci_rearm_timeout(struct ahci_channel *ch)
	{
	int i;

	mtx_assert(&ch->mtx, MA_OWNED);
	for (i = 0; i < ch->numslots; i++) {
	struct ahci_slot *slot = &ch->slot[i];

	/* Do we have a running request on slot? */
	if (slot->state < AHCI_SLOT_RUNNING)
	continue;
	if ((ch->toslots & (1 << i)) == 0)
	continue;
	callout_reset_sbt(&slot->timeout,
	SBT_1MS * slot->ccb->ccb_h.timeout / 2, 0,
	ahci_timeout, slot, 0);
	}
	}

	/* Locked by callout mechanism. */
	static void
	ahci_timeout(void *arg)
	{
	struct ahci_slot *slot = arg;
	struct ahci_channel *ch = slot->ch;
	device_t dev = ch->dev;
	uint32_t sstatus;
	int ccs;
	int i;

	/* Check for stale timeout. */
	if (slot->state < AHCI_SLOT_RUNNING)
	return;

	/* Check if slot was not being executed last time we checked. */
	if (slot->state < AHCI_SLOT_EXECUTING) {
	/* Check if slot started executing. */
	sstatus = ATA_INL(ch->r_mem, AHCI_P_SACT);
	ccs = (ATA_INL(ch->r_mem, AHCI_P_CMD) & AHCI_P_CMD_CCS_MASK)
	>> AHCI_P_CMD_CCS_SHIFT;
	if ((sstatus & (1 << slot->slot)) != 0 \|\| ccs == slot->slot \|\|
	ch->fbs_enabled \|\| ch->wrongccs)
	slot->state = AHCI_SLOT_EXECUTING;
	else if ((ch->rslots & (1 << ccs)) == 0) {
	ch->wrongccs = 1;
	slot->state = AHCI_SLOT_EXECUTING;
	}

	callout_reset_sbt(&slot->timeout,
	SBT_1MS * slot->ccb->ccb_h.timeout / 2, 0,
	ahci_timeout, slot, 0);
	return;
	}

	device_printf(dev, "Timeout on slot %d port %d\n",
	slot->slot, slot->ccb->ccb_h.target_id & 0x0f);
	device_printf(dev, "is %08x cs %08x ss %08x rs %08x tfd %02x "
	"serr %08x cmd %08x\n",
	ATA_INL(ch->r_mem, AHCI_P_IS), ATA_INL(ch->r_mem, AHCI_P_CI),
	ATA_INL(ch->r_mem, AHCI_P_SACT), ch->rslots,
	ATA_INL(ch->r_mem, AHCI_P_TFD), ATA_INL(ch->r_mem, AHCI_P_SERR),
	ATA_INL(ch->r_mem, AHCI_P_CMD));

	/* Handle frozen command. */
	if (ch->frozen) {
	union ccb *fccb = ch->frozen;
	ch->frozen = NULL;
	fccb->ccb_h.status = CAM_REQUEUE_REQ \| CAM_RELEASE_SIMQ;
	if (!(fccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(fccb->ccb_h.path, 1);
	fccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	ahci_done(ch, fccb);
	}
	if (!ch->fbs_enabled && !ch->wrongccs) {
	/* Without FBS we know real timeout source. */
	ch->fatalerr = 1;
	/* Handle command with timeout. */
	ahci_end_transaction(&ch->slot[slot->slot], AHCI_ERR_TIMEOUT);
	/* Handle the rest of commands. */
	for (i = 0; i < ch->numslots; i++) {
	/* Do we have a running request on slot? */
	if (ch->slot[i].state < AHCI_SLOT_RUNNING)
	continue;
	ahci_end_transaction(&ch->slot[i], AHCI_ERR_INNOCENT);
	}
	} else {
	/* With FBS we wait for other commands timeout and pray. */
	if (ch->toslots == 0)
	xpt_freeze_simq(ch->sim, 1);
	ch->toslots \|= (1 << slot->slot);
	if ((ch->rslots & ~ch->toslots) == 0)
	ahci_process_timeout(ch);
	else
	device_printf(dev, " ... waiting for slots %08x\n",
	ch->rslots & ~ch->toslots);
	}
	}

	/* Must be called with channel locked. */
	static void
	ahci_end_transaction(struct ahci_slot *slot, enum ahci_err_type et)
	{
	struct ahci_channel *ch = slot->ch;
	union ccb *ccb = slot->ccb;
	struct ahci_cmd_list *clp;
	int lastto;
	uint32_t sig;

	bus_dmamap_sync(ch->dma.work_tag, ch->dma.work_map,
	BUS_DMASYNC_POSTREAD \| BUS_DMASYNC_POSTWRITE);
	clp = (struct ahci_cmd_list *)
	(ch->dma.work + AHCI_CL_OFFSET + (AHCI_CL_SIZE * slot->slot));
	/* Read result registers to the result struct
	* May be incorrect if several commands finished same time,
	* so read only when sure or have to.
	*/
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	struct ata_res *res = &ccb->ataio.res;

	if ((et == AHCI_ERR_TFE) \|\|
	(ccb->ataio.cmd.flags & CAM_ATAIO_NEEDRESULT)) {
	u_int8_t *fis = ch->dma.rfis + 0x40;

	bus_dmamap_sync(ch->dma.rfis_tag, ch->dma.rfis_map,
	BUS_DMASYNC_POSTREAD);
	if (ch->fbs_enabled) {
	fis += ccb->ccb_h.target_id * 256;
	res->status = fis[2];
	res->error = fis[3];
	} else {
	uint16_t tfd = ATA_INL(ch->r_mem, AHCI_P_TFD);

	res->status = tfd;
	res->error = tfd >> 8;
	}
	res->lba_low = fis[4];
	res->lba_mid = fis[5];
	res->lba_high = fis[6];
	res->device = fis[7];
	res->lba_low_exp = fis[8];
	res->lba_mid_exp = fis[9];
	res->lba_high_exp = fis[10];
	res->sector_count = fis[12];
	res->sector_count_exp = fis[13];

	/*
	* Some weird controllers do not return signature in
	* FIS receive area. Read it from PxSIG register.
	*/
	if ((ch->quirks & AHCI_Q_ALTSIG) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) &&
	(ccb->ataio.cmd.control & ATA_A_RESET) == 0) {
	sig = ATA_INL(ch->r_mem, AHCI_P_SIG);
	res->lba_high = sig >> 24;
	res->lba_mid = sig >> 16;
	res->lba_low = sig >> 8;
	res->sector_count = sig;
	}
	} else
	bzero(res, sizeof(*res));
	if ((ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA) == 0 &&
	(ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE &&
	(ch->quirks & AHCI_Q_NOCOUNT) == 0) {
	ccb->ataio.resid =
	ccb->ataio.dxfer_len - le32toh(clp->bytecount);
	}
	} else {
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE &&
	(ch->quirks & AHCI_Q_NOCOUNT) == 0) {
	ccb->csio.resid =
	ccb->csio.dxfer_len - le32toh(clp->bytecount);
	}
	}
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) {
	bus_dmamap_sync(ch->dma.data_tag, slot->dma.data_map,
	(ccb->ccb_h.flags & CAM_DIR_IN) ?
	BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
	bus_dmamap_unload(ch->dma.data_tag, slot->dma.data_map);
	}
	if (et != AHCI_ERR_NONE)
	ch->eslots \|= (1 << slot->slot);
	/* In case of error, freeze device for proper recovery. */
	if ((et != AHCI_ERR_NONE) && (!ch->recoverycmd) &&
	!(ccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	/* Set proper result status. */
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	switch (et) {
	case AHCI_ERR_NONE:
	ccb->ccb_h.status \|= CAM_REQ_CMP;
	if (ccb->ccb_h.func_code == XPT_SCSI_IO)
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	break;
	case AHCI_ERR_INVALID:
	ch->fatalerr = 1;
	ccb->ccb_h.status \|= CAM_REQ_INVALID;
	break;
	case AHCI_ERR_INNOCENT:
	ccb->ccb_h.status \|= CAM_REQUEUE_REQ;
	break;
	case AHCI_ERR_TFE:
	case AHCI_ERR_NCQ:
	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
	ccb->ccb_h.status \|= CAM_SCSI_STATUS_ERROR;
	ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
	} else {
	ccb->ccb_h.status \|= CAM_ATA_STATUS_ERROR;
	}
	break;
	case AHCI_ERR_SATA:
	ch->fatalerr = 1;
	if (!ch->recoverycmd) {
	xpt_freeze_simq(ch->sim, 1);
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	ccb->ccb_h.status \|= CAM_RELEASE_SIMQ;
	}
	ccb->ccb_h.status \|= CAM_UNCOR_PARITY;
	break;
	case AHCI_ERR_TIMEOUT:
	if (!ch->recoverycmd) {
	xpt_freeze_simq(ch->sim, 1);
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	ccb->ccb_h.status \|= CAM_RELEASE_SIMQ;
	}
	ccb->ccb_h.status \|= CAM_CMD_TIMEOUT;
	break;
	default:
	ch->fatalerr = 1;
	ccb->ccb_h.status \|= CAM_REQ_CMP_ERR;
	}
	/* Free slot. */
	ch->oslots &= ~(1 << slot->slot);
	ch->rslots &= ~(1 << slot->slot);
	ch->aslots &= ~(1 << slot->slot);
	slot->state = AHCI_SLOT_EMPTY;
	slot->ccb = NULL;
	/* Update channel stats. */
	ch->numrslots--;
	ch->numrslotspd[ccb->ccb_h.target_id]--;
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA)) {
	ch->numtslots--;
	ch->numtslotspd[ccb->ccb_h.target_id]--;
	}
	/* Cancel timeout state if request completed normally. */
	if (et != AHCI_ERR_TIMEOUT) {
	lastto = (ch->toslots == (1 << slot->slot));
	ch->toslots &= ~(1 << slot->slot);
	if (lastto)
	xpt_release_simq(ch->sim, TRUE);
	}
	/* If it was first request of reset sequence and there is no error,
	* proceed to second request. */
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) &&
	(ccb->ataio.cmd.control & ATA_A_RESET) &&
	et == AHCI_ERR_NONE) {
	ccb->ataio.cmd.control &= ~ATA_A_RESET;
	ahci_begin_transaction(ch, ccb);
	return;
	}
	/* If it was our READ LOG command - process it. */
	if (ccb->ccb_h.recovery_type == RECOVERY_READ_LOG) {
	ahci_process_read_log(ch, ccb);
	/* If it was our REQUEST SENSE command - process it. */
	} else if (ccb->ccb_h.recovery_type == RECOVERY_REQUEST_SENSE) {
	ahci_process_request_sense(ch, ccb);
	/* If it was NCQ or ATAPI command error, put result on hold. */
	} else if (et == AHCI_ERR_NCQ \|\|
	((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR &&
	(ccb->ccb_h.flags & CAM_DIS_AUTOSENSE) == 0)) {
	ch->hold[slot->slot] = ccb;
	ch->numhslots++;
	} else
	ahci_done(ch, ccb);
	/* If we have no other active commands, ... */
	if (ch->rslots == 0) {
	/* if there was fatal error - reset port. */
	if (ch->toslots != 0 \|\| ch->fatalerr) {
	ahci_reset(ch);
	} else {
	/* if we have slots in error, we can reinit port. */
	if (ch->eslots != 0) {
	ahci_stop(ch);
	ahci_clo(ch);
	ahci_start(ch, 1);
	}
	/* if there commands on hold, we can do READ LOG. */
	if (!ch->recoverycmd && ch->numhslots)
	ahci_issue_recovery(ch);
	}
	/* If all the rest of commands are in timeout - give them chance. */
	} else if ((ch->rslots & ~ch->toslots) == 0 &&
	et != AHCI_ERR_TIMEOUT)
	ahci_rearm_timeout(ch);
	/* Unfreeze frozen command. */
	if (ch->frozen && !ahci_check_collision(ch, ch->frozen)) {
	union ccb *fccb = ch->frozen;
	ch->frozen = NULL;
	ahci_begin_transaction(ch, fccb);
	xpt_release_simq(ch->sim, TRUE);
	}
	/* Start PM timer. */
	if (ch->numrslots == 0 && ch->pm_level > 3 &&
	(ch->curr[ch->pm_present ? 15 : 0].caps & CTS_SATA_CAPS_D_PMREQ)) {
	callout_schedule(&ch->pm_timer,
	(ch->pm_level == 4) ? hz / 1000 : hz / 8);
	}
	}

	static void
	ahci_issue_recovery(struct ahci_channel *ch)
	{
	union ccb *ccb;
	struct ccb_ataio *ataio;
	struct ccb_scsiio *csio;
	int i;

	/* Find some held command. */
	for (i = 0; i < ch->numslots; i++) {
	if (ch->hold[i])
	break;
	}
	ccb = xpt_alloc_ccb_nowait();
	if (ccb == NULL) {
	device_printf(ch->dev, "Unable to allocate recovery command\n");
	completeall:
	/* We can't do anything -- complete held commands. */
	for (i = 0; i < ch->numslots; i++) {
	if (ch->hold[i] == NULL)
	continue;
	ch->hold[i]->ccb_h.status &= ~CAM_STATUS_MASK;
	ch->hold[i]->ccb_h.status \|= CAM_RESRC_UNAVAIL;
	ahci_done(ch, ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	ahci_reset(ch);
	return;
	}
	ccb->ccb_h = ch->hold[i]->ccb_h; /* Reuse old header. */
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	/* READ LOG */
	ccb->ccb_h.recovery_type = RECOVERY_READ_LOG;
	ccb->ccb_h.func_code = XPT_ATA_IO;
	ccb->ccb_h.flags = CAM_DIR_IN;
	ccb->ccb_h.timeout = 1000; /* 1s should be enough. */
	ataio = &ccb->ataio;
	ataio->data_ptr = malloc(512, M_AHCI, M_NOWAIT);
	if (ataio->data_ptr == NULL) {
	xpt_free_ccb(ccb);
	device_printf(ch->dev,
	"Unable to allocate memory for READ LOG command\n");
	goto completeall;
	}
	ataio->dxfer_len = 512;
	bzero(&ataio->cmd, sizeof(ataio->cmd));
	ataio->cmd.flags = CAM_ATAIO_48BIT;
	ataio->cmd.command = 0x2F; /* READ LOG EXT */
	ataio->cmd.sector_count = 1;
	ataio->cmd.sector_count_exp = 0;
	ataio->cmd.lba_low = 0x10;
	ataio->cmd.lba_mid = 0;
	ataio->cmd.lba_mid_exp = 0;
	} else {
	/* REQUEST SENSE */
	ccb->ccb_h.recovery_type = RECOVERY_REQUEST_SENSE;
	ccb->ccb_h.recovery_slot = i;
	ccb->ccb_h.func_code = XPT_SCSI_IO;
	ccb->ccb_h.flags = CAM_DIR_IN;
	ccb->ccb_h.status = 0;
	ccb->ccb_h.timeout = 1000; /* 1s should be enough. */
	csio = &ccb->csio;
	csio->data_ptr = (void *)&ch->hold[i]->csio.sense_data;
	csio->dxfer_len = ch->hold[i]->csio.sense_len;
	csio->cdb_len = 6;
	bzero(&csio->cdb_io, sizeof(csio->cdb_io));
	csio->cdb_io.cdb_bytes[0] = 0x03;
	csio->cdb_io.cdb_bytes[4] = csio->dxfer_len;
	}
	/* Freeze SIM while doing recovery. */
	ch->recoverycmd = 1;
	xpt_freeze_simq(ch->sim, 1);
	ahci_begin_transaction(ch, ccb);
	}

	static void
	ahci_process_read_log(struct ahci_channel ch, union ccb ccb)
	{
	uint8_t *data;
	struct ata_res *res;
	int i;

	ch->recoverycmd = 0;

	data = ccb->ataio.data_ptr;
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP &&
	(data[0] & 0x80) == 0) {
	for (i = 0; i < ch->numslots; i++) {
	if (!ch->hold[i])
	continue;
	if (ch->hold[i]->ccb_h.func_code != XPT_ATA_IO)
	continue;
	if ((data[0] & 0x1F) == i) {
	res = &ch->hold[i]->ataio.res;
	res->status = data[2];
	res->error = data[3];
	res->lba_low = data[4];
	res->lba_mid = data[5];
	res->lba_high = data[6];
	res->device = data[7];
	res->lba_low_exp = data[8];
	res->lba_mid_exp = data[9];
	res->lba_high_exp = data[10];
	res->sector_count = data[12];
	res->sector_count_exp = data[13];
	} else {
	ch->hold[i]->ccb_h.status &= ~CAM_STATUS_MASK;
	ch->hold[i]->ccb_h.status \|= CAM_REQUEUE_REQ;
	}
	ahci_done(ch, ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	} else {
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)
	device_printf(ch->dev, "Error while READ LOG EXT\n");
	else if ((data[0] & 0x80) == 0) {
	device_printf(ch->dev, "Non-queued command error in READ LOG EXT\n");
	}
	for (i = 0; i < ch->numslots; i++) {
	if (!ch->hold[i])
	continue;
	if (ch->hold[i]->ccb_h.func_code != XPT_ATA_IO)
	continue;
	ahci_done(ch, ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	}
	free(ccb->ataio.data_ptr, M_AHCI);
	xpt_free_ccb(ccb);
	xpt_release_simq(ch->sim, TRUE);
	}

	static void
	ahci_process_request_sense(struct ahci_channel ch, union ccb ccb)
	{
	int i;

	ch->recoverycmd = 0;

	i = ccb->ccb_h.recovery_slot;
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	ch->hold[i]->ccb_h.status \|= CAM_AUTOSNS_VALID;
	} else {
	ch->hold[i]->ccb_h.status &= ~CAM_STATUS_MASK;
	ch->hold[i]->ccb_h.status \|= CAM_AUTOSENSE_FAIL;
	}
	ahci_done(ch, ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	xpt_free_ccb(ccb);
	xpt_release_simq(ch->sim, TRUE);
	}

	static void
	ahci_start(struct ahci_channel *ch, int fbs)
	{
	u_int32_t cmd;

	/* Run the channel start callback, if any. */
	if (ch->start)
	ch->start(ch);

	/* Clear SATA error register */
	ATA_OUTL(ch->r_mem, AHCI_P_SERR, 0xFFFFFFFF);
	/* Clear any interrupts pending on this channel */
	ATA_OUTL(ch->r_mem, AHCI_P_IS, 0xFFFFFFFF);
	/* Configure FIS-based switching if supported. */
	if (ch->chcaps & AHCI_P_CMD_FBSCP) {
	ch->fbs_enabled = (fbs && ch->pm_present) ? 1 : 0;
	ATA_OUTL(ch->r_mem, AHCI_P_FBS,
	ch->fbs_enabled ? AHCI_P_FBS_EN : 0);
	}
	/* Start operations on this channel */
	cmd = ATA_INL(ch->r_mem, AHCI_P_CMD);
	cmd &= ~AHCI_P_CMD_PMA;
	ATA_OUTL(ch->r_mem, AHCI_P_CMD, cmd \| AHCI_P_CMD_ST \|
	(ch->pm_present ? AHCI_P_CMD_PMA : 0));
	}

	static void
	ahci_stop(struct ahci_channel *ch)
	{
	u_int32_t cmd;
	int timeout;

	/* Kill all activity on this channel */
	cmd = ATA_INL(ch->r_mem, AHCI_P_CMD);
	ATA_OUTL(ch->r_mem, AHCI_P_CMD, cmd & ~AHCI_P_CMD_ST);
	/* Wait for activity stop. */
	timeout = 0;
	do {
	DELAY(10);
	if (timeout++ > 50000) {
	device_printf(ch->dev, "stopping AHCI engine failed\n");
	break;
	}
	} while (ATA_INL(ch->r_mem, AHCI_P_CMD) & AHCI_P_CMD_CR);
	ch->eslots = 0;
	}

	static void
	ahci_clo(struct ahci_channel *ch)
	{
	u_int32_t cmd;
	int timeout;

	/* Issue Command List Override if supported */
	if (ch->caps & AHCI_CAP_SCLO) {
	cmd = ATA_INL(ch->r_mem, AHCI_P_CMD);
	cmd \|= AHCI_P_CMD_CLO;
	ATA_OUTL(ch->r_mem, AHCI_P_CMD, cmd);
	timeout = 0;
	do {
	DELAY(10);
	if (timeout++ > 50000) {
	device_printf(ch->dev, "executing CLO failed\n");
	break;
	}
	} while (ATA_INL(ch->r_mem, AHCI_P_CMD) & AHCI_P_CMD_CLO);
	}
	}

	static void
	ahci_stop_fr(struct ahci_channel *ch)
	{
	u_int32_t cmd;
	int timeout;

	/* Kill all FIS reception on this channel */
	cmd = ATA_INL(ch->r_mem, AHCI_P_CMD);
	ATA_OUTL(ch->r_mem, AHCI_P_CMD, cmd & ~AHCI_P_CMD_FRE);
	/* Wait for FIS reception stop. */
	timeout = 0;
	do {
	DELAY(10);
	if (timeout++ > 50000) {
	device_printf(ch->dev, "stopping AHCI FR engine failed\n");
	break;
	}
	} while (ATA_INL(ch->r_mem, AHCI_P_CMD) & AHCI_P_CMD_FR);
	}

	static void
	ahci_start_fr(struct ahci_channel *ch)
	{
	u_int32_t cmd;

	/* Start FIS reception on this channel */
	cmd = ATA_INL(ch->r_mem, AHCI_P_CMD);
	ATA_OUTL(ch->r_mem, AHCI_P_CMD, cmd \| AHCI_P_CMD_FRE);
	}

	static int
	ahci_wait_ready(struct ahci_channel *ch, int t, int t0)
	{
	int timeout = 0;
	uint32_t val;

	while ((val = ATA_INL(ch->r_mem, AHCI_P_TFD)) &
	(ATA_S_BUSY \| ATA_S_DRQ)) {
	if (timeout > t) {
	if (t != 0) {
	device_printf(ch->dev,
	"AHCI reset: device not ready after %dms "
	"(tfd = %08x)\n",
	MAX(t, 0) + t0, val);
	}
	return (EBUSY);
	}
	DELAY(1000);
	timeout++;
	}
	if (bootverbose)
	device_printf(ch->dev, "AHCI reset: device ready after %dms\n",
	timeout + t0);
	return (0);
	}

	static void
	ahci_reset_to(void *arg)
	{
	struct ahci_channel *ch = arg;

	if (ch->resetting == 0)
	return;
	ch->resetting--;
	if (ahci_wait_ready(ch, ch->resetting == 0 ? -1 : 0,
	(310 - ch->resetting) * 100) == 0) {
	ch->resetting = 0;
	ahci_start(ch, 1);
	xpt_release_simq(ch->sim, TRUE);
	return;
	}
	if (ch->resetting == 0) {
	ahci_clo(ch);
	ahci_start(ch, 1);
	xpt_release_simq(ch->sim, TRUE);
	return;
	}
	callout_schedule(&ch->reset_timer, hz / 10);
	}

	static void
	ahci_reset(struct ahci_channel *ch)
	{
	struct ahci_controller *ctlr = device_get_softc(device_get_parent(ch->dev));
	int i;

	xpt_freeze_simq(ch->sim, 1);
	if (bootverbose)
	device_printf(ch->dev, "AHCI reset...\n");
	/* Forget about previous reset. */
	if (ch->resetting) {
	ch->resetting = 0;
	callout_stop(&ch->reset_timer);
	xpt_release_simq(ch->sim, TRUE);
	}
	/* Requeue freezed command. */
	if (ch->frozen) {
	union ccb *fccb = ch->frozen;
	ch->frozen = NULL;
	fccb->ccb_h.status = CAM_REQUEUE_REQ \| CAM_RELEASE_SIMQ;
	if (!(fccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(fccb->ccb_h.path, 1);
	fccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	ahci_done(ch, fccb);
	}
	/* Kill the engine and requeue all running commands. */
	ahci_stop(ch);
	for (i = 0; i < ch->numslots; i++) {
	/* Do we have a running request on slot? */
	if (ch->slot[i].state < AHCI_SLOT_RUNNING)
	continue;
	/* XXX; Commands in loading state. */
	ahci_end_transaction(&ch->slot[i], AHCI_ERR_INNOCENT);
	}
	for (i = 0; i < ch->numslots; i++) {
	if (!ch->hold[i])
	continue;
	ahci_done(ch, ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	if (ch->toslots != 0)
	xpt_release_simq(ch->sim, TRUE);
	ch->eslots = 0;
	ch->toslots = 0;
	ch->wrongccs = 0;
	ch->fatalerr = 0;
	/* Tell the XPT about the event */
	xpt_async(AC_BUS_RESET, ch->path, NULL);
	/* Disable port interrupts */
	ATA_OUTL(ch->r_mem, AHCI_P_IE, 0);
	/* Reset and reconnect PHY, */
	if (!ahci_sata_phy_reset(ch)) {
	if (bootverbose)
	device_printf(ch->dev,
	"AHCI reset: device not found\n");
	ch->devices = 0;
	/* Enable wanted port interrupts */
	ATA_OUTL(ch->r_mem, AHCI_P_IE,
	(((ch->pm_level != 0) ? AHCI_P_IX_CPD \| AHCI_P_IX_MP : 0) \|
	AHCI_P_IX_PRC \| AHCI_P_IX_PC));
	xpt_release_simq(ch->sim, TRUE);
	return;
	}
	if (bootverbose)
	device_printf(ch->dev, "AHCI reset: device found\n");
	/* Wait for clearing busy status. */
	if (ahci_wait_ready(ch, dumping ? 31000 : 0, 0)) {
	if (dumping)
	ahci_clo(ch);
	else
	ch->resetting = 310;
	}
	ch->devices = 1;
	/* Enable wanted port interrupts */
	ATA_OUTL(ch->r_mem, AHCI_P_IE,
	(((ch->pm_level != 0) ? AHCI_P_IX_CPD \| AHCI_P_IX_MP : 0) \|
	AHCI_P_IX_TFE \| AHCI_P_IX_HBF \|
	AHCI_P_IX_HBD \| AHCI_P_IX_IF \| AHCI_P_IX_OF \|
	((ch->pm_level == 0) ? AHCI_P_IX_PRC : 0) \| AHCI_P_IX_PC \|
	AHCI_P_IX_DP \| AHCI_P_IX_UF \| (ctlr->ccc ? 0 : AHCI_P_IX_SDB) \|
	AHCI_P_IX_DS \| AHCI_P_IX_PS \| (ctlr->ccc ? 0 : AHCI_P_IX_DHR)));
	if (ch->resetting)
	callout_reset(&ch->reset_timer, hz / 10, ahci_reset_to, ch);
	else {
	ahci_start(ch, 1);
	xpt_release_simq(ch->sim, TRUE);
	}
	}

	static int
	ahci_setup_fis(struct ahci_channel ch, struct ahci_cmd_tab ctp, union ccb *ccb, int tag)
	{
	u_int8_t *fis = &ctp->cfis[0];

	bzero(fis, 20);
	fis[0] = 0x27; /* host to device */
	fis[1] = (ccb->ccb_h.target_id & 0x0f);
	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
	fis[1] \|= 0x80;
	fis[2] = ATA_PACKET_CMD;
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE &&
	ch->curr[ccb->ccb_h.target_id].mode >= ATA_DMA)
	fis[3] = ATA_F_DMA;
	else {
	fis[5] = ccb->csio.dxfer_len;
	fis[6] = ccb->csio.dxfer_len >> 8;
	}
	fis[7] = ATA_D_LBA;
	fis[15] = ATA_A_4BIT;
	bcopy((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
	ccb->csio.cdb_io.cdb_ptr : ccb->csio.cdb_io.cdb_bytes,
	ctp->acmd, ccb->csio.cdb_len);
	bzero(ctp->acmd + ccb->csio.cdb_len, 32 - ccb->csio.cdb_len);
	} else if ((ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) == 0) {
	fis[1] \|= 0x80;
	fis[2] = ccb->ataio.cmd.command;
	fis[3] = ccb->ataio.cmd.features;
	fis[4] = ccb->ataio.cmd.lba_low;
	fis[5] = ccb->ataio.cmd.lba_mid;
	fis[6] = ccb->ataio.cmd.lba_high;
	fis[7] = ccb->ataio.cmd.device;
	fis[8] = ccb->ataio.cmd.lba_low_exp;
	fis[9] = ccb->ataio.cmd.lba_mid_exp;
	fis[10] = ccb->ataio.cmd.lba_high_exp;
	fis[11] = ccb->ataio.cmd.features_exp;
	fis[12] = ccb->ataio.cmd.sector_count;
	if (ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA) {
	fis[12] &= 0x07;
	fis[12] \|= tag << 3;
	}
	fis[13] = ccb->ataio.cmd.sector_count_exp;
	if (ccb->ataio.ata_flags & ATA_FLAG_ICC)
	fis[14] = ccb->ataio.icc;
	fis[15] = ATA_A_4BIT;
	if (ccb->ataio.ata_flags & ATA_FLAG_AUX) {
	fis[16] = ccb->ataio.aux & 0xff;
	fis[17] = (ccb->ataio.aux >> 8) & 0xff;
	fis[18] = (ccb->ataio.aux >> 16) & 0xff;
	fis[19] = (ccb->ataio.aux >> 24) & 0xff;
	}
	} else {
	fis[15] = ccb->ataio.cmd.control;
	}
	return (20);
	}

	static int
	ahci_sata_connect(struct ahci_channel *ch)
	{
	u_int32_t status;
	int timeout, found = 0;

	/* Wait up to 100ms for "connect well" */
	for (timeout = 0; timeout < 1000 ; timeout++) {
	status = ATA_INL(ch->r_mem, AHCI_P_SSTS);
	if ((status & ATA_SS_DET_MASK) != ATA_SS_DET_NO_DEVICE)
	found = 1;
	if (((status & ATA_SS_DET_MASK) == ATA_SS_DET_PHY_ONLINE) &&
	((status & ATA_SS_SPD_MASK) != ATA_SS_SPD_NO_SPEED) &&
	((status & ATA_SS_IPM_MASK) == ATA_SS_IPM_ACTIVE))
	break;
	if ((status & ATA_SS_DET_MASK) == ATA_SS_DET_PHY_OFFLINE) {
	if (bootverbose) {
	device_printf(ch->dev, "SATA offline status=%08x\n",
	status);
	}
	return (0);
	}
	if (found == 0 && timeout >= 100)
	break;
	DELAY(100);
	}
	if (timeout >= 1000 \|\| !found) {
	if (bootverbose) {
	device_printf(ch->dev,
	"SATA connect timeout time=%dus status=%08x\n",
	timeout * 100, status);
	}
	return (0);
	}
	if (bootverbose) {
	device_printf(ch->dev, "SATA connect time=%dus status=%08x\n",
	timeout * 100, status);
	}
	/* Clear SATA error register */
	ATA_OUTL(ch->r_mem, AHCI_P_SERR, 0xffffffff);
	return (1);
	}

	static int
	ahci_sata_phy_reset(struct ahci_channel *ch)
	{
	int sata_rev;
	uint32_t val, detval;

	if (ch->listening) {
	val = ATA_INL(ch->r_mem, AHCI_P_CMD);
	val \|= AHCI_P_CMD_SUD;
	ATA_OUTL(ch->r_mem, AHCI_P_CMD, val);
	ch->listening = 0;
	}
	sata_rev = ch->user[ch->pm_present ? 15 : 0].revision;
	if (sata_rev == 1)
	val = ATA_SC_SPD_SPEED_GEN1;
	else if (sata_rev == 2)
	val = ATA_SC_SPD_SPEED_GEN2;
	else if (sata_rev == 3)
	val = ATA_SC_SPD_SPEED_GEN3;
	else
	val = 0;
	detval = ahci_ch_detval(ch, ATA_SC_DET_RESET);
	ATA_OUTL(ch->r_mem, AHCI_P_SCTL,
	detval \| val \|
	ATA_SC_IPM_DIS_PARTIAL \| ATA_SC_IPM_DIS_SLUMBER);
	DELAY(1000);
	detval = ahci_ch_detval(ch, ATA_SC_DET_IDLE);
	ATA_OUTL(ch->r_mem, AHCI_P_SCTL,
	detval \| val \| ((ch->pm_level > 0) ? 0 :
	(ATA_SC_IPM_DIS_PARTIAL \| ATA_SC_IPM_DIS_SLUMBER)));
	if (!ahci_sata_connect(ch)) {
	if (ch->caps & AHCI_CAP_SSS) {
	val = ATA_INL(ch->r_mem, AHCI_P_CMD);
	val &= ~AHCI_P_CMD_SUD;
	ATA_OUTL(ch->r_mem, AHCI_P_CMD, val);
	ch->listening = 1;
	} else if (ch->pm_level > 0)
	ATA_OUTL(ch->r_mem, AHCI_P_SCTL, ATA_SC_DET_DISABLE);
	return (0);
	}
	return (1);
	}

	static int
	ahci_check_ids(struct ahci_channel ch, union ccb ccb)
	{

	if (ccb->ccb_h.target_id > ((ch->caps & AHCI_CAP_SPM) ? 15 : 0)) {
	ccb->ccb_h.status = CAM_TID_INVALID;
	ahci_done(ch, ccb);
	return (-1);
	}
	if (ccb->ccb_h.target_lun != 0) {
	ccb->ccb_h.status = CAM_LUN_INVALID;
	ahci_done(ch, ccb);
	return (-1);
	}
	return (0);
	}

	static void
	ahciaction(struct cam_sim sim, union ccb ccb)
	{
	struct ahci_channel *ch;

	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("ahciaction func_code=%x\n",
	ccb->ccb_h.func_code));

	ch = (struct ahci_channel *)cam_sim_softc(sim);
	switch (ccb->ccb_h.func_code) {
	/* Common cases first */
	case XPT_ATA_IO: /* Execute the requested I/O operation */
	case XPT_SCSI_IO:
	if (ahci_check_ids(ch, ccb))
	return;
	if (ch->devices == 0 \|\|
	(ch->pm_present == 0 &&
	ccb->ccb_h.target_id > 0 && ccb->ccb_h.target_id < 15)) {
	ccb->ccb_h.status = CAM_SEL_TIMEOUT;
	break;
	}
	ccb->ccb_h.recovery_type = RECOVERY_NONE;
	/* Check for command collision. */
	if (ahci_check_collision(ch, ccb)) {
	/* Freeze command. */
	ch->frozen = ccb;
	/* We have only one frozen slot, so freeze simq also. */
	xpt_freeze_simq(ch->sim, 1);
	return;
	}
	ahci_begin_transaction(ch, ccb);
	return;
	case XPT_ABORT: /* Abort the specified CCB */
	/* XXX Implement */
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	case XPT_SET_TRAN_SETTINGS:
	{
	struct ccb_trans_settings *cts = &ccb->cts;
	struct ahci_device *d;

	if (ahci_check_ids(ch, ccb))
	return;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS)
	d = &ch->curr[ccb->ccb_h.target_id];
	else
	d = &ch->user[ccb->ccb_h.target_id];
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_REVISION)
	d->revision = cts->xport_specific.sata.revision;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_MODE)
	d->mode = cts->xport_specific.sata.mode;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_BYTECOUNT)
	d->bytecount = min(8192, cts->xport_specific.sata.bytecount);
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_TAGS)
	d->tags = min(ch->numslots, cts->xport_specific.sata.tags);
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_PM)
	ch->pm_present = cts->xport_specific.sata.pm_present;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_ATAPI)
	d->atapi = cts->xport_specific.sata.atapi;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_CAPS)
	d->caps = cts->xport_specific.sata.caps;
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_GET_TRAN_SETTINGS:
	/* Get default/user set transfer settings for the target */
	{
	struct ccb_trans_settings *cts = &ccb->cts;
	struct ahci_device *d;
	uint32_t status;

	if (ahci_check_ids(ch, ccb))
	return;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS)
	d = &ch->curr[ccb->ccb_h.target_id];
	else
	d = &ch->user[ccb->ccb_h.target_id];
	cts->protocol = PROTO_UNSPECIFIED;
	cts->protocol_version = PROTO_VERSION_UNSPECIFIED;
	cts->transport = XPORT_SATA;
	cts->transport_version = XPORT_VERSION_UNSPECIFIED;
	cts->proto_specific.valid = 0;
	cts->xport_specific.sata.valid = 0;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS &&
	(ccb->ccb_h.target_id == 15 \|\|
	(ccb->ccb_h.target_id == 0 && !ch->pm_present))) {
	status = ATA_INL(ch->r_mem, AHCI_P_SSTS) & ATA_SS_SPD_MASK;
	if (status & 0x0f0) {
	cts->xport_specific.sata.revision =
	(status & 0x0f0) >> 4;
	cts->xport_specific.sata.valid \|=
	CTS_SATA_VALID_REVISION;
	}
	cts->xport_specific.sata.caps = d->caps & CTS_SATA_CAPS_D;
	if (ch->pm_level) {
	if (ch->caps & (AHCI_CAP_PSC \| AHCI_CAP_SSC))
	cts->xport_specific.sata.caps \|= CTS_SATA_CAPS_H_PMREQ;
	if (ch->caps2 & AHCI_CAP2_APST)
	cts->xport_specific.sata.caps \|= CTS_SATA_CAPS_H_APST;
	}
	if ((ch->caps & AHCI_CAP_SNCQ) &&
	(ch->quirks & AHCI_Q_NOAA) == 0)
	cts->xport_specific.sata.caps \|= CTS_SATA_CAPS_H_DMAAA;
	cts->xport_specific.sata.caps \|= CTS_SATA_CAPS_H_AN;
	cts->xport_specific.sata.caps &=
	ch->user[ccb->ccb_h.target_id].caps;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_CAPS;
	} else {
	cts->xport_specific.sata.revision = d->revision;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_REVISION;
	cts->xport_specific.sata.caps = d->caps;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_CAPS;
	}
	cts->xport_specific.sata.mode = d->mode;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_MODE;
	cts->xport_specific.sata.bytecount = d->bytecount;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_BYTECOUNT;
	cts->xport_specific.sata.pm_present = ch->pm_present;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_PM;
	cts->xport_specific.sata.tags = d->tags;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_TAGS;
	cts->xport_specific.sata.atapi = d->atapi;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_ATAPI;
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_RESET_BUS: /* Reset the specified SCSI bus */
	case XPT_RESET_DEV: /* Bus Device Reset the specified SCSI device */
	ahci_reset(ch);
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	case XPT_TERM_IO: /* Terminate the I/O process */
	/* XXX Implement */
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	case XPT_PATH_INQ: /* Path routing inquiry */
	{
	struct ccb_pathinq *cpi = &ccb->cpi;

	cpi->version_num = 1; /* XXX??? */
	cpi->hba_inquiry = PI_SDTR_ABLE;
	if (ch->caps & AHCI_CAP_SNCQ)
	cpi->hba_inquiry \|= PI_TAG_ABLE;
	if (ch->caps & AHCI_CAP_SPM)
	cpi->hba_inquiry \|= PI_SATAPM;
	cpi->target_sprt = 0;
	cpi->hba_misc = PIM_SEQSCAN \| PIM_UNMAPPED;
	if ((ch->quirks & AHCI_Q_NOAUX) == 0)
	cpi->hba_misc \|= PIM_ATA_EXT;
	cpi->hba_eng_cnt = 0;
	if (ch->caps & AHCI_CAP_SPM)
	cpi->max_target = 15;
	else
	cpi->max_target = 0;
	cpi->max_lun = 0;
	cpi->initiator_id = 0;
	cpi->bus_id = cam_sim_bus(sim);
	cpi->base_transfer_speed = 150000;
	strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strlcpy(cpi->hba_vid, "AHCI", HBA_IDLEN);
	strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
	cpi->unit_number = cam_sim_unit(sim);
	cpi->transport = XPORT_SATA;
	cpi->transport_version = XPORT_VERSION_UNSPECIFIED;
	cpi->protocol = PROTO_ATA;
	cpi->protocol_version = PROTO_VERSION_UNSPECIFIED;
	- cpi->maxio = MAXPHYS;
	+ cpi->maxio = ctob(AHCI_SG_ENTRIES - 1);
	/* ATI SB600 can't handle 256 sectors with FPDMA (NCQ). */
	if (ch->quirks & AHCI_Q_MAXIO_64K)
	cpi->maxio = min(cpi->maxio, 128 * 512);
	cpi->hba_vendor = ch->vendorid;
	cpi->hba_device = ch->deviceid;
	cpi->hba_subvendor = ch->subvendorid;
	cpi->hba_subdevice = ch->subdeviceid;
	cpi->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	default:
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	}
	ahci_done(ch, ccb);
	}

	static void
	ahcipoll(struct cam_sim *sim)
	{
	struct ahci_channel ch = (struct ahci_channel )cam_sim_softc(sim);
	uint32_t istatus;

	/* Read interrupt statuses and process if any. */
	istatus = ATA_INL(ch->r_mem, AHCI_P_IS);
	if (istatus != 0)
	ahci_ch_intr_main(ch, istatus);
	if (ch->resetting != 0 &&
	(--ch->resetpolldiv <= 0 \|\| !callout_pending(&ch->reset_timer))) {
	ch->resetpolldiv = 1000;
	ahci_reset_to(ch);
	}
	}

	devclass_t ahci_devclass;

	MODULE_VERSION(ahci, 1);
	MODULE_DEPEND(ahci, cam, 1, 1, 1);
	diff --git a/sys/dev/ahci/ahci.h b/sys/dev/ahci/ahci.h
	index 9d3e072cfea6..472f9845117d 100644
	--- a/sys/dev/ahci/ahci.h
	+++ b/sys/dev/ahci/ahci.h
	@@ -1,683 +1,679 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
	* Copyright (c) 2009-2012 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	/* ATA register defines */
	#define ATA_DATA 0 /* (RW) data */

	#define ATA_FEATURE 1 /* (W) feature */
	#define ATA_F_DMA 0x01 /* enable DMA */
	#define ATA_F_OVL 0x02 /* enable overlap */

	#define ATA_COUNT 2 /* (W) sector count */

	#define ATA_SECTOR 3 /* (RW) sector # */
	#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */
	#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */
	#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */
	#define ATA_D_LBA 0x40 /* use LBA addressing */
	#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */

	#define ATA_COMMAND 7 /* (W) command */

	#define ATA_ERROR 8 /* (R) error */
	#define ATA_E_ILI 0x01 /* illegal length */
	#define ATA_E_NM 0x02 /* no media */
	#define ATA_E_ABORT 0x04 /* command aborted */
	#define ATA_E_MCR 0x08 /* media change request */
	#define ATA_E_IDNF 0x10 /* ID not found */
	#define ATA_E_MC 0x20 /* media changed */
	#define ATA_E_UNC 0x40 /* uncorrectable data */
	#define ATA_E_ICRC 0x80 /* UDMA crc error */
	#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */

	#define ATA_IREASON 9 /* (R) interrupt reason */
	#define ATA_I_CMD 0x01 /* cmd (1) \| data (0) */
	#define ATA_I_IN 0x02 /* read (1) \| write (0) */
	#define ATA_I_RELEASE 0x04 /* released bus (1) */
	#define ATA_I_TAGMASK 0xf8 /* tag mask */

	#define ATA_STATUS 10 /* (R) status */
	#define ATA_ALTSTAT 11 /* (R) alternate status */
	#define ATA_S_ERROR 0x01 /* error */
	#define ATA_S_INDEX 0x02 /* index */
	#define ATA_S_CORR 0x04 /* data corrected */
	#define ATA_S_DRQ 0x08 /* data request */
	#define ATA_S_DSC 0x10 /* drive seek completed */
	#define ATA_S_SERVICE 0x10 /* drive needs service */
	#define ATA_S_DWF 0x20 /* drive write fault */
	#define ATA_S_DMA 0x20 /* DMA ready */
	#define ATA_S_READY 0x40 /* drive ready */
	#define ATA_S_BUSY 0x80 /* busy */

	#define ATA_CONTROL 12 /* (W) control */
	#define ATA_A_IDS 0x02 /* disable interrupts */
	#define ATA_A_RESET 0x04 /* RESET controller */
	#define ATA_A_4BIT 0x08 /* 4 head bits */
	#define ATA_A_HOB 0x80 /* High Order Byte enable */

	/* SATA register defines */
	#define ATA_SSTATUS 13
	#define ATA_SS_DET_MASK 0x0000000f
	#define ATA_SS_DET_NO_DEVICE 0x00000000
	#define ATA_SS_DET_DEV_PRESENT 0x00000001
	#define ATA_SS_DET_PHY_ONLINE 0x00000003
	#define ATA_SS_DET_PHY_OFFLINE 0x00000004

	#define ATA_SS_SPD_MASK 0x000000f0
	#define ATA_SS_SPD_NO_SPEED 0x00000000
	#define ATA_SS_SPD_GEN1 0x00000010
	#define ATA_SS_SPD_GEN2 0x00000020
	#define ATA_SS_SPD_GEN3 0x00000030

	#define ATA_SS_IPM_MASK 0x00000f00
	#define ATA_SS_IPM_NO_DEVICE 0x00000000
	#define ATA_SS_IPM_ACTIVE 0x00000100
	#define ATA_SS_IPM_PARTIAL 0x00000200
	#define ATA_SS_IPM_SLUMBER 0x00000600
	#define ATA_SS_IPM_DEVSLEEP 0x00000800

	#define ATA_SERROR 14
	#define ATA_SE_DATA_CORRECTED 0x00000001
	#define ATA_SE_COMM_CORRECTED 0x00000002
	#define ATA_SE_DATA_ERR 0x00000100
	#define ATA_SE_COMM_ERR 0x00000200
	#define ATA_SE_PROT_ERR 0x00000400
	#define ATA_SE_HOST_ERR 0x00000800
	#define ATA_SE_PHY_CHANGED 0x00010000
	#define ATA_SE_PHY_IERROR 0x00020000
	#define ATA_SE_COMM_WAKE 0x00040000
	#define ATA_SE_DECODE_ERR 0x00080000
	#define ATA_SE_PARITY_ERR 0x00100000
	#define ATA_SE_CRC_ERR 0x00200000
	#define ATA_SE_HANDSHAKE_ERR 0x00400000
	#define ATA_SE_LINKSEQ_ERR 0x00800000
	#define ATA_SE_TRANSPORT_ERR 0x01000000
	#define ATA_SE_UNKNOWN_FIS 0x02000000
	#define ATA_SE_EXCHANGED 0x04000000

	#define ATA_SCONTROL 15
	#define ATA_SC_DET_MASK 0x0000000f
	#define ATA_SC_DET_IDLE 0x00000000
	#define ATA_SC_DET_RESET 0x00000001
	#define ATA_SC_DET_DISABLE 0x00000004

	#define ATA_SC_SPD_MASK 0x000000f0
	#define ATA_SC_SPD_NO_SPEED 0x00000000
	#define ATA_SC_SPD_SPEED_GEN1 0x00000010
	#define ATA_SC_SPD_SPEED_GEN2 0x00000020
	#define ATA_SC_SPD_SPEED_GEN3 0x00000030

	#define ATA_SC_IPM_MASK 0x00000f00
	#define ATA_SC_IPM_NONE 0x00000000
	#define ATA_SC_IPM_DIS_PARTIAL 0x00000100
	#define ATA_SC_IPM_DIS_SLUMBER 0x00000200
	#define ATA_SC_IPM_DIS_DEVSLEEP 0x00000400

	#define ATA_SACTIVE 16

	#define AHCI_MAX_PORTS 32
	#define AHCI_MAX_SLOTS 32
	#define AHCI_MAX_IRQS 16

	/* SATA AHCI v1.0 register defines */
	#define AHCI_CAP 0x00
	#define AHCI_CAP_NPMASK 0x0000001f
	#define AHCI_CAP_SXS 0x00000020
	#define AHCI_CAP_EMS 0x00000040
	#define AHCI_CAP_CCCS 0x00000080
	#define AHCI_CAP_NCS 0x00001F00
	#define AHCI_CAP_NCS_SHIFT 8
	#define AHCI_CAP_PSC 0x00002000
	#define AHCI_CAP_SSC 0x00004000
	#define AHCI_CAP_PMD 0x00008000
	#define AHCI_CAP_FBSS 0x00010000
	#define AHCI_CAP_SPM 0x00020000
	#define AHCI_CAP_SAM 0x00080000
	#define AHCI_CAP_ISS 0x00F00000
	#define AHCI_CAP_ISS_SHIFT 20
	#define AHCI_CAP_SCLO 0x01000000
	#define AHCI_CAP_SAL 0x02000000
	#define AHCI_CAP_SALP 0x04000000
	#define AHCI_CAP_SSS 0x08000000
	#define AHCI_CAP_SMPS 0x10000000
	#define AHCI_CAP_SSNTF 0x20000000
	#define AHCI_CAP_SNCQ 0x40000000
	#define AHCI_CAP_64BIT 0x80000000

	#define AHCI_GHC 0x04
	#define AHCI_GHC_AE 0x80000000
	#define AHCI_GHC_MRSM 0x00000004
	#define AHCI_GHC_IE 0x00000002
	#define AHCI_GHC_HR 0x00000001

	#define AHCI_IS 0x08
	#define AHCI_PI 0x0c
	#define AHCI_VS 0x10

	#define AHCI_CCCC 0x14
	#define AHCI_CCCC_TV_MASK 0xffff0000
	#define AHCI_CCCC_TV_SHIFT 16
	#define AHCI_CCCC_CC_MASK 0x0000ff00
	#define AHCI_CCCC_CC_SHIFT 8
	#define AHCI_CCCC_INT_MASK 0x000000f8
	#define AHCI_CCCC_INT_SHIFT 3
	#define AHCI_CCCC_EN 0x00000001
	#define AHCI_CCCP 0x18

	#define AHCI_EM_LOC 0x1C
	#define AHCI_EM_CTL 0x20
	#define AHCI_EM_MR 0x00000001
	#define AHCI_EM_TM 0x00000100
	#define AHCI_EM_RST 0x00000200
	#define AHCI_EM_LED 0x00010000
	#define AHCI_EM_SAFTE 0x00020000
	#define AHCI_EM_SES2 0x00040000
	#define AHCI_EM_SGPIO 0x00080000
	#define AHCI_EM_SMB 0x01000000
	#define AHCI_EM_XMT 0x02000000
	#define AHCI_EM_ALHD 0x04000000
	#define AHCI_EM_PM 0x08000000

	#define AHCI_CAP2 0x24
	#define AHCI_CAP2_BOH 0x00000001
	#define AHCI_CAP2_NVMP 0x00000002
	#define AHCI_CAP2_APST 0x00000004
	#define AHCI_CAP2_SDS 0x00000008
	#define AHCI_CAP2_SADM 0x00000010
	#define AHCI_CAP2_DESO 0x00000020

	#define AHCI_BOHC 0x28
	#define AHCI_BOHC_BOS 0x00000001
	#define AHCI_BOHC_OOS 0x00000002
	#define AHCI_BOHC_SOOE 0x00000004
	#define AHCI_BOHC_OOC 0x00000008
	#define AHCI_BOHC_BB 0x00000010

	#define AHCI_VSCAP 0xa4
	#define AHCI_OFFSET 0x100
	#define AHCI_STEP 0x80

	#define AHCI_P_CLB 0x00
	#define AHCI_P_CLBU 0x04
	#define AHCI_P_FB 0x08
	#define AHCI_P_FBU 0x0c
	#define AHCI_P_IS 0x10
	#define AHCI_P_IE 0x14
	#define AHCI_P_IX_DHR 0x00000001
	#define AHCI_P_IX_PS 0x00000002
	#define AHCI_P_IX_DS 0x00000004
	#define AHCI_P_IX_SDB 0x00000008
	#define AHCI_P_IX_UF 0x00000010
	#define AHCI_P_IX_DP 0x00000020
	#define AHCI_P_IX_PC 0x00000040
	#define AHCI_P_IX_MP 0x00000080

	#define AHCI_P_IX_PRC 0x00400000
	#define AHCI_P_IX_IPM 0x00800000
	#define AHCI_P_IX_OF 0x01000000
	#define AHCI_P_IX_INF 0x04000000
	#define AHCI_P_IX_IF 0x08000000
	#define AHCI_P_IX_HBD 0x10000000
	#define AHCI_P_IX_HBF 0x20000000
	#define AHCI_P_IX_TFE 0x40000000
	#define AHCI_P_IX_CPD 0x80000000

	#define AHCI_P_CMD 0x18
	#define AHCI_P_CMD_ST 0x00000001
	#define AHCI_P_CMD_SUD 0x00000002
	#define AHCI_P_CMD_POD 0x00000004
	#define AHCI_P_CMD_CLO 0x00000008
	#define AHCI_P_CMD_FRE 0x00000010
	#define AHCI_P_CMD_CCS_MASK 0x00001f00
	#define AHCI_P_CMD_CCS_SHIFT 8
	#define AHCI_P_CMD_ISS 0x00002000
	#define AHCI_P_CMD_FR 0x00004000
	#define AHCI_P_CMD_CR 0x00008000
	#define AHCI_P_CMD_CPS 0x00010000
	#define AHCI_P_CMD_PMA 0x00020000
	#define AHCI_P_CMD_HPCP 0x00040000
	#define AHCI_P_CMD_MPSP 0x00080000
	#define AHCI_P_CMD_CPD 0x00100000
	#define AHCI_P_CMD_ESP 0x00200000
	#define AHCI_P_CMD_FBSCP 0x00400000
	#define AHCI_P_CMD_APSTE 0x00800000
	#define AHCI_P_CMD_ATAPI 0x01000000
	#define AHCI_P_CMD_DLAE 0x02000000
	#define AHCI_P_CMD_ALPE 0x04000000
	#define AHCI_P_CMD_ASP 0x08000000
	#define AHCI_P_CMD_ICC_MASK 0xf0000000
	#define AHCI_P_CMD_NOOP 0x00000000
	#define AHCI_P_CMD_ACTIVE 0x10000000
	#define AHCI_P_CMD_PARTIAL 0x20000000
	#define AHCI_P_CMD_SLUMBER 0x60000000
	#define AHCI_P_CMD_DEVSLEEP 0x80000000

	#define AHCI_P_TFD 0x20
	#define AHCI_P_SIG 0x24
	#define AHCI_P_SSTS 0x28
	#define AHCI_P_SCTL 0x2c
	#define AHCI_P_SERR 0x30
	#define AHCI_P_SACT 0x34
	#define AHCI_P_CI 0x38
	#define AHCI_P_SNTF 0x3C
	#define AHCI_P_FBS 0x40
	#define AHCI_P_FBS_EN 0x00000001
	#define AHCI_P_FBS_DEC 0x00000002
	#define AHCI_P_FBS_SDE 0x00000004
	#define AHCI_P_FBS_DEV 0x00000f00
	#define AHCI_P_FBS_DEV_SHIFT 8
	#define AHCI_P_FBS_ADO 0x0000f000
	#define AHCI_P_FBS_ADO_SHIFT 12
	#define AHCI_P_FBS_DWE 0x000f0000
	#define AHCI_P_FBS_DWE_SHIFT 16
	#define AHCI_P_DEVSLP 0x44
	#define AHCI_P_DEVSLP_ADSE 0x00000001
	#define AHCI_P_DEVSLP_DSP 0x00000002
	#define AHCI_P_DEVSLP_DETO 0x000003fc
	#define AHCI_P_DEVSLP_DETO_SHIFT 2
	#define AHCI_P_DEVSLP_MDAT 0x00007c00
	#define AHCI_P_DEVSLP_MDAT_SHIFT 10
	#define AHCI_P_DEVSLP_DITO 0x01ff8000
	#define AHCI_P_DEVSLP_DITO_SHIFT 15
	#define AHCI_P_DEVSLP_DM 0x0e000000
	#define AHCI_P_DEVSLP_DM_SHIFT 25

	-/* Just to be sure, if building as module. */
	-#if MAXPHYS < 512 * 1024
	-#undef MAXPHYS
	-#define MAXPHYS 512 * 1024
	-#endif
	/* Pessimistic prognosis on number of required S/G entries */
	-#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8))
	+#define AHCI_SG_ENTRIES MIN(roundup(btoc(maxphys) + 1, 8), 65528)
	/* Command list. 32 commands. First, 1Kbyte aligned. */
	#define AHCI_CL_OFFSET 0
	#define AHCI_CL_SIZE 32
	/* Command tables. Up to 32 commands, Each, 128byte aligned. */
	#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS)
	#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16)
	/* Total main work area. */
	#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots)

	/* ivars value fields */
	#define AHCI_REMAPPED_UNIT (1 << 31) /* NVMe remapped device. */
	#define AHCI_EM_UNIT (1 << 30) /* Enclosure Mgmt device. */
	#define AHCI_UNIT 0xff /* Channel number. */

	struct ahci_dma_prd {
	u_int64_t dba;
	u_int32_t reserved;
	u_int32_t dbc; /* 0 based */
	#define AHCI_PRD_MASK 0x003fffff /* max 4MB */
	#define AHCI_PRD_MAX (AHCI_PRD_MASK + 1)
	#define AHCI_PRD_IPC (1U << 31)
	} __packed;

	struct ahci_cmd_tab {
	u_int8_t cfis[64];
	u_int8_t acmd[32];
	u_int8_t reserved[32];
	- struct ahci_dma_prd prd_tab[AHCI_SG_ENTRIES];
	+ struct ahci_dma_prd prd_tab[];
	} __packed;

	struct ahci_cmd_list {
	u_int16_t cmd_flags;
	#define AHCI_CMD_ATAPI 0x0020
	#define AHCI_CMD_WRITE 0x0040
	#define AHCI_CMD_PREFETCH 0x0080
	#define AHCI_CMD_RESET 0x0100
	#define AHCI_CMD_BIST 0x0200
	#define AHCI_CMD_CLR_BUSY 0x0400

	u_int16_t prd_length; /* PRD entries */
	u_int32_t bytecount;
	u_int64_t cmd_table_phys; /* 128byte aligned */
	} __packed;

	/* misc defines */
	#define ATA_IRQ_RID 0
	#define ATA_INTR_FLAGS (INTR_MPSAFE\|INTR_TYPE_BIO\|INTR_ENTROPY)

	struct ata_dmaslot {
	bus_dmamap_t data_map; /* data DMA map */
	int nsegs; /* Number of segs loaded */
	};

	/* structure holding DMA related information */
	struct ata_dma {
	bus_dma_tag_t work_tag; /* workspace DMA tag */
	bus_dmamap_t work_map; /* workspace DMA map */
	uint8_t work; / workspace */
	bus_addr_t work_bus; /* bus address of work */
	bus_dma_tag_t rfis_tag; /* RFIS list DMA tag */
	bus_dmamap_t rfis_map; /* RFIS list DMA map */
	uint8_t rfis; / FIS receive area */
	bus_addr_t rfis_bus; /* bus address of rfis */
	bus_dma_tag_t data_tag; /* data DMA tag */
	};

	enum ahci_slot_states {
	AHCI_SLOT_EMPTY,
	AHCI_SLOT_LOADING,
	AHCI_SLOT_RUNNING,
	AHCI_SLOT_EXECUTING
	};

	struct ahci_slot {
	struct ahci_channel ch; / Channel */
	u_int8_t slot; /* Number of this slot */
	enum ahci_slot_states state; /* Slot state */
	+ u_int ct_offset; /* cmd_tab offset */
	union ccb ccb; / CCB occupying slot */
	struct ata_dmaslot dma; /* DMA data of this slot */
	struct callout timeout; /* Execution timeout */
	};

	struct ahci_device {
	int revision;
	int mode;
	u_int bytecount;
	u_int atapi;
	u_int tags;
	u_int caps;
	};

	struct ahci_led {
	device_t dev; /* Device handle */
	struct cdev *led;
	uint8_t num; /* Number of this led */
	uint8_t state; /* State of this led */
	};

	#define AHCI_NUM_LEDS 3

	/* structure describing an ATA channel */
	struct ahci_channel {
	device_t dev; /* Device handle */
	int unit; /* Physical channel */
	struct resource r_mem; / Memory of this channel */
	struct resource r_irq; / Interrupt of this channel */
	void ih; / Interrupt handle */
	struct ata_dma dma; /* DMA data */
	struct cam_sim *sim;
	struct cam_path *path;
	uint32_t caps; /* Controller capabilities */
	uint32_t caps2; /* Controller capabilities */
	uint32_t chcaps; /* Channel capabilities */
	uint32_t chscaps; /* Channel sleep capabilities */
	uint16_t vendorid; /* Vendor ID from the bus */
	uint16_t deviceid; /* Device ID from the bus */
	uint16_t subvendorid; /* Subvendor ID from the bus */
	uint16_t subdeviceid; /* Subdevice ID from the bus */
	int quirks;
	int numslots; /* Number of present slots */
	int pm_level; /* power management level */
	int devices; /* What is present */
	int pm_present; /* PM presence reported */
	int fbs_enabled; /* FIS-based switching enabled */

	void (start)(struct ahci_channel );

	union ccb *hold[AHCI_MAX_SLOTS];
	struct ahci_slot slot[AHCI_MAX_SLOTS];
	uint32_t oslots; /* Occupied slots */
	uint32_t rslots; /* Running slots */
	uint32_t aslots; /* Slots with atomic commands */
	uint32_t eslots; /* Slots in error */
	uint32_t toslots; /* Slots in timeout */
	int lastslot; /* Last used slot */
	int taggedtarget; /* Last tagged target */
	int numrslots; /* Number of running slots */
	int numrslotspd[16];/* Number of running slots per dev */
	int numtslots; /* Number of tagged slots */
	int numtslotspd[16];/* Number of tagged slots per dev */
	int numhslots; /* Number of held slots */
	int recoverycmd; /* Our READ LOG active */
	int fatalerr; /* Fatal error happened */
	int resetting; /* Hard-reset in progress. */
	int resetpolldiv; /* Hard-reset poll divider. */
	int listening; /* SUD bit is cleared. */
	int wrongccs; /* CCS field in CMD was wrong */
	union ccb frozen; / Frozen command */
	struct callout pm_timer; /* Power management events */
	struct callout reset_timer; /* Hard-reset timeout */

	struct ahci_device user[16]; /* User-specified settings */
	struct ahci_device curr[16]; /* Current settings */

	struct mtx_padalign mtx; /* state lock */
	STAILQ_HEAD(, ccb_hdr) doneq; /* queue of completed CCBs */
	int batch; /* doneq is in use */

	int disablephy; /* keep PHY disabled */
	};

	struct ahci_enclosure {
	device_t dev; /* Device handle */
	struct resource r_memc; / Control register */
	struct resource r_memt; / Transmit buffer */
	struct resource r_memr; / Receive buffer */
	struct cam_sim *sim;
	struct cam_path *path;
	struct mtx mtx; /* state lock */
	struct ahci_led leds[AHCI_MAX_PORTS * 3];
	uint32_t capsem; /* Controller capabilities */
	uint8_t status[AHCI_MAX_PORTS][4]; /* ArrayDev statuses */
	int quirks;
	int channels;
	uint32_t ichannels;
	};

	/* structure describing a AHCI controller */
	struct ahci_controller {
	device_t dev;
	bus_dma_tag_t dma_tag;
	int r_rid;
	int r_msix_tab_rid;
	int r_msix_pba_rid;
	uint16_t vendorid; /* Vendor ID from the bus */
	uint16_t deviceid; /* Device ID from the bus */
	uint16_t subvendorid; /* Subvendor ID from the bus */
	uint16_t subdeviceid; /* Subdevice ID from the bus */
	struct resource *r_mem;
	struct resource *r_msix_table;
	struct resource *r_msix_pba;
	struct rman sc_iomem;
	struct ahci_controller_irq {
	struct ahci_controller *ctlr;
	struct resource *r_irq;
	void *handle;
	int r_irq_rid;
	int mode;
	#define AHCI_IRQ_MODE_ALL 0
	#define AHCI_IRQ_MODE_AFTER 1
	#define AHCI_IRQ_MODE_ONE 2
	} irqs[AHCI_MAX_IRQS];
	uint32_t caps; /* Controller capabilities */
	uint32_t caps2; /* Controller capabilities */
	uint32_t capsem; /* Controller capabilities */
	uint32_t emloc; /* EM buffer location */
	int quirks;
	int numirqs;
	int channels;
	uint32_t ichannels;
	int ccc; /* CCC timeout */
	int cccv; /* CCC vector */
	int direct; /* Direct command completion */
	int msi; /* MSI interupts */
	int remapped_devices; /* Remapped NVMe devices */
	uint32_t remap_offset;
	uint32_t remap_size;
	struct {
	void (function)(void );
	void *argument;
	} interrupt[AHCI_MAX_PORTS];
	void (ch_start)(struct ahci_channel );
	int dma_coherent; /* DMA is cache-coherent */
	struct mtx ch_mtx; /* Lock for attached channels */
	struct ahci_channel ch[AHCI_MAX_PORTS]; / Attached channels */
	};

	enum ahci_err_type {
	AHCI_ERR_NONE, /* No error */
	AHCI_ERR_INVALID, /* Error detected by us before submitting. */
	AHCI_ERR_INNOCENT, /* Innocent victim. */
	AHCI_ERR_TFE, /* Task File Error. */
	AHCI_ERR_SATA, /* SATA error. */
	AHCI_ERR_TIMEOUT, /* Command execution timeout. */
	AHCI_ERR_NCQ, /* NCQ command error. CCB should be put on hold
	* until READ LOG executed to reveal error. */
	};

	/* macros to hide busspace uglyness */
	#define ATA_INB(res, offset) \
	bus_read_1((res), (offset))
	#define ATA_INW(res, offset) \
	bus_read_2((res), (offset))
	#define ATA_INL(res, offset) \
	bus_read_4((res), (offset))
	#define ATA_INSW(res, offset, addr, count) \
	bus_read_multi_2((res), (offset), (addr), (count))
	#define ATA_INSW_STRM(res, offset, addr, count) \
	bus_read_multi_stream_2((res), (offset), (addr), (count))
	#define ATA_INSL(res, offset, addr, count) \
	bus_read_multi_4((res), (offset), (addr), (count))
	#define ATA_INSL_STRM(res, offset, addr, count) \
	bus_read_multi_stream_4((res), (offset), (addr), (count))
	#define ATA_OUTB(res, offset, value) \
	bus_write_1((res), (offset), (value))
	#define ATA_OUTW(res, offset, value) \
	bus_write_2((res), (offset), (value))
	#define ATA_OUTL(res, offset, value) \
	bus_write_4((res), (offset), (value))
	#define ATA_OUTSW(res, offset, addr, count) \
	bus_write_multi_2((res), (offset), (addr), (count))
	#define ATA_OUTSW_STRM(res, offset, addr, count) \
	bus_write_multi_stream_2((res), (offset), (addr), (count))
	#define ATA_OUTSL(res, offset, addr, count) \
	bus_write_multi_4((res), (offset), (addr), (count))
	#define ATA_OUTSL_STRM(res, offset, addr, count) \
	bus_write_multi_stream_4((res), (offset), (addr), (count))

	/*
	* On some platforms, we must ensure proper interdevice write ordering.
	* The AHCI interrupt status register must be updated in HW before
	* registers in interrupt controller.
	* Unfortunately, only way how we can do it is readback.
	*
	* Currently, only ARM is known to have this issue.
	*/
	#if defined(__arm__)
	#define ATA_RBL(res, offset) \
	bus_read_4((res), (offset))
	#else
	#define ATA_RBL(res, offset)
	#endif

	#define AHCI_Q_NOFORCE 0x00000001
	#define AHCI_Q_NOPMP 0x00000002
	#define AHCI_Q_NONCQ 0x00000004
	#define AHCI_Q_1CH 0x00000008
	#define AHCI_Q_2CH 0x00000010
	#define AHCI_Q_4CH 0x00000020
	#define AHCI_Q_EDGEIS 0x00000040
	#define AHCI_Q_SATA2 0x00000080
	#define AHCI_Q_NOBSYRES 0x00000100
	#define AHCI_Q_NOAA 0x00000200
	#define AHCI_Q_NOCOUNT 0x00000400
	#define AHCI_Q_ALTSIG 0x00000800
	#define AHCI_Q_NOMSI 0x00001000
	#define AHCI_Q_ATI_PMP_BUG 0x00002000
	#define AHCI_Q_MAXIO_64K 0x00004000
	#define AHCI_Q_SATA1_UNIT0 0x00008000 /* need better method for this */
	#define AHCI_Q_ABAR0 0x00010000
	#define AHCI_Q_1MSI 0x00020000
	#define AHCI_Q_FORCE_PI 0x00040000
	#define AHCI_Q_RESTORE_CAP 0x00080000
	#define AHCI_Q_NOMSIX 0x00100000
	#define AHCI_Q_MRVL_SR_DEL 0x00200000
	#define AHCI_Q_NOCCS 0x00400000
	#define AHCI_Q_NOAUX 0x00800000
	#define AHCI_Q_IOMMU_BUSWIDE 0x01000000

	#define AHCI_Q_BIT_STRING \
	"\020" \
	"\001NOFORCE" \
	"\002NOPMP" \
	"\003NONCQ" \
	"\0041CH" \
	"\0052CH" \
	"\0064CH" \
	"\007EDGEIS" \
	"\010SATA2" \
	"\011NOBSYRES" \
	"\012NOAA" \
	"\013NOCOUNT" \
	"\014ALTSIG" \
	"\015NOMSI" \
	"\016ATI_PMP_BUG" \
	"\017MAXIO_64K" \
	"\020SATA1_UNIT0" \
	"\021ABAR0" \
	"\0221MSI" \
	"\023FORCE_PI" \
	"\024RESTORE_CAP" \
	"\025NOMSIX" \
	"\026MRVL_SR_DEL" \
	"\027NOCCS" \
	"\030NOAUX" \
	"\031IOMMU_BUSWIDE"

	int ahci_attach(device_t dev);
	int ahci_detach(device_t dev);
	int ahci_setup_interrupt(device_t dev);
	int ahci_print_child(device_t dev, device_t child);
	struct resource ahci_alloc_resource(device_t dev, device_t child, int type, int rid,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags);
	int ahci_release_resource(device_t dev, device_t child, int type, int rid,
	struct resource *r);
	int ahci_setup_intr(device_t dev, device_t child, struct resource *irq,
	int flags, driver_filter_t filter, driver_intr_t function,
	void argument, void *cookiep);
	int ahci_teardown_intr(device_t dev, device_t child, struct resource *irq,
	void *cookie);
	int ahci_child_location_str(device_t dev, device_t child, char *buf,
	size_t buflen);
	bus_dma_tag_t ahci_get_dma_tag(device_t dev, device_t child);
	int ahci_ctlr_reset(device_t dev);
	int ahci_ctlr_setup(device_t dev);
	void ahci_free_mem(device_t dev);

	/* Functions to allow AHCI EM to access other channels. */
	void ahci_attached(device_t dev, struct ahci_channel *ch);
	void ahci_detached(device_t dev, struct ahci_channel *ch);
	struct ahci_channel * ahci_getch(device_t dev, int n);
	void ahci_putch(struct ahci_channel *ch);

	extern devclass_t ahci_devclass;
	diff --git a/sys/dev/ahci/ahciem.c b/sys/dev/ahci/ahciem.c
	index a6f22fac5639..b1c3fbf1fdd4 100644
	--- a/sys/dev/ahci/ahciem.c
	+++ b/sys/dev/ahci/ahciem.c
	@@ -1,663 +1,663 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2012 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/module.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/endian.h>
	#include <sys/malloc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <machine/stdarg.h>
	#include <machine/resource.h>
	#include <machine/bus.h>
	#include <sys/rman.h>
	#include <dev/led/led.h>
	#include <dev/pci/pcivar.h>
	#include <dev/pci/pcireg.h>
	#include "ahci.h"

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/cam_debug.h>
	#include <cam/scsi/scsi_ses.h>

	/* local prototypes */
	static void ahciemaction(struct cam_sim sim, union ccb ccb);
	static void ahciempoll(struct cam_sim *sim);
	static int ahci_em_reset(device_t dev);
	static void ahci_em_led(void *priv, int onoff);
	static void ahci_em_setleds(device_t dev, int c);

	static int
	ahci_em_probe(device_t dev)
	{

	device_set_desc_copy(dev, "AHCI enclosure management bridge");
	return (BUS_PROBE_DEFAULT);
	}

	static int
	ahci_em_attach(device_t dev)
	{
	device_t parent = device_get_parent(dev);
	struct ahci_controller *ctlr = device_get_softc(parent);
	struct ahci_enclosure *enc = device_get_softc(dev);
	struct cam_devq *devq;
	int i, c, rid, error;
	char buf[32];

	enc->dev = dev;
	enc->quirks = ctlr->quirks;
	enc->channels = ctlr->channels;
	enc->ichannels = ctlr->ichannels;
	mtx_init(&enc->mtx, "AHCI enclosure lock", NULL, MTX_DEF);
	rid = 0;
	if (!(enc->r_memc = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&rid, RF_ACTIVE))) {
	mtx_destroy(&enc->mtx);
	return (ENXIO);
	}
	enc->capsem = ATA_INL(enc->r_memc, 0);
	rid = 1;
	if (!(enc->r_memt = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&rid, RF_ACTIVE))) {
	error = ENXIO;
	goto err0;
	}
	if ((enc->capsem & (AHCI_EM_XMT \| AHCI_EM_SMB)) == 0) {
	rid = 2;
	if (!(enc->r_memr = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&rid, RF_ACTIVE))) {
	error = ENXIO;
	goto err0;
	}
	} else
	enc->r_memr = NULL;
	mtx_lock(&enc->mtx);
	if (ahci_em_reset(dev) != 0) {
	error = ENXIO;
	goto err1;
	}
	rid = ATA_IRQ_RID;
	/* Create the device queue for our SIM. */
	devq = cam_simq_alloc(1);
	if (devq == NULL) {
	device_printf(dev, "Unable to allocate SIM queue\n");
	error = ENOMEM;
	goto err1;
	}
	/* Construct SIM entry */
	enc->sim = cam_sim_alloc(ahciemaction, ahciempoll, "ahciem", enc,
	device_get_unit(dev), &enc->mtx,
	1, 0, devq);
	if (enc->sim == NULL) {
	cam_simq_free(devq);
	device_printf(dev, "Unable to allocate SIM\n");
	error = ENOMEM;
	goto err1;
	}
	if (xpt_bus_register(enc->sim, dev, 0) != CAM_SUCCESS) {
	device_printf(dev, "unable to register xpt bus\n");
	error = ENXIO;
	goto err2;
	}
	if (xpt_create_path(&enc->path, /periph/NULL, cam_sim_path(enc->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	device_printf(dev, "Unable to create path\n");
	error = ENXIO;
	goto err3;
	}
	mtx_unlock(&enc->mtx);
	if (bootverbose) {
	device_printf(dev, "Caps:%s%s%s%s%s%s%s%s\n",
	(enc->capsem & AHCI_EM_PM) ? " PM":"",
	(enc->capsem & AHCI_EM_ALHD) ? " ALHD":"",
	(enc->capsem & AHCI_EM_XMT) ? " XMT":"",
	(enc->capsem & AHCI_EM_SMB) ? " SMB":"",
	(enc->capsem & AHCI_EM_SGPIO) ? " SGPIO":"",
	(enc->capsem & AHCI_EM_SES2) ? " SES-2":"",
	(enc->capsem & AHCI_EM_SAFTE) ? " SAF-TE":"",
	(enc->capsem & AHCI_EM_LED) ? " LED":"");
	}
	if ((enc->capsem & AHCI_EM_LED)) {
	for (c = 0; c < enc->channels; c++) {
	if ((enc->ichannels & (1 << c)) == 0)
	continue;
	for (i = 0; i < AHCI_NUM_LEDS; i++) {
	enc->leds[c * AHCI_NUM_LEDS + i].dev = dev;
	enc->leds[c * AHCI_NUM_LEDS + i].num =
	c * AHCI_NUM_LEDS + i;
	}
	if ((enc->capsem & AHCI_EM_ALHD) == 0) {
	snprintf(buf, sizeof(buf), "%s.%d.act",
	device_get_nameunit(parent), c);
	enc->leds[c * AHCI_NUM_LEDS + 0].led =
	led_create(ahci_em_led,
	&enc->leds[c * AHCI_NUM_LEDS + 0], buf);
	}
	snprintf(buf, sizeof(buf), "%s.%d.locate",
	device_get_nameunit(parent), c);
	enc->leds[c * AHCI_NUM_LEDS + 1].led =
	led_create(ahci_em_led,
	&enc->leds[c * AHCI_NUM_LEDS + 1], buf);
	snprintf(buf, sizeof(buf), "%s.%d.fault",
	device_get_nameunit(parent), c);
	enc->leds[c * AHCI_NUM_LEDS + 2].led =
	led_create(ahci_em_led,
	&enc->leds[c * AHCI_NUM_LEDS + 2], buf);
	}
	}
	return (0);

	err3:
	xpt_bus_deregister(cam_sim_path(enc->sim));
	err2:
	cam_sim_free(enc->sim, /free_devq/TRUE);
	err1:
	mtx_unlock(&enc->mtx);
	if (enc->r_memr)
	bus_release_resource(dev, SYS_RES_MEMORY, 2, enc->r_memr);
	err0:
	if (enc->r_memt)
	bus_release_resource(dev, SYS_RES_MEMORY, 1, enc->r_memt);
	bus_release_resource(dev, SYS_RES_MEMORY, 0, enc->r_memc);
	mtx_destroy(&enc->mtx);
	return (error);
	}

	static int
	ahci_em_detach(device_t dev)
	{
	struct ahci_enclosure *enc = device_get_softc(dev);
	int i;

	for (i = 0; i < enc->channels * AHCI_NUM_LEDS; i++) {
	if (enc->leds[i].led)
	led_destroy(enc->leds[i].led);
	}
	mtx_lock(&enc->mtx);
	xpt_async(AC_LOST_DEVICE, enc->path, NULL);
	xpt_free_path(enc->path);
	xpt_bus_deregister(cam_sim_path(enc->sim));
	cam_sim_free(enc->sim, /free_devq/TRUE);
	mtx_unlock(&enc->mtx);

	bus_release_resource(dev, SYS_RES_MEMORY, 0, enc->r_memc);
	bus_release_resource(dev, SYS_RES_MEMORY, 1, enc->r_memt);
	if (enc->r_memr)
	bus_release_resource(dev, SYS_RES_MEMORY, 2, enc->r_memr);
	mtx_destroy(&enc->mtx);
	return (0);
	}

	static int
	ahci_em_reset(device_t dev)
	{
	struct ahci_enclosure *enc;
	int i, timeout;

	enc = device_get_softc(dev);
	ATA_OUTL(enc->r_memc, 0, AHCI_EM_RST);
	timeout = 1000;
	while ((ATA_INL(enc->r_memc, 0) & AHCI_EM_RST) &&
	--timeout > 0)
	DELAY(1000);
	if (timeout == 0) {
	device_printf(dev, "EM timeout\n");
	return (1);
	}
	for (i = 0; i < enc->channels; i++)
	ahci_em_setleds(dev, i);
	return (0);
	}

	static int
	ahci_em_suspend(device_t dev)
	{
	struct ahci_enclosure *enc = device_get_softc(dev);

	mtx_lock(&enc->mtx);
	xpt_freeze_simq(enc->sim, 1);
	mtx_unlock(&enc->mtx);
	return (0);
	}

	static int
	ahci_em_resume(device_t dev)
	{
	struct ahci_enclosure *enc = device_get_softc(dev);

	mtx_lock(&enc->mtx);
	ahci_em_reset(dev);
	xpt_release_simq(enc->sim, TRUE);
	mtx_unlock(&enc->mtx);
	return (0);
	}

	devclass_t ahciem_devclass;
	static device_method_t ahciem_methods[] = {
	DEVMETHOD(device_probe, ahci_em_probe),
	DEVMETHOD(device_attach, ahci_em_attach),
	DEVMETHOD(device_detach, ahci_em_detach),
	DEVMETHOD(device_suspend, ahci_em_suspend),
	DEVMETHOD(device_resume, ahci_em_resume),
	DEVMETHOD_END
	};
	static driver_t ahciem_driver = {
	"ahciem",
	ahciem_methods,
	sizeof(struct ahci_enclosure)
	};
	DRIVER_MODULE(ahciem, ahci, ahciem_driver, ahciem_devclass, NULL, NULL);

	static void
	ahci_em_setleds(device_t dev, int c)
	{
	struct ahci_enclosure *enc;
	int timeout;
	int16_t val;

	enc = device_get_softc(dev);

	val = 0;
	if (enc->status[c][2] & SESCTL_RQSACT) /* Activity */
	val \|= (1 << 0);
	if (enc->status[c][1] & SESCTL_RQSRR) /* Rebuild */
	val \|= (1 << 6) \| (1 << 3);
	else if (enc->status[c][2] & SESCTL_RQSID) /* Identification */
	val \|= (1 << 3);
	else if (enc->status[c][3] & SESCTL_RQSFLT) /* Fault */
	val \|= (1 << 6);

	timeout = 10000;
	while (ATA_INL(enc->r_memc, 0) & (AHCI_EM_TM \| AHCI_EM_RST) &&
	--timeout > 0)
	DELAY(100);
	if (timeout == 0)
	device_printf(dev, "Transmit timeout\n");
	ATA_OUTL(enc->r_memt, 0, (1 << 8) \| (0 << 16) \| (0 << 24));
	ATA_OUTL(enc->r_memt, 4, c \| (0 << 8) \| (val << 16));
	ATA_OUTL(enc->r_memc, 0, AHCI_EM_TM);
	}

	static void
	ahci_em_led(void *priv, int onoff)
	{
	struct ahci_led *led;
	struct ahci_enclosure *enc;
	int c, l;

	led = (struct ahci_led *)priv;
	enc = device_get_softc(led->dev);
	c = led->num / AHCI_NUM_LEDS;
	l = led->num % AHCI_NUM_LEDS;

	if (l == 0) {
	if (onoff)
	enc->status[c][2] \|= 0x80;
	else
	enc->status[c][2] &= ~0x80;
	} else if (l == 1) {
	if (onoff)
	enc->status[c][2] \|= SESCTL_RQSID;
	else
	enc->status[c][2] &= ~SESCTL_RQSID;
	} else if (l == 2) {
	if (onoff)
	enc->status[c][3] \|= SESCTL_RQSFLT;
	else
	enc->status[c][3] &= SESCTL_RQSFLT;
	}
	ahci_em_setleds(led->dev, c);
	}

	static int
	ahci_check_ids(union ccb *ccb)
	{

	if (ccb->ccb_h.target_id != 0) {
	ccb->ccb_h.status = CAM_TID_INVALID;
	xpt_done(ccb);
	return (-1);
	}
	if (ccb->ccb_h.target_lun != 0) {
	ccb->ccb_h.status = CAM_LUN_INVALID;
	xpt_done(ccb);
	return (-1);
	}
	return (0);
	}

	static void
	ahci_em_emulate_ses_on_led(device_t dev, union ccb *ccb)
	{
	struct ahci_enclosure *enc;
	struct ahci_channel *ch;
	struct ses_status_page *page;
	struct ses_status_array_dev_slot ads, ads0;
	struct ses_elm_desc_hdr *elmd;
	struct ses_elm_addlstatus_eip_hdr *elma;
	struct ses_elm_ata_hdr *elmb;
	uint8_t *buf;
	int i;

	enc = device_get_softc(dev);
	buf = ccb->ataio.data_ptr;

	/* General request validation. */
	if (ccb->ataio.cmd.command != ATA_SEP_ATTN \|\|
	ccb->ataio.dxfer_len < ccb->ataio.cmd.sector_count * 4) {
	ccb->ccb_h.status = CAM_REQ_INVALID;
	goto out;
	}

	/* SEMB IDENTIFY */
	if (ccb->ataio.cmd.features == 0xEC &&
	ccb->ataio.cmd.sector_count >= 16) {
	bzero(buf, ccb->ataio.dxfer_len);
	buf[0] = 64; /* Valid bytes. */
	buf[2] = 0x30; /* NAA Locally Assigned. */
	strncpy(&buf[3], device_get_nameunit(dev), 7);
	strncpy(&buf[10], "AHCI ", SID_VENDOR_SIZE);
	strncpy(&buf[18], "SGPIO Enclosure ", SID_PRODUCT_SIZE);
	strncpy(&buf[34], "2.00", SID_REVISION_SIZE);
	strncpy(&buf[39], "0001", 4);
	strncpy(&buf[43], "S-E-S ", 6);
	strncpy(&buf[49], "2.00", 4);
	ccb->ccb_h.status = CAM_REQ_CMP;
	goto out;
	}

	/* SEMB RECEIVE DIAGNOSTIC RESULT (0) */
	page = (struct ses_status_page *)buf;
	if (ccb->ataio.cmd.lba_low == 0x02 &&
	ccb->ataio.cmd.features == 0x00 &&
	ccb->ataio.cmd.sector_count >= 3) {
	bzero(buf, ccb->ataio.dxfer_len);
	page->hdr.page_code = 0;
	scsi_ulto2b(5, page->hdr.length);
	buf[4] = 0x00;
	buf[5] = 0x01;
	buf[6] = 0x02;
	buf[7] = 0x07;
	buf[8] = 0x0a;
	ccb->ccb_h.status = CAM_REQ_CMP;
	goto out;
	}

	/* SEMB RECEIVE DIAGNOSTIC RESULT (1) */
	if (ccb->ataio.cmd.lba_low == 0x02 &&
	ccb->ataio.cmd.features == 0x01 &&
	ccb->ataio.cmd.sector_count >= 16) {
	struct ses_enc_desc *ed;
	struct ses_elm_type_desc *td;

	bzero(buf, ccb->ataio.dxfer_len);
	page->hdr.page_code = 0x01;
	scsi_ulto2b(4 + sizeof(ed) + sizeof(td) + 11,
	page->hdr.length);
	ed = (struct ses_enc_desc *)&buf[8];
	ed->byte0 = 0x11;
	ed->subenc_id = 0;
	ed->num_types = 1;
	ed->length = 36;
	ed->logical_id[0] = 0x30; /* NAA Locally Assigned. */
	strncpy(&ed->logical_id[1], device_get_nameunit(dev), 7);
	strncpy(ed->vendor_id, "AHCI ", SID_VENDOR_SIZE);
	strncpy(ed->product_id, "SGPIO Enclosure ", SID_PRODUCT_SIZE);
	strncpy(ed->product_rev, "2.00", SID_REVISION_SIZE);
	td = (struct ses_elm_type_desc *)ses_enc_desc_next(ed);
	td->etype_elm_type = 0x17;
	td->etype_maxelt = enc->channels;
	td->etype_subenc = 0;
	td->etype_txt_len = 11;
	snprintf((char *)(td + 1), 12, "Drive Slots");
	ccb->ccb_h.status = CAM_REQ_CMP;
	goto out;
	}

	/* SEMB RECEIVE DIAGNOSTIC RESULT (2) */
	if (ccb->ataio.cmd.lba_low == 0x02 &&
	ccb->ataio.cmd.features == 0x02 &&
	ccb->ataio.cmd.sector_count >= (3 + enc->channels)) {
	bzero(buf, ccb->ataio.dxfer_len);
	page->hdr.page_code = 0x02;
	scsi_ulto2b(4 + 4 * (1 + enc->channels),
	page->hdr.length);
	for (i = 0; i < enc->channels; i++) {
	ads = &page->elements[i + 1].array_dev_slot;
	memcpy(ads, enc->status[i], 4);
	ch = ahci_getch(device_get_parent(dev), i);
	if (ch == NULL) {
	ads->common.bytes[0] \|= SES_OBJSTAT_UNKNOWN;
	continue;
	}
	if (ch->pm_present)
	ads->common.bytes[0] \|= SES_OBJSTAT_UNKNOWN;
	else if (ch->devices)
	ads->common.bytes[0] \|= SES_OBJSTAT_OK;
	else if (ch->disablephy)
	ads->common.bytes[0] \|= SES_OBJSTAT_NOTAVAIL;
	else
	ads->common.bytes[0] \|= SES_OBJSTAT_NOTINSTALLED;
	if (ch->disablephy)
	ads->common.bytes[3] \|= SESCTL_DEVOFF;
	ahci_putch(ch);
	}
	ccb->ccb_h.status = CAM_REQ_CMP;
	goto out;
	}

	/* SEMB SEND DIAGNOSTIC (2) */
	if (ccb->ataio.cmd.lba_low == 0x82 &&
	ccb->ataio.cmd.features == 0x02 &&
	ccb->ataio.cmd.sector_count >= (3 + enc->channels)) {
	ads0 = &page->elements[0].array_dev_slot;
	for (i = 0; i < enc->channels; i++) {
	ads = &page->elements[i + 1].array_dev_slot;
	if (ads->common.bytes[0] & SESCTL_CSEL) {
	enc->status[i][0] = 0;
	enc->status[i][1] = ads->bytes[0] &
	SESCTL_RQSRR;
	enc->status[i][2] = ads->bytes[1] &
	(SESCTL_RQSACT \| SESCTL_RQSID);
	enc->status[i][3] = ads->bytes[2] &
	SESCTL_RQSFLT;
	ahci_em_setleds(dev, i);
	} else if (ads0->common.bytes[0] & SESCTL_CSEL) {
	enc->status[i][0] = 0;
	enc->status[i][1] = ads0->bytes[0] &
	SESCTL_RQSRR;
	enc->status[i][2] = ads0->bytes[1] &
	(SESCTL_RQSACT \| SESCTL_RQSID);
	enc->status[i][3] = ads0->bytes[2] &
	SESCTL_RQSFLT;
	ahci_em_setleds(dev, i);
	}
	}
	ccb->ccb_h.status = CAM_REQ_CMP;
	goto out;
	}

	/* SEMB RECEIVE DIAGNOSTIC RESULT (7) */
	if (ccb->ataio.cmd.lba_low == 0x02 &&
	ccb->ataio.cmd.features == 0x07 &&
	ccb->ataio.cmd.sector_count >= (6 + 3 * enc->channels)) {
	bzero(buf, ccb->ataio.dxfer_len);
	page->hdr.page_code = 0x07;
	scsi_ulto2b(4 + 15 + 11 * enc->channels, page->hdr.length);
	elmd = (struct ses_elm_desc_hdr *)&buf[8];
	scsi_ulto2b(11, elmd->length);
	snprintf((char *)(elmd + 1), 12, "Drive Slots");
	for (i = 0; i < enc->channels; i++) {
	elmd = (struct ses_elm_desc_hdr )&buf[8 + 15 + 11 i];
	scsi_ulto2b(7, elmd->length);
	snprintf((char *)(elmd + 1), 8, "Slot %02d", i);
	}
	ccb->ccb_h.status = CAM_REQ_CMP;
	goto out;
	}

	/* SEMB RECEIVE DIAGNOSTIC RESULT (a) */
	if (ccb->ataio.cmd.lba_low == 0x02 &&
	ccb->ataio.cmd.features == 0x0a &&
	ccb->ataio.cmd.sector_count >= (2 + 3 * enc->channels)) {
	bzero(buf, ccb->ataio.dxfer_len);
	page->hdr.page_code = 0x0a;
	scsi_ulto2b(4 + (sizeof(elma) + sizeof(elmb)) * enc->channels,
	page->hdr.length);
	for (i = 0; i < enc->channels; i++) {
	elma = (struct ses_elm_addlstatus_eip_hdr *)&buf[
	8 + (sizeof(elma) + sizeof(elmb)) * i];
	elma->base.byte0 = 0x10 \| SPSP_PROTO_ATA;
	elma->base.length = 2 + sizeof(*elmb);
	elma->byte2 = 0x01;
	elma->element_index = 1 + i;
	ch = ahci_getch(device_get_parent(dev), i);
	if (ch == NULL) {
	elma->base.byte0 \|= 0x80;
	continue;
	}
	if (ch->devices == 0 \|\| ch->pm_present)
	elma->base.byte0 \|= 0x80;
	elmb = (struct ses_elm_ata_hdr *)(elma + 1);
	scsi_ulto4b(cam_sim_path(ch->sim), elmb->bus);
	scsi_ulto4b(0, elmb->target);
	ahci_putch(ch);
	}
	ccb->ccb_h.status = CAM_REQ_CMP;
	goto out;
	}

	ccb->ccb_h.status = CAM_REQ_INVALID;
	out:
	xpt_done(ccb);
	}

	static void
	ahci_em_begin_transaction(device_t dev, union ccb *ccb)
	{
	struct ahci_enclosure *enc;
	struct ata_res *res;

	enc = device_get_softc(dev);
	res = &ccb->ataio.res;
	bzero(res, sizeof(*res));
	if ((ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) &&
	(ccb->ataio.cmd.control & ATA_A_RESET)) {
	res->lba_high = 0xc3;
	res->lba_mid = 0x3c;
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	return;
	}

	if (enc->capsem & AHCI_EM_LED) {
	ahci_em_emulate_ses_on_led(dev, ccb);
	return;
	} else
	device_printf(dev, "Unsupported enclosure interface\n");

	ccb->ccb_h.status = CAM_REQ_INVALID;
	xpt_done(ccb);
	}

	static void
	ahciemaction(struct cam_sim sim, union ccb ccb)
	{
	device_t dev, parent;
	struct ahci_enclosure *enc;

	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
	("ahciemaction func_code=%x\n", ccb->ccb_h.func_code));

	enc = cam_sim_softc(sim);
	dev = enc->dev;
	switch (ccb->ccb_h.func_code) {
	case XPT_ATA_IO: /* Execute the requested I/O operation */
	if (ahci_check_ids(ccb))
	return;
	ahci_em_begin_transaction(dev, ccb);
	return;
	case XPT_RESET_BUS: /* Reset the specified bus */
	case XPT_RESET_DEV: /* Bus Device Reset the specified device */
	ahci_em_reset(dev);
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	case XPT_PATH_INQ: /* Path routing inquiry */
	{
	struct ccb_pathinq *cpi = &ccb->cpi;

	parent = device_get_parent(dev);
	cpi->version_num = 1; /* XXX??? */
	cpi->hba_inquiry = PI_SDTR_ABLE;
	cpi->target_sprt = 0;
	cpi->hba_misc = PIM_SEQSCAN;
	cpi->hba_eng_cnt = 0;
	cpi->max_target = 0;
	cpi->max_lun = 0;
	cpi->initiator_id = 0;
	cpi->bus_id = cam_sim_bus(sim);
	cpi->base_transfer_speed = 150000;
	strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strlcpy(cpi->hba_vid, "AHCI", HBA_IDLEN);
	strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
	cpi->unit_number = cam_sim_unit(sim);
	cpi->transport = XPORT_SATA;
	cpi->transport_version = XPORT_VERSION_UNSPECIFIED;
	cpi->protocol = PROTO_ATA;
	cpi->protocol_version = PROTO_VERSION_UNSPECIFIED;
	- cpi->maxio = MAXPHYS;
	+ cpi->maxio = maxphys;
	cpi->hba_vendor = pci_get_vendor(parent);
	cpi->hba_device = pci_get_device(parent);
	cpi->hba_subvendor = pci_get_subvendor(parent);
	cpi->hba_subdevice = pci_get_subdevice(parent);
	cpi->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	default:
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	}
	xpt_done(ccb);
	}

	static void
	ahciempoll(struct cam_sim *sim)
	{

	}
	diff --git a/sys/dev/ata/ata-all.c b/sys/dev/ata/ata-all.c
	index e456017fd4d5..220b3a614906 100644
	--- a/sys/dev/ata/ata-all.c
	+++ b/sys/dev/ata/ata-all.c
	@@ -1,1230 +1,1230 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/ata.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/endian.h>
	#include <sys/ctype.h>
	#include <sys/conf.h>
	#include <sys/bus.h>
	#include <sys/bio.h>
	#include <sys/malloc.h>
	#include <sys/sysctl.h>
	#include <sys/sema.h>
	#include <sys/taskqueue.h>
	#include <vm/uma.h>
	#include <machine/stdarg.h>
	#include <machine/resource.h>
	#include <machine/bus.h>
	#include <sys/rman.h>
	#include <dev/ata/ata-all.h>
	#include <dev/pci/pcivar.h>
	#include <ata_if.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/cam_debug.h>

	/* prototypes */
	static void ataaction(struct cam_sim sim, union ccb ccb);
	static void atapoll(struct cam_sim *sim);
	static void ata_cam_begin_transaction(device_t dev, union ccb *ccb);
	static void ata_cam_end_transaction(device_t dev, struct ata_request *request);
	static void ata_cam_request_sense(device_t dev, struct ata_request *request);
	static int ata_check_ids(device_t dev, union ccb *ccb);
	static void ata_conn_event(void *context, int dummy);
	static void ata_interrupt_locked(void *data);
	static int ata_module_event_handler(module_t mod, int what, void *arg);
	static void ata_periodic_poll(void *data);
	static int ata_str2mode(const char *str);

	/* global vars */
	MALLOC_DEFINE(M_ATA, "ata_generic", "ATA driver generic layer");
	int (*ata_raid_ioctl_func)(u_long cmd, caddr_t data) = NULL;
	devclass_t ata_devclass;
	int ata_dma_check_80pin = 1;

	/* sysctl vars */
	static SYSCTL_NODE(_hw, OID_AUTO, ata, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"ATA driver parameters");
	SYSCTL_INT(_hw_ata, OID_AUTO, ata_dma_check_80pin,
	CTLFLAG_RWTUN, &ata_dma_check_80pin, 0,
	"Check for 80pin cable before setting ATA DMA mode");
	FEATURE(ata_cam, "ATA devices are accessed through the cam(4) driver");

	/*
	* newbus device interface related functions
	*/
	int
	ata_probe(device_t dev)
	{
	return (BUS_PROBE_LOW_PRIORITY);
	}

	int
	ata_attach(device_t dev)
	{
	struct ata_channel *ch = device_get_softc(dev);
	int error, rid;
	struct cam_devq *devq;
	const char *res;
	char buf[64];
	int i, mode;

	/* check that we have a virgin channel to attach */
	if (ch->r_irq)
	return EEXIST;

	/* initialize the softc basics */
	ch->dev = dev;
	ch->state = ATA_IDLE;
	bzero(&ch->state_mtx, sizeof(struct mtx));
	mtx_init(&ch->state_mtx, "ATA state lock", NULL, MTX_DEF);
	TASK_INIT(&ch->conntask, 0, ata_conn_event, dev);
	for (i = 0; i < 16; i++) {
	ch->user[i].revision = 0;
	snprintf(buf, sizeof(buf), "dev%d.sata_rev", i);
	if (resource_int_value(device_get_name(dev),
	device_get_unit(dev), buf, &mode) != 0 &&
	resource_int_value(device_get_name(dev),
	device_get_unit(dev), "sata_rev", &mode) != 0)
	mode = -1;
	if (mode >= 0)
	ch->user[i].revision = mode;
	ch->user[i].mode = 0;
	snprintf(buf, sizeof(buf), "dev%d.mode", i);
	if (resource_string_value(device_get_name(dev),
	device_get_unit(dev), buf, &res) == 0)
	mode = ata_str2mode(res);
	else if (resource_string_value(device_get_name(dev),
	device_get_unit(dev), "mode", &res) == 0)
	mode = ata_str2mode(res);
	else
	mode = -1;
	if (mode >= 0)
	ch->user[i].mode = mode;
	if (ch->flags & ATA_SATA)
	ch->user[i].bytecount = 8192;
	else
	- ch->user[i].bytecount = MAXPHYS;
	+ ch->user[i].bytecount = 65536;
	ch->user[i].caps = 0;
	ch->curr[i] = ch->user[i];
	if (ch->flags & ATA_SATA) {
	if (ch->pm_level > 0)
	ch->user[i].caps \|= CTS_SATA_CAPS_H_PMREQ;
	if (ch->pm_level > 1)
	ch->user[i].caps \|= CTS_SATA_CAPS_D_PMREQ;
	} else {
	if (!(ch->flags & ATA_NO_48BIT_DMA))
	ch->user[i].caps \|= CTS_ATA_CAPS_H_DMA48;
	}
	}
	callout_init(&ch->poll_callout, 1);

	/* allocate DMA resources if DMA HW present*/
	if (ch->dma.alloc)
	ch->dma.alloc(dev);

	/* setup interrupt delivery */
	rid = ATA_IRQ_RID;
	ch->r_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid,
	RF_SHAREABLE \| RF_ACTIVE);
	if (!ch->r_irq) {
	device_printf(dev, "unable to allocate interrupt\n");
	return ENXIO;
	}
	if ((error = bus_setup_intr(dev, ch->r_irq, ATA_INTR_FLAGS, NULL,
	ata_interrupt, ch, &ch->ih))) {
	bus_release_resource(dev, SYS_RES_IRQ, rid, ch->r_irq);
	device_printf(dev, "unable to setup interrupt\n");
	return error;
	}

	if (ch->flags & ATA_PERIODIC_POLL)
	callout_reset(&ch->poll_callout, hz, ata_periodic_poll, ch);
	mtx_lock(&ch->state_mtx);
	/* Create the device queue for our SIM. */
	devq = cam_simq_alloc(1);
	if (devq == NULL) {
	device_printf(dev, "Unable to allocate simq\n");
	error = ENOMEM;
	goto err1;
	}
	/* Construct SIM entry */
	ch->sim = cam_sim_alloc(ataaction, atapoll, "ata", ch,
	device_get_unit(dev), &ch->state_mtx, 1, 0, devq);
	if (ch->sim == NULL) {
	device_printf(dev, "unable to allocate sim\n");
	cam_simq_free(devq);
	error = ENOMEM;
	goto err1;
	}
	if (xpt_bus_register(ch->sim, dev, 0) != CAM_SUCCESS) {
	device_printf(dev, "unable to register xpt bus\n");
	error = ENXIO;
	goto err2;
	}
	if (xpt_create_path(&ch->path, /periph/NULL, cam_sim_path(ch->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	device_printf(dev, "unable to create path\n");
	error = ENXIO;
	goto err3;
	}
	mtx_unlock(&ch->state_mtx);
	return (0);

	err3:
	xpt_bus_deregister(cam_sim_path(ch->sim));
	err2:
	cam_sim_free(ch->sim, /free_devq/TRUE);
	ch->sim = NULL;
	err1:
	bus_release_resource(dev, SYS_RES_IRQ, rid, ch->r_irq);
	mtx_unlock(&ch->state_mtx);
	if (ch->flags & ATA_PERIODIC_POLL)
	callout_drain(&ch->poll_callout);
	return (error);
	}

	int
	ata_detach(device_t dev)
	{
	struct ata_channel *ch = device_get_softc(dev);

	/* check that we have a valid channel to detach */
	if (!ch->r_irq)
	return ENXIO;

	/* grap the channel lock so no new requests gets launched */
	mtx_lock(&ch->state_mtx);
	ch->state \|= ATA_STALL_QUEUE;
	mtx_unlock(&ch->state_mtx);
	if (ch->flags & ATA_PERIODIC_POLL)
	callout_drain(&ch->poll_callout);

	taskqueue_drain(taskqueue_thread, &ch->conntask);

	mtx_lock(&ch->state_mtx);
	xpt_async(AC_LOST_DEVICE, ch->path, NULL);
	xpt_free_path(ch->path);
	xpt_bus_deregister(cam_sim_path(ch->sim));
	cam_sim_free(ch->sim, /free_devq/TRUE);
	ch->sim = NULL;
	mtx_unlock(&ch->state_mtx);

	/* release resources */
	bus_teardown_intr(dev, ch->r_irq, ch->ih);
	bus_release_resource(dev, SYS_RES_IRQ, ATA_IRQ_RID, ch->r_irq);
	ch->r_irq = NULL;

	/* free DMA resources if DMA HW present*/
	if (ch->dma.free)
	ch->dma.free(dev);

	mtx_destroy(&ch->state_mtx);
	return 0;
	}

	static void
	ata_conn_event(void *context, int dummy)
	{
	device_t dev = (device_t)context;
	struct ata_channel *ch = device_get_softc(dev);
	union ccb *ccb;

	mtx_lock(&ch->state_mtx);
	if (ch->sim == NULL) {
	mtx_unlock(&ch->state_mtx);
	return;
	}
	ata_reinit(dev);
	if ((ccb = xpt_alloc_ccb_nowait()) == NULL)
	return;
	if (xpt_create_path(&ccb->ccb_h.path, NULL,
	cam_sim_path(ch->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	xpt_free_ccb(ccb);
	return;
	}
	xpt_rescan(ccb);
	mtx_unlock(&ch->state_mtx);
	}

	int
	ata_reinit(device_t dev)
	{
	struct ata_channel *ch = device_get_softc(dev);
	struct ata_request *request;

	xpt_freeze_simq(ch->sim, 1);
	if ((request = ch->running)) {
	ch->running = NULL;
	if (ch->state == ATA_ACTIVE)
	ch->state = ATA_IDLE;
	callout_stop(&request->callout);
	if (ch->dma.unload)
	ch->dma.unload(request);
	request->result = ERESTART;
	ata_cam_end_transaction(dev, request);
	}
	/* reset the controller HW, the channel and device(s) */
	ATA_RESET(dev);
	/* Tell the XPT about the event */
	xpt_async(AC_BUS_RESET, ch->path, NULL);
	xpt_release_simq(ch->sim, TRUE);
	return(0);
	}

	int
	ata_suspend(device_t dev)
	{
	struct ata_channel *ch;

	/* check for valid device */
	if (!dev \|\| !(ch = device_get_softc(dev)))
	return ENXIO;

	if (ch->flags & ATA_PERIODIC_POLL)
	callout_drain(&ch->poll_callout);
	mtx_lock(&ch->state_mtx);
	xpt_freeze_simq(ch->sim, 1);
	while (ch->state != ATA_IDLE)
	msleep(ch, &ch->state_mtx, PRIBIO, "atasusp", hz/100);
	mtx_unlock(&ch->state_mtx);
	return(0);
	}

	int
	ata_resume(device_t dev)
	{
	struct ata_channel *ch;
	int error;

	/* check for valid device */
	if (!dev \|\| !(ch = device_get_softc(dev)))
	return ENXIO;

	mtx_lock(&ch->state_mtx);
	error = ata_reinit(dev);
	xpt_release_simq(ch->sim, TRUE);
	mtx_unlock(&ch->state_mtx);
	if (ch->flags & ATA_PERIODIC_POLL)
	callout_reset(&ch->poll_callout, hz, ata_periodic_poll, ch);
	return error;
	}

	void
	ata_interrupt(void *data)
	{
	struct ata_channel ch = (struct ata_channel )data;

	mtx_lock(&ch->state_mtx);
	ata_interrupt_locked(data);
	mtx_unlock(&ch->state_mtx);
	}

	static void
	ata_interrupt_locked(void *data)
	{
	struct ata_channel ch = (struct ata_channel )data;
	struct ata_request *request;

	/* ignore interrupt if its not for us */
	if (ch->hw.status && !ch->hw.status(ch->dev))
	return;

	/* do we have a running request */
	if (!(request = ch->running))
	return;

	ATA_DEBUG_RQ(request, "interrupt");

	/* safetycheck for the right state */
	if (ch->state == ATA_IDLE) {
	device_printf(request->dev, "interrupt on idle channel ignored\n");
	return;
	}

	/*
	* we have the HW locks, so end the transaction for this request
	* if it finishes immediately otherwise wait for next interrupt
	*/
	if (ch->hw.end_transaction(request) == ATA_OP_FINISHED) {
	ch->running = NULL;
	if (ch->state == ATA_ACTIVE)
	ch->state = ATA_IDLE;
	ata_cam_end_transaction(ch->dev, request);
	return;
	}
	}

	static void
	ata_periodic_poll(void *data)
	{
	struct ata_channel ch = (struct ata_channel )data;

	callout_reset(&ch->poll_callout, hz, ata_periodic_poll, ch);
	ata_interrupt(ch);
	}

	void
	ata_print_cable(device_t dev, u_int8_t *who)
	{
	device_printf(dev,
	"DMA limited to UDMA33, %s found non-ATA66 cable\n", who);
	}

	/*
	* misc support functions
	*/
	void
	ata_default_registers(device_t dev)
	{
	struct ata_channel *ch = device_get_softc(dev);

	/* fill in the defaults from whats setup already */
	ch->r_io[ATA_ERROR].res = ch->r_io[ATA_FEATURE].res;
	ch->r_io[ATA_ERROR].offset = ch->r_io[ATA_FEATURE].offset;
	ch->r_io[ATA_IREASON].res = ch->r_io[ATA_COUNT].res;
	ch->r_io[ATA_IREASON].offset = ch->r_io[ATA_COUNT].offset;
	ch->r_io[ATA_STATUS].res = ch->r_io[ATA_COMMAND].res;
	ch->r_io[ATA_STATUS].offset = ch->r_io[ATA_COMMAND].offset;
	ch->r_io[ATA_ALTSTAT].res = ch->r_io[ATA_CONTROL].res;
	ch->r_io[ATA_ALTSTAT].offset = ch->r_io[ATA_CONTROL].offset;
	}

	void
	ata_udelay(int interval)
	{
	/* for now just use DELAY, the timer/sleep subsystems are not there yet */
	if (1 \|\| interval < (1000000/hz) \|\| ata_delayed_attach)
	DELAY(interval);
	else
	pause("ataslp", interval/(1000000/hz));
	}

	const char *
	ata_cmd2str(struct ata_request *request)
	{
	static char buffer[20];

	if (request->flags & ATA_R_ATAPI) {
	switch (request->u.atapi.sense.key ?
	request->u.atapi.saved_cmd : request->u.atapi.ccb[0]) {
	case 0x00: return ("TEST_UNIT_READY");
	case 0x01: return ("REZERO");
	case 0x03: return ("REQUEST_SENSE");
	case 0x04: return ("FORMAT");
	case 0x08: return ("READ");
	case 0x0a: return ("WRITE");
	case 0x10: return ("WEOF");
	case 0x11: return ("SPACE");
	case 0x12: return ("INQUIRY");
	case 0x15: return ("MODE_SELECT");
	case 0x19: return ("ERASE");
	case 0x1a: return ("MODE_SENSE");
	case 0x1b: return ("START_STOP");
	case 0x1e: return ("PREVENT_ALLOW");
	case 0x23: return ("ATAPI_READ_FORMAT_CAPACITIES");
	case 0x25: return ("READ_CAPACITY");
	case 0x28: return ("READ_BIG");
	case 0x2a: return ("WRITE_BIG");
	case 0x2b: return ("LOCATE");
	case 0x34: return ("READ_POSITION");
	case 0x35: return ("SYNCHRONIZE_CACHE");
	case 0x3b: return ("WRITE_BUFFER");
	case 0x3c: return ("READ_BUFFER");
	case 0x42: return ("READ_SUBCHANNEL");
	case 0x43: return ("READ_TOC");
	case 0x45: return ("PLAY_10");
	case 0x47: return ("PLAY_MSF");
	case 0x48: return ("PLAY_TRACK");
	case 0x4b: return ("PAUSE");
	case 0x51: return ("READ_DISK_INFO");
	case 0x52: return ("READ_TRACK_INFO");
	case 0x53: return ("RESERVE_TRACK");
	case 0x54: return ("SEND_OPC_INFO");
	case 0x55: return ("MODE_SELECT_BIG");
	case 0x58: return ("REPAIR_TRACK");
	case 0x59: return ("READ_MASTER_CUE");
	case 0x5a: return ("MODE_SENSE_BIG");
	case 0x5b: return ("CLOSE_TRACK/SESSION");
	case 0x5c: return ("READ_BUFFER_CAPACITY");
	case 0x5d: return ("SEND_CUE_SHEET");
	case 0x96: return ("SERVICE_ACTION_IN");
	case 0xa1: return ("BLANK_CMD");
	case 0xa3: return ("SEND_KEY");
	case 0xa4: return ("REPORT_KEY");
	case 0xa5: return ("PLAY_12");
	case 0xa6: return ("LOAD_UNLOAD");
	case 0xad: return ("READ_DVD_STRUCTURE");
	case 0xb4: return ("PLAY_CD");
	case 0xbb: return ("SET_SPEED");
	case 0xbd: return ("MECH_STATUS");
	case 0xbe: return ("READ_CD");
	case 0xff: return ("POLL_DSC");
	}
	} else {
	switch (request->u.ata.command) {
	case 0x00:
	switch (request->u.ata.feature) {
	case 0x00: return ("NOP FLUSHQUEUE");
	case 0x01: return ("NOP AUTOPOLL");
	}
	return ("NOP");
	case 0x03: return ("CFA_REQUEST_EXTENDED_ERROR");
	case 0x06:
	switch (request->u.ata.feature) {
	case 0x01: return ("DSM TRIM");
	}
	return "DSM";
	case 0x08: return ("DEVICE_RESET");
	case 0x20: return ("READ");
	case 0x24: return ("READ48");
	case 0x25: return ("READ_DMA48");
	case 0x26: return ("READ_DMA_QUEUED48");
	case 0x27: return ("READ_NATIVE_MAX_ADDRESS48");
	case 0x29: return ("READ_MUL48");
	case 0x2a: return ("READ_STREAM_DMA48");
	case 0x2b: return ("READ_STREAM48");
	case 0x2f: return ("READ_LOG_EXT");
	case 0x30: return ("WRITE");
	case 0x34: return ("WRITE48");
	case 0x35: return ("WRITE_DMA48");
	case 0x36: return ("WRITE_DMA_QUEUED48");
	case 0x37: return ("SET_MAX_ADDRESS48");
	case 0x39: return ("WRITE_MUL48");
	case 0x3a: return ("WRITE_STREAM_DMA48");
	case 0x3b: return ("WRITE_STREAM48");
	case 0x3d: return ("WRITE_DMA_FUA48");
	case 0x3e: return ("WRITE_DMA_QUEUED_FUA48");
	case 0x3f: return ("WRITE_LOG_EXT");
	case 0x40: return ("READ_VERIFY");
	case 0x42: return ("READ_VERIFY48");
	case 0x45:
	switch (request->u.ata.feature) {
	case 0x55: return ("WRITE_UNCORRECTABLE48 PSEUDO");
	case 0xaa: return ("WRITE_UNCORRECTABLE48 FLAGGED");
	}
	return "WRITE_UNCORRECTABLE48";
	case 0x51: return ("CONFIGURE_STREAM");
	case 0x60: return ("READ_FPDMA_QUEUED");
	case 0x61: return ("WRITE_FPDMA_QUEUED");
	case 0x63: return ("NCQ_NON_DATA");
	case 0x64: return ("SEND_FPDMA_QUEUED");
	case 0x65: return ("RECEIVE_FPDMA_QUEUED");
	case 0x67:
	if (request->u.ata.feature == 0xec)
	return ("SEP_ATTN IDENTIFY");
	switch (request->u.ata.lba) {
	case 0x00: return ("SEP_ATTN READ BUFFER");
	case 0x02: return ("SEP_ATTN RECEIVE DIAGNOSTIC RESULTS");
	case 0x80: return ("SEP_ATTN WRITE BUFFER");
	case 0x82: return ("SEP_ATTN SEND DIAGNOSTIC");
	}
	return ("SEP_ATTN");
	case 0x70: return ("SEEK");
	case 0x87: return ("CFA_TRANSLATE_SECTOR");
	case 0x90: return ("EXECUTE_DEVICE_DIAGNOSTIC");
	case 0x92: return ("DOWNLOAD_MICROCODE");
	case 0xa0: return ("PACKET");
	case 0xa1: return ("ATAPI_IDENTIFY");
	case 0xa2: return ("SERVICE");
	case 0xb0:
	switch(request->u.ata.feature) {
	case 0xd0: return ("SMART READ ATTR VALUES");
	case 0xd1: return ("SMART READ ATTR THRESHOLDS");
	case 0xd3: return ("SMART SAVE ATTR VALUES");
	case 0xd4: return ("SMART EXECUTE OFFLINE IMMEDIATE");
	case 0xd5: return ("SMART READ LOG DATA");
	case 0xd8: return ("SMART ENABLE OPERATION");
	case 0xd9: return ("SMART DISABLE OPERATION");
	case 0xda: return ("SMART RETURN STATUS");
	}
	return ("SMART");
	case 0xb1: return ("DEVICE CONFIGURATION");
	case 0xc0: return ("CFA_ERASE");
	case 0xc4: return ("READ_MUL");
	case 0xc5: return ("WRITE_MUL");
	case 0xc6: return ("SET_MULTI");
	case 0xc7: return ("READ_DMA_QUEUED");
	case 0xc8: return ("READ_DMA");
	case 0xca: return ("WRITE_DMA");
	case 0xcc: return ("WRITE_DMA_QUEUED");
	case 0xcd: return ("CFA_WRITE_MULTIPLE_WITHOUT_ERASE");
	case 0xce: return ("WRITE_MUL_FUA48");
	case 0xd1: return ("CHECK_MEDIA_CARD_TYPE");
	case 0xda: return ("GET_MEDIA_STATUS");
	case 0xde: return ("MEDIA_LOCK");
	case 0xdf: return ("MEDIA_UNLOCK");
	case 0xe0: return ("STANDBY_IMMEDIATE");
	case 0xe1: return ("IDLE_IMMEDIATE");
	case 0xe2: return ("STANDBY");
	case 0xe3: return ("IDLE");
	case 0xe4: return ("READ_BUFFER/PM");
	case 0xe5: return ("CHECK_POWER_MODE");
	case 0xe6: return ("SLEEP");
	case 0xe7: return ("FLUSHCACHE");
	case 0xe8: return ("WRITE_PM");
	case 0xea: return ("FLUSHCACHE48");
	case 0xec: return ("ATA_IDENTIFY");
	case 0xed: return ("MEDIA_EJECT");
	case 0xef:
	switch (request->u.ata.feature) {
	case 0x03: return ("SETFEATURES SET TRANSFER MODE");
	case 0x02: return ("SETFEATURES ENABLE WCACHE");
	case 0x82: return ("SETFEATURES DISABLE WCACHE");
	case 0x06: return ("SETFEATURES ENABLE PUIS");
	case 0x86: return ("SETFEATURES DISABLE PUIS");
	case 0x07: return ("SETFEATURES SPIN-UP");
	case 0x10: return ("SETFEATURES ENABLE SATA FEATURE");
	case 0x90: return ("SETFEATURES DISABLE SATA FEATURE");
	case 0xaa: return ("SETFEATURES ENABLE RCACHE");
	case 0x55: return ("SETFEATURES DISABLE RCACHE");
	case 0x5d: return ("SETFEATURES ENABLE RELIRQ");
	case 0xdd: return ("SETFEATURES DISABLE RELIRQ");
	case 0x5e: return ("SETFEATURES ENABLE SRVIRQ");
	case 0xde: return ("SETFEATURES DISABLE SRVIRQ");
	}
	return "SETFEATURES";
	case 0xf1: return ("SECURITY_SET_PASSWORD");
	case 0xf2: return ("SECURITY_UNLOCK");
	case 0xf3: return ("SECURITY_ERASE_PREPARE");
	case 0xf4: return ("SECURITY_ERASE_UNIT");
	case 0xf5: return ("SECURITY_FREEZE_LOCK");
	case 0xf6: return ("SECURITY_DISABLE_PASSWORD");
	case 0xf8: return ("READ_NATIVE_MAX_ADDRESS");
	case 0xf9: return ("SET_MAX_ADDRESS");
	}
	}
	sprintf(buffer, "unknown CMD (0x%02x)", request->u.ata.command);
	return (buffer);
	}

	const char *
	ata_mode2str(int mode)
	{
	switch (mode) {
	case -1: return "UNSUPPORTED";
	case ATA_PIO0: return "PIO0";
	case ATA_PIO1: return "PIO1";
	case ATA_PIO2: return "PIO2";
	case ATA_PIO3: return "PIO3";
	case ATA_PIO4: return "PIO4";
	case ATA_WDMA0: return "WDMA0";
	case ATA_WDMA1: return "WDMA1";
	case ATA_WDMA2: return "WDMA2";
	case ATA_UDMA0: return "UDMA16";
	case ATA_UDMA1: return "UDMA25";
	case ATA_UDMA2: return "UDMA33";
	case ATA_UDMA3: return "UDMA40";
	case ATA_UDMA4: return "UDMA66";
	case ATA_UDMA5: return "UDMA100";
	case ATA_UDMA6: return "UDMA133";
	case ATA_SA150: return "SATA150";
	case ATA_SA300: return "SATA300";
	case ATA_SA600: return "SATA600";
	default:
	if (mode & ATA_DMA_MASK)
	return "BIOSDMA";
	else
	return "BIOSPIO";
	}
	}

	static int
	ata_str2mode(const char *str)
	{

	if (!strcasecmp(str, "PIO0")) return (ATA_PIO0);
	if (!strcasecmp(str, "PIO1")) return (ATA_PIO1);
	if (!strcasecmp(str, "PIO2")) return (ATA_PIO2);
	if (!strcasecmp(str, "PIO3")) return (ATA_PIO3);
	if (!strcasecmp(str, "PIO4")) return (ATA_PIO4);
	if (!strcasecmp(str, "WDMA0")) return (ATA_WDMA0);
	if (!strcasecmp(str, "WDMA1")) return (ATA_WDMA1);
	if (!strcasecmp(str, "WDMA2")) return (ATA_WDMA2);
	if (!strcasecmp(str, "UDMA0")) return (ATA_UDMA0);
	if (!strcasecmp(str, "UDMA16")) return (ATA_UDMA0);
	if (!strcasecmp(str, "UDMA1")) return (ATA_UDMA1);
	if (!strcasecmp(str, "UDMA25")) return (ATA_UDMA1);
	if (!strcasecmp(str, "UDMA2")) return (ATA_UDMA2);
	if (!strcasecmp(str, "UDMA33")) return (ATA_UDMA2);
	if (!strcasecmp(str, "UDMA3")) return (ATA_UDMA3);
	if (!strcasecmp(str, "UDMA44")) return (ATA_UDMA3);
	if (!strcasecmp(str, "UDMA4")) return (ATA_UDMA4);
	if (!strcasecmp(str, "UDMA66")) return (ATA_UDMA4);
	if (!strcasecmp(str, "UDMA5")) return (ATA_UDMA5);
	if (!strcasecmp(str, "UDMA100")) return (ATA_UDMA5);
	if (!strcasecmp(str, "UDMA6")) return (ATA_UDMA6);
	if (!strcasecmp(str, "UDMA133")) return (ATA_UDMA6);
	return (-1);
	}

	int
	ata_atapi(device_t dev, int target)
	{
	struct ata_channel *ch = device_get_softc(dev);

	return (ch->devices & (ATA_ATAPI_MASTER << target));
	}

	void
	ata_timeout(void *arg)
	{
	struct ata_request *request;
	struct ata_channel *ch;

	request = arg;
	ch = device_get_softc(request->parent);
	//request->flags \|= ATA_R_DEBUG;
	ATA_DEBUG_RQ(request, "timeout");

	/*
	* If we have an ATA_ACTIVE request running, we flag the request
	* ATA_R_TIMEOUT so ata_cam_end_transaction() will handle it correctly.
	* Also, NULL out the running request so we wont loose the race with
	* an eventual interrupt arriving late.
	*/
	if (ch->state == ATA_ACTIVE) {
	request->flags \|= ATA_R_TIMEOUT;
	if (ch->dma.unload)
	ch->dma.unload(request);
	ch->running = NULL;
	ch->state = ATA_IDLE;
	ata_cam_end_transaction(ch->dev, request);
	}
	mtx_unlock(&ch->state_mtx);
	}

	static void
	ata_cam_begin_transaction(device_t dev, union ccb *ccb)
	{
	struct ata_channel *ch = device_get_softc(dev);
	struct ata_request *request;

	request = &ch->request;
	bzero(request, sizeof(*request));

	/* setup request */
	request->dev = NULL;
	request->parent = dev;
	request->unit = ccb->ccb_h.target_id;
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	request->data = ccb->ataio.data_ptr;
	request->bytecount = ccb->ataio.dxfer_len;
	request->u.ata.command = ccb->ataio.cmd.command;
	request->u.ata.feature = ((uint16_t)ccb->ataio.cmd.features_exp << 8) \|
	(uint16_t)ccb->ataio.cmd.features;
	request->u.ata.count = ((uint16_t)ccb->ataio.cmd.sector_count_exp << 8) \|
	(uint16_t)ccb->ataio.cmd.sector_count;
	if (ccb->ataio.cmd.flags & CAM_ATAIO_48BIT) {
	request->flags \|= ATA_R_48BIT;
	request->u.ata.lba =
	((uint64_t)ccb->ataio.cmd.lba_high_exp << 40) \|
	((uint64_t)ccb->ataio.cmd.lba_mid_exp << 32) \|
	((uint64_t)ccb->ataio.cmd.lba_low_exp << 24);
	} else {
	request->u.ata.lba =
	((uint64_t)(ccb->ataio.cmd.device & 0x0f) << 24);
	}
	request->u.ata.lba \|= ((uint64_t)ccb->ataio.cmd.lba_high << 16) \|
	((uint64_t)ccb->ataio.cmd.lba_mid << 8) \|
	(uint64_t)ccb->ataio.cmd.lba_low;
	if (ccb->ataio.cmd.flags & CAM_ATAIO_NEEDRESULT)
	request->flags \|= ATA_R_NEEDRESULT;
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE &&
	ccb->ataio.cmd.flags & CAM_ATAIO_DMA)
	request->flags \|= ATA_R_DMA;
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN)
	request->flags \|= ATA_R_READ;
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT)
	request->flags \|= ATA_R_WRITE;
	if (ccb->ataio.cmd.command == ATA_READ_MUL \|\|
	ccb->ataio.cmd.command == ATA_READ_MUL48 \|\|
	ccb->ataio.cmd.command == ATA_WRITE_MUL \|\|
	ccb->ataio.cmd.command == ATA_WRITE_MUL48) {
	request->transfersize = min(request->bytecount,
	ch->curr[ccb->ccb_h.target_id].bytecount);
	} else
	request->transfersize = min(request->bytecount, 512);
	} else {
	request->data = ccb->csio.data_ptr;
	request->bytecount = ccb->csio.dxfer_len;
	bcopy((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
	ccb->csio.cdb_io.cdb_ptr : ccb->csio.cdb_io.cdb_bytes,
	request->u.atapi.ccb, ccb->csio.cdb_len);
	request->flags \|= ATA_R_ATAPI;
	if (ch->curr[ccb->ccb_h.target_id].atapi == 16)
	request->flags \|= ATA_R_ATAPI16;
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE &&
	ch->curr[ccb->ccb_h.target_id].mode >= ATA_DMA)
	request->flags \|= ATA_R_DMA;
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN)
	request->flags \|= ATA_R_READ;
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT)
	request->flags \|= ATA_R_WRITE;
	request->transfersize = min(request->bytecount,
	ch->curr[ccb->ccb_h.target_id].bytecount);
	}
	request->retries = 0;
	request->timeout = (ccb->ccb_h.timeout + 999) / 1000;
	callout_init_mtx(&request->callout, &ch->state_mtx, CALLOUT_RETURNUNLOCKED);
	request->ccb = ccb;
	request->flags \|= ATA_R_DATA_IN_CCB;

	ch->running = request;
	ch->state = ATA_ACTIVE;
	if (ch->hw.begin_transaction(request) == ATA_OP_FINISHED) {
	ch->running = NULL;
	ch->state = ATA_IDLE;
	ata_cam_end_transaction(dev, request);
	return;
	}
	}

	static void
	ata_cam_request_sense(device_t dev, struct ata_request *request)
	{
	struct ata_channel *ch = device_get_softc(dev);
	union ccb *ccb = request->ccb;

	ch->requestsense = 1;

	bzero(request, sizeof(*request));
	request->dev = NULL;
	request->parent = dev;
	request->unit = ccb->ccb_h.target_id;
	request->data = (void *)&ccb->csio.sense_data;
	request->bytecount = ccb->csio.sense_len;
	request->u.atapi.ccb[0] = ATAPI_REQUEST_SENSE;
	request->u.atapi.ccb[4] = ccb->csio.sense_len;
	request->flags \|= ATA_R_ATAPI;
	if (ch->curr[ccb->ccb_h.target_id].atapi == 16)
	request->flags \|= ATA_R_ATAPI16;
	if (ch->curr[ccb->ccb_h.target_id].mode >= ATA_DMA)
	request->flags \|= ATA_R_DMA;
	request->flags \|= ATA_R_READ;
	request->transfersize = min(request->bytecount,
	ch->curr[ccb->ccb_h.target_id].bytecount);
	request->retries = 0;
	request->timeout = (ccb->ccb_h.timeout + 999) / 1000;
	callout_init_mtx(&request->callout, &ch->state_mtx, CALLOUT_RETURNUNLOCKED);
	request->ccb = ccb;

	ch->running = request;
	ch->state = ATA_ACTIVE;
	if (ch->hw.begin_transaction(request) == ATA_OP_FINISHED) {
	ch->running = NULL;
	ch->state = ATA_IDLE;
	ata_cam_end_transaction(dev, request);
	return;
	}
	}

	static void
	ata_cam_process_sense(device_t dev, struct ata_request *request)
	{
	struct ata_channel *ch = device_get_softc(dev);
	union ccb *ccb = request->ccb;
	int fatalerr = 0;

	ch->requestsense = 0;

	if (request->flags & ATA_R_TIMEOUT)
	fatalerr = 1;
	if ((request->flags & ATA_R_TIMEOUT) == 0 &&
	(request->status & ATA_S_ERROR) == 0 &&
	request->result == 0) {
	ccb->ccb_h.status \|= CAM_AUTOSNS_VALID;
	} else {
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	ccb->ccb_h.status \|= CAM_AUTOSENSE_FAIL;
	}

	xpt_done(ccb);
	/* Do error recovery if needed. */
	if (fatalerr)
	ata_reinit(dev);
	}

	static void
	ata_cam_end_transaction(device_t dev, struct ata_request *request)
	{
	struct ata_channel *ch = device_get_softc(dev);
	union ccb *ccb = request->ccb;
	int fatalerr = 0;

	if (ch->requestsense) {
	ata_cam_process_sense(dev, request);
	return;
	}

	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	if (request->flags & ATA_R_TIMEOUT) {
	xpt_freeze_simq(ch->sim, 1);
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	ccb->ccb_h.status \|= CAM_CMD_TIMEOUT \| CAM_RELEASE_SIMQ;
	fatalerr = 1;
	} else if (request->status & ATA_S_ERROR) {
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	ccb->ccb_h.status \|= CAM_ATA_STATUS_ERROR;
	} else {
	ccb->ccb_h.status \|= CAM_SCSI_STATUS_ERROR;
	ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
	}
	} else if (request->result == ERESTART)
	ccb->ccb_h.status \|= CAM_REQUEUE_REQ;
	else if (request->result != 0)
	ccb->ccb_h.status \|= CAM_REQ_CMP_ERR;
	else
	ccb->ccb_h.status \|= CAM_REQ_CMP;
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP &&
	!(ccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	if (ccb->ccb_h.func_code == XPT_ATA_IO &&
	((request->status & ATA_S_ERROR) \|\|
	(ccb->ataio.cmd.flags & CAM_ATAIO_NEEDRESULT))) {
	struct ata_res *res = &ccb->ataio.res;
	res->status = request->status;
	res->error = request->error;
	res->lba_low = request->u.ata.lba;
	res->lba_mid = request->u.ata.lba >> 8;
	res->lba_high = request->u.ata.lba >> 16;
	res->device = request->u.ata.lba >> 24;
	res->lba_low_exp = request->u.ata.lba >> 24;
	res->lba_mid_exp = request->u.ata.lba >> 32;
	res->lba_high_exp = request->u.ata.lba >> 40;
	res->sector_count = request->u.ata.count;
	res->sector_count_exp = request->u.ata.count >> 8;
	}
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) {
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	ccb->ataio.resid =
	ccb->ataio.dxfer_len - request->donecount;
	} else {
	ccb->csio.resid =
	ccb->csio.dxfer_len - request->donecount;
	}
	}
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR &&
	(ccb->ccb_h.flags & CAM_DIS_AUTOSENSE) == 0)
	ata_cam_request_sense(dev, request);
	else
	xpt_done(ccb);
	/* Do error recovery if needed. */
	if (fatalerr)
	ata_reinit(dev);
	}

	static int
	ata_check_ids(device_t dev, union ccb *ccb)
	{
	struct ata_channel *ch = device_get_softc(dev);

	if (ccb->ccb_h.target_id > ((ch->flags & ATA_NO_SLAVE) ? 0 : 1)) {
	ccb->ccb_h.status = CAM_TID_INVALID;
	xpt_done(ccb);
	return (-1);
	}
	if (ccb->ccb_h.target_lun != 0) {
	ccb->ccb_h.status = CAM_LUN_INVALID;
	xpt_done(ccb);
	return (-1);
	}
	/*
	* It's a programming error to see AUXILIARY register requests.
	*/
	KASSERT(ccb->ccb_h.func_code != XPT_ATA_IO \|\|
	((ccb->ataio.ata_flags & ATA_FLAG_AUX) == 0),
	("AUX register unsupported"));
	return (0);
	}

	static void
	ataaction(struct cam_sim sim, union ccb ccb)
	{
	device_t dev, parent;
	struct ata_channel *ch;

	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("ataaction func_code=%x\n",
	ccb->ccb_h.func_code));

	ch = (struct ata_channel *)cam_sim_softc(sim);
	dev = ch->dev;
	switch (ccb->ccb_h.func_code) {
	/* Common cases first */
	case XPT_ATA_IO: /* Execute the requested I/O operation */
	case XPT_SCSI_IO:
	if (ata_check_ids(dev, ccb))
	return;
	if ((ch->devices & ((ATA_ATA_MASTER \| ATA_ATAPI_MASTER)
	<< ccb->ccb_h.target_id)) == 0) {
	ccb->ccb_h.status = CAM_SEL_TIMEOUT;
	break;
	}
	if (ch->running)
	device_printf(dev, "already running!\n");
	if (ccb->ccb_h.func_code == XPT_ATA_IO &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) &&
	(ccb->ataio.cmd.control & ATA_A_RESET)) {
	struct ata_res *res = &ccb->ataio.res;

	bzero(res, sizeof(*res));
	if (ch->devices & (ATA_ATA_MASTER << ccb->ccb_h.target_id)) {
	res->lba_high = 0;
	res->lba_mid = 0;
	} else {
	res->lba_high = 0xeb;
	res->lba_mid = 0x14;
	}
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	ata_cam_begin_transaction(dev, ccb);
	return;
	case XPT_ABORT: /* Abort the specified CCB */
	/* XXX Implement */
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	case XPT_SET_TRAN_SETTINGS:
	{
	struct ccb_trans_settings *cts = &ccb->cts;
	struct ata_cam_device *d;

	if (ata_check_ids(dev, ccb))
	return;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS)
	d = &ch->curr[ccb->ccb_h.target_id];
	else
	d = &ch->user[ccb->ccb_h.target_id];
	if (ch->flags & ATA_SATA) {
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_REVISION)
	d->revision = cts->xport_specific.sata.revision;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_MODE) {
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS) {
	d->mode = ATA_SETMODE(ch->dev,
	ccb->ccb_h.target_id,
	cts->xport_specific.sata.mode);
	} else
	d->mode = cts->xport_specific.sata.mode;
	}
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_BYTECOUNT)
	d->bytecount = min(8192, cts->xport_specific.sata.bytecount);
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_ATAPI)
	d->atapi = cts->xport_specific.sata.atapi;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_CAPS)
	d->caps = cts->xport_specific.sata.caps;
	} else {
	if (cts->xport_specific.ata.valid & CTS_ATA_VALID_MODE) {
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS) {
	d->mode = ATA_SETMODE(ch->dev,
	ccb->ccb_h.target_id,
	cts->xport_specific.ata.mode);
	} else
	d->mode = cts->xport_specific.ata.mode;
	}
	if (cts->xport_specific.ata.valid & CTS_ATA_VALID_BYTECOUNT)
	d->bytecount = cts->xport_specific.ata.bytecount;
	if (cts->xport_specific.ata.valid & CTS_ATA_VALID_ATAPI)
	d->atapi = cts->xport_specific.ata.atapi;
	if (cts->xport_specific.ata.valid & CTS_ATA_VALID_CAPS)
	d->caps = cts->xport_specific.ata.caps;
	}
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_GET_TRAN_SETTINGS:
	{
	struct ccb_trans_settings *cts = &ccb->cts;
	struct ata_cam_device *d;

	if (ata_check_ids(dev, ccb))
	return;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS)
	d = &ch->curr[ccb->ccb_h.target_id];
	else
	d = &ch->user[ccb->ccb_h.target_id];
	cts->protocol = PROTO_UNSPECIFIED;
	cts->protocol_version = PROTO_VERSION_UNSPECIFIED;
	if (ch->flags & ATA_SATA) {
	cts->transport = XPORT_SATA;
	cts->transport_version = XPORT_VERSION_UNSPECIFIED;
	cts->xport_specific.sata.valid = 0;
	cts->xport_specific.sata.mode = d->mode;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_MODE;
	cts->xport_specific.sata.bytecount = d->bytecount;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_BYTECOUNT;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS) {
	cts->xport_specific.sata.revision =
	ATA_GETREV(dev, ccb->ccb_h.target_id);
	if (cts->xport_specific.sata.revision != 0xff) {
	cts->xport_specific.sata.valid \|=
	CTS_SATA_VALID_REVISION;
	}
	cts->xport_specific.sata.caps =
	d->caps & CTS_SATA_CAPS_D;
	if (ch->pm_level) {
	cts->xport_specific.sata.caps \|=
	CTS_SATA_CAPS_H_PMREQ;
	}
	cts->xport_specific.sata.caps &=
	ch->user[ccb->ccb_h.target_id].caps;
	} else {
	cts->xport_specific.sata.revision = d->revision;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_REVISION;
	cts->xport_specific.sata.caps = d->caps;
	}
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_CAPS;
	cts->xport_specific.sata.atapi = d->atapi;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_ATAPI;
	} else {
	cts->transport = XPORT_ATA;
	cts->transport_version = XPORT_VERSION_UNSPECIFIED;
	cts->xport_specific.ata.valid = 0;
	cts->xport_specific.ata.mode = d->mode;
	cts->xport_specific.ata.valid \|= CTS_ATA_VALID_MODE;
	cts->xport_specific.ata.bytecount = d->bytecount;
	cts->xport_specific.ata.valid \|= CTS_ATA_VALID_BYTECOUNT;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS) {
	cts->xport_specific.ata.caps =
	d->caps & CTS_ATA_CAPS_D;
	if (!(ch->flags & ATA_NO_48BIT_DMA))
	cts->xport_specific.ata.caps \|=
	CTS_ATA_CAPS_H_DMA48;
	cts->xport_specific.ata.caps &=
	ch->user[ccb->ccb_h.target_id].caps;
	} else
	cts->xport_specific.ata.caps = d->caps;
	cts->xport_specific.ata.valid \|= CTS_ATA_VALID_CAPS;
	cts->xport_specific.ata.atapi = d->atapi;
	cts->xport_specific.ata.valid \|= CTS_ATA_VALID_ATAPI;
	}
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_RESET_BUS: /* Reset the specified SCSI bus */
	case XPT_RESET_DEV: /* Bus Device Reset the specified SCSI device */
	ata_reinit(dev);
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	case XPT_TERM_IO: /* Terminate the I/O process */
	/* XXX Implement */
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	case XPT_PATH_INQ: /* Path routing inquiry */
	{
	struct ccb_pathinq *cpi = &ccb->cpi;

	parent = device_get_parent(dev);
	cpi->version_num = 1; /* XXX??? */
	cpi->hba_inquiry = PI_SDTR_ABLE;
	cpi->target_sprt = 0;
	cpi->hba_misc = PIM_SEQSCAN \| PIM_UNMAPPED;
	cpi->hba_eng_cnt = 0;
	if (ch->flags & ATA_NO_SLAVE)
	cpi->max_target = 0;
	else
	cpi->max_target = 1;
	cpi->max_lun = 0;
	cpi->initiator_id = 0;
	cpi->bus_id = cam_sim_bus(sim);
	if (ch->flags & ATA_SATA)
	cpi->base_transfer_speed = 150000;
	else
	cpi->base_transfer_speed = 3300;
	strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strlcpy(cpi->hba_vid, "ATA", HBA_IDLEN);
	strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
	cpi->unit_number = cam_sim_unit(sim);
	if (ch->flags & ATA_SATA)
	cpi->transport = XPORT_SATA;
	else
	cpi->transport = XPORT_ATA;
	cpi->transport_version = XPORT_VERSION_UNSPECIFIED;
	cpi->protocol = PROTO_ATA;
	cpi->protocol_version = PROTO_VERSION_UNSPECIFIED;
	cpi->maxio = ch->dma.max_iosize ? ch->dma.max_iosize : DFLTPHYS;
	if (device_get_devclass(device_get_parent(parent)) ==
	devclass_find("pci")) {
	cpi->hba_vendor = pci_get_vendor(parent);
	cpi->hba_device = pci_get_device(parent);
	cpi->hba_subvendor = pci_get_subvendor(parent);
	cpi->hba_subdevice = pci_get_subdevice(parent);
	}
	cpi->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	default:
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	}
	xpt_done(ccb);
	}

	static void
	atapoll(struct cam_sim *sim)
	{
	struct ata_channel ch = (struct ata_channel )cam_sim_softc(sim);

	ata_interrupt_locked(ch);
	}

	/*
	* module handeling
	*/
	static int
	ata_module_event_handler(module_t mod, int what, void *arg)
	{

	switch (what) {
	case MOD_LOAD:
	return 0;

	case MOD_UNLOAD:
	return 0;

	default:
	return EOPNOTSUPP;
	}
	}

	static moduledata_t ata_moduledata = { "ata", ata_module_event_handler, NULL };
	DECLARE_MODULE(ata, ata_moduledata, SI_SUB_CONFIGURE, SI_ORDER_SECOND);
	MODULE_VERSION(ata, 1);
	MODULE_DEPEND(ata, cam, 1, 1, 1);
	diff --git a/sys/dev/ata/ata-all.h b/sys/dev/ata/ata-all.h
	index d3a381e594fd..da9419b2ed32 100644
	--- a/sys/dev/ata/ata-all.h
	+++ b/sys/dev/ata/ata-all.h
	@@ -1,588 +1,588 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#if 0
	#define ATA_LEGACY_SUPPORT /* Enable obsolete features that break
	* some modern devices */
	#endif

	/* ATA register defines */
	#define ATA_DATA 0 /* (RW) data */

	#define ATA_FEATURE 1 /* (W) feature */
	#define ATA_F_DMA 0x01 /* enable DMA */
	#define ATA_F_OVL 0x02 /* enable overlap */

	#define ATA_COUNT 2 /* (W) sector count */

	#define ATA_SECTOR 3 /* (RW) sector # */
	#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */
	#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */
	#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */
	#define ATA_D_LBA 0x40 /* use LBA addressing */
	#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */

	#define ATA_COMMAND 7 /* (W) command */

	#define ATA_ERROR 8 /* (R) error */
	#define ATA_E_ILI 0x01 /* illegal length */
	#define ATA_E_NM 0x02 /* no media */
	#define ATA_E_ABORT 0x04 /* command aborted */
	#define ATA_E_MCR 0x08 /* media change request */
	#define ATA_E_IDNF 0x10 /* ID not found */
	#define ATA_E_MC 0x20 /* media changed */
	#define ATA_E_UNC 0x40 /* uncorrectable data */
	#define ATA_E_ICRC 0x80 /* UDMA crc error */
	#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */

	#define ATA_IREASON 9 /* (R) interrupt reason */
	#define ATA_I_CMD 0x01 /* cmd (1) \| data (0) */
	#define ATA_I_IN 0x02 /* read (1) \| write (0) */
	#define ATA_I_RELEASE 0x04 /* released bus (1) */
	#define ATA_I_TAGMASK 0xf8 /* tag mask */

	#define ATA_STATUS 10 /* (R) status */
	#define ATA_ALTSTAT 11 /* (R) alternate status */
	#define ATA_S_ERROR 0x01 /* error */
	#define ATA_S_INDEX 0x02 /* index */
	#define ATA_S_CORR 0x04 /* data corrected */
	#define ATA_S_DRQ 0x08 /* data request */
	#define ATA_S_DSC 0x10 /* drive seek completed */
	#define ATA_S_SERVICE 0x10 /* drive needs service */
	#define ATA_S_DWF 0x20 /* drive write fault */
	#define ATA_S_DMA 0x20 /* DMA ready */
	#define ATA_S_READY 0x40 /* drive ready */
	#define ATA_S_BUSY 0x80 /* busy */

	#define ATA_CONTROL 12 /* (W) control */

	#define ATA_CTLOFFSET 0x206 /* control register offset */
	#define ATA_PCCARD_CTLOFFSET 0x0e /* do for PCCARD devices */
	#define ATA_A_IDS 0x02 /* disable interrupts */
	#define ATA_A_RESET 0x04 /* RESET controller */
	#ifdef ATA_LEGACY_SUPPORT
	#define ATA_A_4BIT 0x08 /* 4 head bits: obsolete 1996 */
	#else
	#define ATA_A_4BIT 0x00
	#endif
	#define ATA_A_HOB 0x80 /* High Order Byte enable */

	/* SATA register defines */
	#define ATA_SSTATUS 13
	#define ATA_SS_DET_MASK 0x0000000f
	#define ATA_SS_DET_NO_DEVICE 0x00000000
	#define ATA_SS_DET_DEV_PRESENT 0x00000001
	#define ATA_SS_DET_PHY_ONLINE 0x00000003
	#define ATA_SS_DET_PHY_OFFLINE 0x00000004

	#define ATA_SS_SPD_MASK 0x000000f0
	#define ATA_SS_SPD_NO_SPEED 0x00000000
	#define ATA_SS_SPD_GEN1 0x00000010
	#define ATA_SS_SPD_GEN2 0x00000020
	#define ATA_SS_SPD_GEN3 0x00000030

	#define ATA_SS_IPM_MASK 0x00000f00
	#define ATA_SS_IPM_NO_DEVICE 0x00000000
	#define ATA_SS_IPM_ACTIVE 0x00000100
	#define ATA_SS_IPM_PARTIAL 0x00000200
	#define ATA_SS_IPM_SLUMBER 0x00000600

	#define ATA_SERROR 14
	#define ATA_SE_DATA_CORRECTED 0x00000001
	#define ATA_SE_COMM_CORRECTED 0x00000002
	#define ATA_SE_DATA_ERR 0x00000100
	#define ATA_SE_COMM_ERR 0x00000200
	#define ATA_SE_PROT_ERR 0x00000400
	#define ATA_SE_HOST_ERR 0x00000800
	#define ATA_SE_PHY_CHANGED 0x00010000
	#define ATA_SE_PHY_IERROR 0x00020000
	#define ATA_SE_COMM_WAKE 0x00040000
	#define ATA_SE_DECODE_ERR 0x00080000
	#define ATA_SE_PARITY_ERR 0x00100000
	#define ATA_SE_CRC_ERR 0x00200000
	#define ATA_SE_HANDSHAKE_ERR 0x00400000
	#define ATA_SE_LINKSEQ_ERR 0x00800000
	#define ATA_SE_TRANSPORT_ERR 0x01000000
	#define ATA_SE_UNKNOWN_FIS 0x02000000

	#define ATA_SCONTROL 15
	#define ATA_SC_DET_MASK 0x0000000f
	#define ATA_SC_DET_IDLE 0x00000000
	#define ATA_SC_DET_RESET 0x00000001
	#define ATA_SC_DET_DISABLE 0x00000004

	#define ATA_SC_SPD_MASK 0x000000f0
	#define ATA_SC_SPD_NO_SPEED 0x00000000
	#define ATA_SC_SPD_SPEED_GEN1 0x00000010
	#define ATA_SC_SPD_SPEED_GEN2 0x00000020
	#define ATA_SC_SPD_SPEED_GEN3 0x00000030

	#define ATA_SC_IPM_MASK 0x00000f00
	#define ATA_SC_IPM_NONE 0x00000000
	#define ATA_SC_IPM_DIS_PARTIAL 0x00000100
	#define ATA_SC_IPM_DIS_SLUMBER 0x00000200

	#define ATA_SACTIVE 16

	/* DMA register defines */
	-#define ATA_DMA_ENTRIES 256
	+#define ATA_DMA_ENTRIES MAX(17, btoc(maxphys) + 1)
	#define ATA_DMA_EOT 0x80000000

	#define ATA_BMCMD_PORT 17
	#define ATA_BMCMD_START_STOP 0x01
	#define ATA_BMCMD_WRITE_READ 0x08

	#define ATA_BMDEVSPEC_0 18
	#define ATA_BMSTAT_PORT 19
	#define ATA_BMSTAT_ACTIVE 0x01
	#define ATA_BMSTAT_ERROR 0x02
	#define ATA_BMSTAT_INTERRUPT 0x04
	#define ATA_BMSTAT_MASK 0x07
	#define ATA_BMSTAT_DMA_MASTER 0x20
	#define ATA_BMSTAT_DMA_SLAVE 0x40
	#define ATA_BMSTAT_DMA_SIMPLEX 0x80

	#define ATA_BMDEVSPEC_1 20
	#define ATA_BMDTP_PORT 21

	#define ATA_IDX_ADDR 22
	#define ATA_IDX_DATA 23
	#define ATA_MAX_RES 24

	/* misc defines */
	#define ATA_PRIMARY 0x1f0
	#define ATA_SECONDARY 0x170
	#define ATA_IOSIZE 0x08
	#define ATA_CTLIOSIZE 0x01
	#define ATA_BMIOSIZE 0x08
	#define ATA_IOADDR_RID 0
	#define ATA_CTLADDR_RID 1
	#define ATA_BMADDR_RID 0x20
	#define ATA_IRQ_RID 0
	#define ATA_DEV(unit) ((unit > 0) ? 0x10 : 0)
	#define ATA_CFA_MAGIC1 0x844A
	#define ATA_CFA_MAGIC2 0x848A
	#define ATA_CFA_MAGIC3 0x8400
	#define ATAPI_MAGIC_LSB 0x14
	#define ATAPI_MAGIC_MSB 0xeb
	#define ATAPI_P_READ (ATA_S_DRQ \| ATA_I_IN)
	#define ATAPI_P_WRITE (ATA_S_DRQ)
	#define ATAPI_P_CMDOUT (ATA_S_DRQ \| ATA_I_CMD)
	#define ATAPI_P_DONEDRQ (ATA_S_DRQ \| ATA_I_CMD \| ATA_I_IN)
	#define ATAPI_P_DONE (ATA_I_CMD \| ATA_I_IN)
	#define ATAPI_P_ABORT 0
	#define ATA_INTR_FLAGS (INTR_MPSAFE\|INTR_TYPE_BIO\|INTR_ENTROPY)
	#define ATA_OP_CONTINUES 0
	#define ATA_OP_FINISHED 1
	#define ATA_MAX_28BIT_LBA 268435455UL

	/* structure used for composite atomic operations */
	#define MAX_COMPOSITES 32 /* u_int32_t bits */
	struct ata_composite {
	struct mtx lock; /* control lock */
	u_int32_t rd_needed; /* needed read subdisks */
	u_int32_t rd_done; /* done read subdisks */
	u_int32_t wr_needed; /* needed write subdisks */
	u_int32_t wr_depend; /* write depends on subdisks */
	u_int32_t wr_done; /* done write subdisks */
	struct ata_request *request[MAX_COMPOSITES];
	u_int32_t residual; /* bytes still to transfer */
	caddr_t data_1;
	caddr_t data_2;
	};

	/* structure used to queue an ATA/ATAPI request */
	struct ata_request {
	device_t dev; /* device handle */
	device_t parent; /* channel handle */
	int unit; /* physical unit */
	union {
	struct {
	u_int8_t command; /* command reg */
	u_int16_t feature; /* feature reg */
	u_int16_t count; /* count reg */
	u_int64_t lba; /* lba reg */
	} ata;
	struct {
	u_int8_t ccb[16]; /* ATAPI command block */
	struct atapi_sense sense; /* ATAPI request sense data */
	u_int8_t saved_cmd; /* ATAPI saved command */
	} atapi;
	} u;
	u_int32_t bytecount; /* bytes to transfer */
	u_int32_t transfersize; /* bytes pr transfer */
	caddr_t data; /* pointer to data buf */
	u_int32_t tag; /* HW tag of this request */
	int flags;
	#define ATA_R_CONTROL 0x00000001
	#define ATA_R_READ 0x00000002
	#define ATA_R_WRITE 0x00000004
	#define ATA_R_ATAPI 0x00000008
	#define ATA_R_DMA 0x00000010
	#define ATA_R_QUIET 0x00000020
	#define ATA_R_TIMEOUT 0x00000040
	#define ATA_R_48BIT 0x00000080

	#define ATA_R_ORDERED 0x00000100
	#define ATA_R_AT_HEAD 0x00000200
	#define ATA_R_REQUEUE 0x00000400
	#define ATA_R_THREAD 0x00000800
	#define ATA_R_DIRECT 0x00001000
	#define ATA_R_NEEDRESULT 0x00002000
	#define ATA_R_DATA_IN_CCB 0x00004000

	#define ATA_R_ATAPI16 0x00010000
	#define ATA_R_ATAPI_INTR 0x00020000

	#define ATA_R_DEBUG 0x10000000
	#define ATA_R_DANGER1 0x20000000
	#define ATA_R_DANGER2 0x40000000

	struct ata_dmaslot dma; / DMA slot of this request */
	u_int8_t status; /* ATA status */
	u_int8_t error; /* ATA error */
	u_int32_t donecount; /* bytes transferred */
	int result; /* result error code */
	void (callback)(struct ata_request request);
	struct sema done; /* request done sema */
	int retries; /* retry count */
	int timeout; /* timeout for this cmd */
	struct callout callout; /* callout management */
	struct task task; /* task management */
	struct bio bio; / bio for this request */
	int this; /* this request ID */
	struct ata_composite composite; / for composite atomic ops */
	void driver; / driver specific */
	TAILQ_ENTRY(ata_request) chain; /* list management */
	union ccb *ccb;
	};

	/* define this for debugging request processing */
	#if 0
	#define ATA_DEBUG_RQ(request, string) \
	{ \
	if (request->flags & ATA_R_DEBUG) \
	device_printf(request->parent, "req=%p %s " string "\n", \
	request, ata_cmd2str(request)); \
	}
	#else
	#define ATA_DEBUG_RQ(request, string)
	#endif

	/* structure describing an ATA/ATAPI device */
	struct ata_device {
	device_t dev; /* device handle */
	int unit; /* physical unit */
	#define ATA_MASTER 0x00
	#define ATA_SLAVE 0x01
	#define ATA_PM 0x0f

	struct ata_params param; /* ata param structure */
	int mode; /* current transfermode */
	u_int32_t max_iosize; /* max IO size */
	int spindown; /* idle spindown timeout */
	struct callout spindown_timer;
	int spindown_state;
	int flags;
	#define ATA_D_USE_CHS 0x0001
	#define ATA_D_MEDIA_CHANGED 0x0002
	#define ATA_D_ENC_PRESENT 0x0004
	};

	/* structure for holding DMA Physical Region Descriptors (PRD) entries */
	struct ata_dma_prdentry {
	u_int32_t addr;
	u_int32_t count;
	};

	/* structure used by the setprd function */
	struct ata_dmasetprd_args {
	void *dmatab;
	int nsegs;
	int error;
	};

	struct ata_dmaslot {
	u_int8_t status; /* DMA status */
	bus_dma_tag_t sg_tag; /* SG list DMA tag */
	bus_dmamap_t sg_map; /* SG list DMA map */
	void sg; / DMA transfer table */
	bus_addr_t sg_bus; /* bus address of dmatab */
	bus_dma_tag_t data_tag; /* data DMA tag */
	bus_dmamap_t data_map; /* data DMA map */
	};

	/* structure holding DMA related information */
	struct ata_dma {
	bus_dma_tag_t dmatag; /* parent DMA tag */
	bus_dma_tag_t work_tag; /* workspace DMA tag */
	bus_dmamap_t work_map; /* workspace DMA map */
	u_int8_t work; / workspace */
	bus_addr_t work_bus; /* bus address of dmatab */

	#define ATA_DMA_SLOTS 1
	int dma_slots; /* DMA slots allocated */
	struct ata_dmaslot slot[ATA_DMA_SLOTS];
	u_int32_t alignment; /* DMA SG list alignment */
	u_int32_t boundary; /* DMA SG list boundary */
	u_int32_t segsize; /* DMA SG list segment size */
	u_int32_t max_iosize; /* DMA data max IO size */
	u_int64_t max_address; /* highest DMA'able address */
	int flags;
	#define ATA_DMA_ACTIVE 0x01 /* DMA transfer in progress */

	void (*alloc)(device_t dev);
	void (*free)(device_t dev);
	void (setprd)(void xsc, bus_dma_segment_t *segs, int nsegs, int error);
	int (load)(struct ata_request request, void addr, int nsegs);
	int (unload)(struct ata_request request);
	int (start)(struct ata_request request);
	int (stop)(struct ata_request request);
	void (*reset)(device_t dev);
	};

	/* structure holding lowlevel functions */
	struct ata_lowlevel {
	u_int32_t (*softreset)(device_t dev, int pmport);
	int (pm_read)(device_t dev, int port, int reg, u_int32_t result);
	int (*pm_write)(device_t dev, int port, int reg, u_int32_t value);
	int (*status)(device_t dev);
	int (begin_transaction)(struct ata_request request);
	int (end_transaction)(struct ata_request request);
	int (command)(struct ata_request request);
	void (tf_read)(struct ata_request request);
	void (tf_write)(struct ata_request request);
	};

	/* structure holding resources for an ATA channel */
	struct ata_resource {
	struct resource *res;
	int offset;
	};

	struct ata_cam_device {
	u_int revision;
	int mode;
	u_int bytecount;
	u_int atapi;
	u_int caps;
	};

	/* structure describing an ATA channel */
	struct ata_channel {
	device_t dev; /* device handle */
	int unit; /* physical channel */
	int attached; /* channel is attached */
	struct ata_resource r_io[ATA_MAX_RES];/* I/O resources */
	struct resource r_irq; / interrupt of this channel */
	void ih; / interrupt handle */
	struct ata_lowlevel hw; /* lowlevel HW functions */
	struct ata_dma dma; /* DMA data / functions */
	int flags; /* channel flags */
	#define ATA_NO_SLAVE 0x01
	#define ATA_USE_16BIT 0x02
	#define ATA_ATAPI_DMA_RO 0x04
	#define ATA_NO_48BIT_DMA 0x08
	#define ATA_ALWAYS_DMASTAT 0x10
	#define ATA_CHECKS_CABLE 0x20
	#define ATA_NO_ATAPI_DMA 0x40
	#define ATA_SATA 0x80
	#define ATA_DMA_BEFORE_CMD 0x100
	#define ATA_KNOWN_PRESENCE 0x200
	#define ATA_STATUS_IS_LONG 0x400
	#define ATA_PERIODIC_POLL 0x800

	int pm_level; /* power management level */
	int devices; /* what is present */
	#define ATA_ATA_MASTER 0x00000001
	#define ATA_ATA_SLAVE 0x00000002
	#define ATA_PORTMULTIPLIER 0x00008000
	#define ATA_ATAPI_MASTER 0x00010000
	#define ATA_ATAPI_SLAVE 0x00020000

	struct mtx state_mtx; /* state lock */
	int state; /* ATA channel state */
	#define ATA_IDLE 0x0000
	#define ATA_ACTIVE 0x0001
	#define ATA_STALL_QUEUE 0x0002

	struct ata_request running; / currently running request */
	struct task conntask; /* PHY events handling task */
	struct cam_sim *sim;
	struct cam_path *path;
	struct ata_cam_device user[16]; /* User-specified settings */
	struct ata_cam_device curr[16]; /* Current settings */
	int requestsense; /* CCB waiting for SENSE. */
	struct callout poll_callout; /* Periodic status poll. */
	struct ata_request request;
	};

	/* disk bay/enclosure related */
	#define ATA_LED_OFF 0x00
	#define ATA_LED_RED 0x01
	#define ATA_LED_GREEN 0x02
	#define ATA_LED_ORANGE 0x03
	#define ATA_LED_MASK 0x03

	/* externs */
	extern int (*ata_raid_ioctl_func)(u_long cmd, caddr_t data);
	extern struct intr_config_hook *ata_delayed_attach;
	extern devclass_t ata_devclass;
	extern int ata_wc;
	extern int ata_setmax;
	extern int ata_dma_check_80pin;

	/* public prototypes */
	/* ata-all.c: */
	int ata_probe(device_t dev);
	int ata_attach(device_t dev);
	int ata_detach(device_t dev);
	int ata_reinit(device_t dev);
	int ata_suspend(device_t dev);
	int ata_resume(device_t dev);
	void ata_interrupt(void *data);
	int ata_getparam(struct ata_device *atadev, int init);
	void ata_default_registers(device_t dev);
	void ata_udelay(int interval);
	const char ata_cmd2str(struct ata_request request);
	const char *ata_mode2str(int mode);
	void ata_setmode(device_t dev);
	void ata_print_cable(device_t dev, u_int8_t *who);
	int ata_atapi(device_t dev, int target);
	void ata_timeout(void *);

	/* ata-lowlevel.c: */
	void ata_generic_hw(device_t dev);
	int ata_begin_transaction(struct ata_request *);
	int ata_end_transaction(struct ata_request *);
	void ata_generic_reset(device_t dev);
	int ata_generic_command(struct ata_request *request);

	/* ata-dma.c: */
	void ata_dmainit(device_t);
	void ata_dmafini(device_t dev);

	/* ata-sata.c: */
	void ata_sata_phy_check_events(device_t dev, int port);
	int ata_sata_scr_read(struct ata_channel ch, int port, int reg, uint32_t val);
	int ata_sata_scr_write(struct ata_channel *ch, int port, int reg, uint32_t val);
	int ata_sata_phy_reset(device_t dev, int port, int quick);
	int ata_sata_setmode(device_t dev, int target, int mode);
	int ata_sata_getrev(device_t dev, int target);
	int ata_request2fis_h2d(struct ata_request request, u_int8_t fis);
	void ata_pm_identify(device_t dev);

	MALLOC_DECLARE(M_ATA);

	/* misc newbus defines */
	#define GRANDPARENT(dev) device_get_parent(device_get_parent(dev))

	/* macros to hide busspace uglyness */
	#define ATA_INB(res, offset) \
	bus_read_1((res), (offset))

	#define ATA_INW(res, offset) \
	bus_read_2((res), (offset))
	#define ATA_INW_STRM(res, offset) \
	bus_read_stream_2((res), (offset))
	#define ATA_INL(res, offset) \
	bus_read_4((res), (offset))
	#define ATA_INSW(res, offset, addr, count) \
	bus_read_multi_2((res), (offset), (addr), (count))
	#define ATA_INSW_STRM(res, offset, addr, count) \
	bus_read_multi_stream_2((res), (offset), (addr), (count))
	#define ATA_INSL(res, offset, addr, count) \
	bus_read_multi_4((res), (offset), (addr), (count))
	#define ATA_INSL_STRM(res, offset, addr, count) \
	bus_read_multi_stream_4((res), (offset), (addr), (count))
	#define ATA_OUTB(res, offset, value) \
	bus_write_1((res), (offset), (value))
	#define ATA_OUTW(res, offset, value) \
	bus_write_2((res), (offset), (value))
	#define ATA_OUTW_STRM(res, offset, value) \
	bus_write_stream_2((res), (offset), (value))
	#define ATA_OUTL(res, offset, value) \
	bus_write_4((res), (offset), (value))
	#define ATA_OUTSW(res, offset, addr, count) \
	bus_write_multi_2((res), (offset), (addr), (count))
	#define ATA_OUTSW_STRM(res, offset, addr, count) \
	bus_write_multi_stream_2((res), (offset), (addr), (count))
	#define ATA_OUTSL(res, offset, addr, count) \
	bus_write_multi_4((res), (offset), (addr), (count))
	#define ATA_OUTSL_STRM(res, offset, addr, count) \
	bus_write_multi_stream_4((res), (offset), (addr), (count))

	#define ATA_IDX_INB(ch, idx) \
	ATA_INB(ch->r_io[idx].res, ch->r_io[idx].offset)

	#define ATA_IDX_INW(ch, idx) \
	ATA_INW(ch->r_io[idx].res, ch->r_io[idx].offset)

	#define ATA_IDX_INW_STRM(ch, idx) \
	ATA_INW_STRM(ch->r_io[idx].res, ch->r_io[idx].offset)

	#define ATA_IDX_INL(ch, idx) \
	ATA_INL(ch->r_io[idx].res, ch->r_io[idx].offset)

	#define ATA_IDX_INSW(ch, idx, addr, count) \
	ATA_INSW(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)

	#define ATA_IDX_INSW_STRM(ch, idx, addr, count) \
	ATA_INSW_STRM(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)

	#define ATA_IDX_INSL(ch, idx, addr, count) \
	ATA_INSL(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)

	#define ATA_IDX_INSL_STRM(ch, idx, addr, count) \
	ATA_INSL_STRM(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)

	#define ATA_IDX_OUTB(ch, idx, value) \
	ATA_OUTB(ch->r_io[idx].res, ch->r_io[idx].offset, value)

	#define ATA_IDX_OUTW(ch, idx, value) \
	ATA_OUTW(ch->r_io[idx].res, ch->r_io[idx].offset, value)

	#define ATA_IDX_OUTW_STRM(ch, idx, value) \
	ATA_OUTW_STRM(ch->r_io[idx].res, ch->r_io[idx].offset, value)

	#define ATA_IDX_OUTL(ch, idx, value) \
	ATA_OUTL(ch->r_io[idx].res, ch->r_io[idx].offset, value)

	#define ATA_IDX_OUTSW(ch, idx, addr, count) \
	ATA_OUTSW(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)

	#define ATA_IDX_OUTSW_STRM(ch, idx, addr, count) \
	ATA_OUTSW_STRM(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)

	#define ATA_IDX_OUTSL(ch, idx, addr, count) \
	ATA_OUTSL(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)

	#define ATA_IDX_OUTSL_STRM(ch, idx, addr, count) \
	ATA_OUTSL_STRM(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)
	diff --git a/sys/dev/ata/ata-dma.c b/sys/dev/ata/ata-dma.c
	index f51d993bf26a..bd6ac72b7936 100644
	--- a/sys/dev/ata/ata-dma.c
	+++ b/sys/dev/ata/ata-dma.c
	@@ -1,350 +1,350 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/ata.h>
	#include <sys/kernel.h>
	#include <sys/endian.h>
	#include <sys/malloc.h>
	#include <sys/lock.h>
	#include <sys/sema.h>
	#include <sys/taskqueue.h>
	#include <vm/uma.h>
	#include <sys/bus.h>
	#include <machine/bus.h>
	#include <sys/rman.h>
	#include <dev/ata/ata-all.h>

	/* prototypes */
	static void ata_dmasetupc_cb(void xsc, bus_dma_segment_t segs, int nsegs, int error);
	static void ata_dmaalloc(device_t dev);
	static void ata_dmafree(device_t dev);
	static void ata_dmasetprd(void xsc, bus_dma_segment_t segs, int nsegs, int error);
	static int ata_dmaload(struct ata_request request, void addr, int *nsegs);
	static int ata_dmaunload(struct ata_request *request);

	/* local vars */
	static MALLOC_DEFINE(M_ATADMA, "ata_dma", "ATA driver DMA");

	/* misc defines */
	#define MAXTABSZ PAGE_SIZE
	#define MAXWSPCSZ PAGE_SIZE*2

	struct ata_dc_cb_args {
	bus_addr_t maddr;
	int error;
	};

	void
	ata_dmainit(device_t dev)
	{
	struct ata_channel *ch = device_get_softc(dev);
	struct ata_dc_cb_args dcba;

	if (ch->dma.alloc == NULL)
	ch->dma.alloc = ata_dmaalloc;
	if (ch->dma.free == NULL)
	ch->dma.free = ata_dmafree;
	if (ch->dma.setprd == NULL)
	ch->dma.setprd = ata_dmasetprd;
	if (ch->dma.load == NULL)
	ch->dma.load = ata_dmaload;
	if (ch->dma.unload == NULL)
	ch->dma.unload = ata_dmaunload;
	if (ch->dma.alignment == 0)
	ch->dma.alignment = 2;
	if (ch->dma.boundary == 0)
	ch->dma.boundary = 65536;
	if (ch->dma.segsize == 0)
	ch->dma.segsize = 65536;
	if (ch->dma.max_iosize == 0)
	- ch->dma.max_iosize = MIN((ATA_DMA_ENTRIES - 1) * PAGE_SIZE, MAXPHYS);
	+ ch->dma.max_iosize = (ATA_DMA_ENTRIES - 1) * PAGE_SIZE;
	if (ch->dma.max_address == 0)
	ch->dma.max_address = BUS_SPACE_MAXADDR_32BIT;
	if (ch->dma.dma_slots == 0)
	ch->dma.dma_slots = 1;

	if (bus_dma_tag_create(bus_get_dma_tag(dev), ch->dma.alignment, 0,
	ch->dma.max_address, BUS_SPACE_MAXADDR,
	NULL, NULL, ch->dma.max_iosize,
	ATA_DMA_ENTRIES, ch->dma.segsize,
	0, NULL, NULL, &ch->dma.dmatag))
	goto error;

	if (bus_dma_tag_create(ch->dma.dmatag, PAGE_SIZE, 64 * 1024,
	ch->dma.max_address, BUS_SPACE_MAXADDR,
	NULL, NULL, MAXWSPCSZ, 1, MAXWSPCSZ,
	0, NULL, NULL, &ch->dma.work_tag))
	goto error;

	if (bus_dmamem_alloc(ch->dma.work_tag, (void **)&ch->dma.work,
	BUS_DMA_WAITOK \| BUS_DMA_COHERENT,
	&ch->dma.work_map))
	goto error;

	if (bus_dmamap_load(ch->dma.work_tag, ch->dma.work_map, ch->dma.work,
	MAXWSPCSZ, ata_dmasetupc_cb, &dcba, 0) \|\|
	dcba.error) {
	bus_dmamem_free(ch->dma.work_tag, ch->dma.work, ch->dma.work_map);
	goto error;
	}
	ch->dma.work_bus = dcba.maddr;
	return;

	error:
	device_printf(dev, "WARNING - DMA initialization failed, disabling DMA\n");
	ata_dmafini(dev);
	}

	void
	ata_dmafini(device_t dev)
	{
	struct ata_channel *ch = device_get_softc(dev);

	if (ch->dma.work_bus) {
	bus_dmamap_unload(ch->dma.work_tag, ch->dma.work_map);
	bus_dmamem_free(ch->dma.work_tag, ch->dma.work, ch->dma.work_map);
	ch->dma.work_bus = 0;
	ch->dma.work = NULL;
	}
	if (ch->dma.work_tag) {
	bus_dma_tag_destroy(ch->dma.work_tag);
	ch->dma.work_tag = NULL;
	}
	if (ch->dma.dmatag) {
	bus_dma_tag_destroy(ch->dma.dmatag);
	ch->dma.dmatag = NULL;
	}
	}

	static void
	ata_dmasetupc_cb(void xsc, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct ata_dc_cb_args dcba = (struct ata_dc_cb_args )xsc;

	if (!(dcba->error = error))
	dcba->maddr = segs[0].ds_addr;
	}

	static void
	ata_dmaalloc(device_t dev)
	{
	struct ata_channel *ch = device_get_softc(dev);
	struct ata_dc_cb_args dcba;
	int i;

	/* alloc and setup needed dma slots */
	bzero(ch->dma.slot, sizeof(struct ata_dmaslot) * ATA_DMA_SLOTS);
	for (i = 0; i < ch->dma.dma_slots; i++) {
	struct ata_dmaslot *slot = &ch->dma.slot[i];

	if (bus_dma_tag_create(ch->dma.dmatag, PAGE_SIZE, PAGE_SIZE,
	ch->dma.max_address, BUS_SPACE_MAXADDR,
	NULL, NULL, PAGE_SIZE, 1, PAGE_SIZE,
	0, NULL, NULL, &slot->sg_tag)) {
	device_printf(ch->dev, "FAILURE - create sg_tag\n");
	goto error;
	}

	if (bus_dmamem_alloc(slot->sg_tag, (void **)&slot->sg, BUS_DMA_WAITOK,
	&slot->sg_map)) {
	device_printf(ch->dev, "FAILURE - alloc sg_map\n");
	goto error;
	}

	if (bus_dmamap_load(slot->sg_tag, slot->sg_map, slot->sg, MAXTABSZ,
	ata_dmasetupc_cb, &dcba, 0) \|\| dcba.error) {
	device_printf(ch->dev, "FAILURE - load sg\n");
	goto error;
	}
	slot->sg_bus = dcba.maddr;

	if (bus_dma_tag_create(ch->dma.dmatag,
	ch->dma.alignment, ch->dma.boundary,
	ch->dma.max_address, BUS_SPACE_MAXADDR,
	NULL, NULL, ch->dma.max_iosize,
	ATA_DMA_ENTRIES, ch->dma.segsize,
	BUS_DMA_ALLOCNOW, NULL, NULL, &slot->data_tag)) {
	device_printf(ch->dev, "FAILURE - create data_tag\n");
	goto error;
	}

	if (bus_dmamap_create(slot->data_tag, 0, &slot->data_map)) {
	device_printf(ch->dev, "FAILURE - create data_map\n");
	goto error;
	}
	}

	return;

	error:
	device_printf(dev, "WARNING - DMA allocation failed, disabling DMA\n");
	ata_dmafree(dev);
	}

	static void
	ata_dmafree(device_t dev)
	{
	struct ata_channel *ch = device_get_softc(dev);
	int i;

	/* free all dma slots */
	for (i = 0; i < ATA_DMA_SLOTS; i++) {
	struct ata_dmaslot *slot = &ch->dma.slot[i];

	if (slot->sg_bus) {
	bus_dmamap_unload(slot->sg_tag, slot->sg_map);
	slot->sg_bus = 0;
	}
	if (slot->sg) {
	bus_dmamem_free(slot->sg_tag, slot->sg, slot->sg_map);
	slot->sg = NULL;
	}
	if (slot->data_map) {
	bus_dmamap_destroy(slot->data_tag, slot->data_map);
	slot->data_map = NULL;
	}
	if (slot->sg_tag) {
	bus_dma_tag_destroy(slot->sg_tag);
	slot->sg_tag = NULL;
	}
	if (slot->data_tag) {
	bus_dma_tag_destroy(slot->data_tag);
	slot->data_tag = NULL;
	}
	}
	}

	static void
	ata_dmasetprd(void xsc, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct ata_dmasetprd_args *args = xsc;
	struct ata_dma_prdentry *prd = args->dmatab;
	int i;

	if ((args->error = error))
	return;

	for (i = 0; i < nsegs; i++) {
	prd[i].addr = htole32(segs[i].ds_addr);
	prd[i].count = htole32(segs[i].ds_len);
	}
	prd[i - 1].count \|= htole32(ATA_DMA_EOT);
	KASSERT(nsegs <= ATA_DMA_ENTRIES, ("too many DMA segment entries\n"));
	args->nsegs = nsegs;
	}

	static int
	ata_dmaload(struct ata_request request, void addr, int *entries)
	{
	struct ata_channel *ch = device_get_softc(request->parent);
	struct ata_dmasetprd_args dspa;
	int error;

	ATA_DEBUG_RQ(request, "dmaload");

	if (request->dma) {
	device_printf(request->parent,
	"FAILURE - already active DMA on this device\n");
	return EIO;
	}
	if (!request->bytecount) {
	device_printf(request->parent,
	"FAILURE - zero length DMA transfer attempted\n");
	return EIO;
	}
	if (request->bytecount & (ch->dma.alignment - 1)) {
	device_printf(request->parent,
	"FAILURE - odd-sized DMA transfer attempt %d %% %d\n",
	request->bytecount, ch->dma.alignment);
	return EIO;
	}
	if (request->bytecount > ch->dma.max_iosize) {
	device_printf(request->parent,
	"FAILURE - oversized DMA transfer attempt %d > %d\n",
	request->bytecount, ch->dma.max_iosize);
	return EIO;
	}

	/* set our slot. XXX SOS NCQ will change that */
	request->dma = &ch->dma.slot[0];

	if (addr)
	dspa.dmatab = addr;
	else
	dspa.dmatab = request->dma->sg;

	if (request->flags & ATA_R_DATA_IN_CCB)
	error = bus_dmamap_load_ccb(request->dma->data_tag,
	request->dma->data_map, request->ccb,
	ch->dma.setprd, &dspa, BUS_DMA_NOWAIT);
	else
	error = bus_dmamap_load(request->dma->data_tag, request->dma->data_map,
	request->data, request->bytecount,
	ch->dma.setprd, &dspa, BUS_DMA_NOWAIT);
	if (error \|\| (error = dspa.error)) {
	device_printf(request->parent, "FAILURE - load data\n");
	goto error;
	}

	if (entries)
	*entries = dspa.nsegs;

	bus_dmamap_sync(request->dma->sg_tag, request->dma->sg_map,
	BUS_DMASYNC_PREWRITE);
	bus_dmamap_sync(request->dma->data_tag, request->dma->data_map,
	(request->flags & ATA_R_READ) ?
	BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
	return 0;

	error:
	ata_dmaunload(request);
	return EIO;
	}

	int
	ata_dmaunload(struct ata_request *request)
	{
	ATA_DEBUG_RQ(request, "dmaunload");

	if (request->dma) {
	bus_dmamap_sync(request->dma->sg_tag, request->dma->sg_map,
	BUS_DMASYNC_POSTWRITE);
	bus_dmamap_sync(request->dma->data_tag, request->dma->data_map,
	(request->flags & ATA_R_READ) ?
	BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);

	bus_dmamap_unload(request->dma->data_tag, request->dma->data_map);
	request->dma = NULL;
	}
	return 0;
	}
	diff --git a/sys/dev/firewire/sbp.c b/sys/dev/firewire/sbp.c
	index 72fe6d9b4454..58e2086399d2 100644
	--- a/sys/dev/firewire/sbp.c
	+++ b/sys/dev/firewire/sbp.c
	@@ -1,2855 +1,2855 @@
	/*-
	* SPDX-License-Identifier: BSD-4-Clause
	*
	* Copyright (c) 2003 Hidetoshi Shimokawa
	* Copyright (c) 1998-2002 Katsushi Kobayashi and Hidetoshi Shimokawa
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the acknowledgement as bellow:
	*
	* This product includes software developed by K. Kobayashi and H. Shimokawa
	*
	* 4. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
	* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/module.h>
	#include <sys/bus.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <machine/bus.h>
	#include <sys/malloc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/cam_debug.h>
	#include <cam/cam_periph.h>
	#include <cam/scsi/scsi_all.h>

	#include <dev/firewire/firewire.h>
	#include <dev/firewire/firewirereg.h>
	#include <dev/firewire/fwdma.h>
	#include <dev/firewire/iec13213.h>
	#include <dev/firewire/sbp.h>

	#define ccb_sdev_ptr spriv_ptr0
	#define ccb_sbp_ptr spriv_ptr1

	#define SBP_NUM_TARGETS 8 /* MAX 64 */
	/*
	* Scan_bus doesn't work for more than 8 LUNs
	* because of CAM_SCSI2_MAXLUN in cam_xpt.c
	*/
	#define SBP_NUM_LUNS 64
	-#define SBP_MAXPHYS MIN(MAXPHYS, (5121024) / 512KB */)
	+#define SBP_MAXPHYS (128 * 1024)
	#define SBP_DMA_SIZE PAGE_SIZE
	#define SBP_LOGIN_SIZE sizeof(struct sbp_login_res)
	#define SBP_QUEUE_LEN ((SBP_DMA_SIZE - SBP_LOGIN_SIZE) / sizeof(struct sbp_ocb))
	#define SBP_NUM_OCB (SBP_QUEUE_LEN * SBP_NUM_TARGETS)

	/*
	* STATUS FIFO addressing
	* bit
	*-----------------------
	* 0- 1( 2): 0 (alignment)
	* 2- 7( 6): target
	* 8-15( 8): lun
	* 16-31( 8): reserved
	* 32-47(16): SBP_BIND_HI
	* 48-64(16): bus_id, node_id
	*/
	#define SBP_BIND_HI 0x1
	#define SBP_DEV2ADDR(t, l) \
	(((u_int64_t)SBP_BIND_HI << 32) \
	\| (((l) & 0xff) << 8) \
	\| (((t) & 0x3f) << 2))
	#define SBP_ADDR2TRG(a) (((a) >> 2) & 0x3f)
	#define SBP_ADDR2LUN(a) (((a) >> 8) & 0xff)
	#define SBP_INITIATOR 7

	static char *orb_fun_name[] = {
	ORB_FUN_NAMES
	};

	static int debug = 0;
	static int auto_login = 1;
	static int max_speed = -1;
	static int sbp_cold = 1;
	static int ex_login = 1;
	static int login_delay = 1000; /* msec */
	static int scan_delay = 500; /* msec */
	static int use_doorbell = 0;
	static int sbp_tags = 0;

	SYSCTL_DECL(_hw_firewire);
	static SYSCTL_NODE(_hw_firewire, OID_AUTO, sbp, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"SBP-II Subsystem");
	SYSCTL_INT(_debug, OID_AUTO, sbp_debug, CTLFLAG_RWTUN, &debug, 0,
	"SBP debug flag");
	SYSCTL_INT(_hw_firewire_sbp, OID_AUTO, auto_login, CTLFLAG_RWTUN, &auto_login, 0,
	"SBP perform login automatically");
	SYSCTL_INT(_hw_firewire_sbp, OID_AUTO, max_speed, CTLFLAG_RWTUN, &max_speed, 0,
	"SBP transfer max speed");
	SYSCTL_INT(_hw_firewire_sbp, OID_AUTO, exclusive_login, CTLFLAG_RWTUN,
	&ex_login, 0, "SBP enable exclusive login");
	SYSCTL_INT(_hw_firewire_sbp, OID_AUTO, login_delay, CTLFLAG_RWTUN,
	&login_delay, 0, "SBP login delay in msec");
	SYSCTL_INT(_hw_firewire_sbp, OID_AUTO, scan_delay, CTLFLAG_RWTUN,
	&scan_delay, 0, "SBP scan delay in msec");
	SYSCTL_INT(_hw_firewire_sbp, OID_AUTO, use_doorbell, CTLFLAG_RWTUN,
	&use_doorbell, 0, "SBP use doorbell request");
	SYSCTL_INT(_hw_firewire_sbp, OID_AUTO, tags, CTLFLAG_RWTUN, &sbp_tags, 0,
	"SBP tagged queuing support");

	#define NEED_RESPONSE 0

	#define SBP_SEG_MAX rounddown(0xffff, PAGE_SIZE)
	#define SBP_IND_MAX howmany(SBP_MAXPHYS, PAGE_SIZE)
	struct sbp_ocb {
	STAILQ_ENTRY(sbp_ocb) ocb;
	union ccb *ccb;
	bus_addr_t bus_addr;
	uint32_t orb[8];
	#define IND_PTR_OFFSET (8*sizeof(uint32_t))
	struct ind_ptr ind_ptr[SBP_IND_MAX];
	struct sbp_dev *sdev;
	int flags; /* XXX should be removed */
	bus_dmamap_t dmamap;
	struct callout timer;
	};

	#define OCB_ACT_MGM 0
	#define OCB_ACT_CMD 1
	#define OCB_MATCH(o,s) ((o)->bus_addr == ntohl((s)->orb_lo))

	struct sbp_dev {
	#define SBP_DEV_RESET 0 /* accept login */
	#define SBP_DEV_LOGIN 1 /* to login */
	#if 0
	#define SBP_DEV_RECONN 2 /* to reconnect */
	#endif
	#define SBP_DEV_TOATTACH 3 /* to attach */
	#define SBP_DEV_PROBE 4 /* scan lun */
	#define SBP_DEV_ATTACHED 5 /* in operation */
	#define SBP_DEV_DEAD 6 /* unavailable unit */
	#define SBP_DEV_RETRY 7 /* unavailable unit */
	uint8_t status:4,
	timeout:4;
	uint8_t type;
	uint16_t lun_id;
	uint16_t freeze;
	#define ORB_LINK_DEAD (1 << 0)
	#define VALID_LUN (1 << 1)
	#define ORB_POINTER_ACTIVE (1 << 2)
	#define ORB_POINTER_NEED (1 << 3)
	#define ORB_DOORBELL_ACTIVE (1 << 4)
	#define ORB_DOORBELL_NEED (1 << 5)
	#define ORB_SHORTAGE (1 << 6)
	uint16_t flags;
	struct cam_path *path;
	struct sbp_target *target;
	struct fwdma_alloc dma;
	struct sbp_login_res *login;
	struct callout login_callout;
	struct sbp_ocb *ocb;
	STAILQ_HEAD(, sbp_ocb) ocbs;
	STAILQ_HEAD(, sbp_ocb) free_ocbs;
	struct sbp_ocb *last_ocb;
	char vendor[32];
	char product[32];
	char revision[10];
	char bustgtlun[32];
	};

	struct sbp_target {
	int target_id;
	int num_lun;
	struct sbp_dev **luns;
	struct sbp_softc *sbp;
	struct fw_device *fwdev;
	uint32_t mgm_hi, mgm_lo;
	struct sbp_ocb *mgm_ocb_cur;
	STAILQ_HEAD(, sbp_ocb) mgm_ocb_queue;
	struct callout mgm_ocb_timeout;
	struct callout scan_callout;
	STAILQ_HEAD(, fw_xfer) xferlist;
	int n_xfer;
	};

	struct sbp_softc {
	struct firewire_dev_comm fd;
	struct cam_sim *sim;
	struct cam_path *path;
	struct sbp_target targets[SBP_NUM_TARGETS];
	struct fw_bind fwb;
	bus_dma_tag_t dmat;
	struct timeval last_busreset;
	#define SIMQ_FREEZED 1
	int flags;
	struct mtx mtx;
	};
	#define SBP_LOCK(sbp) mtx_lock(&(sbp)->mtx)
	#define SBP_UNLOCK(sbp) mtx_unlock(&(sbp)->mtx)
	#define SBP_LOCK_ASSERT(sbp) mtx_assert(&(sbp)->mtx, MA_OWNED)

	static void sbp_post_explore (void *);
	static void sbp_recv (struct fw_xfer *);
	static void sbp_mgm_callback (struct fw_xfer *);
	#if 0
	static void sbp_cmd_callback (struct fw_xfer *);
	#endif
	static void sbp_orb_pointer (struct sbp_dev , struct sbp_ocb );
	static void sbp_doorbell(struct sbp_dev *);
	static void sbp_execute_ocb (void , bus_dma_segment_t , int, int);
	static void sbp_free_ocb (struct sbp_dev , struct sbp_ocb );
	static void sbp_abort_ocb (struct sbp_ocb *, int);
	static void sbp_abort_all_ocbs (struct sbp_dev *, int);
	static struct fw_xfer * sbp_write_cmd (struct sbp_dev *, int, int);
	static struct sbp_ocb * sbp_get_ocb (struct sbp_dev *);
	static struct sbp_ocb * sbp_enqueue_ocb (struct sbp_dev , struct sbp_ocb );
	static struct sbp_ocb * sbp_dequeue_ocb (struct sbp_dev , struct sbp_status );
	static void sbp_cam_detach_sdev(struct sbp_dev *);
	static void sbp_free_sdev(struct sbp_dev *);
	static void sbp_cam_detach_target (struct sbp_target *);
	static void sbp_free_target (struct sbp_target *);
	static void sbp_mgm_timeout (void *arg);
	static void sbp_timeout (void *arg);
	static void sbp_mgm_orb (struct sbp_dev , int, struct sbp_ocb );

	static MALLOC_DEFINE(M_SBP, "sbp", "SBP-II/FireWire");

	/* cam related functions */
	static void sbp_action(struct cam_sim sim, union ccb ccb);
	static void sbp_poll(struct cam_sim *sim);
	static void sbp_cam_scan_lun(struct cam_periph , union ccb );
	static void sbp_cam_scan_target(void *arg);

	static char *orb_status0[] = {
	/* 0 */ "No additional information to report",
	/* 1 */ "Request type not supported",
	/* 2 */ "Speed not supported",
	/* 3 */ "Page size not supported",
	/* 4 */ "Access denied",
	/* 5 */ "Logical unit not supported",
	/* 6 */ "Maximum payload too small",
	/* 7 */ "Reserved for future standardization",
	/* 8 */ "Resources unavailable",
	/* 9 */ "Function rejected",
	/* A */ "Login ID not recognized",
	/* B */ "Dummy ORB completed",
	/* C */ "Request aborted",
	/* FF */ "Unspecified error"
	#define MAX_ORB_STATUS0 0xd
	};

	static char *orb_status1_object[] = {
	/* 0 */ "Operation request block (ORB)",
	/* 1 */ "Data buffer",
	/* 2 */ "Page table",
	/* 3 */ "Unable to specify"
	};

	static char *orb_status1_serial_bus_error[] = {
	/* 0 */ "Missing acknowledge",
	/* 1 */ "Reserved; not to be used",
	/* 2 */ "Time-out error",
	/* 3 */ "Reserved; not to be used",
	/* 4 */ "Busy retry limit exceeded(X)",
	/* 5 */ "Busy retry limit exceeded(A)",
	/* 6 */ "Busy retry limit exceeded(B)",
	/* 7 */ "Reserved for future standardization",
	/* 8 */ "Reserved for future standardization",
	/* 9 */ "Reserved for future standardization",
	/* A */ "Reserved for future standardization",
	/* B */ "Tardy retry limit exceeded",
	/* C */ "Conflict error",
	/* D */ "Data error",
	/* E */ "Type error",
	/* F */ "Address error"
	};

	static void
	sbp_identify(driver_t *driver, device_t parent)
	{
	SBP_DEBUG(0)
	printf("sbp_identify\n");
	END_DEBUG

	if (device_find_child(parent, "sbp", -1) == NULL)
	BUS_ADD_CHILD(parent, 0, "sbp", -1);
	}

	/*
	* sbp_probe()
	*/
	static int
	sbp_probe(device_t dev)
	{

	SBP_DEBUG(0)
	printf("sbp_probe\n");
	END_DEBUG

	device_set_desc(dev, "SBP-2/SCSI over FireWire");

	#if 0
	if (bootverbose)
	debug = bootverbose;
	#endif

	return (0);
	}

	/*
	* Display device characteristics on the console
	*/
	static void
	sbp_show_sdev_info(struct sbp_dev *sdev)
	{
	struct fw_device *fwdev;

	fwdev = sdev->target->fwdev;
	device_printf(sdev->target->sbp->fd.dev,
	"%s: %s: ordered:%d type:%d EUI:%08x%08x node:%d "
	"speed:%d maxrec:%d\n",
	__func__,
	sdev->bustgtlun,
	(sdev->type & 0x40) >> 6,
	(sdev->type & 0x1f),
	fwdev->eui.hi,
	fwdev->eui.lo,
	fwdev->dst,
	fwdev->speed,
	fwdev->maxrec);

	device_printf(sdev->target->sbp->fd.dev,
	"%s: %s '%s' '%s' '%s'\n",
	__func__,
	sdev->bustgtlun,
	sdev->vendor,
	sdev->product,
	sdev->revision);
	}

	static struct {
	int bus;
	int target;
	struct fw_eui64 eui;
	} wired[] = {
	/* Bus Target EUI64 */
	#if 0
	{0, 2, {0x00018ea0, 0x01fd0154}}, /* Logitec HDD */
	{0, 0, {0x00018ea6, 0x00100682}}, /* Logitec DVD */
	{0, 1, {0x00d03200, 0xa412006a}}, /* Yano HDD */
	#endif
	{-1, -1, {0,0}}
	};

	static int
	sbp_new_target(struct sbp_softc sbp, struct fw_device fwdev)
	{
	int bus, i, target=-1;
	char w[SBP_NUM_TARGETS];

	bzero(w, sizeof(w));
	bus = device_get_unit(sbp->fd.dev);

	/* XXX wired-down configuration should be gotten from
	tunable or device hint */
	for (i = 0; wired[i].bus >= 0; i++) {
	if (wired[i].bus == bus) {
	w[wired[i].target] = 1;
	if (wired[i].eui.hi == fwdev->eui.hi &&
	wired[i].eui.lo == fwdev->eui.lo)
	target = wired[i].target;
	}
	}
	if (target >= 0) {
	if (target < SBP_NUM_TARGETS &&
	sbp->targets[target].fwdev == NULL)
	return (target);
	device_printf(sbp->fd.dev,
	"target %d is not free for %08x:%08x\n",
	target, fwdev->eui.hi, fwdev->eui.lo);
	target = -1;
	}
	/* non-wired target */
	for (i = 0; i < SBP_NUM_TARGETS; i++)
	if (sbp->targets[i].fwdev == NULL && w[i] == 0) {
	target = i;
	break;
	}

	return target;
	}

	static void
	sbp_alloc_lun(struct sbp_target *target)
	{
	struct crom_context cc;
	struct csrreg *reg;
	struct sbp_dev sdev, *newluns;
	struct sbp_softc *sbp;
	int maxlun, lun, i;

	sbp = target->sbp;
	crom_init_context(&cc, target->fwdev->csrrom);
	/* XXX shoud parse appropriate unit directories only */
	maxlun = -1;
	while (cc.depth >= 0) {
	reg = crom_search_key(&cc, CROM_LUN);
	if (reg == NULL)
	break;
	lun = reg->val & 0xffff;
	SBP_DEBUG(0)
	printf("target %d lun %d found\n", target->target_id, lun);
	END_DEBUG
	if (maxlun < lun)
	maxlun = lun;
	crom_next(&cc);
	}
	if (maxlun < 0)
	device_printf(target->sbp->fd.dev, "%d no LUN found\n",
	target->target_id);

	maxlun++;
	if (maxlun >= SBP_NUM_LUNS)
	maxlun = SBP_NUM_LUNS;

	/* Invalidiate stale devices */
	for (lun = 0; lun < target->num_lun; lun++) {
	sdev = target->luns[lun];
	if (sdev == NULL)
	continue;
	sdev->flags &= ~VALID_LUN;
	if (lun >= maxlun) {
	/* lost device */
	sbp_cam_detach_sdev(sdev);
	sbp_free_sdev(sdev);
	target->luns[lun] = NULL;
	}
	}

	/* Reallocate */
	if (maxlun != target->num_lun) {
	newluns = (struct sbp_dev **) realloc(target->luns,
	sizeof(struct sbp_dev ) maxlun,
	M_SBP, M_NOWAIT \| M_ZERO);

	if (newluns == NULL) {
	printf("%s: realloc failed\n", __func__);
	newluns = target->luns;
	maxlun = target->num_lun;
	}

	/*
	* We must zero the extended region for the case
	* realloc() doesn't allocate new buffer.
	*/
	if (maxlun > target->num_lun)
	bzero(&newluns[target->num_lun],
	sizeof(struct sbp_dev )
	(maxlun - target->num_lun));

	target->luns = newluns;
	target->num_lun = maxlun;
	}

	crom_init_context(&cc, target->fwdev->csrrom);
	while (cc.depth >= 0) {
	int new = 0;

	reg = crom_search_key(&cc, CROM_LUN);
	if (reg == NULL)
	break;
	lun = reg->val & 0xffff;
	if (lun >= SBP_NUM_LUNS) {
	printf("too large lun %d\n", lun);
	goto next;
	}

	sdev = target->luns[lun];
	if (sdev == NULL) {
	sdev = malloc(sizeof(struct sbp_dev),
	M_SBP, M_NOWAIT \| M_ZERO);
	if (sdev == NULL) {
	printf("%s: malloc failed\n", __func__);
	goto next;
	}
	target->luns[lun] = sdev;
	sdev->lun_id = lun;
	sdev->target = target;
	STAILQ_INIT(&sdev->ocbs);
	callout_init_mtx(&sdev->login_callout, &sbp->mtx, 0);
	sdev->status = SBP_DEV_RESET;
	new = 1;
	snprintf(sdev->bustgtlun, 32, "%s:%d:%d",
	device_get_nameunit(sdev->target->sbp->fd.dev),
	sdev->target->target_id,
	sdev->lun_id);
	}
	sdev->flags \|= VALID_LUN;
	sdev->type = (reg->val & 0xff0000) >> 16;

	if (new == 0)
	goto next;

	fwdma_malloc(sbp->fd.fc,
	/* alignment */ sizeof(uint32_t),
	SBP_DMA_SIZE, &sdev->dma, BUS_DMA_NOWAIT \|
	BUS_DMA_COHERENT);
	if (sdev->dma.v_addr == NULL) {
	printf("%s: dma space allocation failed\n",
	__func__);
	free(sdev, M_SBP);
	target->luns[lun] = NULL;
	goto next;
	}
	sdev->login = (struct sbp_login_res *) sdev->dma.v_addr;
	sdev->ocb = (struct sbp_ocb *)
	((char *)sdev->dma.v_addr + SBP_LOGIN_SIZE);
	bzero((char *)sdev->ocb,
	sizeof(struct sbp_ocb) * SBP_QUEUE_LEN);

	STAILQ_INIT(&sdev->free_ocbs);
	for (i = 0; i < SBP_QUEUE_LEN; i++) {
	struct sbp_ocb *ocb;
	ocb = &sdev->ocb[i];
	ocb->bus_addr = sdev->dma.bus_addr
	+ SBP_LOGIN_SIZE
	+ sizeof(struct sbp_ocb) * i
	+ offsetof(struct sbp_ocb, orb[0]);
	if (bus_dmamap_create(sbp->dmat, 0, &ocb->dmamap)) {
	printf("sbp_attach: cannot create dmamap\n");
	/* XXX */
	goto next;
	}
	callout_init_mtx(&ocb->timer, &sbp->mtx, 0);
	SBP_LOCK(sbp);
	sbp_free_ocb(sdev, ocb);
	SBP_UNLOCK(sbp);
	}
	next:
	crom_next(&cc);
	}

	for (lun = 0; lun < target->num_lun; lun++) {
	sdev = target->luns[lun];
	if (sdev != NULL && (sdev->flags & VALID_LUN) == 0) {
	sbp_cam_detach_sdev(sdev);
	sbp_free_sdev(sdev);
	target->luns[lun] = NULL;
	}
	}
	}

	static struct sbp_target *
	sbp_alloc_target(struct sbp_softc sbp, struct fw_device fwdev)
	{
	int i;
	struct sbp_target *target;
	struct crom_context cc;
	struct csrreg *reg;

	SBP_DEBUG(1)
	printf("sbp_alloc_target\n");
	END_DEBUG
	i = sbp_new_target(sbp, fwdev);
	if (i < 0) {
	device_printf(sbp->fd.dev, "increase SBP_NUM_TARGETS!\n");
	return NULL;
	}
	/* new target */
	target = &sbp->targets[i];
	target->fwdev = fwdev;
	target->target_id = i;
	/* XXX we may want to reload mgm port after each bus reset */
	/* XXX there might be multiple management agents */
	crom_init_context(&cc, target->fwdev->csrrom);
	reg = crom_search_key(&cc, CROM_MGM);
	if (reg == NULL \|\| reg->val == 0) {
	printf("NULL management address\n");
	target->fwdev = NULL;
	return NULL;
	}
	target->mgm_hi = 0xffff;
	target->mgm_lo = 0xf0000000 \| (reg->val << 2);
	target->mgm_ocb_cur = NULL;
	SBP_DEBUG(1)
	printf("target:%d mgm_port: %x\n", i, target->mgm_lo);
	END_DEBUG
	STAILQ_INIT(&target->xferlist);
	target->n_xfer = 0;
	STAILQ_INIT(&target->mgm_ocb_queue);
	callout_init_mtx(&target->mgm_ocb_timeout, &sbp->mtx, 0);
	callout_init_mtx(&target->scan_callout, &sbp->mtx, 0);

	target->luns = NULL;
	target->num_lun = 0;
	return target;
	}

	static void
	sbp_probe_lun(struct sbp_dev *sdev)
	{
	struct fw_device *fwdev;
	struct crom_context c, *cc = &c;
	struct csrreg *reg;

	bzero(sdev->vendor, sizeof(sdev->vendor));
	bzero(sdev->product, sizeof(sdev->product));

	fwdev = sdev->target->fwdev;
	crom_init_context(cc, fwdev->csrrom);
	/* get vendor string */
	crom_search_key(cc, CSRKEY_VENDOR);
	crom_next(cc);
	crom_parse_text(cc, sdev->vendor, sizeof(sdev->vendor));
	/* skip to the unit directory for SBP-2 */
	while ((reg = crom_search_key(cc, CSRKEY_VER)) != NULL) {
	if (reg->val == CSRVAL_T10SBP2)
	break;
	crom_next(cc);
	}
	/* get firmware revision */
	reg = crom_search_key(cc, CSRKEY_FIRM_VER);
	if (reg != NULL)
	snprintf(sdev->revision, sizeof(sdev->revision),
	"%06x", reg->val);
	/* get product string */
	crom_search_key(cc, CSRKEY_MODEL);
	crom_next(cc);
	crom_parse_text(cc, sdev->product, sizeof(sdev->product));
	}

	static void
	sbp_login_callout(void *arg)
	{
	struct sbp_dev sdev = (struct sbp_dev )arg;
	SBP_LOCK_ASSERT(sdev->target->sbp);
	sbp_mgm_orb(sdev, ORB_FUN_LGI, NULL);
	}

	static void
	sbp_login(struct sbp_dev *sdev)
	{
	struct timeval delta;
	struct timeval t;
	int ticks = 0;

	microtime(&delta);
	timevalsub(&delta, &sdev->target->sbp->last_busreset);
	t.tv_sec = login_delay / 1000;
	t.tv_usec = (login_delay % 1000) * 1000;
	timevalsub(&t, &delta);
	if (t.tv_sec >= 0 && t.tv_usec > 0)
	ticks = (t.tv_sec * 1000 + t.tv_usec / 1000) * hz / 1000;
	SBP_DEBUG(0)
	printf("%s: sec = %jd usec = %ld ticks = %d\n", __func__,
	(intmax_t)t.tv_sec, t.tv_usec, ticks);
	END_DEBUG
	callout_reset(&sdev->login_callout, ticks,
	sbp_login_callout, (void *)(sdev));
	}

	#define SBP_FWDEV_ALIVE(fwdev) (((fwdev)->status == FWDEVATTACHED) \
	&& crom_has_specver((fwdev)->csrrom, CSRVAL_ANSIT10, CSRVAL_T10SBP2))

	static void
	sbp_probe_target(struct sbp_target *target)
	{
	struct sbp_softc *sbp = target->sbp;
	struct sbp_dev *sdev;
	int i, alive;

	alive = SBP_FWDEV_ALIVE(target->fwdev);
	SBP_DEBUG(1)
	device_printf(sbp->fd.dev, "%s %d%salive\n",
	__func__, target->target_id,
	(!alive) ? " not " : "");
	END_DEBUG

	sbp_alloc_lun(target);

	/* XXX untimeout mgm_ocb and dequeue */
	for (i=0; i < target->num_lun; i++) {
	sdev = target->luns[i];
	if (sdev == NULL)
	continue;
	if (alive && (sdev->status != SBP_DEV_DEAD)) {
	if (sdev->path != NULL) {
	xpt_freeze_devq(sdev->path, 1);
	sdev->freeze++;
	}
	sbp_probe_lun(sdev);
	sbp_show_sdev_info(sdev);

	SBP_LOCK(sbp);
	sbp_abort_all_ocbs(sdev, CAM_SCSI_BUS_RESET);
	SBP_UNLOCK(sbp);
	switch (sdev->status) {
	case SBP_DEV_RESET:
	/* new or revived target */
	if (auto_login)
	sbp_login(sdev);
	break;
	case SBP_DEV_TOATTACH:
	case SBP_DEV_PROBE:
	case SBP_DEV_ATTACHED:
	case SBP_DEV_RETRY:
	default:
	sbp_mgm_orb(sdev, ORB_FUN_RCN, NULL);
	break;
	}
	} else {
	switch (sdev->status) {
	case SBP_DEV_ATTACHED:
	SBP_DEBUG(0)
	/* the device has gone */
	device_printf(sbp->fd.dev, "%s: lost target\n",
	__func__);
	END_DEBUG
	if (sdev->path) {
	xpt_freeze_devq(sdev->path, 1);
	sdev->freeze++;
	}
	sdev->status = SBP_DEV_RETRY;
	sbp_cam_detach_sdev(sdev);
	sbp_free_sdev(sdev);
	target->luns[i] = NULL;
	break;
	case SBP_DEV_PROBE:
	case SBP_DEV_TOATTACH:
	sdev->status = SBP_DEV_RESET;
	break;
	case SBP_DEV_RETRY:
	case SBP_DEV_RESET:
	case SBP_DEV_DEAD:
	break;
	}
	}
	}
	}

	static void
	sbp_post_busreset(void *arg)
	{
	struct sbp_softc *sbp;

	sbp = (struct sbp_softc *)arg;
	SBP_DEBUG(0)
	printf("sbp_post_busreset\n");
	END_DEBUG
	SBP_LOCK(sbp);
	if ((sbp->flags & SIMQ_FREEZED) == 0) {
	xpt_freeze_simq(sbp->sim, /count/1);
	sbp->flags \|= SIMQ_FREEZED;
	}
	microtime(&sbp->last_busreset);
	SBP_UNLOCK(sbp);
	}

	static void
	sbp_post_explore(void *arg)
	{
	struct sbp_softc sbp = (struct sbp_softc )arg;
	struct sbp_target *target;
	struct fw_device *fwdev;
	int i, alive;

	SBP_DEBUG(0)
	printf("sbp_post_explore (sbp_cold=%d)\n", sbp_cold);
	END_DEBUG
	/* We need physical access */
	if (!firewire_phydma_enable)
	return;

	if (sbp_cold > 0)
	sbp_cold--;

	SBP_LOCK(sbp);

	/* Garbage Collection */
	for (i = 0; i < SBP_NUM_TARGETS; i++) {
	target = &sbp->targets[i];
	if (target->fwdev == NULL)
	continue;

	STAILQ_FOREACH(fwdev, &sbp->fd.fc->devices, link)
	if (target->fwdev == fwdev)
	break;
	if (fwdev == NULL) {
	/* device has removed in lower driver */
	sbp_cam_detach_target(target);
	sbp_free_target(target);
	}
	}

	/* traverse device list */
	STAILQ_FOREACH(fwdev, &sbp->fd.fc->devices, link) {
	SBP_DEBUG(0)
	device_printf(sbp->fd.dev,"%s:: EUI:%08x%08x %s attached, state=%d\n",
	__func__, fwdev->eui.hi, fwdev->eui.lo,
	(fwdev->status != FWDEVATTACHED) ? "not" : "",
	fwdev->status);
	END_DEBUG
	alive = SBP_FWDEV_ALIVE(fwdev);
	for (i = 0; i < SBP_NUM_TARGETS; i++) {
	target = &sbp->targets[i];
	if (target->fwdev == fwdev) {
	/* known target */
	break;
	}
	}
	if (i == SBP_NUM_TARGETS) {
	if (alive) {
	/* new target */
	target = sbp_alloc_target(sbp, fwdev);
	if (target == NULL)
	continue;
	} else {
	continue;
	}
	}

	/*
	* It is safe to drop the lock here as the target is already
	* reserved, so there should be no contenders for it.
	* And the target is not yet exposed, so there should not be
	* any other accesses to it.
	* Finally, the list being iterated is protected somewhere else.
	*/
	SBP_UNLOCK(sbp);
	sbp_probe_target(target);
	SBP_LOCK(sbp);
	if (target->num_lun == 0)
	sbp_free_target(target);
	}
	if ((sbp->flags & SIMQ_FREEZED) != 0) {
	xpt_release_simq(sbp->sim, /run queue/TRUE);
	sbp->flags &= ~SIMQ_FREEZED;
	}
	SBP_UNLOCK(sbp);
	}

	#if NEED_RESPONSE
	static void
	sbp_loginres_callback(struct fw_xfer *xfer)
	{
	struct sbp_dev *sdev;
	sdev = (struct sbp_dev *)xfer->sc;
	SBP_DEBUG(1)
	device_printf(sdev->target->sbp->fd.dev,"%s\n", __func__);
	END_DEBUG
	/* recycle */
	SBP_LOCK(sdev->target->sbp);
	STAILQ_INSERT_TAIL(&sdev->target->sbp->fwb.xferlist, xfer, link);
	SBP_UNLOCK(sdev->target->sbp);
	return;
	}
	#endif

	static __inline void
	sbp_xfer_free(struct fw_xfer *xfer)
	{
	struct sbp_dev *sdev;

	sdev = (struct sbp_dev *)xfer->sc;
	fw_xfer_unload(xfer);
	SBP_LOCK_ASSERT(sdev->target->sbp);
	STAILQ_INSERT_TAIL(&sdev->target->xferlist, xfer, link);
	}

	static void
	sbp_reset_start_callback(struct fw_xfer *xfer)
	{
	struct sbp_dev tsdev, sdev = (struct sbp_dev *)xfer->sc;
	struct sbp_target *target = sdev->target;
	int i;

	if (xfer->resp != 0) {
	device_printf(sdev->target->sbp->fd.dev,
	"%s: %s failed: resp=%d\n", __func__, sdev->bustgtlun, xfer->resp);
	}

	SBP_LOCK(target->sbp);
	for (i = 0; i < target->num_lun; i++) {
	tsdev = target->luns[i];
	if (tsdev != NULL && tsdev->status == SBP_DEV_LOGIN)
	sbp_login(tsdev);
	}
	SBP_UNLOCK(target->sbp);
	}

	static void
	sbp_reset_start(struct sbp_dev *sdev)
	{
	struct fw_xfer *xfer;
	struct fw_pkt *fp;

	SBP_DEBUG(0)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s\n", __func__,sdev->bustgtlun);
	END_DEBUG

	xfer = sbp_write_cmd(sdev, FWTCODE_WREQQ, 0);
	xfer->hand = sbp_reset_start_callback;
	fp = &xfer->send.hdr;
	fp->mode.wreqq.dest_hi = 0xffff;
	fp->mode.wreqq.dest_lo = 0xf0000000 \| RESET_START;
	fp->mode.wreqq.data = htonl(0xf);
	fw_asyreq(xfer->fc, -1, xfer);
	}

	static void
	sbp_mgm_callback(struct fw_xfer *xfer)
	{
	struct sbp_dev *sdev;
	int resp;

	sdev = (struct sbp_dev *)xfer->sc;

	SBP_DEBUG(1)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s\n", __func__, sdev->bustgtlun);
	END_DEBUG
	resp = xfer->resp;
	SBP_LOCK(sdev->target->sbp);
	sbp_xfer_free(xfer);
	SBP_UNLOCK(sdev->target->sbp);
	}

	static struct sbp_dev *
	sbp_next_dev(struct sbp_target *target, int lun)
	{
	struct sbp_dev **sdevp;
	int i;

	for (i = lun, sdevp = &target->luns[lun]; i < target->num_lun;
	i++, sdevp++)
	if (sdevp != NULL && (sdevp)->status == SBP_DEV_PROBE)
	return (*sdevp);
	return (NULL);
	}

	#define SCAN_PRI 1
	static void
	sbp_cam_scan_lun(struct cam_periph periph, union ccb ccb)
	{
	struct sbp_softc *sbp;
	struct sbp_target *target;
	struct sbp_dev *sdev;

	sdev = (struct sbp_dev *) ccb->ccb_h.ccb_sdev_ptr;
	target = sdev->target;
	sbp = target->sbp;
	SBP_LOCK(sbp);
	SBP_DEBUG(0)
	device_printf(sbp->fd.dev,
	"%s:%s\n", __func__, sdev->bustgtlun);
	END_DEBUG
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	sdev->status = SBP_DEV_ATTACHED;
	} else {
	device_printf(sbp->fd.dev,
	"%s:%s failed\n", __func__, sdev->bustgtlun);
	}
	sdev = sbp_next_dev(target, sdev->lun_id + 1);
	if (sdev == NULL) {
	SBP_UNLOCK(sbp);
	xpt_free_ccb(ccb);
	return;
	}
	/* reuse ccb */
	xpt_setup_ccb(&ccb->ccb_h, sdev->path, SCAN_PRI);
	ccb->ccb_h.ccb_sdev_ptr = sdev;
	ccb->ccb_h.flags \|= CAM_DEV_QFREEZE;
	SBP_UNLOCK(sbp);

	xpt_action(ccb);
	xpt_release_devq(sdev->path, sdev->freeze, TRUE);
	sdev->freeze = 1;
	}

	static void
	sbp_cam_scan_target(void *arg)
	{
	struct sbp_target target = (struct sbp_target )arg;
	struct sbp_dev *sdev;
	union ccb *ccb;

	SBP_LOCK_ASSERT(target->sbp);
	sdev = sbp_next_dev(target, 0);
	if (sdev == NULL) {
	printf("sbp_cam_scan_target: nothing to do for target%d\n",
	target->target_id);
	return;
	}
	SBP_DEBUG(0)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s\n", __func__, sdev->bustgtlun);
	END_DEBUG
	ccb = xpt_alloc_ccb_nowait();
	if (ccb == NULL) {
	printf("sbp_cam_scan_target: xpt_alloc_ccb_nowait() failed\n");
	return;
	}
	SBP_UNLOCK(target->sbp);

	xpt_setup_ccb(&ccb->ccb_h, sdev->path, SCAN_PRI);
	ccb->ccb_h.func_code = XPT_SCAN_LUN;
	ccb->ccb_h.cbfcnp = sbp_cam_scan_lun;
	ccb->ccb_h.flags \|= CAM_DEV_QFREEZE;
	ccb->crcn.flags = CAM_FLAG_NONE;
	ccb->ccb_h.ccb_sdev_ptr = sdev;

	/* The scan is in progress now. */
	xpt_action(ccb);

	SBP_LOCK(target->sbp);
	xpt_release_devq(sdev->path, sdev->freeze, TRUE);
	sdev->freeze = 1;
	}

	static __inline void
	sbp_scan_dev(struct sbp_dev *sdev)
	{
	sdev->status = SBP_DEV_PROBE;
	callout_reset_sbt(&sdev->target->scan_callout, SBT_1MS * scan_delay, 0,
	sbp_cam_scan_target, (void *)sdev->target, 0);
	}

	static void
	sbp_do_attach(struct fw_xfer *xfer)
	{
	struct sbp_dev *sdev;
	struct sbp_target *target;
	struct sbp_softc *sbp;

	sdev = (struct sbp_dev *)xfer->sc;
	target = sdev->target;
	sbp = target->sbp;
	SBP_LOCK(sbp);
	SBP_DEBUG(0)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s\n", __func__, sdev->bustgtlun);
	END_DEBUG
	sbp_xfer_free(xfer);

	if (sdev->path == NULL)
	xpt_create_path(&sdev->path, NULL,
	cam_sim_path(target->sbp->sim),
	target->target_id, sdev->lun_id);

	/*
	* Let CAM scan the bus if we are in the boot process.
	* XXX xpt_scan_bus cannot detect LUN larger than 0
	* if LUN 0 doesn't exist.
	*/
	if (sbp_cold > 0) {
	sdev->status = SBP_DEV_ATTACHED;
	SBP_UNLOCK(sbp);
	return;
	}

	sbp_scan_dev(sdev);
	SBP_UNLOCK(sbp);
	}

	static void
	sbp_agent_reset_callback(struct fw_xfer *xfer)
	{
	struct sbp_dev *sdev;

	sdev = (struct sbp_dev *)xfer->sc;
	SBP_DEBUG(1)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s\n", __func__, sdev->bustgtlun);
	END_DEBUG
	if (xfer->resp != 0) {
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s resp=%d\n", __func__, sdev->bustgtlun, xfer->resp);
	}

	SBP_LOCK(sdev->target->sbp);
	sbp_xfer_free(xfer);
	if (sdev->path) {
	xpt_release_devq(sdev->path, sdev->freeze, TRUE);
	sdev->freeze = 0;
	}
	SBP_UNLOCK(sdev->target->sbp);
	}

	static void
	sbp_agent_reset(struct sbp_dev *sdev)
	{
	struct fw_xfer *xfer;
	struct fw_pkt *fp;

	SBP_LOCK_ASSERT(sdev->target->sbp);
	SBP_DEBUG(0)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s\n", __func__, sdev->bustgtlun);
	END_DEBUG
	xfer = sbp_write_cmd(sdev, FWTCODE_WREQQ, 0x04);
	if (xfer == NULL)
	return;
	if (sdev->status == SBP_DEV_ATTACHED \|\| sdev->status == SBP_DEV_PROBE)
	xfer->hand = sbp_agent_reset_callback;
	else
	xfer->hand = sbp_do_attach;
	fp = &xfer->send.hdr;
	fp->mode.wreqq.data = htonl(0xf);
	fw_asyreq(xfer->fc, -1, xfer);
	sbp_abort_all_ocbs(sdev, CAM_BDR_SENT);
	}

	static void
	sbp_busy_timeout_callback(struct fw_xfer *xfer)
	{
	struct sbp_dev *sdev;

	sdev = (struct sbp_dev *)xfer->sc;
	SBP_DEBUG(1)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s\n", __func__, sdev->bustgtlun);
	END_DEBUG
	SBP_LOCK(sdev->target->sbp);
	sbp_xfer_free(xfer);
	sbp_agent_reset(sdev);
	SBP_UNLOCK(sdev->target->sbp);
	}

	static void
	sbp_busy_timeout(struct sbp_dev *sdev)
	{
	struct fw_pkt *fp;
	struct fw_xfer *xfer;
	SBP_DEBUG(0)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s\n", __func__, sdev->bustgtlun);
	END_DEBUG
	xfer = sbp_write_cmd(sdev, FWTCODE_WREQQ, 0);

	xfer->hand = sbp_busy_timeout_callback;
	fp = &xfer->send.hdr;
	fp->mode.wreqq.dest_hi = 0xffff;
	fp->mode.wreqq.dest_lo = 0xf0000000 \| BUSY_TIMEOUT;
	fp->mode.wreqq.data = htonl((1 << (13 + 12)) \| 0xf);
	fw_asyreq(xfer->fc, -1, xfer);
	}

	static void
	sbp_orb_pointer_callback(struct fw_xfer *xfer)
	{
	struct sbp_dev *sdev;
	sdev = (struct sbp_dev *)xfer->sc;

	SBP_DEBUG(2)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s\n", __func__, sdev->bustgtlun);
	END_DEBUG
	if (xfer->resp != 0) {
	/* XXX */
	printf("%s: xfer->resp = %d\n", __func__, xfer->resp);
	}
	SBP_LOCK(sdev->target->sbp);
	sbp_xfer_free(xfer);

	sdev->flags &= ~ORB_POINTER_ACTIVE;

	if ((sdev->flags & ORB_POINTER_NEED) != 0) {
	struct sbp_ocb *ocb;

	sdev->flags &= ~ORB_POINTER_NEED;
	ocb = STAILQ_FIRST(&sdev->ocbs);
	if (ocb != NULL)
	sbp_orb_pointer(sdev, ocb);
	}
	SBP_UNLOCK(sdev->target->sbp);
	return;
	}

	static void
	sbp_orb_pointer(struct sbp_dev sdev, struct sbp_ocb ocb)
	{
	struct fw_xfer *xfer;
	struct fw_pkt *fp;
	SBP_DEBUG(1)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s 0x%08x\n",
	__func__, sdev->bustgtlun,
	(uint32_t)ocb->bus_addr);
	END_DEBUG

	SBP_LOCK_ASSERT(sdev->target->sbp);

	if ((sdev->flags & ORB_POINTER_ACTIVE) != 0) {
	SBP_DEBUG(0)
	printf("%s: orb pointer active\n", __func__);
	END_DEBUG
	sdev->flags \|= ORB_POINTER_NEED;
	return;
	}

	sdev->flags \|= ORB_POINTER_ACTIVE;
	xfer = sbp_write_cmd(sdev, FWTCODE_WREQB, 0x08);
	if (xfer == NULL)
	return;
	xfer->hand = sbp_orb_pointer_callback;

	fp = &xfer->send.hdr;
	fp->mode.wreqb.len = 8;
	fp->mode.wreqb.extcode = 0;
	xfer->send.payload[0] =
	htonl(((sdev->target->sbp->fd.fc->nodeid \| FWLOCALBUS) << 16));
	xfer->send.payload[1] = htonl((uint32_t)ocb->bus_addr);

	if (fw_asyreq(xfer->fc, -1, xfer) != 0) {
	sbp_xfer_free(xfer);
	ocb->ccb->ccb_h.status = CAM_REQ_INVALID;
	xpt_done(ocb->ccb);
	}
	}

	static void
	sbp_doorbell_callback(struct fw_xfer *xfer)
	{
	struct sbp_dev *sdev;
	sdev = (struct sbp_dev *)xfer->sc;

	SBP_DEBUG(1)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s\n", __func__, sdev->bustgtlun);
	END_DEBUG
	if (xfer->resp != 0) {
	/* XXX */
	device_printf(sdev->target->sbp->fd.dev,
	"%s: xfer->resp = %d\n", __func__, xfer->resp);
	}
	SBP_LOCK(sdev->target->sbp);
	sbp_xfer_free(xfer);
	sdev->flags &= ~ORB_DOORBELL_ACTIVE;
	if ((sdev->flags & ORB_DOORBELL_NEED) != 0) {
	sdev->flags &= ~ORB_DOORBELL_NEED;
	sbp_doorbell(sdev);
	}
	SBP_UNLOCK(sdev->target->sbp);
	}

	static void
	sbp_doorbell(struct sbp_dev *sdev)
	{
	struct fw_xfer *xfer;
	struct fw_pkt *fp;
	SBP_DEBUG(1)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s\n", __func__, sdev->bustgtlun);
	END_DEBUG

	if ((sdev->flags & ORB_DOORBELL_ACTIVE) != 0) {
	sdev->flags \|= ORB_DOORBELL_NEED;
	return;
	}
	sdev->flags \|= ORB_DOORBELL_ACTIVE;
	xfer = sbp_write_cmd(sdev, FWTCODE_WREQQ, 0x10);
	if (xfer == NULL)
	return;
	xfer->hand = sbp_doorbell_callback;
	fp = &xfer->send.hdr;
	fp->mode.wreqq.data = htonl(0xf);
	fw_asyreq(xfer->fc, -1, xfer);
	}

	static struct fw_xfer *
	sbp_write_cmd(struct sbp_dev *sdev, int tcode, int offset)
	{
	struct fw_xfer *xfer;
	struct fw_pkt *fp;
	struct sbp_target *target;
	int new = 0;

	SBP_LOCK_ASSERT(sdev->target->sbp);

	target = sdev->target;
	xfer = STAILQ_FIRST(&target->xferlist);
	if (xfer == NULL) {
	if (target->n_xfer > 5 /* XXX */) {
	printf("sbp: no more xfer for this target\n");
	return (NULL);
	}
	xfer = fw_xfer_alloc_buf(M_SBP, 8, 0);
	if (xfer == NULL) {
	printf("sbp: fw_xfer_alloc_buf failed\n");
	return NULL;
	}
	target->n_xfer++;
	if (debug)
	printf("sbp: alloc %d xfer\n", target->n_xfer);
	new = 1;
	} else {
	STAILQ_REMOVE_HEAD(&target->xferlist, link);
	}

	if (new) {
	xfer->recv.pay_len = 0;
	xfer->send.spd = min(sdev->target->fwdev->speed, max_speed);
	xfer->fc = sdev->target->sbp->fd.fc;
	}

	if (tcode == FWTCODE_WREQB)
	xfer->send.pay_len = 8;
	else
	xfer->send.pay_len = 0;

	xfer->sc = (caddr_t)sdev;
	fp = &xfer->send.hdr;
	fp->mode.wreqq.dest_hi = sdev->login->cmd_hi;
	fp->mode.wreqq.dest_lo = sdev->login->cmd_lo + offset;
	fp->mode.wreqq.tlrt = 0;
	fp->mode.wreqq.tcode = tcode;
	fp->mode.wreqq.pri = 0;
	fp->mode.wreqq.dst = FWLOCALBUS \| sdev->target->fwdev->dst;

	return xfer;
	}

	static void
	sbp_mgm_orb(struct sbp_dev sdev, int func, struct sbp_ocb aocb)
	{
	struct fw_xfer *xfer;
	struct fw_pkt *fp;
	struct sbp_ocb *ocb;
	struct sbp_target *target;
	int nid;

	target = sdev->target;
	nid = target->sbp->fd.fc->nodeid \| FWLOCALBUS;

	SBP_LOCK_ASSERT(target->sbp);
	if (func == ORB_FUN_RUNQUEUE) {
	ocb = STAILQ_FIRST(&target->mgm_ocb_queue);
	if (target->mgm_ocb_cur != NULL \|\| ocb == NULL) {
	return;
	}
	STAILQ_REMOVE_HEAD(&target->mgm_ocb_queue, ocb);
	goto start;
	}
	if ((ocb = sbp_get_ocb(sdev)) == NULL) {
	/* XXX */
	return;
	}
	ocb->flags = OCB_ACT_MGM;
	ocb->sdev = sdev;

	bzero((void *)ocb->orb, sizeof(ocb->orb));
	ocb->orb[6] = htonl((nid << 16) \| SBP_BIND_HI);
	ocb->orb[7] = htonl(SBP_DEV2ADDR(target->target_id, sdev->lun_id));

	SBP_DEBUG(0)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s %s\n",
	__func__,sdev->bustgtlun,
	orb_fun_name[(func >> 16) & 0xf]);
	END_DEBUG
	switch (func) {
	case ORB_FUN_LGI:
	ocb->orb[0] = ocb->orb[1] = 0; /* password */
	ocb->orb[2] = htonl(nid << 16);
	ocb->orb[3] = htonl(sdev->dma.bus_addr);
	ocb->orb[4] = htonl(ORB_NOTIFY \| sdev->lun_id);
	if (ex_login)
	ocb->orb[4] \|= htonl(ORB_EXV);
	ocb->orb[5] = htonl(SBP_LOGIN_SIZE);
	fwdma_sync(&sdev->dma, BUS_DMASYNC_PREREAD);
	break;
	case ORB_FUN_ATA:
	ocb->orb[0] = htonl((0 << 16) \| 0);
	ocb->orb[1] = htonl(aocb->bus_addr & 0xffffffff);
	/* fall through */
	case ORB_FUN_RCN:
	case ORB_FUN_LGO:
	case ORB_FUN_LUR:
	case ORB_FUN_RST:
	case ORB_FUN_ATS:
	ocb->orb[4] = htonl(ORB_NOTIFY \| func \| sdev->login->id);
	break;
	}

	if (target->mgm_ocb_cur != NULL) {
	/* there is a standing ORB */
	STAILQ_INSERT_TAIL(&sdev->target->mgm_ocb_queue, ocb, ocb);
	return;
	}
	start:
	target->mgm_ocb_cur = ocb;

	callout_reset(&target->mgm_ocb_timeout, 5 * hz,
	sbp_mgm_timeout, (caddr_t)ocb);
	xfer = sbp_write_cmd(sdev, FWTCODE_WREQB, 0);
	if (xfer == NULL) {
	return;
	}
	xfer->hand = sbp_mgm_callback;

	fp = &xfer->send.hdr;
	fp->mode.wreqb.dest_hi = sdev->target->mgm_hi;
	fp->mode.wreqb.dest_lo = sdev->target->mgm_lo;
	fp->mode.wreqb.len = 8;
	fp->mode.wreqb.extcode = 0;
	xfer->send.payload[0] = htonl(nid << 16);
	xfer->send.payload[1] = htonl(ocb->bus_addr & 0xffffffff);

	fw_asyreq(xfer->fc, -1, xfer);
	}

	static void
	sbp_print_scsi_cmd(struct sbp_ocb *ocb)
	{
	struct ccb_scsiio *csio;

	csio = &ocb->ccb->csio;
	printf("%s:%d:%jx XPT_SCSI_IO: "
	"cmd: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x"
	", flags: 0x%02x, "
	"%db cmd/%db data/%db sense\n",
	device_get_nameunit(ocb->sdev->target->sbp->fd.dev),
	ocb->ccb->ccb_h.target_id,
	(uintmax_t)ocb->ccb->ccb_h.target_lun,
	csio->cdb_io.cdb_bytes[0],
	csio->cdb_io.cdb_bytes[1],
	csio->cdb_io.cdb_bytes[2],
	csio->cdb_io.cdb_bytes[3],
	csio->cdb_io.cdb_bytes[4],
	csio->cdb_io.cdb_bytes[5],
	csio->cdb_io.cdb_bytes[6],
	csio->cdb_io.cdb_bytes[7],
	csio->cdb_io.cdb_bytes[8],
	csio->cdb_io.cdb_bytes[9],
	ocb->ccb->ccb_h.flags & CAM_DIR_MASK,
	csio->cdb_len, csio->dxfer_len,
	csio->sense_len);
	}

	static void
	sbp_scsi_status(struct sbp_status sbp_status, struct sbp_ocb ocb)
	{
	struct sbp_cmd_status *sbp_cmd_status;
	struct scsi_sense_data_fixed *sense;

	sbp_cmd_status = (struct sbp_cmd_status *)sbp_status->data;
	sense = (struct scsi_sense_data_fixed *)&ocb->ccb->csio.sense_data;

	SBP_DEBUG(0)
	sbp_print_scsi_cmd(ocb);
	/* XXX need decode status */
	printf("%s: SCSI status %x sfmt %x valid %x key %x code %x qlfr %x len %d\n",
	ocb->sdev->bustgtlun,
	sbp_cmd_status->status,
	sbp_cmd_status->sfmt,
	sbp_cmd_status->valid,
	sbp_cmd_status->s_key,
	sbp_cmd_status->s_code,
	sbp_cmd_status->s_qlfr,
	sbp_status->len);
	END_DEBUG

	switch (sbp_cmd_status->status) {
	case SCSI_STATUS_CHECK_COND:
	case SCSI_STATUS_BUSY:
	case SCSI_STATUS_CMD_TERMINATED:
	if (sbp_cmd_status->sfmt == SBP_SFMT_CURR) {
	sense->error_code = SSD_CURRENT_ERROR;
	} else {
	sense->error_code = SSD_DEFERRED_ERROR;
	}
	if (sbp_cmd_status->valid)
	sense->error_code \|= SSD_ERRCODE_VALID;
	sense->flags = sbp_cmd_status->s_key;
	if (sbp_cmd_status->mark)
	sense->flags \|= SSD_FILEMARK;
	if (sbp_cmd_status->eom)
	sense->flags \|= SSD_EOM;
	if (sbp_cmd_status->ill_len)
	sense->flags \|= SSD_ILI;

	bcopy(&sbp_cmd_status->info, &sense->info[0], 4);

	if (sbp_status->len <= 1)
	/* XXX not scsi status. shouldn't be happened */
	sense->extra_len = 0;
	else if (sbp_status->len <= 4)
	/* add_sense_code(_qual), info, cmd_spec_info */
	sense->extra_len = 6;
	else
	/* fru, sense_key_spec */
	sense->extra_len = 10;

	bcopy(&sbp_cmd_status->cdb, &sense->cmd_spec_info[0], 4);

	sense->add_sense_code = sbp_cmd_status->s_code;
	sense->add_sense_code_qual = sbp_cmd_status->s_qlfr;
	sense->fru = sbp_cmd_status->fru;

	bcopy(&sbp_cmd_status->s_keydep[0],
	&sense->sense_key_spec[0], 3);

	ocb->ccb->csio.scsi_status = sbp_cmd_status->status;
	ocb->ccb->ccb_h.status = CAM_SCSI_STATUS_ERROR
	\| CAM_AUTOSNS_VALID;
	/*
	{
	uint8_t j, *tmp;
	tmp = sense;
	for (j = 0; j < 32; j += 8) {
	printf("sense %02x%02x %02x%02x %02x%02x %02x%02x\n",
	tmp[j], tmp[j + 1], tmp[j + 2], tmp[j + 3],
	tmp[j + 4], tmp[j + 5], tmp[j + 6], tmp[j + 7]);
	}

	}
	*/
	break;
	default:
	device_printf(ocb->sdev->target->sbp->fd.dev,
	"%s:%s unknown scsi status 0x%x\n",
	__func__, ocb->sdev->bustgtlun,
	sbp_cmd_status->status);
	}
	}

	static void
	sbp_fix_inq_data(struct sbp_ocb *ocb)
	{
	union ccb *ccb;
	struct sbp_dev *sdev;
	struct scsi_inquiry_data *inq;

	ccb = ocb->ccb;
	sdev = ocb->sdev;

	if (ccb->csio.cdb_io.cdb_bytes[1] & SI_EVPD)
	return;
	SBP_DEBUG(1)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s\n", __func__, sdev->bustgtlun);
	END_DEBUG
	inq = (struct scsi_inquiry_data *) ccb->csio.data_ptr;
	switch (SID_TYPE(inq)) {
	case T_DIRECT:
	#if 0
	/*
	* XXX Convert Direct Access device to RBC.
	* I've never seen FireWire DA devices which support READ_6.
	*/
	if (SID_TYPE(inq) == T_DIRECT)
	inq->device \|= T_RBC; /* T_DIRECT == 0 */
	#endif
	/* fall through */
	case T_RBC:
	/*
	* Override vendor/product/revision information.
	* Some devices sometimes return strange strings.
	*/
	#if 1
	bcopy(sdev->vendor, inq->vendor, sizeof(inq->vendor));
	bcopy(sdev->product, inq->product, sizeof(inq->product));
	bcopy(sdev->revision + 2, inq->revision, sizeof(inq->revision));
	#endif
	break;
	}
	/*
	* Force to enable/disable tagged queuing.
	* XXX CAM also checks SCP_QUEUE_DQUE flag in the control mode page.
	*/
	if (sbp_tags > 0)
	inq->flags \|= SID_CmdQue;
	else if (sbp_tags < 0)
	inq->flags &= ~SID_CmdQue;

	}

	static void
	sbp_recv1(struct fw_xfer *xfer)
	{
	struct fw_pkt *rfp;
	#if NEED_RESPONSE
	struct fw_pkt *sfp;
	#endif
	struct sbp_softc *sbp;
	struct sbp_dev *sdev;
	struct sbp_ocb *ocb;
	struct sbp_login_res *login_res = NULL;
	struct sbp_status *sbp_status;
	struct sbp_target *target;
	int orb_fun, status_valid0, status_valid, t, l, reset_agent = 0;
	uint32_t addr;
	/*
	uint32_t *ld;
	ld = xfer->recv.buf;
	printf("sbp %x %d %d %08x %08x %08x %08x\n",
	xfer->resp, xfer->recv.len, xfer->recv.off, ntohl(ld[0]), ntohl(ld[1]), ntohl(ld[2]), ntohl(ld[3]));
	printf("sbp %08x %08x %08x %08x\n", ntohl(ld[4]), ntohl(ld[5]), ntohl(ld[6]), ntohl(ld[7]));
	printf("sbp %08x %08x %08x %08x\n", ntohl(ld[8]), ntohl(ld[9]), ntohl(ld[10]), ntohl(ld[11]));
	*/
	sbp = (struct sbp_softc *)xfer->sc;
	SBP_LOCK_ASSERT(sbp);
	if (xfer->resp != 0) {
	printf("sbp_recv: xfer->resp = %d\n", xfer->resp);
	goto done0;
	}
	if (xfer->recv.payload == NULL) {
	printf("sbp_recv: xfer->recv.payload == NULL\n");
	goto done0;
	}
	rfp = &xfer->recv.hdr;
	if (rfp->mode.wreqb.tcode != FWTCODE_WREQB) {
	printf("sbp_recv: tcode = %d\n", rfp->mode.wreqb.tcode);
	goto done0;
	}
	sbp_status = (struct sbp_status *)xfer->recv.payload;
	addr = rfp->mode.wreqb.dest_lo;
	SBP_DEBUG(2)
	printf("received address 0x%x\n", addr);
	END_DEBUG
	t = SBP_ADDR2TRG(addr);
	if (t >= SBP_NUM_TARGETS) {
	device_printf(sbp->fd.dev,
	"sbp_recv1: invalid target %d\n", t);
	goto done0;
	}
	target = &sbp->targets[t];
	l = SBP_ADDR2LUN(addr);
	if (l >= target->num_lun \|\| target->luns[l] == NULL) {
	device_printf(sbp->fd.dev,
	"sbp_recv1: invalid lun %d (target=%d)\n", l, t);
	goto done0;
	}
	sdev = target->luns[l];

	ocb = NULL;
	switch (sbp_status->src) {
	case 0:
	case 1:
	/* check mgm_ocb_cur first */
	ocb = target->mgm_ocb_cur;
	if (ocb != NULL) {
	if (OCB_MATCH(ocb, sbp_status)) {
	callout_stop(&target->mgm_ocb_timeout);
	target->mgm_ocb_cur = NULL;
	break;
	}
	}
	ocb = sbp_dequeue_ocb(sdev, sbp_status);
	if (ocb == NULL) {
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s No ocb(%x) on the queue\n",
	__func__,sdev->bustgtlun,
	ntohl(sbp_status->orb_lo));
	}
	break;
	case 2:
	/* unsolicit */
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s unsolicit status received\n",
	__func__, sdev->bustgtlun);
	break;
	default:
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s unknown sbp_status->src\n",
	__func__, sdev->bustgtlun);
	}

	status_valid0 = (sbp_status->src < 2
	&& sbp_status->resp == ORB_RES_CMPL
	&& sbp_status->dead == 0);
	status_valid = (status_valid0 && sbp_status->status == 0);

	if (!status_valid0 \|\| debug > 2) {
	int status;
	SBP_DEBUG(0)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s ORB status src:%x resp:%x dead:%x"
	" len:%x stat:%x orb:%x%08x\n",
	__func__, sdev->bustgtlun,
	sbp_status->src, sbp_status->resp, sbp_status->dead,
	sbp_status->len, sbp_status->status,
	ntohs(sbp_status->orb_hi), ntohl(sbp_status->orb_lo));
	END_DEBUG
	device_printf(sdev->target->sbp->fd.dev,
	"%s\n", sdev->bustgtlun);
	status = sbp_status->status;
	switch (sbp_status->resp) {
	case 0:
	if (status > MAX_ORB_STATUS0)
	printf("%s\n", orb_status0[MAX_ORB_STATUS0]);
	else
	printf("%s\n", orb_status0[status]);
	break;
	case 1:
	printf("Obj: %s, Error: %s\n",
	orb_status1_object[(status >> 6) & 3],
	orb_status1_serial_bus_error[status & 0xf]);
	break;
	case 2:
	printf("Illegal request\n");
	break;
	case 3:
	printf("Vendor dependent\n");
	break;
	default:
	printf("unknown respose code %d\n", sbp_status->resp);
	}
	}

	/* we have to reset the fetch agent if it's dead */
	if (sbp_status->dead) {
	if (sdev->path) {
	xpt_freeze_devq(sdev->path, 1);
	sdev->freeze++;
	}
	reset_agent = 1;
	}

	if (ocb == NULL)
	goto done;

	switch (ntohl(ocb->orb[4]) & ORB_FMT_MSK) {
	case ORB_FMT_NOP:
	break;
	case ORB_FMT_VED:
	break;
	case ORB_FMT_STD:
	switch (ocb->flags) {
	case OCB_ACT_MGM:
	orb_fun = ntohl(ocb->orb[4]) & ORB_FUN_MSK;
	reset_agent = 0;
	switch (orb_fun) {
	case ORB_FUN_LGI:
	fwdma_sync(&sdev->dma, BUS_DMASYNC_POSTREAD);
	login_res = sdev->login;
	login_res->len = ntohs(login_res->len);
	login_res->id = ntohs(login_res->id);
	login_res->cmd_hi = ntohs(login_res->cmd_hi);
	login_res->cmd_lo = ntohl(login_res->cmd_lo);
	if (status_valid) {
	SBP_DEBUG(0)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s login: len %d, ID %d, cmd %08x%08x, recon_hold %d\n",
	__func__, sdev->bustgtlun,
	login_res->len, login_res->id,
	login_res->cmd_hi, login_res->cmd_lo,
	ntohs(login_res->recon_hold));
	END_DEBUG
	sbp_busy_timeout(sdev);
	} else {
	/* forgot logout? */
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s login failed\n",
	__func__, sdev->bustgtlun);
	sdev->status = SBP_DEV_RESET;
	}
	break;
	case ORB_FUN_RCN:
	login_res = sdev->login;
	if (status_valid) {
	SBP_DEBUG(0)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s reconnect: len %d, ID %d, cmd %08x%08x\n",
	__func__, sdev->bustgtlun,
	login_res->len, login_res->id,
	login_res->cmd_hi, login_res->cmd_lo);
	END_DEBUG
	if (sdev->status == SBP_DEV_ATTACHED)
	sbp_scan_dev(sdev);
	else
	sbp_agent_reset(sdev);
	} else {
	/* reconnection hold time exceed? */
	SBP_DEBUG(0)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s reconnect failed\n",
	__func__, sdev->bustgtlun);
	END_DEBUG
	sbp_login(sdev);
	}
	break;
	case ORB_FUN_LGO:
	sdev->status = SBP_DEV_RESET;
	break;
	case ORB_FUN_RST:
	sbp_busy_timeout(sdev);
	break;
	case ORB_FUN_LUR:
	case ORB_FUN_ATA:
	case ORB_FUN_ATS:
	sbp_agent_reset(sdev);
	break;
	default:
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s unknown function %d\n",
	__func__, sdev->bustgtlun, orb_fun);
	break;
	}
	sbp_mgm_orb(sdev, ORB_FUN_RUNQUEUE, NULL);
	break;
	case OCB_ACT_CMD:
	sdev->timeout = 0;
	if (ocb->ccb != NULL) {
	union ccb *ccb;

	ccb = ocb->ccb;
	if (sbp_status->len > 1) {
	sbp_scsi_status(sbp_status, ocb);
	} else {
	if (sbp_status->resp != ORB_RES_CMPL) {
	ccb->ccb_h.status = CAM_REQ_CMP_ERR;
	} else {
	ccb->ccb_h.status = CAM_REQ_CMP;
	}
	}
	/* fix up inq data */
	if (ccb->csio.cdb_io.cdb_bytes[0] == INQUIRY)
	sbp_fix_inq_data(ocb);
	xpt_done(ccb);
	}
	break;
	default:
	break;
	}
	}

	if (!use_doorbell)
	sbp_free_ocb(sdev, ocb);
	done:
	if (reset_agent)
	sbp_agent_reset(sdev);

	done0:
	xfer->recv.pay_len = SBP_RECV_LEN;
	/* The received packet is usually small enough to be stored within
	* the buffer. In that case, the controller return ack_complete and
	* no respose is necessary.
	*
	* XXX fwohci.c and firewire.c should inform event_code such as
	* ack_complete or ack_pending to upper driver.
	*/
	#if NEED_RESPONSE
	xfer->send.off = 0;
	sfp = (struct fw_pkt *)xfer->send.buf;
	sfp->mode.wres.dst = rfp->mode.wreqb.src;
	xfer->dst = sfp->mode.wres.dst;
	xfer->spd = min(sdev->target->fwdev->speed, max_speed);
	xfer->hand = sbp_loginres_callback;

	sfp->mode.wres.tlrt = rfp->mode.wreqb.tlrt;
	sfp->mode.wres.tcode = FWTCODE_WRES;
	sfp->mode.wres.rtcode = 0;
	sfp->mode.wres.pri = 0;

	fw_asyreq(xfer->fc, -1, xfer);
	#else
	/* recycle */
	STAILQ_INSERT_TAIL(&sbp->fwb.xferlist, xfer, link);
	#endif
	}

	static void
	sbp_recv(struct fw_xfer *xfer)
	{
	struct sbp_softc *sbp;

	sbp = (struct sbp_softc *)xfer->sc;
	SBP_LOCK(sbp);
	sbp_recv1(xfer);
	SBP_UNLOCK(sbp);
	}
	/*
	* sbp_attach()
	*/
	static int
	sbp_attach(device_t dev)
	{
	struct sbp_softc *sbp;
	struct cam_devq *devq;
	struct firewire_comm *fc;
	int i, error;

	if (DFLTPHYS > SBP_MAXPHYS)
	device_printf(dev, "Warning, DFLTPHYS(%dKB) is larger than "
	"SBP_MAXPHYS(%dKB).\n", DFLTPHYS / 1024,
	SBP_MAXPHYS / 1024);

	if (!firewire_phydma_enable)
	device_printf(dev, "Warning, hw.firewire.phydma_enable must be 1 "
	"for SBP over FireWire.\n");
	SBP_DEBUG(0)
	printf("sbp_attach (cold=%d)\n", cold);
	END_DEBUG

	if (cold)
	sbp_cold++;
	sbp = device_get_softc(dev);
	sbp->fd.dev = dev;
	sbp->fd.fc = fc = device_get_ivars(dev);
	mtx_init(&sbp->mtx, "sbp", NULL, MTX_DEF);

	if (max_speed < 0)
	max_speed = fc->speed;

	error = bus_dma_tag_create(/parent/fc->dmat,
	/* XXX shoud be 4 for sane backend? */
	/alignment/1,
	/boundary/0,
	/lowaddr/BUS_SPACE_MAXADDR_32BIT,
	/highaddr/BUS_SPACE_MAXADDR,
	/filter/NULL, /filterarg/NULL,
	/maxsize/0x100000, /nsegments/SBP_IND_MAX,
	/maxsegsz/SBP_SEG_MAX,
	/flags/BUS_DMA_ALLOCNOW,
	/lockfunc/busdma_lock_mutex,
	/lockarg/&sbp->mtx,
	&sbp->dmat);
	if (error != 0) {
	printf("sbp_attach: Could not allocate DMA tag "
	"- error %d\n", error);
	return (ENOMEM);
	}

	devq = cam_simq_alloc(/maxopenings/SBP_NUM_OCB);
	if (devq == NULL)
	return (ENXIO);

	for (i = 0; i < SBP_NUM_TARGETS; i++) {
	sbp->targets[i].fwdev = NULL;
	sbp->targets[i].luns = NULL;
	sbp->targets[i].sbp = sbp;
	}

	sbp->sim = cam_sim_alloc(sbp_action, sbp_poll, "sbp", sbp,
	device_get_unit(dev),
	&sbp->mtx,
	/untagged/ 1,
	/tagged/ SBP_QUEUE_LEN - 1,
	devq);

	if (sbp->sim == NULL) {
	cam_simq_free(devq);
	return (ENXIO);
	}

	SBP_LOCK(sbp);
	if (xpt_bus_register(sbp->sim, dev, /bus/0) != CAM_SUCCESS)
	goto fail;

	if (xpt_create_path(&sbp->path, NULL, cam_sim_path(sbp->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	xpt_bus_deregister(cam_sim_path(sbp->sim));
	goto fail;
	}
	SBP_UNLOCK(sbp);

	/* We reserve 16 bit space (4 bytes X 64 targets X 256 luns) */
	sbp->fwb.start = ((u_int64_t)SBP_BIND_HI << 32) \| SBP_DEV2ADDR(0, 0);
	sbp->fwb.end = sbp->fwb.start + 0xffff;
	/* pre-allocate xfer */
	STAILQ_INIT(&sbp->fwb.xferlist);
	fw_xferlist_add(&sbp->fwb.xferlist, M_SBP,
	/send/ 0, /recv/ SBP_RECV_LEN, SBP_NUM_OCB/2,
	fc, (void *)sbp, sbp_recv);

	fw_bindadd(fc, &sbp->fwb);

	sbp->fd.post_busreset = sbp_post_busreset;
	sbp->fd.post_explore = sbp_post_explore;

	if (fc->status != -1) {
	sbp_post_busreset(sbp);
	sbp_post_explore(sbp);
	}
	SBP_LOCK(sbp);
	xpt_async(AC_BUS_RESET, sbp->path, /arg/ NULL);
	SBP_UNLOCK(sbp);

	return (0);
	fail:
	SBP_UNLOCK(sbp);
	cam_sim_free(sbp->sim, /free_devq/TRUE);
	return (ENXIO);
	}

	static int
	sbp_logout_all(struct sbp_softc *sbp)
	{
	struct sbp_target *target;
	struct sbp_dev *sdev;
	int i, j;

	SBP_DEBUG(0)
	printf("sbp_logout_all\n");
	END_DEBUG
	SBP_LOCK_ASSERT(sbp);
	for (i = 0; i < SBP_NUM_TARGETS; i++) {
	target = &sbp->targets[i];
	if (target->luns == NULL)
	continue;
	for (j = 0; j < target->num_lun; j++) {
	sdev = target->luns[j];
	if (sdev == NULL)
	continue;
	callout_stop(&sdev->login_callout);
	if (sdev->status >= SBP_DEV_TOATTACH &&
	sdev->status <= SBP_DEV_ATTACHED)
	sbp_mgm_orb(sdev, ORB_FUN_LGO, NULL);
	}
	}

	return 0;
	}

	static int
	sbp_shutdown(device_t dev)
	{
	struct sbp_softc sbp = ((struct sbp_softc )device_get_softc(dev));

	SBP_LOCK(sbp);
	sbp_logout_all(sbp);
	SBP_UNLOCK(sbp);
	return (0);
	}

	static void
	sbp_free_sdev(struct sbp_dev *sdev)
	{
	struct sbp_softc *sbp;
	int i;

	if (sdev == NULL)
	return;
	sbp = sdev->target->sbp;
	SBP_UNLOCK(sbp);
	callout_drain(&sdev->login_callout);
	for (i = 0; i < SBP_QUEUE_LEN; i++) {
	callout_drain(&sdev->ocb[i].timer);
	bus_dmamap_destroy(sbp->dmat, sdev->ocb[i].dmamap);
	}
	fwdma_free(sbp->fd.fc, &sdev->dma);
	free(sdev, M_SBP);
	SBP_LOCK(sbp);
	}

	static void
	sbp_free_target(struct sbp_target *target)
	{
	struct sbp_softc *sbp;
	struct fw_xfer xfer, next;
	int i;

	if (target->luns == NULL)
	return;
	sbp = target->sbp;
	SBP_LOCK_ASSERT(sbp);
	SBP_UNLOCK(sbp);
	callout_drain(&target->mgm_ocb_timeout);
	callout_drain(&target->scan_callout);
	SBP_LOCK(sbp);
	for (i = 0; i < target->num_lun; i++)
	sbp_free_sdev(target->luns[i]);

	STAILQ_FOREACH_SAFE(xfer, &target->xferlist, link, next) {
	fw_xfer_free_buf(xfer);
	}
	STAILQ_INIT(&target->xferlist);
	free(target->luns, M_SBP);
	target->num_lun = 0;
	target->luns = NULL;
	target->fwdev = NULL;
	}

	static int
	sbp_detach(device_t dev)
	{
	struct sbp_softc sbp = ((struct sbp_softc )device_get_softc(dev));
	struct firewire_comm *fc = sbp->fd.fc;
	int i;

	SBP_DEBUG(0)
	printf("sbp_detach\n");
	END_DEBUG

	SBP_LOCK(sbp);
	for (i = 0; i < SBP_NUM_TARGETS; i++)
	sbp_cam_detach_target(&sbp->targets[i]);

	xpt_async(AC_LOST_DEVICE, sbp->path, NULL);
	xpt_free_path(sbp->path);
	xpt_bus_deregister(cam_sim_path(sbp->sim));
	cam_sim_free(sbp->sim, /free_devq/ TRUE);

	sbp_logout_all(sbp);
	SBP_UNLOCK(sbp);

	/* XXX wait for logout completion */
	pause("sbpdtc", hz/2);

	SBP_LOCK(sbp);
	for (i = 0; i < SBP_NUM_TARGETS; i++)
	sbp_free_target(&sbp->targets[i]);
	SBP_UNLOCK(sbp);

	fw_bindremove(fc, &sbp->fwb);
	fw_xferlist_remove(&sbp->fwb.xferlist);

	bus_dma_tag_destroy(sbp->dmat);
	mtx_destroy(&sbp->mtx);

	return (0);
	}

	static void
	sbp_cam_detach_sdev(struct sbp_dev *sdev)
	{
	if (sdev == NULL)
	return;
	if (sdev->status == SBP_DEV_DEAD)
	return;
	if (sdev->status == SBP_DEV_RESET)
	return;
	SBP_LOCK_ASSERT(sdev->target->sbp);
	sbp_abort_all_ocbs(sdev, CAM_DEV_NOT_THERE);
	if (sdev->path) {
	xpt_release_devq(sdev->path,
	sdev->freeze, TRUE);
	sdev->freeze = 0;
	xpt_async(AC_LOST_DEVICE, sdev->path, NULL);
	xpt_free_path(sdev->path);
	sdev->path = NULL;
	}
	}

	static void
	sbp_cam_detach_target(struct sbp_target *target)
	{
	int i;

	SBP_LOCK_ASSERT(target->sbp);
	if (target->luns != NULL) {
	SBP_DEBUG(0)
	printf("sbp_detach_target %d\n", target->target_id);
	END_DEBUG
	callout_stop(&target->scan_callout);
	for (i = 0; i < target->num_lun; i++)
	sbp_cam_detach_sdev(target->luns[i]);
	}
	}

	static void
	sbp_target_reset(struct sbp_dev *sdev, int method)
	{
	int i;
	struct sbp_target *target = sdev->target;
	struct sbp_dev *tsdev;

	SBP_LOCK_ASSERT(target->sbp);
	for (i = 0; i < target->num_lun; i++) {
	tsdev = target->luns[i];
	if (tsdev == NULL)
	continue;
	if (tsdev->status == SBP_DEV_DEAD)
	continue;
	if (tsdev->status == SBP_DEV_RESET)
	continue;
	xpt_freeze_devq(tsdev->path, 1);
	tsdev->freeze++;
	sbp_abort_all_ocbs(tsdev, CAM_CMD_TIMEOUT);
	if (method == 2)
	tsdev->status = SBP_DEV_LOGIN;
	}
	switch (method) {
	case 1:
	printf("target reset\n");
	sbp_mgm_orb(sdev, ORB_FUN_RST, NULL);
	break;
	case 2:
	printf("reset start\n");
	sbp_reset_start(sdev);
	break;
	}

	}

	static void
	sbp_mgm_timeout(void *arg)
	{
	struct sbp_ocb ocb = (struct sbp_ocb )arg;
	struct sbp_dev *sdev = ocb->sdev;
	struct sbp_target *target = sdev->target;

	SBP_LOCK_ASSERT(target->sbp);
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s request timeout(mgm orb:0x%08x)\n",
	__func__, sdev->bustgtlun, (uint32_t)ocb->bus_addr);
	target->mgm_ocb_cur = NULL;
	sbp_free_ocb(sdev, ocb);
	#if 0
	/* XXX */
	printf("run next request\n");
	sbp_mgm_orb(sdev, ORB_FUN_RUNQUEUE, NULL);
	#endif
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s reset start\n",
	__func__, sdev->bustgtlun);
	sbp_reset_start(sdev);
	}

	static void
	sbp_timeout(void *arg)
	{
	struct sbp_ocb ocb = (struct sbp_ocb )arg;
	struct sbp_dev *sdev = ocb->sdev;

	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s request timeout(cmd orb:0x%08x) ... ",
	__func__, sdev->bustgtlun, (uint32_t)ocb->bus_addr);

	SBP_LOCK_ASSERT(sdev->target->sbp);
	sdev->timeout++;
	switch (sdev->timeout) {
	case 1:
	printf("agent reset\n");
	xpt_freeze_devq(sdev->path, 1);
	sdev->freeze++;
	sbp_abort_all_ocbs(sdev, CAM_CMD_TIMEOUT);
	sbp_agent_reset(sdev);
	break;
	case 2:
	case 3:
	sbp_target_reset(sdev, sdev->timeout - 1);
	break;
	#if 0
	default:
	/* XXX give up */
	sbp_cam_detach_target(target);
	if (target->luns != NULL)
	free(target->luns, M_SBP);
	target->num_lun = 0;
	target->luns = NULL;
	target->fwdev = NULL;
	#endif
	}
	}

	static void
	sbp_action(struct cam_sim sim, union ccb ccb)
	{

	struct sbp_softc sbp = (struct sbp_softc )sim->softc;
	struct sbp_target *target = NULL;
	struct sbp_dev *sdev = NULL;

	if (sbp != NULL)
	SBP_LOCK_ASSERT(sbp);
	/* target:lun -> sdev mapping */
	if (sbp != NULL
	&& ccb->ccb_h.target_id != CAM_TARGET_WILDCARD
	&& ccb->ccb_h.target_id < SBP_NUM_TARGETS) {
	target = &sbp->targets[ccb->ccb_h.target_id];
	if (target->fwdev != NULL
	&& ccb->ccb_h.target_lun != CAM_LUN_WILDCARD
	&& ccb->ccb_h.target_lun < target->num_lun) {
	sdev = target->luns[ccb->ccb_h.target_lun];
	if (sdev != NULL && sdev->status != SBP_DEV_ATTACHED &&
	sdev->status != SBP_DEV_PROBE)
	sdev = NULL;
	}
	}

	SBP_DEBUG(1)
	if (sdev == NULL)
	printf("invalid target %d lun %jx\n",
	ccb->ccb_h.target_id, (uintmax_t)ccb->ccb_h.target_lun);
	END_DEBUG

	switch (ccb->ccb_h.func_code) {
	case XPT_SCSI_IO:
	case XPT_RESET_DEV:
	case XPT_GET_TRAN_SETTINGS:
	case XPT_SET_TRAN_SETTINGS:
	case XPT_CALC_GEOMETRY:
	if (sdev == NULL) {
	SBP_DEBUG(1)
	printf("%s:%d:%jx:func_code 0x%04x: "
	"Invalid target (target needed)\n",
	device_get_nameunit(sbp->fd.dev),
	ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun,
	ccb->ccb_h.func_code);
	END_DEBUG

	ccb->ccb_h.status = CAM_DEV_NOT_THERE;
	xpt_done(ccb);
	return;
	}
	break;
	case XPT_PATH_INQ:
	case XPT_NOOP:
	/* The opcodes sometimes aimed at a target (sc is valid),
	* sometimes aimed at the SIM (sc is invalid and target is
	* CAM_TARGET_WILDCARD)
	*/
	if (sbp == NULL &&
	ccb->ccb_h.target_id != CAM_TARGET_WILDCARD) {
	SBP_DEBUG(0)
	printf("%s:%d:%jx func_code 0x%04x: "
	"Invalid target (no wildcard)\n",
	device_get_nameunit(sbp->fd.dev),
	ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun,
	ccb->ccb_h.func_code);
	END_DEBUG
	ccb->ccb_h.status = CAM_DEV_NOT_THERE;
	xpt_done(ccb);
	return;
	}
	break;
	default:
	/* XXX Hm, we should check the input parameters */
	break;
	}

	switch (ccb->ccb_h.func_code) {
	case XPT_SCSI_IO:
	{
	struct ccb_scsiio *csio;
	struct sbp_ocb *ocb;
	int speed;
	void *cdb;

	csio = &ccb->csio;
	mtx_assert(sim->mtx, MA_OWNED);

	SBP_DEBUG(2)
	printf("%s:%d:%jx XPT_SCSI_IO: "
	"cmd: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x"
	", flags: 0x%02x, "
	"%db cmd/%db data/%db sense\n",
	device_get_nameunit(sbp->fd.dev),
	ccb->ccb_h.target_id, (uintmax_t)ccb->ccb_h.target_lun,
	csio->cdb_io.cdb_bytes[0],
	csio->cdb_io.cdb_bytes[1],
	csio->cdb_io.cdb_bytes[2],
	csio->cdb_io.cdb_bytes[3],
	csio->cdb_io.cdb_bytes[4],
	csio->cdb_io.cdb_bytes[5],
	csio->cdb_io.cdb_bytes[6],
	csio->cdb_io.cdb_bytes[7],
	csio->cdb_io.cdb_bytes[8],
	csio->cdb_io.cdb_bytes[9],
	ccb->ccb_h.flags & CAM_DIR_MASK,
	csio->cdb_len, csio->dxfer_len,
	csio->sense_len);
	END_DEBUG
	if (sdev == NULL) {
	ccb->ccb_h.status = CAM_DEV_NOT_THERE;
	xpt_done(ccb);
	return;
	}
	if (csio->cdb_len > sizeof(ocb->orb) - 5 * sizeof(uint32_t)) {
	ccb->ccb_h.status = CAM_REQ_INVALID;
	xpt_done(ccb);
	return;
	}
	#if 0
	/* if we are in probe stage, pass only probe commands */
	if (sdev->status == SBP_DEV_PROBE) {
	char *name;
	name = xpt_path_periph(ccb->ccb_h.path)->periph_name;
	printf("probe stage, periph name: %s\n", name);
	if (strcmp(name, "probe") != 0) {
	ccb->ccb_h.status = CAM_REQUEUE_REQ;
	xpt_done(ccb);
	return;
	}
	}
	#endif
	if ((ocb = sbp_get_ocb(sdev)) == NULL) {
	ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
	if (sdev->freeze == 0) {
	xpt_freeze_devq(sdev->path, 1);
	sdev->freeze++;
	}
	xpt_done(ccb);
	return;
	}

	ocb->flags = OCB_ACT_CMD;
	ocb->sdev = sdev;
	ocb->ccb = ccb;
	ccb->ccb_h.ccb_sdev_ptr = sdev;
	ocb->orb[0] = htonl(1U << 31);
	ocb->orb[1] = 0;
	ocb->orb[2] = htonl(((sbp->fd.fc->nodeid \| FWLOCALBUS) << 16));
	ocb->orb[3] = htonl(ocb->bus_addr + IND_PTR_OFFSET);
	speed = min(target->fwdev->speed, max_speed);
	ocb->orb[4] = htonl(ORB_NOTIFY \| ORB_CMD_SPD(speed)
	\| ORB_CMD_MAXP(speed + 7));
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
	ocb->orb[4] \|= htonl(ORB_CMD_IN);
	}

	if (csio->ccb_h.flags & CAM_CDB_POINTER)
	cdb = (void *)csio->cdb_io.cdb_ptr;
	else
	cdb = (void *)&csio->cdb_io.cdb_bytes;
	bcopy(cdb, (void *)&ocb->orb[5], csio->cdb_len);
	/*
	printf("ORB %08x %08x %08x %08x\n", ntohl(ocb->orb[0]), ntohl(ocb->orb[1]), ntohl(ocb->orb[2]), ntohl(ocb->orb[3]));
	printf("ORB %08x %08x %08x %08x\n", ntohl(ocb->orb[4]), ntohl(ocb->orb[5]), ntohl(ocb->orb[6]), ntohl(ocb->orb[7]));
	*/
	if (ccb->csio.dxfer_len > 0) {
	int error;

	error = bus_dmamap_load_ccb(/dma tag/sbp->dmat,
	/dma map/ocb->dmamap,
	ccb,
	sbp_execute_ocb,
	ocb,
	/flags/0);
	if (error)
	printf("sbp: bus_dmamap_load error %d\n", error);
	} else
	sbp_execute_ocb(ocb, NULL, 0, 0);
	break;
	}
	case XPT_CALC_GEOMETRY:
	{
	struct ccb_calc_geometry *ccg;

	ccg = &ccb->ccg;
	if (ccg->block_size == 0) {
	printf("sbp_action: block_size is 0.\n");
	ccb->ccb_h.status = CAM_REQ_INVALID;
	xpt_done(ccb);
	break;
	}
	SBP_DEBUG(1)
	printf("%s:%d:%d:%jx:XPT_CALC_GEOMETRY: "
	"Volume size = %jd\n",
	device_get_nameunit(sbp->fd.dev),
	cam_sim_path(sbp->sim),
	ccb->ccb_h.target_id, (uintmax_t)ccb->ccb_h.target_lun,
	(uintmax_t)ccg->volume_size);
	END_DEBUG

	cam_calc_geometry(ccg, /extended/1);
	xpt_done(ccb);
	break;
	}
	case XPT_RESET_BUS: /* Reset the specified SCSI bus */
	{

	SBP_DEBUG(1)
	printf("%s:%d:XPT_RESET_BUS: \n",
	device_get_nameunit(sbp->fd.dev), cam_sim_path(sbp->sim));
	END_DEBUG

	ccb->ccb_h.status = CAM_REQ_INVALID;
	xpt_done(ccb);
	break;
	}
	case XPT_PATH_INQ: /* Path routing inquiry */
	{
	struct ccb_pathinq *cpi = &ccb->cpi;

	SBP_DEBUG(1)
	printf("%s:%d:%jx XPT_PATH_INQ:.\n",
	device_get_nameunit(sbp->fd.dev),
	ccb->ccb_h.target_id, (uintmax_t)ccb->ccb_h.target_lun);
	END_DEBUG
	cpi->version_num = 1; /* XXX??? */
	cpi->hba_inquiry = PI_TAG_ABLE;
	cpi->target_sprt = 0;
	cpi->hba_misc = PIM_NOBUSRESET \| PIM_NO_6_BYTE;
	cpi->hba_eng_cnt = 0;
	cpi->max_target = SBP_NUM_TARGETS - 1;
	cpi->max_lun = SBP_NUM_LUNS - 1;
	cpi->initiator_id = SBP_INITIATOR;
	cpi->bus_id = sim->bus_id;
	cpi->base_transfer_speed = 400 * 1000 / 8;
	strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strlcpy(cpi->hba_vid, "SBP", HBA_IDLEN);
	strlcpy(cpi->dev_name, sim->sim_name, DEV_IDLEN);
	cpi->unit_number = sim->unit_number;
	cpi->transport = XPORT_SPI; /* XX should have a FireWire */
	cpi->transport_version = 2;
	cpi->protocol = PROTO_SCSI;
	cpi->protocol_version = SCSI_REV_2;

	cpi->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	break;
	}
	case XPT_GET_TRAN_SETTINGS:
	{
	struct ccb_trans_settings *cts = &ccb->cts;
	struct ccb_trans_settings_scsi *scsi =
	&cts->proto_specific.scsi;
	struct ccb_trans_settings_spi *spi =
	&cts->xport_specific.spi;

	cts->protocol = PROTO_SCSI;
	cts->protocol_version = SCSI_REV_2;
	cts->transport = XPORT_SPI; /* should have a FireWire */
	cts->transport_version = 2;
	spi->valid = CTS_SPI_VALID_DISC;
	spi->flags = CTS_SPI_FLAGS_DISC_ENB;
	scsi->valid = CTS_SCSI_VALID_TQ;
	scsi->flags = CTS_SCSI_FLAGS_TAG_ENB;
	SBP_DEBUG(1)
	printf("%s:%d:%jx XPT_GET_TRAN_SETTINGS:.\n",
	device_get_nameunit(sbp->fd.dev),
	ccb->ccb_h.target_id, (uintmax_t)ccb->ccb_h.target_lun);
	END_DEBUG
	cts->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	break;
	}
	case XPT_ABORT:
	ccb->ccb_h.status = CAM_UA_ABORT;
	xpt_done(ccb);
	break;
	case XPT_SET_TRAN_SETTINGS:
	/* XXX */
	default:
	ccb->ccb_h.status = CAM_REQ_INVALID;
	xpt_done(ccb);
	break;
	}
	return;
	}

	static void
	sbp_execute_ocb(void arg, bus_dma_segment_t segments, int seg, int error)
	{
	int i;
	struct sbp_ocb *ocb;
	struct sbp_ocb *prev;
	bus_dma_segment_t *s;

	if (error)
	printf("sbp_execute_ocb: error=%d\n", error);

	ocb = (struct sbp_ocb *)arg;

	SBP_DEBUG(2)
	printf("sbp_execute_ocb: seg %d", seg);
	for (i = 0; i < seg; i++)
	printf(", %jx:%jd", (uintmax_t)segments[i].ds_addr,
	(uintmax_t)segments[i].ds_len);
	printf("\n");
	END_DEBUG

	if (seg == 1) {
	/* direct pointer */
	s = &segments[0];
	if (s->ds_len > SBP_SEG_MAX)
	panic("ds_len > SBP_SEG_MAX, fix busdma code");
	ocb->orb[3] = htonl(s->ds_addr);
	ocb->orb[4] \|= htonl(s->ds_len);
	} else if (seg > 1) {
	/* page table */
	for (i = 0; i < seg; i++) {
	s = &segments[i];
	SBP_DEBUG(0)
	/* XXX LSI Logic "< 16 byte" bug might be hit */
	if (s->ds_len < 16)
	printf("sbp_execute_ocb: warning, "
	"segment length(%zd) is less than 16."
	"(seg=%d/%d)\n", (size_t)s->ds_len, i + 1, seg);
	END_DEBUG
	if (s->ds_len > SBP_SEG_MAX)
	panic("ds_len > SBP_SEG_MAX, fix busdma code");
	ocb->ind_ptr[i].hi = htonl(s->ds_len << 16);
	ocb->ind_ptr[i].lo = htonl(s->ds_addr);
	}
	ocb->orb[4] \|= htonl(ORB_CMD_PTBL \| seg);
	}

	if (seg > 0)
	bus_dmamap_sync(ocb->sdev->target->sbp->dmat, ocb->dmamap,
	(ntohl(ocb->orb[4]) & ORB_CMD_IN) ?
	BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
	prev = sbp_enqueue_ocb(ocb->sdev, ocb);
	fwdma_sync(&ocb->sdev->dma, BUS_DMASYNC_PREWRITE);
	if (use_doorbell) {
	if (prev == NULL) {
	if (ocb->sdev->last_ocb != NULL)
	sbp_doorbell(ocb->sdev);
	else
	sbp_orb_pointer(ocb->sdev, ocb);
	}
	} else {
	if (prev == NULL \|\| (ocb->sdev->flags & ORB_LINK_DEAD) != 0) {
	ocb->sdev->flags &= ~ORB_LINK_DEAD;
	sbp_orb_pointer(ocb->sdev, ocb);
	}
	}
	}

	static void
	sbp_poll(struct cam_sim *sim)
	{
	struct sbp_softc *sbp;
	struct firewire_comm *fc;

	sbp = (struct sbp_softc *)sim->softc;
	fc = sbp->fd.fc;

	fc->poll(fc, 0, -1);

	return;
	}

	static struct sbp_ocb *
	sbp_dequeue_ocb(struct sbp_dev sdev, struct sbp_status sbp_status)
	{
	struct sbp_ocb *ocb;
	struct sbp_ocb *next;
	int order = 0;

	SBP_DEBUG(1)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s 0x%08x src %d\n",
	__func__, sdev->bustgtlun, ntohl(sbp_status->orb_lo), sbp_status->src);
	END_DEBUG
	SBP_LOCK_ASSERT(sdev->target->sbp);
	STAILQ_FOREACH_SAFE(ocb, &sdev->ocbs, ocb, next) {
	if (OCB_MATCH(ocb, sbp_status)) {
	/* found */
	STAILQ_REMOVE(&sdev->ocbs, ocb, sbp_ocb, ocb);
	if (ocb->ccb != NULL)
	callout_stop(&ocb->timer);
	if (ntohl(ocb->orb[4]) & 0xffff) {
	bus_dmamap_sync(sdev->target->sbp->dmat,
	ocb->dmamap,
	(ntohl(ocb->orb[4]) & ORB_CMD_IN) ?
	BUS_DMASYNC_POSTREAD :
	BUS_DMASYNC_POSTWRITE);
	bus_dmamap_unload(sdev->target->sbp->dmat,
	ocb->dmamap);
	}
	if (!use_doorbell) {
	if (sbp_status->src == SRC_NO_NEXT) {
	if (next != NULL)
	sbp_orb_pointer(sdev, next);
	else if (order > 0) {
	/*
	* Unordered execution
	* We need to send pointer for
	* next ORB
	*/
	sdev->flags \|= ORB_LINK_DEAD;
	}
	}
	} else {
	/*
	* XXX this is not correct for unordered
	* execution.
	*/
	if (sdev->last_ocb != NULL) {
	sbp_free_ocb(sdev, sdev->last_ocb);
	}
	sdev->last_ocb = ocb;
	if (next != NULL &&
	sbp_status->src == SRC_NO_NEXT)
	sbp_doorbell(sdev);
	}
	break;
	} else
	order++;
	}
	SBP_DEBUG(0)
	if (ocb && order > 0) {
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s unordered execution order:%d\n",
	__func__, sdev->bustgtlun, order);
	}
	END_DEBUG
	return (ocb);
	}

	static struct sbp_ocb *
	sbp_enqueue_ocb(struct sbp_dev sdev, struct sbp_ocb ocb)
	{
	struct sbp_ocb prev, prev2;

	SBP_LOCK_ASSERT(sdev->target->sbp);
	SBP_DEBUG(1)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s 0x%08jx\n", __func__, sdev->bustgtlun, (uintmax_t)ocb->bus_addr);
	END_DEBUG
	prev2 = prev = STAILQ_LAST(&sdev->ocbs, sbp_ocb, ocb);
	STAILQ_INSERT_TAIL(&sdev->ocbs, ocb, ocb);

	if (ocb->ccb != NULL) {
	callout_reset_sbt(&ocb->timer,
	SBT_1MS * ocb->ccb->ccb_h.timeout, 0, sbp_timeout,
	ocb, 0);
	}

	if (use_doorbell && prev == NULL)
	prev2 = sdev->last_ocb;

	if (prev2 != NULL && (ocb->sdev->flags & ORB_LINK_DEAD) == 0) {
	SBP_DEBUG(1)
	printf("linking chain 0x%jx -> 0x%jx\n",
	(uintmax_t)prev2->bus_addr, (uintmax_t)ocb->bus_addr);
	END_DEBUG
	/*
	* Suppress compiler optimization so that orb[1] must be written first.
	* XXX We may need an explicit memory barrier for other architectures
	* other than i386/amd64.
	*/
	(volatile uint32_t )&prev2->orb[1] = htonl(ocb->bus_addr);
	(volatile uint32_t )&prev2->orb[0] = 0;
	}

	return prev;
	}

	static struct sbp_ocb *
	sbp_get_ocb(struct sbp_dev *sdev)
	{
	struct sbp_ocb *ocb;

	SBP_LOCK_ASSERT(sdev->target->sbp);
	ocb = STAILQ_FIRST(&sdev->free_ocbs);
	if (ocb == NULL) {
	sdev->flags \|= ORB_SHORTAGE;
	printf("ocb shortage!!!\n");
	return NULL;
	}
	STAILQ_REMOVE_HEAD(&sdev->free_ocbs, ocb);
	ocb->ccb = NULL;
	return (ocb);
	}

	static void
	sbp_free_ocb(struct sbp_dev sdev, struct sbp_ocb ocb)
	{
	ocb->flags = 0;
	ocb->ccb = NULL;

	SBP_LOCK_ASSERT(sdev->target->sbp);
	STAILQ_INSERT_TAIL(&sdev->free_ocbs, ocb, ocb);
	if ((sdev->flags & ORB_SHORTAGE) != 0) {
	int count;

	sdev->flags &= ~ORB_SHORTAGE;
	count = sdev->freeze;
	sdev->freeze = 0;
	xpt_release_devq(sdev->path, count, TRUE);
	}
	}

	static void
	sbp_abort_ocb(struct sbp_ocb *ocb, int status)
	{
	struct sbp_dev *sdev;

	sdev = ocb->sdev;
	SBP_LOCK_ASSERT(sdev->target->sbp);
	SBP_DEBUG(0)
	device_printf(sdev->target->sbp->fd.dev,
	"%s:%s 0x%jx\n", __func__, sdev->bustgtlun, (uintmax_t)ocb->bus_addr);
	END_DEBUG
	SBP_DEBUG(1)
	if (ocb->ccb != NULL)
	sbp_print_scsi_cmd(ocb);
	END_DEBUG
	if (ntohl(ocb->orb[4]) & 0xffff) {
	bus_dmamap_sync(sdev->target->sbp->dmat, ocb->dmamap,
	(ntohl(ocb->orb[4]) & ORB_CMD_IN) ?
	BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
	bus_dmamap_unload(sdev->target->sbp->dmat, ocb->dmamap);
	}
	if (ocb->ccb != NULL) {
	callout_stop(&ocb->timer);
	ocb->ccb->ccb_h.status = status;
	xpt_done(ocb->ccb);
	}
	sbp_free_ocb(sdev, ocb);
	}

	static void
	sbp_abort_all_ocbs(struct sbp_dev *sdev, int status)
	{
	struct sbp_ocb ocb, next;
	STAILQ_HEAD(, sbp_ocb) temp;

	STAILQ_INIT(&temp);
	SBP_LOCK_ASSERT(sdev->target->sbp);
	STAILQ_CONCAT(&temp, &sdev->ocbs);
	STAILQ_INIT(&sdev->ocbs);

	STAILQ_FOREACH_SAFE(ocb, &temp, ocb, next) {
	sbp_abort_ocb(ocb, status);
	}
	if (sdev->last_ocb != NULL) {
	sbp_free_ocb(sdev, sdev->last_ocb);
	sdev->last_ocb = NULL;
	}
	}

	static devclass_t sbp_devclass;

	static device_method_t sbp_methods[] = {
	/* device interface */
	DEVMETHOD(device_identify, sbp_identify),
	DEVMETHOD(device_probe, sbp_probe),
	DEVMETHOD(device_attach, sbp_attach),
	DEVMETHOD(device_detach, sbp_detach),
	DEVMETHOD(device_shutdown, sbp_shutdown),

	{ 0, 0 }
	};

	static driver_t sbp_driver = {
	"sbp",
	sbp_methods,
	sizeof(struct sbp_softc),
	};
	DRIVER_MODULE(sbp, firewire, sbp_driver, sbp_devclass, 0, 0);
	MODULE_VERSION(sbp, 1);
	MODULE_DEPEND(sbp, firewire, 1, 1, 1);
	MODULE_DEPEND(sbp, cam, 1, 1, 1);
	diff --git a/sys/dev/flash/cqspi.c b/sys/dev/flash/cqspi.c
	index 4c2bc1a75bc9..54752792798d 100644
	--- a/sys/dev/flash/cqspi.c
	+++ b/sys/dev/flash/cqspi.c
	@@ -1,769 +1,769 @@
	/*-
	* Copyright (c) 2017-2018 Ruslan Bukin <br@bsdpad.com>
	* All rights reserved.
	*
	* This software was developed by SRI International and the University of
	* Cambridge Computer Laboratory under DARPA/AFRL contract FA8750-10-C-0237
	* ("CTSRD"), as part of the DARPA CRASH research programme.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* Cadence Quad SPI Flash Controller driver.
	* 4B-addressing mode supported only.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_platform.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/lock.h>
	#include <sys/mbuf.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/rman.h>
	#include <geom/geom_disk.h>

	#include <machine/bus.h>

	#include <dev/fdt/simplebus.h>
	#include <dev/fdt/fdt_common.h>
	#include <dev/ofw/ofw_bus_subr.h>
	#include <dev/ofw/openfirm.h>

	#include <dev/flash/cqspi.h>
	#include <dev/flash/mx25lreg.h>
	#include <dev/xdma/xdma.h>

	#include "qspi_if.h"

	#define CQSPI_DEBUG
	#undef CQSPI_DEBUG

	#ifdef CQSPI_DEBUG
	#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
	#else
	#define dprintf(fmt, ...)
	#endif

	#define CQSPI_SECTORSIZE 512
	#define TX_QUEUE_SIZE 16
	#define RX_QUEUE_SIZE 16

	#define READ4(_sc, _reg) bus_read_4((_sc)->res[0], _reg)
	#define READ2(_sc, _reg) bus_read_2((_sc)->res[0], _reg)
	#define READ1(_sc, _reg) bus_read_1((_sc)->res[0], _reg)
	#define WRITE4(_sc, _reg, _val) bus_write_4((_sc)->res[0], _reg, _val)
	#define WRITE2(_sc, _reg, _val) bus_write_2((_sc)->res[0], _reg, _val)
	#define WRITE1(_sc, _reg, _val) bus_write_1((_sc)->res[0], _reg, _val)
	#define READ_DATA_4(_sc, _reg) bus_read_4((_sc)->res[1], _reg)
	#define READ_DATA_1(_sc, _reg) bus_read_1((_sc)->res[1], _reg)
	#define WRITE_DATA_4(_sc, _reg, _val) bus_write_4((_sc)->res[1], _reg, _val)
	#define WRITE_DATA_1(_sc, _reg, _val) bus_write_1((_sc)->res[1], _reg, _val)

	struct cqspi_softc {
	device_t dev;

	struct resource *res[3];
	bus_space_tag_t bst;
	bus_space_handle_t bsh;
	void *ih;
	uint8_t read_op_done;
	uint8_t write_op_done;

	uint32_t fifo_depth;
	uint32_t fifo_width;
	uint32_t trigger_address;
	uint32_t sram_phys;

	/* xDMA */
	xdma_controller_t *xdma_tx;
	xdma_channel_t *xchan_tx;
	void *ih_tx;

	xdma_controller_t *xdma_rx;
	xdma_channel_t *xchan_rx;
	void *ih_rx;

	struct intr_config_hook config_intrhook;
	struct mtx sc_mtx;
	};

	#define CQSPI_LOCK(_sc) mtx_lock(&(_sc)->sc_mtx)
	#define CQSPI_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_mtx)
	#define CQSPI_LOCK_INIT(_sc) \
	mtx_init(&_sc->sc_mtx, device_get_nameunit(_sc->dev), \
	"cqspi", MTX_DEF)
	#define CQSPI_LOCK_DESTROY(_sc) mtx_destroy(&_sc->sc_mtx);
	#define CQSPI_ASSERT_LOCKED(_sc) \
	mtx_assert(&_sc->sc_mtx, MA_OWNED);
	#define CQSPI_ASSERT_UNLOCKED(_sc) \
	mtx_assert(&_sc->sc_mtx, MA_NOTOWNED);

	static struct resource_spec cqspi_spec[] = {
	{ SYS_RES_MEMORY, 0, RF_ACTIVE },
	{ SYS_RES_MEMORY, 1, RF_ACTIVE },
	{ SYS_RES_IRQ, 0, RF_ACTIVE },
	{ -1, 0 }
	};

	static struct ofw_compat_data compat_data[] = {
	{ "cdns,qspi-nor", 1 },
	{ NULL, 0 },
	};

	static void
	cqspi_intr(void *arg)
	{
	struct cqspi_softc *sc;
	uint32_t pending;

	sc = arg;

	pending = READ4(sc, CQSPI_IRQSTAT);

	dprintf("%s: IRQSTAT %x\n", __func__, pending);

	if (pending & (IRQMASK_INDOPDONE \| IRQMASK_INDXFRLVL \|
	IRQMASK_INDSRAMFULL)) {
	/* TODO: PIO operation done */
	}

	WRITE4(sc, CQSPI_IRQSTAT, pending);
	}

	static int
	cqspi_xdma_tx_intr(void arg, xdma_transfer_status_t status)
	{
	struct xdma_transfer_status st;
	struct cqspi_softc *sc;
	struct bio *bp;
	int ret;
	int deq;

	sc = arg;

	dprintf("%s\n", __func__);

	deq = 0;

	while (1) {
	ret = xdma_dequeue_bio(sc->xchan_tx, &bp, &st);
	if (ret != 0) {
	break;
	}
	sc->write_op_done = 1;
	deq++;
	}

	if (deq > 1)
	device_printf(sc->dev,
	"Warning: more than 1 tx bio dequeued\n");

	wakeup(&sc->xdma_tx);

	return (0);
	}

	static int
	cqspi_xdma_rx_intr(void arg, xdma_transfer_status_t status)
	{
	struct xdma_transfer_status st;
	struct cqspi_softc *sc;
	struct bio *bp;
	int ret;
	int deq;

	sc = arg;

	dprintf("%s\n", __func__);

	deq = 0;

	while (1) {
	ret = xdma_dequeue_bio(sc->xchan_rx, &bp, &st);
	if (ret != 0) {
	break;
	}
	sc->read_op_done = 1;
	deq++;
	}

	if (deq > 1)
	device_printf(sc->dev,
	"Warning: more than 1 rx bio dequeued\n");

	wakeup(&sc->xdma_rx);

	return (0);
	}

	static int
	cqspi_wait_for_completion(struct cqspi_softc *sc)
	{
	int timeout;
	int i;

	timeout = 10000;

	for (i = timeout; i > 0; i--) {
	if ((READ4(sc, CQSPI_FLASHCMD) & FLASHCMD_CMDEXECSTAT) == 0) {
	break;
	}
	}

	if (i == 0) {
	device_printf(sc->dev, "%s: cmd timed out: %x\n",
	__func__, READ4(sc, CQSPI_FLASHCMD));
	return (-1);
	}

	return (0);
	}

	static int
	cqspi_cmd_write_addr(struct cqspi_softc *sc, uint8_t cmd,
	uint32_t addr, uint32_t len)
	{
	uint32_t reg;
	int ret;

	dprintf("%s: %x\n", __func__, cmd);

	WRITE4(sc, CQSPI_FLASHCMDADDR, addr);
	reg = (cmd << FLASHCMD_CMDOPCODE_S);
	reg \|= (FLASHCMD_ENCMDADDR);
	reg \|= ((len - 1) << FLASHCMD_NUMADDRBYTES_S);
	WRITE4(sc, CQSPI_FLASHCMD, reg);

	reg \|= FLASHCMD_EXECCMD;
	WRITE4(sc, CQSPI_FLASHCMD, reg);

	ret = cqspi_wait_for_completion(sc);

	return (ret);
	}

	static int
	cqspi_cmd_write(struct cqspi_softc *sc, uint8_t cmd,
	uint8_t *addr, uint32_t len)
	{
	uint32_t reg;
	int ret;

	reg = (cmd << FLASHCMD_CMDOPCODE_S);
	WRITE4(sc, CQSPI_FLASHCMD, reg);
	reg \|= FLASHCMD_EXECCMD;
	WRITE4(sc, CQSPI_FLASHCMD, reg);

	ret = cqspi_wait_for_completion(sc);

	return (ret);
	}

	static int
	cqspi_cmd_read(struct cqspi_softc *sc, uint8_t cmd,
	uint8_t *addr, uint32_t len)
	{
	uint32_t data;
	uint32_t reg;
	uint8_t *buf;
	int ret;
	int i;

	if (len > 8) {
	device_printf(sc->dev, "Failed to read data\n");
	return (-1);
	}

	dprintf("%s: %x\n", __func__, cmd);

	buf = (uint8_t *)addr;

	reg = (cmd << FLASHCMD_CMDOPCODE_S);
	reg \|= ((len - 1) << FLASHCMD_NUMRDDATABYTES_S);
	reg \|= FLASHCMD_ENRDDATA;
	WRITE4(sc, CQSPI_FLASHCMD, reg);

	reg \|= FLASHCMD_EXECCMD;
	WRITE4(sc, CQSPI_FLASHCMD, reg);

	ret = cqspi_wait_for_completion(sc);
	if (ret != 0) {
	device_printf(sc->dev, "%s: cmd failed: %x\n",
	__func__, cmd);
	return (ret);
	}

	data = READ4(sc, CQSPI_FLASHCMDRDDATALO);

	for (i = 0; i < len; i++)
	buf[i] = (data >> (i * 8)) & 0xff;

	return (0);
	}

	static int
	cqspi_wait_ready(struct cqspi_softc *sc)
	{
	uint8_t data;
	int ret;

	do {
	ret = cqspi_cmd_read(sc, CMD_READ_STATUS, &data, 1);
	} while (data & STATUS_WIP);

	return (0);
	}

	static int
	cqspi_write_reg(device_t dev, device_t child,
	uint8_t opcode, uint8_t *addr, uint32_t len)
	{
	struct cqspi_softc *sc;
	int ret;

	sc = device_get_softc(dev);

	ret = cqspi_cmd_write(sc, opcode, addr, len);

	return (ret);
	}

	static int
	cqspi_read_reg(device_t dev, device_t child,
	uint8_t opcode, uint8_t *addr, uint32_t len)
	{
	struct cqspi_softc *sc;
	int ret;

	sc = device_get_softc(dev);

	ret = cqspi_cmd_read(sc, opcode, addr, len);

	return (ret);
	}

	static int
	cqspi_wait_idle(struct cqspi_softc *sc)
	{
	uint32_t reg;

	do {
	reg = READ4(sc, CQSPI_CFG);
	if (reg & CFG_IDLE) {
	break;
	}
	} while (1);

	return (0);
	}

	static int
	cqspi_erase(device_t dev, device_t child, off_t offset)
	{
	struct cqspi_softc *sc;
	int ret;

	sc = device_get_softc(dev);

	cqspi_wait_idle(sc);
	cqspi_wait_ready(sc);
	ret = cqspi_cmd_write(sc, CMD_WRITE_ENABLE, 0, 0);

	cqspi_wait_idle(sc);
	cqspi_wait_ready(sc);
	ret = cqspi_cmd_write_addr(sc, CMD_QUAD_SECTOR_ERASE, offset, 4);

	cqspi_wait_idle(sc);

	return (0);
	}

	static int
	cqspi_write(device_t dev, device_t child, struct bio *bp,
	off_t offset, caddr_t data, off_t count)
	{
	struct cqspi_softc *sc;
	uint32_t reg;

	dprintf("%s: offset 0x%llx count %lld bytes\n",
	__func__, offset, count);

	sc = device_get_softc(dev);

	cqspi_wait_ready(sc);
	reg = cqspi_cmd_write(sc, CMD_WRITE_ENABLE, 0, 0);

	cqspi_wait_idle(sc);
	cqspi_wait_ready(sc);
	cqspi_wait_idle(sc);

	reg = DMAPER_NUMSGLREQBYTES_4;
	reg \|= DMAPER_NUMBURSTREQBYTES_4;
	WRITE4(sc, CQSPI_DMAPER, reg);

	WRITE4(sc, CQSPI_INDWRWATER, 64);
	WRITE4(sc, CQSPI_INDWR, INDRD_IND_OPS_DONE_STATUS);
	WRITE4(sc, CQSPI_INDWR, 0);

	WRITE4(sc, CQSPI_INDWRCNT, count);
	WRITE4(sc, CQSPI_INDWRSTADDR, offset);

	reg = (0 << DEVWR_DUMMYWRCLKS_S);
	reg \|= DEVWR_DATA_WIDTH_QUAD;
	reg \|= DEVWR_ADDR_WIDTH_SINGLE;
	reg \|= (CMD_QUAD_PAGE_PROGRAM << DEVWR_WROPCODE_S);
	WRITE4(sc, CQSPI_DEVWR, reg);

	reg = DEVRD_DATA_WIDTH_QUAD;
	reg \|= DEVRD_ADDR_WIDTH_SINGLE;
	reg \|= DEVRD_INST_WIDTH_SINGLE;
	WRITE4(sc, CQSPI_DEVRD, reg);

	xdma_enqueue_bio(sc->xchan_tx, &bp,
	sc->sram_phys, 4, 4, XDMA_MEM_TO_DEV);
	xdma_queue_submit(sc->xchan_tx);

	sc->write_op_done = 0;

	WRITE4(sc, CQSPI_INDWR, INDRD_START);

	while (sc->write_op_done == 0)
	tsleep(&sc->xdma_tx, PCATCH \| PZERO, "spi", hz/2);

	cqspi_wait_idle(sc);

	return (0);
	}

	static int
	cqspi_read(device_t dev, device_t child, struct bio *bp,
	off_t offset, caddr_t data, off_t count)
	{
	struct cqspi_softc *sc;
	uint32_t reg;

	sc = device_get_softc(dev);

	dprintf("%s: offset 0x%llx count %lld bytes\n",
	__func__, offset, count);

	cqspi_wait_idle(sc);

	reg = DMAPER_NUMSGLREQBYTES_4;
	reg \|= DMAPER_NUMBURSTREQBYTES_4;
	WRITE4(sc, CQSPI_DMAPER, reg);

	WRITE4(sc, CQSPI_INDRDWATER, 64);
	WRITE4(sc, CQSPI_INDRD, INDRD_IND_OPS_DONE_STATUS);
	WRITE4(sc, CQSPI_INDRD, 0);

	WRITE4(sc, CQSPI_INDRDCNT, count);
	WRITE4(sc, CQSPI_INDRDSTADDR, offset);

	reg = (0 << DEVRD_DUMMYRDCLKS_S);
	reg \|= DEVRD_DATA_WIDTH_QUAD;
	reg \|= DEVRD_ADDR_WIDTH_SINGLE;
	reg \|= DEVRD_INST_WIDTH_SINGLE;
	reg \|= DEVRD_ENMODEBITS;
	reg \|= (CMD_READ_4B_QUAD_OUTPUT << DEVRD_RDOPCODE_S);
	WRITE4(sc, CQSPI_DEVRD, reg);

	WRITE4(sc, CQSPI_MODEBIT, 0xff);
	WRITE4(sc, CQSPI_IRQMASK, 0);

	xdma_enqueue_bio(sc->xchan_rx, &bp, sc->sram_phys, 4, 4,
	XDMA_DEV_TO_MEM);
	xdma_queue_submit(sc->xchan_rx);

	sc->read_op_done = 0;

	WRITE4(sc, CQSPI_INDRD, INDRD_START);

	while (sc->read_op_done == 0)
	tsleep(&sc->xdma_rx, PCATCH \| PZERO, "spi", hz/2);

	cqspi_wait_idle(sc);

	return (0);
	}

	static int
	cqspi_init(struct cqspi_softc *sc)
	{
	pcell_t dts_value[1];
	phandle_t node;
	uint32_t reg;
	int len;

	device_printf(sc->dev, "Module ID %x\n",
	READ4(sc, CQSPI_MODULEID));

	if ((node = ofw_bus_get_node(sc->dev)) == -1) {
	return (ENXIO);
	}

	if ((len = OF_getproplen(node, "cdns,fifo-depth")) <= 0) {
	return (ENXIO);
	}
	OF_getencprop(node, "cdns,fifo-depth", dts_value, len);
	sc->fifo_depth = dts_value[0];

	if ((len = OF_getproplen(node, "cdns,fifo-width")) <= 0) {
	return (ENXIO);
	}
	OF_getencprop(node, "cdns,fifo-width", dts_value, len);
	sc->fifo_width = dts_value[0];

	if ((len = OF_getproplen(node, "cdns,trigger-address")) <= 0) {
	return (ENXIO);
	}
	OF_getencprop(node, "cdns,trigger-address", dts_value, len);
	sc->trigger_address = dts_value[0];

	/* Disable controller */
	reg = READ4(sc, CQSPI_CFG);
	reg &= ~(CFG_EN);
	WRITE4(sc, CQSPI_CFG, reg);

	reg = READ4(sc, CQSPI_DEVSZ);
	reg &= ~(DEVSZ_NUMADDRBYTES_M);
	reg \|= ((4 - 1) - DEVSZ_NUMADDRBYTES_S);
	WRITE4(sc, CQSPI_DEVSZ, reg);

	WRITE4(sc, CQSPI_SRAMPART, sc->fifo_depth/2);

	/* TODO: calculate baud rate and delay values. */

	reg = READ4(sc, CQSPI_CFG);
	/* Configure baud rate */
	reg &= ~(CFG_BAUD_M);
	reg \|= CFG_BAUD12;
	reg \|= CFG_ENDMA;
	WRITE4(sc, CQSPI_CFG, reg);

	reg = (3 << DELAY_NSS_S);
	reg \|= (3 << DELAY_BTWN_S);
	reg \|= (1 << DELAY_AFTER_S);
	reg \|= (1 << DELAY_INIT_S);
	WRITE4(sc, CQSPI_DELAY, reg);

	READ4(sc, CQSPI_RDDATACAP);
	reg &= ~(RDDATACAP_DELAY_M);
	reg \|= (1 << RDDATACAP_DELAY_S);
	WRITE4(sc, CQSPI_RDDATACAP, reg);

	/* Enable controller */
	reg = READ4(sc, CQSPI_CFG);
	reg \|= (CFG_EN);
	WRITE4(sc, CQSPI_CFG, reg);

	return (0);
	}

	static int
	cqspi_add_devices(device_t dev)
	{
	phandle_t child, node;
	device_t child_dev;
	int error;

	node = ofw_bus_get_node(dev);

	for (child = OF_child(node); child != 0; child = OF_peer(child)) {
	child_dev =
	simplebus_add_device(dev, child, 0, NULL, -1, NULL);
	if (child_dev == NULL) {
	return (ENXIO);
	}

	error = device_probe_and_attach(child_dev);
	if (error != 0) {
	printf("can't probe and attach: %d\n", error);
	}
	}

	return (0);
	}

	static void
	cqspi_delayed_attach(void *arg)
	{
	struct cqspi_softc *sc;

	sc = arg;

	cqspi_add_devices(sc->dev);
	bus_generic_attach(sc->dev);

	config_intrhook_disestablish(&sc->config_intrhook);
	}

	static int
	cqspi_probe(device_t dev)
	{

	if (!ofw_bus_status_okay(dev)) {
	return (ENXIO);
	}

	if (!ofw_bus_search_compatible(dev, compat_data)->ocd_data) {
	return (ENXIO);
	}

	device_set_desc(dev, "Cadence Quad SPI controller");

	return (0);
	}

	static int
	cqspi_attach(device_t dev)
	{
	struct cqspi_softc *sc;
	uint32_t caps;
	int error;

	sc = device_get_softc(dev);
	sc->dev = dev;

	if (bus_alloc_resources(dev, cqspi_spec, sc->res)) {
	device_printf(dev, "could not allocate resources\n");
	return (ENXIO);
	}

	/* Memory interface */
	sc->bst = rman_get_bustag(sc->res[0]);
	sc->bsh = rman_get_bushandle(sc->res[0]);

	sc->sram_phys = rman_get_start(sc->res[1]);

	/* Setup interrupt handlers */
	if (bus_setup_intr(sc->dev, sc->res[2], INTR_TYPE_BIO \| INTR_MPSAFE,
	NULL, cqspi_intr, sc, &sc->ih)) {
	device_printf(sc->dev, "Unable to setup intr\n");
	return (ENXIO);
	}

	CQSPI_LOCK_INIT(sc);

	caps = 0;

	/* Get xDMA controller. */
	sc->xdma_tx = xdma_ofw_get(sc->dev, "tx");
	if (sc->xdma_tx == NULL) {
	device_printf(dev, "Can't find DMA controller.\n");
	return (ENXIO);
	}

	sc->xdma_rx = xdma_ofw_get(sc->dev, "rx");
	if (sc->xdma_rx == NULL) {
	device_printf(dev, "Can't find DMA controller.\n");
	return (ENXIO);
	}

	/* Alloc xDMA virtual channels. */
	sc->xchan_tx = xdma_channel_alloc(sc->xdma_tx, caps);
	if (sc->xchan_tx == NULL) {
	device_printf(dev, "Can't alloc virtual DMA channel.\n");
	return (ENXIO);
	}

	sc->xchan_rx = xdma_channel_alloc(sc->xdma_rx, caps);
	if (sc->xchan_rx == NULL) {
	device_printf(dev, "Can't alloc virtual DMA channel.\n");
	return (ENXIO);
	}

	/* Setup xDMA interrupt handlers. */
	error = xdma_setup_intr(sc->xchan_tx, 0, cqspi_xdma_tx_intr,
	sc, &sc->ih_tx);
	if (error) {
	device_printf(sc->dev,
	"Can't setup xDMA interrupt handler.\n");
	return (ENXIO);
	}

	error = xdma_setup_intr(sc->xchan_rx, 0, cqspi_xdma_rx_intr,
	sc, &sc->ih_rx);
	if (error) {
	device_printf(sc->dev,
	"Can't setup xDMA interrupt handler.\n");
	return (ENXIO);
	}

	- xdma_prep_sg(sc->xchan_tx, TX_QUEUE_SIZE, MAXPHYS, 8, 16, 0,
	+ xdma_prep_sg(sc->xchan_tx, TX_QUEUE_SIZE, maxphys, 8, 16, 0,
	BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR);
	- xdma_prep_sg(sc->xchan_rx, TX_QUEUE_SIZE, MAXPHYS, 8, 16, 0,
	+ xdma_prep_sg(sc->xchan_rx, TX_QUEUE_SIZE, maxphys, 8, 16, 0,
	BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR);

	cqspi_init(sc);

	sc->config_intrhook.ich_func = cqspi_delayed_attach;
	sc->config_intrhook.ich_arg = sc;
	if (config_intrhook_establish(&sc->config_intrhook) != 0) {
	device_printf(dev, "config_intrhook_establish failed\n");
	return (ENOMEM);
	}

	return (0);
	}

	static int
	cqspi_detach(device_t dev)
	{

	return (ENXIO);
	}

	static device_method_t cqspi_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, cqspi_probe),
	DEVMETHOD(device_attach, cqspi_attach),
	DEVMETHOD(device_detach, cqspi_detach),

	/* Quad SPI Flash Interface */
	DEVMETHOD(qspi_read_reg, cqspi_read_reg),
	DEVMETHOD(qspi_write_reg, cqspi_write_reg),
	DEVMETHOD(qspi_read, cqspi_read),
	DEVMETHOD(qspi_write, cqspi_write),
	DEVMETHOD(qspi_erase, cqspi_erase),

	{ 0, 0 }
	};

	static devclass_t cqspi_devclass;

	DEFINE_CLASS_1(cqspi, cqspi_driver, cqspi_methods,
	sizeof(struct cqspi_softc), simplebus_driver);

	DRIVER_MODULE(cqspi, simplebus, cqspi_driver, cqspi_devclass, 0, 0);
	diff --git a/sys/dev/isci/scil/sci_controller_constants.h b/sys/dev/isci/scil/sci_controller_constants.h
	index 04a47856de51..40f6b983601d 100644
	--- a/sys/dev/isci/scil/sci_controller_constants.h
	+++ b/sys/dev/isci/scil/sci_controller_constants.h
	@@ -1,226 +1,226 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0
	*
	* This file is provided under a dual BSD/GPLv2 license. When using or
	* redistributing this file, you may do so under either license.
	*
	* GPL LICENSE SUMMARY
	*
	* Copyright(c) 2008 - 2011 Intel Corporation. All rights reserved.
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of version 2 of the GNU General Public License as
	* published by the Free Software Foundation.
	*
	* This program is distributed in the hope that it will be useful, but
	* WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
	* The full GNU General Public License is included in this distribution
	* in the file called LICENSE.GPL.
	*
	* BSD LICENSE
	*
	* Copyright(c) 2008 - 2011 Intel Corporation. All rights reserved.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/
	#ifndef _SCI_CONTROLLER_CONSTANTS_H_
	#define _SCI_CONTROLLER_CONSTANTS_H_

	#include <sys/param.h>

	/**
	* @file
	*
	* @brief This file contains constant values that change based on the type
	* of core or framework being managed. These constants are exported
	* in order to provide the user with information as to the bounds
	* (i.e. how many) of specific objects.
	*/

	#ifdef __cplusplus
	extern "C" {
	#endif // __cplusplus

	#ifdef SCIC_SDS_4_ENABLED

	#ifndef SCI_MAX_PHYS
	/**
	* This constant defines the maximum number of phy objects that can be
	* supported for the SCU Driver Standard (SDS) library. This is tied
	* directly to silicon capabilities.
	*/
	#define SCI_MAX_PHYS (4)
	#endif

	#ifndef SCI_MAX_PORTS
	/**
	* This constant defines the maximum number of port objects that can be
	* supported for the SCU Driver Standard (SDS) library. This is tied
	* directly to silicon capabilities.
	*/
	#define SCI_MAX_PORTS SCI_MAX_PHYS
	#endif

	#ifndef SCI_MIN_SMP_PHYS
	/**
	* This constant defines the minimum number of SMP phy objects that
	* can be supported for a single expander level.
	* This was determined by using 36 physical phys and room for 2 virtual
	* phys.
	*/
	#define SCI_MIN_SMP_PHYS (38)
	#endif

	#ifndef SCI_MAX_SMP_PHYS
	/**
	* This constant defines the maximum number of SMP phy objects that
	* can be supported for the SCU Driver Standard (SDS) library.
	* This number can be increased if required.
	*/
	#define SCI_MAX_SMP_PHYS (384)
	#endif

	#ifndef SCI_MAX_REMOTE_DEVICES
	/**
	* This constant defines the maximum number of remote device objects that
	* can be supported for the SCU Driver Standard (SDS) library. This is tied
	* directly to silicon capabilities.
	*/
	#define SCI_MAX_REMOTE_DEVICES (256)
	#endif

	#ifndef SCI_MIN_REMOTE_DEVICES
	/**
	* This constant defines the minimum number of remote device objects that
	* can be supported for the SCU Driver Standard (SDS) library. This # can
	* be configured for minimum memory environments to any value less than
	* SCI_MAX_REMOTE_DEVICES
	*/
	#define SCI_MIN_REMOTE_DEVICES (16)
	#endif

	#ifndef SCI_MAX_IO_REQUESTS
	/**
	* This constant defines the maximum number of IO request objects that
	* can be supported for the SCU Driver Standard (SDS) library. This is tied
	* directly to silicon capabilities.
	*/
	#define SCI_MAX_IO_REQUESTS (256)
	#endif

	#ifndef SCI_MIN_IO_REQUESTS
	/**
	* This constant defines the minimum number of IO request objects that
	* can be supported for the SCU Driver Standard (SDS) library. This #
	* can be configured for minimum memory environments to any value less
	* than SCI_MAX_IO_REQUESTS.
	*/
	#define SCI_MIN_IO_REQUESTS (1)
	#endif

	#ifndef SCI_MAX_SCATTER_GATHER_ELEMENTS
	/**
	* This constant defines the maximum number of Scatter-Gather Elements
	* to be used by any SCI component.
	*
	* Note: number of elements must be an even number, since descriptors
	* posted to hardware always contain pairs of elements (with second
	* element set to zeroes if not needed).
	*/
	-#define __MAXPHYS_ELEMENTS ((MAXPHYS / PAGE_SIZE) + 1)
	+#define __MAXPHYS_ELEMENTS ((128 * 1024 / PAGE_SIZE) + 1)
	#define SCI_MAX_SCATTER_GATHER_ELEMENTS ((__MAXPHYS_ELEMENTS + 1) & ~0x1)
	#endif

	#ifndef SCI_MIN_SCATTER_GATHER_ELEMENTS
	/**
	* This constant defines the minimum number of Scatter-Gather Elements
	* to be used by any SCI component.
	*/
	#define SCI_MIN_SCATTER_GATHER_ELEMENTS 1
	#endif

	#else // SCIC_SDS_4_ENABLED

	#error "SCI Core configuration left unspecified (e.g. SCIC_SDS_4_ENABLED)"

	#endif // SCIC_SDS_4_ENABLED

	/**
	* This constant defines the maximum number of PCI devices that can be supported
	* by the driver.
	*/
	#define SCI_MAX_PCI_DEVICES (2)

	/**
	* This constant defines the maximum number of controllers that can
	* occur in a single silicon package.
	*/
	#define SCI_MAX_CONTROLLERS_PER_PCI_DEVICE (2)

	/**
	* This constant defines the maximum number of controllers that can
	* be supported by a library object. The user specified maximum controller
	* count must be less than or equal to this number. This is a driver
	* specific constant that is not tied to silicon capabilities.
	*/
	#if !defined(SCI_MAX_CONTROLLERS)
	#define SCI_MAX_CONTROLLERS (2)
	#endif

	#ifndef SCI_MAX_MSIX_MESSAGES_PER_CONTROLLER
	/**
	* This constant defines the maximum number of MSI-X interrupt vectors/messages
	* supported for an SCU hardware controller instance.
	*/
	#define SCI_MAX_MSIX_MESSAGES_PER_CONTROLLER (2)
	#endif

	/**
	* This constant defines the maximum number of MSI-X interrupt vectors/messages
	* supported for an SCU device.
	*/
	#define SCI_MAX_MSIX_MESSAGES \
	(SCI_MAX_MSIX_MESSAGES_PER_CONTROLLER * SCI_MAX_CONTROLLERS)

	/**
	* The maximum number of supported domain objects is currently tied to the
	* maximum number of support port objects.
	*/
	#define SCI_MAX_DOMAINS SCI_MAX_PORTS

	#ifdef __cplusplus
	}
	#endif // __cplusplus

	#endif // _SCI_CONTROLLER_CONSTANTS_H_

	diff --git a/sys/dev/iscsi/iscsi.c b/sys/dev/iscsi/iscsi.c
	index f8701a89efe7..eaaf84de9594 100644
	--- a/sys/dev/iscsi/iscsi.c
	+++ b/sys/dev/iscsi/iscsi.c
	@@ -1,2629 +1,2629 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2012 The FreeBSD Foundation
	*
	* This software was developed by Edward Tomasz Napierala under sponsorship
	* from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/condvar.h>
	#include <sys/conf.h>
	#include <sys/endian.h>
	#include <sys/eventhandler.h>
	#include <sys/file.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/module.h>
	#include <sys/socket.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/sx.h>
	#include <vm/uma.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_xpt.h>
	#include <cam/cam_debug.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/cam_periph.h>
	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_message.h>

	#include <dev/iscsi/icl.h>
	#include <dev/iscsi/icl_wrappers.h>
	#include <dev/iscsi/iscsi_ioctl.h>
	#include <dev/iscsi/iscsi_proto.h>
	#include <dev/iscsi/iscsi.h>

	#ifdef ICL_KERNEL_PROXY
	#include <sys/socketvar.h>
	#endif

	#ifdef ICL_KERNEL_PROXY
	FEATURE(iscsi_kernel_proxy, "iSCSI initiator built with ICL_KERNEL_PROXY");
	#endif

	/*
	* XXX: This is global so the iscsi_unload() can access it.
	* Think about how to do this properly.
	*/
	static struct iscsi_softc *sc;

	SYSCTL_NODE(_kern, OID_AUTO, iscsi, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"iSCSI initiator");
	static int debug = 1;
	SYSCTL_INT(_kern_iscsi, OID_AUTO, debug, CTLFLAG_RWTUN,
	&debug, 0, "Enable debug messages");
	static int ping_timeout = 5;
	SYSCTL_INT(_kern_iscsi, OID_AUTO, ping_timeout, CTLFLAG_RWTUN, &ping_timeout,
	0, "Timeout for ping (NOP-Out) requests, in seconds");
	static int iscsid_timeout = 60;
	SYSCTL_INT(_kern_iscsi, OID_AUTO, iscsid_timeout, CTLFLAG_RWTUN, &iscsid_timeout,
	0, "Time to wait for iscsid(8) to handle reconnection, in seconds");
	static int login_timeout = 60;
	SYSCTL_INT(_kern_iscsi, OID_AUTO, login_timeout, CTLFLAG_RWTUN, &login_timeout,
	0, "Time to wait for iscsid(8) to finish Login Phase, in seconds");
	static int maxtags = 255;
	SYSCTL_INT(_kern_iscsi, OID_AUTO, maxtags, CTLFLAG_RWTUN, &maxtags,
	0, "Max number of IO requests queued");
	static int fail_on_disconnection = 0;
	SYSCTL_INT(_kern_iscsi, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
	&fail_on_disconnection, 0, "Destroy CAM SIM on connection failure");
	static int fail_on_shutdown = 1;
	SYSCTL_INT(_kern_iscsi, OID_AUTO, fail_on_shutdown, CTLFLAG_RWTUN,
	&fail_on_shutdown, 0, "Fail disconnected sessions on shutdown");

	static MALLOC_DEFINE(M_ISCSI, "iSCSI", "iSCSI initiator");
	static uma_zone_t iscsi_outstanding_zone;

	#define CONN_SESSION(X) ((struct iscsi_session *)X->ic_prv0)
	#define PDU_SESSION(X) (CONN_SESSION(X->ip_conn))

	#define ISCSI_DEBUG(X, ...) \
	do { \
	if (debug > 1) \
	printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
	} while (0)

	#define ISCSI_WARN(X, ...) \
	do { \
	if (debug > 0) { \
	printf("WARNING: %s: " X "\n", \
	__func__, ## __VA_ARGS__); \
	} \
	} while (0)

	#define ISCSI_SESSION_DEBUG(S, X, ...) \
	do { \
	if (debug > 1) { \
	printf("%s: %s (%s): " X "\n", \
	__func__, S->is_conf.isc_target_addr, \
	S->is_conf.isc_target, ## __VA_ARGS__); \
	} \
	} while (0)

	#define ISCSI_SESSION_WARN(S, X, ...) \
	do { \
	if (debug > 0) { \
	printf("WARNING: %s (%s): " X "\n", \
	S->is_conf.isc_target_addr, \
	S->is_conf.isc_target, ## __VA_ARGS__); \
	} \
	} while (0)

	#define ISCSI_SESSION_LOCK(X) mtx_lock(&X->is_lock)
	#define ISCSI_SESSION_UNLOCK(X) mtx_unlock(&X->is_lock)
	#define ISCSI_SESSION_LOCK_ASSERT(X) mtx_assert(&X->is_lock, MA_OWNED)
	#define ISCSI_SESSION_LOCK_ASSERT_NOT(X) mtx_assert(&X->is_lock, MA_NOTOWNED)

	static int iscsi_ioctl(struct cdev *dev, u_long cmd, caddr_t arg,
	int mode, struct thread *td);

	static struct cdevsw iscsi_cdevsw = {
	.d_version = D_VERSION,
	.d_ioctl = iscsi_ioctl,
	.d_name = "iscsi",
	};

	static void iscsi_pdu_queue_locked(struct icl_pdu *request);
	static void iscsi_pdu_queue(struct icl_pdu *request);
	static void iscsi_pdu_update_statsn(const struct icl_pdu *response);
	static void iscsi_pdu_handle_nop_in(struct icl_pdu *response);
	static void iscsi_pdu_handle_scsi_response(struct icl_pdu *response);
	static void iscsi_pdu_handle_task_response(struct icl_pdu *response);
	static void iscsi_pdu_handle_data_in(struct icl_pdu *response);
	static void iscsi_pdu_handle_logout_response(struct icl_pdu *response);
	static void iscsi_pdu_handle_r2t(struct icl_pdu *response);
	static void iscsi_pdu_handle_async_message(struct icl_pdu *response);
	static void iscsi_pdu_handle_reject(struct icl_pdu *response);
	static void iscsi_session_reconnect(struct iscsi_session *is);
	static void iscsi_session_terminate(struct iscsi_session *is);
	static void iscsi_action(struct cam_sim sim, union ccb ccb);
	static void iscsi_poll(struct cam_sim *sim);
	static struct iscsi_outstanding iscsi_outstanding_find(struct iscsi_session is,
	uint32_t initiator_task_tag);
	static struct iscsi_outstanding iscsi_outstanding_add(struct iscsi_session is,
	struct icl_pdu request, union ccb ccb,
	uint32_t *initiator_task_tagp);
	static void iscsi_outstanding_remove(struct iscsi_session *is,
	struct iscsi_outstanding *io);

	static bool
	iscsi_pdu_prepare(struct icl_pdu *request)
	{
	struct iscsi_session *is;
	struct iscsi_bhs_scsi_command *bhssc;

	is = PDU_SESSION(request);

	ISCSI_SESSION_LOCK_ASSERT(is);

	/*
	* We're only using fields common for all the request
	* (initiator -> target) PDUs.
	*/
	bhssc = (struct iscsi_bhs_scsi_command *)request->ip_bhs;

	/*
	* Data-Out PDU does not contain CmdSN.
	*/
	if (bhssc->bhssc_opcode != ISCSI_BHS_OPCODE_SCSI_DATA_OUT) {
	if (ISCSI_SNGT(is->is_cmdsn, is->is_maxcmdsn) &&
	(bhssc->bhssc_opcode & ISCSI_BHS_OPCODE_IMMEDIATE) == 0) {
	/*
	* Current MaxCmdSN prevents us from sending any more
	* SCSI Command PDUs to the target; postpone the PDU.
	* It will get resent by either iscsi_pdu_queue(),
	* or by maintenance thread.
	*/
	#if 0
	ISCSI_SESSION_DEBUG(is, "postponing send, CmdSN %u, "
	"ExpCmdSN %u, MaxCmdSN %u, opcode 0x%x",
	is->is_cmdsn, is->is_expcmdsn, is->is_maxcmdsn,
	bhssc->bhssc_opcode);
	#endif
	return (true);
	}
	bhssc->bhssc_cmdsn = htonl(is->is_cmdsn);
	if ((bhssc->bhssc_opcode & ISCSI_BHS_OPCODE_IMMEDIATE) == 0)
	is->is_cmdsn++;
	}
	bhssc->bhssc_expstatsn = htonl(is->is_statsn + 1);

	return (false);
	}

	static void
	iscsi_session_send_postponed(struct iscsi_session *is)
	{
	struct icl_pdu *request;
	bool postpone;

	ISCSI_SESSION_LOCK_ASSERT(is);

	if (STAILQ_EMPTY(&is->is_postponed))
	return;
	while ((request = STAILQ_FIRST(&is->is_postponed)) != NULL) {
	postpone = iscsi_pdu_prepare(request);
	if (postpone)
	return;
	STAILQ_REMOVE_HEAD(&is->is_postponed, ip_next);
	icl_pdu_queue(request);
	}
	xpt_release_simq(is->is_sim, 1);
	}

	static void
	iscsi_pdu_queue_locked(struct icl_pdu *request)
	{
	struct iscsi_session *is;
	bool postpone;

	is = PDU_SESSION(request);
	ISCSI_SESSION_LOCK_ASSERT(is);
	iscsi_session_send_postponed(is);
	postpone = iscsi_pdu_prepare(request);
	if (postpone) {
	if (STAILQ_EMPTY(&is->is_postponed))
	xpt_freeze_simq(is->is_sim, 1);
	STAILQ_INSERT_TAIL(&is->is_postponed, request, ip_next);
	return;
	}
	icl_pdu_queue(request);
	}

	static void
	iscsi_pdu_queue(struct icl_pdu *request)
	{
	struct iscsi_session *is;

	is = PDU_SESSION(request);
	ISCSI_SESSION_LOCK(is);
	iscsi_pdu_queue_locked(request);
	ISCSI_SESSION_UNLOCK(is);
	}

	static void
	iscsi_session_logout(struct iscsi_session *is)
	{
	struct icl_pdu *request;
	struct iscsi_bhs_logout_request *bhslr;

	request = icl_pdu_new(is->is_conn, M_NOWAIT);
	if (request == NULL)
	return;

	bhslr = (struct iscsi_bhs_logout_request *)request->ip_bhs;
	bhslr->bhslr_opcode = ISCSI_BHS_OPCODE_LOGOUT_REQUEST;
	bhslr->bhslr_reason = BHSLR_REASON_CLOSE_SESSION;
	iscsi_pdu_queue_locked(request);
	}

	static void
	iscsi_session_terminate_task(struct iscsi_session *is,
	struct iscsi_outstanding *io, cam_status status)
	{

	ISCSI_SESSION_LOCK_ASSERT(is);

	if (io->io_ccb != NULL) {
	io->io_ccb->ccb_h.status &= ~(CAM_SIM_QUEUED \| CAM_STATUS_MASK);
	io->io_ccb->ccb_h.status \|= status;
	if ((io->io_ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) {
	io->io_ccb->ccb_h.status \|= CAM_DEV_QFRZN;
	xpt_freeze_devq(io->io_ccb->ccb_h.path, 1);
	ISCSI_SESSION_DEBUG(is, "freezing devq");
	}
	xpt_done(io->io_ccb);
	}
	iscsi_outstanding_remove(is, io);
	}

	static void
	iscsi_session_terminate_tasks(struct iscsi_session *is, cam_status status)
	{
	struct iscsi_outstanding io, tmp;

	ISCSI_SESSION_LOCK_ASSERT(is);

	TAILQ_FOREACH_SAFE(io, &is->is_outstanding, io_next, tmp) {
	iscsi_session_terminate_task(is, io, status);
	}
	}

	static void
	iscsi_session_cleanup(struct iscsi_session *is, bool destroy_sim)
	{
	struct icl_pdu *pdu;

	ISCSI_SESSION_LOCK_ASSERT(is);

	/*
	* Don't queue any new PDUs.
	*/
	if (is->is_sim != NULL && is->is_simq_frozen == false) {
	ISCSI_SESSION_DEBUG(is, "freezing");
	xpt_freeze_simq(is->is_sim, 1);
	is->is_simq_frozen = true;
	}

	/*
	* Remove postponed PDUs.
	*/
	if (!STAILQ_EMPTY(&is->is_postponed))
	xpt_release_simq(is->is_sim, 1);
	while ((pdu = STAILQ_FIRST(&is->is_postponed)) != NULL) {
	STAILQ_REMOVE_HEAD(&is->is_postponed, ip_next);
	icl_pdu_free(pdu);
	}

	if (destroy_sim == false) {
	/*
	* Terminate SCSI tasks, asking CAM to requeue them.
	*/
	iscsi_session_terminate_tasks(is, CAM_REQUEUE_REQ);
	return;
	}

	iscsi_session_terminate_tasks(is, CAM_DEV_NOT_THERE);

	if (is->is_sim == NULL)
	return;

	ISCSI_SESSION_DEBUG(is, "deregistering SIM");
	xpt_async(AC_LOST_DEVICE, is->is_path, NULL);

	if (is->is_simq_frozen) {
	is->is_simq_frozen = false;
	xpt_release_simq(is->is_sim, 1);
	}

	xpt_free_path(is->is_path);
	is->is_path = NULL;
	xpt_bus_deregister(cam_sim_path(is->is_sim));
	cam_sim_free(is->is_sim, TRUE /free_devq/);
	is->is_sim = NULL;
	is->is_devq = NULL;
	}

	static void
	iscsi_maintenance_thread_reconnect(struct iscsi_session *is)
	{

	icl_conn_close(is->is_conn);

	ISCSI_SESSION_LOCK(is);

	is->is_connected = false;
	is->is_reconnecting = false;
	is->is_login_phase = false;

	#ifdef ICL_KERNEL_PROXY
	if (is->is_login_pdu != NULL) {
	icl_pdu_free(is->is_login_pdu);
	is->is_login_pdu = NULL;
	}
	cv_signal(&is->is_login_cv);
	#endif

	if (fail_on_disconnection) {
	ISCSI_SESSION_DEBUG(is, "connection failed, destroying devices");
	iscsi_session_cleanup(is, true);
	} else {
	iscsi_session_cleanup(is, false);
	}

	KASSERT(TAILQ_EMPTY(&is->is_outstanding),
	("destroying session with active tasks"));
	KASSERT(STAILQ_EMPTY(&is->is_postponed),
	("destroying session with postponed PDUs"));

	if (is->is_conf.isc_enable == 0 && is->is_conf.isc_discovery == 0) {
	ISCSI_SESSION_UNLOCK(is);
	return;
	}

	/*
	* Request immediate reconnection from iscsid(8).
	*/
	//ISCSI_SESSION_DEBUG(is, "waking up iscsid(8)");
	is->is_waiting_for_iscsid = true;
	strlcpy(is->is_reason, "Waiting for iscsid(8)", sizeof(is->is_reason));
	is->is_timeout = 0;
	ISCSI_SESSION_UNLOCK(is);
	cv_signal(&is->is_softc->sc_cv);
	}

	static void
	iscsi_maintenance_thread_terminate(struct iscsi_session *is)
	{
	struct iscsi_softc *sc;

	sc = is->is_softc;
	sx_xlock(&sc->sc_lock);
	TAILQ_REMOVE(&sc->sc_sessions, is, is_next);
	sx_xunlock(&sc->sc_lock);

	icl_conn_close(is->is_conn);
	callout_drain(&is->is_callout);

	ISCSI_SESSION_LOCK(is);

	KASSERT(is->is_terminating, ("is_terminating == false"));

	#ifdef ICL_KERNEL_PROXY
	if (is->is_login_pdu != NULL) {
	icl_pdu_free(is->is_login_pdu);
	is->is_login_pdu = NULL;
	}
	cv_signal(&is->is_login_cv);
	#endif

	iscsi_session_cleanup(is, true);

	KASSERT(TAILQ_EMPTY(&is->is_outstanding),
	("destroying session with active tasks"));
	KASSERT(STAILQ_EMPTY(&is->is_postponed),
	("destroying session with postponed PDUs"));

	ISCSI_SESSION_UNLOCK(is);

	icl_conn_free(is->is_conn);
	mtx_destroy(&is->is_lock);
	cv_destroy(&is->is_maintenance_cv);
	#ifdef ICL_KERNEL_PROXY
	cv_destroy(&is->is_login_cv);
	#endif

	ISCSI_SESSION_DEBUG(is, "terminated");
	free(is, M_ISCSI);

	/*
	* The iscsi_unload() routine might be waiting.
	*/
	cv_signal(&sc->sc_cv);
	}

	static void
	iscsi_maintenance_thread(void *arg)
	{
	struct iscsi_session *is = arg;

	ISCSI_SESSION_LOCK(is);
	for (;;) {
	if (is->is_reconnecting == false &&
	is->is_terminating == false &&
	(STAILQ_EMPTY(&is->is_postponed) \|\|
	ISCSI_SNGT(is->is_cmdsn, is->is_maxcmdsn)))
	cv_wait(&is->is_maintenance_cv, &is->is_lock);

	/* Terminate supersedes reconnect. */
	if (is->is_terminating) {
	ISCSI_SESSION_UNLOCK(is);
	iscsi_maintenance_thread_terminate(is);
	kthread_exit();
	return;
	}

	if (is->is_reconnecting) {
	ISCSI_SESSION_UNLOCK(is);
	iscsi_maintenance_thread_reconnect(is);
	ISCSI_SESSION_LOCK(is);
	continue;
	}

	iscsi_session_send_postponed(is);
	}
	ISCSI_SESSION_UNLOCK(is);
	}

	static void
	iscsi_session_reconnect(struct iscsi_session *is)
	{

	/*
	* XXX: We can't use locking here, because
	* it's being called from various contexts.
	* Hope it doesn't break anything.
	*/
	if (is->is_reconnecting)
	return;

	is->is_reconnecting = true;
	cv_signal(&is->is_maintenance_cv);
	}

	static void
	iscsi_session_terminate(struct iscsi_session *is)
	{

	if (is->is_terminating)
	return;

	is->is_terminating = true;

	#if 0
	iscsi_session_logout(is);
	#endif
	cv_signal(&is->is_maintenance_cv);
	}

	static void
	iscsi_callout(void *context)
	{
	struct icl_pdu *request;
	struct iscsi_bhs_nop_out *bhsno;
	struct iscsi_session *is;
	bool reconnect_needed = false;

	is = context;

	ISCSI_SESSION_LOCK(is);
	if (is->is_terminating) {
	ISCSI_SESSION_UNLOCK(is);
	return;
	}

	callout_schedule(&is->is_callout, 1 * hz);

	if (is->is_conf.isc_enable == 0)
	goto out;

	is->is_timeout++;

	if (is->is_waiting_for_iscsid) {
	if (iscsid_timeout > 0 && is->is_timeout > iscsid_timeout) {
	ISCSI_SESSION_WARN(is, "timed out waiting for iscsid(8) "
	"for %d seconds; reconnecting",
	is->is_timeout);
	reconnect_needed = true;
	}
	goto out;
	}

	if (is->is_login_phase) {
	if (login_timeout > 0 && is->is_timeout > login_timeout) {
	ISCSI_SESSION_WARN(is, "login timed out after %d seconds; "
	"reconnecting", is->is_timeout);
	reconnect_needed = true;
	}
	goto out;
	}

	if (ping_timeout <= 0) {
	/*
	* Pings are disabled. Don't send NOP-Out in this case.
	* Reset the timeout, to avoid triggering reconnection,
	* should the user decide to reenable them.
	*/
	is->is_timeout = 0;
	goto out;
	}

	if (is->is_timeout >= ping_timeout) {
	ISCSI_SESSION_WARN(is, "no ping reply (NOP-In) after %d seconds; "
	"reconnecting", ping_timeout);
	reconnect_needed = true;
	goto out;
	}

	ISCSI_SESSION_UNLOCK(is);

	/*
	* If the ping was reset less than one second ago - which means
	* that we've received some PDU during the last second - assume
	* the traffic flows correctly and don't bother sending a NOP-Out.
	*
	* (It's 2 - one for one second, and one for incrementing is_timeout
	* earlier in this routine.)
	*/
	if (is->is_timeout < 2)
	return;

	request = icl_pdu_new(is->is_conn, M_NOWAIT);
	if (request == NULL) {
	ISCSI_SESSION_WARN(is, "failed to allocate PDU");
	return;
	}
	bhsno = (struct iscsi_bhs_nop_out *)request->ip_bhs;
	bhsno->bhsno_opcode = ISCSI_BHS_OPCODE_NOP_OUT \|
	ISCSI_BHS_OPCODE_IMMEDIATE;
	bhsno->bhsno_flags = 0x80;
	bhsno->bhsno_target_transfer_tag = 0xffffffff;
	iscsi_pdu_queue(request);
	return;

	out:
	if (is->is_terminating) {
	ISCSI_SESSION_UNLOCK(is);
	return;
	}

	ISCSI_SESSION_UNLOCK(is);

	if (reconnect_needed)
	iscsi_session_reconnect(is);
	}

	static void
	iscsi_pdu_update_statsn(const struct icl_pdu *response)
	{
	const struct iscsi_bhs_data_in *bhsdi;
	struct iscsi_session *is;
	uint32_t expcmdsn, maxcmdsn, statsn;

	is = PDU_SESSION(response);

	ISCSI_SESSION_LOCK_ASSERT(is);

	/*
	* We're only using fields common for all the response
	* (target -> initiator) PDUs.
	*/
	bhsdi = (const struct iscsi_bhs_data_in *)response->ip_bhs;
	/*
	* Ok, I lied. In case of Data-In, "The fields StatSN, Status,
	* and Residual Count only have meaningful content if the S bit
	* is set to 1", so we also need to check the bit specific for
	* Data-In PDU.
	*/
	if (bhsdi->bhsdi_opcode != ISCSI_BHS_OPCODE_SCSI_DATA_IN \|\|
	(bhsdi->bhsdi_flags & BHSDI_FLAGS_S) != 0) {
	statsn = ntohl(bhsdi->bhsdi_statsn);
	if (statsn != is->is_statsn && statsn != (is->is_statsn + 1)) {
	/* XXX: This is normal situation for MCS */
	ISCSI_SESSION_WARN(is, "PDU 0x%x StatSN %u != "
	"session ExpStatSN %u (or + 1); reconnecting",
	bhsdi->bhsdi_opcode, statsn, is->is_statsn);
	iscsi_session_reconnect(is);
	}
	if (ISCSI_SNGT(statsn, is->is_statsn))
	is->is_statsn = statsn;
	}

	expcmdsn = ntohl(bhsdi->bhsdi_expcmdsn);
	maxcmdsn = ntohl(bhsdi->bhsdi_maxcmdsn);

	if (ISCSI_SNLT(maxcmdsn + 1, expcmdsn)) {
	ISCSI_SESSION_DEBUG(is,
	"PDU MaxCmdSN %u + 1 < PDU ExpCmdSN %u; ignoring",
	maxcmdsn, expcmdsn);
	} else {
	if (ISCSI_SNGT(maxcmdsn, is->is_maxcmdsn)) {
	is->is_maxcmdsn = maxcmdsn;

	/*
	* Command window increased; kick the maintanance thread
	* to send out postponed commands.
	*/
	if (!STAILQ_EMPTY(&is->is_postponed))
	cv_signal(&is->is_maintenance_cv);
	} else if (ISCSI_SNLT(maxcmdsn, is->is_maxcmdsn)) {
	/* XXX: This is normal situation for MCS */
	ISCSI_SESSION_DEBUG(is,
	"PDU MaxCmdSN %u < session MaxCmdSN %u; ignoring",
	maxcmdsn, is->is_maxcmdsn);
	}

	if (ISCSI_SNGT(expcmdsn, is->is_expcmdsn)) {
	is->is_expcmdsn = expcmdsn;
	} else if (ISCSI_SNLT(expcmdsn, is->is_expcmdsn)) {
	/* XXX: This is normal situation for MCS */
	ISCSI_SESSION_DEBUG(is,
	"PDU ExpCmdSN %u < session ExpCmdSN %u; ignoring",
	expcmdsn, is->is_expcmdsn);
	}
	}

	/*
	* Every incoming PDU - not just NOP-In - resets the ping timer.
	* The purpose of the timeout is to reset the connection when it stalls;
	* we don't want this to happen when NOP-In or NOP-Out ends up delayed
	* in some queue.
	*/
	is->is_timeout = 0;
	}

	static void
	iscsi_receive_callback(struct icl_pdu *response)
	{
	struct iscsi_session *is;

	is = PDU_SESSION(response);

	ISCSI_SESSION_LOCK(is);

	iscsi_pdu_update_statsn(response);

	#ifdef ICL_KERNEL_PROXY
	if (is->is_login_phase) {
	if (is->is_login_pdu == NULL)
	is->is_login_pdu = response;
	else
	icl_pdu_free(response);
	ISCSI_SESSION_UNLOCK(is);
	cv_signal(&is->is_login_cv);
	return;
	}
	#endif

	/*
	* The handling routine is responsible for freeing the PDU
	* when it's no longer needed.
	*/
	switch (response->ip_bhs->bhs_opcode) {
	case ISCSI_BHS_OPCODE_NOP_IN:
	iscsi_pdu_handle_nop_in(response);
	ISCSI_SESSION_UNLOCK(is);
	break;
	case ISCSI_BHS_OPCODE_SCSI_RESPONSE:
	iscsi_pdu_handle_scsi_response(response);
	/* Session lock dropped inside. */
	ISCSI_SESSION_LOCK_ASSERT_NOT(is);
	break;
	case ISCSI_BHS_OPCODE_TASK_RESPONSE:
	iscsi_pdu_handle_task_response(response);
	ISCSI_SESSION_UNLOCK(is);
	break;
	case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
	iscsi_pdu_handle_data_in(response);
	/* Session lock dropped inside. */
	ISCSI_SESSION_LOCK_ASSERT_NOT(is);
	break;
	case ISCSI_BHS_OPCODE_LOGOUT_RESPONSE:
	iscsi_pdu_handle_logout_response(response);
	ISCSI_SESSION_UNLOCK(is);
	break;
	case ISCSI_BHS_OPCODE_R2T:
	iscsi_pdu_handle_r2t(response);
	ISCSI_SESSION_UNLOCK(is);
	break;
	case ISCSI_BHS_OPCODE_ASYNC_MESSAGE:
	iscsi_pdu_handle_async_message(response);
	ISCSI_SESSION_UNLOCK(is);
	break;
	case ISCSI_BHS_OPCODE_REJECT:
	iscsi_pdu_handle_reject(response);
	ISCSI_SESSION_UNLOCK(is);
	break;
	default:
	ISCSI_SESSION_WARN(is, "received PDU with unsupported "
	"opcode 0x%x; reconnecting",
	response->ip_bhs->bhs_opcode);
	iscsi_session_reconnect(is);
	ISCSI_SESSION_UNLOCK(is);
	icl_pdu_free(response);
	}
	}

	static void
	iscsi_error_callback(struct icl_conn *ic)
	{
	struct iscsi_session *is;

	is = CONN_SESSION(ic);

	ISCSI_SESSION_WARN(is, "connection error; reconnecting");
	iscsi_session_reconnect(is);
	}

	static void
	iscsi_pdu_handle_nop_in(struct icl_pdu *response)
	{
	struct iscsi_session *is;
	struct iscsi_bhs_nop_out *bhsno;
	struct iscsi_bhs_nop_in *bhsni;
	struct icl_pdu *request;
	void *data = NULL;
	size_t datasize;
	int error;

	is = PDU_SESSION(response);
	bhsni = (struct iscsi_bhs_nop_in *)response->ip_bhs;

	if (bhsni->bhsni_target_transfer_tag == 0xffffffff) {
	/*
	* Nothing to do; iscsi_pdu_update_statsn() already
	* zeroed the timeout.
	*/
	icl_pdu_free(response);
	return;
	}

	datasize = icl_pdu_data_segment_length(response);
	if (datasize > 0) {
	data = malloc(datasize, M_ISCSI, M_NOWAIT \| M_ZERO);
	if (data == NULL) {
	ISCSI_SESSION_WARN(is, "failed to allocate memory; "
	"reconnecting");
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	return;
	}
	icl_pdu_get_data(response, 0, data, datasize);
	}

	request = icl_pdu_new(response->ip_conn, M_NOWAIT);
	if (request == NULL) {
	ISCSI_SESSION_WARN(is, "failed to allocate memory; "
	"reconnecting");
	free(data, M_ISCSI);
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	return;
	}
	bhsno = (struct iscsi_bhs_nop_out *)request->ip_bhs;
	bhsno->bhsno_opcode = ISCSI_BHS_OPCODE_NOP_OUT \|
	ISCSI_BHS_OPCODE_IMMEDIATE;
	bhsno->bhsno_flags = 0x80;
	bhsno->bhsno_initiator_task_tag = 0xffffffff;
	bhsno->bhsno_target_transfer_tag = bhsni->bhsni_target_transfer_tag;
	if (datasize > 0) {
	error = icl_pdu_append_data(request, data, datasize, M_NOWAIT);
	if (error != 0) {
	ISCSI_SESSION_WARN(is, "failed to allocate memory; "
	"reconnecting");
	free(data, M_ISCSI);
	icl_pdu_free(request);
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	return;
	}
	free(data, M_ISCSI);
	}

	icl_pdu_free(response);
	iscsi_pdu_queue_locked(request);
	}

	static void
	iscsi_pdu_handle_scsi_response(struct icl_pdu *response)
	{
	struct iscsi_bhs_scsi_response *bhssr;
	struct iscsi_outstanding *io;
	struct iscsi_session *is;
	union ccb *ccb;
	struct ccb_scsiio *csio;
	size_t data_segment_len, received;
	uint16_t sense_len;
	uint32_t resid;

	is = PDU_SESSION(response);

	bhssr = (struct iscsi_bhs_scsi_response *)response->ip_bhs;
	io = iscsi_outstanding_find(is, bhssr->bhssr_initiator_task_tag);
	if (io == NULL \|\| io->io_ccb == NULL) {
	ISCSI_SESSION_WARN(is, "bad itt 0x%x", bhssr->bhssr_initiator_task_tag);
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	ISCSI_SESSION_UNLOCK(is);
	return;
	}

	ccb = io->io_ccb;

	/*
	* With iSER, after getting good response we can be sure
	* that all the data has been successfully transferred.
	*/
	if (is->is_conn->ic_iser) {
	resid = ntohl(bhssr->bhssr_residual_count);
	if (bhssr->bhssr_flags & BHSSR_FLAGS_RESIDUAL_UNDERFLOW) {
	io->io_received = ccb->csio.dxfer_len - resid;
	} else if (bhssr->bhssr_flags & BHSSR_FLAGS_RESIDUAL_OVERFLOW) {
	ISCSI_SESSION_WARN(is, "overflow: target indicates %d", resid);
	} else {
	io->io_received = ccb->csio.dxfer_len;
	}
	}

	received = io->io_received;
	iscsi_outstanding_remove(is, io);
	ISCSI_SESSION_UNLOCK(is);

	if (bhssr->bhssr_response != BHSSR_RESPONSE_COMMAND_COMPLETED) {
	ISCSI_SESSION_WARN(is, "service response 0x%x", bhssr->bhssr_response);
	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ISCSI_SESSION_DEBUG(is, "freezing devq");
	}
	ccb->ccb_h.status = CAM_REQ_CMP_ERR \| CAM_DEV_QFRZN;
	} else if (bhssr->bhssr_status == 0) {
	ccb->ccb_h.status = CAM_REQ_CMP;
	} else {
	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ISCSI_SESSION_DEBUG(is, "freezing devq");
	}
	ccb->ccb_h.status = CAM_SCSI_STATUS_ERROR \| CAM_DEV_QFRZN;
	ccb->csio.scsi_status = bhssr->bhssr_status;
	}

	csio = &ccb->csio;
	data_segment_len = icl_pdu_data_segment_length(response);
	if (data_segment_len > 0) {
	if (data_segment_len < sizeof(sense_len)) {
	ISCSI_SESSION_WARN(is, "truncated data segment (%zd bytes)",
	data_segment_len);
	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ISCSI_SESSION_DEBUG(is, "freezing devq");
	}
	ccb->ccb_h.status = CAM_REQ_CMP_ERR \| CAM_DEV_QFRZN;
	goto out;
	}
	icl_pdu_get_data(response, 0, &sense_len, sizeof(sense_len));
	sense_len = ntohs(sense_len);
	#if 0
	ISCSI_SESSION_DEBUG(is, "sense_len %d, data len %zd",
	sense_len, data_segment_len);
	#endif
	if (sizeof(sense_len) + sense_len > data_segment_len) {
	ISCSI_SESSION_WARN(is, "truncated data segment "
	"(%zd bytes, should be %zd)",
	data_segment_len, sizeof(sense_len) + sense_len);
	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ISCSI_SESSION_DEBUG(is, "freezing devq");
	}
	ccb->ccb_h.status = CAM_REQ_CMP_ERR \| CAM_DEV_QFRZN;
	goto out;
	} else if (sizeof(sense_len) + sense_len < data_segment_len)
	ISCSI_SESSION_WARN(is, "oversize data segment "
	"(%zd bytes, should be %zd)",
	data_segment_len, sizeof(sense_len) + sense_len);
	if (sense_len > csio->sense_len) {
	ISCSI_SESSION_DEBUG(is, "truncating sense from %d to %d",
	sense_len, csio->sense_len);
	sense_len = csio->sense_len;
	}
	icl_pdu_get_data(response, sizeof(sense_len), &csio->sense_data, sense_len);
	csio->sense_resid = csio->sense_len - sense_len;
	ccb->ccb_h.status \|= CAM_AUTOSNS_VALID;
	}

	out:
	if (bhssr->bhssr_flags & BHSSR_FLAGS_RESIDUAL_UNDERFLOW)
	csio->resid = ntohl(bhssr->bhssr_residual_count);

	if ((csio->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
	KASSERT(received <= csio->dxfer_len,
	("received > csio->dxfer_len"));
	if (received < csio->dxfer_len) {
	if (csio->resid != csio->dxfer_len - received) {
	ISCSI_SESSION_WARN(is, "underflow mismatch: "
	"target indicates %d, we calculated %zd",
	csio->resid, csio->dxfer_len - received);
	}
	csio->resid = csio->dxfer_len - received;
	}
	}

	xpt_done(ccb);
	icl_pdu_free(response);
	}

	static void
	iscsi_pdu_handle_task_response(struct icl_pdu *response)
	{
	struct iscsi_bhs_task_management_response *bhstmr;
	struct iscsi_outstanding io, aio;
	struct iscsi_session *is;

	is = PDU_SESSION(response);

	bhstmr = (struct iscsi_bhs_task_management_response *)response->ip_bhs;
	io = iscsi_outstanding_find(is, bhstmr->bhstmr_initiator_task_tag);
	if (io == NULL \|\| io->io_ccb != NULL) {
	ISCSI_SESSION_WARN(is, "bad itt 0x%x",
	bhstmr->bhstmr_initiator_task_tag);
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	return;
	}

	if (bhstmr->bhstmr_response != BHSTMR_RESPONSE_FUNCTION_COMPLETE) {
	ISCSI_SESSION_WARN(is, "task response 0x%x",
	bhstmr->bhstmr_response);
	} else {
	aio = iscsi_outstanding_find(is, io->io_datasn);
	if (aio != NULL && aio->io_ccb != NULL)
	iscsi_session_terminate_task(is, aio, CAM_REQ_ABORTED);
	}

	iscsi_outstanding_remove(is, io);
	icl_pdu_free(response);
	}

	static void
	iscsi_pdu_handle_data_in(struct icl_pdu *response)
	{
	struct iscsi_bhs_data_in *bhsdi;
	struct iscsi_outstanding *io;
	struct iscsi_session *is;
	union ccb *ccb;
	struct ccb_scsiio *csio;
	size_t data_segment_len, received, oreceived;

	is = PDU_SESSION(response);
	bhsdi = (struct iscsi_bhs_data_in *)response->ip_bhs;
	io = iscsi_outstanding_find(is, bhsdi->bhsdi_initiator_task_tag);
	if (io == NULL \|\| io->io_ccb == NULL) {
	ISCSI_SESSION_WARN(is, "bad itt 0x%x", bhsdi->bhsdi_initiator_task_tag);
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	ISCSI_SESSION_UNLOCK(is);
	return;
	}

	data_segment_len = icl_pdu_data_segment_length(response);
	if (data_segment_len == 0) {
	/*
	* "The sending of 0 length data segments should be avoided,
	* but initiators and targets MUST be able to properly receive
	* 0 length data segments."
	*/
	ISCSI_SESSION_UNLOCK(is);
	icl_pdu_free(response);
	return;
	}

	/*
	* We need to track this for security reasons - without it, malicious target
	* could respond to SCSI READ without sending Data-In PDUs, which would result
	* in read operation on the initiator side returning random kernel data.
	*/
	if (ntohl(bhsdi->bhsdi_buffer_offset) != io->io_received) {
	ISCSI_SESSION_WARN(is, "data out of order; expected offset %zd, got %zd",
	io->io_received, (size_t)ntohl(bhsdi->bhsdi_buffer_offset));
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	ISCSI_SESSION_UNLOCK(is);
	return;
	}

	ccb = io->io_ccb;
	csio = &ccb->csio;

	if (io->io_received + data_segment_len > csio->dxfer_len) {
	ISCSI_SESSION_WARN(is, "oversize data segment (%zd bytes "
	"at offset %zd, buffer is %d)",
	data_segment_len, io->io_received, csio->dxfer_len);
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	ISCSI_SESSION_UNLOCK(is);
	return;
	}

	oreceived = io->io_received;
	io->io_received += data_segment_len;
	received = io->io_received;
	if ((bhsdi->bhsdi_flags & BHSDI_FLAGS_S) != 0)
	iscsi_outstanding_remove(is, io);
	ISCSI_SESSION_UNLOCK(is);

	icl_pdu_get_data(response, 0, csio->data_ptr + oreceived, data_segment_len);

	/*
	* XXX: Check DataSN.
	* XXX: Check F.
	*/
	if ((bhsdi->bhsdi_flags & BHSDI_FLAGS_S) == 0) {
	/*
	* Nothing more to do.
	*/
	icl_pdu_free(response);
	return;
	}

	//ISCSI_SESSION_DEBUG(is, "got S flag; status 0x%x", bhsdi->bhsdi_status);
	if (bhsdi->bhsdi_status == 0) {
	ccb->ccb_h.status = CAM_REQ_CMP;
	} else {
	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ISCSI_SESSION_DEBUG(is, "freezing devq");
	}
	ccb->ccb_h.status = CAM_SCSI_STATUS_ERROR \| CAM_DEV_QFRZN;
	csio->scsi_status = bhsdi->bhsdi_status;
	}

	if ((csio->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
	KASSERT(received <= csio->dxfer_len,
	("received > csio->dxfer_len"));
	if (received < csio->dxfer_len) {
	csio->resid = ntohl(bhsdi->bhsdi_residual_count);
	if (csio->resid != csio->dxfer_len - received) {
	ISCSI_SESSION_WARN(is, "underflow mismatch: "
	"target indicates %d, we calculated %zd",
	csio->resid, csio->dxfer_len - received);
	}
	csio->resid = csio->dxfer_len - received;
	}
	}

	xpt_done(ccb);
	icl_pdu_free(response);
	}

	static void
	iscsi_pdu_handle_logout_response(struct icl_pdu *response)
	{

	ISCSI_SESSION_DEBUG(PDU_SESSION(response), "logout response");
	icl_pdu_free(response);
	}

	static void
	iscsi_pdu_handle_r2t(struct icl_pdu *response)
	{
	struct icl_pdu *request;
	struct iscsi_session *is;
	struct iscsi_bhs_r2t *bhsr2t;
	struct iscsi_bhs_data_out *bhsdo;
	struct iscsi_outstanding *io;
	struct ccb_scsiio *csio;
	size_t off, len, total_len;
	int error;

	is = PDU_SESSION(response);

	bhsr2t = (struct iscsi_bhs_r2t *)response->ip_bhs;
	io = iscsi_outstanding_find(is, bhsr2t->bhsr2t_initiator_task_tag);
	if (io == NULL \|\| io->io_ccb == NULL) {
	ISCSI_SESSION_WARN(is, "bad itt 0x%x; reconnecting",
	bhsr2t->bhsr2t_initiator_task_tag);
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	return;
	}

	csio = &io->io_ccb->csio;

	if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_OUT) {
	ISCSI_SESSION_WARN(is, "received R2T for read command; reconnecting");
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	return;
	}

	/*
	* XXX: Verify R2TSN.
	*/

	io->io_datasn = 0;

	off = ntohl(bhsr2t->bhsr2t_buffer_offset);
	if (off > csio->dxfer_len) {
	ISCSI_SESSION_WARN(is, "target requested invalid offset "
	"%zd, buffer is is %d; reconnecting", off, csio->dxfer_len);
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	return;
	}

	total_len = ntohl(bhsr2t->bhsr2t_desired_data_transfer_length);
	if (total_len == 0 \|\| total_len > csio->dxfer_len) {
	ISCSI_SESSION_WARN(is, "target requested invalid length "
	"%zd, buffer is %d; reconnecting", total_len, csio->dxfer_len);
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	return;
	}

	//ISCSI_SESSION_DEBUG(is, "r2t; off %zd, len %zd", off, total_len);

	for (;;) {
	len = total_len;

	if (len > is->is_max_send_data_segment_length)
	len = is->is_max_send_data_segment_length;

	if (off + len > csio->dxfer_len) {
	ISCSI_SESSION_WARN(is, "target requested invalid "
	"length/offset %zd, buffer is %d; reconnecting",
	off + len, csio->dxfer_len);
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	return;
	}

	request = icl_pdu_new(response->ip_conn, M_NOWAIT);
	if (request == NULL) {
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	return;
	}

	bhsdo = (struct iscsi_bhs_data_out *)request->ip_bhs;
	bhsdo->bhsdo_opcode = ISCSI_BHS_OPCODE_SCSI_DATA_OUT;
	bhsdo->bhsdo_lun = bhsr2t->bhsr2t_lun;
	bhsdo->bhsdo_initiator_task_tag =
	bhsr2t->bhsr2t_initiator_task_tag;
	bhsdo->bhsdo_target_transfer_tag =
	bhsr2t->bhsr2t_target_transfer_tag;
	bhsdo->bhsdo_datasn = htonl(io->io_datasn++);
	bhsdo->bhsdo_buffer_offset = htonl(off);
	error = icl_pdu_append_data(request, csio->data_ptr + off, len,
	M_NOWAIT);
	if (error != 0) {
	ISCSI_SESSION_WARN(is, "failed to allocate memory; "
	"reconnecting");
	icl_pdu_free(request);
	icl_pdu_free(response);
	iscsi_session_reconnect(is);
	return;
	}

	off += len;
	total_len -= len;

	if (total_len == 0) {
	bhsdo->bhsdo_flags \|= BHSDO_FLAGS_F;
	//ISCSI_SESSION_DEBUG(is, "setting F, off %zd", off);
	} else {
	//ISCSI_SESSION_DEBUG(is, "not finished, off %zd", off);
	}

	iscsi_pdu_queue_locked(request);

	if (total_len == 0)
	break;
	}

	icl_pdu_free(response);
	}

	static void
	iscsi_pdu_handle_async_message(struct icl_pdu *response)
	{
	struct iscsi_bhs_asynchronous_message *bhsam;
	struct iscsi_session *is;

	is = PDU_SESSION(response);
	bhsam = (struct iscsi_bhs_asynchronous_message *)response->ip_bhs;
	switch (bhsam->bhsam_async_event) {
	case BHSAM_EVENT_TARGET_REQUESTS_LOGOUT:
	ISCSI_SESSION_WARN(is, "target requests logout; removing session");
	iscsi_session_logout(is);
	iscsi_session_terminate(is);
	break;
	case BHSAM_EVENT_TARGET_TERMINATES_CONNECTION:
	ISCSI_SESSION_WARN(is, "target indicates it will drop the connection");
	break;
	case BHSAM_EVENT_TARGET_TERMINATES_SESSION:
	ISCSI_SESSION_WARN(is, "target indicates it will drop the session");
	break;
	default:
	/*
	* XXX: Technically, we're obligated to also handle
	* parameter renegotiation.
	*/
	ISCSI_SESSION_WARN(is, "ignoring AsyncEvent %d", bhsam->bhsam_async_event);
	break;
	}

	icl_pdu_free(response);
	}

	static void
	iscsi_pdu_handle_reject(struct icl_pdu *response)
	{
	struct iscsi_bhs_reject *bhsr;
	struct iscsi_session *is;

	is = PDU_SESSION(response);
	bhsr = (struct iscsi_bhs_reject *)response->ip_bhs;
	ISCSI_SESSION_WARN(is, "received Reject PDU, reason 0x%x; protocol error?",
	bhsr->bhsr_reason);

	icl_pdu_free(response);
	}

	static int
	iscsi_ioctl_daemon_wait(struct iscsi_softc *sc,
	struct iscsi_daemon_request *request)
	{
	struct iscsi_session *is;
	struct icl_drv_limits idl;
	int error;

	sx_slock(&sc->sc_lock);
	for (;;) {
	TAILQ_FOREACH(is, &sc->sc_sessions, is_next) {
	ISCSI_SESSION_LOCK(is);
	if (is->is_conf.isc_enable == 0 &&
	is->is_conf.isc_discovery == 0) {
	ISCSI_SESSION_UNLOCK(is);
	continue;
	}
	if (is->is_waiting_for_iscsid)
	break;
	ISCSI_SESSION_UNLOCK(is);
	}

	if (is == NULL) {
	/*
	* No session requires attention from iscsid(8); wait.
	*/
	error = cv_wait_sig(&sc->sc_cv, &sc->sc_lock);
	if (error != 0) {
	sx_sunlock(&sc->sc_lock);
	return (error);
	}
	continue;
	}

	is->is_waiting_for_iscsid = false;
	is->is_login_phase = true;
	is->is_reason[0] = '\0';
	ISCSI_SESSION_UNLOCK(is);

	request->idr_session_id = is->is_id;
	memcpy(&request->idr_isid, &is->is_isid,
	sizeof(request->idr_isid));
	request->idr_tsih = 0; /* New or reinstated session. */
	memcpy(&request->idr_conf, &is->is_conf,
	sizeof(request->idr_conf));

	error = icl_limits(is->is_conf.isc_offload,
	is->is_conf.isc_iser, &idl);
	if (error != 0) {
	ISCSI_SESSION_WARN(is, "icl_limits for offload \"%s\" "
	"failed with error %d", is->is_conf.isc_offload,
	error);
	sx_sunlock(&sc->sc_lock);
	return (error);
	}
	request->idr_limits.isl_max_recv_data_segment_length =
	idl.idl_max_recv_data_segment_length;
	request->idr_limits.isl_max_send_data_segment_length =
	idl.idl_max_send_data_segment_length;
	request->idr_limits.isl_max_burst_length =
	idl.idl_max_burst_length;
	request->idr_limits.isl_first_burst_length =
	idl.idl_first_burst_length;

	sx_sunlock(&sc->sc_lock);
	return (0);
	}
	}

	static int
	iscsi_ioctl_daemon_handoff(struct iscsi_softc *sc,
	struct iscsi_daemon_handoff *handoff)
	{
	struct iscsi_session *is;
	struct icl_conn *ic;
	int error;

	sx_slock(&sc->sc_lock);

	/*
	* Find the session to hand off socket to.
	*/
	TAILQ_FOREACH(is, &sc->sc_sessions, is_next) {
	if (is->is_id == handoff->idh_session_id)
	break;
	}
	if (is == NULL) {
	sx_sunlock(&sc->sc_lock);
	return (ESRCH);
	}
	ISCSI_SESSION_LOCK(is);
	ic = is->is_conn;
	if (is->is_conf.isc_discovery \|\| is->is_terminating) {
	ISCSI_SESSION_UNLOCK(is);
	sx_sunlock(&sc->sc_lock);
	return (EINVAL);
	}
	if (is->is_connected) {
	/*
	* This might have happened because another iscsid(8)
	* instance handed off the connection in the meantime.
	* Just return.
	*/
	ISCSI_SESSION_WARN(is, "handoff on already connected "
	"session");
	ISCSI_SESSION_UNLOCK(is);
	sx_sunlock(&sc->sc_lock);
	return (EBUSY);
	}

	strlcpy(is->is_target_alias, handoff->idh_target_alias,
	sizeof(is->is_target_alias));
	is->is_tsih = handoff->idh_tsih;
	is->is_statsn = handoff->idh_statsn;
	is->is_protocol_level = handoff->idh_protocol_level;
	is->is_initial_r2t = handoff->idh_initial_r2t;
	is->is_immediate_data = handoff->idh_immediate_data;

	is->is_max_recv_data_segment_length =
	handoff->idh_max_recv_data_segment_length;
	is->is_max_send_data_segment_length =
	handoff->idh_max_send_data_segment_length;
	is->is_max_burst_length = handoff->idh_max_burst_length;
	is->is_first_burst_length = handoff->idh_first_burst_length;

	if (handoff->idh_header_digest == ISCSI_DIGEST_CRC32C)
	ic->ic_header_crc32c = true;
	else
	ic->ic_header_crc32c = false;
	if (handoff->idh_data_digest == ISCSI_DIGEST_CRC32C)
	ic->ic_data_crc32c = true;
	else
	ic->ic_data_crc32c = false;
	ic->ic_maxtags = maxtags;

	is->is_cmdsn = 0;
	is->is_expcmdsn = 0;
	is->is_maxcmdsn = 0;
	is->is_waiting_for_iscsid = false;
	is->is_login_phase = false;
	is->is_timeout = 0;
	is->is_connected = true;
	is->is_reason[0] = '\0';

	ISCSI_SESSION_UNLOCK(is);

	/*
	* If we're going through the proxy, the idh_socket will be 0,
	* and the ICL module can simply ignore this call. It can also
	* use it to determine it's no longer in the Login phase.
	*/
	error = icl_conn_handoff(ic, handoff->idh_socket);
	if (error != 0) {
	sx_sunlock(&sc->sc_lock);
	iscsi_session_terminate(is);
	return (error);
	}

	sx_sunlock(&sc->sc_lock);

	if (is->is_sim != NULL) {
	/*
	* When reconnecting, there already is SIM allocated for the session.
	*/
	KASSERT(is->is_simq_frozen, ("reconnect without frozen simq"));
	ISCSI_SESSION_LOCK(is);
	ISCSI_SESSION_DEBUG(is, "releasing");
	is->is_simq_frozen = false;
	xpt_release_simq(is->is_sim, 1);
	ISCSI_SESSION_UNLOCK(is);

	} else {
	ISCSI_SESSION_LOCK(is);
	is->is_devq = cam_simq_alloc(ic->ic_maxtags);
	if (is->is_devq == NULL) {
	ISCSI_SESSION_WARN(is, "failed to allocate simq");
	iscsi_session_terminate(is);
	return (ENOMEM);
	}

	is->is_sim = cam_sim_alloc(iscsi_action, iscsi_poll, "iscsi",
	is, is->is_id /* unit */, &is->is_lock,
	1, ic->ic_maxtags, is->is_devq);
	if (is->is_sim == NULL) {
	ISCSI_SESSION_UNLOCK(is);
	ISCSI_SESSION_WARN(is, "failed to allocate SIM");
	cam_simq_free(is->is_devq);
	iscsi_session_terminate(is);
	return (ENOMEM);
	}

	error = xpt_bus_register(is->is_sim, NULL, 0);
	if (error != 0) {
	ISCSI_SESSION_UNLOCK(is);
	ISCSI_SESSION_WARN(is, "failed to register bus");
	iscsi_session_terminate(is);
	return (ENOMEM);
	}

	error = xpt_create_path(&is->is_path, /periph/NULL,
	cam_sim_path(is->is_sim), CAM_TARGET_WILDCARD,
	CAM_LUN_WILDCARD);
	if (error != CAM_REQ_CMP) {
	ISCSI_SESSION_UNLOCK(is);
	ISCSI_SESSION_WARN(is, "failed to create path");
	iscsi_session_terminate(is);
	return (ENOMEM);
	}
	ISCSI_SESSION_UNLOCK(is);
	}

	return (0);
	}

	static int
	iscsi_ioctl_daemon_fail(struct iscsi_softc *sc,
	struct iscsi_daemon_fail *fail)
	{
	struct iscsi_session *is;

	sx_slock(&sc->sc_lock);

	TAILQ_FOREACH(is, &sc->sc_sessions, is_next) {
	if (is->is_id == fail->idf_session_id)
	break;
	}
	if (is == NULL) {
	sx_sunlock(&sc->sc_lock);
	return (ESRCH);
	}
	ISCSI_SESSION_LOCK(is);
	ISCSI_SESSION_DEBUG(is, "iscsid(8) failed: %s",
	fail->idf_reason);
	strlcpy(is->is_reason, fail->idf_reason, sizeof(is->is_reason));
	//is->is_waiting_for_iscsid = false;
	//is->is_login_phase = true;
	//iscsi_session_reconnect(is);
	ISCSI_SESSION_UNLOCK(is);
	sx_sunlock(&sc->sc_lock);

	return (0);
	}

	#ifdef ICL_KERNEL_PROXY
	static int
	iscsi_ioctl_daemon_connect(struct iscsi_softc *sc,
	struct iscsi_daemon_connect *idc)
	{
	struct iscsi_session *is;
	struct sockaddr from_sa, to_sa;
	int error;

	sx_slock(&sc->sc_lock);
	TAILQ_FOREACH(is, &sc->sc_sessions, is_next) {
	if (is->is_id == idc->idc_session_id)
	break;
	}
	if (is == NULL) {
	sx_sunlock(&sc->sc_lock);
	return (ESRCH);
	}
	sx_sunlock(&sc->sc_lock);

	if (idc->idc_from_addrlen > 0) {
	error = getsockaddr(&from_sa, (void *)idc->idc_from_addr, idc->idc_from_addrlen);
	if (error != 0) {
	ISCSI_SESSION_WARN(is,
	"getsockaddr failed with error %d", error);
	return (error);
	}
	} else {
	from_sa = NULL;
	}
	error = getsockaddr(&to_sa, (void *)idc->idc_to_addr, idc->idc_to_addrlen);
	if (error != 0) {
	ISCSI_SESSION_WARN(is, "getsockaddr failed with error %d",
	error);
	free(from_sa, M_SONAME);
	return (error);
	}

	ISCSI_SESSION_LOCK(is);
	is->is_statsn = 0;
	is->is_cmdsn = 0;
	is->is_expcmdsn = 0;
	is->is_maxcmdsn = 0;
	is->is_waiting_for_iscsid = false;
	is->is_login_phase = true;
	is->is_timeout = 0;
	ISCSI_SESSION_UNLOCK(is);

	error = icl_conn_connect(is->is_conn, idc->idc_domain,
	idc->idc_socktype, idc->idc_protocol, from_sa, to_sa);
	free(from_sa, M_SONAME);
	free(to_sa, M_SONAME);

	/*
	* Digests are always disabled during login phase.
	*/
	is->is_conn->ic_header_crc32c = false;
	is->is_conn->ic_data_crc32c = false;

	return (error);
	}

	static int
	iscsi_ioctl_daemon_send(struct iscsi_softc *sc,
	struct iscsi_daemon_send *ids)
	{
	struct iscsi_session *is;
	struct icl_pdu *ip;
	size_t datalen;
	void *data;
	int error;

	sx_slock(&sc->sc_lock);
	TAILQ_FOREACH(is, &sc->sc_sessions, is_next) {
	if (is->is_id == ids->ids_session_id)
	break;
	}
	if (is == NULL) {
	sx_sunlock(&sc->sc_lock);
	return (ESRCH);
	}
	sx_sunlock(&sc->sc_lock);

	if (is->is_login_phase == false)
	return (EBUSY);

	if (is->is_terminating \|\| is->is_reconnecting)
	return (EIO);

	datalen = ids->ids_data_segment_len;
	if (datalen > is->is_max_send_data_segment_length)
	return (EINVAL);
	if (datalen > 0) {
	data = malloc(datalen, M_ISCSI, M_WAITOK);
	error = copyin(ids->ids_data_segment, data, datalen);
	if (error != 0) {
	free(data, M_ISCSI);
	return (error);
	}
	}

	ip = icl_pdu_new(is->is_conn, M_WAITOK);
	memcpy(ip->ip_bhs, ids->ids_bhs, sizeof(*ip->ip_bhs));
	if (datalen > 0) {
	error = icl_pdu_append_data(ip, data, datalen, M_WAITOK);
	KASSERT(error == 0, ("icl_pdu_append_data(..., M_WAITOK) failed"));
	free(data, M_ISCSI);
	}
	iscsi_pdu_queue(ip);

	return (0);
	}

	static int
	iscsi_ioctl_daemon_receive(struct iscsi_softc *sc,
	struct iscsi_daemon_receive *idr)
	{
	struct iscsi_session *is;
	struct icl_pdu *ip;
	void *data;
	int error;

	sx_slock(&sc->sc_lock);
	TAILQ_FOREACH(is, &sc->sc_sessions, is_next) {
	if (is->is_id == idr->idr_session_id)
	break;
	}
	if (is == NULL) {
	sx_sunlock(&sc->sc_lock);
	return (ESRCH);
	}
	sx_sunlock(&sc->sc_lock);

	if (is->is_login_phase == false)
	return (EBUSY);

	ISCSI_SESSION_LOCK(is);
	while (is->is_login_pdu == NULL &&
	is->is_terminating == false &&
	is->is_reconnecting == false) {
	error = cv_wait_sig(&is->is_login_cv, &is->is_lock);
	if (error != 0) {
	ISCSI_SESSION_UNLOCK(is);
	return (error);
	}
	}
	if (is->is_terminating \|\| is->is_reconnecting) {
	ISCSI_SESSION_UNLOCK(is);
	return (EIO);
	}
	ip = is->is_login_pdu;
	is->is_login_pdu = NULL;
	ISCSI_SESSION_UNLOCK(is);

	if (ip->ip_data_len > idr->idr_data_segment_len) {
	icl_pdu_free(ip);
	return (EMSGSIZE);
	}

	copyout(ip->ip_bhs, idr->idr_bhs, sizeof(*ip->ip_bhs));
	if (ip->ip_data_len > 0) {
	data = malloc(ip->ip_data_len, M_ISCSI, M_WAITOK);
	icl_pdu_get_data(ip, 0, data, ip->ip_data_len);
	copyout(data, idr->idr_data_segment, ip->ip_data_len);
	free(data, M_ISCSI);
	}

	icl_pdu_free(ip);

	return (0);
	}
	#endif /* ICL_KERNEL_PROXY */

	static void
	iscsi_sanitize_session_conf(struct iscsi_session_conf *isc)
	{
	/*
	* Just make sure all the fields are null-terminated.
	*
	* XXX: This is not particularly secure. We should
	* create our own conf and then copy in relevant
	* fields.
	*/
	isc->isc_initiator[ISCSI_NAME_LEN - 1] = '\0';
	isc->isc_initiator_addr[ISCSI_ADDR_LEN - 1] = '\0';
	isc->isc_initiator_alias[ISCSI_ALIAS_LEN - 1] = '\0';
	isc->isc_target[ISCSI_NAME_LEN - 1] = '\0';
	isc->isc_target_addr[ISCSI_ADDR_LEN - 1] = '\0';
	isc->isc_user[ISCSI_NAME_LEN - 1] = '\0';
	isc->isc_secret[ISCSI_SECRET_LEN - 1] = '\0';
	isc->isc_mutual_user[ISCSI_NAME_LEN - 1] = '\0';
	isc->isc_mutual_secret[ISCSI_SECRET_LEN - 1] = '\0';
	}

	static bool
	iscsi_valid_session_conf(const struct iscsi_session_conf *isc)
	{

	if (isc->isc_initiator[0] == '\0') {
	ISCSI_DEBUG("empty isc_initiator");
	return (false);
	}

	if (isc->isc_target_addr[0] == '\0') {
	ISCSI_DEBUG("empty isc_target_addr");
	return (false);
	}

	if (isc->isc_discovery != 0 && isc->isc_target[0] != 0) {
	ISCSI_DEBUG("non-empty isc_target for discovery session");
	return (false);
	}

	if (isc->isc_discovery == 0 && isc->isc_target[0] == 0) {
	ISCSI_DEBUG("empty isc_target for non-discovery session");
	return (false);
	}

	return (true);
	}

	static int
	iscsi_ioctl_session_add(struct iscsi_softc sc, struct iscsi_session_add isa)
	{
	struct iscsi_session *is;
	const struct iscsi_session *is2;
	int error;

	iscsi_sanitize_session_conf(&isa->isa_conf);
	if (iscsi_valid_session_conf(&isa->isa_conf) == false)
	return (EINVAL);

	is = malloc(sizeof(*is), M_ISCSI, M_ZERO \| M_WAITOK);
	memcpy(&is->is_conf, &isa->isa_conf, sizeof(is->is_conf));

	/*
	* Set some default values, from RFC 3720, section 12.
	*
	* These values are updated by the handoff IOCTL, but are
	* needed prior to the handoff to support sending the ISER
	* login PDU.
	*/
	is->is_max_recv_data_segment_length = 8192;
	is->is_max_send_data_segment_length = 8192;
	is->is_max_burst_length = 262144;
	is->is_first_burst_length = 65536;

	sx_xlock(&sc->sc_lock);

	/*
	* Prevent duplicates.
	*/
	TAILQ_FOREACH(is2, &sc->sc_sessions, is_next) {
	if (!!is->is_conf.isc_discovery !=
	!!is2->is_conf.isc_discovery)
	continue;

	if (strcmp(is->is_conf.isc_target_addr,
	is2->is_conf.isc_target_addr) != 0)
	continue;

	if (is->is_conf.isc_discovery == 0 &&
	strcmp(is->is_conf.isc_target,
	is2->is_conf.isc_target) != 0)
	continue;

	sx_xunlock(&sc->sc_lock);
	free(is, M_ISCSI);
	return (EBUSY);
	}

	is->is_conn = icl_new_conn(is->is_conf.isc_offload,
	is->is_conf.isc_iser, "iscsi", &is->is_lock);
	if (is->is_conn == NULL) {
	sx_xunlock(&sc->sc_lock);
	free(is, M_ISCSI);
	return (EINVAL);
	}
	is->is_conn->ic_receive = iscsi_receive_callback;
	is->is_conn->ic_error = iscsi_error_callback;
	is->is_conn->ic_prv0 = is;
	TAILQ_INIT(&is->is_outstanding);
	STAILQ_INIT(&is->is_postponed);
	mtx_init(&is->is_lock, "iscsi_lock", NULL, MTX_DEF);
	cv_init(&is->is_maintenance_cv, "iscsi_mt");
	#ifdef ICL_KERNEL_PROXY
	cv_init(&is->is_login_cv, "iscsi_login");
	#endif

	is->is_softc = sc;
	sc->sc_last_session_id++;
	is->is_id = sc->sc_last_session_id;
	is->is_isid[0] = 0x80; /* RFC 3720, 10.12.5: 10b, "Random" ISID. */
	arc4rand(&is->is_isid[1], 5, 0);
	is->is_tsih = 0;
	callout_init(&is->is_callout, 1);

	error = kthread_add(iscsi_maintenance_thread, is, NULL, NULL, 0, 0, "iscsimt");
	if (error != 0) {
	ISCSI_SESSION_WARN(is, "kthread_add(9) failed with error %d", error);
	sx_xunlock(&sc->sc_lock);
	return (error);
	}

	callout_reset(&is->is_callout, 1 * hz, iscsi_callout, is);
	TAILQ_INSERT_TAIL(&sc->sc_sessions, is, is_next);

	ISCSI_SESSION_LOCK(is);
	/*
	* Don't notify iscsid(8) if the session is disabled and it's not
	* a discovery session,
	*/
	if (is->is_conf.isc_enable == 0 && is->is_conf.isc_discovery == 0) {
	ISCSI_SESSION_UNLOCK(is);
	sx_xunlock(&sc->sc_lock);
	return (0);
	}

	is->is_waiting_for_iscsid = true;
	strlcpy(is->is_reason, "Waiting for iscsid(8)", sizeof(is->is_reason));
	ISCSI_SESSION_UNLOCK(is);
	cv_signal(&sc->sc_cv);
	sx_xunlock(&sc->sc_lock);
	return (0);
	}

	static bool
	iscsi_session_conf_matches(unsigned int id1, const struct iscsi_session_conf *c1,
	unsigned int id2, const struct iscsi_session_conf *c2)
	{

	if (id2 != 0 && id2 != id1)
	return (false);
	if (c2->isc_target[0] != '\0' &&
	strcmp(c1->isc_target, c2->isc_target) != 0)
	return (false);
	if (c2->isc_target_addr[0] != '\0' &&
	strcmp(c1->isc_target_addr, c2->isc_target_addr) != 0)
	return (false);
	return (true);
	}

	static int
	iscsi_ioctl_session_remove(struct iscsi_softc *sc,
	struct iscsi_session_remove *isr)
	{
	struct iscsi_session is, tmp;
	bool found = false;

	iscsi_sanitize_session_conf(&isr->isr_conf);

	sx_xlock(&sc->sc_lock);
	TAILQ_FOREACH_SAFE(is, &sc->sc_sessions, is_next, tmp) {
	ISCSI_SESSION_LOCK(is);
	if (iscsi_session_conf_matches(is->is_id, &is->is_conf,
	isr->isr_session_id, &isr->isr_conf)) {
	found = true;
	iscsi_session_logout(is);
	iscsi_session_terminate(is);
	}
	ISCSI_SESSION_UNLOCK(is);
	}
	sx_xunlock(&sc->sc_lock);

	if (!found)
	return (ESRCH);

	return (0);
	}

	static int
	iscsi_ioctl_session_list(struct iscsi_softc sc, struct iscsi_session_list isl)
	{
	int error;
	unsigned int i = 0;
	struct iscsi_session *is;
	struct iscsi_session_state iss;

	sx_slock(&sc->sc_lock);
	TAILQ_FOREACH(is, &sc->sc_sessions, is_next) {
	if (i >= isl->isl_nentries) {
	sx_sunlock(&sc->sc_lock);
	return (EMSGSIZE);
	}
	memset(&iss, 0, sizeof(iss));
	memcpy(&iss.iss_conf, &is->is_conf, sizeof(iss.iss_conf));
	iss.iss_id = is->is_id;
	strlcpy(iss.iss_target_alias, is->is_target_alias, sizeof(iss.iss_target_alias));
	strlcpy(iss.iss_reason, is->is_reason, sizeof(iss.iss_reason));
	strlcpy(iss.iss_offload, is->is_conn->ic_offload, sizeof(iss.iss_offload));

	if (is->is_conn->ic_header_crc32c)
	iss.iss_header_digest = ISCSI_DIGEST_CRC32C;
	else
	iss.iss_header_digest = ISCSI_DIGEST_NONE;

	if (is->is_conn->ic_data_crc32c)
	iss.iss_data_digest = ISCSI_DIGEST_CRC32C;
	else
	iss.iss_data_digest = ISCSI_DIGEST_NONE;

	iss.iss_max_send_data_segment_length =
	is->is_max_send_data_segment_length;
	iss.iss_max_recv_data_segment_length =
	is->is_max_recv_data_segment_length;
	iss.iss_max_burst_length = is->is_max_burst_length;
	iss.iss_first_burst_length = is->is_first_burst_length;
	iss.iss_immediate_data = is->is_immediate_data;
	iss.iss_connected = is->is_connected;

	error = copyout(&iss, isl->isl_pstates + i, sizeof(iss));
	if (error != 0) {
	sx_sunlock(&sc->sc_lock);
	return (error);
	}
	i++;
	}
	sx_sunlock(&sc->sc_lock);

	isl->isl_nentries = i;

	return (0);
	}

	static int
	iscsi_ioctl_session_modify(struct iscsi_softc *sc,
	struct iscsi_session_modify *ism)
	{
	struct iscsi_session *is;
	const struct iscsi_session *is2;

	iscsi_sanitize_session_conf(&ism->ism_conf);
	if (iscsi_valid_session_conf(&ism->ism_conf) == false)
	return (EINVAL);

	sx_xlock(&sc->sc_lock);
	TAILQ_FOREACH(is, &sc->sc_sessions, is_next) {
	ISCSI_SESSION_LOCK(is);
	if (is->is_id == ism->ism_session_id) {
	/* Note that the session remains locked. */
	break;
	}
	ISCSI_SESSION_UNLOCK(is);
	}
	if (is == NULL) {
	sx_xunlock(&sc->sc_lock);
	return (ESRCH);
	}

	/*
	* Prevent duplicates.
	*/
	TAILQ_FOREACH(is2, &sc->sc_sessions, is_next) {
	if (is == is2)
	continue;

	if (!!ism->ism_conf.isc_discovery !=
	!!is2->is_conf.isc_discovery)
	continue;

	if (strcmp(ism->ism_conf.isc_target_addr,
	is2->is_conf.isc_target_addr) != 0)
	continue;

	if (ism->ism_conf.isc_discovery == 0 &&
	strcmp(ism->ism_conf.isc_target,
	is2->is_conf.isc_target) != 0)
	continue;

	ISCSI_SESSION_UNLOCK(is);
	sx_xunlock(&sc->sc_lock);
	return (EBUSY);
	}

	sx_xunlock(&sc->sc_lock);

	memcpy(&is->is_conf, &ism->ism_conf, sizeof(is->is_conf));
	ISCSI_SESSION_UNLOCK(is);

	iscsi_session_reconnect(is);

	return (0);
	}

	static int
	iscsi_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int mode,
	struct thread *td)
	{
	struct iscsi_softc *sc;

	sc = dev->si_drv1;

	switch (cmd) {
	case ISCSIDWAIT:
	return (iscsi_ioctl_daemon_wait(sc,
	(struct iscsi_daemon_request *)arg));
	case ISCSIDHANDOFF:
	return (iscsi_ioctl_daemon_handoff(sc,
	(struct iscsi_daemon_handoff *)arg));
	case ISCSIDFAIL:
	return (iscsi_ioctl_daemon_fail(sc,
	(struct iscsi_daemon_fail *)arg));
	#ifdef ICL_KERNEL_PROXY
	case ISCSIDCONNECT:
	return (iscsi_ioctl_daemon_connect(sc,
	(struct iscsi_daemon_connect *)arg));
	case ISCSIDSEND:
	return (iscsi_ioctl_daemon_send(sc,
	(struct iscsi_daemon_send *)arg));
	case ISCSIDRECEIVE:
	return (iscsi_ioctl_daemon_receive(sc,
	(struct iscsi_daemon_receive *)arg));
	#endif /* ICL_KERNEL_PROXY */
	case ISCSISADD:
	return (iscsi_ioctl_session_add(sc,
	(struct iscsi_session_add *)arg));
	case ISCSISREMOVE:
	return (iscsi_ioctl_session_remove(sc,
	(struct iscsi_session_remove *)arg));
	case ISCSISLIST:
	return (iscsi_ioctl_session_list(sc,
	(struct iscsi_session_list *)arg));
	case ISCSISMODIFY:
	return (iscsi_ioctl_session_modify(sc,
	(struct iscsi_session_modify *)arg));
	default:
	return (EINVAL);
	}
	}

	static struct iscsi_outstanding *
	iscsi_outstanding_find(struct iscsi_session *is, uint32_t initiator_task_tag)
	{
	struct iscsi_outstanding *io;

	ISCSI_SESSION_LOCK_ASSERT(is);

	TAILQ_FOREACH(io, &is->is_outstanding, io_next) {
	if (io->io_initiator_task_tag == initiator_task_tag)
	return (io);
	}
	return (NULL);
	}

	static struct iscsi_outstanding *
	iscsi_outstanding_find_ccb(struct iscsi_session is, union ccb ccb)
	{
	struct iscsi_outstanding *io;

	ISCSI_SESSION_LOCK_ASSERT(is);

	TAILQ_FOREACH(io, &is->is_outstanding, io_next) {
	if (io->io_ccb == ccb)
	return (io);
	}
	return (NULL);
	}

	static struct iscsi_outstanding *
	iscsi_outstanding_add(struct iscsi_session is, struct icl_pdu request,
	union ccb ccb, uint32_t initiator_task_tagp)
	{
	struct iscsi_outstanding *io;
	int error;

	ISCSI_SESSION_LOCK_ASSERT(is);

	io = uma_zalloc(iscsi_outstanding_zone, M_NOWAIT \| M_ZERO);
	if (io == NULL) {
	ISCSI_SESSION_WARN(is, "failed to allocate %zd bytes",
	sizeof(*io));
	return (NULL);
	}

	error = icl_conn_task_setup(is->is_conn, request, &ccb->csio,
	initiator_task_tagp, &io->io_icl_prv);
	if (error != 0) {
	ISCSI_SESSION_WARN(is,
	"icl_conn_task_setup() failed with error %d", error);
	uma_zfree(iscsi_outstanding_zone, io);
	return (NULL);
	}

	KASSERT(iscsi_outstanding_find(is, *initiator_task_tagp) == NULL,
	("initiator_task_tag 0x%x already added", *initiator_task_tagp));

	io->io_initiator_task_tag = *initiator_task_tagp;
	io->io_ccb = ccb;
	TAILQ_INSERT_TAIL(&is->is_outstanding, io, io_next);
	return (io);
	}

	static void
	iscsi_outstanding_remove(struct iscsi_session is, struct iscsi_outstanding io)
	{

	ISCSI_SESSION_LOCK_ASSERT(is);

	icl_conn_task_done(is->is_conn, io->io_icl_prv);
	TAILQ_REMOVE(&is->is_outstanding, io, io_next);
	uma_zfree(iscsi_outstanding_zone, io);
	}

	static void
	iscsi_action_abort(struct iscsi_session is, union ccb ccb)
	{
	struct icl_pdu *request;
	struct iscsi_bhs_task_management_request *bhstmr;
	struct ccb_abort *cab = &ccb->cab;
	struct iscsi_outstanding io, aio;
	uint32_t initiator_task_tag;

	ISCSI_SESSION_LOCK_ASSERT(is);

	#if 0
	KASSERT(is->is_login_phase == false, ("%s called during Login Phase", __func__));
	#else
	if (is->is_login_phase) {
	ccb->ccb_h.status = CAM_REQ_ABORTED;
	xpt_done(ccb);
	return;
	}
	#endif

	aio = iscsi_outstanding_find_ccb(is, cab->abort_ccb);
	if (aio == NULL) {
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	return;
	}

	request = icl_pdu_new(is->is_conn, M_NOWAIT);
	if (request == NULL) {
	ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
	xpt_done(ccb);
	return;
	}

	initiator_task_tag = is->is_initiator_task_tag++;

	io = iscsi_outstanding_add(is, request, NULL, &initiator_task_tag);
	if (io == NULL) {
	icl_pdu_free(request);
	ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
	xpt_done(ccb);
	return;
	}
	io->io_datasn = aio->io_initiator_task_tag;

	bhstmr = (struct iscsi_bhs_task_management_request *)request->ip_bhs;
	bhstmr->bhstmr_opcode = ISCSI_BHS_OPCODE_TASK_REQUEST;
	bhstmr->bhstmr_function = 0x80 \| BHSTMR_FUNCTION_ABORT_TASK;
	bhstmr->bhstmr_lun = htobe64(CAM_EXTLUN_BYTE_SWIZZLE(ccb->ccb_h.target_lun));
	bhstmr->bhstmr_initiator_task_tag = initiator_task_tag;
	bhstmr->bhstmr_referenced_task_tag = aio->io_initiator_task_tag;

	iscsi_pdu_queue_locked(request);
	}

	static void
	iscsi_action_scsiio(struct iscsi_session is, union ccb ccb)
	{
	struct icl_pdu *request;
	struct iscsi_bhs_scsi_command *bhssc;
	struct ccb_scsiio *csio;
	struct iscsi_outstanding *io;
	size_t len;
	uint32_t initiator_task_tag;
	int error;

	ISCSI_SESSION_LOCK_ASSERT(is);

	#if 0
	KASSERT(is->is_login_phase == false, ("%s called during Login Phase", __func__));
	#else
	if (is->is_login_phase) {
	ISCSI_SESSION_DEBUG(is, "called during login phase");
	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ISCSI_SESSION_DEBUG(is, "freezing devq");
	}
	ccb->ccb_h.status = CAM_REQ_ABORTED \| CAM_DEV_QFRZN;
	xpt_done(ccb);
	return;
	}
	#endif

	request = icl_pdu_new(is->is_conn, M_NOWAIT);
	if (request == NULL) {
	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ISCSI_SESSION_DEBUG(is, "freezing devq");
	}
	ccb->ccb_h.status = CAM_RESRC_UNAVAIL \| CAM_DEV_QFRZN;
	xpt_done(ccb);
	return;
	}

	initiator_task_tag = is->is_initiator_task_tag++;
	io = iscsi_outstanding_add(is, request, ccb, &initiator_task_tag);
	if (io == NULL) {
	icl_pdu_free(request);
	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ISCSI_SESSION_DEBUG(is, "freezing devq");
	}
	ccb->ccb_h.status = CAM_RESRC_UNAVAIL \| CAM_DEV_QFRZN;
	xpt_done(ccb);
	return;
	}

	csio = &ccb->csio;
	bhssc = (struct iscsi_bhs_scsi_command *)request->ip_bhs;
	bhssc->bhssc_opcode = ISCSI_BHS_OPCODE_SCSI_COMMAND;
	bhssc->bhssc_flags \|= BHSSC_FLAGS_F;
	switch (csio->ccb_h.flags & CAM_DIR_MASK) {
	case CAM_DIR_IN:
	bhssc->bhssc_flags \|= BHSSC_FLAGS_R;
	break;
	case CAM_DIR_OUT:
	bhssc->bhssc_flags \|= BHSSC_FLAGS_W;
	break;
	}

	if ((ccb->ccb_h.flags & CAM_TAG_ACTION_VALID) != 0) {
	switch (csio->tag_action) {
	case MSG_HEAD_OF_Q_TAG:
	bhssc->bhssc_flags \|= BHSSC_FLAGS_ATTR_HOQ;
	break;
	case MSG_ORDERED_Q_TAG:
	bhssc->bhssc_flags \|= BHSSC_FLAGS_ATTR_ORDERED;
	break;
	case MSG_ACA_TASK:
	bhssc->bhssc_flags \|= BHSSC_FLAGS_ATTR_ACA;
	break;
	case MSG_SIMPLE_Q_TAG:
	default:
	bhssc->bhssc_flags \|= BHSSC_FLAGS_ATTR_SIMPLE;
	break;
	}
	} else
	bhssc->bhssc_flags \|= BHSSC_FLAGS_ATTR_UNTAGGED;

	if (is->is_protocol_level >= 2) {
	bhssc->bhssc_pri = (csio->priority << BHSSC_PRI_SHIFT) &
	BHSSC_PRI_MASK;
	}

	bhssc->bhssc_lun = htobe64(CAM_EXTLUN_BYTE_SWIZZLE(ccb->ccb_h.target_lun));
	bhssc->bhssc_initiator_task_tag = initiator_task_tag;
	bhssc->bhssc_expected_data_transfer_length = htonl(csio->dxfer_len);
	KASSERT(csio->cdb_len <= sizeof(bhssc->bhssc_cdb),
	("unsupported CDB size %zd", (size_t)csio->cdb_len));

	if (csio->ccb_h.flags & CAM_CDB_POINTER)
	memcpy(&bhssc->bhssc_cdb, csio->cdb_io.cdb_ptr, csio->cdb_len);
	else
	memcpy(&bhssc->bhssc_cdb, csio->cdb_io.cdb_bytes, csio->cdb_len);

	if (is->is_immediate_data &&
	(csio->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
	len = csio->dxfer_len;
	//ISCSI_SESSION_DEBUG(is, "adding %zd of immediate data", len);
	if (len > is->is_first_burst_length) {
	ISCSI_SESSION_DEBUG(is, "len %zd -> %d", len, is->is_first_burst_length);
	len = is->is_first_burst_length;
	}
	if (len > is->is_max_send_data_segment_length) {
	ISCSI_SESSION_DEBUG(is, "len %zd -> %d", len,
	is->is_max_send_data_segment_length);
	len = is->is_max_send_data_segment_length;
	}

	error = icl_pdu_append_data(request, csio->data_ptr, len, M_NOWAIT);
	if (error != 0) {
	iscsi_outstanding_remove(is, io);
	icl_pdu_free(request);
	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ISCSI_SESSION_DEBUG(is, "freezing devq");
	}
	ccb->ccb_h.status = CAM_RESRC_UNAVAIL \| CAM_DEV_QFRZN;
	xpt_done(ccb);
	return;
	}
	}
	iscsi_pdu_queue_locked(request);
	}

	static void
	iscsi_action(struct cam_sim sim, union ccb ccb)
	{
	struct iscsi_session *is;

	is = cam_sim_softc(sim);

	ISCSI_SESSION_LOCK_ASSERT(is);

	if (is->is_terminating \|\|
	(is->is_connected == false && fail_on_disconnection)) {
	ccb->ccb_h.status = CAM_DEV_NOT_THERE;
	xpt_done(ccb);
	return;
	}

	/*
	* Make sure CAM doesn't sneak in a CCB just after freezing the queue.
	*/
	if (is->is_simq_frozen == true) {
	ccb->ccb_h.status &= ~(CAM_SIM_QUEUED \| CAM_STATUS_MASK);
	ccb->ccb_h.status \|= CAM_REQUEUE_REQ;
	/* Don't freeze the devq - the SIM queue is already frozen. */
	xpt_done(ccb);
	return;
	}

	switch (ccb->ccb_h.func_code) {
	case XPT_PATH_INQ:
	{
	struct ccb_pathinq *cpi = &ccb->cpi;

	cpi->version_num = 1;
	cpi->hba_inquiry = PI_TAG_ABLE;
	cpi->target_sprt = 0;
	cpi->hba_misc = PIM_EXTLUNS;
	/*
	* XXX: It shouldn't ever be NULL; this could be turned
	* into a KASSERT eventually.
	*/
	if (is->is_conn == NULL)
	ISCSI_WARN("NULL conn");
	else if (is->is_conn->ic_unmapped)
	cpi->hba_misc \|= PIM_UNMAPPED;
	cpi->hba_eng_cnt = 0;
	cpi->max_target = 0;
	/*
	* Note that the variable below is only relevant for targets
	* that don't claim compliance with anything above SPC2, which
	* means they don't support REPORT_LUNS.
	*/
	cpi->max_lun = 255;
	cpi->initiator_id = ~0;
	strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strlcpy(cpi->hba_vid, "iSCSI", HBA_IDLEN);
	strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
	cpi->unit_number = cam_sim_unit(sim);
	cpi->bus_id = cam_sim_bus(sim);
	cpi->base_transfer_speed = 150000; /* XXX */
	cpi->transport = XPORT_ISCSI;
	cpi->transport_version = 0;
	cpi->protocol = PROTO_SCSI;
	cpi->protocol_version = SCSI_REV_SPC3;
	- cpi->maxio = MAXPHYS;
	+ cpi->maxio = maxphys;
	cpi->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_GET_TRAN_SETTINGS:
	{
	struct ccb_trans_settings *cts;
	struct ccb_trans_settings_scsi *scsi;

	cts = &ccb->cts;
	scsi = &cts->proto_specific.scsi;

	cts->protocol = PROTO_SCSI;
	cts->protocol_version = SCSI_REV_SPC3;
	cts->transport = XPORT_ISCSI;
	cts->transport_version = 0;
	scsi->valid = CTS_SCSI_VALID_TQ;
	scsi->flags = CTS_SCSI_FLAGS_TAG_ENB;
	cts->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_CALC_GEOMETRY:
	cam_calc_geometry(&ccb->ccg, /extended/1);
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	#if 0
	/*
	* XXX: What's the point?
	*/
	case XPT_RESET_BUS:
	case XPT_TERM_IO:
	ISCSI_SESSION_DEBUG(is, "faking success for reset, abort, or term_io");
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	#endif
	case XPT_ABORT:
	iscsi_action_abort(is, ccb);
	return;
	case XPT_SCSI_IO:
	iscsi_action_scsiio(is, ccb);
	return;
	default:
	#if 0
	ISCSI_SESSION_DEBUG(is, "got unsupported code 0x%x", ccb->ccb_h.func_code);
	#endif
	ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
	break;
	}
	xpt_done(ccb);
	}

	static void
	iscsi_poll(struct cam_sim *sim)
	{

	KASSERT(0, ("%s: you're not supposed to be here", __func__));
	}

	static void
	iscsi_terminate_sessions(struct iscsi_softc *sc)
	{
	struct iscsi_session *is;

	sx_slock(&sc->sc_lock);
	TAILQ_FOREACH(is, &sc->sc_sessions, is_next)
	iscsi_session_terminate(is);
	while(!TAILQ_EMPTY(&sc->sc_sessions)) {
	ISCSI_DEBUG("waiting for sessions to terminate");
	cv_wait(&sc->sc_cv, &sc->sc_lock);
	}
	ISCSI_DEBUG("all sessions terminated");
	sx_sunlock(&sc->sc_lock);
	}

	static void
	iscsi_shutdown_pre(struct iscsi_softc *sc)
	{
	struct iscsi_session *is;

	if (!fail_on_shutdown)
	return;

	/*
	* If we have any sessions waiting for reconnection, request
	* maintenance thread to fail them immediately instead of waiting
	* for reconnect timeout.
	*
	* This prevents LUNs with mounted filesystems that are supported
	* by disconnected iSCSI sessions from hanging, however it will
	* fail all queued BIOs.
	*/
	ISCSI_DEBUG("forcing failing all disconnected sessions due to shutdown");

	fail_on_disconnection = 1;

	sx_slock(&sc->sc_lock);
	TAILQ_FOREACH(is, &sc->sc_sessions, is_next) {
	ISCSI_SESSION_LOCK(is);
	if (!is->is_connected) {
	ISCSI_SESSION_DEBUG(is, "force failing disconnected session early");
	iscsi_session_reconnect(is);
	}
	ISCSI_SESSION_UNLOCK(is);
	}
	sx_sunlock(&sc->sc_lock);
	}

	static void
	iscsi_shutdown_post(struct iscsi_softc *sc)
	{

	if (!KERNEL_PANICKED()) {
	ISCSI_DEBUG("removing all sessions due to shutdown");
	iscsi_terminate_sessions(sc);
	}
	}

	static int
	iscsi_load(void)
	{
	int error;

	sc = malloc(sizeof(*sc), M_ISCSI, M_ZERO \| M_WAITOK);
	sx_init(&sc->sc_lock, "iscsi");
	TAILQ_INIT(&sc->sc_sessions);
	cv_init(&sc->sc_cv, "iscsi_cv");

	iscsi_outstanding_zone = uma_zcreate("iscsi_outstanding",
	sizeof(struct iscsi_outstanding), NULL, NULL, NULL, NULL,
	UMA_ALIGN_PTR, 0);

	error = make_dev_p(MAKEDEV_CHECKNAME, &sc->sc_cdev, &iscsi_cdevsw,
	NULL, UID_ROOT, GID_WHEEL, 0600, "iscsi");
	if (error != 0) {
	ISCSI_WARN("failed to create device node, error %d", error);
	return (error);
	}
	sc->sc_cdev->si_drv1 = sc;

	sc->sc_shutdown_pre_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
	iscsi_shutdown_pre, sc, SHUTDOWN_PRI_FIRST);
	/*
	* shutdown_post_sync needs to run after filesystem shutdown and before
	* CAM shutdown - otherwise when rebooting with an iSCSI session that is
	* disconnected but has outstanding requests, dashutdown() will hang on
	* cam_periph_runccb().
	*/
	sc->sc_shutdown_post_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
	iscsi_shutdown_post, sc, SHUTDOWN_PRI_DEFAULT - 1);

	return (0);
	}

	static int
	iscsi_unload(void)
	{

	if (sc->sc_cdev != NULL) {
	ISCSI_DEBUG("removing device node");
	destroy_dev(sc->sc_cdev);
	ISCSI_DEBUG("device node removed");
	}

	if (sc->sc_shutdown_pre_eh != NULL)
	EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->sc_shutdown_pre_eh);
	if (sc->sc_shutdown_post_eh != NULL)
	EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->sc_shutdown_post_eh);

	iscsi_terminate_sessions(sc);

	uma_zdestroy(iscsi_outstanding_zone);
	sx_destroy(&sc->sc_lock);
	cv_destroy(&sc->sc_cv);
	free(sc, M_ISCSI);
	return (0);
	}

	static int
	iscsi_quiesce(void)
	{
	sx_slock(&sc->sc_lock);
	if (!TAILQ_EMPTY(&sc->sc_sessions)) {
	sx_sunlock(&sc->sc_lock);
	return (EBUSY);
	}
	sx_sunlock(&sc->sc_lock);
	return (0);
	}

	static int
	iscsi_modevent(module_t mod, int what, void *arg)
	{
	int error;

	switch (what) {
	case MOD_LOAD:
	error = iscsi_load();
	break;
	case MOD_UNLOAD:
	error = iscsi_unload();
	break;
	case MOD_QUIESCE:
	error = iscsi_quiesce();
	break;
	default:
	error = EINVAL;
	break;
	}
	return (error);
	}

	moduledata_t iscsi_data = {
	"iscsi",
	iscsi_modevent,
	0
	};

	DECLARE_MODULE(iscsi, iscsi_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
	MODULE_DEPEND(iscsi, cam, 1, 1, 1);
	MODULE_DEPEND(iscsi, icl, 1, 1, 1);
	diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c
	index 6334e043d0e6..1d17603ffdfe 100644
	--- a/sys/dev/md/md.c
	+++ b/sys/dev/md/md.c
	@@ -1,2169 +1,2170 @@
	/*-
	* SPDX-License-Identifier: (Beerware AND BSD-3-Clause)
	*
	* ----------------------------------------------------------------------------
	* "THE BEER-WARE LICENSE" (Revision 42):
	* <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
	* can do whatever you want with this stuff. If we meet some day, and you think
	* this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
	* ----------------------------------------------------------------------------
	*
	* $FreeBSD$
	*
	*/

	/*-
	* The following functions are based on the vn(4) driver: mdstart_swap(),
	* mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
	* and as such under the following copyright:
	*
	* Copyright (c) 1988 University of Utah.
	* Copyright (c) 1990, 1993
	* The Regents of the University of California. All rights reserved.
	* Copyright (c) 2013 The FreeBSD Foundation
	* All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* the Systems Programming Group of the University of Utah Computer
	* Science Department.
	*
	* Portions of this software were developed by Konstantin Belousov
	* under sponsorship from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: Utah Hdr: vn.c 1.13 94/04/02
	*
	* from: @(#)vn.c 8.6 (Berkeley) 4/1/94
	* From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
	*/

	#include "opt_rootdevname.h"
	#include "opt_geom.h"
	#include "opt_md.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/conf.h>
	#include <sys/devicestat.h>
	#include <sys/fcntl.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/limits.h>
	#include <sys/linker.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mdioctl.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/sx.h>
	#include <sys/namei.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	#include <sys/rwlock.h>
	#include <sys/sbuf.h>
	#include <sys/sched.h>
	#include <sys/sf_buf.h>
	#include <sys/sysctl.h>
	#include <sys/uio.h>
	#include <sys/vnode.h>
	#include <sys/disk.h>

	#include <geom/geom.h>
	#include <geom/geom_int.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>
	#include <vm/swap_pager.h>
	#include <vm/uma.h>

	#include <machine/bus.h>

	#define MD_MODVER 1

	#define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */
	#define MD_EXITING 0x20000 /* Worker thread is exiting. */
	#define MD_PROVIDERGONE 0x40000 /* Safe to free the softc */

	#ifndef MD_NSECT
	#define MD_NSECT (10000 * 2)
	#endif

	struct md_req {
	unsigned md_unit; /* unit number */
	enum md_types md_type; /* type of disk */
	off_t md_mediasize; /* size of disk in bytes */
	unsigned md_sectorsize; /* sectorsize */
	unsigned md_options; /* options */
	int md_fwheads; /* firmware heads */
	int md_fwsectors; /* firmware sectors */
	char md_file; / pathname of file to mount */
	enum uio_seg md_file_seg; /* location of md_file */
	char md_label; / label of the device (userspace) */
	int md_units; / pointer to units array (kernel) */
	size_t md_units_nitems; /* items in md_units array */
	};

	#ifdef COMPAT_FREEBSD32
	struct md_ioctl32 {
	unsigned md_version;
	unsigned md_unit;
	enum md_types md_type;
	uint32_t md_file;
	off_t md_mediasize;
	unsigned md_sectorsize;
	unsigned md_options;
	uint64_t md_base;
	int md_fwheads;
	int md_fwsectors;
	uint32_t md_label;
	int md_pad[MDNPAD];
	} __attribute__((__packed__));
	CTASSERT((sizeof(struct md_ioctl32)) == 436);

	#define MDIOCATTACH_32 _IOC_NEWTYPE(MDIOCATTACH, struct md_ioctl32)
	#define MDIOCDETACH_32 _IOC_NEWTYPE(MDIOCDETACH, struct md_ioctl32)
	#define MDIOCQUERY_32 _IOC_NEWTYPE(MDIOCQUERY, struct md_ioctl32)
	#define MDIOCRESIZE_32 _IOC_NEWTYPE(MDIOCRESIZE, struct md_ioctl32)
	#endif /* COMPAT_FREEBSD32 */

	static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
	static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");

	static int md_debug;
	SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0,
	"Enable md(4) debug messages");
	static int md_malloc_wait;
	SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0,
	"Allow malloc to wait for memory allocations");

	#if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE)
	#define MD_ROOT_FSTYPE "ufs"
	#endif

	#if defined(MD_ROOT)
	/*
	* Preloaded image gets put here.
	*/
	#if defined(MD_ROOT_SIZE)
	/*
	* We put the mfs_root symbol into the oldmfs section of the kernel object file.
	* Applications that patch the object with the image can determine
	* the size looking at the oldmfs section size within the kernel.
	*/
	u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs")));
	const int mfs_root_size = sizeof(mfs_root);
	#elif defined(MD_ROOT_MEM)
	/* MD region already mapped in the memory */
	u_char *mfs_root;
	int mfs_root_size;
	#else
	extern volatile u_char __weak_symbol mfs_root;
	extern volatile u_char __weak_symbol mfs_root_end;
	__GLOBL(mfs_root);
	__GLOBL(mfs_root_end);
	#define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root))
	#endif
	#endif

	static g_init_t g_md_init;
	static g_fini_t g_md_fini;
	static g_start_t g_md_start;
	static g_access_t g_md_access;
	static void g_md_dumpconf(struct sbuf sb, const char indent,
	struct g_geom gp, struct g_consumer cp __unused, struct g_provider *pp);
	static g_provgone_t g_md_providergone;

	static struct cdev *status_dev = NULL;
	static struct sx md_sx;
	static struct unrhdr *md_uh;

	static d_ioctl_t mdctlioctl;

	static struct cdevsw mdctl_cdevsw = {
	.d_version = D_VERSION,
	.d_ioctl = mdctlioctl,
	.d_name = MD_NAME,
	};

	struct g_class g_md_class = {
	.name = "MD",
	.version = G_VERSION,
	.init = g_md_init,
	.fini = g_md_fini,
	.start = g_md_start,
	.access = g_md_access,
	.dumpconf = g_md_dumpconf,
	.providergone = g_md_providergone,
	};

	DECLARE_GEOM_CLASS(g_md_class, g_md);

	static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list);

	#define NINDIR (PAGE_SIZE / sizeof(uintptr_t))
	#define NMASK (NINDIR-1)
	static int nshift;

	static uma_zone_t md_pbuf_zone;

	struct indir {
	uintptr_t *array;
	u_int total;
	u_int used;
	u_int shift;
	};

	struct md_s {
	int unit;
	LIST_ENTRY(md_s) list;
	struct bio_queue_head bio_queue;
	struct mtx queue_mtx;
	struct cdev *dev;
	enum md_types type;
	off_t mediasize;
	unsigned sectorsize;
	unsigned opencount;
	unsigned fwheads;
	unsigned fwsectors;
	char ident[32];
	unsigned flags;
	char name[20];
	struct proc *procp;
	struct g_geom *gp;
	struct g_provider *pp;
	int (start)(struct md_s sc, struct bio *bp);
	struct devstat *devstat;

	/* MD_MALLOC related fields */
	struct indir *indir;
	uma_zone_t uma;

	/* MD_PRELOAD related fields */
	u_char *pl_ptr;
	size_t pl_len;

	/* MD_VNODE related fields */
	struct vnode *vnode;
	char file[PATH_MAX];
	char label[PATH_MAX];
	struct ucred *cred;

	/* MD_SWAP related fields */
	vm_object_t object;
	};

	static struct indir *
	new_indir(u_int shift)
	{
	struct indir *ip;

	ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT)
	\| M_ZERO);
	if (ip == NULL)
	return (NULL);
	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
	M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) \| M_ZERO);
	if (ip->array == NULL) {
	free(ip, M_MD);
	return (NULL);
	}
	ip->total = NINDIR;
	ip->shift = shift;
	return (ip);
	}

	static void
	del_indir(struct indir *ip)
	{

	free(ip->array, M_MDSECT);
	free(ip, M_MD);
	}

	static void
	destroy_indir(struct md_s sc, struct indir ip)
	{
	int i;

	for (i = 0; i < NINDIR; i++) {
	if (!ip->array[i])
	continue;
	if (ip->shift)
	destroy_indir(sc, (struct indir*)(ip->array[i]));
	else if (ip->array[i] > 255)
	uma_zfree(sc->uma, (void *)(ip->array[i]));
	}
	del_indir(ip);
	}

	/*
	* This function does the math and allocates the top level "indir" structure
	* for a device of "size" sectors.
	*/

	static struct indir *
	dimension(off_t size)
	{
	off_t rcnt;
	struct indir *ip;
	int layer;

	rcnt = size;
	layer = 0;
	while (rcnt > NINDIR) {
	rcnt /= NINDIR;
	layer++;
	}

	/*
	* XXX: the top layer is probably not fully populated, so we allocate
	* too much space for ip->array in here.
	*/
	ip = malloc(sizeof *ip, M_MD, M_WAITOK \| M_ZERO);
	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
	M_MDSECT, M_WAITOK \| M_ZERO);
	ip->total = NINDIR;
	ip->shift = layer * nshift;
	return (ip);
	}

	/*
	* Read a given sector
	*/

	static uintptr_t
	s_read(struct indir *ip, off_t offset)
	{
	struct indir *cip;
	int idx;
	uintptr_t up;

	if (md_debug > 1)
	printf("s_read(%jd)\n", (intmax_t)offset);
	up = 0;
	for (cip = ip; cip != NULL;) {
	if (cip->shift) {
	idx = (offset >> cip->shift) & NMASK;
	up = cip->array[idx];
	cip = (struct indir *)up;
	continue;
	}
	idx = offset & NMASK;
	return (cip->array[idx]);
	}
	return (0);
	}

	/*
	* Write a given sector, prune the tree if the value is 0
	*/

	static int
	s_write(struct indir *ip, off_t offset, uintptr_t ptr)
	{
	struct indir cip, lip[10];
	int idx, li;
	uintptr_t up;

	if (md_debug > 1)
	printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
	up = 0;
	li = 0;
	cip = ip;
	for (;;) {
	lip[li++] = cip;
	if (cip->shift) {
	idx = (offset >> cip->shift) & NMASK;
	up = cip->array[idx];
	if (up != 0) {
	cip = (struct indir *)up;
	continue;
	}
	/* Allocate branch */
	cip->array[idx] =
	(uintptr_t)new_indir(cip->shift - nshift);
	if (cip->array[idx] == 0)
	return (ENOSPC);
	cip->used++;
	up = cip->array[idx];
	cip = (struct indir *)up;
	continue;
	}
	/* leafnode */
	idx = offset & NMASK;
	up = cip->array[idx];
	if (up != 0)
	cip->used--;
	cip->array[idx] = ptr;
	if (ptr != 0)
	cip->used++;
	break;
	}
	if (cip->used != 0 \|\| li == 1)
	return (0);
	li--;
	while (cip->used == 0 && cip != ip) {
	li--;
	idx = (offset >> lip[li]->shift) & NMASK;
	up = lip[li]->array[idx];
	KASSERT(up == (uintptr_t)cip, ("md screwed up"));
	del_indir(cip);
	lip[li]->array[idx] = 0;
	lip[li]->used--;
	cip = lip[li];
	}
	return (0);
	}

	static int
	g_md_access(struct g_provider *pp, int r, int w, int e)
	{
	struct md_s *sc;

	sc = pp->geom->softc;
	if (sc == NULL) {
	if (r <= 0 && w <= 0 && e <= 0)
	return (0);
	return (ENXIO);
	}
	r += pp->acr;
	w += pp->acw;
	e += pp->ace;
	if ((sc->flags & MD_READONLY) != 0 && w > 0)
	return (EROFS);
	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
	sc->opencount = 1;
	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
	sc->opencount = 0;
	}
	return (0);
	}

	static void
	g_md_start(struct bio *bp)
	{
	struct md_s *sc;

	sc = bp->bio_to->geom->softc;
	if ((bp->bio_cmd == BIO_READ) \|\| (bp->bio_cmd == BIO_WRITE)) {
	devstat_start_transaction_bio(sc->devstat, bp);
	}
	mtx_lock(&sc->queue_mtx);
	bioq_disksort(&sc->bio_queue, bp);
	wakeup(sc);
	mtx_unlock(&sc->queue_mtx);
	}

	#define MD_MALLOC_MOVE_ZERO 1
	#define MD_MALLOC_MOVE_FILL 2
	#define MD_MALLOC_MOVE_READ 3
	#define MD_MALLOC_MOVE_WRITE 4
	#define MD_MALLOC_MOVE_CMP 5

	static int
	md_malloc_move_ma(vm_page_t *mp, int ma_offs, unsigned sectorsize,
	void *ptr, u_char fill, int op)
	{
	struct sf_buf *sf;
	vm_page_t m, *mp1;
	char *p, first;
	off_t *uc;
	unsigned n;
	int error, i, ma_offs1, sz, first_read;

	m = NULL;
	error = 0;
	sf = NULL;
	/* if (op == MD_MALLOC_MOVE_CMP) { gcc */
	first = 0;
	first_read = 0;
	uc = ptr;
	mp1 = *mp;
	ma_offs1 = *ma_offs;
	/* } */
	sched_pin();
	for (n = sectorsize; n != 0; n -= sz) {
	sz = imin(PAGE_SIZE - *ma_offs, n);
	if (m != **mp) {
	if (sf != NULL)
	sf_buf_free(sf);
	m = **mp;
	sf = sf_buf_alloc(m, SFB_CPUPRIVATE \|
	(md_malloc_wait ? 0 : SFB_NOWAIT));
	if (sf == NULL) {
	error = ENOMEM;
	break;
	}
	}
	p = (char )sf_buf_kva(sf) + ma_offs;
	switch (op) {
	case MD_MALLOC_MOVE_ZERO:
	bzero(p, sz);
	break;
	case MD_MALLOC_MOVE_FILL:
	memset(p, fill, sz);
	break;
	case MD_MALLOC_MOVE_READ:
	bcopy(ptr, p, sz);
	cpu_flush_dcache(p, sz);
	break;
	case MD_MALLOC_MOVE_WRITE:
	bcopy(p, ptr, sz);
	break;
	case MD_MALLOC_MOVE_CMP:
	for (i = 0; i < sz; i++, p++) {
	if (!first_read) {
	uc = (u_char)p;
	first = *p;
	first_read = 1;
	} else if (*p != first) {
	error = EDOOFUS;
	break;
	}
	}
	break;
	default:
	KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op));
	break;
	}
	if (error != 0)
	break;
	*ma_offs += sz;
	*ma_offs %= PAGE_SIZE;
	if (*ma_offs == 0)
	(*mp)++;
	ptr = (char *)ptr + sz;
	}

	if (sf != NULL)
	sf_buf_free(sf);
	sched_unpin();
	if (op == MD_MALLOC_MOVE_CMP && error != 0) {
	*mp = mp1;
	*ma_offs = ma_offs1;
	}
	return (error);
	}

	static int
	md_malloc_move_vlist(bus_dma_segment_t *pvlist, int pma_offs,
	unsigned len, void *ptr, u_char fill, int op)
	{
	bus_dma_segment_t *vlist;
	uint8_t p, end, first;
	off_t *uc;
	int ma_offs, seg_len;

	vlist = *pvlist;
	ma_offs = *pma_offs;
	uc = ptr;

	for (; len != 0; len -= seg_len) {
	seg_len = imin(vlist->ds_len - ma_offs, len);
	p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs;
	switch (op) {
	case MD_MALLOC_MOVE_ZERO:
	bzero(p, seg_len);
	break;
	case MD_MALLOC_MOVE_FILL:
	memset(p, fill, seg_len);
	break;
	case MD_MALLOC_MOVE_READ:
	bcopy(ptr, p, seg_len);
	cpu_flush_dcache(p, seg_len);
	break;
	case MD_MALLOC_MOVE_WRITE:
	bcopy(p, ptr, seg_len);
	break;
	case MD_MALLOC_MOVE_CMP:
	end = p + seg_len;
	first = uc = p;
	/* Confirm all following bytes match the first */
	while (++p < end) {
	if (*p != first)
	return (EDOOFUS);
	}
	break;
	default:
	KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op));
	break;
	}

	ma_offs += seg_len;
	if (ma_offs == vlist->ds_len) {
	ma_offs = 0;
	vlist++;
	}
	ptr = (uint8_t *)ptr + seg_len;
	}
	*pvlist = vlist;
	*pma_offs = ma_offs;

	return (0);
	}

	static int
	mdstart_malloc(struct md_s sc, struct bio bp)
	{
	u_char *dst;
	vm_page_t *m;
	bus_dma_segment_t *vlist;
	int i, error, error1, ma_offs, notmapped;
	off_t secno, nsec, uc;
	uintptr_t sp, osp;

	switch (bp->bio_cmd) {
	case BIO_READ:
	case BIO_WRITE:
	case BIO_DELETE:
	break;
	default:
	return (EOPNOTSUPP);
	}

	notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0;
	vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
	(bus_dma_segment_t *)bp->bio_data : NULL;
	if (notmapped) {
	m = bp->bio_ma;
	ma_offs = bp->bio_ma_offset;
	dst = NULL;
	KASSERT(vlist == NULL, ("vlists cannot be unmapped"));
	} else if (vlist != NULL) {
	ma_offs = bp->bio_ma_offset;
	dst = NULL;
	} else {
	dst = bp->bio_data;
	}

	nsec = bp->bio_length / sc->sectorsize;
	secno = bp->bio_offset / sc->sectorsize;
	error = 0;
	while (nsec--) {
	osp = s_read(sc->indir, secno);
	if (bp->bio_cmd == BIO_DELETE) {
	if (osp != 0)
	error = s_write(sc->indir, secno, 0);
	} else if (bp->bio_cmd == BIO_READ) {
	if (osp == 0) {
	if (notmapped) {
	error = md_malloc_move_ma(&m, &ma_offs,
	sc->sectorsize, NULL, 0,
	MD_MALLOC_MOVE_ZERO);
	} else if (vlist != NULL) {
	error = md_malloc_move_vlist(&vlist,
	&ma_offs, sc->sectorsize, NULL, 0,
	MD_MALLOC_MOVE_ZERO);
	} else
	bzero(dst, sc->sectorsize);
	} else if (osp <= 255) {
	if (notmapped) {
	error = md_malloc_move_ma(&m, &ma_offs,
	sc->sectorsize, NULL, osp,
	MD_MALLOC_MOVE_FILL);
	} else if (vlist != NULL) {
	error = md_malloc_move_vlist(&vlist,
	&ma_offs, sc->sectorsize, NULL, osp,
	MD_MALLOC_MOVE_FILL);
	} else
	memset(dst, osp, sc->sectorsize);
	} else {
	if (notmapped) {
	error = md_malloc_move_ma(&m, &ma_offs,
	sc->sectorsize, (void *)osp, 0,
	MD_MALLOC_MOVE_READ);
	} else if (vlist != NULL) {
	error = md_malloc_move_vlist(&vlist,
	&ma_offs, sc->sectorsize,
	(void *)osp, 0,
	MD_MALLOC_MOVE_READ);
	} else {
	bcopy((void *)osp, dst, sc->sectorsize);
	cpu_flush_dcache(dst, sc->sectorsize);
	}
	}
	osp = 0;
	} else if (bp->bio_cmd == BIO_WRITE) {
	if (sc->flags & MD_COMPRESS) {
	if (notmapped) {
	error1 = md_malloc_move_ma(&m, &ma_offs,
	sc->sectorsize, &uc, 0,
	MD_MALLOC_MOVE_CMP);
	i = error1 == 0 ? sc->sectorsize : 0;
	} else if (vlist != NULL) {
	error1 = md_malloc_move_vlist(&vlist,
	&ma_offs, sc->sectorsize, &uc, 0,
	MD_MALLOC_MOVE_CMP);
	i = error1 == 0 ? sc->sectorsize : 0;
	} else {
	uc = dst[0];
	for (i = 1; i < sc->sectorsize; i++) {
	if (dst[i] != uc)
	break;
	}
	}
	} else {
	i = 0;
	uc = 0;
	}
	if (i == sc->sectorsize) {
	if (osp != uc)
	error = s_write(sc->indir, secno, uc);
	} else {
	if (osp <= 255) {
	sp = (uintptr_t)uma_zalloc(sc->uma,
	md_malloc_wait ? M_WAITOK :
	M_NOWAIT);
	if (sp == 0) {
	error = ENOSPC;
	break;
	}
	if (notmapped) {
	error = md_malloc_move_ma(&m,
	&ma_offs, sc->sectorsize,
	(void *)sp, 0,
	MD_MALLOC_MOVE_WRITE);
	} else if (vlist != NULL) {
	error = md_malloc_move_vlist(
	&vlist, &ma_offs,
	sc->sectorsize, (void *)sp,
	0, MD_MALLOC_MOVE_WRITE);
	} else {
	bcopy(dst, (void *)sp,
	sc->sectorsize);
	}
	error = s_write(sc->indir, secno, sp);
	} else {
	if (notmapped) {
	error = md_malloc_move_ma(&m,
	&ma_offs, sc->sectorsize,
	(void *)osp, 0,
	MD_MALLOC_MOVE_WRITE);
	} else if (vlist != NULL) {
	error = md_malloc_move_vlist(
	&vlist, &ma_offs,
	sc->sectorsize, (void *)osp,
	0, MD_MALLOC_MOVE_WRITE);
	} else {
	bcopy(dst, (void *)osp,
	sc->sectorsize);
	}
	osp = 0;
	}
	}
	} else {
	error = EOPNOTSUPP;
	}
	if (osp > 255)
	uma_zfree(sc->uma, (void*)osp);
	if (error != 0)
	break;
	secno++;
	if (!notmapped && vlist == NULL)
	dst += sc->sectorsize;
	}
	bp->bio_resid = 0;
	return (error);
	}

	static void
	mdcopyto_vlist(void src, bus_dma_segment_t vlist, off_t offset, off_t len)
	{
	off_t seg_len;

	while (offset >= vlist->ds_len) {
	offset -= vlist->ds_len;
	vlist++;
	}

	while (len != 0) {
	seg_len = omin(len, vlist->ds_len - offset);
	bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset),
	seg_len);
	offset = 0;
	src = (uint8_t *)src + seg_len;
	len -= seg_len;
	vlist++;
	}
	}

	static void
	mdcopyfrom_vlist(bus_dma_segment_t vlist, off_t offset, void dst, off_t len)
	{
	off_t seg_len;

	while (offset >= vlist->ds_len) {
	offset -= vlist->ds_len;
	vlist++;
	}

	while (len != 0) {
	seg_len = omin(len, vlist->ds_len - offset);
	bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst,
	seg_len);
	offset = 0;
	dst = (uint8_t *)dst + seg_len;
	len -= seg_len;
	vlist++;
	}
	}

	static int
	mdstart_preload(struct md_s sc, struct bio bp)
	{
	uint8_t *p;

	p = sc->pl_ptr + bp->bio_offset;
	switch (bp->bio_cmd) {
	case BIO_READ:
	if ((bp->bio_flags & BIO_VLIST) != 0) {
	mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data,
	bp->bio_ma_offset, bp->bio_length);
	} else {
	bcopy(p, bp->bio_data, bp->bio_length);
	}
	cpu_flush_dcache(bp->bio_data, bp->bio_length);
	break;
	case BIO_WRITE:
	if ((bp->bio_flags & BIO_VLIST) != 0) {
	mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data,
	bp->bio_ma_offset, p, bp->bio_length);
	} else {
	bcopy(bp->bio_data, p, bp->bio_length);
	}
	break;
	}
	bp->bio_resid = 0;
	return (0);
	}

	static int
	mdstart_vnode(struct md_s sc, struct bio bp)
	{
	int error;
	struct uio auio;
	struct iovec aiov;
	struct iovec *piov;
	struct mount *mp;
	struct vnode *vp;
	struct buf *pb;
	bus_dma_segment_t *vlist;
	struct thread *td;
	off_t iolen, iostart, len, zerosize;
	int ma_offs, npages;

	switch (bp->bio_cmd) {
	case BIO_READ:
	auio.uio_rw = UIO_READ;
	break;
	case BIO_WRITE:
	case BIO_DELETE:
	auio.uio_rw = UIO_WRITE;
	break;
	case BIO_FLUSH:
	break;
	default:
	return (EOPNOTSUPP);
	}

	td = curthread;
	vp = sc->vnode;
	pb = NULL;
	piov = NULL;
	ma_offs = bp->bio_ma_offset;
	len = bp->bio_length;

	/*
	* VNODE I/O
	*
	* If an error occurs, we set BIO_ERROR but we do not set
	* B_INVAL because (for a write anyway), the buffer is
	* still valid.
	*/

	if (bp->bio_cmd == BIO_FLUSH) {
	(void) vn_start_write(vp, &mp, V_WAIT);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	error = VOP_FSYNC(vp, MNT_WAIT, td);
	VOP_UNLOCK(vp);
	vn_finished_write(mp);
	return (error);
	}

	auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
	auio.uio_resid = bp->bio_length;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_td = td;

	if (bp->bio_cmd == BIO_DELETE) {
	/*
	* Emulate BIO_DELETE by writing zeros.
	*/
	zerosize = ZERO_REGION_SIZE -
	(ZERO_REGION_SIZE % sc->sectorsize);
	auio.uio_iovcnt = howmany(bp->bio_length, zerosize);
	piov = malloc(sizeof(piov) auio.uio_iovcnt, M_MD, M_WAITOK);
	auio.uio_iov = piov;
	while (len > 0) {
	piov->iov_base = __DECONST(void *, zero_region);
	piov->iov_len = len;
	if (len > zerosize)
	piov->iov_len = zerosize;
	len -= piov->iov_len;
	piov++;
	}
	piov = auio.uio_iov;
	} else if ((bp->bio_flags & BIO_VLIST) != 0) {
	piov = malloc(sizeof(piov) bp->bio_ma_n, M_MD, M_WAITOK);
	auio.uio_iov = piov;
	vlist = (bus_dma_segment_t *)bp->bio_data;
	while (len > 0) {
	piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr +
	ma_offs);
	piov->iov_len = vlist->ds_len - ma_offs;
	if (piov->iov_len > len)
	piov->iov_len = len;
	len -= piov->iov_len;
	ma_offs = 0;
	vlist++;
	piov++;
	}
	auio.uio_iovcnt = piov - auio.uio_iov;
	piov = auio.uio_iov;
	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
	pb = uma_zalloc(md_pbuf_zone, M_WAITOK);
	+ MPASS((pb->b_flags & B_MAXPHYS) != 0);
	bp->bio_resid = len;
	unmapped_step:
	- npages = atop(min(MAXPHYS, round_page(len + (ma_offs &
	+ npages = atop(min(maxphys, round_page(len + (ma_offs &
	PAGE_MASK))));
	iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len);
	KASSERT(iolen > 0, ("zero iolen"));
	pmap_qenter((vm_offset_t)pb->b_data,
	&bp->bio_ma[atop(ma_offs)], npages);
	aiov.iov_base = (void *)((vm_offset_t)pb->b_data +
	(ma_offs & PAGE_MASK));
	aiov.iov_len = iolen;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_resid = iolen;
	} else {
	aiov.iov_base = bp->bio_data;
	aiov.iov_len = bp->bio_length;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	}
	iostart = auio.uio_offset;
	if (auio.uio_rw == UIO_READ) {
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	error = VOP_READ(vp, &auio, 0, sc->cred);
	VOP_UNLOCK(vp);
	} else {
	(void) vn_start_write(vp, &mp, V_WAIT);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
	sc->cred);
	VOP_UNLOCK(vp);
	vn_finished_write(mp);
	if (error == 0)
	sc->flags &= ~MD_VERIFY;
	}

	/* When MD_CACHE is set, try to avoid double-caching the data. */
	if (error == 0 && (sc->flags & MD_CACHE) == 0)
	VOP_ADVISE(vp, iostart, auio.uio_offset - 1,
	POSIX_FADV_DONTNEED);

	if (pb != NULL) {
	pmap_qremove((vm_offset_t)pb->b_data, npages);
	if (error == 0) {
	len -= iolen;
	bp->bio_resid -= iolen;
	ma_offs += iolen;
	if (len > 0)
	goto unmapped_step;
	}
	uma_zfree(md_pbuf_zone, pb);
	}

	free(piov, M_MD);
	if (pb == NULL)
	bp->bio_resid = auio.uio_resid;
	return (error);
	}

	static int
	mdstart_swap(struct md_s sc, struct bio bp)
	{
	vm_page_t m;
	u_char *p;
	vm_pindex_t i, lastp;
	bus_dma_segment_t *vlist;
	int rv, ma_offs, offs, len, lastend;

	switch (bp->bio_cmd) {
	case BIO_READ:
	case BIO_WRITE:
	case BIO_DELETE:
	break;
	default:
	return (EOPNOTSUPP);
	}

	p = bp->bio_data;
	ma_offs = (bp->bio_flags & (BIO_UNMAPPED\|BIO_VLIST)) != 0 ?
	bp->bio_ma_offset : 0;
	vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
	(bus_dma_segment_t *)bp->bio_data : NULL;

	/*
	* offs is the offset at which to start operating on the
	* next (ie, first) page. lastp is the last page on
	* which we're going to operate. lastend is the ending
	* position within that last page (ie, PAGE_SIZE if
	* we're operating on complete aligned pages).
	*/
	offs = bp->bio_offset % PAGE_SIZE;
	lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
	lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;

	rv = VM_PAGER_OK;
	vm_object_pip_add(sc->object, 1);
	for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
	len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
	m = vm_page_grab_unlocked(sc->object, i, VM_ALLOC_SYSTEM);
	if (bp->bio_cmd == BIO_READ) {
	if (vm_page_all_valid(m))
	rv = VM_PAGER_OK;
	else
	rv = vm_pager_get_pages(sc->object, &m, 1,
	NULL, NULL);
	if (rv == VM_PAGER_ERROR) {
	VM_OBJECT_WLOCK(sc->object);
	vm_page_free(m);
	VM_OBJECT_WUNLOCK(sc->object);
	break;
	} else if (rv == VM_PAGER_FAIL) {
	/*
	* Pager does not have the page. Zero
	* the allocated page, and mark it as
	* valid. Do not set dirty, the page
	* can be recreated if thrown out.
	*/
	pmap_zero_page(m);
	vm_page_valid(m);
	}
	if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
	pmap_copy_pages(&m, offs, bp->bio_ma,
	ma_offs, len);
	} else if ((bp->bio_flags & BIO_VLIST) != 0) {
	physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs,
	vlist, ma_offs, len);
	cpu_flush_dcache(p, len);
	} else {
	physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len);
	cpu_flush_dcache(p, len);
	}
	} else if (bp->bio_cmd == BIO_WRITE) {
	if (len == PAGE_SIZE \|\| vm_page_all_valid(m))
	rv = VM_PAGER_OK;
	else
	rv = vm_pager_get_pages(sc->object, &m, 1,
	NULL, NULL);
	if (rv == VM_PAGER_ERROR) {
	VM_OBJECT_WLOCK(sc->object);
	vm_page_free(m);
	VM_OBJECT_WUNLOCK(sc->object);
	break;
	} else if (rv == VM_PAGER_FAIL)
	pmap_zero_page(m);

	if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
	pmap_copy_pages(bp->bio_ma, ma_offs, &m,
	offs, len);
	} else if ((bp->bio_flags & BIO_VLIST) != 0) {
	physcopyin_vlist(vlist, ma_offs,
	VM_PAGE_TO_PHYS(m) + offs, len);
	} else {
	physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len);
	}

	vm_page_valid(m);
	vm_page_set_dirty(m);
	} else if (bp->bio_cmd == BIO_DELETE) {
	if (len == PAGE_SIZE \|\| vm_page_all_valid(m))
	rv = VM_PAGER_OK;
	else
	rv = vm_pager_get_pages(sc->object, &m, 1,
	NULL, NULL);
	VM_OBJECT_WLOCK(sc->object);
	if (rv == VM_PAGER_ERROR) {
	vm_page_free(m);
	VM_OBJECT_WUNLOCK(sc->object);
	break;
	} else if (rv == VM_PAGER_FAIL) {
	vm_page_free(m);
	m = NULL;
	} else {
	/* Page is valid. */
	if (len != PAGE_SIZE) {
	pmap_zero_page_area(m, offs, len);
	vm_page_set_dirty(m);
	} else {
	vm_pager_page_unswapped(m);
	vm_page_free(m);
	m = NULL;
	}
	}
	VM_OBJECT_WUNLOCK(sc->object);
	}
	if (m != NULL) {
	vm_page_xunbusy(m);

	/*
	* The page may be deactivated prior to setting
	* PGA_REFERENCED, but in this case it will be
	* reactivated by the page daemon.
	*/
	if (vm_page_active(m))
	vm_page_reference(m);
	else
	vm_page_activate(m);
	}

	/* Actions on further pages start at offset 0 */
	p += PAGE_SIZE - offs;
	offs = 0;
	ma_offs += len;
	}
	vm_object_pip_wakeup(sc->object);
	return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
	}

	static int
	mdstart_null(struct md_s sc, struct bio bp)
	{

	switch (bp->bio_cmd) {
	case BIO_READ:
	bzero(bp->bio_data, bp->bio_length);
	cpu_flush_dcache(bp->bio_data, bp->bio_length);
	break;
	case BIO_WRITE:
	break;
	}
	bp->bio_resid = 0;
	return (0);
	}

	static void
	md_kthread(void *arg)
	{
	struct md_s *sc;
	struct bio *bp;
	int error;

	sc = arg;
	thread_lock(curthread);
	sched_prio(curthread, PRIBIO);
	thread_unlock(curthread);
	if (sc->type == MD_VNODE)
	curthread->td_pflags \|= TDP_NORUNNINGBUF;

	for (;;) {
	mtx_lock(&sc->queue_mtx);
	if (sc->flags & MD_SHUTDOWN) {
	sc->flags \|= MD_EXITING;
	mtx_unlock(&sc->queue_mtx);
	kproc_exit(0);
	}
	bp = bioq_takefirst(&sc->bio_queue);
	if (!bp) {
	msleep(sc, &sc->queue_mtx, PRIBIO \| PDROP, "mdwait", 0);
	continue;
	}
	mtx_unlock(&sc->queue_mtx);
	if (bp->bio_cmd == BIO_GETATTR) {
	int isv = ((sc->flags & MD_VERIFY) != 0);

	if ((sc->fwsectors && sc->fwheads &&
	(g_handleattr_int(bp, "GEOM::fwsectors",
	sc->fwsectors) \|\|
	g_handleattr_int(bp, "GEOM::fwheads",
	sc->fwheads))) \|\|
	g_handleattr_int(bp, "GEOM::candelete", 1))
	error = -1;
	else if (sc->ident[0] != '\0' &&
	g_handleattr_str(bp, "GEOM::ident", sc->ident))
	error = -1;
	else if (g_handleattr_int(bp, "MNT::verified", isv))
	error = -1;
	else
	error = EOPNOTSUPP;
	} else {
	error = sc->start(sc, bp);
	}

	if (bp->bio_cmd == BIO_READ \|\| bp->bio_cmd == BIO_WRITE) {
	/*
	* Devstat uses (bio_bcount, bio_resid) for
	* determining the length of the completed part of
	* the i/o. g_io_deliver() will translate from
	* bio_completed to that, but it also destroys the
	* bio so we must do our own translation.
	*/
	bp->bio_bcount = bp->bio_length;
	bp->bio_resid = (error == -1 ? bp->bio_bcount : 0);
	devstat_end_transaction_bio(sc->devstat, bp);
	}
	if (error != -1) {
	bp->bio_completed = bp->bio_length;
	g_io_deliver(bp, error);
	}
	}
	}

	static struct md_s *
	mdfind(int unit)
	{
	struct md_s *sc;

	LIST_FOREACH(sc, &md_softc_list, list) {
	if (sc->unit == unit)
	break;
	}
	return (sc);
	}

	static struct md_s *
	mdnew(int unit, int *errp, enum md_types type)
	{
	struct md_s *sc;
	int error;

	*errp = 0;
	if (unit == -1)
	unit = alloc_unr(md_uh);
	else
	unit = alloc_unr_specific(md_uh, unit);

	if (unit == -1) {
	*errp = EBUSY;
	return (NULL);
	}

	sc = (struct md_s )malloc(sizeof sc, M_MD, M_WAITOK \| M_ZERO);
	sc->type = type;
	bioq_init(&sc->bio_queue);
	mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
	sc->unit = unit;
	sprintf(sc->name, "md%d", unit);
	LIST_INSERT_HEAD(&md_softc_list, sc, list);
	error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
	if (error == 0)
	return (sc);
	LIST_REMOVE(sc, list);
	mtx_destroy(&sc->queue_mtx);
	free_unr(md_uh, sc->unit);
	free(sc, M_MD);
	*errp = error;
	return (NULL);
	}

	static void
	mdinit(struct md_s *sc)
	{
	struct g_geom *gp;
	struct g_provider *pp;

	g_topology_lock();
	gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
	gp->softc = sc;
	pp = g_new_providerf(gp, "md%d", sc->unit);
	devstat_remove_entry(pp->stat);
	pp->stat = NULL;
	pp->flags \|= G_PF_DIRECT_SEND \| G_PF_DIRECT_RECEIVE;
	pp->mediasize = sc->mediasize;
	pp->sectorsize = sc->sectorsize;
	switch (sc->type) {
	case MD_MALLOC:
	case MD_VNODE:
	case MD_SWAP:
	pp->flags \|= G_PF_ACCEPT_UNMAPPED;
	break;
	case MD_PRELOAD:
	case MD_NULL:
	break;
	}
	sc->gp = gp;
	sc->pp = pp;
	sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
	DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
	sc->devstat->id = pp;
	g_error_provider(pp, 0);
	g_topology_unlock();
	}

	static int
	mdcreate_malloc(struct md_s sc, struct md_req mdr)
	{
	uintptr_t sp;
	int error;
	off_t u;

	error = 0;
	if (mdr->md_options & ~(MD_AUTOUNIT \| MD_COMPRESS \| MD_RESERVE))
	return (EINVAL);
	if (mdr->md_sectorsize != 0 && !powerof2(mdr->md_sectorsize))
	return (EINVAL);
	/* Compression doesn't make sense if we have reserved space */
	if (mdr->md_options & MD_RESERVE)
	mdr->md_options &= ~MD_COMPRESS;
	if (mdr->md_fwsectors != 0)
	sc->fwsectors = mdr->md_fwsectors;
	if (mdr->md_fwheads != 0)
	sc->fwheads = mdr->md_fwheads;
	sc->flags = mdr->md_options & (MD_COMPRESS \| MD_FORCE);
	sc->indir = dimension(sc->mediasize / sc->sectorsize);
	sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
	0x1ff, 0);
	if (mdr->md_options & MD_RESERVE) {
	off_t nsectors;

	nsectors = sc->mediasize / sc->sectorsize;
	for (u = 0; u < nsectors; u++) {
	sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ?
	M_WAITOK : M_NOWAIT) \| M_ZERO);
	if (sp != 0)
	error = s_write(sc->indir, u, sp);
	else
	error = ENOMEM;
	if (error != 0)
	break;
	}
	}
	return (error);
	}

	static int
	mdsetcred(struct md_s sc, struct ucred cred)
	{
	char *tmpbuf;
	int error = 0;

	/*
	* Set credits in our softc
	*/

	if (sc->cred)
	crfree(sc->cred);
	sc->cred = crhold(cred);

	/*
	* Horrible kludge to establish credentials for NFS XXX.
	*/

	if (sc->vnode) {
	struct uio auio;
	struct iovec aiov;

	tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
	bzero(&auio, sizeof(auio));

	aiov.iov_base = tmpbuf;
	aiov.iov_len = sc->sectorsize;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = 0;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_resid = aiov.iov_len;
	vn_lock(sc->vnode, LK_EXCLUSIVE \| LK_RETRY);
	error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
	VOP_UNLOCK(sc->vnode);
	free(tmpbuf, M_TEMP);
	}
	return (error);
	}

	static int
	mdcreate_vnode(struct md_s sc, struct md_req mdr, struct thread *td)
	{
	struct vattr vattr;
	struct nameidata nd;
	char *fname;
	int error, flags;

	fname = mdr->md_file;
	if (mdr->md_file_seg == UIO_USERSPACE) {
	error = copyinstr(fname, sc->file, sizeof(sc->file), NULL);
	if (error != 0)
	return (error);
	} else if (mdr->md_file_seg == UIO_SYSSPACE)
	strlcpy(sc->file, fname, sizeof(sc->file));
	else
	return (EDOOFUS);

	/*
	* If the user specified that this is a read only device, don't
	* set the FWRITE mask before trying to open the backing store.
	*/
	flags = FREAD \| ((mdr->md_options & MD_READONLY) ? 0 : FWRITE) \
	\| ((mdr->md_options & MD_VERIFY) ? O_VERIFY : 0);
	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td);
	error = vn_open(&nd, &flags, 0, NULL);
	if (error != 0)
	return (error);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (nd.ni_vp->v_type != VREG) {
	error = EINVAL;
	goto bad;
	}
	error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred);
	if (error != 0)
	goto bad;
	if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) {
	vn_lock(nd.ni_vp, LK_UPGRADE \| LK_RETRY);
	if (VN_IS_DOOMED(nd.ni_vp)) {
	/* Forced unmount. */
	error = EBADF;
	goto bad;
	}
	}
	nd.ni_vp->v_vflag \|= VV_MD;
	VOP_UNLOCK(nd.ni_vp);

	if (mdr->md_fwsectors != 0)
	sc->fwsectors = mdr->md_fwsectors;
	if (mdr->md_fwheads != 0)
	sc->fwheads = mdr->md_fwheads;
	snprintf(sc->ident, sizeof(sc->ident), "MD-DEV%ju-INO%ju",
	(uintmax_t)vattr.va_fsid, (uintmax_t)vattr.va_fileid);
	sc->flags = mdr->md_options & (MD_ASYNC \| MD_CACHE \| MD_FORCE \|
	MD_VERIFY);
	if (!(flags & FWRITE))
	sc->flags \|= MD_READONLY;
	sc->vnode = nd.ni_vp;

	error = mdsetcred(sc, td->td_ucred);
	if (error != 0) {
	sc->vnode = NULL;
	vn_lock(nd.ni_vp, LK_EXCLUSIVE \| LK_RETRY);
	nd.ni_vp->v_vflag &= ~VV_MD;
	goto bad;
	}
	return (0);
	bad:
	VOP_UNLOCK(nd.ni_vp);
	(void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
	return (error);
	}

	static void
	g_md_providergone(struct g_provider *pp)
	{
	struct md_s *sc = pp->geom->softc;

	mtx_lock(&sc->queue_mtx);
	sc->flags \|= MD_PROVIDERGONE;
	wakeup(&sc->flags);
	mtx_unlock(&sc->queue_mtx);
	}

	static int
	mddestroy(struct md_s sc, struct thread td)
	{

	if (sc->gp) {
	g_topology_lock();
	g_wither_geom(sc->gp, ENXIO);
	g_topology_unlock();

	mtx_lock(&sc->queue_mtx);
	while (!(sc->flags & MD_PROVIDERGONE))
	msleep(&sc->flags, &sc->queue_mtx, PRIBIO, "mddestroy", 0);
	mtx_unlock(&sc->queue_mtx);
	}
	if (sc->devstat) {
	devstat_remove_entry(sc->devstat);
	sc->devstat = NULL;
	}
	mtx_lock(&sc->queue_mtx);
	sc->flags \|= MD_SHUTDOWN;
	wakeup(sc);
	while (!(sc->flags & MD_EXITING))
	msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
	mtx_unlock(&sc->queue_mtx);
	mtx_destroy(&sc->queue_mtx);
	if (sc->vnode != NULL) {
	vn_lock(sc->vnode, LK_EXCLUSIVE \| LK_RETRY);
	sc->vnode->v_vflag &= ~VV_MD;
	VOP_UNLOCK(sc->vnode);
	(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
	FREAD : (FREAD\|FWRITE), sc->cred, td);
	}
	if (sc->cred != NULL)
	crfree(sc->cred);
	if (sc->object != NULL)
	vm_object_deallocate(sc->object);
	if (sc->indir)
	destroy_indir(sc, sc->indir);
	if (sc->uma)
	uma_zdestroy(sc->uma);

	LIST_REMOVE(sc, list);
	free_unr(md_uh, sc->unit);
	free(sc, M_MD);
	return (0);
	}

	static int
	mdresize(struct md_s sc, struct md_req mdr)
	{
	int error, res;
	vm_pindex_t oldpages, newpages;

	switch (sc->type) {
	case MD_VNODE:
	case MD_NULL:
	break;
	case MD_SWAP:
	if (mdr->md_mediasize <= 0 \|\|
	(mdr->md_mediasize % PAGE_SIZE) != 0)
	return (EDOM);
	oldpages = OFF_TO_IDX(sc->mediasize);
	newpages = OFF_TO_IDX(mdr->md_mediasize);
	if (newpages < oldpages) {
	VM_OBJECT_WLOCK(sc->object);
	vm_object_page_remove(sc->object, newpages, 0, 0);
	swap_release_by_cred(IDX_TO_OFF(oldpages -
	newpages), sc->cred);
	sc->object->charge = IDX_TO_OFF(newpages);
	sc->object->size = newpages;
	VM_OBJECT_WUNLOCK(sc->object);
	} else if (newpages > oldpages) {
	res = swap_reserve_by_cred(IDX_TO_OFF(newpages -
	oldpages), sc->cred);
	if (!res)
	return (ENOMEM);
	if ((mdr->md_options & MD_RESERVE) \|\|
	(sc->flags & MD_RESERVE)) {
	error = swap_pager_reserve(sc->object,
	oldpages, newpages - oldpages);
	if (error < 0) {
	swap_release_by_cred(
	IDX_TO_OFF(newpages - oldpages),
	sc->cred);
	return (EDOM);
	}
	}
	VM_OBJECT_WLOCK(sc->object);
	sc->object->charge = IDX_TO_OFF(newpages);
	sc->object->size = newpages;
	VM_OBJECT_WUNLOCK(sc->object);
	}
	break;
	default:
	return (EOPNOTSUPP);
	}

	sc->mediasize = mdr->md_mediasize;
	g_topology_lock();
	g_resize_provider(sc->pp, sc->mediasize);
	g_topology_unlock();
	return (0);
	}

	static int
	mdcreate_swap(struct md_s sc, struct md_req mdr, struct thread *td)
	{
	vm_ooffset_t npage;
	int error;

	/*
	* Range check. Disallow negative sizes and sizes not being
	* multiple of page size.
	*/
	if (sc->mediasize <= 0 \|\| (sc->mediasize % PAGE_SIZE) != 0)
	return (EDOM);

	/*
	* Allocate an OBJT_SWAP object.
	*
	* Note the truncation.
	*/

	if ((mdr->md_options & MD_VERIFY) != 0)
	return (EINVAL);
	npage = mdr->md_mediasize / PAGE_SIZE;
	if (mdr->md_fwsectors != 0)
	sc->fwsectors = mdr->md_fwsectors;
	if (mdr->md_fwheads != 0)
	sc->fwheads = mdr->md_fwheads;
	sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
	VM_PROT_DEFAULT, 0, td->td_ucred);
	if (sc->object == NULL)
	return (ENOMEM);
	sc->flags = mdr->md_options & (MD_FORCE \| MD_RESERVE);
	if (mdr->md_options & MD_RESERVE) {
	if (swap_pager_reserve(sc->object, 0, npage) < 0) {
	error = EDOM;
	goto finish;
	}
	}
	error = mdsetcred(sc, td->td_ucred);
	finish:
	if (error != 0) {
	vm_object_deallocate(sc->object);
	sc->object = NULL;
	}
	return (error);
	}

	static int
	mdcreate_null(struct md_s sc, struct md_req mdr, struct thread *td)
	{

	/*
	* Range check. Disallow negative sizes and sizes not being
	* multiple of page size.
	*/
	if (sc->mediasize <= 0 \|\| (sc->mediasize % PAGE_SIZE) != 0)
	return (EDOM);

	return (0);
	}

	static int
	kern_mdattach_locked(struct thread td, struct md_req mdr)
	{
	struct md_s *sc;
	unsigned sectsize;
	int error, i;

	sx_assert(&md_sx, SA_XLOCKED);

	switch (mdr->md_type) {
	case MD_MALLOC:
	case MD_PRELOAD:
	case MD_VNODE:
	case MD_SWAP:
	case MD_NULL:
	break;
	default:
	return (EINVAL);
	}
	if (mdr->md_sectorsize == 0)
	sectsize = DEV_BSIZE;
	else
	sectsize = mdr->md_sectorsize;
	- if (sectsize > MAXPHYS \|\| mdr->md_mediasize < sectsize)
	+ if (sectsize > maxphys \|\| mdr->md_mediasize < sectsize)
	return (EINVAL);
	if (mdr->md_options & MD_AUTOUNIT)
	sc = mdnew(-1, &error, mdr->md_type);
	else {
	if (mdr->md_unit > INT_MAX)
	return (EINVAL);
	sc = mdnew(mdr->md_unit, &error, mdr->md_type);
	}
	if (sc == NULL)
	return (error);
	if (mdr->md_label != NULL)
	error = copyinstr(mdr->md_label, sc->label,
	sizeof(sc->label), NULL);
	if (error != 0)
	goto err_after_new;
	if (mdr->md_options & MD_AUTOUNIT)
	mdr->md_unit = sc->unit;
	sc->mediasize = mdr->md_mediasize;
	sc->sectorsize = sectsize;
	error = EDOOFUS;
	switch (sc->type) {
	case MD_MALLOC:
	sc->start = mdstart_malloc;
	error = mdcreate_malloc(sc, mdr);
	break;
	case MD_PRELOAD:
	/*
	* We disallow attaching preloaded memory disks via
	* ioctl. Preloaded memory disks are automatically
	* attached in g_md_init().
	*/
	error = EOPNOTSUPP;
	break;
	case MD_VNODE:
	sc->start = mdstart_vnode;
	error = mdcreate_vnode(sc, mdr, td);
	break;
	case MD_SWAP:
	sc->start = mdstart_swap;
	error = mdcreate_swap(sc, mdr, td);
	break;
	case MD_NULL:
	sc->start = mdstart_null;
	error = mdcreate_null(sc, mdr, td);
	break;
	}
	err_after_new:
	if (error != 0) {
	mddestroy(sc, td);
	return (error);
	}

	/* Prune off any residual fractional sector */
	i = sc->mediasize % sc->sectorsize;
	sc->mediasize -= i;

	mdinit(sc);
	return (0);
	}

	static int
	kern_mdattach(struct thread td, struct md_req mdr)
	{
	int error;

	sx_xlock(&md_sx);
	error = kern_mdattach_locked(td, mdr);
	sx_xunlock(&md_sx);
	return (error);
	}

	static int
	kern_mddetach_locked(struct thread td, struct md_req mdr)
	{
	struct md_s *sc;

	sx_assert(&md_sx, SA_XLOCKED);

	if (mdr->md_mediasize != 0 \|\|
	(mdr->md_options & ~MD_FORCE) != 0)
	return (EINVAL);

	sc = mdfind(mdr->md_unit);
	if (sc == NULL)
	return (ENOENT);
	if (sc->opencount != 0 && !(sc->flags & MD_FORCE) &&
	!(mdr->md_options & MD_FORCE))
	return (EBUSY);
	return (mddestroy(sc, td));
	}

	static int
	kern_mddetach(struct thread td, struct md_req mdr)
	{
	int error;

	sx_xlock(&md_sx);
	error = kern_mddetach_locked(td, mdr);
	sx_xunlock(&md_sx);
	return (error);
	}

	static int
	kern_mdresize_locked(struct md_req *mdr)
	{
	struct md_s *sc;

	sx_assert(&md_sx, SA_XLOCKED);

	if ((mdr->md_options & ~(MD_FORCE \| MD_RESERVE)) != 0)
	return (EINVAL);

	sc = mdfind(mdr->md_unit);
	if (sc == NULL)
	return (ENOENT);
	if (mdr->md_mediasize < sc->sectorsize)
	return (EINVAL);
	if (mdr->md_mediasize < sc->mediasize &&
	!(sc->flags & MD_FORCE) &&
	!(mdr->md_options & MD_FORCE))
	return (EBUSY);
	return (mdresize(sc, mdr));
	}

	static int
	kern_mdresize(struct md_req *mdr)
	{
	int error;

	sx_xlock(&md_sx);
	error = kern_mdresize_locked(mdr);
	sx_xunlock(&md_sx);
	return (error);
	}

	static int
	kern_mdquery_locked(struct md_req *mdr)
	{
	struct md_s *sc;
	int error;

	sx_assert(&md_sx, SA_XLOCKED);

	sc = mdfind(mdr->md_unit);
	if (sc == NULL)
	return (ENOENT);
	mdr->md_type = sc->type;
	mdr->md_options = sc->flags;
	mdr->md_mediasize = sc->mediasize;
	mdr->md_sectorsize = sc->sectorsize;
	error = 0;
	if (mdr->md_label != NULL) {
	error = copyout(sc->label, mdr->md_label,
	strlen(sc->label) + 1);
	if (error != 0)
	return (error);
	}
	if (sc->type == MD_VNODE \|\|
	(sc->type == MD_PRELOAD && mdr->md_file != NULL))
	error = copyout(sc->file, mdr->md_file,
	strlen(sc->file) + 1);
	return (error);
	}

	static int
	kern_mdquery(struct md_req *mdr)
	{
	int error;

	sx_xlock(&md_sx);
	error = kern_mdquery_locked(mdr);
	sx_xunlock(&md_sx);
	return (error);
	}

	/* Copy members that are not userspace pointers. */
	#define MD_IOCTL2REQ(mdio, mdr) do { \
	(mdr)->md_unit = (mdio)->md_unit; \
	(mdr)->md_type = (mdio)->md_type; \
	(mdr)->md_mediasize = (mdio)->md_mediasize; \
	(mdr)->md_sectorsize = (mdio)->md_sectorsize; \
	(mdr)->md_options = (mdio)->md_options; \
	(mdr)->md_fwheads = (mdio)->md_fwheads; \
	(mdr)->md_fwsectors = (mdio)->md_fwsectors; \
	(mdr)->md_units = &(mdio)->md_pad[0]; \
	(mdr)->md_units_nitems = nitems((mdio)->md_pad); \
	} while(0)

	/* Copy members that might have been updated */
	#define MD_REQ2IOCTL(mdr, mdio) do { \
	(mdio)->md_unit = (mdr)->md_unit; \
	(mdio)->md_type = (mdr)->md_type; \
	(mdio)->md_mediasize = (mdr)->md_mediasize; \
	(mdio)->md_sectorsize = (mdr)->md_sectorsize; \
	(mdio)->md_options = (mdr)->md_options; \
	(mdio)->md_fwheads = (mdr)->md_fwheads; \
	(mdio)->md_fwsectors = (mdr)->md_fwsectors; \
	} while(0)

	static int
	mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
	struct thread *td)
	{
	struct md_req mdr;
	int error;

	if (md_debug)
	printf("mdctlioctl(%s %lx %p %x %p)\n",
	devtoname(dev), cmd, addr, flags, td);

	bzero(&mdr, sizeof(mdr));
	switch (cmd) {
	case MDIOCATTACH:
	case MDIOCDETACH:
	case MDIOCRESIZE:
	case MDIOCQUERY: {
	struct md_ioctl mdio = (struct md_ioctl )addr;
	if (mdio->md_version != MDIOVERSION)
	return (EINVAL);
	MD_IOCTL2REQ(mdio, &mdr);
	mdr.md_file = mdio->md_file;
	mdr.md_file_seg = UIO_USERSPACE;
	/* If the file is adjacent to the md_ioctl it's in kernel. */
	if ((void )mdio->md_file == (void )(mdio + 1))
	mdr.md_file_seg = UIO_SYSSPACE;
	mdr.md_label = mdio->md_label;
	break;
	}
	#ifdef COMPAT_FREEBSD32
	case MDIOCATTACH_32:
	case MDIOCDETACH_32:
	case MDIOCRESIZE_32:
	case MDIOCQUERY_32: {
	struct md_ioctl32 mdio = (struct md_ioctl32 )addr;
	if (mdio->md_version != MDIOVERSION)
	return (EINVAL);
	MD_IOCTL2REQ(mdio, &mdr);
	mdr.md_file = (void *)(uintptr_t)mdio->md_file;
	mdr.md_file_seg = UIO_USERSPACE;
	mdr.md_label = (void *)(uintptr_t)mdio->md_label;
	break;
	}
	#endif
	default:
	/* Fall through to handler switch. */
	break;
	}

	error = 0;
	switch (cmd) {
	case MDIOCATTACH:
	#ifdef COMPAT_FREEBSD32
	case MDIOCATTACH_32:
	#endif
	error = kern_mdattach(td, &mdr);
	break;
	case MDIOCDETACH:
	#ifdef COMPAT_FREEBSD32
	case MDIOCDETACH_32:
	#endif
	error = kern_mddetach(td, &mdr);
	break;
	case MDIOCRESIZE:
	#ifdef COMPAT_FREEBSD32
	case MDIOCRESIZE_32:
	#endif
	error = kern_mdresize(&mdr);
	break;
	case MDIOCQUERY:
	#ifdef COMPAT_FREEBSD32
	case MDIOCQUERY_32:
	#endif
	error = kern_mdquery(&mdr);
	break;
	default:
	error = ENOIOCTL;
	}

	switch (cmd) {
	case MDIOCATTACH:
	case MDIOCQUERY: {
	struct md_ioctl mdio = (struct md_ioctl )addr;
	MD_REQ2IOCTL(&mdr, mdio);
	break;
	}
	#ifdef COMPAT_FREEBSD32
	case MDIOCATTACH_32:
	case MDIOCQUERY_32: {
	struct md_ioctl32 mdio = (struct md_ioctl32 )addr;
	MD_REQ2IOCTL(&mdr, mdio);
	break;
	}
	#endif
	default:
	/* Other commands to not alter mdr. */
	break;
	}

	return (error);
	}

	static void
	md_preloaded(u_char image, size_t length, const char name)
	{
	struct md_s *sc;
	int error;

	sc = mdnew(-1, &error, MD_PRELOAD);
	if (sc == NULL)
	return;
	sc->mediasize = length;
	sc->sectorsize = DEV_BSIZE;
	sc->pl_ptr = image;
	sc->pl_len = length;
	sc->start = mdstart_preload;
	if (name != NULL)
	strlcpy(sc->file, name, sizeof(sc->file));
	#ifdef MD_ROOT
	if (sc->unit == 0) {
	#ifndef ROOTDEVNAME
	rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0";
	#endif
	#ifdef MD_ROOT_READONLY
	sc->flags \|= MD_READONLY;
	#endif
	}
	#endif
	mdinit(sc);
	if (name != NULL) {
	printf("%s%d: Preloaded image <%s> %zd bytes at %p\n",
	MD_NAME, sc->unit, name, length, image);
	} else {
	printf("%s%d: Embedded image %zd bytes at %p\n",
	MD_NAME, sc->unit, length, image);
	}
	}

	static void
	g_md_init(struct g_class *mp __unused)
	{
	caddr_t mod;
	u_char ptr, name, *type;
	unsigned len;
	int i;

	/* figure out log2(NINDIR) */
	for (i = NINDIR, nshift = -1; i; nshift++)
	i >>= 1;

	mod = NULL;
	sx_init(&md_sx, "MD config lock");
	g_topology_unlock();
	md_uh = new_unrhdr(0, INT_MAX, NULL);
	#ifdef MD_ROOT
	if (mfs_root_size != 0) {
	sx_xlock(&md_sx);
	#ifdef MD_ROOT_MEM
	md_preloaded(mfs_root, mfs_root_size, NULL);
	#else
	md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size,
	NULL);
	#endif
	sx_xunlock(&md_sx);
	}
	#endif
	/* XXX: are preload_* static or do they need Giant ? */
	while ((mod = preload_search_next_name(mod)) != NULL) {
	name = (char *)preload_search_info(mod, MODINFO_NAME);
	if (name == NULL)
	continue;
	type = (char *)preload_search_info(mod, MODINFO_TYPE);
	if (type == NULL)
	continue;
	if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
	continue;
	ptr = preload_fetch_addr(mod);
	len = preload_fetch_size(mod);
	if (ptr != NULL && len != 0) {
	sx_xlock(&md_sx);
	md_preloaded(ptr, len, name);
	sx_xunlock(&md_sx);
	}
	}
	md_pbuf_zone = pbuf_zsecond_create("mdpbuf", nswbuf / 10);
	status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
	0600, MDCTL_NAME);
	g_topology_lock();
	}

	static void
	g_md_dumpconf(struct sbuf sb, const char indent, struct g_geom *gp,
	struct g_consumer cp __unused, struct g_provider pp)
	{
	struct md_s *mp;
	char *type;

	mp = gp->softc;
	if (mp == NULL)
	return;

	switch (mp->type) {
	case MD_MALLOC:
	type = "malloc";
	break;
	case MD_PRELOAD:
	type = "preload";
	break;
	case MD_VNODE:
	type = "vnode";
	break;
	case MD_SWAP:
	type = "swap";
	break;
	case MD_NULL:
	type = "null";
	break;
	default:
	type = "unknown";
	break;
	}

	if (pp != NULL) {
	if (indent == NULL) {
	sbuf_printf(sb, " u %d", mp->unit);
	sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
	sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
	sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
	sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
	sbuf_printf(sb, " t %s", type);
	if ((mp->type == MD_VNODE && mp->vnode != NULL) \|\|
	(mp->type == MD_PRELOAD && mp->file[0] != '\0'))
	sbuf_printf(sb, " file %s", mp->file);
	sbuf_printf(sb, " label %s", mp->label);
	} else {
	sbuf_printf(sb, "%s<unit>%d</unit>\n", indent,
	mp->unit);
	sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n",
	indent, (uintmax_t) mp->sectorsize);
	sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n",
	indent, (uintmax_t) mp->fwheads);
	sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n",
	indent, (uintmax_t) mp->fwsectors);
	if (mp->ident[0] != '\0') {
	sbuf_printf(sb, "%s<ident>", indent);
	g_conf_printf_escaped(sb, "%s", mp->ident);
	sbuf_printf(sb, "</ident>\n");
	}
	sbuf_printf(sb, "%s<length>%ju</length>\n",
	indent, (uintmax_t) mp->mediasize);
	sbuf_printf(sb, "%s<compression>%s</compression>\n", indent,
	(mp->flags & MD_COMPRESS) == 0 ? "off": "on");
	sbuf_printf(sb, "%s<access>%s</access>\n", indent,
	(mp->flags & MD_READONLY) == 0 ? "read-write":
	"read-only");
	sbuf_printf(sb, "%s<type>%s</type>\n", indent,
	type);
	if ((mp->type == MD_VNODE && mp->vnode != NULL) \|\|
	(mp->type == MD_PRELOAD && mp->file[0] != '\0')) {
	sbuf_printf(sb, "%s<file>", indent);
	g_conf_printf_escaped(sb, "%s", mp->file);
	sbuf_printf(sb, "</file>\n");
	}
	if (mp->type == MD_VNODE)
	sbuf_printf(sb, "%s<cache>%s</cache>\n", indent,
	(mp->flags & MD_CACHE) == 0 ? "off": "on");
	sbuf_printf(sb, "%s<label>", indent);
	g_conf_printf_escaped(sb, "%s", mp->label);
	sbuf_printf(sb, "</label>\n");
	}
	}
	}

	static void
	g_md_fini(struct g_class *mp __unused)
	{

	sx_destroy(&md_sx);
	if (status_dev != NULL)
	destroy_dev(status_dev);
	uma_zdestroy(md_pbuf_zone);
	delete_unrhdr(md_uh);
	}
	diff --git a/sys/dev/mfi/mfi.c b/sys/dev/mfi/mfi.c
	index 0bc24724cdf7..981f5a2673e5 100644
	--- a/sys/dev/mfi/mfi.c
	+++ b/sys/dev/mfi/mfi.c
	@@ -1,3796 +1,3796 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND BSD-2-Clause
	*
	* Copyright (c) 2006 IronPort Systems
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/
	/*-
	* Copyright (c) 2007 LSI Corp.
	* Copyright (c) 2007 Rajesh Prabhakaran.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_mfi.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/kernel.h>
	#include <sys/poll.h>
	#include <sys/selinfo.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/eventhandler.h>
	#include <sys/rman.h>
	#include <sys/bio.h>
	#include <sys/ioccom.h>
	#include <sys/uio.h>
	#include <sys/proc.h>
	#include <sys/signalvar.h>
	#include <sys/sysent.h>
	#include <sys/taskqueue.h>

	#include <machine/bus.h>
	#include <machine/resource.h>

	#include <dev/mfi/mfireg.h>
	#include <dev/mfi/mfi_ioctl.h>
	#include <dev/mfi/mfivar.h>
	#include <sys/interrupt.h>
	#include <sys/priority.h>

	static int mfi_alloc_commands(struct mfi_softc *);
	static int mfi_comms_init(struct mfi_softc *);
	static int mfi_get_controller_info(struct mfi_softc *);
	static int mfi_get_log_state(struct mfi_softc *,
	struct mfi_evt_log_state **);
	static int mfi_parse_entries(struct mfi_softc *, int, int);
	static void mfi_data_cb(void , bus_dma_segment_t , int, int);
	static void mfi_startup(void *arg);
	static void mfi_intr(void *arg);
	static void mfi_ldprobe(struct mfi_softc *sc);
	static void mfi_syspdprobe(struct mfi_softc *sc);
	static void mfi_handle_evt(void *context, int pending);
	static int mfi_aen_register(struct mfi_softc *sc, int seq, int locale);
	static void mfi_aen_complete(struct mfi_command *);
	static int mfi_add_ld(struct mfi_softc *sc, int);
	static void mfi_add_ld_complete(struct mfi_command *);
	static int mfi_add_sys_pd(struct mfi_softc *sc, int);
	static void mfi_add_sys_pd_complete(struct mfi_command *);
	static struct mfi_command * mfi_bio_command(struct mfi_softc *);
	static void mfi_bio_complete(struct mfi_command *);
	static struct mfi_command mfi_build_ldio(struct mfi_softc ,struct bio*);
	static struct mfi_command mfi_build_syspdio(struct mfi_softc ,struct bio*);
	static int mfi_send_frame(struct mfi_softc , struct mfi_command );
	static int mfi_std_send_frame(struct mfi_softc , struct mfi_command );
	static int mfi_abort(struct mfi_softc , struct mfi_command *);
	static int mfi_linux_ioctl_int(struct cdev , u_long, caddr_t, int, struct thread );
	static void mfi_timeout(void *);
	static int mfi_user_command(struct mfi_softc *,
	struct mfi_ioc_passthru *);
	static void mfi_enable_intr_xscale(struct mfi_softc *sc);
	static void mfi_enable_intr_ppc(struct mfi_softc *sc);
	static int32_t mfi_read_fw_status_xscale(struct mfi_softc *sc);
	static int32_t mfi_read_fw_status_ppc(struct mfi_softc *sc);
	static int mfi_check_clear_intr_xscale(struct mfi_softc *sc);
	static int mfi_check_clear_intr_ppc(struct mfi_softc *sc);
	static void mfi_issue_cmd_xscale(struct mfi_softc *sc, bus_addr_t bus_add,
	uint32_t frame_cnt);
	static void mfi_issue_cmd_ppc(struct mfi_softc *sc, bus_addr_t bus_add,
	uint32_t frame_cnt);
	static int mfi_config_lock(struct mfi_softc *sc, uint32_t opcode);
	static void mfi_config_unlock(struct mfi_softc *sc, int locked);
	static int mfi_check_command_pre(struct mfi_softc sc, struct mfi_command cm);
	static void mfi_check_command_post(struct mfi_softc sc, struct mfi_command cm);
	static int mfi_check_for_sscd(struct mfi_softc sc, struct mfi_command cm);

	SYSCTL_NODE(_hw, OID_AUTO, mfi, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"MFI driver parameters");
	static int mfi_event_locale = MFI_EVT_LOCALE_ALL;
	SYSCTL_INT(_hw_mfi, OID_AUTO, event_locale, CTLFLAG_RWTUN, &mfi_event_locale,
	0, "event message locale");

	static int mfi_event_class = MFI_EVT_CLASS_INFO;
	SYSCTL_INT(_hw_mfi, OID_AUTO, event_class, CTLFLAG_RWTUN, &mfi_event_class,
	0, "event message class");

	static int mfi_max_cmds = 128;
	SYSCTL_INT(_hw_mfi, OID_AUTO, max_cmds, CTLFLAG_RDTUN, &mfi_max_cmds,
	0, "Max commands limit (-1 = controller limit)");

	static int mfi_detect_jbod_change = 1;
	SYSCTL_INT(_hw_mfi, OID_AUTO, detect_jbod_change, CTLFLAG_RWTUN,
	&mfi_detect_jbod_change, 0, "Detect a change to a JBOD");

	int mfi_polled_cmd_timeout = MFI_POLL_TIMEOUT_SECS;
	SYSCTL_INT(_hw_mfi, OID_AUTO, polled_cmd_timeout, CTLFLAG_RWTUN,
	&mfi_polled_cmd_timeout, 0,
	"Polled command timeout - used for firmware flash etc (in seconds)");

	static int mfi_cmd_timeout = MFI_CMD_TIMEOUT;
	SYSCTL_INT(_hw_mfi, OID_AUTO, cmd_timeout, CTLFLAG_RWTUN, &mfi_cmd_timeout,
	0, "Command timeout (in seconds)");

	/* Management interface */
	static d_open_t mfi_open;
	static d_close_t mfi_close;
	static d_ioctl_t mfi_ioctl;
	static d_poll_t mfi_poll;

	static struct cdevsw mfi_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = 0,
	.d_open = mfi_open,
	.d_close = mfi_close,
	.d_ioctl = mfi_ioctl,
	.d_poll = mfi_poll,
	.d_name = "mfi",
	};

	MALLOC_DEFINE(M_MFIBUF, "mfibuf", "Buffers for the MFI driver");

	#define MFI_INQ_LENGTH SHORT_INQUIRY_LENGTH
	struct mfi_skinny_dma_info mfi_skinny;

	static void
	mfi_enable_intr_xscale(struct mfi_softc *sc)
	{
	MFI_WRITE4(sc, MFI_OMSK, 0x01);
	}

	static void
	mfi_enable_intr_ppc(struct mfi_softc *sc)
	{
	if (sc->mfi_flags & MFI_FLAGS_1078) {
	MFI_WRITE4(sc, MFI_ODCR0, 0xFFFFFFFF);
	MFI_WRITE4(sc, MFI_OMSK, ~MFI_1078_EIM);
	}
	else if (sc->mfi_flags & MFI_FLAGS_GEN2) {
	MFI_WRITE4(sc, MFI_ODCR0, 0xFFFFFFFF);
	MFI_WRITE4(sc, MFI_OMSK, ~MFI_GEN2_EIM);
	}
	else if (sc->mfi_flags & MFI_FLAGS_SKINNY) {
	MFI_WRITE4(sc, MFI_OMSK, ~0x00000001);
	}
	}

	static int32_t
	mfi_read_fw_status_xscale(struct mfi_softc *sc)
	{
	return MFI_READ4(sc, MFI_OMSG0);
	}

	static int32_t
	mfi_read_fw_status_ppc(struct mfi_softc *sc)
	{
	return MFI_READ4(sc, MFI_OSP0);
	}

	static int
	mfi_check_clear_intr_xscale(struct mfi_softc *sc)
	{
	int32_t status;

	status = MFI_READ4(sc, MFI_OSTS);
	if ((status & MFI_OSTS_INTR_VALID) == 0)
	return 1;

	MFI_WRITE4(sc, MFI_OSTS, status);
	return 0;
	}

	static int
	mfi_check_clear_intr_ppc(struct mfi_softc *sc)
	{
	int32_t status;

	status = MFI_READ4(sc, MFI_OSTS);
	if (sc->mfi_flags & MFI_FLAGS_1078) {
	if (!(status & MFI_1078_RM)) {
	return 1;
	}
	}
	else if (sc->mfi_flags & MFI_FLAGS_GEN2) {
	if (!(status & MFI_GEN2_RM)) {
	return 1;
	}
	}
	else if (sc->mfi_flags & MFI_FLAGS_SKINNY) {
	if (!(status & MFI_SKINNY_RM)) {
	return 1;
	}
	}
	if (sc->mfi_flags & MFI_FLAGS_SKINNY)
	MFI_WRITE4(sc, MFI_OSTS, status);
	else
	MFI_WRITE4(sc, MFI_ODCR0, status);
	return 0;
	}

	static void
	mfi_issue_cmd_xscale(struct mfi_softc *sc, bus_addr_t bus_add, uint32_t frame_cnt)
	{
	MFI_WRITE4(sc, MFI_IQP,(bus_add >>3)\|frame_cnt);
	}

	static void
	mfi_issue_cmd_ppc(struct mfi_softc *sc, bus_addr_t bus_add, uint32_t frame_cnt)
	{
	if (sc->mfi_flags & MFI_FLAGS_SKINNY) {
	MFI_WRITE4(sc, MFI_IQPL, (bus_add \| frame_cnt <<1)\|1 );
	MFI_WRITE4(sc, MFI_IQPH, 0x00000000);
	} else {
	MFI_WRITE4(sc, MFI_IQP, (bus_add \| frame_cnt <<1)\|1 );
	}
	}

	int
	mfi_transition_firmware(struct mfi_softc *sc)
	{
	uint32_t fw_state, cur_state;
	int max_wait, i;
	uint32_t cur_abs_reg_val = 0;
	uint32_t prev_abs_reg_val = 0;

	cur_abs_reg_val = sc->mfi_read_fw_status(sc);
	fw_state = cur_abs_reg_val & MFI_FWSTATE_MASK;
	while (fw_state != MFI_FWSTATE_READY) {
	if (bootverbose)
	device_printf(sc->mfi_dev, "Waiting for firmware to "
	"become ready\n");
	cur_state = fw_state;
	switch (fw_state) {
	case MFI_FWSTATE_FAULT:
	device_printf(sc->mfi_dev, "Firmware fault\n");
	return (ENXIO);
	case MFI_FWSTATE_WAIT_HANDSHAKE:
	if (sc->mfi_flags & MFI_FLAGS_SKINNY \|\| sc->mfi_flags & MFI_FLAGS_TBOLT)
	MFI_WRITE4(sc, MFI_SKINNY_IDB, MFI_FWINIT_CLEAR_HANDSHAKE);
	else
	MFI_WRITE4(sc, MFI_IDB, MFI_FWINIT_CLEAR_HANDSHAKE);
	max_wait = MFI_RESET_WAIT_TIME;
	break;
	case MFI_FWSTATE_OPERATIONAL:
	if (sc->mfi_flags & MFI_FLAGS_SKINNY \|\| sc->mfi_flags & MFI_FLAGS_TBOLT)
	MFI_WRITE4(sc, MFI_SKINNY_IDB, 7);
	else
	MFI_WRITE4(sc, MFI_IDB, MFI_FWINIT_READY);
	max_wait = MFI_RESET_WAIT_TIME;
	break;
	case MFI_FWSTATE_UNDEFINED:
	case MFI_FWSTATE_BB_INIT:
	max_wait = MFI_RESET_WAIT_TIME;
	break;
	case MFI_FWSTATE_FW_INIT_2:
	max_wait = MFI_RESET_WAIT_TIME;
	break;
	case MFI_FWSTATE_FW_INIT:
	case MFI_FWSTATE_FLUSH_CACHE:
	max_wait = MFI_RESET_WAIT_TIME;
	break;
	case MFI_FWSTATE_DEVICE_SCAN:
	max_wait = MFI_RESET_WAIT_TIME; /* wait for 180 seconds */
	prev_abs_reg_val = cur_abs_reg_val;
	break;
	case MFI_FWSTATE_BOOT_MESSAGE_PENDING:
	if (sc->mfi_flags & MFI_FLAGS_SKINNY \|\| sc->mfi_flags & MFI_FLAGS_TBOLT)
	MFI_WRITE4(sc, MFI_SKINNY_IDB, MFI_FWINIT_HOTPLUG);
	else
	MFI_WRITE4(sc, MFI_IDB, MFI_FWINIT_HOTPLUG);
	max_wait = MFI_RESET_WAIT_TIME;
	break;
	default:
	device_printf(sc->mfi_dev, "Unknown firmware state %#x\n",
	fw_state);
	return (ENXIO);
	}
	for (i = 0; i < (max_wait * 10); i++) {
	cur_abs_reg_val = sc->mfi_read_fw_status(sc);
	fw_state = cur_abs_reg_val & MFI_FWSTATE_MASK;
	if (fw_state == cur_state)
	DELAY(100000);
	else
	break;
	}
	if (fw_state == MFI_FWSTATE_DEVICE_SCAN) {
	/* Check the device scanning progress */
	if (prev_abs_reg_val != cur_abs_reg_val) {
	continue;
	}
	}
	if (fw_state == cur_state) {
	device_printf(sc->mfi_dev, "Firmware stuck in state "
	"%#x\n", fw_state);
	return (ENXIO);
	}
	}
	return (0);
	}

	static void
	mfi_addr_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	bus_addr_t *addr;

	addr = arg;
	*addr = segs[0].ds_addr;
	}

	int
	mfi_attach(struct mfi_softc *sc)
	{
	uint32_t status;
	int error, commsz, framessz, sensesz;
	int frames, unit, max_fw_sge, max_fw_cmds;
	uint32_t tb_mem_size = 0;
	struct cdev *dev_t;

	if (sc == NULL)
	return EINVAL;

	device_printf(sc->mfi_dev, "Megaraid SAS driver Ver %s \n",
	MEGASAS_VERSION);

	mtx_init(&sc->mfi_io_lock, "MFI I/O lock", NULL, MTX_DEF);
	sx_init(&sc->mfi_config_lock, "MFI config");
	TAILQ_INIT(&sc->mfi_ld_tqh);
	TAILQ_INIT(&sc->mfi_syspd_tqh);
	TAILQ_INIT(&sc->mfi_ld_pend_tqh);
	TAILQ_INIT(&sc->mfi_syspd_pend_tqh);
	TAILQ_INIT(&sc->mfi_evt_queue);
	TASK_INIT(&sc->mfi_evt_task, 0, mfi_handle_evt, sc);
	TASK_INIT(&sc->mfi_map_sync_task, 0, mfi_handle_map_sync, sc);
	TAILQ_INIT(&sc->mfi_aen_pids);
	TAILQ_INIT(&sc->mfi_cam_ccbq);

	mfi_initq_free(sc);
	mfi_initq_ready(sc);
	mfi_initq_busy(sc);
	mfi_initq_bio(sc);

	sc->adpreset = 0;
	sc->last_seq_num = 0;
	sc->disableOnlineCtrlReset = 1;
	sc->issuepend_done = 1;
	sc->hw_crit_error = 0;

	if (sc->mfi_flags & MFI_FLAGS_1064R) {
	sc->mfi_enable_intr = mfi_enable_intr_xscale;
	sc->mfi_read_fw_status = mfi_read_fw_status_xscale;
	sc->mfi_check_clear_intr = mfi_check_clear_intr_xscale;
	sc->mfi_issue_cmd = mfi_issue_cmd_xscale;
	} else if (sc->mfi_flags & MFI_FLAGS_TBOLT) {
	sc->mfi_enable_intr = mfi_tbolt_enable_intr_ppc;
	sc->mfi_disable_intr = mfi_tbolt_disable_intr_ppc;
	sc->mfi_read_fw_status = mfi_tbolt_read_fw_status_ppc;
	sc->mfi_check_clear_intr = mfi_tbolt_check_clear_intr_ppc;
	sc->mfi_issue_cmd = mfi_tbolt_issue_cmd_ppc;
	sc->mfi_adp_reset = mfi_tbolt_adp_reset;
	sc->mfi_tbolt = 1;
	TAILQ_INIT(&sc->mfi_cmd_tbolt_tqh);
	} else {
	sc->mfi_enable_intr = mfi_enable_intr_ppc;
	sc->mfi_read_fw_status = mfi_read_fw_status_ppc;
	sc->mfi_check_clear_intr = mfi_check_clear_intr_ppc;
	sc->mfi_issue_cmd = mfi_issue_cmd_ppc;
	}

	/* Before we get too far, see if the firmware is working */
	if ((error = mfi_transition_firmware(sc)) != 0) {
	device_printf(sc->mfi_dev, "Firmware not in READY state, "
	"error %d\n", error);
	return (ENXIO);
	}

	/* Start: LSIP200113393 */
	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	1, 0, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	MEGASAS_MAX_NAMEsizeof(bus_addr_t), / maxsize */
	1, /* msegments */
	MEGASAS_MAX_NAMEsizeof(bus_addr_t), / maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->verbuf_h_dmat)) {
	device_printf(sc->mfi_dev, "Cannot allocate verbuf_h_dmat DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->verbuf_h_dmat, (void **)&sc->verbuf,
	BUS_DMA_NOWAIT, &sc->verbuf_h_dmamap)) {
	device_printf(sc->mfi_dev, "Cannot allocate verbuf_h_dmamap memory\n");
	return (ENOMEM);
	}
	bzero(sc->verbuf, MEGASAS_MAX_NAME*sizeof(bus_addr_t));
	bus_dmamap_load(sc->verbuf_h_dmat, sc->verbuf_h_dmamap,
	sc->verbuf, MEGASAS_MAX_NAME*sizeof(bus_addr_t),
	mfi_addr_cb, &sc->verbuf_h_busaddr, 0);
	/* End: LSIP200113393 */

	/*
	* Get information needed for sizing the contiguous memory for the
	* frame pool. Size down the sgl parameter since we know that
	- * we will never need more than what's required for MAXPHYS.
	+ * we will never need more than what's required for MFI_MAXPHYS.
	* It would be nice if these constants were available at runtime
	* instead of compile time.
	*/
	status = sc->mfi_read_fw_status(sc);
	max_fw_cmds = status & MFI_FWSTATE_MAXCMD_MASK;
	if (mfi_max_cmds > 0 && mfi_max_cmds < max_fw_cmds) {
	device_printf(sc->mfi_dev, "FW MaxCmds = %d, limiting to %d\n",
	max_fw_cmds, mfi_max_cmds);
	sc->mfi_max_fw_cmds = mfi_max_cmds;
	} else {
	sc->mfi_max_fw_cmds = max_fw_cmds;
	}
	max_fw_sge = (status & MFI_FWSTATE_MAXSGL_MASK) >> 16;
	sc->mfi_max_sge = min(max_fw_sge, ((MFI_MAXPHYS / PAGE_SIZE) + 1));

	/* ThunderBolt Support get the contiguous memory */

	if (sc->mfi_flags & MFI_FLAGS_TBOLT) {
	mfi_tbolt_init_globals(sc);
	device_printf(sc->mfi_dev, "MaxCmd = %d, Drv MaxCmd = %d, "
	"MaxSgl = %d, state = %#x\n", max_fw_cmds,
	sc->mfi_max_fw_cmds, sc->mfi_max_sge, status);
	tb_mem_size = mfi_tbolt_get_memory_requirement(sc);

	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	1, 0, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	tb_mem_size, /* maxsize */
	1, /* msegments */
	tb_mem_size, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->mfi_tb_dmat)) {
	device_printf(sc->mfi_dev, "Cannot allocate comms DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->mfi_tb_dmat, (void **)&sc->request_message_pool,
	BUS_DMA_NOWAIT, &sc->mfi_tb_dmamap)) {
	device_printf(sc->mfi_dev, "Cannot allocate comms memory\n");
	return (ENOMEM);
	}
	bzero(sc->request_message_pool, tb_mem_size);
	bus_dmamap_load(sc->mfi_tb_dmat, sc->mfi_tb_dmamap,
	sc->request_message_pool, tb_mem_size, mfi_addr_cb, &sc->mfi_tb_busaddr, 0);

	/* For ThunderBolt memory init */
	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	0x100, 0, /* alignmnt, boundary */
	BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	MFI_FRAME_SIZE, /* maxsize */
	1, /* msegments */
	MFI_FRAME_SIZE, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->mfi_tb_init_dmat)) {
	device_printf(sc->mfi_dev, "Cannot allocate init DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->mfi_tb_init_dmat, (void **)&sc->mfi_tb_init,
	BUS_DMA_NOWAIT, &sc->mfi_tb_init_dmamap)) {
	device_printf(sc->mfi_dev, "Cannot allocate init memory\n");
	return (ENOMEM);
	}
	bzero(sc->mfi_tb_init, MFI_FRAME_SIZE);
	bus_dmamap_load(sc->mfi_tb_init_dmat, sc->mfi_tb_init_dmamap,
	sc->mfi_tb_init, MFI_FRAME_SIZE, mfi_addr_cb,
	&sc->mfi_tb_init_busaddr, 0);
	if (mfi_tbolt_init_desc_pool(sc, sc->request_message_pool,
	tb_mem_size)) {
	device_printf(sc->mfi_dev,
	"Thunderbolt pool preparation error\n");
	return 0;
	}

	/*
	Allocate DMA memory mapping for MPI2 IOC Init descriptor,
	we are taking it different from what we have allocated for Request
	and reply descriptors to avoid confusion later
	*/
	tb_mem_size = sizeof(struct MPI2_IOC_INIT_REQUEST);
	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	1, 0, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	tb_mem_size, /* maxsize */
	1, /* msegments */
	tb_mem_size, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->mfi_tb_ioc_init_dmat)) {
	device_printf(sc->mfi_dev,
	"Cannot allocate comms DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->mfi_tb_ioc_init_dmat,
	(void **)&sc->mfi_tb_ioc_init_desc,
	BUS_DMA_NOWAIT, &sc->mfi_tb_ioc_init_dmamap)) {
	device_printf(sc->mfi_dev, "Cannot allocate comms memory\n");
	return (ENOMEM);
	}
	bzero(sc->mfi_tb_ioc_init_desc, tb_mem_size);
	bus_dmamap_load(sc->mfi_tb_ioc_init_dmat, sc->mfi_tb_ioc_init_dmamap,
	sc->mfi_tb_ioc_init_desc, tb_mem_size, mfi_addr_cb,
	&sc->mfi_tb_ioc_init_busaddr, 0);
	}
	/*
	* Create the dma tag for data buffers. Used both for block I/O
	* and for various internal data queries.
	*/
	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	1, 0, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
	sc->mfi_max_sge, /* nsegments */
	BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
	BUS_DMA_ALLOCNOW, /* flags */
	busdma_lock_mutex, /* lockfunc */
	&sc->mfi_io_lock, /* lockfuncarg */
	&sc->mfi_buffer_dmat)) {
	device_printf(sc->mfi_dev, "Cannot allocate buffer DMA tag\n");
	return (ENOMEM);
	}

	/*
	* Allocate DMA memory for the comms queues. Keep it under 4GB for
	* efficiency. The mfi_hwcomms struct includes space for 1 reply queue
	* entry, so the calculated size here will be will be 1 more than
	* mfi_max_fw_cmds. This is apparently a requirement of the hardware.
	*/
	commsz = (sizeof(uint32_t) * sc->mfi_max_fw_cmds) +
	sizeof(struct mfi_hwcomms);
	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	1, 0, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	commsz, /* maxsize */
	1, /* msegments */
	commsz, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->mfi_comms_dmat)) {
	device_printf(sc->mfi_dev, "Cannot allocate comms DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->mfi_comms_dmat, (void **)&sc->mfi_comms,
	BUS_DMA_NOWAIT, &sc->mfi_comms_dmamap)) {
	device_printf(sc->mfi_dev, "Cannot allocate comms memory\n");
	return (ENOMEM);
	}
	bzero(sc->mfi_comms, commsz);
	bus_dmamap_load(sc->mfi_comms_dmat, sc->mfi_comms_dmamap,
	sc->mfi_comms, commsz, mfi_addr_cb, &sc->mfi_comms_busaddr, 0);
	/*
	* Allocate DMA memory for the command frames. Keep them in the
	* lower 4GB for efficiency. Calculate the size of the commands at
	* the same time; each command is one 64 byte frame plus a set of
	* additional frames for holding sg lists or other data.
	* The assumption here is that the SG list will start at the second
	* frame and not use the unused bytes in the first frame. While this
	* isn't technically correct, it simplifies the calculation and allows
	* for command frames that might be larger than an mfi_io_frame.
	*/
	if (sizeof(bus_addr_t) == 8) {
	sc->mfi_sge_size = sizeof(struct mfi_sg64);
	sc->mfi_flags \|= MFI_FLAGS_SG64;
	} else {
	sc->mfi_sge_size = sizeof(struct mfi_sg32);
	}
	if (sc->mfi_flags & MFI_FLAGS_SKINNY)
	sc->mfi_sge_size = sizeof(struct mfi_sg_skinny);
	frames = (sc->mfi_sge_size * sc->mfi_max_sge - 1) / MFI_FRAME_SIZE + 2;
	sc->mfi_cmd_size = frames * MFI_FRAME_SIZE;
	framessz = sc->mfi_cmd_size * sc->mfi_max_fw_cmds;
	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	64, 0, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	framessz, /* maxsize */
	1, /* nsegments */
	framessz, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->mfi_frames_dmat)) {
	device_printf(sc->mfi_dev, "Cannot allocate frame DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->mfi_frames_dmat, (void **)&sc->mfi_frames,
	BUS_DMA_NOWAIT, &sc->mfi_frames_dmamap)) {
	device_printf(sc->mfi_dev, "Cannot allocate frames memory\n");
	return (ENOMEM);
	}
	bzero(sc->mfi_frames, framessz);
	bus_dmamap_load(sc->mfi_frames_dmat, sc->mfi_frames_dmamap,
	sc->mfi_frames, framessz, mfi_addr_cb, &sc->mfi_frames_busaddr,0);
	/*
	* Allocate DMA memory for the frame sense data. Keep them in the
	* lower 4GB for efficiency
	*/
	sensesz = sc->mfi_max_fw_cmds * MFI_SENSE_LEN;
	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	4, 0, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	sensesz, /* maxsize */
	1, /* nsegments */
	sensesz, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->mfi_sense_dmat)) {
	device_printf(sc->mfi_dev, "Cannot allocate sense DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->mfi_sense_dmat, (void **)&sc->mfi_sense,
	BUS_DMA_NOWAIT, &sc->mfi_sense_dmamap)) {
	device_printf(sc->mfi_dev, "Cannot allocate sense memory\n");
	return (ENOMEM);
	}
	bus_dmamap_load(sc->mfi_sense_dmat, sc->mfi_sense_dmamap,
	sc->mfi_sense, sensesz, mfi_addr_cb, &sc->mfi_sense_busaddr, 0);
	if ((error = mfi_alloc_commands(sc)) != 0)
	return (error);

	/* Before moving the FW to operational state, check whether
	* hostmemory is required by the FW or not
	*/

	/* ThunderBolt MFI_IOC2 INIT */
	if (sc->mfi_flags & MFI_FLAGS_TBOLT) {
	sc->mfi_disable_intr(sc);
	mtx_lock(&sc->mfi_io_lock);
	if ((error = mfi_tbolt_init_MFI_queue(sc)) != 0) {
	device_printf(sc->mfi_dev,
	"TB Init has failed with error %d\n",error);
	mtx_unlock(&sc->mfi_io_lock);
	return error;
	}
	mtx_unlock(&sc->mfi_io_lock);

	if ((error = mfi_tbolt_alloc_cmd(sc)) != 0)
	return error;
	if (bus_setup_intr(sc->mfi_dev, sc->mfi_irq,
	INTR_MPSAFE\|INTR_TYPE_BIO, NULL, mfi_intr_tbolt, sc,
	&sc->mfi_intr)) {
	device_printf(sc->mfi_dev, "Cannot set up interrupt\n");
	return (EINVAL);
	}
	sc->mfi_intr_ptr = mfi_intr_tbolt;
	sc->mfi_enable_intr(sc);
	} else {
	if ((error = mfi_comms_init(sc)) != 0)
	return (error);

	if (bus_setup_intr(sc->mfi_dev, sc->mfi_irq,
	INTR_MPSAFE\|INTR_TYPE_BIO, NULL, mfi_intr, sc, &sc->mfi_intr)) {
	device_printf(sc->mfi_dev, "Cannot set up interrupt\n");
	return (EINVAL);
	}
	sc->mfi_intr_ptr = mfi_intr;
	sc->mfi_enable_intr(sc);
	}
	if ((error = mfi_get_controller_info(sc)) != 0)
	return (error);
	sc->disableOnlineCtrlReset = 0;

	/* Register a config hook to probe the bus for arrays */
	sc->mfi_ich.ich_func = mfi_startup;
	sc->mfi_ich.ich_arg = sc;
	if (config_intrhook_establish(&sc->mfi_ich) != 0) {
	device_printf(sc->mfi_dev, "Cannot establish configuration "
	"hook\n");
	return (EINVAL);
	}
	mtx_lock(&sc->mfi_io_lock);
	if ((error = mfi_aen_setup(sc, 0), 0) != 0) {
	mtx_unlock(&sc->mfi_io_lock);
	return (error);
	}
	mtx_unlock(&sc->mfi_io_lock);

	/*
	* Register a shutdown handler.
	*/
	if ((sc->mfi_eh = EVENTHANDLER_REGISTER(shutdown_final, mfi_shutdown,
	sc, SHUTDOWN_PRI_DEFAULT)) == NULL) {
	device_printf(sc->mfi_dev, "Warning: shutdown event "
	"registration failed\n");
	}

	/*
	* Create the control device for doing management
	*/
	unit = device_get_unit(sc->mfi_dev);
	sc->mfi_cdev = make_dev(&mfi_cdevsw, unit, UID_ROOT, GID_OPERATOR,
	0640, "mfi%d", unit);
	if (unit == 0)
	make_dev_alias_p(MAKEDEV_CHECKNAME \| MAKEDEV_WAITOK, &dev_t,
	sc->mfi_cdev, "%s", "megaraid_sas_ioctl_node");
	if (sc->mfi_cdev != NULL)
	sc->mfi_cdev->si_drv1 = sc;
	SYSCTL_ADD_INT(device_get_sysctl_ctx(sc->mfi_dev),
	SYSCTL_CHILDREN(device_get_sysctl_tree(sc->mfi_dev)),
	OID_AUTO, "delete_busy_volumes", CTLFLAG_RW,
	&sc->mfi_delete_busy_volumes, 0, "Allow removal of busy volumes");
	SYSCTL_ADD_INT(device_get_sysctl_ctx(sc->mfi_dev),
	SYSCTL_CHILDREN(device_get_sysctl_tree(sc->mfi_dev)),
	OID_AUTO, "keep_deleted_volumes", CTLFLAG_RW,
	&sc->mfi_keep_deleted_volumes, 0,
	"Don't detach the mfid device for a busy volume that is deleted");

	device_add_child(sc->mfi_dev, "mfip", -1);
	bus_generic_attach(sc->mfi_dev);

	/* Start the timeout watchdog */
	callout_init(&sc->mfi_watchdog_callout, 1);
	callout_reset(&sc->mfi_watchdog_callout, mfi_cmd_timeout * hz,
	mfi_timeout, sc);

	if (sc->mfi_flags & MFI_FLAGS_TBOLT) {
	mtx_lock(&sc->mfi_io_lock);
	mfi_tbolt_sync_map_info(sc);
	mtx_unlock(&sc->mfi_io_lock);
	}

	return (0);
	}

	static int
	mfi_alloc_commands(struct mfi_softc *sc)
	{
	struct mfi_command *cm;
	int i, j;

	/*
	* XXX Should we allocate all the commands up front, or allocate on
	* demand later like 'aac' does?
	*/
	sc->mfi_commands = malloc(sizeof(sc->mfi_commands[0]) *
	sc->mfi_max_fw_cmds, M_MFIBUF, M_WAITOK \| M_ZERO);

	for (i = 0; i < sc->mfi_max_fw_cmds; i++) {
	cm = &sc->mfi_commands[i];
	cm->cm_frame = (union mfi_frame *)((uintptr_t)sc->mfi_frames +
	sc->mfi_cmd_size * i);
	cm->cm_frame_busaddr = sc->mfi_frames_busaddr +
	sc->mfi_cmd_size * i;
	cm->cm_frame->header.context = i;
	cm->cm_sense = &sc->mfi_sense[i];
	cm->cm_sense_busaddr= sc->mfi_sense_busaddr + MFI_SENSE_LEN * i;
	cm->cm_sc = sc;
	cm->cm_index = i;
	if (bus_dmamap_create(sc->mfi_buffer_dmat, 0,
	&cm->cm_dmamap) == 0) {
	mtx_lock(&sc->mfi_io_lock);
	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);
	} else {
	device_printf(sc->mfi_dev, "Failed to allocate %d "
	"command blocks, only allocated %d\n",
	sc->mfi_max_fw_cmds, i - 1);
	for (j = 0; j < i; j++) {
	cm = &sc->mfi_commands[i];
	bus_dmamap_destroy(sc->mfi_buffer_dmat,
	cm->cm_dmamap);
	}
	free(sc->mfi_commands, M_MFIBUF);
	sc->mfi_commands = NULL;

	return (ENOMEM);
	}
	}

	return (0);
	}

	void
	mfi_release_command(struct mfi_command *cm)
	{
	struct mfi_frame_header *hdr;
	uint32_t *hdr_data;

	mtx_assert(&cm->cm_sc->mfi_io_lock, MA_OWNED);

	/*
	* Zero out the important fields of the frame, but make sure the
	* context field is preserved. For efficiency, handle the fields
	* as 32 bit words. Clear out the first S/G entry too for safety.
	*/
	hdr = &cm->cm_frame->header;
	if (cm->cm_data != NULL && hdr->sg_count) {
	cm->cm_sg->sg32[0].len = 0;
	cm->cm_sg->sg32[0].addr = 0;
	}

	/*
	* Command may be on other queues e.g. busy queue depending on the
	* flow of a previous call to mfi_mapcmd, so ensure its dequeued
	* properly
	*/
	if ((cm->cm_flags & MFI_ON_MFIQ_BUSY) != 0)
	mfi_remove_busy(cm);
	if ((cm->cm_flags & MFI_ON_MFIQ_READY) != 0)
	mfi_remove_ready(cm);

	/* We're not expecting it to be on any other queue but check */
	if ((cm->cm_flags & MFI_ON_MFIQ_MASK) != 0) {
	panic("Command %p is still on another queue, flags = %#x",
	cm, cm->cm_flags);
	}

	/* tbolt cleanup */
	if ((cm->cm_flags & MFI_CMD_TBOLT) != 0) {
	mfi_tbolt_return_cmd(cm->cm_sc,
	cm->cm_sc->mfi_cmd_pool_tbolt[cm->cm_extra_frames - 1],
	cm);
	}

	hdr_data = (uint32_t *)cm->cm_frame;
	hdr_data[0] = 0; /* cmd, sense_len, cmd_status, scsi_status */
	hdr_data[1] = 0; /* target_id, lun_id, cdb_len, sg_count */
	hdr_data[4] = 0; /* flags, timeout */
	hdr_data[5] = 0; /* data_len */

	cm->cm_extra_frames = 0;
	cm->cm_flags = 0;
	cm->cm_complete = NULL;
	cm->cm_private = NULL;
	cm->cm_data = NULL;
	cm->cm_sg = 0;
	cm->cm_total_frame_size = 0;
	cm->retry_for_fw_reset = 0;

	mfi_enqueue_free(cm);
	}

	int
	mfi_dcmd_command(struct mfi_softc sc, struct mfi_command *cmp,
	uint32_t opcode, void **bufp, size_t bufsize)
	{
	struct mfi_command *cm;
	struct mfi_dcmd_frame *dcmd;
	void *buf = NULL;
	uint32_t context = 0;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	cm = mfi_dequeue_free(sc);
	if (cm == NULL)
	return (EBUSY);

	/* Zero out the MFI frame */
	context = cm->cm_frame->header.context;
	bzero(cm->cm_frame, sizeof(union mfi_frame));
	cm->cm_frame->header.context = context;

	if ((bufsize > 0) && (bufp != NULL)) {
	if (*bufp == NULL) {
	buf = malloc(bufsize, M_MFIBUF, M_NOWAIT\|M_ZERO);
	if (buf == NULL) {
	mfi_release_command(cm);
	return (ENOMEM);
	}
	*bufp = buf;
	} else {
	buf = *bufp;
	}
	}

	dcmd = &cm->cm_frame->dcmd;
	bzero(dcmd->mbox, MFI_MBOX_SIZE);
	dcmd->header.cmd = MFI_CMD_DCMD;
	dcmd->header.timeout = 0;
	dcmd->header.flags = 0;
	dcmd->header.data_len = bufsize;
	dcmd->header.scsi_status = 0;
	dcmd->opcode = opcode;
	cm->cm_sg = &dcmd->sgl;
	cm->cm_total_frame_size = MFI_DCMD_FRAME_SIZE;
	cm->cm_flags = 0;
	cm->cm_data = buf;
	cm->cm_private = buf;
	cm->cm_len = bufsize;

	*cmp = cm;
	if ((bufp != NULL) && (*bufp == NULL) && (buf != NULL))
	*bufp = buf;
	return (0);
	}

	static int
	mfi_comms_init(struct mfi_softc *sc)
	{
	struct mfi_command *cm;
	struct mfi_init_frame *init;
	struct mfi_init_qinfo *qinfo;
	int error;
	uint32_t context = 0;

	mtx_lock(&sc->mfi_io_lock);
	if ((cm = mfi_dequeue_free(sc)) == NULL) {
	mtx_unlock(&sc->mfi_io_lock);
	return (EBUSY);
	}

	/* Zero out the MFI frame */
	context = cm->cm_frame->header.context;
	bzero(cm->cm_frame, sizeof(union mfi_frame));
	cm->cm_frame->header.context = context;

	/*
	* Abuse the SG list area of the frame to hold the init_qinfo
	* object;
	*/
	init = &cm->cm_frame->init;
	qinfo = (struct mfi_init_qinfo *)((uintptr_t)init + MFI_FRAME_SIZE);

	bzero(qinfo, sizeof(struct mfi_init_qinfo));
	qinfo->rq_entries = sc->mfi_max_fw_cmds + 1;
	qinfo->rq_addr_lo = sc->mfi_comms_busaddr +
	offsetof(struct mfi_hwcomms, hw_reply_q);
	qinfo->pi_addr_lo = sc->mfi_comms_busaddr +
	offsetof(struct mfi_hwcomms, hw_pi);
	qinfo->ci_addr_lo = sc->mfi_comms_busaddr +
	offsetof(struct mfi_hwcomms, hw_ci);

	init->header.cmd = MFI_CMD_INIT;
	init->header.data_len = sizeof(struct mfi_init_qinfo);
	init->qinfo_new_addr_lo = cm->cm_frame_busaddr + MFI_FRAME_SIZE;
	cm->cm_data = NULL;
	cm->cm_flags = MFI_CMD_POLLED;

	if ((error = mfi_mapcmd(sc, cm)) != 0)
	device_printf(sc->mfi_dev, "failed to send init command\n");
	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);

	return (error);
	}

	static int
	mfi_get_controller_info(struct mfi_softc *sc)
	{
	struct mfi_command *cm = NULL;
	struct mfi_ctrl_info *ci = NULL;
	uint32_t max_sectors_1, max_sectors_2;
	int error;

	mtx_lock(&sc->mfi_io_lock);
	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_GETINFO,
	(void *)&ci, sizeof(ci));
	if (error)
	goto out;
	cm->cm_flags = MFI_CMD_DATAIN \| MFI_CMD_POLLED;

	if ((error = mfi_mapcmd(sc, cm)) != 0) {
	device_printf(sc->mfi_dev, "Failed to get controller info\n");
	sc->mfi_max_io = (sc->mfi_max_sge - 1) * PAGE_SIZE /
	MFI_SECTOR_LEN;
	error = 0;
	goto out;
	}

	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
	BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);

	max_sectors_1 = (1 << ci->stripe_sz_ops.max) * ci->max_strips_per_io;
	max_sectors_2 = ci->max_request_size;
	sc->mfi_max_io = min(max_sectors_1, max_sectors_2);
	sc->disableOnlineCtrlReset =
	ci->properties.OnOffProperties.disableOnlineCtrlReset;

	out:
	if (ci)
	free(ci, M_MFIBUF);
	if (cm)
	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);
	return (error);
	}

	static int
	mfi_get_log_state(struct mfi_softc sc, struct mfi_evt_log_state *log_state)
	{
	struct mfi_command *cm = NULL;
	int error;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_EVENT_GETINFO,
	(void )log_state, sizeof(log_state));
	if (error)
	goto out;
	cm->cm_flags = MFI_CMD_DATAIN \| MFI_CMD_POLLED;

	if ((error = mfi_mapcmd(sc, cm)) != 0) {
	device_printf(sc->mfi_dev, "Failed to get log state\n");
	goto out;
	}

	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
	BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);

	out:
	if (cm)
	mfi_release_command(cm);

	return (error);
	}

	int
	mfi_aen_setup(struct mfi_softc *sc, uint32_t seq_start)
	{
	struct mfi_evt_log_state *log_state = NULL;
	union mfi_evt class_locale;
	int error = 0;
	uint32_t seq;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	class_locale.members.reserved = 0;
	class_locale.members.locale = mfi_event_locale;
	class_locale.members.evt_class = mfi_event_class;

	if (seq_start == 0) {
	if ((error = mfi_get_log_state(sc, &log_state)) != 0)
	goto out;
	sc->mfi_boot_seq_num = log_state->boot_seq_num;

	/*
	* Walk through any events that fired since the last
	* shutdown.
	*/
	if ((error = mfi_parse_entries(sc, log_state->shutdown_seq_num,
	log_state->newest_seq_num)) != 0)
	goto out;
	seq = log_state->newest_seq_num;
	} else
	seq = seq_start;
	error = mfi_aen_register(sc, seq, class_locale.word);
	out:
	free(log_state, M_MFIBUF);

	return (error);
	}

	int
	mfi_wait_command(struct mfi_softc sc, struct mfi_command cm)
	{

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
	cm->cm_complete = NULL;

	/*
	* MegaCli can issue a DCMD of 0. In this case do nothing
	* and return 0 to it as status
	*/
	if (cm->cm_frame->dcmd.opcode == 0) {
	cm->cm_frame->header.cmd_status = MFI_STAT_OK;
	cm->cm_error = 0;
	return (cm->cm_error);
	}
	mfi_enqueue_ready(cm);
	mfi_startio(sc);
	if ((cm->cm_flags & MFI_CMD_COMPLETED) == 0)
	msleep(cm, &sc->mfi_io_lock, PRIBIO, "mfiwait", 0);
	return (cm->cm_error);
	}

	void
	mfi_free(struct mfi_softc *sc)
	{
	struct mfi_command *cm;
	int i;

	callout_drain(&sc->mfi_watchdog_callout);

	if (sc->mfi_cdev != NULL)
	destroy_dev(sc->mfi_cdev);

	if (sc->mfi_commands != NULL) {
	for (i = 0; i < sc->mfi_max_fw_cmds; i++) {
	cm = &sc->mfi_commands[i];
	bus_dmamap_destroy(sc->mfi_buffer_dmat, cm->cm_dmamap);
	}
	free(sc->mfi_commands, M_MFIBUF);
	sc->mfi_commands = NULL;
	}

	if (sc->mfi_intr)
	bus_teardown_intr(sc->mfi_dev, sc->mfi_irq, sc->mfi_intr);
	if (sc->mfi_irq != NULL)
	bus_release_resource(sc->mfi_dev, SYS_RES_IRQ, sc->mfi_irq_rid,
	sc->mfi_irq);

	if (sc->mfi_sense_busaddr != 0)
	bus_dmamap_unload(sc->mfi_sense_dmat, sc->mfi_sense_dmamap);
	if (sc->mfi_sense != NULL)
	bus_dmamem_free(sc->mfi_sense_dmat, sc->mfi_sense,
	sc->mfi_sense_dmamap);
	if (sc->mfi_sense_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_sense_dmat);

	if (sc->mfi_frames_busaddr != 0)
	bus_dmamap_unload(sc->mfi_frames_dmat, sc->mfi_frames_dmamap);
	if (sc->mfi_frames != NULL)
	bus_dmamem_free(sc->mfi_frames_dmat, sc->mfi_frames,
	sc->mfi_frames_dmamap);
	if (sc->mfi_frames_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_frames_dmat);

	if (sc->mfi_comms_busaddr != 0)
	bus_dmamap_unload(sc->mfi_comms_dmat, sc->mfi_comms_dmamap);
	if (sc->mfi_comms != NULL)
	bus_dmamem_free(sc->mfi_comms_dmat, sc->mfi_comms,
	sc->mfi_comms_dmamap);
	if (sc->mfi_comms_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_comms_dmat);

	/* ThunderBolt contiguous memory free here */
	if (sc->mfi_flags & MFI_FLAGS_TBOLT) {
	if (sc->mfi_tb_busaddr != 0)
	bus_dmamap_unload(sc->mfi_tb_dmat, sc->mfi_tb_dmamap);
	if (sc->request_message_pool != NULL)
	bus_dmamem_free(sc->mfi_tb_dmat, sc->request_message_pool,
	sc->mfi_tb_dmamap);
	if (sc->mfi_tb_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_tb_dmat);

	/* Version buffer memory free */
	/* Start LSIP200113393 */
	if (sc->verbuf_h_busaddr != 0)
	bus_dmamap_unload(sc->verbuf_h_dmat, sc->verbuf_h_dmamap);
	if (sc->verbuf != NULL)
	bus_dmamem_free(sc->verbuf_h_dmat, sc->verbuf,
	sc->verbuf_h_dmamap);
	if (sc->verbuf_h_dmat != NULL)
	bus_dma_tag_destroy(sc->verbuf_h_dmat);

	/* End LSIP200113393 */
	/* ThunderBolt INIT packet memory Free */
	if (sc->mfi_tb_init_busaddr != 0)
	bus_dmamap_unload(sc->mfi_tb_init_dmat,
	sc->mfi_tb_init_dmamap);
	if (sc->mfi_tb_init != NULL)
	bus_dmamem_free(sc->mfi_tb_init_dmat, sc->mfi_tb_init,
	sc->mfi_tb_init_dmamap);
	if (sc->mfi_tb_init_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_tb_init_dmat);

	/* ThunderBolt IOC Init Desc memory free here */
	if (sc->mfi_tb_ioc_init_busaddr != 0)
	bus_dmamap_unload(sc->mfi_tb_ioc_init_dmat,
	sc->mfi_tb_ioc_init_dmamap);
	if (sc->mfi_tb_ioc_init_desc != NULL)
	bus_dmamem_free(sc->mfi_tb_ioc_init_dmat,
	sc->mfi_tb_ioc_init_desc,
	sc->mfi_tb_ioc_init_dmamap);
	if (sc->mfi_tb_ioc_init_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_tb_ioc_init_dmat);
	if (sc->mfi_cmd_pool_tbolt != NULL) {
	for (int i = 0; i < sc->mfi_max_fw_cmds; i++) {
	if (sc->mfi_cmd_pool_tbolt[i] != NULL) {
	free(sc->mfi_cmd_pool_tbolt[i],
	M_MFIBUF);
	sc->mfi_cmd_pool_tbolt[i] = NULL;
	}
	}
	free(sc->mfi_cmd_pool_tbolt, M_MFIBUF);
	sc->mfi_cmd_pool_tbolt = NULL;
	}
	if (sc->request_desc_pool != NULL) {
	free(sc->request_desc_pool, M_MFIBUF);
	sc->request_desc_pool = NULL;
	}
	}
	if (sc->mfi_buffer_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_buffer_dmat);
	if (sc->mfi_parent_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_parent_dmat);

	if (mtx_initialized(&sc->mfi_io_lock)) {
	mtx_destroy(&sc->mfi_io_lock);
	sx_destroy(&sc->mfi_config_lock);
	}

	return;
	}

	static void
	mfi_startup(void *arg)
	{
	struct mfi_softc *sc;

	sc = (struct mfi_softc *)arg;

	sc->mfi_enable_intr(sc);
	sx_xlock(&sc->mfi_config_lock);
	mtx_lock(&sc->mfi_io_lock);
	mfi_ldprobe(sc);
	if (sc->mfi_flags & MFI_FLAGS_SKINNY)
	mfi_syspdprobe(sc);
	mtx_unlock(&sc->mfi_io_lock);
	sx_xunlock(&sc->mfi_config_lock);

	config_intrhook_disestablish(&sc->mfi_ich);
	}

	static void
	mfi_intr(void *arg)
	{
	struct mfi_softc *sc;
	struct mfi_command *cm;
	uint32_t pi, ci, context;

	sc = (struct mfi_softc *)arg;

	if (sc->mfi_check_clear_intr(sc))
	return;

	restart:
	pi = sc->mfi_comms->hw_pi;
	ci = sc->mfi_comms->hw_ci;
	mtx_lock(&sc->mfi_io_lock);
	while (ci != pi) {
	context = sc->mfi_comms->hw_reply_q[ci];
	if (context < sc->mfi_max_fw_cmds) {
	cm = &sc->mfi_commands[context];
	mfi_remove_busy(cm);
	cm->cm_error = 0;
	mfi_complete(sc, cm);
	}
	if (++ci == (sc->mfi_max_fw_cmds + 1))
	ci = 0;
	}

	sc->mfi_comms->hw_ci = ci;

	/* Give defered I/O a chance to run */
	sc->mfi_flags &= ~MFI_FLAGS_QFRZN;
	mfi_startio(sc);
	mtx_unlock(&sc->mfi_io_lock);

	/*
	* Dummy read to flush the bus; this ensures that the indexes are up
	* to date. Restart processing if more commands have come it.
	*/
	(void)sc->mfi_read_fw_status(sc);
	if (pi != sc->mfi_comms->hw_pi)
	goto restart;

	return;
	}

	int
	mfi_shutdown(struct mfi_softc *sc)
	{
	struct mfi_dcmd_frame *dcmd;
	struct mfi_command *cm;
	int error;

	if (sc->mfi_aen_cm != NULL) {
	sc->cm_aen_abort = 1;
	mfi_abort(sc, &sc->mfi_aen_cm);
	}

	if (sc->mfi_map_sync_cm != NULL) {
	sc->cm_map_abort = 1;
	mfi_abort(sc, &sc->mfi_map_sync_cm);
	}

	mtx_lock(&sc->mfi_io_lock);
	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_SHUTDOWN, NULL, 0);
	if (error) {
	mtx_unlock(&sc->mfi_io_lock);
	return (error);
	}

	dcmd = &cm->cm_frame->dcmd;
	dcmd->header.flags = MFI_FRAME_DIR_NONE;
	cm->cm_flags = MFI_CMD_POLLED;
	cm->cm_data = NULL;

	if ((error = mfi_mapcmd(sc, cm)) != 0)
	device_printf(sc->mfi_dev, "Failed to shutdown controller\n");

	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);
	return (error);
	}

	static void
	mfi_syspdprobe(struct mfi_softc *sc)
	{
	struct mfi_frame_header *hdr;
	struct mfi_command *cm = NULL;
	struct mfi_pd_list *pdlist = NULL;
	struct mfi_system_pd syspd, tmp;
	struct mfi_system_pending *syspd_pend;
	int error, i, found;

	sx_assert(&sc->mfi_config_lock, SA_XLOCKED);
	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
	/* Add SYSTEM PD's */
	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_PD_LIST_QUERY,
	(void *)&pdlist, sizeof(pdlist));
	if (error) {
	device_printf(sc->mfi_dev,
	"Error while forming SYSTEM PD list\n");
	goto out;
	}

	cm->cm_flags = MFI_CMD_DATAIN \| MFI_CMD_POLLED;
	cm->cm_frame->dcmd.mbox[0] = MR_PD_QUERY_TYPE_EXPOSED_TO_HOST;
	cm->cm_frame->dcmd.mbox[1] = 0;
	if (mfi_mapcmd(sc, cm) != 0) {
	device_printf(sc->mfi_dev,
	"Failed to get syspd device listing\n");
	goto out;
	}
	bus_dmamap_sync(sc->mfi_buffer_dmat,cm->cm_dmamap,
	BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
	hdr = &cm->cm_frame->header;
	if (hdr->cmd_status != MFI_STAT_OK) {
	device_printf(sc->mfi_dev,
	"MFI_DCMD_PD_LIST_QUERY failed %x\n", hdr->cmd_status);
	goto out;
	}
	/* Get each PD and add it to the system */
	for (i = 0; i < pdlist->count; i++) {
	if (pdlist->addr[i].device_id ==
	pdlist->addr[i].encl_device_id)
	continue;
	found = 0;
	TAILQ_FOREACH(syspd, &sc->mfi_syspd_tqh, pd_link) {
	if (syspd->pd_id == pdlist->addr[i].device_id)
	found = 1;
	}
	TAILQ_FOREACH(syspd_pend, &sc->mfi_syspd_pend_tqh, pd_link) {
	if (syspd_pend->pd_id == pdlist->addr[i].device_id)
	found = 1;
	}
	if (found == 0)
	mfi_add_sys_pd(sc, pdlist->addr[i].device_id);
	}
	/* Delete SYSPD's whose state has been changed */
	TAILQ_FOREACH_SAFE(syspd, &sc->mfi_syspd_tqh, pd_link, tmp) {
	found = 0;
	for (i = 0; i < pdlist->count; i++) {
	if (syspd->pd_id == pdlist->addr[i].device_id) {
	found = 1;
	break;
	}
	}
	if (found == 0) {
	printf("DELETE\n");
	mtx_unlock(&sc->mfi_io_lock);
	mtx_lock(&Giant);
	device_delete_child(sc->mfi_dev, syspd->pd_dev);
	mtx_unlock(&Giant);
	mtx_lock(&sc->mfi_io_lock);
	}
	}
	out:
	if (pdlist)
	free(pdlist, M_MFIBUF);
	if (cm)
	mfi_release_command(cm);

	return;
	}

	static void
	mfi_ldprobe(struct mfi_softc *sc)
	{
	struct mfi_frame_header *hdr;
	struct mfi_command *cm = NULL;
	struct mfi_ld_list *list = NULL;
	struct mfi_disk *ld;
	struct mfi_disk_pending *ld_pend;
	int error, i;

	sx_assert(&sc->mfi_config_lock, SA_XLOCKED);
	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_LD_GET_LIST,
	(void *)&list, sizeof(list));
	if (error)
	goto out;

	cm->cm_flags = MFI_CMD_DATAIN;
	if (mfi_wait_command(sc, cm) != 0) {
	device_printf(sc->mfi_dev, "Failed to get device listing\n");
	goto out;
	}

	hdr = &cm->cm_frame->header;
	if (hdr->cmd_status != MFI_STAT_OK) {
	device_printf(sc->mfi_dev, "MFI_DCMD_LD_GET_LIST failed %x\n",
	hdr->cmd_status);
	goto out;
	}

	for (i = 0; i < list->ld_count; i++) {
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
	if (ld->ld_id == list->ld_list[i].ld.v.target_id)
	goto skip_add;
	}
	TAILQ_FOREACH(ld_pend, &sc->mfi_ld_pend_tqh, ld_link) {
	if (ld_pend->ld_id == list->ld_list[i].ld.v.target_id)
	goto skip_add;
	}
	mfi_add_ld(sc, list->ld_list[i].ld.v.target_id);
	skip_add:;
	}
	out:
	if (list)
	free(list, M_MFIBUF);
	if (cm)
	mfi_release_command(cm);

	return;
	}

	/*
	* The timestamp is the number of seconds since 00:00 Jan 1, 2000. If
	* the bits in 24-31 are all set, then it is the number of seconds since
	* boot.
	*/
	static const char *
	format_timestamp(uint32_t timestamp)
	{
	static char buffer[32];

	if ((timestamp & 0xff000000) == 0xff000000)
	snprintf(buffer, sizeof(buffer), "boot + %us", timestamp &
	0x00ffffff);
	else
	snprintf(buffer, sizeof(buffer), "%us", timestamp);
	return (buffer);
	}

	static const char *
	format_class(int8_t class)
	{
	static char buffer[6];

	switch (class) {
	case MFI_EVT_CLASS_DEBUG:
	return ("debug");
	case MFI_EVT_CLASS_PROGRESS:
	return ("progress");
	case MFI_EVT_CLASS_INFO:
	return ("info");
	case MFI_EVT_CLASS_WARNING:
	return ("WARN");
	case MFI_EVT_CLASS_CRITICAL:
	return ("CRIT");
	case MFI_EVT_CLASS_FATAL:
	return ("FATAL");
	case MFI_EVT_CLASS_DEAD:
	return ("DEAD");
	default:
	snprintf(buffer, sizeof(buffer), "%d", class);
	return (buffer);
	}
	}

	static void
	mfi_decode_evt(struct mfi_softc sc, struct mfi_evt_detail detail)
	{
	struct mfi_system_pd *syspd = NULL;

	device_printf(sc->mfi_dev, "%d (%s/0x%04x/%s) - %s\n", detail->seq,
	format_timestamp(detail->time), detail->evt_class.members.locale,
	format_class(detail->evt_class.members.evt_class),
	detail->description);

	/* Don't act on old AEN's or while shutting down */
	if (detail->seq < sc->mfi_boot_seq_num \|\| sc->mfi_detaching)
	return;

	switch (detail->arg_type) {
	case MR_EVT_ARGS_NONE:
	if (detail->code == MR_EVT_CTRL_HOST_BUS_SCAN_REQUESTED) {
	device_printf(sc->mfi_dev, "HostBus scan raised\n");
	if (mfi_detect_jbod_change) {
	/*
	* Probe for new SYSPD's and Delete
	* invalid SYSPD's
	*/
	sx_xlock(&sc->mfi_config_lock);
	mtx_lock(&sc->mfi_io_lock);
	mfi_syspdprobe(sc);
	mtx_unlock(&sc->mfi_io_lock);
	sx_xunlock(&sc->mfi_config_lock);
	}
	}
	break;
	case MR_EVT_ARGS_LD_STATE:
	/* During load time driver reads all the events starting
	* from the one that has been logged after shutdown. Avoid
	* these old events.
	*/
	if (detail->args.ld_state.new_state == MFI_LD_STATE_OFFLINE ) {
	/* Remove the LD */
	struct mfi_disk *ld;
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
	if (ld->ld_id ==
	detail->args.ld_state.ld.target_id)
	break;
	}
	/*
	Fix: for kernel panics when SSCD is removed
	KASSERT(ld != NULL, ("volume dissappeared"));
	*/
	if (ld != NULL) {
	mtx_lock(&Giant);
	device_delete_child(sc->mfi_dev, ld->ld_dev);
	mtx_unlock(&Giant);
	}
	}
	break;
	case MR_EVT_ARGS_PD:
	if (detail->code == MR_EVT_PD_REMOVED) {
	if (mfi_detect_jbod_change) {
	/*
	* If the removed device is a SYSPD then
	* delete it
	*/
	TAILQ_FOREACH(syspd, &sc->mfi_syspd_tqh,
	pd_link) {
	if (syspd->pd_id ==
	detail->args.pd.device_id) {
	mtx_lock(&Giant);
	device_delete_child(
	sc->mfi_dev,
	syspd->pd_dev);
	mtx_unlock(&Giant);
	break;
	}
	}
	}
	}
	if (detail->code == MR_EVT_PD_INSERTED) {
	if (mfi_detect_jbod_change) {
	/* Probe for new SYSPD's */
	sx_xlock(&sc->mfi_config_lock);
	mtx_lock(&sc->mfi_io_lock);
	mfi_syspdprobe(sc);
	mtx_unlock(&sc->mfi_io_lock);
	sx_xunlock(&sc->mfi_config_lock);
	}
	}
	if (sc->mfi_cam_rescan_cb != NULL &&
	(detail->code == MR_EVT_PD_INSERTED \|\|
	detail->code == MR_EVT_PD_REMOVED)) {
	sc->mfi_cam_rescan_cb(sc, detail->args.pd.device_id);
	}
	break;
	}
	}

	static void
	mfi_queue_evt(struct mfi_softc sc, struct mfi_evt_detail detail)
	{
	struct mfi_evt_queue_elm *elm;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
	elm = malloc(sizeof(*elm), M_MFIBUF, M_NOWAIT\|M_ZERO);
	if (elm == NULL)
	return;
	memcpy(&elm->detail, detail, sizeof(*detail));
	TAILQ_INSERT_TAIL(&sc->mfi_evt_queue, elm, link);
	taskqueue_enqueue(taskqueue_swi, &sc->mfi_evt_task);
	}

	static void
	mfi_handle_evt(void *context, int pending)
	{
	TAILQ_HEAD(,mfi_evt_queue_elm) queue;
	struct mfi_softc *sc;
	struct mfi_evt_queue_elm *elm;

	sc = context;
	TAILQ_INIT(&queue);
	mtx_lock(&sc->mfi_io_lock);
	TAILQ_CONCAT(&queue, &sc->mfi_evt_queue, link);
	mtx_unlock(&sc->mfi_io_lock);
	while ((elm = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, elm, link);
	mfi_decode_evt(sc, &elm->detail);
	free(elm, M_MFIBUF);
	}
	}

	static int
	mfi_aen_register(struct mfi_softc *sc, int seq, int locale)
	{
	struct mfi_command *cm;
	struct mfi_dcmd_frame *dcmd;
	union mfi_evt current_aen, prior_aen;
	struct mfi_evt_detail *ed = NULL;
	int error = 0;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	current_aen.word = locale;
	if (sc->mfi_aen_cm != NULL) {
	prior_aen.word =
	((uint32_t *)&sc->mfi_aen_cm->cm_frame->dcmd.mbox)[1];
	if (prior_aen.members.evt_class <= current_aen.members.evt_class &&
	!((prior_aen.members.locale & current_aen.members.locale)
	^current_aen.members.locale)) {
	return (0);
	} else {
	prior_aen.members.locale \|= current_aen.members.locale;
	if (prior_aen.members.evt_class
	< current_aen.members.evt_class)
	current_aen.members.evt_class =
	prior_aen.members.evt_class;
	mfi_abort(sc, &sc->mfi_aen_cm);
	}
	}

	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_EVENT_WAIT,
	(void *)&ed, sizeof(ed));
	if (error)
	goto out;

	dcmd = &cm->cm_frame->dcmd;
	((uint32_t *)&dcmd->mbox)[0] = seq;
	((uint32_t *)&dcmd->mbox)[1] = locale;
	cm->cm_flags = MFI_CMD_DATAIN;
	cm->cm_complete = mfi_aen_complete;

	sc->last_seq_num = seq;
	sc->mfi_aen_cm = cm;

	mfi_enqueue_ready(cm);
	mfi_startio(sc);

	out:
	return (error);
	}

	static void
	mfi_aen_complete(struct mfi_command *cm)
	{
	struct mfi_frame_header *hdr;
	struct mfi_softc *sc;
	struct mfi_evt_detail *detail;
	struct mfi_aen mfi_aen_entry, tmp;
	int seq = 0, aborted = 0;

	sc = cm->cm_sc;
	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	if (sc->mfi_aen_cm == NULL)
	return;

	hdr = &cm->cm_frame->header;

	if (sc->cm_aen_abort \|\|
	hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
	sc->cm_aen_abort = 0;
	aborted = 1;
	} else {
	sc->mfi_aen_triggered = 1;
	if (sc->mfi_poll_waiting) {
	sc->mfi_poll_waiting = 0;
	selwakeup(&sc->mfi_select);
	}
	detail = cm->cm_data;
	mfi_queue_evt(sc, detail);
	seq = detail->seq + 1;
	TAILQ_FOREACH_SAFE(mfi_aen_entry, &sc->mfi_aen_pids, aen_link,
	tmp) {
	TAILQ_REMOVE(&sc->mfi_aen_pids, mfi_aen_entry,
	aen_link);
	PROC_LOCK(mfi_aen_entry->p);
	kern_psignal(mfi_aen_entry->p, SIGIO);
	PROC_UNLOCK(mfi_aen_entry->p);
	free(mfi_aen_entry, M_MFIBUF);
	}
	}

	free(cm->cm_data, M_MFIBUF);
	wakeup(&sc->mfi_aen_cm);
	sc->mfi_aen_cm = NULL;
	mfi_release_command(cm);

	/* set it up again so the driver can catch more events */
	if (!aborted)
	mfi_aen_setup(sc, seq);
	}

	#define MAX_EVENTS 15

	static int
	mfi_parse_entries(struct mfi_softc *sc, int start_seq, int stop_seq)
	{
	struct mfi_command *cm;
	struct mfi_dcmd_frame *dcmd;
	struct mfi_evt_list *el;
	union mfi_evt class_locale;
	int error, i, seq, size;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	class_locale.members.reserved = 0;
	class_locale.members.locale = mfi_event_locale;
	class_locale.members.evt_class = mfi_event_class;

	size = sizeof(struct mfi_evt_list) + sizeof(struct mfi_evt_detail)
	* (MAX_EVENTS - 1);
	el = malloc(size, M_MFIBUF, M_NOWAIT \| M_ZERO);
	if (el == NULL)
	return (ENOMEM);

	for (seq = start_seq;;) {
	if ((cm = mfi_dequeue_free(sc)) == NULL) {
	free(el, M_MFIBUF);
	return (EBUSY);
	}

	dcmd = &cm->cm_frame->dcmd;
	bzero(dcmd->mbox, MFI_MBOX_SIZE);
	dcmd->header.cmd = MFI_CMD_DCMD;
	dcmd->header.timeout = 0;
	dcmd->header.data_len = size;
	dcmd->opcode = MFI_DCMD_CTRL_EVENT_GET;
	((uint32_t *)&dcmd->mbox)[0] = seq;
	((uint32_t *)&dcmd->mbox)[1] = class_locale.word;
	cm->cm_sg = &dcmd->sgl;
	cm->cm_total_frame_size = MFI_DCMD_FRAME_SIZE;
	cm->cm_flags = MFI_CMD_DATAIN \| MFI_CMD_POLLED;
	cm->cm_data = el;
	cm->cm_len = size;

	if ((error = mfi_mapcmd(sc, cm)) != 0) {
	device_printf(sc->mfi_dev,
	"Failed to get controller entries\n");
	mfi_release_command(cm);
	break;
	}

	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
	BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);

	if (dcmd->header.cmd_status == MFI_STAT_NOT_FOUND) {
	mfi_release_command(cm);
	break;
	}
	if (dcmd->header.cmd_status != MFI_STAT_OK) {
	device_printf(sc->mfi_dev,
	"Error %d fetching controller entries\n",
	dcmd->header.cmd_status);
	mfi_release_command(cm);
	error = EIO;
	break;
	}
	mfi_release_command(cm);

	for (i = 0; i < el->count; i++) {
	/*
	* If this event is newer than 'stop_seq' then
	* break out of the loop. Note that the log
	* is a circular buffer so we have to handle
	* the case that our stop point is earlier in
	* the buffer than our start point.
	*/
	if (el->event[i].seq >= stop_seq) {
	if (start_seq <= stop_seq)
	break;
	else if (el->event[i].seq < start_seq)
	break;
	}
	mfi_queue_evt(sc, &el->event[i]);
	}
	seq = el->event[el->count - 1].seq + 1;
	}

	free(el, M_MFIBUF);
	return (error);
	}

	static int
	mfi_add_ld(struct mfi_softc *sc, int id)
	{
	struct mfi_command *cm;
	struct mfi_dcmd_frame *dcmd = NULL;
	struct mfi_ld_info *ld_info = NULL;
	struct mfi_disk_pending *ld_pend;
	int error;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	ld_pend = malloc(sizeof(*ld_pend), M_MFIBUF, M_NOWAIT \| M_ZERO);
	if (ld_pend != NULL) {
	ld_pend->ld_id = id;
	TAILQ_INSERT_TAIL(&sc->mfi_ld_pend_tqh, ld_pend, ld_link);
	}

	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_LD_GET_INFO,
	(void *)&ld_info, sizeof(ld_info));
	if (error) {
	device_printf(sc->mfi_dev,
	"Failed to allocate for MFI_DCMD_LD_GET_INFO %d\n", error);
	if (ld_info)
	free(ld_info, M_MFIBUF);
	return (error);
	}
	cm->cm_flags = MFI_CMD_DATAIN;
	dcmd = &cm->cm_frame->dcmd;
	dcmd->mbox[0] = id;
	if (mfi_wait_command(sc, cm) != 0) {
	device_printf(sc->mfi_dev,
	"Failed to get logical drive: %d\n", id);
	free(ld_info, M_MFIBUF);
	return (0);
	}
	if (ld_info->ld_config.params.isSSCD != 1)
	mfi_add_ld_complete(cm);
	else {
	mfi_release_command(cm);
	if (ld_info) /* SSCD drives ld_info free here */
	free(ld_info, M_MFIBUF);
	}
	return (0);
	}

	static void
	mfi_add_ld_complete(struct mfi_command *cm)
	{
	struct mfi_frame_header *hdr;
	struct mfi_ld_info *ld_info;
	struct mfi_softc *sc;
	device_t child;

	sc = cm->cm_sc;
	hdr = &cm->cm_frame->header;
	ld_info = cm->cm_private;

	if (sc->cm_map_abort \|\| hdr->cmd_status != MFI_STAT_OK) {
	free(ld_info, M_MFIBUF);
	wakeup(&sc->mfi_map_sync_cm);
	mfi_release_command(cm);
	return;
	}
	wakeup(&sc->mfi_map_sync_cm);
	mfi_release_command(cm);

	mtx_unlock(&sc->mfi_io_lock);
	mtx_lock(&Giant);
	if ((child = device_add_child(sc->mfi_dev, "mfid", -1)) == NULL) {
	device_printf(sc->mfi_dev, "Failed to add logical disk\n");
	free(ld_info, M_MFIBUF);
	mtx_unlock(&Giant);
	mtx_lock(&sc->mfi_io_lock);
	return;
	}

	device_set_ivars(child, ld_info);
	device_set_desc(child, "MFI Logical Disk");
	bus_generic_attach(sc->mfi_dev);
	mtx_unlock(&Giant);
	mtx_lock(&sc->mfi_io_lock);
	}

	static int mfi_add_sys_pd(struct mfi_softc *sc, int id)
	{
	struct mfi_command *cm;
	struct mfi_dcmd_frame *dcmd = NULL;
	struct mfi_pd_info *pd_info = NULL;
	struct mfi_system_pending *syspd_pend;
	int error;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	syspd_pend = malloc(sizeof(*syspd_pend), M_MFIBUF, M_NOWAIT \| M_ZERO);
	if (syspd_pend != NULL) {
	syspd_pend->pd_id = id;
	TAILQ_INSERT_TAIL(&sc->mfi_syspd_pend_tqh, syspd_pend, pd_link);
	}

	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_PD_GET_INFO,
	(void *)&pd_info, sizeof(pd_info));
	if (error) {
	device_printf(sc->mfi_dev,
	"Failed to allocated for MFI_DCMD_PD_GET_INFO %d\n",
	error);
	if (pd_info)
	free(pd_info, M_MFIBUF);
	return (error);
	}
	cm->cm_flags = MFI_CMD_DATAIN \| MFI_CMD_POLLED;
	dcmd = &cm->cm_frame->dcmd;
	dcmd->mbox[0]=id;
	dcmd->header.scsi_status = 0;
	dcmd->header.pad0 = 0;
	if ((error = mfi_mapcmd(sc, cm)) != 0) {
	device_printf(sc->mfi_dev,
	"Failed to get physical drive info %d\n", id);
	free(pd_info, M_MFIBUF);
	mfi_release_command(cm);
	return (error);
	}
	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
	BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
	mfi_add_sys_pd_complete(cm);
	return (0);
	}

	static void
	mfi_add_sys_pd_complete(struct mfi_command *cm)
	{
	struct mfi_frame_header *hdr;
	struct mfi_pd_info *pd_info;
	struct mfi_softc *sc;
	device_t child;

	sc = cm->cm_sc;
	hdr = &cm->cm_frame->header;
	pd_info = cm->cm_private;

	if (hdr->cmd_status != MFI_STAT_OK) {
	free(pd_info, M_MFIBUF);
	mfi_release_command(cm);
	return;
	}
	if (pd_info->fw_state != MFI_PD_STATE_SYSTEM) {
	device_printf(sc->mfi_dev, "PD=%x is not SYSTEM PD\n",
	pd_info->ref.v.device_id);
	free(pd_info, M_MFIBUF);
	mfi_release_command(cm);
	return;
	}
	mfi_release_command(cm);

	mtx_unlock(&sc->mfi_io_lock);
	mtx_lock(&Giant);
	if ((child = device_add_child(sc->mfi_dev, "mfisyspd", -1)) == NULL) {
	device_printf(sc->mfi_dev, "Failed to add system pd\n");
	free(pd_info, M_MFIBUF);
	mtx_unlock(&Giant);
	mtx_lock(&sc->mfi_io_lock);
	return;
	}

	device_set_ivars(child, pd_info);
	device_set_desc(child, "MFI System PD");
	bus_generic_attach(sc->mfi_dev);
	mtx_unlock(&Giant);
	mtx_lock(&sc->mfi_io_lock);
	}

	static struct mfi_command *
	mfi_bio_command(struct mfi_softc *sc)
	{
	struct bio *bio;
	struct mfi_command *cm = NULL;

	/reserving two commands to avoid starvation for IOCTL/
	if (sc->mfi_qstat[MFIQ_FREE].q_length < 2) {
	return (NULL);
	}
	if ((bio = mfi_dequeue_bio(sc)) == NULL) {
	return (NULL);
	}
	if ((uintptr_t)bio->bio_driver2 == MFI_LD_IO) {
	cm = mfi_build_ldio(sc, bio);
	} else if ((uintptr_t) bio->bio_driver2 == MFI_SYS_PD_IO) {
	cm = mfi_build_syspdio(sc, bio);
	}
	if (!cm)
	mfi_enqueue_bio(sc, bio);
	return cm;
	}

	/*
	* mostly copied from cam/scsi/scsi_all.c:scsi_read_write
	*/

	int
	mfi_build_cdb(int readop, uint8_t byte2, u_int64_t lba, u_int32_t block_count, uint8_t *cdb)
	{
	int cdb_len;

	if (((lba & 0x1fffff) == lba)
	&& ((block_count & 0xff) == block_count)
	&& (byte2 == 0)) {
	/* We can fit in a 6 byte cdb */
	struct scsi_rw_6 *scsi_cmd;

	scsi_cmd = (struct scsi_rw_6 *)cdb;
	scsi_cmd->opcode = readop ? READ_6 : WRITE_6;
	scsi_ulto3b(lba, scsi_cmd->addr);
	scsi_cmd->length = block_count & 0xff;
	scsi_cmd->control = 0;
	cdb_len = sizeof(*scsi_cmd);
	} else if (((block_count & 0xffff) == block_count) && ((lba & 0xffffffff) == lba)) {
	/* Need a 10 byte CDB */
	struct scsi_rw_10 *scsi_cmd;

	scsi_cmd = (struct scsi_rw_10 *)cdb;
	scsi_cmd->opcode = readop ? READ_10 : WRITE_10;
	scsi_cmd->byte2 = byte2;
	scsi_ulto4b(lba, scsi_cmd->addr);
	scsi_cmd->reserved = 0;
	scsi_ulto2b(block_count, scsi_cmd->length);
	scsi_cmd->control = 0;
	cdb_len = sizeof(*scsi_cmd);
	} else if (((block_count & 0xffffffff) == block_count) &&
	((lba & 0xffffffff) == lba)) {
	/* Block count is too big for 10 byte CDB use a 12 byte CDB */
	struct scsi_rw_12 *scsi_cmd;

	scsi_cmd = (struct scsi_rw_12 *)cdb;
	scsi_cmd->opcode = readop ? READ_12 : WRITE_12;
	scsi_cmd->byte2 = byte2;
	scsi_ulto4b(lba, scsi_cmd->addr);
	scsi_cmd->reserved = 0;
	scsi_ulto4b(block_count, scsi_cmd->length);
	scsi_cmd->control = 0;
	cdb_len = sizeof(*scsi_cmd);
	} else {
	/*
	* 16 byte CDB. We'll only get here if the LBA is larger
	* than 2^32
	*/
	struct scsi_rw_16 *scsi_cmd;

	scsi_cmd = (struct scsi_rw_16 *)cdb;
	scsi_cmd->opcode = readop ? READ_16 : WRITE_16;
	scsi_cmd->byte2 = byte2;
	scsi_u64to8b(lba, scsi_cmd->addr);
	scsi_cmd->reserved = 0;
	scsi_ulto4b(block_count, scsi_cmd->length);
	scsi_cmd->control = 0;
	cdb_len = sizeof(*scsi_cmd);
	}

	return cdb_len;
	}

	extern char *unmapped_buf;

	static struct mfi_command *
	mfi_build_syspdio(struct mfi_softc sc, struct bio bio)
	{
	struct mfi_command *cm;
	struct mfi_pass_frame *pass;
	uint32_t context = 0;
	int flags = 0, blkcount = 0, readop;
	uint8_t cdb_len;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	if ((cm = mfi_dequeue_free(sc)) == NULL)
	return (NULL);

	/* Zero out the MFI frame */
	context = cm->cm_frame->header.context;
	bzero(cm->cm_frame, sizeof(union mfi_frame));
	cm->cm_frame->header.context = context;
	pass = &cm->cm_frame->pass;
	bzero(pass->cdb, 16);
	pass->header.cmd = MFI_CMD_PD_SCSI_IO;
	switch (bio->bio_cmd) {
	case BIO_READ:
	flags = MFI_CMD_DATAIN \| MFI_CMD_BIO;
	readop = 1;
	break;
	case BIO_WRITE:
	flags = MFI_CMD_DATAOUT \| MFI_CMD_BIO;
	readop = 0;
	break;
	default:
	/* TODO: what about BIO_DELETE??? */
	biofinish(bio, NULL, EOPNOTSUPP);
	mfi_enqueue_free(cm);
	return (NULL);
	}

	/* Cheat with the sector length to avoid a non-constant division */
	blkcount = howmany(bio->bio_bcount, MFI_SECTOR_LEN);
	/* Fill the LBA and Transfer length in CDB */
	cdb_len = mfi_build_cdb(readop, 0, bio->bio_pblkno, blkcount,
	pass->cdb);
	pass->header.target_id = (uintptr_t)bio->bio_driver1;
	pass->header.lun_id = 0;
	pass->header.timeout = 0;
	pass->header.flags = 0;
	pass->header.scsi_status = 0;
	pass->header.sense_len = MFI_SENSE_LEN;
	pass->header.data_len = bio->bio_bcount;
	pass->header.cdb_len = cdb_len;
	pass->sense_addr_lo = (uint32_t)cm->cm_sense_busaddr;
	pass->sense_addr_hi = (uint32_t)((uint64_t)cm->cm_sense_busaddr >> 32);
	cm->cm_complete = mfi_bio_complete;
	cm->cm_private = bio;
	cm->cm_data = unmapped_buf;
	cm->cm_len = bio->bio_bcount;
	cm->cm_sg = &pass->sgl;
	cm->cm_total_frame_size = MFI_PASS_FRAME_SIZE;
	cm->cm_flags = flags;

	return (cm);
	}

	static struct mfi_command *
	mfi_build_ldio(struct mfi_softc sc, struct bio bio)
	{
	struct mfi_io_frame *io;
	struct mfi_command *cm;
	int flags;
	uint32_t blkcount;
	uint32_t context = 0;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	if ((cm = mfi_dequeue_free(sc)) == NULL)
	return (NULL);

	/* Zero out the MFI frame */
	context = cm->cm_frame->header.context;
	bzero(cm->cm_frame, sizeof(union mfi_frame));
	cm->cm_frame->header.context = context;
	io = &cm->cm_frame->io;
	switch (bio->bio_cmd) {
	case BIO_READ:
	io->header.cmd = MFI_CMD_LD_READ;
	flags = MFI_CMD_DATAIN \| MFI_CMD_BIO;
	break;
	case BIO_WRITE:
	io->header.cmd = MFI_CMD_LD_WRITE;
	flags = MFI_CMD_DATAOUT \| MFI_CMD_BIO;
	break;
	default:
	/* TODO: what about BIO_DELETE??? */
	biofinish(bio, NULL, EOPNOTSUPP);
	mfi_enqueue_free(cm);
	return (NULL);
	}

	/* Cheat with the sector length to avoid a non-constant division */
	blkcount = howmany(bio->bio_bcount, MFI_SECTOR_LEN);
	io->header.target_id = (uintptr_t)bio->bio_driver1;
	io->header.timeout = 0;
	io->header.flags = 0;
	io->header.scsi_status = 0;
	io->header.sense_len = MFI_SENSE_LEN;
	io->header.data_len = blkcount;
	io->sense_addr_lo = (uint32_t)cm->cm_sense_busaddr;
	io->sense_addr_hi = (uint32_t)((uint64_t)cm->cm_sense_busaddr >> 32);
	io->lba_hi = (bio->bio_pblkno & 0xffffffff00000000) >> 32;
	io->lba_lo = bio->bio_pblkno & 0xffffffff;
	cm->cm_complete = mfi_bio_complete;
	cm->cm_private = bio;
	cm->cm_data = unmapped_buf;
	cm->cm_len = bio->bio_bcount;
	cm->cm_sg = &io->sgl;
	cm->cm_total_frame_size = MFI_IO_FRAME_SIZE;
	cm->cm_flags = flags;

	return (cm);
	}

	static void
	mfi_bio_complete(struct mfi_command *cm)
	{
	struct bio *bio;
	struct mfi_frame_header *hdr;
	struct mfi_softc *sc;

	bio = cm->cm_private;
	hdr = &cm->cm_frame->header;
	sc = cm->cm_sc;

	if ((hdr->cmd_status != MFI_STAT_OK) \|\| (hdr->scsi_status != 0)) {
	bio->bio_flags \|= BIO_ERROR;
	bio->bio_error = EIO;
	device_printf(sc->mfi_dev, "I/O error, cmd=%p, status=%#x, "
	"scsi_status=%#x\n", cm, hdr->cmd_status, hdr->scsi_status);
	mfi_print_sense(cm->cm_sc, cm->cm_sense);
	} else if (cm->cm_error != 0) {
	bio->bio_flags \|= BIO_ERROR;
	bio->bio_error = cm->cm_error;
	device_printf(sc->mfi_dev, "I/O error, cmd=%p, error=%#x\n",
	cm, cm->cm_error);
	}

	mfi_release_command(cm);
	mfi_disk_complete(bio);
	}

	void
	mfi_startio(struct mfi_softc *sc)
	{
	struct mfi_command *cm;
	struct ccb_hdr *ccbh;

	for (;;) {
	/* Don't bother if we're short on resources */
	if (sc->mfi_flags & MFI_FLAGS_QFRZN)
	break;

	/* Try a command that has already been prepared */
	cm = mfi_dequeue_ready(sc);

	if (cm == NULL) {
	if ((ccbh = TAILQ_FIRST(&sc->mfi_cam_ccbq)) != NULL)
	cm = sc->mfi_cam_start(ccbh);
	}

	/* Nope, so look for work on the bioq */
	if (cm == NULL)
	cm = mfi_bio_command(sc);

	/* No work available, so exit */
	if (cm == NULL)
	break;

	/* Send the command to the controller */
	if (mfi_mapcmd(sc, cm) != 0) {
	device_printf(sc->mfi_dev, "Failed to startio\n");
	mfi_requeue_ready(cm);
	break;
	}
	}
	}

	int
	mfi_mapcmd(struct mfi_softc sc, struct mfi_command cm)
	{
	int error, polled;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	if ((cm->cm_data != NULL) && (cm->cm_frame->header.cmd != MFI_CMD_STP )) {
	polled = (cm->cm_flags & MFI_CMD_POLLED) ? BUS_DMA_NOWAIT : 0;
	if (cm->cm_flags & MFI_CMD_CCB)
	error = bus_dmamap_load_ccb(sc->mfi_buffer_dmat,
	cm->cm_dmamap, cm->cm_data, mfi_data_cb, cm,
	polled);
	else if (cm->cm_flags & MFI_CMD_BIO)
	error = bus_dmamap_load_bio(sc->mfi_buffer_dmat,
	cm->cm_dmamap, cm->cm_private, mfi_data_cb, cm,
	polled);
	else
	error = bus_dmamap_load(sc->mfi_buffer_dmat,
	cm->cm_dmamap, cm->cm_data, cm->cm_len,
	mfi_data_cb, cm, polled);
	if (error == EINPROGRESS) {
	sc->mfi_flags \|= MFI_FLAGS_QFRZN;
	return (0);
	}
	} else {
	error = mfi_send_frame(sc, cm);
	}

	return (error);
	}

	static void
	mfi_data_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct mfi_frame_header *hdr;
	struct mfi_command *cm;
	union mfi_sgl *sgl;
	struct mfi_softc *sc;
	int i, j, first, dir;
	int sge_size, locked;

	cm = (struct mfi_command *)arg;
	sc = cm->cm_sc;
	hdr = &cm->cm_frame->header;
	sgl = cm->cm_sg;

	/*
	* We need to check if we have the lock as this is async
	* callback so even though our caller mfi_mapcmd asserts
	* it has the lock, there is no guarantee that hasn't been
	* dropped if bus_dmamap_load returned prior to our
	* completion.
	*/
	if ((locked = mtx_owned(&sc->mfi_io_lock)) == 0)
	mtx_lock(&sc->mfi_io_lock);

	if (error) {
	printf("error %d in callback\n", error);
	cm->cm_error = error;
	mfi_complete(sc, cm);
	goto out;
	}
	/* Use IEEE sgl only for IO's on a SKINNY controller
	* For other commands on a SKINNY controller use either
	* sg32 or sg64 based on the sizeof(bus_addr_t).
	* Also calculate the total frame size based on the type
	* of SGL used.
	*/
	if (((cm->cm_frame->header.cmd == MFI_CMD_PD_SCSI_IO) \|\|
	(cm->cm_frame->header.cmd == MFI_CMD_LD_READ) \|\|
	(cm->cm_frame->header.cmd == MFI_CMD_LD_WRITE)) &&
	(sc->mfi_flags & MFI_FLAGS_SKINNY)) {
	for (i = 0; i < nsegs; i++) {
	sgl->sg_skinny[i].addr = segs[i].ds_addr;
	sgl->sg_skinny[i].len = segs[i].ds_len;
	sgl->sg_skinny[i].flag = 0;
	}
	hdr->flags \|= MFI_FRAME_IEEE_SGL \| MFI_FRAME_SGL64;
	sge_size = sizeof(struct mfi_sg_skinny);
	hdr->sg_count = nsegs;
	} else {
	j = 0;
	if (cm->cm_frame->header.cmd == MFI_CMD_STP) {
	first = cm->cm_stp_len;
	if ((sc->mfi_flags & MFI_FLAGS_SG64) == 0) {
	sgl->sg32[j].addr = segs[0].ds_addr;
	sgl->sg32[j++].len = first;
	} else {
	sgl->sg64[j].addr = segs[0].ds_addr;
	sgl->sg64[j++].len = first;
	}
	} else
	first = 0;
	if ((sc->mfi_flags & MFI_FLAGS_SG64) == 0) {
	for (i = 0; i < nsegs; i++) {
	sgl->sg32[j].addr = segs[i].ds_addr + first;
	sgl->sg32[j++].len = segs[i].ds_len - first;
	first = 0;
	}
	} else {
	for (i = 0; i < nsegs; i++) {
	sgl->sg64[j].addr = segs[i].ds_addr + first;
	sgl->sg64[j++].len = segs[i].ds_len - first;
	first = 0;
	}
	hdr->flags \|= MFI_FRAME_SGL64;
	}
	hdr->sg_count = j;
	sge_size = sc->mfi_sge_size;
	}

	dir = 0;
	if (cm->cm_flags & MFI_CMD_DATAIN) {
	dir \|= BUS_DMASYNC_PREREAD;
	hdr->flags \|= MFI_FRAME_DIR_READ;
	}
	if (cm->cm_flags & MFI_CMD_DATAOUT) {
	dir \|= BUS_DMASYNC_PREWRITE;
	hdr->flags \|= MFI_FRAME_DIR_WRITE;
	}
	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap, dir);
	cm->cm_flags \|= MFI_CMD_MAPPED;

	/*
	* Instead of calculating the total number of frames in the
	* compound frame, it's already assumed that there will be at
	* least 1 frame, so don't compensate for the modulo of the
	* following division.
	*/
	cm->cm_total_frame_size += (sc->mfi_sge_size * nsegs);
	cm->cm_extra_frames = (cm->cm_total_frame_size - 1) / MFI_FRAME_SIZE;

	if ((error = mfi_send_frame(sc, cm)) != 0) {
	printf("error %d in callback from mfi_send_frame\n", error);
	cm->cm_error = error;
	mfi_complete(sc, cm);
	goto out;
	}

	out:
	/* leave the lock in the state we found it */
	if (locked == 0)
	mtx_unlock(&sc->mfi_io_lock);

	return;
	}

	static int
	mfi_send_frame(struct mfi_softc sc, struct mfi_command cm)
	{
	int error;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	if (sc->MFA_enabled)
	error = mfi_tbolt_send_frame(sc, cm);
	else
	error = mfi_std_send_frame(sc, cm);

	if (error != 0 && (cm->cm_flags & MFI_ON_MFIQ_BUSY) != 0)
	mfi_remove_busy(cm);

	return (error);
	}

	static int
	mfi_std_send_frame(struct mfi_softc sc, struct mfi_command cm)
	{
	struct mfi_frame_header *hdr;
	int tm = mfi_polled_cmd_timeout * 1000;

	hdr = &cm->cm_frame->header;

	if ((cm->cm_flags & MFI_CMD_POLLED) == 0) {
	cm->cm_timestamp = time_uptime;
	mfi_enqueue_busy(cm);
	} else {
	hdr->cmd_status = MFI_STAT_INVALID_STATUS;
	hdr->flags \|= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
	}

	/*
	* The bus address of the command is aligned on a 64 byte boundary,
	* leaving the least 6 bits as zero. For whatever reason, the
	* hardware wants the address shifted right by three, leaving just
	* 3 zero bits. These three bits are then used as a prefetching
	* hint for the hardware to predict how many frames need to be
	* fetched across the bus. If a command has more than 8 frames
	* then the 3 bits are set to 0x7 and the firmware uses other
	* information in the command to determine the total amount to fetch.
	* However, FreeBSD doesn't support I/O larger than 128K, so 8 frames
	* is enough for both 32bit and 64bit systems.
	*/
	if (cm->cm_extra_frames > 7)
	cm->cm_extra_frames = 7;

	sc->mfi_issue_cmd(sc, cm->cm_frame_busaddr, cm->cm_extra_frames);

	if ((cm->cm_flags & MFI_CMD_POLLED) == 0)
	return (0);

	/* This is a polled command, so busy-wait for it to complete. */
	while (hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
	DELAY(1000);
	tm -= 1;
	if (tm <= 0)
	break;
	}

	if (hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
	device_printf(sc->mfi_dev, "Frame %p timed out "
	"command 0x%X\n", hdr, cm->cm_frame->dcmd.opcode);
	return (ETIMEDOUT);
	}

	return (0);
	}

	void
	mfi_complete(struct mfi_softc sc, struct mfi_command cm)
	{
	int dir;
	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	if ((cm->cm_flags & MFI_CMD_MAPPED) != 0) {
	dir = 0;
	if ((cm->cm_flags & MFI_CMD_DATAIN) \|\|
	(cm->cm_frame->header.cmd == MFI_CMD_STP))
	dir \|= BUS_DMASYNC_POSTREAD;
	if (cm->cm_flags & MFI_CMD_DATAOUT)
	dir \|= BUS_DMASYNC_POSTWRITE;

	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap, dir);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
	cm->cm_flags &= ~MFI_CMD_MAPPED;
	}

	cm->cm_flags \|= MFI_CMD_COMPLETED;

	if (cm->cm_complete != NULL)
	cm->cm_complete(cm);
	else
	wakeup(cm);
	}

	static int
	mfi_abort(struct mfi_softc sc, struct mfi_command *cm_abort)
	{
	struct mfi_command *cm;
	struct mfi_abort_frame *abort;
	int i = 0, error;
	uint32_t context = 0;

	mtx_lock(&sc->mfi_io_lock);
	if ((cm = mfi_dequeue_free(sc)) == NULL) {
	mtx_unlock(&sc->mfi_io_lock);
	return (EBUSY);
	}

	/* Zero out the MFI frame */
	context = cm->cm_frame->header.context;
	bzero(cm->cm_frame, sizeof(union mfi_frame));
	cm->cm_frame->header.context = context;

	abort = &cm->cm_frame->abort;
	abort->header.cmd = MFI_CMD_ABORT;
	abort->header.flags = 0;
	abort->header.scsi_status = 0;
	abort->abort_context = (*cm_abort)->cm_frame->header.context;
	abort->abort_mfi_addr_lo = (uint32_t)(*cm_abort)->cm_frame_busaddr;
	abort->abort_mfi_addr_hi =
	(uint32_t)((uint64_t)(*cm_abort)->cm_frame_busaddr >> 32);
	cm->cm_data = NULL;
	cm->cm_flags = MFI_CMD_POLLED;

	if ((error = mfi_mapcmd(sc, cm)) != 0)
	device_printf(sc->mfi_dev, "failed to abort command\n");
	mfi_release_command(cm);

	mtx_unlock(&sc->mfi_io_lock);
	while (i < 5 && *cm_abort != NULL) {
	tsleep(cm_abort, 0, "mfiabort",
	5 * hz);
	i++;
	}
	if (*cm_abort != NULL) {
	/* Force a complete if command didn't abort */
	mtx_lock(&sc->mfi_io_lock);
	(cm_abort)->cm_complete(cm_abort);
	mtx_unlock(&sc->mfi_io_lock);
	}

	return (error);
	}

	int
	mfi_dump_blocks(struct mfi_softc sc, int id, uint64_t lba, void virt,
	int len)
	{
	struct mfi_command *cm;
	struct mfi_io_frame *io;
	int error;
	uint32_t context = 0;

	if ((cm = mfi_dequeue_free(sc)) == NULL)
	return (EBUSY);

	/* Zero out the MFI frame */
	context = cm->cm_frame->header.context;
	bzero(cm->cm_frame, sizeof(union mfi_frame));
	cm->cm_frame->header.context = context;

	io = &cm->cm_frame->io;
	io->header.cmd = MFI_CMD_LD_WRITE;
	io->header.target_id = id;
	io->header.timeout = 0;
	io->header.flags = 0;
	io->header.scsi_status = 0;
	io->header.sense_len = MFI_SENSE_LEN;
	io->header.data_len = howmany(len, MFI_SECTOR_LEN);
	io->sense_addr_lo = (uint32_t)cm->cm_sense_busaddr;
	io->sense_addr_hi = (uint32_t)((uint64_t)cm->cm_sense_busaddr >> 32);
	io->lba_hi = (lba & 0xffffffff00000000) >> 32;
	io->lba_lo = lba & 0xffffffff;
	cm->cm_data = virt;
	cm->cm_len = len;
	cm->cm_sg = &io->sgl;
	cm->cm_total_frame_size = MFI_IO_FRAME_SIZE;
	cm->cm_flags = MFI_CMD_POLLED \| MFI_CMD_DATAOUT;

	if ((error = mfi_mapcmd(sc, cm)) != 0)
	device_printf(sc->mfi_dev, "failed dump blocks\n");
	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
	BUS_DMASYNC_POSTWRITE);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
	mfi_release_command(cm);

	return (error);
	}

	int
	mfi_dump_syspd_blocks(struct mfi_softc sc, int id, uint64_t lba, void virt,
	int len)
	{
	struct mfi_command *cm;
	struct mfi_pass_frame *pass;
	int error, readop, cdb_len;
	uint32_t blkcount;

	if ((cm = mfi_dequeue_free(sc)) == NULL)
	return (EBUSY);

	pass = &cm->cm_frame->pass;
	bzero(pass->cdb, 16);
	pass->header.cmd = MFI_CMD_PD_SCSI_IO;

	readop = 0;
	blkcount = howmany(len, MFI_SECTOR_LEN);
	cdb_len = mfi_build_cdb(readop, 0, lba, blkcount, pass->cdb);
	pass->header.target_id = id;
	pass->header.timeout = 0;
	pass->header.flags = 0;
	pass->header.scsi_status = 0;
	pass->header.sense_len = MFI_SENSE_LEN;
	pass->header.data_len = len;
	pass->header.cdb_len = cdb_len;
	pass->sense_addr_lo = (uint32_t)cm->cm_sense_busaddr;
	pass->sense_addr_hi = (uint32_t)((uint64_t)cm->cm_sense_busaddr >> 32);
	cm->cm_data = virt;
	cm->cm_len = len;
	cm->cm_sg = &pass->sgl;
	cm->cm_total_frame_size = MFI_PASS_FRAME_SIZE;
	cm->cm_flags = MFI_CMD_POLLED \| MFI_CMD_DATAOUT \| MFI_CMD_SCSI;

	if ((error = mfi_mapcmd(sc, cm)) != 0)
	device_printf(sc->mfi_dev, "failed dump blocks\n");
	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
	BUS_DMASYNC_POSTWRITE);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
	mfi_release_command(cm);

	return (error);
	}

	static int
	mfi_open(struct cdev dev, int flags, int fmt, struct thread td)
	{
	struct mfi_softc *sc;
	int error;

	sc = dev->si_drv1;

	mtx_lock(&sc->mfi_io_lock);
	if (sc->mfi_detaching)
	error = ENXIO;
	else {
	sc->mfi_flags \|= MFI_FLAGS_OPEN;
	error = 0;
	}
	mtx_unlock(&sc->mfi_io_lock);

	return (error);
	}

	static int
	mfi_close(struct cdev dev, int flags, int fmt, struct thread td)
	{
	struct mfi_softc *sc;
	struct mfi_aen mfi_aen_entry, tmp;

	sc = dev->si_drv1;

	mtx_lock(&sc->mfi_io_lock);
	sc->mfi_flags &= ~MFI_FLAGS_OPEN;

	TAILQ_FOREACH_SAFE(mfi_aen_entry, &sc->mfi_aen_pids, aen_link, tmp) {
	if (mfi_aen_entry->p == curproc) {
	TAILQ_REMOVE(&sc->mfi_aen_pids, mfi_aen_entry,
	aen_link);
	free(mfi_aen_entry, M_MFIBUF);
	}
	}
	mtx_unlock(&sc->mfi_io_lock);
	return (0);
	}

	static int
	mfi_config_lock(struct mfi_softc *sc, uint32_t opcode)
	{

	switch (opcode) {
	case MFI_DCMD_LD_DELETE:
	case MFI_DCMD_CFG_ADD:
	case MFI_DCMD_CFG_CLEAR:
	case MFI_DCMD_CFG_FOREIGN_IMPORT:
	sx_xlock(&sc->mfi_config_lock);
	return (1);
	default:
	return (0);
	}
	}

	static void
	mfi_config_unlock(struct mfi_softc *sc, int locked)
	{

	if (locked)
	sx_xunlock(&sc->mfi_config_lock);
	}

	/*
	* Perform pre-issue checks on commands from userland and possibly veto
	* them.
	*/
	static int
	mfi_check_command_pre(struct mfi_softc sc, struct mfi_command cm)
	{
	struct mfi_disk ld, ld2;
	int error;
	struct mfi_system_pd *syspd = NULL;
	uint16_t syspd_id;
	uint16_t *mbox;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
	error = 0;
	switch (cm->cm_frame->dcmd.opcode) {
	case MFI_DCMD_LD_DELETE:
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
	if (ld->ld_id == cm->cm_frame->dcmd.mbox[0])
	break;
	}
	if (ld == NULL)
	error = ENOENT;
	else
	error = mfi_disk_disable(ld);
	break;
	case MFI_DCMD_CFG_CLEAR:
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
	error = mfi_disk_disable(ld);
	if (error)
	break;
	}
	if (error) {
	TAILQ_FOREACH(ld2, &sc->mfi_ld_tqh, ld_link) {
	if (ld2 == ld)
	break;
	mfi_disk_enable(ld2);
	}
	}
	break;
	case MFI_DCMD_PD_STATE_SET:
	mbox = (uint16_t *) cm->cm_frame->dcmd.mbox;
	syspd_id = mbox[0];
	if (mbox[2] == MFI_PD_STATE_UNCONFIGURED_GOOD) {
	TAILQ_FOREACH(syspd, &sc->mfi_syspd_tqh, pd_link) {
	if (syspd->pd_id == syspd_id)
	break;
	}
	}
	else
	break;
	if (syspd)
	error = mfi_syspd_disable(syspd);
	break;
	default:
	break;
	}
	return (error);
	}

	/* Perform post-issue checks on commands from userland. */
	static void
	mfi_check_command_post(struct mfi_softc sc, struct mfi_command cm)
	{
	struct mfi_disk ld, ldn;
	struct mfi_system_pd *syspd = NULL;
	uint16_t syspd_id;
	uint16_t *mbox;

	switch (cm->cm_frame->dcmd.opcode) {
	case MFI_DCMD_LD_DELETE:
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
	if (ld->ld_id == cm->cm_frame->dcmd.mbox[0])
	break;
	}
	KASSERT(ld != NULL, ("volume dissappeared"));
	if (cm->cm_frame->header.cmd_status == MFI_STAT_OK) {
	mtx_unlock(&sc->mfi_io_lock);
	mtx_lock(&Giant);
	device_delete_child(sc->mfi_dev, ld->ld_dev);
	mtx_unlock(&Giant);
	mtx_lock(&sc->mfi_io_lock);
	} else
	mfi_disk_enable(ld);
	break;
	case MFI_DCMD_CFG_CLEAR:
	if (cm->cm_frame->header.cmd_status == MFI_STAT_OK) {
	mtx_unlock(&sc->mfi_io_lock);
	mtx_lock(&Giant);
	TAILQ_FOREACH_SAFE(ld, &sc->mfi_ld_tqh, ld_link, ldn) {
	device_delete_child(sc->mfi_dev, ld->ld_dev);
	}
	mtx_unlock(&Giant);
	mtx_lock(&sc->mfi_io_lock);
	} else {
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link)
	mfi_disk_enable(ld);
	}
	break;
	case MFI_DCMD_CFG_ADD:
	mfi_ldprobe(sc);
	break;
	case MFI_DCMD_CFG_FOREIGN_IMPORT:
	mfi_ldprobe(sc);
	break;
	case MFI_DCMD_PD_STATE_SET:
	mbox = (uint16_t *) cm->cm_frame->dcmd.mbox;
	syspd_id = mbox[0];
	if (mbox[2] == MFI_PD_STATE_UNCONFIGURED_GOOD) {
	TAILQ_FOREACH(syspd, &sc->mfi_syspd_tqh,pd_link) {
	if (syspd->pd_id == syspd_id)
	break;
	}
	}
	else
	break;
	/* If the transition fails then enable the syspd again */
	if (syspd && cm->cm_frame->header.cmd_status != MFI_STAT_OK)
	mfi_syspd_enable(syspd);
	break;
	}
	}

	static int
	mfi_check_for_sscd(struct mfi_softc sc, struct mfi_command cm)
	{
	struct mfi_config_data *conf_data;
	struct mfi_command *ld_cm = NULL;
	struct mfi_ld_info *ld_info = NULL;
	struct mfi_ld_config *ld;
	char *p;
	int error = 0;

	conf_data = (struct mfi_config_data *)cm->cm_data;

	if (cm->cm_frame->dcmd.opcode == MFI_DCMD_CFG_ADD) {
	p = (char *)conf_data->array;
	p += conf_data->array_size * conf_data->array_count;
	ld = (struct mfi_ld_config *)p;
	if (ld->params.isSSCD == 1)
	error = 1;
	} else if (cm->cm_frame->dcmd.opcode == MFI_DCMD_LD_DELETE) {
	error = mfi_dcmd_command (sc, &ld_cm, MFI_DCMD_LD_GET_INFO,
	(void *)&ld_info, sizeof(ld_info));
	if (error) {
	device_printf(sc->mfi_dev, "Failed to allocate"
	"MFI_DCMD_LD_GET_INFO %d", error);
	if (ld_info)
	free(ld_info, M_MFIBUF);
	return 0;
	}
	ld_cm->cm_flags = MFI_CMD_DATAIN;
	ld_cm->cm_frame->dcmd.mbox[0]= cm->cm_frame->dcmd.mbox[0];
	ld_cm->cm_frame->header.target_id = cm->cm_frame->dcmd.mbox[0];
	if (mfi_wait_command(sc, ld_cm) != 0) {
	device_printf(sc->mfi_dev, "failed to get log drv\n");
	mfi_release_command(ld_cm);
	free(ld_info, M_MFIBUF);
	return 0;
	}

	if (ld_cm->cm_frame->header.cmd_status != MFI_STAT_OK) {
	free(ld_info, M_MFIBUF);
	mfi_release_command(ld_cm);
	return 0;
	}
	else
	ld_info = (struct mfi_ld_info *)ld_cm->cm_private;

	if (ld_info->ld_config.params.isSSCD == 1)
	error = 1;

	mfi_release_command(ld_cm);
	free(ld_info, M_MFIBUF);
	}
	return error;
	}

	static int
	mfi_stp_cmd(struct mfi_softc sc, struct mfi_command cm,caddr_t arg)
	{
	uint8_t i;
	struct mfi_ioc_packet *ioc;
	ioc = (struct mfi_ioc_packet *)arg;
	int sge_size, error;
	struct megasas_sge *kern_sge;

	memset(sc->kbuff_arr, 0, sizeof(sc->kbuff_arr));
	kern_sge =(struct megasas_sge *) ((uintptr_t)cm->cm_frame + ioc->mfi_sgl_off);
	cm->cm_frame->header.sg_count = ioc->mfi_sge_count;

	if (sizeof(bus_addr_t) == 8) {
	cm->cm_frame->header.flags \|= MFI_FRAME_SGL64;
	cm->cm_extra_frames = 2;
	sge_size = sizeof(struct mfi_sg64);
	} else {
	cm->cm_extra_frames = (cm->cm_total_frame_size - 1) / MFI_FRAME_SIZE;
	sge_size = sizeof(struct mfi_sg32);
	}

	cm->cm_total_frame_size += (sge_size * ioc->mfi_sge_count);
	for (i = 0; i < ioc->mfi_sge_count; i++) {
	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	1, 0, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	ioc->mfi_sgl[i].iov_len,/* maxsize */
	2, /* nsegments */
	ioc->mfi_sgl[i].iov_len,/* maxsegsize */
	BUS_DMA_ALLOCNOW, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->mfi_kbuff_arr_dmat[i])) {
	device_printf(sc->mfi_dev,
	"Cannot allocate mfi_kbuff_arr_dmat tag\n");
	return (ENOMEM);
	}

	if (bus_dmamem_alloc(sc->mfi_kbuff_arr_dmat[i],
	(void **)&sc->kbuff_arr[i], BUS_DMA_NOWAIT,
	&sc->mfi_kbuff_arr_dmamap[i])) {
	device_printf(sc->mfi_dev,
	"Cannot allocate mfi_kbuff_arr_dmamap memory\n");
	return (ENOMEM);
	}

	bus_dmamap_load(sc->mfi_kbuff_arr_dmat[i],
	sc->mfi_kbuff_arr_dmamap[i], sc->kbuff_arr[i],
	ioc->mfi_sgl[i].iov_len, mfi_addr_cb,
	&sc->mfi_kbuff_arr_busaddr[i], 0);

	if (!sc->kbuff_arr[i]) {
	device_printf(sc->mfi_dev,
	"Could not allocate memory for kbuff_arr info\n");
	return -1;
	}
	kern_sge[i].phys_addr = sc->mfi_kbuff_arr_busaddr[i];
	kern_sge[i].length = ioc->mfi_sgl[i].iov_len;

	if (sizeof(bus_addr_t) == 8) {
	cm->cm_frame->stp.sgl.sg64[i].addr =
	kern_sge[i].phys_addr;
	cm->cm_frame->stp.sgl.sg64[i].len =
	ioc->mfi_sgl[i].iov_len;
	} else {
	cm->cm_frame->stp.sgl.sg32[i].addr =
	kern_sge[i].phys_addr;
	cm->cm_frame->stp.sgl.sg32[i].len =
	ioc->mfi_sgl[i].iov_len;
	}

	error = copyin(ioc->mfi_sgl[i].iov_base,
	sc->kbuff_arr[i],
	ioc->mfi_sgl[i].iov_len);
	if (error != 0) {
	device_printf(sc->mfi_dev, "Copy in failed\n");
	return error;
	}
	}

	cm->cm_flags \|=MFI_CMD_MAPPED;
	return 0;
	}

	static int
	mfi_user_command(struct mfi_softc sc, struct mfi_ioc_passthru ioc)
	{
	struct mfi_command *cm;
	struct mfi_dcmd_frame *dcmd;
	void *ioc_buf = NULL;
	uint32_t context;
	int error = 0, locked;

	if (ioc->buf_size > 0) {
	if (ioc->buf_size > 1024 * 1024)
	return (ENOMEM);
	ioc_buf = malloc(ioc->buf_size, M_MFIBUF, M_WAITOK);
	error = copyin(ioc->buf, ioc_buf, ioc->buf_size);
	if (error) {
	device_printf(sc->mfi_dev, "failed to copyin\n");
	free(ioc_buf, M_MFIBUF);
	return (error);
	}
	}

	locked = mfi_config_lock(sc, ioc->ioc_frame.opcode);

	mtx_lock(&sc->mfi_io_lock);
	while ((cm = mfi_dequeue_free(sc)) == NULL)
	msleep(mfi_user_command, &sc->mfi_io_lock, 0, "mfiioc", hz);

	/* Save context for later */
	context = cm->cm_frame->header.context;

	dcmd = &cm->cm_frame->dcmd;
	bcopy(&ioc->ioc_frame, dcmd, sizeof(struct mfi_dcmd_frame));

	cm->cm_sg = &dcmd->sgl;
	cm->cm_total_frame_size = MFI_DCMD_FRAME_SIZE;
	cm->cm_data = ioc_buf;
	cm->cm_len = ioc->buf_size;

	/* restore context */
	cm->cm_frame->header.context = context;

	/* Cheat since we don't know if we're writing or reading */
	cm->cm_flags = MFI_CMD_DATAIN \| MFI_CMD_DATAOUT;

	error = mfi_check_command_pre(sc, cm);
	if (error)
	goto out;

	error = mfi_wait_command(sc, cm);
	if (error) {
	device_printf(sc->mfi_dev, "ioctl failed %d\n", error);
	goto out;
	}
	bcopy(dcmd, &ioc->ioc_frame, sizeof(struct mfi_dcmd_frame));
	mfi_check_command_post(sc, cm);
	out:
	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);
	mfi_config_unlock(sc, locked);
	if (ioc->buf_size > 0)
	error = copyout(ioc_buf, ioc->buf, ioc->buf_size);
	if (ioc_buf)
	free(ioc_buf, M_MFIBUF);
	return (error);
	}

	#define PTRIN(p) ((void *)(uintptr_t)(p))

	static int
	mfi_ioctl(struct cdev dev, u_long cmd, caddr_t arg, int flag, struct thread td)
	{
	struct mfi_softc *sc;
	union mfi_statrequest *ms;
	struct mfi_ioc_packet *ioc;
	#ifdef COMPAT_FREEBSD32
	struct mfi_ioc_packet32 *ioc32;
	#endif
	struct mfi_ioc_aen *aen;
	struct mfi_command *cm = NULL;
	uint32_t context = 0;
	union mfi_sense_ptr sense_ptr;
	uint8_t data = NULL, temp, *addr, skip_pre_post = 0;
	size_t len;
	int i, res;
	struct mfi_ioc_passthru iop = (struct mfi_ioc_passthru )arg;
	#ifdef COMPAT_FREEBSD32
	struct mfi_ioc_passthru32 iop32 = (struct mfi_ioc_passthru32 )arg;
	struct mfi_ioc_passthru iop_swab;
	#endif
	int error, locked;
	union mfi_sgl *sgl;
	sc = dev->si_drv1;
	error = 0;

	if (sc->adpreset)
	return EBUSY;

	if (sc->hw_crit_error)
	return EBUSY;

	if (sc->issuepend_done == 0)
	return EBUSY;

	switch (cmd) {
	case MFIIO_STATS:
	ms = (union mfi_statrequest *)arg;
	switch (ms->ms_item) {
	case MFIQ_FREE:
	case MFIQ_BIO:
	case MFIQ_READY:
	case MFIQ_BUSY:
	bcopy(&sc->mfi_qstat[ms->ms_item], &ms->ms_qstat,
	sizeof(struct mfi_qstat));
	break;
	default:
	error = ENOIOCTL;
	break;
	}
	break;
	case MFIIO_QUERY_DISK:
	{
	struct mfi_query_disk *qd;
	struct mfi_disk *ld;

	qd = (struct mfi_query_disk *)arg;
	mtx_lock(&sc->mfi_io_lock);
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
	if (ld->ld_id == qd->array_id)
	break;
	}
	if (ld == NULL) {
	qd->present = 0;
	mtx_unlock(&sc->mfi_io_lock);
	return (0);
	}
	qd->present = 1;
	if (ld->ld_flags & MFI_DISK_FLAGS_OPEN)
	qd->open = 1;
	bzero(qd->devname, SPECNAMELEN + 1);
	snprintf(qd->devname, SPECNAMELEN, "mfid%d", ld->ld_unit);
	mtx_unlock(&sc->mfi_io_lock);
	break;
	}
	case MFI_CMD:
	#ifdef COMPAT_FREEBSD32
	case MFI_CMD32:
	#endif
	{
	devclass_t devclass;
	ioc = (struct mfi_ioc_packet *)arg;
	int adapter;

	adapter = ioc->mfi_adapter_no;
	if (device_get_unit(sc->mfi_dev) == 0 && adapter != 0) {
	devclass = devclass_find("mfi");
	sc = devclass_get_softc(devclass, adapter);
	}
	mtx_lock(&sc->mfi_io_lock);
	if ((cm = mfi_dequeue_free(sc)) == NULL) {
	mtx_unlock(&sc->mfi_io_lock);
	return (EBUSY);
	}
	mtx_unlock(&sc->mfi_io_lock);
	locked = 0;

	/*
	* save off original context since copying from user
	* will clobber some data
	*/
	context = cm->cm_frame->header.context;
	cm->cm_frame->header.context = cm->cm_index;

	bcopy(ioc->mfi_frame.raw, cm->cm_frame,
	2 * MEGAMFI_FRAME_SIZE);
	cm->cm_total_frame_size = (sizeof(union mfi_sgl)
	* ioc->mfi_sge_count) + ioc->mfi_sgl_off;
	cm->cm_frame->header.scsi_status = 0;
	cm->cm_frame->header.pad0 = 0;
	if (ioc->mfi_sge_count) {
	cm->cm_sg =
	(union mfi_sgl *)&cm->cm_frame->bytes[ioc->mfi_sgl_off];
	}
	sgl = cm->cm_sg;
	cm->cm_flags = 0;
	if (cm->cm_frame->header.flags & MFI_FRAME_DATAIN)
	cm->cm_flags \|= MFI_CMD_DATAIN;
	if (cm->cm_frame->header.flags & MFI_FRAME_DATAOUT)
	cm->cm_flags \|= MFI_CMD_DATAOUT;
	/* Legacy app shim */
	if (cm->cm_flags == 0)
	cm->cm_flags \|= MFI_CMD_DATAIN \| MFI_CMD_DATAOUT;
	cm->cm_len = cm->cm_frame->header.data_len;
	if (cm->cm_frame->header.cmd == MFI_CMD_STP) {
	#ifdef COMPAT_FREEBSD32
	if (cmd == MFI_CMD) {
	#endif
	/* Native */
	cm->cm_stp_len = ioc->mfi_sgl[0].iov_len;
	#ifdef COMPAT_FREEBSD32
	} else {
	/* 32bit on 64bit */
	ioc32 = (struct mfi_ioc_packet32 *)ioc;
	cm->cm_stp_len = ioc32->mfi_sgl[0].iov_len;
	}
	#endif
	cm->cm_len += cm->cm_stp_len;
	}
	if (cm->cm_len &&
	(cm->cm_flags & (MFI_CMD_DATAIN \| MFI_CMD_DATAOUT))) {
	cm->cm_data = data = malloc(cm->cm_len, M_MFIBUF,
	M_WAITOK \| M_ZERO);
	} else {
	cm->cm_data = 0;
	}

	/* restore header context */
	cm->cm_frame->header.context = context;

	if (cm->cm_frame->header.cmd == MFI_CMD_STP) {
	res = mfi_stp_cmd(sc, cm, arg);
	if (res != 0)
	goto out;
	} else {
	temp = data;
	if ((cm->cm_flags & MFI_CMD_DATAOUT) \|\|
	(cm->cm_frame->header.cmd == MFI_CMD_STP)) {
	for (i = 0; i < ioc->mfi_sge_count; i++) {
	#ifdef COMPAT_FREEBSD32
	if (cmd == MFI_CMD) {
	#endif
	/* Native */
	addr = ioc->mfi_sgl[i].iov_base;
	len = ioc->mfi_sgl[i].iov_len;
	#ifdef COMPAT_FREEBSD32
	} else {
	/* 32bit on 64bit */
	ioc32 = (struct mfi_ioc_packet32 *)ioc;
	addr = PTRIN(ioc32->mfi_sgl[i].iov_base);
	len = ioc32->mfi_sgl[i].iov_len;
	}
	#endif
	error = copyin(addr, temp, len);
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy in failed\n");
	goto out;
	}
	temp = &temp[len];
	}
	}
	}

	if (cm->cm_frame->header.cmd == MFI_CMD_DCMD)
	locked = mfi_config_lock(sc,
	cm->cm_frame->dcmd.opcode);

	if (cm->cm_frame->header.cmd == MFI_CMD_PD_SCSI_IO) {
	cm->cm_frame->pass.sense_addr_lo =
	(uint32_t)cm->cm_sense_busaddr;
	cm->cm_frame->pass.sense_addr_hi =
	(uint32_t)((uint64_t)cm->cm_sense_busaddr >> 32);
	}
	mtx_lock(&sc->mfi_io_lock);
	skip_pre_post = mfi_check_for_sscd (sc, cm);
	if (!skip_pre_post) {
	error = mfi_check_command_pre(sc, cm);
	if (error) {
	mtx_unlock(&sc->mfi_io_lock);
	goto out;
	}
	}
	if ((error = mfi_wait_command(sc, cm)) != 0) {
	device_printf(sc->mfi_dev,
	"Controller polled failed\n");
	mtx_unlock(&sc->mfi_io_lock);
	goto out;
	}
	if (!skip_pre_post) {
	mfi_check_command_post(sc, cm);
	}
	mtx_unlock(&sc->mfi_io_lock);

	if (cm->cm_frame->header.cmd != MFI_CMD_STP) {
	temp = data;
	if ((cm->cm_flags & MFI_CMD_DATAIN) \|\|
	(cm->cm_frame->header.cmd == MFI_CMD_STP)) {
	for (i = 0; i < ioc->mfi_sge_count; i++) {
	#ifdef COMPAT_FREEBSD32
	if (cmd == MFI_CMD) {
	#endif
	/* Native */
	addr = ioc->mfi_sgl[i].iov_base;
	len = ioc->mfi_sgl[i].iov_len;
	#ifdef COMPAT_FREEBSD32
	} else {
	/* 32bit on 64bit */
	ioc32 = (struct mfi_ioc_packet32 *)ioc;
	addr = PTRIN(ioc32->mfi_sgl[i].iov_base);
	len = ioc32->mfi_sgl[i].iov_len;
	}
	#endif
	error = copyout(temp, addr, len);
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy out failed\n");
	goto out;
	}
	temp = &temp[len];
	}
	}
	}

	if (ioc->mfi_sense_len) {
	/* get user-space sense ptr then copy out sense */
	bcopy(&ioc->mfi_frame.raw[ioc->mfi_sense_off],
	&sense_ptr.sense_ptr_data[0],
	sizeof(sense_ptr.sense_ptr_data));
	#ifdef COMPAT_FREEBSD32
	if (cmd != MFI_CMD) {
	/*
	* not 64bit native so zero out any address
	* over 32bit */
	sense_ptr.addr.high = 0;
	}
	#endif
	error = copyout(cm->cm_sense, sense_ptr.user_space,
	ioc->mfi_sense_len);
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy out failed\n");
	goto out;
	}
	}

	ioc->mfi_frame.hdr.cmd_status = cm->cm_frame->header.cmd_status;
	out:
	mfi_config_unlock(sc, locked);
	if (data)
	free(data, M_MFIBUF);
	if (cm->cm_frame->header.cmd == MFI_CMD_STP) {
	for (i = 0; i < 2; i++) {
	if (sc->kbuff_arr[i]) {
	if (sc->mfi_kbuff_arr_busaddr[i] != 0)
	bus_dmamap_unload(
	sc->mfi_kbuff_arr_dmat[i],
	sc->mfi_kbuff_arr_dmamap[i]
	);
	if (sc->kbuff_arr[i] != NULL)
	bus_dmamem_free(
	sc->mfi_kbuff_arr_dmat[i],
	sc->kbuff_arr[i],
	sc->mfi_kbuff_arr_dmamap[i]
	);
	if (sc->mfi_kbuff_arr_dmat[i] != NULL)
	bus_dma_tag_destroy(
	sc->mfi_kbuff_arr_dmat[i]);
	}
	}
	}
	if (cm) {
	mtx_lock(&sc->mfi_io_lock);
	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);
	}

	break;
	}
	case MFI_SET_AEN:
	aen = (struct mfi_ioc_aen *)arg;
	mtx_lock(&sc->mfi_io_lock);
	error = mfi_aen_register(sc, aen->aen_seq_num,
	aen->aen_class_locale);
	mtx_unlock(&sc->mfi_io_lock);

	break;
	case MFI_LINUX_CMD_2: /* Firmware Linux ioctl shim */
	{
	devclass_t devclass;
	struct mfi_linux_ioc_packet l_ioc;
	int adapter;

	devclass = devclass_find("mfi");
	if (devclass == NULL)
	return (ENOENT);

	error = copyin(arg, &l_ioc, sizeof(l_ioc));
	if (error)
	return (error);
	adapter = l_ioc.lioc_adapter_no;
	sc = devclass_get_softc(devclass, adapter);
	if (sc == NULL)
	return (ENOENT);
	return (mfi_linux_ioctl_int(sc->mfi_cdev,
	cmd, arg, flag, td));
	break;
	}
	case MFI_LINUX_SET_AEN_2: /* AEN Linux ioctl shim */
	{
	devclass_t devclass;
	struct mfi_linux_ioc_aen l_aen;
	int adapter;

	devclass = devclass_find("mfi");
	if (devclass == NULL)
	return (ENOENT);

	error = copyin(arg, &l_aen, sizeof(l_aen));
	if (error)
	return (error);
	adapter = l_aen.laen_adapter_no;
	sc = devclass_get_softc(devclass, adapter);
	if (sc == NULL)
	return (ENOENT);
	return (mfi_linux_ioctl_int(sc->mfi_cdev,
	cmd, arg, flag, td));
	break;
	}
	#ifdef COMPAT_FREEBSD32
	case MFIIO_PASSTHRU32:
	if (!SV_CURPROC_FLAG(SV_ILP32)) {
	error = ENOTTY;
	break;
	}
	iop_swab.ioc_frame = iop32->ioc_frame;
	iop_swab.buf_size = iop32->buf_size;
	iop_swab.buf = PTRIN(iop32->buf);
	iop = &iop_swab;
	/* FALLTHROUGH */
	#endif
	case MFIIO_PASSTHRU:
	error = mfi_user_command(sc, iop);
	#ifdef COMPAT_FREEBSD32
	if (cmd == MFIIO_PASSTHRU32)
	iop32->ioc_frame = iop_swab.ioc_frame;
	#endif
	break;
	default:
	device_printf(sc->mfi_dev, "IOCTL 0x%lx not handled\n", cmd);
	error = ENOTTY;
	break;
	}

	return (error);
	}

	static int
	mfi_linux_ioctl_int(struct cdev dev, u_long cmd, caddr_t arg, int flag, struct thread td)
	{
	struct mfi_softc *sc;
	struct mfi_linux_ioc_packet l_ioc;
	struct mfi_linux_ioc_aen l_aen;
	struct mfi_command *cm = NULL;
	struct mfi_aen *mfi_aen_entry;
	union mfi_sense_ptr sense_ptr;
	uint32_t context = 0;
	uint8_t data = NULL, temp;
	int i;
	int error, locked;

	sc = dev->si_drv1;
	error = 0;
	switch (cmd) {
	case MFI_LINUX_CMD_2: /* Firmware Linux ioctl shim */
	error = copyin(arg, &l_ioc, sizeof(l_ioc));
	if (error != 0)
	return (error);

	if (l_ioc.lioc_sge_count > MAX_LINUX_IOCTL_SGE) {
	return (EINVAL);
	}

	mtx_lock(&sc->mfi_io_lock);
	if ((cm = mfi_dequeue_free(sc)) == NULL) {
	mtx_unlock(&sc->mfi_io_lock);
	return (EBUSY);
	}
	mtx_unlock(&sc->mfi_io_lock);
	locked = 0;

	/*
	* save off original context since copying from user
	* will clobber some data
	*/
	context = cm->cm_frame->header.context;

	bcopy(l_ioc.lioc_frame.raw, cm->cm_frame,
	2 * MFI_DCMD_FRAME_SIZE); /* this isn't quite right */
	cm->cm_total_frame_size = (sizeof(union mfi_sgl)
	* l_ioc.lioc_sge_count) + l_ioc.lioc_sgl_off;
	cm->cm_frame->header.scsi_status = 0;
	cm->cm_frame->header.pad0 = 0;
	if (l_ioc.lioc_sge_count)
	cm->cm_sg =
	(union mfi_sgl *)&cm->cm_frame->bytes[l_ioc.lioc_sgl_off];
	cm->cm_flags = 0;
	if (cm->cm_frame->header.flags & MFI_FRAME_DATAIN)
	cm->cm_flags \|= MFI_CMD_DATAIN;
	if (cm->cm_frame->header.flags & MFI_FRAME_DATAOUT)
	cm->cm_flags \|= MFI_CMD_DATAOUT;
	cm->cm_len = cm->cm_frame->header.data_len;
	if (cm->cm_len &&
	(cm->cm_flags & (MFI_CMD_DATAIN \| MFI_CMD_DATAOUT))) {
	cm->cm_data = data = malloc(cm->cm_len, M_MFIBUF,
	M_WAITOK \| M_ZERO);
	} else {
	cm->cm_data = 0;
	}

	/* restore header context */
	cm->cm_frame->header.context = context;

	temp = data;
	if (cm->cm_flags & MFI_CMD_DATAOUT) {
	for (i = 0; i < l_ioc.lioc_sge_count; i++) {
	error = copyin(PTRIN(l_ioc.lioc_sgl[i].iov_base),
	temp,
	l_ioc.lioc_sgl[i].iov_len);
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy in failed\n");
	goto out;
	}
	temp = &temp[l_ioc.lioc_sgl[i].iov_len];
	}
	}

	if (cm->cm_frame->header.cmd == MFI_CMD_DCMD)
	locked = mfi_config_lock(sc, cm->cm_frame->dcmd.opcode);

	if (cm->cm_frame->header.cmd == MFI_CMD_PD_SCSI_IO) {
	cm->cm_frame->pass.sense_addr_lo =
	(uint32_t)cm->cm_sense_busaddr;
	cm->cm_frame->pass.sense_addr_hi =
	(uint32_t)((uint64_t)cm->cm_sense_busaddr >> 32);
	}

	mtx_lock(&sc->mfi_io_lock);
	error = mfi_check_command_pre(sc, cm);
	if (error) {
	mtx_unlock(&sc->mfi_io_lock);
	goto out;
	}

	if ((error = mfi_wait_command(sc, cm)) != 0) {
	device_printf(sc->mfi_dev,
	"Controller polled failed\n");
	mtx_unlock(&sc->mfi_io_lock);
	goto out;
	}

	mfi_check_command_post(sc, cm);
	mtx_unlock(&sc->mfi_io_lock);

	temp = data;
	if (cm->cm_flags & MFI_CMD_DATAIN) {
	for (i = 0; i < l_ioc.lioc_sge_count; i++) {
	error = copyout(temp,
	PTRIN(l_ioc.lioc_sgl[i].iov_base),
	l_ioc.lioc_sgl[i].iov_len);
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy out failed\n");
	goto out;
	}
	temp = &temp[l_ioc.lioc_sgl[i].iov_len];
	}
	}

	if (l_ioc.lioc_sense_len) {
	/* get user-space sense ptr then copy out sense */
	bcopy(&((struct mfi_linux_ioc_packet*)arg)
	->lioc_frame.raw[l_ioc.lioc_sense_off],
	&sense_ptr.sense_ptr_data[0],
	sizeof(sense_ptr.sense_ptr_data));
	#ifdef __amd64__
	/*
	* only 32bit Linux support so zero out any
	* address over 32bit
	*/
	sense_ptr.addr.high = 0;
	#endif
	error = copyout(cm->cm_sense, sense_ptr.user_space,
	l_ioc.lioc_sense_len);
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy out failed\n");
	goto out;
	}
	}

	error = copyout(&cm->cm_frame->header.cmd_status,
	&((struct mfi_linux_ioc_packet*)arg)
	->lioc_frame.hdr.cmd_status,
	1);
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy out failed\n");
	goto out;
	}

	out:
	mfi_config_unlock(sc, locked);
	if (data)
	free(data, M_MFIBUF);
	if (cm) {
	mtx_lock(&sc->mfi_io_lock);
	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);
	}

	return (error);
	case MFI_LINUX_SET_AEN_2: /* AEN Linux ioctl shim */
	error = copyin(arg, &l_aen, sizeof(l_aen));
	if (error != 0)
	return (error);
	printf("AEN IMPLEMENTED for pid %d\n", curproc->p_pid);
	mfi_aen_entry = malloc(sizeof(struct mfi_aen), M_MFIBUF,
	M_WAITOK);
	mtx_lock(&sc->mfi_io_lock);
	if (mfi_aen_entry != NULL) {
	mfi_aen_entry->p = curproc;
	TAILQ_INSERT_TAIL(&sc->mfi_aen_pids, mfi_aen_entry,
	aen_link);
	}
	error = mfi_aen_register(sc, l_aen.laen_seq_num,
	l_aen.laen_class_locale);

	if (error != 0) {
	TAILQ_REMOVE(&sc->mfi_aen_pids, mfi_aen_entry,
	aen_link);
	free(mfi_aen_entry, M_MFIBUF);
	}
	mtx_unlock(&sc->mfi_io_lock);

	return (error);
	default:
	device_printf(sc->mfi_dev, "IOCTL 0x%lx not handled\n", cmd);
	error = ENOENT;
	break;
	}

	return (error);
	}

	static int
	mfi_poll(struct cdev dev, int poll_events, struct thread td)
	{
	struct mfi_softc *sc;
	int revents = 0;

	sc = dev->si_drv1;

	if (poll_events & (POLLIN \| POLLRDNORM)) {
	if (sc->mfi_aen_triggered != 0) {
	revents \|= poll_events & (POLLIN \| POLLRDNORM);
	sc->mfi_aen_triggered = 0;
	}
	if (sc->mfi_aen_triggered == 0 && sc->mfi_aen_cm == NULL) {
	revents \|= POLLERR;
	}
	}

	if (revents == 0) {
	if (poll_events & (POLLIN \| POLLRDNORM)) {
	sc->mfi_poll_waiting = 1;
	selrecord(td, &sc->mfi_select);
	}
	}

	return revents;
	}

	static void
	mfi_dump_all(void)
	{
	struct mfi_softc *sc;
	struct mfi_command *cm;
	devclass_t dc;
	time_t deadline;
	int timedout;
	int i;

	dc = devclass_find("mfi");
	if (dc == NULL) {
	printf("No mfi dev class\n");
	return;
	}

	for (i = 0; ; i++) {
	sc = devclass_get_softc(dc, i);
	if (sc == NULL)
	break;
	device_printf(sc->mfi_dev, "Dumping\n\n");
	timedout = 0;
	deadline = time_uptime - mfi_cmd_timeout;
	mtx_lock(&sc->mfi_io_lock);
	TAILQ_FOREACH(cm, &sc->mfi_busy, cm_link) {
	if (cm->cm_timestamp <= deadline) {
	device_printf(sc->mfi_dev,
	"COMMAND %p TIMEOUT AFTER %d SECONDS\n",
	cm, (int)(time_uptime - cm->cm_timestamp));
	MFI_PRINT_CMD(cm);
	timedout++;
	}
	}

	#if 0
	if (timedout)
	MFI_DUMP_CMDS(sc);
	#endif

	mtx_unlock(&sc->mfi_io_lock);
	}

	return;
	}

	static void
	mfi_timeout(void *data)
	{
	struct mfi_softc sc = (struct mfi_softc )data;
	struct mfi_command cm, tmp;
	time_t deadline;
	int timedout = 0;

	deadline = time_uptime - mfi_cmd_timeout;
	if (sc->adpreset == 0) {
	if (!mfi_tbolt_reset(sc)) {
	callout_reset(&sc->mfi_watchdog_callout,
	mfi_cmd_timeout * hz, mfi_timeout, sc);
	return;
	}
	}
	mtx_lock(&sc->mfi_io_lock);
	TAILQ_FOREACH_SAFE(cm, &sc->mfi_busy, cm_link, tmp) {
	if (sc->mfi_aen_cm == cm \|\| sc->mfi_map_sync_cm == cm)
	continue;
	if (cm->cm_timestamp <= deadline) {
	if (sc->adpreset != 0 && sc->issuepend_done == 0) {
	cm->cm_timestamp = time_uptime;
	} else {
	device_printf(sc->mfi_dev,
	"COMMAND %p TIMEOUT AFTER %d SECONDS\n",
	cm, (int)(time_uptime - cm->cm_timestamp)
	);
	MFI_PRINT_CMD(cm);
	MFI_VALIDATE_CMD(sc, cm);
	/*
	* While commands can get stuck forever we do
	* not fail them as there is no way to tell if
	* the controller has actually processed them
	* or not.
	*
	* In addition its very likely that force
	* failing a command here would cause a panic
	* e.g. in UFS.
	*/
	timedout++;
	}
	}
	}

	#if 0
	if (timedout)
	MFI_DUMP_CMDS(sc);
	#endif

	mtx_unlock(&sc->mfi_io_lock);

	callout_reset(&sc->mfi_watchdog_callout, mfi_cmd_timeout * hz,
	mfi_timeout, sc);

	if (0)
	mfi_dump_all();
	return;
	}
	diff --git a/sys/dev/mpr/mpr.c b/sys/dev/mpr/mpr.c
	index 197016562219..678fe5052fca 100644
	--- a/sys/dev/mpr/mpr.c
	+++ b/sys/dev/mpr/mpr.c
	@@ -1,3981 +1,3981 @@
	/*-
	* Copyright (c) 2009 Yahoo! Inc.
	* Copyright (c) 2011-2015 LSI Corp.
	* Copyright (c) 2013-2016 Avago Technologies
	* Copyright 2000-2020 Broadcom Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* Broadcom Inc. (LSI) MPT-Fusion Host Adapter FreeBSD
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/* Communications core for Avago Technologies (LSI) MPT3 */

	/* TODO Move headers to mprvar */
	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/selinfo.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/module.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/bio.h>
	#include <sys/malloc.h>
	#include <sys/uio.h>
	#include <sys/sysctl.h>
	#include <sys/smp.h>
	#include <sys/queue.h>
	#include <sys/kthread.h>
	#include <sys/taskqueue.h>
	#include <sys/endian.h>
	#include <sys/eventhandler.h>
	#include <sys/sbuf.h>
	#include <sys/priv.h>

	#include <machine/bus.h>
	#include <machine/resource.h>
	#include <sys/rman.h>
	#include <sys/proc.h>

	#include <dev/pci/pcivar.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/scsi/scsi_all.h>

	#include <dev/mpr/mpi/mpi2_type.h>
	#include <dev/mpr/mpi/mpi2.h>
	#include <dev/mpr/mpi/mpi2_ioc.h>
	#include <dev/mpr/mpi/mpi2_sas.h>
	#include <dev/mpr/mpi/mpi2_pci.h>
	#include <dev/mpr/mpi/mpi2_cnfg.h>
	#include <dev/mpr/mpi/mpi2_init.h>
	#include <dev/mpr/mpi/mpi2_tool.h>
	#include <dev/mpr/mpr_ioctl.h>
	#include <dev/mpr/mprvar.h>
	#include <dev/mpr/mpr_table.h>
	#include <dev/mpr/mpr_sas.h>

	static int mpr_diag_reset(struct mpr_softc *sc, int sleep_flag);
	static int mpr_init_queues(struct mpr_softc *sc);
	static void mpr_resize_queues(struct mpr_softc *sc);
	static int mpr_message_unit_reset(struct mpr_softc *sc, int sleep_flag);
	static int mpr_transition_operational(struct mpr_softc *sc);
	static int mpr_iocfacts_allocate(struct mpr_softc *sc, uint8_t attaching);
	static void mpr_iocfacts_free(struct mpr_softc *sc);
	static void mpr_startup(void *arg);
	static int mpr_send_iocinit(struct mpr_softc *sc);
	static int mpr_alloc_queues(struct mpr_softc *sc);
	static int mpr_alloc_hw_queues(struct mpr_softc *sc);
	static int mpr_alloc_replies(struct mpr_softc *sc);
	static int mpr_alloc_requests(struct mpr_softc *sc);
	static int mpr_alloc_nvme_prp_pages(struct mpr_softc *sc);
	static int mpr_attach_log(struct mpr_softc *sc);
	static __inline void mpr_complete_command(struct mpr_softc *sc,
	struct mpr_command *cm);
	static void mpr_dispatch_event(struct mpr_softc *sc, uintptr_t data,
	MPI2_EVENT_NOTIFICATION_REPLY *reply);
	static void mpr_config_complete(struct mpr_softc sc, struct mpr_command cm);
	static void mpr_periodic(void *);
	static int mpr_reregister_events(struct mpr_softc *sc);
	static void mpr_enqueue_request(struct mpr_softc sc, struct mpr_command cm);
	static int mpr_get_iocfacts(struct mpr_softc sc, MPI2_IOC_FACTS_REPLY facts);
	static int mpr_wait_db_ack(struct mpr_softc *sc, int timeout, int sleep_flag);
	static int mpr_debug_sysctl(SYSCTL_HANDLER_ARGS);
	static int mpr_dump_reqs(SYSCTL_HANDLER_ARGS);
	static void mpr_parse_debug(struct mpr_softc sc, char list);

	SYSCTL_NODE(_hw, OID_AUTO, mpr, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"MPR Driver Parameters");

	MALLOC_DEFINE(M_MPR, "mpr", "mpr driver memory");

	/*
	* Do a "Diagnostic Reset" aka a hard reset. This should get the chip out of
	* any state and back to its initialization state machine.
	*/
	static char mpt2_reset_magic[] = { 0x00, 0x0f, 0x04, 0x0b, 0x02, 0x07, 0x0d };

	/*
	* Added this union to smoothly convert le64toh cm->cm_desc.Words.
	* Compiler only supports uint64_t to be passed as an argument.
	* Otherwise it will throw this error:
	* "aggregate value used where an integer was expected"
	*/
	typedef union {
	u64 word;
	struct {
	u32 low;
	u32 high;
	} u;
	} request_descriptor_t;

	/* Rate limit chain-fail messages to 1 per minute */
	static struct timeval mpr_chainfail_interval = { 60, 0 };

	/*
	* sleep_flag can be either CAN_SLEEP or NO_SLEEP.
	* If this function is called from process context, it can sleep
	* and there is no harm to sleep, in case if this fuction is called
	* from Interrupt handler, we can not sleep and need NO_SLEEP flag set.
	* based on sleep flags driver will call either msleep, pause or DELAY.
	* msleep and pause are of same variant, but pause is used when mpr_mtx
	* is not hold by driver.
	*/
	static int
	mpr_diag_reset(struct mpr_softc *sc,int sleep_flag)
	{
	uint32_t reg;
	int i, error, tries = 0;
	uint8_t first_wait_done = FALSE;

	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);

	/* Clear any pending interrupts */
	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);

	/*
	* Force NO_SLEEP for threads prohibited to sleep
	* e.a Thread from interrupt handler are prohibited to sleep.
	*/
	if (curthread->td_no_sleeping)
	sleep_flag = NO_SLEEP;

	mpr_dprint(sc, MPR_INIT, "sequence start, sleep_flag=%d\n", sleep_flag);
	/* Push the magic sequence */
	error = ETIMEDOUT;
	while (tries++ < 20) {
	for (i = 0; i < sizeof(mpt2_reset_magic); i++)
	mpr_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET,
	mpt2_reset_magic[i]);

	/* wait 100 msec */
	if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP)
	msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0,
	"mprdiag", hz/10);
	else if (sleep_flag == CAN_SLEEP)
	pause("mprdiag", hz/10);
	else
	DELAY(100 * 1000);

	reg = mpr_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
	if (reg & MPI2_DIAG_DIAG_WRITE_ENABLE) {
	error = 0;
	break;
	}
	}
	if (error) {
	mpr_dprint(sc, MPR_INIT, "sequence failed, error=%d, exit\n",
	error);
	return (error);
	}

	/* Send the actual reset. XXX need to refresh the reg? */
	reg \|= MPI2_DIAG_RESET_ADAPTER;
	mpr_dprint(sc, MPR_INIT, "sequence success, sending reset, reg= 0x%x\n",
	reg);
	mpr_regwrite(sc, MPI2_HOST_DIAGNOSTIC_OFFSET, reg);

	/* Wait up to 300 seconds in 50ms intervals */
	error = ETIMEDOUT;
	for (i = 0; i < 6000; i++) {
	/*
	* Wait 50 msec. If this is the first time through, wait 256
	* msec to satisfy Diag Reset timing requirements.
	*/
	if (first_wait_done) {
	if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP)
	msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0,
	"mprdiag", hz/20);
	else if (sleep_flag == CAN_SLEEP)
	pause("mprdiag", hz/20);
	else
	DELAY(50 * 1000);
	} else {
	DELAY(256 * 1000);
	first_wait_done = TRUE;
	}
	/*
	* Check for the RESET_ADAPTER bit to be cleared first, then
	* wait for the RESET state to be cleared, which takes a little
	* longer.
	*/
	reg = mpr_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
	if (reg & MPI2_DIAG_RESET_ADAPTER) {
	continue;
	}
	reg = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
	if ((reg & MPI2_IOC_STATE_MASK) != MPI2_IOC_STATE_RESET) {
	error = 0;
	break;
	}
	}
	if (error) {
	mpr_dprint(sc, MPR_INIT, "reset failed, error= %d, exit\n",
	error);
	return (error);
	}

	mpr_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET, 0x0);
	mpr_dprint(sc, MPR_INIT, "diag reset success, exit\n");

	return (0);
	}

	static int
	mpr_message_unit_reset(struct mpr_softc *sc, int sleep_flag)
	{
	int error;

	MPR_FUNCTRACE(sc);

	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);

	error = 0;
	mpr_regwrite(sc, MPI2_DOORBELL_OFFSET,
	MPI2_FUNCTION_IOC_MESSAGE_UNIT_RESET <<
	MPI2_DOORBELL_FUNCTION_SHIFT);

	if (mpr_wait_db_ack(sc, 5, sleep_flag) != 0) {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT,
	"Doorbell handshake failed\n");
	error = ETIMEDOUT;
	}

	mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
	return (error);
	}

	static int
	mpr_transition_ready(struct mpr_softc *sc)
	{
	uint32_t reg, state;
	int error, tries = 0;
	int sleep_flags;

	MPR_FUNCTRACE(sc);
	/* If we are in attach call, do not sleep */
	sleep_flags = (sc->mpr_flags & MPR_FLAGS_ATTACH_DONE)
	? CAN_SLEEP : NO_SLEEP;

	error = 0;

	mpr_dprint(sc, MPR_INIT, "%s entered, sleep_flags= %d\n",
	__func__, sleep_flags);

	while (tries++ < 1200) {
	reg = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
	mpr_dprint(sc, MPR_INIT, " Doorbell= 0x%x\n", reg);

	/*
	* Ensure the IOC is ready to talk. If it's not, try
	* resetting it.
	*/
	if (reg & MPI2_DOORBELL_USED) {
	mpr_dprint(sc, MPR_INIT, " Not ready, sending diag "
	"reset\n");
	mpr_diag_reset(sc, sleep_flags);
	DELAY(50000);
	continue;
	}

	/* Is the adapter owned by another peer? */
	if ((reg & MPI2_DOORBELL_WHO_INIT_MASK) ==
	(MPI2_WHOINIT_PCI_PEER << MPI2_DOORBELL_WHO_INIT_SHIFT)) {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT, "IOC is under the "
	"control of another peer host, aborting "
	"initialization.\n");
	error = ENXIO;
	break;
	}

	state = reg & MPI2_IOC_STATE_MASK;
	if (state == MPI2_IOC_STATE_READY) {
	/* Ready to go! */
	error = 0;
	break;
	} else if (state == MPI2_IOC_STATE_FAULT) {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT, "IOC in fault "
	"state 0x%x, resetting\n",
	state & MPI2_DOORBELL_FAULT_CODE_MASK);
	mpr_diag_reset(sc, sleep_flags);
	} else if (state == MPI2_IOC_STATE_OPERATIONAL) {
	/* Need to take ownership */
	mpr_message_unit_reset(sc, sleep_flags);
	} else if (state == MPI2_IOC_STATE_RESET) {
	/* Wait a bit, IOC might be in transition */
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT,
	"IOC in unexpected reset state\n");
	} else {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT,
	"IOC in unknown state 0x%x\n", state);
	error = EINVAL;
	break;
	}

	/* Wait 50ms for things to settle down. */
	DELAY(50000);
	}

	if (error)
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT,
	"Cannot transition IOC to ready\n");
	mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
	return (error);
	}

	static int
	mpr_transition_operational(struct mpr_softc *sc)
	{
	uint32_t reg, state;
	int error;

	MPR_FUNCTRACE(sc);

	error = 0;
	reg = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
	mpr_dprint(sc, MPR_INIT, "%s entered, Doorbell= 0x%x\n", __func__, reg);

	state = reg & MPI2_IOC_STATE_MASK;
	if (state != MPI2_IOC_STATE_READY) {
	mpr_dprint(sc, MPR_INIT, "IOC not ready\n");
	if ((error = mpr_transition_ready(sc)) != 0) {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT,
	"failed to transition ready, exit\n");
	return (error);
	}
	}

	error = mpr_send_iocinit(sc);
	mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);

	return (error);
	}

	static void
	mpr_resize_queues(struct mpr_softc *sc)
	{
	u_int reqcr, prireqcr, maxio, sges_per_frame, chain_seg_size;

	/*
	* Size the queues. Since the reply queues always need one free
	* entry, we'll deduct one reply message here. The LSI documents
	* suggest instead to add a count to the request queue, but I think
	* that it's better to deduct from reply queue.
	*/
	prireqcr = MAX(1, sc->max_prireqframes);
	prireqcr = MIN(prireqcr, sc->facts->HighPriorityCredit);

	reqcr = MAX(2, sc->max_reqframes);
	reqcr = MIN(reqcr, sc->facts->RequestCredit);

	sc->num_reqs = prireqcr + reqcr;
	sc->num_prireqs = prireqcr;
	sc->num_replies = MIN(sc->max_replyframes + sc->max_evtframes,
	sc->facts->MaxReplyDescriptorPostQueueDepth) - 1;

	/* Store the request frame size in bytes rather than as 32bit words */
	sc->reqframesz = sc->facts->IOCRequestFrameSize * 4;

	/*
	* Gen3 and beyond uses the IOCMaxChainSegmentSize from IOC Facts to
	* get the size of a Chain Frame. Previous versions use the size as a
	* Request Frame for the Chain Frame size. If IOCMaxChainSegmentSize
	* is 0, use the default value. The IOCMaxChainSegmentSize is the
	* number of 16-byte elelements that can fit in a Chain Frame, which is
	* the size of an IEEE Simple SGE.
	*/
	if (sc->facts->MsgVersion >= MPI2_VERSION_02_05) {
	chain_seg_size = htole16(sc->facts->IOCMaxChainSegmentSize);
	if (chain_seg_size == 0)
	chain_seg_size = MPR_DEFAULT_CHAIN_SEG_SIZE;
	sc->chain_frame_size = chain_seg_size *
	MPR_MAX_CHAIN_ELEMENT_SIZE;
	} else {
	sc->chain_frame_size = sc->reqframesz;
	}

	/*
	* Max IO Size is Page Size * the following:
	* ((SGEs per frame - 1 for chain element) * Max Chain Depth)
	* + 1 for no chain needed in last frame
	*
	* If user suggests a Max IO size to use, use the smaller of the
	* user's value and the calculated value as long as the user's
	* value is larger than 0. The user's value is in pages.
	*/
	sges_per_frame = sc->chain_frame_size/sizeof(MPI2_IEEE_SGE_SIMPLE64)-1;
	maxio = (sges_per_frame * sc->facts->MaxChainDepth + 1) * PAGE_SIZE;

	/*
	* If I/O size limitation requested then use it and pass up to CAM.
	- * If not, use MAXPHYS as an optimization hint, but report HW limit.
	+ * If not, use maxphys as an optimization hint, but report HW limit.
	*/
	if (sc->max_io_pages > 0) {
	maxio = min(maxio, sc->max_io_pages * PAGE_SIZE);
	sc->maxio = maxio;
	} else {
	sc->maxio = maxio;
	- maxio = min(maxio, MAXPHYS);
	+ maxio = min(maxio, maxphys);
	}

	sc->num_chains = (maxio / PAGE_SIZE + sges_per_frame - 2) /
	sges_per_frame * reqcr;
	if (sc->max_chains > 0 && sc->max_chains < sc->num_chains)
	sc->num_chains = sc->max_chains;

	/*
	* Figure out the number of MSIx-based queues. If the firmware or
	* user has done something crazy and not allowed enough credit for
	* the queues to be useful then don't enable multi-queue.
	*/
	if (sc->facts->MaxMSIxVectors < 2)
	sc->msi_msgs = 1;

	if (sc->msi_msgs > 1) {
	sc->msi_msgs = MIN(sc->msi_msgs, mp_ncpus);
	sc->msi_msgs = MIN(sc->msi_msgs, sc->facts->MaxMSIxVectors);
	if (sc->num_reqs / sc->msi_msgs < 2)
	sc->msi_msgs = 1;
	}

	mpr_dprint(sc, MPR_INIT, "Sized queues to q=%d reqs=%d replies=%d\n",
	sc->msi_msgs, sc->num_reqs, sc->num_replies);
	}

	/*
	* This is called during attach and when re-initializing due to a Diag Reset.
	* IOC Facts is used to allocate many of the structures needed by the driver.
	* If called from attach, de-allocation is not required because the driver has
	* not allocated any structures yet, but if called from a Diag Reset, previously
	* allocated structures based on IOC Facts will need to be freed and re-
	* allocated bases on the latest IOC Facts.
	*/
	static int
	mpr_iocfacts_allocate(struct mpr_softc *sc, uint8_t attaching)
	{
	int error;
	Mpi2IOCFactsReply_t saved_facts;
	uint8_t saved_mode, reallocating;

	mpr_dprint(sc, MPR_INIT\|MPR_TRACE, "%s entered\n", __func__);

	/* Save old IOC Facts and then only reallocate if Facts have changed */
	if (!attaching) {
	bcopy(sc->facts, &saved_facts, sizeof(MPI2_IOC_FACTS_REPLY));
	}

	/*
	* Get IOC Facts. In all cases throughout this function, panic if doing
	* a re-initialization and only return the error if attaching so the OS
	* can handle it.
	*/
	if ((error = mpr_get_iocfacts(sc, sc->facts)) != 0) {
	if (attaching) {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT, "Failed to get "
	"IOC Facts with error %d, exit\n", error);
	return (error);
	} else {
	panic("%s failed to get IOC Facts with error %d\n",
	__func__, error);
	}
	}

	MPR_DPRINT_PAGE(sc, MPR_XINFO, iocfacts, sc->facts);

	snprintf(sc->fw_version, sizeof(sc->fw_version),
	"%02d.%02d.%02d.%02d",
	sc->facts->FWVersion.Struct.Major,
	sc->facts->FWVersion.Struct.Minor,
	sc->facts->FWVersion.Struct.Unit,
	sc->facts->FWVersion.Struct.Dev);

	snprintf(sc->msg_version, sizeof(sc->msg_version), "%d.%d",
	(sc->facts->MsgVersion & MPI2_IOCFACTS_MSGVERSION_MAJOR_MASK) >>
	MPI2_IOCFACTS_MSGVERSION_MAJOR_SHIFT,
	(sc->facts->MsgVersion & MPI2_IOCFACTS_MSGVERSION_MINOR_MASK) >>
	MPI2_IOCFACTS_MSGVERSION_MINOR_SHIFT);

	mpr_dprint(sc, MPR_INFO, "Firmware: %s, Driver: %s\n", sc->fw_version,
	MPR_DRIVER_VERSION);
	mpr_dprint(sc, MPR_INFO,
	"IOCCapabilities: %b\n", sc->facts->IOCCapabilities,
	"\20" "\3ScsiTaskFull" "\4DiagTrace" "\5SnapBuf" "\6ExtBuf"
	"\7EEDP" "\10BiDirTarg" "\11Multicast" "\14TransRetry" "\15IR"
	"\16EventReplay" "\17RaidAccel" "\20MSIXIndex" "\21HostDisc"
	"\22FastPath" "\23RDPQArray" "\24AtomicReqDesc" "\25PCIeSRIOV");

	/*
	* If the chip doesn't support event replay then a hard reset will be
	* required to trigger a full discovery. Do the reset here then
	* retransition to Ready. A hard reset might have already been done,
	* but it doesn't hurt to do it again. Only do this if attaching, not
	* for a Diag Reset.
	*/
	if (attaching && ((sc->facts->IOCCapabilities &
	MPI2_IOCFACTS_CAPABILITY_EVENT_REPLAY) == 0)) {
	mpr_dprint(sc, MPR_INIT, "No event replay, resetting\n");
	mpr_diag_reset(sc, NO_SLEEP);
	if ((error = mpr_transition_ready(sc)) != 0) {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT, "Failed to "
	"transition to ready with error %d, exit\n",
	error);
	return (error);
	}
	}

	/*
	* Set flag if IR Firmware is loaded. If the RAID Capability has
	* changed from the previous IOC Facts, log a warning, but only if
	* checking this after a Diag Reset and not during attach.
	*/
	saved_mode = sc->ir_firmware;
	if (sc->facts->IOCCapabilities &
	MPI2_IOCFACTS_CAPABILITY_INTEGRATED_RAID)
	sc->ir_firmware = 1;
	if (!attaching) {
	if (sc->ir_firmware != saved_mode) {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT, "new IR/IT mode "
	"in IOC Facts does not match previous mode\n");
	}
	}

	/* Only deallocate and reallocate if relevant IOC Facts have changed */
	reallocating = FALSE;
	sc->mpr_flags &= ~MPR_FLAGS_REALLOCATED;

	if ((!attaching) &&
	((saved_facts.MsgVersion != sc->facts->MsgVersion) \|\|
	(saved_facts.HeaderVersion != sc->facts->HeaderVersion) \|\|
	(saved_facts.MaxChainDepth != sc->facts->MaxChainDepth) \|\|
	(saved_facts.RequestCredit != sc->facts->RequestCredit) \|\|
	(saved_facts.ProductID != sc->facts->ProductID) \|\|
	(saved_facts.IOCCapabilities != sc->facts->IOCCapabilities) \|\|
	(saved_facts.IOCRequestFrameSize !=
	sc->facts->IOCRequestFrameSize) \|\|
	(saved_facts.IOCMaxChainSegmentSize !=
	sc->facts->IOCMaxChainSegmentSize) \|\|
	(saved_facts.MaxTargets != sc->facts->MaxTargets) \|\|
	(saved_facts.MaxSasExpanders != sc->facts->MaxSasExpanders) \|\|
	(saved_facts.MaxEnclosures != sc->facts->MaxEnclosures) \|\|
	(saved_facts.HighPriorityCredit != sc->facts->HighPriorityCredit) \|\|
	(saved_facts.MaxReplyDescriptorPostQueueDepth !=
	sc->facts->MaxReplyDescriptorPostQueueDepth) \|\|
	(saved_facts.ReplyFrameSize != sc->facts->ReplyFrameSize) \|\|
	(saved_facts.MaxVolumes != sc->facts->MaxVolumes) \|\|
	(saved_facts.MaxPersistentEntries !=
	sc->facts->MaxPersistentEntries))) {
	reallocating = TRUE;

	/* Record that we reallocated everything */
	sc->mpr_flags \|= MPR_FLAGS_REALLOCATED;
	}

	/*
	* Some things should be done if attaching or re-allocating after a Diag
	* Reset, but are not needed after a Diag Reset if the FW has not
	* changed.
	*/
	if (attaching \|\| reallocating) {
	/*
	* Check if controller supports FW diag buffers and set flag to
	* enable each type.
	*/
	if (sc->facts->IOCCapabilities &
	MPI2_IOCFACTS_CAPABILITY_DIAG_TRACE_BUFFER)
	sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_TRACE].
	enabled = TRUE;
	if (sc->facts->IOCCapabilities &
	MPI2_IOCFACTS_CAPABILITY_SNAPSHOT_BUFFER)
	sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_SNAPSHOT].
	enabled = TRUE;
	if (sc->facts->IOCCapabilities &
	MPI2_IOCFACTS_CAPABILITY_EXTENDED_BUFFER)
	sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_EXTENDED].
	enabled = TRUE;

	/*
	* Set flags for some supported items.
	*/
	if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_EEDP)
	sc->eedp_enabled = TRUE;
	if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_TLR)
	sc->control_TLR = TRUE;
	if ((sc->facts->IOCCapabilities &
	MPI26_IOCFACTS_CAPABILITY_ATOMIC_REQ) &&
	(sc->mpr_flags & MPR_FLAGS_SEA_IOC))
	sc->atomic_desc_capable = TRUE;

	mpr_resize_queues(sc);

	/*
	* Initialize all Tail Queues
	*/
	TAILQ_INIT(&sc->req_list);
	TAILQ_INIT(&sc->high_priority_req_list);
	TAILQ_INIT(&sc->chain_list);
	TAILQ_INIT(&sc->prp_page_list);
	TAILQ_INIT(&sc->tm_list);
	}

	/*
	* If doing a Diag Reset and the FW is significantly different
	* (reallocating will be set above in IOC Facts comparison), then all
	* buffers based on the IOC Facts will need to be freed before they are
	* reallocated.
	*/
	if (reallocating) {
	mpr_iocfacts_free(sc);
	mprsas_realloc_targets(sc, saved_facts.MaxTargets +
	saved_facts.MaxVolumes);
	}

	/*
	* Any deallocation has been completed. Now start reallocating
	* if needed. Will only need to reallocate if attaching or if the new
	* IOC Facts are different from the previous IOC Facts after a Diag
	* Reset. Targets have already been allocated above if needed.
	*/
	error = 0;
	while (attaching \|\| reallocating) {
	if ((error = mpr_alloc_hw_queues(sc)) != 0)
	break;
	if ((error = mpr_alloc_replies(sc)) != 0)
	break;
	if ((error = mpr_alloc_requests(sc)) != 0)
	break;
	if ((error = mpr_alloc_queues(sc)) != 0)
	break;
	break;
	}
	if (error) {
	mpr_dprint(sc, MPR_INIT\|MPR_ERROR,
	"Failed to alloc queues with error %d\n", error);
	mpr_free(sc);
	return (error);
	}

	/* Always initialize the queues */
	bzero(sc->free_queue, sc->fqdepth * 4);
	mpr_init_queues(sc);

	/*
	* Always get the chip out of the reset state, but only panic if not
	* attaching. If attaching and there is an error, that is handled by
	* the OS.
	*/
	error = mpr_transition_operational(sc);
	if (error != 0) {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT, "Failed to "
	"transition to operational with error %d\n", error);
	mpr_free(sc);
	return (error);
	}

	/*
	* Finish the queue initialization.
	* These are set here instead of in mpr_init_queues() because the
	* IOC resets these values during the state transition in
	* mpr_transition_operational(). The free index is set to 1
	* because the corresponding index in the IOC is set to 0, and the
	* IOC treats the queues as full if both are set to the same value.
	* Hence the reason that the queue can't hold all of the possible
	* replies.
	*/
	sc->replypostindex = 0;
	mpr_regwrite(sc, MPI2_REPLY_FREE_HOST_INDEX_OFFSET, sc->replyfreeindex);
	mpr_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET, 0);

	/*
	* Attach the subsystems so they can prepare their event masks.
	* XXX Should be dynamic so that IM/IR and user modules can attach
	*/
	error = 0;
	while (attaching) {
	mpr_dprint(sc, MPR_INIT, "Attaching subsystems\n");
	if ((error = mpr_attach_log(sc)) != 0)
	break;
	if ((error = mpr_attach_sas(sc)) != 0)
	break;
	if ((error = mpr_attach_user(sc)) != 0)
	break;
	break;
	}
	if (error) {
	mpr_dprint(sc, MPR_INIT\|MPR_ERROR,
	"Failed to attach all subsystems: error %d\n", error);
	mpr_free(sc);
	return (error);
	}

	/*
	* XXX If the number of MSI-X vectors changes during re-init, this
	* won't see it and adjust.
	*/
	if (attaching && (error = mpr_pci_setup_interrupts(sc)) != 0) {
	mpr_dprint(sc, MPR_INIT\|MPR_ERROR,
	"Failed to setup interrupts\n");
	mpr_free(sc);
	return (error);
	}

	return (error);
	}

	/*
	* This is called if memory is being free (during detach for example) and when
	* buffers need to be reallocated due to a Diag Reset.
	*/
	static void
	mpr_iocfacts_free(struct mpr_softc *sc)
	{
	struct mpr_command *cm;
	int i;

	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);

	if (sc->free_busaddr != 0)
	bus_dmamap_unload(sc->queues_dmat, sc->queues_map);
	if (sc->free_queue != NULL)
	bus_dmamem_free(sc->queues_dmat, sc->free_queue,
	sc->queues_map);
	if (sc->queues_dmat != NULL)
	bus_dma_tag_destroy(sc->queues_dmat);

	if (sc->chain_frames != NULL) {
	bus_dmamap_unload(sc->chain_dmat, sc->chain_map);
	bus_dmamem_free(sc->chain_dmat, sc->chain_frames,
	sc->chain_map);
	}
	if (sc->chain_dmat != NULL)
	bus_dma_tag_destroy(sc->chain_dmat);

	if (sc->sense_busaddr != 0)
	bus_dmamap_unload(sc->sense_dmat, sc->sense_map);
	if (sc->sense_frames != NULL)
	bus_dmamem_free(sc->sense_dmat, sc->sense_frames,
	sc->sense_map);
	if (sc->sense_dmat != NULL)
	bus_dma_tag_destroy(sc->sense_dmat);

	if (sc->prp_page_busaddr != 0)
	bus_dmamap_unload(sc->prp_page_dmat, sc->prp_page_map);
	if (sc->prp_pages != NULL)
	bus_dmamem_free(sc->prp_page_dmat, sc->prp_pages,
	sc->prp_page_map);
	if (sc->prp_page_dmat != NULL)
	bus_dma_tag_destroy(sc->prp_page_dmat);

	if (sc->reply_busaddr != 0)
	bus_dmamap_unload(sc->reply_dmat, sc->reply_map);
	if (sc->reply_frames != NULL)
	bus_dmamem_free(sc->reply_dmat, sc->reply_frames,
	sc->reply_map);
	if (sc->reply_dmat != NULL)
	bus_dma_tag_destroy(sc->reply_dmat);

	if (sc->req_busaddr != 0)
	bus_dmamap_unload(sc->req_dmat, sc->req_map);
	if (sc->req_frames != NULL)
	bus_dmamem_free(sc->req_dmat, sc->req_frames, sc->req_map);
	if (sc->req_dmat != NULL)
	bus_dma_tag_destroy(sc->req_dmat);

	if (sc->chains != NULL)
	free(sc->chains, M_MPR);
	if (sc->prps != NULL)
	free(sc->prps, M_MPR);
	if (sc->commands != NULL) {
	for (i = 1; i < sc->num_reqs; i++) {
	cm = &sc->commands[i];
	bus_dmamap_destroy(sc->buffer_dmat, cm->cm_dmamap);
	}
	free(sc->commands, M_MPR);
	}
	if (sc->buffer_dmat != NULL)
	bus_dma_tag_destroy(sc->buffer_dmat);

	mpr_pci_free_interrupts(sc);
	free(sc->queues, M_MPR);
	sc->queues = NULL;
	}

	/*
	* The terms diag reset and hard reset are used interchangeably in the MPI
	* docs to mean resetting the controller chip. In this code diag reset
	* cleans everything up, and the hard reset function just sends the reset
	* sequence to the chip. This should probably be refactored so that every
	* subsystem gets a reset notification of some sort, and can clean up
	* appropriately.
	*/
	int
	mpr_reinit(struct mpr_softc *sc)
	{
	int error;
	struct mprsas_softc *sassc;

	sassc = sc->sassc;

	MPR_FUNCTRACE(sc);

	mtx_assert(&sc->mpr_mtx, MA_OWNED);

	mpr_dprint(sc, MPR_INIT\|MPR_INFO, "Reinitializing controller\n");
	if (sc->mpr_flags & MPR_FLAGS_DIAGRESET) {
	mpr_dprint(sc, MPR_INIT, "Reset already in progress\n");
	return 0;
	}

	/*
	* Make sure the completion callbacks can recognize they're getting
	* a NULL cm_reply due to a reset.
	*/
	sc->mpr_flags \|= MPR_FLAGS_DIAGRESET;

	/*
	* Mask interrupts here.
	*/
	mpr_dprint(sc, MPR_INIT, "Masking interrupts and resetting\n");
	mpr_mask_intr(sc);

	error = mpr_diag_reset(sc, CAN_SLEEP);
	if (error != 0) {
	panic("%s hard reset failed with error %d\n", __func__, error);
	}

	/* Restore the PCI state, including the MSI-X registers */
	mpr_pci_restore(sc);

	/* Give the I/O subsystem special priority to get itself prepared */
	mprsas_handle_reinit(sc);

	/*
	* Get IOC Facts and allocate all structures based on this information.
	* The attach function will also call mpr_iocfacts_allocate at startup.
	* If relevant values have changed in IOC Facts, this function will free
	* all of the memory based on IOC Facts and reallocate that memory.
	*/
	if ((error = mpr_iocfacts_allocate(sc, FALSE)) != 0) {
	panic("%s IOC Facts based allocation failed with error %d\n",
	__func__, error);
	}

	/*
	* Mapping structures will be re-allocated after getting IOC Page8, so
	* free these structures here.
	*/
	mpr_mapping_exit(sc);

	/*
	* The static page function currently read is IOC Page8. Others can be
	* added in future. It's possible that the values in IOC Page8 have
	* changed after a Diag Reset due to user modification, so always read
	* these. Interrupts are masked, so unmask them before getting config
	* pages.
	*/
	mpr_unmask_intr(sc);
	sc->mpr_flags &= ~MPR_FLAGS_DIAGRESET;
	mpr_base_static_config_pages(sc);

	/*
	* Some mapping info is based in IOC Page8 data, so re-initialize the
	* mapping tables.
	*/
	mpr_mapping_initialize(sc);

	/*
	* Restart will reload the event masks clobbered by the reset, and
	* then enable the port.
	*/
	mpr_reregister_events(sc);

	/* the end of discovery will release the simq, so we're done. */
	mpr_dprint(sc, MPR_INIT\|MPR_XINFO, "Finished sc %p post %u free %u\n",
	sc, sc->replypostindex, sc->replyfreeindex);
	mprsas_release_simq_reinit(sassc);
	mpr_dprint(sc, MPR_INIT, "%s exit error= %d\n", __func__, error);

	return 0;
	}

	/* Wait for the chip to ACK a word that we've put into its FIFO
	* Wait for <timeout> seconds. In single loop wait for busy loop
	* for 500 microseconds.
	* Total is [ 0.5 * (2000 * <timeout>) ] in miliseconds.
	* */
	static int
	mpr_wait_db_ack(struct mpr_softc *sc, int timeout, int sleep_flag)
	{
	u32 cntdn, count;
	u32 int_status;
	u32 doorbell;

	count = 0;
	cntdn = (sleep_flag == CAN_SLEEP) ? 1000timeout : 2000timeout;
	do {
	int_status = mpr_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
	if (!(int_status & MPI2_HIS_SYS2IOC_DB_STATUS)) {
	mpr_dprint(sc, MPR_TRACE, "%s: successful count(%d), "
	"timeout(%d)\n", __func__, count, timeout);
	return 0;
	} else if (int_status & MPI2_HIS_IOC2SYS_DB_STATUS) {
	doorbell = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
	if ((doorbell & MPI2_IOC_STATE_MASK) ==
	MPI2_IOC_STATE_FAULT) {
	mpr_dprint(sc, MPR_FAULT,
	"fault_state(0x%04x)!\n", doorbell);
	return (EFAULT);
	}
	} else if (int_status == 0xFFFFFFFF)
	goto out;

	/*
	* If it can sleep, sleep for 1 milisecond, else busy loop for
	* 0.5 milisecond
	*/
	if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP)
	msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0, "mprdba",
	hz/1000);
	else if (sleep_flag == CAN_SLEEP)
	pause("mprdba", hz/1000);
	else
	DELAY(500);
	count++;
	} while (--cntdn);

	out:
	mpr_dprint(sc, MPR_FAULT, "%s: failed due to timeout count(%d), "
	"int_status(%x)!\n", __func__, count, int_status);
	return (ETIMEDOUT);
	}

	/* Wait for the chip to signal that the next word in its FIFO can be fetched */
	static int
	mpr_wait_db_int(struct mpr_softc *sc)
	{
	int retry;

	for (retry = 0; retry < MPR_DB_MAX_WAIT; retry++) {
	if ((mpr_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET) &
	MPI2_HIS_IOC2SYS_DB_STATUS) != 0)
	return (0);
	DELAY(2000);
	}
	return (ETIMEDOUT);
	}

	/* Step through the synchronous command state machine, i.e. "Doorbell mode" */
	static int
	mpr_request_sync(struct mpr_softc sc, void req, MPI2_DEFAULT_REPLY *reply,
	int req_sz, int reply_sz, int timeout)
	{
	uint32_t *data32;
	uint16_t *data16;
	int i, count, ioc_sz, residual;
	int sleep_flags = CAN_SLEEP;

	if (curthread->td_no_sleeping)
	sleep_flags = NO_SLEEP;

	/* Step 1 */
	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);

	/* Step 2 */
	if (mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
	return (EBUSY);

	/* Step 3
	* Announce that a message is coming through the doorbell. Messages
	* are pushed at 32bit words, so round up if needed.
	*/
	count = (req_sz + 3) / 4;
	mpr_regwrite(sc, MPI2_DOORBELL_OFFSET,
	(MPI2_FUNCTION_HANDSHAKE << MPI2_DOORBELL_FUNCTION_SHIFT) \|
	(count << MPI2_DOORBELL_ADD_DWORDS_SHIFT));

	/* Step 4 */
	if (mpr_wait_db_int(sc) \|\|
	(mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED) == 0) {
	mpr_dprint(sc, MPR_FAULT, "Doorbell failed to activate\n");
	return (ENXIO);
	}
	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
	if (mpr_wait_db_ack(sc, 5, sleep_flags) != 0) {
	mpr_dprint(sc, MPR_FAULT, "Doorbell handshake failed\n");
	return (ENXIO);
	}

	/* Step 5 */
	/* Clock out the message data synchronously in 32-bit dwords*/
	data32 = (uint32_t *)req;
	for (i = 0; i < count; i++) {
	mpr_regwrite(sc, MPI2_DOORBELL_OFFSET, htole32(data32[i]));
	if (mpr_wait_db_ack(sc, 5, sleep_flags) != 0) {
	mpr_dprint(sc, MPR_FAULT,
	"Timeout while writing doorbell\n");
	return (ENXIO);
	}
	}

	/* Step 6 */
	/* Clock in the reply in 16-bit words. The total length of the
	* message is always in the 4th byte, so clock out the first 2 words
	* manually, then loop the rest.
	*/
	data16 = (uint16_t *)reply;
	if (mpr_wait_db_int(sc) != 0) {
	mpr_dprint(sc, MPR_FAULT, "Timeout reading doorbell 0\n");
	return (ENXIO);
	}
	data16[0] =
	mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK;
	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
	if (mpr_wait_db_int(sc) != 0) {
	mpr_dprint(sc, MPR_FAULT, "Timeout reading doorbell 1\n");
	return (ENXIO);
	}
	data16[1] =
	mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK;
	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);

	/* Number of 32bit words in the message */
	ioc_sz = reply->MsgLength;

	/*
	* Figure out how many 16bit words to clock in without overrunning.
	* The precision loss with dividing reply_sz can safely be
	* ignored because the messages can only be multiples of 32bits.
	*/
	residual = 0;
	count = MIN((reply_sz / 4), ioc_sz) * 2;
	if (count < ioc_sz * 2) {
	residual = ioc_sz * 2 - count;
	mpr_dprint(sc, MPR_ERROR, "Driver error, throwing away %d "
	"residual message words\n", residual);
	}

	for (i = 2; i < count; i++) {
	if (mpr_wait_db_int(sc) != 0) {
	mpr_dprint(sc, MPR_FAULT,
	"Timeout reading doorbell %d\n", i);
	return (ENXIO);
	}
	data16[i] = mpr_regread(sc, MPI2_DOORBELL_OFFSET) &
	MPI2_DOORBELL_DATA_MASK;
	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
	}

	/*
	* Pull out residual words that won't fit into the provided buffer.
	* This keeps the chip from hanging due to a driver programming
	* error.
	*/
	while (residual--) {
	if (mpr_wait_db_int(sc) != 0) {
	mpr_dprint(sc, MPR_FAULT, "Timeout reading doorbell\n");
	return (ENXIO);
	}
	(void)mpr_regread(sc, MPI2_DOORBELL_OFFSET);
	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
	}

	/* Step 7 */
	if (mpr_wait_db_int(sc) != 0) {
	mpr_dprint(sc, MPR_FAULT, "Timeout waiting to exit doorbell\n");
	return (ENXIO);
	}
	if (mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
	mpr_dprint(sc, MPR_FAULT, "Warning, doorbell still active\n");
	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);

	return (0);
	}

	static void
	mpr_enqueue_request(struct mpr_softc sc, struct mpr_command cm)
	{
	request_descriptor_t rd;

	MPR_FUNCTRACE(sc);
	mpr_dprint(sc, MPR_TRACE, "SMID %u cm %p ccb %p\n",
	cm->cm_desc.Default.SMID, cm, cm->cm_ccb);

	if (sc->mpr_flags & MPR_FLAGS_ATTACH_DONE && !(sc->mpr_flags &
	MPR_FLAGS_SHUTDOWN))
	mtx_assert(&sc->mpr_mtx, MA_OWNED);

	if (++sc->io_cmds_active > sc->io_cmds_highwater)
	sc->io_cmds_highwater++;

	KASSERT(cm->cm_state == MPR_CM_STATE_BUSY, ("command not busy\n"));
	cm->cm_state = MPR_CM_STATE_INQUEUE;

	if (sc->atomic_desc_capable) {
	rd.u.low = cm->cm_desc.Words.Low;
	mpr_regwrite(sc, MPI26_ATOMIC_REQUEST_DESCRIPTOR_POST_OFFSET,
	rd.u.low);
	} else {
	rd.u.low = cm->cm_desc.Words.Low;
	rd.u.high = cm->cm_desc.Words.High;
	rd.word = htole64(rd.word);
	mpr_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_LOW_OFFSET,
	rd.u.low);
	mpr_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_HIGH_OFFSET,
	rd.u.high);
	}
	}

	/*
	* Just the FACTS, ma'am.
	*/
	static int
	mpr_get_iocfacts(struct mpr_softc sc, MPI2_IOC_FACTS_REPLY facts)
	{
	MPI2_DEFAULT_REPLY *reply;
	MPI2_IOC_FACTS_REQUEST request;
	int error, req_sz, reply_sz;

	MPR_FUNCTRACE(sc);
	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);

	req_sz = sizeof(MPI2_IOC_FACTS_REQUEST);
	reply_sz = sizeof(MPI2_IOC_FACTS_REPLY);
	reply = (MPI2_DEFAULT_REPLY *)facts;

	bzero(&request, req_sz);
	request.Function = MPI2_FUNCTION_IOC_FACTS;
	error = mpr_request_sync(sc, &request, reply, req_sz, reply_sz, 5);

	mpr_dprint(sc, MPR_INIT, "%s exit, error= %d\n", __func__, error);
	return (error);
	}

	static int
	mpr_send_iocinit(struct mpr_softc *sc)
	{
	MPI2_IOC_INIT_REQUEST init;
	MPI2_DEFAULT_REPLY reply;
	int req_sz, reply_sz, error;
	struct timeval now;
	uint64_t time_in_msec;

	MPR_FUNCTRACE(sc);
	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);

	/* Do a quick sanity check on proper initialization */
	if ((sc->pqdepth == 0) \|\| (sc->fqdepth == 0) \|\| (sc->reqframesz == 0)
	\|\| (sc->replyframesz == 0)) {
	mpr_dprint(sc, MPR_INIT\|MPR_ERROR,
	"Driver not fully initialized for IOCInit\n");
	return (EINVAL);
	}

	req_sz = sizeof(MPI2_IOC_INIT_REQUEST);
	reply_sz = sizeof(MPI2_IOC_INIT_REPLY);
	bzero(&init, req_sz);
	bzero(&reply, reply_sz);

	/*
	* Fill in the init block. Note that most addresses are
	* deliberately in the lower 32bits of memory. This is a micro-
	* optimzation for PCI/PCIX, though it's not clear if it helps PCIe.
	*/
	init.Function = MPI2_FUNCTION_IOC_INIT;
	init.WhoInit = MPI2_WHOINIT_HOST_DRIVER;
	init.MsgVersion = htole16(MPI2_VERSION);
	init.HeaderVersion = htole16(MPI2_HEADER_VERSION);
	init.SystemRequestFrameSize = htole16((uint16_t)(sc->reqframesz / 4));
	init.ReplyDescriptorPostQueueDepth = htole16(sc->pqdepth);
	init.ReplyFreeQueueDepth = htole16(sc->fqdepth);
	init.SenseBufferAddressHigh = 0;
	init.SystemReplyAddressHigh = 0;
	init.SystemRequestFrameBaseAddress.High = 0;
	init.SystemRequestFrameBaseAddress.Low =
	htole32((uint32_t)sc->req_busaddr);
	init.ReplyDescriptorPostQueueAddress.High = 0;
	init.ReplyDescriptorPostQueueAddress.Low =
	htole32((uint32_t)sc->post_busaddr);
	init.ReplyFreeQueueAddress.High = 0;
	init.ReplyFreeQueueAddress.Low = htole32((uint32_t)sc->free_busaddr);
	getmicrotime(&now);
	time_in_msec = (now.tv_sec * 1000 + now.tv_usec/1000);
	init.TimeStamp.High = htole32((time_in_msec >> 32) & 0xFFFFFFFF);
	init.TimeStamp.Low = htole32(time_in_msec & 0xFFFFFFFF);
	init.HostPageSize = HOST_PAGE_SIZE_4K;

	error = mpr_request_sync(sc, &init, &reply, req_sz, reply_sz, 5);
	if ((reply.IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
	error = ENXIO;

	mpr_dprint(sc, MPR_INIT, "IOCInit status= 0x%x\n", reply.IOCStatus);
	mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
	return (error);
	}

	void
	mpr_memaddr_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	bus_addr_t *addr;

	addr = arg;
	*addr = segs[0].ds_addr;
	}

	void
	mpr_memaddr_wait_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct mpr_busdma_context *ctx;
	int need_unload, need_free;

	ctx = (struct mpr_busdma_context *)arg;
	need_unload = 0;
	need_free = 0;

	mpr_lock(ctx->softc);
	ctx->error = error;
	ctx->completed = 1;
	if ((error == 0) && (ctx->abandoned == 0)) {
	*ctx->addr = segs[0].ds_addr;
	} else {
	if (nsegs != 0)
	need_unload = 1;
	if (ctx->abandoned != 0)
	need_free = 1;
	}
	if (need_free == 0)
	wakeup(ctx);

	mpr_unlock(ctx->softc);

	if (need_unload != 0) {
	bus_dmamap_unload(ctx->buffer_dmat,
	ctx->buffer_dmamap);
	*ctx->addr = 0;
	}

	if (need_free != 0)
	free(ctx, M_MPR);
	}

	static int
	mpr_alloc_queues(struct mpr_softc *sc)
	{
	struct mpr_queue *q;
	int nq, i;

	nq = sc->msi_msgs;
	mpr_dprint(sc, MPR_INIT\|MPR_XINFO, "Allocating %d I/O queues\n", nq);

	sc->queues = malloc(sizeof(struct mpr_queue) * nq, M_MPR,
	M_NOWAIT\|M_ZERO);
	if (sc->queues == NULL)
	return (ENOMEM);

	for (i = 0; i < nq; i++) {
	q = &sc->queues[i];
	mpr_dprint(sc, MPR_INIT, "Configuring queue %d %p\n", i, q);
	q->sc = sc;
	q->qnum = i;
	}
	return (0);
	}

	static int
	mpr_alloc_hw_queues(struct mpr_softc *sc)
	{
	bus_dma_template_t t;
	bus_addr_t queues_busaddr;
	uint8_t *queues;
	int qsize, fqsize, pqsize;

	/*
	* The reply free queue contains 4 byte entries in multiples of 16 and
	* aligned on a 16 byte boundary. There must always be an unused entry.
	* This queue supplies fresh reply frames for the firmware to use.
	*
	* The reply descriptor post queue contains 8 byte entries in
	* multiples of 16 and aligned on a 16 byte boundary. This queue
	* contains filled-in reply frames sent from the firmware to the host.
	*
	* These two queues are allocated together for simplicity.
	*/
	sc->fqdepth = roundup2(sc->num_replies + 1, 16);
	sc->pqdepth = roundup2(sc->num_replies + 1, 16);
	fqsize= sc->fqdepth * 4;
	pqsize = sc->pqdepth * 8;
	qsize = fqsize + pqsize;

	bus_dma_template_init(&t, sc->mpr_parent_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(16), BD_MAXSIZE(qsize),
	BD_MAXSEGSIZE(qsize), BD_NSEGMENTS(1),
	BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
	if (bus_dma_template_tag(&t, &sc->queues_dmat)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate queues DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->queues_dmat, (void **)&queues, BUS_DMA_NOWAIT,
	&sc->queues_map)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate queues memory\n");
	return (ENOMEM);
	}
	bzero(queues, qsize);
	bus_dmamap_load(sc->queues_dmat, sc->queues_map, queues, qsize,
	mpr_memaddr_cb, &queues_busaddr, 0);

	sc->free_queue = (uint32_t *)queues;
	sc->free_busaddr = queues_busaddr;
	sc->post_queue = (MPI2_REPLY_DESCRIPTORS_UNION *)(queues + fqsize);
	sc->post_busaddr = queues_busaddr + fqsize;
	mpr_dprint(sc, MPR_INIT, "free queue busaddr= %#016jx size= %d\n",
	(uintmax_t)sc->free_busaddr, fqsize);
	mpr_dprint(sc, MPR_INIT, "reply queue busaddr= %#016jx size= %d\n",
	(uintmax_t)sc->post_busaddr, pqsize);

	return (0);
	}

	static int
	mpr_alloc_replies(struct mpr_softc *sc)
	{
	bus_dma_template_t t;
	int rsize, num_replies;

	/* Store the reply frame size in bytes rather than as 32bit words */
	sc->replyframesz = sc->facts->ReplyFrameSize * 4;

	/*
	* sc->num_replies should be one less than sc->fqdepth. We need to
	* allocate space for sc->fqdepth replies, but only sc->num_replies
	* replies can be used at once.
	*/
	num_replies = max(sc->fqdepth, sc->num_replies);

	rsize = sc->replyframesz * num_replies;
	bus_dma_template_init(&t, sc->mpr_parent_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(4), BD_MAXSIZE(rsize),
	BD_MAXSEGSIZE(rsize), BD_NSEGMENTS(1),
	BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
	if (bus_dma_template_tag(&t, &sc->reply_dmat)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate replies DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->reply_dmat, (void **)&sc->reply_frames,
	BUS_DMA_NOWAIT, &sc->reply_map)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate replies memory\n");
	return (ENOMEM);
	}
	bzero(sc->reply_frames, rsize);
	bus_dmamap_load(sc->reply_dmat, sc->reply_map, sc->reply_frames, rsize,
	mpr_memaddr_cb, &sc->reply_busaddr, 0);
	mpr_dprint(sc, MPR_INIT, "reply frames busaddr= %#016jx size= %d\n",
	(uintmax_t)sc->reply_busaddr, rsize);

	return (0);
	}

	static void
	mpr_load_chains_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct mpr_softc *sc = arg;
	struct mpr_chain *chain;
	bus_size_t bo;
	int i, o, s;

	if (error != 0)
	return;

	for (i = 0, o = 0, s = 0; s < nsegs; s++) {
	for (bo = 0; bo + sc->chain_frame_size <= segs[s].ds_len;
	bo += sc->chain_frame_size) {
	chain = &sc->chains[i++];
	chain->chain =(MPI2_SGE_IO_UNION *)(sc->chain_frames+o);
	chain->chain_busaddr = segs[s].ds_addr + bo;
	o += sc->chain_frame_size;
	mpr_free_chain(sc, chain);
	}
	if (bo != segs[s].ds_len)
	o += segs[s].ds_len - bo;
	}
	sc->chain_free_lowwater = i;
	}

	static int
	mpr_alloc_requests(struct mpr_softc *sc)
	{
	bus_dma_template_t t;
	struct mpr_command *cm;
	int i, rsize, nsegs;

	rsize = sc->reqframesz * sc->num_reqs;
	bus_dma_template_init(&t, sc->mpr_parent_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(16), BD_MAXSIZE(rsize),
	BD_MAXSEGSIZE(rsize), BD_NSEGMENTS(1),
	BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
	if (bus_dma_template_tag(&t, &sc->req_dmat)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate request DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->req_dmat, (void **)&sc->req_frames,
	BUS_DMA_NOWAIT, &sc->req_map)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate request memory\n");
	return (ENOMEM);
	}
	bzero(sc->req_frames, rsize);
	bus_dmamap_load(sc->req_dmat, sc->req_map, sc->req_frames, rsize,
	mpr_memaddr_cb, &sc->req_busaddr, 0);
	mpr_dprint(sc, MPR_INIT, "request frames busaddr= %#016jx size= %d\n",
	(uintmax_t)sc->req_busaddr, rsize);

	sc->chains = malloc(sizeof(struct mpr_chain) * sc->num_chains, M_MPR,
	M_NOWAIT \| M_ZERO);
	if (!sc->chains) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate chain memory\n");
	return (ENOMEM);
	}
	rsize = sc->chain_frame_size * sc->num_chains;
	bus_dma_template_init(&t, sc->mpr_parent_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(16), BD_MAXSIZE(rsize),
	BD_MAXSEGSIZE(rsize), BD_NSEGMENTS((howmany(rsize, PAGE_SIZE))));
	if (bus_dma_template_tag(&t, &sc->chain_dmat)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate chain DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->chain_dmat, (void **)&sc->chain_frames,
	BUS_DMA_NOWAIT \| BUS_DMA_ZERO, &sc->chain_map)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate chain memory\n");
	return (ENOMEM);
	}
	if (bus_dmamap_load(sc->chain_dmat, sc->chain_map, sc->chain_frames,
	rsize, mpr_load_chains_cb, sc, BUS_DMA_NOWAIT)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot load chain memory\n");
	bus_dmamem_free(sc->chain_dmat, sc->chain_frames,
	sc->chain_map);
	return (ENOMEM);
	}

	rsize = MPR_SENSE_LEN * sc->num_reqs;
	bus_dma_template_clone(&t, sc->req_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(1), BD_MAXSIZE(rsize),
	BD_MAXSEGSIZE(rsize));
	if (bus_dma_template_tag(&t, &sc->sense_dmat)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate sense DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->sense_dmat, (void **)&sc->sense_frames,
	BUS_DMA_NOWAIT, &sc->sense_map)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate sense memory\n");
	return (ENOMEM);
	}
	bzero(sc->sense_frames, rsize);
	bus_dmamap_load(sc->sense_dmat, sc->sense_map, sc->sense_frames, rsize,
	mpr_memaddr_cb, &sc->sense_busaddr, 0);
	mpr_dprint(sc, MPR_INIT, "sense frames busaddr= %#016jx size= %d\n",
	(uintmax_t)sc->sense_busaddr, rsize);

	/*
	* Allocate NVMe PRP Pages for NVMe SGL support only if the FW supports
	* these devices.
	*/
	if ((sc->facts->MsgVersion >= MPI2_VERSION_02_06) &&
	(sc->facts->ProtocolFlags & MPI2_IOCFACTS_PROTOCOL_NVME_DEVICES)) {
	if (mpr_alloc_nvme_prp_pages(sc) == ENOMEM)
	return (ENOMEM);
	}

	nsegs = (sc->maxio / PAGE_SIZE) + 1;
	bus_dma_template_init(&t, sc->mpr_parent_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_MAXSIZE(BUS_SPACE_MAXSIZE_32BIT),
	BD_NSEGMENTS(nsegs), BD_MAXSEGSIZE(BUS_SPACE_MAXSIZE_32BIT),
	BD_FLAGS(BUS_DMA_ALLOCNOW), BD_LOCKFUNC(busdma_lock_mutex),
	BD_LOCKFUNCARG(&sc->mpr_mtx));
	if (bus_dma_template_tag(&t, &sc->buffer_dmat)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate buffer DMA tag\n");
	return (ENOMEM);
	}

	/*
	* SMID 0 cannot be used as a free command per the firmware spec.
	* Just drop that command instead of risking accounting bugs.
	*/
	sc->commands = malloc(sizeof(struct mpr_command) * sc->num_reqs,
	M_MPR, M_WAITOK \| M_ZERO);
	for (i = 1; i < sc->num_reqs; i++) {
	cm = &sc->commands[i];
	cm->cm_req = sc->req_frames + i * sc->reqframesz;
	cm->cm_req_busaddr = sc->req_busaddr + i * sc->reqframesz;
	cm->cm_sense = &sc->sense_frames[i];
	cm->cm_sense_busaddr = sc->sense_busaddr + i * MPR_SENSE_LEN;
	cm->cm_desc.Default.SMID = i;
	cm->cm_sc = sc;
	cm->cm_state = MPR_CM_STATE_BUSY;
	TAILQ_INIT(&cm->cm_chain_list);
	TAILQ_INIT(&cm->cm_prp_page_list);
	callout_init_mtx(&cm->cm_callout, &sc->mpr_mtx, 0);

	/* XXX Is a failure here a critical problem? */
	if (bus_dmamap_create(sc->buffer_dmat, 0, &cm->cm_dmamap)
	== 0) {
	if (i <= sc->num_prireqs)
	mpr_free_high_priority_command(sc, cm);
	else
	mpr_free_command(sc, cm);
	} else {
	panic("failed to allocate command %d\n", i);
	sc->num_reqs = i;
	break;
	}
	}

	return (0);
	}

	/*
	* Allocate contiguous buffers for PCIe NVMe devices for building native PRPs,
	* which are scatter/gather lists for NVMe devices.
	*
	* This buffer must be contiguous due to the nature of how NVMe PRPs are built
	* and translated by FW.
	*
	* returns ENOMEM if memory could not be allocated, otherwise returns 0.
	*/
	static int
	mpr_alloc_nvme_prp_pages(struct mpr_softc *sc)
	{
	bus_dma_template_t t;
	struct mpr_prp_page *prp_page;
	int PRPs_per_page, PRPs_required, pages_required;
	int rsize, i;

	/*
	* Assuming a MAX_IO_SIZE of 1MB and a PAGE_SIZE of 4k, the max number
	* of PRPs (NVMe's Scatter/Gather Element) needed per I/O is:
	* MAX_IO_SIZE / PAGE_SIZE = 256
	*
	* 1 PRP entry in main frame for PRP list pointer still leaves 255 PRPs
	* required for the remainder of the 1MB I/O. 512 PRPs can fit into one
	* page (4096 / 8 = 512), so only one page is required for each I/O.
	*
	* Each of these buffers will need to be contiguous. For simplicity,
	* only one buffer is allocated here, which has all of the space
	* required for the NVMe Queue Depth. If there are problems allocating
	* this one buffer, this function will need to change to allocate
	* individual, contiguous NVME_QDEPTH buffers.
	*
	* The real calculation will use the real max io size. Above is just an
	* example.
	*
	*/
	PRPs_required = sc->maxio / PAGE_SIZE;
	PRPs_per_page = (PAGE_SIZE / PRP_ENTRY_SIZE) - 1;
	pages_required = (PRPs_required / PRPs_per_page) + 1;

	sc->prp_buffer_size = PAGE_SIZE * pages_required;
	rsize = sc->prp_buffer_size * NVME_QDEPTH;
	bus_dma_template_init(&t, sc->mpr_parent_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(4), BD_MAXSIZE(rsize),
	BD_MAXSEGSIZE(rsize), BD_NSEGMENTS(1),
	BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
	if (bus_dma_template_tag(&t, &sc->prp_page_dmat)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate NVMe PRP DMA "
	"tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->prp_page_dmat, (void **)&sc->prp_pages,
	BUS_DMA_NOWAIT, &sc->prp_page_map)) {
	mpr_dprint(sc, MPR_ERROR, "Cannot allocate NVMe PRP memory\n");
	return (ENOMEM);
	}
	bzero(sc->prp_pages, rsize);
	bus_dmamap_load(sc->prp_page_dmat, sc->prp_page_map, sc->prp_pages,
	rsize, mpr_memaddr_cb, &sc->prp_page_busaddr, 0);

	sc->prps = malloc(sizeof(struct mpr_prp_page) * NVME_QDEPTH, M_MPR,
	M_WAITOK \| M_ZERO);
	for (i = 0; i < NVME_QDEPTH; i++) {
	prp_page = &sc->prps[i];
	prp_page->prp_page = (uint64_t *)(sc->prp_pages +
	i * sc->prp_buffer_size);
	prp_page->prp_page_busaddr = (uint64_t)(sc->prp_page_busaddr +
	i * sc->prp_buffer_size);
	mpr_free_prp_page(sc, prp_page);
	sc->prp_pages_free_lowwater++;
	}

	return (0);
	}

	static int
	mpr_init_queues(struct mpr_softc *sc)
	{
	int i;

	memset((uint8_t )sc->post_queue, 0xff, sc->pqdepth 8);

	/*
	* According to the spec, we need to use one less reply than we
	* have space for on the queue. So sc->num_replies (the number we
	* use) should be less than sc->fqdepth (allocated size).
	*/
	if (sc->num_replies >= sc->fqdepth)
	return (EINVAL);

	/*
	* Initialize all of the free queue entries.
	*/
	for (i = 0; i < sc->fqdepth; i++) {
	sc->free_queue[i] = sc->reply_busaddr + (i * sc->replyframesz);
	}
	sc->replyfreeindex = sc->num_replies;

	return (0);
	}

	/* Get the driver parameter tunables. Lowest priority are the driver defaults.
	* Next are the global settings, if they exist. Highest are the per-unit
	* settings, if they exist.
	*/
	void
	mpr_get_tunables(struct mpr_softc *sc)
	{
	char tmpstr[80], mpr_debug[80];

	/* XXX default to some debugging for now */
	sc->mpr_debug = MPR_INFO \| MPR_FAULT;
	sc->disable_msix = 0;
	sc->disable_msi = 0;
	sc->max_msix = MPR_MSIX_MAX;
	sc->max_chains = MPR_CHAIN_FRAMES;
	sc->max_io_pages = MPR_MAXIO_PAGES;
	sc->enable_ssu = MPR_SSU_ENABLE_SSD_DISABLE_HDD;
	sc->spinup_wait_time = DEFAULT_SPINUP_WAIT;
	sc->use_phynum = 1;
	sc->max_reqframes = MPR_REQ_FRAMES;
	sc->max_prireqframes = MPR_PRI_REQ_FRAMES;
	sc->max_replyframes = MPR_REPLY_FRAMES;
	sc->max_evtframes = MPR_EVT_REPLY_FRAMES;

	/*
	* Grab the global variables.
	*/
	bzero(mpr_debug, 80);
	if (TUNABLE_STR_FETCH("hw.mpr.debug_level", mpr_debug, 80) != 0)
	mpr_parse_debug(sc, mpr_debug);
	TUNABLE_INT_FETCH("hw.mpr.disable_msix", &sc->disable_msix);
	TUNABLE_INT_FETCH("hw.mpr.disable_msi", &sc->disable_msi);
	TUNABLE_INT_FETCH("hw.mpr.max_msix", &sc->max_msix);
	TUNABLE_INT_FETCH("hw.mpr.max_chains", &sc->max_chains);
	TUNABLE_INT_FETCH("hw.mpr.max_io_pages", &sc->max_io_pages);
	TUNABLE_INT_FETCH("hw.mpr.enable_ssu", &sc->enable_ssu);
	TUNABLE_INT_FETCH("hw.mpr.spinup_wait_time", &sc->spinup_wait_time);
	TUNABLE_INT_FETCH("hw.mpr.use_phy_num", &sc->use_phynum);
	TUNABLE_INT_FETCH("hw.mpr.max_reqframes", &sc->max_reqframes);
	TUNABLE_INT_FETCH("hw.mpr.max_prireqframes", &sc->max_prireqframes);
	TUNABLE_INT_FETCH("hw.mpr.max_replyframes", &sc->max_replyframes);
	TUNABLE_INT_FETCH("hw.mpr.max_evtframes", &sc->max_evtframes);

	/* Grab the unit-instance variables */
	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.debug_level",
	device_get_unit(sc->mpr_dev));
	bzero(mpr_debug, 80);
	if (TUNABLE_STR_FETCH(tmpstr, mpr_debug, 80) != 0)
	mpr_parse_debug(sc, mpr_debug);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.disable_msix",
	device_get_unit(sc->mpr_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->disable_msix);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.disable_msi",
	device_get_unit(sc->mpr_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->disable_msi);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_msix",
	device_get_unit(sc->mpr_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_msix);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_chains",
	device_get_unit(sc->mpr_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_chains);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_io_pages",
	device_get_unit(sc->mpr_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_io_pages);

	bzero(sc->exclude_ids, sizeof(sc->exclude_ids));
	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.exclude_ids",
	device_get_unit(sc->mpr_dev));
	TUNABLE_STR_FETCH(tmpstr, sc->exclude_ids, sizeof(sc->exclude_ids));

	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.enable_ssu",
	device_get_unit(sc->mpr_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->enable_ssu);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.spinup_wait_time",
	device_get_unit(sc->mpr_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->spinup_wait_time);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.use_phy_num",
	device_get_unit(sc->mpr_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->use_phynum);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_reqframes",
	device_get_unit(sc->mpr_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_reqframes);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_prireqframes",
	device_get_unit(sc->mpr_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_prireqframes);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_replyframes",
	device_get_unit(sc->mpr_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_replyframes);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_evtframes",
	device_get_unit(sc->mpr_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_evtframes);
	}

	static void
	mpr_setup_sysctl(struct mpr_softc *sc)
	{
	struct sysctl_ctx_list *sysctl_ctx = NULL;
	struct sysctl_oid *sysctl_tree = NULL;
	char tmpstr[80], tmpstr2[80];

	/*
	* Setup the sysctl variable so the user can change the debug level
	* on the fly.
	*/
	snprintf(tmpstr, sizeof(tmpstr), "MPR controller %d",
	device_get_unit(sc->mpr_dev));
	snprintf(tmpstr2, sizeof(tmpstr2), "%d", device_get_unit(sc->mpr_dev));

	sysctl_ctx = device_get_sysctl_ctx(sc->mpr_dev);
	if (sysctl_ctx != NULL)
	sysctl_tree = device_get_sysctl_tree(sc->mpr_dev);

	if (sysctl_tree == NULL) {
	sysctl_ctx_init(&sc->sysctl_ctx);
	sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_hw_mpr), OID_AUTO, tmpstr2,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, tmpstr);
	if (sc->sysctl_tree == NULL)
	return;
	sysctl_ctx = &sc->sysctl_ctx;
	sysctl_tree = sc->sysctl_tree;
	}

	SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "debug_level", CTLTYPE_STRING \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	sc, 0, mpr_debug_sysctl, "A", "mpr debug level");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "disable_msix", CTLFLAG_RD, &sc->disable_msix, 0,
	"Disable the use of MSI-X interrupts");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_msix", CTLFLAG_RD, &sc->max_msix, 0,
	"User-defined maximum number of MSIX queues");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "msix_msgs", CTLFLAG_RD, &sc->msi_msgs, 0,
	"Negotiated number of MSIX queues");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_reqframes", CTLFLAG_RD, &sc->max_reqframes, 0,
	"Total number of allocated request frames");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_prireqframes", CTLFLAG_RD, &sc->max_prireqframes, 0,
	"Total number of allocated high priority request frames");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_replyframes", CTLFLAG_RD, &sc->max_replyframes, 0,
	"Total number of allocated reply frames");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_evtframes", CTLFLAG_RD, &sc->max_evtframes, 0,
	"Total number of event frames allocated");

	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version,
	strlen(sc->fw_version), "firmware version");

	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "driver_version", CTLFLAG_RD, MPR_DRIVER_VERSION,
	strlen(MPR_DRIVER_VERSION), "driver version");

	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "msg_version", CTLFLAG_RD, sc->msg_version,
	strlen(sc->msg_version), "message interface version");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "io_cmds_active", CTLFLAG_RD,
	&sc->io_cmds_active, 0, "number of currently active commands");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "io_cmds_highwater", CTLFLAG_RD,
	&sc->io_cmds_highwater, 0, "maximum active commands seen");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "chain_free", CTLFLAG_RD,
	&sc->chain_free, 0, "number of free chain elements");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "chain_free_lowwater", CTLFLAG_RD,
	&sc->chain_free_lowwater, 0,"lowest number of free chain elements");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_chains", CTLFLAG_RD,
	&sc->max_chains, 0,"maximum chain frames that will be allocated");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_io_pages", CTLFLAG_RD,
	&sc->max_io_pages, 0,"maximum pages to allow per I/O (if <1 use "
	"IOCFacts)");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "enable_ssu", CTLFLAG_RW, &sc->enable_ssu, 0,
	"enable SSU to SATA SSD/HDD at shutdown");

	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "chain_alloc_fail", CTLFLAG_RD,
	&sc->chain_alloc_fail, "chain allocation failures");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "spinup_wait_time", CTLFLAG_RD,
	&sc->spinup_wait_time, DEFAULT_SPINUP_WAIT, "seconds to wait for "
	"spinup after SATA ID error");

	SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "dump_reqs",
	CTLTYPE_OPAQUE \| CTLFLAG_RD \| CTLFLAG_SKIP \| CTLFLAG_NEEDGIANT,
	sc, 0, mpr_dump_reqs, "I", "Dump Active Requests");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "use_phy_num", CTLFLAG_RD, &sc->use_phynum, 0,
	"Use the phy number for enumeration");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "prp_pages_free", CTLFLAG_RD,
	&sc->prp_pages_free, 0, "number of free PRP pages");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "prp_pages_free_lowwater", CTLFLAG_RD,
	&sc->prp_pages_free_lowwater, 0,"lowest number of free PRP pages");

	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "prp_page_alloc_fail", CTLFLAG_RD,
	&sc->prp_page_alloc_fail, "PRP page allocation failures");
	}

	static struct mpr_debug_string {
	char *name;
	int flag;
	} mpr_debug_strings[] = {
	{"info", MPR_INFO},
	{"fault", MPR_FAULT},
	{"event", MPR_EVENT},
	{"log", MPR_LOG},
	{"recovery", MPR_RECOVERY},
	{"error", MPR_ERROR},
	{"init", MPR_INIT},
	{"xinfo", MPR_XINFO},
	{"user", MPR_USER},
	{"mapping", MPR_MAPPING},
	{"trace", MPR_TRACE}
	};

	enum mpr_debug_level_combiner {
	COMB_NONE,
	COMB_ADD,
	COMB_SUB
	};

	static int
	mpr_debug_sysctl(SYSCTL_HANDLER_ARGS)
	{
	struct mpr_softc *sc;
	struct mpr_debug_string *string;
	struct sbuf *sbuf;
	char *buffer;
	size_t sz;
	int i, len, debug, error;

	sc = (struct mpr_softc *)arg1;

	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
	return (error);

	sbuf = sbuf_new_for_sysctl(NULL, NULL, 128, req);
	debug = sc->mpr_debug;

	sbuf_printf(sbuf, "%#x", debug);

	sz = sizeof(mpr_debug_strings) / sizeof(mpr_debug_strings[0]);
	for (i = 0; i < sz; i++) {
	string = &mpr_debug_strings[i];
	if (debug & string->flag)
	sbuf_printf(sbuf, ",%s", string->name);
	}

	error = sbuf_finish(sbuf);
	sbuf_delete(sbuf);

	if (error \|\| req->newptr == NULL)
	return (error);

	len = req->newlen - req->newidx;
	if (len == 0)
	return (0);

	buffer = malloc(len, M_MPR, M_ZERO\|M_WAITOK);
	error = SYSCTL_IN(req, buffer, len);

	mpr_parse_debug(sc, buffer);

	free(buffer, M_MPR);
	return (error);
	}

	static void
	mpr_parse_debug(struct mpr_softc sc, char list)
	{
	struct mpr_debug_string *string;
	enum mpr_debug_level_combiner op;
	char token, endtoken;
	size_t sz;
	int flags, i;

	if (list == NULL \|\| *list == '\0')
	return;

	if (*list == '+') {
	op = COMB_ADD;
	list++;
	} else if (*list == '-') {
	op = COMB_SUB;
	list++;
	} else
	op = COMB_NONE;
	if (*list == '\0')
	return;

	flags = 0;
	sz = sizeof(mpr_debug_strings) / sizeof(mpr_debug_strings[0]);
	while ((token = strsep(&list, ":,")) != NULL) {
	/* Handle integer flags */
	flags \|= strtol(token, &endtoken, 0);
	if (token != endtoken)
	continue;

	/* Handle text flags */
	for (i = 0; i < sz; i++) {
	string = &mpr_debug_strings[i];
	if (strcasecmp(token, string->name) == 0) {
	flags \|= string->flag;
	break;
	}
	}
	}

	switch (op) {
	case COMB_NONE:
	sc->mpr_debug = flags;
	break;
	case COMB_ADD:
	sc->mpr_debug \|= flags;
	break;
	case COMB_SUB:
	sc->mpr_debug &= (~flags);
	break;
	}
	return;
	}

	struct mpr_dumpreq_hdr {
	uint32_t smid;
	uint32_t state;
	uint32_t numframes;
	uint32_t deschi;
	uint32_t desclo;
	};

	static int
	mpr_dump_reqs(SYSCTL_HANDLER_ARGS)
	{
	struct mpr_softc *sc;
	struct mpr_chain chain, chain1;
	struct mpr_command *cm;
	struct mpr_dumpreq_hdr hdr;
	struct sbuf *sb;
	uint32_t smid, state;
	int i, numreqs, error = 0;

	sc = (struct mpr_softc *)arg1;

	if ((error = priv_check(curthread, PRIV_DRIVER)) != 0) {
	printf("priv check error %d\n", error);
	return (error);
	}

	state = MPR_CM_STATE_INQUEUE;
	smid = 1;
	numreqs = sc->num_reqs;

	if (req->newptr != NULL)
	return (EINVAL);

	if (smid == 0 \|\| smid > sc->num_reqs)
	return (EINVAL);
	if (numreqs <= 0 \|\| (numreqs + smid > sc->num_reqs))
	numreqs = sc->num_reqs;
	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);

	/* Best effort, no locking */
	for (i = smid; i < numreqs; i++) {
	cm = &sc->commands[i];
	if (cm->cm_state != state)
	continue;
	hdr.smid = i;
	hdr.state = cm->cm_state;
	hdr.numframes = 1;
	hdr.deschi = cm->cm_desc.Words.High;
	hdr.desclo = cm->cm_desc.Words.Low;
	TAILQ_FOREACH_SAFE(chain, &cm->cm_chain_list, chain_link,
	chain1)
	hdr.numframes++;
	sbuf_bcat(sb, &hdr, sizeof(hdr));
	sbuf_bcat(sb, cm->cm_req, 128);
	TAILQ_FOREACH_SAFE(chain, &cm->cm_chain_list, chain_link,
	chain1)
	sbuf_bcat(sb, chain->chain, 128);
	}

	error = sbuf_finish(sb);
	sbuf_delete(sb);
	return (error);
	}

	int
	mpr_attach(struct mpr_softc *sc)
	{
	int error;

	MPR_FUNCTRACE(sc);
	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);

	mtx_init(&sc->mpr_mtx, "MPR lock", NULL, MTX_DEF);
	callout_init_mtx(&sc->periodic, &sc->mpr_mtx, 0);
	callout_init_mtx(&sc->device_check_callout, &sc->mpr_mtx, 0);
	TAILQ_INIT(&sc->event_list);
	timevalclear(&sc->lastfail);

	if ((error = mpr_transition_ready(sc)) != 0) {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT,
	"Failed to transition ready\n");
	return (error);
	}

	sc->facts = malloc(sizeof(MPI2_IOC_FACTS_REPLY), M_MPR,
	M_ZERO\|M_NOWAIT);
	if (!sc->facts) {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT,
	"Cannot allocate memory, exit\n");
	return (ENOMEM);
	}

	/*
	* Get IOC Facts and allocate all structures based on this information.
	* A Diag Reset will also call mpr_iocfacts_allocate and re-read the IOC
	* Facts. If relevant values have changed in IOC Facts, this function
	* will free all of the memory based on IOC Facts and reallocate that
	* memory. If this fails, any allocated memory should already be freed.
	*/
	if ((error = mpr_iocfacts_allocate(sc, TRUE)) != 0) {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT, "IOC Facts allocation "
	"failed with error %d\n", error);
	return (error);
	}

	/* Start the periodic watchdog check on the IOC Doorbell */
	mpr_periodic(sc);

	/*
	* The portenable will kick off discovery events that will drive the
	* rest of the initialization process. The CAM/SAS module will
	* hold up the boot sequence until discovery is complete.
	*/
	sc->mpr_ich.ich_func = mpr_startup;
	sc->mpr_ich.ich_arg = sc;
	if (config_intrhook_establish(&sc->mpr_ich) != 0) {
	mpr_dprint(sc, MPR_INIT\|MPR_ERROR,
	"Cannot establish MPR config hook\n");
	error = EINVAL;
	}

	/*
	* Allow IR to shutdown gracefully when shutdown occurs.
	*/
	sc->shutdown_eh = EVENTHANDLER_REGISTER(shutdown_final,
	mprsas_ir_shutdown, sc, SHUTDOWN_PRI_DEFAULT);

	if (sc->shutdown_eh == NULL)
	mpr_dprint(sc, MPR_INIT\|MPR_ERROR,
	"shutdown event registration failed\n");

	mpr_setup_sysctl(sc);

	sc->mpr_flags \|= MPR_FLAGS_ATTACH_DONE;
	mpr_dprint(sc, MPR_INIT, "%s exit error= %d\n", __func__, error);

	return (error);
	}

	/* Run through any late-start handlers. */
	static void
	mpr_startup(void *arg)
	{
	struct mpr_softc *sc;

	sc = (struct mpr_softc *)arg;
	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);

	mpr_lock(sc);
	mpr_unmask_intr(sc);

	/* initialize device mapping tables */
	mpr_base_static_config_pages(sc);
	mpr_mapping_initialize(sc);
	mprsas_startup(sc);
	mpr_unlock(sc);

	mpr_dprint(sc, MPR_INIT, "disestablish config intrhook\n");
	config_intrhook_disestablish(&sc->mpr_ich);
	sc->mpr_ich.ich_arg = NULL;

	mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
	}

	/* Periodic watchdog. Is called with the driver lock already held. */
	static void
	mpr_periodic(void *arg)
	{
	struct mpr_softc *sc;
	uint32_t db;

	sc = (struct mpr_softc *)arg;
	if (sc->mpr_flags & MPR_FLAGS_SHUTDOWN)
	return;

	db = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
	if ((db & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
	if ((db & MPI2_DOORBELL_FAULT_CODE_MASK) ==
	IFAULT_IOP_OVER_TEMP_THRESHOLD_EXCEEDED) {
	panic("TEMPERATURE FAULT: STOPPING.");
	}
	mpr_dprint(sc, MPR_FAULT, "IOC Fault 0x%08x, Resetting\n", db);
	mpr_reinit(sc);
	}

	callout_reset(&sc->periodic, MPR_PERIODIC_DELAY * hz, mpr_periodic, sc);
	}

	static void
	mpr_log_evt_handler(struct mpr_softc *sc, uintptr_t data,
	MPI2_EVENT_NOTIFICATION_REPLY *event)
	{
	MPI2_EVENT_DATA_LOG_ENTRY_ADDED *entry;

	MPR_DPRINT_EVENT(sc, generic, event);

	switch (event->Event) {
	case MPI2_EVENT_LOG_DATA:
	mpr_dprint(sc, MPR_EVENT, "MPI2_EVENT_LOG_DATA:\n");
	if (sc->mpr_debug & MPR_EVENT)
	hexdump(event->EventData, event->EventDataLength, NULL,
	0);
	break;
	case MPI2_EVENT_LOG_ENTRY_ADDED:
	entry = (MPI2_EVENT_DATA_LOG_ENTRY_ADDED *)event->EventData;
	mpr_dprint(sc, MPR_EVENT, "MPI2_EVENT_LOG_ENTRY_ADDED event "
	"0x%x Sequence %d:\n", entry->LogEntryQualifier,
	entry->LogSequence);
	break;
	default:
	break;
	}
	return;
	}

	static int
	mpr_attach_log(struct mpr_softc *sc)
	{
	uint8_t events[16];

	bzero(events, 16);
	setbit(events, MPI2_EVENT_LOG_DATA);
	setbit(events, MPI2_EVENT_LOG_ENTRY_ADDED);

	mpr_register_events(sc, events, mpr_log_evt_handler, NULL,
	&sc->mpr_log_eh);

	return (0);
	}

	static int
	mpr_detach_log(struct mpr_softc *sc)
	{

	if (sc->mpr_log_eh != NULL)
	mpr_deregister_events(sc, sc->mpr_log_eh);
	return (0);
	}

	/*
	* Free all of the driver resources and detach submodules. Should be called
	* without the lock held.
	*/
	int
	mpr_free(struct mpr_softc *sc)
	{
	int error;

	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
	/* Turn off the watchdog */
	mpr_lock(sc);
	sc->mpr_flags \|= MPR_FLAGS_SHUTDOWN;
	mpr_unlock(sc);
	/* Lock must not be held for this */
	callout_drain(&sc->periodic);
	callout_drain(&sc->device_check_callout);

	if (((error = mpr_detach_log(sc)) != 0) \|\|
	((error = mpr_detach_sas(sc)) != 0)) {
	mpr_dprint(sc, MPR_INIT\|MPR_FAULT, "failed to detach "
	"subsystems, error= %d, exit\n", error);
	return (error);
	}

	mpr_detach_user(sc);

	/* Put the IOC back in the READY state. */
	mpr_lock(sc);
	if ((error = mpr_transition_ready(sc)) != 0) {
	mpr_unlock(sc);
	return (error);
	}
	mpr_unlock(sc);

	if (sc->facts != NULL)
	free(sc->facts, M_MPR);

	/*
	* Free all buffers that are based on IOC Facts. A Diag Reset may need
	* to free these buffers too.
	*/
	mpr_iocfacts_free(sc);

	if (sc->sysctl_tree != NULL)
	sysctl_ctx_free(&sc->sysctl_ctx);

	/* Deregister the shutdown function */
	if (sc->shutdown_eh != NULL)
	EVENTHANDLER_DEREGISTER(shutdown_final, sc->shutdown_eh);

	mtx_destroy(&sc->mpr_mtx);
	mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);

	return (0);
	}

	static __inline void
	mpr_complete_command(struct mpr_softc sc, struct mpr_command cm)
	{
	MPR_FUNCTRACE(sc);

	if (cm == NULL) {
	mpr_dprint(sc, MPR_ERROR, "Completing NULL command\n");
	return;
	}

	cm->cm_state = MPR_CM_STATE_BUSY;
	if (cm->cm_flags & MPR_CM_FLAGS_POLLED)
	cm->cm_flags \|= MPR_CM_FLAGS_COMPLETE;

	if (cm->cm_complete != NULL) {
	mpr_dprint(sc, MPR_TRACE,
	"%s cm %p calling cm_complete %p data %p reply %p\n",
	__func__, cm, cm->cm_complete, cm->cm_complete_data,
	cm->cm_reply);
	cm->cm_complete(sc, cm);
	}

	if (cm->cm_flags & MPR_CM_FLAGS_WAKEUP) {
	mpr_dprint(sc, MPR_TRACE, "waking up %p\n", cm);
	wakeup(cm);
	}

	if (sc->io_cmds_active != 0) {
	sc->io_cmds_active--;
	} else {
	mpr_dprint(sc, MPR_ERROR, "Warning: io_cmds_active is "
	"out of sync - resynching to 0\n");
	}
	}

	static void
	mpr_sas_log_info(struct mpr_softc *sc , u32 log_info)
	{
	union loginfo_type {
	u32 loginfo;
	struct {
	u32 subcode:16;
	u32 code:8;
	u32 originator:4;
	u32 bus_type:4;
	} dw;
	};
	union loginfo_type sas_loginfo;
	char *originator_str = NULL;

	sas_loginfo.loginfo = log_info;
	if (sas_loginfo.dw.bus_type != 3 /SAS/)
	return;

	/* each nexus loss loginfo */
	if (log_info == 0x31170000)
	return;

	/* eat the loginfos associated with task aborts */
	if ((log_info == 30050000) \|\| (log_info == 0x31140000) \|\|
	(log_info == 0x31130000))
	return;

	switch (sas_loginfo.dw.originator) {
	case 0:
	originator_str = "IOP";
	break;
	case 1:
	originator_str = "PL";
	break;
	case 2:
	originator_str = "IR";
	break;
	}

	mpr_dprint(sc, MPR_LOG, "log_info(0x%08x): originator(%s), "
	"code(0x%02x), sub_code(0x%04x)\n", log_info, originator_str,
	sas_loginfo.dw.code, sas_loginfo.dw.subcode);
	}

	static void
	mpr_display_reply_info(struct mpr_softc sc, uint8_t reply)
	{
	MPI2DefaultReply_t *mpi_reply;
	u16 sc_status;

	mpi_reply = (MPI2DefaultReply_t*)reply;
	sc_status = le16toh(mpi_reply->IOCStatus);
	if (sc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE)
	mpr_sas_log_info(sc, le32toh(mpi_reply->IOCLogInfo));
	}

	void
	mpr_intr(void *data)
	{
	struct mpr_softc *sc;
	uint32_t status;

	sc = (struct mpr_softc *)data;
	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);

	/*
	* Check interrupt status register to flush the bus. This is
	* needed for both INTx interrupts and driver-driven polling
	*/
	status = mpr_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
	if ((status & MPI2_HIS_REPLY_DESCRIPTOR_INTERRUPT) == 0)
	return;

	mpr_lock(sc);
	mpr_intr_locked(data);
	mpr_unlock(sc);
	return;
	}

	/*
	* In theory, MSI/MSIX interrupts shouldn't need to read any registers on the
	* chip. Hopefully this theory is correct.
	*/
	void
	mpr_intr_msi(void *data)
	{
	struct mpr_softc *sc;

	sc = (struct mpr_softc *)data;
	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
	mpr_lock(sc);
	mpr_intr_locked(data);
	mpr_unlock(sc);
	return;
	}

	/*
	* The locking is overly broad and simplistic, but easy to deal with for now.
	*/
	void
	mpr_intr_locked(void *data)
	{
	MPI2_REPLY_DESCRIPTORS_UNION *desc;
	MPI2_DIAG_RELEASE_REPLY *rel_rep;
	mpr_fw_diagnostic_buffer_t *pBuffer;
	struct mpr_softc *sc;
	uint64_t tdesc;
	struct mpr_command *cm = NULL;
	uint8_t flags;
	u_int pq;

	sc = (struct mpr_softc *)data;

	pq = sc->replypostindex;
	mpr_dprint(sc, MPR_TRACE,
	"%s sc %p starting with replypostindex %u\n",
	__func__, sc, sc->replypostindex);

	for ( ;; ) {
	cm = NULL;
	desc = &sc->post_queue[sc->replypostindex];

	/*
	* Copy and clear out the descriptor so that any reentry will
	* immediately know that this descriptor has already been
	* looked at. There is unfortunate casting magic because the
	* MPI API doesn't have a cardinal 64bit type.
	*/
	tdesc = 0xffffffffffffffff;
	tdesc = atomic_swap_64((uint64_t *)desc, tdesc);
	desc = (MPI2_REPLY_DESCRIPTORS_UNION *)&tdesc;

	flags = desc->Default.ReplyFlags &
	MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK;
	if ((flags == MPI2_RPY_DESCRIPT_FLAGS_UNUSED) \|\|
	(le32toh(desc->Words.High) == 0xffffffff))
	break;

	/* increment the replypostindex now, so that event handlers
	* and cm completion handlers which decide to do a diag
	* reset can zero it without it getting incremented again
	* afterwards, and we break out of this loop on the next
	* iteration since the reply post queue has been cleared to
	* 0xFF and all descriptors look unused (which they are).
	*/
	if (++sc->replypostindex >= sc->pqdepth)
	sc->replypostindex = 0;

	switch (flags) {
	case MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS:
	case MPI25_RPY_DESCRIPT_FLAGS_FAST_PATH_SCSI_IO_SUCCESS:
	case MPI26_RPY_DESCRIPT_FLAGS_PCIE_ENCAPSULATED_SUCCESS:
	cm = &sc->commands[le16toh(desc->SCSIIOSuccess.SMID)];
	KASSERT(cm->cm_state == MPR_CM_STATE_INQUEUE,
	("command not inqueue\n"));
	cm->cm_state = MPR_CM_STATE_BUSY;
	cm->cm_reply = NULL;
	break;
	case MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY:
	{
	uint32_t baddr;
	uint8_t *reply;

	/*
	* Re-compose the reply address from the address
	* sent back from the chip. The ReplyFrameAddress
	* is the lower 32 bits of the physical address of
	* particular reply frame. Convert that address to
	* host format, and then use that to provide the
	* offset against the virtual address base
	* (sc->reply_frames).
	*/
	baddr = le32toh(desc->AddressReply.ReplyFrameAddress);
	reply = sc->reply_frames +
	(baddr - ((uint32_t)sc->reply_busaddr));
	/*
	* Make sure the reply we got back is in a valid
	* range. If not, go ahead and panic here, since
	* we'll probably panic as soon as we deference the
	* reply pointer anyway.
	*/
	if ((reply < sc->reply_frames)
	\|\| (reply > (sc->reply_frames +
	(sc->fqdepth * sc->replyframesz)))) {
	printf("%s: WARNING: reply %p out of range!\n",
	__func__, reply);
	printf("%s: reply_frames %p, fqdepth %d, "
	"frame size %d\n", __func__,
	sc->reply_frames, sc->fqdepth,
	sc->replyframesz);
	printf("%s: baddr %#x,\n", __func__, baddr);
	/* LSI-TODO. See Linux Code for Graceful exit */
	panic("Reply address out of range");
	}
	if (le16toh(desc->AddressReply.SMID) == 0) {
	if (((MPI2_DEFAULT_REPLY *)reply)->Function ==
	MPI2_FUNCTION_DIAG_BUFFER_POST) {
	/*
	* If SMID is 0 for Diag Buffer Post,
	* this implies that the reply is due to
	* a release function with a status that
	* the buffer has been released. Set
	* the buffer flags accordingly.
	*/
	rel_rep =
	(MPI2_DIAG_RELEASE_REPLY *)reply;
	if ((le16toh(rel_rep->IOCStatus) &
	MPI2_IOCSTATUS_MASK) ==
	MPI2_IOCSTATUS_DIAGNOSTIC_RELEASED)
	{
	pBuffer =
	&sc->fw_diag_buffer_list[
	rel_rep->BufferType];
	pBuffer->valid_data = TRUE;
	pBuffer->owned_by_firmware =
	FALSE;
	pBuffer->immediate = FALSE;
	}
	} else
	mpr_dispatch_event(sc, baddr,
	(MPI2_EVENT_NOTIFICATION_REPLY *)
	reply);
	} else {
	cm = &sc->commands[
	le16toh(desc->AddressReply.SMID)];
	if (cm->cm_state == MPR_CM_STATE_INQUEUE) {
	cm->cm_reply = reply;
	cm->cm_reply_data =
	le32toh(desc->AddressReply.
	ReplyFrameAddress);
	} else {
	mpr_dprint(sc, MPR_RECOVERY,
	"Bad state for ADDRESS_REPLY status,"
	" ignoring state %d cm %p\n",
	cm->cm_state, cm);
	}
	}
	break;
	}
	case MPI2_RPY_DESCRIPT_FLAGS_TARGETASSIST_SUCCESS:
	case MPI2_RPY_DESCRIPT_FLAGS_TARGET_COMMAND_BUFFER:
	case MPI2_RPY_DESCRIPT_FLAGS_RAID_ACCELERATOR_SUCCESS:
	default:
	/* Unhandled */
	mpr_dprint(sc, MPR_ERROR, "Unhandled reply 0x%x\n",
	desc->Default.ReplyFlags);
	cm = NULL;
	break;
	}

	if (cm != NULL) {
	// Print Error reply frame
	if (cm->cm_reply)
	mpr_display_reply_info(sc,cm->cm_reply);
	mpr_complete_command(sc, cm);
	}
	}

	if (pq != sc->replypostindex) {
	mpr_dprint(sc, MPR_TRACE, "%s sc %p writing postindex %d\n",
	__func__, sc, sc->replypostindex);
	mpr_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET,
	sc->replypostindex);
	}

	return;
	}

	static void
	mpr_dispatch_event(struct mpr_softc *sc, uintptr_t data,
	MPI2_EVENT_NOTIFICATION_REPLY *reply)
	{
	struct mpr_event_handle *eh;
	int event, handled = 0;

	event = le16toh(reply->Event);
	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
	if (isset(eh->mask, event)) {
	eh->callback(sc, data, reply);
	handled++;
	}
	}

	if (handled == 0)
	mpr_dprint(sc, MPR_EVENT, "Unhandled event 0x%x\n",
	le16toh(event));

	/*
	* This is the only place that the event/reply should be freed.
	* Anything wanting to hold onto the event data should have
	* already copied it into their own storage.
	*/
	mpr_free_reply(sc, data);
	}

	static void
	mpr_reregister_events_complete(struct mpr_softc sc, struct mpr_command cm)
	{
	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);

	if (cm->cm_reply)
	MPR_DPRINT_EVENT(sc, generic,
	(MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply);

	mpr_free_command(sc, cm);

	/* next, send a port enable */
	mprsas_startup(sc);
	}

	/*
	* For both register_events and update_events, the caller supplies a bitmap
	* of events that it _wants_. These functions then turn that into a bitmask
	* suitable for the controller.
	*/
	int
	mpr_register_events(struct mpr_softc sc, uint8_t mask,
	mpr_evt_callback_t cb, void data, struct mpr_event_handle **handle)
	{
	struct mpr_event_handle *eh;
	int error = 0;

	eh = malloc(sizeof(struct mpr_event_handle), M_MPR, M_WAITOK\|M_ZERO);
	eh->callback = cb;
	eh->data = data;
	TAILQ_INSERT_TAIL(&sc->event_list, eh, eh_list);
	if (mask != NULL)
	error = mpr_update_events(sc, eh, mask);
	*handle = eh;

	return (error);
	}

	int
	mpr_update_events(struct mpr_softc sc, struct mpr_event_handle handle,
	uint8_t *mask)
	{
	MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
	MPI2_EVENT_NOTIFICATION_REPLY *reply = NULL;
	struct mpr_command *cm = NULL;
	struct mpr_event_handle *eh;
	int error, i;

	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);

	if ((mask != NULL) && (handle != NULL))
	bcopy(mask, &handle->mask[0], 16);
	memset(sc->event_mask, 0xff, 16);

	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
	for (i = 0; i < 16; i++)
	sc->event_mask[i] &= ~eh->mask[i];
	}

	if ((cm = mpr_alloc_command(sc)) == NULL)
	return (EBUSY);
	evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
	evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
	evtreq->MsgFlags = 0;
	evtreq->SASBroadcastPrimitiveMasks = 0;
	#ifdef MPR_DEBUG_ALL_EVENTS
	{
	u_char fullmask[16];
	memset(fullmask, 0x00, 16);
	bcopy(fullmask, (uint8_t *)&evtreq->EventMasks, 16);
	}
	#else
	bcopy(sc->event_mask, (uint8_t *)&evtreq->EventMasks, 16);
	#endif
	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
	cm->cm_data = NULL;

	error = mpr_request_polled(sc, &cm);
	if (cm != NULL)
	reply = (MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply;
	if ((reply == NULL) \|\|
	(reply->IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
	error = ENXIO;

	if (reply)
	MPR_DPRINT_EVENT(sc, generic, reply);

	mpr_dprint(sc, MPR_TRACE, "%s finished error %d\n", __func__, error);

	if (cm != NULL)
	mpr_free_command(sc, cm);
	return (error);
	}

	static int
	mpr_reregister_events(struct mpr_softc *sc)
	{
	MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
	struct mpr_command *cm;
	struct mpr_event_handle *eh;
	int error, i;

	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);

	/* first, reregister events */

	memset(sc->event_mask, 0xff, 16);

	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
	for (i = 0; i < 16; i++)
	sc->event_mask[i] &= ~eh->mask[i];
	}

	if ((cm = mpr_alloc_command(sc)) == NULL)
	return (EBUSY);
	evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
	evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
	evtreq->MsgFlags = 0;
	evtreq->SASBroadcastPrimitiveMasks = 0;
	#ifdef MPR_DEBUG_ALL_EVENTS
	{
	u_char fullmask[16];
	memset(fullmask, 0x00, 16);
	bcopy(fullmask, (uint8_t *)&evtreq->EventMasks, 16);
	}
	#else
	bcopy(sc->event_mask, (uint8_t *)&evtreq->EventMasks, 16);
	#endif
	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
	cm->cm_data = NULL;
	cm->cm_complete = mpr_reregister_events_complete;

	error = mpr_map_command(sc, cm);

	mpr_dprint(sc, MPR_TRACE, "%s finished with error %d\n", __func__,
	error);
	return (error);
	}

	int
	mpr_deregister_events(struct mpr_softc sc, struct mpr_event_handle handle)
	{

	TAILQ_REMOVE(&sc->event_list, handle, eh_list);
	free(handle, M_MPR);
	return (mpr_update_events(sc, NULL, NULL));
	}

	/**
	* mpr_build_nvme_prp - This function is called for NVMe end devices to build a
	* native SGL (NVMe PRP). The native SGL is built starting in the first PRP entry
	* of the NVMe message (PRP1). If the data buffer is small enough to be described
	* entirely using PRP1, then PRP2 is not used. If needed, PRP2 is used to
	* describe a larger data buffer. If the data buffer is too large to describe
	* using the two PRP entriess inside the NVMe message, then PRP1 describes the
	* first data memory segment, and PRP2 contains a pointer to a PRP list located
	* elsewhere in memory to describe the remaining data memory segments. The PRP
	* list will be contiguous.

	* The native SGL for NVMe devices is a Physical Region Page (PRP). A PRP
	* consists of a list of PRP entries to describe a number of noncontigous
	* physical memory segments as a single memory buffer, just as a SGL does. Note
	* however, that this function is only used by the IOCTL call, so the memory
	* given will be guaranteed to be contiguous. There is no need to translate
	* non-contiguous SGL into a PRP in this case. All PRPs will describe contiguous
	* space that is one page size each.
	*
	* Each NVMe message contains two PRP entries. The first (PRP1) either contains
	* a PRP list pointer or a PRP element, depending upon the command. PRP2 contains
	* the second PRP element if the memory being described fits within 2 PRP
	* entries, or a PRP list pointer if the PRP spans more than two entries.
	*
	* A PRP list pointer contains the address of a PRP list, structured as a linear
	* array of PRP entries. Each PRP entry in this list describes a segment of
	* physical memory.
	*
	* Each 64-bit PRP entry comprises an address and an offset field. The address
	* always points to the beginning of a PAGE_SIZE physical memory page, and the
	* offset describes where within that page the memory segment begins. Only the
	* first element in a PRP list may contain a non-zero offest, implying that all
	* memory segments following the first begin at the start of a PAGE_SIZE page.
	*
	* Each PRP element normally describes a chunck of PAGE_SIZE physical memory,
	* with exceptions for the first and last elements in the list. If the memory
	* being described by the list begins at a non-zero offset within the first page,
	* then the first PRP element will contain a non-zero offset indicating where the
	* region begins within the page. The last memory segment may end before the end
	* of the PAGE_SIZE segment, depending upon the overall size of the memory being
	* described by the PRP list.
	*
	* Since PRP entries lack any indication of size, the overall data buffer length
	* is used to determine where the end of the data memory buffer is located, and
	* how many PRP entries are required to describe it.
	*
	* Returns nothing.
	*/
	void
	mpr_build_nvme_prp(struct mpr_softc sc, struct mpr_command cm,
	Mpi26NVMeEncapsulatedRequest_t nvme_encap_request, void data,
	uint32_t data_in_sz, uint32_t data_out_sz)
	{
	int prp_size = PRP_ENTRY_SIZE;
	uint64_t prp_entry, prp1_entry, *prp2_entry;
	uint64_t prp_entry_phys, prp_page, *prp_page_phys;
	uint32_t offset, entry_len, page_mask_result, page_mask;
	bus_addr_t paddr;
	size_t length;
	struct mpr_prp_page *prp_page_info = NULL;

	/*
	* Not all commands require a data transfer. If no data, just return
	* without constructing any PRP.
	*/
	if (!data_in_sz && !data_out_sz)
	return;

	/*
	* Set pointers to PRP1 and PRP2, which are in the NVMe command. PRP1 is
	* located at a 24 byte offset from the start of the NVMe command. Then
	* set the current PRP entry pointer to PRP1.
	*/
	prp1_entry = (uint64_t *)(nvme_encap_request->NVMe_Command +
	NVME_CMD_PRP1_OFFSET);
	prp2_entry = (uint64_t *)(nvme_encap_request->NVMe_Command +
	NVME_CMD_PRP2_OFFSET);
	prp_entry = prp1_entry;

	/*
	* For the PRP entries, use the specially allocated buffer of
	* contiguous memory. PRP Page allocation failures should not happen
	* because there should be enough PRP page buffers to account for the
	* possible NVMe QDepth.
	*/
	prp_page_info = mpr_alloc_prp_page(sc);
	KASSERT(prp_page_info != NULL, ("%s: There are no PRP Pages left to be "
	"used for building a native NVMe SGL.\n", __func__));
	prp_page = (uint64_t *)prp_page_info->prp_page;
	prp_page_phys = (uint64_t *)(uintptr_t)prp_page_info->prp_page_busaddr;

	/*
	* Insert the allocated PRP page into the command's PRP page list. This
	* will be freed when the command is freed.
	*/
	TAILQ_INSERT_TAIL(&cm->cm_prp_page_list, prp_page_info, prp_page_link);

	/*
	* Check if we are within 1 entry of a page boundary we don't want our
	* first entry to be a PRP List entry.
	*/
	page_mask = PAGE_SIZE - 1;
	page_mask_result = (uintptr_t)((uint8_t *)prp_page + prp_size) &
	page_mask;
	if (!page_mask_result)
	{
	/* Bump up to next page boundary. */
	prp_page = (uint64_t )((uint8_t )prp_page + prp_size);
	prp_page_phys = (uint64_t )((uint8_t )prp_page_phys +
	prp_size);
	}

	/*
	* Set PRP physical pointer, which initially points to the current PRP
	* DMA memory page.
	*/
	prp_entry_phys = prp_page_phys;

	/* Get physical address and length of the data buffer. */
	paddr = (bus_addr_t)(uintptr_t)data;
	if (data_in_sz)
	length = data_in_sz;
	else
	length = data_out_sz;

	/* Loop while the length is not zero. */
	while (length)
	{
	/*
	* Check if we need to put a list pointer here if we are at page
	* boundary - prp_size (8 bytes).
	*/
	page_mask_result = (uintptr_t)((uint8_t *)prp_entry_phys +
	prp_size) & page_mask;
	if (!page_mask_result)
	{
	/*
	* This is the last entry in a PRP List, so we need to
	* put a PRP list pointer here. What this does is:
	* - bump the current memory pointer to the next
	* address, which will be the next full page.
	* - set the PRP Entry to point to that page. This is
	* now the PRP List pointer.
	* - bump the PRP Entry pointer the start of the next
	* page. Since all of this PRP memory is contiguous,
	* no need to get a new page - it's just the next
	* address.
	*/
	prp_entry_phys++;
	*prp_entry =
	htole64((uint64_t)(uintptr_t)prp_entry_phys);
	prp_entry++;
	}

	/* Need to handle if entry will be part of a page. */
	offset = (uint32_t)paddr & page_mask;
	entry_len = PAGE_SIZE - offset;

	if (prp_entry == prp1_entry)
	{
	/*
	* Must fill in the first PRP pointer (PRP1) before
	* moving on.
	*/
	*prp1_entry = htole64((uint64_t)paddr);

	/*
	* Now point to the second PRP entry within the
	* command (PRP2).
	*/
	prp_entry = prp2_entry;
	}
	else if (prp_entry == prp2_entry)
	{
	/*
	* Should the PRP2 entry be a PRP List pointer or just a
	* regular PRP pointer? If there is more than one more
	* page of data, must use a PRP List pointer.
	*/
	if (length > PAGE_SIZE)
	{
	/*
	* PRP2 will contain a PRP List pointer because
	* more PRP's are needed with this command. The
	* list will start at the beginning of the
	* contiguous buffer.
	*/
	*prp2_entry =
	htole64(
	(uint64_t)(uintptr_t)prp_entry_phys);

	/*
	* The next PRP Entry will be the start of the
	* first PRP List.
	*/
	prp_entry = prp_page;
	}
	else
	{
	/*
	* After this, the PRP Entries are complete.
	* This command uses 2 PRP's and no PRP list.
	*/
	*prp2_entry = htole64((uint64_t)paddr);
	}
	}
	else
	{
	/*
	* Put entry in list and bump the addresses.
	*
	* After PRP1 and PRP2 are filled in, this will fill in
	* all remaining PRP entries in a PRP List, one per each
	* time through the loop.
	*/
	*prp_entry = htole64((uint64_t)paddr);
	prp_entry++;
	prp_entry_phys++;
	}

	/*
	* Bump the phys address of the command's data buffer by the
	* entry_len.
	*/
	paddr += entry_len;

	/* Decrement length accounting for last partial page. */
	if (entry_len > length)
	length = 0;
	else
	length -= entry_len;
	}
	}

	/*
	* mpr_check_pcie_native_sgl - This function is called for PCIe end devices to
	* determine if the driver needs to build a native SGL. If so, that native SGL
	* is built in the contiguous buffers allocated especially for PCIe SGL
	* creation. If the driver will not build a native SGL, return TRUE and a
	* normal IEEE SGL will be built. Currently this routine supports NVMe devices
	* only.
	*
	* Returns FALSE (0) if native SGL was built, TRUE (1) if no SGL was built.
	*/
	static int
	mpr_check_pcie_native_sgl(struct mpr_softc sc, struct mpr_command cm,
	bus_dma_segment_t *segs, int segs_left)
	{
	uint32_t i, sge_dwords, length, offset, entry_len;
	uint32_t num_entries, buff_len = 0, sges_in_segment;
	uint32_t page_mask, page_mask_result, *curr_buff;
	uint32_t ptr_sgl, ptr_first_sgl, first_page_offset;
	uint32_t first_page_data_size, end_residual;
	uint64_t *msg_phys;
	bus_addr_t paddr;
	int build_native_sgl = 0, first_prp_entry;
	int prp_size = PRP_ENTRY_SIZE;
	Mpi25IeeeSgeChain64_t *main_chain_element = NULL;
	struct mpr_prp_page *prp_page_info = NULL;

	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);

	/*
	* Add up the sizes of each segment length to get the total transfer
	* size, which will be checked against the Maximum Data Transfer Size.
	* If the data transfer length exceeds the MDTS for this device, just
	* return 1 so a normal IEEE SGL will be built. F/W will break the I/O
	* up into multiple I/O's. [nvme_mdts = 0 means unlimited]
	*/
	for (i = 0; i < segs_left; i++)
	buff_len += htole32(segs[i].ds_len);
	if ((cm->cm_targ->MDTS > 0) && (buff_len > cm->cm_targ->MDTS))
	return 1;

	/* Create page_mask (to get offset within page) */
	page_mask = PAGE_SIZE - 1;

	/*
	* Check if the number of elements exceeds the max number that can be
	* put in the main message frame (H/W can only translate an SGL that
	* is contained entirely in the main message frame).
	*/
	sges_in_segment = (sc->reqframesz -
	offsetof(Mpi25SCSIIORequest_t, SGL)) / sizeof(MPI25_SGE_IO_UNION);
	if (segs_left > sges_in_segment)
	build_native_sgl = 1;
	else
	{
	/*
	* NVMe uses one PRP for each physical page (or part of physical
	* page).
	* if 4 pages or less then IEEE is OK
	* if > 5 pages then we need to build a native SGL
	* if > 4 and <= 5 pages, then check the physical address of
	* the first SG entry, then if this first size in the page
	* is >= the residual beyond 4 pages then use IEEE,
	* otherwise use native SGL
	*/
	if (buff_len > (PAGE_SIZE * 5))
	build_native_sgl = 1;
	else if ((buff_len > (PAGE_SIZE * 4)) &&
	(buff_len <= (PAGE_SIZE * 5)) )
	{
	msg_phys = (uint64_t *)(uintptr_t)segs[0].ds_addr;
	first_page_offset =
	((uint32_t)(uint64_t)(uintptr_t)msg_phys &
	page_mask);
	first_page_data_size = PAGE_SIZE - first_page_offset;
	end_residual = buff_len % PAGE_SIZE;

	/*
	* If offset into first page pushes the end of the data
	* beyond end of the 5th page, we need the extra PRP
	* list.
	*/
	if (first_page_data_size < end_residual)
	build_native_sgl = 1;

	/*
	* Check if first SG entry size is < residual beyond 4
	* pages.
	*/
	if (htole32(segs[0].ds_len) <
	(buff_len - (PAGE_SIZE * 4)))
	build_native_sgl = 1;
	}
	}

	/* check if native SGL is needed */
	if (!build_native_sgl)
	return 1;

	/*
	* Native SGL is needed.
	* Put a chain element in main message frame that points to the first
	* chain buffer.
	*
	* NOTE: The ChainOffset field must be 0 when using a chain pointer to
	* a native SGL.
	*/

	/* Set main message chain element pointer */
	main_chain_element = (pMpi25IeeeSgeChain64_t)cm->cm_sge;

	/*
	* For NVMe the chain element needs to be the 2nd SGL entry in the main
	* message.
	*/
	main_chain_element = (Mpi25IeeeSgeChain64_t *)
	((uint8_t *)main_chain_element + sizeof(MPI25_IEEE_SGE_CHAIN64));

	/*
	* For the PRP entries, use the specially allocated buffer of
	* contiguous memory. PRP Page allocation failures should not happen
	* because there should be enough PRP page buffers to account for the
	* possible NVMe QDepth.
	*/
	prp_page_info = mpr_alloc_prp_page(sc);
	KASSERT(prp_page_info != NULL, ("%s: There are no PRP Pages left to be "
	"used for building a native NVMe SGL.\n", __func__));
	curr_buff = (uint32_t *)prp_page_info->prp_page;
	msg_phys = (uint64_t *)(uintptr_t)prp_page_info->prp_page_busaddr;

	/*
	* Insert the allocated PRP page into the command's PRP page list. This
	* will be freed when the command is freed.
	*/
	TAILQ_INSERT_TAIL(&cm->cm_prp_page_list, prp_page_info, prp_page_link);

	/*
	* Check if we are within 1 entry of a page boundary we don't want our
	* first entry to be a PRP List entry.
	*/
	page_mask_result = (uintptr_t)((uint8_t *)curr_buff + prp_size) &
	page_mask;
	if (!page_mask_result) {
	/* Bump up to next page boundary. */
	curr_buff = (uint32_t )((uint8_t )curr_buff + prp_size);
	msg_phys = (uint64_t )((uint8_t )msg_phys + prp_size);
	}

	/* Fill in the chain element and make it an NVMe segment type. */
	main_chain_element->Address.High =
	htole32((uint32_t)((uint64_t)(uintptr_t)msg_phys >> 32));
	main_chain_element->Address.Low =
	htole32((uint32_t)(uintptr_t)msg_phys);
	main_chain_element->NextChainOffset = 0;
	main_chain_element->Flags = MPI2_IEEE_SGE_FLAGS_CHAIN_ELEMENT \|
	MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR \|
	MPI26_IEEE_SGE_FLAGS_NSF_NVME_PRP;

	/* Set SGL pointer to start of contiguous PCIe buffer. */
	ptr_sgl = curr_buff;
	sge_dwords = 2;
	num_entries = 0;

	/*
	* NVMe has a very convoluted PRP format. One PRP is required for each
	* page or partial page. We need to split up OS SG entries if they are
	* longer than one page or cross a page boundary. We also have to insert
	* a PRP list pointer entry as the last entry in each physical page of
	* the PRP list.
	*
	* NOTE: The first PRP "entry" is actually placed in the first SGL entry
	* in the main message in IEEE 64 format. The 2nd entry in the main
	* message is the chain element, and the rest of the PRP entries are
	* built in the contiguous PCIe buffer.
	*/
	first_prp_entry = 1;
	ptr_first_sgl = (uint32_t *)cm->cm_sge;

	for (i = 0; i < segs_left; i++) {
	/* Get physical address and length of this SG entry. */
	paddr = segs[i].ds_addr;
	length = segs[i].ds_len;

	/*
	* Check whether a given SGE buffer lies on a non-PAGED
	* boundary if this is not the first page. If so, this is not
	* expected so have FW build the SGL.
	*/
	if ((i != 0) && (((uint32_t)paddr & page_mask) != 0)) {
	mpr_dprint(sc, MPR_ERROR, "Unaligned SGE while "
	"building NVMe PRPs, low address is 0x%x\n",
	(uint32_t)paddr);
	return 1;
	}

	/* Apart from last SGE, if any other SGE boundary is not page
	* aligned then it means that hole exists. Existence of hole
	* leads to data corruption. So fallback to IEEE SGEs.
	*/
	if (i != (segs_left - 1)) {
	if (((uint32_t)paddr + length) & page_mask) {
	mpr_dprint(sc, MPR_ERROR, "Unaligned SGE "
	"boundary while building NVMe PRPs, low "
	"address: 0x%x and length: %u\n",
	(uint32_t)paddr, length);
	return 1;
	}
	}

	/* Loop while the length is not zero. */
	while (length) {
	/*
	* Check if we need to put a list pointer here if we are
	* at page boundary - prp_size.
	*/
	page_mask_result = (uintptr_t)((uint8_t *)ptr_sgl +
	prp_size) & page_mask;
	if (!page_mask_result) {
	/*
	* Need to put a PRP list pointer here.
	*/
	msg_phys = (uint64_t )((uint8_t )msg_phys +
	prp_size);
	*ptr_sgl = htole32((uintptr_t)msg_phys);
	*(ptr_sgl+1) = htole32((uint64_t)(uintptr_t)
	msg_phys >> 32);
	ptr_sgl += sge_dwords;
	num_entries++;
	}

	/* Need to handle if entry will be part of a page. */
	offset = (uint32_t)paddr & page_mask;
	entry_len = PAGE_SIZE - offset;
	if (first_prp_entry) {
	/*
	* Put IEEE entry in first SGE in main message.
	* (Simple element, System addr, not end of
	* list.)
	*/
	*ptr_first_sgl = htole32((uint32_t)paddr);
	*(ptr_first_sgl + 1) =
	htole32((uint32_t)((uint64_t)paddr >> 32));
	*(ptr_first_sgl + 2) = htole32(entry_len);
	*(ptr_first_sgl + 3) = 0;

	/* No longer the first PRP entry. */
	first_prp_entry = 0;
	} else {
	/* Put entry in list. */
	*ptr_sgl = htole32((uint32_t)paddr);
	*(ptr_sgl + 1) =
	htole32((uint32_t)((uint64_t)paddr >> 32));

	/* Bump ptr_sgl, msg_phys, and num_entries. */
	ptr_sgl += sge_dwords;
	msg_phys = (uint64_t )((uint8_t )msg_phys +
	prp_size);
	num_entries++;
	}

	/* Bump the phys address by the entry_len. */
	paddr += entry_len;

	/* Decrement length accounting for last partial page. */
	if (entry_len > length)
	length = 0;
	else
	length -= entry_len;
	}
	}

	/* Set chain element Length. */
	main_chain_element->Length = htole32(num_entries * prp_size);

	/* Return 0, indicating we built a native SGL. */
	return 0;
	}

	/*
	* Add a chain element as the next SGE for the specified command.
	* Reset cm_sge and cm_sgesize to indicate all the available space. Chains are
	* only required for IEEE commands. Therefore there is no code for commands
	* that have the MPR_CM_FLAGS_SGE_SIMPLE flag set (and those commands
	* shouldn't be requesting chains).
	*/
	static int
	mpr_add_chain(struct mpr_command *cm, int segsleft)
	{
	struct mpr_softc *sc = cm->cm_sc;
	MPI2_REQUEST_HEADER *req;
	MPI25_IEEE_SGE_CHAIN64 *ieee_sgc;
	struct mpr_chain *chain;
	int sgc_size, current_segs, rem_segs, segs_per_frame;
	uint8_t next_chain_offset = 0;

	/*
	* Fail if a command is requesting a chain for SIMPLE SGE's. For SAS3
	* only IEEE commands should be requesting chains. Return some error
	* code other than 0.
	*/
	if (cm->cm_flags & MPR_CM_FLAGS_SGE_SIMPLE) {
	mpr_dprint(sc, MPR_ERROR, "A chain element cannot be added to "
	"an MPI SGL.\n");
	return(ENOBUFS);
	}

	sgc_size = sizeof(MPI25_IEEE_SGE_CHAIN64);
	if (cm->cm_sglsize < sgc_size)
	panic("MPR: Need SGE Error Code\n");

	chain = mpr_alloc_chain(cm->cm_sc);
	if (chain == NULL)
	return (ENOBUFS);

	/*
	* Note: a double-linked list is used to make it easier to walk for
	* debugging.
	*/
	TAILQ_INSERT_TAIL(&cm->cm_chain_list, chain, chain_link);

	/*
	* Need to know if the number of frames left is more than 1 or not. If
	* more than 1 frame is required, NextChainOffset will need to be set,
	* which will just be the last segment of the frame.
	*/
	rem_segs = 0;
	if (cm->cm_sglsize < (sgc_size * segsleft)) {
	/*
	* rem_segs is the number of segements remaining after the
	* segments that will go into the current frame. Since it is
	* known that at least one more frame is required, account for
	* the chain element. To know if more than one more frame is
	* required, just check if there will be a remainder after using
	* the current frame (with this chain) and the next frame. If
	* so the NextChainOffset must be the last element of the next
	* frame.
	*/
	current_segs = (cm->cm_sglsize / sgc_size) - 1;
	rem_segs = segsleft - current_segs;
	segs_per_frame = sc->chain_frame_size / sgc_size;
	if (rem_segs > segs_per_frame) {
	next_chain_offset = segs_per_frame - 1;
	}
	}
	ieee_sgc = &((MPI25_SGE_IO_UNION *)cm->cm_sge)->IeeeChain;
	ieee_sgc->Length = next_chain_offset ?
	htole32((uint32_t)sc->chain_frame_size) :
	htole32((uint32_t)rem_segs * (uint32_t)sgc_size);
	ieee_sgc->NextChainOffset = next_chain_offset;
	ieee_sgc->Flags = (MPI2_IEEE_SGE_FLAGS_CHAIN_ELEMENT \|
	MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR);
	ieee_sgc->Address.Low = htole32(chain->chain_busaddr);
	ieee_sgc->Address.High = htole32(chain->chain_busaddr >> 32);
	cm->cm_sge = &((MPI25_SGE_IO_UNION *)chain->chain)->IeeeSimple;
	req = (MPI2_REQUEST_HEADER *)cm->cm_req;
	req->ChainOffset = (sc->chain_frame_size - sgc_size) >> 4;

	cm->cm_sglsize = sc->chain_frame_size;
	return (0);
	}

	/*
	* Add one scatter-gather element to the scatter-gather list for a command.
	* Maintain cm_sglsize and cm_sge as the remaining size and pointer to the
	* next SGE to fill in, respectively. In Gen3, the MPI SGL does not have a
	* chain, so don't consider any chain additions.
	*/
	int
	mpr_push_sge(struct mpr_command cm, MPI2_SGE_SIMPLE64 sge, size_t len,
	int segsleft)
	{
	uint32_t saved_buf_len, saved_address_low, saved_address_high;
	u32 sge_flags;

	/*
	* case 1: >=1 more segment, no room for anything (error)
	* case 2: 1 more segment and enough room for it
	*/

	if (cm->cm_sglsize < (segsleft * sizeof(MPI2_SGE_SIMPLE64))) {
	mpr_dprint(cm->cm_sc, MPR_ERROR,
	"%s: warning: Not enough room for MPI SGL in frame.\n",
	__func__);
	return(ENOBUFS);
	}

	KASSERT(segsleft == 1,
	("segsleft cannot be more than 1 for an MPI SGL; segsleft = %d\n",
	segsleft));

	/*
	* There is one more segment left to add for the MPI SGL and there is
	* enough room in the frame to add it. This is the normal case because
	* MPI SGL's don't have chains, otherwise something is wrong.
	*
	* If this is a bi-directional request, need to account for that
	* here. Save the pre-filled sge values. These will be used
	* either for the 2nd SGL or for a single direction SGL. If
	* cm_out_len is non-zero, this is a bi-directional request, so
	* fill in the OUT SGL first, then the IN SGL, otherwise just
	* fill in the IN SGL. Note that at this time, when filling in
	* 2 SGL's for a bi-directional request, they both use the same
	* DMA buffer (same cm command).
	*/
	saved_buf_len = sge->FlagsLength & 0x00FFFFFF;
	saved_address_low = sge->Address.Low;
	saved_address_high = sge->Address.High;
	if (cm->cm_out_len) {
	sge->FlagsLength = cm->cm_out_len \|
	((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT \|
	MPI2_SGE_FLAGS_END_OF_BUFFER \|
	MPI2_SGE_FLAGS_HOST_TO_IOC \|
	MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
	MPI2_SGE_FLAGS_SHIFT);
	cm->cm_sglsize -= len;
	/* Endian Safe code */
	sge_flags = sge->FlagsLength;
	sge->FlagsLength = htole32(sge_flags);
	sge->Address.High = htole32(sge->Address.High);
	sge->Address.Low = htole32(sge->Address.Low);
	bcopy(sge, cm->cm_sge, len);
	cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
	}
	sge->FlagsLength = saved_buf_len \|
	((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT \|
	MPI2_SGE_FLAGS_END_OF_BUFFER \|
	MPI2_SGE_FLAGS_LAST_ELEMENT \|
	MPI2_SGE_FLAGS_END_OF_LIST \|
	MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
	MPI2_SGE_FLAGS_SHIFT);
	if (cm->cm_flags & MPR_CM_FLAGS_DATAIN) {
	sge->FlagsLength \|=
	((uint32_t)(MPI2_SGE_FLAGS_IOC_TO_HOST) <<
	MPI2_SGE_FLAGS_SHIFT);
	} else {
	sge->FlagsLength \|=
	((uint32_t)(MPI2_SGE_FLAGS_HOST_TO_IOC) <<
	MPI2_SGE_FLAGS_SHIFT);
	}
	sge->Address.Low = saved_address_low;
	sge->Address.High = saved_address_high;

	cm->cm_sglsize -= len;
	/* Endian Safe code */
	sge_flags = sge->FlagsLength;
	sge->FlagsLength = htole32(sge_flags);
	sge->Address.High = htole32(sge->Address.High);
	sge->Address.Low = htole32(sge->Address.Low);
	bcopy(sge, cm->cm_sge, len);
	cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
	return (0);
	}

	/*
	* Add one IEEE scatter-gather element (chain or simple) to the IEEE scatter-
	* gather list for a command. Maintain cm_sglsize and cm_sge as the
	* remaining size and pointer to the next SGE to fill in, respectively.
	*/
	int
	mpr_push_ieee_sge(struct mpr_command cm, void sgep, int segsleft)
	{
	MPI2_IEEE_SGE_SIMPLE64 *sge = sgep;
	int error, ieee_sge_size = sizeof(MPI25_SGE_IO_UNION);
	uint32_t saved_buf_len, saved_address_low, saved_address_high;
	uint32_t sge_length;

	/*
	* case 1: No room for chain or segment (error).
	* case 2: Two or more segments left but only room for chain.
	* case 3: Last segment and room for it, so set flags.
	*/

	/*
	* There should be room for at least one element, or there is a big
	* problem.
	*/
	if (cm->cm_sglsize < ieee_sge_size)
	panic("MPR: Need SGE Error Code\n");

	if ((segsleft >= 2) && (cm->cm_sglsize < (ieee_sge_size * 2))) {
	if ((error = mpr_add_chain(cm, segsleft)) != 0)
	return (error);
	}

	if (segsleft == 1) {
	/*
	* If this is a bi-directional request, need to account for that
	* here. Save the pre-filled sge values. These will be used
	* either for the 2nd SGL or for a single direction SGL. If
	* cm_out_len is non-zero, this is a bi-directional request, so
	* fill in the OUT SGL first, then the IN SGL, otherwise just
	* fill in the IN SGL. Note that at this time, when filling in
	* 2 SGL's for a bi-directional request, they both use the same
	* DMA buffer (same cm command).
	*/
	saved_buf_len = sge->Length;
	saved_address_low = sge->Address.Low;
	saved_address_high = sge->Address.High;
	if (cm->cm_out_len) {
	sge->Length = cm->cm_out_len;
	sge->Flags = (MPI2_IEEE_SGE_FLAGS_SIMPLE_ELEMENT \|
	MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR);
	cm->cm_sglsize -= ieee_sge_size;
	/* Endian Safe code */
	sge_length = sge->Length;
	sge->Length = htole32(sge_length);
	sge->Address.High = htole32(sge->Address.High);
	sge->Address.Low = htole32(sge->Address.Low);
	bcopy(sgep, cm->cm_sge, ieee_sge_size);
	cm->cm_sge =
	(MPI25_SGE_IO_UNION *)((uintptr_t)cm->cm_sge +
	ieee_sge_size);
	}
	sge->Length = saved_buf_len;
	sge->Flags = (MPI2_IEEE_SGE_FLAGS_SIMPLE_ELEMENT \|
	MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR \|
	MPI25_IEEE_SGE_FLAGS_END_OF_LIST);
	sge->Address.Low = saved_address_low;
	sge->Address.High = saved_address_high;
	}

	cm->cm_sglsize -= ieee_sge_size;
	/* Endian Safe code */
	sge_length = sge->Length;
	sge->Length = htole32(sge_length);
	sge->Address.High = htole32(sge->Address.High);
	sge->Address.Low = htole32(sge->Address.Low);
	bcopy(sgep, cm->cm_sge, ieee_sge_size);
	cm->cm_sge = (MPI25_SGE_IO_UNION *)((uintptr_t)cm->cm_sge +
	ieee_sge_size);
	return (0);
	}

	/*
	* Add one dma segment to the scatter-gather list for a command.
	*/
	int
	mpr_add_dmaseg(struct mpr_command *cm, vm_paddr_t pa, size_t len, u_int flags,
	int segsleft)
	{
	MPI2_SGE_SIMPLE64 sge;
	MPI2_IEEE_SGE_SIMPLE64 ieee_sge;

	if (!(cm->cm_flags & MPR_CM_FLAGS_SGE_SIMPLE)) {
	ieee_sge.Flags = (MPI2_IEEE_SGE_FLAGS_SIMPLE_ELEMENT \|
	MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR);
	ieee_sge.Length = len;
	mpr_from_u64(pa, &ieee_sge.Address);

	return (mpr_push_ieee_sge(cm, &ieee_sge, segsleft));
	} else {
	/*
	* This driver always uses 64-bit address elements for
	* simplicity.
	*/
	flags \|= MPI2_SGE_FLAGS_SIMPLE_ELEMENT \|
	MPI2_SGE_FLAGS_64_BIT_ADDRESSING;
	/* Set Endian safe macro in mpr_push_sge */
	sge.FlagsLength = len \| (flags << MPI2_SGE_FLAGS_SHIFT);
	mpr_from_u64(pa, &sge.Address);

	return (mpr_push_sge(cm, &sge, sizeof sge, segsleft));
	}
	}

	static void
	mpr_data_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct mpr_softc *sc;
	struct mpr_command *cm;
	u_int i, dir, sflags;

	cm = (struct mpr_command *)arg;
	sc = cm->cm_sc;

	/*
	* In this case, just print out a warning and let the chip tell the
	* user they did the wrong thing.
	*/
	if ((cm->cm_max_segs != 0) && (nsegs > cm->cm_max_segs)) {
	mpr_dprint(sc, MPR_ERROR, "%s: warning: busdma returned %d "
	"segments, more than the %d allowed\n", __func__, nsegs,
	cm->cm_max_segs);
	}

	/*
	* Set up DMA direction flags. Bi-directional requests are also handled
	* here. In that case, both direction flags will be set.
	*/
	sflags = 0;
	if (cm->cm_flags & MPR_CM_FLAGS_SMP_PASS) {
	/*
	* We have to add a special case for SMP passthrough, there
	* is no easy way to generically handle it. The first
	* S/G element is used for the command (therefore the
	* direction bit needs to be set). The second one is used
	* for the reply. We'll leave it to the caller to make
	* sure we only have two buffers.
	*/
	/*
	* Even though the busdma man page says it doesn't make
	* sense to have both direction flags, it does in this case.
	* We have one s/g element being accessed in each direction.
	*/
	dir = BUS_DMASYNC_PREWRITE \| BUS_DMASYNC_PREREAD;

	/*
	* Set the direction flag on the first buffer in the SMP
	* passthrough request. We'll clear it for the second one.
	*/
	sflags \|= MPI2_SGE_FLAGS_DIRECTION \|
	MPI2_SGE_FLAGS_END_OF_BUFFER;
	} else if (cm->cm_flags & MPR_CM_FLAGS_DATAOUT) {
	sflags \|= MPI2_SGE_FLAGS_HOST_TO_IOC;
	dir = BUS_DMASYNC_PREWRITE;
	} else
	dir = BUS_DMASYNC_PREREAD;

	/* Check if a native SG list is needed for an NVMe PCIe device. */
	if (cm->cm_targ && cm->cm_targ->is_nvme &&
	mpr_check_pcie_native_sgl(sc, cm, segs, nsegs) == 0) {
	/* A native SG list was built, skip to end. */
	goto out;
	}

	for (i = 0; i < nsegs; i++) {
	if ((cm->cm_flags & MPR_CM_FLAGS_SMP_PASS) && (i != 0)) {
	sflags &= ~MPI2_SGE_FLAGS_DIRECTION;
	}
	error = mpr_add_dmaseg(cm, segs[i].ds_addr, segs[i].ds_len,
	sflags, nsegs - i);
	if (error != 0) {
	/* Resource shortage, roll back! */
	if (ratecheck(&sc->lastfail, &mpr_chainfail_interval))
	mpr_dprint(sc, MPR_INFO, "Out of chain frames, "
	"consider increasing hw.mpr.max_chains.\n");
	cm->cm_flags \|= MPR_CM_FLAGS_CHAIN_FAILED;
	mpr_complete_command(sc, cm);
	return;
	}
	}

	out:
	bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir);
	mpr_enqueue_request(sc, cm);

	return;
	}

	static void
	mpr_data_cb2(void arg, bus_dma_segment_t segs, int nsegs, bus_size_t mapsize,
	int error)
	{
	mpr_data_cb(arg, segs, nsegs, error);
	}

	/*
	* This is the routine to enqueue commands ansynchronously.
	* Note that the only error path here is from bus_dmamap_load(), which can
	* return EINPROGRESS if it is waiting for resources. Other than this, it's
	* assumed that if you have a command in-hand, then you have enough credits
	* to use it.
	*/
	int
	mpr_map_command(struct mpr_softc sc, struct mpr_command cm)
	{
	int error = 0;

	if (cm->cm_flags & MPR_CM_FLAGS_USE_UIO) {
	error = bus_dmamap_load_uio(sc->buffer_dmat, cm->cm_dmamap,
	&cm->cm_uio, mpr_data_cb2, cm, 0);
	} else if (cm->cm_flags & MPR_CM_FLAGS_USE_CCB) {
	error = bus_dmamap_load_ccb(sc->buffer_dmat, cm->cm_dmamap,
	cm->cm_data, mpr_data_cb, cm, 0);
	} else if ((cm->cm_data != NULL) && (cm->cm_length != 0)) {
	error = bus_dmamap_load(sc->buffer_dmat, cm->cm_dmamap,
	cm->cm_data, cm->cm_length, mpr_data_cb, cm, 0);
	} else {
	/* Add a zero-length element as needed */
	if (cm->cm_sge != NULL)
	mpr_add_dmaseg(cm, 0, 0, 0, 1);
	mpr_enqueue_request(sc, cm);
	}

	return (error);
	}

	/*
	* This is the routine to enqueue commands synchronously. An error of
	* EINPROGRESS from mpr_map_command() is ignored since the command will
	* be executed and enqueued automatically. Other errors come from msleep().
	*/
	int
	mpr_wait_command(struct mpr_softc sc, struct mpr_command *cmp, int timeout,
	int sleep_flag)
	{
	int error, rc;
	struct timeval cur_time, start_time;
	struct mpr_command cm = cmp;

	if (sc->mpr_flags & MPR_FLAGS_DIAGRESET)
	return EBUSY;

	cm->cm_complete = NULL;
	cm->cm_flags \|= (MPR_CM_FLAGS_WAKEUP + MPR_CM_FLAGS_POLLED);
	error = mpr_map_command(sc, cm);
	if ((error != 0) && (error != EINPROGRESS))
	return (error);

	// Check for context and wait for 50 mSec at a time until time has
	// expired or the command has finished. If msleep can't be used, need
	// to poll.
	if (curthread->td_no_sleeping)
	sleep_flag = NO_SLEEP;
	getmicrouptime(&start_time);
	if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP) {
	error = msleep(cm, &sc->mpr_mtx, 0, "mprwait", timeout*hz);
	if (error == EWOULDBLOCK) {
	/*
	* Record the actual elapsed time in the case of a
	* timeout for the message below.
	*/
	getmicrouptime(&cur_time);
	timevalsub(&cur_time, &start_time);
	}
	} else {
	while ((cm->cm_flags & MPR_CM_FLAGS_COMPLETE) == 0) {
	mpr_intr_locked(sc);
	if (sleep_flag == CAN_SLEEP)
	pause("mprwait", hz/20);
	else
	DELAY(50000);

	getmicrouptime(&cur_time);
	timevalsub(&cur_time, &start_time);
	if (cur_time.tv_sec > timeout) {
	error = EWOULDBLOCK;
	break;
	}
	}
	}

	if (error == EWOULDBLOCK) {
	if (cm->cm_timeout_handler == NULL) {
	mpr_dprint(sc, MPR_FAULT, "Calling Reinit from %s, timeout=%d,"
	" elapsed=%jd\n", __func__, timeout,
	(intmax_t)cur_time.tv_sec);
	rc = mpr_reinit(sc);
	mpr_dprint(sc, MPR_FAULT, "Reinit %s\n", (rc == 0) ? "success" :
	"failed");
	} else
	cm->cm_timeout_handler(sc, cm);
	if (sc->mpr_flags & MPR_FLAGS_REALLOCATED) {
	/*
	* Tell the caller that we freed the command in a
	* reinit.
	*/
	*cmp = NULL;
	}
	error = ETIMEDOUT;
	}
	return (error);
	}

	/*
	* This is the routine to enqueue a command synchonously and poll for
	* completion. Its use should be rare.
	*/
	int
	mpr_request_polled(struct mpr_softc sc, struct mpr_command *cmp)
	{
	int error, rc;
	struct timeval cur_time, start_time;
	struct mpr_command cm = cmp;

	error = 0;

	cm->cm_flags \|= MPR_CM_FLAGS_POLLED;
	cm->cm_complete = NULL;
	mpr_map_command(sc, cm);

	getmicrouptime(&start_time);
	while ((cm->cm_flags & MPR_CM_FLAGS_COMPLETE) == 0) {
	mpr_intr_locked(sc);

	if (mtx_owned(&sc->mpr_mtx))
	msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0,
	"mprpoll", hz/20);
	else
	pause("mprpoll", hz/20);

	/*
	* Check for real-time timeout and fail if more than 60 seconds.
	*/
	getmicrouptime(&cur_time);
	timevalsub(&cur_time, &start_time);
	if (cur_time.tv_sec > 60) {
	mpr_dprint(sc, MPR_FAULT, "polling failed\n");
	error = ETIMEDOUT;
	break;
	}
	}
	cm->cm_state = MPR_CM_STATE_BUSY;
	if (error) {
	mpr_dprint(sc, MPR_FAULT, "Calling Reinit from %s\n", __func__);
	rc = mpr_reinit(sc);
	mpr_dprint(sc, MPR_FAULT, "Reinit %s\n", (rc == 0) ? "success" :
	"failed");

	if (sc->mpr_flags & MPR_FLAGS_REALLOCATED) {
	/*
	* Tell the caller that we freed the command in a
	* reinit.
	*/
	*cmp = NULL;
	}
	}
	return (error);
	}

	/*
	* The MPT driver had a verbose interface for config pages. In this driver,
	* reduce it to much simpler terms, similar to the Linux driver.
	*/
	int
	mpr_read_config_page(struct mpr_softc sc, struct mpr_config_params params)
	{
	MPI2_CONFIG_REQUEST *req;
	struct mpr_command *cm;
	int error;

	if (sc->mpr_flags & MPR_FLAGS_BUSY) {
	return (EBUSY);
	}

	cm = mpr_alloc_command(sc);
	if (cm == NULL) {
	return (EBUSY);
	}

	req = (MPI2_CONFIG_REQUEST *)cm->cm_req;
	req->Function = MPI2_FUNCTION_CONFIG;
	req->Action = params->action;
	req->SGLFlags = 0;
	req->ChainOffset = 0;
	req->PageAddress = params->page_address;
	if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
	MPI2_CONFIG_EXTENDED_PAGE_HEADER *hdr;

	hdr = &params->hdr.Ext;
	req->ExtPageType = hdr->ExtPageType;
	req->ExtPageLength = hdr->ExtPageLength;
	req->Header.PageType = MPI2_CONFIG_PAGETYPE_EXTENDED;
	req->Header.PageLength = 0; /* Must be set to zero */
	req->Header.PageNumber = hdr->PageNumber;
	req->Header.PageVersion = hdr->PageVersion;
	} else {
	MPI2_CONFIG_PAGE_HEADER *hdr;

	hdr = &params->hdr.Struct;
	req->Header.PageType = hdr->PageType;
	req->Header.PageNumber = hdr->PageNumber;
	req->Header.PageLength = hdr->PageLength;
	req->Header.PageVersion = hdr->PageVersion;
	}

	cm->cm_data = params->buffer;
	cm->cm_length = params->length;
	if (cm->cm_data != NULL) {
	cm->cm_sge = &req->PageBufferSGE;
	cm->cm_sglsize = sizeof(MPI2_SGE_IO_UNION);
	cm->cm_flags = MPR_CM_FLAGS_SGE_SIMPLE \| MPR_CM_FLAGS_DATAIN;
	} else
	cm->cm_sge = NULL;
	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;

	cm->cm_complete_data = params;
	if (params->callback != NULL) {
	cm->cm_complete = mpr_config_complete;
	return (mpr_map_command(sc, cm));
	} else {
	error = mpr_wait_command(sc, &cm, 0, CAN_SLEEP);
	if (error) {
	mpr_dprint(sc, MPR_FAULT,
	"Error %d reading config page\n", error);
	if (cm != NULL)
	mpr_free_command(sc, cm);
	return (error);
	}
	mpr_config_complete(sc, cm);
	}

	return (0);
	}

	int
	mpr_write_config_page(struct mpr_softc sc, struct mpr_config_params params)
	{
	return (EINVAL);
	}

	static void
	mpr_config_complete(struct mpr_softc sc, struct mpr_command cm)
	{
	MPI2_CONFIG_REPLY *reply;
	struct mpr_config_params *params;

	MPR_FUNCTRACE(sc);
	params = cm->cm_complete_data;

	if (cm->cm_data != NULL) {
	bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap,
	BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap);
	}

	/*
	* XXX KDM need to do more error recovery? This results in the
	* device in question not getting probed.
	*/
	if ((cm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) {
	params->status = MPI2_IOCSTATUS_BUSY;
	goto done;
	}

	reply = (MPI2_CONFIG_REPLY *)cm->cm_reply;
	if (reply == NULL) {
	params->status = MPI2_IOCSTATUS_BUSY;
	goto done;
	}
	params->status = reply->IOCStatus;
	if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
	params->hdr.Ext.ExtPageType = reply->ExtPageType;
	params->hdr.Ext.ExtPageLength = reply->ExtPageLength;
	params->hdr.Ext.PageType = reply->Header.PageType;
	params->hdr.Ext.PageNumber = reply->Header.PageNumber;
	params->hdr.Ext.PageVersion = reply->Header.PageVersion;
	} else {
	params->hdr.Struct.PageType = reply->Header.PageType;
	params->hdr.Struct.PageNumber = reply->Header.PageNumber;
	params->hdr.Struct.PageLength = reply->Header.PageLength;
	params->hdr.Struct.PageVersion = reply->Header.PageVersion;
	}

	done:
	mpr_free_command(sc, cm);
	if (params->callback != NULL)
	params->callback(sc, params);

	return;
	}
	diff --git a/sys/dev/mps/mps.c b/sys/dev/mps/mps.c
	index 52d76a6c1924..e4d79b10e358 100644
	--- a/sys/dev/mps/mps.c
	+++ b/sys/dev/mps/mps.c
	@@ -1,3235 +1,3235 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2009 Yahoo! Inc.
	* Copyright (c) 2011-2015 LSI Corp.
	* Copyright (c) 2013-2015 Avago Technologies
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* Avago Technologies (LSI) MPT-Fusion Host Adapter FreeBSD
	*
	* $FreeBSD$
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/* Communications core for Avago Technologies (LSI) MPT2 */

	/* TODO Move headers to mpsvar */
	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/selinfo.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/module.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/bio.h>
	#include <sys/malloc.h>
	#include <sys/uio.h>
	#include <sys/sysctl.h>
	#include <sys/smp.h>
	#include <sys/queue.h>
	#include <sys/kthread.h>
	#include <sys/taskqueue.h>
	#include <sys/endian.h>
	#include <sys/eventhandler.h>
	#include <sys/sbuf.h>
	#include <sys/priv.h>

	#include <machine/bus.h>
	#include <machine/resource.h>
	#include <sys/rman.h>
	#include <sys/proc.h>

	#include <dev/pci/pcivar.h>

	#include <cam/cam.h>
	#include <cam/scsi/scsi_all.h>

	#include <dev/mps/mpi/mpi2_type.h>
	#include <dev/mps/mpi/mpi2.h>
	#include <dev/mps/mpi/mpi2_ioc.h>
	#include <dev/mps/mpi/mpi2_sas.h>
	#include <dev/mps/mpi/mpi2_cnfg.h>
	#include <dev/mps/mpi/mpi2_init.h>
	#include <dev/mps/mpi/mpi2_tool.h>
	#include <dev/mps/mps_ioctl.h>
	#include <dev/mps/mpsvar.h>
	#include <dev/mps/mps_table.h>

	static int mps_diag_reset(struct mps_softc *sc, int sleep_flag);
	static int mps_init_queues(struct mps_softc *sc);
	static void mps_resize_queues(struct mps_softc *sc);
	static int mps_message_unit_reset(struct mps_softc *sc, int sleep_flag);
	static int mps_transition_operational(struct mps_softc *sc);
	static int mps_iocfacts_allocate(struct mps_softc *sc, uint8_t attaching);
	static void mps_iocfacts_free(struct mps_softc *sc);
	static void mps_startup(void *arg);
	static int mps_send_iocinit(struct mps_softc *sc);
	static int mps_alloc_queues(struct mps_softc *sc);
	static int mps_alloc_hw_queues(struct mps_softc *sc);
	static int mps_alloc_replies(struct mps_softc *sc);
	static int mps_alloc_requests(struct mps_softc *sc);
	static int mps_attach_log(struct mps_softc *sc);
	static __inline void mps_complete_command(struct mps_softc *sc,
	struct mps_command *cm);
	static void mps_dispatch_event(struct mps_softc *sc, uintptr_t data,
	MPI2_EVENT_NOTIFICATION_REPLY *reply);
	static void mps_config_complete(struct mps_softc sc, struct mps_command cm);
	static void mps_periodic(void *);
	static int mps_reregister_events(struct mps_softc *sc);
	static void mps_enqueue_request(struct mps_softc sc, struct mps_command cm);
	static int mps_get_iocfacts(struct mps_softc sc, MPI2_IOC_FACTS_REPLY facts);
	static int mps_wait_db_ack(struct mps_softc *sc, int timeout, int sleep_flag);
	static int mps_debug_sysctl(SYSCTL_HANDLER_ARGS);
	static int mps_dump_reqs(SYSCTL_HANDLER_ARGS);
	static void mps_parse_debug(struct mps_softc sc, char list);

	SYSCTL_NODE(_hw, OID_AUTO, mps, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"MPS Driver Parameters");

	MALLOC_DEFINE(M_MPT2, "mps", "mpt2 driver memory");
	MALLOC_DECLARE(M_MPSUSER);

	/*
	* Do a "Diagnostic Reset" aka a hard reset. This should get the chip out of
	* any state and back to its initialization state machine.
	*/
	static char mpt2_reset_magic[] = { 0x00, 0x0f, 0x04, 0x0b, 0x02, 0x07, 0x0d };

	/* Added this union to smoothly convert le64toh cm->cm_desc.Words.
	* Compiler only support unint64_t to be passed as argument.
	* Otherwise it will throw below error
	* "aggregate value used where an integer was expected"
	*/

	typedef union {
	u64 word;
	struct {
	u32 low;
	u32 high;
	} u;
	} request_descriptor_t;

	/* Rate limit chain-fail messages to 1 per minute */
	static struct timeval mps_chainfail_interval = { 60, 0 };

	/*
	* sleep_flag can be either CAN_SLEEP or NO_SLEEP.
	* If this function is called from process context, it can sleep
	* and there is no harm to sleep, in case if this fuction is called
	* from Interrupt handler, we can not sleep and need NO_SLEEP flag set.
	* based on sleep flags driver will call either msleep, pause or DELAY.
	* msleep and pause are of same variant, but pause is used when mps_mtx
	* is not hold by driver.
	*
	*/
	static int
	mps_diag_reset(struct mps_softc *sc,int sleep_flag)
	{
	uint32_t reg;
	int i, error, tries = 0;
	uint8_t first_wait_done = FALSE;

	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);

	/* Clear any pending interrupts */
	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);

	/*
	* Force NO_SLEEP for threads prohibited to sleep
	* e.a Thread from interrupt handler are prohibited to sleep.
	*/
	if (curthread->td_no_sleeping != 0)
	sleep_flag = NO_SLEEP;

	mps_dprint(sc, MPS_INIT, "sequence start, sleep_flag= %d\n", sleep_flag);

	/* Push the magic sequence */
	error = ETIMEDOUT;
	while (tries++ < 20) {
	for (i = 0; i < sizeof(mpt2_reset_magic); i++)
	mps_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET,
	mpt2_reset_magic[i]);
	/* wait 100 msec */
	if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP)
	msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0,
	"mpsdiag", hz/10);
	else if (sleep_flag == CAN_SLEEP)
	pause("mpsdiag", hz/10);
	else
	DELAY(100 * 1000);

	reg = mps_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
	if (reg & MPI2_DIAG_DIAG_WRITE_ENABLE) {
	error = 0;
	break;
	}
	}
	if (error) {
	mps_dprint(sc, MPS_INIT, "sequence failed, error=%d, exit\n",
	error);
	return (error);
	}

	/* Send the actual reset. XXX need to refresh the reg? */
	reg \|= MPI2_DIAG_RESET_ADAPTER;
	mps_dprint(sc, MPS_INIT, "sequence success, sending reset, reg= 0x%x\n",
	reg);
	mps_regwrite(sc, MPI2_HOST_DIAGNOSTIC_OFFSET, reg);

	/* Wait up to 300 seconds in 50ms intervals */
	error = ETIMEDOUT;
	for (i = 0; i < 6000; i++) {
	/*
	* Wait 50 msec. If this is the first time through, wait 256
	* msec to satisfy Diag Reset timing requirements.
	*/
	if (first_wait_done) {
	if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP)
	msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0,
	"mpsdiag", hz/20);
	else if (sleep_flag == CAN_SLEEP)
	pause("mpsdiag", hz/20);
	else
	DELAY(50 * 1000);
	} else {
	DELAY(256 * 1000);
	first_wait_done = TRUE;
	}
	/*
	* Check for the RESET_ADAPTER bit to be cleared first, then
	* wait for the RESET state to be cleared, which takes a little
	* longer.
	*/
	reg = mps_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
	if (reg & MPI2_DIAG_RESET_ADAPTER) {
	continue;
	}
	reg = mps_regread(sc, MPI2_DOORBELL_OFFSET);
	if ((reg & MPI2_IOC_STATE_MASK) != MPI2_IOC_STATE_RESET) {
	error = 0;
	break;
	}
	}
	if (error) {
	mps_dprint(sc, MPS_INIT, "reset failed, error= %d, exit\n",
	error);
	return (error);
	}

	mps_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET, 0x0);
	mps_dprint(sc, MPS_INIT, "diag reset success, exit\n");

	return (0);
	}

	static int
	mps_message_unit_reset(struct mps_softc *sc, int sleep_flag)
	{
	int error;

	MPS_FUNCTRACE(sc);

	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);

	error = 0;
	mps_regwrite(sc, MPI2_DOORBELL_OFFSET,
	MPI2_FUNCTION_IOC_MESSAGE_UNIT_RESET <<
	MPI2_DOORBELL_FUNCTION_SHIFT);

	if (mps_wait_db_ack(sc, 5, sleep_flag) != 0) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT,
	"Doorbell handshake failed\n");
	error = ETIMEDOUT;
	}

	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);
	return (error);
	}

	static int
	mps_transition_ready(struct mps_softc *sc)
	{
	uint32_t reg, state;
	int error, tries = 0;
	int sleep_flags;

	MPS_FUNCTRACE(sc);
	/* If we are in attach call, do not sleep */
	sleep_flags = (sc->mps_flags & MPS_FLAGS_ATTACH_DONE)
	? CAN_SLEEP:NO_SLEEP;
	error = 0;

	mps_dprint(sc, MPS_INIT, "%s entered, sleep_flags= %d\n",
	__func__, sleep_flags);

	while (tries++ < 1200) {
	reg = mps_regread(sc, MPI2_DOORBELL_OFFSET);
	mps_dprint(sc, MPS_INIT, " Doorbell= 0x%x\n", reg);

	/*
	* Ensure the IOC is ready to talk. If it's not, try
	* resetting it.
	*/
	if (reg & MPI2_DOORBELL_USED) {
	mps_dprint(sc, MPS_INIT, " Not ready, sending diag "
	"reset\n");
	mps_diag_reset(sc, sleep_flags);
	DELAY(50000);
	continue;
	}

	/* Is the adapter owned by another peer? */
	if ((reg & MPI2_DOORBELL_WHO_INIT_MASK) ==
	(MPI2_WHOINIT_PCI_PEER << MPI2_DOORBELL_WHO_INIT_SHIFT)) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT, "IOC is under the "
	"control of another peer host, aborting "
	"initialization.\n");
	error = ENXIO;
	break;
	}

	state = reg & MPI2_IOC_STATE_MASK;
	if (state == MPI2_IOC_STATE_READY) {
	/* Ready to go! */
	error = 0;
	break;
	} else if (state == MPI2_IOC_STATE_FAULT) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT, "IOC in fault "
	"state 0x%x, resetting\n",
	state & MPI2_DOORBELL_FAULT_CODE_MASK);
	mps_diag_reset(sc, sleep_flags);
	} else if (state == MPI2_IOC_STATE_OPERATIONAL) {
	/* Need to take ownership */
	mps_message_unit_reset(sc, sleep_flags);
	} else if (state == MPI2_IOC_STATE_RESET) {
	/* Wait a bit, IOC might be in transition */
	mps_dprint(sc, MPS_INIT\|MPS_FAULT,
	"IOC in unexpected reset state\n");
	} else {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT,
	"IOC in unknown state 0x%x\n", state);
	error = EINVAL;
	break;
	}

	/* Wait 50ms for things to settle down. */
	DELAY(50000);
	}

	if (error)
	mps_dprint(sc, MPS_INIT\|MPS_FAULT,
	"Cannot transition IOC to ready\n");
	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);

	return (error);
	}

	static int
	mps_transition_operational(struct mps_softc *sc)
	{
	uint32_t reg, state;
	int error;

	MPS_FUNCTRACE(sc);

	error = 0;
	reg = mps_regread(sc, MPI2_DOORBELL_OFFSET);
	mps_dprint(sc, MPS_INIT, "%s entered, Doorbell= 0x%x\n", __func__, reg);

	state = reg & MPI2_IOC_STATE_MASK;
	if (state != MPI2_IOC_STATE_READY) {
	mps_dprint(sc, MPS_INIT, "IOC not ready\n");
	if ((error = mps_transition_ready(sc)) != 0) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT,
	"failed to transition ready, exit\n");
	return (error);
	}
	}

	error = mps_send_iocinit(sc);
	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);

	return (error);
	}

	static void
	mps_resize_queues(struct mps_softc *sc)
	{
	u_int reqcr, prireqcr, maxio, sges_per_frame;

	/*
	* Size the queues. Since the reply queues always need one free
	* entry, we'll deduct one reply message here. The LSI documents
	* suggest instead to add a count to the request queue, but I think
	* that it's better to deduct from reply queue.
	*/
	prireqcr = MAX(1, sc->max_prireqframes);
	prireqcr = MIN(prireqcr, sc->facts->HighPriorityCredit);

	reqcr = MAX(2, sc->max_reqframes);
	reqcr = MIN(reqcr, sc->facts->RequestCredit);

	sc->num_reqs = prireqcr + reqcr;
	sc->num_prireqs = prireqcr;
	sc->num_replies = MIN(sc->max_replyframes + sc->max_evtframes,
	sc->facts->MaxReplyDescriptorPostQueueDepth) - 1;

	/* Store the request frame size in bytes rather than as 32bit words */
	sc->reqframesz = sc->facts->IOCRequestFrameSize * 4;

	/*
	* Max IO Size is Page Size * the following:
	* ((SGEs per frame - 1 for chain element) * Max Chain Depth)
	* + 1 for no chain needed in last frame
	*
	* If user suggests a Max IO size to use, use the smaller of the
	* user's value and the calculated value as long as the user's
	* value is larger than 0. The user's value is in pages.
	*/
	sges_per_frame = sc->reqframesz / sizeof(MPI2_SGE_SIMPLE64) - 1;
	maxio = (sges_per_frame * sc->facts->MaxChainDepth + 1) * PAGE_SIZE;

	/*
	* If I/O size limitation requested, then use it and pass up to CAM.
	- * If not, use MAXPHYS as an optimization hint, but report HW limit.
	+ * If not, use maxphys as an optimization hint, but report HW limit.
	*/
	if (sc->max_io_pages > 0) {
	maxio = min(maxio, sc->max_io_pages * PAGE_SIZE);
	sc->maxio = maxio;
	} else {
	sc->maxio = maxio;
	- maxio = min(maxio, MAXPHYS);
	+ maxio = min(maxio, maxphys);
	}

	sc->num_chains = (maxio / PAGE_SIZE + sges_per_frame - 2) /
	sges_per_frame * reqcr;
	if (sc->max_chains > 0 && sc->max_chains < sc->num_chains)
	sc->num_chains = sc->max_chains;

	/*
	* Figure out the number of MSIx-based queues. If the firmware or
	* user has done something crazy and not allowed enough credit for
	* the queues to be useful then don't enable multi-queue.
	*/
	if (sc->facts->MaxMSIxVectors < 2)
	sc->msi_msgs = 1;

	if (sc->msi_msgs > 1) {
	sc->msi_msgs = MIN(sc->msi_msgs, mp_ncpus);
	sc->msi_msgs = MIN(sc->msi_msgs, sc->facts->MaxMSIxVectors);
	if (sc->num_reqs / sc->msi_msgs < 2)
	sc->msi_msgs = 1;
	}

	mps_dprint(sc, MPS_INIT, "Sized queues to q=%d reqs=%d replies=%d\n",
	sc->msi_msgs, sc->num_reqs, sc->num_replies);
	}

	/*
	* This is called during attach and when re-initializing due to a Diag Reset.
	* IOC Facts is used to allocate many of the structures needed by the driver.
	* If called from attach, de-allocation is not required because the driver has
	* not allocated any structures yet, but if called from a Diag Reset, previously
	* allocated structures based on IOC Facts will need to be freed and re-
	* allocated bases on the latest IOC Facts.
	*/
	static int
	mps_iocfacts_allocate(struct mps_softc *sc, uint8_t attaching)
	{
	int error;
	Mpi2IOCFactsReply_t saved_facts;
	uint8_t saved_mode, reallocating;

	mps_dprint(sc, MPS_INIT\|MPS_TRACE, "%s entered\n", __func__);

	/* Save old IOC Facts and then only reallocate if Facts have changed */
	if (!attaching) {
	bcopy(sc->facts, &saved_facts, sizeof(MPI2_IOC_FACTS_REPLY));
	}

	/*
	* Get IOC Facts. In all cases throughout this function, panic if doing
	* a re-initialization and only return the error if attaching so the OS
	* can handle it.
	*/
	if ((error = mps_get_iocfacts(sc, sc->facts)) != 0) {
	if (attaching) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT, "Failed to get "
	"IOC Facts with error %d, exit\n", error);
	return (error);
	} else {
	panic("%s failed to get IOC Facts with error %d\n",
	__func__, error);
	}
	}

	MPS_DPRINT_PAGE(sc, MPS_XINFO, iocfacts, sc->facts);

	snprintf(sc->fw_version, sizeof(sc->fw_version),
	"%02d.%02d.%02d.%02d",
	sc->facts->FWVersion.Struct.Major,
	sc->facts->FWVersion.Struct.Minor,
	sc->facts->FWVersion.Struct.Unit,
	sc->facts->FWVersion.Struct.Dev);

	snprintf(sc->msg_version, sizeof(sc->msg_version), "%d.%d",
	(sc->facts->MsgVersion & MPI2_IOCFACTS_MSGVERSION_MAJOR_MASK) >>
	MPI2_IOCFACTS_MSGVERSION_MAJOR_SHIFT,
	(sc->facts->MsgVersion & MPI2_IOCFACTS_MSGVERSION_MINOR_MASK) >>
	MPI2_IOCFACTS_MSGVERSION_MINOR_SHIFT);

	mps_dprint(sc, MPS_INFO, "Firmware: %s, Driver: %s\n", sc->fw_version,
	MPS_DRIVER_VERSION);
	mps_dprint(sc, MPS_INFO, "IOCCapabilities: %b\n",
	sc->facts->IOCCapabilities,
	"\20" "\3ScsiTaskFull" "\4DiagTrace" "\5SnapBuf" "\6ExtBuf"
	"\7EEDP" "\10BiDirTarg" "\11Multicast" "\14TransRetry" "\15IR"
	"\16EventReplay" "\17RaidAccel" "\20MSIXIndex" "\21HostDisc");

	/*
	* If the chip doesn't support event replay then a hard reset will be
	* required to trigger a full discovery. Do the reset here then
	* retransition to Ready. A hard reset might have already been done,
	* but it doesn't hurt to do it again. Only do this if attaching, not
	* for a Diag Reset.
	*/
	if (attaching && ((sc->facts->IOCCapabilities &
	MPI2_IOCFACTS_CAPABILITY_EVENT_REPLAY) == 0)) {
	mps_dprint(sc, MPS_INIT, "No event replay, reseting\n");
	mps_diag_reset(sc, NO_SLEEP);
	if ((error = mps_transition_ready(sc)) != 0) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT, "Failed to "
	"transition to ready with error %d, exit\n",
	error);
	return (error);
	}
	}

	/*
	* Set flag if IR Firmware is loaded. If the RAID Capability has
	* changed from the previous IOC Facts, log a warning, but only if
	* checking this after a Diag Reset and not during attach.
	*/
	saved_mode = sc->ir_firmware;
	if (sc->facts->IOCCapabilities &
	MPI2_IOCFACTS_CAPABILITY_INTEGRATED_RAID)
	sc->ir_firmware = 1;
	if (!attaching) {
	if (sc->ir_firmware != saved_mode) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT, "new IR/IT mode "
	"in IOC Facts does not match previous mode\n");
	}
	}

	/* Only deallocate and reallocate if relevant IOC Facts have changed */
	reallocating = FALSE;
	sc->mps_flags &= ~MPS_FLAGS_REALLOCATED;

	if ((!attaching) &&
	((saved_facts.MsgVersion != sc->facts->MsgVersion) \|\|
	(saved_facts.HeaderVersion != sc->facts->HeaderVersion) \|\|
	(saved_facts.MaxChainDepth != sc->facts->MaxChainDepth) \|\|
	(saved_facts.RequestCredit != sc->facts->RequestCredit) \|\|
	(saved_facts.ProductID != sc->facts->ProductID) \|\|
	(saved_facts.IOCCapabilities != sc->facts->IOCCapabilities) \|\|
	(saved_facts.IOCRequestFrameSize !=
	sc->facts->IOCRequestFrameSize) \|\|
	(saved_facts.MaxTargets != sc->facts->MaxTargets) \|\|
	(saved_facts.MaxSasExpanders != sc->facts->MaxSasExpanders) \|\|
	(saved_facts.MaxEnclosures != sc->facts->MaxEnclosures) \|\|
	(saved_facts.HighPriorityCredit != sc->facts->HighPriorityCredit) \|\|
	(saved_facts.MaxReplyDescriptorPostQueueDepth !=
	sc->facts->MaxReplyDescriptorPostQueueDepth) \|\|
	(saved_facts.ReplyFrameSize != sc->facts->ReplyFrameSize) \|\|
	(saved_facts.MaxVolumes != sc->facts->MaxVolumes) \|\|
	(saved_facts.MaxPersistentEntries !=
	sc->facts->MaxPersistentEntries))) {
	reallocating = TRUE;

	/* Record that we reallocated everything */
	sc->mps_flags \|= MPS_FLAGS_REALLOCATED;
	}

	/*
	* Some things should be done if attaching or re-allocating after a Diag
	* Reset, but are not needed after a Diag Reset if the FW has not
	* changed.
	*/
	if (attaching \|\| reallocating) {
	/*
	* Check if controller supports FW diag buffers and set flag to
	* enable each type.
	*/
	if (sc->facts->IOCCapabilities &
	MPI2_IOCFACTS_CAPABILITY_DIAG_TRACE_BUFFER)
	sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_TRACE].
	enabled = TRUE;
	if (sc->facts->IOCCapabilities &
	MPI2_IOCFACTS_CAPABILITY_SNAPSHOT_BUFFER)
	sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_SNAPSHOT].
	enabled = TRUE;
	if (sc->facts->IOCCapabilities &
	MPI2_IOCFACTS_CAPABILITY_EXTENDED_BUFFER)
	sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_EXTENDED].
	enabled = TRUE;

	/*
	* Set flag if EEDP is supported and if TLR is supported.
	*/
	if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_EEDP)
	sc->eedp_enabled = TRUE;
	if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_TLR)
	sc->control_TLR = TRUE;

	mps_resize_queues(sc);

	/*
	* Initialize all Tail Queues
	*/
	TAILQ_INIT(&sc->req_list);
	TAILQ_INIT(&sc->high_priority_req_list);
	TAILQ_INIT(&sc->chain_list);
	TAILQ_INIT(&sc->tm_list);
	}

	/*
	* If doing a Diag Reset and the FW is significantly different
	* (reallocating will be set above in IOC Facts comparison), then all
	* buffers based on the IOC Facts will need to be freed before they are
	* reallocated.
	*/
	if (reallocating) {
	mps_iocfacts_free(sc);
	mpssas_realloc_targets(sc, saved_facts.MaxTargets +
	saved_facts.MaxVolumes);
	}

	/*
	* Any deallocation has been completed. Now start reallocating
	* if needed. Will only need to reallocate if attaching or if the new
	* IOC Facts are different from the previous IOC Facts after a Diag
	* Reset. Targets have already been allocated above if needed.
	*/
	error = 0;
	while (attaching \|\| reallocating) {
	if ((error = mps_alloc_hw_queues(sc)) != 0)
	break;
	if ((error = mps_alloc_replies(sc)) != 0)
	break;
	if ((error = mps_alloc_requests(sc)) != 0)
	break;
	if ((error = mps_alloc_queues(sc)) != 0)
	break;

	break;
	}
	if (error) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT,
	"Failed to alloc queues with error %d\n", error);
	mps_free(sc);
	return (error);
	}

	/* Always initialize the queues */
	bzero(sc->free_queue, sc->fqdepth * 4);
	mps_init_queues(sc);

	/*
	* Always get the chip out of the reset state, but only panic if not
	* attaching. If attaching and there is an error, that is handled by
	* the OS.
	*/
	error = mps_transition_operational(sc);
	if (error != 0) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT, "Failed to "
	"transition to operational with error %d\n", error);
	mps_free(sc);
	return (error);
	}

	/*
	* Finish the queue initialization.
	* These are set here instead of in mps_init_queues() because the
	* IOC resets these values during the state transition in
	* mps_transition_operational(). The free index is set to 1
	* because the corresponding index in the IOC is set to 0, and the
	* IOC treats the queues as full if both are set to the same value.
	* Hence the reason that the queue can't hold all of the possible
	* replies.
	*/
	sc->replypostindex = 0;
	mps_regwrite(sc, MPI2_REPLY_FREE_HOST_INDEX_OFFSET, sc->replyfreeindex);
	mps_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET, 0);

	/*
	* Attach the subsystems so they can prepare their event masks.
	* XXX Should be dynamic so that IM/IR and user modules can attach
	*/
	error = 0;
	while (attaching) {
	mps_dprint(sc, MPS_INIT, "Attaching subsystems\n");
	if ((error = mps_attach_log(sc)) != 0)
	break;
	if ((error = mps_attach_sas(sc)) != 0)
	break;
	if ((error = mps_attach_user(sc)) != 0)
	break;
	break;
	}
	if (error) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT, "Failed to attach all "
	"subsystems: error %d\n", error);
	mps_free(sc);
	return (error);
	}

	/*
	* XXX If the number of MSI-X vectors changes during re-init, this
	* won't see it and adjust.
	*/
	if (attaching && (error = mps_pci_setup_interrupts(sc)) != 0) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT, "Failed to setup "
	"interrupts\n");
	mps_free(sc);
	return (error);
	}

	/*
	* Set flag if this is a WD controller. This shouldn't ever change, but
	* reset it after a Diag Reset, just in case.
	*/
	sc->WD_available = FALSE;
	if (pci_get_device(sc->mps_dev) == MPI2_MFGPAGE_DEVID_SSS6200)
	sc->WD_available = TRUE;

	return (error);
	}

	/*
	* This is called if memory is being free (during detach for example) and when
	* buffers need to be reallocated due to a Diag Reset.
	*/
	static void
	mps_iocfacts_free(struct mps_softc *sc)
	{
	struct mps_command *cm;
	int i;

	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);

	if (sc->free_busaddr != 0)
	bus_dmamap_unload(sc->queues_dmat, sc->queues_map);
	if (sc->free_queue != NULL)
	bus_dmamem_free(sc->queues_dmat, sc->free_queue,
	sc->queues_map);
	if (sc->queues_dmat != NULL)
	bus_dma_tag_destroy(sc->queues_dmat);

	if (sc->chain_frames != NULL) {
	bus_dmamap_unload(sc->chain_dmat, sc->chain_map);
	bus_dmamem_free(sc->chain_dmat, sc->chain_frames,
	sc->chain_map);
	}
	if (sc->chain_dmat != NULL)
	bus_dma_tag_destroy(sc->chain_dmat);

	if (sc->sense_busaddr != 0)
	bus_dmamap_unload(sc->sense_dmat, sc->sense_map);
	if (sc->sense_frames != NULL)
	bus_dmamem_free(sc->sense_dmat, sc->sense_frames,
	sc->sense_map);
	if (sc->sense_dmat != NULL)
	bus_dma_tag_destroy(sc->sense_dmat);

	if (sc->reply_busaddr != 0)
	bus_dmamap_unload(sc->reply_dmat, sc->reply_map);
	if (sc->reply_frames != NULL)
	bus_dmamem_free(sc->reply_dmat, sc->reply_frames,
	sc->reply_map);
	if (sc->reply_dmat != NULL)
	bus_dma_tag_destroy(sc->reply_dmat);

	if (sc->req_busaddr != 0)
	bus_dmamap_unload(sc->req_dmat, sc->req_map);
	if (sc->req_frames != NULL)
	bus_dmamem_free(sc->req_dmat, sc->req_frames, sc->req_map);
	if (sc->req_dmat != NULL)
	bus_dma_tag_destroy(sc->req_dmat);

	if (sc->chains != NULL)
	free(sc->chains, M_MPT2);
	if (sc->commands != NULL) {
	for (i = 1; i < sc->num_reqs; i++) {
	cm = &sc->commands[i];
	bus_dmamap_destroy(sc->buffer_dmat, cm->cm_dmamap);
	}
	free(sc->commands, M_MPT2);
	}
	if (sc->buffer_dmat != NULL)
	bus_dma_tag_destroy(sc->buffer_dmat);

	mps_pci_free_interrupts(sc);
	free(sc->queues, M_MPT2);
	sc->queues = NULL;
	}

	/*
	* The terms diag reset and hard reset are used interchangeably in the MPI
	* docs to mean resetting the controller chip. In this code diag reset
	* cleans everything up, and the hard reset function just sends the reset
	* sequence to the chip. This should probably be refactored so that every
	* subsystem gets a reset notification of some sort, and can clean up
	* appropriately.
	*/
	int
	mps_reinit(struct mps_softc *sc)
	{
	int error;
	struct mpssas_softc *sassc;

	sassc = sc->sassc;

	MPS_FUNCTRACE(sc);

	mtx_assert(&sc->mps_mtx, MA_OWNED);

	mps_dprint(sc, MPS_INIT\|MPS_INFO, "Reinitializing controller\n");
	if (sc->mps_flags & MPS_FLAGS_DIAGRESET) {
	mps_dprint(sc, MPS_INIT, "Reset already in progress\n");
	return 0;
	}

	/* make sure the completion callbacks can recognize they're getting
	* a NULL cm_reply due to a reset.
	*/
	sc->mps_flags \|= MPS_FLAGS_DIAGRESET;

	/*
	* Mask interrupts here.
	*/
	mps_dprint(sc, MPS_INIT, "masking interrupts and resetting\n");
	mps_mask_intr(sc);

	error = mps_diag_reset(sc, CAN_SLEEP);
	if (error != 0) {
	/* XXXSL No need to panic here */
	panic("%s hard reset failed with error %d\n",
	__func__, error);
	}

	/* Restore the PCI state, including the MSI-X registers */
	mps_pci_restore(sc);

	/* Give the I/O subsystem special priority to get itself prepared */
	mpssas_handle_reinit(sc);

	/*
	* Get IOC Facts and allocate all structures based on this information.
	* The attach function will also call mps_iocfacts_allocate at startup.
	* If relevant values have changed in IOC Facts, this function will free
	* all of the memory based on IOC Facts and reallocate that memory.
	*/
	if ((error = mps_iocfacts_allocate(sc, FALSE)) != 0) {
	panic("%s IOC Facts based allocation failed with error %d\n",
	__func__, error);
	}

	/*
	* Mapping structures will be re-allocated after getting IOC Page8, so
	* free these structures here.
	*/
	mps_mapping_exit(sc);

	/*
	* The static page function currently read is IOC Page8. Others can be
	* added in future. It's possible that the values in IOC Page8 have
	* changed after a Diag Reset due to user modification, so always read
	* these. Interrupts are masked, so unmask them before getting config
	* pages.
	*/
	mps_unmask_intr(sc);
	sc->mps_flags &= ~MPS_FLAGS_DIAGRESET;
	mps_base_static_config_pages(sc);

	/*
	* Some mapping info is based in IOC Page8 data, so re-initialize the
	* mapping tables.
	*/
	mps_mapping_initialize(sc);

	/*
	* Restart will reload the event masks clobbered by the reset, and
	* then enable the port.
	*/
	mps_reregister_events(sc);

	/* the end of discovery will release the simq, so we're done. */
	mps_dprint(sc, MPS_INIT\|MPS_XINFO, "Finished sc %p post %u free %u\n",
	sc, sc->replypostindex, sc->replyfreeindex);

	mpssas_release_simq_reinit(sassc);
	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);

	return 0;
	}

	/* Wait for the chip to ACK a word that we've put into its FIFO
	* Wait for <timeout> seconds. In single loop wait for busy loop
	* for 500 microseconds.
	* Total is [ 0.5 * (2000 * <timeout>) ] in miliseconds.
	* */
	static int
	mps_wait_db_ack(struct mps_softc *sc, int timeout, int sleep_flag)
	{

	u32 cntdn, count;
	u32 int_status;
	u32 doorbell;

	count = 0;
	cntdn = (sleep_flag == CAN_SLEEP) ? 1000timeout : 2000timeout;
	do {
	int_status = mps_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
	if (!(int_status & MPI2_HIS_SYS2IOC_DB_STATUS)) {
	mps_dprint(sc, MPS_TRACE,
	"%s: successful count(%d), timeout(%d)\n",
	__func__, count, timeout);
	return 0;
	} else if (int_status & MPI2_HIS_IOC2SYS_DB_STATUS) {
	doorbell = mps_regread(sc, MPI2_DOORBELL_OFFSET);
	if ((doorbell & MPI2_IOC_STATE_MASK) ==
	MPI2_IOC_STATE_FAULT) {
	mps_dprint(sc, MPS_FAULT,
	"fault_state(0x%04x)!\n", doorbell);
	return (EFAULT);
	}
	} else if (int_status == 0xFFFFFFFF)
	goto out;

	/* If it can sleep, sleep for 1 milisecond, else busy loop for
	* 0.5 milisecond */
	if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP)
	msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0,
	"mpsdba", hz/1000);
	else if (sleep_flag == CAN_SLEEP)
	pause("mpsdba", hz/1000);
	else
	DELAY(500);
	count++;
	} while (--cntdn);

	out:
	mps_dprint(sc, MPS_FAULT, "%s: failed due to timeout count(%d), "
	"int_status(%x)!\n", __func__, count, int_status);
	return (ETIMEDOUT);

	}

	/* Wait for the chip to signal that the next word in its FIFO can be fetched */
	static int
	mps_wait_db_int(struct mps_softc *sc)
	{
	int retry;

	for (retry = 0; retry < MPS_DB_MAX_WAIT; retry++) {
	if ((mps_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET) &
	MPI2_HIS_IOC2SYS_DB_STATUS) != 0)
	return (0);
	DELAY(2000);
	}
	return (ETIMEDOUT);
	}

	/* Step through the synchronous command state machine, i.e. "Doorbell mode" */
	static int
	mps_request_sync(struct mps_softc sc, void req, MPI2_DEFAULT_REPLY *reply,
	int req_sz, int reply_sz, int timeout)
	{
	uint32_t *data32;
	uint16_t *data16;
	int i, count, ioc_sz, residual;
	int sleep_flags = CAN_SLEEP;

	if (curthread->td_no_sleeping != 0)
	sleep_flags = NO_SLEEP;

	/* Step 1 */
	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);

	/* Step 2 */
	if (mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
	return (EBUSY);

	/* Step 3
	* Announce that a message is coming through the doorbell. Messages
	* are pushed at 32bit words, so round up if needed.
	*/
	count = (req_sz + 3) / 4;
	mps_regwrite(sc, MPI2_DOORBELL_OFFSET,
	(MPI2_FUNCTION_HANDSHAKE << MPI2_DOORBELL_FUNCTION_SHIFT) \|
	(count << MPI2_DOORBELL_ADD_DWORDS_SHIFT));

	/* Step 4 */
	if (mps_wait_db_int(sc) \|\|
	(mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED) == 0) {
	mps_dprint(sc, MPS_FAULT, "Doorbell failed to activate\n");
	return (ENXIO);
	}
	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
	if (mps_wait_db_ack(sc, 5, sleep_flags) != 0) {
	mps_dprint(sc, MPS_FAULT, "Doorbell handshake failed\n");
	return (ENXIO);
	}

	/* Step 5 */
	/* Clock out the message data synchronously in 32-bit dwords*/
	data32 = (uint32_t *)req;
	for (i = 0; i < count; i++) {
	mps_regwrite(sc, MPI2_DOORBELL_OFFSET, htole32(data32[i]));
	if (mps_wait_db_ack(sc, 5, sleep_flags) != 0) {
	mps_dprint(sc, MPS_FAULT,
	"Timeout while writing doorbell\n");
	return (ENXIO);
	}
	}

	/* Step 6 */
	/* Clock in the reply in 16-bit words. The total length of the
	* message is always in the 4th byte, so clock out the first 2 words
	* manually, then loop the rest.
	*/
	data16 = (uint16_t *)reply;
	if (mps_wait_db_int(sc) != 0) {
	mps_dprint(sc, MPS_FAULT, "Timeout reading doorbell 0\n");
	return (ENXIO);
	}
	data16[0] =
	mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK;
	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
	if (mps_wait_db_int(sc) != 0) {
	mps_dprint(sc, MPS_FAULT, "Timeout reading doorbell 1\n");
	return (ENXIO);
	}
	data16[1] =
	mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK;
	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);

	/* Number of 32bit words in the message */
	ioc_sz = reply->MsgLength;

	/*
	* Figure out how many 16bit words to clock in without overrunning.
	* The precision loss with dividing reply_sz can safely be
	* ignored because the messages can only be multiples of 32bits.
	*/
	residual = 0;
	count = MIN((reply_sz / 4), ioc_sz) * 2;
	if (count < ioc_sz * 2) {
	residual = ioc_sz * 2 - count;
	mps_dprint(sc, MPS_ERROR, "Driver error, throwing away %d "
	"residual message words\n", residual);
	}

	for (i = 2; i < count; i++) {
	if (mps_wait_db_int(sc) != 0) {
	mps_dprint(sc, MPS_FAULT,
	"Timeout reading doorbell %d\n", i);
	return (ENXIO);
	}
	data16[i] = mps_regread(sc, MPI2_DOORBELL_OFFSET) &
	MPI2_DOORBELL_DATA_MASK;
	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
	}

	/*
	* Pull out residual words that won't fit into the provided buffer.
	* This keeps the chip from hanging due to a driver programming
	* error.
	*/
	while (residual--) {
	if (mps_wait_db_int(sc) != 0) {
	mps_dprint(sc, MPS_FAULT,
	"Timeout reading doorbell\n");
	return (ENXIO);
	}
	(void)mps_regread(sc, MPI2_DOORBELL_OFFSET);
	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
	}

	/* Step 7 */
	if (mps_wait_db_int(sc) != 0) {
	mps_dprint(sc, MPS_FAULT, "Timeout waiting to exit doorbell\n");
	return (ENXIO);
	}
	if (mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
	mps_dprint(sc, MPS_FAULT, "Warning, doorbell still active\n");
	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);

	return (0);
	}

	static void
	mps_enqueue_request(struct mps_softc sc, struct mps_command cm)
	{
	request_descriptor_t rd;
	MPS_FUNCTRACE(sc);
	mps_dprint(sc, MPS_TRACE, "SMID %u cm %p ccb %p\n",
	cm->cm_desc.Default.SMID, cm, cm->cm_ccb);

	if (sc->mps_flags & MPS_FLAGS_ATTACH_DONE && !(sc->mps_flags & MPS_FLAGS_SHUTDOWN))
	mtx_assert(&sc->mps_mtx, MA_OWNED);

	if (++sc->io_cmds_active > sc->io_cmds_highwater)
	sc->io_cmds_highwater++;
	rd.u.low = cm->cm_desc.Words.Low;
	rd.u.high = cm->cm_desc.Words.High;
	rd.word = htole64(rd.word);

	KASSERT(cm->cm_state == MPS_CM_STATE_BUSY, ("command not busy\n"));
	cm->cm_state = MPS_CM_STATE_INQUEUE;

	/* TODO-We may need to make below regwrite atomic */
	mps_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_LOW_OFFSET,
	rd.u.low);
	mps_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_HIGH_OFFSET,
	rd.u.high);
	}

	/*
	* Just the FACTS, ma'am.
	*/
	static int
	mps_get_iocfacts(struct mps_softc sc, MPI2_IOC_FACTS_REPLY facts)
	{
	MPI2_DEFAULT_REPLY *reply;
	MPI2_IOC_FACTS_REQUEST request;
	int error, req_sz, reply_sz;

	MPS_FUNCTRACE(sc);
	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);

	req_sz = sizeof(MPI2_IOC_FACTS_REQUEST);
	reply_sz = sizeof(MPI2_IOC_FACTS_REPLY);
	reply = (MPI2_DEFAULT_REPLY *)facts;

	bzero(&request, req_sz);
	request.Function = MPI2_FUNCTION_IOC_FACTS;
	error = mps_request_sync(sc, &request, reply, req_sz, reply_sz, 5);
	mps_dprint(sc, MPS_INIT, "%s exit error= %d\n", __func__, error);

	return (error);
	}

	static int
	mps_send_iocinit(struct mps_softc *sc)
	{
	MPI2_IOC_INIT_REQUEST init;
	MPI2_DEFAULT_REPLY reply;
	int req_sz, reply_sz, error;
	struct timeval now;
	uint64_t time_in_msec;

	MPS_FUNCTRACE(sc);
	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);

	/* Do a quick sanity check on proper initialization */
	if ((sc->pqdepth == 0) \|\| (sc->fqdepth == 0) \|\| (sc->reqframesz == 0)
	\|\| (sc->replyframesz == 0)) {
	mps_dprint(sc, MPS_INIT\|MPS_ERROR,
	"Driver not fully initialized for IOCInit\n");
	return (EINVAL);
	}

	req_sz = sizeof(MPI2_IOC_INIT_REQUEST);
	reply_sz = sizeof(MPI2_IOC_INIT_REPLY);
	bzero(&init, req_sz);
	bzero(&reply, reply_sz);

	/*
	* Fill in the init block. Note that most addresses are
	* deliberately in the lower 32bits of memory. This is a micro-
	* optimzation for PCI/PCIX, though it's not clear if it helps PCIe.
	*/
	init.Function = MPI2_FUNCTION_IOC_INIT;
	init.WhoInit = MPI2_WHOINIT_HOST_DRIVER;
	init.MsgVersion = htole16(MPI2_VERSION);
	init.HeaderVersion = htole16(MPI2_HEADER_VERSION);
	init.SystemRequestFrameSize = htole16((uint16_t)(sc->reqframesz / 4));
	init.ReplyDescriptorPostQueueDepth = htole16(sc->pqdepth);
	init.ReplyFreeQueueDepth = htole16(sc->fqdepth);
	init.SenseBufferAddressHigh = 0;
	init.SystemReplyAddressHigh = 0;
	init.SystemRequestFrameBaseAddress.High = 0;
	init.SystemRequestFrameBaseAddress.Low = htole32((uint32_t)sc->req_busaddr);
	init.ReplyDescriptorPostQueueAddress.High = 0;
	init.ReplyDescriptorPostQueueAddress.Low = htole32((uint32_t)sc->post_busaddr);
	init.ReplyFreeQueueAddress.High = 0;
	init.ReplyFreeQueueAddress.Low = htole32((uint32_t)sc->free_busaddr);
	getmicrotime(&now);
	time_in_msec = (now.tv_sec * 1000 + now.tv_usec/1000);
	init.TimeStamp.High = htole32((time_in_msec >> 32) & 0xFFFFFFFF);
	init.TimeStamp.Low = htole32(time_in_msec & 0xFFFFFFFF);

	error = mps_request_sync(sc, &init, &reply, req_sz, reply_sz, 5);
	if ((reply.IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
	error = ENXIO;

	mps_dprint(sc, MPS_INIT, "IOCInit status= 0x%x\n", reply.IOCStatus);
	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);
	return (error);
	}

	void
	mps_memaddr_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	bus_addr_t *addr;

	addr = arg;
	*addr = segs[0].ds_addr;
	}

	void
	mps_memaddr_wait_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct mps_busdma_context *ctx;
	int need_unload, need_free;

	ctx = (struct mps_busdma_context *)arg;
	need_unload = 0;
	need_free = 0;

	mps_lock(ctx->softc);
	ctx->error = error;
	ctx->completed = 1;
	if ((error == 0) && (ctx->abandoned == 0)) {
	*ctx->addr = segs[0].ds_addr;
	} else {
	if (nsegs != 0)
	need_unload = 1;
	if (ctx->abandoned != 0)
	need_free = 1;
	}
	if (need_free == 0)
	wakeup(ctx);

	mps_unlock(ctx->softc);

	if (need_unload != 0) {
	bus_dmamap_unload(ctx->buffer_dmat,
	ctx->buffer_dmamap);
	*ctx->addr = 0;
	}

	if (need_free != 0)
	free(ctx, M_MPSUSER);
	}

	static int
	mps_alloc_queues(struct mps_softc *sc)
	{
	struct mps_queue *q;
	u_int nq, i;

	nq = sc->msi_msgs;
	mps_dprint(sc, MPS_INIT\|MPS_XINFO, "Allocating %d I/O queues\n", nq);

	sc->queues = malloc(sizeof(struct mps_queue) * nq, M_MPT2,
	M_NOWAIT\|M_ZERO);
	if (sc->queues == NULL)
	return (ENOMEM);

	for (i = 0; i < nq; i++) {
	q = &sc->queues[i];
	mps_dprint(sc, MPS_INIT, "Configuring queue %d %p\n", i, q);
	q->sc = sc;
	q->qnum = i;
	}

	return (0);
	}

	static int
	mps_alloc_hw_queues(struct mps_softc *sc)
	{
	bus_dma_template_t t;
	bus_addr_t queues_busaddr;
	uint8_t *queues;
	int qsize, fqsize, pqsize;

	/*
	* The reply free queue contains 4 byte entries in multiples of 16 and
	* aligned on a 16 byte boundary. There must always be an unused entry.
	* This queue supplies fresh reply frames for the firmware to use.
	*
	* The reply descriptor post queue contains 8 byte entries in
	* multiples of 16 and aligned on a 16 byte boundary. This queue
	* contains filled-in reply frames sent from the firmware to the host.
	*
	* These two queues are allocated together for simplicity.
	*/
	sc->fqdepth = roundup2(sc->num_replies + 1, 16);
	sc->pqdepth = roundup2(sc->num_replies + 1, 16);
	fqsize= sc->fqdepth * 4;
	pqsize = sc->pqdepth * 8;
	qsize = fqsize + pqsize;

	bus_dma_template_init(&t, sc->mps_parent_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(16), BD_MAXSIZE(qsize),
	BD_MAXSEGSIZE(qsize), BD_NSEGMENTS(1),
	BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
	if (bus_dma_template_tag(&t, &sc->queues_dmat)) {
	mps_dprint(sc, MPS_ERROR, "Cannot allocate queues DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->queues_dmat, (void **)&queues, BUS_DMA_NOWAIT,
	&sc->queues_map)) {
	mps_dprint(sc, MPS_ERROR, "Cannot allocate queues memory\n");
	return (ENOMEM);
	}
	bzero(queues, qsize);
	bus_dmamap_load(sc->queues_dmat, sc->queues_map, queues, qsize,
	mps_memaddr_cb, &queues_busaddr, 0);

	sc->free_queue = (uint32_t *)queues;
	sc->free_busaddr = queues_busaddr;
	sc->post_queue = (MPI2_REPLY_DESCRIPTORS_UNION *)(queues + fqsize);
	sc->post_busaddr = queues_busaddr + fqsize;
	mps_dprint(sc, MPS_INIT, "free queue busaddr= %#016jx size= %d\n",
	(uintmax_t)sc->free_busaddr, fqsize);
	mps_dprint(sc, MPS_INIT, "reply queue busaddr= %#016jx size= %d\n",
	(uintmax_t)sc->post_busaddr, pqsize);

	return (0);
	}

	static int
	mps_alloc_replies(struct mps_softc *sc)
	{
	bus_dma_template_t t;
	int rsize, num_replies;

	/* Store the reply frame size in bytes rather than as 32bit words */
	sc->replyframesz = sc->facts->ReplyFrameSize * 4;

	/*
	* sc->num_replies should be one less than sc->fqdepth. We need to
	* allocate space for sc->fqdepth replies, but only sc->num_replies
	* replies can be used at once.
	*/
	num_replies = max(sc->fqdepth, sc->num_replies);

	rsize = sc->replyframesz * num_replies;
	bus_dma_template_init(&t, sc->mps_parent_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(4), BD_MAXSIZE(rsize),
	BD_MAXSEGSIZE(rsize), BD_NSEGMENTS(1),
	BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
	if (bus_dma_template_tag(&t, &sc->reply_dmat)) {
	mps_dprint(sc, MPS_ERROR, "Cannot allocate replies DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->reply_dmat, (void **)&sc->reply_frames,
	BUS_DMA_NOWAIT, &sc->reply_map)) {
	mps_dprint(sc, MPS_ERROR, "Cannot allocate replies memory\n");
	return (ENOMEM);
	}
	bzero(sc->reply_frames, rsize);
	bus_dmamap_load(sc->reply_dmat, sc->reply_map, sc->reply_frames, rsize,
	mps_memaddr_cb, &sc->reply_busaddr, 0);

	mps_dprint(sc, MPS_INIT, "reply frames busaddr= %#016jx size= %d\n",
	(uintmax_t)sc->reply_busaddr, rsize);

	return (0);
	}

	static void
	mps_load_chains_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct mps_softc *sc = arg;
	struct mps_chain *chain;
	bus_size_t bo;
	int i, o, s;

	if (error != 0)
	return;

	for (i = 0, o = 0, s = 0; s < nsegs; s++) {
	for (bo = 0; bo + sc->reqframesz <= segs[s].ds_len;
	bo += sc->reqframesz) {
	chain = &sc->chains[i++];
	chain->chain =(MPI2_SGE_IO_UNION *)(sc->chain_frames+o);
	chain->chain_busaddr = segs[s].ds_addr + bo;
	o += sc->reqframesz;
	mps_free_chain(sc, chain);
	}
	if (bo != segs[s].ds_len)
	o += segs[s].ds_len - bo;
	}
	sc->chain_free_lowwater = i;
	}

	static int
	mps_alloc_requests(struct mps_softc *sc)
	{
	bus_dma_template_t t;
	struct mps_command *cm;
	int i, rsize, nsegs;

	rsize = sc->reqframesz * sc->num_reqs;
	bus_dma_template_init(&t, sc->mps_parent_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(16), BD_MAXSIZE(rsize),
	BD_MAXSEGSIZE(rsize), BD_NSEGMENTS(1),
	BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
	if (bus_dma_template_tag(&t, &sc->req_dmat)) {
	mps_dprint(sc, MPS_ERROR, "Cannot allocate request DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->req_dmat, (void **)&sc->req_frames,
	BUS_DMA_NOWAIT, &sc->req_map)) {
	mps_dprint(sc, MPS_ERROR, "Cannot allocate request memory\n");
	return (ENOMEM);
	}
	bzero(sc->req_frames, rsize);
	bus_dmamap_load(sc->req_dmat, sc->req_map, sc->req_frames, rsize,
	mps_memaddr_cb, &sc->req_busaddr, 0);
	mps_dprint(sc, MPS_INIT, "request frames busaddr= %#016jx size= %d\n",
	(uintmax_t)sc->req_busaddr, rsize);

	sc->chains = malloc(sizeof(struct mps_chain) * sc->num_chains, M_MPT2,
	M_NOWAIT \| M_ZERO);
	if (!sc->chains) {
	mps_dprint(sc, MPS_ERROR, "Cannot allocate chain memory\n");
	return (ENOMEM);
	}
	rsize = sc->reqframesz * sc->num_chains;
	bus_dma_template_clone(&t, sc->req_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_MAXSIZE(rsize), BD_MAXSEGSIZE(rsize),
	BD_NSEGMENTS(howmany(rsize, PAGE_SIZE)));
	if (bus_dma_template_tag(&t, &sc->chain_dmat)) {
	mps_dprint(sc, MPS_ERROR, "Cannot allocate chain DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->chain_dmat, (void **)&sc->chain_frames,
	BUS_DMA_NOWAIT \| BUS_DMA_ZERO, &sc->chain_map)) {
	mps_dprint(sc, MPS_ERROR, "Cannot allocate chain memory\n");
	return (ENOMEM);
	}
	if (bus_dmamap_load(sc->chain_dmat, sc->chain_map, sc->chain_frames,
	rsize, mps_load_chains_cb, sc, BUS_DMA_NOWAIT)) {
	mps_dprint(sc, MPS_ERROR, "Cannot load chain memory\n");
	bus_dmamem_free(sc->chain_dmat, sc->chain_frames,
	sc->chain_map);
	return (ENOMEM);
	}

	rsize = MPS_SENSE_LEN * sc->num_reqs;
	bus_dma_template_clone(&t, sc->req_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(1), BD_MAXSIZE(rsize),
	BD_MAXSEGSIZE(rsize));
	if (bus_dma_template_tag(&t, &sc->sense_dmat)) {
	mps_dprint(sc, MPS_ERROR, "Cannot allocate sense DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->sense_dmat, (void **)&sc->sense_frames,
	BUS_DMA_NOWAIT, &sc->sense_map)) {
	mps_dprint(sc, MPS_ERROR, "Cannot allocate sense memory\n");
	return (ENOMEM);
	}
	bzero(sc->sense_frames, rsize);
	bus_dmamap_load(sc->sense_dmat, sc->sense_map, sc->sense_frames, rsize,
	mps_memaddr_cb, &sc->sense_busaddr, 0);
	mps_dprint(sc, MPS_INIT, "sense frames busaddr= %#016jx size= %d\n",
	(uintmax_t)sc->sense_busaddr, rsize);

	nsegs = (sc->maxio / PAGE_SIZE) + 1;
	bus_dma_template_init(&t, sc->mps_parent_dmat);
	BUS_DMA_TEMPLATE_FILL(&t, BD_MAXSIZE(BUS_SPACE_MAXSIZE_32BIT),
	BD_NSEGMENTS(nsegs), BD_MAXSEGSIZE(BUS_SPACE_MAXSIZE_24BIT),
	BD_FLAGS(BUS_DMA_ALLOCNOW), BD_LOCKFUNC(busdma_lock_mutex),
	BD_LOCKFUNCARG(&sc->mps_mtx));
	if (bus_dma_template_tag(&t, &sc->buffer_dmat)) {
	mps_dprint(sc, MPS_ERROR, "Cannot allocate buffer DMA tag\n");
	return (ENOMEM);
	}

	/*
	* SMID 0 cannot be used as a free command per the firmware spec.
	* Just drop that command instead of risking accounting bugs.
	*/
	sc->commands = malloc(sizeof(struct mps_command) * sc->num_reqs,
	M_MPT2, M_WAITOK \| M_ZERO);
	for (i = 1; i < sc->num_reqs; i++) {
	cm = &sc->commands[i];
	cm->cm_req = sc->req_frames + i * sc->reqframesz;
	cm->cm_req_busaddr = sc->req_busaddr + i * sc->reqframesz;
	cm->cm_sense = &sc->sense_frames[i];
	cm->cm_sense_busaddr = sc->sense_busaddr + i * MPS_SENSE_LEN;
	cm->cm_desc.Default.SMID = i;
	cm->cm_sc = sc;
	cm->cm_state = MPS_CM_STATE_BUSY;
	TAILQ_INIT(&cm->cm_chain_list);
	callout_init_mtx(&cm->cm_callout, &sc->mps_mtx, 0);

	/* XXX Is a failure here a critical problem? */
	if (bus_dmamap_create(sc->buffer_dmat, 0, &cm->cm_dmamap) == 0)
	if (i <= sc->num_prireqs)
	mps_free_high_priority_command(sc, cm);
	else
	mps_free_command(sc, cm);
	else {
	panic("failed to allocate command %d\n", i);
	sc->num_reqs = i;
	break;
	}
	}

	return (0);
	}

	static int
	mps_init_queues(struct mps_softc *sc)
	{
	int i;

	memset((uint8_t )sc->post_queue, 0xff, sc->pqdepth 8);

	/*
	* According to the spec, we need to use one less reply than we
	* have space for on the queue. So sc->num_replies (the number we
	* use) should be less than sc->fqdepth (allocated size).
	*/
	if (sc->num_replies >= sc->fqdepth)
	return (EINVAL);

	/*
	* Initialize all of the free queue entries.
	*/
	for (i = 0; i < sc->fqdepth; i++)
	sc->free_queue[i] = sc->reply_busaddr + (i * sc->replyframesz);
	sc->replyfreeindex = sc->num_replies;

	return (0);
	}

	/* Get the driver parameter tunables. Lowest priority are the driver defaults.
	* Next are the global settings, if they exist. Highest are the per-unit
	* settings, if they exist.
	*/
	void
	mps_get_tunables(struct mps_softc *sc)
	{
	char tmpstr[80], mps_debug[80];

	/* XXX default to some debugging for now */
	sc->mps_debug = MPS_INFO\|MPS_FAULT;
	sc->disable_msix = 0;
	sc->disable_msi = 0;
	sc->max_msix = MPS_MSIX_MAX;
	sc->max_chains = MPS_CHAIN_FRAMES;
	sc->max_io_pages = MPS_MAXIO_PAGES;
	sc->enable_ssu = MPS_SSU_ENABLE_SSD_DISABLE_HDD;
	sc->spinup_wait_time = DEFAULT_SPINUP_WAIT;
	sc->use_phynum = 1;
	sc->max_reqframes = MPS_REQ_FRAMES;
	sc->max_prireqframes = MPS_PRI_REQ_FRAMES;
	sc->max_replyframes = MPS_REPLY_FRAMES;
	sc->max_evtframes = MPS_EVT_REPLY_FRAMES;

	/*
	* Grab the global variables.
	*/
	bzero(mps_debug, 80);
	if (TUNABLE_STR_FETCH("hw.mps.debug_level", mps_debug, 80) != 0)
	mps_parse_debug(sc, mps_debug);
	TUNABLE_INT_FETCH("hw.mps.disable_msix", &sc->disable_msix);
	TUNABLE_INT_FETCH("hw.mps.disable_msi", &sc->disable_msi);
	TUNABLE_INT_FETCH("hw.mps.max_msix", &sc->max_msix);
	TUNABLE_INT_FETCH("hw.mps.max_chains", &sc->max_chains);
	TUNABLE_INT_FETCH("hw.mps.max_io_pages", &sc->max_io_pages);
	TUNABLE_INT_FETCH("hw.mps.enable_ssu", &sc->enable_ssu);
	TUNABLE_INT_FETCH("hw.mps.spinup_wait_time", &sc->spinup_wait_time);
	TUNABLE_INT_FETCH("hw.mps.use_phy_num", &sc->use_phynum);
	TUNABLE_INT_FETCH("hw.mps.max_reqframes", &sc->max_reqframes);
	TUNABLE_INT_FETCH("hw.mps.max_prireqframes", &sc->max_prireqframes);
	TUNABLE_INT_FETCH("hw.mps.max_replyframes", &sc->max_replyframes);
	TUNABLE_INT_FETCH("hw.mps.max_evtframes", &sc->max_evtframes);

	/* Grab the unit-instance variables */
	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.debug_level",
	device_get_unit(sc->mps_dev));
	bzero(mps_debug, 80);
	if (TUNABLE_STR_FETCH(tmpstr, mps_debug, 80) != 0)
	mps_parse_debug(sc, mps_debug);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.disable_msix",
	device_get_unit(sc->mps_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->disable_msix);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.disable_msi",
	device_get_unit(sc->mps_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->disable_msi);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_msix",
	device_get_unit(sc->mps_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_msix);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_chains",
	device_get_unit(sc->mps_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_chains);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_io_pages",
	device_get_unit(sc->mps_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_io_pages);

	bzero(sc->exclude_ids, sizeof(sc->exclude_ids));
	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.exclude_ids",
	device_get_unit(sc->mps_dev));
	TUNABLE_STR_FETCH(tmpstr, sc->exclude_ids, sizeof(sc->exclude_ids));

	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.enable_ssu",
	device_get_unit(sc->mps_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->enable_ssu);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.spinup_wait_time",
	device_get_unit(sc->mps_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->spinup_wait_time);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.use_phy_num",
	device_get_unit(sc->mps_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->use_phynum);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_reqframes",
	device_get_unit(sc->mps_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_reqframes);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_prireqframes",
	device_get_unit(sc->mps_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_prireqframes);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_replyframes",
	device_get_unit(sc->mps_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_replyframes);

	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_evtframes",
	device_get_unit(sc->mps_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->max_evtframes);

	}

	static void
	mps_setup_sysctl(struct mps_softc *sc)
	{
	struct sysctl_ctx_list *sysctl_ctx = NULL;
	struct sysctl_oid *sysctl_tree = NULL;
	char tmpstr[80], tmpstr2[80];

	/*
	* Setup the sysctl variable so the user can change the debug level
	* on the fly.
	*/
	snprintf(tmpstr, sizeof(tmpstr), "MPS controller %d",
	device_get_unit(sc->mps_dev));
	snprintf(tmpstr2, sizeof(tmpstr2), "%d", device_get_unit(sc->mps_dev));

	sysctl_ctx = device_get_sysctl_ctx(sc->mps_dev);
	if (sysctl_ctx != NULL)
	sysctl_tree = device_get_sysctl_tree(sc->mps_dev);

	if (sysctl_tree == NULL) {
	sysctl_ctx_init(&sc->sysctl_ctx);
	sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_hw_mps), OID_AUTO, tmpstr2,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, tmpstr);
	if (sc->sysctl_tree == NULL)
	return;
	sysctl_ctx = &sc->sysctl_ctx;
	sysctl_tree = sc->sysctl_tree;
	}

	SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "debug_level", CTLTYPE_STRING \| CTLFLAG_RW \|CTLFLAG_MPSAFE,
	sc, 0, mps_debug_sysctl, "A", "mps debug level");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "disable_msix", CTLFLAG_RD, &sc->disable_msix, 0,
	"Disable the use of MSI-X interrupts");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "disable_msi", CTLFLAG_RD, &sc->disable_msi, 0,
	"Disable the use of MSI interrupts");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_msix", CTLFLAG_RD, &sc->max_msix, 0,
	"User-defined maximum number of MSIX queues");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "msix_msgs", CTLFLAG_RD, &sc->msi_msgs, 0,
	"Negotiated number of MSIX queues");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_reqframes", CTLFLAG_RD, &sc->max_reqframes, 0,
	"Total number of allocated request frames");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_prireqframes", CTLFLAG_RD, &sc->max_prireqframes, 0,
	"Total number of allocated high priority request frames");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_replyframes", CTLFLAG_RD, &sc->max_replyframes, 0,
	"Total number of allocated reply frames");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_evtframes", CTLFLAG_RD, &sc->max_evtframes, 0,
	"Total number of event frames allocated");

	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version,
	strlen(sc->fw_version), "firmware version");

	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "driver_version", CTLFLAG_RD, MPS_DRIVER_VERSION,
	strlen(MPS_DRIVER_VERSION), "driver version");

	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "msg_version", CTLFLAG_RD, sc->msg_version,
	strlen(sc->msg_version), "message interface version");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "io_cmds_active", CTLFLAG_RD,
	&sc->io_cmds_active, 0, "number of currently active commands");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "io_cmds_highwater", CTLFLAG_RD,
	&sc->io_cmds_highwater, 0, "maximum active commands seen");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "chain_free", CTLFLAG_RD,
	&sc->chain_free, 0, "number of free chain elements");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "chain_free_lowwater", CTLFLAG_RD,
	&sc->chain_free_lowwater, 0,"lowest number of free chain elements");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_chains", CTLFLAG_RD,
	&sc->max_chains, 0,"maximum chain frames that will be allocated");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "max_io_pages", CTLFLAG_RD,
	&sc->max_io_pages, 0,"maximum pages to allow per I/O (if <1 use "
	"IOCFacts)");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "enable_ssu", CTLFLAG_RW, &sc->enable_ssu, 0,
	"enable SSU to SATA SSD/HDD at shutdown");

	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "chain_alloc_fail", CTLFLAG_RD,
	&sc->chain_alloc_fail, "chain allocation failures");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "spinup_wait_time", CTLFLAG_RD,
	&sc->spinup_wait_time, DEFAULT_SPINUP_WAIT, "seconds to wait for "
	"spinup after SATA ID error");

	SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "mapping_table_dump",
	CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_NEEDGIANT, sc, 0,
	mps_mapping_dump, "A", "Mapping Table Dump");

	SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "encl_table_dump",
	CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_NEEDGIANT, sc, 0,
	mps_mapping_encl_dump, "A", "Enclosure Table Dump");

	SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "dump_reqs",
	CTLTYPE_OPAQUE \| CTLFLAG_RD \| CTLFLAG_SKIP \| CTLFLAG_NEEDGIANT,
	sc, 0, mps_dump_reqs, "I", "Dump Active Requests");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "use_phy_num", CTLFLAG_RD, &sc->use_phynum, 0,
	"Use the phy number for enumeration");
	}

	static struct mps_debug_string {
	char *name;
	int flag;
	} mps_debug_strings[] = {
	{"info", MPS_INFO},
	{"fault", MPS_FAULT},
	{"event", MPS_EVENT},
	{"log", MPS_LOG},
	{"recovery", MPS_RECOVERY},
	{"error", MPS_ERROR},
	{"init", MPS_INIT},
	{"xinfo", MPS_XINFO},
	{"user", MPS_USER},
	{"mapping", MPS_MAPPING},
	{"trace", MPS_TRACE}
	};

	enum mps_debug_level_combiner {
	COMB_NONE,
	COMB_ADD,
	COMB_SUB
	};

	static int
	mps_debug_sysctl(SYSCTL_HANDLER_ARGS)
	{
	struct mps_softc *sc;
	struct mps_debug_string *string;
	struct sbuf *sbuf;
	char *buffer;
	size_t sz;
	int i, len, debug, error;

	sc = (struct mps_softc *)arg1;

	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
	return (error);

	sbuf = sbuf_new_for_sysctl(NULL, NULL, 128, req);
	debug = sc->mps_debug;

	sbuf_printf(sbuf, "%#x", debug);

	sz = sizeof(mps_debug_strings) / sizeof(mps_debug_strings[0]);
	for (i = 0; i < sz; i++) {
	string = &mps_debug_strings[i];
	if (debug & string->flag)
	sbuf_printf(sbuf, ",%s", string->name);
	}

	error = sbuf_finish(sbuf);
	sbuf_delete(sbuf);

	if (error \|\| req->newptr == NULL)
	return (error);

	len = req->newlen - req->newidx;
	if (len == 0)
	return (0);

	buffer = malloc(len, M_MPT2, M_ZERO\|M_WAITOK);
	error = SYSCTL_IN(req, buffer, len);

	mps_parse_debug(sc, buffer);

	free(buffer, M_MPT2);
	return (error);
	}

	static void
	mps_parse_debug(struct mps_softc sc, char list)
	{
	struct mps_debug_string *string;
	enum mps_debug_level_combiner op;
	char token, endtoken;
	size_t sz;
	int flags, i;

	if (list == NULL \|\| *list == '\0')
	return;

	if (*list == '+') {
	op = COMB_ADD;
	list++;
	} else if (*list == '-') {
	op = COMB_SUB;
	list++;
	} else
	op = COMB_NONE;
	if (*list == '\0')
	return;

	flags = 0;
	sz = sizeof(mps_debug_strings) / sizeof(mps_debug_strings[0]);
	while ((token = strsep(&list, ":,")) != NULL) {
	/* Handle integer flags */
	flags \|= strtol(token, &endtoken, 0);
	if (token != endtoken)
	continue;

	/* Handle text flags */
	for (i = 0; i < sz; i++) {
	string = &mps_debug_strings[i];
	if (strcasecmp(token, string->name) == 0) {
	flags \|= string->flag;
	break;
	}
	}
	}

	switch (op) {
	case COMB_NONE:
	sc->mps_debug = flags;
	break;
	case COMB_ADD:
	sc->mps_debug \|= flags;
	break;
	case COMB_SUB:
	sc->mps_debug &= (~flags);
	break;
	}

	return;
	}

	struct mps_dumpreq_hdr {
	uint32_t smid;
	uint32_t state;
	uint32_t numframes;
	uint32_t deschi;
	uint32_t desclo;
	};

	static int
	mps_dump_reqs(SYSCTL_HANDLER_ARGS)
	{
	struct mps_softc *sc;
	struct mps_chain chain, chain1;
	struct mps_command *cm;
	struct mps_dumpreq_hdr hdr;
	struct sbuf *sb;
	uint32_t smid, state;
	int i, numreqs, error = 0;

	sc = (struct mps_softc *)arg1;

	if ((error = priv_check(curthread, PRIV_DRIVER)) != 0) {
	printf("priv check error %d\n", error);
	return (error);
	}

	state = MPS_CM_STATE_INQUEUE;
	smid = 1;
	numreqs = sc->num_reqs;

	if (req->newptr != NULL)
	return (EINVAL);

	if (smid == 0 \|\| smid > sc->num_reqs)
	return (EINVAL);
	if (numreqs <= 0 \|\| (numreqs + smid > sc->num_reqs))
	numreqs = sc->num_reqs;
	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);

	/* Best effort, no locking */
	for (i = smid; i < numreqs; i++) {
	cm = &sc->commands[i];
	if (cm->cm_state != state)
	continue;
	hdr.smid = i;
	hdr.state = cm->cm_state;
	hdr.numframes = 1;
	hdr.deschi = cm->cm_desc.Words.High;
	hdr.desclo = cm->cm_desc.Words.Low;
	TAILQ_FOREACH_SAFE(chain, &cm->cm_chain_list, chain_link,
	chain1)
	hdr.numframes++;
	sbuf_bcat(sb, &hdr, sizeof(hdr));
	sbuf_bcat(sb, cm->cm_req, 128);
	TAILQ_FOREACH_SAFE(chain, &cm->cm_chain_list, chain_link,
	chain1)
	sbuf_bcat(sb, chain->chain, 128);
	}

	error = sbuf_finish(sb);
	sbuf_delete(sb);
	return (error);
	}

	int
	mps_attach(struct mps_softc *sc)
	{
	int error;

	MPS_FUNCTRACE(sc);
	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);

	mtx_init(&sc->mps_mtx, "MPT2SAS lock", NULL, MTX_DEF);
	callout_init_mtx(&sc->periodic, &sc->mps_mtx, 0);
	callout_init_mtx(&sc->device_check_callout, &sc->mps_mtx, 0);
	TAILQ_INIT(&sc->event_list);
	timevalclear(&sc->lastfail);

	if ((error = mps_transition_ready(sc)) != 0) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT, "failed to transition "
	"ready\n");
	return (error);
	}

	sc->facts = malloc(sizeof(MPI2_IOC_FACTS_REPLY), M_MPT2,
	M_ZERO\|M_NOWAIT);
	if(!sc->facts) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT, "Cannot allocate memory, "
	"exit\n");
	return (ENOMEM);
	}

	/*
	* Get IOC Facts and allocate all structures based on this information.
	* A Diag Reset will also call mps_iocfacts_allocate and re-read the IOC
	* Facts. If relevant values have changed in IOC Facts, this function
	* will free all of the memory based on IOC Facts and reallocate that
	* memory. If this fails, any allocated memory should already be freed.
	*/
	if ((error = mps_iocfacts_allocate(sc, TRUE)) != 0) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT, "IOC Facts based allocation "
	"failed with error %d, exit\n", error);
	return (error);
	}

	/* Start the periodic watchdog check on the IOC Doorbell */
	mps_periodic(sc);

	/*
	* The portenable will kick off discovery events that will drive the
	* rest of the initialization process. The CAM/SAS module will
	* hold up the boot sequence until discovery is complete.
	*/
	sc->mps_ich.ich_func = mps_startup;
	sc->mps_ich.ich_arg = sc;
	if (config_intrhook_establish(&sc->mps_ich) != 0) {
	mps_dprint(sc, MPS_INIT\|MPS_ERROR,
	"Cannot establish MPS config hook\n");
	error = EINVAL;
	}

	/*
	* Allow IR to shutdown gracefully when shutdown occurs.
	*/
	sc->shutdown_eh = EVENTHANDLER_REGISTER(shutdown_final,
	mpssas_ir_shutdown, sc, SHUTDOWN_PRI_DEFAULT);

	if (sc->shutdown_eh == NULL)
	mps_dprint(sc, MPS_INIT\|MPS_ERROR,
	"shutdown event registration failed\n");

	mps_setup_sysctl(sc);

	sc->mps_flags \|= MPS_FLAGS_ATTACH_DONE;
	mps_dprint(sc, MPS_INIT, "%s exit error= %d\n", __func__, error);

	return (error);
	}

	/* Run through any late-start handlers. */
	static void
	mps_startup(void *arg)
	{
	struct mps_softc *sc;

	sc = (struct mps_softc *)arg;
	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);

	mps_lock(sc);
	mps_unmask_intr(sc);

	/* initialize device mapping tables */
	mps_base_static_config_pages(sc);
	mps_mapping_initialize(sc);
	mpssas_startup(sc);
	mps_unlock(sc);

	mps_dprint(sc, MPS_INIT, "disestablish config intrhook\n");
	config_intrhook_disestablish(&sc->mps_ich);
	sc->mps_ich.ich_arg = NULL;

	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);
	}

	/* Periodic watchdog. Is called with the driver lock already held. */
	static void
	mps_periodic(void *arg)
	{
	struct mps_softc *sc;
	uint32_t db;

	sc = (struct mps_softc *)arg;
	if (sc->mps_flags & MPS_FLAGS_SHUTDOWN)
	return;

	db = mps_regread(sc, MPI2_DOORBELL_OFFSET);
	if ((db & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
	mps_dprint(sc, MPS_FAULT, "IOC Fault 0x%08x, Resetting\n", db);
	mps_reinit(sc);
	}

	callout_reset(&sc->periodic, MPS_PERIODIC_DELAY * hz, mps_periodic, sc);
	}

	static void
	mps_log_evt_handler(struct mps_softc *sc, uintptr_t data,
	MPI2_EVENT_NOTIFICATION_REPLY *event)
	{
	MPI2_EVENT_DATA_LOG_ENTRY_ADDED *entry;

	MPS_DPRINT_EVENT(sc, generic, event);

	switch (event->Event) {
	case MPI2_EVENT_LOG_DATA:
	mps_dprint(sc, MPS_EVENT, "MPI2_EVENT_LOG_DATA:\n");
	if (sc->mps_debug & MPS_EVENT)
	hexdump(event->EventData, event->EventDataLength, NULL, 0);
	break;
	case MPI2_EVENT_LOG_ENTRY_ADDED:
	entry = (MPI2_EVENT_DATA_LOG_ENTRY_ADDED *)event->EventData;
	mps_dprint(sc, MPS_EVENT, "MPI2_EVENT_LOG_ENTRY_ADDED event "
	"0x%x Sequence %d:\n", entry->LogEntryQualifier,
	entry->LogSequence);
	break;
	default:
	break;
	}
	return;
	}

	static int
	mps_attach_log(struct mps_softc *sc)
	{
	u32 events[MPI2_EVENT_NOTIFY_EVENTMASK_WORDS];

	bzero(events, 16);
	setbit(events, MPI2_EVENT_LOG_DATA);
	setbit(events, MPI2_EVENT_LOG_ENTRY_ADDED);

	mps_register_events(sc, events, mps_log_evt_handler, NULL,
	&sc->mps_log_eh);

	return (0);
	}

	static int
	mps_detach_log(struct mps_softc *sc)
	{

	if (sc->mps_log_eh != NULL)
	mps_deregister_events(sc, sc->mps_log_eh);
	return (0);
	}

	/*
	* Free all of the driver resources and detach submodules. Should be called
	* without the lock held.
	*/
	int
	mps_free(struct mps_softc *sc)
	{
	int error;

	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);
	/* Turn off the watchdog */
	mps_lock(sc);
	sc->mps_flags \|= MPS_FLAGS_SHUTDOWN;
	mps_unlock(sc);
	/* Lock must not be held for this */
	callout_drain(&sc->periodic);
	callout_drain(&sc->device_check_callout);

	if (((error = mps_detach_log(sc)) != 0) \|\|
	((error = mps_detach_sas(sc)) != 0)) {
	mps_dprint(sc, MPS_INIT\|MPS_FAULT, "failed to detach "
	"subsystems, exit\n");
	return (error);
	}

	mps_detach_user(sc);

	/* Put the IOC back in the READY state. */
	mps_lock(sc);
	if ((error = mps_transition_ready(sc)) != 0) {
	mps_unlock(sc);
	return (error);
	}
	mps_unlock(sc);

	if (sc->facts != NULL)
	free(sc->facts, M_MPT2);

	/*
	* Free all buffers that are based on IOC Facts. A Diag Reset may need
	* to free these buffers too.
	*/
	mps_iocfacts_free(sc);

	if (sc->sysctl_tree != NULL)
	sysctl_ctx_free(&sc->sysctl_ctx);

	/* Deregister the shutdown function */
	if (sc->shutdown_eh != NULL)
	EVENTHANDLER_DEREGISTER(shutdown_final, sc->shutdown_eh);

	mtx_destroy(&sc->mps_mtx);
	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);

	return (0);
	}

	static __inline void
	mps_complete_command(struct mps_softc sc, struct mps_command cm)
	{
	MPS_FUNCTRACE(sc);

	if (cm == NULL) {
	mps_dprint(sc, MPS_ERROR, "Completing NULL command\n");
	return;
	}

	if (cm->cm_flags & MPS_CM_FLAGS_POLLED)
	cm->cm_flags \|= MPS_CM_FLAGS_COMPLETE;

	if (cm->cm_complete != NULL) {
	mps_dprint(sc, MPS_TRACE,
	"%s cm %p calling cm_complete %p data %p reply %p\n",
	__func__, cm, cm->cm_complete, cm->cm_complete_data,
	cm->cm_reply);
	cm->cm_complete(sc, cm);
	}

	if (cm->cm_flags & MPS_CM_FLAGS_WAKEUP) {
	mps_dprint(sc, MPS_TRACE, "waking up %p\n", cm);
	wakeup(cm);
	}

	if (cm->cm_sc->io_cmds_active != 0) {
	cm->cm_sc->io_cmds_active--;
	} else {
	mps_dprint(sc, MPS_ERROR, "Warning: io_cmds_active is "
	"out of sync - resynching to 0\n");
	}
	}

	static void
	mps_sas_log_info(struct mps_softc *sc , u32 log_info)
	{
	union loginfo_type {
	u32 loginfo;
	struct {
	u32 subcode:16;
	u32 code:8;
	u32 originator:4;
	u32 bus_type:4;
	} dw;
	};
	union loginfo_type sas_loginfo;
	char *originator_str = NULL;

	sas_loginfo.loginfo = log_info;
	if (sas_loginfo.dw.bus_type != 3 /SAS/)
	return;

	/* each nexus loss loginfo */
	if (log_info == 0x31170000)
	return;

	/* eat the loginfos associated with task aborts */
	if ((log_info == 30050000 \|\| log_info ==
	0x31140000 \|\| log_info == 0x31130000))
	return;

	switch (sas_loginfo.dw.originator) {
	case 0:
	originator_str = "IOP";
	break;
	case 1:
	originator_str = "PL";
	break;
	case 2:
	originator_str = "IR";
	break;
	}

	mps_dprint(sc, MPS_LOG, "log_info(0x%08x): originator(%s), "
	"code(0x%02x), sub_code(0x%04x)\n", log_info,
	originator_str, sas_loginfo.dw.code,
	sas_loginfo.dw.subcode);
	}

	static void
	mps_display_reply_info(struct mps_softc sc, uint8_t reply)
	{
	MPI2DefaultReply_t *mpi_reply;
	u16 sc_status;

	mpi_reply = (MPI2DefaultReply_t*)reply;
	sc_status = le16toh(mpi_reply->IOCStatus);
	if (sc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE)
	mps_sas_log_info(sc, le32toh(mpi_reply->IOCLogInfo));
	}
	void
	mps_intr(void *data)
	{
	struct mps_softc *sc;
	uint32_t status;

	sc = (struct mps_softc *)data;
	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);

	/*
	* Check interrupt status register to flush the bus. This is
	* needed for both INTx interrupts and driver-driven polling
	*/
	status = mps_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
	if ((status & MPI2_HIS_REPLY_DESCRIPTOR_INTERRUPT) == 0)
	return;

	mps_lock(sc);
	mps_intr_locked(data);
	mps_unlock(sc);
	return;
	}

	/*
	* In theory, MSI/MSIX interrupts shouldn't need to read any registers on the
	* chip. Hopefully this theory is correct.
	*/
	void
	mps_intr_msi(void *data)
	{
	struct mps_softc *sc;

	sc = (struct mps_softc *)data;
	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
	mps_lock(sc);
	mps_intr_locked(data);
	mps_unlock(sc);
	return;
	}

	/*
	* The locking is overly broad and simplistic, but easy to deal with for now.
	*/
	void
	mps_intr_locked(void *data)
	{
	MPI2_REPLY_DESCRIPTORS_UNION *desc;
	MPI2_DIAG_RELEASE_REPLY *rel_rep;
	mps_fw_diagnostic_buffer_t *pBuffer;
	struct mps_softc *sc;
	struct mps_command *cm = NULL;
	uint64_t tdesc;
	uint8_t flags;
	u_int pq;

	sc = (struct mps_softc *)data;

	pq = sc->replypostindex;
	mps_dprint(sc, MPS_TRACE,
	"%s sc %p starting with replypostindex %u\n",
	__func__, sc, sc->replypostindex);

	for ( ;; ) {
	cm = NULL;
	desc = &sc->post_queue[sc->replypostindex];

	/*
	* Copy and clear out the descriptor so that any reentry will
	* immediately know that this descriptor has already been
	* looked at. There is unfortunate casting magic because the
	* MPI API doesn't have a cardinal 64bit type.
	*/
	tdesc = 0xffffffffffffffff;
	tdesc = atomic_swap_64((uint64_t *)desc, tdesc);
	desc = (MPI2_REPLY_DESCRIPTORS_UNION *)&tdesc;

	flags = desc->Default.ReplyFlags &
	MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK;
	if ((flags == MPI2_RPY_DESCRIPT_FLAGS_UNUSED)
	\|\| (le32toh(desc->Words.High) == 0xffffffff))
	break;

	/* increment the replypostindex now, so that event handlers
	* and cm completion handlers which decide to do a diag
	* reset can zero it without it getting incremented again
	* afterwards, and we break out of this loop on the next
	* iteration since the reply post queue has been cleared to
	* 0xFF and all descriptors look unused (which they are).
	*/
	if (++sc->replypostindex >= sc->pqdepth)
	sc->replypostindex = 0;

	switch (flags) {
	case MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS:
	cm = &sc->commands[le16toh(desc->SCSIIOSuccess.SMID)];
	KASSERT(cm->cm_state == MPS_CM_STATE_INQUEUE,
	("command not inqueue\n"));
	cm->cm_state = MPS_CM_STATE_BUSY;
	cm->cm_reply = NULL;
	break;
	case MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY:
	{
	uint32_t baddr;
	uint8_t *reply;

	/*
	* Re-compose the reply address from the address
	* sent back from the chip. The ReplyFrameAddress
	* is the lower 32 bits of the physical address of
	* particular reply frame. Convert that address to
	* host format, and then use that to provide the
	* offset against the virtual address base
	* (sc->reply_frames).
	*/
	baddr = le32toh(desc->AddressReply.ReplyFrameAddress);
	reply = sc->reply_frames +
	(baddr - ((uint32_t)sc->reply_busaddr));
	/*
	* Make sure the reply we got back is in a valid
	* range. If not, go ahead and panic here, since
	* we'll probably panic as soon as we deference the
	* reply pointer anyway.
	*/
	if ((reply < sc->reply_frames)
	\|\| (reply > (sc->reply_frames +
	(sc->fqdepth * sc->replyframesz)))) {
	printf("%s: WARNING: reply %p out of range!\n",
	__func__, reply);
	printf("%s: reply_frames %p, fqdepth %d, "
	"frame size %d\n", __func__,
	sc->reply_frames, sc->fqdepth,
	sc->replyframesz);
	printf("%s: baddr %#x,\n", __func__, baddr);
	/* LSI-TODO. See Linux Code for Graceful exit */
	panic("Reply address out of range");
	}
	if (le16toh(desc->AddressReply.SMID) == 0) {
	if (((MPI2_DEFAULT_REPLY *)reply)->Function ==
	MPI2_FUNCTION_DIAG_BUFFER_POST) {
	/*
	* If SMID is 0 for Diag Buffer Post,
	* this implies that the reply is due to
	* a release function with a status that
	* the buffer has been released. Set
	* the buffer flags accordingly.
	*/
	rel_rep =
	(MPI2_DIAG_RELEASE_REPLY *)reply;
	if ((le16toh(rel_rep->IOCStatus) &
	MPI2_IOCSTATUS_MASK) ==
	MPI2_IOCSTATUS_DIAGNOSTIC_RELEASED)
	{
	pBuffer =
	&sc->fw_diag_buffer_list[
	rel_rep->BufferType];
	pBuffer->valid_data = TRUE;
	pBuffer->owned_by_firmware =
	FALSE;
	pBuffer->immediate = FALSE;
	}
	} else
	mps_dispatch_event(sc, baddr,
	(MPI2_EVENT_NOTIFICATION_REPLY *)
	reply);
	} else {
	/*
	* Ignore commands not in INQUEUE state
	* since they've already been completed
	* via another path.
	*/
	cm = &sc->commands[
	le16toh(desc->AddressReply.SMID)];
	if (cm->cm_state == MPS_CM_STATE_INQUEUE) {
	cm->cm_state = MPS_CM_STATE_BUSY;
	cm->cm_reply = reply;
	cm->cm_reply_data = le32toh(
	desc->AddressReply.ReplyFrameAddress);
	} else {
	mps_dprint(sc, MPS_RECOVERY,
	"Bad state for ADDRESS_REPLY status,"
	" ignoring state %d cm %p\n",
	cm->cm_state, cm);
	}
	}
	break;
	}
	case MPI2_RPY_DESCRIPT_FLAGS_TARGETASSIST_SUCCESS:
	case MPI2_RPY_DESCRIPT_FLAGS_TARGET_COMMAND_BUFFER:
	case MPI2_RPY_DESCRIPT_FLAGS_RAID_ACCELERATOR_SUCCESS:
	default:
	/* Unhandled */
	mps_dprint(sc, MPS_ERROR, "Unhandled reply 0x%x\n",
	desc->Default.ReplyFlags);
	cm = NULL;
	break;
	}


	if (cm != NULL) {
	// Print Error reply frame
	if (cm->cm_reply)
	mps_display_reply_info(sc,cm->cm_reply);
	mps_complete_command(sc, cm);
	}
	}

	if (pq != sc->replypostindex) {
	mps_dprint(sc, MPS_TRACE, "%s sc %p writing postindex %d\n",
	__func__, sc, sc->replypostindex);
	mps_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET,
	sc->replypostindex);
	}

	return;
	}

	static void
	mps_dispatch_event(struct mps_softc *sc, uintptr_t data,
	MPI2_EVENT_NOTIFICATION_REPLY *reply)
	{
	struct mps_event_handle *eh;
	int event, handled = 0;

	event = le16toh(reply->Event);
	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
	if (isset(eh->mask, event)) {
	eh->callback(sc, data, reply);
	handled++;
	}
	}

	if (handled == 0)
	mps_dprint(sc, MPS_EVENT, "Unhandled event 0x%x\n", le16toh(event));

	/*
	* This is the only place that the event/reply should be freed.
	* Anything wanting to hold onto the event data should have
	* already copied it into their own storage.
	*/
	mps_free_reply(sc, data);
	}

	static void
	mps_reregister_events_complete(struct mps_softc sc, struct mps_command cm)
	{
	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);

	if (cm->cm_reply)
	MPS_DPRINT_EVENT(sc, generic,
	(MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply);

	mps_free_command(sc, cm);

	/* next, send a port enable */
	mpssas_startup(sc);
	}

	/*
	* For both register_events and update_events, the caller supplies a bitmap
	* of events that it _wants_. These functions then turn that into a bitmask
	* suitable for the controller.
	*/
	int
	mps_register_events(struct mps_softc sc, u32 mask,
	mps_evt_callback_t cb, void data, struct mps_event_handle **handle)
	{
	struct mps_event_handle *eh;
	int error = 0;

	eh = malloc(sizeof(struct mps_event_handle), M_MPT2, M_WAITOK\|M_ZERO);
	eh->callback = cb;
	eh->data = data;
	TAILQ_INSERT_TAIL(&sc->event_list, eh, eh_list);
	if (mask != NULL)
	error = mps_update_events(sc, eh, mask);
	*handle = eh;

	return (error);
	}

	int
	mps_update_events(struct mps_softc sc, struct mps_event_handle handle,
	u32 *mask)
	{
	MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
	MPI2_EVENT_NOTIFICATION_REPLY *reply = NULL;
	struct mps_command *cm;
	int error, i;

	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);

	if ((mask != NULL) && (handle != NULL))
	bcopy(mask, &handle->mask[0], sizeof(u32) *
	MPI2_EVENT_NOTIFY_EVENTMASK_WORDS);

	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
	sc->event_mask[i] = -1;

	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
	sc->event_mask[i] &= ~handle->mask[i];

	if ((cm = mps_alloc_command(sc)) == NULL)
	return (EBUSY);
	evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
	evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
	evtreq->MsgFlags = 0;
	evtreq->SASBroadcastPrimitiveMasks = 0;
	#ifdef MPS_DEBUG_ALL_EVENTS
	{
	u_char fullmask[16];
	memset(fullmask, 0x00, 16);
	bcopy(fullmask, &evtreq->EventMasks[0], sizeof(u32) *
	MPI2_EVENT_NOTIFY_EVENTMASK_WORDS);
	}
	#else
	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
	evtreq->EventMasks[i] =
	htole32(sc->event_mask[i]);
	#endif
	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
	cm->cm_data = NULL;

	error = mps_wait_command(sc, &cm, 60, 0);
	if (cm != NULL)
	reply = (MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply;
	if ((reply == NULL) \|\|
	(reply->IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
	error = ENXIO;

	if (reply)
	MPS_DPRINT_EVENT(sc, generic, reply);

	mps_dprint(sc, MPS_TRACE, "%s finished error %d\n", __func__, error);

	if (cm != NULL)
	mps_free_command(sc, cm);
	return (error);
	}

	static int
	mps_reregister_events(struct mps_softc *sc)
	{
	MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
	struct mps_command *cm;
	struct mps_event_handle *eh;
	int error, i;

	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);

	/* first, reregister events */

	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
	sc->event_mask[i] = -1;

	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
	sc->event_mask[i] &= ~eh->mask[i];
	}

	if ((cm = mps_alloc_command(sc)) == NULL)
	return (EBUSY);
	evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
	evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
	evtreq->MsgFlags = 0;
	evtreq->SASBroadcastPrimitiveMasks = 0;
	#ifdef MPS_DEBUG_ALL_EVENTS
	{
	u_char fullmask[16];
	memset(fullmask, 0x00, 16);
	bcopy(fullmask, &evtreq->EventMasks[0], sizeof(u32) *
	MPI2_EVENT_NOTIFY_EVENTMASK_WORDS);
	}
	#else
	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
	evtreq->EventMasks[i] =
	htole32(sc->event_mask[i]);
	#endif
	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
	cm->cm_data = NULL;
	cm->cm_complete = mps_reregister_events_complete;

	error = mps_map_command(sc, cm);

	mps_dprint(sc, MPS_TRACE, "%s finished with error %d\n", __func__,
	error);
	return (error);
	}

	void
	mps_deregister_events(struct mps_softc sc, struct mps_event_handle handle)
	{

	TAILQ_REMOVE(&sc->event_list, handle, eh_list);
	free(handle, M_MPT2);
	}

	/*
	* Add a chain element as the next SGE for the specified command.
	* Reset cm_sge and cm_sgesize to indicate all the available space.
	*/
	static int
	mps_add_chain(struct mps_command *cm)
	{
	MPI2_SGE_CHAIN32 *sgc;
	struct mps_chain *chain;
	u_int space;

	if (cm->cm_sglsize < MPS_SGC_SIZE)
	panic("MPS: Need SGE Error Code\n");

	chain = mps_alloc_chain(cm->cm_sc);
	if (chain == NULL)
	return (ENOBUFS);

	space = cm->cm_sc->reqframesz;

	/*
	* Note: a double-linked list is used to make it easier to
	* walk for debugging.
	*/
	TAILQ_INSERT_TAIL(&cm->cm_chain_list, chain, chain_link);

	sgc = (MPI2_SGE_CHAIN32 *)&cm->cm_sge->MpiChain;
	sgc->Length = htole16(space);
	sgc->NextChainOffset = 0;
	/* TODO Looks like bug in Setting sgc->Flags.
	* sgc->Flags = ( MPI2_SGE_FLAGS_CHAIN_ELEMENT \| MPI2_SGE_FLAGS_64_BIT_ADDRESSING \|
	* MPI2_SGE_FLAGS_SYSTEM_ADDRESS) << MPI2_SGE_FLAGS_SHIFT
	* This is fine.. because we are not using simple element. In case of
	* MPI2_SGE_CHAIN32, we have separate Length and Flags feild.
	*/
	sgc->Flags = MPI2_SGE_FLAGS_CHAIN_ELEMENT;
	sgc->Address = htole32(chain->chain_busaddr);

	cm->cm_sge = (MPI2_SGE_IO_UNION *)&chain->chain->MpiSimple;
	cm->cm_sglsize = space;
	return (0);
	}

	/*
	* Add one scatter-gather element (chain, simple, transaction context)
	* to the scatter-gather list for a command. Maintain cm_sglsize and
	* cm_sge as the remaining size and pointer to the next SGE to fill
	* in, respectively.
	*/
	int
	mps_push_sge(struct mps_command cm, void sgep, size_t len, int segsleft)
	{
	MPI2_SGE_TRANSACTION_UNION *tc = sgep;
	MPI2_SGE_SIMPLE64 *sge = sgep;
	int error, type;
	uint32_t saved_buf_len, saved_address_low, saved_address_high;

	type = (tc->Flags & MPI2_SGE_FLAGS_ELEMENT_MASK);

	#ifdef INVARIANTS
	switch (type) {
	case MPI2_SGE_FLAGS_TRANSACTION_ELEMENT: {
	if (len != tc->DetailsLength + 4)
	panic("TC %p length %u or %zu?", tc,
	tc->DetailsLength + 4, len);
	}
	break;
	case MPI2_SGE_FLAGS_CHAIN_ELEMENT:
	/* Driver only uses 32-bit chain elements */
	if (len != MPS_SGC_SIZE)
	panic("CHAIN %p length %u or %zu?", sgep,
	MPS_SGC_SIZE, len);
	break;
	case MPI2_SGE_FLAGS_SIMPLE_ELEMENT:
	/* Driver only uses 64-bit SGE simple elements */
	if (len != MPS_SGE64_SIZE)
	panic("SGE simple %p length %u or %zu?", sge,
	MPS_SGE64_SIZE, len);
	if (((le32toh(sge->FlagsLength) >> MPI2_SGE_FLAGS_SHIFT) &
	MPI2_SGE_FLAGS_ADDRESS_SIZE) == 0)
	panic("SGE simple %p not marked 64-bit?", sge);

	break;
	default:
	panic("Unexpected SGE %p, flags %02x", tc, tc->Flags);
	}
	#endif

	/*
	* case 1: 1 more segment, enough room for it
	* case 2: 2 more segments, enough room for both
	* case 3: >=2 more segments, only enough room for 1 and a chain
	* case 4: >=1 more segment, enough room for only a chain
	* case 5: >=1 more segment, no room for anything (error)
	*/

	/*
	* There should be room for at least a chain element, or this
	* code is buggy. Case (5).
	*/
	if (cm->cm_sglsize < MPS_SGC_SIZE)
	panic("MPS: Need SGE Error Code\n");

	if (segsleft >= 1 && cm->cm_sglsize < len + MPS_SGC_SIZE) {
	/*
	* 1 or more segment, enough room for only a chain.
	* Hope the previous element wasn't a Simple entry
	* that needed to be marked with
	* MPI2_SGE_FLAGS_LAST_ELEMENT. Case (4).
	*/
	if ((error = mps_add_chain(cm)) != 0)
	return (error);
	}

	if (segsleft >= 2 &&
	cm->cm_sglsize < len + MPS_SGC_SIZE + MPS_SGE64_SIZE) {
	/*
	* There are 2 or more segments left to add, and only
	* enough room for 1 and a chain. Case (3).
	*
	* Mark as last element in this chain if necessary.
	*/
	if (type == MPI2_SGE_FLAGS_SIMPLE_ELEMENT) {
	sge->FlagsLength \|= htole32(
	MPI2_SGE_FLAGS_LAST_ELEMENT << MPI2_SGE_FLAGS_SHIFT);
	}

	/*
	* Add the item then a chain. Do the chain now,
	* rather than on the next iteration, to simplify
	* understanding the code.
	*/
	cm->cm_sglsize -= len;
	bcopy(sgep, cm->cm_sge, len);
	cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
	return (mps_add_chain(cm));
	}

	#ifdef INVARIANTS
	/* Case 1: 1 more segment, enough room for it. */
	if (segsleft == 1 && cm->cm_sglsize < len)
	panic("1 seg left and no room? %u versus %zu",
	cm->cm_sglsize, len);

	/* Case 2: 2 more segments, enough room for both */
	if (segsleft == 2 && cm->cm_sglsize < len + MPS_SGE64_SIZE)
	panic("2 segs left and no room? %u versus %zu",
	cm->cm_sglsize, len);
	#endif

	if (segsleft == 1 && type == MPI2_SGE_FLAGS_SIMPLE_ELEMENT) {
	/*
	* If this is a bi-directional request, need to account for that
	* here. Save the pre-filled sge values. These will be used
	* either for the 2nd SGL or for a single direction SGL. If
	* cm_out_len is non-zero, this is a bi-directional request, so
	* fill in the OUT SGL first, then the IN SGL, otherwise just
	* fill in the IN SGL. Note that at this time, when filling in
	* 2 SGL's for a bi-directional request, they both use the same
	* DMA buffer (same cm command).
	*/
	saved_buf_len = le32toh(sge->FlagsLength) & 0x00FFFFFF;
	saved_address_low = sge->Address.Low;
	saved_address_high = sge->Address.High;
	if (cm->cm_out_len) {
	sge->FlagsLength = htole32(cm->cm_out_len \|
	((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT \|
	MPI2_SGE_FLAGS_END_OF_BUFFER \|
	MPI2_SGE_FLAGS_HOST_TO_IOC \|
	MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
	MPI2_SGE_FLAGS_SHIFT));
	cm->cm_sglsize -= len;
	bcopy(sgep, cm->cm_sge, len);
	cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge
	+ len);
	}
	saved_buf_len \|=
	((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT \|
	MPI2_SGE_FLAGS_END_OF_BUFFER \|
	MPI2_SGE_FLAGS_LAST_ELEMENT \|
	MPI2_SGE_FLAGS_END_OF_LIST \|
	MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
	MPI2_SGE_FLAGS_SHIFT);
	if (cm->cm_flags & MPS_CM_FLAGS_DATAIN) {
	saved_buf_len \|=
	((uint32_t)(MPI2_SGE_FLAGS_IOC_TO_HOST) <<
	MPI2_SGE_FLAGS_SHIFT);
	} else {
	saved_buf_len \|=
	((uint32_t)(MPI2_SGE_FLAGS_HOST_TO_IOC) <<
	MPI2_SGE_FLAGS_SHIFT);
	}
	sge->FlagsLength = htole32(saved_buf_len);
	sge->Address.Low = saved_address_low;
	sge->Address.High = saved_address_high;
	}

	cm->cm_sglsize -= len;
	bcopy(sgep, cm->cm_sge, len);
	cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
	return (0);
	}

	/*
	* Add one dma segment to the scatter-gather list for a command.
	*/
	int
	mps_add_dmaseg(struct mps_command *cm, vm_paddr_t pa, size_t len, u_int flags,
	int segsleft)
	{
	MPI2_SGE_SIMPLE64 sge;

	/*
	* This driver always uses 64-bit address elements for simplicity.
	*/
	bzero(&sge, sizeof(sge));
	flags \|= MPI2_SGE_FLAGS_SIMPLE_ELEMENT \|
	MPI2_SGE_FLAGS_64_BIT_ADDRESSING;
	sge.FlagsLength = htole32(len \| (flags << MPI2_SGE_FLAGS_SHIFT));
	mps_from_u64(pa, &sge.Address);

	return (mps_push_sge(cm, &sge, sizeof sge, segsleft));
	}

	static void
	mps_data_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct mps_softc *sc;
	struct mps_command *cm;
	u_int i, dir, sflags;

	cm = (struct mps_command *)arg;
	sc = cm->cm_sc;

	/*
	* In this case, just print out a warning and let the chip tell the
	* user they did the wrong thing.
	*/
	if ((cm->cm_max_segs != 0) && (nsegs > cm->cm_max_segs)) {
	mps_dprint(sc, MPS_ERROR,
	"%s: warning: busdma returned %d segments, "
	"more than the %d allowed\n", __func__, nsegs,
	cm->cm_max_segs);
	}

	/*
	* Set up DMA direction flags. Bi-directional requests are also handled
	* here. In that case, both direction flags will be set.
	*/
	sflags = 0;
	if (cm->cm_flags & MPS_CM_FLAGS_SMP_PASS) {
	/*
	* We have to add a special case for SMP passthrough, there
	* is no easy way to generically handle it. The first
	* S/G element is used for the command (therefore the
	* direction bit needs to be set). The second one is used
	* for the reply. We'll leave it to the caller to make
	* sure we only have two buffers.
	*/
	/*
	* Even though the busdma man page says it doesn't make
	* sense to have both direction flags, it does in this case.
	* We have one s/g element being accessed in each direction.
	*/
	dir = BUS_DMASYNC_PREWRITE \| BUS_DMASYNC_PREREAD;

	/*
	* Set the direction flag on the first buffer in the SMP
	* passthrough request. We'll clear it for the second one.
	*/
	sflags \|= MPI2_SGE_FLAGS_DIRECTION \|
	MPI2_SGE_FLAGS_END_OF_BUFFER;
	} else if (cm->cm_flags & MPS_CM_FLAGS_DATAOUT) {
	sflags \|= MPI2_SGE_FLAGS_HOST_TO_IOC;
	dir = BUS_DMASYNC_PREWRITE;
	} else
	dir = BUS_DMASYNC_PREREAD;

	for (i = 0; i < nsegs; i++) {
	if ((cm->cm_flags & MPS_CM_FLAGS_SMP_PASS) && (i != 0)) {
	sflags &= ~MPI2_SGE_FLAGS_DIRECTION;
	}
	error = mps_add_dmaseg(cm, segs[i].ds_addr, segs[i].ds_len,
	sflags, nsegs - i);
	if (error != 0) {
	/* Resource shortage, roll back! */
	if (ratecheck(&sc->lastfail, &mps_chainfail_interval))
	mps_dprint(sc, MPS_INFO, "Out of chain frames, "
	"consider increasing hw.mps.max_chains.\n");
	cm->cm_flags \|= MPS_CM_FLAGS_CHAIN_FAILED;
	mps_complete_command(sc, cm);
	return;
	}
	}

	bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir);
	mps_enqueue_request(sc, cm);

	return;
	}

	static void
	mps_data_cb2(void arg, bus_dma_segment_t segs, int nsegs, bus_size_t mapsize,
	int error)
	{
	mps_data_cb(arg, segs, nsegs, error);
	}

	/*
	* This is the routine to enqueue commands ansynchronously.
	* Note that the only error path here is from bus_dmamap_load(), which can
	* return EINPROGRESS if it is waiting for resources. Other than this, it's
	* assumed that if you have a command in-hand, then you have enough credits
	* to use it.
	*/
	int
	mps_map_command(struct mps_softc sc, struct mps_command cm)
	{
	int error = 0;

	if (cm->cm_flags & MPS_CM_FLAGS_USE_UIO) {
	error = bus_dmamap_load_uio(sc->buffer_dmat, cm->cm_dmamap,
	&cm->cm_uio, mps_data_cb2, cm, 0);
	} else if (cm->cm_flags & MPS_CM_FLAGS_USE_CCB) {
	error = bus_dmamap_load_ccb(sc->buffer_dmat, cm->cm_dmamap,
	cm->cm_data, mps_data_cb, cm, 0);
	} else if ((cm->cm_data != NULL) && (cm->cm_length != 0)) {
	error = bus_dmamap_load(sc->buffer_dmat, cm->cm_dmamap,
	cm->cm_data, cm->cm_length, mps_data_cb, cm, 0);
	} else {
	/* Add a zero-length element as needed */
	if (cm->cm_sge != NULL)
	mps_add_dmaseg(cm, 0, 0, 0, 1);
	mps_enqueue_request(sc, cm);
	}

	return (error);
	}

	/*
	* This is the routine to enqueue commands synchronously. An error of
	* EINPROGRESS from mps_map_command() is ignored since the command will
	* be executed and enqueued automatically. Other errors come from msleep().
	*/
	int
	mps_wait_command(struct mps_softc sc, struct mps_command *cmp, int timeout,
	int sleep_flag)
	{
	int error, rc;
	struct timeval cur_time, start_time;
	struct mps_command cm = cmp;

	if (sc->mps_flags & MPS_FLAGS_DIAGRESET)
	return EBUSY;

	cm->cm_complete = NULL;
	cm->cm_flags \|= MPS_CM_FLAGS_POLLED;
	error = mps_map_command(sc, cm);
	if ((error != 0) && (error != EINPROGRESS))
	return (error);

	/*
	* Check for context and wait for 50 mSec at a time until time has
	* expired or the command has finished. If msleep can't be used, need
	* to poll.
	*/
	if (curthread->td_no_sleeping != 0)
	sleep_flag = NO_SLEEP;
	getmicrouptime(&start_time);
	if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP) {
	cm->cm_flags \|= MPS_CM_FLAGS_WAKEUP;
	error = msleep(cm, &sc->mps_mtx, 0, "mpswait", timeout*hz);
	if (error == EWOULDBLOCK) {
	/*
	* Record the actual elapsed time in the case of a
	* timeout for the message below.
	*/
	getmicrouptime(&cur_time);
	timevalsub(&cur_time, &start_time);
	}
	} else {
	while ((cm->cm_flags & MPS_CM_FLAGS_COMPLETE) == 0) {
	mps_intr_locked(sc);
	if (sleep_flag == CAN_SLEEP)
	pause("mpswait", hz/20);
	else
	DELAY(50000);

	getmicrouptime(&cur_time);
	timevalsub(&cur_time, &start_time);
	if (cur_time.tv_sec > timeout) {
	error = EWOULDBLOCK;
	break;
	}
	}
	}

	if (error == EWOULDBLOCK) {
	if (cm->cm_timeout_handler == NULL) {
	mps_dprint(sc, MPS_FAULT, "Calling Reinit from %s, timeout=%d,"
	" elapsed=%jd\n", __func__, timeout,
	(intmax_t)cur_time.tv_sec);
	rc = mps_reinit(sc);
	mps_dprint(sc, MPS_FAULT, "Reinit %s\n", (rc == 0) ? "success" :
	"failed");
	} else
	cm->cm_timeout_handler(sc, cm);
	if (sc->mps_flags & MPS_FLAGS_REALLOCATED) {
	/*
	* Tell the caller that we freed the command in a
	* reinit.
	*/
	*cmp = NULL;
	}
	error = ETIMEDOUT;
	}
	return (error);
	}

	/*
	* The MPT driver had a verbose interface for config pages. In this driver,
	* reduce it to much simpler terms, similar to the Linux driver.
	*/
	int
	mps_read_config_page(struct mps_softc sc, struct mps_config_params params)
	{
	MPI2_CONFIG_REQUEST *req;
	struct mps_command *cm;
	int error;

	if (sc->mps_flags & MPS_FLAGS_BUSY) {
	return (EBUSY);
	}

	cm = mps_alloc_command(sc);
	if (cm == NULL) {
	return (EBUSY);
	}

	req = (MPI2_CONFIG_REQUEST *)cm->cm_req;
	req->Function = MPI2_FUNCTION_CONFIG;
	req->Action = params->action;
	req->SGLFlags = 0;
	req->ChainOffset = 0;
	req->PageAddress = params->page_address;
	if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
	MPI2_CONFIG_EXTENDED_PAGE_HEADER *hdr;

	hdr = &params->hdr.Ext;
	req->ExtPageType = hdr->ExtPageType;
	req->ExtPageLength = hdr->ExtPageLength;
	req->Header.PageType = MPI2_CONFIG_PAGETYPE_EXTENDED;
	req->Header.PageLength = 0; /* Must be set to zero */
	req->Header.PageNumber = hdr->PageNumber;
	req->Header.PageVersion = hdr->PageVersion;
	} else {
	MPI2_CONFIG_PAGE_HEADER *hdr;

	hdr = &params->hdr.Struct;
	req->Header.PageType = hdr->PageType;
	req->Header.PageNumber = hdr->PageNumber;
	req->Header.PageLength = hdr->PageLength;
	req->Header.PageVersion = hdr->PageVersion;
	}

	cm->cm_data = params->buffer;
	cm->cm_length = params->length;
	if (cm->cm_data != NULL) {
	cm->cm_sge = &req->PageBufferSGE;
	cm->cm_sglsize = sizeof(MPI2_SGE_IO_UNION);
	cm->cm_flags = MPS_CM_FLAGS_SGE_SIMPLE \| MPS_CM_FLAGS_DATAIN;
	} else
	cm->cm_sge = NULL;
	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;

	cm->cm_complete_data = params;
	if (params->callback != NULL) {
	cm->cm_complete = mps_config_complete;
	return (mps_map_command(sc, cm));
	} else {
	error = mps_wait_command(sc, &cm, 0, CAN_SLEEP);
	if (error) {
	mps_dprint(sc, MPS_FAULT,
	"Error %d reading config page\n", error);
	if (cm != NULL)
	mps_free_command(sc, cm);
	return (error);
	}
	mps_config_complete(sc, cm);
	}

	return (0);
	}

	int
	mps_write_config_page(struct mps_softc sc, struct mps_config_params params)
	{
	return (EINVAL);
	}

	static void
	mps_config_complete(struct mps_softc sc, struct mps_command cm)
	{
	MPI2_CONFIG_REPLY *reply;
	struct mps_config_params *params;

	MPS_FUNCTRACE(sc);
	params = cm->cm_complete_data;

	if (cm->cm_data != NULL) {
	bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap,
	BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap);
	}

	/*
	* XXX KDM need to do more error recovery? This results in the
	* device in question not getting probed.
	*/
	if ((cm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) {
	params->status = MPI2_IOCSTATUS_BUSY;
	goto done;
	}

	reply = (MPI2_CONFIG_REPLY *)cm->cm_reply;
	if (reply == NULL) {
	params->status = MPI2_IOCSTATUS_BUSY;
	goto done;
	}
	params->status = reply->IOCStatus;
	if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
	params->hdr.Ext.ExtPageType = reply->ExtPageType;
	params->hdr.Ext.ExtPageLength = reply->ExtPageLength;
	params->hdr.Ext.PageType = reply->Header.PageType;
	params->hdr.Ext.PageNumber = reply->Header.PageNumber;
	params->hdr.Ext.PageVersion = reply->Header.PageVersion;
	} else {
	params->hdr.Struct.PageType = reply->Header.PageType;
	params->hdr.Struct.PageNumber = reply->Header.PageNumber;
	params->hdr.Struct.PageLength = reply->Header.PageLength;
	params->hdr.Struct.PageVersion = reply->Header.PageVersion;
	}

	done:
	mps_free_command(sc, cm);
	if (params->callback != NULL)
	params->callback(sc, params);

	return;
	}
	diff --git a/sys/dev/mpt/mpt.c b/sys/dev/mpt/mpt.c
	index 62722d9ce693..1b9e5c042fdc 100644
	--- a/sys/dev/mpt/mpt.c
	+++ b/sys/dev/mpt/mpt.c
	@@ -1,3149 +1,3149 @@
	/*-
	* Generic routines for LSI Fusion adapters.
	* FreeBSD Version.
	*
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND BSD-3-Clause
	*
	* Copyright (c) 2000, 2001 by Greg Ansley
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice immediately at the beginning of the file, without modification,
	* this list of conditions, and the following disclaimer.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/
	/*-
	* Copyright (c) 2002, 2006 by Matthew Jacob
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are
	* met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce at minimum a disclaimer
	* substantially similar to the "NO WARRANTY" disclaimer below
	* ("Disclaimer") and any redistribution must be conditioned upon including
	* a substantially similar Disclaimer requirement for further binary
	* redistribution.
	* 3. Neither the names of the above listed copyright holders nor the names
	* of any contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF THE COPYRIGHT
	* OWNER OR CONTRIBUTOR IS ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* Support from Chris Ellsworth in order to make SAS adapters work
	* is gratefully acknowledged.
	*
	*
	* Support from LSI-Logic has also gone a great deal toward making this a
	* workable subsystem and is gratefully acknowledged.
	*/
	/*-
	* Copyright (c) 2004, Avid Technology, Inc. and its contributors.
	* Copyright (c) 2005, WHEEL Sp. z o.o.
	* Copyright (c) 2004, 2005 Justin T. Gibbs
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are
	* met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce at minimum a disclaimer
	* substantially similar to the "NO WARRANTY" disclaimer below
	* ("Disclaimer") and any redistribution must be conditioned upon including
	* a substantially similar Disclaimer requirement for further binary
	* redistribution.
	* 3. Neither the names of the above listed copyright holders nor the names
	* of any contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF THE COPYRIGHT
	* OWNER OR CONTRIBUTOR IS ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <dev/mpt/mpt.h>
	#include <dev/mpt/mpt_cam.h> /* XXX For static handler registration */
	#include <dev/mpt/mpt_raid.h> /* XXX For static handler registration */

	#include <dev/mpt/mpilib/mpi.h>
	#include <dev/mpt/mpilib/mpi_ioc.h>
	#include <dev/mpt/mpilib/mpi_fc.h>
	#include <dev/mpt/mpilib/mpi_targ.h>

	#include <sys/sysctl.h>

	#define MPT_MAX_TRYS 3
	#define MPT_MAX_WAIT 300000

	static int maxwait_ack = 0;
	static int maxwait_int = 0;
	static int maxwait_state = 0;

	static TAILQ_HEAD(, mpt_softc) mpt_tailq = TAILQ_HEAD_INITIALIZER(mpt_tailq);
	mpt_reply_handler_t *mpt_reply_handlers[MPT_NUM_REPLY_HANDLERS];

	static mpt_reply_handler_t mpt_default_reply_handler;
	static mpt_reply_handler_t mpt_config_reply_handler;
	static mpt_reply_handler_t mpt_handshake_reply_handler;
	static mpt_reply_handler_t mpt_event_reply_handler;
	static void mpt_send_event_ack(struct mpt_softc mpt, request_t ack_req,
	MSG_EVENT_NOTIFY_REPLY *msg, uint32_t context);
	static int mpt_send_event_request(struct mpt_softc *mpt, int onoff);
	static int mpt_soft_reset(struct mpt_softc *mpt);
	static void mpt_hard_reset(struct mpt_softc *mpt);
	static int mpt_dma_buf_alloc(struct mpt_softc *mpt);
	static void mpt_dma_buf_free(struct mpt_softc *mpt);
	static int mpt_configure_ioc(struct mpt_softc *mpt, int, int);
	static int mpt_enable_ioc(struct mpt_softc *mpt, int);

	/*********************** Personality Module Support ***********************/
	/*
	* We include one extra entry that is guaranteed to be NULL
	* to simplify our itterator.
	*/
	static struct mpt_personality *mpt_personalities[MPT_MAX_PERSONALITIES + 1];
	static __inline struct mpt_personality*
	mpt_pers_find(struct mpt_softc *, u_int);
	static __inline struct mpt_personality*
	mpt_pers_find_reverse(struct mpt_softc *, u_int);

	static __inline struct mpt_personality *
	mpt_pers_find(struct mpt_softc *mpt, u_int start_at)
	{
	KASSERT(start_at <= MPT_MAX_PERSONALITIES,
	("mpt_pers_find: starting position out of range"));

	while (start_at < MPT_MAX_PERSONALITIES
	&& (mpt->mpt_pers_mask & (0x1 << start_at)) == 0) {
	start_at++;
	}
	return (mpt_personalities[start_at]);
	}

	/*
	* Used infrequently, so no need to optimize like a forward
	* traversal where we use the MAX+1 is guaranteed to be NULL
	* trick.
	*/
	static __inline struct mpt_personality *
	mpt_pers_find_reverse(struct mpt_softc *mpt, u_int start_at)
	{
	while (start_at < MPT_MAX_PERSONALITIES
	&& (mpt->mpt_pers_mask & (0x1 << start_at)) == 0) {
	start_at--;
	}
	if (start_at < MPT_MAX_PERSONALITIES)
	return (mpt_personalities[start_at]);
	return (NULL);
	}

	#define MPT_PERS_FOREACH(mpt, pers) \
	for (pers = mpt_pers_find(mpt, /start_at/0); \
	pers != NULL; \
	pers = mpt_pers_find(mpt, /start_at/pers->id+1))

	#define MPT_PERS_FOREACH_REVERSE(mpt, pers) \
	for (pers = mpt_pers_find_reverse(mpt, MPT_MAX_PERSONALITIES-1);\
	pers != NULL; \
	pers = mpt_pers_find_reverse(mpt, /start_at/pers->id-1))

	static mpt_load_handler_t mpt_stdload;
	static mpt_probe_handler_t mpt_stdprobe;
	static mpt_attach_handler_t mpt_stdattach;
	static mpt_enable_handler_t mpt_stdenable;
	static mpt_ready_handler_t mpt_stdready;
	static mpt_event_handler_t mpt_stdevent;
	static mpt_reset_handler_t mpt_stdreset;
	static mpt_shutdown_handler_t mpt_stdshutdown;
	static mpt_detach_handler_t mpt_stddetach;
	static mpt_unload_handler_t mpt_stdunload;
	static struct mpt_personality mpt_default_personality =
	{
	.load = mpt_stdload,
	.probe = mpt_stdprobe,
	.attach = mpt_stdattach,
	.enable = mpt_stdenable,
	.ready = mpt_stdready,
	.event = mpt_stdevent,
	.reset = mpt_stdreset,
	.shutdown = mpt_stdshutdown,
	.detach = mpt_stddetach,
	.unload = mpt_stdunload
	};

	static mpt_load_handler_t mpt_core_load;
	static mpt_attach_handler_t mpt_core_attach;
	static mpt_enable_handler_t mpt_core_enable;
	static mpt_reset_handler_t mpt_core_ioc_reset;
	static mpt_event_handler_t mpt_core_event;
	static mpt_shutdown_handler_t mpt_core_shutdown;
	static mpt_shutdown_handler_t mpt_core_detach;
	static mpt_unload_handler_t mpt_core_unload;
	static struct mpt_personality mpt_core_personality =
	{
	.name = "mpt_core",
	.load = mpt_core_load,
	// .attach = mpt_core_attach,
	// .enable = mpt_core_enable,
	.event = mpt_core_event,
	.reset = mpt_core_ioc_reset,
	.shutdown = mpt_core_shutdown,
	.detach = mpt_core_detach,
	.unload = mpt_core_unload,
	};

	/*
	* Manual declaration so that DECLARE_MPT_PERSONALITY doesn't need
	* ordering information. We want the core to always register FIRST.
	* other modules are set to SI_ORDER_SECOND.
	*/
	static moduledata_t mpt_core_mod = {
	"mpt_core", mpt_modevent, &mpt_core_personality
	};
	DECLARE_MODULE(mpt_core, mpt_core_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
	MODULE_VERSION(mpt_core, 1);

	#define MPT_PERS_ATTACHED(pers, mpt) ((mpt)->mpt_pers_mask & (0x1 << pers->id))

	int
	mpt_modevent(module_t mod, int type, void *data)
	{
	struct mpt_personality *pers;
	int error;

	pers = (struct mpt_personality *)data;

	error = 0;
	switch (type) {
	case MOD_LOAD:
	{
	mpt_load_handler_t **def_handler;
	mpt_load_handler_t **pers_handler;
	int i;

	for (i = 0; i < MPT_MAX_PERSONALITIES; i++) {
	if (mpt_personalities[i] == NULL)
	break;
	}
	if (i >= MPT_MAX_PERSONALITIES) {
	error = ENOMEM;
	break;
	}
	pers->id = i;
	mpt_personalities[i] = pers;

	/* Install standard/noop handlers for any NULL entries. */
	def_handler = MPT_PERS_FIRST_HANDLER(&mpt_default_personality);
	pers_handler = MPT_PERS_FIRST_HANDLER(pers);
	while (pers_handler <= MPT_PERS_LAST_HANDLER(pers)) {
	if (*pers_handler == NULL)
	pers_handler = def_handler;
	pers_handler++;
	def_handler++;
	}

	error = (pers->load(pers));
	if (error != 0)
	mpt_personalities[i] = NULL;
	break;
	}
	case MOD_SHUTDOWN:
	break;
	case MOD_QUIESCE:
	break;
	case MOD_UNLOAD:
	error = pers->unload(pers);
	mpt_personalities[pers->id] = NULL;
	break;
	default:
	error = EINVAL;
	break;
	}
	return (error);
	}

	static int
	mpt_stdload(struct mpt_personality *pers)
	{

	/* Load is always successful. */
	return (0);
	}

	static int
	mpt_stdprobe(struct mpt_softc *mpt)
	{

	/* Probe is always successful. */
	return (0);
	}

	static int
	mpt_stdattach(struct mpt_softc *mpt)
	{

	/* Attach is always successful. */
	return (0);
	}

	static int
	mpt_stdenable(struct mpt_softc *mpt)
	{

	/* Enable is always successful. */
	return (0);
	}

	static void
	mpt_stdready(struct mpt_softc *mpt)
	{

	}

	static int
	mpt_stdevent(struct mpt_softc mpt, request_t req, MSG_EVENT_NOTIFY_REPLY *msg)
	{

	mpt_lprt(mpt, MPT_PRT_DEBUG, "mpt_stdevent: 0x%x\n", msg->Event & 0xFF);
	/* Event was not for us. */
	return (0);
	}

	static void
	mpt_stdreset(struct mpt_softc *mpt, int type)
	{

	}

	static void
	mpt_stdshutdown(struct mpt_softc *mpt)
	{

	}

	static void
	mpt_stddetach(struct mpt_softc *mpt)
	{

	}

	static int
	mpt_stdunload(struct mpt_personality *pers)
	{

	/* Unload is always successful. */
	return (0);
	}

	/*
	* Post driver attachment, we may want to perform some global actions.
	* Here is the hook to do so.
	*/

	static void
	mpt_postattach(void *unused)
	{
	struct mpt_softc *mpt;
	struct mpt_personality *pers;

	TAILQ_FOREACH(mpt, &mpt_tailq, links) {
	MPT_PERS_FOREACH(mpt, pers)
	pers->ready(mpt);
	}
	}
	SYSINIT(mptdev, SI_SUB_CONFIGURE, SI_ORDER_MIDDLE, mpt_postattach, NULL);

	/***************************** Bus DMA Support ****************************/
	void
	mpt_map_rquest(void arg, bus_dma_segment_t segs, int nseg, int error)
	{
	struct mpt_map_info *map_info;

	map_info = (struct mpt_map_info *)arg;
	map_info->error = error;
	map_info->phys = segs->ds_addr;
	}

	/************************** Reply/Event Handling **************************/
	int
	mpt_register_handler(struct mpt_softc *mpt, mpt_handler_type type,
	mpt_handler_t handler, uint32_t *phandler_id)
	{

	switch (type) {
	case MPT_HANDLER_REPLY:
	{
	u_int cbi;
	u_int free_cbi;

	if (phandler_id == NULL)
	return (EINVAL);

	free_cbi = MPT_HANDLER_ID_NONE;
	for (cbi = 0; cbi < MPT_NUM_REPLY_HANDLERS; cbi++) {
	/*
	* If the same handler is registered multiple
	* times, don't error out. Just return the
	* index of the original registration.
	*/
	if (mpt_reply_handlers[cbi] == handler.reply_handler) {
	*phandler_id = MPT_CBI_TO_HID(cbi);
	return (0);
	}

	/*
	* Fill from the front in the hope that
	* all registered handlers consume only a
	* single cache line.
	*
	* We don't break on the first empty slot so
	* that the full table is checked to see if
	* this handler was previously registered.
	*/
	if (free_cbi == MPT_HANDLER_ID_NONE &&
	(mpt_reply_handlers[cbi]
	== mpt_default_reply_handler))
	free_cbi = cbi;
	}
	if (free_cbi == MPT_HANDLER_ID_NONE) {
	return (ENOMEM);
	}
	mpt_reply_handlers[free_cbi] = handler.reply_handler;
	*phandler_id = MPT_CBI_TO_HID(free_cbi);
	break;
	}
	default:
	mpt_prt(mpt, "mpt_register_handler unknown type %d\n", type);
	return (EINVAL);
	}
	return (0);
	}

	int
	mpt_deregister_handler(struct mpt_softc *mpt, mpt_handler_type type,
	mpt_handler_t handler, uint32_t handler_id)
	{

	switch (type) {
	case MPT_HANDLER_REPLY:
	{
	u_int cbi;

	cbi = MPT_CBI(handler_id);
	if (cbi >= MPT_NUM_REPLY_HANDLERS
	\|\| mpt_reply_handlers[cbi] != handler.reply_handler)
	return (ENOENT);
	mpt_reply_handlers[cbi] = mpt_default_reply_handler;
	break;
	}
	default:
	mpt_prt(mpt, "mpt_deregister_handler unknown type %d\n", type);
	return (EINVAL);
	}
	return (0);
	}

	static int
	mpt_default_reply_handler(struct mpt_softc mpt, request_t req,
	uint32_t reply_desc, MSG_DEFAULT_REPLY *reply_frame)
	{

	mpt_prt(mpt,
	"Default Handler Called: req=%p:%u reply_descriptor=%x frame=%p\n",
	req, req->serno, reply_desc, reply_frame);

	if (reply_frame != NULL)
	mpt_dump_reply_frame(mpt, reply_frame);

	mpt_prt(mpt, "Reply Frame Ignored\n");

	return (/free_reply/TRUE);
	}

	static int
	mpt_config_reply_handler(struct mpt_softc mpt, request_t req,
	uint32_t reply_desc, MSG_DEFAULT_REPLY *reply_frame)
	{

	if (req != NULL) {
	if (reply_frame != NULL) {
	MSG_CONFIG *cfgp;
	MSG_CONFIG_REPLY *reply;

	cfgp = (MSG_CONFIG *)req->req_vbuf;
	reply = (MSG_CONFIG_REPLY *)reply_frame;
	req->IOCStatus = le16toh(reply_frame->IOCStatus);
	bcopy(&reply->Header, &cfgp->Header,
	sizeof(cfgp->Header));
	cfgp->ExtPageLength = reply->ExtPageLength;
	cfgp->ExtPageType = reply->ExtPageType;
	}
	req->state &= ~REQ_STATE_QUEUED;
	req->state \|= REQ_STATE_DONE;
	TAILQ_REMOVE(&mpt->request_pending_list, req, links);
	if ((req->state & REQ_STATE_NEED_WAKEUP) != 0) {
	wakeup(req);
	} else if ((req->state & REQ_STATE_TIMEDOUT) != 0) {
	/*
	* Whew- we can free this request (late completion)
	*/
	mpt_free_request(mpt, req);
	}
	}

	return (TRUE);
	}

	static int
	mpt_handshake_reply_handler(struct mpt_softc mpt, request_t req,
	uint32_t reply_desc, MSG_DEFAULT_REPLY *reply_frame)
	{

	/* Nothing to be done. */
	return (TRUE);
	}

	static int
	mpt_event_reply_handler(struct mpt_softc mpt, request_t req,
	uint32_t reply_desc, MSG_DEFAULT_REPLY *reply_frame)
	{
	int free_reply;

	KASSERT(reply_frame != NULL, ("null reply in mpt_event_reply_handler"));
	KASSERT(req != NULL, ("null request in mpt_event_reply_handler"));

	free_reply = TRUE;
	switch (reply_frame->Function) {
	case MPI_FUNCTION_EVENT_NOTIFICATION:
	{
	MSG_EVENT_NOTIFY_REPLY *msg;
	struct mpt_personality *pers;
	u_int handled;

	handled = 0;
	msg = (MSG_EVENT_NOTIFY_REPLY *)reply_frame;
	msg->EventDataLength = le16toh(msg->EventDataLength);
	msg->IOCStatus = le16toh(msg->IOCStatus);
	msg->IOCLogInfo = le32toh(msg->IOCLogInfo);
	msg->Event = le32toh(msg->Event);
	MPT_PERS_FOREACH(mpt, pers)
	handled += pers->event(mpt, req, msg);

	if (handled == 0 && mpt->mpt_pers_mask == 0) {
	mpt_lprt(mpt, MPT_PRT_INFO,
	"No Handlers For Any Event Notify Frames. "
	"Event %#x (ACK %sequired).\n",
	msg->Event, msg->AckRequired? "r" : "not r");
	} else if (handled == 0) {
	mpt_lprt(mpt,
	msg->AckRequired? MPT_PRT_WARN : MPT_PRT_INFO,
	"Unhandled Event Notify Frame. Event %#x "
	"(ACK %sequired).\n",
	msg->Event, msg->AckRequired? "r" : "not r");
	}

	if (msg->AckRequired) {
	request_t *ack_req;
	uint32_t context;

	context = req->index \| MPT_REPLY_HANDLER_EVENTS;
	ack_req = mpt_get_request(mpt, FALSE);
	if (ack_req == NULL) {
	struct mpt_evtf_record *evtf;

	evtf = (struct mpt_evtf_record *)reply_frame;
	evtf->context = context;
	LIST_INSERT_HEAD(&mpt->ack_frames, evtf, links);
	free_reply = FALSE;
	break;
	}
	mpt_send_event_ack(mpt, ack_req, msg, context);
	/*
	* Don't check for CONTINUATION_REPLY here
	*/
	return (free_reply);
	}
	break;
	}
	case MPI_FUNCTION_PORT_ENABLE:
	mpt_lprt(mpt, MPT_PRT_DEBUG , "enable port reply\n");
	break;
	case MPI_FUNCTION_EVENT_ACK:
	break;
	default:
	mpt_prt(mpt, "unknown event function: %x\n",
	reply_frame->Function);
	break;
	}

	/*
	* I'm not sure that this continuation stuff works as it should.
	*
	* I've had FC async events occur that free the frame up because
	* the continuation bit isn't set, and then additional async events
	* then occur using the same context. As you might imagine, this
	* leads to Very Bad Thing.
	*
	* Let's just be safe for now and not free them up until we figure
	* out what's actually happening here.
	*/
	#if 0
	if ((reply_frame->MsgFlags & MPI_MSGFLAGS_CONTINUATION_REPLY) == 0) {
	TAILQ_REMOVE(&mpt->request_pending_list, req, links);
	mpt_free_request(mpt, req);
	mpt_prt(mpt, "event_reply %x for req %p:%u NOT a continuation",
	reply_frame->Function, req, req->serno);
	if (reply_frame->Function == MPI_FUNCTION_EVENT_NOTIFICATION) {
	MSG_EVENT_NOTIFY_REPLY *msg =
	(MSG_EVENT_NOTIFY_REPLY *)reply_frame;
	mpt_prtc(mpt, " Event=0x%x AckReq=%d",
	msg->Event, msg->AckRequired);
	}
	} else {
	mpt_prt(mpt, "event_reply %x for %p:%u IS a continuation",
	reply_frame->Function, req, req->serno);
	if (reply_frame->Function == MPI_FUNCTION_EVENT_NOTIFICATION) {
	MSG_EVENT_NOTIFY_REPLY *msg =
	(MSG_EVENT_NOTIFY_REPLY *)reply_frame;
	mpt_prtc(mpt, " Event=0x%x AckReq=%d",
	msg->Event, msg->AckRequired);
	}
	mpt_prtc(mpt, "\n");
	}
	#endif
	return (free_reply);
	}

	/*
	* Process an asynchronous event from the IOC.
	*/
	static int
	mpt_core_event(struct mpt_softc mpt, request_t req,
	MSG_EVENT_NOTIFY_REPLY *msg)
	{

	mpt_lprt(mpt, MPT_PRT_DEBUG, "mpt_core_event: 0x%x\n",
	msg->Event & 0xFF);
	switch(msg->Event & 0xFF) {
	case MPI_EVENT_NONE:
	break;
	case MPI_EVENT_LOG_DATA:
	{
	int i;

	/* Some error occurred that LSI wants logged */
	mpt_prt(mpt, "EvtLogData: IOCLogInfo: 0x%08x\n",
	msg->IOCLogInfo);
	mpt_prt(mpt, "\tEvtLogData: Event Data:");
	for (i = 0; i < msg->EventDataLength; i++)
	mpt_prtc(mpt, " %08x", msg->Data[i]);
	mpt_prtc(mpt, "\n");
	break;
	}
	case MPI_EVENT_EVENT_CHANGE:
	/*
	* This is just an acknowledgement
	* of our mpt_send_event_request.
	*/
	break;
	case MPI_EVENT_SAS_DEVICE_STATUS_CHANGE:
	break;
	default:
	return (0);
	break;
	}
	return (1);
	}

	static void
	mpt_send_event_ack(struct mpt_softc mpt, request_t ack_req,
	MSG_EVENT_NOTIFY_REPLY *msg, uint32_t context)
	{
	MSG_EVENT_ACK *ackp;

	ackp = (MSG_EVENT_ACK *)ack_req->req_vbuf;
	memset(ackp, 0, sizeof (*ackp));
	ackp->Function = MPI_FUNCTION_EVENT_ACK;
	ackp->Event = htole32(msg->Event);
	ackp->EventContext = htole32(msg->EventContext);
	ackp->MsgContext = htole32(context);
	mpt_check_doorbell(mpt);
	mpt_send_cmd(mpt, ack_req);
	}

	/*************************** Interrupt Handling ***************************/
	void
	mpt_intr(void *arg)
	{
	struct mpt_softc *mpt;
	uint32_t reply_desc;
	int ntrips = 0;

	mpt = (struct mpt_softc *)arg;
	mpt_lprt(mpt, MPT_PRT_DEBUG2, "enter mpt_intr\n");
	MPT_LOCK_ASSERT(mpt);

	while ((reply_desc = mpt_pop_reply_queue(mpt)) != MPT_REPLY_EMPTY) {
	request_t *req;
	MSG_DEFAULT_REPLY *reply_frame;
	uint32_t reply_baddr;
	uint32_t ctxt_idx;
	u_int cb_index;
	u_int req_index;
	u_int offset;
	int free_rf;

	req = NULL;
	reply_frame = NULL;
	reply_baddr = 0;
	offset = 0;
	if ((reply_desc & MPI_ADDRESS_REPLY_A_BIT) != 0) {
	/*
	* Ensure that the reply frame is coherent.
	*/
	reply_baddr = MPT_REPLY_BADDR(reply_desc);
	offset = reply_baddr - (mpt->reply_phys & 0xFFFFFFFF);
	bus_dmamap_sync_range(mpt->reply_dmat,
	mpt->reply_dmap, offset, MPT_REPLY_SIZE,
	BUS_DMASYNC_POSTREAD);
	reply_frame = MPT_REPLY_OTOV(mpt, offset);
	ctxt_idx = le32toh(reply_frame->MsgContext);
	} else {
	uint32_t type;

	type = MPI_GET_CONTEXT_REPLY_TYPE(reply_desc);
	ctxt_idx = reply_desc;
	mpt_lprt(mpt, MPT_PRT_DEBUG1, "Context Reply: 0x%08x\n",
	reply_desc);

	switch (type) {
	case MPI_CONTEXT_REPLY_TYPE_SCSI_INIT:
	ctxt_idx &= MPI_CONTEXT_REPLY_CONTEXT_MASK;
	break;
	case MPI_CONTEXT_REPLY_TYPE_SCSI_TARGET:
	ctxt_idx = GET_IO_INDEX(reply_desc);
	if (mpt->tgt_cmd_ptrs == NULL) {
	mpt_prt(mpt,
	"mpt_intr: no target cmd ptrs\n");
	reply_desc = MPT_REPLY_EMPTY;
	break;
	}
	if (ctxt_idx >= mpt->tgt_cmds_allocated) {
	mpt_prt(mpt,
	"mpt_intr: bad tgt cmd ctxt %u\n",
	ctxt_idx);
	reply_desc = MPT_REPLY_EMPTY;
	ntrips = 1000;
	break;
	}
	req = mpt->tgt_cmd_ptrs[ctxt_idx];
	if (req == NULL) {
	mpt_prt(mpt, "no request backpointer "
	"at index %u", ctxt_idx);
	reply_desc = MPT_REPLY_EMPTY;
	ntrips = 1000;
	break;
	}
	/*
	* Reformulate ctxt_idx to be just as if
	* it were another type of context reply
	* so the code below will find the request
	* via indexing into the pool.
	*/
	ctxt_idx =
	req->index \| mpt->scsi_tgt_handler_id;
	req = NULL;
	break;
	case MPI_CONTEXT_REPLY_TYPE_LAN:
	mpt_prt(mpt, "LAN CONTEXT REPLY: 0x%08x\n",
	reply_desc);
	reply_desc = MPT_REPLY_EMPTY;
	break;
	default:
	mpt_prt(mpt, "Context Reply 0x%08x?\n", type);
	reply_desc = MPT_REPLY_EMPTY;
	break;
	}
	if (reply_desc == MPT_REPLY_EMPTY) {
	if (ntrips++ > 1000) {
	break;
	}
	continue;
	}
	}

	cb_index = MPT_CONTEXT_TO_CBI(ctxt_idx);
	req_index = MPT_CONTEXT_TO_REQI(ctxt_idx);
	if (req_index < MPT_MAX_REQUESTS(mpt)) {
	req = &mpt->request_pool[req_index];
	} else {
	mpt_prt(mpt, "WARN: mpt_intr index == %d (reply_desc =="
	" 0x%x)\n", req_index, reply_desc);
	}

	bus_dmamap_sync(mpt->request_dmat, mpt->request_dmap,
	BUS_DMASYNC_POSTREAD \| BUS_DMASYNC_POSTWRITE);
	free_rf = mpt_reply_handlers[cb_index](mpt, req,
	reply_desc, reply_frame);

	if (reply_frame != NULL && free_rf) {
	bus_dmamap_sync_range(mpt->reply_dmat,
	mpt->reply_dmap, offset, MPT_REPLY_SIZE,
	BUS_DMASYNC_PREREAD);
	mpt_free_reply(mpt, reply_baddr);
	}

	/*
	* If we got ourselves disabled, don't get stuck in a loop
	*/
	if (mpt->disabled) {
	mpt_disable_ints(mpt);
	break;
	}
	if (ntrips++ > 1000) {
	break;
	}
	}
	mpt_lprt(mpt, MPT_PRT_DEBUG2, "exit mpt_intr\n");
	}

	/***************************** Error Recovery *****************************/
	void
	mpt_complete_request_chain(struct mpt_softc mpt, struct req_queue chain,
	u_int iocstatus)
	{
	MSG_DEFAULT_REPLY ioc_status_frame;
	request_t *req;

	memset(&ioc_status_frame, 0, sizeof(ioc_status_frame));
	ioc_status_frame.MsgLength = roundup2(sizeof(ioc_status_frame), 4);
	ioc_status_frame.IOCStatus = iocstatus;
	while((req = TAILQ_FIRST(chain)) != NULL) {
	MSG_REQUEST_HEADER *msg_hdr;
	u_int cb_index;

	bus_dmamap_sync(mpt->request_dmat, mpt->request_dmap,
	BUS_DMASYNC_POSTREAD \| BUS_DMASYNC_POSTWRITE);
	msg_hdr = (MSG_REQUEST_HEADER *)req->req_vbuf;
	ioc_status_frame.Function = msg_hdr->Function;
	ioc_status_frame.MsgContext = msg_hdr->MsgContext;
	cb_index = MPT_CONTEXT_TO_CBI(le32toh(msg_hdr->MsgContext));
	mpt_reply_handlers[cb_index](mpt, req, msg_hdr->MsgContext,
	&ioc_status_frame);
	if (mpt_req_on_pending_list(mpt, req) != 0)
	TAILQ_REMOVE(chain, req, links);
	}
	}

	/******************************* Diagnostics ******************************/
	/*
	* Perform a diagnostic dump of a reply frame.
	*/
	void
	mpt_dump_reply_frame(struct mpt_softc mpt, MSG_DEFAULT_REPLY reply_frame)
	{

	mpt_prt(mpt, "Address Reply:\n");
	mpt_print_reply(reply_frame);
	}

	/***************************** Doorbell Access ****************************/
	static __inline uint32_t mpt_rd_db(struct mpt_softc *mpt);
	static __inline uint32_t mpt_rd_intr(struct mpt_softc *mpt);

	static __inline uint32_t
	mpt_rd_db(struct mpt_softc *mpt)
	{

	return mpt_read(mpt, MPT_OFFSET_DOORBELL);
	}

	static __inline uint32_t
	mpt_rd_intr(struct mpt_softc *mpt)
	{

	return mpt_read(mpt, MPT_OFFSET_INTR_STATUS);
	}

	/* Busy wait for a door bell to be read by IOC */
	static int
	mpt_wait_db_ack(struct mpt_softc *mpt)
	{
	int i;

	for (i=0; i < MPT_MAX_WAIT; i++) {
	if (!MPT_DB_IS_BUSY(mpt_rd_intr(mpt))) {
	maxwait_ack = i > maxwait_ack ? i : maxwait_ack;
	return (MPT_OK);
	}
	DELAY(200);
	}
	return (MPT_FAIL);
	}

	/* Busy wait for a door bell interrupt */
	static int
	mpt_wait_db_int(struct mpt_softc *mpt)
	{
	int i;

	for (i = 0; i < MPT_MAX_WAIT; i++) {
	if (MPT_DB_INTR(mpt_rd_intr(mpt))) {
	maxwait_int = i > maxwait_int ? i : maxwait_int;
	return MPT_OK;
	}
	DELAY(100);
	}
	return (MPT_FAIL);
	}

	/* Wait for IOC to transition to a give state */
	void
	mpt_check_doorbell(struct mpt_softc *mpt)
	{
	uint32_t db = mpt_rd_db(mpt);

	if (MPT_STATE(db) != MPT_DB_STATE_RUNNING) {
	mpt_prt(mpt, "Device not running\n");
	mpt_print_db(db);
	}
	}

	/* Wait for IOC to transition to a give state */
	static int
	mpt_wait_state(struct mpt_softc *mpt, enum DB_STATE_BITS state)
	{
	int i;

	for (i = 0; i < MPT_MAX_WAIT; i++) {
	uint32_t db = mpt_rd_db(mpt);
	if (MPT_STATE(db) == state) {
	maxwait_state = i > maxwait_state ? i : maxwait_state;
	return (MPT_OK);
	}
	DELAY(100);
	}
	return (MPT_FAIL);
	}

	/*********************** Initialization/Configuration **********************/
	static int mpt_download_fw(struct mpt_softc *mpt);

	/* Issue the reset COMMAND to the IOC */
	static int
	mpt_soft_reset(struct mpt_softc *mpt)
	{

	mpt_lprt(mpt, MPT_PRT_DEBUG, "soft reset\n");

	/* Have to use hard reset if we are not in Running state */
	if (MPT_STATE(mpt_rd_db(mpt)) != MPT_DB_STATE_RUNNING) {
	mpt_prt(mpt, "soft reset failed: device not running\n");
	return (MPT_FAIL);
	}

	/* If door bell is in use we don't have a chance of getting
	* a word in since the IOC probably crashed in message
	* processing. So don't waste our time.
	*/
	if (MPT_DB_IS_IN_USE(mpt_rd_db(mpt))) {
	mpt_prt(mpt, "soft reset failed: doorbell wedged\n");
	return (MPT_FAIL);
	}

	/* Send the reset request to the IOC */
	mpt_write(mpt, MPT_OFFSET_DOORBELL,
	MPI_FUNCTION_IOC_MESSAGE_UNIT_RESET << MPI_DOORBELL_FUNCTION_SHIFT);
	if (mpt_wait_db_ack(mpt) != MPT_OK) {
	mpt_prt(mpt, "soft reset failed: ack timeout\n");
	return (MPT_FAIL);
	}

	/* Wait for the IOC to reload and come out of reset state */
	if (mpt_wait_state(mpt, MPT_DB_STATE_READY) != MPT_OK) {
	mpt_prt(mpt, "soft reset failed: device did not restart\n");
	return (MPT_FAIL);
	}

	return MPT_OK;
	}

	static int
	mpt_enable_diag_mode(struct mpt_softc *mpt)
	{
	int try;

	try = 20;
	while (--try) {
	if ((mpt_read(mpt, MPT_OFFSET_DIAGNOSTIC) & MPI_DIAG_DRWE) != 0)
	break;

	/* Enable diagnostic registers */
	mpt_write(mpt, MPT_OFFSET_SEQUENCE, 0xFF);
	mpt_write(mpt, MPT_OFFSET_SEQUENCE, MPI_WRSEQ_1ST_KEY_VALUE);
	mpt_write(mpt, MPT_OFFSET_SEQUENCE, MPI_WRSEQ_2ND_KEY_VALUE);
	mpt_write(mpt, MPT_OFFSET_SEQUENCE, MPI_WRSEQ_3RD_KEY_VALUE);
	mpt_write(mpt, MPT_OFFSET_SEQUENCE, MPI_WRSEQ_4TH_KEY_VALUE);
	mpt_write(mpt, MPT_OFFSET_SEQUENCE, MPI_WRSEQ_5TH_KEY_VALUE);

	DELAY(100000);
	}
	if (try == 0)
	return (EIO);
	return (0);
	}

	static void
	mpt_disable_diag_mode(struct mpt_softc *mpt)
	{

	mpt_write(mpt, MPT_OFFSET_SEQUENCE, 0xFFFFFFFF);
	}

	/* This is a magic diagnostic reset that resets all the ARM
	* processors in the chip.
	*/
	static void
	mpt_hard_reset(struct mpt_softc *mpt)
	{
	int error;
	int wait;
	uint32_t diagreg;

	mpt_lprt(mpt, MPT_PRT_DEBUG, "hard reset\n");

	if (mpt->is_1078) {
	mpt_write(mpt, MPT_OFFSET_RESET_1078, 0x07);
	DELAY(1000);
	return;
	}

	error = mpt_enable_diag_mode(mpt);
	if (error) {
	mpt_prt(mpt, "WARNING - Could not enter diagnostic mode !\n");
	mpt_prt(mpt, "Trying to reset anyway.\n");
	}

	diagreg = mpt_read(mpt, MPT_OFFSET_DIAGNOSTIC);

	/*
	* This appears to be a workaround required for some
	* firmware or hardware revs.
	*/
	mpt_write(mpt, MPT_OFFSET_DIAGNOSTIC, diagreg \| MPI_DIAG_DISABLE_ARM);
	DELAY(1000);

	/* Diag. port is now active so we can now hit the reset bit */
	mpt_write(mpt, MPT_OFFSET_DIAGNOSTIC, diagreg \| MPI_DIAG_RESET_ADAPTER);

	/*
	* Ensure that the reset has finished. We delay 1ms
	* prior to reading the register to make sure the chip
	* has sufficiently completed its reset to handle register
	* accesses.
	*/
	wait = 5000;
	do {
	DELAY(1000);
	diagreg = mpt_read(mpt, MPT_OFFSET_DIAGNOSTIC);
	} while (--wait && (diagreg & MPI_DIAG_RESET_ADAPTER) == 0);

	if (wait == 0) {
	mpt_prt(mpt, "WARNING - Failed hard reset! "
	"Trying to initialize anyway.\n");
	}

	/*
	* If we have firmware to download, it must be loaded before
	* the controller will become operational. Do so now.
	*/
	if (mpt->fw_image != NULL) {
	error = mpt_download_fw(mpt);

	if (error) {
	mpt_prt(mpt, "WARNING - Firmware Download Failed!\n");
	mpt_prt(mpt, "Trying to initialize anyway.\n");
	}
	}

	/*
	* Reseting the controller should have disabled write
	* access to the diagnostic registers, but disable
	* manually to be sure.
	*/
	mpt_disable_diag_mode(mpt);
	}

	static void
	mpt_core_ioc_reset(struct mpt_softc *mpt, int type)
	{

	/*
	* Complete all pending requests with a status
	* appropriate for an IOC reset.
	*/
	mpt_complete_request_chain(mpt, &mpt->request_pending_list,
	MPI_IOCSTATUS_INVALID_STATE);
	}

	/*
	* Reset the IOC when needed. Try software command first then if needed
	* poke at the magic diagnostic reset. Note that a hard reset resets
	* both IOCs on dual function chips (FC929 && LSI1030) as well as
	* fouls up the PCI configuration registers.
	*/
	int
	mpt_reset(struct mpt_softc *mpt, int reinit)
	{
	struct mpt_personality *pers;
	int ret;
	int retry_cnt = 0;

	/*
	* Try a soft reset. If that fails, get out the big hammer.
	*/
	again:
	if ((ret = mpt_soft_reset(mpt)) != MPT_OK) {
	int cnt;
	for (cnt = 0; cnt < 5; cnt++) {
	/* Failed; do a hard reset */
	mpt_hard_reset(mpt);

	/*
	* Wait for the IOC to reload
	* and come out of reset state
	*/
	ret = mpt_wait_state(mpt, MPT_DB_STATE_READY);
	if (ret == MPT_OK) {
	break;
	}
	/*
	* Okay- try to check again...
	*/
	ret = mpt_wait_state(mpt, MPT_DB_STATE_READY);
	if (ret == MPT_OK) {
	break;
	}
	mpt_prt(mpt, "mpt_reset: failed hard reset (%d:%d)\n",
	retry_cnt, cnt);
	}
	}

	if (retry_cnt == 0) {
	/*
	* Invoke reset handlers. We bump the reset count so
	* that mpt_wait_req() understands that regardless of
	* the specified wait condition, it should stop its wait.
	*/
	mpt->reset_cnt++;
	MPT_PERS_FOREACH(mpt, pers)
	pers->reset(mpt, ret);
	}

	if (reinit) {
	ret = mpt_enable_ioc(mpt, 1);
	if (ret == MPT_OK) {
	mpt_enable_ints(mpt);
	}
	}
	if (ret != MPT_OK && retry_cnt++ < 2) {
	goto again;
	}
	return ret;
	}

	/* Return a command buffer to the free queue */
	void
	mpt_free_request(struct mpt_softc mpt, request_t req)
	{
	request_t *nxt;
	struct mpt_evtf_record *record;
	uint32_t offset, reply_baddr;

	if (req == NULL \|\| req != &mpt->request_pool[req->index]) {
	panic("mpt_free_request: bad req ptr");
	}
	if ((nxt = req->chain) != NULL) {
	req->chain = NULL;
	mpt_free_request(mpt, nxt); /* NB: recursion */
	}
	KASSERT(req->state != REQ_STATE_FREE, ("freeing free request"));
	KASSERT(!(req->state & REQ_STATE_LOCKED), ("freeing locked request"));
	MPT_LOCK_ASSERT(mpt);
	KASSERT(mpt_req_on_free_list(mpt, req) == 0,
	("mpt_free_request: req %p:%u func %x already on freelist",
	req, req->serno, ((MSG_REQUEST_HEADER *)req->req_vbuf)->Function));
	KASSERT(mpt_req_on_pending_list(mpt, req) == 0,
	("mpt_free_request: req %p:%u func %x on pending list",
	req, req->serno, ((MSG_REQUEST_HEADER *)req->req_vbuf)->Function));
	#ifdef INVARIANTS
	mpt_req_not_spcl(mpt, req, "mpt_free_request", __LINE__);
	#endif

	req->ccb = NULL;
	if (LIST_EMPTY(&mpt->ack_frames)) {
	/*
	* Insert free ones at the tail
	*/
	req->serno = 0;
	req->state = REQ_STATE_FREE;
	#ifdef INVARIANTS
	memset(req->req_vbuf, 0xff, sizeof (MSG_REQUEST_HEADER));
	#endif
	TAILQ_INSERT_TAIL(&mpt->request_free_list, req, links);
	if (mpt->getreqwaiter != 0) {
	mpt->getreqwaiter = 0;
	wakeup(&mpt->request_free_list);
	}
	return;
	}

	/*
	* Process an ack frame deferred due to resource shortage.
	*/
	record = LIST_FIRST(&mpt->ack_frames);
	LIST_REMOVE(record, links);
	req->state = REQ_STATE_ALLOCATED;
	mpt_assign_serno(mpt, req);
	mpt_send_event_ack(mpt, req, &record->reply, record->context);
	offset = (uint32_t)((uint8_t *)record - mpt->reply);
	reply_baddr = offset + (mpt->reply_phys & 0xFFFFFFFF);
	bus_dmamap_sync_range(mpt->reply_dmat, mpt->reply_dmap, offset,
	MPT_REPLY_SIZE, BUS_DMASYNC_PREREAD);
	mpt_free_reply(mpt, reply_baddr);
	}

	/* Get a command buffer from the free queue */
	request_t *
	mpt_get_request(struct mpt_softc *mpt, int sleep_ok)
	{
	request_t *req;

	retry:
	MPT_LOCK_ASSERT(mpt);
	req = TAILQ_FIRST(&mpt->request_free_list);
	if (req != NULL) {
	KASSERT(req == &mpt->request_pool[req->index],
	("mpt_get_request: corrupted request free list"));
	KASSERT(req->state == REQ_STATE_FREE,
	("req %p:%u not free on free list %x index %d function %x",
	req, req->serno, req->state, req->index,
	((MSG_REQUEST_HEADER *)req->req_vbuf)->Function));
	TAILQ_REMOVE(&mpt->request_free_list, req, links);
	req->state = REQ_STATE_ALLOCATED;
	req->chain = NULL;
	mpt_assign_serno(mpt, req);
	} else if (sleep_ok != 0) {
	mpt->getreqwaiter = 1;
	mpt_sleep(mpt, &mpt->request_free_list, PUSER, "mptgreq", 0);
	goto retry;
	}
	return (req);
	}

	/* Pass the command to the IOC */
	void
	mpt_send_cmd(struct mpt_softc mpt, request_t req)
	{

	if (mpt->verbose > MPT_PRT_DEBUG2) {
	mpt_dump_request(mpt, req);
	}
	bus_dmamap_sync(mpt->request_dmat, mpt->request_dmap,
	BUS_DMASYNC_PREREAD \| BUS_DMASYNC_PREWRITE);
	req->state \|= REQ_STATE_QUEUED;
	KASSERT(mpt_req_on_free_list(mpt, req) == 0,
	("req %p:%u func %x on freelist list in mpt_send_cmd",
	req, req->serno, ((MSG_REQUEST_HEADER *)req->req_vbuf)->Function));
	KASSERT(mpt_req_on_pending_list(mpt, req) == 0,
	("req %p:%u func %x already on pending list in mpt_send_cmd",
	req, req->serno, ((MSG_REQUEST_HEADER *)req->req_vbuf)->Function));
	TAILQ_INSERT_HEAD(&mpt->request_pending_list, req, links);
	mpt_write(mpt, MPT_OFFSET_REQUEST_Q, (uint32_t) req->req_pbuf);
	}

	/*
	* Wait for a request to complete.
	*
	* Inputs:
	* mpt softc of controller executing request
	* req request to wait for
	* sleep_ok nonzero implies may sleep in this context
	* time_ms timeout in ms. 0 implies no timeout.
	*
	* Return Values:
	* 0 Request completed
	* non-0 Timeout fired before request completion.
	*/
	int
	mpt_wait_req(struct mpt_softc mpt, request_t req,
	mpt_req_state_t state, mpt_req_state_t mask,
	int sleep_ok, int time_ms)
	{
	int timeout;
	u_int saved_cnt;
	sbintime_t sbt;

	/*
	* time_ms is in ms, 0 indicates infinite wait.
	* Convert to sbintime_t or 500us units depending on
	* our sleep mode.
	*/
	if (sleep_ok != 0) {
	sbt = SBT_1MS * time_ms;
	/* Set timeout as well so final timeout check works. */
	timeout = time_ms;
	} else {
	sbt = 0; /* Squelch bogus gcc warning. */
	timeout = time_ms * 2;
	}
	req->state \|= REQ_STATE_NEED_WAKEUP;
	mask &= ~REQ_STATE_NEED_WAKEUP;
	saved_cnt = mpt->reset_cnt;
	while ((req->state & mask) != state && mpt->reset_cnt == saved_cnt) {
	if (sleep_ok != 0) {
	if (mpt_sleep(mpt, req, PUSER, "mptreq", sbt) ==
	EWOULDBLOCK) {
	timeout = 0;
	break;
	}
	} else {
	if (time_ms != 0 && --timeout == 0) {
	break;
	}
	DELAY(500);
	mpt_intr(mpt);
	}
	}
	req->state &= ~REQ_STATE_NEED_WAKEUP;
	if (mpt->reset_cnt != saved_cnt) {
	return (EIO);
	}
	if (time_ms && timeout <= 0) {
	MSG_REQUEST_HEADER *msg_hdr = req->req_vbuf;
	req->state \|= REQ_STATE_TIMEDOUT;
	mpt_prt(mpt, "mpt_wait_req(%x) timed out\n", msg_hdr->Function);
	return (ETIMEDOUT);
	}
	return (0);
	}

	/*
	* Send a command to the IOC via the handshake register.
	*
	* Only done at initialization time and for certain unusual
	* commands such as device/bus reset as specified by LSI.
	*/
	int
	mpt_send_handshake_cmd(struct mpt_softc mpt, size_t len, void cmd)
	{
	int i;
	uint32_t data, *data32;

	/* Check condition of the IOC */
	data = mpt_rd_db(mpt);
	if ((MPT_STATE(data) != MPT_DB_STATE_READY
	&& MPT_STATE(data) != MPT_DB_STATE_RUNNING
	&& MPT_STATE(data) != MPT_DB_STATE_FAULT)
	\|\| MPT_DB_IS_IN_USE(data)) {
	mpt_prt(mpt, "handshake aborted - invalid doorbell state\n");
	mpt_print_db(data);
	return (EBUSY);
	}

	/* We move things in 32 bit chunks */
	len = (len + 3) >> 2;
	data32 = cmd;

	/* Clear any left over pending doorbell interrupts */
	if (MPT_DB_INTR(mpt_rd_intr(mpt)))
	mpt_write(mpt, MPT_OFFSET_INTR_STATUS, 0);

	/*
	* Tell the handshake reg. we are going to send a command
	* and how long it is going to be.
	*/
	data = (MPI_FUNCTION_HANDSHAKE << MPI_DOORBELL_FUNCTION_SHIFT) \|
	(len << MPI_DOORBELL_ADD_DWORDS_SHIFT);
	mpt_write(mpt, MPT_OFFSET_DOORBELL, data);

	/* Wait for the chip to notice */
	if (mpt_wait_db_int(mpt) != MPT_OK) {
	mpt_prt(mpt, "mpt_send_handshake_cmd: db ignored\n");
	return (ETIMEDOUT);
	}

	/* Clear the interrupt */
	mpt_write(mpt, MPT_OFFSET_INTR_STATUS, 0);

	if (mpt_wait_db_ack(mpt) != MPT_OK) {
	mpt_prt(mpt, "mpt_send_handshake_cmd: db ack timed out\n");
	return (ETIMEDOUT);
	}

	/* Send the command */
	for (i = 0; i < len; i++) {
	mpt_write_stream(mpt, MPT_OFFSET_DOORBELL, *data32++);
	if (mpt_wait_db_ack(mpt) != MPT_OK) {
	mpt_prt(mpt,
	"mpt_send_handshake_cmd: timeout @ index %d\n", i);
	return (ETIMEDOUT);
	}
	}
	return MPT_OK;
	}

	/* Get the response from the handshake register */
	int
	mpt_recv_handshake_reply(struct mpt_softc mpt, size_t reply_len, void reply)
	{
	int left, reply_left;
	u_int16_t *data16;
	uint32_t data;
	MSG_DEFAULT_REPLY *hdr;

	/* We move things out in 16 bit chunks */
	reply_len >>= 1;
	data16 = (u_int16_t *)reply;

	hdr = (MSG_DEFAULT_REPLY *)reply;

	/* Get first word */
	if (mpt_wait_db_int(mpt) != MPT_OK) {
	mpt_prt(mpt, "mpt_recv_handshake_cmd timeout1\n");
	return ETIMEDOUT;
	}
	data = mpt_read(mpt, MPT_OFFSET_DOORBELL);
	*data16++ = le16toh(data & MPT_DB_DATA_MASK);
	mpt_write(mpt, MPT_OFFSET_INTR_STATUS, 0);

	/* Get second word */
	if (mpt_wait_db_int(mpt) != MPT_OK) {
	mpt_prt(mpt, "mpt_recv_handshake_cmd timeout2\n");
	return ETIMEDOUT;
	}
	data = mpt_read(mpt, MPT_OFFSET_DOORBELL);
	*data16++ = le16toh(data & MPT_DB_DATA_MASK);
	mpt_write(mpt, MPT_OFFSET_INTR_STATUS, 0);

	/*
	* With the second word, we can now look at the length.
	* Warn about a reply that's too short (except for IOC FACTS REPLY)
	*/
	if ((reply_len >> 1) != hdr->MsgLength &&
	(hdr->Function != MPI_FUNCTION_IOC_FACTS)){
	mpt_prt(mpt, "reply length does not match message length: "
	"got %x; expected %zx for function %x\n",
	hdr->MsgLength << 2, reply_len << 1, hdr->Function);
	}

	/* Get rest of the reply; but don't overflow the provided buffer */
	left = (hdr->MsgLength << 1) - 2;
	reply_left = reply_len - 2;
	while (left--) {
	if (mpt_wait_db_int(mpt) != MPT_OK) {
	mpt_prt(mpt, "mpt_recv_handshake_cmd timeout3\n");
	return ETIMEDOUT;
	}
	data = mpt_read(mpt, MPT_OFFSET_DOORBELL);
	if (reply_left-- > 0)
	*data16++ = le16toh(data & MPT_DB_DATA_MASK);
	mpt_write(mpt, MPT_OFFSET_INTR_STATUS, 0);
	}

	/* One more wait & clear at the end */
	if (mpt_wait_db_int(mpt) != MPT_OK) {
	mpt_prt(mpt, "mpt_recv_handshake_cmd timeout4\n");
	return ETIMEDOUT;
	}
	mpt_write(mpt, MPT_OFFSET_INTR_STATUS, 0);

	if ((hdr->IOCStatus & MPI_IOCSTATUS_MASK) != MPI_IOCSTATUS_SUCCESS) {
	if (mpt->verbose >= MPT_PRT_TRACE)
	mpt_print_reply(hdr);
	return (MPT_FAIL \| hdr->IOCStatus);
	}

	return (0);
	}

	static int
	mpt_get_iocfacts(struct mpt_softc mpt, MSG_IOC_FACTS_REPLY freplp)
	{
	MSG_IOC_FACTS f_req;
	int error;

	memset(&f_req, 0, sizeof f_req);
	f_req.Function = MPI_FUNCTION_IOC_FACTS;
	f_req.MsgContext = htole32(MPT_REPLY_HANDLER_HANDSHAKE);
	error = mpt_send_handshake_cmd(mpt, sizeof f_req, &f_req);
	if (error) {
	return(error);
	}
	error = mpt_recv_handshake_reply(mpt, sizeof (*freplp), freplp);
	return (error);
	}

	static int
	mpt_get_portfacts(struct mpt_softc mpt, U8 port, MSG_PORT_FACTS_REPLY freplp)
	{
	MSG_PORT_FACTS f_req;
	int error;

	memset(&f_req, 0, sizeof f_req);
	f_req.Function = MPI_FUNCTION_PORT_FACTS;
	f_req.PortNumber = port;
	f_req.MsgContext = htole32(MPT_REPLY_HANDLER_HANDSHAKE);
	error = mpt_send_handshake_cmd(mpt, sizeof f_req, &f_req);
	if (error) {
	return(error);
	}
	error = mpt_recv_handshake_reply(mpt, sizeof (*freplp), freplp);
	return (error);
	}

	/*
	* Send the initialization request. This is where we specify how many
	* SCSI buses and how many devices per bus we wish to emulate.
	* This is also the command that specifies the max size of the reply
	* frames from the IOC that we will be allocating.
	*/
	static int
	mpt_send_ioc_init(struct mpt_softc *mpt, uint32_t who)
	{
	int error = 0;
	MSG_IOC_INIT init;
	MSG_IOC_INIT_REPLY reply;

	memset(&init, 0, sizeof init);
	init.WhoInit = who;
	init.Function = MPI_FUNCTION_IOC_INIT;
	init.MaxDevices = 0; /* at least 256 devices per bus */
	init.MaxBuses = 16; /* at least 16 buses */

	init.MsgVersion = htole16(MPI_VERSION);
	init.HeaderVersion = htole16(MPI_HEADER_VERSION);
	init.ReplyFrameSize = htole16(MPT_REPLY_SIZE);
	init.MsgContext = htole32(MPT_REPLY_HANDLER_HANDSHAKE);

	if ((error = mpt_send_handshake_cmd(mpt, sizeof init, &init)) != 0) {
	return(error);
	}

	error = mpt_recv_handshake_reply(mpt, sizeof reply, &reply);
	return (error);
	}

	/*
	* Utiltity routine to read configuration headers and pages
	*/
	int
	mpt_issue_cfg_req(struct mpt_softc mpt, request_t req, cfgparms_t *params,
	bus_addr_t addr, bus_size_t len, int sleep_ok, int timeout_ms)
	{
	MSG_CONFIG *cfgp;
	SGE_SIMPLE32 *se;

	cfgp = req->req_vbuf;
	memset(cfgp, 0, sizeof *cfgp);
	cfgp->Action = params->Action;
	cfgp->Function = MPI_FUNCTION_CONFIG;
	cfgp->Header.PageVersion = params->PageVersion;
	cfgp->Header.PageNumber = params->PageNumber;
	cfgp->PageAddress = htole32(params->PageAddress);
	if ((params->PageType & MPI_CONFIG_PAGETYPE_MASK) ==
	MPI_CONFIG_PAGETYPE_EXTENDED) {
	cfgp->Header.PageType = MPI_CONFIG_PAGETYPE_EXTENDED;
	cfgp->Header.PageLength = 0;
	cfgp->ExtPageLength = htole16(params->ExtPageLength);
	cfgp->ExtPageType = params->ExtPageType;
	} else {
	cfgp->Header.PageType = params->PageType;
	cfgp->Header.PageLength = params->PageLength;
	}
	se = (SGE_SIMPLE32 *)&cfgp->PageBufferSGE;
	se->Address = htole32(addr);
	MPI_pSGE_SET_LENGTH(se, len);
	MPI_pSGE_SET_FLAGS(se, (MPI_SGE_FLAGS_SIMPLE_ELEMENT \|
	MPI_SGE_FLAGS_LAST_ELEMENT \| MPI_SGE_FLAGS_END_OF_BUFFER \|
	MPI_SGE_FLAGS_END_OF_LIST \|
	((params->Action == MPI_CONFIG_ACTION_PAGE_WRITE_CURRENT
	\|\| params->Action == MPI_CONFIG_ACTION_PAGE_WRITE_NVRAM)
	? MPI_SGE_FLAGS_HOST_TO_IOC : MPI_SGE_FLAGS_IOC_TO_HOST)));
	se->FlagsLength = htole32(se->FlagsLength);
	cfgp->MsgContext = htole32(req->index \| MPT_REPLY_HANDLER_CONFIG);

	mpt_check_doorbell(mpt);
	mpt_send_cmd(mpt, req);
	return (mpt_wait_req(mpt, req, REQ_STATE_DONE, REQ_STATE_DONE,
	sleep_ok, timeout_ms));
	}

	int
	mpt_read_extcfg_header(struct mpt_softc *mpt, int PageVersion, int PageNumber,
	uint32_t PageAddress, int ExtPageType,
	CONFIG_EXTENDED_PAGE_HEADER *rslt,
	int sleep_ok, int timeout_ms)
	{
	request_t *req;
	cfgparms_t params;
	MSG_CONFIG_REPLY *cfgp;
	int error;

	req = mpt_get_request(mpt, sleep_ok);
	if (req == NULL) {
	mpt_prt(mpt, "mpt_extread_cfg_header: Get request failed!\n");
	return (ENOMEM);
	}

	params.Action = MPI_CONFIG_ACTION_PAGE_HEADER;
	params.PageVersion = PageVersion;
	params.PageLength = 0;
	params.PageNumber = PageNumber;
	params.PageType = MPI_CONFIG_PAGETYPE_EXTENDED;
	params.PageAddress = PageAddress;
	params.ExtPageType = ExtPageType;
	params.ExtPageLength = 0;
	error = mpt_issue_cfg_req(mpt, req, &params, /addr/0, /len/0,
	sleep_ok, timeout_ms);
	if (error != 0) {
	/*
	* Leave the request. Without resetting the chip, it's
	* still owned by it and we'll just get into trouble
	* freeing it now. Mark it as abandoned so that if it
	* shows up later it can be freed.
	*/
	mpt_prt(mpt, "read_extcfg_header timed out\n");
	return (ETIMEDOUT);
	}

	switch (req->IOCStatus & MPI_IOCSTATUS_MASK) {
	case MPI_IOCSTATUS_SUCCESS:
	cfgp = req->req_vbuf;
	rslt->PageVersion = cfgp->Header.PageVersion;
	rslt->PageNumber = cfgp->Header.PageNumber;
	rslt->PageType = cfgp->Header.PageType;
	rslt->ExtPageLength = le16toh(cfgp->ExtPageLength);
	rslt->ExtPageType = cfgp->ExtPageType;
	error = 0;
	break;
	case MPI_IOCSTATUS_CONFIG_INVALID_PAGE:
	mpt_lprt(mpt, MPT_PRT_DEBUG,
	"Invalid Page Type %d Number %d Addr 0x%0x\n",
	MPI_CONFIG_PAGETYPE_EXTENDED, PageNumber, PageAddress);
	error = EINVAL;
	break;
	default:
	mpt_prt(mpt, "mpt_read_extcfg_header: Config Info Status %x\n",
	req->IOCStatus);
	error = EIO;
	break;
	}
	mpt_free_request(mpt, req);
	return (error);
	}

	int
	mpt_read_extcfg_page(struct mpt_softc *mpt, int Action, uint32_t PageAddress,
	CONFIG_EXTENDED_PAGE_HEADER hdr, void buf, size_t len,
	int sleep_ok, int timeout_ms)
	{
	request_t *req;
	cfgparms_t params;
	int error;

	req = mpt_get_request(mpt, sleep_ok);
	if (req == NULL) {
	mpt_prt(mpt, "mpt_read_extcfg_page: Get request failed!\n");
	return (-1);
	}

	params.Action = Action;
	params.PageVersion = hdr->PageVersion;
	params.PageLength = 0;
	params.PageNumber = hdr->PageNumber;
	params.PageType = MPI_CONFIG_PAGETYPE_EXTENDED;
	params.PageAddress = PageAddress;
	params.ExtPageType = hdr->ExtPageType;
	params.ExtPageLength = hdr->ExtPageLength;
	error = mpt_issue_cfg_req(mpt, req, &params,
	req->req_pbuf + MPT_RQSL(mpt),
	len, sleep_ok, timeout_ms);
	if (error != 0) {
	mpt_prt(mpt, "read_extcfg_page(%d) timed out\n", Action);
	return (-1);
	}

	if ((req->IOCStatus & MPI_IOCSTATUS_MASK) != MPI_IOCSTATUS_SUCCESS) {
	mpt_prt(mpt, "mpt_read_extcfg_page: Config Info Status %x\n",
	req->IOCStatus);
	mpt_free_request(mpt, req);
	return (-1);
	}
	memcpy(buf, ((uint8_t *)req->req_vbuf)+MPT_RQSL(mpt), len);
	mpt_free_request(mpt, req);
	return (0);
	}

	int
	mpt_read_cfg_header(struct mpt_softc *mpt, int PageType, int PageNumber,
	uint32_t PageAddress, CONFIG_PAGE_HEADER *rslt,
	int sleep_ok, int timeout_ms)
	{
	request_t *req;
	cfgparms_t params;
	MSG_CONFIG *cfgp;
	int error;

	req = mpt_get_request(mpt, sleep_ok);
	if (req == NULL) {
	mpt_prt(mpt, "mpt_read_cfg_header: Get request failed!\n");
	return (ENOMEM);
	}

	params.Action = MPI_CONFIG_ACTION_PAGE_HEADER;
	params.PageVersion = 0;
	params.PageLength = 0;
	params.PageNumber = PageNumber;
	params.PageType = PageType;
	params.PageAddress = PageAddress;
	error = mpt_issue_cfg_req(mpt, req, &params, /addr/0, /len/0,
	sleep_ok, timeout_ms);
	if (error != 0) {
	/*
	* Leave the request. Without resetting the chip, it's
	* still owned by it and we'll just get into trouble
	* freeing it now. Mark it as abandoned so that if it
	* shows up later it can be freed.
	*/
	mpt_prt(mpt, "read_cfg_header timed out\n");
	return (ETIMEDOUT);
	}

	switch (req->IOCStatus & MPI_IOCSTATUS_MASK) {
	case MPI_IOCSTATUS_SUCCESS:
	cfgp = req->req_vbuf;
	bcopy(&cfgp->Header, rslt, sizeof(*rslt));
	error = 0;
	break;
	case MPI_IOCSTATUS_CONFIG_INVALID_PAGE:
	mpt_lprt(mpt, MPT_PRT_DEBUG,
	"Invalid Page Type %d Number %d Addr 0x%0x\n",
	PageType, PageNumber, PageAddress);
	error = EINVAL;
	break;
	default:
	mpt_prt(mpt, "mpt_read_cfg_header: Config Info Status %x\n",
	req->IOCStatus);
	error = EIO;
	break;
	}
	mpt_free_request(mpt, req);
	return (error);
	}

	int
	mpt_read_cfg_page(struct mpt_softc *mpt, int Action, uint32_t PageAddress,
	CONFIG_PAGE_HEADER *hdr, size_t len, int sleep_ok,
	int timeout_ms)
	{
	request_t *req;
	cfgparms_t params;
	int error;

	req = mpt_get_request(mpt, sleep_ok);
	if (req == NULL) {
	mpt_prt(mpt, "mpt_read_cfg_page: Get request failed!\n");
	return (-1);
	}

	params.Action = Action;
	params.PageVersion = hdr->PageVersion;
	params.PageLength = hdr->PageLength;
	params.PageNumber = hdr->PageNumber;
	params.PageType = hdr->PageType & MPI_CONFIG_PAGETYPE_MASK;
	params.PageAddress = PageAddress;
	error = mpt_issue_cfg_req(mpt, req, &params,
	req->req_pbuf + MPT_RQSL(mpt),
	len, sleep_ok, timeout_ms);
	if (error != 0) {
	mpt_prt(mpt, "read_cfg_page(%d) timed out\n", Action);
	return (-1);
	}

	if ((req->IOCStatus & MPI_IOCSTATUS_MASK) != MPI_IOCSTATUS_SUCCESS) {
	mpt_prt(mpt, "mpt_read_cfg_page: Config Info Status %x\n",
	req->IOCStatus);
	mpt_free_request(mpt, req);
	return (-1);
	}
	memcpy(hdr, ((uint8_t *)req->req_vbuf)+MPT_RQSL(mpt), len);
	mpt_free_request(mpt, req);
	return (0);
	}

	int
	mpt_write_cfg_page(struct mpt_softc *mpt, int Action, uint32_t PageAddress,
	CONFIG_PAGE_HEADER *hdr, size_t len, int sleep_ok,
	int timeout_ms)
	{
	request_t *req;
	cfgparms_t params;
	u_int hdr_attr;
	int error;

	hdr_attr = hdr->PageType & MPI_CONFIG_PAGEATTR_MASK;
	if (hdr_attr != MPI_CONFIG_PAGEATTR_CHANGEABLE &&
	hdr_attr != MPI_CONFIG_PAGEATTR_PERSISTENT) {
	mpt_prt(mpt, "page type 0x%x not changeable\n",
	hdr->PageType & MPI_CONFIG_PAGETYPE_MASK);
	return (-1);
	}

	#if 0
	/*
	* We shouldn't mask off other bits here.
	*/
	hdr->PageType &= MPI_CONFIG_PAGETYPE_MASK;
	#endif

	req = mpt_get_request(mpt, sleep_ok);
	if (req == NULL)
	return (-1);

	memcpy(((caddr_t)req->req_vbuf) + MPT_RQSL(mpt), hdr, len);

	/*
	* There isn't any point in restoring stripped out attributes
	* if you then mask them going down to issue the request.
	*/

	params.Action = Action;
	params.PageVersion = hdr->PageVersion;
	params.PageLength = hdr->PageLength;
	params.PageNumber = hdr->PageNumber;
	params.PageAddress = PageAddress;
	#if 0
	/* Restore stripped out attributes */
	hdr->PageType \|= hdr_attr;
	params.PageType = hdr->PageType & MPI_CONFIG_PAGETYPE_MASK;
	#else
	params.PageType = hdr->PageType;
	#endif
	error = mpt_issue_cfg_req(mpt, req, &params,
	req->req_pbuf + MPT_RQSL(mpt),
	len, sleep_ok, timeout_ms);
	if (error != 0) {
	mpt_prt(mpt, "mpt_write_cfg_page timed out\n");
	return (-1);
	}

	if ((req->IOCStatus & MPI_IOCSTATUS_MASK) != MPI_IOCSTATUS_SUCCESS) {
	mpt_prt(mpt, "mpt_write_cfg_page: Config Info Status %x\n",
	req->IOCStatus);
	mpt_free_request(mpt, req);
	return (-1);
	}
	mpt_free_request(mpt, req);
	return (0);
	}

	/*
	* Read IOC configuration information
	*/
	static int
	mpt_read_config_info_ioc(struct mpt_softc *mpt)
	{
	CONFIG_PAGE_HEADER hdr;
	struct mpt_raid_volume *mpt_raid;
	int rv;
	int i;
	size_t len;

	rv = mpt_read_cfg_header(mpt, MPI_CONFIG_PAGETYPE_IOC,
	2, 0, &hdr, FALSE, 5000);
	/*
	* If it's an invalid page, so what? Not a supported function....
	*/
	if (rv == EINVAL) {
	return (0);
	}
	if (rv) {
	return (rv);
	}

	mpt_lprt(mpt, MPT_PRT_DEBUG,
	"IOC Page 2 Header: Version %x len %x PageNumber %x PageType %x\n",
	hdr.PageVersion, hdr.PageLength << 2,
	hdr.PageNumber, hdr.PageType);

	len = hdr.PageLength * sizeof(uint32_t);
	mpt->ioc_page2 = malloc(len, M_DEVBUF, M_NOWAIT \| M_ZERO);
	if (mpt->ioc_page2 == NULL) {
	mpt_prt(mpt, "unable to allocate memory for IOC page 2\n");
	mpt_raid_free_mem(mpt);
	return (ENOMEM);
	}
	memcpy(&mpt->ioc_page2->Header, &hdr, sizeof(hdr));
	rv = mpt_read_cur_cfg_page(mpt, 0,
	&mpt->ioc_page2->Header, len, FALSE, 5000);
	if (rv) {
	mpt_prt(mpt, "failed to read IOC Page 2\n");
	mpt_raid_free_mem(mpt);
	return (EIO);
	}
	mpt2host_config_page_ioc2(mpt->ioc_page2);

	if (mpt->ioc_page2->CapabilitiesFlags != 0) {
	uint32_t mask;

	mpt_prt(mpt, "Capabilities: (");
	for (mask = 1; mask != 0; mask <<= 1) {
	if ((mpt->ioc_page2->CapabilitiesFlags & mask) == 0) {
	continue;
	}
	switch (mask) {
	case MPI_IOCPAGE2_CAP_FLAGS_IS_SUPPORT:
	mpt_prtc(mpt, " RAID-0");
	break;
	case MPI_IOCPAGE2_CAP_FLAGS_IME_SUPPORT:
	mpt_prtc(mpt, " RAID-1E");
	break;
	case MPI_IOCPAGE2_CAP_FLAGS_IM_SUPPORT:
	mpt_prtc(mpt, " RAID-1");
	break;
	case MPI_IOCPAGE2_CAP_FLAGS_SES_SUPPORT:
	mpt_prtc(mpt, " SES");
	break;
	case MPI_IOCPAGE2_CAP_FLAGS_SAFTE_SUPPORT:
	mpt_prtc(mpt, " SAFTE");
	break;
	case MPI_IOCPAGE2_CAP_FLAGS_CROSS_CHANNEL_SUPPORT:
	mpt_prtc(mpt, " Multi-Channel-Arrays");
	default:
	break;
	}
	}
	mpt_prtc(mpt, " )\n");
	if ((mpt->ioc_page2->CapabilitiesFlags
	& (MPI_IOCPAGE2_CAP_FLAGS_IS_SUPPORT
	\| MPI_IOCPAGE2_CAP_FLAGS_IME_SUPPORT
	\| MPI_IOCPAGE2_CAP_FLAGS_IM_SUPPORT)) != 0) {
	mpt_prt(mpt, "%d Active Volume%s(%d Max)\n",
	mpt->ioc_page2->NumActiveVolumes,
	mpt->ioc_page2->NumActiveVolumes != 1
	? "s " : " ",
	mpt->ioc_page2->MaxVolumes);
	mpt_prt(mpt, "%d Hidden Drive Member%s(%d Max)\n",
	mpt->ioc_page2->NumActivePhysDisks,
	mpt->ioc_page2->NumActivePhysDisks != 1
	? "s " : " ",
	mpt->ioc_page2->MaxPhysDisks);
	}
	}

	len = mpt->ioc_page2->MaxVolumes * sizeof(struct mpt_raid_volume);
	mpt->raid_volumes = malloc(len, M_DEVBUF, M_NOWAIT \| M_ZERO);
	if (mpt->raid_volumes == NULL) {
	mpt_prt(mpt, "Could not allocate RAID volume data\n");
	mpt_raid_free_mem(mpt);
	return (ENOMEM);
	}

	/*
	* Copy critical data out of ioc_page2 so that we can
	* safely refresh the page without windows of unreliable
	* data.
	*/
	mpt->raid_max_volumes = mpt->ioc_page2->MaxVolumes;

	len = sizeof(*mpt->raid_volumes->config_page) +
	(sizeof (RAID_VOL0_PHYS_DISK) * (mpt->ioc_page2->MaxPhysDisks - 1));
	for (i = 0; i < mpt->ioc_page2->MaxVolumes; i++) {
	mpt_raid = &mpt->raid_volumes[i];
	mpt_raid->config_page =
	malloc(len, M_DEVBUF, M_NOWAIT \| M_ZERO);
	if (mpt_raid->config_page == NULL) {
	mpt_prt(mpt, "Could not allocate RAID page data\n");
	mpt_raid_free_mem(mpt);
	return (ENOMEM);
	}
	}
	mpt->raid_page0_len = len;

	len = mpt->ioc_page2->MaxPhysDisks * sizeof(struct mpt_raid_disk);
	mpt->raid_disks = malloc(len, M_DEVBUF, M_NOWAIT \| M_ZERO);
	if (mpt->raid_disks == NULL) {
	mpt_prt(mpt, "Could not allocate RAID disk data\n");
	mpt_raid_free_mem(mpt);
	return (ENOMEM);
	}
	mpt->raid_max_disks = mpt->ioc_page2->MaxPhysDisks;

	/*
	* Load page 3.
	*/
	rv = mpt_read_cfg_header(mpt, MPI_CONFIG_PAGETYPE_IOC,
	3, 0, &hdr, FALSE, 5000);
	if (rv) {
	mpt_raid_free_mem(mpt);
	return (EIO);
	}

	mpt_lprt(mpt, MPT_PRT_DEBUG, "IOC Page 3 Header: %x %x %x %x\n",
	hdr.PageVersion, hdr.PageLength, hdr.PageNumber, hdr.PageType);

	len = hdr.PageLength * sizeof(uint32_t);
	mpt->ioc_page3 = malloc(len, M_DEVBUF, M_NOWAIT \| M_ZERO);
	if (mpt->ioc_page3 == NULL) {
	mpt_prt(mpt, "unable to allocate memory for IOC page 3\n");
	mpt_raid_free_mem(mpt);
	return (ENOMEM);
	}
	memcpy(&mpt->ioc_page3->Header, &hdr, sizeof(hdr));
	rv = mpt_read_cur_cfg_page(mpt, 0,
	&mpt->ioc_page3->Header, len, FALSE, 5000);
	if (rv) {
	mpt_raid_free_mem(mpt);
	return (EIO);
	}
	mpt2host_config_page_ioc3(mpt->ioc_page3);
	mpt_raid_wakeup(mpt);
	return (0);
	}

	/*
	* Enable IOC port
	*/
	static int
	mpt_send_port_enable(struct mpt_softc *mpt, int port)
	{
	request_t *req;
	MSG_PORT_ENABLE *enable_req;
	int error;

	req = mpt_get_request(mpt, /sleep_ok/FALSE);
	if (req == NULL)
	return (-1);

	enable_req = req->req_vbuf;
	memset(enable_req, 0, MPT_RQSL(mpt));

	enable_req->Function = MPI_FUNCTION_PORT_ENABLE;
	enable_req->MsgContext = htole32(req->index \| MPT_REPLY_HANDLER_CONFIG);
	enable_req->PortNumber = port;

	mpt_check_doorbell(mpt);
	mpt_lprt(mpt, MPT_PRT_DEBUG, "enabling port %d\n", port);

	mpt_send_cmd(mpt, req);
	error = mpt_wait_req(mpt, req, REQ_STATE_DONE, REQ_STATE_DONE,
	FALSE, (mpt->is_sas \|\| mpt->is_fc)? 300000 : 30000);
	if (error != 0) {
	mpt_prt(mpt, "port %d enable timed out\n", port);
	return (-1);
	}
	mpt_free_request(mpt, req);
	mpt_lprt(mpt, MPT_PRT_DEBUG, "enabled port %d\n", port);
	return (0);
	}

	/*
	* Enable/Disable asynchronous event reporting.
	*/
	static int
	mpt_send_event_request(struct mpt_softc *mpt, int onoff)
	{
	request_t *req;
	MSG_EVENT_NOTIFY *enable_req;

	req = mpt_get_request(mpt, FALSE);
	if (req == NULL) {
	return (ENOMEM);
	}
	enable_req = req->req_vbuf;
	memset(enable_req, 0, sizeof *enable_req);

	enable_req->Function = MPI_FUNCTION_EVENT_NOTIFICATION;
	enable_req->MsgContext = htole32(req->index \| MPT_REPLY_HANDLER_EVENTS);
	enable_req->Switch = onoff;

	mpt_check_doorbell(mpt);
	mpt_lprt(mpt, MPT_PRT_DEBUG, "%sabling async events\n",
	onoff ? "en" : "dis");
	/*
	* Send the command off, but don't wait for it.
	*/
	mpt_send_cmd(mpt, req);
	return (0);
	}

	/*
	* Un-mask the interrupts on the chip.
	*/
	void
	mpt_enable_ints(struct mpt_softc *mpt)
	{

	/* Unmask every thing except door bell int */
	mpt_write(mpt, MPT_OFFSET_INTR_MASK, MPT_INTR_DB_MASK);
	}

	/*
	* Mask the interrupts on the chip.
	*/
	void
	mpt_disable_ints(struct mpt_softc *mpt)
	{

	/* Mask all interrupts */
	mpt_write(mpt, MPT_OFFSET_INTR_MASK,
	MPT_INTR_REPLY_MASK \| MPT_INTR_DB_MASK);
	}

	static void
	mpt_sysctl_attach(struct mpt_softc *mpt)
	{
	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(mpt->dev);
	struct sysctl_oid *tree = device_get_sysctl_tree(mpt->dev);

	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
	"debug", CTLFLAG_RW, &mpt->verbose, 0,
	"Debugging/Verbose level");
	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
	"role", CTLFLAG_RD, &mpt->role, 0,
	"HBA role");
	#ifdef MPT_TEST_MULTIPATH
	SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
	"failure_id", CTLFLAG_RW, &mpt->failure_id, -1,
	"Next Target to Fail");
	#endif
	}

	int
	mpt_attach(struct mpt_softc *mpt)
	{
	struct mpt_personality *pers;
	int i;
	int error;

	mpt_core_attach(mpt);
	mpt_core_enable(mpt);

	TAILQ_INSERT_TAIL(&mpt_tailq, mpt, links);
	for (i = 0; i < MPT_MAX_PERSONALITIES; i++) {
	pers = mpt_personalities[i];
	if (pers == NULL) {
	continue;
	}
	if (pers->probe(mpt) == 0) {
	error = pers->attach(mpt);
	if (error != 0) {
	mpt_detach(mpt);
	return (error);
	}
	mpt->mpt_pers_mask \|= (0x1 << pers->id);
	pers->use_count++;
	}
	}

	/*
	* Now that we've attached everything, do the enable function
	* for all of the personalities. This allows the personalities
	* to do setups that are appropriate for them prior to enabling
	* any ports.
	*/
	for (i = 0; i < MPT_MAX_PERSONALITIES; i++) {
	pers = mpt_personalities[i];
	if (pers != NULL && MPT_PERS_ATTACHED(pers, mpt) != 0) {
	error = pers->enable(mpt);
	if (error != 0) {
	mpt_prt(mpt, "personality %s attached but would"
	" not enable (%d)\n", pers->name, error);
	mpt_detach(mpt);
	return (error);
	}
	}
	}
	return (0);
	}

	int
	mpt_shutdown(struct mpt_softc *mpt)
	{
	struct mpt_personality *pers;

	MPT_PERS_FOREACH_REVERSE(mpt, pers) {
	pers->shutdown(mpt);
	}
	return (0);
	}

	int
	mpt_detach(struct mpt_softc *mpt)
	{
	struct mpt_personality *pers;

	MPT_PERS_FOREACH_REVERSE(mpt, pers) {
	pers->detach(mpt);
	mpt->mpt_pers_mask &= ~(0x1 << pers->id);
	pers->use_count--;
	}
	TAILQ_REMOVE(&mpt_tailq, mpt, links);
	return (0);
	}

	static int
	mpt_core_load(struct mpt_personality *pers)
	{
	int i;

	/*
	* Setup core handlers and insert the default handler
	* into all "empty slots".
	*/
	for (i = 0; i < MPT_NUM_REPLY_HANDLERS; i++) {
	mpt_reply_handlers[i] = mpt_default_reply_handler;
	}

	mpt_reply_handlers[MPT_CBI(MPT_REPLY_HANDLER_EVENTS)] =
	mpt_event_reply_handler;
	mpt_reply_handlers[MPT_CBI(MPT_REPLY_HANDLER_CONFIG)] =
	mpt_config_reply_handler;
	mpt_reply_handlers[MPT_CBI(MPT_REPLY_HANDLER_HANDSHAKE)] =
	mpt_handshake_reply_handler;
	return (0);
	}

	/*
	* Initialize per-instance driver data and perform
	* initial controller configuration.
	*/
	static int
	mpt_core_attach(struct mpt_softc *mpt)
	{
	int val, error;

	LIST_INIT(&mpt->ack_frames);
	/* Put all request buffers on the free list */
	TAILQ_INIT(&mpt->request_pending_list);
	TAILQ_INIT(&mpt->request_free_list);
	TAILQ_INIT(&mpt->request_timeout_list);
	for (val = 0; val < MPT_MAX_LUNS; val++) {
	STAILQ_INIT(&mpt->trt[val].atios);
	STAILQ_INIT(&mpt->trt[val].inots);
	}
	STAILQ_INIT(&mpt->trt_wildcard.atios);
	STAILQ_INIT(&mpt->trt_wildcard.inots);
	#ifdef MPT_TEST_MULTIPATH
	mpt->failure_id = -1;
	#endif
	mpt->scsi_tgt_handler_id = MPT_HANDLER_ID_NONE;
	mpt_sysctl_attach(mpt);
	mpt_lprt(mpt, MPT_PRT_DEBUG, "doorbell req = %s\n",
	mpt_ioc_diag(mpt_read(mpt, MPT_OFFSET_DOORBELL)));

	MPT_LOCK(mpt);
	error = mpt_configure_ioc(mpt, 0, 0);
	MPT_UNLOCK(mpt);

	return (error);
	}

	static int
	mpt_core_enable(struct mpt_softc *mpt)
	{

	/*
	* We enter with the IOC enabled, but async events
	* not enabled, ports not enabled and interrupts
	* not enabled.
	*/
	MPT_LOCK(mpt);

	/*
	* Enable asynchronous event reporting- all personalities
	* have attached so that they should be able to now field
	* async events.
	*/
	mpt_send_event_request(mpt, 1);

	/*
	* Catch any pending interrupts
	*
	* This seems to be crucial- otherwise
	* the portenable below times out.
	*/
	mpt_intr(mpt);

	/*
	* Enable Interrupts
	*/
	mpt_enable_ints(mpt);

	/*
	* Catch any pending interrupts
	*
	* This seems to be crucial- otherwise
	* the portenable below times out.
	*/
	mpt_intr(mpt);

	/*
	* Enable the port.
	*/
	if (mpt_send_port_enable(mpt, 0) != MPT_OK) {
	mpt_prt(mpt, "failed to enable port 0\n");
	MPT_UNLOCK(mpt);
	return (ENXIO);
	}
	MPT_UNLOCK(mpt);
	return (0);
	}

	static void
	mpt_core_shutdown(struct mpt_softc *mpt)
	{

	mpt_disable_ints(mpt);
	}

	static void
	mpt_core_detach(struct mpt_softc *mpt)
	{
	int val;

	/*
	* XXX: FREE MEMORY
	*/
	mpt_disable_ints(mpt);

	/* Make sure no request has pending timeouts. */
	for (val = 0; val < MPT_MAX_REQUESTS(mpt); val++) {
	request_t *req = &mpt->request_pool[val];
	mpt_callout_drain(mpt, &req->callout);
	}

	mpt_dma_buf_free(mpt);
	}

	static int
	mpt_core_unload(struct mpt_personality *pers)
	{

	/* Unload is always successful. */
	return (0);
	}

	#define FW_UPLOAD_REQ_SIZE \
	(sizeof(MSG_FW_UPLOAD) - sizeof(SGE_MPI_UNION) \
	+ sizeof(FW_UPLOAD_TCSGE) + sizeof(SGE_SIMPLE32))

	static int
	mpt_upload_fw(struct mpt_softc *mpt)
	{
	uint8_t fw_req_buf[FW_UPLOAD_REQ_SIZE];
	MSG_FW_UPLOAD_REPLY fw_reply;
	MSG_FW_UPLOAD *fw_req;
	FW_UPLOAD_TCSGE *tsge;
	SGE_SIMPLE32 *sge;
	uint32_t flags;
	int error;

	memset(&fw_req_buf, 0, sizeof(fw_req_buf));
	fw_req = (MSG_FW_UPLOAD *)fw_req_buf;
	fw_req->ImageType = MPI_FW_UPLOAD_ITYPE_FW_IOC_MEM;
	fw_req->Function = MPI_FUNCTION_FW_UPLOAD;
	fw_req->MsgContext = htole32(MPT_REPLY_HANDLER_HANDSHAKE);
	tsge = (FW_UPLOAD_TCSGE *)&fw_req->SGL;
	tsge->DetailsLength = 12;
	tsge->Flags = MPI_SGE_FLAGS_TRANSACTION_ELEMENT;
	tsge->ImageSize = htole32(mpt->fw_image_size);
	sge = (SGE_SIMPLE32 *)(tsge + 1);
	flags = (MPI_SGE_FLAGS_LAST_ELEMENT \| MPI_SGE_FLAGS_END_OF_BUFFER
	\| MPI_SGE_FLAGS_END_OF_LIST \| MPI_SGE_FLAGS_SIMPLE_ELEMENT
	\| MPI_SGE_FLAGS_32_BIT_ADDRESSING \| MPI_SGE_FLAGS_IOC_TO_HOST);
	flags <<= MPI_SGE_FLAGS_SHIFT;
	sge->FlagsLength = htole32(flags \| mpt->fw_image_size);
	sge->Address = htole32(mpt->fw_phys);
	bus_dmamap_sync(mpt->fw_dmat, mpt->fw_dmap, BUS_DMASYNC_PREREAD);
	error = mpt_send_handshake_cmd(mpt, sizeof(fw_req_buf), &fw_req_buf);
	if (error)
	return(error);
	error = mpt_recv_handshake_reply(mpt, sizeof(fw_reply), &fw_reply);
	bus_dmamap_sync(mpt->fw_dmat, mpt->fw_dmap, BUS_DMASYNC_POSTREAD);
	return (error);
	}

	static void
	mpt_diag_outsl(struct mpt_softc *mpt, uint32_t addr,
	uint32_t *data, bus_size_t len)
	{
	uint32_t *data_end;

	data_end = data + (roundup2(len, sizeof(uint32_t)) / 4);
	if (mpt->is_sas) {
	pci_enable_io(mpt->dev, SYS_RES_IOPORT);
	}
	mpt_pio_write(mpt, MPT_OFFSET_DIAG_ADDR, addr);
	while (data != data_end) {
	mpt_pio_write(mpt, MPT_OFFSET_DIAG_DATA, *data);
	data++;
	}
	if (mpt->is_sas) {
	pci_disable_io(mpt->dev, SYS_RES_IOPORT);
	}
	}

	static int
	mpt_download_fw(struct mpt_softc *mpt)
	{
	MpiFwHeader_t *fw_hdr;
	int error;
	uint32_t ext_offset;
	uint32_t data;

	if (mpt->pci_pio_reg == NULL) {
	mpt_prt(mpt, "No PIO resource!\n");
	return (ENXIO);
	}

	mpt_prt(mpt, "Downloading Firmware - Image Size %d\n",
	mpt->fw_image_size);

	error = mpt_enable_diag_mode(mpt);
	if (error != 0) {
	mpt_prt(mpt, "Could not enter diagnostic mode!\n");
	return (EIO);
	}

	mpt_write(mpt, MPT_OFFSET_DIAGNOSTIC,
	MPI_DIAG_RW_ENABLE\|MPI_DIAG_DISABLE_ARM);

	fw_hdr = (MpiFwHeader_t *)mpt->fw_image;
	bus_dmamap_sync(mpt->fw_dmat, mpt->fw_dmap, BUS_DMASYNC_PREWRITE);
	mpt_diag_outsl(mpt, fw_hdr->LoadStartAddress, (uint32_t*)fw_hdr,
	fw_hdr->ImageSize);
	bus_dmamap_sync(mpt->fw_dmat, mpt->fw_dmap, BUS_DMASYNC_POSTWRITE);

	ext_offset = fw_hdr->NextImageHeaderOffset;
	while (ext_offset != 0) {
	MpiExtImageHeader_t *ext;

	ext = (MpiExtImageHeader_t *)((uintptr_t)fw_hdr + ext_offset);
	ext_offset = ext->NextImageHeaderOffset;
	bus_dmamap_sync(mpt->fw_dmat, mpt->fw_dmap,
	BUS_DMASYNC_PREWRITE);
	mpt_diag_outsl(mpt, ext->LoadStartAddress, (uint32_t*)ext,
	ext->ImageSize);
	bus_dmamap_sync(mpt->fw_dmat, mpt->fw_dmap,
	BUS_DMASYNC_POSTWRITE);
	}

	if (mpt->is_sas) {
	pci_enable_io(mpt->dev, SYS_RES_IOPORT);
	}
	/* Setup the address to jump to on reset. */
	mpt_pio_write(mpt, MPT_OFFSET_DIAG_ADDR, fw_hdr->IopResetRegAddr);
	mpt_pio_write(mpt, MPT_OFFSET_DIAG_DATA, fw_hdr->IopResetVectorValue);

	/*
	* The controller sets the "flash bad" status after attempting
	* to auto-boot from flash. Clear the status so that the controller
	* will continue the boot process with our newly installed firmware.
	*/
	mpt_pio_write(mpt, MPT_OFFSET_DIAG_ADDR, MPT_DIAG_MEM_CFG_BASE);
	data = mpt_pio_read(mpt, MPT_OFFSET_DIAG_DATA) \| MPT_DIAG_MEM_CFG_BADFL;
	mpt_pio_write(mpt, MPT_OFFSET_DIAG_ADDR, MPT_DIAG_MEM_CFG_BASE);
	mpt_pio_write(mpt, MPT_OFFSET_DIAG_DATA, data);

	if (mpt->is_sas) {
	pci_disable_io(mpt->dev, SYS_RES_IOPORT);
	}

	/*
	* Re-enable the processor and clear the boot halt flag.
	*/
	data = mpt_read(mpt, MPT_OFFSET_DIAGNOSTIC);
	data &= ~(MPI_DIAG_PREVENT_IOC_BOOT\|MPI_DIAG_DISABLE_ARM);
	mpt_write(mpt, MPT_OFFSET_DIAGNOSTIC, data);

	mpt_disable_diag_mode(mpt);
	return (0);
	}

	static int
	mpt_dma_buf_alloc(struct mpt_softc *mpt)
	{
	struct mpt_map_info mi;
	uint8_t *vptr;
	uint32_t pptr, end;
	int i, error;

	/* Create a child tag for data buffers */
	if (mpt_dma_tag_create(mpt, mpt->parent_dmat, 1,
	0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
	NULL, NULL, (mpt->max_cam_seg_cnt - 1) * PAGE_SIZE,
	mpt->max_cam_seg_cnt, BUS_SPACE_MAXSIZE_32BIT, 0,
	&mpt->buffer_dmat) != 0) {
	mpt_prt(mpt, "cannot create a dma tag for data buffers\n");
	return (1);
	}

	/* Create a child tag for request buffers */
	if (mpt_dma_tag_create(mpt, mpt->parent_dmat, PAGE_SIZE, 0,
	BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR,
	NULL, NULL, MPT_REQ_MEM_SIZE(mpt), 1, BUS_SPACE_MAXSIZE_32BIT, 0,
	&mpt->request_dmat) != 0) {
	mpt_prt(mpt, "cannot create a dma tag for requests\n");
	return (1);
	}

	/* Allocate some DMA accessible memory for requests */
	if (bus_dmamem_alloc(mpt->request_dmat, (void **)&mpt->request,
	BUS_DMA_NOWAIT \| BUS_DMA_COHERENT, &mpt->request_dmap) != 0) {
	mpt_prt(mpt, "cannot allocate %d bytes of request memory\n",
	MPT_REQ_MEM_SIZE(mpt));
	return (1);
	}

	mi.mpt = mpt;
	mi.error = 0;

	/* Load and lock it into "bus space" */
	bus_dmamap_load(mpt->request_dmat, mpt->request_dmap, mpt->request,
	MPT_REQ_MEM_SIZE(mpt), mpt_map_rquest, &mi, 0);

	if (mi.error) {
	mpt_prt(mpt, "error %d loading dma map for DMA request queue\n",
	mi.error);
	return (1);
	}
	mpt->request_phys = mi.phys;

	/*
	* Now create per-request dma maps
	*/
	i = 0;
	pptr = mpt->request_phys;
	vptr = mpt->request;
	end = pptr + MPT_REQ_MEM_SIZE(mpt);
	while(pptr < end) {
	request_t *req = &mpt->request_pool[i];
	req->index = i++;

	/* Store location of Request Data */
	req->req_pbuf = pptr;
	req->req_vbuf = vptr;

	pptr += MPT_REQUEST_AREA;
	vptr += MPT_REQUEST_AREA;

	req->sense_pbuf = (pptr - MPT_SENSE_SIZE);
	req->sense_vbuf = (vptr - MPT_SENSE_SIZE);

	error = bus_dmamap_create(mpt->buffer_dmat, 0, &req->dmap);
	if (error) {
	mpt_prt(mpt, "error %d creating per-cmd DMA maps\n",
	error);
	return (1);
	}
	}

	return (0);
	}

	static void
	mpt_dma_buf_free(struct mpt_softc *mpt)
	{
	int i;

	if (mpt->request_dmat == 0) {
	mpt_lprt(mpt, MPT_PRT_DEBUG, "already released dma memory\n");
	return;
	}
	for (i = 0; i < MPT_MAX_REQUESTS(mpt); i++) {
	bus_dmamap_destroy(mpt->buffer_dmat, mpt->request_pool[i].dmap);
	}
	bus_dmamap_unload(mpt->request_dmat, mpt->request_dmap);
	bus_dmamem_free(mpt->request_dmat, mpt->request, mpt->request_dmap);
	bus_dma_tag_destroy(mpt->request_dmat);
	mpt->request_dmat = 0;
	bus_dma_tag_destroy(mpt->buffer_dmat);
	}

	/*
	* Allocate/Initialize data structures for the controller. Called
	* once at instance startup.
	*/
	static int
	mpt_configure_ioc(struct mpt_softc *mpt, int tn, int needreset)
	{
	PTR_MSG_PORT_FACTS_REPLY pfp;
	int error, port, val;
	size_t len;

	if (tn == MPT_MAX_TRYS) {
	return (-1);
	}

	/*
	* No need to reset if the IOC is already in the READY state.
	*
	* Force reset if initialization failed previously.
	* Note that a hard_reset of the second channel of a '929
	* will stop operation of the first channel. Hopefully, if the
	* first channel is ok, the second will not require a hard
	* reset.
	*/
	if (needreset \|\| MPT_STATE(mpt_rd_db(mpt)) != MPT_DB_STATE_READY) {
	if (mpt_reset(mpt, FALSE) != MPT_OK) {
	return (mpt_configure_ioc(mpt, tn++, 1));
	}
	needreset = 0;
	}

	if (mpt_get_iocfacts(mpt, &mpt->ioc_facts) != MPT_OK) {
	mpt_prt(mpt, "mpt_get_iocfacts failed\n");
	return (mpt_configure_ioc(mpt, tn++, 1));
	}
	mpt2host_iocfacts_reply(&mpt->ioc_facts);

	mpt_prt(mpt, "MPI Version=%d.%d.%d.%d\n",
	mpt->ioc_facts.MsgVersion >> 8,
	mpt->ioc_facts.MsgVersion & 0xFF,
	mpt->ioc_facts.HeaderVersion >> 8,
	mpt->ioc_facts.HeaderVersion & 0xFF);

	/*
	* Now that we know request frame size, we can calculate
	* the actual (reasonable) segment limit for read/write I/O.
	*
	* This limit is constrained by:
	*
	* + The size of each area we allocate per command (and how
	* many chain segments we can fit into it).
	* + The total number of areas we've set up.
	* + The actual chain depth the card will allow.
	*
	* The first area's segment count is limited by the I/O request
	* at the head of it. We cannot allocate realistically more
	* than MPT_MAX_REQUESTS areas. Therefore, to account for both
	* conditions, we'll just start out with MPT_MAX_REQUESTS-2.
	*
	*/
	/* total number of request areas we (can) allocate */
	mpt->max_seg_cnt = MPT_MAX_REQUESTS(mpt) - 2;

	/* converted to the number of chain areas possible */
	mpt->max_seg_cnt *= MPT_NRFM(mpt);

	/* limited by the number of chain areas the card will support */
	if (mpt->max_seg_cnt > mpt->ioc_facts.MaxChainDepth) {
	mpt_lprt(mpt, MPT_PRT_INFO,
	"chain depth limited to %u (from %u)\n",
	mpt->ioc_facts.MaxChainDepth, mpt->max_seg_cnt);
	mpt->max_seg_cnt = mpt->ioc_facts.MaxChainDepth;
	}

	/* converted to the number of simple sges in chain segments. */
	mpt->max_seg_cnt *= (MPT_NSGL(mpt) - 1);

	/*
	* Use this as the basis for reporting the maximum I/O size to CAM.
	*/
	- mpt->max_cam_seg_cnt = min(mpt->max_seg_cnt, (MAXPHYS / PAGE_SIZE) + 1);
	+ mpt->max_cam_seg_cnt = min(mpt->max_seg_cnt, btoc(maxphys) + 1);

	/* XXX Lame Locking! */
	MPT_UNLOCK(mpt);
	error = mpt_dma_buf_alloc(mpt);
	MPT_LOCK(mpt);

	if (error != 0) {
	mpt_prt(mpt, "mpt_dma_buf_alloc() failed!\n");
	return (EIO);
	}

	for (val = 0; val < MPT_MAX_REQUESTS(mpt); val++) {
	request_t *req = &mpt->request_pool[val];
	req->state = REQ_STATE_ALLOCATED;
	mpt_callout_init(mpt, &req->callout);
	mpt_free_request(mpt, req);
	}

	mpt_lprt(mpt, MPT_PRT_INFO, "Maximum Segment Count: %u, Maximum "
	"CAM Segment Count: %u\n", mpt->max_seg_cnt,
	mpt->max_cam_seg_cnt);

	mpt_lprt(mpt, MPT_PRT_INFO, "MsgLength=%u IOCNumber = %d\n",
	mpt->ioc_facts.MsgLength, mpt->ioc_facts.IOCNumber);
	mpt_lprt(mpt, MPT_PRT_INFO,
	"IOCFACTS: GlobalCredits=%d BlockSize=%u bytes "
	"Request Frame Size %u bytes Max Chain Depth %u\n",
	mpt->ioc_facts.GlobalCredits, mpt->ioc_facts.BlockSize,
	mpt->ioc_facts.RequestFrameSize << 2,
	mpt->ioc_facts.MaxChainDepth);
	mpt_lprt(mpt, MPT_PRT_INFO, "IOCFACTS: Num Ports %d, FWImageSize %d, "
	"Flags=%#x\n", mpt->ioc_facts.NumberOfPorts,
	mpt->ioc_facts.FWImageSize, mpt->ioc_facts.Flags);

	len = mpt->ioc_facts.NumberOfPorts * sizeof (MSG_PORT_FACTS_REPLY);
	mpt->port_facts = malloc(len, M_DEVBUF, M_NOWAIT \| M_ZERO);
	if (mpt->port_facts == NULL) {
	mpt_prt(mpt, "unable to allocate memory for port facts\n");
	return (ENOMEM);
	}

	if ((mpt->ioc_facts.Flags & MPI_IOCFACTS_FLAGS_FW_DOWNLOAD_BOOT) &&
	(mpt->fw_uploaded == 0)) {
	struct mpt_map_info mi;

	/*
	* In some configurations, the IOC's firmware is
	* stored in a shared piece of system NVRAM that
	* is only accessible via the BIOS. In this
	* case, the firmware keeps a copy of firmware in
	* RAM until the OS driver retrieves it. Once
	* retrieved, we are responsible for re-downloading
	* the firmware after any hard-reset.
	*/
	MPT_UNLOCK(mpt);
	mpt->fw_image_size = mpt->ioc_facts.FWImageSize;
	error = mpt_dma_tag_create(mpt, mpt->parent_dmat, 1, 0,
	BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL,
	mpt->fw_image_size, 1, mpt->fw_image_size, 0,
	&mpt->fw_dmat);
	if (error != 0) {
	mpt_prt(mpt, "cannot create firmware dma tag\n");
	MPT_LOCK(mpt);
	return (ENOMEM);
	}
	error = bus_dmamem_alloc(mpt->fw_dmat,
	(void **)&mpt->fw_image, BUS_DMA_NOWAIT \|
	BUS_DMA_COHERENT, &mpt->fw_dmap);
	if (error != 0) {
	mpt_prt(mpt, "cannot allocate firmware memory\n");
	bus_dma_tag_destroy(mpt->fw_dmat);
	MPT_LOCK(mpt);
	return (ENOMEM);
	}
	mi.mpt = mpt;
	mi.error = 0;
	bus_dmamap_load(mpt->fw_dmat, mpt->fw_dmap,
	mpt->fw_image, mpt->fw_image_size, mpt_map_rquest, &mi, 0);
	mpt->fw_phys = mi.phys;

	MPT_LOCK(mpt);
	error = mpt_upload_fw(mpt);
	if (error != 0) {
	mpt_prt(mpt, "firmware upload failed.\n");
	bus_dmamap_unload(mpt->fw_dmat, mpt->fw_dmap);
	bus_dmamem_free(mpt->fw_dmat, mpt->fw_image,
	mpt->fw_dmap);
	bus_dma_tag_destroy(mpt->fw_dmat);
	mpt->fw_image = NULL;
	return (EIO);
	}
	mpt->fw_uploaded = 1;
	}

	for (port = 0; port < mpt->ioc_facts.NumberOfPorts; port++) {
	pfp = &mpt->port_facts[port];
	error = mpt_get_portfacts(mpt, 0, pfp);
	if (error != MPT_OK) {
	mpt_prt(mpt,
	"mpt_get_portfacts on port %d failed\n", port);
	free(mpt->port_facts, M_DEVBUF);
	mpt->port_facts = NULL;
	return (mpt_configure_ioc(mpt, tn++, 1));
	}
	mpt2host_portfacts_reply(pfp);

	if (port > 0) {
	error = MPT_PRT_INFO;
	} else {
	error = MPT_PRT_DEBUG;
	}
	mpt_lprt(mpt, error,
	"PORTFACTS[%d]: Type %x PFlags %x IID %d MaxDev %d\n",
	port, pfp->PortType, pfp->ProtocolFlags, pfp->PortSCSIID,
	pfp->MaxDevices);
	}

	/*
	* XXX: Not yet supporting more than port 0
	*/
	pfp = &mpt->port_facts[0];
	if (pfp->PortType == MPI_PORTFACTS_PORTTYPE_FC) {
	mpt->is_fc = 1;
	mpt->is_sas = 0;
	mpt->is_spi = 0;
	} else if (pfp->PortType == MPI_PORTFACTS_PORTTYPE_SAS) {
	mpt->is_fc = 0;
	mpt->is_sas = 1;
	mpt->is_spi = 0;
	} else if (pfp->PortType == MPI_PORTFACTS_PORTTYPE_SCSI) {
	mpt->is_fc = 0;
	mpt->is_sas = 0;
	mpt->is_spi = 1;
	if (mpt->mpt_ini_id == MPT_INI_ID_NONE)
	mpt->mpt_ini_id = pfp->PortSCSIID;
	} else if (pfp->PortType == MPI_PORTFACTS_PORTTYPE_ISCSI) {
	mpt_prt(mpt, "iSCSI not supported yet\n");
	return (ENXIO);
	} else if (pfp->PortType == MPI_PORTFACTS_PORTTYPE_INACTIVE) {
	mpt_prt(mpt, "Inactive Port\n");
	return (ENXIO);
	} else {
	mpt_prt(mpt, "unknown Port Type %#x\n", pfp->PortType);
	return (ENXIO);
	}

	/*
	* Set our role with what this port supports.
	*
	* Note this might be changed later in different modules
	* if this is different from what is wanted.
	*/
	mpt->role = MPT_ROLE_NONE;
	if (pfp->ProtocolFlags & MPI_PORTFACTS_PROTOCOL_INITIATOR) {
	mpt->role \|= MPT_ROLE_INITIATOR;
	}
	if (pfp->ProtocolFlags & MPI_PORTFACTS_PROTOCOL_TARGET) {
	mpt->role \|= MPT_ROLE_TARGET;
	}

	/*
	* Enable the IOC
	*/
	if (mpt_enable_ioc(mpt, 1) != MPT_OK) {
	mpt_prt(mpt, "unable to initialize IOC\n");
	return (ENXIO);
	}

	/*
	* Read IOC configuration information.
	*
	* We need this to determine whether or not we have certain
	* settings for Integrated Mirroring (e.g.).
	*/
	mpt_read_config_info_ioc(mpt);

	return (0);
	}

	static int
	mpt_enable_ioc(struct mpt_softc *mpt, int portenable)
	{
	uint32_t pptr;
	int val;

	if (mpt_send_ioc_init(mpt, MPI_WHOINIT_HOST_DRIVER) != MPT_OK) {
	mpt_prt(mpt, "mpt_send_ioc_init failed\n");
	return (EIO);
	}

	mpt_lprt(mpt, MPT_PRT_DEBUG, "mpt_send_ioc_init ok\n");

	if (mpt_wait_state(mpt, MPT_DB_STATE_RUNNING) != MPT_OK) {
	mpt_prt(mpt, "IOC failed to go to run state\n");
	return (ENXIO);
	}
	mpt_lprt(mpt, MPT_PRT_DEBUG, "IOC now at RUNSTATE\n");

	/*
	* Give it reply buffers
	*
	* Do not exceed global credits.
	*/
	for (val = 0, pptr = mpt->reply_phys;
	(pptr + MPT_REPLY_SIZE) < (mpt->reply_phys + PAGE_SIZE);
	pptr += MPT_REPLY_SIZE) {
	mpt_free_reply(mpt, pptr);
	if (++val == mpt->ioc_facts.GlobalCredits - 1)
	break;
	}

	/*
	* Enable the port if asked. This is only done if we're resetting
	* the IOC after initial startup.
	*/
	if (portenable) {
	/*
	* Enable asynchronous event reporting
	*/
	mpt_send_event_request(mpt, 1);

	if (mpt_send_port_enable(mpt, 0) != MPT_OK) {
	mpt_prt(mpt, "%s: failed to enable port 0\n", __func__);
	return (ENXIO);
	}
	}
	return (MPT_OK);
	}

	/*
	* Endian Conversion Functions- only used on Big Endian machines
	*/
	#if _BYTE_ORDER == _BIG_ENDIAN
	void
	mpt2host_sge_simple_union(SGE_SIMPLE_UNION *sge)
	{

	MPT_2_HOST32(sge, FlagsLength);
	MPT_2_HOST32(sge, u.Address64.Low);
	MPT_2_HOST32(sge, u.Address64.High);
	}

	void
	mpt2host_iocfacts_reply(MSG_IOC_FACTS_REPLY *rp)
	{

	MPT_2_HOST16(rp, MsgVersion);
	MPT_2_HOST16(rp, HeaderVersion);
	MPT_2_HOST32(rp, MsgContext);
	MPT_2_HOST16(rp, IOCExceptions);
	MPT_2_HOST16(rp, IOCStatus);
	MPT_2_HOST32(rp, IOCLogInfo);
	MPT_2_HOST16(rp, ReplyQueueDepth);
	MPT_2_HOST16(rp, RequestFrameSize);
	MPT_2_HOST16(rp, Reserved_0101_FWVersion);
	MPT_2_HOST16(rp, ProductID);
	MPT_2_HOST32(rp, CurrentHostMfaHighAddr);
	MPT_2_HOST16(rp, GlobalCredits);
	MPT_2_HOST32(rp, CurrentSenseBufferHighAddr);
	MPT_2_HOST16(rp, CurReplyFrameSize);
	MPT_2_HOST32(rp, FWImageSize);
	MPT_2_HOST32(rp, IOCCapabilities);
	MPT_2_HOST32(rp, FWVersion.Word);
	MPT_2_HOST16(rp, HighPriorityQueueDepth);
	MPT_2_HOST16(rp, Reserved2);
	mpt2host_sge_simple_union(&rp->HostPageBufferSGE);
	MPT_2_HOST32(rp, ReplyFifoHostSignalingAddr);
	}

	void
	mpt2host_portfacts_reply(MSG_PORT_FACTS_REPLY *pfp)
	{

	MPT_2_HOST16(pfp, Reserved);
	MPT_2_HOST16(pfp, Reserved1);
	MPT_2_HOST32(pfp, MsgContext);
	MPT_2_HOST16(pfp, Reserved2);
	MPT_2_HOST16(pfp, IOCStatus);
	MPT_2_HOST32(pfp, IOCLogInfo);
	MPT_2_HOST16(pfp, MaxDevices);
	MPT_2_HOST16(pfp, PortSCSIID);
	MPT_2_HOST16(pfp, ProtocolFlags);
	MPT_2_HOST16(pfp, MaxPostedCmdBuffers);
	MPT_2_HOST16(pfp, MaxPersistentIDs);
	MPT_2_HOST16(pfp, MaxLanBuckets);
	MPT_2_HOST16(pfp, Reserved4);
	MPT_2_HOST32(pfp, Reserved5);
	}

	void
	mpt2host_config_page_ioc2(CONFIG_PAGE_IOC_2 *ioc2)
	{
	int i;

	MPT_2_HOST32(ioc2, CapabilitiesFlags);
	for (i = 0; i < MPI_IOC_PAGE_2_RAID_VOLUME_MAX; i++) {
	MPT_2_HOST16(ioc2, RaidVolume[i].Reserved3);
	}
	}

	void
	mpt2host_config_page_ioc3(CONFIG_PAGE_IOC_3 *ioc3)
	{

	MPT_2_HOST16(ioc3, Reserved2);
	}

	void
	mpt2host_config_page_scsi_port_0(CONFIG_PAGE_SCSI_PORT_0 *sp0)
	{

	MPT_2_HOST32(sp0, Capabilities);
	MPT_2_HOST32(sp0, PhysicalInterface);
	}

	void
	mpt2host_config_page_scsi_port_1(CONFIG_PAGE_SCSI_PORT_1 *sp1)
	{

	MPT_2_HOST32(sp1, Configuration);
	MPT_2_HOST32(sp1, OnBusTimerValue);
	MPT_2_HOST16(sp1, IDConfig);
	}

	void
	host2mpt_config_page_scsi_port_1(CONFIG_PAGE_SCSI_PORT_1 *sp1)
	{

	HOST_2_MPT32(sp1, Configuration);
	HOST_2_MPT32(sp1, OnBusTimerValue);
	HOST_2_MPT16(sp1, IDConfig);
	}

	void
	mpt2host_config_page_scsi_port_2(CONFIG_PAGE_SCSI_PORT_2 *sp2)
	{
	int i;

	MPT_2_HOST32(sp2, PortFlags);
	MPT_2_HOST32(sp2, PortSettings);
	for (i = 0; i < sizeof(sp2->DeviceSettings) /
	sizeof(*sp2->DeviceSettings); i++) {
	MPT_2_HOST16(sp2, DeviceSettings[i].DeviceFlags);
	}
	}

	void
	mpt2host_config_page_scsi_device_0(CONFIG_PAGE_SCSI_DEVICE_0 *sd0)
	{

	MPT_2_HOST32(sd0, NegotiatedParameters);
	MPT_2_HOST32(sd0, Information);
	}

	void
	mpt2host_config_page_scsi_device_1(CONFIG_PAGE_SCSI_DEVICE_1 *sd1)
	{

	MPT_2_HOST32(sd1, RequestedParameters);
	MPT_2_HOST32(sd1, Reserved);
	MPT_2_HOST32(sd1, Configuration);
	}

	void
	host2mpt_config_page_scsi_device_1(CONFIG_PAGE_SCSI_DEVICE_1 *sd1)
	{

	HOST_2_MPT32(sd1, RequestedParameters);
	HOST_2_MPT32(sd1, Reserved);
	HOST_2_MPT32(sd1, Configuration);
	}

	void
	mpt2host_config_page_fc_port_0(CONFIG_PAGE_FC_PORT_0 *fp0)
	{

	MPT_2_HOST32(fp0, Flags);
	MPT_2_HOST32(fp0, PortIdentifier);
	MPT_2_HOST32(fp0, WWNN.Low);
	MPT_2_HOST32(fp0, WWNN.High);
	MPT_2_HOST32(fp0, WWPN.Low);
	MPT_2_HOST32(fp0, WWPN.High);
	MPT_2_HOST32(fp0, SupportedServiceClass);
	MPT_2_HOST32(fp0, SupportedSpeeds);
	MPT_2_HOST32(fp0, CurrentSpeed);
	MPT_2_HOST32(fp0, MaxFrameSize);
	MPT_2_HOST32(fp0, FabricWWNN.Low);
	MPT_2_HOST32(fp0, FabricWWNN.High);
	MPT_2_HOST32(fp0, FabricWWPN.Low);
	MPT_2_HOST32(fp0, FabricWWPN.High);
	MPT_2_HOST32(fp0, DiscoveredPortsCount);
	MPT_2_HOST32(fp0, MaxInitiators);
	}

	void
	mpt2host_config_page_fc_port_1(CONFIG_PAGE_FC_PORT_1 *fp1)
	{

	MPT_2_HOST32(fp1, Flags);
	MPT_2_HOST32(fp1, NoSEEPROMWWNN.Low);
	MPT_2_HOST32(fp1, NoSEEPROMWWNN.High);
	MPT_2_HOST32(fp1, NoSEEPROMWWPN.Low);
	MPT_2_HOST32(fp1, NoSEEPROMWWPN.High);
	}

	void
	host2mpt_config_page_fc_port_1(CONFIG_PAGE_FC_PORT_1 *fp1)
	{

	HOST_2_MPT32(fp1, Flags);
	HOST_2_MPT32(fp1, NoSEEPROMWWNN.Low);
	HOST_2_MPT32(fp1, NoSEEPROMWWNN.High);
	HOST_2_MPT32(fp1, NoSEEPROMWWPN.Low);
	HOST_2_MPT32(fp1, NoSEEPROMWWPN.High);
	}

	void
	mpt2host_config_page_raid_vol_0(CONFIG_PAGE_RAID_VOL_0 *volp)
	{
	int i;

	MPT_2_HOST16(volp, VolumeStatus.Reserved);
	MPT_2_HOST16(volp, VolumeSettings.Settings);
	MPT_2_HOST32(volp, MaxLBA);
	MPT_2_HOST32(volp, MaxLBAHigh);
	MPT_2_HOST32(volp, StripeSize);
	MPT_2_HOST32(volp, Reserved2);
	MPT_2_HOST32(volp, Reserved3);
	for (i = 0; i < MPI_RAID_VOL_PAGE_0_PHYSDISK_MAX; i++) {
	MPT_2_HOST16(volp, PhysDisk[i].Reserved);
	}
	}

	void
	mpt2host_config_page_raid_phys_disk_0(CONFIG_PAGE_RAID_PHYS_DISK_0 *rpd0)
	{

	MPT_2_HOST32(rpd0, Reserved1);
	MPT_2_HOST16(rpd0, PhysDiskStatus.Reserved);
	MPT_2_HOST32(rpd0, MaxLBA);
	MPT_2_HOST16(rpd0, ErrorData.Reserved);
	MPT_2_HOST16(rpd0, ErrorData.ErrorCount);
	MPT_2_HOST16(rpd0, ErrorData.SmartCount);
	}

	void
	mpt2host_mpi_raid_vol_indicator(MPI_RAID_VOL_INDICATOR *vi)
	{

	MPT_2_HOST16(vi, TotalBlocks.High);
	MPT_2_HOST16(vi, TotalBlocks.Low);
	MPT_2_HOST16(vi, BlocksRemaining.High);
	MPT_2_HOST16(vi, BlocksRemaining.Low);
	}
	#endif
	diff --git a/sys/dev/mpt/mpt.h b/sys/dev/mpt/mpt.h
	index 5aecbc82a2f8..52cd3dda827e 100644
	--- a/sys/dev/mpt/mpt.h
	+++ b/sys/dev/mpt/mpt.h
	@@ -1,1140 +1,1140 @@
	/* $FreeBSD$ */
	/*-
	* Generic defines for LSI '909 FC adapters.
	* FreeBSD Version.
	*
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND BSD-3-Clause
	*
	* Copyright (c) 2000, 2001 by Greg Ansley
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice immediately at the beginning of the file, without modification,
	* this list of conditions, and the following disclaimer.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/
	/*-
	* Copyright (c) 2002, 2006 by Matthew Jacob
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are
	* met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce at minimum a disclaimer
	* substantially similar to the "NO WARRANTY" disclaimer below
	* ("Disclaimer") and any redistribution must be conditioned upon including
	* a substantially similar Disclaimer requirement for further binary
	* redistribution.
	* 3. Neither the names of the above listed copyright holders nor the names
	* of any contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF THE COPYRIGHT
	* OWNER OR CONTRIBUTOR IS ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* Support from Chris Ellsworth in order to make SAS adapters work
	* is gratefully acknowledged.
	*
	*
	* Support from LSI-Logic has also gone a great deal toward making this a
	* workable subsystem and is gratefully acknowledged.
	*/
	/*
	* Copyright (c) 2004, Avid Technology, Inc. and its contributors.
	* Copyright (c) 2004, 2005 Justin T. Gibbs
	* Copyright (c) 2005, WHEEL Sp. z o.o.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are
	* met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce at minimum a disclaimer
	* substantially similar to the "NO WARRANTY" disclaimer below
	* ("Disclaimer") and any redistribution must be conditioned upon including
	* a substantially similar Disclaimer requirement for further binary
	* redistribution.
	* 3. Neither the names of the above listed copyright holders nor the names
	* of any contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF THE COPYRIGHT
	* OWNER OR CONTRIBUTOR IS ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#ifndef _MPT_H_
	#define _MPT_H_

	/******************************* OS Includes ******************************/
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/condvar.h>
	#include <sys/endian.h>
	#include <sys/eventhandler.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	#include <sys/rman.h>
	#include <sys/types.h>

	#include <machine/bus.h>
	#include <machine/cpu.h>
	#include <machine/resource.h>

	#include <dev/pci/pcireg.h>
	#include <dev/pci/pcivar.h>

	#include "opt_ddb.h"

	/************************** Register Definitions **************************/
	#include <dev/mpt/mpt_reg.h>

	/***************************** MPI Definitions ****************************/
	#include <dev/mpt/mpilib/mpi_type.h>
	#include <dev/mpt/mpilib/mpi.h>
	#include <dev/mpt/mpilib/mpi_cnfg.h>
	#include <dev/mpt/mpilib/mpi_ioc.h>
	#include <dev/mpt/mpilib/mpi_raid.h>

	/* XXX For mpt_debug.c */
	#include <dev/mpt/mpilib/mpi_init.h>

	#define MPT_S64_2_SCALAR(y) ((((int64_t)y.High) << 32) \| (y.Low))
	#define MPT_U64_2_SCALAR(y) ((((uint64_t)y.High) << 32) \| (y.Low))

	/**************************** Misc Definitions ****************************/
	/* #define MPT_TEST_MULTIPATH 1 */
	#define MPT_OK (0)
	#define MPT_FAIL (0x10000)

	#define NUM_ELEMENTS(array) (sizeof(array) / sizeof(*array))

	#define MPT_ROLE_NONE 0
	#define MPT_ROLE_INITIATOR 1
	#define MPT_ROLE_TARGET 2
	#define MPT_ROLE_BOTH 3
	#define MPT_ROLE_DEFAULT MPT_ROLE_INITIATOR

	#define MPT_INI_ID_NONE -1

	/************************** Forward Declarations **************************/
	struct mpt_softc;
	struct mpt_personality;
	typedef struct req_entry request_t;

	/*********************** Personality Module Support ***********************/
	typedef int mpt_load_handler_t(struct mpt_personality *);
	typedef int mpt_probe_handler_t(struct mpt_softc *);
	typedef int mpt_attach_handler_t(struct mpt_softc *);
	typedef int mpt_enable_handler_t(struct mpt_softc *);
	typedef void mpt_ready_handler_t(struct mpt_softc *);
	typedef int mpt_event_handler_t(struct mpt_softc , request_t ,
	MSG_EVENT_NOTIFY_REPLY *);
	typedef void mpt_reset_handler_t(struct mpt_softc , int /type*/);
	/* XXX Add return value and use for veto? */
	typedef void mpt_shutdown_handler_t(struct mpt_softc *);
	typedef void mpt_detach_handler_t(struct mpt_softc *);
	typedef int mpt_unload_handler_t(struct mpt_personality *);

	struct mpt_personality
	{
	const char *name;
	uint32_t id; /* Assigned identifier. */
	u_int use_count; /* Instances using personality*/
	mpt_load_handler_t load; / configure personailty */
	#define MPT_PERS_FIRST_HANDLER(pers) (&(pers)->load)
	mpt_probe_handler_t probe; / configure personailty */
	mpt_attach_handler_t attach; / initialize device instance */
	mpt_enable_handler_t enable; / enable device */
	mpt_ready_handler_t ready; / final open for business */
	mpt_event_handler_t event; / Handle MPI event. */
	mpt_reset_handler_t reset; / Re-init after reset. */
	mpt_shutdown_handler_t shutdown; / Shutdown instance. */
	mpt_detach_handler_t detach; / release device instance */
	mpt_unload_handler_t unload; / Shutdown personality */
	#define MPT_PERS_LAST_HANDLER(pers) (&(pers)->unload)
	};

	int mpt_modevent(module_t, int, void *);

	/* Maximum supported number of personalities. */
	#define MPT_MAX_PERSONALITIES (15)

	#define MPT_PERSONALITY_DEPEND(name, dep, vmin, vpref, vmax) \
	MODULE_DEPEND(name, dep, vmin, vpref, vmax)

	#define DECLARE_MPT_PERSONALITY(name, order) \
	static moduledata_t name##_mod = { \
	#name, mpt_modevent, &name##_personality \
	}; \
	DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, order); \
	MODULE_VERSION(name, 1); \
	MPT_PERSONALITY_DEPEND(name, mpt_core, 1, 1, 1)

	/***************************** Bus DMA Support ****************************/
	/* XXX Need to update bus_dmamap_sync to take a range argument. */
	#define bus_dmamap_sync_range(dma_tag, dmamap, offset, len, op) \
	bus_dmamap_sync(dma_tag, dmamap, op)

	#define mpt_dma_tag_create(mpt, parent_tag, alignment, boundary, \
	lowaddr, highaddr, filter, filterarg, \
	maxsize, nsegments, maxsegsz, flags, \
	dma_tagp) \
	bus_dma_tag_create(parent_tag, alignment, boundary, \
	lowaddr, highaddr, filter, filterarg, \
	maxsize, nsegments, maxsegsz, flags, \
	busdma_lock_mutex, &(mpt)->mpt_lock, \
	dma_tagp)
	struct mpt_map_info {
	struct mpt_softc *mpt;
	int error;
	uint32_t phys;
	};

	void mpt_map_rquest(void , bus_dma_segment_t , int, int);

	/******************************* Endianness *******************************/
	#define MPT_2_HOST64(ptr, tag) ptr->tag = le64toh(ptr->tag)
	#define MPT_2_HOST32(ptr, tag) ptr->tag = le32toh(ptr->tag)
	#define MPT_2_HOST16(ptr, tag) ptr->tag = le16toh(ptr->tag)

	#define HOST_2_MPT64(ptr, tag) ptr->tag = htole64(ptr->tag)
	#define HOST_2_MPT32(ptr, tag) ptr->tag = htole32(ptr->tag)
	#define HOST_2_MPT16(ptr, tag) ptr->tag = htole16(ptr->tag)

	#if _BYTE_ORDER == _BIG_ENDIAN
	void mpt2host_sge_simple_union(SGE_SIMPLE_UNION *);
	void mpt2host_iocfacts_reply(MSG_IOC_FACTS_REPLY *);
	void mpt2host_portfacts_reply(MSG_PORT_FACTS_REPLY *);
	void mpt2host_config_page_ioc2(CONFIG_PAGE_IOC_2 *);
	void mpt2host_config_page_ioc3(CONFIG_PAGE_IOC_3 *);
	void mpt2host_config_page_scsi_port_0(CONFIG_PAGE_SCSI_PORT_0 *);
	void mpt2host_config_page_scsi_port_1(CONFIG_PAGE_SCSI_PORT_1 *);
	void host2mpt_config_page_scsi_port_1(CONFIG_PAGE_SCSI_PORT_1 *);
	void mpt2host_config_page_scsi_port_2(CONFIG_PAGE_SCSI_PORT_2 *);
	void mpt2host_config_page_scsi_device_0(CONFIG_PAGE_SCSI_DEVICE_0 *);
	void mpt2host_config_page_scsi_device_1(CONFIG_PAGE_SCSI_DEVICE_1 *);
	void host2mpt_config_page_scsi_device_1(CONFIG_PAGE_SCSI_DEVICE_1 *);
	void mpt2host_config_page_fc_port_0(CONFIG_PAGE_FC_PORT_0 *);
	void mpt2host_config_page_fc_port_1(CONFIG_PAGE_FC_PORT_1 *);
	void host2mpt_config_page_fc_port_1(CONFIG_PAGE_FC_PORT_1 *);
	void mpt2host_config_page_raid_vol_0(CONFIG_PAGE_RAID_VOL_0 *);
	void mpt2host_config_page_raid_phys_disk_0(CONFIG_PAGE_RAID_PHYS_DISK_0 *);
	void mpt2host_mpi_raid_vol_indicator(MPI_RAID_VOL_INDICATOR *);
	#else
	#define mpt2host_sge_simple_union(x) do { ; } while (0)
	#define mpt2host_iocfacts_reply(x) do { ; } while (0)
	#define mpt2host_portfacts_reply(x) do { ; } while (0)
	#define mpt2host_config_page_ioc2(x) do { ; } while (0)
	#define mpt2host_config_page_ioc3(x) do { ; } while (0)
	#define mpt2host_config_page_scsi_port_0(x) do { ; } while (0)
	#define mpt2host_config_page_scsi_port_1(x) do { ; } while (0)
	#define host2mpt_config_page_scsi_port_1(x) do { ; } while (0)
	#define mpt2host_config_page_scsi_port_2(x) do { ; } while (0)
	#define mpt2host_config_page_scsi_device_0(x) do { ; } while (0)
	#define mpt2host_config_page_scsi_device_1(x) do { ; } while (0)
	#define host2mpt_config_page_scsi_device_1(x) do { ; } while (0)
	#define mpt2host_config_page_fc_port_0(x) do { ; } while (0)
	#define mpt2host_config_page_fc_port_1(x) do { ; } while (0)
	#define host2mpt_config_page_fc_port_1(x) do { ; } while (0)
	#define mpt2host_config_page_raid_vol_0(x) do { ; } while (0)
	#define mpt2host_config_page_raid_phys_disk_0(x) \
	do { ; } while (0)
	#define mpt2host_mpi_raid_vol_indicator(x) do { ; } while (0)
	#endif

	/************************** MPI Transaction State *************************/
	typedef enum {
	REQ_STATE_NIL = 0x00,
	REQ_STATE_FREE = 0x01,
	REQ_STATE_ALLOCATED = 0x02,
	REQ_STATE_QUEUED = 0x04,
	REQ_STATE_DONE = 0x08,
	REQ_STATE_TIMEDOUT = 0x10,
	REQ_STATE_NEED_WAKEUP = 0x20,
	REQ_STATE_LOCKED = 0x80, /* can't be freed */
	REQ_STATE_MASK = 0xFF
	} mpt_req_state_t;

	struct req_entry {
	TAILQ_ENTRY(req_entry) links; /* Pointer to next in list */
	mpt_req_state_t state; /* Request State Information */
	uint16_t index; /* Index of this entry */
	uint16_t IOCStatus; /* Completion status */
	uint16_t ResponseCode; /* TMF Response Code */
	uint16_t serno; /* serial number */
	union ccb ccb; / CAM request */
	void req_vbuf; / Virtual Address of Entry */
	void sense_vbuf; / Virtual Address of sense data */
	bus_addr_t req_pbuf; /* Physical Address of Entry */
	bus_addr_t sense_pbuf; /* Physical Address of sense data */
	bus_dmamap_t dmap; /* DMA map for data buffers */
	struct req_entry chain; / for SGE overallocations */
	struct callout callout; /* Timeout for the request */
	};

	typedef struct mpt_config_params {
	u_int Action;
	u_int PageVersion;
	u_int PageLength;
	u_int PageNumber;
	u_int PageType;
	u_int PageAddress;
	u_int ExtPageLength;
	u_int ExtPageType;
	} cfgparms_t;

	/************************** MPI Target State Info *************************/
	typedef struct {
	uint32_t reply_desc; /* current reply descriptor */
	uint32_t bytes_xfered; /* current relative offset */
	int resid; /* current data residual */
	union ccb ccb; / pointer to currently active ccb */
	request_t req; / pointer to currently active assist request */
	uint32_t
	is_local : 1,
	nxfers : 31;
	uint32_t tag_id; /* Our local tag. */
	uint16_t itag; /* Initiator tag. */
	enum {
	TGT_STATE_NIL,
	TGT_STATE_LOADING,
	TGT_STATE_LOADED,
	TGT_STATE_IN_CAM,
	TGT_STATE_SETTING_UP_FOR_DATA,
	TGT_STATE_MOVING_DATA,
	TGT_STATE_MOVING_DATA_AND_STATUS,
	TGT_STATE_SENDING_STATUS
	} state;
	} mpt_tgt_state_t;

	/*
	* When we get an incoming command it has its own tag which is called the
	* IoIndex. This is the value we gave that particular command buffer when
	* we originally assigned it. It's just a number, really. The FC card uses
	* it as an RX_ID. We can use it to index into mpt->tgt_cmd_ptrs, which
	* contains pointers the request_t structures related to that IoIndex.
	*
	* What we do is construct a tag out of the index for the target command
	* which owns the incoming ATIO plus a rolling sequence number.
	*/
	#define MPT_MAKE_TAGID(mpt, req, ioindex) \
	((ioindex << 18) \| (((mpt->sequence++) & 0x3f) << 12) \| (req->index & 0xfff))

	#ifdef INVARIANTS
	#define MPT_TAG_2_REQ(a, b) mpt_tag_2_req(a, (uint32_t) b)
	#else
	#define MPT_TAG_2_REQ(mpt, tag) mpt->tgt_cmd_ptrs[tag >> 18]
	#endif

	#define MPT_TGT_STATE(mpt, req) ((mpt_tgt_state_t *) \
	(&((uint8_t *)req->req_vbuf)[MPT_RQSL(mpt) - sizeof (mpt_tgt_state_t)]))

	STAILQ_HEAD(mpt_hdr_stailq, ccb_hdr);
	#define MPT_MAX_LUNS 256
	typedef struct {
	struct mpt_hdr_stailq atios;
	struct mpt_hdr_stailq inots;
	int enabled;
	} tgt_resource_t;
	#define MPT_MAX_ELS 64

	/************************** Handler Registration **************************/
	/*
	* Global table of registered reply handlers. The
	* handler is indicated by byte 3 of the request
	* index submitted to the IOC. This allows the
	* driver core to perform generic processing without
	* any knowledge of per-personality behavior.
	*
	* MPT_NUM_REPLY_HANDLERS must be a power of 2
	* to allow the easy generation of a mask.
	*
	* The handler offsets used by the core are hard coded
	* allowing faster code generation when assigning a handler
	* to a request. All "personalities" must use the
	* the handler registration mechanism.
	*
	* The IOC handlers that are rarely executed are placed
	* at the tail of the table to make it more likely that
	* all commonly executed handlers fit in a single cache
	* line.
	*/
	#define MPT_NUM_REPLY_HANDLERS (32)
	#define MPT_REPLY_HANDLER_EVENTS MPT_CBI_TO_HID(0)
	#define MPT_REPLY_HANDLER_CONFIG MPT_CBI_TO_HID(MPT_NUM_REPLY_HANDLERS-1)
	#define MPT_REPLY_HANDLER_HANDSHAKE MPT_CBI_TO_HID(MPT_NUM_REPLY_HANDLERS-2)
	typedef int mpt_reply_handler_t(struct mpt_softc mpt, request_t request,
	uint32_t reply_desc, MSG_DEFAULT_REPLY *reply_frame);
	typedef union {
	mpt_reply_handler_t *reply_handler;
	} mpt_handler_t;

	typedef enum {
	MPT_HANDLER_REPLY,
	MPT_HANDLER_EVENT,
	MPT_HANDLER_RESET,
	MPT_HANDLER_SHUTDOWN
	} mpt_handler_type;

	struct mpt_handler_record
	{
	LIST_ENTRY(mpt_handler_record) links;
	mpt_handler_t handler;
	};

	LIST_HEAD(mpt_handler_list, mpt_handler_record);

	/*
	* The handler_id is currently unused but would contain the
	* handler ID used in the MsgContext field to allow direction
	* of replies to the handler. Registrations that don't require
	* a handler id can pass in NULL for the handler_id.
	*
	* Deregistrations for handlers without a handler id should
	* pass in MPT_HANDLER_ID_NONE.
	*/
	#define MPT_HANDLER_ID_NONE (0xFFFFFFFF)
	int mpt_register_handler(struct mpt_softc *, mpt_handler_type,
	mpt_handler_t, uint32_t *);
	int mpt_deregister_handler(struct mpt_softc *, mpt_handler_type,
	mpt_handler_t, uint32_t);

	/***************** Per-Controller Instance Data Structures ****************/
	TAILQ_HEAD(req_queue, req_entry);

	/* Structure for saving proper values for modifyable PCI config registers */
	struct mpt_pci_cfg {
	uint16_t Command;
	uint16_t LatencyTimer_LineSize;
	uint32_t IO_BAR;
	uint32_t Mem0_BAR[2];
	uint32_t Mem1_BAR[2];
	uint32_t ROM_BAR;
	uint8_t IntLine;
	uint32_t PMCSR;
	};

	typedef enum {
	MPT_RVF_NONE = 0x0,
	MPT_RVF_ACTIVE = 0x1,
	MPT_RVF_ANNOUNCED = 0x2,
	MPT_RVF_UP2DATE = 0x4,
	MPT_RVF_REFERENCED = 0x8,
	MPT_RVF_WCE_CHANGED = 0x10
	} mpt_raid_volume_flags;

	struct mpt_raid_volume {
	CONFIG_PAGE_RAID_VOL_0 *config_page;
	MPI_RAID_VOL_INDICATOR sync_progress;
	mpt_raid_volume_flags flags;
	u_int quiesced_disks;
	};

	typedef enum {
	MPT_RDF_NONE = 0x00,
	MPT_RDF_ACTIVE = 0x01,
	MPT_RDF_ANNOUNCED = 0x02,
	MPT_RDF_UP2DATE = 0x04,
	MPT_RDF_REFERENCED = 0x08,
	MPT_RDF_QUIESCING = 0x10,
	MPT_RDF_QUIESCED = 0x20
	} mpt_raid_disk_flags;

	struct mpt_raid_disk {
	CONFIG_PAGE_RAID_PHYS_DISK_0 config_page;
	struct mpt_raid_volume *volume;
	u_int member_number;
	u_int pass_thru_active;
	mpt_raid_disk_flags flags;
	};

	struct mpt_evtf_record {
	MSG_EVENT_NOTIFY_REPLY reply;
	uint32_t context;
	LIST_ENTRY(mpt_evtf_record) links;
	};

	LIST_HEAD(mpt_evtf_list, mpt_evtf_record);

	struct mptsas_devinfo {
	uint16_t dev_handle;
	uint16_t parent_dev_handle;
	uint16_t enclosure_handle;
	uint16_t slot;
	uint8_t phy_num;
	uint8_t physical_port;
	uint8_t target_id;
	uint8_t bus;
	uint64_t sas_address;
	uint32_t device_info;
	};

	struct mptsas_phyinfo {
	uint16_t handle;
	uint8_t phy_num;
	uint8_t port_id;
	uint8_t negotiated_link_rate;
	uint8_t hw_link_rate;
	uint8_t programmed_link_rate;
	uint8_t sas_port_add_phy;
	struct mptsas_devinfo identify;
	struct mptsas_devinfo attached;
	};

	struct mptsas_portinfo {
	uint16_t num_phys;
	struct mptsas_phyinfo *phy_info;
	};

	struct mpt_softc {
	device_t dev;
	struct mtx mpt_lock;
	int mpt_locksetup;
	uint32_t mpt_pers_mask;
	uint32_t
	: 7,
	unit : 8,
	ready : 1,
	fw_uploaded : 1,
	msi_enable : 1,
	twildcard : 1,
	tenabled : 1,
	do_cfg_role : 1,
	raid_enabled : 1,
	raid_mwce_set : 1,
	getreqwaiter : 1,
	shutdwn_raid : 1,
	shutdwn_recovery: 1,
	outofbeer : 1,
	disabled : 1,
	is_spi : 1,
	is_sas : 1,
	is_fc : 1,
	is_1078 : 1;

	u_int cfg_role;
	u_int role; /* role: none, ini, target, both */

	u_int verbose;
	#ifdef MPT_TEST_MULTIPATH
	int failure_id;
	#endif

	/*
	* IOC Facts
	*/
	MSG_IOC_FACTS_REPLY ioc_facts;

	/*
	* Port Facts
	*/
	MSG_PORT_FACTS_REPLY * port_facts;
	#define mpt_max_tgtcmds port_facts[0].MaxPostedCmdBuffers

	/*
	* Device Configuration Information
	*/
	union {
	struct mpt_spi_cfg {
	CONFIG_PAGE_SCSI_PORT_0 _port_page0;
	CONFIG_PAGE_SCSI_PORT_1 _port_page1;
	CONFIG_PAGE_SCSI_PORT_2 _port_page2;
	CONFIG_PAGE_SCSI_DEVICE_0 _dev_page0[16];
	CONFIG_PAGE_SCSI_DEVICE_1 _dev_page1[16];
	int _ini_id;
	uint16_t _tag_enable;
	uint16_t _disc_enable;
	} spi;
	#define mpt_port_page0 cfg.spi._port_page0
	#define mpt_port_page1 cfg.spi._port_page1
	#define mpt_port_page2 cfg.spi._port_page2
	#define mpt_dev_page0 cfg.spi._dev_page0
	#define mpt_dev_page1 cfg.spi._dev_page1
	#define mpt_ini_id cfg.spi._ini_id
	#define mpt_tag_enable cfg.spi._tag_enable
	#define mpt_disc_enable cfg.spi._disc_enable
	struct mpi_fc_cfg {
	CONFIG_PAGE_FC_PORT_0 _port_page0;
	uint32_t _port_speed;
	#define mpt_fcport_page0 cfg.fc._port_page0
	#define mpt_fcport_speed cfg.fc._port_speed
	} fc;
	} cfg;
	/*
	* Device config information stored up for sysctl to access
	*/
	union {
	struct {
	unsigned int initiator_id;
	} spi;
	struct {
	uint64_t wwnn;
	uint64_t wwpn;
	uint32_t portid;
	} fc;
	} scinfo;

	/* Controller Info for RAID information */
	CONFIG_PAGE_IOC_2 * ioc_page2;
	CONFIG_PAGE_IOC_3 * ioc_page3;

	/* Raid Data */
	struct mpt_raid_volume* raid_volumes;
	struct mpt_raid_disk* raid_disks;
	u_int raid_max_volumes;
	u_int raid_max_disks;
	u_int raid_page0_len;
	u_int raid_wakeup;
	u_int raid_rescan;
	u_int raid_resync_rate;
	u_int raid_mwce_setting;
	u_int raid_queue_depth;
	u_int raid_nonopt_volumes;
	struct proc *raid_thread;
	struct callout raid_timer;

	/*
	* PCI Hardware info
	*/
	struct resource * pci_irq; /* Interrupt map for chip */
	void * ih; /* Interrupt handle */
	#if 0
	struct mpt_pci_cfg pci_cfg; /* saved PCI conf registers */
	#endif

	/*
	* DMA Mapping Stuff
	*/
	struct resource * pci_reg; /* Register map for chip */
	bus_space_tag_t pci_st; /* Bus tag for registers */
	bus_space_handle_t pci_sh; /* Bus handle for registers */
	/* PIO versions of above. */
	struct resource * pci_pio_reg;
	bus_space_tag_t pci_pio_st;
	bus_space_handle_t pci_pio_sh;

	bus_dma_tag_t parent_dmat; /* DMA tag for parent PCI bus */
	bus_dma_tag_t reply_dmat; /* DMA tag for reply memory */
	bus_dmamap_t reply_dmap; /* DMA map for reply memory */
	uint8_t reply; / KVA of reply memory */
	bus_addr_t reply_phys; /* BusAddr of reply memory */

	bus_dma_tag_t buffer_dmat; /* DMA tag for buffers */
	bus_dma_tag_t request_dmat; /* DMA tag for request memory */
	bus_dmamap_t request_dmap; /* DMA map for request memory */
	uint8_t request; / KVA of Request memory */
	bus_addr_t request_phys; /* BusAddr of request memory */

	uint32_t max_seg_cnt; /* calculated after IOC facts */
	- uint32_t max_cam_seg_cnt;/* calculated from MAXPHYS*/
	+ uint32_t max_cam_seg_cnt;/* calculated from maxphys */

	/*
	* Hardware management
	*/
	u_int reset_cnt;

	/*
	* CAM && Software Management
	*/
	request_t *request_pool;
	struct req_queue request_free_list;
	struct req_queue request_pending_list;
	struct req_queue request_timeout_list;

	struct cam_sim *sim;
	struct cam_path *path;

	struct cam_sim *phydisk_sim;
	struct cam_path *phydisk_path;

	struct proc *recovery_thread;
	request_t *tmf_req;

	/*
	* Deferred frame acks due to resource shortage.
	*/
	struct mpt_evtf_list ack_frames;

	/*
	* Target Mode Support
	*/
	uint32_t scsi_tgt_handler_id;
	request_t ** tgt_cmd_ptrs;
	request_t ** els_cmd_ptrs; /* FC only */

	/*
	* snork- this is chosen to be here just in case somebody
	* forgets to point to it exactly and we index off of trt with
	* CAM_LUN_WILDCARD.
	*/
	tgt_resource_t trt_wildcard; /* wildcard luns */
	tgt_resource_t trt[MPT_MAX_LUNS];
	uint16_t tgt_cmds_allocated;
	uint16_t els_cmds_allocated; /* FC only */

	uint16_t timeouts; /* timeout count */
	uint16_t success; /* successes afer timeout */
	uint16_t sequence; /* Sequence Number */
	uint16_t pad3;

	#if 0
	/* Paired port in some dual adapters configurations */
	struct mpt_softc * mpt2;
	#endif

	/* FW Image management */
	uint32_t fw_image_size;
	uint8_t *fw_image;
	bus_dma_tag_t fw_dmat; /* DMA tag for firmware image */
	bus_dmamap_t fw_dmap; /* DMA map for firmware image */
	bus_addr_t fw_phys; /* BusAddr of firmware image */

	/* SAS Topology */
	struct mptsas_portinfo *sas_portinfo;

	/* Shutdown Event Handler. */
	eventhandler_tag eh;

	/* Userland management interface. */
	struct cdev *cdev;

	TAILQ_ENTRY(mpt_softc) links;
	};

	static __inline void mpt_assign_serno(struct mpt_softc , request_t );

	static __inline void
	mpt_assign_serno(struct mpt_softc mpt, request_t req)
	{
	if ((req->serno = mpt->sequence++) == 0) {
	req->serno = mpt->sequence++;
	}
	}

	/*************************** Locking Primitives ***************************/
	#define MPT_IFLAGS INTR_TYPE_CAM \| INTR_ENTROPY \| INTR_MPSAFE
	#define MPT_LOCK_SETUP(mpt) \
	mtx_init(&mpt->mpt_lock, "mpt", NULL, MTX_DEF); \
	mpt->mpt_locksetup = 1
	#define MPT_LOCK_DESTROY(mpt) \
	if (mpt->mpt_locksetup) { \
	mtx_destroy(&mpt->mpt_lock); \
	mpt->mpt_locksetup = 0; \
	}

	#define MPT_LOCK(mpt) mtx_lock(&(mpt)->mpt_lock)
	#define MPT_UNLOCK(mpt) mtx_unlock(&(mpt)->mpt_lock)
	#define MPT_OWNED(mpt) mtx_owned(&(mpt)->mpt_lock)
	#define MPT_LOCK_ASSERT(mpt) mtx_assert(&(mpt)->mpt_lock, MA_OWNED)
	#define mpt_sleep(mpt, ident, priority, wmesg, sbt) \
	msleep_sbt(ident, &(mpt)->mpt_lock, priority, wmesg, sbt, 0, 0)
	#define mpt_req_timeout(req, sbt, func, arg) \
	callout_reset_sbt(&(req)->callout, (sbt), 0, (func), (arg), 0)
	#define mpt_req_untimeout(req, func, arg) \
	callout_stop(&(req)->callout)
	#define mpt_callout_init(mpt, c) \
	callout_init_mtx(c, &(mpt)->mpt_lock, 0)
	#define mpt_callout_drain(mpt, c) \
	callout_drain(c)

	/***************************** Register Access ****************************/
	static __inline void mpt_write(struct mpt_softc *, size_t, uint32_t);
	static __inline void mpt_write_stream(struct mpt_softc *, size_t, uint32_t);
	static __inline uint32_t mpt_read(struct mpt_softc *, int);
	static __inline void mpt_pio_write(struct mpt_softc *, size_t, uint32_t);
	static __inline uint32_t mpt_pio_read(struct mpt_softc *, int);

	static __inline void
	mpt_write(struct mpt_softc *mpt, size_t offset, uint32_t val)
	{
	bus_space_write_4(mpt->pci_st, mpt->pci_sh, offset, val);
	}

	static __inline void
	mpt_write_stream(struct mpt_softc *mpt, size_t offset, uint32_t val)
	{
	bus_space_write_stream_4(mpt->pci_st, mpt->pci_sh, offset, val);
	}

	static __inline uint32_t
	mpt_read(struct mpt_softc *mpt, int offset)
	{
	return (bus_space_read_4(mpt->pci_st, mpt->pci_sh, offset));
	}

	/*
	* Some operations (e.g. diagnostic register writes while the ARM proccessor
	* is disabled), must be performed using "PCI pio" operations. On non-PCI
	* buses, these operations likely map to normal register accesses.
	*/
	static __inline void
	mpt_pio_write(struct mpt_softc *mpt, size_t offset, uint32_t val)
	{
	KASSERT(mpt->pci_pio_reg != NULL, ("no PIO resource"));
	bus_space_write_4(mpt->pci_pio_st, mpt->pci_pio_sh, offset, val);
	}

	static __inline uint32_t
	mpt_pio_read(struct mpt_softc *mpt, int offset)
	{
	KASSERT(mpt->pci_pio_reg != NULL, ("no PIO resource"));
	return (bus_space_read_4(mpt->pci_pio_st, mpt->pci_pio_sh, offset));
	}

	/********************* Reply Frame/Request Management *********************/
	/* Max MPT Reply we are willing to accept (must be power of 2) */
	#define MPT_REPLY_SIZE 256

	/*
	* Must be less than 16384 in order for target mode to work
	*/
	#define MPT_MAX_REQUESTS(mpt) 512
	#define MPT_REQUEST_AREA 512
	#define MPT_SENSE_SIZE 32 /* included in MPT_REQUEST_AREA */
	#define MPT_REQ_MEM_SIZE(mpt) (MPT_MAX_REQUESTS(mpt) * MPT_REQUEST_AREA)

	#define MPT_CONTEXT_CB_SHIFT (16)
	#define MPT_CBI(handle) (handle >> MPT_CONTEXT_CB_SHIFT)
	#define MPT_CBI_TO_HID(cbi) ((cbi) << MPT_CONTEXT_CB_SHIFT)
	#define MPT_CONTEXT_TO_CBI(x) \
	(((x) >> MPT_CONTEXT_CB_SHIFT) & (MPT_NUM_REPLY_HANDLERS - 1))
	#define MPT_CONTEXT_REQI_MASK 0xFFFF
	#define MPT_CONTEXT_TO_REQI(x) ((x) & MPT_CONTEXT_REQI_MASK)

	/*
	* Convert a 32bit physical address returned from IOC to an
	* offset into our reply frame memory or the kvm address needed
	* to access the data. The returned address is only the low
	* 32 bits, so mask our base physical address accordingly.
	*/
	#define MPT_REPLY_BADDR(x) \
	(x << 1)
	#define MPT_REPLY_OTOV(m, i) \
	((void *)(&m->reply[i]))

	#define MPT_DUMP_REPLY_FRAME(mpt, reply_frame) \
	do { \
	if (mpt->verbose > MPT_PRT_DEBUG) \
	mpt_dump_reply_frame(mpt, reply_frame); \
	} while(0)

	static __inline uint32_t mpt_pop_reply_queue(struct mpt_softc *mpt);
	static __inline void mpt_free_reply(struct mpt_softc *mpt, uint32_t ptr);

	/*
	* Give the reply buffer back to the IOC after we have
	* finished processing it.
	*/
	static __inline void
	mpt_free_reply(struct mpt_softc *mpt, uint32_t ptr)
	{
	mpt_write(mpt, MPT_OFFSET_REPLY_Q, ptr);
	}

	/* Get a reply from the IOC */
	static __inline uint32_t
	mpt_pop_reply_queue(struct mpt_softc *mpt)
	{
	return mpt_read(mpt, MPT_OFFSET_REPLY_Q);
	}

	void
	mpt_complete_request_chain(struct mpt_softc , struct req_queue , u_int);

	/************************ Scatter Gather Management ************************/
	/* MPT_RQSL- size of request frame, in bytes */
	#define MPT_RQSL(mpt) (mpt->ioc_facts.RequestFrameSize << 2)

	/* MPT_NSGL- how many SG entries can fit in a request frame size */
	#define MPT_NSGL(mpt) (MPT_RQSL(mpt) / sizeof (SGE_IO_UNION))

	/* MPT_NRFM- how many request frames can fit in each request alloc we make */
	#define MPT_NRFM(mpt) (MPT_REQUEST_AREA / MPT_RQSL(mpt))

	/*
	* MPT_NSGL_FIRST- # of SG elements that can fit after
	* an I/O request but still within the request frame.
	* Do this safely based upon SGE_IO_UNION.
	*
	* Note that the first element is within the SCSI request.
	*/
	#define MPT_NSGL_FIRST(mpt) \
	((MPT_RQSL(mpt) - sizeof (MSG_SCSI_IO_REQUEST) + sizeof (SGE_IO_UNION)) / \
	sizeof (SGE_IO_UNION))

	/*************************** IOC Initialization ***************************/
	int mpt_reset(struct mpt_softc , int /reinit*/);

	/**************************** Debugging **********************************/
	void mpt_dump_data(struct mpt_softc , const char , void *, int);
	void mpt_dump_request(struct mpt_softc , request_t );

	enum {
	MPT_PRT_ALWAYS,
	MPT_PRT_FATAL,
	MPT_PRT_ERROR,
	MPT_PRT_WARN,
	MPT_PRT_INFO,
	MPT_PRT_NEGOTIATION,
	MPT_PRT_DEBUG,
	MPT_PRT_DEBUG1,
	MPT_PRT_DEBUG2,
	MPT_PRT_DEBUG3,
	MPT_PRT_TRACE,
	MPT_PRT_NONE=100
	};

	#define mpt_lprt(mpt, level, ...) \
	do { \
	if ((level) <= (mpt)->verbose) \
	mpt_prt(mpt, __VA_ARGS__); \
	} while (0)

	#if 0
	#define mpt_lprtc(mpt, level, ...) \
	do { \
	if ((level) <= (mpt)->verbose) \
	mpt_prtc(mpt, __VA_ARGS__); \
	} while (0)
	#endif

	void mpt_prt(struct mpt_softc , const char , ...)
	__printflike(2, 3);
	void mpt_prtc(struct mpt_softc , const char , ...)
	__printflike(2, 3);

	/************************** Target Mode Related *************************/
	#ifdef INVARIANTS
	static __inline request_t * mpt_tag_2_req(struct mpt_softc *, uint32_t);
	static __inline request_t *
	mpt_tag_2_req(struct mpt_softc *mpt, uint32_t tag)
	{
	uint16_t rtg = (tag >> 18);
	KASSERT(rtg < mpt->tgt_cmds_allocated, ("bad tag %d", tag));
	KASSERT(mpt->tgt_cmd_ptrs, ("no cmd backpointer array"));
	KASSERT(mpt->tgt_cmd_ptrs[rtg], ("no cmd backpointer"));
	return (mpt->tgt_cmd_ptrs[rtg]);
	}
	#endif

	static __inline int
	mpt_req_on_free_list(struct mpt_softc , request_t );
	static __inline int
	mpt_req_on_pending_list(struct mpt_softc , request_t );

	/*
	* Is request on freelist?
	*/
	static __inline int
	mpt_req_on_free_list(struct mpt_softc mpt, request_t req)
	{
	request_t *lrq;

	TAILQ_FOREACH(lrq, &mpt->request_free_list, links) {
	if (lrq == req) {
	return (1);
	}
	}
	return (0);
	}

	/*
	* Is request on pending list?
	*/
	static __inline int
	mpt_req_on_pending_list(struct mpt_softc mpt, request_t req)
	{
	request_t *lrq;

	TAILQ_FOREACH(lrq, &mpt->request_pending_list, links) {
	if (lrq == req) {
	return (1);
	}
	}
	return (0);
	}

	#ifdef INVARIANTS
	static __inline void
	mpt_req_spcl(struct mpt_softc , request_t , const char *, int);
	static __inline void
	mpt_req_not_spcl(struct mpt_softc , request_t , const char *, int);

	/*
	* Make sure that req is part of one of the special lists
	*/
	static __inline void
	mpt_req_spcl(struct mpt_softc mpt, request_t req, const char *s, int line)
	{
	int i;
	for (i = 0; i < mpt->els_cmds_allocated; i++) {
	if (req == mpt->els_cmd_ptrs[i]) {
	return;
	}
	}
	for (i = 0; i < mpt->tgt_cmds_allocated; i++) {
	if (req == mpt->tgt_cmd_ptrs[i]) {
	return;
	}
	}
	panic("%s(%d): req %p:%u function %x not in els or tgt ptrs",
	s, line, req, req->serno,
	((PTR_MSG_REQUEST_HEADER)req->req_vbuf)->Function);
	}

	/*
	* Make sure that req is not part of one of the special lists.
	*/
	static __inline void
	mpt_req_not_spcl(struct mpt_softc mpt, request_t req, const char *s, int line)
	{
	int i;
	for (i = 0; i < mpt->els_cmds_allocated; i++) {
	KASSERT(req != mpt->els_cmd_ptrs[i],
	("%s(%d): req %p:%u func %x in els ptrs at ioindex %d",
	s, line, req, req->serno,
	((PTR_MSG_REQUEST_HEADER)req->req_vbuf)->Function, i));
	}
	for (i = 0; i < mpt->tgt_cmds_allocated; i++) {
	KASSERT(req != mpt->tgt_cmd_ptrs[i],
	("%s(%d): req %p:%u func %x in tgt ptrs at ioindex %d",
	s, line, req, req->serno,
	((PTR_MSG_REQUEST_HEADER)req->req_vbuf)->Function, i));
	}
	}
	#endif

	/*
	* Task Management Types, purely for internal consumption
	*/
	typedef enum {
	MPT_QUERY_TASK_SET=1234,
	MPT_ABORT_TASK_SET,
	MPT_CLEAR_TASK_SET,
	MPT_QUERY_ASYNC_EVENT,
	MPT_LOGICAL_UNIT_RESET,
	MPT_TARGET_RESET,
	MPT_CLEAR_ACA,
	MPT_NIL_TMT_VALUE=5678
	} mpt_task_mgmt_t;

	/************************** Unclassified Routines *************************/
	void mpt_send_cmd(struct mpt_softc mpt, request_t req);
	int mpt_recv_handshake_reply(struct mpt_softc *mpt,
	size_t reply_len, void *reply);
	int mpt_wait_req(struct mpt_softc mpt, request_t req,
	mpt_req_state_t state, mpt_req_state_t mask,
	int sleep_ok, int time_ms);
	void mpt_enable_ints(struct mpt_softc *mpt);
	void mpt_disable_ints(struct mpt_softc *mpt);
	int mpt_attach(struct mpt_softc *mpt);
	int mpt_shutdown(struct mpt_softc *mpt);
	int mpt_detach(struct mpt_softc *mpt);
	int mpt_send_handshake_cmd(struct mpt_softc *mpt,
	size_t len, void *cmd);
	request_t * mpt_get_request(struct mpt_softc *mpt, int sleep_ok);
	void mpt_free_request(struct mpt_softc mpt, request_t req);
	void mpt_intr(void *arg);
	void mpt_check_doorbell(struct mpt_softc *mpt);
	void mpt_dump_reply_frame(struct mpt_softc *mpt,
	MSG_DEFAULT_REPLY *reply_frame);

	int mpt_issue_cfg_req(struct mpt_softc /mpt/, request_t /req/,
	cfgparms_t *params,
	bus_addr_t /addr/, bus_size_t/len/,
	int /sleep_ok/, int /timeout_ms/);
	int mpt_read_extcfg_header(struct mpt_softc *mpt, int PageVersion,
	int PageNumber, uint32_t PageAddress,
	int ExtPageType,
	CONFIG_EXTENDED_PAGE_HEADER *rslt,
	int sleep_ok, int timeout_ms);
	int mpt_read_extcfg_page(struct mpt_softc *mpt, int Action,
	uint32_t PageAddress,
	CONFIG_EXTENDED_PAGE_HEADER *hdr,
	void *buf, size_t len, int sleep_ok,
	int timeout_ms);
	int mpt_read_cfg_header(struct mpt_softc , int /PageType*/,
	int /PageNumber/,
	uint32_t /PageAddress/,
	CONFIG_PAGE_HEADER *,
	int /sleep_ok/, int /timeout_ms/);
	int mpt_read_cfg_page(struct mpt_softc t, int /Action*/,
	uint32_t /PageAddress/,
	CONFIG_PAGE_HEADER , size_t /len*/,
	int /sleep_ok/, int /timeout_ms/);
	int mpt_write_cfg_page(struct mpt_softc , int /Action*/,
	uint32_t /PageAddress/,
	CONFIG_PAGE_HEADER , size_t /len*/,
	int /sleep_ok/, int /timeout_ms/);
	static __inline int
	mpt_read_cur_cfg_page(struct mpt_softc *mpt, uint32_t PageAddress,
	CONFIG_PAGE_HEADER *hdr, size_t len,
	int sleep_ok, int timeout_ms)
	{
	return (mpt_read_cfg_page(mpt, MPI_CONFIG_ACTION_PAGE_READ_CURRENT,
	PageAddress, hdr, len, sleep_ok, timeout_ms));
	}

	static __inline int
	mpt_write_cur_cfg_page(struct mpt_softc *mpt, uint32_t PageAddress,
	CONFIG_PAGE_HEADER *hdr, size_t len, int sleep_ok,
	int timeout_ms)
	{
	return (mpt_write_cfg_page(mpt, MPI_CONFIG_ACTION_PAGE_WRITE_CURRENT,
	PageAddress, hdr, len, sleep_ok,
	timeout_ms));
	}

	/* mpt_debug.c functions */
	void mpt_print_reply(void *vmsg);
	void mpt_print_db(uint32_t mb);
	void mpt_print_config_reply(void *vmsg);
	char *mpt_ioc_diag(uint32_t diag);
	void mpt_req_state(mpt_req_state_t state);
	void mpt_print_config_request(void *vmsg);
	void mpt_print_request(void *vmsg);
	void mpt_dump_sgl(SGE_IO_UNION *se, int offset);

	#endif /* _MPT_H_ */
	diff --git a/sys/dev/mrsas/mrsas.c b/sys/dev/mrsas/mrsas.c
	index ee5279fc9c81..32d85c803938 100644
	--- a/sys/dev/mrsas/mrsas.c
	+++ b/sys/dev/mrsas/mrsas.c
	@@ -1,5068 +1,5068 @@
	/*
	* Copyright (c) 2015, AVAGO Tech. All rights reserved. Author: Marian Choy
	* Copyright (c) 2014, LSI Corp. All rights reserved. Author: Marian Choy
	* Support: freebsdraid@avagotech.com
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are
	* met:
	*
	* 1. Redistributions of source code must retain the above copyright notice,
	* this list of conditions and the following disclaimer. 2. Redistributions
	* in binary form must reproduce the above copyright notice, this list of
	* conditions and the following disclaimer in the documentation and/or other
	* materials provided with the distribution. 3. Neither the name of the
	* <ORGANIZATION> nor the names of its contributors may be used to endorse or
	* promote products derived from this software without specific prior written
	* permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* The views and conclusions contained in the software and documentation are
	* those of the authors and should not be interpreted as representing
	* official policies,either expressed or implied, of the FreeBSD Project.
	*
	* Send feedback to: <megaraidfbsd@avagotech.com> Mail to: AVAGO TECHNOLOGIES 1621
	* Barber Lane, Milpitas, CA 95035 ATTN: MegaRaid FreeBSD
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <dev/mrsas/mrsas.h>
	#include <dev/mrsas/mrsas_ioctl.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>

	#include <sys/sysctl.h>
	#include <sys/types.h>
	#include <sys/sysent.h>
	#include <sys/kthread.h>
	#include <sys/taskqueue.h>
	#include <sys/smp.h>

	/*
	* Function prototypes
	*/
	static d_open_t mrsas_open;
	static d_close_t mrsas_close;
	static d_read_t mrsas_read;
	static d_write_t mrsas_write;
	static d_ioctl_t mrsas_ioctl;
	static d_poll_t mrsas_poll;

	static void mrsas_ich_startup(void *arg);
	static struct mrsas_mgmt_info mrsas_mgmt_info;
	static struct mrsas_ident *mrsas_find_ident(device_t);
	static int mrsas_setup_msix(struct mrsas_softc *sc);
	static int mrsas_allocate_msix(struct mrsas_softc *sc);
	static void mrsas_shutdown_ctlr(struct mrsas_softc *sc, u_int32_t opcode);
	static void mrsas_flush_cache(struct mrsas_softc *sc);
	static void mrsas_reset_reply_desc(struct mrsas_softc *sc);
	static void mrsas_ocr_thread(void *arg);
	static int mrsas_get_map_info(struct mrsas_softc *sc);
	static int mrsas_get_ld_map_info(struct mrsas_softc *sc);
	static int mrsas_sync_map_info(struct mrsas_softc *sc);
	static int mrsas_get_pd_list(struct mrsas_softc *sc);
	static int mrsas_get_ld_list(struct mrsas_softc *sc);
	static int mrsas_setup_irq(struct mrsas_softc *sc);
	static int mrsas_alloc_mem(struct mrsas_softc *sc);
	static int mrsas_init_fw(struct mrsas_softc *sc);
	static int mrsas_setup_raidmap(struct mrsas_softc *sc);
	static void megasas_setup_jbod_map(struct mrsas_softc *sc);
	static int megasas_sync_pd_seq_num(struct mrsas_softc *sc, boolean_t pend);
	static int mrsas_clear_intr(struct mrsas_softc *sc);
	static int mrsas_get_ctrl_info(struct mrsas_softc *sc);
	static void mrsas_update_ext_vd_details(struct mrsas_softc *sc);
	static int
	mrsas_issue_blocked_abort_cmd(struct mrsas_softc *sc,
	struct mrsas_mfi_cmd *cmd_to_abort);
	static void
	mrsas_get_pd_info(struct mrsas_softc *sc, u_int16_t device_id);
	static struct mrsas_softc *
	mrsas_get_softc_instance(struct cdev *dev,
	u_long cmd, caddr_t arg);
	u_int32_t
	mrsas_read_reg_with_retries(struct mrsas_softc *sc, int offset);
	u_int32_t mrsas_read_reg(struct mrsas_softc *sc, int offset);
	u_int8_t
	mrsas_build_mptmfi_passthru(struct mrsas_softc *sc,
	struct mrsas_mfi_cmd *mfi_cmd);
	void mrsas_complete_outstanding_ioctls(struct mrsas_softc *sc);
	int mrsas_transition_to_ready(struct mrsas_softc *sc, int ocr);
	int mrsas_init_adapter(struct mrsas_softc *sc);
	int mrsas_alloc_mpt_cmds(struct mrsas_softc *sc);
	int mrsas_alloc_ioc_cmd(struct mrsas_softc *sc);
	int mrsas_alloc_ctlr_info_cmd(struct mrsas_softc *sc);
	int mrsas_ioc_init(struct mrsas_softc *sc);
	int mrsas_bus_scan(struct mrsas_softc *sc);
	int mrsas_issue_dcmd(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd);
	int mrsas_issue_polled(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd);
	int mrsas_reset_ctrl(struct mrsas_softc *sc, u_int8_t reset_reason);
	int mrsas_wait_for_outstanding(struct mrsas_softc *sc, u_int8_t check_reason);
	int mrsas_complete_cmd(struct mrsas_softc *sc, u_int32_t MSIxIndex);
	int mrsas_reset_targets(struct mrsas_softc *sc);
	int
	mrsas_issue_blocked_cmd(struct mrsas_softc *sc,
	struct mrsas_mfi_cmd *cmd);
	int
	mrsas_alloc_tmp_dcmd(struct mrsas_softc sc, struct mrsas_tmp_dcmd tcmd,
	int size);
	void mrsas_release_mfi_cmd(struct mrsas_mfi_cmd *cmd);
	void mrsas_wakeup(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd);
	void mrsas_complete_aen(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd);
	void mrsas_complete_abort(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd);
	void mrsas_disable_intr(struct mrsas_softc *sc);
	void mrsas_enable_intr(struct mrsas_softc *sc);
	void mrsas_free_ioc_cmd(struct mrsas_softc *sc);
	void mrsas_free_mem(struct mrsas_softc *sc);
	void mrsas_free_tmp_dcmd(struct mrsas_tmp_dcmd *tmp);
	void mrsas_isr(void *arg);
	void mrsas_teardown_intr(struct mrsas_softc *sc);
	void mrsas_addr_cb(void arg, bus_dma_segment_t segs, int nsegs, int error);
	void mrsas_kill_hba(struct mrsas_softc *sc);
	void mrsas_aen_handler(struct mrsas_softc *sc);
	void
	mrsas_write_reg(struct mrsas_softc *sc, int offset,
	u_int32_t value);
	void
	mrsas_fire_cmd(struct mrsas_softc *sc, u_int32_t req_desc_lo,
	u_int32_t req_desc_hi);
	void mrsas_free_ctlr_info_cmd(struct mrsas_softc *sc);
	void
	mrsas_complete_mptmfi_passthru(struct mrsas_softc *sc,
	struct mrsas_mfi_cmd *cmd, u_int8_t status);
	struct mrsas_mfi_cmd mrsas_get_mfi_cmd(struct mrsas_softc sc);

	MRSAS_REQUEST_DESCRIPTOR_UNION *mrsas_build_mpt_cmd
	(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd);

	extern int mrsas_cam_attach(struct mrsas_softc *sc);
	extern void mrsas_cam_detach(struct mrsas_softc *sc);
	extern void mrsas_cmd_done(struct mrsas_softc sc, struct mrsas_mpt_cmd cmd);
	extern void mrsas_free_frame(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd);
	extern int mrsas_alloc_mfi_cmds(struct mrsas_softc *sc);
	extern struct mrsas_mpt_cmd mrsas_get_mpt_cmd(struct mrsas_softc sc);
	extern int mrsas_passthru(struct mrsas_softc sc, void arg, u_long ioctlCmd);
	extern uint8_t MR_ValidateMapInfo(struct mrsas_softc *sc);
	extern u_int16_t MR_GetLDTgtId(u_int32_t ld, MR_DRV_RAID_MAP_ALL * map);
	extern MR_LD_RAID MR_LdRaidGet(u_int32_t ld, MR_DRV_RAID_MAP_ALL map);
	extern void mrsas_xpt_freeze(struct mrsas_softc *sc);
	extern void mrsas_xpt_release(struct mrsas_softc *sc);
	extern MRSAS_REQUEST_DESCRIPTOR_UNION *
	mrsas_get_request_desc(struct mrsas_softc *sc,
	u_int16_t index);
	extern int mrsas_bus_scan_sim(struct mrsas_softc sc, struct cam_sim sim);
	static int mrsas_alloc_evt_log_info_cmd(struct mrsas_softc *sc);
	static void mrsas_free_evt_log_info_cmd(struct mrsas_softc *sc);
	void mrsas_release_mpt_cmd(struct mrsas_mpt_cmd *cmd);

	void mrsas_map_mpt_cmd_status(struct mrsas_mpt_cmd *cmd,
	union ccb *ccb_ptr, u_int8_t status, u_int8_t extStatus,
	u_int32_t data_length, u_int8_t *sense);
	void
	mrsas_write_64bit_req_desc(struct mrsas_softc *sc, u_int32_t req_desc_lo,
	u_int32_t req_desc_hi);

	SYSCTL_NODE(_hw, OID_AUTO, mrsas, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"MRSAS Driver Parameters");

	/*
	* PCI device struct and table
	*
	*/
	typedef struct mrsas_ident {
	uint16_t vendor;
	uint16_t device;
	uint16_t subvendor;
	uint16_t subdevice;
	const char *desc;
	} MRSAS_CTLR_ID;

	MRSAS_CTLR_ID device_table[] = {
	{0x1000, MRSAS_TBOLT, 0xffff, 0xffff, "AVAGO Thunderbolt SAS Controller"},
	{0x1000, MRSAS_INVADER, 0xffff, 0xffff, "AVAGO Invader SAS Controller"},
	{0x1000, MRSAS_FURY, 0xffff, 0xffff, "AVAGO Fury SAS Controller"},
	{0x1000, MRSAS_INTRUDER, 0xffff, 0xffff, "AVAGO Intruder SAS Controller"},
	{0x1000, MRSAS_INTRUDER_24, 0xffff, 0xffff, "AVAGO Intruder_24 SAS Controller"},
	{0x1000, MRSAS_CUTLASS_52, 0xffff, 0xffff, "AVAGO Cutlass_52 SAS Controller"},
	{0x1000, MRSAS_CUTLASS_53, 0xffff, 0xffff, "AVAGO Cutlass_53 SAS Controller"},
	{0x1000, MRSAS_VENTURA, 0xffff, 0xffff, "AVAGO Ventura SAS Controller"},
	{0x1000, MRSAS_CRUSADER, 0xffff, 0xffff, "AVAGO Crusader SAS Controller"},
	{0x1000, MRSAS_HARPOON, 0xffff, 0xffff, "AVAGO Harpoon SAS Controller"},
	{0x1000, MRSAS_TOMCAT, 0xffff, 0xffff, "AVAGO Tomcat SAS Controller"},
	{0x1000, MRSAS_VENTURA_4PORT, 0xffff, 0xffff, "AVAGO Ventura_4Port SAS Controller"},
	{0x1000, MRSAS_CRUSADER_4PORT, 0xffff, 0xffff, "AVAGO Crusader_4Port SAS Controller"},
	{0x1000, MRSAS_AERO_10E0, 0xffff, 0xffff, "BROADCOM AERO-10E0 SAS Controller"},
	{0x1000, MRSAS_AERO_10E1, 0xffff, 0xffff, "BROADCOM AERO-10E1 SAS Controller"},
	{0x1000, MRSAS_AERO_10E2, 0xffff, 0xffff, "BROADCOM AERO-10E2 SAS Controller"},
	{0x1000, MRSAS_AERO_10E3, 0xffff, 0xffff, "BROADCOM AERO-10E3 SAS Controller"},
	{0x1000, MRSAS_AERO_10E4, 0xffff, 0xffff, "BROADCOM AERO-10E4 SAS Controller"},
	{0x1000, MRSAS_AERO_10E5, 0xffff, 0xffff, "BROADCOM AERO-10E5 SAS Controller"},
	{0x1000, MRSAS_AERO_10E6, 0xffff, 0xffff, "BROADCOM AERO-10E6 SAS Controller"},
	{0x1000, MRSAS_AERO_10E7, 0xffff, 0xffff, "BROADCOM AERO-10E7 SAS Controller"},
	{0, 0, 0, 0, NULL}
	};

	/*
	* Character device entry points
	*
	*/
	static struct cdevsw mrsas_cdevsw = {
	.d_version = D_VERSION,
	.d_open = mrsas_open,
	.d_close = mrsas_close,
	.d_read = mrsas_read,
	.d_write = mrsas_write,
	.d_ioctl = mrsas_ioctl,
	.d_poll = mrsas_poll,
	.d_name = "mrsas",
	};

	MALLOC_DEFINE(M_MRSAS, "mrsasbuf", "Buffers for the MRSAS driver");

	/*
	* In the cdevsw routines, we find our softc by using the si_drv1 member of
	* struct cdev. We set this variable to point to our softc in our attach
	* routine when we create the /dev entry.
	*/
	int
	mrsas_open(struct cdev dev, int oflags, int devtype, struct thread td)
	{
	struct mrsas_softc *sc;

	sc = dev->si_drv1;
	return (0);
	}

	int
	mrsas_close(struct cdev dev, int fflag, int devtype, struct thread td)
	{
	struct mrsas_softc *sc;

	sc = dev->si_drv1;
	return (0);
	}

	int
	mrsas_read(struct cdev dev, struct uio uio, int ioflag)
	{
	struct mrsas_softc *sc;

	sc = dev->si_drv1;
	return (0);
	}
	int
	mrsas_write(struct cdev dev, struct uio uio, int ioflag)
	{
	struct mrsas_softc *sc;

	sc = dev->si_drv1;
	return (0);
	}

	u_int32_t
	mrsas_read_reg_with_retries(struct mrsas_softc *sc, int offset)
	{
	u_int32_t i = 0, ret_val;

	if (sc->is_aero) {
	do {
	ret_val = mrsas_read_reg(sc, offset);
	i++;
	} while(ret_val == 0 && i < 3);
	} else
	ret_val = mrsas_read_reg(sc, offset);

	return ret_val;
	}

	/*
	* Register Read/Write Functions
	*
	*/
	void
	mrsas_write_reg(struct mrsas_softc *sc, int offset,
	u_int32_t value)
	{
	bus_space_tag_t bus_tag = sc->bus_tag;
	bus_space_handle_t bus_handle = sc->bus_handle;

	bus_space_write_4(bus_tag, bus_handle, offset, value);
	}

	u_int32_t
	mrsas_read_reg(struct mrsas_softc *sc, int offset)
	{
	bus_space_tag_t bus_tag = sc->bus_tag;
	bus_space_handle_t bus_handle = sc->bus_handle;

	return ((u_int32_t)bus_space_read_4(bus_tag, bus_handle, offset));
	}

	/*
	* Interrupt Disable/Enable/Clear Functions
	*
	*/
	void
	mrsas_disable_intr(struct mrsas_softc *sc)
	{
	u_int32_t mask = 0xFFFFFFFF;
	u_int32_t status;

	sc->mask_interrupts = 1;
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, outbound_intr_mask), mask);
	/* Dummy read to force pci flush */
	status = mrsas_read_reg(sc, offsetof(mrsas_reg_set, outbound_intr_mask));
	}

	void
	mrsas_enable_intr(struct mrsas_softc *sc)
	{
	u_int32_t mask = MFI_FUSION_ENABLE_INTERRUPT_MASK;
	u_int32_t status;

	sc->mask_interrupts = 0;
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, outbound_intr_status), ~0);
	status = mrsas_read_reg(sc, offsetof(mrsas_reg_set, outbound_intr_status));

	mrsas_write_reg(sc, offsetof(mrsas_reg_set, outbound_intr_mask), ~mask);
	status = mrsas_read_reg(sc, offsetof(mrsas_reg_set, outbound_intr_mask));
	}

	static int
	mrsas_clear_intr(struct mrsas_softc *sc)
	{
	u_int32_t status;

	/* Read received interrupt */
	status = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set, outbound_intr_status));

	/* Not our interrupt, so just return */
	if (!(status & MFI_FUSION_ENABLE_INTERRUPT_MASK))
	return (0);

	/* We got a reply interrupt */
	return (1);
	}

	/*
	* PCI Support Functions
	*
	*/
	static struct mrsas_ident *
	mrsas_find_ident(device_t dev)
	{
	struct mrsas_ident *pci_device;

	for (pci_device = device_table; pci_device->vendor != 0; pci_device++) {
	if ((pci_device->vendor == pci_get_vendor(dev)) &&
	(pci_device->device == pci_get_device(dev)) &&
	((pci_device->subvendor == pci_get_subvendor(dev)) \|\|
	(pci_device->subvendor == 0xffff)) &&
	((pci_device->subdevice == pci_get_subdevice(dev)) \|\|
	(pci_device->subdevice == 0xffff)))
	return (pci_device);
	}
	return (NULL);
	}

	static int
	mrsas_probe(device_t dev)
	{
	static u_int8_t first_ctrl = 1;
	struct mrsas_ident *id;

	if ((id = mrsas_find_ident(dev)) != NULL) {
	if (first_ctrl) {
	printf("AVAGO MegaRAID SAS FreeBSD mrsas driver version: %s\n",
	MRSAS_VERSION);
	first_ctrl = 0;
	}
	device_set_desc(dev, id->desc);
	/* between BUS_PROBE_DEFAULT and BUS_PROBE_LOW_PRIORITY */
	return (-30);
	}
	return (ENXIO);
	}

	/*
	* mrsas_setup_sysctl: setup sysctl values for mrsas
	* input: Adapter instance soft state
	*
	* Setup sysctl entries for mrsas driver.
	*/
	static void
	mrsas_setup_sysctl(struct mrsas_softc *sc)
	{
	struct sysctl_ctx_list *sysctl_ctx = NULL;
	struct sysctl_oid *sysctl_tree = NULL;
	char tmpstr[80], tmpstr2[80];

	/*
	* Setup the sysctl variable so the user can change the debug level
	* on the fly.
	*/
	snprintf(tmpstr, sizeof(tmpstr), "MRSAS controller %d",
	device_get_unit(sc->mrsas_dev));
	snprintf(tmpstr2, sizeof(tmpstr2), "%d", device_get_unit(sc->mrsas_dev));

	sysctl_ctx = device_get_sysctl_ctx(sc->mrsas_dev);
	if (sysctl_ctx != NULL)
	sysctl_tree = device_get_sysctl_tree(sc->mrsas_dev);

	if (sysctl_tree == NULL) {
	sysctl_ctx_init(&sc->sysctl_ctx);
	sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_hw_mrsas), OID_AUTO, tmpstr2,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, tmpstr);
	if (sc->sysctl_tree == NULL)
	return;
	sysctl_ctx = &sc->sysctl_ctx;
	sysctl_tree = sc->sysctl_tree;
	}
	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "disable_ocr", CTLFLAG_RW, &sc->disableOnlineCtrlReset, 0,
	"Disable the use of OCR");

	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "driver_version", CTLFLAG_RD, MRSAS_VERSION,
	strlen(MRSAS_VERSION), "driver version");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "reset_count", CTLFLAG_RD,
	&sc->reset_count, 0, "number of ocr from start of the day");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "fw_outstanding", CTLFLAG_RD,
	&sc->fw_outstanding.val_rdonly, 0, "FW outstanding commands");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "io_cmds_highwater", CTLFLAG_RD,
	&sc->io_cmds_highwater, 0, "Max FW outstanding commands");

	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "mrsas_debug", CTLFLAG_RW, &sc->mrsas_debug, 0,
	"Driver debug level");

	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "mrsas_io_timeout", CTLFLAG_RW, &sc->mrsas_io_timeout,
	0, "Driver IO timeout value in mili-second.");

	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "mrsas_fw_fault_check_delay", CTLFLAG_RW,
	&sc->mrsas_fw_fault_check_delay,
	0, "FW fault check thread delay in seconds. <default is 1 sec>");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "reset_in_progress", CTLFLAG_RD,
	&sc->reset_in_progress, 0, "ocr in progress status");

	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "block_sync_cache", CTLFLAG_RW,
	&sc->block_sync_cache, 0,
	"Block SYNC CACHE at driver. <default: 0, send it to FW>");
	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "stream detection", CTLFLAG_RW,
	&sc->drv_stream_detection, 0,
	"Disable/Enable Stream detection. <default: 1, Enable Stream Detection>");
	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "prp_count", CTLFLAG_RD,
	&sc->prp_count.val_rdonly, 0, "Number of IOs for which PRPs are built");
	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
	OID_AUTO, "SGE holes", CTLFLAG_RD,
	&sc->sge_holes.val_rdonly, 0, "Number of IOs with holes in SGEs");
	}

	/*
	* mrsas_get_tunables: get tunable parameters.
	* input: Adapter instance soft state
	*
	* Get tunable parameters. This will help to debug driver at boot time.
	*/
	static void
	mrsas_get_tunables(struct mrsas_softc *sc)
	{
	char tmpstr[80];

	/* XXX default to some debugging for now */
	sc->mrsas_debug =
	(MRSAS_FAULT \| MRSAS_OCR \| MRSAS_INFO \| MRSAS_TRACE \| MRSAS_AEN);
	sc->mrsas_io_timeout = MRSAS_IO_TIMEOUT;
	sc->mrsas_fw_fault_check_delay = 1;
	sc->reset_count = 0;
	sc->reset_in_progress = 0;
	sc->block_sync_cache = 0;
	sc->drv_stream_detection = 1;

	/*
	* Grab the global variables.
	*/
	TUNABLE_INT_FETCH("hw.mrsas.debug_level", &sc->mrsas_debug);

	/*
	* Grab the global variables.
	*/
	TUNABLE_INT_FETCH("hw.mrsas.lb_pending_cmds", &sc->lb_pending_cmds);

	/* Grab the unit-instance variables */
	snprintf(tmpstr, sizeof(tmpstr), "dev.mrsas.%d.debug_level",
	device_get_unit(sc->mrsas_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->mrsas_debug);
	}

	/*
	* mrsas_alloc_evt_log_info cmd: Allocates memory to get event log information.
	* Used to get sequence number at driver load time.
	* input: Adapter soft state
	*
	* Allocates DMAable memory for the event log info internal command.
	*/
	int
	mrsas_alloc_evt_log_info_cmd(struct mrsas_softc *sc)
	{
	int el_info_size;

	/* Allocate get event log info command */
	el_info_size = sizeof(struct mrsas_evt_log_info);
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	1, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	el_info_size,
	1,
	el_info_size,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&sc->el_info_tag)) {
	device_printf(sc->mrsas_dev, "Cannot allocate event log info tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->el_info_tag, (void **)&sc->el_info_mem,
	BUS_DMA_NOWAIT, &sc->el_info_dmamap)) {
	device_printf(sc->mrsas_dev, "Cannot allocate event log info cmd mem\n");
	return (ENOMEM);
	}
	if (bus_dmamap_load(sc->el_info_tag, sc->el_info_dmamap,
	sc->el_info_mem, el_info_size, mrsas_addr_cb,
	&sc->el_info_phys_addr, BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load event log info cmd mem\n");
	return (ENOMEM);
	}
	memset(sc->el_info_mem, 0, el_info_size);
	return (0);
	}

	/*
	* mrsas_free_evt_info_cmd: Free memory for Event log info command
	* input: Adapter soft state
	*
	* Deallocates memory for the event log info internal command.
	*/
	void
	mrsas_free_evt_log_info_cmd(struct mrsas_softc *sc)
	{
	if (sc->el_info_phys_addr)
	bus_dmamap_unload(sc->el_info_tag, sc->el_info_dmamap);
	if (sc->el_info_mem != NULL)
	bus_dmamem_free(sc->el_info_tag, sc->el_info_mem, sc->el_info_dmamap);
	if (sc->el_info_tag != NULL)
	bus_dma_tag_destroy(sc->el_info_tag);
	}

	/*
	* mrsas_get_seq_num: Get latest event sequence number
	* @sc: Adapter soft state
	* @eli: Firmware event log sequence number information.
	*
	* Firmware maintains a log of all events in a non-volatile area.
	* Driver get the sequence number using DCMD
	* "MR_DCMD_CTRL_EVENT_GET_INFO" at driver load time.
	*/

	static int
	mrsas_get_seq_num(struct mrsas_softc *sc,
	struct mrsas_evt_log_info *eli)
	{
	struct mrsas_mfi_cmd *cmd;
	struct mrsas_dcmd_frame *dcmd;
	u_int8_t do_ocr = 1, retcode = 0;

	cmd = mrsas_get_mfi_cmd(sc);

	if (!cmd) {
	device_printf(sc->mrsas_dev, "Failed to get a free cmd\n");
	return -ENOMEM;
	}
	dcmd = &cmd->frame->dcmd;

	if (mrsas_alloc_evt_log_info_cmd(sc) != SUCCESS) {
	device_printf(sc->mrsas_dev, "Cannot allocate evt log info cmd\n");
	mrsas_release_mfi_cmd(cmd);
	return -ENOMEM;
	}
	memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE);

	dcmd->cmd = MFI_CMD_DCMD;
	dcmd->cmd_status = 0x0;
	dcmd->sge_count = 1;
	dcmd->flags = MFI_FRAME_DIR_READ;
	dcmd->timeout = 0;
	dcmd->pad_0 = 0;
	dcmd->data_xfer_len = sizeof(struct mrsas_evt_log_info);
	dcmd->opcode = MR_DCMD_CTRL_EVENT_GET_INFO;
	dcmd->sgl.sge32[0].phys_addr = sc->el_info_phys_addr;
	dcmd->sgl.sge32[0].length = sizeof(struct mrsas_evt_log_info);

	retcode = mrsas_issue_blocked_cmd(sc, cmd);
	if (retcode == ETIMEDOUT)
	goto dcmd_timeout;

	do_ocr = 0;
	/*
	* Copy the data back into callers buffer
	*/
	memcpy(eli, sc->el_info_mem, sizeof(struct mrsas_evt_log_info));
	mrsas_free_evt_log_info_cmd(sc);

	dcmd_timeout:
	if (do_ocr)
	sc->do_timedout_reset = MFI_DCMD_TIMEOUT_OCR;
	else
	mrsas_release_mfi_cmd(cmd);

	return retcode;
	}

	/*
	* mrsas_register_aen: Register for asynchronous event notification
	* @sc: Adapter soft state
	* @seq_num: Starting sequence number
	* @class_locale: Class of the event
	*
	* This function subscribes for events beyond the @seq_num
	* and type @class_locale.
	*
	*/
	static int
	mrsas_register_aen(struct mrsas_softc *sc, u_int32_t seq_num,
	u_int32_t class_locale_word)
	{
	int ret_val;
	struct mrsas_mfi_cmd *cmd;
	struct mrsas_dcmd_frame *dcmd;
	union mrsas_evt_class_locale curr_aen;
	union mrsas_evt_class_locale prev_aen;

	/*
	* If there an AEN pending already (aen_cmd), check if the
	* class_locale of that pending AEN is inclusive of the new AEN
	* request we currently have. If it is, then we don't have to do
	* anything. In other words, whichever events the current AEN request
	* is subscribing to, have already been subscribed to. If the old_cmd
	* is _not_ inclusive, then we have to abort that command, form a
	* class_locale that is superset of both old and current and re-issue
	* to the FW
	*/

	curr_aen.word = class_locale_word;

	if (sc->aen_cmd) {
	prev_aen.word = sc->aen_cmd->frame->dcmd.mbox.w[1];

	/*
	* A class whose enum value is smaller is inclusive of all
	* higher values. If a PROGRESS (= -1) was previously
	* registered, then a new registration requests for higher
	* classes need not be sent to FW. They are automatically
	* included. Locale numbers don't have such hierarchy. They
	* are bitmap values
	*/
	if ((prev_aen.members.class <= curr_aen.members.class) &&
	!((prev_aen.members.locale & curr_aen.members.locale) ^
	curr_aen.members.locale)) {
	/*
	* Previously issued event registration includes
	* current request. Nothing to do.
	*/
	return 0;
	} else {
	curr_aen.members.locale \|= prev_aen.members.locale;

	if (prev_aen.members.class < curr_aen.members.class)
	curr_aen.members.class = prev_aen.members.class;

	sc->aen_cmd->abort_aen = 1;
	ret_val = mrsas_issue_blocked_abort_cmd(sc,
	sc->aen_cmd);

	if (ret_val) {
	printf("mrsas: Failed to abort previous AEN command\n");
	return ret_val;
	} else
	sc->aen_cmd = NULL;
	}
	}
	cmd = mrsas_get_mfi_cmd(sc);
	if (!cmd)
	return ENOMEM;

	dcmd = &cmd->frame->dcmd;

	memset(sc->evt_detail_mem, 0, sizeof(struct mrsas_evt_detail));

	/*
	* Prepare DCMD for aen registration
	*/
	memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE);

	dcmd->cmd = MFI_CMD_DCMD;
	dcmd->cmd_status = 0x0;
	dcmd->sge_count = 1;
	dcmd->flags = MFI_FRAME_DIR_READ;
	dcmd->timeout = 0;
	dcmd->pad_0 = 0;
	dcmd->data_xfer_len = sizeof(struct mrsas_evt_detail);
	dcmd->opcode = MR_DCMD_CTRL_EVENT_WAIT;
	dcmd->mbox.w[0] = seq_num;
	sc->last_seq_num = seq_num;
	dcmd->mbox.w[1] = curr_aen.word;
	dcmd->sgl.sge32[0].phys_addr = (u_int32_t)sc->evt_detail_phys_addr;
	dcmd->sgl.sge32[0].length = sizeof(struct mrsas_evt_detail);

	if (sc->aen_cmd != NULL) {
	mrsas_release_mfi_cmd(cmd);
	return 0;
	}
	/*
	* Store reference to the cmd used to register for AEN. When an
	* application wants us to register for AEN, we have to abort this
	* cmd and re-register with a new EVENT LOCALE supplied by that app
	*/
	sc->aen_cmd = cmd;

	/*
	* Issue the aen registration frame
	*/
	if (mrsas_issue_dcmd(sc, cmd)) {
	device_printf(sc->mrsas_dev, "Cannot issue AEN DCMD command.\n");
	return (1);
	}
	return 0;
	}

	/*
	* mrsas_start_aen: Subscribes to AEN during driver load time
	* @instance: Adapter soft state
	*/
	static int
	mrsas_start_aen(struct mrsas_softc *sc)
	{
	struct mrsas_evt_log_info eli;
	union mrsas_evt_class_locale class_locale;

	/* Get the latest sequence number from FW */

	memset(&eli, 0, sizeof(eli));

	if (mrsas_get_seq_num(sc, &eli))
	return -1;

	/* Register AEN with FW for latest sequence number plus 1 */
	class_locale.members.reserved = 0;
	class_locale.members.locale = MR_EVT_LOCALE_ALL;
	class_locale.members.class = MR_EVT_CLASS_DEBUG;

	return mrsas_register_aen(sc, eli.newest_seq_num + 1,
	class_locale.word);

	}

	/*
	* mrsas_setup_msix: Allocate MSI-x vectors
	* @sc: adapter soft state
	*/
	static int
	mrsas_setup_msix(struct mrsas_softc *sc)
	{
	int i;

	for (i = 0; i < sc->msix_vectors; i++) {
	sc->irq_context[i].sc = sc;
	sc->irq_context[i].MSIxIndex = i;
	sc->irq_id[i] = i + 1;
	sc->mrsas_irq[i] = bus_alloc_resource_any
	(sc->mrsas_dev, SYS_RES_IRQ, &sc->irq_id[i]
	,RF_ACTIVE);
	if (sc->mrsas_irq[i] == NULL) {
	device_printf(sc->mrsas_dev, "Can't allocate MSI-x\n");
	goto irq_alloc_failed;
	}
	if (bus_setup_intr(sc->mrsas_dev,
	sc->mrsas_irq[i],
	INTR_MPSAFE \| INTR_TYPE_CAM,
	NULL, mrsas_isr, &sc->irq_context[i],
	&sc->intr_handle[i])) {
	device_printf(sc->mrsas_dev,
	"Cannot set up MSI-x interrupt handler\n");
	goto irq_alloc_failed;
	}
	}
	return SUCCESS;

	irq_alloc_failed:
	mrsas_teardown_intr(sc);
	return (FAIL);
	}

	/*
	* mrsas_allocate_msix: Setup MSI-x vectors
	* @sc: adapter soft state
	*/
	static int
	mrsas_allocate_msix(struct mrsas_softc *sc)
	{
	if (pci_alloc_msix(sc->mrsas_dev, &sc->msix_vectors) == 0) {
	device_printf(sc->mrsas_dev, "Using MSI-X with %d number"
	" of vectors\n", sc->msix_vectors);
	} else {
	device_printf(sc->mrsas_dev, "MSI-x setup failed\n");
	goto irq_alloc_failed;
	}
	return SUCCESS;

	irq_alloc_failed:
	mrsas_teardown_intr(sc);
	return (FAIL);
	}

	/*
	* mrsas_attach: PCI entry point
	* input: pointer to device struct
	*
	* Performs setup of PCI and registers, initializes mutexes and linked lists,
	* registers interrupts and CAM, and initializes the adapter/controller to
	* its proper state.
	*/
	static int
	mrsas_attach(device_t dev)
	{
	struct mrsas_softc *sc = device_get_softc(dev);
	uint32_t cmd, error;

	memset(sc, 0, sizeof(struct mrsas_softc));

	/* Look up our softc and initialize its fields. */
	sc->mrsas_dev = dev;
	sc->device_id = pci_get_device(dev);

	switch (sc->device_id) {
	case MRSAS_INVADER:
	case MRSAS_FURY:
	case MRSAS_INTRUDER:
	case MRSAS_INTRUDER_24:
	case MRSAS_CUTLASS_52:
	case MRSAS_CUTLASS_53:
	sc->mrsas_gen3_ctrl = 1;
	break;
	case MRSAS_VENTURA:
	case MRSAS_CRUSADER:
	case MRSAS_HARPOON:
	case MRSAS_TOMCAT:
	case MRSAS_VENTURA_4PORT:
	case MRSAS_CRUSADER_4PORT:
	sc->is_ventura = true;
	break;
	case MRSAS_AERO_10E1:
	case MRSAS_AERO_10E5:
	device_printf(dev, "Adapter is in configurable secure mode\n");
	case MRSAS_AERO_10E2:
	case MRSAS_AERO_10E6:
	sc->is_aero = true;
	break;
	case MRSAS_AERO_10E0:
	case MRSAS_AERO_10E3:
	case MRSAS_AERO_10E4:
	case MRSAS_AERO_10E7:
	device_printf(dev, "Adapter is in non-secure mode\n");
	return SUCCESS;
	}

	mrsas_get_tunables(sc);

	/*
	* Set up PCI and registers
	*/
	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
	if ((cmd & PCIM_CMD_PORTEN) == 0) {
	return (ENXIO);
	}
	/* Force the busmaster enable bit on. */
	cmd \|= PCIM_CMD_BUSMASTEREN;
	pci_write_config(dev, PCIR_COMMAND, cmd, 2);

	/* For Ventura/Aero system registers are mapped to BAR0 */
	if (sc->is_ventura \|\| sc->is_aero)
	sc->reg_res_id = PCIR_BAR(0); /* BAR0 offset */
	else
	sc->reg_res_id = PCIR_BAR(1); /* BAR1 offset */

	if ((sc->reg_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&(sc->reg_res_id), RF_ACTIVE))
	== NULL) {
	device_printf(dev, "Cannot allocate PCI registers\n");
	goto attach_fail;
	}
	sc->bus_tag = rman_get_bustag(sc->reg_res);
	sc->bus_handle = rman_get_bushandle(sc->reg_res);

	/* Intialize mutexes */
	mtx_init(&sc->sim_lock, "mrsas_sim_lock", NULL, MTX_DEF);
	mtx_init(&sc->pci_lock, "mrsas_pci_lock", NULL, MTX_DEF);
	mtx_init(&sc->io_lock, "mrsas_io_lock", NULL, MTX_DEF);
	mtx_init(&sc->aen_lock, "mrsas_aen_lock", NULL, MTX_DEF);
	mtx_init(&sc->ioctl_lock, "mrsas_ioctl_lock", NULL, MTX_SPIN);
	mtx_init(&sc->mpt_cmd_pool_lock, "mrsas_mpt_cmd_pool_lock", NULL, MTX_DEF);
	mtx_init(&sc->mfi_cmd_pool_lock, "mrsas_mfi_cmd_pool_lock", NULL, MTX_DEF);
	mtx_init(&sc->raidmap_lock, "mrsas_raidmap_lock", NULL, MTX_DEF);
	mtx_init(&sc->stream_lock, "mrsas_stream_lock", NULL, MTX_DEF);

	/* Intialize linked list */
	TAILQ_INIT(&sc->mrsas_mpt_cmd_list_head);
	TAILQ_INIT(&sc->mrsas_mfi_cmd_list_head);

	mrsas_atomic_set(&sc->fw_outstanding, 0);
	mrsas_atomic_set(&sc->target_reset_outstanding, 0);
	mrsas_atomic_set(&sc->prp_count, 0);
	mrsas_atomic_set(&sc->sge_holes, 0);

	sc->io_cmds_highwater = 0;

	sc->adprecovery = MRSAS_HBA_OPERATIONAL;
	sc->UnevenSpanSupport = 0;

	sc->msix_enable = 0;

	/* Initialize Firmware */
	if (mrsas_init_fw(sc) != SUCCESS) {
	goto attach_fail_fw;
	}
	/* Register mrsas to CAM layer */
	if ((mrsas_cam_attach(sc) != SUCCESS)) {
	goto attach_fail_cam;
	}
	/* Register IRQs */
	if (mrsas_setup_irq(sc) != SUCCESS) {
	goto attach_fail_irq;
	}
	error = mrsas_kproc_create(mrsas_ocr_thread, sc,
	&sc->ocr_thread, 0, 0, "mrsas_ocr%d",
	device_get_unit(sc->mrsas_dev));
	if (error) {
	device_printf(sc->mrsas_dev, "Error %d starting OCR thread\n", error);
	goto attach_fail_ocr_thread;
	}
	/*
	* After FW initialization and OCR thread creation
	* we will defer the cdev creation, AEN setup on ICH callback
	*/
	sc->mrsas_ich.ich_func = mrsas_ich_startup;
	sc->mrsas_ich.ich_arg = sc;
	if (config_intrhook_establish(&sc->mrsas_ich) != 0) {
	device_printf(sc->mrsas_dev, "Config hook is already established\n");
	}
	mrsas_setup_sysctl(sc);
	return SUCCESS;

	attach_fail_ocr_thread:
	if (sc->ocr_thread_active)
	wakeup(&sc->ocr_chan);
	attach_fail_irq:
	mrsas_teardown_intr(sc);
	attach_fail_cam:
	mrsas_cam_detach(sc);
	attach_fail_fw:
	/* if MSIX vector is allocated and FW Init FAILED then release MSIX */
	if (sc->msix_enable == 1)
	pci_release_msi(sc->mrsas_dev);
	mrsas_free_mem(sc);
	mtx_destroy(&sc->sim_lock);
	mtx_destroy(&sc->aen_lock);
	mtx_destroy(&sc->pci_lock);
	mtx_destroy(&sc->io_lock);
	mtx_destroy(&sc->ioctl_lock);
	mtx_destroy(&sc->mpt_cmd_pool_lock);
	mtx_destroy(&sc->mfi_cmd_pool_lock);
	mtx_destroy(&sc->raidmap_lock);
	mtx_destroy(&sc->stream_lock);
	attach_fail:
	if (sc->reg_res) {
	bus_release_resource(sc->mrsas_dev, SYS_RES_MEMORY,
	sc->reg_res_id, sc->reg_res);
	}
	return (ENXIO);
	}

	/*
	* Interrupt config hook
	*/
	static void
	mrsas_ich_startup(void *arg)
	{
	int i = 0;
	struct mrsas_softc sc = (struct mrsas_softc )arg;

	/*
	* Intialize a counting Semaphore to take care no. of concurrent IOCTLs
	*/
	sema_init(&sc->ioctl_count_sema, MRSAS_MAX_IOCTL_CMDS,
	IOCTL_SEMA_DESCRIPTION);

	/* Create a /dev entry for mrsas controller. */
	sc->mrsas_cdev = make_dev(&mrsas_cdevsw, device_get_unit(sc->mrsas_dev), UID_ROOT,
	GID_OPERATOR, (S_IRUSR \| S_IWUSR \| S_IRGRP \| S_IWGRP), "mrsas%u",
	device_get_unit(sc->mrsas_dev));

	if (device_get_unit(sc->mrsas_dev) == 0) {
	make_dev_alias_p(MAKEDEV_CHECKNAME,
	&sc->mrsas_linux_emulator_cdev, sc->mrsas_cdev,
	"megaraid_sas_ioctl_node");
	}
	if (sc->mrsas_cdev)
	sc->mrsas_cdev->si_drv1 = sc;

	/*
	* Add this controller to mrsas_mgmt_info structure so that it can be
	* exported to management applications
	*/
	if (device_get_unit(sc->mrsas_dev) == 0)
	memset(&mrsas_mgmt_info, 0, sizeof(mrsas_mgmt_info));

	mrsas_mgmt_info.count++;
	mrsas_mgmt_info.sc_ptr[mrsas_mgmt_info.max_index] = sc;
	mrsas_mgmt_info.max_index++;

	/* Enable Interrupts */
	mrsas_enable_intr(sc);

	/* Call DCMD get_pd_info for all system PDs */
	for (i = 0; i < MRSAS_MAX_PD; i++) {
	if ((sc->target_list[i].target_id != 0xffff) &&
	sc->pd_info_mem)
	mrsas_get_pd_info(sc, sc->target_list[i].target_id);
	}

	/* Initiate AEN (Asynchronous Event Notification) */
	if (mrsas_start_aen(sc)) {
	device_printf(sc->mrsas_dev, "Error: AEN registration FAILED !!! "
	"Further events from the controller will not be communicated.\n"
	"Either there is some problem in the controller"
	"or the controller does not support AEN.\n"
	"Please contact to the SUPPORT TEAM if the problem persists\n");
	}
	if (sc->mrsas_ich.ich_arg != NULL) {
	device_printf(sc->mrsas_dev, "Disestablish mrsas intr hook\n");
	config_intrhook_disestablish(&sc->mrsas_ich);
	sc->mrsas_ich.ich_arg = NULL;
	}
	}

	/*
	* mrsas_detach: De-allocates and teardown resources
	* input: pointer to device struct
	*
	* This function is the entry point for device disconnect and detach.
	* It performs memory de-allocations, shutdown of the controller and various
	* teardown and destroy resource functions.
	*/
	static int
	mrsas_detach(device_t dev)
	{
	struct mrsas_softc *sc;
	int i = 0;

	sc = device_get_softc(dev);
	sc->remove_in_progress = 1;

	/* Destroy the character device so no other IOCTL will be handled */
	if ((device_get_unit(dev) == 0) && sc->mrsas_linux_emulator_cdev)
	destroy_dev(sc->mrsas_linux_emulator_cdev);
	destroy_dev(sc->mrsas_cdev);

	/*
	* Take the instance off the instance array. Note that we will not
	* decrement the max_index. We let this array be sparse array
	*/
	for (i = 0; i < mrsas_mgmt_info.max_index; i++) {
	if (mrsas_mgmt_info.sc_ptr[i] == sc) {
	mrsas_mgmt_info.count--;
	mrsas_mgmt_info.sc_ptr[i] = NULL;
	break;
	}
	}

	if (sc->ocr_thread_active)
	wakeup(&sc->ocr_chan);
	while (sc->reset_in_progress) {
	i++;
	if (!(i % MRSAS_RESET_NOTICE_INTERVAL)) {
	mrsas_dprint(sc, MRSAS_INFO,
	"[%2d]waiting for OCR to be finished from %s\n", i, __func__);
	}
	pause("mr_shutdown", hz);
	}
	i = 0;
	while (sc->ocr_thread_active) {
	i++;
	if (!(i % MRSAS_RESET_NOTICE_INTERVAL)) {
	mrsas_dprint(sc, MRSAS_INFO,
	"[%2d]waiting for "
	"mrsas_ocr thread to quit ocr %d\n", i,
	sc->ocr_thread_active);
	}
	pause("mr_shutdown", hz);
	}
	mrsas_flush_cache(sc);
	mrsas_shutdown_ctlr(sc, MR_DCMD_CTRL_SHUTDOWN);
	mrsas_disable_intr(sc);

	if ((sc->is_ventura \|\| sc->is_aero) && sc->streamDetectByLD) {
	for (i = 0; i < MAX_LOGICAL_DRIVES_EXT; ++i)
	free(sc->streamDetectByLD[i], M_MRSAS);
	free(sc->streamDetectByLD, M_MRSAS);
	sc->streamDetectByLD = NULL;
	}

	mrsas_cam_detach(sc);
	mrsas_teardown_intr(sc);
	mrsas_free_mem(sc);
	mtx_destroy(&sc->sim_lock);
	mtx_destroy(&sc->aen_lock);
	mtx_destroy(&sc->pci_lock);
	mtx_destroy(&sc->io_lock);
	mtx_destroy(&sc->ioctl_lock);
	mtx_destroy(&sc->mpt_cmd_pool_lock);
	mtx_destroy(&sc->mfi_cmd_pool_lock);
	mtx_destroy(&sc->raidmap_lock);
	mtx_destroy(&sc->stream_lock);

	/* Wait for all the semaphores to be released */
	while (sema_value(&sc->ioctl_count_sema) != MRSAS_MAX_IOCTL_CMDS)
	pause("mr_shutdown", hz);

	/* Destroy the counting semaphore created for Ioctl */
	sema_destroy(&sc->ioctl_count_sema);

	if (sc->reg_res) {
	bus_release_resource(sc->mrsas_dev,
	SYS_RES_MEMORY, sc->reg_res_id, sc->reg_res);
	}
	if (sc->sysctl_tree != NULL)
	sysctl_ctx_free(&sc->sysctl_ctx);

	return (0);
	}

	static int
	mrsas_shutdown(device_t dev)
	{
	struct mrsas_softc *sc;
	int i;

	sc = device_get_softc(dev);
	sc->remove_in_progress = 1;
	if (!KERNEL_PANICKED()) {
	if (sc->ocr_thread_active)
	wakeup(&sc->ocr_chan);
	i = 0;
	while (sc->reset_in_progress && i < 15) {
	i++;
	if ((i % MRSAS_RESET_NOTICE_INTERVAL) == 0) {
	mrsas_dprint(sc, MRSAS_INFO,
	"[%2d]waiting for OCR to be finished "
	"from %s\n", i, __func__);
	}
	pause("mr_shutdown", hz);
	}
	if (sc->reset_in_progress) {
	mrsas_dprint(sc, MRSAS_INFO,
	"gave up waiting for OCR to be finished\n");
	}
	}

	mrsas_flush_cache(sc);
	mrsas_shutdown_ctlr(sc, MR_DCMD_CTRL_SHUTDOWN);
	mrsas_disable_intr(sc);
	return (0);
	}

	/*
	* mrsas_free_mem: Frees allocated memory
	* input: Adapter instance soft state
	*
	* This function is called from mrsas_detach() to free previously allocated
	* memory.
	*/
	void
	mrsas_free_mem(struct mrsas_softc *sc)
	{
	int i;
	u_int32_t max_fw_cmds;
	struct mrsas_mfi_cmd *mfi_cmd;
	struct mrsas_mpt_cmd *mpt_cmd;

	/*
	* Free RAID map memory
	*/
	for (i = 0; i < 2; i++) {
	if (sc->raidmap_phys_addr[i])
	bus_dmamap_unload(sc->raidmap_tag[i], sc->raidmap_dmamap[i]);
	if (sc->raidmap_mem[i] != NULL)
	bus_dmamem_free(sc->raidmap_tag[i], sc->raidmap_mem[i], sc->raidmap_dmamap[i]);
	if (sc->raidmap_tag[i] != NULL)
	bus_dma_tag_destroy(sc->raidmap_tag[i]);

	if (sc->ld_drv_map[i] != NULL)
	free(sc->ld_drv_map[i], M_MRSAS);
	}
	for (i = 0; i < 2; i++) {
	if (sc->jbodmap_phys_addr[i])
	bus_dmamap_unload(sc->jbodmap_tag[i], sc->jbodmap_dmamap[i]);
	if (sc->jbodmap_mem[i] != NULL)
	bus_dmamem_free(sc->jbodmap_tag[i], sc->jbodmap_mem[i], sc->jbodmap_dmamap[i]);
	if (sc->jbodmap_tag[i] != NULL)
	bus_dma_tag_destroy(sc->jbodmap_tag[i]);
	}
	/*
	* Free version buffer memory
	*/
	if (sc->verbuf_phys_addr)
	bus_dmamap_unload(sc->verbuf_tag, sc->verbuf_dmamap);
	if (sc->verbuf_mem != NULL)
	bus_dmamem_free(sc->verbuf_tag, sc->verbuf_mem, sc->verbuf_dmamap);
	if (sc->verbuf_tag != NULL)
	bus_dma_tag_destroy(sc->verbuf_tag);

	/*
	* Free sense buffer memory
	*/
	if (sc->sense_phys_addr)
	bus_dmamap_unload(sc->sense_tag, sc->sense_dmamap);
	if (sc->sense_mem != NULL)
	bus_dmamem_free(sc->sense_tag, sc->sense_mem, sc->sense_dmamap);
	if (sc->sense_tag != NULL)
	bus_dma_tag_destroy(sc->sense_tag);

	/*
	* Free chain frame memory
	*/
	if (sc->chain_frame_phys_addr)
	bus_dmamap_unload(sc->chain_frame_tag, sc->chain_frame_dmamap);
	if (sc->chain_frame_mem != NULL)
	bus_dmamem_free(sc->chain_frame_tag, sc->chain_frame_mem, sc->chain_frame_dmamap);
	if (sc->chain_frame_tag != NULL)
	bus_dma_tag_destroy(sc->chain_frame_tag);

	/*
	* Free IO Request memory
	*/
	if (sc->io_request_phys_addr)
	bus_dmamap_unload(sc->io_request_tag, sc->io_request_dmamap);
	if (sc->io_request_mem != NULL)
	bus_dmamem_free(sc->io_request_tag, sc->io_request_mem, sc->io_request_dmamap);
	if (sc->io_request_tag != NULL)
	bus_dma_tag_destroy(sc->io_request_tag);

	/*
	* Free Reply Descriptor memory
	*/
	if (sc->reply_desc_phys_addr)
	bus_dmamap_unload(sc->reply_desc_tag, sc->reply_desc_dmamap);
	if (sc->reply_desc_mem != NULL)
	bus_dmamem_free(sc->reply_desc_tag, sc->reply_desc_mem, sc->reply_desc_dmamap);
	if (sc->reply_desc_tag != NULL)
	bus_dma_tag_destroy(sc->reply_desc_tag);

	/*
	* Free event detail memory
	*/
	if (sc->evt_detail_phys_addr)
	bus_dmamap_unload(sc->evt_detail_tag, sc->evt_detail_dmamap);
	if (sc->evt_detail_mem != NULL)
	bus_dmamem_free(sc->evt_detail_tag, sc->evt_detail_mem, sc->evt_detail_dmamap);
	if (sc->evt_detail_tag != NULL)
	bus_dma_tag_destroy(sc->evt_detail_tag);

	/*
	* Free PD info memory
	*/
	if (sc->pd_info_phys_addr)
	bus_dmamap_unload(sc->pd_info_tag, sc->pd_info_dmamap);
	if (sc->pd_info_mem != NULL)
	bus_dmamem_free(sc->pd_info_tag, sc->pd_info_mem, sc->pd_info_dmamap);
	if (sc->pd_info_tag != NULL)
	bus_dma_tag_destroy(sc->pd_info_tag);

	/*
	* Free MFI frames
	*/
	if (sc->mfi_cmd_list) {
	for (i = 0; i < MRSAS_MAX_MFI_CMDS; i++) {
	mfi_cmd = sc->mfi_cmd_list[i];
	mrsas_free_frame(sc, mfi_cmd);
	}
	}
	if (sc->mficmd_frame_tag != NULL)
	bus_dma_tag_destroy(sc->mficmd_frame_tag);

	/*
	* Free MPT internal command list
	*/
	max_fw_cmds = sc->max_fw_cmds;
	if (sc->mpt_cmd_list) {
	for (i = 0; i < max_fw_cmds; i++) {
	mpt_cmd = sc->mpt_cmd_list[i];
	bus_dmamap_destroy(sc->data_tag, mpt_cmd->data_dmamap);
	free(sc->mpt_cmd_list[i], M_MRSAS);
	}
	free(sc->mpt_cmd_list, M_MRSAS);
	sc->mpt_cmd_list = NULL;
	}
	/*
	* Free MFI internal command list
	*/

	if (sc->mfi_cmd_list) {
	for (i = 0; i < MRSAS_MAX_MFI_CMDS; i++) {
	free(sc->mfi_cmd_list[i], M_MRSAS);
	}
	free(sc->mfi_cmd_list, M_MRSAS);
	sc->mfi_cmd_list = NULL;
	}
	/*
	* Free request descriptor memory
	*/
	free(sc->req_desc, M_MRSAS);
	sc->req_desc = NULL;

	/*
	* Destroy parent tag
	*/
	if (sc->mrsas_parent_tag != NULL)
	bus_dma_tag_destroy(sc->mrsas_parent_tag);

	/*
	* Free ctrl_info memory
	*/
	if (sc->ctrl_info != NULL)
	free(sc->ctrl_info, M_MRSAS);
	}

	/*
	* mrsas_teardown_intr: Teardown interrupt
	* input: Adapter instance soft state
	*
	* This function is called from mrsas_detach() to teardown and release bus
	* interrupt resourse.
	*/
	void
	mrsas_teardown_intr(struct mrsas_softc *sc)
	{
	int i;

	if (!sc->msix_enable) {
	if (sc->intr_handle[0])
	bus_teardown_intr(sc->mrsas_dev, sc->mrsas_irq[0], sc->intr_handle[0]);
	if (sc->mrsas_irq[0] != NULL)
	bus_release_resource(sc->mrsas_dev, SYS_RES_IRQ,
	sc->irq_id[0], sc->mrsas_irq[0]);
	sc->intr_handle[0] = NULL;
	} else {
	for (i = 0; i < sc->msix_vectors; i++) {
	if (sc->intr_handle[i])
	bus_teardown_intr(sc->mrsas_dev, sc->mrsas_irq[i],
	sc->intr_handle[i]);

	if (sc->mrsas_irq[i] != NULL)
	bus_release_resource(sc->mrsas_dev, SYS_RES_IRQ,
	sc->irq_id[i], sc->mrsas_irq[i]);

	sc->intr_handle[i] = NULL;
	}
	pci_release_msi(sc->mrsas_dev);
	}

	}

	/*
	* mrsas_suspend: Suspend entry point
	* input: Device struct pointer
	*
	* This function is the entry point for system suspend from the OS.
	*/
	static int
	mrsas_suspend(device_t dev)
	{
	/* This will be filled when the driver will have hibernation support */
	return (0);
	}

	/*
	* mrsas_resume: Resume entry point
	* input: Device struct pointer
	*
	* This function is the entry point for system resume from the OS.
	*/
	static int
	mrsas_resume(device_t dev)
	{
	/* This will be filled when the driver will have hibernation support */
	return (0);
	}

	/**
	* mrsas_get_softc_instance: Find softc instance based on cmd type
	*
	* This function will return softc instance based on cmd type.
	* In some case, application fire ioctl on required management instance and
	* do not provide host_no. Use cdev->si_drv1 to get softc instance for those
	* case, else get the softc instance from host_no provided by application in
	* user data.
	*/

	static struct mrsas_softc *
	mrsas_get_softc_instance(struct cdev *dev, u_long cmd, caddr_t arg)
	{
	struct mrsas_softc *sc = NULL;
	struct mrsas_iocpacket user_ioc = (struct mrsas_iocpacket )arg;

	if (cmd == MRSAS_IOC_GET_PCI_INFO) {
	sc = dev->si_drv1;
	} else {
	/*
	* get the Host number & the softc from data sent by the
	* Application
	*/
	sc = mrsas_mgmt_info.sc_ptr[user_ioc->host_no];
	if (sc == NULL)
	printf("There is no Controller number %d\n",
	user_ioc->host_no);
	else if (user_ioc->host_no >= mrsas_mgmt_info.max_index)
	mrsas_dprint(sc, MRSAS_FAULT,
	"Invalid Controller number %d\n", user_ioc->host_no);
	}

	return sc;
	}

	/*
	* mrsas_ioctl: IOCtl commands entry point.
	*
	* This function is the entry point for IOCtls from the OS. It calls the
	* appropriate function for processing depending on the command received.
	*/
	static int
	mrsas_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
	struct thread *td)
	{
	struct mrsas_softc *sc;
	int ret = 0, i = 0;
	MRSAS_DRV_PCI_INFORMATION *pciDrvInfo;

	sc = mrsas_get_softc_instance(dev, cmd, arg);
	if (!sc)
	return ENOENT;

	if (sc->remove_in_progress \|\|
	(sc->adprecovery == MRSAS_HW_CRITICAL_ERROR)) {
	mrsas_dprint(sc, MRSAS_INFO,
	"Either driver remove or shutdown called or "
	"HW is in unrecoverable critical error state.\n");
	return ENOENT;
	}
	mtx_lock_spin(&sc->ioctl_lock);
	if (!sc->reset_in_progress) {
	mtx_unlock_spin(&sc->ioctl_lock);
	goto do_ioctl;
	}
	mtx_unlock_spin(&sc->ioctl_lock);
	while (sc->reset_in_progress) {
	i++;
	if (!(i % MRSAS_RESET_NOTICE_INTERVAL)) {
	mrsas_dprint(sc, MRSAS_INFO,
	"[%2d]waiting for OCR to be finished from %s\n", i, __func__);
	}
	pause("mr_ioctl", hz);
	}

	do_ioctl:
	switch (cmd) {
	case MRSAS_IOC_FIRMWARE_PASS_THROUGH64:
	#ifdef COMPAT_FREEBSD32
	case MRSAS_IOC_FIRMWARE_PASS_THROUGH32:
	#endif
	/*
	* Decrement the Ioctl counting Semaphore before getting an
	* mfi command
	*/
	sema_wait(&sc->ioctl_count_sema);

	ret = mrsas_passthru(sc, (void *)arg, cmd);

	/* Increment the Ioctl counting semaphore value */
	sema_post(&sc->ioctl_count_sema);

	break;
	case MRSAS_IOC_SCAN_BUS:
	ret = mrsas_bus_scan(sc);
	break;

	case MRSAS_IOC_GET_PCI_INFO:
	pciDrvInfo = (MRSAS_DRV_PCI_INFORMATION *) arg;
	memset(pciDrvInfo, 0, sizeof(MRSAS_DRV_PCI_INFORMATION));
	pciDrvInfo->busNumber = pci_get_bus(sc->mrsas_dev);
	pciDrvInfo->deviceNumber = pci_get_slot(sc->mrsas_dev);
	pciDrvInfo->functionNumber = pci_get_function(sc->mrsas_dev);
	pciDrvInfo->domainID = pci_get_domain(sc->mrsas_dev);
	mrsas_dprint(sc, MRSAS_INFO, "pci bus no: %d,"
	"pci device no: %d, pci function no: %d,"
	"pci domain ID: %d\n",
	pciDrvInfo->busNumber, pciDrvInfo->deviceNumber,
	pciDrvInfo->functionNumber, pciDrvInfo->domainID);
	ret = 0;
	break;

	default:
	mrsas_dprint(sc, MRSAS_TRACE, "IOCTL command 0x%lx is not handled\n", cmd);
	ret = ENOENT;
	}

	return (ret);
	}

	/*
	* mrsas_poll: poll entry point for mrsas driver fd
	*
	* This function is the entry point for poll from the OS. It waits for some AEN
	* events to be triggered from the controller and notifies back.
	*/
	static int
	mrsas_poll(struct cdev dev, int poll_events, struct thread td)
	{
	struct mrsas_softc *sc;
	int revents = 0;

	sc = dev->si_drv1;

	if (poll_events & (POLLIN \| POLLRDNORM)) {
	if (sc->mrsas_aen_triggered) {
	revents \|= poll_events & (POLLIN \| POLLRDNORM);
	}
	}
	if (revents == 0) {
	if (poll_events & (POLLIN \| POLLRDNORM)) {
	mtx_lock(&sc->aen_lock);
	sc->mrsas_poll_waiting = 1;
	selrecord(td, &sc->mrsas_select);
	mtx_unlock(&sc->aen_lock);
	}
	}
	return revents;
	}

	/*
	* mrsas_setup_irq: Set up interrupt
	* input: Adapter instance soft state
	*
	* This function sets up interrupts as a bus resource, with flags indicating
	* resource permitting contemporaneous sharing and for resource to activate
	* atomically.
	*/
	static int
	mrsas_setup_irq(struct mrsas_softc *sc)
	{
	if (sc->msix_enable && (mrsas_setup_msix(sc) == SUCCESS))
	device_printf(sc->mrsas_dev, "MSI-x interrupts setup success\n");

	else {
	device_printf(sc->mrsas_dev, "Fall back to legacy interrupt\n");
	sc->irq_context[0].sc = sc;
	sc->irq_context[0].MSIxIndex = 0;
	sc->irq_id[0] = 0;
	sc->mrsas_irq[0] = bus_alloc_resource_any(sc->mrsas_dev,
	SYS_RES_IRQ, &sc->irq_id[0], RF_SHAREABLE \| RF_ACTIVE);
	if (sc->mrsas_irq[0] == NULL) {
	device_printf(sc->mrsas_dev, "Cannot allocate legcay"
	"interrupt\n");
	return (FAIL);
	}
	if (bus_setup_intr(sc->mrsas_dev, sc->mrsas_irq[0],
	INTR_MPSAFE \| INTR_TYPE_CAM, NULL, mrsas_isr,
	&sc->irq_context[0], &sc->intr_handle[0])) {
	device_printf(sc->mrsas_dev, "Cannot set up legacy"
	"interrupt\n");
	return (FAIL);
	}
	}
	return (0);
	}

	/*
	* mrsas_isr: ISR entry point
	* input: argument pointer
	*
	* This function is the interrupt service routine entry point. There are two
	* types of interrupts, state change interrupt and response interrupt. If an
	* interrupt is not ours, we just return.
	*/
	void
	mrsas_isr(void *arg)
	{
	struct mrsas_irq_context irq_context = (struct mrsas_irq_context )arg;
	struct mrsas_softc *sc = irq_context->sc;
	int status = 0;

	if (sc->mask_interrupts)
	return;

	if (!sc->msix_vectors) {
	status = mrsas_clear_intr(sc);
	if (!status)
	return;
	}
	/* If we are resetting, bail */
	if (mrsas_test_bit(MRSAS_FUSION_IN_RESET, &sc->reset_flags)) {
	printf(" Entered into ISR when OCR is going active. \n");
	mrsas_clear_intr(sc);
	return;
	}
	/* Process for reply request and clear response interrupt */
	if (mrsas_complete_cmd(sc, irq_context->MSIxIndex) != SUCCESS)
	mrsas_clear_intr(sc);

	return;
	}

	/*
	* mrsas_complete_cmd: Process reply request
	* input: Adapter instance soft state
	*
	* This function is called from mrsas_isr() to process reply request and clear
	* response interrupt. Processing of the reply request entails walking
	* through the reply descriptor array for the command request pended from
	* Firmware. We look at the Function field to determine the command type and
	* perform the appropriate action. Before we return, we clear the response
	* interrupt.
	*/
	int
	mrsas_complete_cmd(struct mrsas_softc *sc, u_int32_t MSIxIndex)
	{
	Mpi2ReplyDescriptorsUnion_t *desc;
	MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR *reply_desc;
	MRSAS_RAID_SCSI_IO_REQUEST *scsi_io_req;
	struct mrsas_mpt_cmd cmd_mpt, r1_cmd = NULL;
	struct mrsas_mfi_cmd *cmd_mfi;
	u_int8_t reply_descript_type, *sense;
	u_int16_t smid, num_completed;
	u_int8_t status, extStatus;
	union desc_value desc_val;
	PLD_LOAD_BALANCE_INFO lbinfo;
	u_int32_t device_id, data_length;
	int threshold_reply_count = 0;
	#if TM_DEBUG
	MR_TASK_MANAGE_REQUEST *mr_tm_req;
	MPI2_SCSI_TASK_MANAGE_REQUEST *mpi_tm_req;
	#endif

	/* If we have a hardware error, not need to continue */
	if (sc->adprecovery == MRSAS_HW_CRITICAL_ERROR)
	return (DONE);

	desc = sc->reply_desc_mem;
	desc += ((MSIxIndex * sc->reply_alloc_sz) / sizeof(MPI2_REPLY_DESCRIPTORS_UNION))
	+ sc->last_reply_idx[MSIxIndex];

	reply_desc = (MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR *) desc;

	desc_val.word = desc->Words;
	num_completed = 0;

	reply_descript_type = reply_desc->ReplyFlags & MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK;

	/* Find our reply descriptor for the command and process */
	while ((desc_val.u.low != 0xFFFFFFFF) && (desc_val.u.high != 0xFFFFFFFF)) {
	smid = reply_desc->SMID;
	cmd_mpt = sc->mpt_cmd_list[smid - 1];
	scsi_io_req = (MRSAS_RAID_SCSI_IO_REQUEST *) cmd_mpt->io_request;

	status = scsi_io_req->RaidContext.raid_context.status;
	extStatus = scsi_io_req->RaidContext.raid_context.exStatus;
	sense = cmd_mpt->sense;
	data_length = scsi_io_req->DataLength;

	switch (scsi_io_req->Function) {
	case MPI2_FUNCTION_SCSI_TASK_MGMT:
	#if TM_DEBUG
	mr_tm_req = (MR_TASK_MANAGE_REQUEST *) cmd_mpt->io_request;
	mpi_tm_req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)
	&mr_tm_req->TmRequest;
	device_printf(sc->mrsas_dev, "TM completion type 0x%X, "
	"TaskMID: 0x%X", mpi_tm_req->TaskType, mpi_tm_req->TaskMID);
	#endif
	wakeup_one((void *)&sc->ocr_chan);
	break;
	case MPI2_FUNCTION_SCSI_IO_REQUEST: /* Fast Path IO. */
	device_id = cmd_mpt->ccb_ptr->ccb_h.target_id;
	lbinfo = &sc->load_balance_info[device_id];
	/* R1 load balancing for READ */
	if (cmd_mpt->load_balance == MRSAS_LOAD_BALANCE_FLAG) {
	mrsas_atomic_dec(&lbinfo->scsi_pending_cmds[cmd_mpt->pd_r1_lb]);
	cmd_mpt->load_balance &= ~MRSAS_LOAD_BALANCE_FLAG;
	}
	/* Fall thru and complete IO */
	case MRSAS_MPI2_FUNCTION_LD_IO_REQUEST:
	if (cmd_mpt->r1_alt_dev_handle == MR_DEVHANDLE_INVALID) {
	mrsas_map_mpt_cmd_status(cmd_mpt, cmd_mpt->ccb_ptr, status,
	extStatus, data_length, sense);
	mrsas_cmd_done(sc, cmd_mpt);
	mrsas_atomic_dec(&sc->fw_outstanding);
	} else {
	/*
	* If the peer Raid 1/10 fast path failed,
	* mark IO as failed to the scsi layer.
	* Overwrite the current status by the failed status
	* and make sure that if any command fails,
	* driver returns fail status to CAM.
	*/
	cmd_mpt->cmd_completed = 1;
	r1_cmd = cmd_mpt->peer_cmd;
	if (r1_cmd->cmd_completed) {
	if (r1_cmd->io_request->RaidContext.raid_context.status != MFI_STAT_OK) {
	status = r1_cmd->io_request->RaidContext.raid_context.status;
	extStatus = r1_cmd->io_request->RaidContext.raid_context.exStatus;
	data_length = r1_cmd->io_request->DataLength;
	sense = r1_cmd->sense;
	}
	r1_cmd->ccb_ptr = NULL;
	if (r1_cmd->callout_owner) {
	callout_stop(&r1_cmd->cm_callout);
	r1_cmd->callout_owner = false;
	}
	mrsas_release_mpt_cmd(r1_cmd);
	mrsas_atomic_dec(&sc->fw_outstanding);
	mrsas_map_mpt_cmd_status(cmd_mpt, cmd_mpt->ccb_ptr, status,
	extStatus, data_length, sense);
	mrsas_cmd_done(sc, cmd_mpt);
	mrsas_atomic_dec(&sc->fw_outstanding);
	}
	}
	break;
	case MRSAS_MPI2_FUNCTION_PASSTHRU_IO_REQUEST: /* MFI command */
	cmd_mfi = sc->mfi_cmd_list[cmd_mpt->sync_cmd_idx];
	/*
	* Make sure NOT TO release the mfi command from the called
	* function's context if it is fired with issue_polled call.
	* And also make sure that the issue_polled call should only be
	* used if INTERRUPT IS DISABLED.
	*/
	if (cmd_mfi->frame->hdr.flags & MFI_FRAME_DONT_POST_IN_REPLY_QUEUE)
	mrsas_release_mfi_cmd(cmd_mfi);
	else
	mrsas_complete_mptmfi_passthru(sc, cmd_mfi, status);
	break;
	}

	sc->last_reply_idx[MSIxIndex]++;
	if (sc->last_reply_idx[MSIxIndex] >= sc->reply_q_depth)
	sc->last_reply_idx[MSIxIndex] = 0;

	desc->Words = ~((uint64_t)0x00); /* set it back to all
	* 0xFFFFFFFFs */
	num_completed++;
	threshold_reply_count++;

	/* Get the next reply descriptor */
	if (!sc->last_reply_idx[MSIxIndex]) {
	desc = sc->reply_desc_mem;
	desc += ((MSIxIndex * sc->reply_alloc_sz) / sizeof(MPI2_REPLY_DESCRIPTORS_UNION));
	} else
	desc++;

	reply_desc = (MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR *) desc;
	desc_val.word = desc->Words;

	reply_descript_type = reply_desc->ReplyFlags & MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK;

	if (reply_descript_type == MPI2_RPY_DESCRIPT_FLAGS_UNUSED)
	break;

	/*
	* Write to reply post index after completing threshold reply
	* count and still there are more replies in reply queue
	* pending to be completed.
	*/
	if (threshold_reply_count >= THRESHOLD_REPLY_COUNT) {
	if (sc->msix_enable) {
	if (sc->msix_combined)
	mrsas_write_reg(sc, sc->msix_reg_offset[MSIxIndex / 8],
	((MSIxIndex & 0x7) << 24) \|
	sc->last_reply_idx[MSIxIndex]);
	else
	mrsas_write_reg(sc, sc->msix_reg_offset[0], (MSIxIndex << 24) \|
	sc->last_reply_idx[MSIxIndex]);
	} else
	mrsas_write_reg(sc, offsetof(mrsas_reg_set,
	reply_post_host_index), sc->last_reply_idx[0]);

	threshold_reply_count = 0;
	}
	}

	/* No match, just return */
	if (num_completed == 0)
	return (DONE);

	/* Clear response interrupt */
	if (sc->msix_enable) {
	if (sc->msix_combined) {
	mrsas_write_reg(sc, sc->msix_reg_offset[MSIxIndex / 8],
	((MSIxIndex & 0x7) << 24) \|
	sc->last_reply_idx[MSIxIndex]);
	} else
	mrsas_write_reg(sc, sc->msix_reg_offset[0], (MSIxIndex << 24) \|
	sc->last_reply_idx[MSIxIndex]);
	} else
	mrsas_write_reg(sc, offsetof(mrsas_reg_set,
	reply_post_host_index), sc->last_reply_idx[0]);

	return (0);
	}

	/*
	* mrsas_map_mpt_cmd_status: Allocate DMAable memory.
	* input: Adapter instance soft state
	*
	* This function is called from mrsas_complete_cmd(), for LD IO and FastPath IO.
	* It checks the command status and maps the appropriate CAM status for the
	* CCB.
	*/
	void
	mrsas_map_mpt_cmd_status(struct mrsas_mpt_cmd cmd, union ccb ccb_ptr, u_int8_t status,
	u_int8_t extStatus, u_int32_t data_length, u_int8_t *sense)
	{
	struct mrsas_softc *sc = cmd->sc;
	u_int8_t *sense_data;

	switch (status) {
	case MFI_STAT_OK:
	ccb_ptr->ccb_h.status = CAM_REQ_CMP;
	break;
	case MFI_STAT_SCSI_IO_FAILED:
	case MFI_STAT_SCSI_DONE_WITH_ERROR:
	ccb_ptr->ccb_h.status = CAM_SCSI_STATUS_ERROR;
	sense_data = (u_int8_t *)&ccb_ptr->csio.sense_data;
	if (sense_data) {
	/* For now just copy 18 bytes back */
	memcpy(sense_data, sense, 18);
	ccb_ptr->csio.sense_len = 18;
	ccb_ptr->ccb_h.status \|= CAM_AUTOSNS_VALID;
	}
	break;
	case MFI_STAT_LD_OFFLINE:
	case MFI_STAT_DEVICE_NOT_FOUND:
	if (ccb_ptr->ccb_h.target_lun)
	ccb_ptr->ccb_h.status \|= CAM_LUN_INVALID;
	else
	ccb_ptr->ccb_h.status \|= CAM_DEV_NOT_THERE;
	break;
	case MFI_STAT_CONFIG_SEQ_MISMATCH:
	ccb_ptr->ccb_h.status \|= CAM_REQUEUE_REQ;
	break;
	default:
	device_printf(sc->mrsas_dev, "FW cmd complete status %x\n", status);
	ccb_ptr->ccb_h.status = CAM_REQ_CMP_ERR;
	ccb_ptr->csio.scsi_status = status;
	}
	return;
	}

	/*
	* mrsas_alloc_mem: Allocate DMAable memory
	* input: Adapter instance soft state
	*
	* This function creates the parent DMA tag and allocates DMAable memory. DMA
	* tag describes constraints of DMA mapping. Memory allocated is mapped into
	* Kernel virtual address. Callback argument is physical memory address.
	*/
	static int
	mrsas_alloc_mem(struct mrsas_softc *sc)
	{
	u_int32_t verbuf_size, io_req_size, reply_desc_size, sense_size, chain_frame_size,
	evt_detail_size, count, pd_info_size;

	/*
	* Allocate parent DMA tag
	*/
	if (bus_dma_tag_create(NULL, /* parent */
	1, /* alignment */
	0, /* boundary */
	BUS_SPACE_MAXADDR, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	- MAXPHYS, /* maxsize */
	+ maxphys, /* maxsize */
	sc->max_num_sge, /* nsegments */
	- MAXPHYS, /* maxsegsize */
	+ maxphys, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->mrsas_parent_tag /* tag */
	)) {
	device_printf(sc->mrsas_dev, "Cannot allocate parent DMA tag\n");
	return (ENOMEM);
	}
	/*
	* Allocate for version buffer
	*/
	verbuf_size = MRSAS_MAX_NAME_LENGTH * (sizeof(bus_addr_t));
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	1, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	verbuf_size,
	1,
	verbuf_size,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&sc->verbuf_tag)) {
	device_printf(sc->mrsas_dev, "Cannot allocate verbuf DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->verbuf_tag, (void **)&sc->verbuf_mem,
	BUS_DMA_NOWAIT, &sc->verbuf_dmamap)) {
	device_printf(sc->mrsas_dev, "Cannot allocate verbuf memory\n");
	return (ENOMEM);
	}
	bzero(sc->verbuf_mem, verbuf_size);
	if (bus_dmamap_load(sc->verbuf_tag, sc->verbuf_dmamap, sc->verbuf_mem,
	verbuf_size, mrsas_addr_cb, &sc->verbuf_phys_addr,
	BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load verbuf DMA map\n");
	return (ENOMEM);
	}
	/*
	* Allocate IO Request Frames
	*/
	io_req_size = sc->io_frames_alloc_sz;
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	16, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	io_req_size,
	1,
	io_req_size,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&sc->io_request_tag)) {
	device_printf(sc->mrsas_dev, "Cannot create IO request tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->io_request_tag, (void **)&sc->io_request_mem,
	BUS_DMA_NOWAIT, &sc->io_request_dmamap)) {
	device_printf(sc->mrsas_dev, "Cannot alloc IO request memory\n");
	return (ENOMEM);
	}
	bzero(sc->io_request_mem, io_req_size);
	if (bus_dmamap_load(sc->io_request_tag, sc->io_request_dmamap,
	sc->io_request_mem, io_req_size, mrsas_addr_cb,
	&sc->io_request_phys_addr, BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load IO request memory\n");
	return (ENOMEM);
	}
	/*
	* Allocate Chain Frames
	*/
	chain_frame_size = sc->chain_frames_alloc_sz;
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	4, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	chain_frame_size,
	1,
	chain_frame_size,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&sc->chain_frame_tag)) {
	device_printf(sc->mrsas_dev, "Cannot create chain frame tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->chain_frame_tag, (void **)&sc->chain_frame_mem,
	BUS_DMA_NOWAIT, &sc->chain_frame_dmamap)) {
	device_printf(sc->mrsas_dev, "Cannot alloc chain frame memory\n");
	return (ENOMEM);
	}
	bzero(sc->chain_frame_mem, chain_frame_size);
	if (bus_dmamap_load(sc->chain_frame_tag, sc->chain_frame_dmamap,
	sc->chain_frame_mem, chain_frame_size, mrsas_addr_cb,
	&sc->chain_frame_phys_addr, BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load chain frame memory\n");
	return (ENOMEM);
	}
	count = sc->msix_vectors > 0 ? sc->msix_vectors : 1;
	/*
	* Allocate Reply Descriptor Array
	*/
	reply_desc_size = sc->reply_alloc_sz * count;
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	16, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	reply_desc_size,
	1,
	reply_desc_size,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&sc->reply_desc_tag)) {
	device_printf(sc->mrsas_dev, "Cannot create reply descriptor tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->reply_desc_tag, (void **)&sc->reply_desc_mem,
	BUS_DMA_NOWAIT, &sc->reply_desc_dmamap)) {
	device_printf(sc->mrsas_dev, "Cannot alloc reply descriptor memory\n");
	return (ENOMEM);
	}
	if (bus_dmamap_load(sc->reply_desc_tag, sc->reply_desc_dmamap,
	sc->reply_desc_mem, reply_desc_size, mrsas_addr_cb,
	&sc->reply_desc_phys_addr, BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load reply descriptor memory\n");
	return (ENOMEM);
	}
	/*
	* Allocate Sense Buffer Array. Keep in lower 4GB
	*/
	sense_size = sc->max_fw_cmds * MRSAS_SENSE_LEN;
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	64, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	sense_size,
	1,
	sense_size,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&sc->sense_tag)) {
	device_printf(sc->mrsas_dev, "Cannot allocate sense buf tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->sense_tag, (void **)&sc->sense_mem,
	BUS_DMA_NOWAIT, &sc->sense_dmamap)) {
	device_printf(sc->mrsas_dev, "Cannot allocate sense buf memory\n");
	return (ENOMEM);
	}
	if (bus_dmamap_load(sc->sense_tag, sc->sense_dmamap,
	sc->sense_mem, sense_size, mrsas_addr_cb, &sc->sense_phys_addr,
	BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load sense buf memory\n");
	return (ENOMEM);
	}

	/*
	* Allocate for Event detail structure
	*/
	evt_detail_size = sizeof(struct mrsas_evt_detail);
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	1, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	evt_detail_size,
	1,
	evt_detail_size,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&sc->evt_detail_tag)) {
	device_printf(sc->mrsas_dev, "Cannot create Event detail tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->evt_detail_tag, (void **)&sc->evt_detail_mem,
	BUS_DMA_NOWAIT, &sc->evt_detail_dmamap)) {
	device_printf(sc->mrsas_dev, "Cannot alloc Event detail buffer memory\n");
	return (ENOMEM);
	}
	bzero(sc->evt_detail_mem, evt_detail_size);
	if (bus_dmamap_load(sc->evt_detail_tag, sc->evt_detail_dmamap,
	sc->evt_detail_mem, evt_detail_size, mrsas_addr_cb,
	&sc->evt_detail_phys_addr, BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load Event detail buffer memory\n");
	return (ENOMEM);
	}

	/*
	* Allocate for PD INFO structure
	*/
	pd_info_size = sizeof(struct mrsas_pd_info);
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	1, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	pd_info_size,
	1,
	pd_info_size,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&sc->pd_info_tag)) {
	device_printf(sc->mrsas_dev, "Cannot create PD INFO tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->pd_info_tag, (void **)&sc->pd_info_mem,
	BUS_DMA_NOWAIT, &sc->pd_info_dmamap)) {
	device_printf(sc->mrsas_dev, "Cannot alloc PD INFO buffer memory\n");
	return (ENOMEM);
	}
	bzero(sc->pd_info_mem, pd_info_size);
	if (bus_dmamap_load(sc->pd_info_tag, sc->pd_info_dmamap,
	sc->pd_info_mem, pd_info_size, mrsas_addr_cb,
	&sc->pd_info_phys_addr, BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load PD INFO buffer memory\n");
	return (ENOMEM);
	}

	/*
	* Create a dma tag for data buffers; size will be the maximum
	* possible I/O size (280kB).
	*/
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	1,
	0,
	BUS_SPACE_MAXADDR,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	- MAXPHYS,
	+ maxphys,
	sc->max_num_sge, /* nsegments */
	- MAXPHYS,
	+ maxphys,
	BUS_DMA_ALLOCNOW,
	busdma_lock_mutex,
	&sc->io_lock,
	&sc->data_tag)) {
	device_printf(sc->mrsas_dev, "Cannot create data dma tag\n");
	return (ENOMEM);
	}
	return (0);
	}

	/*
	* mrsas_addr_cb: Callback function of bus_dmamap_load()
	* input: callback argument, machine dependent type
	* that describes DMA segments, number of segments, error code
	*
	* This function is for the driver to receive mapping information resultant of
	* the bus_dmamap_load(). The information is actually not being used, but the
	* address is saved anyway.
	*/
	void
	mrsas_addr_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	bus_addr_t *addr;

	addr = arg;
	*addr = segs[0].ds_addr;
	}

	/*
	* mrsas_setup_raidmap: Set up RAID map.
	* input: Adapter instance soft state
	*
	* Allocate DMA memory for the RAID maps and perform setup.
	*/
	static int
	mrsas_setup_raidmap(struct mrsas_softc *sc)
	{
	int i;

	for (i = 0; i < 2; i++) {
	sc->ld_drv_map[i] =
	(void *)malloc(sc->drv_map_sz, M_MRSAS, M_NOWAIT);
	/* Do Error handling */
	if (!sc->ld_drv_map[i]) {
	device_printf(sc->mrsas_dev, "Could not allocate memory for local map");

	if (i == 1)
	free(sc->ld_drv_map[0], M_MRSAS);
	/* ABORT driver initialization */
	goto ABORT;
	}
	}

	for (int i = 0; i < 2; i++) {
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	4, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	sc->max_map_sz,
	1,
	sc->max_map_sz,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&sc->raidmap_tag[i])) {
	device_printf(sc->mrsas_dev,
	"Cannot allocate raid map tag.\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->raidmap_tag[i],
	(void **)&sc->raidmap_mem[i],
	BUS_DMA_NOWAIT, &sc->raidmap_dmamap[i])) {
	device_printf(sc->mrsas_dev,
	"Cannot allocate raidmap memory.\n");
	return (ENOMEM);
	}
	bzero(sc->raidmap_mem[i], sc->max_map_sz);

	if (bus_dmamap_load(sc->raidmap_tag[i], sc->raidmap_dmamap[i],
	sc->raidmap_mem[i], sc->max_map_sz,
	mrsas_addr_cb, &sc->raidmap_phys_addr[i],
	BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load raidmap memory.\n");
	return (ENOMEM);
	}
	if (!sc->raidmap_mem[i]) {
	device_printf(sc->mrsas_dev,
	"Cannot allocate memory for raid map.\n");
	return (ENOMEM);
	}
	}

	if (!mrsas_get_map_info(sc))
	mrsas_sync_map_info(sc);

	return (0);

	ABORT:
	return (1);
	}

	/**
	* megasas_setup_jbod_map - setup jbod map for FP seq_number.
	* @sc: Adapter soft state
	*
	* Return 0 on success.
	*/
	void
	megasas_setup_jbod_map(struct mrsas_softc *sc)
	{
	int i;
	uint32_t pd_seq_map_sz;

	pd_seq_map_sz = sizeof(struct MR_PD_CFG_SEQ_NUM_SYNC) +
	(sizeof(struct MR_PD_CFG_SEQ) * (MAX_PHYSICAL_DEVICES - 1));

	if (!sc->ctrl_info->adapterOperations3.useSeqNumJbodFP) {
	sc->use_seqnum_jbod_fp = 0;
	return;
	}
	if (sc->jbodmap_mem[0])
	goto skip_alloc;

	for (i = 0; i < 2; i++) {
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	4, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	pd_seq_map_sz,
	1,
	pd_seq_map_sz,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&sc->jbodmap_tag[i])) {
	device_printf(sc->mrsas_dev,
	"Cannot allocate jbod map tag.\n");
	return;
	}
	if (bus_dmamem_alloc(sc->jbodmap_tag[i],
	(void **)&sc->jbodmap_mem[i],
	BUS_DMA_NOWAIT, &sc->jbodmap_dmamap[i])) {
	device_printf(sc->mrsas_dev,
	"Cannot allocate jbod map memory.\n");
	return;
	}
	bzero(sc->jbodmap_mem[i], pd_seq_map_sz);

	if (bus_dmamap_load(sc->jbodmap_tag[i], sc->jbodmap_dmamap[i],
	sc->jbodmap_mem[i], pd_seq_map_sz,
	mrsas_addr_cb, &sc->jbodmap_phys_addr[i],
	BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load jbod map memory.\n");
	return;
	}
	if (!sc->jbodmap_mem[i]) {
	device_printf(sc->mrsas_dev,
	"Cannot allocate memory for jbod map.\n");
	sc->use_seqnum_jbod_fp = 0;
	return;
	}
	}

	skip_alloc:
	if (!megasas_sync_pd_seq_num(sc, false) &&
	!megasas_sync_pd_seq_num(sc, true))
	sc->use_seqnum_jbod_fp = 1;
	else
	sc->use_seqnum_jbod_fp = 0;

	device_printf(sc->mrsas_dev, "Jbod map is supported\n");
	}

	/*
	* mrsas_init_fw: Initialize Firmware
	* input: Adapter soft state
	*
	* Calls transition_to_ready() to make sure Firmware is in operational state and
	* calls mrsas_init_adapter() to send IOC_INIT command to Firmware. It
	* issues internal commands to get the controller info after the IOC_INIT
	* command response is received by Firmware. Note: code relating to
	* get_pdlist, get_ld_list and max_sectors are currently not being used, it
	* is left here as placeholder.
	*/
	static int
	mrsas_init_fw(struct mrsas_softc *sc)
	{

	int ret, loop, ocr = 0;
	u_int32_t max_sectors_1;
	u_int32_t max_sectors_2;
	u_int32_t tmp_sectors;
	u_int32_t scratch_pad_2, scratch_pad_3, scratch_pad_4;
	int msix_enable = 0;
	int fw_msix_count = 0;
	int i, j;

	/* Make sure Firmware is ready */
	ret = mrsas_transition_to_ready(sc, ocr);
	if (ret != SUCCESS) {
	return (ret);
	}
	if (sc->is_ventura \|\| sc->is_aero) {
	scratch_pad_3 = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set, outbound_scratch_pad_3));
	#if VD_EXT_DEBUG
	device_printf(sc->mrsas_dev, "scratch_pad_3 0x%x\n", scratch_pad_3);
	#endif
	sc->maxRaidMapSize = ((scratch_pad_3 >>
	MR_MAX_RAID_MAP_SIZE_OFFSET_SHIFT) &
	MR_MAX_RAID_MAP_SIZE_MASK);
	}
	/* MSI-x index 0- reply post host index register */
	sc->msix_reg_offset[0] = MPI2_REPLY_POST_HOST_INDEX_OFFSET;
	/* Check if MSI-X is supported while in ready state */
	msix_enable = (mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set, outbound_scratch_pad)) & 0x4000000) >> 0x1a;

	if (msix_enable) {
	scratch_pad_2 = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	outbound_scratch_pad_2));

	/* Check max MSI-X vectors */
	if (sc->device_id == MRSAS_TBOLT) {
	sc->msix_vectors = (scratch_pad_2
	& MR_MAX_REPLY_QUEUES_OFFSET) + 1;
	fw_msix_count = sc->msix_vectors;
	} else {
	/* Invader/Fury supports 96 MSI-X vectors */
	sc->msix_vectors = ((scratch_pad_2
	& MR_MAX_REPLY_QUEUES_EXT_OFFSET)
	>> MR_MAX_REPLY_QUEUES_EXT_OFFSET_SHIFT) + 1;
	fw_msix_count = sc->msix_vectors;

	if ((sc->mrsas_gen3_ctrl && (sc->msix_vectors > 8)) \|\|
	((sc->is_ventura \|\| sc->is_aero) && (sc->msix_vectors > 16)))
	sc->msix_combined = true;
	/*
	* Save 1-15 reply post index
	* address to local memory Index 0
	* is already saved from reg offset
	* MPI2_REPLY_POST_HOST_INDEX_OFFSET
	*/
	for (loop = 1; loop < MR_MAX_MSIX_REG_ARRAY;
	loop++) {
	sc->msix_reg_offset[loop] =
	MPI2_SUP_REPLY_POST_HOST_INDEX_OFFSET +
	(loop * 0x10);
	}
	}

	/* Don't bother allocating more MSI-X vectors than cpus */
	sc->msix_vectors = min(sc->msix_vectors,
	mp_ncpus);

	/* Allocate MSI-x vectors */
	if (mrsas_allocate_msix(sc) == SUCCESS)
	sc->msix_enable = 1;
	else
	sc->msix_enable = 0;

	device_printf(sc->mrsas_dev, "FW supports <%d> MSIX vector,"
	"Online CPU %d Current MSIX <%d>\n",
	fw_msix_count, mp_ncpus, sc->msix_vectors);
	}
	/*
	* MSI-X host index 0 is common for all adapter.
	* It is used for all MPT based Adapters.
	*/
	if (sc->msix_combined) {
	sc->msix_reg_offset[0] =
	MPI2_SUP_REPLY_POST_HOST_INDEX_OFFSET;
	}
	if (mrsas_init_adapter(sc) != SUCCESS) {
	device_printf(sc->mrsas_dev, "Adapter initialize Fail.\n");
	return (1);
	}

	if (sc->is_ventura \|\| sc->is_aero) {
	scratch_pad_4 = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	outbound_scratch_pad_4));
	if ((scratch_pad_4 & MR_NVME_PAGE_SIZE_MASK) >= MR_DEFAULT_NVME_PAGE_SHIFT)
	sc->nvme_page_size = 1 << (scratch_pad_4 & MR_NVME_PAGE_SIZE_MASK);

	device_printf(sc->mrsas_dev, "NVME page size\t: (%d)\n", sc->nvme_page_size);
	}

	/* Allocate internal commands for pass-thru */
	if (mrsas_alloc_mfi_cmds(sc) != SUCCESS) {
	device_printf(sc->mrsas_dev, "Allocate MFI cmd failed.\n");
	return (1);
	}
	sc->ctrl_info = malloc(sizeof(struct mrsas_ctrl_info), M_MRSAS, M_NOWAIT);
	if (!sc->ctrl_info) {
	device_printf(sc->mrsas_dev, "Malloc for ctrl_info failed.\n");
	return (1);
	}
	/*
	* Get the controller info from FW, so that the MAX VD support
	* availability can be decided.
	*/
	if (mrsas_get_ctrl_info(sc)) {
	device_printf(sc->mrsas_dev, "Unable to get FW ctrl_info.\n");
	return (1);
	}
	sc->secure_jbod_support =
	(u_int8_t)sc->ctrl_info->adapterOperations3.supportSecurityonJBOD;

	if (sc->secure_jbod_support)
	device_printf(sc->mrsas_dev, "FW supports SED \n");

	if (sc->use_seqnum_jbod_fp)
	device_printf(sc->mrsas_dev, "FW supports JBOD Map \n");

	if (sc->support_morethan256jbod)
	device_printf(sc->mrsas_dev, "FW supports JBOD Map Ext \n");

	if (mrsas_setup_raidmap(sc) != SUCCESS) {
	device_printf(sc->mrsas_dev, "Error: RAID map setup FAILED !!! "
	"There seems to be some problem in the controller\n"
	"Please contact to the SUPPORT TEAM if the problem persists\n");
	}
	megasas_setup_jbod_map(sc);

	memset(sc->target_list, 0,
	MRSAS_MAX_TM_TARGETS * sizeof(struct mrsas_target));
	for (i = 0; i < MRSAS_MAX_TM_TARGETS; i++)
	sc->target_list[i].target_id = 0xffff;

	/* For pass-thru, get PD/LD list and controller info */
	memset(sc->pd_list, 0,
	MRSAS_MAX_PD * sizeof(struct mrsas_pd_list));
	if (mrsas_get_pd_list(sc) != SUCCESS) {
	device_printf(sc->mrsas_dev, "Get PD list failed.\n");
	return (1);
	}
	memset(sc->ld_ids, 0xff, MRSAS_MAX_LD_IDS);
	if (mrsas_get_ld_list(sc) != SUCCESS) {
	device_printf(sc->mrsas_dev, "Get LD lsit failed.\n");
	return (1);
	}

	if ((sc->is_ventura \|\| sc->is_aero) && sc->drv_stream_detection) {
	sc->streamDetectByLD = malloc(sizeof(PTR_LD_STREAM_DETECT) *
	MAX_LOGICAL_DRIVES_EXT, M_MRSAS, M_NOWAIT);
	if (!sc->streamDetectByLD) {
	device_printf(sc->mrsas_dev,
	"unable to allocate stream detection for pool of LDs\n");
	return (1);
	}
	for (i = 0; i < MAX_LOGICAL_DRIVES_EXT; ++i) {
	sc->streamDetectByLD[i] = malloc(sizeof(LD_STREAM_DETECT), M_MRSAS, M_NOWAIT);
	if (!sc->streamDetectByLD[i]) {
	device_printf(sc->mrsas_dev, "unable to allocate stream detect by LD\n");
	for (j = 0; j < i; ++j)
	free(sc->streamDetectByLD[j], M_MRSAS);
	free(sc->streamDetectByLD, M_MRSAS);
	sc->streamDetectByLD = NULL;
	return (1);
	}
	memset(sc->streamDetectByLD[i], 0, sizeof(LD_STREAM_DETECT));
	sc->streamDetectByLD[i]->mruBitMap = MR_STREAM_BITMAP;
	}
	}

	/*
	* Compute the max allowed sectors per IO: The controller info has
	* two limits on max sectors. Driver should use the minimum of these
	* two.
	*
	* 1 << stripe_sz_ops.min = max sectors per strip
	*
	* Note that older firmwares ( < FW ver 30) didn't report information to
	* calculate max_sectors_1. So the number ended up as zero always.
	*/
	tmp_sectors = 0;
	max_sectors_1 = (1 << sc->ctrl_info->stripe_sz_ops.min) *
	sc->ctrl_info->max_strips_per_io;
	max_sectors_2 = sc->ctrl_info->max_request_size;
	tmp_sectors = min(max_sectors_1, max_sectors_2);
	sc->max_sectors_per_req = sc->max_num_sge * MRSAS_PAGE_SIZE / 512;

	if (tmp_sectors && (sc->max_sectors_per_req > tmp_sectors))
	sc->max_sectors_per_req = tmp_sectors;

	sc->disableOnlineCtrlReset =
	sc->ctrl_info->properties.OnOffProperties.disableOnlineCtrlReset;
	sc->UnevenSpanSupport =
	sc->ctrl_info->adapterOperations2.supportUnevenSpans;
	if (sc->UnevenSpanSupport) {
	device_printf(sc->mrsas_dev, "FW supports: UnevenSpanSupport=%x\n\n",
	sc->UnevenSpanSupport);

	if (MR_ValidateMapInfo(sc))
	sc->fast_path_io = 1;
	else
	sc->fast_path_io = 0;
	}

	device_printf(sc->mrsas_dev, "max_fw_cmds: %u max_scsi_cmds: %u\n",
	sc->max_fw_cmds, sc->max_scsi_cmds);
	return (0);
	}

	/*
	* mrsas_init_adapter: Initializes the adapter/controller
	* input: Adapter soft state
	*
	* Prepares for the issuing of the IOC Init cmd to FW for initializing the
	* ROC/controller. The FW register is read to determined the number of
	* commands that is supported. All memory allocations for IO is based on
	* max_cmd. Appropriate calculations are performed in this function.
	*/
	int
	mrsas_init_adapter(struct mrsas_softc *sc)
	{
	uint32_t status;
	u_int32_t scratch_pad_2;
	int ret;
	int i = 0;

	/* Read FW status register */
	status = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set, outbound_scratch_pad));

	sc->max_fw_cmds = status & MRSAS_FWSTATE_MAXCMD_MASK;

	/* Decrement the max supported by 1, to correlate with FW */
	sc->max_fw_cmds = sc->max_fw_cmds - 1;
	sc->max_scsi_cmds = sc->max_fw_cmds - MRSAS_MAX_MFI_CMDS;

	/* Determine allocation size of command frames */
	sc->reply_q_depth = ((sc->max_fw_cmds + 1 + 15) / 16 * 16) * 2;
	sc->request_alloc_sz = sizeof(MRSAS_REQUEST_DESCRIPTOR_UNION) * sc->max_fw_cmds;
	sc->reply_alloc_sz = sizeof(MPI2_REPLY_DESCRIPTORS_UNION) * (sc->reply_q_depth);
	sc->io_frames_alloc_sz = MRSAS_MPI2_RAID_DEFAULT_IO_FRAME_SIZE +
	(MRSAS_MPI2_RAID_DEFAULT_IO_FRAME_SIZE * (sc->max_fw_cmds + 1));
	scratch_pad_2 = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	outbound_scratch_pad_2));
	/*
	* If scratch_pad_2 & MEGASAS_MAX_CHAIN_SIZE_UNITS_MASK is set,
	* Firmware support extended IO chain frame which is 4 time more
	* than legacy Firmware. Legacy Firmware - Frame size is (8 * 128) =
	* 1K 1M IO Firmware - Frame size is (8 * 128 * 4) = 4K
	*/
	if (scratch_pad_2 & MEGASAS_MAX_CHAIN_SIZE_UNITS_MASK)
	sc->max_chain_frame_sz =
	((scratch_pad_2 & MEGASAS_MAX_CHAIN_SIZE_MASK) >> 5)
	* MEGASAS_1MB_IO;
	else
	sc->max_chain_frame_sz =
	((scratch_pad_2 & MEGASAS_MAX_CHAIN_SIZE_MASK) >> 5)
	* MEGASAS_256K_IO;

	sc->chain_frames_alloc_sz = sc->max_chain_frame_sz * sc->max_fw_cmds;
	sc->max_sge_in_main_msg = (MRSAS_MPI2_RAID_DEFAULT_IO_FRAME_SIZE -
	offsetof(MRSAS_RAID_SCSI_IO_REQUEST, SGL)) / 16;

	sc->max_sge_in_chain = sc->max_chain_frame_sz / sizeof(MPI2_SGE_IO_UNION);
	sc->max_num_sge = sc->max_sge_in_main_msg + sc->max_sge_in_chain - 2;

	mrsas_dprint(sc, MRSAS_INFO,
	"max sge: 0x%x, max chain frame size: 0x%x, "
	"max fw cmd: 0x%x\n", sc->max_num_sge,
	sc->max_chain_frame_sz, sc->max_fw_cmds);

	/* Used for pass thru MFI frame (DCMD) */
	sc->chain_offset_mfi_pthru = offsetof(MRSAS_RAID_SCSI_IO_REQUEST, SGL) / 16;

	sc->chain_offset_io_request = (MRSAS_MPI2_RAID_DEFAULT_IO_FRAME_SIZE -
	sizeof(MPI2_SGE_IO_UNION)) / 16;

	int count = sc->msix_vectors > 0 ? sc->msix_vectors : 1;

	for (i = 0; i < count; i++)
	sc->last_reply_idx[i] = 0;

	ret = mrsas_alloc_mem(sc);
	if (ret != SUCCESS)
	return (ret);

	ret = mrsas_alloc_mpt_cmds(sc);
	if (ret != SUCCESS)
	return (ret);

	ret = mrsas_ioc_init(sc);
	if (ret != SUCCESS)
	return (ret);

	return (0);
	}

	/*
	* mrsas_alloc_ioc_cmd: Allocates memory for IOC Init command
	* input: Adapter soft state
	*
	* Allocates for the IOC Init cmd to FW to initialize the ROC/controller.
	*/
	int
	mrsas_alloc_ioc_cmd(struct mrsas_softc *sc)
	{
	int ioc_init_size;

	/* Allocate IOC INIT command */
	ioc_init_size = 1024 + sizeof(MPI2_IOC_INIT_REQUEST);
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	1, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	ioc_init_size,
	1,
	ioc_init_size,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&sc->ioc_init_tag)) {
	device_printf(sc->mrsas_dev, "Cannot allocate ioc init tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->ioc_init_tag, (void **)&sc->ioc_init_mem,
	BUS_DMA_NOWAIT, &sc->ioc_init_dmamap)) {
	device_printf(sc->mrsas_dev, "Cannot allocate ioc init cmd mem\n");
	return (ENOMEM);
	}
	bzero(sc->ioc_init_mem, ioc_init_size);
	if (bus_dmamap_load(sc->ioc_init_tag, sc->ioc_init_dmamap,
	sc->ioc_init_mem, ioc_init_size, mrsas_addr_cb,
	&sc->ioc_init_phys_mem, BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load ioc init cmd mem\n");
	return (ENOMEM);
	}
	return (0);
	}

	/*
	* mrsas_free_ioc_cmd: Allocates memory for IOC Init command
	* input: Adapter soft state
	*
	* Deallocates memory of the IOC Init cmd.
	*/
	void
	mrsas_free_ioc_cmd(struct mrsas_softc *sc)
	{
	if (sc->ioc_init_phys_mem)
	bus_dmamap_unload(sc->ioc_init_tag, sc->ioc_init_dmamap);
	if (sc->ioc_init_mem != NULL)
	bus_dmamem_free(sc->ioc_init_tag, sc->ioc_init_mem, sc->ioc_init_dmamap);
	if (sc->ioc_init_tag != NULL)
	bus_dma_tag_destroy(sc->ioc_init_tag);
	}

	/*
	* mrsas_ioc_init: Sends IOC Init command to FW
	* input: Adapter soft state
	*
	* Issues the IOC Init cmd to FW to initialize the ROC/controller.
	*/
	int
	mrsas_ioc_init(struct mrsas_softc *sc)
	{
	struct mrsas_init_frame *init_frame;
	pMpi2IOCInitRequest_t IOCInitMsg;
	MRSAS_REQUEST_DESCRIPTOR_UNION req_desc;
	u_int8_t max_wait = MRSAS_INTERNAL_CMD_WAIT_TIME;
	bus_addr_t phys_addr;
	int i, retcode = 0;
	u_int32_t scratch_pad_2;

	/* Allocate memory for the IOC INIT command */
	if (mrsas_alloc_ioc_cmd(sc)) {
	device_printf(sc->mrsas_dev, "Cannot allocate IOC command.\n");
	return (1);
	}

	if (!sc->block_sync_cache) {
	scratch_pad_2 = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	outbound_scratch_pad_2));
	sc->fw_sync_cache_support = (scratch_pad_2 &
	MR_CAN_HANDLE_SYNC_CACHE_OFFSET) ? 1 : 0;
	}

	IOCInitMsg = (pMpi2IOCInitRequest_t)(((char *)sc->ioc_init_mem) + 1024);
	IOCInitMsg->Function = MPI2_FUNCTION_IOC_INIT;
	IOCInitMsg->WhoInit = MPI2_WHOINIT_HOST_DRIVER;
	IOCInitMsg->MsgVersion = MPI2_VERSION;
	IOCInitMsg->HeaderVersion = MPI2_HEADER_VERSION;
	IOCInitMsg->SystemRequestFrameSize = MRSAS_MPI2_RAID_DEFAULT_IO_FRAME_SIZE / 4;
	IOCInitMsg->ReplyDescriptorPostQueueDepth = sc->reply_q_depth;
	IOCInitMsg->ReplyDescriptorPostQueueAddress = sc->reply_desc_phys_addr;
	IOCInitMsg->SystemRequestFrameBaseAddress = sc->io_request_phys_addr;
	IOCInitMsg->HostMSIxVectors = (sc->msix_vectors > 0 ? sc->msix_vectors : 0);
	IOCInitMsg->HostPageSize = MR_DEFAULT_NVME_PAGE_SHIFT;

	init_frame = (struct mrsas_init_frame *)sc->ioc_init_mem;
	init_frame->cmd = MFI_CMD_INIT;
	init_frame->cmd_status = 0xFF;
	init_frame->flags \|= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;

	/* driver support Extended MSIX */
	if (sc->mrsas_gen3_ctrl \|\| sc->is_ventura \|\| sc->is_aero) {
	init_frame->driver_operations.
	mfi_capabilities.support_additional_msix = 1;
	}
	if (sc->verbuf_mem) {
	snprintf((char *)sc->verbuf_mem, strlen(MRSAS_VERSION) + 2, "%s\n",
	MRSAS_VERSION);
	init_frame->driver_ver_lo = (bus_addr_t)sc->verbuf_phys_addr;
	init_frame->driver_ver_hi = 0;
	}
	init_frame->driver_operations.mfi_capabilities.support_ndrive_r1_lb = 1;
	init_frame->driver_operations.mfi_capabilities.support_max_255lds = 1;
	init_frame->driver_operations.mfi_capabilities.security_protocol_cmds_fw = 1;
	if (sc->max_chain_frame_sz > MEGASAS_CHAIN_FRAME_SZ_MIN)
	init_frame->driver_operations.mfi_capabilities.support_ext_io_size = 1;
	phys_addr = (bus_addr_t)sc->ioc_init_phys_mem + 1024;
	init_frame->queue_info_new_phys_addr_lo = phys_addr;
	init_frame->data_xfer_len = sizeof(Mpi2IOCInitRequest_t);

	req_desc.addr.Words = (bus_addr_t)sc->ioc_init_phys_mem;
	req_desc.MFAIo.RequestFlags =
	(MRSAS_REQ_DESCRIPT_FLAGS_MFA << MRSAS_REQ_DESCRIPT_FLAGS_TYPE_SHIFT);

	mrsas_disable_intr(sc);
	mrsas_dprint(sc, MRSAS_OCR, "Issuing IOC INIT command to FW.\n");
	mrsas_write_64bit_req_desc(sc, req_desc.addr.u.low, req_desc.addr.u.high);

	/*
	* Poll response timer to wait for Firmware response. While this
	* timer with the DELAY call could block CPU, the time interval for
	* this is only 1 millisecond.
	*/
	if (init_frame->cmd_status == 0xFF) {
	for (i = 0; i < (max_wait * 1000); i++) {
	if (init_frame->cmd_status == 0xFF)
	DELAY(1000);
	else
	break;
	}
	}
	if (init_frame->cmd_status == 0)
	mrsas_dprint(sc, MRSAS_OCR,
	"IOC INIT response received from FW.\n");
	else {
	if (init_frame->cmd_status == 0xFF)
	device_printf(sc->mrsas_dev, "IOC Init timed out after %d seconds.\n", max_wait);
	else
	device_printf(sc->mrsas_dev, "IOC Init failed, status = 0x%x\n", init_frame->cmd_status);
	retcode = 1;
	}

	if (sc->is_aero) {
	scratch_pad_2 = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	outbound_scratch_pad_2));
	sc->atomic_desc_support = (scratch_pad_2 &
	MR_ATOMIC_DESCRIPTOR_SUPPORT_OFFSET) ? 1 : 0;
	device_printf(sc->mrsas_dev, "FW supports atomic descriptor: %s\n",
	sc->atomic_desc_support ? "Yes" : "No");
	}

	mrsas_free_ioc_cmd(sc);
	return (retcode);
	}

	/*
	* mrsas_alloc_mpt_cmds: Allocates the command packets
	* input: Adapter instance soft state
	*
	* This function allocates the internal commands for IOs. Each command that is
	* issued to FW is wrapped in a local data structure called mrsas_mpt_cmd. An
	* array is allocated with mrsas_mpt_cmd context. The free commands are
	* maintained in a linked list (cmd pool). SMID value range is from 1 to
	* max_fw_cmds.
	*/
	int
	mrsas_alloc_mpt_cmds(struct mrsas_softc *sc)
	{
	int i, j;
	u_int32_t max_fw_cmds, count;
	struct mrsas_mpt_cmd *cmd;
	pMpi2ReplyDescriptorsUnion_t reply_desc;
	u_int32_t offset, chain_offset, sense_offset;
	bus_addr_t io_req_base_phys, chain_frame_base_phys, sense_base_phys;
	u_int8_t io_req_base, chain_frame_base, *sense_base;

	max_fw_cmds = sc->max_fw_cmds;

	sc->req_desc = malloc(sc->request_alloc_sz, M_MRSAS, M_NOWAIT);
	if (!sc->req_desc) {
	device_printf(sc->mrsas_dev, "Out of memory, cannot alloc req desc\n");
	return (ENOMEM);
	}
	memset(sc->req_desc, 0, sc->request_alloc_sz);

	/*
	* sc->mpt_cmd_list is an array of struct mrsas_mpt_cmd pointers.
	* Allocate the dynamic array first and then allocate individual
	* commands.
	*/
	sc->mpt_cmd_list = malloc(sizeof(struct mrsas_mpt_cmd ) max_fw_cmds,
	M_MRSAS, M_NOWAIT);
	if (!sc->mpt_cmd_list) {
	device_printf(sc->mrsas_dev, "Cannot alloc memory for mpt_cmd_list.\n");
	return (ENOMEM);
	}
	memset(sc->mpt_cmd_list, 0, sizeof(struct mrsas_mpt_cmd ) max_fw_cmds);
	for (i = 0; i < max_fw_cmds; i++) {
	sc->mpt_cmd_list[i] = malloc(sizeof(struct mrsas_mpt_cmd),
	M_MRSAS, M_NOWAIT);
	if (!sc->mpt_cmd_list[i]) {
	for (j = 0; j < i; j++)
	free(sc->mpt_cmd_list[j], M_MRSAS);
	free(sc->mpt_cmd_list, M_MRSAS);
	sc->mpt_cmd_list = NULL;
	return (ENOMEM);
	}
	}

	io_req_base = (u_int8_t *)sc->io_request_mem + MRSAS_MPI2_RAID_DEFAULT_IO_FRAME_SIZE;
	io_req_base_phys = (bus_addr_t)sc->io_request_phys_addr + MRSAS_MPI2_RAID_DEFAULT_IO_FRAME_SIZE;
	chain_frame_base = (u_int8_t *)sc->chain_frame_mem;
	chain_frame_base_phys = (bus_addr_t)sc->chain_frame_phys_addr;
	sense_base = (u_int8_t *)sc->sense_mem;
	sense_base_phys = (bus_addr_t)sc->sense_phys_addr;
	for (i = 0; i < max_fw_cmds; i++) {
	cmd = sc->mpt_cmd_list[i];
	offset = MRSAS_MPI2_RAID_DEFAULT_IO_FRAME_SIZE * i;
	chain_offset = sc->max_chain_frame_sz * i;
	sense_offset = MRSAS_SENSE_LEN * i;
	memset(cmd, 0, sizeof(struct mrsas_mpt_cmd));
	cmd->index = i + 1;
	cmd->ccb_ptr = NULL;
	cmd->r1_alt_dev_handle = MR_DEVHANDLE_INVALID;
	callout_init_mtx(&cmd->cm_callout, &sc->sim_lock, 0);
	cmd->sync_cmd_idx = (u_int32_t)MRSAS_ULONG_MAX;
	cmd->sc = sc;
	cmd->io_request = (MRSAS_RAID_SCSI_IO_REQUEST *) (io_req_base + offset);
	memset(cmd->io_request, 0, sizeof(MRSAS_RAID_SCSI_IO_REQUEST));
	cmd->io_request_phys_addr = io_req_base_phys + offset;
	cmd->chain_frame = (MPI2_SGE_IO_UNION *) (chain_frame_base + chain_offset);
	cmd->chain_frame_phys_addr = chain_frame_base_phys + chain_offset;
	cmd->sense = sense_base + sense_offset;
	cmd->sense_phys_addr = sense_base_phys + sense_offset;
	if (bus_dmamap_create(sc->data_tag, 0, &cmd->data_dmamap)) {
	return (FAIL);
	}
	TAILQ_INSERT_TAIL(&(sc->mrsas_mpt_cmd_list_head), cmd, next);
	}

	/* Initialize reply descriptor array to 0xFFFFFFFF */
	reply_desc = sc->reply_desc_mem;
	count = sc->msix_vectors > 0 ? sc->msix_vectors : 1;
	for (i = 0; i < sc->reply_q_depth * count; i++, reply_desc++) {
	reply_desc->Words = MRSAS_ULONG_MAX;
	}
	return (0);
	}

	/*
	* mrsas_write_64bit_req_dsc: Writes 64 bit request descriptor to FW
	* input: Adapter softstate
	* request descriptor address low
	* request descriptor address high
	*/
	void
	mrsas_write_64bit_req_desc(struct mrsas_softc *sc, u_int32_t req_desc_lo,
	u_int32_t req_desc_hi)
	{
	mtx_lock(&sc->pci_lock);
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, inbound_low_queue_port),
	req_desc_lo);
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, inbound_high_queue_port),
	req_desc_hi);
	mtx_unlock(&sc->pci_lock);
	}

	/*
	* mrsas_fire_cmd: Sends command to FW
	* input: Adapter softstate
	* request descriptor address low
	* request descriptor address high
	*
	* This functions fires the command to Firmware by writing to the
	* inbound_low_queue_port and inbound_high_queue_port.
	*/
	void
	mrsas_fire_cmd(struct mrsas_softc *sc, u_int32_t req_desc_lo,
	u_int32_t req_desc_hi)
	{
	if (sc->atomic_desc_support)
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, inbound_single_queue_port),
	req_desc_lo);
	else
	mrsas_write_64bit_req_desc(sc, req_desc_lo, req_desc_hi);
	}

	/*
	* mrsas_transition_to_ready: Move FW to Ready state input:
	* Adapter instance soft state
	*
	* During the initialization, FW passes can potentially be in any one of several
	* possible states. If the FW in operational, waiting-for-handshake states,
	* driver must take steps to bring it to ready state. Otherwise, it has to
	* wait for the ready state.
	*/
	int
	mrsas_transition_to_ready(struct mrsas_softc *sc, int ocr)
	{
	int i;
	u_int8_t max_wait;
	u_int32_t val, fw_state;
	u_int32_t cur_state;
	u_int32_t abs_state, curr_abs_state;

	val = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set, outbound_scratch_pad));
	fw_state = val & MFI_STATE_MASK;
	max_wait = MRSAS_RESET_WAIT_TIME;

	if (fw_state != MFI_STATE_READY)
	device_printf(sc->mrsas_dev, "Waiting for FW to come to ready state\n");

	while (fw_state != MFI_STATE_READY) {
	abs_state = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set, outbound_scratch_pad));
	switch (fw_state) {
	case MFI_STATE_FAULT:
	device_printf(sc->mrsas_dev, "FW is in FAULT state!!\n");
	if (ocr) {
	cur_state = MFI_STATE_FAULT;
	break;
	} else
	return -ENODEV;
	case MFI_STATE_WAIT_HANDSHAKE:
	/* Set the CLR bit in inbound doorbell */
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, doorbell),
	MFI_INIT_CLEAR_HANDSHAKE \| MFI_INIT_HOTPLUG);
	cur_state = MFI_STATE_WAIT_HANDSHAKE;
	break;
	case MFI_STATE_BOOT_MESSAGE_PENDING:
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, doorbell),
	MFI_INIT_HOTPLUG);
	cur_state = MFI_STATE_BOOT_MESSAGE_PENDING;
	break;
	case MFI_STATE_OPERATIONAL:
	/*
	* Bring it to READY state; assuming max wait 10
	* secs
	*/
	mrsas_disable_intr(sc);
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, doorbell), MFI_RESET_FLAGS);
	for (i = 0; i < max_wait * 1000; i++) {
	if (mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set, doorbell)) & 1)
	DELAY(1000);
	else
	break;
	}
	cur_state = MFI_STATE_OPERATIONAL;
	break;
	case MFI_STATE_UNDEFINED:
	/*
	* This state should not last for more than 2
	* seconds
	*/
	cur_state = MFI_STATE_UNDEFINED;
	break;
	case MFI_STATE_BB_INIT:
	cur_state = MFI_STATE_BB_INIT;
	break;
	case MFI_STATE_FW_INIT:
	cur_state = MFI_STATE_FW_INIT;
	break;
	case MFI_STATE_FW_INIT_2:
	cur_state = MFI_STATE_FW_INIT_2;
	break;
	case MFI_STATE_DEVICE_SCAN:
	cur_state = MFI_STATE_DEVICE_SCAN;
	break;
	case MFI_STATE_FLUSH_CACHE:
	cur_state = MFI_STATE_FLUSH_CACHE;
	break;
	default:
	device_printf(sc->mrsas_dev, "Unknown state 0x%x\n", fw_state);
	return -ENODEV;
	}

	/*
	* The cur_state should not last for more than max_wait secs
	*/
	for (i = 0; i < (max_wait * 1000); i++) {
	fw_state = (mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	outbound_scratch_pad)) & MFI_STATE_MASK);
	curr_abs_state = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	outbound_scratch_pad));
	if (abs_state == curr_abs_state)
	DELAY(1000);
	else
	break;
	}

	/*
	* Return error if fw_state hasn't changed after max_wait
	*/
	if (curr_abs_state == abs_state) {
	device_printf(sc->mrsas_dev, "FW state [%d] hasn't changed "
	"in %d secs\n", fw_state, max_wait);
	return -ENODEV;
	}
	}
	mrsas_dprint(sc, MRSAS_OCR, "FW now in Ready state\n");
	return 0;
	}

	/*
	* mrsas_get_mfi_cmd: Get a cmd from free command pool
	* input: Adapter soft state
	*
	* This function removes an MFI command from the command list.
	*/
	struct mrsas_mfi_cmd *
	mrsas_get_mfi_cmd(struct mrsas_softc *sc)
	{
	struct mrsas_mfi_cmd *cmd = NULL;

	mtx_lock(&sc->mfi_cmd_pool_lock);
	if (!TAILQ_EMPTY(&sc->mrsas_mfi_cmd_list_head)) {
	cmd = TAILQ_FIRST(&sc->mrsas_mfi_cmd_list_head);
	TAILQ_REMOVE(&sc->mrsas_mfi_cmd_list_head, cmd, next);
	}
	mtx_unlock(&sc->mfi_cmd_pool_lock);

	return cmd;
	}

	/*
	* mrsas_ocr_thread: Thread to handle OCR/Kill Adapter.
	* input: Adapter Context.
	*
	* This function will check FW status register and flag do_timeout_reset flag.
	* It will do OCR/Kill adapter if FW is in fault state or IO timed out has
	* trigger reset.
	*/
	static void
	mrsas_ocr_thread(void *arg)
	{
	struct mrsas_softc *sc;
	u_int32_t fw_status, fw_state;
	u_int8_t tm_target_reset_failed = 0;

	sc = (struct mrsas_softc *)arg;

	mrsas_dprint(sc, MRSAS_TRACE, "%s\n", __func__);

	sc->ocr_thread_active = 1;
	mtx_lock(&sc->sim_lock);
	for (;;) {
	/* Sleep for 1 second and check the queue status */
	msleep(&sc->ocr_chan, &sc->sim_lock, PRIBIO,
	"mrsas_ocr", sc->mrsas_fw_fault_check_delay * hz);
	if (sc->remove_in_progress \|\|
	sc->adprecovery == MRSAS_HW_CRITICAL_ERROR) {
	mrsas_dprint(sc, MRSAS_OCR,
	"Exit due to %s from %s\n",
	sc->remove_in_progress ? "Shutdown" :
	"Hardware critical error", __func__);
	break;
	}
	fw_status = mrsas_read_reg_with_retries(sc,
	offsetof(mrsas_reg_set, outbound_scratch_pad));
	fw_state = fw_status & MFI_STATE_MASK;
	if (fw_state == MFI_STATE_FAULT \|\| sc->do_timedout_reset \|\|
	mrsas_atomic_read(&sc->target_reset_outstanding)) {
	/* First, freeze further IOs to come to the SIM */
	mrsas_xpt_freeze(sc);

	/* If this is an IO timeout then go for target reset */
	if (mrsas_atomic_read(&sc->target_reset_outstanding)) {
	device_printf(sc->mrsas_dev, "Initiating Target RESET "
	"because of SCSI IO timeout!\n");

	/* Let the remaining IOs to complete */
	msleep(&sc->ocr_chan, &sc->sim_lock, PRIBIO,
	"mrsas_reset_targets", 5 * hz);

	/* Try to reset the target device */
	if (mrsas_reset_targets(sc) == FAIL)
	tm_target_reset_failed = 1;
	}

	/* If this is a DCMD timeout or FW fault,
	* then go for controller reset
	*/
	if (fw_state == MFI_STATE_FAULT \|\| tm_target_reset_failed \|\|
	(sc->do_timedout_reset == MFI_DCMD_TIMEOUT_OCR)) {
	if (tm_target_reset_failed)
	device_printf(sc->mrsas_dev, "Initiaiting OCR because of "
	"TM FAILURE!\n");
	else
	device_printf(sc->mrsas_dev, "Initiaiting OCR "
	"because of %s!\n", sc->do_timedout_reset ?
	"DCMD IO Timeout" : "FW fault");

	mtx_lock_spin(&sc->ioctl_lock);
	sc->reset_in_progress = 1;
	mtx_unlock_spin(&sc->ioctl_lock);
	sc->reset_count++;

	/*
	* Wait for the AEN task to be completed if it is running.
	*/
	mtx_unlock(&sc->sim_lock);
	taskqueue_drain(sc->ev_tq, &sc->ev_task);
	mtx_lock(&sc->sim_lock);

	taskqueue_block(sc->ev_tq);
	/* Try to reset the controller */
	mrsas_reset_ctrl(sc, sc->do_timedout_reset);

	sc->do_timedout_reset = 0;
	sc->reset_in_progress = 0;
	tm_target_reset_failed = 0;
	mrsas_atomic_set(&sc->target_reset_outstanding, 0);
	memset(sc->target_reset_pool, 0,
	sizeof(sc->target_reset_pool));
	taskqueue_unblock(sc->ev_tq);
	}

	/* Now allow IOs to come to the SIM */
	mrsas_xpt_release(sc);
	}
	}
	mtx_unlock(&sc->sim_lock);
	sc->ocr_thread_active = 0;
	mrsas_kproc_exit(0);
	}

	/*
	* mrsas_reset_reply_desc: Reset Reply descriptor as part of OCR.
	* input: Adapter Context.
	*
	* This function will clear reply descriptor so that post OCR driver and FW will
	* lost old history.
	*/
	void
	mrsas_reset_reply_desc(struct mrsas_softc *sc)
	{
	int i, count;
	pMpi2ReplyDescriptorsUnion_t reply_desc;

	count = sc->msix_vectors > 0 ? sc->msix_vectors : 1;
	for (i = 0; i < count; i++)
	sc->last_reply_idx[i] = 0;

	reply_desc = sc->reply_desc_mem;
	for (i = 0; i < sc->reply_q_depth; i++, reply_desc++) {
	reply_desc->Words = MRSAS_ULONG_MAX;
	}
	}

	/*
	* mrsas_reset_ctrl: Core function to OCR/Kill adapter.
	* input: Adapter Context.
	*
	* This function will run from thread context so that it can sleep. 1. Do not
	* handle OCR if FW is in HW critical error. 2. Wait for outstanding command
	* to complete for 180 seconds. 3. If #2 does not find any outstanding
	* command Controller is in working state, so skip OCR. Otherwise, do
	* OCR/kill Adapter based on flag disableOnlineCtrlReset. 4. Start of the
	* OCR, return all SCSI command back to CAM layer which has ccb_ptr. 5. Post
	* OCR, Re-fire Management command and move Controller to Operation state.
	*/
	int
	mrsas_reset_ctrl(struct mrsas_softc *sc, u_int8_t reset_reason)
	{
	int retval = SUCCESS, i, j, retry = 0;
	u_int32_t host_diag, abs_state, status_reg, reset_adapter;
	union ccb *ccb;
	struct mrsas_mfi_cmd *mfi_cmd;
	struct mrsas_mpt_cmd *mpt_cmd;
	union mrsas_evt_class_locale class_locale;
	MRSAS_REQUEST_DESCRIPTOR_UNION *req_desc;

	if (sc->adprecovery == MRSAS_HW_CRITICAL_ERROR) {
	device_printf(sc->mrsas_dev,
	"mrsas: Hardware critical error, returning FAIL.\n");
	return FAIL;
	}
	mrsas_set_bit(MRSAS_FUSION_IN_RESET, &sc->reset_flags);
	sc->adprecovery = MRSAS_ADPRESET_SM_INFAULT;
	mrsas_disable_intr(sc);
	msleep(&sc->ocr_chan, &sc->sim_lock, PRIBIO, "mrsas_ocr",
	sc->mrsas_fw_fault_check_delay * hz);

	/* First try waiting for commands to complete */
	if (mrsas_wait_for_outstanding(sc, reset_reason)) {
	mrsas_dprint(sc, MRSAS_OCR,
	"resetting adapter from %s.\n",
	__func__);
	/* Now return commands back to the CAM layer */
	mtx_unlock(&sc->sim_lock);
	for (i = 0; i < sc->max_fw_cmds; i++) {
	mpt_cmd = sc->mpt_cmd_list[i];

	if (mpt_cmd->peer_cmd) {
	mrsas_dprint(sc, MRSAS_OCR,
	"R1 FP command [%d] - (mpt_cmd) %p, (peer_cmd) %p\n",
	i, mpt_cmd, mpt_cmd->peer_cmd);
	}

	if (mpt_cmd->ccb_ptr) {
	if (mpt_cmd->callout_owner) {
	ccb = (union ccb *)(mpt_cmd->ccb_ptr);
	ccb->ccb_h.status = CAM_SCSI_BUS_RESET;
	mrsas_cmd_done(sc, mpt_cmd);
	} else {
	mpt_cmd->ccb_ptr = NULL;
	mrsas_release_mpt_cmd(mpt_cmd);
	}
	}
	}

	mrsas_atomic_set(&sc->fw_outstanding, 0);

	mtx_lock(&sc->sim_lock);

	status_reg = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	outbound_scratch_pad));
	abs_state = status_reg & MFI_STATE_MASK;
	reset_adapter = status_reg & MFI_RESET_ADAPTER;
	if (sc->disableOnlineCtrlReset \|\|
	(abs_state == MFI_STATE_FAULT && !reset_adapter)) {
	/* Reset not supported, kill adapter */
	mrsas_dprint(sc, MRSAS_OCR, "Reset not supported, killing adapter.\n");
	mrsas_kill_hba(sc);
	retval = FAIL;
	goto out;
	}
	/* Now try to reset the chip */
	for (i = 0; i < MRSAS_FUSION_MAX_RESET_TRIES; i++) {
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, fusion_seq_offset),
	MPI2_WRSEQ_FLUSH_KEY_VALUE);
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, fusion_seq_offset),
	MPI2_WRSEQ_1ST_KEY_VALUE);
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, fusion_seq_offset),
	MPI2_WRSEQ_2ND_KEY_VALUE);
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, fusion_seq_offset),
	MPI2_WRSEQ_3RD_KEY_VALUE);
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, fusion_seq_offset),
	MPI2_WRSEQ_4TH_KEY_VALUE);
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, fusion_seq_offset),
	MPI2_WRSEQ_5TH_KEY_VALUE);
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, fusion_seq_offset),
	MPI2_WRSEQ_6TH_KEY_VALUE);

	/* Check that the diag write enable (DRWE) bit is on */
	host_diag = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	fusion_host_diag));
	retry = 0;
	while (!(host_diag & HOST_DIAG_WRITE_ENABLE)) {
	DELAY(100 * 1000);
	host_diag = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	fusion_host_diag));
	if (retry++ == 100) {
	mrsas_dprint(sc, MRSAS_OCR,
	"Host diag unlock failed!\n");
	break;
	}
	}
	if (!(host_diag & HOST_DIAG_WRITE_ENABLE))
	continue;

	/* Send chip reset command */
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, fusion_host_diag),
	host_diag \| HOST_DIAG_RESET_ADAPTER);
	DELAY(3000 * 1000);

	/* Make sure reset adapter bit is cleared */
	host_diag = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	fusion_host_diag));
	retry = 0;
	while (host_diag & HOST_DIAG_RESET_ADAPTER) {
	DELAY(100 * 1000);
	host_diag = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	fusion_host_diag));
	if (retry++ == 1000) {
	mrsas_dprint(sc, MRSAS_OCR,
	"Diag reset adapter never cleared!\n");
	break;
	}
	}
	if (host_diag & HOST_DIAG_RESET_ADAPTER)
	continue;

	abs_state = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	outbound_scratch_pad)) & MFI_STATE_MASK;
	retry = 0;

	while ((abs_state <= MFI_STATE_FW_INIT) && (retry++ < 1000)) {
	DELAY(100 * 1000);
	abs_state = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	outbound_scratch_pad)) & MFI_STATE_MASK;
	}
	if (abs_state <= MFI_STATE_FW_INIT) {
	mrsas_dprint(sc, MRSAS_OCR, "firmware state < MFI_STATE_FW_INIT,"
	" state = 0x%x\n", abs_state);
	continue;
	}
	/* Wait for FW to become ready */
	if (mrsas_transition_to_ready(sc, 1)) {
	mrsas_dprint(sc, MRSAS_OCR,
	"mrsas: Failed to transition controller to ready.\n");
	continue;
	}
	mrsas_reset_reply_desc(sc);
	if (mrsas_ioc_init(sc)) {
	mrsas_dprint(sc, MRSAS_OCR, "mrsas_ioc_init() failed!\n");
	continue;
	}
	for (j = 0; j < sc->max_fw_cmds; j++) {
	mpt_cmd = sc->mpt_cmd_list[j];
	if (mpt_cmd->sync_cmd_idx != (u_int32_t)MRSAS_ULONG_MAX) {
	mfi_cmd = sc->mfi_cmd_list[mpt_cmd->sync_cmd_idx];
	/* If not an IOCTL then release the command else re-fire */
	if (!mfi_cmd->sync_cmd) {
	mrsas_release_mfi_cmd(mfi_cmd);
	} else {
	req_desc = mrsas_get_request_desc(sc,
	mfi_cmd->cmd_id.context.smid - 1);
	mrsas_dprint(sc, MRSAS_OCR,
	"Re-fire command DCMD opcode 0x%x index %d\n ",
	mfi_cmd->frame->dcmd.opcode, j);
	if (!req_desc)
	device_printf(sc->mrsas_dev,
	"Cannot build MPT cmd.\n");
	else
	mrsas_fire_cmd(sc, req_desc->addr.u.low,
	req_desc->addr.u.high);
	}
	}
	}

	/* Reset load balance info */
	memset(sc->load_balance_info, 0,
	sizeof(LD_LOAD_BALANCE_INFO) * MAX_LOGICAL_DRIVES_EXT);

	if (mrsas_get_ctrl_info(sc)) {
	mrsas_kill_hba(sc);
	retval = FAIL;
	goto out;
	}
	if (!mrsas_get_map_info(sc))
	mrsas_sync_map_info(sc);

	megasas_setup_jbod_map(sc);

	if ((sc->is_ventura \|\| sc->is_aero) && sc->streamDetectByLD) {
	for (j = 0; j < MAX_LOGICAL_DRIVES_EXT; ++j) {
	memset(sc->streamDetectByLD[i], 0, sizeof(LD_STREAM_DETECT));
	sc->streamDetectByLD[i]->mruBitMap = MR_STREAM_BITMAP;
	}
	}

	mrsas_clear_bit(MRSAS_FUSION_IN_RESET, &sc->reset_flags);
	mrsas_enable_intr(sc);
	sc->adprecovery = MRSAS_HBA_OPERATIONAL;

	/* Register AEN with FW for last sequence number */
	class_locale.members.reserved = 0;
	class_locale.members.locale = MR_EVT_LOCALE_ALL;
	class_locale.members.class = MR_EVT_CLASS_DEBUG;

	mtx_unlock(&sc->sim_lock);
	if (mrsas_register_aen(sc, sc->last_seq_num,
	class_locale.word)) {
	device_printf(sc->mrsas_dev,
	"ERROR: AEN registration FAILED from OCR !!! "
	"Further events from the controller cannot be notified."
	"Either there is some problem in the controller"
	"or the controller does not support AEN.\n"
	"Please contact to the SUPPORT TEAM if the problem persists\n");
	}
	mtx_lock(&sc->sim_lock);

	/* Adapter reset completed successfully */
	device_printf(sc->mrsas_dev, "Reset successful\n");
	retval = SUCCESS;
	goto out;
	}
	/* Reset failed, kill the adapter */
	device_printf(sc->mrsas_dev, "Reset failed, killing adapter.\n");
	mrsas_kill_hba(sc);
	retval = FAIL;
	} else {
	mrsas_clear_bit(MRSAS_FUSION_IN_RESET, &sc->reset_flags);
	mrsas_enable_intr(sc);
	sc->adprecovery = MRSAS_HBA_OPERATIONAL;
	}
	out:
	mrsas_clear_bit(MRSAS_FUSION_IN_RESET, &sc->reset_flags);
	mrsas_dprint(sc, MRSAS_OCR,
	"Reset Exit with %d.\n", retval);
	return retval;
	}

	/*
	* mrsas_kill_hba: Kill HBA when OCR is not supported
	* input: Adapter Context.
	*
	* This function will kill HBA when OCR is not supported.
	*/
	void
	mrsas_kill_hba(struct mrsas_softc *sc)
	{
	sc->adprecovery = MRSAS_HW_CRITICAL_ERROR;
	DELAY(1000 * 1000);
	mrsas_dprint(sc, MRSAS_OCR, "%s\n", __func__);
	mrsas_write_reg(sc, offsetof(mrsas_reg_set, doorbell),
	MFI_STOP_ADP);
	/* Flush */
	mrsas_read_reg(sc, offsetof(mrsas_reg_set, doorbell));
	mrsas_complete_outstanding_ioctls(sc);
	}

	/**
	* mrsas_complete_outstanding_ioctls Complete pending IOCTLS after kill_hba
	* input: Controller softc
	*
	* Returns void
	*/
	void
	mrsas_complete_outstanding_ioctls(struct mrsas_softc *sc)
	{
	int i;
	struct mrsas_mpt_cmd *cmd_mpt;
	struct mrsas_mfi_cmd *cmd_mfi;
	u_int32_t count, MSIxIndex;

	count = sc->msix_vectors > 0 ? sc->msix_vectors : 1;
	for (i = 0; i < sc->max_fw_cmds; i++) {
	cmd_mpt = sc->mpt_cmd_list[i];

	if (cmd_mpt->sync_cmd_idx != (u_int32_t)MRSAS_ULONG_MAX) {
	cmd_mfi = sc->mfi_cmd_list[cmd_mpt->sync_cmd_idx];
	if (cmd_mfi->sync_cmd && cmd_mfi->frame->hdr.cmd != MFI_CMD_ABORT) {
	for (MSIxIndex = 0; MSIxIndex < count; MSIxIndex++)
	mrsas_complete_mptmfi_passthru(sc, cmd_mfi,
	cmd_mpt->io_request->RaidContext.raid_context.status);
	}
	}
	}
	}

	/*
	* mrsas_wait_for_outstanding: Wait for outstanding commands
	* input: Adapter Context.
	*
	* This function will wait for 180 seconds for outstanding commands to be
	* completed.
	*/
	int
	mrsas_wait_for_outstanding(struct mrsas_softc *sc, u_int8_t check_reason)
	{
	int i, outstanding, retval = 0;
	u_int32_t fw_state, count, MSIxIndex;

	for (i = 0; i < MRSAS_RESET_WAIT_TIME; i++) {
	if (sc->remove_in_progress) {
	mrsas_dprint(sc, MRSAS_OCR,
	"Driver remove or shutdown called.\n");
	retval = 1;
	goto out;
	}
	/* Check if firmware is in fault state */
	fw_state = mrsas_read_reg_with_retries(sc, offsetof(mrsas_reg_set,
	outbound_scratch_pad)) & MFI_STATE_MASK;
	if (fw_state == MFI_STATE_FAULT) {
	mrsas_dprint(sc, MRSAS_OCR,
	"Found FW in FAULT state, will reset adapter.\n");
	count = sc->msix_vectors > 0 ? sc->msix_vectors : 1;
	mtx_unlock(&sc->sim_lock);
	for (MSIxIndex = 0; MSIxIndex < count; MSIxIndex++)
	mrsas_complete_cmd(sc, MSIxIndex);
	mtx_lock(&sc->sim_lock);
	retval = 1;
	goto out;
	}
	if (check_reason == MFI_DCMD_TIMEOUT_OCR) {
	mrsas_dprint(sc, MRSAS_OCR,
	"DCMD IO TIMEOUT detected, will reset adapter.\n");
	retval = 1;
	goto out;
	}
	outstanding = mrsas_atomic_read(&sc->fw_outstanding);
	if (!outstanding)
	goto out;

	if (!(i % MRSAS_RESET_NOTICE_INTERVAL)) {
	mrsas_dprint(sc, MRSAS_OCR, "[%2d]waiting for %d "
	"commands to complete\n", i, outstanding);
	count = sc->msix_vectors > 0 ? sc->msix_vectors : 1;
	mtx_unlock(&sc->sim_lock);
	for (MSIxIndex = 0; MSIxIndex < count; MSIxIndex++)
	mrsas_complete_cmd(sc, MSIxIndex);
	mtx_lock(&sc->sim_lock);
	}
	DELAY(1000 * 1000);
	}

	if (mrsas_atomic_read(&sc->fw_outstanding)) {
	mrsas_dprint(sc, MRSAS_OCR,
	" pending commands remain after waiting,"
	" will reset adapter.\n");
	retval = 1;
	}
	out:
	return retval;
	}

	/*
	* mrsas_release_mfi_cmd: Return a cmd to free command pool
	* input: Command packet for return to free cmd pool
	*
	* This function returns the MFI & MPT command to the command list.
	*/
	void
	mrsas_release_mfi_cmd(struct mrsas_mfi_cmd *cmd_mfi)
	{
	struct mrsas_softc *sc = cmd_mfi->sc;
	struct mrsas_mpt_cmd *cmd_mpt;

	mtx_lock(&sc->mfi_cmd_pool_lock);
	/*
	* Release the mpt command (if at all it is allocated
	* associated with the mfi command
	*/
	if (cmd_mfi->cmd_id.context.smid) {
	mtx_lock(&sc->mpt_cmd_pool_lock);
	/* Get the mpt cmd from mfi cmd frame's smid value */
	cmd_mpt = sc->mpt_cmd_list[cmd_mfi->cmd_id.context.smid-1];
	cmd_mpt->flags = 0;
	cmd_mpt->sync_cmd_idx = (u_int32_t)MRSAS_ULONG_MAX;
	TAILQ_INSERT_HEAD(&(sc->mrsas_mpt_cmd_list_head), cmd_mpt, next);
	mtx_unlock(&sc->mpt_cmd_pool_lock);
	}
	/* Release the mfi command */
	cmd_mfi->ccb_ptr = NULL;
	cmd_mfi->cmd_id.frame_count = 0;
	TAILQ_INSERT_HEAD(&(sc->mrsas_mfi_cmd_list_head), cmd_mfi, next);
	mtx_unlock(&sc->mfi_cmd_pool_lock);

	return;
	}

	/*
	* mrsas_get_controller_info: Returns FW's controller structure
	* input: Adapter soft state
	* Controller information structure
	*
	* Issues an internal command (DCMD) to get the FW's controller structure. This
	* information is mainly used to find out the maximum IO transfer per command
	* supported by the FW.
	*/
	static int
	mrsas_get_ctrl_info(struct mrsas_softc *sc)
	{
	int retcode = 0;
	u_int8_t do_ocr = 1;
	struct mrsas_mfi_cmd *cmd;
	struct mrsas_dcmd_frame *dcmd;

	cmd = mrsas_get_mfi_cmd(sc);

	if (!cmd) {
	device_printf(sc->mrsas_dev, "Failed to get a free cmd\n");
	return -ENOMEM;
	}
	dcmd = &cmd->frame->dcmd;

	if (mrsas_alloc_ctlr_info_cmd(sc) != SUCCESS) {
	device_printf(sc->mrsas_dev, "Cannot allocate get ctlr info cmd\n");
	mrsas_release_mfi_cmd(cmd);
	return -ENOMEM;
	}
	memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE);

	dcmd->cmd = MFI_CMD_DCMD;
	dcmd->cmd_status = 0xFF;
	dcmd->sge_count = 1;
	dcmd->flags = MFI_FRAME_DIR_READ;
	dcmd->timeout = 0;
	dcmd->pad_0 = 0;
	dcmd->data_xfer_len = sizeof(struct mrsas_ctrl_info);
	dcmd->opcode = MR_DCMD_CTRL_GET_INFO;
	dcmd->sgl.sge32[0].phys_addr = sc->ctlr_info_phys_addr;
	dcmd->sgl.sge32[0].length = sizeof(struct mrsas_ctrl_info);

	if (!sc->mask_interrupts)
	retcode = mrsas_issue_blocked_cmd(sc, cmd);
	else
	retcode = mrsas_issue_polled(sc, cmd);

	if (retcode == ETIMEDOUT)
	goto dcmd_timeout;
	else
	memcpy(sc->ctrl_info, sc->ctlr_info_mem, sizeof(struct mrsas_ctrl_info));

	do_ocr = 0;
	mrsas_update_ext_vd_details(sc);

	sc->use_seqnum_jbod_fp =
	sc->ctrl_info->adapterOperations3.useSeqNumJbodFP;
	sc->support_morethan256jbod =
	sc->ctrl_info->adapterOperations4.supportPdMapTargetId;

	sc->disableOnlineCtrlReset =
	sc->ctrl_info->properties.OnOffProperties.disableOnlineCtrlReset;

	dcmd_timeout:
	mrsas_free_ctlr_info_cmd(sc);

	if (do_ocr)
	sc->do_timedout_reset = MFI_DCMD_TIMEOUT_OCR;

	if (!sc->mask_interrupts)
	mrsas_release_mfi_cmd(cmd);

	return (retcode);
	}

	/*
	* mrsas_update_ext_vd_details : Update details w.r.t Extended VD
	* input:
	* sc - Controller's softc
	*/
	static void
	mrsas_update_ext_vd_details(struct mrsas_softc *sc)
	{
	u_int32_t ventura_map_sz = 0;
	sc->max256vdSupport =
	sc->ctrl_info->adapterOperations3.supportMaxExtLDs;

	/* Below is additional check to address future FW enhancement */
	if (sc->ctrl_info->max_lds > 64)
	sc->max256vdSupport = 1;

	sc->drv_supported_vd_count = MRSAS_MAX_LD_CHANNELS
	* MRSAS_MAX_DEV_PER_CHANNEL;
	sc->drv_supported_pd_count = MRSAS_MAX_PD_CHANNELS
	* MRSAS_MAX_DEV_PER_CHANNEL;
	if (sc->max256vdSupport) {
	sc->fw_supported_vd_count = MAX_LOGICAL_DRIVES_EXT;
	sc->fw_supported_pd_count = MAX_PHYSICAL_DEVICES;
	} else {
	sc->fw_supported_vd_count = MAX_LOGICAL_DRIVES;
	sc->fw_supported_pd_count = MAX_PHYSICAL_DEVICES;
	}

	if (sc->maxRaidMapSize) {
	ventura_map_sz = sc->maxRaidMapSize *
	MR_MIN_MAP_SIZE;
	sc->current_map_sz = ventura_map_sz;
	sc->max_map_sz = ventura_map_sz;
	} else {
	sc->old_map_sz = sizeof(MR_FW_RAID_MAP) +
	(sizeof(MR_LD_SPAN_MAP) * (sc->fw_supported_vd_count - 1));
	sc->new_map_sz = sizeof(MR_FW_RAID_MAP_EXT);
	sc->max_map_sz = max(sc->old_map_sz, sc->new_map_sz);
	if (sc->max256vdSupport)
	sc->current_map_sz = sc->new_map_sz;
	else
	sc->current_map_sz = sc->old_map_sz;
	}

	sc->drv_map_sz = sizeof(MR_DRV_RAID_MAP_ALL);
	#if VD_EXT_DEBUG
	device_printf(sc->mrsas_dev, "sc->maxRaidMapSize 0x%x \n",
	sc->maxRaidMapSize);
	device_printf(sc->mrsas_dev,
	"new_map_sz = 0x%x, old_map_sz = 0x%x, "
	"ventura_map_sz = 0x%x, current_map_sz = 0x%x "
	"fusion->drv_map_sz =0x%x, size of driver raid map 0x%lx \n",
	sc->new_map_sz, sc->old_map_sz, ventura_map_sz,
	sc->current_map_sz, sc->drv_map_sz, sizeof(MR_DRV_RAID_MAP_ALL));
	#endif
	}

	/*
	* mrsas_alloc_ctlr_info_cmd: Allocates memory for controller info command
	* input: Adapter soft state
	*
	* Allocates DMAable memory for the controller info internal command.
	*/
	int
	mrsas_alloc_ctlr_info_cmd(struct mrsas_softc *sc)
	{
	int ctlr_info_size;

	/* Allocate get controller info command */
	ctlr_info_size = sizeof(struct mrsas_ctrl_info);
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	1, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	ctlr_info_size,
	1,
	ctlr_info_size,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&sc->ctlr_info_tag)) {
	device_printf(sc->mrsas_dev, "Cannot allocate ctlr info tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->ctlr_info_tag, (void **)&sc->ctlr_info_mem,
	BUS_DMA_NOWAIT, &sc->ctlr_info_dmamap)) {
	device_printf(sc->mrsas_dev, "Cannot allocate ctlr info cmd mem\n");
	return (ENOMEM);
	}
	if (bus_dmamap_load(sc->ctlr_info_tag, sc->ctlr_info_dmamap,
	sc->ctlr_info_mem, ctlr_info_size, mrsas_addr_cb,
	&sc->ctlr_info_phys_addr, BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load ctlr info cmd mem\n");
	return (ENOMEM);
	}
	memset(sc->ctlr_info_mem, 0, ctlr_info_size);
	return (0);
	}

	/*
	* mrsas_free_ctlr_info_cmd: Free memory for controller info command
	* input: Adapter soft state
	*
	* Deallocates memory of the get controller info cmd.
	*/
	void
	mrsas_free_ctlr_info_cmd(struct mrsas_softc *sc)
	{
	if (sc->ctlr_info_phys_addr)
	bus_dmamap_unload(sc->ctlr_info_tag, sc->ctlr_info_dmamap);
	if (sc->ctlr_info_mem != NULL)
	bus_dmamem_free(sc->ctlr_info_tag, sc->ctlr_info_mem, sc->ctlr_info_dmamap);
	if (sc->ctlr_info_tag != NULL)
	bus_dma_tag_destroy(sc->ctlr_info_tag);
	}

	/*
	* mrsas_issue_polled: Issues a polling command
	* inputs: Adapter soft state
	* Command packet to be issued
	*
	* This function is for posting of internal commands to Firmware. MFI requires
	* the cmd_status to be set to 0xFF before posting. The maximun wait time of
	* the poll response timer is 180 seconds.
	*/
	int
	mrsas_issue_polled(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd)
	{
	struct mrsas_header *frame_hdr = &cmd->frame->hdr;
	u_int8_t max_wait = MRSAS_INTERNAL_CMD_WAIT_TIME;
	int i, retcode = SUCCESS;

	frame_hdr->cmd_status = 0xFF;
	frame_hdr->flags \|= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;

	/* Issue the frame using inbound queue port */
	if (mrsas_issue_dcmd(sc, cmd)) {
	device_printf(sc->mrsas_dev, "Cannot issue DCMD internal command.\n");
	return (1);
	}
	/*
	* Poll response timer to wait for Firmware response. While this
	* timer with the DELAY call could block CPU, the time interval for
	* this is only 1 millisecond.
	*/
	if (frame_hdr->cmd_status == 0xFF) {
	for (i = 0; i < (max_wait * 1000); i++) {
	if (frame_hdr->cmd_status == 0xFF)
	DELAY(1000);
	else
	break;
	}
	}
	if (frame_hdr->cmd_status == 0xFF) {
	device_printf(sc->mrsas_dev, "DCMD timed out after %d "
	"seconds from %s\n", max_wait, __func__);
	device_printf(sc->mrsas_dev, "DCMD opcode 0x%X\n",
	cmd->frame->dcmd.opcode);
	retcode = ETIMEDOUT;
	}
	return (retcode);
	}

	/*
	* mrsas_issue_dcmd: Issues a MFI Pass thru cmd
	* input: Adapter soft state mfi cmd pointer
	*
	* This function is called by mrsas_issued_blocked_cmd() and
	* mrsas_issued_polled(), to build the MPT command and then fire the command
	* to Firmware.
	*/
	int
	mrsas_issue_dcmd(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd)
	{
	MRSAS_REQUEST_DESCRIPTOR_UNION *req_desc;

	req_desc = mrsas_build_mpt_cmd(sc, cmd);
	if (!req_desc) {
	device_printf(sc->mrsas_dev, "Cannot build MPT cmd.\n");
	return (1);
	}
	mrsas_fire_cmd(sc, req_desc->addr.u.low, req_desc->addr.u.high);

	return (0);
	}

	/*
	* mrsas_build_mpt_cmd: Calls helper function to build Passthru cmd
	* input: Adapter soft state mfi cmd to build
	*
	* This function is called by mrsas_issue_cmd() to build the MPT-MFI passthru
	* command and prepares the MPT command to send to Firmware.
	*/
	MRSAS_REQUEST_DESCRIPTOR_UNION *
	mrsas_build_mpt_cmd(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd)
	{
	MRSAS_REQUEST_DESCRIPTOR_UNION *req_desc;
	u_int16_t index;

	if (mrsas_build_mptmfi_passthru(sc, cmd)) {
	device_printf(sc->mrsas_dev, "Cannot build MPT-MFI passthru cmd.\n");
	return NULL;
	}
	index = cmd->cmd_id.context.smid;

	req_desc = mrsas_get_request_desc(sc, index - 1);
	if (!req_desc)
	return NULL;

	req_desc->addr.Words = 0;
	req_desc->SCSIIO.RequestFlags = (MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO << MRSAS_REQ_DESCRIPT_FLAGS_TYPE_SHIFT);

	req_desc->SCSIIO.SMID = index;

	return (req_desc);
	}

	/*
	* mrsas_build_mptmfi_passthru: Builds a MPT MFI Passthru command
	* input: Adapter soft state mfi cmd pointer
	*
	* The MPT command and the io_request are setup as a passthru command. The SGE
	* chain address is set to frame_phys_addr of the MFI command.
	*/
	u_int8_t
	mrsas_build_mptmfi_passthru(struct mrsas_softc sc, struct mrsas_mfi_cmd mfi_cmd)
	{
	MPI25_IEEE_SGE_CHAIN64 *mpi25_ieee_chain;
	PTR_MRSAS_RAID_SCSI_IO_REQUEST io_req;
	struct mrsas_mpt_cmd *mpt_cmd;
	struct mrsas_header *frame_hdr = &mfi_cmd->frame->hdr;

	mpt_cmd = mrsas_get_mpt_cmd(sc);
	if (!mpt_cmd)
	return (1);

	/* Save the smid. To be used for returning the cmd */
	mfi_cmd->cmd_id.context.smid = mpt_cmd->index;

	mpt_cmd->sync_cmd_idx = mfi_cmd->index;

	/*
	* For cmds where the flag is set, store the flag and check on
	* completion. For cmds with this flag, don't call
	* mrsas_complete_cmd.
	*/

	if (frame_hdr->flags & MFI_FRAME_DONT_POST_IN_REPLY_QUEUE)
	mpt_cmd->flags = MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;

	io_req = mpt_cmd->io_request;

	if (sc->mrsas_gen3_ctrl \|\| sc->is_ventura \|\| sc->is_aero) {
	pMpi25IeeeSgeChain64_t sgl_ptr_end = (pMpi25IeeeSgeChain64_t)&io_req->SGL;

	sgl_ptr_end += sc->max_sge_in_main_msg - 1;
	sgl_ptr_end->Flags = 0;
	}
	mpi25_ieee_chain = (MPI25_IEEE_SGE_CHAIN64 *) & io_req->SGL.IeeeChain;

	io_req->Function = MRSAS_MPI2_FUNCTION_PASSTHRU_IO_REQUEST;
	io_req->SGLOffset0 = offsetof(MRSAS_RAID_SCSI_IO_REQUEST, SGL) / 4;
	io_req->ChainOffset = sc->chain_offset_mfi_pthru;

	mpi25_ieee_chain->Address = mfi_cmd->frame_phys_addr;

	mpi25_ieee_chain->Flags = IEEE_SGE_FLAGS_CHAIN_ELEMENT \|
	MPI2_IEEE_SGE_FLAGS_IOCPLBNTA_ADDR;

	mpi25_ieee_chain->Length = sc->max_chain_frame_sz;

	return (0);
	}

	/*
	* mrsas_issue_blocked_cmd: Synchronous wrapper around regular FW cmds
	* input: Adapter soft state Command to be issued
	*
	* This function waits on an event for the command to be returned from the ISR.
	* Max wait time is MRSAS_INTERNAL_CMD_WAIT_TIME secs. Used for issuing
	* internal and ioctl commands.
	*/
	int
	mrsas_issue_blocked_cmd(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd)
	{
	u_int8_t max_wait = MRSAS_INTERNAL_CMD_WAIT_TIME;
	unsigned long total_time = 0;
	int retcode = SUCCESS;

	/* Initialize cmd_status */
	cmd->cmd_status = 0xFF;

	/* Build MPT-MFI command for issue to FW */
	if (mrsas_issue_dcmd(sc, cmd)) {
	device_printf(sc->mrsas_dev, "Cannot issue DCMD internal command.\n");
	return (1);
	}
	sc->chan = (void *)&cmd;

	while (1) {
	if (cmd->cmd_status == 0xFF) {
	tsleep((void *)&sc->chan, 0, "mrsas_sleep", hz);
	} else
	break;

	if (!cmd->sync_cmd) { /* cmd->sync will be set for an IOCTL
	* command */
	total_time++;
	if (total_time >= max_wait) {
	device_printf(sc->mrsas_dev,
	"Internal command timed out after %d seconds.\n", max_wait);
	retcode = 1;
	break;
	}
	}
	}

	if (cmd->cmd_status == 0xFF) {
	device_printf(sc->mrsas_dev, "DCMD timed out after %d "
	"seconds from %s\n", max_wait, __func__);
	device_printf(sc->mrsas_dev, "DCMD opcode 0x%X\n",
	cmd->frame->dcmd.opcode);
	retcode = ETIMEDOUT;
	}
	return (retcode);
	}

	/*
	* mrsas_complete_mptmfi_passthru: Completes a command
	* input: @sc: Adapter soft state
	* @cmd: Command to be completed
	* @status: cmd completion status
	*
	* This function is called from mrsas_complete_cmd() after an interrupt is
	* received from Firmware, and io_request->Function is
	* MRSAS_MPI2_FUNCTION_PASSTHRU_IO_REQUEST.
	*/
	void
	mrsas_complete_mptmfi_passthru(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd,
	u_int8_t status)
	{
	struct mrsas_header *hdr = &cmd->frame->hdr;
	u_int8_t cmd_status = cmd->frame->hdr.cmd_status;

	/* Reset the retry counter for future re-tries */
	cmd->retry_for_fw_reset = 0;

	if (cmd->ccb_ptr)
	cmd->ccb_ptr = NULL;

	switch (hdr->cmd) {
	case MFI_CMD_INVALID:
	device_printf(sc->mrsas_dev, "MFI_CMD_INVALID command.\n");
	break;
	case MFI_CMD_PD_SCSI_IO:
	case MFI_CMD_LD_SCSI_IO:
	/*
	* MFI_CMD_PD_SCSI_IO and MFI_CMD_LD_SCSI_IO could have been
	* issued either through an IO path or an IOCTL path. If it
	* was via IOCTL, we will send it to internal completion.
	*/
	if (cmd->sync_cmd) {
	cmd->sync_cmd = 0;
	mrsas_wakeup(sc, cmd);
	break;
	}
	case MFI_CMD_SMP:
	case MFI_CMD_STP:
	case MFI_CMD_DCMD:
	/* Check for LD map update */
	if ((cmd->frame->dcmd.opcode == MR_DCMD_LD_MAP_GET_INFO) &&
	(cmd->frame->dcmd.mbox.b[1] == 1)) {
	sc->fast_path_io = 0;
	mtx_lock(&sc->raidmap_lock);
	sc->map_update_cmd = NULL;
	if (cmd_status != 0) {
	if (cmd_status != MFI_STAT_NOT_FOUND)
	device_printf(sc->mrsas_dev, "map sync failed, status=%x\n", cmd_status);
	else {
	mrsas_release_mfi_cmd(cmd);
	mtx_unlock(&sc->raidmap_lock);
	break;
	}
	} else
	sc->map_id++;
	mrsas_release_mfi_cmd(cmd);
	if (MR_ValidateMapInfo(sc))
	sc->fast_path_io = 0;
	else
	sc->fast_path_io = 1;
	mrsas_sync_map_info(sc);
	mtx_unlock(&sc->raidmap_lock);
	break;
	}
	if (cmd->frame->dcmd.opcode == MR_DCMD_CTRL_EVENT_GET_INFO \|\|
	cmd->frame->dcmd.opcode == MR_DCMD_CTRL_EVENT_GET) {
	sc->mrsas_aen_triggered = 0;
	}
	/* FW has an updated PD sequence */
	if ((cmd->frame->dcmd.opcode ==
	MR_DCMD_SYSTEM_PD_MAP_GET_INFO) &&
	(cmd->frame->dcmd.mbox.b[0] == 1)) {
	mtx_lock(&sc->raidmap_lock);
	sc->jbod_seq_cmd = NULL;
	mrsas_release_mfi_cmd(cmd);

	if (cmd_status == MFI_STAT_OK) {
	sc->pd_seq_map_id++;
	/* Re-register a pd sync seq num cmd */
	if (megasas_sync_pd_seq_num(sc, true))
	sc->use_seqnum_jbod_fp = 0;
	} else {
	sc->use_seqnum_jbod_fp = 0;
	device_printf(sc->mrsas_dev,
	"Jbod map sync failed, status=%x\n", cmd_status);
	}
	mtx_unlock(&sc->raidmap_lock);
	break;
	}
	/* See if got an event notification */
	if (cmd->frame->dcmd.opcode == MR_DCMD_CTRL_EVENT_WAIT)
	mrsas_complete_aen(sc, cmd);
	else
	mrsas_wakeup(sc, cmd);
	break;
	case MFI_CMD_ABORT:
	/* Command issued to abort another cmd return */
	mrsas_complete_abort(sc, cmd);
	break;
	default:
	device_printf(sc->mrsas_dev, "Unknown command completed! [0x%X]\n", hdr->cmd);
	break;
	}
	}

	/*
	* mrsas_wakeup: Completes an internal command
	* input: Adapter soft state
	* Command to be completed
	*
	* In mrsas_issue_blocked_cmd(), after a command is issued to Firmware, a wait
	* timer is started. This function is called from
	* mrsas_complete_mptmfi_passthru() as it completes the command, to wake up
	* from the command wait.
	*/
	void
	mrsas_wakeup(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd)
	{
	cmd->cmd_status = cmd->frame->io.cmd_status;

	if (cmd->cmd_status == 0xFF)
	cmd->cmd_status = 0;

	sc->chan = (void *)&cmd;
	wakeup_one((void *)&sc->chan);
	return;
	}

	/*
	* mrsas_shutdown_ctlr: Instructs FW to shutdown the controller input:
	* Adapter soft state Shutdown/Hibernate
	*
	* This function issues a DCMD internal command to Firmware to initiate shutdown
	* of the controller.
	*/
	static void
	mrsas_shutdown_ctlr(struct mrsas_softc *sc, u_int32_t opcode)
	{
	struct mrsas_mfi_cmd *cmd;
	struct mrsas_dcmd_frame *dcmd;

	if (sc->adprecovery == MRSAS_HW_CRITICAL_ERROR)
	return;

	cmd = mrsas_get_mfi_cmd(sc);
	if (!cmd) {
	device_printf(sc->mrsas_dev, "Cannot allocate for shutdown cmd.\n");
	return;
	}
	if (sc->aen_cmd)
	mrsas_issue_blocked_abort_cmd(sc, sc->aen_cmd);
	if (sc->map_update_cmd)
	mrsas_issue_blocked_abort_cmd(sc, sc->map_update_cmd);
	if (sc->jbod_seq_cmd)
	mrsas_issue_blocked_abort_cmd(sc, sc->jbod_seq_cmd);

	dcmd = &cmd->frame->dcmd;
	memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE);

	dcmd->cmd = MFI_CMD_DCMD;
	dcmd->cmd_status = 0x0;
	dcmd->sge_count = 0;
	dcmd->flags = MFI_FRAME_DIR_NONE;
	dcmd->timeout = 0;
	dcmd->pad_0 = 0;
	dcmd->data_xfer_len = 0;
	dcmd->opcode = opcode;

	device_printf(sc->mrsas_dev, "Preparing to shut down controller.\n");

	mrsas_issue_blocked_cmd(sc, cmd);
	mrsas_release_mfi_cmd(cmd);

	return;
	}

	/*
	* mrsas_flush_cache: Requests FW to flush all its caches input:
	* Adapter soft state
	*
	* This function is issues a DCMD internal command to Firmware to initiate
	* flushing of all caches.
	*/
	static void
	mrsas_flush_cache(struct mrsas_softc *sc)
	{
	struct mrsas_mfi_cmd *cmd;
	struct mrsas_dcmd_frame *dcmd;

	if (sc->adprecovery == MRSAS_HW_CRITICAL_ERROR)
	return;

	cmd = mrsas_get_mfi_cmd(sc);
	if (!cmd) {
	device_printf(sc->mrsas_dev, "Cannot allocate for flush cache cmd.\n");
	return;
	}
	dcmd = &cmd->frame->dcmd;
	memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE);

	dcmd->cmd = MFI_CMD_DCMD;
	dcmd->cmd_status = 0x0;
	dcmd->sge_count = 0;
	dcmd->flags = MFI_FRAME_DIR_NONE;
	dcmd->timeout = 0;
	dcmd->pad_0 = 0;
	dcmd->data_xfer_len = 0;
	dcmd->opcode = MR_DCMD_CTRL_CACHE_FLUSH;
	dcmd->mbox.b[0] = MR_FLUSH_CTRL_CACHE \| MR_FLUSH_DISK_CACHE;

	mrsas_issue_blocked_cmd(sc, cmd);
	mrsas_release_mfi_cmd(cmd);

	return;
	}

	int
	megasas_sync_pd_seq_num(struct mrsas_softc *sc, boolean_t pend)
	{
	int retcode = 0;
	u_int8_t do_ocr = 1;
	struct mrsas_mfi_cmd *cmd;
	struct mrsas_dcmd_frame *dcmd;
	uint32_t pd_seq_map_sz;
	struct MR_PD_CFG_SEQ_NUM_SYNC *pd_sync;
	bus_addr_t pd_seq_h;

	pd_seq_map_sz = sizeof(struct MR_PD_CFG_SEQ_NUM_SYNC) +
	(sizeof(struct MR_PD_CFG_SEQ) *
	(MAX_PHYSICAL_DEVICES - 1));

	cmd = mrsas_get_mfi_cmd(sc);
	if (!cmd) {
	device_printf(sc->mrsas_dev,
	"Cannot alloc for ld map info cmd.\n");
	return 1;
	}
	dcmd = &cmd->frame->dcmd;

	pd_sync = (void *)sc->jbodmap_mem[(sc->pd_seq_map_id & 1)];
	pd_seq_h = sc->jbodmap_phys_addr[(sc->pd_seq_map_id & 1)];
	if (!pd_sync) {
	device_printf(sc->mrsas_dev,
	"Failed to alloc mem for jbod map info.\n");
	mrsas_release_mfi_cmd(cmd);
	return (ENOMEM);
	}
	memset(pd_sync, 0, pd_seq_map_sz);
	memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE);
	dcmd->cmd = MFI_CMD_DCMD;
	dcmd->cmd_status = 0xFF;
	dcmd->sge_count = 1;
	dcmd->timeout = 0;
	dcmd->pad_0 = 0;
	dcmd->data_xfer_len = (pd_seq_map_sz);
	dcmd->opcode = (MR_DCMD_SYSTEM_PD_MAP_GET_INFO);
	dcmd->sgl.sge32[0].phys_addr = (pd_seq_h);
	dcmd->sgl.sge32[0].length = (pd_seq_map_sz);

	if (pend) {
	dcmd->mbox.b[0] = MRSAS_DCMD_MBOX_PEND_FLAG;
	dcmd->flags = (MFI_FRAME_DIR_WRITE);
	sc->jbod_seq_cmd = cmd;
	if (mrsas_issue_dcmd(sc, cmd)) {
	device_printf(sc->mrsas_dev,
	"Fail to send sync map info command.\n");
	return 1;
	} else
	return 0;
	} else
	dcmd->flags = MFI_FRAME_DIR_READ;

	retcode = mrsas_issue_polled(sc, cmd);
	if (retcode == ETIMEDOUT)
	goto dcmd_timeout;

	if (pd_sync->count > MAX_PHYSICAL_DEVICES) {
	device_printf(sc->mrsas_dev,
	"driver supports max %d JBOD, but FW reports %d\n",
	MAX_PHYSICAL_DEVICES, pd_sync->count);
	retcode = -EINVAL;
	}
	if (!retcode)
	sc->pd_seq_map_id++;
	do_ocr = 0;

	dcmd_timeout:
	if (do_ocr)
	sc->do_timedout_reset = MFI_DCMD_TIMEOUT_OCR;

	return (retcode);
	}

	/*
	* mrsas_get_map_info: Load and validate RAID map input:
	* Adapter instance soft state
	*
	* This function calls mrsas_get_ld_map_info() and MR_ValidateMapInfo() to load
	* and validate RAID map. It returns 0 if successful, 1 other- wise.
	*/
	static int
	mrsas_get_map_info(struct mrsas_softc *sc)
	{
	uint8_t retcode = 0;

	sc->fast_path_io = 0;
	if (!mrsas_get_ld_map_info(sc)) {
	retcode = MR_ValidateMapInfo(sc);
	if (retcode == 0) {
	sc->fast_path_io = 1;
	return 0;
	}
	}
	return 1;
	}

	/*
	* mrsas_get_ld_map_info: Get FW's ld_map structure input:
	* Adapter instance soft state
	*
	* Issues an internal command (DCMD) to get the FW's controller PD list
	* structure.
	*/
	static int
	mrsas_get_ld_map_info(struct mrsas_softc *sc)
	{
	int retcode = 0;
	struct mrsas_mfi_cmd *cmd;
	struct mrsas_dcmd_frame *dcmd;
	void *map;
	bus_addr_t map_phys_addr = 0;

	cmd = mrsas_get_mfi_cmd(sc);
	if (!cmd) {
	device_printf(sc->mrsas_dev,
	"Cannot alloc for ld map info cmd.\n");
	return 1;
	}
	dcmd = &cmd->frame->dcmd;

	map = (void *)sc->raidmap_mem[(sc->map_id & 1)];
	map_phys_addr = sc->raidmap_phys_addr[(sc->map_id & 1)];
	if (!map) {
	device_printf(sc->mrsas_dev,
	"Failed to alloc mem for ld map info.\n");
	mrsas_release_mfi_cmd(cmd);
	return (ENOMEM);
	}
	memset(map, 0, sizeof(sc->max_map_sz));
	memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE);

	dcmd->cmd = MFI_CMD_DCMD;
	dcmd->cmd_status = 0xFF;
	dcmd->sge_count = 1;
	dcmd->flags = MFI_FRAME_DIR_READ;
	dcmd->timeout = 0;
	dcmd->pad_0 = 0;
	dcmd->data_xfer_len = sc->current_map_sz;
	dcmd->opcode = MR_DCMD_LD_MAP_GET_INFO;
	dcmd->sgl.sge32[0].phys_addr = map_phys_addr;
	dcmd->sgl.sge32[0].length = sc->current_map_sz;

	retcode = mrsas_issue_polled(sc, cmd);
	if (retcode == ETIMEDOUT)
	sc->do_timedout_reset = MFI_DCMD_TIMEOUT_OCR;

	return (retcode);
	}

	/*
	* mrsas_sync_map_info: Get FW's ld_map structure input:
	* Adapter instance soft state
	*
	* Issues an internal command (DCMD) to get the FW's controller PD list
	* structure.
	*/
	static int
	mrsas_sync_map_info(struct mrsas_softc *sc)
	{
	int retcode = 0, i;
	struct mrsas_mfi_cmd *cmd;
	struct mrsas_dcmd_frame *dcmd;
	uint32_t size_sync_info, num_lds;
	MR_LD_TARGET_SYNC *target_map = NULL;
	MR_DRV_RAID_MAP_ALL *map;
	MR_LD_RAID *raid;
	MR_LD_TARGET_SYNC *ld_sync;
	bus_addr_t map_phys_addr = 0;

	cmd = mrsas_get_mfi_cmd(sc);
	if (!cmd) {
	device_printf(sc->mrsas_dev, "Cannot alloc for sync map info cmd\n");
	return ENOMEM;
	}
	map = sc->ld_drv_map[sc->map_id & 1];
	num_lds = map->raidMap.ldCount;

	dcmd = &cmd->frame->dcmd;
	size_sync_info = sizeof(MR_LD_TARGET_SYNC) * num_lds;
	memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE);

	target_map = (MR_LD_TARGET_SYNC *) sc->raidmap_mem[(sc->map_id - 1) & 1];
	memset(target_map, 0, sc->max_map_sz);

	map_phys_addr = sc->raidmap_phys_addr[(sc->map_id - 1) & 1];

	ld_sync = (MR_LD_TARGET_SYNC *) target_map;

	for (i = 0; i < num_lds; i++, ld_sync++) {
	raid = MR_LdRaidGet(i, map);
	ld_sync->targetId = MR_GetLDTgtId(i, map);
	ld_sync->seqNum = raid->seqNum;
	}

	dcmd->cmd = MFI_CMD_DCMD;
	dcmd->cmd_status = 0xFF;
	dcmd->sge_count = 1;
	dcmd->flags = MFI_FRAME_DIR_WRITE;
	dcmd->timeout = 0;
	dcmd->pad_0 = 0;
	dcmd->data_xfer_len = sc->current_map_sz;
	dcmd->mbox.b[0] = num_lds;
	dcmd->mbox.b[1] = MRSAS_DCMD_MBOX_PEND_FLAG;
	dcmd->opcode = MR_DCMD_LD_MAP_GET_INFO;
	dcmd->sgl.sge32[0].phys_addr = map_phys_addr;
	dcmd->sgl.sge32[0].length = sc->current_map_sz;

	sc->map_update_cmd = cmd;
	if (mrsas_issue_dcmd(sc, cmd)) {
	device_printf(sc->mrsas_dev,
	"Fail to send sync map info command.\n");
	return (1);
	}
	return (retcode);
	}

	/* Input: dcmd.opcode - MR_DCMD_PD_GET_INFO
	* dcmd.mbox.s[0] - deviceId for this physical drive
	* dcmd.sge IN - ptr to returned MR_PD_INFO structure
	* Desc: Firmware return the physical drive info structure
	*
	*/
	static void
	mrsas_get_pd_info(struct mrsas_softc *sc, u_int16_t device_id)
	{
	int retcode;
	u_int8_t do_ocr = 1;
	struct mrsas_mfi_cmd *cmd;
	struct mrsas_dcmd_frame *dcmd;

	cmd = mrsas_get_mfi_cmd(sc);

	if (!cmd) {
	device_printf(sc->mrsas_dev,
	"Cannot alloc for get PD info cmd\n");
	return;
	}
	dcmd = &cmd->frame->dcmd;

	memset(sc->pd_info_mem, 0, sizeof(struct mrsas_pd_info));
	memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE);

	dcmd->mbox.s[0] = device_id;
	dcmd->cmd = MFI_CMD_DCMD;
	dcmd->cmd_status = 0xFF;
	dcmd->sge_count = 1;
	dcmd->flags = MFI_FRAME_DIR_READ;
	dcmd->timeout = 0;
	dcmd->pad_0 = 0;
	dcmd->data_xfer_len = sizeof(struct mrsas_pd_info);
	dcmd->opcode = MR_DCMD_PD_GET_INFO;
	dcmd->sgl.sge32[0].phys_addr = (u_int32_t)sc->pd_info_phys_addr;
	dcmd->sgl.sge32[0].length = sizeof(struct mrsas_pd_info);

	if (!sc->mask_interrupts)
	retcode = mrsas_issue_blocked_cmd(sc, cmd);
	else
	retcode = mrsas_issue_polled(sc, cmd);

	if (retcode == ETIMEDOUT)
	goto dcmd_timeout;

	sc->target_list[device_id].interface_type =
	sc->pd_info_mem->state.ddf.pdType.intf;

	do_ocr = 0;

	dcmd_timeout:

	if (do_ocr)
	sc->do_timedout_reset = MFI_DCMD_TIMEOUT_OCR;

	if (!sc->mask_interrupts)
	mrsas_release_mfi_cmd(cmd);
	}

	/*
	* mrsas_add_target: Add target ID of system PD/VD to driver's data structure.
	* sc: Adapter's soft state
	* target_id: Unique target id per controller(managed by driver)
	* for system PDs- target ID ranges from 0 to (MRSAS_MAX_PD - 1)
	* for VDs- target ID ranges from MRSAS_MAX_PD to MRSAS_MAX_TM_TARGETS
	* return: void
	* Descripton: This function will be called whenever system PD or VD is created.
	*/
	static void mrsas_add_target(struct mrsas_softc *sc,
	u_int16_t target_id)
	{
	sc->target_list[target_id].target_id = target_id;

	device_printf(sc->mrsas_dev,
	"%s created target ID: 0x%x\n",
	(target_id < MRSAS_MAX_PD ? "System PD" : "VD"),
	(target_id < MRSAS_MAX_PD ? target_id : (target_id - MRSAS_MAX_PD)));
	/*
	* If interrupts are enabled, then only fire DCMD to get pd_info
	* for system PDs
	*/
	if (!sc->mask_interrupts && sc->pd_info_mem &&
	(target_id < MRSAS_MAX_PD))
	mrsas_get_pd_info(sc, target_id);

	}

	/*
	* mrsas_remove_target: Remove target ID of system PD/VD from driver's data structure.
	* sc: Adapter's soft state
	* target_id: Unique target id per controller(managed by driver)
	* for system PDs- target ID ranges from 0 to (MRSAS_MAX_PD - 1)
	* for VDs- target ID ranges from MRSAS_MAX_PD to MRSAS_MAX_TM_TARGETS
	* return: void
	* Descripton: This function will be called whenever system PD or VD is deleted
	*/
	static void mrsas_remove_target(struct mrsas_softc *sc,
	u_int16_t target_id)
	{
	sc->target_list[target_id].target_id = 0xffff;
	device_printf(sc->mrsas_dev,
	"%s deleted target ID: 0x%x\n",
	(target_id < MRSAS_MAX_PD ? "System PD" : "VD"),
	(target_id < MRSAS_MAX_PD ? target_id : (target_id - MRSAS_MAX_PD)));
	}

	/*
	* mrsas_get_pd_list: Returns FW's PD list structure input:
	* Adapter soft state
	*
	* Issues an internal command (DCMD) to get the FW's controller PD list
	* structure. This information is mainly used to find out about system
	* supported by Firmware.
	*/
	static int
	mrsas_get_pd_list(struct mrsas_softc *sc)
	{
	int retcode = 0, pd_index = 0, pd_count = 0, pd_list_size;
	u_int8_t do_ocr = 1;
	struct mrsas_mfi_cmd *cmd;
	struct mrsas_dcmd_frame *dcmd;
	struct MR_PD_LIST *pd_list_mem;
	struct MR_PD_ADDRESS *pd_addr;
	bus_addr_t pd_list_phys_addr = 0;
	struct mrsas_tmp_dcmd *tcmd;

	cmd = mrsas_get_mfi_cmd(sc);
	if (!cmd) {
	device_printf(sc->mrsas_dev,
	"Cannot alloc for get PD list cmd\n");
	return 1;
	}
	dcmd = &cmd->frame->dcmd;

	tcmd = malloc(sizeof(struct mrsas_tmp_dcmd), M_MRSAS, M_NOWAIT);
	pd_list_size = MRSAS_MAX_PD * sizeof(struct MR_PD_LIST);
	if (mrsas_alloc_tmp_dcmd(sc, tcmd, pd_list_size) != SUCCESS) {
	device_printf(sc->mrsas_dev,
	"Cannot alloc dmamap for get PD list cmd\n");
	mrsas_release_mfi_cmd(cmd);
	mrsas_free_tmp_dcmd(tcmd);
	free(tcmd, M_MRSAS);
	return (ENOMEM);
	} else {
	pd_list_mem = tcmd->tmp_dcmd_mem;
	pd_list_phys_addr = tcmd->tmp_dcmd_phys_addr;
	}
	memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE);

	dcmd->mbox.b[0] = MR_PD_QUERY_TYPE_EXPOSED_TO_HOST;
	dcmd->mbox.b[1] = 0;
	dcmd->cmd = MFI_CMD_DCMD;
	dcmd->cmd_status = 0xFF;
	dcmd->sge_count = 1;
	dcmd->flags = MFI_FRAME_DIR_READ;
	dcmd->timeout = 0;
	dcmd->pad_0 = 0;
	dcmd->data_xfer_len = MRSAS_MAX_PD * sizeof(struct MR_PD_LIST);
	dcmd->opcode = MR_DCMD_PD_LIST_QUERY;
	dcmd->sgl.sge32[0].phys_addr = pd_list_phys_addr;
	dcmd->sgl.sge32[0].length = MRSAS_MAX_PD * sizeof(struct MR_PD_LIST);

	if (!sc->mask_interrupts)
	retcode = mrsas_issue_blocked_cmd(sc, cmd);
	else
	retcode = mrsas_issue_polled(sc, cmd);

	if (retcode == ETIMEDOUT)
	goto dcmd_timeout;

	/* Get the instance PD list */
	pd_count = MRSAS_MAX_PD;
	pd_addr = pd_list_mem->addr;
	if (pd_list_mem->count < pd_count) {
	memset(sc->local_pd_list, 0,
	MRSAS_MAX_PD * sizeof(struct mrsas_pd_list));
	for (pd_index = 0; pd_index < pd_list_mem->count; pd_index++) {
	sc->local_pd_list[pd_addr->deviceId].tid = pd_addr->deviceId;
	sc->local_pd_list[pd_addr->deviceId].driveType =
	pd_addr->scsiDevType;
	sc->local_pd_list[pd_addr->deviceId].driveState =
	MR_PD_STATE_SYSTEM;
	if (sc->target_list[pd_addr->deviceId].target_id == 0xffff)
	mrsas_add_target(sc, pd_addr->deviceId);
	pd_addr++;
	}
	for (pd_index = 0; pd_index < MRSAS_MAX_PD; pd_index++) {
	if ((sc->local_pd_list[pd_index].driveState !=
	MR_PD_STATE_SYSTEM) &&
	(sc->target_list[pd_index].target_id !=
	0xffff)) {
	mrsas_remove_target(sc, pd_index);
	}
	}
	/*
	* Use mutext/spinlock if pd_list component size increase more than
	* 32 bit.
	*/
	memcpy(sc->pd_list, sc->local_pd_list, sizeof(sc->local_pd_list));
	do_ocr = 0;
	}
	dcmd_timeout:
	mrsas_free_tmp_dcmd(tcmd);
	free(tcmd, M_MRSAS);

	if (do_ocr)
	sc->do_timedout_reset = MFI_DCMD_TIMEOUT_OCR;

	if (!sc->mask_interrupts)
	mrsas_release_mfi_cmd(cmd);

	return (retcode);
	}

	/*
	* mrsas_get_ld_list: Returns FW's LD list structure input:
	* Adapter soft state
	*
	* Issues an internal command (DCMD) to get the FW's controller PD list
	* structure. This information is mainly used to find out about supported by
	* the FW.
	*/
	static int
	mrsas_get_ld_list(struct mrsas_softc *sc)
	{
	int ld_list_size, retcode = 0, ld_index = 0, ids = 0, drv_tgt_id;
	u_int8_t do_ocr = 1;
	struct mrsas_mfi_cmd *cmd;
	struct mrsas_dcmd_frame *dcmd;
	struct MR_LD_LIST *ld_list_mem;
	bus_addr_t ld_list_phys_addr = 0;
	struct mrsas_tmp_dcmd *tcmd;

	cmd = mrsas_get_mfi_cmd(sc);
	if (!cmd) {
	device_printf(sc->mrsas_dev,
	"Cannot alloc for get LD list cmd\n");
	return 1;
	}
	dcmd = &cmd->frame->dcmd;

	tcmd = malloc(sizeof(struct mrsas_tmp_dcmd), M_MRSAS, M_NOWAIT);
	ld_list_size = sizeof(struct MR_LD_LIST);
	if (mrsas_alloc_tmp_dcmd(sc, tcmd, ld_list_size) != SUCCESS) {
	device_printf(sc->mrsas_dev,
	"Cannot alloc dmamap for get LD list cmd\n");
	mrsas_release_mfi_cmd(cmd);
	mrsas_free_tmp_dcmd(tcmd);
	free(tcmd, M_MRSAS);
	return (ENOMEM);
	} else {
	ld_list_mem = tcmd->tmp_dcmd_mem;
	ld_list_phys_addr = tcmd->tmp_dcmd_phys_addr;
	}
	memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE);

	if (sc->max256vdSupport)
	dcmd->mbox.b[0] = 1;

	dcmd->cmd = MFI_CMD_DCMD;
	dcmd->cmd_status = 0xFF;
	dcmd->sge_count = 1;
	dcmd->flags = MFI_FRAME_DIR_READ;
	dcmd->timeout = 0;
	dcmd->data_xfer_len = sizeof(struct MR_LD_LIST);
	dcmd->opcode = MR_DCMD_LD_GET_LIST;
	dcmd->sgl.sge32[0].phys_addr = ld_list_phys_addr;
	dcmd->sgl.sge32[0].length = sizeof(struct MR_LD_LIST);
	dcmd->pad_0 = 0;

	if (!sc->mask_interrupts)
	retcode = mrsas_issue_blocked_cmd(sc, cmd);
	else
	retcode = mrsas_issue_polled(sc, cmd);

	if (retcode == ETIMEDOUT)
	goto dcmd_timeout;

	#if VD_EXT_DEBUG
	printf("Number of LDs %d\n", ld_list_mem->ldCount);
	#endif

	/* Get the instance LD list */
	if (ld_list_mem->ldCount <= sc->fw_supported_vd_count) {
	sc->CurLdCount = ld_list_mem->ldCount;
	memset(sc->ld_ids, 0xff, MAX_LOGICAL_DRIVES_EXT);
	for (ld_index = 0; ld_index < ld_list_mem->ldCount; ld_index++) {
	ids = ld_list_mem->ldList[ld_index].ref.ld_context.targetId;
	drv_tgt_id = ids + MRSAS_MAX_PD;
	if (ld_list_mem->ldList[ld_index].state != 0) {
	sc->ld_ids[ids] = ld_list_mem->ldList[ld_index].ref.ld_context.targetId;
	if (sc->target_list[drv_tgt_id].target_id ==
	0xffff)
	mrsas_add_target(sc, drv_tgt_id);
	} else {
	if (sc->target_list[drv_tgt_id].target_id !=
	0xffff)
	mrsas_remove_target(sc,
	drv_tgt_id);
	}
	}

	do_ocr = 0;
	}
	dcmd_timeout:
	mrsas_free_tmp_dcmd(tcmd);
	free(tcmd, M_MRSAS);

	if (do_ocr)
	sc->do_timedout_reset = MFI_DCMD_TIMEOUT_OCR;
	if (!sc->mask_interrupts)
	mrsas_release_mfi_cmd(cmd);

	return (retcode);
	}

	/*
	* mrsas_alloc_tmp_dcmd: Allocates memory for temporary command input:
	* Adapter soft state Temp command Size of alloction
	*
	* Allocates DMAable memory for a temporary internal command. The allocated
	* memory is initialized to all zeros upon successful loading of the dma
	* mapped memory.
	*/
	int
	mrsas_alloc_tmp_dcmd(struct mrsas_softc *sc,
	struct mrsas_tmp_dcmd *tcmd, int size)
	{
	if (bus_dma_tag_create(sc->mrsas_parent_tag,
	1, 0,
	BUS_SPACE_MAXADDR_32BIT,
	BUS_SPACE_MAXADDR,
	NULL, NULL,
	size,
	1,
	size,
	BUS_DMA_ALLOCNOW,
	NULL, NULL,
	&tcmd->tmp_dcmd_tag)) {
	device_printf(sc->mrsas_dev, "Cannot allocate tmp dcmd tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(tcmd->tmp_dcmd_tag, (void **)&tcmd->tmp_dcmd_mem,
	BUS_DMA_NOWAIT, &tcmd->tmp_dcmd_dmamap)) {
	device_printf(sc->mrsas_dev, "Cannot allocate tmp dcmd mem\n");
	return (ENOMEM);
	}
	if (bus_dmamap_load(tcmd->tmp_dcmd_tag, tcmd->tmp_dcmd_dmamap,
	tcmd->tmp_dcmd_mem, size, mrsas_addr_cb,
	&tcmd->tmp_dcmd_phys_addr, BUS_DMA_NOWAIT)) {
	device_printf(sc->mrsas_dev, "Cannot load tmp dcmd mem\n");
	return (ENOMEM);
	}
	memset(tcmd->tmp_dcmd_mem, 0, size);
	return (0);
	}

	/*
	* mrsas_free_tmp_dcmd: Free memory for temporary command input:
	* temporary dcmd pointer
	*
	* Deallocates memory of the temporary command for use in the construction of
	* the internal DCMD.
	*/
	void
	mrsas_free_tmp_dcmd(struct mrsas_tmp_dcmd *tmp)
	{
	if (tmp->tmp_dcmd_phys_addr)
	bus_dmamap_unload(tmp->tmp_dcmd_tag, tmp->tmp_dcmd_dmamap);
	if (tmp->tmp_dcmd_mem != NULL)
	bus_dmamem_free(tmp->tmp_dcmd_tag, tmp->tmp_dcmd_mem, tmp->tmp_dcmd_dmamap);
	if (tmp->tmp_dcmd_tag != NULL)
	bus_dma_tag_destroy(tmp->tmp_dcmd_tag);
	}

	/*
	* mrsas_issue_blocked_abort_cmd: Aborts previously issued cmd input:
	* Adapter soft state Previously issued cmd to be aborted
	*
	* This function is used to abort previously issued commands, such as AEN and
	* RAID map sync map commands. The abort command is sent as a DCMD internal
	* command and subsequently the driver will wait for a return status. The
	* max wait time is MRSAS_INTERNAL_CMD_WAIT_TIME seconds.
	*/
	static int
	mrsas_issue_blocked_abort_cmd(struct mrsas_softc *sc,
	struct mrsas_mfi_cmd *cmd_to_abort)
	{
	struct mrsas_mfi_cmd *cmd;
	struct mrsas_abort_frame *abort_fr;
	u_int8_t retcode = 0;
	unsigned long total_time = 0;
	u_int8_t max_wait = MRSAS_INTERNAL_CMD_WAIT_TIME;

	cmd = mrsas_get_mfi_cmd(sc);
	if (!cmd) {
	device_printf(sc->mrsas_dev, "Cannot alloc for abort cmd\n");
	return (1);
	}
	abort_fr = &cmd->frame->abort;

	/* Prepare and issue the abort frame */
	abort_fr->cmd = MFI_CMD_ABORT;
	abort_fr->cmd_status = 0xFF;
	abort_fr->flags = 0;
	abort_fr->abort_context = cmd_to_abort->index;
	abort_fr->abort_mfi_phys_addr_lo = cmd_to_abort->frame_phys_addr;
	abort_fr->abort_mfi_phys_addr_hi = 0;

	cmd->sync_cmd = 1;
	cmd->cmd_status = 0xFF;

	if (mrsas_issue_dcmd(sc, cmd)) {
	device_printf(sc->mrsas_dev, "Fail to send abort command.\n");
	return (1);
	}
	/* Wait for this cmd to complete */
	sc->chan = (void *)&cmd;
	while (1) {
	if (cmd->cmd_status == 0xFF) {
	tsleep((void *)&sc->chan, 0, "mrsas_sleep", hz);
	} else
	break;
	total_time++;
	if (total_time >= max_wait) {
	device_printf(sc->mrsas_dev, "Abort cmd timed out after %d sec.\n", max_wait);
	retcode = 1;
	break;
	}
	}

	cmd->sync_cmd = 0;
	mrsas_release_mfi_cmd(cmd);
	return (retcode);
	}

	/*
	* mrsas_complete_abort: Completes aborting a command input:
	* Adapter soft state Cmd that was issued to abort another cmd
	*
	* The mrsas_issue_blocked_abort_cmd() function waits for the command status to
	* change after sending the command. This function is called from
	* mrsas_complete_mptmfi_passthru() to wake up the sleep thread associated.
	*/
	void
	mrsas_complete_abort(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd)
	{
	if (cmd->sync_cmd) {
	cmd->sync_cmd = 0;
	cmd->cmd_status = 0;
	sc->chan = (void *)&cmd;
	wakeup_one((void *)&sc->chan);
	}
	return;
	}

	/*
	* mrsas_aen_handler: AEN processing callback function from thread context
	* input: Adapter soft state
	*
	* Asynchronous event handler
	*/
	void
	mrsas_aen_handler(struct mrsas_softc *sc)
	{
	union mrsas_evt_class_locale class_locale;
	int doscan = 0;
	u_int32_t seq_num;
	int error, fail_aen = 0;

	if (sc == NULL) {
	printf("invalid instance!\n");
	return;
	}
	if (sc->remove_in_progress \|\| sc->reset_in_progress) {
	device_printf(sc->mrsas_dev, "Returning from %s, line no %d\n",
	__func__, __LINE__);
	return;
	}
	if (sc->evt_detail_mem) {
	switch (sc->evt_detail_mem->code) {
	case MR_EVT_PD_INSERTED:
	fail_aen = mrsas_get_pd_list(sc);
	if (!fail_aen)
	mrsas_bus_scan_sim(sc, sc->sim_1);
	else
	goto skip_register_aen;
	break;
	case MR_EVT_PD_REMOVED:
	fail_aen = mrsas_get_pd_list(sc);
	if (!fail_aen)
	mrsas_bus_scan_sim(sc, sc->sim_1);
	else
	goto skip_register_aen;
	break;
	case MR_EVT_LD_OFFLINE:
	case MR_EVT_CFG_CLEARED:
	case MR_EVT_LD_DELETED:
	mrsas_bus_scan_sim(sc, sc->sim_0);
	break;
	case MR_EVT_LD_CREATED:
	fail_aen = mrsas_get_ld_list(sc);
	if (!fail_aen)
	mrsas_bus_scan_sim(sc, sc->sim_0);
	else
	goto skip_register_aen;
	break;
	case MR_EVT_CTRL_HOST_BUS_SCAN_REQUESTED:
	case MR_EVT_FOREIGN_CFG_IMPORTED:
	case MR_EVT_LD_STATE_CHANGE:
	doscan = 1;
	break;
	case MR_EVT_CTRL_PROP_CHANGED:
	fail_aen = mrsas_get_ctrl_info(sc);
	if (fail_aen)
	goto skip_register_aen;
	break;
	default:
	break;
	}
	} else {
	device_printf(sc->mrsas_dev, "invalid evt_detail\n");
	return;
	}
	if (doscan) {
	fail_aen = mrsas_get_pd_list(sc);
	if (!fail_aen) {
	mrsas_dprint(sc, MRSAS_AEN, "scanning ...sim 1\n");
	mrsas_bus_scan_sim(sc, sc->sim_1);
	} else
	goto skip_register_aen;

	fail_aen = mrsas_get_ld_list(sc);
	if (!fail_aen) {
	mrsas_dprint(sc, MRSAS_AEN, "scanning ...sim 0\n");
	mrsas_bus_scan_sim(sc, sc->sim_0);
	} else
	goto skip_register_aen;
	}
	seq_num = sc->evt_detail_mem->seq_num + 1;

	/* Register AEN with FW for latest sequence number plus 1 */
	class_locale.members.reserved = 0;
	class_locale.members.locale = MR_EVT_LOCALE_ALL;
	class_locale.members.class = MR_EVT_CLASS_DEBUG;

	if (sc->aen_cmd != NULL)
	return;

	mtx_lock(&sc->aen_lock);
	error = mrsas_register_aen(sc, seq_num,
	class_locale.word);
	mtx_unlock(&sc->aen_lock);

	if (error)
	device_printf(sc->mrsas_dev, "register aen failed error %x\n", error);

	skip_register_aen:
	return;

	}

	/*
	* mrsas_complete_aen: Completes AEN command
	* input: Adapter soft state
	* Cmd that was issued to abort another cmd
	*
	* This function will be called from ISR and will continue event processing from
	* thread context by enqueuing task in ev_tq (callback function
	* "mrsas_aen_handler").
	*/
	void
	mrsas_complete_aen(struct mrsas_softc sc, struct mrsas_mfi_cmd cmd)
	{
	/*
	* Don't signal app if it is just an aborted previously registered
	* aen
	*/
	if ((!cmd->abort_aen) && (sc->remove_in_progress == 0)) {
	sc->mrsas_aen_triggered = 1;
	mtx_lock(&sc->aen_lock);
	if (sc->mrsas_poll_waiting) {
	sc->mrsas_poll_waiting = 0;
	selwakeup(&sc->mrsas_select);
	}
	mtx_unlock(&sc->aen_lock);
	} else
	cmd->abort_aen = 0;

	sc->aen_cmd = NULL;
	mrsas_release_mfi_cmd(cmd);

	taskqueue_enqueue(sc->ev_tq, &sc->ev_task);

	return;
	}

	static device_method_t mrsas_methods[] = {
	DEVMETHOD(device_probe, mrsas_probe),
	DEVMETHOD(device_attach, mrsas_attach),
	DEVMETHOD(device_detach, mrsas_detach),
	DEVMETHOD(device_shutdown, mrsas_shutdown),
	DEVMETHOD(device_suspend, mrsas_suspend),
	DEVMETHOD(device_resume, mrsas_resume),
	DEVMETHOD(bus_print_child, bus_generic_print_child),
	DEVMETHOD(bus_driver_added, bus_generic_driver_added),
	{0, 0}
	};

	static driver_t mrsas_driver = {
	"mrsas",
	mrsas_methods,
	sizeof(struct mrsas_softc)
	};

	static devclass_t mrsas_devclass;

	DRIVER_MODULE(mrsas, pci, mrsas_driver, mrsas_devclass, 0, 0);
	MODULE_DEPEND(mrsas, cam, 1, 1, 1);
	diff --git a/sys/dev/mvs/mvs.c b/sys/dev/mvs/mvs.c
	index e018aba5bdea..80c5e8cfe05f 100644
	--- a/sys/dev/mvs/mvs.c
	+++ b/sys/dev/mvs/mvs.c
	@@ -1,2457 +1,2455 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/module.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/ata.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/endian.h>
	#include <sys/malloc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <vm/uma.h>
	#include <machine/stdarg.h>
	#include <machine/resource.h>
	#include <machine/bus.h>
	#include <sys/rman.h>
	#include <dev/pci/pcivar.h>
	#include "mvs.h"

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/cam_debug.h>

	/* local prototypes */
	static int mvs_ch_init(device_t dev);
	static int mvs_ch_deinit(device_t dev);
	static int mvs_ch_suspend(device_t dev);
	static int mvs_ch_resume(device_t dev);
	static void mvs_dmainit(device_t dev);
	static void mvs_dmasetupc_cb(void *xsc,
	bus_dma_segment_t *segs, int nsegs, int error);
	static void mvs_dmafini(device_t dev);
	static void mvs_slotsalloc(device_t dev);
	static void mvs_slotsfree(device_t dev);
	static void mvs_setup_edma_queues(device_t dev);
	static void mvs_set_edma_mode(device_t dev, enum mvs_edma_mode mode);
	static void mvs_ch_pm(void *arg);
	static void mvs_ch_intr_locked(void *data);
	static void mvs_ch_intr(void *data);
	static void mvs_reset(device_t dev);
	static void mvs_softreset(device_t dev, union ccb *ccb);

	static int mvs_sata_connect(struct mvs_channel *ch);
	static int mvs_sata_phy_reset(device_t dev);
	static int mvs_wait(device_t dev, u_int s, u_int c, int t);
	static void mvs_tfd_read(device_t dev, union ccb *ccb);
	static void mvs_tfd_write(device_t dev, union ccb *ccb);
	static void mvs_legacy_intr(device_t dev, int poll);
	static void mvs_crbq_intr(device_t dev);
	static void mvs_begin_transaction(device_t dev, union ccb *ccb);
	static void mvs_legacy_execute_transaction(struct mvs_slot *slot);
	static void mvs_timeout(void *arg);
	static void mvs_dmasetprd(void *arg,
	bus_dma_segment_t *segs, int nsegs, int error);
	static void mvs_requeue_frozen(device_t dev);
	static void mvs_execute_transaction(struct mvs_slot *slot);
	static void mvs_end_transaction(struct mvs_slot *slot, enum mvs_err_type et);

	static void mvs_issue_recovery(device_t dev);
	static void mvs_process_read_log(device_t dev, union ccb *ccb);
	static void mvs_process_request_sense(device_t dev, union ccb *ccb);

	static void mvsaction(struct cam_sim sim, union ccb ccb);
	static void mvspoll(struct cam_sim *sim);

	static MALLOC_DEFINE(M_MVS, "MVS driver", "MVS driver data buffers");

	#define recovery_type spriv_field0
	#define RECOVERY_NONE 0
	#define RECOVERY_READ_LOG 1
	#define RECOVERY_REQUEST_SENSE 2
	#define recovery_slot spriv_field1

	static int
	mvs_ch_probe(device_t dev)
	{

	device_set_desc_copy(dev, "Marvell SATA channel");
	return (BUS_PROBE_DEFAULT);
	}

	static int
	mvs_ch_attach(device_t dev)
	{
	struct mvs_controller *ctlr = device_get_softc(device_get_parent(dev));
	struct mvs_channel *ch = device_get_softc(dev);
	struct cam_devq *devq;
	int rid, error, i, sata_rev = 0;

	ch->dev = dev;
	ch->unit = (intptr_t)device_get_ivars(dev);
	ch->quirks = ctlr->quirks;
	mtx_init(&ch->mtx, "MVS channel lock", NULL, MTX_DEF);
	ch->pm_level = 0;
	resource_int_value(device_get_name(dev),
	device_get_unit(dev), "pm_level", &ch->pm_level);
	if (ch->pm_level > 3)
	callout_init_mtx(&ch->pm_timer, &ch->mtx, 0);
	callout_init_mtx(&ch->reset_timer, &ch->mtx, 0);
	resource_int_value(device_get_name(dev),
	device_get_unit(dev), "sata_rev", &sata_rev);
	for (i = 0; i < 16; i++) {
	ch->user[i].revision = sata_rev;
	ch->user[i].mode = 0;
	ch->user[i].bytecount = (ch->quirks & MVS_Q_GENIIE) ? 8192 : 2048;
	ch->user[i].tags = MVS_MAX_SLOTS;
	ch->curr[i] = ch->user[i];
	if (ch->pm_level) {
	ch->user[i].caps = CTS_SATA_CAPS_H_PMREQ \|
	CTS_SATA_CAPS_H_APST \|
	CTS_SATA_CAPS_D_PMREQ \| CTS_SATA_CAPS_D_APST;
	}
	ch->user[i].caps \|= CTS_SATA_CAPS_H_AN;
	}
	rid = ch->unit;
	if (!(ch->r_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&rid, RF_ACTIVE)))
	return (ENXIO);
	mvs_dmainit(dev);
	mvs_slotsalloc(dev);
	mvs_ch_init(dev);
	mtx_lock(&ch->mtx);
	rid = ATA_IRQ_RID;
	if (!(ch->r_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ,
	&rid, RF_SHAREABLE \| RF_ACTIVE))) {
	device_printf(dev, "Unable to map interrupt\n");
	error = ENXIO;
	goto err0;
	}
	if ((bus_setup_intr(dev, ch->r_irq, ATA_INTR_FLAGS, NULL,
	mvs_ch_intr_locked, dev, &ch->ih))) {
	device_printf(dev, "Unable to setup interrupt\n");
	error = ENXIO;
	goto err1;
	}
	/* Create the device queue for our SIM. */
	devq = cam_simq_alloc(MVS_MAX_SLOTS - 1);
	if (devq == NULL) {
	device_printf(dev, "Unable to allocate simq\n");
	error = ENOMEM;
	goto err1;
	}
	/* Construct SIM entry */
	ch->sim = cam_sim_alloc(mvsaction, mvspoll, "mvsch", ch,
	device_get_unit(dev), &ch->mtx,
	2, (ch->quirks & MVS_Q_GENI) ? 0 : MVS_MAX_SLOTS - 1,
	devq);
	if (ch->sim == NULL) {
	cam_simq_free(devq);
	device_printf(dev, "unable to allocate sim\n");
	error = ENOMEM;
	goto err1;
	}
	if (xpt_bus_register(ch->sim, dev, 0) != CAM_SUCCESS) {
	device_printf(dev, "unable to register xpt bus\n");
	error = ENXIO;
	goto err2;
	}
	if (xpt_create_path(&ch->path, /periph/NULL, cam_sim_path(ch->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	device_printf(dev, "unable to create path\n");
	error = ENXIO;
	goto err3;
	}
	if (ch->pm_level > 3) {
	callout_reset(&ch->pm_timer,
	(ch->pm_level == 4) ? hz / 1000 : hz / 8,
	mvs_ch_pm, dev);
	}
	mtx_unlock(&ch->mtx);
	return (0);

	err3:
	xpt_bus_deregister(cam_sim_path(ch->sim));
	err2:
	cam_sim_free(ch->sim, /free_devq/TRUE);
	err1:
	bus_release_resource(dev, SYS_RES_IRQ, ATA_IRQ_RID, ch->r_irq);
	err0:
	bus_release_resource(dev, SYS_RES_MEMORY, ch->unit, ch->r_mem);
	mtx_unlock(&ch->mtx);
	mtx_destroy(&ch->mtx);
	return (error);
	}

	static int
	mvs_ch_detach(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);

	mtx_lock(&ch->mtx);
	xpt_async(AC_LOST_DEVICE, ch->path, NULL);
	/* Forget about reset. */
	if (ch->resetting) {
	ch->resetting = 0;
	xpt_release_simq(ch->sim, TRUE);
	}
	xpt_free_path(ch->path);
	xpt_bus_deregister(cam_sim_path(ch->sim));
	cam_sim_free(ch->sim, /free_devq/TRUE);
	mtx_unlock(&ch->mtx);

	if (ch->pm_level > 3)
	callout_drain(&ch->pm_timer);
	callout_drain(&ch->reset_timer);
	bus_teardown_intr(dev, ch->r_irq, ch->ih);
	bus_release_resource(dev, SYS_RES_IRQ, ATA_IRQ_RID, ch->r_irq);

	mvs_ch_deinit(dev);
	mvs_slotsfree(dev);
	mvs_dmafini(dev);

	bus_release_resource(dev, SYS_RES_MEMORY, ch->unit, ch->r_mem);
	mtx_destroy(&ch->mtx);
	return (0);
	}

	static int
	mvs_ch_init(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	uint32_t reg;

	/* Disable port interrupts */
	ATA_OUTL(ch->r_mem, EDMA_IEM, 0);
	/* Stop EDMA */
	ch->curr_mode = MVS_EDMA_UNKNOWN;
	mvs_set_edma_mode(dev, MVS_EDMA_OFF);
	/* Clear and configure FIS interrupts. */
	ATA_OUTL(ch->r_mem, SATA_FISIC, 0);
	reg = ATA_INL(ch->r_mem, SATA_FISC);
	reg \|= SATA_FISC_FISWAIT4HOSTRDYEN_B1;
	ATA_OUTL(ch->r_mem, SATA_FISC, reg);
	reg = ATA_INL(ch->r_mem, SATA_FISIM);
	reg \|= SATA_FISC_FISWAIT4HOSTRDYEN_B1;
	ATA_OUTL(ch->r_mem, SATA_FISC, reg);
	/* Clear SATA error register. */
	ATA_OUTL(ch->r_mem, SATA_SE, 0xffffffff);
	/* Clear any outstanding error interrupts. */
	ATA_OUTL(ch->r_mem, EDMA_IEC, 0);
	/* Unmask all error interrupts */
	ATA_OUTL(ch->r_mem, EDMA_IEM, ~EDMA_IE_TRANSIENT);
	return (0);
	}

	static int
	mvs_ch_deinit(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);

	/* Stop EDMA */
	mvs_set_edma_mode(dev, MVS_EDMA_OFF);
	/* Disable port interrupts. */
	ATA_OUTL(ch->r_mem, EDMA_IEM, 0);
	return (0);
	}

	static int
	mvs_ch_suspend(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);

	mtx_lock(&ch->mtx);
	xpt_freeze_simq(ch->sim, 1);
	while (ch->oslots)
	msleep(ch, &ch->mtx, PRIBIO, "mvssusp", hz/100);
	/* Forget about reset. */
	if (ch->resetting) {
	ch->resetting = 0;
	callout_stop(&ch->reset_timer);
	xpt_release_simq(ch->sim, TRUE);
	}
	mvs_ch_deinit(dev);
	mtx_unlock(&ch->mtx);
	return (0);
	}

	static int
	mvs_ch_resume(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);

	mtx_lock(&ch->mtx);
	mvs_ch_init(dev);
	mvs_reset(dev);
	xpt_release_simq(ch->sim, TRUE);
	mtx_unlock(&ch->mtx);
	return (0);
	}

	struct mvs_dc_cb_args {
	bus_addr_t maddr;
	int error;
	};

	static void
	mvs_dmainit(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	struct mvs_dc_cb_args dcba;

	/* EDMA command request area. */
	if (bus_dma_tag_create(bus_get_dma_tag(dev), 1024, 0,
	BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
	NULL, NULL, MVS_WORKRQ_SIZE, 1, MVS_WORKRQ_SIZE,
	0, NULL, NULL, &ch->dma.workrq_tag))
	goto error;
	if (bus_dmamem_alloc(ch->dma.workrq_tag, (void **)&ch->dma.workrq, 0,
	&ch->dma.workrq_map))
	goto error;
	if (bus_dmamap_load(ch->dma.workrq_tag, ch->dma.workrq_map,
	ch->dma.workrq, MVS_WORKRQ_SIZE, mvs_dmasetupc_cb, &dcba, 0) \|\|
	dcba.error) {
	bus_dmamem_free(ch->dma.workrq_tag,
	ch->dma.workrq, ch->dma.workrq_map);
	goto error;
	}
	ch->dma.workrq_bus = dcba.maddr;
	/* EDMA command response area. */
	if (bus_dma_tag_create(bus_get_dma_tag(dev), 256, 0,
	BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
	NULL, NULL, MVS_WORKRP_SIZE, 1, MVS_WORKRP_SIZE,
	0, NULL, NULL, &ch->dma.workrp_tag))
	goto error;
	if (bus_dmamem_alloc(ch->dma.workrp_tag, (void **)&ch->dma.workrp, 0,
	&ch->dma.workrp_map))
	goto error;
	if (bus_dmamap_load(ch->dma.workrp_tag, ch->dma.workrp_map,
	ch->dma.workrp, MVS_WORKRP_SIZE, mvs_dmasetupc_cb, &dcba, 0) \|\|
	dcba.error) {
	bus_dmamem_free(ch->dma.workrp_tag,
	ch->dma.workrp, ch->dma.workrp_map);
	goto error;
	}
	ch->dma.workrp_bus = dcba.maddr;
	/* Data area. */
	if (bus_dma_tag_create(bus_get_dma_tag(dev), 2, MVS_EPRD_MAX,
	BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
	NULL, NULL,
	- MVS_SG_ENTRIES * PAGE_SIZE * MVS_MAX_SLOTS,
	- MVS_SG_ENTRIES, MVS_EPRD_MAX,
	+ MVS_SG_ENTRIES * PAGE_SIZE, MVS_SG_ENTRIES, MVS_EPRD_MAX,
	0, busdma_lock_mutex, &ch->mtx, &ch->dma.data_tag)) {
	goto error;
	}
	return;

	error:
	device_printf(dev, "WARNING - DMA initialization failed\n");
	mvs_dmafini(dev);
	}

	static void
	mvs_dmasetupc_cb(void xsc, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct mvs_dc_cb_args dcba = (struct mvs_dc_cb_args )xsc;

	if (!(dcba->error = error))
	dcba->maddr = segs[0].ds_addr;
	}

	static void
	mvs_dmafini(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);

	if (ch->dma.data_tag) {
	bus_dma_tag_destroy(ch->dma.data_tag);
	ch->dma.data_tag = NULL;
	}
	if (ch->dma.workrp_bus) {
	bus_dmamap_unload(ch->dma.workrp_tag, ch->dma.workrp_map);
	bus_dmamem_free(ch->dma.workrp_tag,
	ch->dma.workrp, ch->dma.workrp_map);
	ch->dma.workrp_bus = 0;
	ch->dma.workrp = NULL;
	}
	if (ch->dma.workrp_tag) {
	bus_dma_tag_destroy(ch->dma.workrp_tag);
	ch->dma.workrp_tag = NULL;
	}
	if (ch->dma.workrq_bus) {
	bus_dmamap_unload(ch->dma.workrq_tag, ch->dma.workrq_map);
	bus_dmamem_free(ch->dma.workrq_tag,
	ch->dma.workrq, ch->dma.workrq_map);
	ch->dma.workrq_bus = 0;
	ch->dma.workrq = NULL;
	}
	if (ch->dma.workrq_tag) {
	bus_dma_tag_destroy(ch->dma.workrq_tag);
	ch->dma.workrq_tag = NULL;
	}
	}

	static void
	mvs_slotsalloc(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	int i;

	/* Alloc and setup command/dma slots */
	bzero(ch->slot, sizeof(ch->slot));
	for (i = 0; i < MVS_MAX_SLOTS; i++) {
	struct mvs_slot *slot = &ch->slot[i];

	slot->dev = dev;
	slot->slot = i;
	slot->state = MVS_SLOT_EMPTY;
	+ slot->eprd_offset = MVS_EPRD_OFFSET + MVS_EPRD_SIZE * i;
	slot->ccb = NULL;
	callout_init_mtx(&slot->timeout, &ch->mtx, 0);

	if (bus_dmamap_create(ch->dma.data_tag, 0, &slot->dma.data_map))
	device_printf(ch->dev, "FAILURE - create data_map\n");
	}
	}

	static void
	mvs_slotsfree(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	int i;

	/* Free all dma slots */
	for (i = 0; i < MVS_MAX_SLOTS; i++) {
	struct mvs_slot *slot = &ch->slot[i];

	callout_drain(&slot->timeout);
	if (slot->dma.data_map) {
	bus_dmamap_destroy(ch->dma.data_tag, slot->dma.data_map);
	slot->dma.data_map = NULL;
	}
	}
	}

	static void
	mvs_setup_edma_queues(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	uint64_t work;

	/* Requests queue. */
	work = ch->dma.workrq_bus;
	ATA_OUTL(ch->r_mem, EDMA_REQQBAH, work >> 32);
	ATA_OUTL(ch->r_mem, EDMA_REQQIP, work & 0xffffffff);
	ATA_OUTL(ch->r_mem, EDMA_REQQOP, work & 0xffffffff);
	bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map,
	BUS_DMASYNC_PREWRITE);
	/* Responses queue. */
	memset(ch->dma.workrp, 0xff, MVS_WORKRP_SIZE);
	work = ch->dma.workrp_bus;
	ATA_OUTL(ch->r_mem, EDMA_RESQBAH, work >> 32);
	ATA_OUTL(ch->r_mem, EDMA_RESQIP, work & 0xffffffff);
	ATA_OUTL(ch->r_mem, EDMA_RESQOP, work & 0xffffffff);
	bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map,
	BUS_DMASYNC_PREREAD);
	ch->out_idx = 0;
	ch->in_idx = 0;
	}

	static void
	mvs_set_edma_mode(device_t dev, enum mvs_edma_mode mode)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	int timeout;
	uint32_t ecfg, fcfg, hc, ltm, unkn;

	if (mode == ch->curr_mode)
	return;
	/* If we are running, we should stop first. */
	if (ch->curr_mode != MVS_EDMA_OFF) {
	ATA_OUTL(ch->r_mem, EDMA_CMD, EDMA_CMD_EDSEDMA);
	timeout = 0;
	while (ATA_INL(ch->r_mem, EDMA_CMD) & EDMA_CMD_EENEDMA) {
	DELAY(1000);
	if (timeout++ > 1000) {
	device_printf(dev, "stopping EDMA engine failed\n");
	break;
	}
	}
	}
	ch->curr_mode = mode;
	ch->fbs_enabled = 0;
	ch->fake_busy = 0;
	/* Report mode to controller. Needed for correct CCC operation. */
	MVS_EDMA(device_get_parent(dev), dev, mode);
	/* Configure new mode. */
	ecfg = EDMA_CFG_RESERVED \| EDMA_CFG_RESERVED2 \| EDMA_CFG_EHOSTQUEUECACHEEN;
	if (ch->pm_present) {
	ecfg \|= EDMA_CFG_EMASKRXPM;
	if (ch->quirks & MVS_Q_GENIIE) {
	ecfg \|= EDMA_CFG_EEDMAFBS;
	ch->fbs_enabled = 1;
	}
	}
	if (ch->quirks & MVS_Q_GENI)
	ecfg \|= EDMA_CFG_ERDBSZ;
	else if (ch->quirks & MVS_Q_GENII)
	ecfg \|= EDMA_CFG_ERDBSZEXT \| EDMA_CFG_EWRBUFFERLEN;
	if (ch->quirks & MVS_Q_CT)
	ecfg \|= EDMA_CFG_ECUTTHROUGHEN;
	if (mode != MVS_EDMA_OFF)
	ecfg \|= EDMA_CFG_EEARLYCOMPLETIONEN;
	if (mode == MVS_EDMA_QUEUED)
	ecfg \|= EDMA_CFG_EQUE;
	else if (mode == MVS_EDMA_NCQ)
	ecfg \|= EDMA_CFG_ESATANATVCMDQUE;
	ATA_OUTL(ch->r_mem, EDMA_CFG, ecfg);
	mvs_setup_edma_queues(dev);
	if (ch->quirks & MVS_Q_GENIIE) {
	/* Configure FBS-related registers */
	fcfg = ATA_INL(ch->r_mem, SATA_FISC);
	ltm = ATA_INL(ch->r_mem, SATA_LTM);
	hc = ATA_INL(ch->r_mem, EDMA_HC);
	if (ch->fbs_enabled) {
	fcfg \|= SATA_FISC_FISDMAACTIVATESYNCRESP;
	if (mode == MVS_EDMA_NCQ) {
	fcfg &= ~SATA_FISC_FISWAIT4HOSTRDYEN_B0;
	hc &= ~EDMA_IE_EDEVERR;
	} else {
	fcfg \|= SATA_FISC_FISWAIT4HOSTRDYEN_B0;
	hc \|= EDMA_IE_EDEVERR;
	}
	ltm \|= (1 << 8);
	} else {
	fcfg &= ~SATA_FISC_FISDMAACTIVATESYNCRESP;
	fcfg &= ~SATA_FISC_FISWAIT4HOSTRDYEN_B0;
	hc \|= EDMA_IE_EDEVERR;
	ltm &= ~(1 << 8);
	}
	ATA_OUTL(ch->r_mem, SATA_FISC, fcfg);
	ATA_OUTL(ch->r_mem, SATA_LTM, ltm);
	ATA_OUTL(ch->r_mem, EDMA_HC, hc);
	/* This is some magic, required to handle several DRQs
	* with basic DMA. */
	unkn = ATA_INL(ch->r_mem, EDMA_UNKN_RESD);
	if (mode == MVS_EDMA_OFF)
	unkn \|= 1;
	else
	unkn &= ~1;
	ATA_OUTL(ch->r_mem, EDMA_UNKN_RESD, unkn);
	}
	/* Run EDMA. */
	if (mode != MVS_EDMA_OFF)
	ATA_OUTL(ch->r_mem, EDMA_CMD, EDMA_CMD_EENEDMA);
	}

	devclass_t mvs_devclass;
	devclass_t mvsch_devclass;
	static device_method_t mvsch_methods[] = {
	DEVMETHOD(device_probe, mvs_ch_probe),
	DEVMETHOD(device_attach, mvs_ch_attach),
	DEVMETHOD(device_detach, mvs_ch_detach),
	DEVMETHOD(device_suspend, mvs_ch_suspend),
	DEVMETHOD(device_resume, mvs_ch_resume),
	{ 0, 0 }
	};
	static driver_t mvsch_driver = {
	"mvsch",
	mvsch_methods,
	sizeof(struct mvs_channel)
	};
	DRIVER_MODULE(mvsch, mvs, mvsch_driver, mvsch_devclass, 0, 0);
	DRIVER_MODULE(mvsch, sata, mvsch_driver, mvsch_devclass, 0, 0);

	static void
	mvs_phy_check_events(device_t dev, u_int32_t serr)
	{
	struct mvs_channel *ch = device_get_softc(dev);

	if (ch->pm_level == 0) {
	u_int32_t status = ATA_INL(ch->r_mem, SATA_SS);
	union ccb *ccb;

	if (bootverbose) {
	if (((status & SATA_SS_DET_MASK) == SATA_SS_DET_PHY_ONLINE) &&
	((status & SATA_SS_SPD_MASK) != SATA_SS_SPD_NO_SPEED) &&
	((status & SATA_SS_IPM_MASK) == SATA_SS_IPM_ACTIVE)) {
	device_printf(dev, "CONNECT requested\n");
	} else
	device_printf(dev, "DISCONNECT requested\n");
	}
	mvs_reset(dev);
	if ((ccb = xpt_alloc_ccb_nowait()) == NULL)
	return;
	if (xpt_create_path(&ccb->ccb_h.path, NULL,
	cam_sim_path(ch->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	xpt_free_ccb(ccb);
	return;
	}
	xpt_rescan(ccb);
	}
	}

	static void
	mvs_notify_events(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	struct cam_path *dpath;
	uint32_t fis;
	int d;

	/* Try to read PMP field from SDB FIS. Present only for Gen-IIe. */
	fis = ATA_INL(ch->r_mem, SATA_FISDW0);
	if ((fis & 0x80ff) == 0x80a1)
	d = (fis & 0x0f00) >> 8;
	else
	d = ch->pm_present ? 15 : 0;
	if (bootverbose)
	device_printf(dev, "SNTF %d\n", d);
	if (xpt_create_path(&dpath, NULL,
	xpt_path_path_id(ch->path), d, 0) == CAM_REQ_CMP) {
	xpt_async(AC_SCSI_AEN, dpath, NULL);
	xpt_free_path(dpath);
	}
	}

	static void
	mvs_ch_intr_locked(void *data)
	{
	struct mvs_intr_arg arg = (struct mvs_intr_arg )data;
	device_t dev = (device_t)arg->arg;
	struct mvs_channel *ch = device_get_softc(dev);

	mtx_lock(&ch->mtx);
	mvs_ch_intr(data);
	mtx_unlock(&ch->mtx);
	}

	static void
	mvs_ch_pm(void *arg)
	{
	device_t dev = (device_t)arg;
	struct mvs_channel *ch = device_get_softc(dev);
	uint32_t work;

	if (ch->numrslots != 0)
	return;
	/* If we are idle - request power state transition. */
	work = ATA_INL(ch->r_mem, SATA_SC);
	work &= ~SATA_SC_SPM_MASK;
	if (ch->pm_level == 4)
	work \|= SATA_SC_SPM_PARTIAL;
	else
	work \|= SATA_SC_SPM_SLUMBER;
	ATA_OUTL(ch->r_mem, SATA_SC, work);
	}

	static void
	mvs_ch_pm_wake(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	uint32_t work;
	int timeout = 0;

	work = ATA_INL(ch->r_mem, SATA_SS);
	if (work & SATA_SS_IPM_ACTIVE)
	return;
	/* If we are not in active state - request power state transition. */
	work = ATA_INL(ch->r_mem, SATA_SC);
	work &= ~SATA_SC_SPM_MASK;
	work \|= SATA_SC_SPM_ACTIVE;
	ATA_OUTL(ch->r_mem, SATA_SC, work);
	/* Wait for transition to happen. */
	while ((ATA_INL(ch->r_mem, SATA_SS) & SATA_SS_IPM_ACTIVE) == 0 &&
	timeout++ < 100) {
	DELAY(100);
	}
	}

	static void
	mvs_ch_intr(void *data)
	{
	struct mvs_intr_arg arg = (struct mvs_intr_arg )data;
	device_t dev = (device_t)arg->arg;
	struct mvs_channel *ch = device_get_softc(dev);
	uint32_t iec, serr = 0, fisic = 0;
	enum mvs_err_type et;
	int i, ccs, port = -1, selfdis = 0;
	int edma = (ch->numtslots != 0 \|\| ch->numdslots != 0);

	/* New item in response queue. */
	if ((arg->cause & 2) && edma)
	mvs_crbq_intr(dev);
	/* Some error or special event. */
	if (arg->cause & 1) {
	iec = ATA_INL(ch->r_mem, EDMA_IEC);
	if (iec & EDMA_IE_SERRINT) {
	serr = ATA_INL(ch->r_mem, SATA_SE);
	ATA_OUTL(ch->r_mem, SATA_SE, serr);
	}
	/* EDMA self-disabled due to error. */
	if (iec & EDMA_IE_ESELFDIS)
	selfdis = 1;
	/* Transport interrupt. */
	if (iec & EDMA_IE_ETRANSINT) {
	/* For Gen-I this bit means self-disable. */
	if (ch->quirks & MVS_Q_GENI)
	selfdis = 1;
	/* For Gen-II this bit means SDB-N. */
	else if (ch->quirks & MVS_Q_GENII)
	fisic = SATA_FISC_FISWAIT4HOSTRDYEN_B1;
	else /* For Gen-IIe - read FIS interrupt cause. */
	fisic = ATA_INL(ch->r_mem, SATA_FISIC);
	}
	if (selfdis)
	ch->curr_mode = MVS_EDMA_UNKNOWN;
	ATA_OUTL(ch->r_mem, EDMA_IEC, ~iec);
	/* Interface errors or Device error. */
	if (iec & (0xfc1e9000 \| EDMA_IE_EDEVERR)) {
	port = -1;
	if (ch->numpslots != 0) {
	ccs = 0;
	} else {
	if (ch->quirks & MVS_Q_GENIIE)
	ccs = EDMA_S_EIOID(ATA_INL(ch->r_mem, EDMA_S));
	else
	ccs = EDMA_S_EDEVQUETAG(ATA_INL(ch->r_mem, EDMA_S));
	/* Check if error is one-PMP-port-specific, */
	if (ch->fbs_enabled) {
	/* Which ports were active. */
	for (i = 0; i < 16; i++) {
	if (ch->numrslotspd[i] == 0)
	continue;
	if (port == -1)
	port = i;
	else if (port != i) {
	port = -2;
	break;
	}
	}
	/* If several ports were active and EDMA still enabled -
	* other ports are probably unaffected and may continue.
	*/
	if (port == -2 && !selfdis) {
	uint16_t p = ATA_INL(ch->r_mem, SATA_SATAITC) >> 16;
	port = ffs(p) - 1;
	if (port != (fls(p) - 1))
	port = -2;
	}
	}
	}
	mvs_requeue_frozen(dev);
	for (i = 0; i < MVS_MAX_SLOTS; i++) {
	/* XXX: reqests in loading state. */
	if (((ch->rslots >> i) & 1) == 0)
	continue;
	if (port >= 0 &&
	ch->slot[i].ccb->ccb_h.target_id != port)
	continue;
	if (iec & EDMA_IE_EDEVERR) { /* Device error. */
	if (port != -2) {
	if (ch->numtslots == 0) {
	/* Untagged operation. */
	if (i == ccs)
	et = MVS_ERR_TFE;
	else
	et = MVS_ERR_INNOCENT;
	} else {
	/* Tagged operation. */
	et = MVS_ERR_NCQ;
	}
	} else {
	et = MVS_ERR_TFE;
	ch->fatalerr = 1;
	}
	} else if (iec & 0xfc1e9000) {
	if (ch->numtslots == 0 &&
	i != ccs && port != -2)
	et = MVS_ERR_INNOCENT;
	else
	et = MVS_ERR_SATA;
	} else
	et = MVS_ERR_INVALID;
	mvs_end_transaction(&ch->slot[i], et);
	}
	}
	/* Process SDB-N. */
	if (fisic & SATA_FISC_FISWAIT4HOSTRDYEN_B1)
	mvs_notify_events(dev);
	if (fisic)
	ATA_OUTL(ch->r_mem, SATA_FISIC, ~fisic);
	/* Process hot-plug. */
	if ((iec & (EDMA_IE_EDEVDIS \| EDMA_IE_EDEVCON)) \|\|
	(serr & SATA_SE_PHY_CHANGED))
	mvs_phy_check_events(dev, serr);
	}
	/* Legacy mode device interrupt. */
	if ((arg->cause & 2) && !edma)
	mvs_legacy_intr(dev, arg->cause & 4);
	}

	static uint8_t
	mvs_getstatus(device_t dev, int clear)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	uint8_t status = ATA_INB(ch->r_mem, clear ? ATA_STATUS : ATA_ALTSTAT);

	if (ch->fake_busy) {
	if (status & (ATA_S_BUSY \| ATA_S_DRQ \| ATA_S_ERROR))
	ch->fake_busy = 0;
	else
	status \|= ATA_S_BUSY;
	}
	return (status);
	}

	static void
	mvs_legacy_intr(device_t dev, int poll)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	struct mvs_slot slot = &ch->slot[0]; / PIO is always in slot 0. */
	union ccb *ccb = slot->ccb;
	enum mvs_err_type et = MVS_ERR_NONE;
	int port;
	u_int length, resid, size;
	uint8_t buf[2];
	uint8_t status, ireason;

	/* Clear interrupt and get status. */
	status = mvs_getstatus(dev, 1);
	if (slot->state < MVS_SLOT_RUNNING)
	return;
	port = ccb->ccb_h.target_id & 0x0f;
	/* Wait a bit for late !BUSY status update. */
	if (status & ATA_S_BUSY) {
	if (poll)
	return;
	DELAY(100);
	if ((status = mvs_getstatus(dev, 1)) & ATA_S_BUSY) {
	DELAY(1000);
	if ((status = mvs_getstatus(dev, 1)) & ATA_S_BUSY)
	return;
	}
	}
	/* If we got an error, we are done. */
	if (status & ATA_S_ERROR) {
	et = MVS_ERR_TFE;
	goto end_finished;
	}
	if (ccb->ccb_h.func_code == XPT_ATA_IO) { /* ATA PIO */
	ccb->ataio.res.status = status;
	/* Are we moving data? */
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) {
	/* If data read command - get them. */
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
	if (mvs_wait(dev, ATA_S_DRQ, ATA_S_BUSY, 1000) < 0) {
	device_printf(dev, "timeout waiting for read DRQ\n");
	et = MVS_ERR_TIMEOUT;
	xpt_freeze_simq(ch->sim, 1);
	ch->toslots \|= (1 << slot->slot);
	goto end_finished;
	}
	ATA_INSW_STRM(ch->r_mem, ATA_DATA,
	(uint16_t *)(ccb->ataio.data_ptr + ch->donecount),
	ch->transfersize / 2);
	}
	/* Update how far we've gotten. */
	ch->donecount += ch->transfersize;
	/* Do we need more? */
	if (ccb->ataio.dxfer_len > ch->donecount) {
	/* Set this transfer size according to HW capabilities */
	ch->transfersize = min(ccb->ataio.dxfer_len - ch->donecount,
	ch->transfersize);
	/* If data write command - put them */
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
	if (mvs_wait(dev, ATA_S_DRQ, ATA_S_BUSY, 1000) < 0) {
	device_printf(dev,
	"timeout waiting for write DRQ\n");
	et = MVS_ERR_TIMEOUT;
	xpt_freeze_simq(ch->sim, 1);
	ch->toslots \|= (1 << slot->slot);
	goto end_finished;
	}
	ATA_OUTSW_STRM(ch->r_mem, ATA_DATA,
	(uint16_t *)(ccb->ataio.data_ptr + ch->donecount),
	ch->transfersize / 2);
	return;
	}
	/* If data read command, return & wait for interrupt */
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN)
	return;
	}
	}
	} else if (ch->basic_dma) { /* ATAPI DMA */
	if (status & ATA_S_DWF)
	et = MVS_ERR_TFE;
	else if (ATA_INL(ch->r_mem, DMA_S) & DMA_S_ERR)
	et = MVS_ERR_TFE;
	/* Stop basic DMA. */
	ATA_OUTL(ch->r_mem, DMA_C, 0);
	goto end_finished;
	} else { /* ATAPI PIO */
	length = ATA_INB(ch->r_mem,ATA_CYL_LSB) \|
	(ATA_INB(ch->r_mem,ATA_CYL_MSB) << 8);
	size = min(ch->transfersize, length);
	ireason = ATA_INB(ch->r_mem,ATA_IREASON);
	switch ((ireason & (ATA_I_CMD \| ATA_I_IN)) \|
	(status & ATA_S_DRQ)) {
	case ATAPI_P_CMDOUT:
	device_printf(dev, "ATAPI CMDOUT\n");
	/* Return wait for interrupt */
	return;

	case ATAPI_P_WRITE:
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
	device_printf(dev, "trying to write on read buffer\n");
	et = MVS_ERR_TFE;
	goto end_finished;
	break;
	}
	ATA_OUTSW_STRM(ch->r_mem, ATA_DATA,
	(uint16_t *)(ccb->csio.data_ptr + ch->donecount),
	(size + 1) / 2);
	for (resid = ch->transfersize + (size & 1);
	resid < length; resid += sizeof(int16_t))
	ATA_OUTW(ch->r_mem, ATA_DATA, 0);
	ch->donecount += length;
	/* Set next transfer size according to HW capabilities */
	ch->transfersize = min(ccb->csio.dxfer_len - ch->donecount,
	ch->curr[ccb->ccb_h.target_id].bytecount);
	/* Return wait for interrupt */
	return;

	case ATAPI_P_READ:
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
	device_printf(dev, "trying to read on write buffer\n");
	et = MVS_ERR_TFE;
	goto end_finished;
	}
	if (size >= 2) {
	ATA_INSW_STRM(ch->r_mem, ATA_DATA,
	(uint16_t *)(ccb->csio.data_ptr + ch->donecount),
	size / 2);
	}
	if (size & 1) {
	ATA_INSW_STRM(ch->r_mem, ATA_DATA, (void*)buf, 1);
	((uint8_t *)ccb->csio.data_ptr + ch->donecount +
	(size & ~1))[0] = buf[0];
	}
	for (resid = ch->transfersize + (size & 1);
	resid < length; resid += sizeof(int16_t))
	ATA_INW(ch->r_mem, ATA_DATA);
	ch->donecount += length;
	/* Set next transfer size according to HW capabilities */
	ch->transfersize = min(ccb->csio.dxfer_len - ch->donecount,
	ch->curr[ccb->ccb_h.target_id].bytecount);
	/* Return wait for interrupt */
	return;

	case ATAPI_P_DONEDRQ:
	device_printf(dev,
	"WARNING - DONEDRQ non conformant device\n");
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
	ATA_INSW_STRM(ch->r_mem, ATA_DATA,
	(uint16_t *)(ccb->csio.data_ptr + ch->donecount),
	length / 2);
	ch->donecount += length;
	}
	else if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
	ATA_OUTSW_STRM(ch->r_mem, ATA_DATA,
	(uint16_t *)(ccb->csio.data_ptr + ch->donecount),
	length / 2);
	ch->donecount += length;
	}
	else
	et = MVS_ERR_TFE;
	/* FALLTHROUGH */

	case ATAPI_P_ABORT:
	case ATAPI_P_DONE:
	if (status & (ATA_S_ERROR \| ATA_S_DWF))
	et = MVS_ERR_TFE;
	goto end_finished;

	default:
	device_printf(dev, "unknown transfer phase"
	" (status %02x, ireason %02x)\n",
	status, ireason);
	et = MVS_ERR_TFE;
	}
	}

	end_finished:
	mvs_end_transaction(slot, et);
	}

	static void
	mvs_crbq_intr(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	struct mvs_crpb *crpb;
	union ccb *ccb;
	int in_idx, fin_idx, cin_idx, slot;
	uint32_t val;
	uint16_t flags;

	val = ATA_INL(ch->r_mem, EDMA_RESQIP);
	if (val == 0)
	val = ATA_INL(ch->r_mem, EDMA_RESQIP);
	in_idx = (val & EDMA_RESQP_ERPQP_MASK) >>
	EDMA_RESQP_ERPQP_SHIFT;
	bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map,
	BUS_DMASYNC_POSTREAD);
	fin_idx = cin_idx = ch->in_idx;
	ch->in_idx = in_idx;
	while (in_idx != cin_idx) {
	crpb = (struct mvs_crpb *)
	(ch->dma.workrp + MVS_CRPB_OFFSET +
	(MVS_CRPB_SIZE * cin_idx));
	slot = le16toh(crpb->id) & MVS_CRPB_TAG_MASK;
	flags = le16toh(crpb->rspflg);
	/*
	* Handle only successful completions here.
	* Errors will be handled by main intr handler.
	*/
	#if defined(__i386__) \|\| defined(__amd64__)
	if (crpb->id == 0xffff && crpb->rspflg == 0xffff) {
	device_printf(dev, "Unfilled CRPB "
	"%d (%d->%d) tag %d flags %04x rs %08x\n",
	cin_idx, fin_idx, in_idx, slot, flags, ch->rslots);
	} else
	#endif
	if (ch->numtslots != 0 \|\|
	(flags & EDMA_IE_EDEVERR) == 0) {
	#if defined(__i386__) \|\| defined(__amd64__)
	crpb->id = 0xffff;
	crpb->rspflg = 0xffff;
	#endif
	if (ch->slot[slot].state >= MVS_SLOT_RUNNING) {
	ccb = ch->slot[slot].ccb;
	ccb->ataio.res.status =
	(flags & MVS_CRPB_ATASTS_MASK) >>
	MVS_CRPB_ATASTS_SHIFT;
	mvs_end_transaction(&ch->slot[slot], MVS_ERR_NONE);
	} else {
	device_printf(dev, "Unused tag in CRPB "
	"%d (%d->%d) tag %d flags %04x rs %08x\n",
	cin_idx, fin_idx, in_idx, slot, flags,
	ch->rslots);
	}
	} else {
	device_printf(dev,
	"CRPB with error %d tag %d flags %04x\n",
	cin_idx, slot, flags);
	}
	cin_idx = (cin_idx + 1) & (MVS_MAX_SLOTS - 1);
	}
	bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map,
	BUS_DMASYNC_PREREAD);
	if (cin_idx == ch->in_idx) {
	ATA_OUTL(ch->r_mem, EDMA_RESQOP,
	ch->dma.workrp_bus \| (cin_idx << EDMA_RESQP_ERPQP_SHIFT));
	}
	}

	/* Must be called with channel locked. */
	static int
	mvs_check_collision(device_t dev, union ccb *ccb)
	{
	struct mvs_channel *ch = device_get_softc(dev);

	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	/* NCQ DMA */
	if (ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA) {
	/* Can't mix NCQ and non-NCQ DMA commands. */
	if (ch->numdslots != 0)
	return (1);
	/* Can't mix NCQ and PIO commands. */
	if (ch->numpslots != 0)
	return (1);
	/* If we have no FBS */
	if (!ch->fbs_enabled) {
	/* Tagged command while tagged to other target is active. */
	if (ch->numtslots != 0 &&
	ch->taggedtarget != ccb->ccb_h.target_id)
	return (1);
	}
	/* Non-NCQ DMA */
	} else if (ccb->ataio.cmd.flags & CAM_ATAIO_DMA) {
	/* Can't mix non-NCQ DMA and NCQ commands. */
	if (ch->numtslots != 0)
	return (1);
	/* Can't mix non-NCQ DMA and PIO commands. */
	if (ch->numpslots != 0)
	return (1);
	/* PIO */
	} else {
	/* Can't mix PIO with anything. */
	if (ch->numrslots != 0)
	return (1);
	}
	if (ccb->ataio.cmd.flags & (CAM_ATAIO_CONTROL \| CAM_ATAIO_NEEDRESULT)) {
	/* Atomic command while anything active. */
	if (ch->numrslots != 0)
	return (1);
	}
	} else { /* ATAPI */
	/* ATAPI goes without EDMA, so can't mix it with anything. */
	if (ch->numrslots != 0)
	return (1);
	}
	/* We have some atomic command running. */
	if (ch->aslots != 0)
	return (1);
	return (0);
	}

	static void
	mvs_tfd_read(device_t dev, union ccb *ccb)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	struct ata_res *res = &ccb->ataio.res;

	res->status = ATA_INB(ch->r_mem, ATA_ALTSTAT);
	res->error = ATA_INB(ch->r_mem, ATA_ERROR);
	res->device = ATA_INB(ch->r_mem, ATA_DRIVE);
	ATA_OUTB(ch->r_mem, ATA_CONTROL, ATA_A_HOB);
	res->sector_count_exp = ATA_INB(ch->r_mem, ATA_COUNT);
	res->lba_low_exp = ATA_INB(ch->r_mem, ATA_SECTOR);
	res->lba_mid_exp = ATA_INB(ch->r_mem, ATA_CYL_LSB);
	res->lba_high_exp = ATA_INB(ch->r_mem, ATA_CYL_MSB);
	ATA_OUTB(ch->r_mem, ATA_CONTROL, 0);
	res->sector_count = ATA_INB(ch->r_mem, ATA_COUNT);
	res->lba_low = ATA_INB(ch->r_mem, ATA_SECTOR);
	res->lba_mid = ATA_INB(ch->r_mem, ATA_CYL_LSB);
	res->lba_high = ATA_INB(ch->r_mem, ATA_CYL_MSB);
	}

	static void
	mvs_tfd_write(device_t dev, union ccb *ccb)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	struct ata_cmd *cmd = &ccb->ataio.cmd;

	ATA_OUTB(ch->r_mem, ATA_DRIVE, cmd->device);
	ATA_OUTB(ch->r_mem, ATA_CONTROL, cmd->control);
	ATA_OUTB(ch->r_mem, ATA_FEATURE, cmd->features_exp);
	ATA_OUTB(ch->r_mem, ATA_FEATURE, cmd->features);
	ATA_OUTB(ch->r_mem, ATA_COUNT, cmd->sector_count_exp);
	ATA_OUTB(ch->r_mem, ATA_COUNT, cmd->sector_count);
	ATA_OUTB(ch->r_mem, ATA_SECTOR, cmd->lba_low_exp);
	ATA_OUTB(ch->r_mem, ATA_SECTOR, cmd->lba_low);
	ATA_OUTB(ch->r_mem, ATA_CYL_LSB, cmd->lba_mid_exp);
	ATA_OUTB(ch->r_mem, ATA_CYL_LSB, cmd->lba_mid);
	ATA_OUTB(ch->r_mem, ATA_CYL_MSB, cmd->lba_high_exp);
	ATA_OUTB(ch->r_mem, ATA_CYL_MSB, cmd->lba_high);
	ATA_OUTB(ch->r_mem, ATA_COMMAND, cmd->command);
	}

	/* Must be called with channel locked. */
	static void
	mvs_begin_transaction(device_t dev, union ccb *ccb)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	struct mvs_slot *slot;
	int slotn, tag;

	if (ch->pm_level > 0)
	mvs_ch_pm_wake(dev);
	/* Softreset is a special case. */
	if (ccb->ccb_h.func_code == XPT_ATA_IO &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL)) {
	mvs_softreset(dev, ccb);
	return;
	}
	/* Choose empty slot. */
	slotn = ffs(~ch->oslots) - 1;
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA)) {
	if (ch->quirks & MVS_Q_GENIIE)
	tag = ffs(~ch->otagspd[ccb->ccb_h.target_id]) - 1;
	else
	tag = slotn;
	} else
	tag = 0;
	/* Occupy chosen slot. */
	slot = &ch->slot[slotn];
	slot->ccb = ccb;
	slot->tag = tag;
	/* Stop PM timer. */
	if (ch->numrslots == 0 && ch->pm_level > 3)
	callout_stop(&ch->pm_timer);
	/* Update channel stats. */
	ch->oslots \|= (1 << slot->slot);
	ch->numrslots++;
	ch->numrslotspd[ccb->ccb_h.target_id]++;
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	if (ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA) {
	ch->otagspd[ccb->ccb_h.target_id] \|= (1 << slot->tag);
	ch->numtslots++;
	ch->numtslotspd[ccb->ccb_h.target_id]++;
	ch->taggedtarget = ccb->ccb_h.target_id;
	mvs_set_edma_mode(dev, MVS_EDMA_NCQ);
	} else if (ccb->ataio.cmd.flags & CAM_ATAIO_DMA) {
	ch->numdslots++;
	mvs_set_edma_mode(dev, MVS_EDMA_ON);
	} else {
	ch->numpslots++;
	mvs_set_edma_mode(dev, MVS_EDMA_OFF);
	}
	if (ccb->ataio.cmd.flags &
	(CAM_ATAIO_CONTROL \| CAM_ATAIO_NEEDRESULT)) {
	ch->aslots \|= (1 << slot->slot);
	}
	} else {
	uint8_t *cdb = (ccb->ccb_h.flags & CAM_CDB_POINTER) ?
	ccb->csio.cdb_io.cdb_ptr : ccb->csio.cdb_io.cdb_bytes;
	ch->numpslots++;
	/* Use ATAPI DMA only for commands without under-/overruns. */
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE &&
	ch->curr[ccb->ccb_h.target_id].mode >= ATA_DMA &&
	(ch->quirks & MVS_Q_SOC) == 0 &&
	(cdb[0] == 0x08 \|\|
	cdb[0] == 0x0a \|\|
	cdb[0] == 0x28 \|\|
	cdb[0] == 0x2a \|\|
	cdb[0] == 0x88 \|\|
	cdb[0] == 0x8a \|\|
	cdb[0] == 0xa8 \|\|
	cdb[0] == 0xaa \|\|
	cdb[0] == 0xbe)) {
	ch->basic_dma = 1;
	}
	mvs_set_edma_mode(dev, MVS_EDMA_OFF);
	}
	if (ch->numpslots == 0 \|\| ch->basic_dma) {
	slot->state = MVS_SLOT_LOADING;
	bus_dmamap_load_ccb(ch->dma.data_tag, slot->dma.data_map,
	ccb, mvs_dmasetprd, slot, 0);
	} else
	mvs_legacy_execute_transaction(slot);
	}

	/* Locked by busdma engine. */
	static void
	mvs_dmasetprd(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct mvs_slot *slot = arg;
	struct mvs_channel *ch = device_get_softc(slot->dev);
	struct mvs_eprd *eprd;
	int i;

	if (error) {
	device_printf(slot->dev, "DMA load error\n");
	mvs_end_transaction(slot, MVS_ERR_INVALID);
	return;
	}
	KASSERT(nsegs <= MVS_SG_ENTRIES, ("too many DMA segment entries\n"));
	/* If there is only one segment - no need to use S/G table on Gen-IIe. */
	if (nsegs == 1 && ch->basic_dma == 0 && (ch->quirks & MVS_Q_GENIIE)) {
	slot->dma.addr = segs[0].ds_addr;
	slot->dma.len = segs[0].ds_len;
	} else {
	slot->dma.addr = 0;
	/* Get a piece of the workspace for this EPRD */
	- eprd = (struct mvs_eprd *)
	- (ch->dma.workrq + MVS_EPRD_OFFSET + (MVS_EPRD_SIZE * slot->slot));
	+ eprd = (struct mvs_eprd *)(ch->dma.workrq + slot->eprd_offset);
	/* Fill S/G table */
	for (i = 0; i < nsegs; i++) {
	eprd[i].prdbal = htole32(segs[i].ds_addr);
	eprd[i].bytecount = htole32(segs[i].ds_len & MVS_EPRD_MASK);
	eprd[i].prdbah = htole32((segs[i].ds_addr >> 16) >> 16);
	}
	eprd[i - 1].bytecount \|= htole32(MVS_EPRD_EOF);
	}
	bus_dmamap_sync(ch->dma.data_tag, slot->dma.data_map,
	((slot->ccb->ccb_h.flags & CAM_DIR_IN) ?
	BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE));
	if (ch->basic_dma)
	mvs_legacy_execute_transaction(slot);
	else
	mvs_execute_transaction(slot);
	}

	static void
	mvs_legacy_execute_transaction(struct mvs_slot *slot)
	{
	device_t dev = slot->dev;
	struct mvs_channel *ch = device_get_softc(dev);
	bus_addr_t eprd;
	union ccb *ccb = slot->ccb;
	int port = ccb->ccb_h.target_id & 0x0f;
	int timeout;

	slot->state = MVS_SLOT_RUNNING;
	ch->rslots \|= (1 << slot->slot);
	ATA_OUTB(ch->r_mem, SATA_SATAICTL, port << SATA_SATAICTL_PMPTX_SHIFT);
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	mvs_tfd_write(dev, ccb);
	/* Device reset doesn't interrupt. */
	if (ccb->ataio.cmd.command == ATA_DEVICE_RESET) {
	int timeout = 1000000;
	do {
	DELAY(10);
	ccb->ataio.res.status = ATA_INB(ch->r_mem, ATA_STATUS);
	} while (ccb->ataio.res.status & ATA_S_BUSY && timeout--);
	mvs_legacy_intr(dev, 1);
	return;
	}
	ch->donecount = 0;
	if (ccb->ataio.cmd.command == ATA_READ_MUL \|\|
	ccb->ataio.cmd.command == ATA_READ_MUL48 \|\|
	ccb->ataio.cmd.command == ATA_WRITE_MUL \|\|
	ccb->ataio.cmd.command == ATA_WRITE_MUL48) {
	ch->transfersize = min(ccb->ataio.dxfer_len,
	ch->curr[port].bytecount);
	} else
	ch->transfersize = min(ccb->ataio.dxfer_len, 512);
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE)
	ch->fake_busy = 1;
	/* If data write command - output the data */
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
	if (mvs_wait(dev, ATA_S_DRQ, ATA_S_BUSY, 1000) < 0) {
	device_printf(dev,
	"timeout waiting for write DRQ\n");
	xpt_freeze_simq(ch->sim, 1);
	ch->toslots \|= (1 << slot->slot);
	mvs_end_transaction(slot, MVS_ERR_TIMEOUT);
	return;
	}
	ATA_OUTSW_STRM(ch->r_mem, ATA_DATA,
	(uint16_t *)(ccb->ataio.data_ptr + ch->donecount),
	ch->transfersize / 2);
	}
	} else {
	ch->donecount = 0;
	ch->transfersize = min(ccb->csio.dxfer_len,
	ch->curr[port].bytecount);
	/* Write ATA PACKET command. */
	if (ch->basic_dma) {
	ATA_OUTB(ch->r_mem, ATA_FEATURE, ATA_F_DMA);
	ATA_OUTB(ch->r_mem, ATA_CYL_LSB, 0);
	ATA_OUTB(ch->r_mem, ATA_CYL_MSB, 0);
	} else {
	ATA_OUTB(ch->r_mem, ATA_FEATURE, 0);
	ATA_OUTB(ch->r_mem, ATA_CYL_LSB, ch->transfersize);
	ATA_OUTB(ch->r_mem, ATA_CYL_MSB, ch->transfersize >> 8);
	}
	ATA_OUTB(ch->r_mem, ATA_COMMAND, ATA_PACKET_CMD);
	ch->fake_busy = 1;
	/* Wait for ready to write ATAPI command block */
	if (mvs_wait(dev, 0, ATA_S_BUSY, 1000) < 0) {
	device_printf(dev, "timeout waiting for ATAPI !BUSY\n");
	xpt_freeze_simq(ch->sim, 1);
	ch->toslots \|= (1 << slot->slot);
	mvs_end_transaction(slot, MVS_ERR_TIMEOUT);
	return;
	}
	timeout = 5000;
	while (timeout--) {
	int reason = ATA_INB(ch->r_mem, ATA_IREASON);
	int status = ATA_INB(ch->r_mem, ATA_STATUS);

	if (((reason & (ATA_I_CMD \| ATA_I_IN)) \|
	(status & (ATA_S_DRQ \| ATA_S_BUSY))) == ATAPI_P_CMDOUT)
	break;
	DELAY(20);
	}
	if (timeout <= 0) {
	device_printf(dev,
	"timeout waiting for ATAPI command ready\n");
	xpt_freeze_simq(ch->sim, 1);
	ch->toslots \|= (1 << slot->slot);
	mvs_end_transaction(slot, MVS_ERR_TIMEOUT);
	return;
	}
	/* Write ATAPI command. */
	ATA_OUTSW_STRM(ch->r_mem, ATA_DATA,
	(uint16_t *)((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
	ccb->csio.cdb_io.cdb_ptr : ccb->csio.cdb_io.cdb_bytes),
	ch->curr[port].atapi / 2);
	DELAY(10);
	if (ch->basic_dma) {
	/* Start basic DMA. */
	- eprd = ch->dma.workrq_bus + MVS_EPRD_OFFSET +
	- (MVS_EPRD_SIZE * slot->slot);
	+ eprd = ch->dma.workrq_bus + slot->eprd_offset;
	ATA_OUTL(ch->r_mem, DMA_DTLBA, eprd);
	ATA_OUTL(ch->r_mem, DMA_DTHBA, (eprd >> 16) >> 16);
	ATA_OUTL(ch->r_mem, DMA_C, DMA_C_START \|
	(((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) ?
	DMA_C_READ : 0));
	}
	}
	/* Start command execution timeout */
	callout_reset_sbt(&slot->timeout, SBT_1MS * ccb->ccb_h.timeout, 0,
	mvs_timeout, slot, 0);
	}

	/* Must be called with channel locked. */
	static void
	mvs_execute_transaction(struct mvs_slot *slot)
	{
	device_t dev = slot->dev;
	struct mvs_channel *ch = device_get_softc(dev);
	bus_addr_t eprd;
	struct mvs_crqb *crqb;
	struct mvs_crqb_gen2e *crqb2e;
	union ccb *ccb = slot->ccb;
	int port = ccb->ccb_h.target_id & 0x0f;
	int i;

	/* Get address of the prepared EPRD */
	- eprd = ch->dma.workrq_bus + MVS_EPRD_OFFSET + (MVS_EPRD_SIZE * slot->slot);
	+ eprd = ch->dma.workrq_bus + slot->eprd_offset;
	/* Prepare CRQB. Gen IIe uses different CRQB format. */
	if (ch->quirks & MVS_Q_GENIIE) {
	crqb2e = (struct mvs_crqb_gen2e *)
	(ch->dma.workrq + MVS_CRQB_OFFSET + (MVS_CRQB_SIZE * ch->out_idx));
	crqb2e->ctrlflg = htole32(
	((ccb->ccb_h.flags & CAM_DIR_IN) ? MVS_CRQB2E_READ : 0) \|
	(slot->tag << MVS_CRQB2E_DTAG_SHIFT) \|
	(port << MVS_CRQB2E_PMP_SHIFT) \|
	(slot->slot << MVS_CRQB2E_HTAG_SHIFT));
	/* If there is only one segment - no need to use S/G table. */
	if (slot->dma.addr != 0) {
	eprd = slot->dma.addr;
	crqb2e->ctrlflg \|= htole32(MVS_CRQB2E_CPRD);
	crqb2e->drbc = slot->dma.len;
	}
	crqb2e->cprdbl = htole32(eprd);
	crqb2e->cprdbh = htole32((eprd >> 16) >> 16);
	crqb2e->cmd[0] = 0;
	crqb2e->cmd[1] = 0;
	crqb2e->cmd[2] = ccb->ataio.cmd.command;
	crqb2e->cmd[3] = ccb->ataio.cmd.features;
	crqb2e->cmd[4] = ccb->ataio.cmd.lba_low;
	crqb2e->cmd[5] = ccb->ataio.cmd.lba_mid;
	crqb2e->cmd[6] = ccb->ataio.cmd.lba_high;
	crqb2e->cmd[7] = ccb->ataio.cmd.device;
	crqb2e->cmd[8] = ccb->ataio.cmd.lba_low_exp;
	crqb2e->cmd[9] = ccb->ataio.cmd.lba_mid_exp;
	crqb2e->cmd[10] = ccb->ataio.cmd.lba_high_exp;
	crqb2e->cmd[11] = ccb->ataio.cmd.features_exp;
	if (ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA) {
	crqb2e->cmd[12] = slot->tag << 3;
	crqb2e->cmd[13] = 0;
	} else {
	crqb2e->cmd[12] = ccb->ataio.cmd.sector_count;
	crqb2e->cmd[13] = ccb->ataio.cmd.sector_count_exp;
	}
	crqb2e->cmd[14] = 0;
	crqb2e->cmd[15] = 0;
	} else {
	crqb = (struct mvs_crqb *)
	(ch->dma.workrq + MVS_CRQB_OFFSET + (MVS_CRQB_SIZE * ch->out_idx));
	crqb->cprdbl = htole32(eprd);
	crqb->cprdbh = htole32((eprd >> 16) >> 16);
	crqb->ctrlflg = htole16(
	((ccb->ccb_h.flags & CAM_DIR_IN) ? MVS_CRQB_READ : 0) \|
	(slot->slot << MVS_CRQB_TAG_SHIFT) \|
	(port << MVS_CRQB_PMP_SHIFT));
	i = 0;
	/*
	* Controller can handle only 11 of 12 ATA registers,
	* so we have to choose which one to skip.
	*/
	if (ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA) {
	crqb->cmd[i++] = ccb->ataio.cmd.features_exp;
	crqb->cmd[i++] = 0x11;
	}
	crqb->cmd[i++] = ccb->ataio.cmd.features;
	crqb->cmd[i++] = 0x11;
	if (ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA) {
	crqb->cmd[i++] = (slot->tag << 3) \|
	(ccb->ataio.cmd.sector_count & 0x07);
	crqb->cmd[i++] = 0x12;
	} else {
	crqb->cmd[i++] = ccb->ataio.cmd.sector_count_exp;
	crqb->cmd[i++] = 0x12;
	crqb->cmd[i++] = ccb->ataio.cmd.sector_count;
	crqb->cmd[i++] = 0x12;
	}
	crqb->cmd[i++] = ccb->ataio.cmd.lba_low_exp;
	crqb->cmd[i++] = 0x13;
	crqb->cmd[i++] = ccb->ataio.cmd.lba_low;
	crqb->cmd[i++] = 0x13;
	crqb->cmd[i++] = ccb->ataio.cmd.lba_mid_exp;
	crqb->cmd[i++] = 0x14;
	crqb->cmd[i++] = ccb->ataio.cmd.lba_mid;
	crqb->cmd[i++] = 0x14;
	crqb->cmd[i++] = ccb->ataio.cmd.lba_high_exp;
	crqb->cmd[i++] = 0x15;
	crqb->cmd[i++] = ccb->ataio.cmd.lba_high;
	crqb->cmd[i++] = 0x15;
	crqb->cmd[i++] = ccb->ataio.cmd.device;
	crqb->cmd[i++] = 0x16;
	crqb->cmd[i++] = ccb->ataio.cmd.command;
	crqb->cmd[i++] = 0x97;
	}
	bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map,
	BUS_DMASYNC_PREWRITE);
	bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map,
	BUS_DMASYNC_PREREAD);
	slot->state = MVS_SLOT_RUNNING;
	ch->rslots \|= (1 << slot->slot);
	/* Issue command to the controller. */
	ch->out_idx = (ch->out_idx + 1) & (MVS_MAX_SLOTS - 1);
	ATA_OUTL(ch->r_mem, EDMA_REQQIP,
	ch->dma.workrq_bus + MVS_CRQB_OFFSET + (MVS_CRQB_SIZE * ch->out_idx));
	/* Start command execution timeout */
	callout_reset_sbt(&slot->timeout, SBT_1MS * ccb->ccb_h.timeout, 0,
	mvs_timeout, slot, 0);
	return;
	}

	/* Must be called with channel locked. */
	static void
	mvs_process_timeout(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	int i;

	mtx_assert(&ch->mtx, MA_OWNED);
	/* Handle the rest of commands. */
	for (i = 0; i < MVS_MAX_SLOTS; i++) {
	/* Do we have a running request on slot? */
	if (ch->slot[i].state < MVS_SLOT_RUNNING)
	continue;
	mvs_end_transaction(&ch->slot[i], MVS_ERR_TIMEOUT);
	}
	}

	/* Must be called with channel locked. */
	static void
	mvs_rearm_timeout(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	int i;

	mtx_assert(&ch->mtx, MA_OWNED);
	for (i = 0; i < MVS_MAX_SLOTS; i++) {
	struct mvs_slot *slot = &ch->slot[i];

	/* Do we have a running request on slot? */
	if (slot->state < MVS_SLOT_RUNNING)
	continue;
	if ((ch->toslots & (1 << i)) == 0)
	continue;
	callout_reset_sbt(&slot->timeout,
	SBT_1MS * slot->ccb->ccb_h.timeout / 2, 0,
	mvs_timeout, slot, 0);
	}
	}

	/* Locked by callout mechanism. */
	static void
	mvs_timeout(void *arg)
	{
	struct mvs_slot *slot = arg;
	device_t dev = slot->dev;
	struct mvs_channel *ch = device_get_softc(dev);

	/* Check for stale timeout. */
	if (slot->state < MVS_SLOT_RUNNING)
	return;
	device_printf(dev, "Timeout on slot %d\n", slot->slot);
	device_printf(dev, "iec %08x sstat %08x serr %08x edma_s %08x "
	"dma_c %08x dma_s %08x rs %08x status %02x\n",
	ATA_INL(ch->r_mem, EDMA_IEC),
	ATA_INL(ch->r_mem, SATA_SS), ATA_INL(ch->r_mem, SATA_SE),
	ATA_INL(ch->r_mem, EDMA_S), ATA_INL(ch->r_mem, DMA_C),
	ATA_INL(ch->r_mem, DMA_S), ch->rslots,
	ATA_INB(ch->r_mem, ATA_ALTSTAT));
	/* Handle frozen command. */
	mvs_requeue_frozen(dev);
	/* We wait for other commands timeout and pray. */
	if (ch->toslots == 0)
	xpt_freeze_simq(ch->sim, 1);
	ch->toslots \|= (1 << slot->slot);
	if ((ch->rslots & ~ch->toslots) == 0)
	mvs_process_timeout(dev);
	else
	device_printf(dev, " ... waiting for slots %08x\n",
	ch->rslots & ~ch->toslots);
	}

	/* Must be called with channel locked. */
	static void
	mvs_end_transaction(struct mvs_slot *slot, enum mvs_err_type et)
	{
	device_t dev = slot->dev;
	struct mvs_channel *ch = device_get_softc(dev);
	union ccb *ccb = slot->ccb;
	int lastto;

	bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map,
	BUS_DMASYNC_POSTWRITE);
	/* Read result registers to the result struct
	* May be incorrect if several commands finished same time,
	* so read only when sure or have to.
	*/
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	struct ata_res *res = &ccb->ataio.res;

	if ((et == MVS_ERR_TFE) \|\|
	(ccb->ataio.cmd.flags & CAM_ATAIO_NEEDRESULT)) {
	mvs_tfd_read(dev, ccb);
	} else
	bzero(res, sizeof(*res));
	} else {
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE &&
	ch->basic_dma == 0)
	ccb->csio.resid = ccb->csio.dxfer_len - ch->donecount;
	}
	if (ch->numpslots == 0 \|\| ch->basic_dma) {
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) {
	bus_dmamap_sync(ch->dma.data_tag, slot->dma.data_map,
	(ccb->ccb_h.flags & CAM_DIR_IN) ?
	BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
	bus_dmamap_unload(ch->dma.data_tag, slot->dma.data_map);
	}
	}
	if (et != MVS_ERR_NONE)
	ch->eslots \|= (1 << slot->slot);
	/* In case of error, freeze device for proper recovery. */
	if ((et != MVS_ERR_NONE) && (!ch->recoverycmd) &&
	!(ccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	/* Set proper result status. */
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	switch (et) {
	case MVS_ERR_NONE:
	ccb->ccb_h.status \|= CAM_REQ_CMP;
	if (ccb->ccb_h.func_code == XPT_SCSI_IO)
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	break;
	case MVS_ERR_INVALID:
	ch->fatalerr = 1;
	ccb->ccb_h.status \|= CAM_REQ_INVALID;
	break;
	case MVS_ERR_INNOCENT:
	ccb->ccb_h.status \|= CAM_REQUEUE_REQ;
	break;
	case MVS_ERR_TFE:
	case MVS_ERR_NCQ:
	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
	ccb->ccb_h.status \|= CAM_SCSI_STATUS_ERROR;
	ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
	} else {
	ccb->ccb_h.status \|= CAM_ATA_STATUS_ERROR;
	}
	break;
	case MVS_ERR_SATA:
	ch->fatalerr = 1;
	if (!ch->recoverycmd) {
	xpt_freeze_simq(ch->sim, 1);
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	ccb->ccb_h.status \|= CAM_RELEASE_SIMQ;
	}
	ccb->ccb_h.status \|= CAM_UNCOR_PARITY;
	break;
	case MVS_ERR_TIMEOUT:
	if (!ch->recoverycmd) {
	xpt_freeze_simq(ch->sim, 1);
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	ccb->ccb_h.status \|= CAM_RELEASE_SIMQ;
	}
	ccb->ccb_h.status \|= CAM_CMD_TIMEOUT;
	break;
	default:
	ch->fatalerr = 1;
	ccb->ccb_h.status \|= CAM_REQ_CMP_ERR;
	}
	/* Free slot. */
	ch->oslots &= ~(1 << slot->slot);
	ch->rslots &= ~(1 << slot->slot);
	ch->aslots &= ~(1 << slot->slot);
	slot->state = MVS_SLOT_EMPTY;
	slot->ccb = NULL;
	/* Update channel stats. */
	ch->numrslots--;
	ch->numrslotspd[ccb->ccb_h.target_id]--;
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	if (ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA) {
	ch->otagspd[ccb->ccb_h.target_id] &= ~(1 << slot->tag);
	ch->numtslots--;
	ch->numtslotspd[ccb->ccb_h.target_id]--;
	} else if (ccb->ataio.cmd.flags & CAM_ATAIO_DMA) {
	ch->numdslots--;
	} else {
	ch->numpslots--;
	}
	} else {
	ch->numpslots--;
	ch->basic_dma = 0;
	}
	/* Cancel timeout state if request completed normally. */
	if (et != MVS_ERR_TIMEOUT) {
	lastto = (ch->toslots == (1 << slot->slot));
	ch->toslots &= ~(1 << slot->slot);
	if (lastto)
	xpt_release_simq(ch->sim, TRUE);
	}
	/* If it was our READ LOG command - process it. */
	if (ccb->ccb_h.recovery_type == RECOVERY_READ_LOG) {
	mvs_process_read_log(dev, ccb);
	/* If it was our REQUEST SENSE command - process it. */
	} else if (ccb->ccb_h.recovery_type == RECOVERY_REQUEST_SENSE) {
	mvs_process_request_sense(dev, ccb);
	/* If it was NCQ or ATAPI command error, put result on hold. */
	} else if (et == MVS_ERR_NCQ \|\|
	((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR &&
	(ccb->ccb_h.flags & CAM_DIS_AUTOSENSE) == 0)) {
	ch->hold[slot->slot] = ccb;
	ch->holdtag[slot->slot] = slot->tag;
	ch->numhslots++;
	} else
	xpt_done(ccb);
	/* If we have no other active commands, ... */
	if (ch->rslots == 0) {
	/* if there was fatal error - reset port. */
	if (ch->toslots != 0 \|\| ch->fatalerr) {
	mvs_reset(dev);
	} else {
	/* if we have slots in error, we can reinit port. */
	if (ch->eslots != 0) {
	mvs_set_edma_mode(dev, MVS_EDMA_OFF);
	ch->eslots = 0;
	}
	/* if there commands on hold, we can do READ LOG. */
	if (!ch->recoverycmd && ch->numhslots)
	mvs_issue_recovery(dev);
	}
	/* If all the rest of commands are in timeout - give them chance. */
	} else if ((ch->rslots & ~ch->toslots) == 0 &&
	et != MVS_ERR_TIMEOUT)
	mvs_rearm_timeout(dev);
	/* Unfreeze frozen command. */
	if (ch->frozen && !mvs_check_collision(dev, ch->frozen)) {
	union ccb *fccb = ch->frozen;
	ch->frozen = NULL;
	mvs_begin_transaction(dev, fccb);
	xpt_release_simq(ch->sim, TRUE);
	}
	/* Start PM timer. */
	if (ch->numrslots == 0 && ch->pm_level > 3 &&
	(ch->curr[ch->pm_present ? 15 : 0].caps & CTS_SATA_CAPS_D_PMREQ)) {
	callout_schedule(&ch->pm_timer,
	(ch->pm_level == 4) ? hz / 1000 : hz / 8);
	}
	}

	static void
	mvs_issue_recovery(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	union ccb *ccb;
	struct ccb_ataio *ataio;
	struct ccb_scsiio *csio;
	int i;

	/* Find some held command. */
	for (i = 0; i < MVS_MAX_SLOTS; i++) {
	if (ch->hold[i])
	break;
	}
	ccb = xpt_alloc_ccb_nowait();
	if (ccb == NULL) {
	device_printf(dev, "Unable to allocate recovery command\n");
	completeall:
	/* We can't do anything -- complete held commands. */
	for (i = 0; i < MVS_MAX_SLOTS; i++) {
	if (ch->hold[i] == NULL)
	continue;
	ch->hold[i]->ccb_h.status &= ~CAM_STATUS_MASK;
	ch->hold[i]->ccb_h.status \|= CAM_RESRC_UNAVAIL;
	xpt_done(ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	mvs_reset(dev);
	return;
	}
	ccb->ccb_h = ch->hold[i]->ccb_h; /* Reuse old header. */
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	/* READ LOG */
	ccb->ccb_h.recovery_type = RECOVERY_READ_LOG;
	ccb->ccb_h.func_code = XPT_ATA_IO;
	ccb->ccb_h.flags = CAM_DIR_IN;
	ccb->ccb_h.timeout = 1000; /* 1s should be enough. */
	ataio = &ccb->ataio;
	ataio->data_ptr = malloc(512, M_MVS, M_NOWAIT);
	if (ataio->data_ptr == NULL) {
	xpt_free_ccb(ccb);
	device_printf(dev,
	"Unable to allocate memory for READ LOG command\n");
	goto completeall;
	}
	ataio->dxfer_len = 512;
	bzero(&ataio->cmd, sizeof(ataio->cmd));
	ataio->cmd.flags = CAM_ATAIO_48BIT;
	ataio->cmd.command = 0x2F; /* READ LOG EXT */
	ataio->cmd.sector_count = 1;
	ataio->cmd.sector_count_exp = 0;
	ataio->cmd.lba_low = 0x10;
	ataio->cmd.lba_mid = 0;
	ataio->cmd.lba_mid_exp = 0;
	} else {
	/* REQUEST SENSE */
	ccb->ccb_h.recovery_type = RECOVERY_REQUEST_SENSE;
	ccb->ccb_h.recovery_slot = i;
	ccb->ccb_h.func_code = XPT_SCSI_IO;
	ccb->ccb_h.flags = CAM_DIR_IN;
	ccb->ccb_h.status = 0;
	ccb->ccb_h.timeout = 1000; /* 1s should be enough. */
	csio = &ccb->csio;
	csio->data_ptr = (void *)&ch->hold[i]->csio.sense_data;
	csio->dxfer_len = ch->hold[i]->csio.sense_len;
	csio->cdb_len = 6;
	bzero(&csio->cdb_io, sizeof(csio->cdb_io));
	csio->cdb_io.cdb_bytes[0] = 0x03;
	csio->cdb_io.cdb_bytes[4] = csio->dxfer_len;
	}
	/* Freeze SIM while doing recovery. */
	ch->recoverycmd = 1;
	xpt_freeze_simq(ch->sim, 1);
	mvs_begin_transaction(dev, ccb);
	}

	static void
	mvs_process_read_log(device_t dev, union ccb *ccb)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	uint8_t *data;
	struct ata_res *res;
	int i;

	ch->recoverycmd = 0;

	data = ccb->ataio.data_ptr;
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP &&
	(data[0] & 0x80) == 0) {
	for (i = 0; i < MVS_MAX_SLOTS; i++) {
	if (!ch->hold[i])
	continue;
	if (ch->hold[i]->ccb_h.target_id != ccb->ccb_h.target_id)
	continue;
	if ((data[0] & 0x1F) == ch->holdtag[i]) {
	res = &ch->hold[i]->ataio.res;
	res->status = data[2];
	res->error = data[3];
	res->lba_low = data[4];
	res->lba_mid = data[5];
	res->lba_high = data[6];
	res->device = data[7];
	res->lba_low_exp = data[8];
	res->lba_mid_exp = data[9];
	res->lba_high_exp = data[10];
	res->sector_count = data[12];
	res->sector_count_exp = data[13];
	} else {
	ch->hold[i]->ccb_h.status &= ~CAM_STATUS_MASK;
	ch->hold[i]->ccb_h.status \|= CAM_REQUEUE_REQ;
	}
	xpt_done(ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	} else {
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)
	device_printf(dev, "Error while READ LOG EXT\n");
	else if ((data[0] & 0x80) == 0) {
	device_printf(dev,
	"Non-queued command error in READ LOG EXT\n");
	}
	for (i = 0; i < MVS_MAX_SLOTS; i++) {
	if (!ch->hold[i])
	continue;
	if (ch->hold[i]->ccb_h.target_id != ccb->ccb_h.target_id)
	continue;
	xpt_done(ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	}
	free(ccb->ataio.data_ptr, M_MVS);
	xpt_free_ccb(ccb);
	xpt_release_simq(ch->sim, TRUE);
	}

	static void
	mvs_process_request_sense(device_t dev, union ccb *ccb)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	int i;

	ch->recoverycmd = 0;

	i = ccb->ccb_h.recovery_slot;
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	ch->hold[i]->ccb_h.status \|= CAM_AUTOSNS_VALID;
	} else {
	ch->hold[i]->ccb_h.status &= ~CAM_STATUS_MASK;
	ch->hold[i]->ccb_h.status \|= CAM_AUTOSENSE_FAIL;
	}
	xpt_done(ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	xpt_free_ccb(ccb);
	xpt_release_simq(ch->sim, TRUE);
	}

	static int
	mvs_wait(device_t dev, u_int s, u_int c, int t)
	{
	int timeout = 0;
	uint8_t st;

	while (((st = mvs_getstatus(dev, 0)) & (s \| c)) != s) {
	if (timeout >= t) {
	if (t != 0)
	device_printf(dev, "Wait status %02x\n", st);
	return (-1);
	}
	DELAY(1000);
	timeout++;
	}
	return (timeout);
	}

	static void
	mvs_requeue_frozen(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	union ccb *fccb = ch->frozen;

	if (fccb) {
	ch->frozen = NULL;
	fccb->ccb_h.status = CAM_REQUEUE_REQ \| CAM_RELEASE_SIMQ;
	if (!(fccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(fccb->ccb_h.path, 1);
	fccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	xpt_done(fccb);
	}
	}

	static void
	mvs_reset_to(void *arg)
	{
	device_t dev = arg;
	struct mvs_channel *ch = device_get_softc(dev);
	int t;

	if (ch->resetting == 0)
	return;
	ch->resetting--;
	if ((t = mvs_wait(dev, 0, ATA_S_BUSY \| ATA_S_DRQ, 0)) >= 0) {
	if (bootverbose) {
	device_printf(dev,
	"MVS reset: device ready after %dms\n",
	(310 - ch->resetting) * 100);
	}
	ch->resetting = 0;
	xpt_release_simq(ch->sim, TRUE);
	return;
	}
	if (ch->resetting == 0) {
	device_printf(dev,
	"MVS reset: device not ready after 31000ms\n");
	xpt_release_simq(ch->sim, TRUE);
	return;
	}
	callout_schedule(&ch->reset_timer, hz / 10);
	}

	static void
	mvs_errata(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	uint32_t val;

	if (ch->quirks & MVS_Q_SOC65) {
	val = ATA_INL(ch->r_mem, SATA_PHYM3);
	val &= ~(0x3 << 27); /* SELMUPF = 1 */
	val \|= (0x1 << 27);
	val &= ~(0x3 << 29); /* SELMUPI = 1 */
	val \|= (0x1 << 29);
	ATA_OUTL(ch->r_mem, SATA_PHYM3, val);

	val = ATA_INL(ch->r_mem, SATA_PHYM4);
	val &= ~0x1; /* SATU_OD8 = 0 */
	val \|= (0x1 << 16); /* reserved bit 16 = 1 */
	ATA_OUTL(ch->r_mem, SATA_PHYM4, val);

	val = ATA_INL(ch->r_mem, SATA_PHYM9_GEN2);
	val &= ~0xf; /* TXAMP[3:0] = 8 */
	val \|= 0x8;
	val &= ~(0x1 << 14); /* TXAMP[4] = 0 */
	ATA_OUTL(ch->r_mem, SATA_PHYM9_GEN2, val);

	val = ATA_INL(ch->r_mem, SATA_PHYM9_GEN1);
	val &= ~0xf; /* TXAMP[3:0] = 8 */
	val \|= 0x8;
	val &= ~(0x1 << 14); /* TXAMP[4] = 0 */
	ATA_OUTL(ch->r_mem, SATA_PHYM9_GEN1, val);
	}
	}

	static void
	mvs_reset(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	int i;

	xpt_freeze_simq(ch->sim, 1);
	if (bootverbose)
	device_printf(dev, "MVS reset...\n");
	/* Forget about previous reset. */
	if (ch->resetting) {
	ch->resetting = 0;
	callout_stop(&ch->reset_timer);
	xpt_release_simq(ch->sim, TRUE);
	}
	/* Requeue freezed command. */
	mvs_requeue_frozen(dev);
	/* Kill the engine and requeue all running commands. */
	mvs_set_edma_mode(dev, MVS_EDMA_OFF);
	ATA_OUTL(ch->r_mem, DMA_C, 0);
	for (i = 0; i < MVS_MAX_SLOTS; i++) {
	/* Do we have a running request on slot? */
	if (ch->slot[i].state < MVS_SLOT_RUNNING)
	continue;
	/* XXX; Commands in loading state. */
	mvs_end_transaction(&ch->slot[i], MVS_ERR_INNOCENT);
	}
	for (i = 0; i < MVS_MAX_SLOTS; i++) {
	if (!ch->hold[i])
	continue;
	xpt_done(ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	if (ch->toslots != 0)
	xpt_release_simq(ch->sim, TRUE);
	ch->eslots = 0;
	ch->toslots = 0;
	ch->fatalerr = 0;
	ch->fake_busy = 0;
	/* Tell the XPT about the event */
	xpt_async(AC_BUS_RESET, ch->path, NULL);
	ATA_OUTL(ch->r_mem, EDMA_IEM, 0);
	ATA_OUTL(ch->r_mem, EDMA_CMD, EDMA_CMD_EATARST);
	DELAY(25);
	ATA_OUTL(ch->r_mem, EDMA_CMD, 0);
	mvs_errata(dev);
	/* Reset and reconnect PHY, */
	if (!mvs_sata_phy_reset(dev)) {
	if (bootverbose)
	device_printf(dev, "MVS reset: device not found\n");
	ch->devices = 0;
	ATA_OUTL(ch->r_mem, SATA_SE, 0xffffffff);
	ATA_OUTL(ch->r_mem, EDMA_IEC, 0);
	ATA_OUTL(ch->r_mem, EDMA_IEM, ~EDMA_IE_TRANSIENT);
	xpt_release_simq(ch->sim, TRUE);
	return;
	}
	if (bootverbose)
	device_printf(dev, "MVS reset: device found\n");
	/* Wait for clearing busy status. */
	if ((i = mvs_wait(dev, 0, ATA_S_BUSY \| ATA_S_DRQ,
	dumping ? 31000 : 0)) < 0) {
	if (dumping) {
	device_printf(dev,
	"MVS reset: device not ready after 31000ms\n");
	} else
	ch->resetting = 310;
	} else if (bootverbose)
	device_printf(dev, "MVS reset: device ready after %dms\n", i);
	ch->devices = 1;
	ATA_OUTL(ch->r_mem, SATA_SE, 0xffffffff);
	ATA_OUTL(ch->r_mem, EDMA_IEC, 0);
	ATA_OUTL(ch->r_mem, EDMA_IEM, ~EDMA_IE_TRANSIENT);
	if (ch->resetting)
	callout_reset(&ch->reset_timer, hz / 10, mvs_reset_to, dev);
	else
	xpt_release_simq(ch->sim, TRUE);
	}

	static void
	mvs_softreset(device_t dev, union ccb *ccb)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	int port = ccb->ccb_h.target_id & 0x0f;
	int i, stuck;
	uint8_t status;

	mvs_set_edma_mode(dev, MVS_EDMA_OFF);
	ATA_OUTB(ch->r_mem, SATA_SATAICTL, port << SATA_SATAICTL_PMPTX_SHIFT);
	ATA_OUTB(ch->r_mem, ATA_CONTROL, ATA_A_RESET);
	DELAY(10000);
	ATA_OUTB(ch->r_mem, ATA_CONTROL, 0);
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	/* Wait for clearing busy status. */
	if ((i = mvs_wait(dev, 0, ATA_S_BUSY, ccb->ccb_h.timeout)) < 0) {
	ccb->ccb_h.status \|= CAM_CMD_TIMEOUT;
	stuck = 1;
	} else {
	status = mvs_getstatus(dev, 0);
	if (status & ATA_S_ERROR)
	ccb->ccb_h.status \|= CAM_ATA_STATUS_ERROR;
	else
	ccb->ccb_h.status \|= CAM_REQ_CMP;
	if (status & ATA_S_DRQ)
	stuck = 1;
	else
	stuck = 0;
	}
	mvs_tfd_read(dev, ccb);

	/*
	* XXX: If some device on PMP failed to soft-reset,
	* try to recover by sending dummy soft-reset to PMP.
	*/
	if (stuck && ch->pm_present && port != 15) {
	ATA_OUTB(ch->r_mem, SATA_SATAICTL,
	15 << SATA_SATAICTL_PMPTX_SHIFT);
	ATA_OUTB(ch->r_mem, ATA_CONTROL, ATA_A_RESET);
	DELAY(10000);
	ATA_OUTB(ch->r_mem, ATA_CONTROL, 0);
	mvs_wait(dev, 0, ATA_S_BUSY \| ATA_S_DRQ, ccb->ccb_h.timeout);
	}

	xpt_done(ccb);
	}

	static int
	mvs_sata_connect(struct mvs_channel *ch)
	{
	u_int32_t status;
	int timeout, found = 0;

	/* Wait up to 100ms for "connect well" */
	for (timeout = 0; timeout < 1000 ; timeout++) {
	status = ATA_INL(ch->r_mem, SATA_SS);
	if ((status & SATA_SS_DET_MASK) != SATA_SS_DET_NO_DEVICE)
	found = 1;
	if (((status & SATA_SS_DET_MASK) == SATA_SS_DET_PHY_ONLINE) &&
	((status & SATA_SS_SPD_MASK) != SATA_SS_SPD_NO_SPEED) &&
	((status & SATA_SS_IPM_MASK) == SATA_SS_IPM_ACTIVE))
	break;
	if ((status & SATA_SS_DET_MASK) == SATA_SS_DET_PHY_OFFLINE) {
	if (bootverbose) {
	device_printf(ch->dev, "SATA offline status=%08x\n",
	status);
	}
	return (0);
	}
	if (found == 0 && timeout >= 100)
	break;
	DELAY(100);
	}
	if (timeout >= 1000 \|\| !found) {
	if (bootverbose) {
	device_printf(ch->dev,
	"SATA connect timeout time=%dus status=%08x\n",
	timeout * 100, status);
	}
	return (0);
	}
	if (bootverbose) {
	device_printf(ch->dev, "SATA connect time=%dus status=%08x\n",
	timeout * 100, status);
	}
	/* Clear SATA error register */
	ATA_OUTL(ch->r_mem, SATA_SE, 0xffffffff);
	return (1);
	}

	static int
	mvs_sata_phy_reset(device_t dev)
	{
	struct mvs_channel *ch = device_get_softc(dev);
	int sata_rev;
	uint32_t val;

	sata_rev = ch->user[ch->pm_present ? 15 : 0].revision;
	if (sata_rev == 1)
	val = SATA_SC_SPD_SPEED_GEN1;
	else if (sata_rev == 2)
	val = SATA_SC_SPD_SPEED_GEN2;
	else if (sata_rev == 3)
	val = SATA_SC_SPD_SPEED_GEN3;
	else
	val = 0;
	ATA_OUTL(ch->r_mem, SATA_SC,
	SATA_SC_DET_RESET \| val \|
	SATA_SC_IPM_DIS_PARTIAL \| SATA_SC_IPM_DIS_SLUMBER);
	DELAY(1000);
	ATA_OUTL(ch->r_mem, SATA_SC,
	SATA_SC_DET_IDLE \| val \| ((ch->pm_level > 0) ? 0 :
	(SATA_SC_IPM_DIS_PARTIAL \| SATA_SC_IPM_DIS_SLUMBER)));
	if (!mvs_sata_connect(ch)) {
	if (ch->pm_level > 0)
	ATA_OUTL(ch->r_mem, SATA_SC, SATA_SC_DET_DISABLE);
	return (0);
	}
	return (1);
	}

	static int
	mvs_check_ids(device_t dev, union ccb *ccb)
	{
	struct mvs_channel *ch = device_get_softc(dev);

	if (ccb->ccb_h.target_id > ((ch->quirks & MVS_Q_GENI) ? 0 : 15)) {
	ccb->ccb_h.status = CAM_TID_INVALID;
	xpt_done(ccb);
	return (-1);
	}
	if (ccb->ccb_h.target_lun != 0) {
	ccb->ccb_h.status = CAM_LUN_INVALID;
	xpt_done(ccb);
	return (-1);
	}
	/*
	* It's a programming error to see AUXILIARY register requests.
	*/
	KASSERT(ccb->ccb_h.func_code != XPT_ATA_IO \|\|
	((ccb->ataio.ata_flags & ATA_FLAG_AUX) == 0),
	("AUX register unsupported"));
	return (0);
	}

	static void
	mvsaction(struct cam_sim sim, union ccb ccb)
	{
	device_t dev, parent;
	struct mvs_channel *ch;

	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("mvsaction func_code=%x\n",
	ccb->ccb_h.func_code));

	ch = (struct mvs_channel *)cam_sim_softc(sim);
	dev = ch->dev;
	switch (ccb->ccb_h.func_code) {
	/* Common cases first */
	case XPT_ATA_IO: /* Execute the requested I/O operation */
	case XPT_SCSI_IO:
	if (mvs_check_ids(dev, ccb))
	return;
	if (ch->devices == 0 \|\|
	(ch->pm_present == 0 &&
	ccb->ccb_h.target_id > 0 && ccb->ccb_h.target_id < 15)) {
	ccb->ccb_h.status = CAM_SEL_TIMEOUT;
	break;
	}
	ccb->ccb_h.recovery_type = RECOVERY_NONE;
	/* Check for command collision. */
	if (mvs_check_collision(dev, ccb)) {
	/* Freeze command. */
	ch->frozen = ccb;
	/* We have only one frozen slot, so freeze simq also. */
	xpt_freeze_simq(ch->sim, 1);
	return;
	}
	mvs_begin_transaction(dev, ccb);
	return;
	case XPT_ABORT: /* Abort the specified CCB */
	/* XXX Implement */
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	case XPT_SET_TRAN_SETTINGS:
	{
	struct ccb_trans_settings *cts = &ccb->cts;
	struct mvs_device *d;

	if (mvs_check_ids(dev, ccb))
	return;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS)
	d = &ch->curr[ccb->ccb_h.target_id];
	else
	d = &ch->user[ccb->ccb_h.target_id];
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_REVISION)
	d->revision = cts->xport_specific.sata.revision;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_MODE)
	d->mode = cts->xport_specific.sata.mode;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_BYTECOUNT) {
	d->bytecount = min((ch->quirks & MVS_Q_GENIIE) ? 8192 : 2048,
	cts->xport_specific.sata.bytecount);
	}
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_TAGS)
	d->tags = min(MVS_MAX_SLOTS, cts->xport_specific.sata.tags);
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_PM)
	ch->pm_present = cts->xport_specific.sata.pm_present;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_ATAPI)
	d->atapi = cts->xport_specific.sata.atapi;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_CAPS)
	d->caps = cts->xport_specific.sata.caps;
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_GET_TRAN_SETTINGS:
	/* Get default/user set transfer settings for the target */
	{
	struct ccb_trans_settings *cts = &ccb->cts;
	struct mvs_device *d;
	uint32_t status;

	if (mvs_check_ids(dev, ccb))
	return;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS)
	d = &ch->curr[ccb->ccb_h.target_id];
	else
	d = &ch->user[ccb->ccb_h.target_id];
	cts->protocol = PROTO_UNSPECIFIED;
	cts->protocol_version = PROTO_VERSION_UNSPECIFIED;
	cts->transport = XPORT_SATA;
	cts->transport_version = XPORT_VERSION_UNSPECIFIED;
	cts->proto_specific.valid = 0;
	cts->xport_specific.sata.valid = 0;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS &&
	(ccb->ccb_h.target_id == 15 \|\|
	(ccb->ccb_h.target_id == 0 && !ch->pm_present))) {
	status = ATA_INL(ch->r_mem, SATA_SS) & SATA_SS_SPD_MASK;
	if (status & 0x0f0) {
	cts->xport_specific.sata.revision =
	(status & 0x0f0) >> 4;
	cts->xport_specific.sata.valid \|=
	CTS_SATA_VALID_REVISION;
	}
	cts->xport_specific.sata.caps = d->caps & CTS_SATA_CAPS_D;
	// if (ch->pm_level)
	// cts->xport_specific.sata.caps \|= CTS_SATA_CAPS_H_PMREQ;
	cts->xport_specific.sata.caps \|= CTS_SATA_CAPS_H_AN;
	cts->xport_specific.sata.caps &=
	ch->user[ccb->ccb_h.target_id].caps;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_CAPS;
	} else {
	cts->xport_specific.sata.revision = d->revision;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_REVISION;
	cts->xport_specific.sata.caps = d->caps;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS/* &&
	(ch->quirks & MVS_Q_GENIIE) == 0*/)
	cts->xport_specific.sata.caps &= ~CTS_SATA_CAPS_H_AN;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_CAPS;
	}
	cts->xport_specific.sata.mode = d->mode;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_MODE;
	cts->xport_specific.sata.bytecount = d->bytecount;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_BYTECOUNT;
	cts->xport_specific.sata.pm_present = ch->pm_present;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_PM;
	cts->xport_specific.sata.tags = d->tags;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_TAGS;
	cts->xport_specific.sata.atapi = d->atapi;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_ATAPI;
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_RESET_BUS: /* Reset the specified SCSI bus */
	case XPT_RESET_DEV: /* Bus Device Reset the specified SCSI device */
	mvs_reset(dev);
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	case XPT_TERM_IO: /* Terminate the I/O process */
	/* XXX Implement */
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	case XPT_PATH_INQ: /* Path routing inquiry */
	{
	struct ccb_pathinq *cpi = &ccb->cpi;

	parent = device_get_parent(dev);
	cpi->version_num = 1; /* XXX??? */
	cpi->hba_inquiry = PI_SDTR_ABLE;
	if (!(ch->quirks & MVS_Q_GENI)) {
	cpi->hba_inquiry \|= PI_SATAPM;
	/* Gen-II is extremely slow with NCQ on PMP. */
	if ((ch->quirks & MVS_Q_GENIIE) \|\| ch->pm_present == 0)
	cpi->hba_inquiry \|= PI_TAG_ABLE;
	}
	cpi->target_sprt = 0;
	cpi->hba_misc = PIM_SEQSCAN;
	cpi->hba_eng_cnt = 0;
	if (!(ch->quirks & MVS_Q_GENI))
	cpi->max_target = 15;
	else
	cpi->max_target = 0;
	cpi->max_lun = 0;
	cpi->initiator_id = 0;
	cpi->bus_id = cam_sim_bus(sim);
	cpi->base_transfer_speed = 150000;
	strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strlcpy(cpi->hba_vid, "Marvell", HBA_IDLEN);
	strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
	cpi->unit_number = cam_sim_unit(sim);
	cpi->transport = XPORT_SATA;
	cpi->transport_version = XPORT_VERSION_UNSPECIFIED;
	cpi->protocol = PROTO_ATA;
	cpi->protocol_version = PROTO_VERSION_UNSPECIFIED;
	- cpi->maxio = MAXPHYS;
	+ cpi->maxio = maxphys;
	if ((ch->quirks & MVS_Q_SOC) == 0) {
	cpi->hba_vendor = pci_get_vendor(parent);
	cpi->hba_device = pci_get_device(parent);
	cpi->hba_subvendor = pci_get_subvendor(parent);
	cpi->hba_subdevice = pci_get_subdevice(parent);
	}
	cpi->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	default:
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	}
	xpt_done(ccb);
	}

	static void
	mvspoll(struct cam_sim *sim)
	{
	struct mvs_channel ch = (struct mvs_channel )cam_sim_softc(sim);
	struct mvs_intr_arg arg;

	arg.arg = ch->dev;
	arg.cause = 2 \| 4; /* XXX */
	mvs_ch_intr(&arg);
	if (ch->resetting != 0 &&
	(--ch->resetpolldiv <= 0 \|\| !callout_pending(&ch->reset_timer))) {
	ch->resetpolldiv = 1000;
	mvs_reset_to(ch->dev);
	}
	}
	diff --git a/sys/dev/mvs/mvs.h b/sys/dev/mvs/mvs.h
	index a25faed18901..4a5e58ffbe72 100644
	--- a/sys/dev/mvs/mvs.h
	+++ b/sys/dev/mvs/mvs.h
	@@ -1,663 +1,664 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include "mvs_if.h"

	/* Chip registers */
	#define CHIP_PCIEIC 0x1900 /* PCIe Interrupt Cause */
	#define CHIP_PCIEIM 0x1910 /* PCIe Interrupt Mask */
	#define CHIP_PCIIC 0x1d58 /* PCI Interrupt Cause */
	#define CHIP_PCIIM 0x1d5c /* PCI Interrupt Mask */
	#define CHIP_MIC 0x1d60 /* Main Interrupt Cause */
	#define CHIP_MIM 0x1d64 /* Main Interrupt Mask */
	#define CHIP_SOC_MIC 0x20 /* SoC Main Interrupt Cause */
	#define CHIP_SOC_MIM 0x24 /* SoC Main Interrupt Mask */
	#define IC_ERR_IRQ (1 << 0) /* shift by (2 * port #) */
	#define IC_DONE_IRQ (1 << 1) /* shift by (2 * port #) */
	#define IC_HC0 0x000001ff /* bits 0-8 = HC0 */
	#define IC_HC_SHIFT 9 /* HC1 shift */
	#define IC_HC1 (IC_HC0 << IC_HC_SHIFT) /* 9-17 = HC1 */
	#define IC_ERR_HC0 0x00000055 /* HC0 ERR_IRQ */
	#define IC_DONE_HC0 0x000000aa /* HC0 DONE_IRQ */
	#define IC_ERR_HC1 (IC_ERR_HC0 << IC_HC_SHIFT) /* HC1 ERR_IRQ */
	#define IC_DONE_HC1 (IC_DONE_HC0 << IC_HC_SHIFT) /* HC1 DONE_IRQ */
	#define IC_HC0_COAL_DONE (1 << 8) /* HC0 IRQ coalescing */
	#define IC_HC1_COAL_DONE (1 << 17) /* HC1 IRQ coalescing */
	#define IC_PCI_ERR (1 << 18)
	#define IC_TRAN_COAL_LO_DONE (1 << 19) /* transaction coalescing */
	#define IC_TRAN_COAL_HI_DONE (1 << 20) /* transaction coalescing */
	#define IC_ALL_PORTS_COAL_DONE (1 << 21) /* GEN_II(E) IRQ coalescing */
	#define IC_GPIO_INT (1 << 22)
	#define IC_SELF_INT (1 << 23)
	#define IC_TWSI_INT (1 << 24)
	#define IC_MAIN_RSVD (0xfe000000) /* bits 31-25 */
	#define IC_MAIN_RSVD_5 (0xfff10000) /* bits 31-19 */
	#define IC_MAIN_RSVD_SOC (0xfffffec0) /* bits 31-9, 7-6 */

	#define CHIP_SOC_LED 0x2C /* SoC LED Configuration */

	/* Additional mask for SoC devices with less than 4 channels */
	#define CHIP_SOC_HC0_MASK(num) (0xff >> ((4 - (num)) * 2))

	/* Chip CCC registers */
	#define CHIP_ICC 0x18008
	#define CHIP_ICC_ALL_PORTS (1 << 4) /* all ports irq event */
	#define CHIP_ICT 0x180cc
	#define CHIP_ITT 0x180d0
	#define CHIP_TRAN_COAL_CAUSE_LO 0x18088
	#define CHIP_TRAN_COAL_CAUSE_HI 0x1808c

	/* Host Controller registers */
	#define HC_SIZE 0x10000
	#define HC_OFFSET 0x20000
	#define HC_BASE(hc) ((hc) * HC_SIZE + HC_OFFSET)

	#define HC_CFG 0x0 /* Configuration */
	#define HC_CFG_TIMEOUT_MASK (0xff << 0)
	#define HC_CFG_NODMABS (1 << 8)
	#define HC_CFG_NOEDMABS (1 << 9)
	#define HC_CFG_NOPRDBS (1 << 10)
	#define HC_CFG_TIMEOUTEN (1 << 16) /* Timer Enable */
	#define HC_CFG_COALDIS(p) (1 << ((p) + 24))/* Coalescing Disable*/
	#define HC_RQOP 0x4 /* Request Queue Out-Pointer */
	#define HC_RQIP 0x8 /* Response Queue In-Pointer */
	#define HC_ICT 0xc /* Interrupt Coalescing Threshold */
	#define HC_ICT_SAICOALT_MASK 0x000000ff
	#define HC_ITT 0x10 /* Interrupt Time Threshold */
	#define HC_ITT_SAITMTH_MASK 0x00ffffff
	#define HC_IC 0x14 /* Interrupt Cause */
	#define HC_IC_DONE(p) (1 << (p)) /* SaCrpb/DMA Done */
	#define HC_IC_COAL (1 << 4) /* Intr Coalescing */
	#define HC_IC_DEV(p) (1 << ((p) + 8)) /* Device Intr */

	/* Port registers */
	#define PORT_SIZE 0x2000
	#define PORT_OFFSET 0x2000
	#define PORT_BASE(hc) ((hc) * PORT_SIZE + PORT_OFFSET)

	#define EDMA_CFG 0x0 /* Configuration */
	#define EDMA_CFG_RESERVED (0x1f << 0) /* Queue len ? */
	#define EDMA_CFG_ESATANATVCMDQUE (1 << 5)
	#define EDMA_CFG_ERDBSZ (1 << 8)
	#define EDMA_CFG_EQUE (1 << 9)
	#define EDMA_CFG_ERDBSZEXT (1 << 11)
	#define EDMA_CFG_RESERVED2 (1 << 12)
	#define EDMA_CFG_EWRBUFFERLEN (1 << 13)
	#define EDMA_CFG_EDEVERR (1 << 14)
	#define EDMA_CFG_EEDMAFBS (1 << 16)
	#define EDMA_CFG_ECUTTHROUGHEN (1 << 17)
	#define EDMA_CFG_EEARLYCOMPLETIONEN (1 << 18)
	#define EDMA_CFG_EEDMAQUELEN (1 << 19)
	#define EDMA_CFG_EHOSTQUEUECACHEEN (1 << 22)
	#define EDMA_CFG_EMASKRXPM (1 << 23)
	#define EDMA_CFG_RESUMEDIS (1 << 24)
	#define EDMA_CFG_EDMAFBS (1 << 26)
	#define EDMA_T 0x4 /* Timer */
	#define EDMA_IEC 0x8 /* Interrupt Error Cause */
	#define EDMA_IEM 0xc /* Interrupt Error Mask */
	#define EDMA_IE_EDEVERR (1 << 2) /* EDMA Device Error */
	#define EDMA_IE_EDEVDIS (1 << 3) /* EDMA Dev Disconn */
	#define EDMA_IE_EDEVCON (1 << 4) /* EDMA Dev Conn */
	#define EDMA_IE_SERRINT (1 << 5)
	#define EDMA_IE_ESELFDIS (1 << 7) /* EDMA Self Disable */
	#define EDMA_IE_ETRANSINT (1 << 8) /* Transport Layer */
	#define EDMA_IE_EIORDYERR (1 << 12) /* EDMA IORdy Error */
	#define EDMA_IE_LINKXERR_SATACRC (1 << 0) /* SATA CRC error */
	#define EDMA_IE_LINKXERR_INTERNALFIFO (1 << 1) /* internal FIFO err */
	#define EDMA_IE_LINKXERR_LINKLAYERRESET (1 << 2)
	/* Link Layer is reset by the reception of SYNC primitive from device */
	#define EDMA_IE_LINKXERR_OTHERERRORS (1 << 3)
	/*
	* Link state errors, coding errors, or running disparity errors occur
	* during FIS reception.
	*/
	#define EDMA_IE_LINKTXERR_FISTXABORTED (1 << 4) /* FIS Tx is aborted */
	#define EDMA_IE_LINKCTLRXERR(x) ((x) << 13) /* Link Ctrl Recv Err */
	#define EDMA_IE_LINKDATARXERR(x) ((x) << 17) /* Link Data Recv Err */
	#define EDMA_IE_LINKCTLTXERR(x) ((x) << 21) /* Link Ctrl Tx Error */
	#define EDMA_IE_LINKDATATXERR(x) ((x) << 26) /* Link Data Tx Error */
	#define EDMA_IE_TRANSPROTERR (1U << 31) /* Transport Proto E */
	#define EDMA_IE_TRANSIENT (EDMA_IE_LINKCTLRXERR(0x0b) \| \
	EDMA_IE_LINKCTLTXERR(0x1f))
	/* Non-fatal Errors */
	#define EDMA_REQQBAH 0x10 /* Request Queue Base Address High */
	#define EDMA_REQQIP 0x14 /* Request Queue In-Pointer */
	#define EDMA_REQQOP 0x18 /* Request Queue Out-Pointer */
	#define EDMA_REQQP_ERQQP_SHIFT 5
	#define EDMA_REQQP_ERQQP_MASK 0x000003e0
	#define EDMA_REQQP_ERQQBAP_MASK 0x00000c00
	#define EDMA_REQQP_ERQQBA_MASK 0xfffff000
	#define EDMA_RESQBAH 0x1c /* Response Queue Base Address High */
	#define EDMA_RESQIP 0x20 /* Response Queue In-Pointer */
	#define EDMA_RESQOP 0x24 /* Response Queue Out-Pointer */
	#define EDMA_RESQP_ERPQP_SHIFT 3
	#define EDMA_RESQP_ERPQP_MASK 0x000000f8
	#define EDMA_RESQP_ERPQBAP_MASK 0x00000300
	#define EDMA_RESQP_ERPQBA_MASK 0xfffffc00
	#define EDMA_CMD 0x28 /* Command */
	#define EDMA_CMD_EENEDMA (1 << 0) /* Enable EDMA */
	#define EDMA_CMD_EDSEDMA (1 << 1) /* Disable EDMA */
	#define EDMA_CMD_EATARST (1 << 2) /* ATA Device Reset */
	#define EDMA_CMD_EEDMAFRZ (1 << 4) /* EDMA Freeze */
	#define EDMA_TC 0x2c /* Test Control */
	#define EDMA_S 0x30 /* Status */
	#define EDMA_S_EDEVQUETAG(s) ((s) & 0x0000001f)
	#define EDMA_S_EDEVDIR_WRITE (0 << 5)
	#define EDMA_S_EDEVDIR_READ (1 << 5)
	#define EDMA_S_ECACHEEMPTY (1 << 6)
	#define EDMA_S_EDMAIDLE (1 << 7)
	#define EDMA_S_ESTATE(s) (((s) & 0x0000ff00) >> 8)
	#define EDMA_S_EIOID(s) (((s) & 0x003f0000) >> 16)
	#define EDMA_IORT 0x34 /* IORdy Timeout */
	#define EDMA_CDT 0x40 /* Command Delay Threshold */
	#define EDMA_HC 0x60 /* Halt Condition */
	#define EDMA_UNKN_RESD 0x6C /* Unknown register */
	#define EDMA_CQDCQOS(x) (0x90 + ((x) << 2)
	/* NCQ Done/TCQ Outstanding Status */

	/* ATA register defines */
	#define ATA_DATA 0x100 /* (RW) data */
	#define ATA_FEATURE 0x104 /* (W) feature */
	#define ATA_F_DMA 0x01 /* enable DMA */
	#define ATA_F_OVL 0x02 /* enable overlap */
	#define ATA_ERROR 0x104 /* (R) error */
	#define ATA_E_ILI 0x01 /* illegal length */
	#define ATA_E_NM 0x02 /* no media */
	#define ATA_E_ABORT 0x04 /* command aborted */
	#define ATA_E_MCR 0x08 /* media change request */
	#define ATA_E_IDNF 0x10 /* ID not found */
	#define ATA_E_MC 0x20 /* media changed */
	#define ATA_E_UNC 0x40 /* uncorrectable data */
	#define ATA_E_ICRC 0x80 /* UDMA crc error */
	#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */
	#define ATA_COUNT 0x108 /* (W) sector count */
	#define ATA_IREASON 0x108 /* (R) interrupt reason */
	#define ATA_I_CMD 0x01 /* cmd (1) \| data (0) */
	#define ATA_I_IN 0x02 /* read (1) \| write (0) */
	#define ATA_I_RELEASE 0x04 /* released bus (1) */
	#define ATA_I_TAGMASK 0xf8 /* tag mask */
	#define ATA_SECTOR 0x10c /* (RW) sector # */
	#define ATA_CYL_LSB 0x110 /* (RW) cylinder# LSB */
	#define ATA_CYL_MSB 0x114 /* (RW) cylinder# MSB */
	#define ATA_DRIVE 0x118 /* (W) Sector/Drive/Head */
	#define ATA_D_LBA 0x40 /* use LBA addressing */
	#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */
	#define ATA_COMMAND 0x11c /* (W) command */
	#define ATA_STATUS 0x11c /* (R) status */
	#define ATA_S_ERROR 0x01 /* error */
	#define ATA_S_INDEX 0x02 /* index */
	#define ATA_S_CORR 0x04 /* data corrected */
	#define ATA_S_DRQ 0x08 /* data request */
	#define ATA_S_DSC 0x10 /* drive seek completed */
	#define ATA_S_SERVICE 0x10 /* drive needs service */
	#define ATA_S_DWF 0x20 /* drive write fault */
	#define ATA_S_DMA 0x20 /* DMA ready */
	#define ATA_S_READY 0x40 /* drive ready */
	#define ATA_S_BUSY 0x80 /* busy */
	#define ATA_CONTROL 0x120 /* (W) control */
	#define ATA_A_IDS 0x02 /* disable interrupts */
	#define ATA_A_RESET 0x04 /* RESET controller */
	#define ATA_A_4BIT 0x08 /* 4 head bits */
	#define ATA_A_HOB 0x80 /* High Order Byte enable */
	#define ATA_ALTSTAT 0x120 /* (R) alternate status */
	#define ATAPI_P_READ (ATA_S_DRQ \| ATA_I_IN)
	#define ATAPI_P_WRITE (ATA_S_DRQ)
	#define ATAPI_P_CMDOUT (ATA_S_DRQ \| ATA_I_CMD)
	#define ATAPI_P_DONEDRQ (ATA_S_DRQ \| ATA_I_CMD \| ATA_I_IN)
	#define ATAPI_P_DONE (ATA_I_CMD \| ATA_I_IN)
	#define ATAPI_P_ABORT 0

	/* Basic DMA Registers */
	#define DMA_C 0x224 /* Basic DMA Command */
	#define DMA_C_START (1 << 0)
	#define DMA_C_READ (1 << 3)
	#define DMA_C_DREGIONVALID (1 << 8)
	#define DMA_C_DREGIONLAST (1 << 9)
	#define DMA_C_CONTFROMPREV (1 << 10)
	#define DMA_C_DRBC(n) (((n) & 0xffff) << 16)
	#define DMA_S 0x228 /* Basic DMA Status */
	#define DMA_S_ACT (1 << 0) /* Active */
	#define DMA_S_ERR (1 << 1) /* Error */
	#define DMA_S_PAUSED (1 << 2) /* Paused */
	#define DMA_S_LAST (1 << 3) /* Last */
	#define DMA_DTLBA 0x22c /* Descriptor Table Low Base Address */
	#define DMA_DTLBA_MASK 0xfffffff0
	#define DMA_DTHBA 0x230 /* Descriptor Table High Base Address */
	#define DMA_DRLA 0x234 /* Data Region Low Address */
	#define DMA_DRHA 0x238 /* Data Region High Address */

	/* Serial-ATA Registers */
	#define SATA_SS 0x300 /* SStatus */
	#define SATA_SS_DET_MASK 0x0000000f
	#define SATA_SS_DET_NO_DEVICE 0x00000000
	#define SATA_SS_DET_DEV_PRESENT 0x00000001
	#define SATA_SS_DET_PHY_ONLINE 0x00000003
	#define SATA_SS_DET_PHY_OFFLINE 0x00000004

	#define SATA_SS_SPD_MASK 0x000000f0
	#define SATA_SS_SPD_NO_SPEED 0x00000000
	#define SATA_SS_SPD_GEN1 0x00000010
	#define SATA_SS_SPD_GEN2 0x00000020
	#define SATA_SS_SPD_GEN3 0x00000030

	#define SATA_SS_IPM_MASK 0x00000f00
	#define SATA_SS_IPM_NO_DEVICE 0x00000000
	#define SATA_SS_IPM_ACTIVE 0x00000100
	#define SATA_SS_IPM_PARTIAL 0x00000200
	#define SATA_SS_IPM_SLUMBER 0x00000600
	#define SATA_SE 0x304 /* SError */
	#define SATA_SEIM 0x340 /* SError Interrupt Mask */
	#define SATA_SE_DATA_CORRECTED 0x00000001
	#define SATA_SE_COMM_CORRECTED 0x00000002
	#define SATA_SE_DATA_ERR 0x00000100
	#define SATA_SE_COMM_ERR 0x00000200
	#define SATA_SE_PROT_ERR 0x00000400
	#define SATA_SE_HOST_ERR 0x00000800
	#define SATA_SE_PHY_CHANGED 0x00010000
	#define SATA_SE_PHY_IERROR 0x00020000
	#define SATA_SE_COMM_WAKE 0x00040000
	#define SATA_SE_DECODE_ERR 0x00080000
	#define SATA_SE_PARITY_ERR 0x00100000
	#define SATA_SE_CRC_ERR 0x00200000
	#define SATA_SE_HANDSHAKE_ERR 0x00400000
	#define SATA_SE_LINKSEQ_ERR 0x00800000
	#define SATA_SE_TRANSPORT_ERR 0x01000000
	#define SATA_SE_UNKNOWN_FIS 0x02000000
	#define SATA_SC 0x308 /* SControl */
	#define SATA_SC_DET_MASK 0x0000000f
	#define SATA_SC_DET_IDLE 0x00000000
	#define SATA_SC_DET_RESET 0x00000001
	#define SATA_SC_DET_DISABLE 0x00000004

	#define SATA_SC_SPD_MASK 0x000000f0
	#define SATA_SC_SPD_NO_SPEED 0x00000000
	#define SATA_SC_SPD_SPEED_GEN1 0x00000010
	#define SATA_SC_SPD_SPEED_GEN2 0x00000020
	#define SATA_SC_SPD_SPEED_GEN3 0x00000030

	#define SATA_SC_IPM_MASK 0x00000f00
	#define SATA_SC_IPM_NONE 0x00000000
	#define SATA_SC_IPM_DIS_PARTIAL 0x00000100
	#define SATA_SC_IPM_DIS_SLUMBER 0x00000200

	#define SATA_SC_SPM_MASK 0x0000f000
	#define SATA_SC_SPM_NONE 0x00000000
	#define SATA_SC_SPM_PARTIAL 0x00001000
	#define SATA_SC_SPM_SLUMBER 0x00002000
	#define SATA_SC_SPM_ACTIVE 0x00004000
	#define SATA_LTM 0x30c /* LTMode */
	#define SATA_PHYM3 0x310 /* PHY Mode 3 */
	#define SATA_PHYM4 0x314 /* PHY Mode 4 */
	#define SATA_PHYM1 0x32c /* PHY Mode 1 */
	#define SATA_PHYM2 0x330 /* PHY Mode 2 */
	#define SATA_BISTC 0x334 /* BIST Control */
	#define SATA_BISTDW1 0x338 /* BIST DW1 */
	#define SATA_BISTDW2 0x33c /* BIST DW2 */
	#define SATA_SATAICFG 0x050 /* Serial-ATA Interface Configuration */
	#define SATA_SATAICFG_REFCLKCNF_20MHZ (0 << 0)
	#define SATA_SATAICFG_REFCLKCNF_25MHZ (1 << 0)
	#define SATA_SATAICFG_REFCLKCNF_30MHZ (2 << 0)
	#define SATA_SATAICFG_REFCLKCNF_40MHZ (3 << 0)
	#define SATA_SATAICFG_REFCLKCNF_MASK (3 << 0)
	#define SATA_SATAICFG_REFCLKDIV_1 (0 << 2)
	#define SATA_SATAICFG_REFCLKDIV_2 (1 << 2) /* Used 20 or 25MHz */
	#define SATA_SATAICFG_REFCLKDIV_4 (2 << 2) /* Used 40MHz */
	#define SATA_SATAICFG_REFCLKDIV_3 (3 << 2) /* Used 30MHz */
	#define SATA_SATAICFG_REFCLKDIV_MASK (3 << 2)
	#define SATA_SATAICFG_REFCLKFEEDDIV_50 (0 << 4) /* or 100, when Gen2En is 1 */
	#define SATA_SATAICFG_REFCLKFEEDDIV_60 (1 << 4) /* or 120. Used 25MHz */
	#define SATA_SATAICFG_REFCLKFEEDDIV_75 (2 << 4) /* or 150. Used 20MHz */
	#define SATA_SATAICFG_REFCLKFEEDDIV_90 (3 << 4) /* or 180 */
	#define SATA_SATAICFG_REFCLKFEEDDIV_MASK (3 << 4)
	#define SATA_SATAICFG_PHYSSCEN (1 << 6)
	#define SATA_SATAICFG_GEN2EN (1 << 7)
	#define SATA_SATAICFG_COMMEN (1 << 8)
	#define SATA_SATAICFG_PHYSHUTDOWN (1 << 9)
	#define SATA_SATAICFG_TARGETMODE (1 << 10) /* 1 = Initiator */
	#define SATA_SATAICFG_COMCHANNEL (1 << 11)
	#define SATA_SATAICFG_IGNOREBSY (1 << 24)
	#define SATA_SATAICFG_LINKRSTEN (1 << 25)
	#define SATA_SATAICFG_CMDRETXDS (1 << 26)
	#define SATA_SATAICTL 0x344 /* Serial-ATA Interface Control */
	#define SATA_SATAICTL_PMPTX_MASK 0x0000000f
	#define SATA_SATAICTL_PMPTX_SHIFT 0
	#define SATA_SATAICTL_VUM (1 << 8)
	#define SATA_SATAICTL_VUS (1 << 9)
	#define SATA_SATAICTL_EDMAACT (1 << 16)
	#define SATA_SATAICTL_CLEARSTAT (1 << 24)
	#define SATA_SATAICTL_SRST (1 << 25)
	#define SATA_SATAITC 0x348 /* Serial-ATA Interface Test Control */
	#define SATA_SATAIS 0x34c /* Serial-ATA Interface Status */
	#define SATA_VU 0x35c /* Vendor Unique */
	#define SATA_FISC 0x360 /* FIS Configuration */
	#define SATA_FISC_FISWAIT4RDYEN_B0 (1 << 0) /* Device to Host FIS */
	#define SATA_FISC_FISWAIT4RDYEN_B1 (1 << 1) /* SDB FIS rcv with <N>bit 0 */
	#define SATA_FISC_FISWAIT4RDYEN_B2 (1 << 2) /* DMA Activate FIS */
	#define SATA_FISC_FISWAIT4RDYEN_B3 (1 << 3) /* DMA Setup FIS */
	#define SATA_FISC_FISWAIT4RDYEN_B4 (1 << 4) /* Data FIS first DW */
	#define SATA_FISC_FISWAIT4RDYEN_B5 (1 << 5) /* Data FIS entire FIS */
	#define SATA_FISC_FISWAIT4HOSTRDYEN_B0 (1 << 8)
	/* Device to Host FIS with <ERR> or <DF> */
	#define SATA_FISC_FISWAIT4HOSTRDYEN_B1 (1 << 9) /* SDB FIS rcv with <N>bit */
	#define SATA_FISC_FISWAIT4HOSTRDYEN_B2 (1 << 10) /* SDB FIS rcv with <ERR> */
	#define SATA_FISC_FISWAIT4HOSTRDYEN_B3 (1 << 11) /* BIST Acivate FIS */
	#define SATA_FISC_FISWAIT4HOSTRDYEN_B4 (1 << 12) /* PIO Setup FIS */
	#define SATA_FISC_FISWAIT4HOSTRDYEN_B5 (1 << 13) /* Data FIS with Link error */
	#define SATA_FISC_FISWAIT4HOSTRDYEN_B6 (1 << 14) /* Unrecognized FIS type */
	#define SATA_FISC_FISWAIT4HOSTRDYEN_B7 (1 << 15) /* Any FIS */
	#define SATA_FISC_FISDMAACTIVATESYNCRESP (1 << 16)
	#define SATA_FISC_FISUNRECTYPECONT (1 << 17)
	#define SATA_FISIC 0x364 /* FIS Interrupt Cause */
	#define SATA_FISIM 0x368 /* FIS Interrupt Mask */
	#define SATA_FISDW0 0x370 /* FIS DW0 */
	#define SATA_FISDW1 0x374 /* FIS DW1 */
	#define SATA_FISDW2 0x378 /* FIS DW2 */
	#define SATA_FISDW3 0x37c /* FIS DW3 */
	#define SATA_FISDW4 0x380 /* FIS DW4 */
	#define SATA_FISDW5 0x384 /* FIS DW5 */
	#define SATA_FISDW6 0x388 /* FIS DW6 */

	#define SATA_PHYM9_GEN2 0x398
	#define SATA_PHYM9_GEN1 0x39c
	#define SATA_PHYCFG_OFS 0x3a0 /* 65nm SoCs only */

	#define MVS_MAX_PORTS 8
	#define MVS_MAX_SLOTS 32

	/* Pessimistic prognosis on number of required S/G entries */
	-#define MVS_SG_ENTRIES (btoc(MAXPHYS) + 1)
	+#define MVS_SG_ENTRIES (btoc(maxphys) + 1)

	/* EDMA Command Request Block (CRQB) Data */
	struct mvs_crqb {
	uint32_t cprdbl; /* cPRD Desriptor Table Base Low Address */
	uint32_t cprdbh; /* cPRD Desriptor Table Base High Address */
	uint16_t ctrlflg; /* Control Flags */
	#define MVS_CRQB_READ 0x0001
	#define MVS_CRQB_TAG_MASK 0x003e
	#define MVS_CRQB_TAG_SHIFT 1
	#define MVS_CRQB_PMP_MASK 0xf000
	#define MVS_CRQB_PMP_SHIFT 12
	uint8_t cmd[22];
	} __packed;

	struct mvs_crqb_gen2e {
	uint32_t cprdbl; /* cPRD Desriptor Table Base Low Address */
	uint32_t cprdbh; /* cPRD Desriptor Table Base High Address */
	uint32_t ctrlflg; /* Control Flags */
	#define MVS_CRQB2E_READ 0x00000001
	#define MVS_CRQB2E_DTAG_MASK 0x0000003e
	#define MVS_CRQB2E_DTAG_SHIFT 1
	#define MVS_CRQB2E_PMP_MASK 0x0000f000
	#define MVS_CRQB2E_PMP_SHIFT 12
	#define MVS_CRQB2E_CPRD 0x00010000
	#define MVS_CRQB2E_HTAG_MASK 0x003e0000
	#define MVS_CRQB2E_HTAG_SHIFT 17
	uint32_t drbc; /* Data Region Byte Count */
	uint8_t cmd[16];
	} __packed;

	/* EDMA Phisical Region Descriptors (ePRD) Table Data Structure */
	struct mvs_eprd {
	uint32_t prdbal; /* Address bits[31:1] */
	uint32_t bytecount; /* Byte Count */
	#define MVS_EPRD_MASK 0x0000ffff /* max 64KB */
	#define MVS_EPRD_MAX (MVS_EPRD_MASK + 1)
	#define MVS_EPRD_EOF 0x80000000
	uint32_t prdbah; /* Address bits[63:32] */
	uint32_t resv;
	} __packed;

	/* Command request blocks. 32 commands. First 1Kbyte aligned. */
	#define MVS_CRQB_OFFSET 0
	#define MVS_CRQB_SIZE 32 /* sizeof(struct mvs_crqb) */
	#define MVS_CRQB_MASK 0x000003e0
	#define MVS_CRQB_SHIFT 5
	#define MVS_CRQB_TO_ADDR(slot) ((slot) << MVS_CRQB_SHIFT)
	#define MVS_ADDR_TO_CRQB(addr) (((addr) & MVS_CRQB_MASK) >> MVS_CRQB_SHIFT)
	/* ePRD blocks. Up to 32 commands, Each 16byte aligned. */
	#define MVS_EPRD_OFFSET (MVS_CRQB_OFFSET + MVS_CRQB_SIZE * MVS_MAX_SLOTS)
	#define MVS_EPRD_SIZE (MVS_SG_ENTRIES * 16) /* sizeof(struct mvs_eprd) */
	/* Request work area. */
	#define MVS_WORKRQ_SIZE (MVS_EPRD_OFFSET + MVS_EPRD_SIZE * MVS_MAX_SLOTS)

	/* EDMA Command Response Block (CRPB) Data */
	struct mvs_crpb {
	uint16_t id; /* CRPB ID */
	#define MVS_CRPB_TAG_MASK 0x001F
	#define MVS_CRPB_TAG_SHIFT 0
	uint16_t rspflg; /* CPRB Response Flags */
	#define MVS_CRPB_EDMASTS_MASK 0x007F
	#define MVS_CRPB_EDMASTS_SHIFT 0
	#define MVS_CRPB_ATASTS_MASK 0xFF00
	#define MVS_CRPB_ATASTS_SHIFT 8
	uint32_t ts; /* CPRB Time Stamp */
	} __packed;

	/* Command response blocks. 32 commands. First 256byte aligned. */
	#define MVS_CRPB_OFFSET 0
	#define MVS_CRPB_SIZE sizeof(struct mvs_crpb)
	#define MVS_CRPB_MASK 0x000000f8
	#define MVS_CRPB_SHIFT 3
	#define MVS_CRPB_TO_ADDR(slot) ((slot) << MVS_CRPB_SHIFT)
	#define MVS_ADDR_TO_CRPB(addr) (((addr) & MVS_CRPB_MASK) >> MVS_CRPB_SHIFT)
	/* Request work area. */
	#define MVS_WORKRP_SIZE (MVS_CRPB_OFFSET + MVS_CRPB_SIZE * MVS_MAX_SLOTS)

	/* misc defines */
	#define ATA_IRQ_RID 0
	#define ATA_INTR_FLAGS (INTR_MPSAFE\|INTR_TYPE_BIO\|INTR_ENTROPY)

	struct ata_dmaslot {
	bus_dmamap_t data_map; /* Data DMA map */
	bus_addr_t addr; /* Data address */
	uint16_t len; /* Data size */
	};

	/* structure holding DMA related information */
	struct mvs_dma {
	bus_dma_tag_t workrq_tag; /* Request workspace DMA tag */
	bus_dmamap_t workrq_map; /* Request workspace DMA map */
	uint8_t workrq; / Request workspace */
	bus_addr_t workrq_bus; /* Request bus address */
	bus_dma_tag_t workrp_tag; /* Reply workspace DMA tag */
	bus_dmamap_t workrp_map; /* Reply workspace DMA map */
	uint8_t workrp; / Reply workspace */
	bus_addr_t workrp_bus; /* Reply bus address */
	bus_dma_tag_t data_tag; /* Data DMA tag */
	};

	enum mvs_slot_states {
	MVS_SLOT_EMPTY,
	MVS_SLOT_LOADING,
	MVS_SLOT_RUNNING,
	MVS_SLOT_EXECUTING
	};

	struct mvs_slot {
	device_t dev; /* Device handle */
	int slot; /* Number of this slot */
	int tag; /* Used command tag */
	enum mvs_slot_states state; /* Slot state */
	+ u_int eprd_offset; /* EPRD offset */
	union ccb ccb; / CCB occupying slot */
	struct ata_dmaslot dma; /* DMA data of this slot */
	struct callout timeout; /* Execution timeout */
	};

	struct mvs_device {
	int revision;
	int mode;
	u_int bytecount;
	u_int atapi;
	u_int tags;
	u_int caps;
	};

	enum mvs_edma_mode {
	MVS_EDMA_UNKNOWN,
	MVS_EDMA_OFF,
	MVS_EDMA_ON,
	MVS_EDMA_QUEUED,
	MVS_EDMA_NCQ,
	};

	/* structure describing an ATA channel */
	struct mvs_channel {
	device_t dev; /* Device handle */
	int unit; /* Physical channel */
	struct resource r_mem; / Memory of this channel */
	struct resource r_irq; / Interrupt of this channel */
	void ih; / Interrupt handle */
	struct mvs_dma dma; /* DMA data */
	struct cam_sim *sim;
	struct cam_path *path;
	int quirks;
	#define MVS_Q_GENI 1
	#define MVS_Q_GENII 2
	#define MVS_Q_GENIIE 4
	#define MVS_Q_SOC 8
	#define MVS_Q_CT 16
	#define MVS_Q_SOC65 32
	int pm_level; /* power management level */

	struct mvs_slot slot[MVS_MAX_SLOTS];
	union ccb *hold[MVS_MAX_SLOTS];
	int holdtag[MVS_MAX_SLOTS]; /* Tags used for held commands. */
	struct mtx mtx; /* state lock */
	int devices; /* What is present */
	int pm_present; /* PM presence reported */
	enum mvs_edma_mode curr_mode; /* Current EDMA mode */
	int fbs_enabled; /* FIS-based switching enabled */
	uint32_t oslots; /* Occupied slots */
	uint32_t otagspd[16]; /* Occupied device tags */
	uint32_t rslots; /* Running slots */
	uint32_t aslots; /* Slots with atomic commands */
	uint32_t eslots; /* Slots in error */
	uint32_t toslots; /* Slots in timeout */
	int numrslots; /* Number of running slots */
	int numrslotspd[16];/* Number of running slots per dev */
	int numpslots; /* Number of PIO slots */
	int numdslots; /* Number of DMA slots */
	int numtslots; /* Number of NCQ slots */
	int numtslotspd[16];/* Number of NCQ slots per dev */
	int numhslots; /* Number of held slots */
	int recoverycmd; /* Our READ LOG active */
	int fatalerr; /* Fatal error happened */
	int lastslot; /* Last used slot */
	int taggedtarget; /* Last tagged target */
	int resetting; /* Hard-reset in progress. */
	int resetpolldiv; /* Hard-reset poll divider. */
	int out_idx; /* Next written CRQB */
	int in_idx; /* Next read CRPB */
	u_int transfersize; /* PIO transfer size */
	u_int donecount; /* PIO bytes sent/received */
	u_int basic_dma; /* Basic DMA used for ATAPI */
	u_int fake_busy; /* Fake busy bit after command submission */
	union ccb frozen; / Frozen command */
	struct callout pm_timer; /* Power management events */
	struct callout reset_timer; /* Hard-reset timeout */

	struct mvs_device user[16]; /* User-specified settings */
	struct mvs_device curr[16]; /* Current settings */
	};

	/* structure describing a MVS controller */
	struct mvs_controller {
	device_t dev;
	int r_rid;
	struct resource *r_mem;
	struct rman sc_iomem;
	struct mvs_controller_irq {
	struct resource *r_irq;
	void *handle;
	int r_irq_rid;
	} irq;
	int quirks;
	int channels;
	int ccc; /* CCC timeout */
	int cccc; /* CCC commands */
	struct mtx mtx; /* MIM access lock */
	int gmim; /* Globally wanted MIM bits */
	int pmim; /* Port wanted MIM bits */
	int mim; /* Current MIM bits */
	int msi; /* MSI enabled */
	int msia; /* MSI active */
	struct {
	void (function)(void );
	void *argument;
	} interrupt[MVS_MAX_PORTS];
	};

	enum mvs_err_type {
	MVS_ERR_NONE, /* No error */
	MVS_ERR_INVALID, /* Error detected by us before submitting. */
	MVS_ERR_INNOCENT, /* Innocent victim. */
	MVS_ERR_TFE, /* Task File Error. */
	MVS_ERR_SATA, /* SATA error. */
	MVS_ERR_TIMEOUT, /* Command execution timeout. */
	MVS_ERR_NCQ, /* NCQ command error. CCB should be put on hold
	* until READ LOG executed to reveal error. */
	};

	struct mvs_intr_arg {
	void *arg;
	u_int cause;
	};

	extern devclass_t mvs_devclass;

	/* macros to hide busspace uglyness */
	#define ATA_INB(res, offset) \
	bus_read_1((res), (offset))
	#define ATA_INW(res, offset) \
	bus_read_2((res), (offset))
	#define ATA_INL(res, offset) \
	bus_read_4((res), (offset))
	#define ATA_INSW(res, offset, addr, count) \
	bus_read_multi_2((res), (offset), (addr), (count))
	#define ATA_INSW_STRM(res, offset, addr, count) \
	bus_read_multi_stream_2((res), (offset), (addr), (count))
	#define ATA_INSL(res, offset, addr, count) \
	bus_read_multi_4((res), (offset), (addr), (count))
	#define ATA_INSL_STRM(res, offset, addr, count) \
	bus_read_multi_stream_4((res), (offset), (addr), (count))
	#define ATA_OUTB(res, offset, value) \
	bus_write_1((res), (offset), (value))
	#define ATA_OUTW(res, offset, value) \
	bus_write_2((res), (offset), (value))
	#define ATA_OUTL(res, offset, value) \
	bus_write_4((res), (offset), (value));
	#define ATA_OUTSW(res, offset, addr, count) \
	bus_write_multi_2((res), (offset), (addr), (count))
	#define ATA_OUTSW_STRM(res, offset, addr, count) \
	bus_write_multi_stream_2((res), (offset), (addr), (count))
	#define ATA_OUTSL(res, offset, addr, count) \
	bus_write_multi_4((res), (offset), (addr), (count))
	#define ATA_OUTSL_STRM(res, offset, addr, count) \
	bus_write_multi_stream_4((res), (offset), (addr), (count))
	diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h
	index 50c81d47d663..a2aaed355b15 100644
	--- a/sys/dev/nvme/nvme.h
	+++ b/sys/dev/nvme/nvme.h
	@@ -1,1986 +1,1986 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (C) 2012-2013 Intel Corporation
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#ifndef __NVME_H__
	#define __NVME_H__

	#ifdef _KERNEL
	#include <sys/types.h>
	#endif

	#include <sys/param.h>
	#include <sys/endian.h>

	#define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command)
	#define NVME_RESET_CONTROLLER _IO('n', 1)
	#define NVME_GET_NSID _IOR('n', 2, struct nvme_get_nsid)
	#define NVME_GET_MAX_XFER_SIZE _IOR('n', 3, uint64_t)

	#define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test)
	#define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test)

	/*
	* Macros to deal with NVME revisions, as defined VS register
	*/
	#define NVME_REV(x, y) (((x) << 16) \| ((y) << 8))
	#define NVME_MAJOR(r) (((r) >> 16) & 0xffff)
	#define NVME_MINOR(r) (((r) >> 8) & 0xff)

	/*
	* Use to mark a command to apply to all namespaces, or to retrieve global
	* log pages.
	*/
	#define NVME_GLOBAL_NAMESPACE_TAG ((uint32_t)0xFFFFFFFF)

	/* Cap nvme to 1MB transfers driver explodes with larger sizes */
	-#define NVME_MAX_XFER_SIZE (MAXPHYS < (1<<20) ? MAXPHYS : (1<<20))
	+#define NVME_MAX_XFER_SIZE (maxphys < (1<<20) ? maxphys : (1<<20))

	/* Register field definitions */
	#define NVME_CAP_LO_REG_MQES_SHIFT (0)
	#define NVME_CAP_LO_REG_MQES_MASK (0xFFFF)
	#define NVME_CAP_LO_REG_CQR_SHIFT (16)
	#define NVME_CAP_LO_REG_CQR_MASK (0x1)
	#define NVME_CAP_LO_REG_AMS_SHIFT (17)
	#define NVME_CAP_LO_REG_AMS_MASK (0x3)
	#define NVME_CAP_LO_REG_TO_SHIFT (24)
	#define NVME_CAP_LO_REG_TO_MASK (0xFF)
	#define NVME_CAP_LO_MQES(x) \
	(((x) >> NVME_CAP_LO_REG_MQES_SHIFT) & NVME_CAP_LO_REG_MQES_MASK)
	#define NVME_CAP_LO_CQR(x) \
	(((x) >> NVME_CAP_LO_REG_CQR_SHIFT) & NVME_CAP_LO_REG_CQR_MASK)
	#define NVME_CAP_LO_AMS(x) \
	(((x) >> NVME_CAP_LO_REG_AMS_SHIFT) & NVME_CAP_LO_REG_AMS_MASK)
	#define NVME_CAP_LO_TO(x) \
	(((x) >> NVME_CAP_LO_REG_TO_SHIFT) & NVME_CAP_LO_REG_TO_MASK)

	#define NVME_CAP_HI_REG_DSTRD_SHIFT (0)
	#define NVME_CAP_HI_REG_DSTRD_MASK (0xF)
	#define NVME_CAP_HI_REG_NSSRS_SHIFT (4)
	#define NVME_CAP_HI_REG_NSSRS_MASK (0x1)
	#define NVME_CAP_HI_REG_CSS_SHIFT (5)
	#define NVME_CAP_HI_REG_CSS_MASK (0xff)
	#define NVME_CAP_HI_REG_CSS_NVM_SHIFT (5)
	#define NVME_CAP_HI_REG_CSS_NVM_MASK (0x1)
	#define NVME_CAP_HI_REG_BPS_SHIFT (13)
	#define NVME_CAP_HI_REG_BPS_MASK (0x1)
	#define NVME_CAP_HI_REG_MPSMIN_SHIFT (16)
	#define NVME_CAP_HI_REG_MPSMIN_MASK (0xF)
	#define NVME_CAP_HI_REG_MPSMAX_SHIFT (20)
	#define NVME_CAP_HI_REG_MPSMAX_MASK (0xF)
	#define NVME_CAP_HI_REG_PMRS_SHIFT (24)
	#define NVME_CAP_HI_REG_PMRS_MASK (0x1)
	#define NVME_CAP_HI_REG_CMBS_SHIFT (25)
	#define NVME_CAP_HI_REG_CMBS_MASK (0x1)
	#define NVME_CAP_HI_DSTRD(x) \
	(((x) >> NVME_CAP_HI_REG_DSTRD_SHIFT) & NVME_CAP_HI_REG_DSTRD_MASK)
	#define NVME_CAP_HI_NSSRS(x) \
	(((x) >> NVME_CAP_HI_REG_NSSRS_SHIFT) & NVME_CAP_HI_REG_NSSRS_MASK)
	#define NVME_CAP_HI_CSS(x) \
	(((x) >> NVME_CAP_HI_REG_CSS_SHIFT) & NVME_CAP_HI_REG_CSS_MASK)
	#define NVME_CAP_HI_CSS_NVM(x) \
	(((x) >> NVME_CAP_HI_REG_CSS_NVM_SHIFT) & NVME_CAP_HI_REG_CSS_NVM_MASK)
	#define NVME_CAP_HI_BPS(x) \
	(((x) >> NVME_CAP_HI_REG_BPS_SHIFT) & NVME_CAP_HI_REG_BPS_MASK)
	#define NVME_CAP_HI_MPSMIN(x) \
	(((x) >> NVME_CAP_HI_REG_MPSMIN_SHIFT) & NVME_CAP_HI_REG_MPSMIN_MASK)
	#define NVME_CAP_HI_MPSMAX(x) \
	(((x) >> NVME_CAP_HI_REG_MPSMAX_SHIFT) & NVME_CAP_HI_REG_MPSMAX_MASK)
	#define NVME_CAP_HI_PMRS(x) \
	(((x) >> NVME_CAP_HI_REG_PMRS_SHIFT) & NVME_CAP_HI_REG_PMRS_MASK)
	#define NVME_CAP_HI_CMBS(x) \
	(((x) >> NVME_CAP_HI_REG_CMBS_SHIFT) & NVME_CAP_HI_REG_CMBS_MASK)

	#define NVME_CC_REG_EN_SHIFT (0)
	#define NVME_CC_REG_EN_MASK (0x1)
	#define NVME_CC_REG_CSS_SHIFT (4)
	#define NVME_CC_REG_CSS_MASK (0x7)
	#define NVME_CC_REG_MPS_SHIFT (7)
	#define NVME_CC_REG_MPS_MASK (0xF)
	#define NVME_CC_REG_AMS_SHIFT (11)
	#define NVME_CC_REG_AMS_MASK (0x7)
	#define NVME_CC_REG_SHN_SHIFT (14)
	#define NVME_CC_REG_SHN_MASK (0x3)
	#define NVME_CC_REG_IOSQES_SHIFT (16)
	#define NVME_CC_REG_IOSQES_MASK (0xF)
	#define NVME_CC_REG_IOCQES_SHIFT (20)
	#define NVME_CC_REG_IOCQES_MASK (0xF)

	#define NVME_CSTS_REG_RDY_SHIFT (0)
	#define NVME_CSTS_REG_RDY_MASK (0x1)
	#define NVME_CSTS_REG_CFS_SHIFT (1)
	#define NVME_CSTS_REG_CFS_MASK (0x1)
	#define NVME_CSTS_REG_SHST_SHIFT (2)
	#define NVME_CSTS_REG_SHST_MASK (0x3)
	#define NVME_CSTS_REG_NVSRO_SHIFT (4)
	#define NVME_CSTS_REG_NVSRO_MASK (0x1)
	#define NVME_CSTS_REG_PP_SHIFT (5)
	#define NVME_CSTS_REG_PP_MASK (0x1)

	#define NVME_CSTS_GET_SHST(csts) (((csts) >> NVME_CSTS_REG_SHST_SHIFT) & NVME_CSTS_REG_SHST_MASK)

	#define NVME_AQA_REG_ASQS_SHIFT (0)
	#define NVME_AQA_REG_ASQS_MASK (0xFFF)
	#define NVME_AQA_REG_ACQS_SHIFT (16)
	#define NVME_AQA_REG_ACQS_MASK (0xFFF)

	#define NVME_PMRCAP_REG_RDS_SHIFT (3)
	#define NVME_PMRCAP_REG_RDS_MASK (0x1)
	#define NVME_PMRCAP_REG_WDS_SHIFT (4)
	#define NVME_PMRCAP_REG_WDS_MASK (0x1)
	#define NVME_PMRCAP_REG_BIR_SHIFT (5)
	#define NVME_PMRCAP_REG_BIR_MASK (0x7)
	#define NVME_PMRCAP_REG_PMRTU_SHIFT (8)
	#define NVME_PMRCAP_REG_PMRTU_MASK (0x3)
	#define NVME_PMRCAP_REG_PMRWBM_SHIFT (10)
	#define NVME_PMRCAP_REG_PMRWBM_MASK (0xf)
	#define NVME_PMRCAP_REG_PMRTO_SHIFT (16)
	#define NVME_PMRCAP_REG_PMRTO_MASK (0xff)
	#define NVME_PMRCAP_REG_CMSS_SHIFT (24)
	#define NVME_PMRCAP_REG_CMSS_MASK (0x1)

	#define NVME_PMRCAP_RDS(x) \
	(((x) >> NVME_PMRCAP_REG_RDS_SHIFT) & NVME_PMRCAP_REG_RDS_MASK)
	#define NVME_PMRCAP_WDS(x) \
	(((x) >> NVME_PMRCAP_REG_WDS_SHIFT) & NVME_PMRCAP_REG_WDS_MASK)
	#define NVME_PMRCAP_BIR(x) \
	(((x) >> NVME_PMRCAP_REG_BIR_SHIFT) & NVME_PMRCAP_REG_BIR_MASK)
	#define NVME_PMRCAP_PMRTU(x) \
	(((x) >> NVME_PMRCAP_REG_PMRTU_SHIFT) & NVME_PMRCAP_REG_PMRTU_MASK)
	#define NVME_PMRCAP_PMRWBM(x) \
	(((x) >> NVME_PMRCAP_REG_PMRWBM_SHIFT) & NVME_PMRCAP_REG_PMRWBM_MASK)
	#define NVME_PMRCAP_PMRTO(x) \
	(((x) >> NVME_PMRCAP_REG_PMRTO_SHIFT) & NVME_PMRCAP_REG_PMRTO_MASK)
	#define NVME_PMRCAP_CMSS(x) \
	(((x) >> NVME_PMRCAP_REG_CMSS_SHIFT) & NVME_PMRCAP_REG_CMSS_MASK)

	/* Command field definitions */

	#define NVME_CMD_FUSE_SHIFT (8)
	#define NVME_CMD_FUSE_MASK (0x3)

	#define NVME_STATUS_P_SHIFT (0)
	#define NVME_STATUS_P_MASK (0x1)
	#define NVME_STATUS_SC_SHIFT (1)
	#define NVME_STATUS_SC_MASK (0xFF)
	#define NVME_STATUS_SCT_SHIFT (9)
	#define NVME_STATUS_SCT_MASK (0x7)
	#define NVME_STATUS_CRD_SHIFT (12)
	#define NVME_STATUS_CRD_MASK (0x3)
	#define NVME_STATUS_M_SHIFT (14)
	#define NVME_STATUS_M_MASK (0x1)
	#define NVME_STATUS_DNR_SHIFT (15)
	#define NVME_STATUS_DNR_MASK (0x1)

	#define NVME_STATUS_GET_P(st) (((st) >> NVME_STATUS_P_SHIFT) & NVME_STATUS_P_MASK)
	#define NVME_STATUS_GET_SC(st) (((st) >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK)
	#define NVME_STATUS_GET_SCT(st) (((st) >> NVME_STATUS_SCT_SHIFT) & NVME_STATUS_SCT_MASK)
	#define NVME_STATUS_GET_M(st) (((st) >> NVME_STATUS_M_SHIFT) & NVME_STATUS_M_MASK)
	#define NVME_STATUS_GET_DNR(st) (((st) >> NVME_STATUS_DNR_SHIFT) & NVME_STATUS_DNR_MASK)

	#define NVME_PWR_ST_MPS_SHIFT (0)
	#define NVME_PWR_ST_MPS_MASK (0x1)
	#define NVME_PWR_ST_NOPS_SHIFT (1)
	#define NVME_PWR_ST_NOPS_MASK (0x1)
	#define NVME_PWR_ST_RRT_SHIFT (0)
	#define NVME_PWR_ST_RRT_MASK (0x1F)
	#define NVME_PWR_ST_RRL_SHIFT (0)
	#define NVME_PWR_ST_RRL_MASK (0x1F)
	#define NVME_PWR_ST_RWT_SHIFT (0)
	#define NVME_PWR_ST_RWT_MASK (0x1F)
	#define NVME_PWR_ST_RWL_SHIFT (0)
	#define NVME_PWR_ST_RWL_MASK (0x1F)
	#define NVME_PWR_ST_IPS_SHIFT (6)
	#define NVME_PWR_ST_IPS_MASK (0x3)
	#define NVME_PWR_ST_APW_SHIFT (0)
	#define NVME_PWR_ST_APW_MASK (0x7)
	#define NVME_PWR_ST_APS_SHIFT (6)
	#define NVME_PWR_ST_APS_MASK (0x3)

	/** Controller Multi-path I/O and Namespace Sharing Capabilities */
	/* More then one port */
	#define NVME_CTRLR_DATA_MIC_MPORTS_SHIFT (0)
	#define NVME_CTRLR_DATA_MIC_MPORTS_MASK (0x1)
	/* More then one controller */
	#define NVME_CTRLR_DATA_MIC_MCTRLRS_SHIFT (1)
	#define NVME_CTRLR_DATA_MIC_MCTRLRS_MASK (0x1)
	/* SR-IOV Virtual Function */
	#define NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT (2)
	#define NVME_CTRLR_DATA_MIC_SRIOVVF_MASK (0x1)
	/* Asymmetric Namespace Access Reporting */
	#define NVME_CTRLR_DATA_MIC_ANAR_SHIFT (3)
	#define NVME_CTRLR_DATA_MIC_ANAR_MASK (0x1)

	/** OACS - optional admin command support */
	/* supports security send/receive commands */
	#define NVME_CTRLR_DATA_OACS_SECURITY_SHIFT (0)
	#define NVME_CTRLR_DATA_OACS_SECURITY_MASK (0x1)
	/* supports format nvm command */
	#define NVME_CTRLR_DATA_OACS_FORMAT_SHIFT (1)
	#define NVME_CTRLR_DATA_OACS_FORMAT_MASK (0x1)
	/* supports firmware activate/download commands */
	#define NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT (2)
	#define NVME_CTRLR_DATA_OACS_FIRMWARE_MASK (0x1)
	/* supports namespace management commands */
	#define NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT (3)
	#define NVME_CTRLR_DATA_OACS_NSMGMT_MASK (0x1)
	/* supports Device Self-test command */
	#define NVME_CTRLR_DATA_OACS_SELFTEST_SHIFT (4)
	#define NVME_CTRLR_DATA_OACS_SELFTEST_MASK (0x1)
	/* supports Directives */
	#define NVME_CTRLR_DATA_OACS_DIRECTIVES_SHIFT (5)
	#define NVME_CTRLR_DATA_OACS_DIRECTIVES_MASK (0x1)
	/* supports NVMe-MI Send/Receive */
	#define NVME_CTRLR_DATA_OACS_NVMEMI_SHIFT (6)
	#define NVME_CTRLR_DATA_OACS_NVMEMI_MASK (0x1)
	/* supports Virtualization Management */
	#define NVME_CTRLR_DATA_OACS_VM_SHIFT (7)
	#define NVME_CTRLR_DATA_OACS_VM_MASK (0x1)
	/* supports Doorbell Buffer Config */
	#define NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT (8)
	#define NVME_CTRLR_DATA_OACS_DBBUFFER_MASK (0x1)
	/* supports Get LBA Status */
	#define NVME_CTRLR_DATA_OACS_GETLBA_SHIFT (9)
	#define NVME_CTRLR_DATA_OACS_GETLBA_MASK (0x1)

	/** firmware updates */
	/* first slot is read-only */
	#define NVME_CTRLR_DATA_FRMW_SLOT1_RO_SHIFT (0)
	#define NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK (0x1)
	/* number of firmware slots */
	#define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT (1)
	#define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK (0x7)
	/* firmware activation without reset */
	#define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_SHIFT (4)
	#define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_MASK (0x1)

	/** log page attributes */
	/* per namespace smart/health log page */
	#define NVME_CTRLR_DATA_LPA_NS_SMART_SHIFT (0)
	#define NVME_CTRLR_DATA_LPA_NS_SMART_MASK (0x1)

	/** AVSCC - admin vendor specific command configuration */
	/* admin vendor specific commands use spec format */
	#define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_SHIFT (0)
	#define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_MASK (0x1)

	/** Autonomous Power State Transition Attributes */
	/* Autonomous Power State Transitions supported */
	#define NVME_CTRLR_DATA_APSTA_APST_SUPP_SHIFT (0)
	#define NVME_CTRLR_DATA_APSTA_APST_SUPP_MASK (0x1)

	/** Sanitize Capabilities */
	/* Crypto Erase Support */
	#define NVME_CTRLR_DATA_SANICAP_CES_SHIFT (0)
	#define NVME_CTRLR_DATA_SANICAP_CES_MASK (0x1)
	/* Block Erase Support */
	#define NVME_CTRLR_DATA_SANICAP_BES_SHIFT (1)
	#define NVME_CTRLR_DATA_SANICAP_BES_MASK (0x1)
	/* Overwrite Support */
	#define NVME_CTRLR_DATA_SANICAP_OWS_SHIFT (2)
	#define NVME_CTRLR_DATA_SANICAP_OWS_MASK (0x1)
	/* No-Deallocate Inhibited */
	#define NVME_CTRLR_DATA_SANICAP_NDI_SHIFT (29)
	#define NVME_CTRLR_DATA_SANICAP_NDI_MASK (0x1)
	/* No-Deallocate Modifies Media After Sanitize */
	#define NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT (30)
	#define NVME_CTRLR_DATA_SANICAP_NODMMAS_MASK (0x3)
	#define NVME_CTRLR_DATA_SANICAP_NODMMAS_UNDEF (0)
	#define NVME_CTRLR_DATA_SANICAP_NODMMAS_NO (1)
	#define NVME_CTRLR_DATA_SANICAP_NODMMAS_YES (2)

	/** submission queue entry size */
	#define NVME_CTRLR_DATA_SQES_MIN_SHIFT (0)
	#define NVME_CTRLR_DATA_SQES_MIN_MASK (0xF)
	#define NVME_CTRLR_DATA_SQES_MAX_SHIFT (4)
	#define NVME_CTRLR_DATA_SQES_MAX_MASK (0xF)

	/** completion queue entry size */
	#define NVME_CTRLR_DATA_CQES_MIN_SHIFT (0)
	#define NVME_CTRLR_DATA_CQES_MIN_MASK (0xF)
	#define NVME_CTRLR_DATA_CQES_MAX_SHIFT (4)
	#define NVME_CTRLR_DATA_CQES_MAX_MASK (0xF)

	/** optional nvm command support */
	#define NVME_CTRLR_DATA_ONCS_COMPARE_SHIFT (0)
	#define NVME_CTRLR_DATA_ONCS_COMPARE_MASK (0x1)
	#define NVME_CTRLR_DATA_ONCS_WRITE_UNC_SHIFT (1)
	#define NVME_CTRLR_DATA_ONCS_WRITE_UNC_MASK (0x1)
	#define NVME_CTRLR_DATA_ONCS_DSM_SHIFT (2)
	#define NVME_CTRLR_DATA_ONCS_DSM_MASK (0x1)
	#define NVME_CTRLR_DATA_ONCS_WRZERO_SHIFT (3)
	#define NVME_CTRLR_DATA_ONCS_WRZERO_MASK (0x1)
	#define NVME_CTRLR_DATA_ONCS_SAVEFEAT_SHIFT (4)
	#define NVME_CTRLR_DATA_ONCS_SAVEFEAT_MASK (0x1)
	#define NVME_CTRLR_DATA_ONCS_RESERV_SHIFT (5)
	#define NVME_CTRLR_DATA_ONCS_RESERV_MASK (0x1)
	#define NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT (6)
	#define NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK (0x1)
	#define NVME_CTRLR_DATA_ONCS_VERIFY_SHIFT (7)
	#define NVME_CTRLR_DATA_ONCS_VERIFY_MASK (0x1)

	/** Fused Operation Support */
	#define NVME_CTRLR_DATA_FUSES_CNW_SHIFT (0)
	#define NVME_CTRLR_DATA_FUSES_CNW_MASK (0x1)

	/** Format NVM Attributes */
	#define NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT (0)
	#define NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK (0x1)
	#define NVME_CTRLR_DATA_FNA_ERASE_ALL_SHIFT (1)
	#define NVME_CTRLR_DATA_FNA_ERASE_ALL_MASK (0x1)
	#define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_SHIFT (2)
	#define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK (0x1)

	/** volatile write cache */
	/* volatile write cache present */
	#define NVME_CTRLR_DATA_VWC_PRESENT_SHIFT (0)
	#define NVME_CTRLR_DATA_VWC_PRESENT_MASK (0x1)
	/* flush all namespaces supported */
	#define NVME_CTRLR_DATA_VWC_ALL_SHIFT (1)
	#define NVME_CTRLR_DATA_VWC_ALL_MASK (0x3)
	#define NVME_CTRLR_DATA_VWC_ALL_UNKNOWN (0)
	#define NVME_CTRLR_DATA_VWC_ALL_NO (2)
	#define NVME_CTRLR_DATA_VWC_ALL_YES (3)

	/** namespace features */
	/* thin provisioning */
	#define NVME_NS_DATA_NSFEAT_THIN_PROV_SHIFT (0)
	#define NVME_NS_DATA_NSFEAT_THIN_PROV_MASK (0x1)
	/* NAWUN, NAWUPF, and NACWU fields are valid */
	#define NVME_NS_DATA_NSFEAT_NA_FIELDS_SHIFT (1)
	#define NVME_NS_DATA_NSFEAT_NA_FIELDS_MASK (0x1)
	/* Deallocated or Unwritten Logical Block errors supported */
	#define NVME_NS_DATA_NSFEAT_DEALLOC_SHIFT (2)
	#define NVME_NS_DATA_NSFEAT_DEALLOC_MASK (0x1)
	/* NGUID and EUI64 fields are not reusable */
	#define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_SHIFT (3)
	#define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_MASK (0x1)
	/* NPWG, NPWA, NPDG, NPDA, and NOWS are valid */
	#define NVME_NS_DATA_NSFEAT_NPVALID_SHIFT (4)
	#define NVME_NS_DATA_NSFEAT_NPVALID_MASK (0x1)

	/** formatted lba size */
	#define NVME_NS_DATA_FLBAS_FORMAT_SHIFT (0)
	#define NVME_NS_DATA_FLBAS_FORMAT_MASK (0xF)
	#define NVME_NS_DATA_FLBAS_EXTENDED_SHIFT (4)
	#define NVME_NS_DATA_FLBAS_EXTENDED_MASK (0x1)

	/** metadata capabilities */
	/* metadata can be transferred as part of data prp list */
	#define NVME_NS_DATA_MC_EXTENDED_SHIFT (0)
	#define NVME_NS_DATA_MC_EXTENDED_MASK (0x1)
	/* metadata can be transferred with separate metadata pointer */
	#define NVME_NS_DATA_MC_POINTER_SHIFT (1)
	#define NVME_NS_DATA_MC_POINTER_MASK (0x1)

	/** end-to-end data protection capabilities */
	/* protection information type 1 */
	#define NVME_NS_DATA_DPC_PIT1_SHIFT (0)
	#define NVME_NS_DATA_DPC_PIT1_MASK (0x1)
	/* protection information type 2 */
	#define NVME_NS_DATA_DPC_PIT2_SHIFT (1)
	#define NVME_NS_DATA_DPC_PIT2_MASK (0x1)
	/* protection information type 3 */
	#define NVME_NS_DATA_DPC_PIT3_SHIFT (2)
	#define NVME_NS_DATA_DPC_PIT3_MASK (0x1)
	/* first eight bytes of metadata */
	#define NVME_NS_DATA_DPC_MD_START_SHIFT (3)
	#define NVME_NS_DATA_DPC_MD_START_MASK (0x1)
	/* last eight bytes of metadata */
	#define NVME_NS_DATA_DPC_MD_END_SHIFT (4)
	#define NVME_NS_DATA_DPC_MD_END_MASK (0x1)

	/** end-to-end data protection type settings */
	/* protection information type */
	#define NVME_NS_DATA_DPS_PIT_SHIFT (0)
	#define NVME_NS_DATA_DPS_PIT_MASK (0x7)
	/* 1 == protection info transferred at start of metadata */
	/* 0 == protection info transferred at end of metadata */
	#define NVME_NS_DATA_DPS_MD_START_SHIFT (3)
	#define NVME_NS_DATA_DPS_MD_START_MASK (0x1)

	/** Namespace Multi-path I/O and Namespace Sharing Capabilities */
	/* the namespace may be attached to two or more controllers */
	#define NVME_NS_DATA_NMIC_MAY_BE_SHARED_SHIFT (0)
	#define NVME_NS_DATA_NMIC_MAY_BE_SHARED_MASK (0x1)

	/** Reservation Capabilities */
	/* Persist Through Power Loss */
	#define NVME_NS_DATA_RESCAP_PTPL_SHIFT (0)
	#define NVME_NS_DATA_RESCAP_PTPL_MASK (0x1)
	/* supports the Write Exclusive */
	#define NVME_NS_DATA_RESCAP_WR_EX_SHIFT (1)
	#define NVME_NS_DATA_RESCAP_WR_EX_MASK (0x1)
	/* supports the Exclusive Access */
	#define NVME_NS_DATA_RESCAP_EX_AC_SHIFT (2)
	#define NVME_NS_DATA_RESCAP_EX_AC_MASK (0x1)
	/* supports the Write Exclusive – Registrants Only */
	#define NVME_NS_DATA_RESCAP_WR_EX_RO_SHIFT (3)
	#define NVME_NS_DATA_RESCAP_WR_EX_RO_MASK (0x1)
	/* supports the Exclusive Access - Registrants Only */
	#define NVME_NS_DATA_RESCAP_EX_AC_RO_SHIFT (4)
	#define NVME_NS_DATA_RESCAP_EX_AC_RO_MASK (0x1)
	/* supports the Write Exclusive – All Registrants */
	#define NVME_NS_DATA_RESCAP_WR_EX_AR_SHIFT (5)
	#define NVME_NS_DATA_RESCAP_WR_EX_AR_MASK (0x1)
	/* supports the Exclusive Access - All Registrants */
	#define NVME_NS_DATA_RESCAP_EX_AC_AR_SHIFT (6)
	#define NVME_NS_DATA_RESCAP_EX_AC_AR_MASK (0x1)
	/* Ignore Existing Key is used as defined in revision 1.3 or later */
	#define NVME_NS_DATA_RESCAP_IEKEY13_SHIFT (7)
	#define NVME_NS_DATA_RESCAP_IEKEY13_MASK (0x1)

	/** Format Progress Indicator */
	/* percentage of the Format NVM command that remains to be completed */
	#define NVME_NS_DATA_FPI_PERC_SHIFT (0)
	#define NVME_NS_DATA_FPI_PERC_MASK (0x7f)
	/* namespace supports the Format Progress Indicator */
	#define NVME_NS_DATA_FPI_SUPP_SHIFT (7)
	#define NVME_NS_DATA_FPI_SUPP_MASK (0x1)

	/** Deallocate Logical Block Features */
	/* deallocated logical block read behavior */
	#define NVME_NS_DATA_DLFEAT_READ_SHIFT (0)
	#define NVME_NS_DATA_DLFEAT_READ_MASK (0x07)
	#define NVME_NS_DATA_DLFEAT_READ_NR (0x00)
	#define NVME_NS_DATA_DLFEAT_READ_00 (0x01)
	#define NVME_NS_DATA_DLFEAT_READ_FF (0x02)
	/* supports the Deallocate bit in the Write Zeroes */
	#define NVME_NS_DATA_DLFEAT_DWZ_SHIFT (3)
	#define NVME_NS_DATA_DLFEAT_DWZ_MASK (0x01)
	/* Guard field for deallocated logical blocks is set to the CRC */
	#define NVME_NS_DATA_DLFEAT_GCRC_SHIFT (4)
	#define NVME_NS_DATA_DLFEAT_GCRC_MASK (0x01)

	/** lba format support */
	/* metadata size */
	#define NVME_NS_DATA_LBAF_MS_SHIFT (0)
	#define NVME_NS_DATA_LBAF_MS_MASK (0xFFFF)
	/* lba data size */
	#define NVME_NS_DATA_LBAF_LBADS_SHIFT (16)
	#define NVME_NS_DATA_LBAF_LBADS_MASK (0xFF)
	/* relative performance */
	#define NVME_NS_DATA_LBAF_RP_SHIFT (24)
	#define NVME_NS_DATA_LBAF_RP_MASK (0x3)

	enum nvme_critical_warning_state {
	NVME_CRIT_WARN_ST_AVAILABLE_SPARE = 0x1,
	NVME_CRIT_WARN_ST_TEMPERATURE = 0x2,
	NVME_CRIT_WARN_ST_DEVICE_RELIABILITY = 0x4,
	NVME_CRIT_WARN_ST_READ_ONLY = 0x8,
	NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP = 0x10,
	};
	#define NVME_CRIT_WARN_ST_RESERVED_MASK (0xE0)
	#define NVME_ASYNC_EVENT_NS_ATTRIBUTE (0x100)
	#define NVME_ASYNC_EVENT_FW_ACTIVATE (0x200)

	/* slot for current FW */
	#define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT (0)
	#define NVME_FIRMWARE_PAGE_AFI_SLOT_MASK (0x7)

	/* Commands Supported and Effects */
	#define NVME_CE_PAGE_CSUP_SHIFT (0)
	#define NVME_CE_PAGE_CSUP_MASK (0x1)
	#define NVME_CE_PAGE_LBCC_SHIFT (1)
	#define NVME_CE_PAGE_LBCC_MASK (0x1)
	#define NVME_CE_PAGE_NCC_SHIFT (2)
	#define NVME_CE_PAGE_NCC_MASK (0x1)
	#define NVME_CE_PAGE_NIC_SHIFT (3)
	#define NVME_CE_PAGE_NIC_MASK (0x1)
	#define NVME_CE_PAGE_CCC_SHIFT (4)
	#define NVME_CE_PAGE_CCC_MASK (0x1)
	#define NVME_CE_PAGE_CSE_SHIFT (16)
	#define NVME_CE_PAGE_CSE_MASK (0x7)
	#define NVME_CE_PAGE_UUID_SHIFT (19)
	#define NVME_CE_PAGE_UUID_MASK (0x1)

	/* Sanitize Status */
	#define NVME_SS_PAGE_SSTAT_STATUS_SHIFT (0)
	#define NVME_SS_PAGE_SSTAT_STATUS_MASK (0x7)
	#define NVME_SS_PAGE_SSTAT_STATUS_NEVER (0)
	#define NVME_SS_PAGE_SSTAT_STATUS_COMPLETED (1)
	#define NVME_SS_PAGE_SSTAT_STATUS_INPROG (2)
	#define NVME_SS_PAGE_SSTAT_STATUS_FAILED (3)
	#define NVME_SS_PAGE_SSTAT_STATUS_COMPLETEDWD (4)
	#define NVME_SS_PAGE_SSTAT_PASSES_SHIFT (3)
	#define NVME_SS_PAGE_SSTAT_PASSES_MASK (0x1f)
	#define NVME_SS_PAGE_SSTAT_GDE_SHIFT (8)
	#define NVME_SS_PAGE_SSTAT_GDE_MASK (0x1)

	/* CC register SHN field values */
	enum shn_value {
	NVME_SHN_NORMAL = 0x1,
	NVME_SHN_ABRUPT = 0x2,
	};

	/* CSTS register SHST field values */
	enum shst_value {
	NVME_SHST_NORMAL = 0x0,
	NVME_SHST_OCCURRING = 0x1,
	NVME_SHST_COMPLETE = 0x2,
	};

	struct nvme_registers
	{
	uint32_t cap_lo; /* controller capabilities */
	uint32_t cap_hi;
	uint32_t vs; /* version */
	uint32_t intms; /* interrupt mask set */
	uint32_t intmc; /* interrupt mask clear */
	uint32_t cc; /* controller configuration */
	uint32_t reserved1;
	uint32_t csts; /* controller status */
	uint32_t nssr; /* NVM Subsystem Reset */
	uint32_t aqa; /* admin queue attributes */
	uint64_t asq; /* admin submission queue base addr */
	uint64_t acq; /* admin completion queue base addr */
	uint32_t cmbloc; /* Controller Memory Buffer Location */
	uint32_t cmbsz; /* Controller Memory Buffer Size */
	uint32_t bpinfo; /* Boot Partition Information */
	uint32_t bprsel; /* Boot Partition Read Select */
	uint64_t bpmbl; /* Boot Partition Memory Buffer Location */
	uint64_t cmbmsc; /* Controller Memory Buffer Memory Space Control */
	uint32_t cmbsts; /* Controller Memory Buffer Status */
	uint8_t reserved3[3492]; /* 5Ch - DFFh */
	uint32_t pmrcap; /* Persistent Memory Capabilities */
	uint32_t pmrctl; /* Persistent Memory Region Control */
	uint32_t pmrsts; /* Persistent Memory Region Status */
	uint32_t pmrebs; /* Persistent Memory Region Elasticity Buffer Size */
	uint32_t pmrswtp; /* Persistent Memory Region Sustained Write Throughput */
	uint32_t pmrmsc_lo; /* Persistent Memory Region Controller Memory Space Control */
	uint32_t pmrmsc_hi;
	uint8_t reserved4[484]; /* E1Ch - FFFh */
	struct {
	uint32_t sq_tdbl; /* submission queue tail doorbell */
	uint32_t cq_hdbl; /* completion queue head doorbell */
	} doorbell[1] __packed;
	} __packed;

	_Static_assert(sizeof(struct nvme_registers) == 0x1008, "bad size for nvme_registers");

	struct nvme_command
	{
	/* dword 0 */
	uint8_t opc; /* opcode */
	uint8_t fuse; /* fused operation */
	uint16_t cid; /* command identifier */

	/* dword 1 */
	uint32_t nsid; /* namespace identifier */

	/* dword 2-3 */
	uint32_t rsvd2;
	uint32_t rsvd3;

	/* dword 4-5 */
	uint64_t mptr; /* metadata pointer */

	/* dword 6-7 */
	uint64_t prp1; /* prp entry 1 */

	/* dword 8-9 */
	uint64_t prp2; /* prp entry 2 */

	/* dword 10-15 */
	uint32_t cdw10; /* command-specific */
	uint32_t cdw11; /* command-specific */
	uint32_t cdw12; /* command-specific */
	uint32_t cdw13; /* command-specific */
	uint32_t cdw14; /* command-specific */
	uint32_t cdw15; /* command-specific */
	} __packed;

	_Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command");

	struct nvme_completion {
	/* dword 0 */
	uint32_t cdw0; /* command-specific */

	/* dword 1 */
	uint32_t rsvd1;

	/* dword 2 */
	uint16_t sqhd; /* submission queue head pointer */
	uint16_t sqid; /* submission queue identifier */

	/* dword 3 */
	uint16_t cid; /* command identifier */
	uint16_t status;
	} __packed;

	_Static_assert(sizeof(struct nvme_completion) == 4 * 4, "bad size for nvme_completion");

	struct nvme_dsm_range {
	uint32_t attributes;
	uint32_t length;
	uint64_t starting_lba;
	} __packed;

	/* Largest DSM Trim that can be done */
	#define NVME_MAX_DSM_TRIM 4096

	_Static_assert(sizeof(struct nvme_dsm_range) == 16, "bad size for nvme_dsm_ranage");

	/* status code types */
	enum nvme_status_code_type {
	NVME_SCT_GENERIC = 0x0,
	NVME_SCT_COMMAND_SPECIFIC = 0x1,
	NVME_SCT_MEDIA_ERROR = 0x2,
	NVME_SCT_PATH_RELATED = 0x3,
	/* 0x3-0x6 - reserved */
	NVME_SCT_VENDOR_SPECIFIC = 0x7,
	};

	/* generic command status codes */
	enum nvme_generic_command_status_code {
	NVME_SC_SUCCESS = 0x00,
	NVME_SC_INVALID_OPCODE = 0x01,
	NVME_SC_INVALID_FIELD = 0x02,
	NVME_SC_COMMAND_ID_CONFLICT = 0x03,
	NVME_SC_DATA_TRANSFER_ERROR = 0x04,
	NVME_SC_ABORTED_POWER_LOSS = 0x05,
	NVME_SC_INTERNAL_DEVICE_ERROR = 0x06,
	NVME_SC_ABORTED_BY_REQUEST = 0x07,
	NVME_SC_ABORTED_SQ_DELETION = 0x08,
	NVME_SC_ABORTED_FAILED_FUSED = 0x09,
	NVME_SC_ABORTED_MISSING_FUSED = 0x0a,
	NVME_SC_INVALID_NAMESPACE_OR_FORMAT = 0x0b,
	NVME_SC_COMMAND_SEQUENCE_ERROR = 0x0c,
	NVME_SC_INVALID_SGL_SEGMENT_DESCR = 0x0d,
	NVME_SC_INVALID_NUMBER_OF_SGL_DESCR = 0x0e,
	NVME_SC_DATA_SGL_LENGTH_INVALID = 0x0f,
	NVME_SC_METADATA_SGL_LENGTH_INVALID = 0x10,
	NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID = 0x11,
	NVME_SC_INVALID_USE_OF_CMB = 0x12,
	NVME_SC_PRP_OFFET_INVALID = 0x13,
	NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED = 0x14,
	NVME_SC_OPERATION_DENIED = 0x15,
	NVME_SC_SGL_OFFSET_INVALID = 0x16,
	/* 0x17 - reserved */
	NVME_SC_HOST_ID_INCONSISTENT_FORMAT = 0x18,
	NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED = 0x19,
	NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID = 0x1a,
	NVME_SC_ABORTED_DUE_TO_PREEMPT = 0x1b,
	NVME_SC_SANITIZE_FAILED = 0x1c,
	NVME_SC_SANITIZE_IN_PROGRESS = 0x1d,
	NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID = 0x1e,
	NVME_SC_NOT_SUPPORTED_IN_CMB = 0x1f,
	NVME_SC_NAMESPACE_IS_WRITE_PROTECTED = 0x20,
	NVME_SC_COMMAND_INTERRUPTED = 0x21,
	NVME_SC_TRANSIENT_TRANSPORT_ERROR = 0x22,

	NVME_SC_LBA_OUT_OF_RANGE = 0x80,
	NVME_SC_CAPACITY_EXCEEDED = 0x81,
	NVME_SC_NAMESPACE_NOT_READY = 0x82,
	NVME_SC_RESERVATION_CONFLICT = 0x83,
	NVME_SC_FORMAT_IN_PROGRESS = 0x84,
	};

	/* command specific status codes */
	enum nvme_command_specific_status_code {
	NVME_SC_COMPLETION_QUEUE_INVALID = 0x00,
	NVME_SC_INVALID_QUEUE_IDENTIFIER = 0x01,
	NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED = 0x02,
	NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED = 0x03,
	/* 0x04 - reserved */
	NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED = 0x05,
	NVME_SC_INVALID_FIRMWARE_SLOT = 0x06,
	NVME_SC_INVALID_FIRMWARE_IMAGE = 0x07,
	NVME_SC_INVALID_INTERRUPT_VECTOR = 0x08,
	NVME_SC_INVALID_LOG_PAGE = 0x09,
	NVME_SC_INVALID_FORMAT = 0x0a,
	NVME_SC_FIRMWARE_REQUIRES_RESET = 0x0b,
	NVME_SC_INVALID_QUEUE_DELETION = 0x0c,
	NVME_SC_FEATURE_NOT_SAVEABLE = 0x0d,
	NVME_SC_FEATURE_NOT_CHANGEABLE = 0x0e,
	NVME_SC_FEATURE_NOT_NS_SPECIFIC = 0x0f,
	NVME_SC_FW_ACT_REQUIRES_NVMS_RESET = 0x10,
	NVME_SC_FW_ACT_REQUIRES_RESET = 0x11,
	NVME_SC_FW_ACT_REQUIRES_TIME = 0x12,
	NVME_SC_FW_ACT_PROHIBITED = 0x13,
	NVME_SC_OVERLAPPING_RANGE = 0x14,
	NVME_SC_NS_INSUFFICIENT_CAPACITY = 0x15,
	NVME_SC_NS_ID_UNAVAILABLE = 0x16,
	/* 0x17 - reserved */
	NVME_SC_NS_ALREADY_ATTACHED = 0x18,
	NVME_SC_NS_IS_PRIVATE = 0x19,
	NVME_SC_NS_NOT_ATTACHED = 0x1a,
	NVME_SC_THIN_PROV_NOT_SUPPORTED = 0x1b,
	NVME_SC_CTRLR_LIST_INVALID = 0x1c,
	NVME_SC_SELT_TEST_IN_PROGRESS = 0x1d,
	NVME_SC_BOOT_PART_WRITE_PROHIB = 0x1e,
	NVME_SC_INVALID_CTRLR_ID = 0x1f,
	NVME_SC_INVALID_SEC_CTRLR_STATE = 0x20,
	NVME_SC_INVALID_NUM_OF_CTRLR_RESRC = 0x21,
	NVME_SC_INVALID_RESOURCE_ID = 0x22,
	NVME_SC_SANITIZE_PROHIBITED_WPMRE = 0x23,
	NVME_SC_ANA_GROUP_ID_INVALID = 0x24,
	NVME_SC_ANA_ATTACH_FAILED = 0x25,

	NVME_SC_CONFLICTING_ATTRIBUTES = 0x80,
	NVME_SC_INVALID_PROTECTION_INFO = 0x81,
	NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE = 0x82,
	};

	/* media error status codes */
	enum nvme_media_error_status_code {
	NVME_SC_WRITE_FAULTS = 0x80,
	NVME_SC_UNRECOVERED_READ_ERROR = 0x81,
	NVME_SC_GUARD_CHECK_ERROR = 0x82,
	NVME_SC_APPLICATION_TAG_CHECK_ERROR = 0x83,
	NVME_SC_REFERENCE_TAG_CHECK_ERROR = 0x84,
	NVME_SC_COMPARE_FAILURE = 0x85,
	NVME_SC_ACCESS_DENIED = 0x86,
	NVME_SC_DEALLOCATED_OR_UNWRITTEN = 0x87,
	};

	/* path related status codes */
	enum nvme_path_related_status_code {
	NVME_SC_INTERNAL_PATH_ERROR = 0x00,
	NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS = 0x01,
	NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE = 0x02,
	NVME_SC_ASYMMETRIC_ACCESS_TRANSITION = 0x03,
	NVME_SC_CONTROLLER_PATHING_ERROR = 0x60,
	NVME_SC_HOST_PATHING_ERROR = 0x70,
	NVME_SC_COMMAND_ABOTHED_BY_HOST = 0x71,
	};

	/* admin opcodes */
	enum nvme_admin_opcode {
	NVME_OPC_DELETE_IO_SQ = 0x00,
	NVME_OPC_CREATE_IO_SQ = 0x01,
	NVME_OPC_GET_LOG_PAGE = 0x02,
	/* 0x03 - reserved */
	NVME_OPC_DELETE_IO_CQ = 0x04,
	NVME_OPC_CREATE_IO_CQ = 0x05,
	NVME_OPC_IDENTIFY = 0x06,
	/* 0x07 - reserved */
	NVME_OPC_ABORT = 0x08,
	NVME_OPC_SET_FEATURES = 0x09,
	NVME_OPC_GET_FEATURES = 0x0a,
	/* 0x0b - reserved */
	NVME_OPC_ASYNC_EVENT_REQUEST = 0x0c,
	NVME_OPC_NAMESPACE_MANAGEMENT = 0x0d,
	/* 0x0e-0x0f - reserved */
	NVME_OPC_FIRMWARE_ACTIVATE = 0x10,
	NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD = 0x11,
	/* 0x12-0x13 - reserved */
	NVME_OPC_DEVICE_SELF_TEST = 0x14,
	NVME_OPC_NAMESPACE_ATTACHMENT = 0x15,
	/* 0x16-0x17 - reserved */
	NVME_OPC_KEEP_ALIVE = 0x18,
	NVME_OPC_DIRECTIVE_SEND = 0x19,
	NVME_OPC_DIRECTIVE_RECEIVE = 0x1a,
	/* 0x1b - reserved */
	NVME_OPC_VIRTUALIZATION_MANAGEMENT = 0x1c,
	NVME_OPC_NVME_MI_SEND = 0x1d,
	NVME_OPC_NVME_MI_RECEIVE = 0x1e,
	/* 0x1f-0x7b - reserved */
	NVME_OPC_DOORBELL_BUFFER_CONFIG = 0x7c,

	NVME_OPC_FORMAT_NVM = 0x80,
	NVME_OPC_SECURITY_SEND = 0x81,
	NVME_OPC_SECURITY_RECEIVE = 0x82,
	/* 0x83 - reserved */
	NVME_OPC_SANITIZE = 0x84,
	/* 0x85 - reserved */
	NVME_OPC_GET_LBA_STATUS = 0x86,
	};

	/* nvme nvm opcodes */
	enum nvme_nvm_opcode {
	NVME_OPC_FLUSH = 0x00,
	NVME_OPC_WRITE = 0x01,
	NVME_OPC_READ = 0x02,
	/* 0x03 - reserved */
	NVME_OPC_WRITE_UNCORRECTABLE = 0x04,
	NVME_OPC_COMPARE = 0x05,
	/* 0x06-0x07 - reserved */
	NVME_OPC_WRITE_ZEROES = 0x08,
	NVME_OPC_DATASET_MANAGEMENT = 0x09,
	/* 0x0a-0x0b - reserved */
	NVME_OPC_VERIFY = 0x0c,
	NVME_OPC_RESERVATION_REGISTER = 0x0d,
	NVME_OPC_RESERVATION_REPORT = 0x0e,
	/* 0x0f-0x10 - reserved */
	NVME_OPC_RESERVATION_ACQUIRE = 0x11,
	/* 0x12-0x14 - reserved */
	NVME_OPC_RESERVATION_RELEASE = 0x15,
	};

	enum nvme_feature {
	/* 0x00 - reserved */
	NVME_FEAT_ARBITRATION = 0x01,
	NVME_FEAT_POWER_MANAGEMENT = 0x02,
	NVME_FEAT_LBA_RANGE_TYPE = 0x03,
	NVME_FEAT_TEMPERATURE_THRESHOLD = 0x04,
	NVME_FEAT_ERROR_RECOVERY = 0x05,
	NVME_FEAT_VOLATILE_WRITE_CACHE = 0x06,
	NVME_FEAT_NUMBER_OF_QUEUES = 0x07,
	NVME_FEAT_INTERRUPT_COALESCING = 0x08,
	NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION = 0x09,
	NVME_FEAT_WRITE_ATOMICITY = 0x0A,
	NVME_FEAT_ASYNC_EVENT_CONFIGURATION = 0x0B,
	NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION = 0x0C,
	NVME_FEAT_HOST_MEMORY_BUFFER = 0x0D,
	NVME_FEAT_TIMESTAMP = 0x0E,
	NVME_FEAT_KEEP_ALIVE_TIMER = 0x0F,
	NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT = 0x10,
	NVME_FEAT_NON_OP_POWER_STATE_CONFIG = 0x11,
	NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG = 0x12,
	NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG = 0x13,
	NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW = 0x14,
	NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES = 0x15,
	NVME_FEAT_HOST_BEHAVIOR_SUPPORT = 0x16,
	NVME_FEAT_SANITIZE_CONFIG = 0x17,
	NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION = 0x18,
	/* 0x19-0x77 - reserved */
	/* 0x78-0x7f - NVMe Management Interface */
	NVME_FEAT_SOFTWARE_PROGRESS_MARKER = 0x80,
	NVME_FEAT_HOST_IDENTIFIER = 0x81,
	NVME_FEAT_RESERVATION_NOTIFICATION_MASK = 0x82,
	NVME_FEAT_RESERVATION_PERSISTENCE = 0x83,
	NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG = 0x84,
	/* 0x85-0xBF - command set specific (reserved) */
	/* 0xC0-0xFF - vendor specific */
	};

	enum nvme_dsm_attribute {
	NVME_DSM_ATTR_INTEGRAL_READ = 0x1,
	NVME_DSM_ATTR_INTEGRAL_WRITE = 0x2,
	NVME_DSM_ATTR_DEALLOCATE = 0x4,
	};

	enum nvme_activate_action {
	NVME_AA_REPLACE_NO_ACTIVATE = 0x0,
	NVME_AA_REPLACE_ACTIVATE = 0x1,
	NVME_AA_ACTIVATE = 0x2,
	};

	struct nvme_power_state {
	/** Maximum Power */
	uint16_t mp; /* Maximum Power */
	uint8_t ps_rsvd1;
	uint8_t mps_nops; /* Max Power Scale, Non-Operational State */

	uint32_t enlat; /* Entry Latency */
	uint32_t exlat; /* Exit Latency */

	uint8_t rrt; /* Relative Read Throughput */
	uint8_t rrl; /* Relative Read Latency */
	uint8_t rwt; /* Relative Write Throughput */
	uint8_t rwl; /* Relative Write Latency */

	uint16_t idlp; /* Idle Power */
	uint8_t ips; /* Idle Power Scale */
	uint8_t ps_rsvd8;

	uint16_t actp; /* Active Power */
	uint8_t apw_aps; /* Active Power Workload, Active Power Scale */
	uint8_t ps_rsvd10[9];
	} __packed;

	_Static_assert(sizeof(struct nvme_power_state) == 32, "bad size for nvme_power_state");

	#define NVME_SERIAL_NUMBER_LENGTH 20
	#define NVME_MODEL_NUMBER_LENGTH 40
	#define NVME_FIRMWARE_REVISION_LENGTH 8

	struct nvme_controller_data {
	/* bytes 0-255: controller capabilities and features */

	/** pci vendor id */
	uint16_t vid;

	/** pci subsystem vendor id */
	uint16_t ssvid;

	/** serial number */
	uint8_t sn[NVME_SERIAL_NUMBER_LENGTH];

	/** model number */
	uint8_t mn[NVME_MODEL_NUMBER_LENGTH];

	/** firmware revision */
	uint8_t fr[NVME_FIRMWARE_REVISION_LENGTH];

	/** recommended arbitration burst */
	uint8_t rab;

	/** ieee oui identifier */
	uint8_t ieee[3];

	/** multi-interface capabilities */
	uint8_t mic;

	/** maximum data transfer size */
	uint8_t mdts;

	/** Controller ID */
	uint16_t ctrlr_id;

	/** Version */
	uint32_t ver;

	/** RTD3 Resume Latency */
	uint32_t rtd3r;

	/** RTD3 Enter Latency */
	uint32_t rtd3e;

	/** Optional Asynchronous Events Supported */
	uint32_t oaes; /* bitfield really */

	/** Controller Attributes */
	uint32_t ctratt; /* bitfield really */

	/** Read Recovery Levels Supported */
	uint16_t rrls;

	uint8_t reserved1[9];

	/** Controller Type */
	uint8_t cntrltype;

	/** FRU Globally Unique Identifier */
	uint8_t fguid[16];

	/** Command Retry Delay Time 1 */
	uint16_t crdt1;

	/** Command Retry Delay Time 2 */
	uint16_t crdt2;

	/** Command Retry Delay Time 3 */
	uint16_t crdt3;

	uint8_t reserved2[122];

	/* bytes 256-511: admin command set attributes */

	/** optional admin command support */
	uint16_t oacs;

	/** abort command limit */
	uint8_t acl;

	/** asynchronous event request limit */
	uint8_t aerl;

	/** firmware updates */
	uint8_t frmw;

	/** log page attributes */
	uint8_t lpa;

	/** error log page entries */
	uint8_t elpe;

	/** number of power states supported */
	uint8_t npss;

	/** admin vendor specific command configuration */
	uint8_t avscc;

	/** Autonomous Power State Transition Attributes */
	uint8_t apsta;

	/** Warning Composite Temperature Threshold */
	uint16_t wctemp;

	/** Critical Composite Temperature Threshold */
	uint16_t cctemp;

	/** Maximum Time for Firmware Activation */
	uint16_t mtfa;

	/** Host Memory Buffer Preferred Size */
	uint32_t hmpre;

	/** Host Memory Buffer Minimum Size */
	uint32_t hmmin;

	/** Name space capabilities */
	struct {
	/* if nsmgmt, report tnvmcap and unvmcap */
	uint8_t tnvmcap[16];
	uint8_t unvmcap[16];
	} __packed untncap;

	/** Replay Protected Memory Block Support */
	uint32_t rpmbs; /* Really a bitfield */

	/** Extended Device Self-test Time */
	uint16_t edstt;

	/** Device Self-test Options */
	uint8_t dsto; /* Really a bitfield */

	/** Firmware Update Granularity */
	uint8_t fwug;

	/** Keep Alive Support */
	uint16_t kas;

	/** Host Controlled Thermal Management Attributes */
	uint16_t hctma; /* Really a bitfield */

	/** Minimum Thermal Management Temperature */
	uint16_t mntmt;

	/** Maximum Thermal Management Temperature */
	uint16_t mxtmt;

	/** Sanitize Capabilities */
	uint32_t sanicap; /* Really a bitfield */

	/** Host Memory Buffer Minimum Descriptor Entry Size */
	uint32_t hmminds;

	/** Host Memory Maximum Descriptors Entries */
	uint16_t hmmaxd;

	/** NVM Set Identifier Maximum */
	uint16_t nsetidmax;

	/** Endurance Group Identifier Maximum */
	uint16_t endgidmax;

	/** ANA Transition Time */
	uint8_t anatt;

	/** Asymmetric Namespace Access Capabilities */
	uint8_t anacap;

	/** ANA Group Identifier Maximum */
	uint32_t anagrpmax;

	/** Number of ANA Group Identifiers */
	uint32_t nanagrpid;

	/** Persistent Event Log Size */
	uint32_t pels;

	uint8_t reserved3[156];
	/* bytes 512-703: nvm command set attributes */

	/** submission queue entry size */
	uint8_t sqes;

	/** completion queue entry size */
	uint8_t cqes;

	/** Maximum Outstanding Commands */
	uint16_t maxcmd;

	/** number of namespaces */
	uint32_t nn;

	/** optional nvm command support */
	uint16_t oncs;

	/** fused operation support */
	uint16_t fuses;

	/** format nvm attributes */
	uint8_t fna;

	/** volatile write cache */
	uint8_t vwc;

	/** Atomic Write Unit Normal */
	uint16_t awun;

	/** Atomic Write Unit Power Fail */
	uint16_t awupf;

	/** NVM Vendor Specific Command Configuration */
	uint8_t nvscc;

	/** Namespace Write Protection Capabilities */
	uint8_t nwpc;

	/** Atomic Compare & Write Unit */
	uint16_t acwu;
	uint16_t reserved6;

	/** SGL Support */
	uint32_t sgls;

	/** Maximum Number of Allowed Namespaces */
	uint32_t mnan;

	/* bytes 540-767: Reserved */
	uint8_t reserved7[224];

	/** NVM Subsystem NVMe Qualified Name */
	uint8_t subnqn[256];

	/* bytes 1024-1791: Reserved */
	uint8_t reserved8[768];

	/* bytes 1792-2047: NVMe over Fabrics specification */
	uint8_t reserved9[256];

	/* bytes 2048-3071: power state descriptors */
	struct nvme_power_state power_state[32];

	/* bytes 3072-4095: vendor specific */
	uint8_t vs[1024];
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_controller_data) == 4096, "bad size for nvme_controller_data");

	struct nvme_namespace_data {
	/** namespace size */
	uint64_t nsze;

	/** namespace capacity */
	uint64_t ncap;

	/** namespace utilization */
	uint64_t nuse;

	/** namespace features */
	uint8_t nsfeat;

	/** number of lba formats */
	uint8_t nlbaf;

	/** formatted lba size */
	uint8_t flbas;

	/** metadata capabilities */
	uint8_t mc;

	/** end-to-end data protection capabilities */
	uint8_t dpc;

	/** end-to-end data protection type settings */
	uint8_t dps;

	/** Namespace Multi-path I/O and Namespace Sharing Capabilities */
	uint8_t nmic;

	/** Reservation Capabilities */
	uint8_t rescap;

	/** Format Progress Indicator */
	uint8_t fpi;

	/** Deallocate Logical Block Features */
	uint8_t dlfeat;

	/** Namespace Atomic Write Unit Normal */
	uint16_t nawun;

	/** Namespace Atomic Write Unit Power Fail */
	uint16_t nawupf;

	/** Namespace Atomic Compare & Write Unit */
	uint16_t nacwu;

	/** Namespace Atomic Boundary Size Normal */
	uint16_t nabsn;

	/** Namespace Atomic Boundary Offset */
	uint16_t nabo;

	/** Namespace Atomic Boundary Size Power Fail */
	uint16_t nabspf;

	/** Namespace Optimal IO Boundary */
	uint16_t noiob;

	/** NVM Capacity */
	uint8_t nvmcap[16];

	/** Namespace Preferred Write Granularity */
	uint16_t npwg;

	/** Namespace Preferred Write Alignment */
	uint16_t npwa;

	/** Namespace Preferred Deallocate Granularity */
	uint16_t npdg;

	/** Namespace Preferred Deallocate Alignment */
	uint16_t npda;

	/** Namespace Optimal Write Size */
	uint16_t nows;

	/* bytes 74-91: Reserved */
	uint8_t reserved5[18];

	/** ANA Group Identifier */
	uint32_t anagrpid;

	/* bytes 96-98: Reserved */
	uint8_t reserved6[3];

	/** Namespace Attributes */
	uint8_t nsattr;

	/** NVM Set Identifier */
	uint16_t nvmsetid;

	/** Endurance Group Identifier */
	uint16_t endgid;

	/** Namespace Globally Unique Identifier */
	uint8_t nguid[16];

	/** IEEE Extended Unique Identifier */
	uint8_t eui64[8];

	/** lba format support */
	uint32_t lbaf[16];

	uint8_t reserved7[192];

	uint8_t vendor_specific[3712];
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_namespace_data) == 4096, "bad size for nvme_namepsace_data");

	enum nvme_log_page {
	/* 0x00 - reserved */
	NVME_LOG_ERROR = 0x01,
	NVME_LOG_HEALTH_INFORMATION = 0x02,
	NVME_LOG_FIRMWARE_SLOT = 0x03,
	NVME_LOG_CHANGED_NAMESPACE = 0x04,
	NVME_LOG_COMMAND_EFFECT = 0x05,
	NVME_LOG_DEVICE_SELF_TEST = 0x06,
	NVME_LOG_TELEMETRY_HOST_INITIATED = 0x07,
	NVME_LOG_TELEMETRY_CONTROLLER_INITIATED = 0x08,
	NVME_LOG_ENDURANCE_GROUP_INFORMATION = 0x09,
	NVME_LOG_PREDICTABLE_LATENCY_PER_NVM_SET = 0x0a,
	NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE = 0x0b,
	NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS = 0x0c,
	NVME_LOG_PERSISTENT_EVENT_LOG = 0x0d,
	NVME_LOG_LBA_STATUS_INFORMATION = 0x0e,
	NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE = 0x0f,
	/* 0x06-0x7F - reserved */
	/* 0x80-0xBF - I/O command set specific */
	NVME_LOG_RES_NOTIFICATION = 0x80,
	NVME_LOG_SANITIZE_STATUS = 0x81,
	/* 0x82-0xBF - reserved */
	/* 0xC0-0xFF - vendor specific */

	/*
	* The following are Intel Specific log pages, but they seem
	* to be widely implemented.
	*/
	INTEL_LOG_READ_LAT_LOG = 0xc1,
	INTEL_LOG_WRITE_LAT_LOG = 0xc2,
	INTEL_LOG_TEMP_STATS = 0xc5,
	INTEL_LOG_ADD_SMART = 0xca,
	INTEL_LOG_DRIVE_MKT_NAME = 0xdd,

	/*
	* HGST log page, with lots ofs sub pages.
	*/
	HGST_INFO_LOG = 0xc1,
	};

	struct nvme_error_information_entry {
	uint64_t error_count;
	uint16_t sqid;
	uint16_t cid;
	uint16_t status;
	uint16_t error_location;
	uint64_t lba;
	uint32_t nsid;
	uint8_t vendor_specific;
	uint8_t trtype;
	uint16_t reserved30;
	uint64_t csi;
	uint16_t ttsi;
	uint8_t reserved[22];
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry");

	struct nvme_health_information_page {
	uint8_t critical_warning;
	uint16_t temperature;
	uint8_t available_spare;
	uint8_t available_spare_threshold;
	uint8_t percentage_used;

	uint8_t reserved[26];

	/*
	* Note that the following are 128-bit values, but are
	* defined as an array of 2 64-bit values.
	*/
	/* Data Units Read is always in 512-byte units. */
	uint64_t data_units_read[2];
	/* Data Units Written is always in 512-byte units. */
	uint64_t data_units_written[2];
	/* For NVM command set, this includes Compare commands. */
	uint64_t host_read_commands[2];
	uint64_t host_write_commands[2];
	/* Controller Busy Time is reported in minutes. */
	uint64_t controller_busy_time[2];
	uint64_t power_cycles[2];
	uint64_t power_on_hours[2];
	uint64_t unsafe_shutdowns[2];
	uint64_t media_errors[2];
	uint64_t num_error_info_log_entries[2];
	uint32_t warning_temp_time;
	uint32_t error_temp_time;
	uint16_t temp_sensor[8];
	/* Thermal Management Temperature 1 Transition Count */
	uint32_t tmt1tc;
	/* Thermal Management Temperature 2 Transition Count */
	uint32_t tmt2tc;
	/* Total Time For Thermal Management Temperature 1 */
	uint32_t ttftmt1;
	/* Total Time For Thermal Management Temperature 2 */
	uint32_t ttftmt2;

	uint8_t reserved2[280];
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page");

	struct nvme_firmware_page {
	uint8_t afi;
	uint8_t reserved[7];
	uint64_t revision[7]; /* revisions for 7 slots */
	uint8_t reserved2[448];
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_firmware_page) == 512, "bad size for nvme_firmware_page");

	struct nvme_ns_list {
	uint32_t ns[1024];
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_ns_list) == 4096, "bad size for nvme_ns_list");

	struct nvme_command_effects_page {
	uint32_t acs[256];
	uint32_t iocs[256];
	uint8_t reserved[2048];
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_command_effects_page) == 4096,
	"bad size for nvme_command_effects_page");

	struct nvme_res_notification_page {
	uint64_t log_page_count;
	uint8_t log_page_type;
	uint8_t available_log_pages;
	uint8_t reserved2;
	uint32_t nsid;
	uint8_t reserved[48];
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_res_notification_page) == 64,
	"bad size for nvme_res_notification_page");

	struct nvme_sanitize_status_page {
	uint16_t sprog;
	uint16_t sstat;
	uint32_t scdw10;
	uint32_t etfo;
	uint32_t etfbe;
	uint32_t etfce;
	uint32_t etfownd;
	uint32_t etfbewnd;
	uint32_t etfcewnd;
	uint8_t reserved[480];
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_sanitize_status_page) == 512,
	"bad size for nvme_sanitize_status_page");

	struct intel_log_temp_stats
	{
	uint64_t current;
	uint64_t overtemp_flag_last;
	uint64_t overtemp_flag_life;
	uint64_t max_temp;
	uint64_t min_temp;
	uint64_t _rsvd[5];
	uint64_t max_oper_temp;
	uint64_t min_oper_temp;
	uint64_t est_offset;
	} __packed __aligned(4);

	_Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats");

	struct nvme_resv_reg_ctrlr
	{
	uint16_t ctrlr_id; /* Controller ID */
	uint8_t rcsts; /* Reservation Status */
	uint8_t reserved3[5];
	uint64_t hostid; /* Host Identifier */
	uint64_t rkey; /* Reservation Key */
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_resv_reg_ctrlr) == 24, "bad size for nvme_resv_reg_ctrlr");

	struct nvme_resv_reg_ctrlr_ext
	{
	uint16_t ctrlr_id; /* Controller ID */
	uint8_t rcsts; /* Reservation Status */
	uint8_t reserved3[5];
	uint64_t rkey; /* Reservation Key */
	uint64_t hostid[2]; /* Host Identifier */
	uint8_t reserved32[32];
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_resv_reg_ctrlr_ext) == 64, "bad size for nvme_resv_reg_ctrlr_ext");

	struct nvme_resv_status
	{
	uint32_t gen; /* Generation */
	uint8_t rtype; /* Reservation Type */
	uint8_t regctl[2]; /* Number of Registered Controllers */
	uint8_t reserved7[2];
	uint8_t ptpls; /* Persist Through Power Loss State */
	uint8_t reserved10[14];
	struct nvme_resv_reg_ctrlr ctrlr[0];
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_resv_status) == 24, "bad size for nvme_resv_status");

	struct nvme_resv_status_ext
	{
	uint32_t gen; /* Generation */
	uint8_t rtype; /* Reservation Type */
	uint8_t regctl[2]; /* Number of Registered Controllers */
	uint8_t reserved7[2];
	uint8_t ptpls; /* Persist Through Power Loss State */
	uint8_t reserved10[14];
	uint8_t reserved24[40];
	struct nvme_resv_reg_ctrlr_ext ctrlr[0];
	} __packed __aligned(4);

	_Static_assert(sizeof(struct nvme_resv_status_ext) == 64, "bad size for nvme_resv_status_ext");

	#define NVME_TEST_MAX_THREADS 128

	struct nvme_io_test {
	enum nvme_nvm_opcode opc;
	uint32_t size;
	uint32_t time; /* in seconds */
	uint32_t num_threads;
	uint32_t flags;
	uint64_t io_completed[NVME_TEST_MAX_THREADS];
	};

	enum nvme_io_test_flags {
	/*
	* Specifies whether dev_refthread/dev_relthread should be
	* called during NVME_BIO_TEST. Ignored for other test
	* types.
	*/
	NVME_TEST_FLAG_REFTHREAD = 0x1,
	};

	struct nvme_pt_command {
	/*
	* cmd is used to specify a passthrough command to a controller or
	* namespace.
	*
	* The following fields from cmd may be specified by the caller:
	* * opc (opcode)
	* * nsid (namespace id) - for admin commands only
	* * cdw10-cdw15
	*
	* Remaining fields must be set to 0 by the caller.
	*/
	struct nvme_command cmd;

	/*
	* cpl returns completion status for the passthrough command
	* specified by cmd.
	*
	* The following fields will be filled out by the driver, for
	* consumption by the caller:
	* * cdw0
	* * status (except for phase)
	*
	* Remaining fields will be set to 0 by the driver.
	*/
	struct nvme_completion cpl;

	/* buf is the data buffer associated with this passthrough command. */
	void * buf;

	/*
	* len is the length of the data buffer associated with this
	* passthrough command.
	*/
	uint32_t len;

	/*
	* is_read = 1 if the passthrough command will read data into the
	* supplied buffer from the controller.
	*
	* is_read = 0 if the passthrough command will write data from the
	* supplied buffer to the controller.
	*/
	uint32_t is_read;

	/*
	* driver_lock is used by the driver only. It must be set to 0
	* by the caller.
	*/
	struct mtx * driver_lock;
	};

	struct nvme_get_nsid {
	char cdev[SPECNAMELEN + 1];
	uint32_t nsid;
	};

	struct nvme_hmb_desc {
	uint64_t addr;
	uint32_t size;
	uint32_t reserved;
	};

	#define nvme_completion_is_error(cpl) \
	(NVME_STATUS_GET_SC((cpl)->status) != 0 \|\| NVME_STATUS_GET_SCT((cpl)->status) != 0)

	void nvme_strvis(uint8_t dst, const uint8_t src, int dstlen, int srclen);

	#ifdef _KERNEL

	struct bio;
	struct thread;

	struct nvme_namespace;
	struct nvme_controller;
	struct nvme_consumer;

	typedef void (nvme_cb_fn_t)(void , const struct nvme_completion *);

	typedef void (nvme_cons_ns_fn_t)(struct nvme_namespace , void );
	typedef void (nvme_cons_ctrlr_fn_t)(struct nvme_controller *);
	typedef void (nvme_cons_async_fn_t)(void , const struct nvme_completion *,
	uint32_t, void *, uint32_t);
	typedef void (nvme_cons_fail_fn_t)(void );

	enum nvme_namespace_flags {
	NVME_NS_DEALLOCATE_SUPPORTED = 0x1,
	NVME_NS_FLUSH_SUPPORTED = 0x2,
	};

	int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
	struct nvme_pt_command *pt,
	uint32_t nsid, int is_user_buffer,
	int is_admin_cmd);

	/* Admin functions */
	void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr,
	uint8_t feature, uint32_t cdw11,
	uint32_t cdw12, uint32_t cdw13,
	uint32_t cdw14, uint32_t cdw15,
	void *payload, uint32_t payload_size,
	nvme_cb_fn_t cb_fn, void *cb_arg);
	void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr,
	uint8_t feature, uint32_t cdw11,
	void *payload, uint32_t payload_size,
	nvme_cb_fn_t cb_fn, void *cb_arg);
	void nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr,
	uint8_t log_page, uint32_t nsid,
	void *payload, uint32_t payload_size,
	nvme_cb_fn_t cb_fn, void *cb_arg);

	/* NVM I/O functions */
	int nvme_ns_cmd_write(struct nvme_namespace ns, void payload,
	uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn,
	void *cb_arg);
	int nvme_ns_cmd_write_bio(struct nvme_namespace ns, struct bio bp,
	nvme_cb_fn_t cb_fn, void *cb_arg);
	int nvme_ns_cmd_read(struct nvme_namespace ns, void payload,
	uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn,
	void *cb_arg);
	int nvme_ns_cmd_read_bio(struct nvme_namespace ns, struct bio bp,
	nvme_cb_fn_t cb_fn, void *cb_arg);
	int nvme_ns_cmd_deallocate(struct nvme_namespace ns, void payload,
	uint8_t num_ranges, nvme_cb_fn_t cb_fn,
	void *cb_arg);
	int nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn,
	void *cb_arg);
	int nvme_ns_dump(struct nvme_namespace ns, void virt, off_t offset,
	size_t len);

	/* Registration functions */
	struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn,
	nvme_cons_ctrlr_fn_t ctrlr_fn,
	nvme_cons_async_fn_t async_fn,
	nvme_cons_fail_fn_t fail_fn);
	void nvme_unregister_consumer(struct nvme_consumer *consumer);

	/* Controller helper functions */
	device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr);
	const struct nvme_controller_data *
	nvme_ctrlr_get_data(struct nvme_controller *ctrlr);
	static inline bool
	nvme_ctrlr_has_dataset_mgmt(const struct nvme_controller_data *cd)
	{
	/* Assumes cd was byte swapped by nvme_controller_data_swapbytes() */
	return ((cd->oncs >> NVME_CTRLR_DATA_ONCS_DSM_SHIFT) &
	NVME_CTRLR_DATA_ONCS_DSM_MASK);
	}

	/* Namespace helper functions */
	uint32_t nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns);
	uint32_t nvme_ns_get_sector_size(struct nvme_namespace *ns);
	uint64_t nvme_ns_get_num_sectors(struct nvme_namespace *ns);
	uint64_t nvme_ns_get_size(struct nvme_namespace *ns);
	uint32_t nvme_ns_get_flags(struct nvme_namespace *ns);
	const char * nvme_ns_get_serial_number(struct nvme_namespace *ns);
	const char * nvme_ns_get_model_number(struct nvme_namespace *ns);
	const struct nvme_namespace_data *
	nvme_ns_get_data(struct nvme_namespace *ns);
	uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns);

	int nvme_ns_bio_process(struct nvme_namespace ns, struct bio bp,
	nvme_cb_fn_t cb_fn);
	int nvme_ns_ioctl_process(struct nvme_namespace *ns, u_long cmd,
	caddr_t arg, int flag, struct thread *td);

	/*
	* Command building helper functions -- shared with CAM
	* These functions assume allocator zeros out cmd structure
	* CAM's xpt_get_ccb and the request allocator for nvme both
	* do zero'd allocations.
	*/
	static inline
	void nvme_ns_flush_cmd(struct nvme_command *cmd, uint32_t nsid)
	{

	cmd->opc = NVME_OPC_FLUSH;
	cmd->nsid = htole32(nsid);
	}

	static inline
	void nvme_ns_rw_cmd(struct nvme_command *cmd, uint32_t rwcmd, uint32_t nsid,
	uint64_t lba, uint32_t count)
	{
	cmd->opc = rwcmd;
	cmd->nsid = htole32(nsid);
	cmd->cdw10 = htole32(lba & 0xffffffffu);
	cmd->cdw11 = htole32(lba >> 32);
	cmd->cdw12 = htole32(count-1);
	}

	static inline
	void nvme_ns_write_cmd(struct nvme_command *cmd, uint32_t nsid,
	uint64_t lba, uint32_t count)
	{
	nvme_ns_rw_cmd(cmd, NVME_OPC_WRITE, nsid, lba, count);
	}

	static inline
	void nvme_ns_read_cmd(struct nvme_command *cmd, uint32_t nsid,
	uint64_t lba, uint32_t count)
	{
	nvme_ns_rw_cmd(cmd, NVME_OPC_READ, nsid, lba, count);
	}

	static inline
	void nvme_ns_trim_cmd(struct nvme_command *cmd, uint32_t nsid,
	uint32_t num_ranges)
	{
	cmd->opc = NVME_OPC_DATASET_MANAGEMENT;
	cmd->nsid = htole32(nsid);
	cmd->cdw10 = htole32(num_ranges - 1);
	cmd->cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
	}

	extern int nvme_use_nvd;

	#endif /* _KERNEL */

	/* Endianess conversion functions for NVMe structs */
	static inline
	void nvme_completion_swapbytes(struct nvme_completion *s)
	{

	s->cdw0 = le32toh(s->cdw0);
	/* omit rsvd1 */
	s->sqhd = le16toh(s->sqhd);
	s->sqid = le16toh(s->sqid);
	/* omit cid */
	s->status = le16toh(s->status);
	}

	static inline
	void nvme_power_state_swapbytes(struct nvme_power_state *s)
	{

	s->mp = le16toh(s->mp);
	s->enlat = le32toh(s->enlat);
	s->exlat = le32toh(s->exlat);
	s->idlp = le16toh(s->idlp);
	s->actp = le16toh(s->actp);
	}

	static inline
	void nvme_controller_data_swapbytes(struct nvme_controller_data *s)
	{
	int i;

	s->vid = le16toh(s->vid);
	s->ssvid = le16toh(s->ssvid);
	s->ctrlr_id = le16toh(s->ctrlr_id);
	s->ver = le32toh(s->ver);
	s->rtd3r = le32toh(s->rtd3r);
	s->rtd3e = le32toh(s->rtd3e);
	s->oaes = le32toh(s->oaes);
	s->ctratt = le32toh(s->ctratt);
	s->rrls = le16toh(s->rrls);
	s->crdt1 = le16toh(s->crdt1);
	s->crdt2 = le16toh(s->crdt2);
	s->crdt3 = le16toh(s->crdt3);
	s->oacs = le16toh(s->oacs);
	s->wctemp = le16toh(s->wctemp);
	s->cctemp = le16toh(s->cctemp);
	s->mtfa = le16toh(s->mtfa);
	s->hmpre = le32toh(s->hmpre);
	s->hmmin = le32toh(s->hmmin);
	s->rpmbs = le32toh(s->rpmbs);
	s->edstt = le16toh(s->edstt);
	s->kas = le16toh(s->kas);
	s->hctma = le16toh(s->hctma);
	s->mntmt = le16toh(s->mntmt);
	s->mxtmt = le16toh(s->mxtmt);
	s->sanicap = le32toh(s->sanicap);
	s->hmminds = le32toh(s->hmminds);
	s->hmmaxd = le16toh(s->hmmaxd);
	s->nsetidmax = le16toh(s->nsetidmax);
	s->endgidmax = le16toh(s->endgidmax);
	s->anagrpmax = le32toh(s->anagrpmax);
	s->nanagrpid = le32toh(s->nanagrpid);
	s->pels = le32toh(s->pels);
	s->maxcmd = le16toh(s->maxcmd);
	s->nn = le32toh(s->nn);
	s->oncs = le16toh(s->oncs);
	s->fuses = le16toh(s->fuses);
	s->awun = le16toh(s->awun);
	s->awupf = le16toh(s->awupf);
	s->acwu = le16toh(s->acwu);
	s->sgls = le32toh(s->sgls);
	s->mnan = le32toh(s->mnan);
	for (i = 0; i < 32; i++)
	nvme_power_state_swapbytes(&s->power_state[i]);
	}

	static inline
	void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s)
	{
	int i;

	s->nsze = le64toh(s->nsze);
	s->ncap = le64toh(s->ncap);
	s->nuse = le64toh(s->nuse);
	s->nawun = le16toh(s->nawun);
	s->nawupf = le16toh(s->nawupf);
	s->nacwu = le16toh(s->nacwu);
	s->nabsn = le16toh(s->nabsn);
	s->nabo = le16toh(s->nabo);
	s->nabspf = le16toh(s->nabspf);
	s->noiob = le16toh(s->noiob);
	s->npwg = le16toh(s->npwg);
	s->npwa = le16toh(s->npwa);
	s->npdg = le16toh(s->npdg);
	s->npda = le16toh(s->npda);
	s->nows = le16toh(s->nows);
	s->anagrpid = le32toh(s->anagrpid);
	s->nvmsetid = le16toh(s->nvmsetid);
	s->endgid = le16toh(s->endgid);
	for (i = 0; i < 16; i++)
	s->lbaf[i] = le32toh(s->lbaf[i]);
	}

	static inline
	void nvme_error_information_entry_swapbytes(struct nvme_error_information_entry *s)
	{

	s->error_count = le64toh(s->error_count);
	s->sqid = le16toh(s->sqid);
	s->cid = le16toh(s->cid);
	s->status = le16toh(s->status);
	s->error_location = le16toh(s->error_location);
	s->lba = le64toh(s->lba);
	s->nsid = le32toh(s->nsid);
	s->csi = le64toh(s->csi);
	s->ttsi = le16toh(s->ttsi);
	}

	static inline
	void nvme_le128toh(void *p)
	{
	#if _BYTE_ORDER != _LITTLE_ENDIAN
	/* Swap 16 bytes in place */
	char tmp = (char)p;
	char b;
	int i;
	for (i = 0; i < 8; i++) {
	b = tmp[i];
	tmp[i] = tmp[15-i];
	tmp[15-i] = b;
	}
	#else
	(void)p;
	#endif
	}

	static inline
	void nvme_health_information_page_swapbytes(struct nvme_health_information_page *s)
	{
	int i;

	s->temperature = le16toh(s->temperature);
	nvme_le128toh((void *)s->data_units_read);
	nvme_le128toh((void *)s->data_units_written);
	nvme_le128toh((void *)s->host_read_commands);
	nvme_le128toh((void *)s->host_write_commands);
	nvme_le128toh((void *)s->controller_busy_time);
	nvme_le128toh((void *)s->power_cycles);
	nvme_le128toh((void *)s->power_on_hours);
	nvme_le128toh((void *)s->unsafe_shutdowns);
	nvme_le128toh((void *)s->media_errors);
	nvme_le128toh((void *)s->num_error_info_log_entries);
	s->warning_temp_time = le32toh(s->warning_temp_time);
	s->error_temp_time = le32toh(s->error_temp_time);
	for (i = 0; i < 8; i++)
	s->temp_sensor[i] = le16toh(s->temp_sensor[i]);
	s->tmt1tc = le32toh(s->tmt1tc);
	s->tmt2tc = le32toh(s->tmt2tc);
	s->ttftmt1 = le32toh(s->ttftmt1);
	s->ttftmt2 = le32toh(s->ttftmt2);
	}

	static inline
	void nvme_firmware_page_swapbytes(struct nvme_firmware_page *s)
	{
	int i;

	for (i = 0; i < 7; i++)
	s->revision[i] = le64toh(s->revision[i]);
	}

	static inline
	void nvme_ns_list_swapbytes(struct nvme_ns_list *s)
	{
	int i;

	for (i = 0; i < 1024; i++)
	s->ns[i] = le32toh(s->ns[i]);
	}

	static inline
	void nvme_command_effects_page_swapbytes(struct nvme_command_effects_page *s)
	{
	int i;

	for (i = 0; i < 256; i++)
	s->acs[i] = le32toh(s->acs[i]);
	for (i = 0; i < 256; i++)
	s->iocs[i] = le32toh(s->iocs[i]);
	}

	static inline
	void nvme_res_notification_page_swapbytes(struct nvme_res_notification_page *s)
	{
	s->log_page_count = le64toh(s->log_page_count);
	s->nsid = le32toh(s->nsid);
	}

	static inline
	void nvme_sanitize_status_page_swapbytes(struct nvme_sanitize_status_page *s)
	{
	s->sprog = le16toh(s->sprog);
	s->sstat = le16toh(s->sstat);
	s->scdw10 = le32toh(s->scdw10);
	s->etfo = le32toh(s->etfo);
	s->etfbe = le32toh(s->etfbe);
	s->etfce = le32toh(s->etfce);
	s->etfownd = le32toh(s->etfownd);
	s->etfbewnd = le32toh(s->etfbewnd);
	s->etfcewnd = le32toh(s->etfcewnd);
	}

	static inline
	void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s)
	{

	s->current = le64toh(s->current);
	s->overtemp_flag_last = le64toh(s->overtemp_flag_last);
	s->overtemp_flag_life = le64toh(s->overtemp_flag_life);
	s->max_temp = le64toh(s->max_temp);
	s->min_temp = le64toh(s->min_temp);
	/* omit _rsvd[] */
	s->max_oper_temp = le64toh(s->max_oper_temp);
	s->min_oper_temp = le64toh(s->min_oper_temp);
	s->est_offset = le64toh(s->est_offset);
	}

	static inline
	void nvme_resv_status_swapbytes(struct nvme_resv_status *s, size_t size)
	{
	u_int i, n;

	s->gen = le32toh(s->gen);
	n = (s->regctl[1] << 8) \| s->regctl[0];
	n = MIN(n, (size - sizeof(s)) / sizeof(s->ctrlr[0]));
	for (i = 0; i < n; i++) {
	s->ctrlr[i].ctrlr_id = le16toh(s->ctrlr[i].ctrlr_id);
	s->ctrlr[i].hostid = le64toh(s->ctrlr[i].hostid);
	s->ctrlr[i].rkey = le64toh(s->ctrlr[i].rkey);
	}
	}

	static inline
	void nvme_resv_status_ext_swapbytes(struct nvme_resv_status_ext *s, size_t size)
	{
	u_int i, n;

	s->gen = le32toh(s->gen);
	n = (s->regctl[1] << 8) \| s->regctl[0];
	n = MIN(n, (size - sizeof(s)) / sizeof(s->ctrlr[0]));
	for (i = 0; i < n; i++) {
	s->ctrlr[i].ctrlr_id = le16toh(s->ctrlr[i].ctrlr_id);
	s->ctrlr[i].rkey = le64toh(s->ctrlr[i].rkey);
	nvme_le128toh((void *)s->ctrlr[i].hostid);
	}
	}

	#endif /* __NVME_H__ */
	diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
	index 5dc434acb11a..32756afa0a1e 100644
	--- a/sys/dev/nvme/nvme_ctrlr.c
	+++ b/sys/dev/nvme/nvme_ctrlr.c
	@@ -1,1693 +1,1693 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (C) 2012-2016 Intel Corporation
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_cam.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/buf.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/ioccom.h>
	#include <sys/proc.h>
	#include <sys/smp.h>
	#include <sys/uio.h>
	#include <sys/sbuf.h>
	#include <sys/endian.h>
	#include <machine/stdarg.h>
	#include <vm/vm.h>

	#include "nvme_private.h"

	#define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */

	static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
	struct nvme_async_event_request *aer);

	static void
	nvme_ctrlr_devctl_log(struct nvme_controller ctrlr, const char type, const char *msg, ...)
	{
	struct sbuf sb;
	va_list ap;
	int error;

	if (sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND \| SBUF_NOWAIT) == NULL)
	return;
	sbuf_printf(&sb, "%s: ", device_get_nameunit(ctrlr->dev));
	va_start(ap, msg);
	sbuf_vprintf(&sb, msg, ap);
	va_end(ap);
	error = sbuf_finish(&sb);
	if (error == 0)
	printf("%s\n", sbuf_data(&sb));

	sbuf_clear(&sb);
	sbuf_printf(&sb, "name=\"%s\" reason=\"", device_get_nameunit(ctrlr->dev));
	va_start(ap, msg);
	sbuf_vprintf(&sb, msg, ap);
	va_end(ap);
	sbuf_printf(&sb, "\"");
	error = sbuf_finish(&sb);
	if (error == 0)
	devctl_notify("nvme", "controller", type, sbuf_data(&sb));
	sbuf_delete(&sb);
	}

	static int
	nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
	{
	struct nvme_qpair *qpair;
	uint32_t num_entries;
	int error;

	qpair = &ctrlr->adminq;
	qpair->id = 0;
	qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
	qpair->domain = ctrlr->domain;

	num_entries = NVME_ADMIN_ENTRIES;
	TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
	/*
	* If admin_entries was overridden to an invalid value, revert it
	* back to our default value.
	*/
	if (num_entries < NVME_MIN_ADMIN_ENTRIES \|\|
	num_entries > NVME_MAX_ADMIN_ENTRIES) {
	nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d "
	"specified\n", num_entries);
	num_entries = NVME_ADMIN_ENTRIES;
	}

	/*
	* The admin queue's max xfer size is treated differently than the
	* max I/O xfer size. 16KB is sufficient here - maybe even less?
	*/
	error = nvme_qpair_construct(qpair, num_entries, NVME_ADMIN_TRACKERS,
	ctrlr);
	return (error);
	}

	#define QP(ctrlr, c) ((c) * (ctrlr)->num_io_queues / mp_ncpus)

	static int
	nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
	{
	struct nvme_qpair *qpair;
	uint32_t cap_lo;
	uint16_t mqes;
	int c, error, i, n;
	int num_entries, num_trackers, max_entries;

	/*
	* NVMe spec sets a hard limit of 64K max entries, but devices may
	* specify a smaller limit, so we need to check the MQES field in the
	* capabilities register. We have to cap the number of entries to the
	* current stride allows for in BAR 0/1, otherwise the remainder entries
	* are inaccessable. MQES should reflect this, and this is just a
	* fail-safe.
	*/
	max_entries =
	(rman_get_size(ctrlr->resource) - nvme_mmio_offsetof(doorbell[0])) /
	(1 << (ctrlr->dstrd + 1));
	num_entries = NVME_IO_ENTRIES;
	TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
	cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
	mqes = NVME_CAP_LO_MQES(cap_lo);
	num_entries = min(num_entries, mqes + 1);
	num_entries = min(num_entries, max_entries);

	num_trackers = NVME_IO_TRACKERS;
	TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers);

	num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS);
	num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS);
	/*
	* No need to have more trackers than entries in the submit queue. Note
	* also that for a queue size of N, we can only have (N-1) commands
	* outstanding, hence the "-1" here.
	*/
	num_trackers = min(num_trackers, (num_entries-1));

	/*
	* Our best estimate for the maximum number of I/Os that we should
	* normally have in flight at one time. This should be viewed as a hint,
	* not a hard limit and will need to be revisited when the upper layers
	* of the storage system grows multi-queue support.
	*/
	ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4;

	ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
	M_NVME, M_ZERO \| M_WAITOK);

	for (i = c = n = 0; i < ctrlr->num_io_queues; i++, c += n) {
	qpair = &ctrlr->ioq[i];

	/*
	* Admin queue has ID=0. IO queues start at ID=1 -
	* hence the 'i+1' here.
	*/
	qpair->id = i + 1;
	if (ctrlr->num_io_queues > 1) {
	/* Find number of CPUs served by this queue. */
	for (n = 1; QP(ctrlr, c + n) == i; n++)
	;
	/* Shuffle multiple NVMe devices between CPUs. */
	qpair->cpu = c + (device_get_unit(ctrlr->dev)+n/2) % n;
	qpair->domain = pcpu_find(qpair->cpu)->pc_domain;
	} else {
	qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
	qpair->domain = ctrlr->domain;
	}

	/*
	* For I/O queues, use the controller-wide max_xfer_size
	* calculated in nvme_attach().
	*/
	error = nvme_qpair_construct(qpair, num_entries, num_trackers,
	ctrlr);
	if (error)
	return (error);

	/*
	* Do not bother binding interrupts if we only have one I/O
	* interrupt thread for this controller.
	*/
	if (ctrlr->num_io_queues > 1)
	bus_bind_intr(ctrlr->dev, qpair->res, qpair->cpu);
	}

	return (0);
	}

	static void
	nvme_ctrlr_fail(struct nvme_controller *ctrlr)
	{
	int i;

	ctrlr->is_failed = true;
	nvme_admin_qpair_disable(&ctrlr->adminq);
	nvme_qpair_fail(&ctrlr->adminq);
	if (ctrlr->ioq != NULL) {
	for (i = 0; i < ctrlr->num_io_queues; i++) {
	nvme_io_qpair_disable(&ctrlr->ioq[i]);
	nvme_qpair_fail(&ctrlr->ioq[i]);
	}
	}
	nvme_notify_fail_consumers(ctrlr);
	}

	void
	nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,
	struct nvme_request *req)
	{

	mtx_lock(&ctrlr->lock);
	STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq);
	mtx_unlock(&ctrlr->lock);
	taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
	}

	static void
	nvme_ctrlr_fail_req_task(void *arg, int pending)
	{
	struct nvme_controller *ctrlr = arg;
	struct nvme_request *req;

	mtx_lock(&ctrlr->lock);
	while ((req = STAILQ_FIRST(&ctrlr->fail_req)) != NULL) {
	STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq);
	mtx_unlock(&ctrlr->lock);
	nvme_qpair_manual_complete_request(req->qpair, req,
	NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST);
	mtx_lock(&ctrlr->lock);
	}
	mtx_unlock(&ctrlr->lock);
	}

	static int
	nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val)
	{
	int ms_waited;
	uint32_t csts;

	ms_waited = 0;
	while (1) {
	csts = nvme_mmio_read_4(ctrlr, csts);
	if (csts == 0xffffffff) /* Hot unplug. */
	return (ENXIO);
	if (((csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK)
	== desired_val)
	break;
	if (ms_waited++ > ctrlr->ready_timeout_in_ms) {
	nvme_printf(ctrlr, "controller ready did not become %d "
	"within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms);
	return (ENXIO);
	}
	DELAY(1000);
	}

	return (0);
	}

	static int
	nvme_ctrlr_disable(struct nvme_controller *ctrlr)
	{
	uint32_t cc;
	uint32_t csts;
	uint8_t en, rdy;
	int err;

	cc = nvme_mmio_read_4(ctrlr, cc);
	csts = nvme_mmio_read_4(ctrlr, csts);

	en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK;
	rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK;

	/*
	* Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1
	* when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when
	* CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY
	* isn't the desired value. Short circuit if we're already disabled.
	*/
	if (en == 1) {
	if (rdy == 0) {
	/* EN == 1, wait for RDY == 1 or fail */
	err = nvme_ctrlr_wait_for_ready(ctrlr, 1);
	if (err != 0)
	return (err);
	}
	} else {
	/* EN == 0 already wait for RDY == 0 */
	if (rdy == 0)
	return (0);
	else
	return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
	}

	cc &= ~NVME_CC_REG_EN_MASK;
	nvme_mmio_write_4(ctrlr, cc, cc);
	/*
	* Some drives have issues with accessing the mmio after we
	* disable, so delay for a bit after we write the bit to
	* cope with these issues.
	*/
	if (ctrlr->quirks & QUIRK_DELAY_B4_CHK_RDY)
	pause("nvmeR", B4_CHK_RDY_DELAY_MS * hz / 1000);
	return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
	}

	static int
	nvme_ctrlr_enable(struct nvme_controller *ctrlr)
	{
	uint32_t cc;
	uint32_t csts;
	uint32_t aqa;
	uint32_t qsize;
	uint8_t en, rdy;
	int err;

	cc = nvme_mmio_read_4(ctrlr, cc);
	csts = nvme_mmio_read_4(ctrlr, csts);

	en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK;
	rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK;

	/*
	* See note in nvme_ctrlr_disable. Short circuit if we're already enabled.
	*/
	if (en == 1) {
	if (rdy == 1)
	return (0);
	else
	return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
	} else {
	/* EN == 0 already wait for RDY == 0 or fail */
	err = nvme_ctrlr_wait_for_ready(ctrlr, 0);
	if (err != 0)
	return (err);
	}

	nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
	DELAY(5000);
	nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
	DELAY(5000);

	/* acqs and asqs are 0-based. */
	qsize = ctrlr->adminq.num_entries - 1;

	aqa = 0;
	aqa = (qsize & NVME_AQA_REG_ACQS_MASK) << NVME_AQA_REG_ACQS_SHIFT;
	aqa \|= (qsize & NVME_AQA_REG_ASQS_MASK) << NVME_AQA_REG_ASQS_SHIFT;
	nvme_mmio_write_4(ctrlr, aqa, aqa);
	DELAY(5000);

	/* Initialization values for CC */
	cc = 0;
	cc \|= 1 << NVME_CC_REG_EN_SHIFT;
	cc \|= 0 << NVME_CC_REG_CSS_SHIFT;
	cc \|= 0 << NVME_CC_REG_AMS_SHIFT;
	cc \|= 0 << NVME_CC_REG_SHN_SHIFT;
	cc \|= 6 << NVME_CC_REG_IOSQES_SHIFT; /* SQ entry size == 64 == 2^6 */
	cc \|= 4 << NVME_CC_REG_IOCQES_SHIFT; /* CQ entry size == 16 == 2^4 */

	/* This evaluates to 0, which is according to spec. */
	cc \|= (PAGE_SIZE >> 13) << NVME_CC_REG_MPS_SHIFT;

	nvme_mmio_write_4(ctrlr, cc, cc);

	return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
	}

	static void
	nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr)
	{
	int i;

	nvme_admin_qpair_disable(&ctrlr->adminq);
	/*
	* I/O queues are not allocated before the initial HW
	* reset, so do not try to disable them. Use is_initialized
	* to determine if this is the initial HW reset.
	*/
	if (ctrlr->is_initialized) {
	for (i = 0; i < ctrlr->num_io_queues; i++)
	nvme_io_qpair_disable(&ctrlr->ioq[i]);
	}
	}

	int
	nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
	{
	int err;

	nvme_ctrlr_disable_qpairs(ctrlr);

	DELAY(100*1000);

	err = nvme_ctrlr_disable(ctrlr);
	if (err != 0)
	return err;
	return (nvme_ctrlr_enable(ctrlr));
	}

	void
	nvme_ctrlr_reset(struct nvme_controller *ctrlr)
	{
	int cmpset;

	cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);

	if (cmpset == 0 \|\| ctrlr->is_failed)
	/*
	* Controller is already resetting or has failed. Return
	* immediately since there is no need to kick off another
	* reset in these cases.
	*/
	return;

	taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
	}

	static int
	nvme_ctrlr_identify(struct nvme_controller *ctrlr)
	{
	struct nvme_completion_poll_status status;

	status.done = 0;
	nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
	nvme_completion_poll_cb, &status);
	nvme_completion_poll(&status);
	if (nvme_completion_is_error(&status.cpl)) {
	nvme_printf(ctrlr, "nvme_identify_controller failed!\n");
	return (ENXIO);
	}

	/* Convert data to host endian */
	nvme_controller_data_swapbytes(&ctrlr->cdata);

	/*
	* Use MDTS to ensure our default max_xfer_size doesn't exceed what the
	* controller supports.
	*/
	if (ctrlr->cdata.mdts > 0)
	ctrlr->max_xfer_size = min(ctrlr->max_xfer_size,
	ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts)));

	return (0);
	}

	static int
	nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
	{
	struct nvme_completion_poll_status status;
	int cq_allocated, sq_allocated;

	status.done = 0;
	nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
	nvme_completion_poll_cb, &status);
	nvme_completion_poll(&status);
	if (nvme_completion_is_error(&status.cpl)) {
	nvme_printf(ctrlr, "nvme_ctrlr_set_num_qpairs failed!\n");
	return (ENXIO);
	}

	/*
	* Data in cdw0 is 0-based.
	* Lower 16-bits indicate number of submission queues allocated.
	* Upper 16-bits indicate number of completion queues allocated.
	*/
	sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1;
	cq_allocated = (status.cpl.cdw0 >> 16) + 1;

	/*
	* Controller may allocate more queues than we requested,
	* so use the minimum of the number requested and what was
	* actually allocated.
	*/
	ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated);
	ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated);
	if (ctrlr->num_io_queues > vm_ndomains)
	ctrlr->num_io_queues -= ctrlr->num_io_queues % vm_ndomains;

	return (0);
	}

	static int
	nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
	{
	struct nvme_completion_poll_status status;
	struct nvme_qpair *qpair;
	int i;

	for (i = 0; i < ctrlr->num_io_queues; i++) {
	qpair = &ctrlr->ioq[i];

	status.done = 0;
	nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair,
	nvme_completion_poll_cb, &status);
	nvme_completion_poll(&status);
	if (nvme_completion_is_error(&status.cpl)) {
	nvme_printf(ctrlr, "nvme_create_io_cq failed!\n");
	return (ENXIO);
	}

	status.done = 0;
	nvme_ctrlr_cmd_create_io_sq(ctrlr, qpair,
	nvme_completion_poll_cb, &status);
	nvme_completion_poll(&status);
	if (nvme_completion_is_error(&status.cpl)) {
	nvme_printf(ctrlr, "nvme_create_io_sq failed!\n");
	return (ENXIO);
	}
	}

	return (0);
	}

	static int
	nvme_ctrlr_delete_qpairs(struct nvme_controller *ctrlr)
	{
	struct nvme_completion_poll_status status;
	struct nvme_qpair *qpair;

	for (int i = 0; i < ctrlr->num_io_queues; i++) {
	qpair = &ctrlr->ioq[i];

	status.done = 0;
	nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair,
	nvme_completion_poll_cb, &status);
	nvme_completion_poll(&status);
	if (nvme_completion_is_error(&status.cpl)) {
	nvme_printf(ctrlr, "nvme_destroy_io_sq failed!\n");
	return (ENXIO);
	}

	status.done = 0;
	nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair,
	nvme_completion_poll_cb, &status);
	nvme_completion_poll(&status);
	if (nvme_completion_is_error(&status.cpl)) {
	nvme_printf(ctrlr, "nvme_destroy_io_cq failed!\n");
	return (ENXIO);
	}
	}

	return (0);
	}

	static int
	nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
	{
	struct nvme_namespace *ns;
	uint32_t i;

	for (i = 0; i < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); i++) {
	ns = &ctrlr->ns[i];
	nvme_ns_construct(ns, i+1, ctrlr);
	}

	return (0);
	}

	static bool
	is_log_page_id_valid(uint8_t page_id)
	{

	switch (page_id) {
	case NVME_LOG_ERROR:
	case NVME_LOG_HEALTH_INFORMATION:
	case NVME_LOG_FIRMWARE_SLOT:
	case NVME_LOG_CHANGED_NAMESPACE:
	case NVME_LOG_COMMAND_EFFECT:
	case NVME_LOG_RES_NOTIFICATION:
	case NVME_LOG_SANITIZE_STATUS:
	return (true);
	}

	return (false);
	}

	static uint32_t
	nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id)
	{
	uint32_t log_page_size;

	switch (page_id) {
	case NVME_LOG_ERROR:
	log_page_size = min(
	sizeof(struct nvme_error_information_entry) *
	(ctrlr->cdata.elpe + 1), NVME_MAX_AER_LOG_SIZE);
	break;
	case NVME_LOG_HEALTH_INFORMATION:
	log_page_size = sizeof(struct nvme_health_information_page);
	break;
	case NVME_LOG_FIRMWARE_SLOT:
	log_page_size = sizeof(struct nvme_firmware_page);
	break;
	case NVME_LOG_CHANGED_NAMESPACE:
	log_page_size = sizeof(struct nvme_ns_list);
	break;
	case NVME_LOG_COMMAND_EFFECT:
	log_page_size = sizeof(struct nvme_command_effects_page);
	break;
	case NVME_LOG_RES_NOTIFICATION:
	log_page_size = sizeof(struct nvme_res_notification_page);
	break;
	case NVME_LOG_SANITIZE_STATUS:
	log_page_size = sizeof(struct nvme_sanitize_status_page);
	break;
	default:
	log_page_size = 0;
	break;
	}

	return (log_page_size);
	}

	static void
	nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr,
	uint8_t state)
	{

	if (state & NVME_CRIT_WARN_ST_AVAILABLE_SPARE)
	nvme_ctrlr_devctl_log(ctrlr, "critical",
	"available spare space below threshold");

	if (state & NVME_CRIT_WARN_ST_TEMPERATURE)
	nvme_ctrlr_devctl_log(ctrlr, "critical",
	"temperature above threshold");

	if (state & NVME_CRIT_WARN_ST_DEVICE_RELIABILITY)
	nvme_ctrlr_devctl_log(ctrlr, "critical",
	"device reliability degraded");

	if (state & NVME_CRIT_WARN_ST_READ_ONLY)
	nvme_ctrlr_devctl_log(ctrlr, "critical",
	"media placed in read only mode");

	if (state & NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP)
	nvme_ctrlr_devctl_log(ctrlr, "critical",
	"volatile memory backup device failed");

	if (state & NVME_CRIT_WARN_ST_RESERVED_MASK)
	nvme_ctrlr_devctl_log(ctrlr, "critical",
	"unknown critical warning(s): state = 0x%02x", state);
	}

	static void
	nvme_ctrlr_async_event_log_page_cb(void arg, const struct nvme_completion cpl)
	{
	struct nvme_async_event_request *aer = arg;
	struct nvme_health_information_page *health_info;
	struct nvme_ns_list *nsl;
	struct nvme_error_information_entry *err;
	int i;

	/*
	* If the log page fetch for some reason completed with an error,
	* don't pass log page data to the consumers. In practice, this case
	* should never happen.
	*/
	if (nvme_completion_is_error(cpl))
	nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
	aer->log_page_id, NULL, 0);
	else {
	/* Convert data to host endian */
	switch (aer->log_page_id) {
	case NVME_LOG_ERROR:
	err = (struct nvme_error_information_entry *)aer->log_page_buffer;
	for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
	nvme_error_information_entry_swapbytes(err++);
	break;
	case NVME_LOG_HEALTH_INFORMATION:
	nvme_health_information_page_swapbytes(
	(struct nvme_health_information_page *)aer->log_page_buffer);
	break;
	case NVME_LOG_FIRMWARE_SLOT:
	nvme_firmware_page_swapbytes(
	(struct nvme_firmware_page *)aer->log_page_buffer);
	break;
	case NVME_LOG_CHANGED_NAMESPACE:
	nvme_ns_list_swapbytes(
	(struct nvme_ns_list *)aer->log_page_buffer);
	break;
	case NVME_LOG_COMMAND_EFFECT:
	nvme_command_effects_page_swapbytes(
	(struct nvme_command_effects_page *)aer->log_page_buffer);
	break;
	case NVME_LOG_RES_NOTIFICATION:
	nvme_res_notification_page_swapbytes(
	(struct nvme_res_notification_page *)aer->log_page_buffer);
	break;
	case NVME_LOG_SANITIZE_STATUS:
	nvme_sanitize_status_page_swapbytes(
	(struct nvme_sanitize_status_page *)aer->log_page_buffer);
	break;
	case INTEL_LOG_TEMP_STATS:
	intel_log_temp_stats_swapbytes(
	(struct intel_log_temp_stats *)aer->log_page_buffer);
	break;
	default:
	break;
	}

	if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
	health_info = (struct nvme_health_information_page *)
	aer->log_page_buffer;
	nvme_ctrlr_log_critical_warnings(aer->ctrlr,
	health_info->critical_warning);
	/*
	* Critical warnings reported through the
	* SMART/health log page are persistent, so
	* clear the associated bits in the async event
	* config so that we do not receive repeated
	* notifications for the same event.
	*/
	aer->ctrlr->async_event_config &=
	~health_info->critical_warning;
	nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
	aer->ctrlr->async_event_config, NULL, NULL);
	} else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE &&
	!nvme_use_nvd) {
	nsl = (struct nvme_ns_list *)aer->log_page_buffer;
	for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
	if (nsl->ns[i] > NVME_MAX_NAMESPACES)
	break;
	nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
	}
	}

	/*
	* Pass the cpl data from the original async event completion,
	* not the log page fetch.
	*/
	nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
	aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
	}

	/*
	* Repost another asynchronous event request to replace the one
	* that just completed.
	*/
	nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
	}

	static void
	nvme_ctrlr_async_event_cb(void arg, const struct nvme_completion cpl)
	{
	struct nvme_async_event_request *aer = arg;

	if (nvme_completion_is_error(cpl)) {
	/*
	* Do not retry failed async event requests. This avoids
	* infinite loops where a new async event request is submitted
	* to replace the one just failed, only to fail again and
	* perpetuate the loop.
	*/
	return;
	}

	/* Associated log page is in bits 23:16 of completion entry dw0. */
	aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;

	nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x,"
	" page 0x%02x)\n", (cpl->cdw0 & 0x07), (cpl->cdw0 & 0xFF00) >> 8,
	aer->log_page_id);

	if (is_log_page_id_valid(aer->log_page_id)) {
	aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
	aer->log_page_id);
	memcpy(&aer->cpl, cpl, sizeof(*cpl));
	nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
	NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
	aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
	aer);
	/* Wait to notify consumers until after log page is fetched. */
	} else {
	nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
	NULL, 0);

	/*
	* Repost another asynchronous event request to replace the one
	* that just completed.
	*/
	nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
	}
	}

	static void
	nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
	struct nvme_async_event_request *aer)
	{
	struct nvme_request *req;

	aer->ctrlr = ctrlr;
	req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer);
	aer->req = req;

	/*
	* Disable timeout here, since asynchronous event requests should by
	* nature never be timed out.
	*/
	req->timeout = false;
	req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
	nvme_ctrlr_submit_admin_request(ctrlr, req);
	}

	static void
	nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
	{
	struct nvme_completion_poll_status status;
	struct nvme_async_event_request *aer;
	uint32_t i;

	ctrlr->async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE \|
	NVME_CRIT_WARN_ST_DEVICE_RELIABILITY \|
	NVME_CRIT_WARN_ST_READ_ONLY \|
	NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP;
	if (ctrlr->cdata.ver >= NVME_REV(1, 2))
	ctrlr->async_event_config \|= NVME_ASYNC_EVENT_NS_ATTRIBUTE \|
	NVME_ASYNC_EVENT_FW_ACTIVATE;

	status.done = 0;
	nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD,
	0, NULL, 0, nvme_completion_poll_cb, &status);
	nvme_completion_poll(&status);
	if (nvme_completion_is_error(&status.cpl) \|\|
	(status.cpl.cdw0 & 0xFFFF) == 0xFFFF \|\|
	(status.cpl.cdw0 & 0xFFFF) == 0x0000) {
	nvme_printf(ctrlr, "temperature threshold not supported\n");
	} else
	ctrlr->async_event_config \|= NVME_CRIT_WARN_ST_TEMPERATURE;

	nvme_ctrlr_cmd_set_async_event_config(ctrlr,
	ctrlr->async_event_config, NULL, NULL);

	/* aerl is a zero-based value, so we need to add 1 here. */
	ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));

	for (i = 0; i < ctrlr->num_aers; i++) {
	aer = &ctrlr->aer[i];
	nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
	}
	}

	static void
	nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
	{

	ctrlr->int_coal_time = 0;
	TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
	&ctrlr->int_coal_time);

	ctrlr->int_coal_threshold = 0;
	TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
	&ctrlr->int_coal_threshold);

	nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
	ctrlr->int_coal_threshold, NULL, NULL);
	}

	static void
	nvme_ctrlr_hmb_free(struct nvme_controller *ctrlr)
	{
	struct nvme_hmb_chunk *hmbc;
	int i;

	if (ctrlr->hmb_desc_paddr) {
	bus_dmamap_unload(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map);
	bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr,
	ctrlr->hmb_desc_map);
	ctrlr->hmb_desc_paddr = 0;
	}
	if (ctrlr->hmb_desc_tag) {
	bus_dma_tag_destroy(ctrlr->hmb_desc_tag);
	ctrlr->hmb_desc_tag = NULL;
	}
	for (i = 0; i < ctrlr->hmb_nchunks; i++) {
	hmbc = &ctrlr->hmb_chunks[i];
	bus_dmamap_unload(ctrlr->hmb_tag, hmbc->hmbc_map);
	bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr,
	hmbc->hmbc_map);
	}
	ctrlr->hmb_nchunks = 0;
	if (ctrlr->hmb_tag) {
	bus_dma_tag_destroy(ctrlr->hmb_tag);
	ctrlr->hmb_tag = NULL;
	}
	if (ctrlr->hmb_chunks) {
	free(ctrlr->hmb_chunks, M_NVME);
	ctrlr->hmb_chunks = NULL;
	}
	}

	static void
	nvme_ctrlr_hmb_alloc(struct nvme_controller *ctrlr)
	{
	struct nvme_hmb_chunk *hmbc;
	size_t pref, min, minc, size;
	int err, i;
	uint64_t max;

	/* Limit HMB to 5% of RAM size per device by default. */
	max = (uint64_t)physmem * PAGE_SIZE / 20;
	TUNABLE_UINT64_FETCH("hw.nvme.hmb_max", &max);

	min = (long long unsigned)ctrlr->cdata.hmmin * 4096;
	if (max == 0 \|\| max < min)
	return;
	pref = MIN((long long unsigned)ctrlr->cdata.hmpre * 4096, max);
	minc = MAX(ctrlr->cdata.hmminds * 4096, PAGE_SIZE);
	if (min > 0 && ctrlr->cdata.hmmaxd > 0)
	minc = MAX(minc, min / ctrlr->cdata.hmmaxd);
	ctrlr->hmb_chunk = pref;

	again:
	ctrlr->hmb_chunk = roundup2(ctrlr->hmb_chunk, PAGE_SIZE);
	ctrlr->hmb_nchunks = howmany(pref, ctrlr->hmb_chunk);
	if (ctrlr->cdata.hmmaxd > 0 && ctrlr->hmb_nchunks > ctrlr->cdata.hmmaxd)
	ctrlr->hmb_nchunks = ctrlr->cdata.hmmaxd;
	ctrlr->hmb_chunks = malloc(sizeof(struct nvme_hmb_chunk) *
	ctrlr->hmb_nchunks, M_NVME, M_WAITOK);
	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
	PAGE_SIZE, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
	ctrlr->hmb_chunk, 1, ctrlr->hmb_chunk, 0, NULL, NULL, &ctrlr->hmb_tag);
	if (err != 0) {
	nvme_printf(ctrlr, "HMB tag create failed %d\n", err);
	nvme_ctrlr_hmb_free(ctrlr);
	return;
	}

	for (i = 0; i < ctrlr->hmb_nchunks; i++) {
	hmbc = &ctrlr->hmb_chunks[i];
	if (bus_dmamem_alloc(ctrlr->hmb_tag,
	(void **)&hmbc->hmbc_vaddr, BUS_DMA_NOWAIT,
	&hmbc->hmbc_map)) {
	nvme_printf(ctrlr, "failed to alloc HMB\n");
	break;
	}
	if (bus_dmamap_load(ctrlr->hmb_tag, hmbc->hmbc_map,
	hmbc->hmbc_vaddr, ctrlr->hmb_chunk, nvme_single_map,
	&hmbc->hmbc_paddr, BUS_DMA_NOWAIT) != 0) {
	bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr,
	hmbc->hmbc_map);
	nvme_printf(ctrlr, "failed to load HMB\n");
	break;
	}
	bus_dmamap_sync(ctrlr->hmb_tag, hmbc->hmbc_map,
	BUS_DMASYNC_PREREAD \| BUS_DMASYNC_PREWRITE);
	}

	if (i < ctrlr->hmb_nchunks && i * ctrlr->hmb_chunk < min &&
	ctrlr->hmb_chunk / 2 >= minc) {
	ctrlr->hmb_nchunks = i;
	nvme_ctrlr_hmb_free(ctrlr);
	ctrlr->hmb_chunk /= 2;
	goto again;
	}
	ctrlr->hmb_nchunks = i;
	if (ctrlr->hmb_nchunks * ctrlr->hmb_chunk < min) {
	nvme_ctrlr_hmb_free(ctrlr);
	return;
	}

	size = sizeof(struct nvme_hmb_desc) * ctrlr->hmb_nchunks;
	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
	16, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
	size, 1, size, 0, NULL, NULL, &ctrlr->hmb_desc_tag);
	if (err != 0) {
	nvme_printf(ctrlr, "HMB desc tag create failed %d\n", err);
	nvme_ctrlr_hmb_free(ctrlr);
	return;
	}
	if (bus_dmamem_alloc(ctrlr->hmb_desc_tag,
	(void **)&ctrlr->hmb_desc_vaddr, BUS_DMA_WAITOK,
	&ctrlr->hmb_desc_map)) {
	nvme_printf(ctrlr, "failed to alloc HMB desc\n");
	nvme_ctrlr_hmb_free(ctrlr);
	return;
	}
	if (bus_dmamap_load(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map,
	ctrlr->hmb_desc_vaddr, size, nvme_single_map,
	&ctrlr->hmb_desc_paddr, BUS_DMA_NOWAIT) != 0) {
	bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr,
	ctrlr->hmb_desc_map);
	nvme_printf(ctrlr, "failed to load HMB desc\n");
	nvme_ctrlr_hmb_free(ctrlr);
	return;
	}

	for (i = 0; i < ctrlr->hmb_nchunks; i++) {
	ctrlr->hmb_desc_vaddr[i].addr =
	htole64(ctrlr->hmb_chunks[i].hmbc_paddr);
	ctrlr->hmb_desc_vaddr[i].size = htole32(ctrlr->hmb_chunk / 4096);
	}
	bus_dmamap_sync(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map,
	BUS_DMASYNC_PREWRITE);

	nvme_printf(ctrlr, "Allocated %lluMB host memory buffer\n",
	(long long unsigned)ctrlr->hmb_nchunks * ctrlr->hmb_chunk
	/ 1024 / 1024);
	}

	static void
	nvme_ctrlr_hmb_enable(struct nvme_controller *ctrlr, bool enable, bool memret)
	{
	struct nvme_completion_poll_status status;
	uint32_t cdw11;

	cdw11 = 0;
	if (enable)
	cdw11 \|= 1;
	if (memret)
	cdw11 \|= 2;
	status.done = 0;
	nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_HOST_MEMORY_BUFFER, cdw11,
	ctrlr->hmb_nchunks * ctrlr->hmb_chunk / 4096, ctrlr->hmb_desc_paddr,
	ctrlr->hmb_desc_paddr >> 32, ctrlr->hmb_nchunks, NULL, 0,
	nvme_completion_poll_cb, &status);
	nvme_completion_poll(&status);
	if (nvme_completion_is_error(&status.cpl))
	nvme_printf(ctrlr, "nvme_ctrlr_hmb_enable failed!\n");
	}

	static void
	nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
	{
	struct nvme_controller *ctrlr = ctrlr_arg;
	uint32_t old_num_io_queues;
	int i;

	/*
	* Only reset adminq here when we are restarting the
	* controller after a reset. During initialization,
	* we have already submitted admin commands to get
	* the number of I/O queues supported, so cannot reset
	* the adminq again here.
	*/
	if (resetting)
	nvme_qpair_reset(&ctrlr->adminq);

	if (ctrlr->ioq != NULL) {
	for (i = 0; i < ctrlr->num_io_queues; i++)
	nvme_qpair_reset(&ctrlr->ioq[i]);
	}

	nvme_admin_qpair_enable(&ctrlr->adminq);

	/*
	* If it was a reset on initialization command timeout, just
	* return here, letting initialization code fail gracefully.
	*/
	if (resetting && !ctrlr->is_initialized)
	return;

	if (nvme_ctrlr_identify(ctrlr) != 0) {
	nvme_ctrlr_fail(ctrlr);
	return;
	}

	/*
	* The number of qpairs are determined during controller initialization,
	* including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the
	* HW limit. We call SET_FEATURES again here so that it gets called
	* after any reset for controllers that depend on the driver to
	* explicit specify how many queues it will use. This value should
	* never change between resets, so panic if somehow that does happen.
	*/
	if (resetting) {
	old_num_io_queues = ctrlr->num_io_queues;
	if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
	nvme_ctrlr_fail(ctrlr);
	return;
	}

	if (old_num_io_queues != ctrlr->num_io_queues) {
	panic("num_io_queues changed from %u to %u",
	old_num_io_queues, ctrlr->num_io_queues);
	}
	}

	if (ctrlr->cdata.hmpre > 0 && ctrlr->hmb_nchunks == 0) {
	nvme_ctrlr_hmb_alloc(ctrlr);
	if (ctrlr->hmb_nchunks > 0)
	nvme_ctrlr_hmb_enable(ctrlr, true, false);
	} else if (ctrlr->hmb_nchunks > 0)
	nvme_ctrlr_hmb_enable(ctrlr, true, true);

	if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
	nvme_ctrlr_fail(ctrlr);
	return;
	}

	if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) {
	nvme_ctrlr_fail(ctrlr);
	return;
	}

	nvme_ctrlr_configure_aer(ctrlr);
	nvme_ctrlr_configure_int_coalescing(ctrlr);

	for (i = 0; i < ctrlr->num_io_queues; i++)
	nvme_io_qpair_enable(&ctrlr->ioq[i]);
	}

	void
	nvme_ctrlr_start_config_hook(void *arg)
	{
	struct nvme_controller *ctrlr = arg;

	/*
	* Reset controller twice to ensure we do a transition from cc.en==1 to
	* cc.en==0. This is because we don't really know what status the
	* controller was left in when boot handed off to OS. Linux doesn't do
	* this, however. If we adopt that policy, see also nvme_ctrlr_resume().
	*/
	if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
	fail:
	nvme_ctrlr_fail(ctrlr);
	config_intrhook_disestablish(&ctrlr->config_hook);
	ctrlr->config_hook.ich_arg = NULL;
	return;
	}

	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
	goto fail;

	nvme_qpair_reset(&ctrlr->adminq);
	nvme_admin_qpair_enable(&ctrlr->adminq);

	if (nvme_ctrlr_set_num_qpairs(ctrlr) == 0 &&
	nvme_ctrlr_construct_io_qpairs(ctrlr) == 0)
	nvme_ctrlr_start(ctrlr, false);
	else
	goto fail;

	nvme_sysctl_initialize_ctrlr(ctrlr);
	config_intrhook_disestablish(&ctrlr->config_hook);
	ctrlr->config_hook.ich_arg = NULL;

	ctrlr->is_initialized = 1;
	nvme_notify_new_controller(ctrlr);
	}

	static void
	nvme_ctrlr_reset_task(void *arg, int pending)
	{
	struct nvme_controller *ctrlr = arg;
	int status;

	nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller");
	status = nvme_ctrlr_hw_reset(ctrlr);
	/*
	* Use pause instead of DELAY, so that we yield to any nvme interrupt
	* handlers on this CPU that were blocked on a qpair lock. We want
	* all nvme interrupts completed before proceeding with restarting the
	* controller.
	*
	* XXX - any way to guarantee the interrupt handlers have quiesced?
	*/
	pause("nvmereset", hz / 10);
	if (status == 0)
	nvme_ctrlr_start(ctrlr, true);
	else
	nvme_ctrlr_fail(ctrlr);

	atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
	}

	/*
	* Poll all the queues enabled on the device for completion.
	*/
	void
	nvme_ctrlr_poll(struct nvme_controller *ctrlr)
	{
	int i;

	nvme_qpair_process_completions(&ctrlr->adminq);

	for (i = 0; i < ctrlr->num_io_queues; i++)
	if (ctrlr->ioq && ctrlr->ioq[i].cpl)
	nvme_qpair_process_completions(&ctrlr->ioq[i]);
	}

	/*
	* Poll the single-vector interrupt case: num_io_queues will be 1 and
	* there's only a single vector. While we're polling, we mask further
	* interrupts in the controller.
	*/
	void
	nvme_ctrlr_intx_handler(void *arg)
	{
	struct nvme_controller *ctrlr = arg;

	nvme_mmio_write_4(ctrlr, intms, 1);
	nvme_ctrlr_poll(ctrlr);
	nvme_mmio_write_4(ctrlr, intmc, 1);
	}

	static void
	nvme_pt_done(void arg, const struct nvme_completion cpl)
	{
	struct nvme_pt_command *pt = arg;
	struct mtx *mtx = pt->driver_lock;
	uint16_t status;

	bzero(&pt->cpl, sizeof(pt->cpl));
	pt->cpl.cdw0 = cpl->cdw0;

	status = cpl->status;
	status &= ~NVME_STATUS_P_MASK;
	pt->cpl.status = status;

	mtx_lock(mtx);
	pt->driver_lock = NULL;
	wakeup(pt);
	mtx_unlock(mtx);
	}

	int
	nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
	struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer,
	int is_admin_cmd)
	{
	struct nvme_request *req;
	struct mtx *mtx;
	struct buf *buf = NULL;
	int ret = 0;
	vm_offset_t addr, end;

	if (pt->len > 0) {
	/*
	* vmapbuf calls vm_fault_quick_hold_pages which only maps full
	- * pages. Ensure this request has fewer than MAXPHYS bytes when
	+ * pages. Ensure this request has fewer than maxphys bytes when
	* extended to full pages.
	*/
	addr = (vm_offset_t)pt->buf;
	end = round_page(addr + pt->len);
	addr = trunc_page(addr);
	- if (end - addr > MAXPHYS)
	+ if (end - addr > maxphys)
	return EIO;

	if (pt->len > ctrlr->max_xfer_size) {
	nvme_printf(ctrlr, "pt->len (%d) "
	"exceeds max_xfer_size (%d)\n", pt->len,
	ctrlr->max_xfer_size);
	return EIO;
	}
	if (is_user_buffer) {
	/*
	* Ensure the user buffer is wired for the duration of
	* this pass-through command.
	*/
	PHOLD(curproc);
	buf = uma_zalloc(pbuf_zone, M_WAITOK);
	buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE;
	if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) {
	ret = EFAULT;
	goto err;
	}
	req = nvme_allocate_request_vaddr(buf->b_data, pt->len,
	nvme_pt_done, pt);
	} else
	req = nvme_allocate_request_vaddr(pt->buf, pt->len,
	nvme_pt_done, pt);
	} else
	req = nvme_allocate_request_null(nvme_pt_done, pt);

	/* Assume user space already converted to little-endian */
	req->cmd.opc = pt->cmd.opc;
	req->cmd.fuse = pt->cmd.fuse;
	req->cmd.rsvd2 = pt->cmd.rsvd2;
	req->cmd.rsvd3 = pt->cmd.rsvd3;
	req->cmd.cdw10 = pt->cmd.cdw10;
	req->cmd.cdw11 = pt->cmd.cdw11;
	req->cmd.cdw12 = pt->cmd.cdw12;
	req->cmd.cdw13 = pt->cmd.cdw13;
	req->cmd.cdw14 = pt->cmd.cdw14;
	req->cmd.cdw15 = pt->cmd.cdw15;

	req->cmd.nsid = htole32(nsid);

	mtx = mtx_pool_find(mtxpool_sleep, pt);
	pt->driver_lock = mtx;

	if (is_admin_cmd)
	nvme_ctrlr_submit_admin_request(ctrlr, req);
	else
	nvme_ctrlr_submit_io_request(ctrlr, req);

	mtx_lock(mtx);
	while (pt->driver_lock != NULL)
	mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0);
	mtx_unlock(mtx);

	err:
	if (buf != NULL) {
	uma_zfree(pbuf_zone, buf);
	PRELE(curproc);
	}

	return (ret);
	}

	static int
	nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
	struct thread *td)
	{
	struct nvme_controller *ctrlr;
	struct nvme_pt_command *pt;

	ctrlr = cdev->si_drv1;

	switch (cmd) {
	case NVME_RESET_CONTROLLER:
	nvme_ctrlr_reset(ctrlr);
	break;
	case NVME_PASSTHROUGH_CMD:
	pt = (struct nvme_pt_command *)arg;
	return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, le32toh(pt->cmd.nsid),
	1 /* is_user_buffer /, 1 / is_admin_cmd */));
	case NVME_GET_NSID:
	{
	struct nvme_get_nsid gnsid = (struct nvme_get_nsid )arg;
	strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
	sizeof(gnsid->cdev));
	gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
	gnsid->nsid = 0;
	break;
	}
	case NVME_GET_MAX_XFER_SIZE:
	(uint64_t )arg = ctrlr->max_xfer_size;
	break;
	default:
	return (ENOTTY);
	}

	return (0);
	}

	static struct cdevsw nvme_ctrlr_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = 0,
	.d_ioctl = nvme_ctrlr_ioctl
	};

	int
	nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
	{
	struct make_dev_args md_args;
	uint32_t cap_lo;
	uint32_t cap_hi;
	uint32_t to, vs, pmrcap;
	uint8_t mpsmin;
	int status, timeout_period;

	ctrlr->dev = dev;

	mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF);
	if (bus_get_domain(dev, &ctrlr->domain) != 0)
	ctrlr->domain = 0;

	cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
	if (bootverbose) {
	device_printf(dev, "CapLo: 0x%08x: MQES %u%s%s%s%s, TO %u\n",
	cap_lo, NVME_CAP_LO_MQES(cap_lo),
	NVME_CAP_LO_CQR(cap_lo) ? ", CQR" : "",
	NVME_CAP_LO_AMS(cap_lo) ? ", AMS" : "",
	(NVME_CAP_LO_AMS(cap_lo) & 0x1) ? " WRRwUPC" : "",
	(NVME_CAP_LO_AMS(cap_lo) & 0x2) ? " VS" : "",
	NVME_CAP_LO_TO(cap_lo));
	}
	cap_hi = nvme_mmio_read_4(ctrlr, cap_hi);
	if (bootverbose) {
	device_printf(dev, "CapHi: 0x%08x: DSTRD %u%s, CSS %x%s, "
	"MPSMIN %u, MPSMAX %u%s%s\n", cap_hi,
	NVME_CAP_HI_DSTRD(cap_hi),
	NVME_CAP_HI_NSSRS(cap_hi) ? ", NSSRS" : "",
	NVME_CAP_HI_CSS(cap_hi),
	NVME_CAP_HI_BPS(cap_hi) ? ", BPS" : "",
	NVME_CAP_HI_MPSMIN(cap_hi),
	NVME_CAP_HI_MPSMAX(cap_hi),
	NVME_CAP_HI_PMRS(cap_hi) ? ", PMRS" : "",
	NVME_CAP_HI_CMBS(cap_hi) ? ", CMBS" : "");
	}
	if (bootverbose) {
	vs = nvme_mmio_read_4(ctrlr, vs);
	device_printf(dev, "Version: 0x%08x: %d.%d\n", vs,
	NVME_MAJOR(vs), NVME_MINOR(vs));
	}
	if (bootverbose && NVME_CAP_HI_PMRS(cap_hi)) {
	pmrcap = nvme_mmio_read_4(ctrlr, pmrcap);
	device_printf(dev, "PMRCap: 0x%08x: BIR %u%s%s, PMRTU %u, "
	"PMRWBM %x, PMRTO %u%s\n", pmrcap,
	NVME_PMRCAP_BIR(pmrcap),
	NVME_PMRCAP_RDS(pmrcap) ? ", RDS" : "",
	NVME_PMRCAP_WDS(pmrcap) ? ", WDS" : "",
	NVME_PMRCAP_PMRTU(pmrcap),
	NVME_PMRCAP_PMRWBM(pmrcap),
	NVME_PMRCAP_PMRTO(pmrcap),
	NVME_PMRCAP_CMSS(pmrcap) ? ", CMSS" : "");
	}

	ctrlr->dstrd = NVME_CAP_HI_DSTRD(cap_hi) + 2;

	mpsmin = NVME_CAP_HI_MPSMIN(cap_hi);
	ctrlr->min_page_size = 1 << (12 + mpsmin);

	/* Get ready timeout value from controller, in units of 500ms. */
	to = NVME_CAP_LO_TO(cap_lo) + 1;
	ctrlr->ready_timeout_in_ms = to * 500;

	timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD;
	TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period);
	timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD);
	timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD);
	ctrlr->timeout_period = timeout_period;

	nvme_retry_count = NVME_DEFAULT_RETRY_COUNT;
	TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count);

	ctrlr->enable_aborts = 0;
	TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);

	ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
	if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0)
	return (ENXIO);

	ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
	taskqueue_thread_enqueue, &ctrlr->taskqueue);
	taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq");

	ctrlr->is_resetting = 0;
	ctrlr->is_initialized = 0;
	ctrlr->notification_sent = 0;
	TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
	TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr);
	STAILQ_INIT(&ctrlr->fail_req);
	ctrlr->is_failed = false;

	make_dev_args_init(&md_args);
	md_args.mda_devsw = &nvme_ctrlr_cdevsw;
	md_args.mda_uid = UID_ROOT;
	md_args.mda_gid = GID_WHEEL;
	md_args.mda_mode = 0600;
	md_args.mda_unit = device_get_unit(dev);
	md_args.mda_si_drv1 = (void *)ctrlr;
	status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d",
	device_get_unit(dev));
	if (status != 0)
	return (ENXIO);

	return (0);
	}

	void
	nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
	{
	int gone, i;

	if (ctrlr->resource == NULL)
	goto nores;

	/*
	* Check whether it is a hot unplug or a clean driver detach.
	* If device is not there any more, skip any shutdown commands.
	*/
	gone = (nvme_mmio_read_4(ctrlr, csts) == 0xffffffff);
	if (gone)
	nvme_ctrlr_fail(ctrlr);
	else
	nvme_notify_fail_consumers(ctrlr);

	for (i = 0; i < NVME_MAX_NAMESPACES; i++)
	nvme_ns_destruct(&ctrlr->ns[i]);

	if (ctrlr->cdev)
	destroy_dev(ctrlr->cdev);

	if (ctrlr->is_initialized) {
	if (!gone) {
	if (ctrlr->hmb_nchunks > 0)
	nvme_ctrlr_hmb_enable(ctrlr, false, false);
	nvme_ctrlr_delete_qpairs(ctrlr);
	}
	nvme_ctrlr_hmb_free(ctrlr);
	}
	if (ctrlr->ioq != NULL) {
	for (i = 0; i < ctrlr->num_io_queues; i++)
	nvme_io_qpair_destroy(&ctrlr->ioq[i]);
	free(ctrlr->ioq, M_NVME);
	}
	nvme_admin_qpair_destroy(&ctrlr->adminq);

	/*
	* Notify the controller of a shutdown, even though this is due to
	* a driver unload, not a system shutdown (this path is not invoked
	* during shutdown). This ensures the controller receives a
	* shutdown notification in case the system is shutdown before
	* reloading the driver.
	*/
	if (!gone)
	nvme_ctrlr_shutdown(ctrlr);

	if (!gone)
	nvme_ctrlr_disable(ctrlr);

	if (ctrlr->taskqueue)
	taskqueue_free(ctrlr->taskqueue);

	if (ctrlr->tag)
	bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);

	if (ctrlr->res)
	bus_release_resource(ctrlr->dev, SYS_RES_IRQ,
	rman_get_rid(ctrlr->res), ctrlr->res);

	if (ctrlr->bar4_resource != NULL) {
	bus_release_resource(dev, SYS_RES_MEMORY,
	ctrlr->bar4_resource_id, ctrlr->bar4_resource);
	}

	bus_release_resource(dev, SYS_RES_MEMORY,
	ctrlr->resource_id, ctrlr->resource);

	nores:
	mtx_destroy(&ctrlr->lock);
	}

	void
	nvme_ctrlr_shutdown(struct nvme_controller *ctrlr)
	{
	uint32_t cc;
	uint32_t csts;
	int ticks = 0, timeout;

	cc = nvme_mmio_read_4(ctrlr, cc);
	cc &= ~(NVME_CC_REG_SHN_MASK << NVME_CC_REG_SHN_SHIFT);
	cc \|= NVME_SHN_NORMAL << NVME_CC_REG_SHN_SHIFT;
	nvme_mmio_write_4(ctrlr, cc, cc);

	timeout = ctrlr->cdata.rtd3e == 0 ? 5 * hz :
	((uint64_t)ctrlr->cdata.rtd3e * hz + 999999) / 1000000;
	while (1) {
	csts = nvme_mmio_read_4(ctrlr, csts);
	if (csts == 0xffffffff) /* Hot unplug. */
	break;
	if (NVME_CSTS_GET_SHST(csts) == NVME_SHST_COMPLETE)
	break;
	if (ticks++ > timeout) {
	nvme_printf(ctrlr, "did not complete shutdown within"
	" %d ticks of notification\n", timeout);
	break;
	}
	pause("nvme shn", 1);
	}
	}

	void
	nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
	struct nvme_request *req)
	{

	nvme_qpair_submit_request(&ctrlr->adminq, req);
	}

	void
	nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
	struct nvme_request *req)
	{
	struct nvme_qpair *qpair;

	qpair = &ctrlr->ioq[QP(ctrlr, curcpu)];
	nvme_qpair_submit_request(qpair, req);
	}

	device_t
	nvme_ctrlr_get_device(struct nvme_controller *ctrlr)
	{

	return (ctrlr->dev);
	}

	const struct nvme_controller_data *
	nvme_ctrlr_get_data(struct nvme_controller *ctrlr)
	{

	return (&ctrlr->cdata);
	}

	int
	nvme_ctrlr_suspend(struct nvme_controller *ctrlr)
	{
	int to = hz;

	/*
	* Can't touch failed controllers, so it's already suspended.
	*/
	if (ctrlr->is_failed)
	return (0);

	/*
	* We don't want the reset taskqueue running, since it does similar
	* things, so prevent it from running after we start. Wait for any reset
	* that may have been started to complete. The reset process we follow
	* will ensure that any new I/O will queue and be given to the hardware
	* after we resume (though there should be none).
	*/
	while (atomic_cmpset_32(&ctrlr->is_resetting, 0, 1) == 0 && to-- > 0)
	pause("nvmesusp", 1);
	if (to <= 0) {
	nvme_printf(ctrlr,
	"Competing reset task didn't finish. Try again later.\n");
	return (EWOULDBLOCK);
	}

	if (ctrlr->hmb_nchunks > 0)
	nvme_ctrlr_hmb_enable(ctrlr, false, false);

	/*
	* Per Section 7.6.2 of NVMe spec 1.4, to properly suspend, we need to
	* delete the hardware I/O queues, and then shutdown. This properly
	* flushes any metadata the drive may have stored so it can survive
	* having its power removed and prevents the unsafe shutdown count from
	* incriminating. Once we delete the qpairs, we have to disable them
	* before shutting down. The delay is out of paranoia in
	* nvme_ctrlr_hw_reset, and is repeated here (though we should have no
	* pending I/O that the delay copes with).
	*/
	nvme_ctrlr_delete_qpairs(ctrlr);
	nvme_ctrlr_disable_qpairs(ctrlr);
	DELAY(100*1000);
	nvme_ctrlr_shutdown(ctrlr);

	return (0);
	}

	int
	nvme_ctrlr_resume(struct nvme_controller *ctrlr)
	{

	/*
	* Can't touch failed controllers, so nothing to do to resume.
	*/
	if (ctrlr->is_failed)
	return (0);

	/*
	* Have to reset the hardware twice, just like we do on attach. See
	* nmve_attach() for why.
	*/
	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
	goto fail;
	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
	goto fail;

	/*
	* Now that we've reset the hardware, we can restart the controller. Any
	* I/O that was pending is requeued. Any admin commands are aborted with
	* an error. Once we've restarted, take the controller out of reset.
	*/
	nvme_ctrlr_start(ctrlr, true);
	(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);

	return (0);
	fail:
	/*
	* Since we can't bring the controller out of reset, announce and fail
	* the controller. However, we have to return success for the resume
	* itself, due to questionable APIs.
	*/
	nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
	nvme_ctrlr_fail(ctrlr);
	(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
	return (0);
	}
	diff --git a/sys/dev/pms/freebsd/driver/ini/src/agdef.h b/sys/dev/pms/freebsd/driver/ini/src/agdef.h
	index a07d97357fd3..cabcafc1819c 100644
	--- a/sys/dev/pms/freebsd/driver/ini/src/agdef.h
	+++ b/sys/dev/pms/freebsd/driver/ini/src/agdef.h
	@@ -1,204 +1,204 @@
	/*******************************************************************************
	**
	**
	*Copyright (c) 2014 PMC-Sierra, Inc. All rights reserved.
	*
	*Redistribution and use in source and binary forms, with or without modification, are permitted provided
	*that the following conditions are met:
	*1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
	*2. Redistributions in binary form must reproduce the above copyright notice,
	*this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
	*
	*THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
	*
	*INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	*ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	*SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	*OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
	*WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
	*THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
	*
	* $FreeBSD$
	*
	**
	*******************************************************************************/
	/******************************************************************************
	Note:
	*******************************************************************************
	Module Name:
	agdef.h
	Abstract:
	Linux iSCSI/FC Initiator driver module constant define header file
	Authors:
	EW - Yiding(Eddie) Wang
	Environment:
	Kernel or loadable module

	Version Control Information:
	$ver. 1.0.0

	Revision History:
	$Revision: 115514 $0.1.0
	$Date: 2012-01-06 17:12:27 -0800 (Fri, 06 Jan 2012) $09-27-2001
	$Modtime: 11/12/01 11:15a $15:56:00

	Notes:
	************************** MODIFICATION HISTORY ***************************
	NAME DATE Rev. DESCRIPTION
	---- ---- ---- -----------
	EW 09-17-2004 1.0.0 Constant definitions
	******************************************************************************/


	#ifndef __AGTIAPI_DEF_H__
	#define __AGTIAPI_DEF_H__

	/*
	** Max device supported
	*/
	#define AGTIAPI_HW_LIMIT_DEVICE 4096
	#define AGTIAPI_MAX_LUN 256 /* Max # luns per target */
	#define AGTIAPI_MAX_DEVICE 128 //64 //2048//1024 /* Max # device per channel */
	#define AGTIAPI_MAX_DEVICE_7H 256 /Max devices per channel in 7H /
	#define AGTIAPI_MAX_DEVICE_8H 512 /Max devices per channel in 8H/
	#define AGTIAPI_MAX_CAM_Q_DEPTH 1024
	-#define AGTIAPI_NSEGS (MAXPHYS / PAGE_SIZE)
	+#define AGTIAPI_NSEGS (maxphys / PAGE_SIZE)
	/*
	** Adapter specific defines
	*/
	#define AGTIAPI_IO_RANGE 256 /* IO mapped address range */

	/*
	** Scatter/Gather DMA Segment Descriptor
	** Note, MAX_Q_DEPTH could be set larger for iscsi "AcceptQueueSize"
	** parameter matching. One thing to do is to make it to be an adjustable
	** parameter. Currently suggest this value set to be same as
	** "AcceptQueueSize" but not required.
	*/

	#define AGTIAPI_MAX_DMA_SEGS 128//256
	#define AGTIAPI_DEFAULT_Q_DEPTH 4
	#define AGTIAPI_MAX_Q_DEPTH AGSA_MAX_INBOUND_Q * 512 // *INBOUND_DEPTH_SIZE

	/*
	** CCB and device flags defines
	*/
	#define ACTIVE 0x00000001
	#define TIMEDOUT 0x00000002
	#define REQ_DONE 0x00000004
	#define AGTIAPI_INQUIRY 0x00000008
	#define AGTIAPI_ABORT 0x00000010
	#define AGTIAPI_RETRY 0x00000020
	#define TASK_SUCCESS 0x00000040
	/* reserved for card flag
	#define AGTIAPI_RESERVED 0x00000080
	*/
	#define AGTIAPI_CNX_UP 0x00000100
	#define DEV_RESET 0x00000400 /* device reset */
	#define DEV_SHIFT 0x00000800 /* device shift physical position */
	#define AGTIAPI_YAM 0x00001000
	#define TASK_TIMEOUT 0x00002000
	#define ENCRYPTED_IO 0x00010000 /* encrypted IO */
	#define SATA_DIF 0x00020000 /* SATA DIF */
	#define EDC_DATA 0x00040000
	#define EDC_DATA_CRC 0x00080000
	#define TAG_SMP 0x40000000
	#define TASK_MANAGEMENT 0x80000000

	#define AGTIAPI_CCB_PER_DEVICE 64
	#define AGTIAPI_CMD_PER_LUN 512

	/*
	** Max time to call agtiapi_GetDevHandle
	** to make sure that no devices are attached
	*/
	#define AGTIAPI_GET_DEV_MAX 2

	/*
	** Device address mode
	*/
	#define AGTIAPI_ADDRMODE_SHIFT 6
	#define AGTIAPI_PERIPHERAL 0x00
	#define AGTIAPI_VOLUME_SET 0x01
	#define AGTIAPI_LUN_ADDR 0x02

	/*
	** Device mapping method
	*/
	#define SOFT_MAPPED 0x0001
	#define HARD_MAPPED 0x0002

	/*
	** bd_dev_type definitions
	*/
	#define DIRECT_DEVICE 0x00
	#define TAPE_DEVICE 0x01
	#define SLOW_DEVICE 0x02
	#define ARRAY_DEVICE 0x04

	/*
	** SCSI CDB
	*/
	#define SCSI_CDB_SIZE 16

	/*
	** SCSI status
	*/
	#define SCSI_GOOD 0x00
	#define SCSI_CHECK_CONDITION 0x02
	#define SCSI_CONDITION_MET 0x04
	#define SCSI_BUSY 0x08
	#define SCSI_INTERMEDIATE 0x10
	#define SCSI_INTERMEDIATE_COND_MET 0x14
	#define SCSI_RESERVATION_CONFLICT 0x18
	#define SCSI_TASK_ABORTED 0x40
	#define SCSI_TASK_SET_FULL 0x28
	#define SCSI_ACA_ACTIVE 0x30

	/*
	** Peripheral device types
	*/
	#define DTYPE_DIRECT 0x00
	#define DTYPE_SEQUENTIAL 0x01
	#define DTYPE_PRINTER 0x02
	#define DTYPE_PROCESSOR 0x03
	#define DTYPE_WORM 0x04
	#define DTYPE_RODIRECT 0x05
	#define DTYPE_SCANNER 0x06
	#define DTYPE_OPTICAL 0x07
	#define DTYPE_CHANGER 0x08
	#define DTYPE_COMM 0x09
	#define DTYPE_ARRAY_CTRL 0x0C
	#define DTYPE_ESI 0x0D
	/*
	** Device types 0x0E-0x1E are reserved
	*/
	#define DTYPE_MASK 0x1F

	/*
	** Driver capability defines
	*/
	#define AGTIAPI_TIMEOUT_SECS 10 /* Default timer interval */
	#define AGTIAPI_RESET_MAX 0x7FFFFFFF /* Default max. reset */
	#define AGTIAPI_DEV_RESET_MAX 0x10 /* Default max. reset */
	#define AGTIAPI_RETRY_MAX 10 /* Default ccb retry cnt */
	#define AGTIAPI_MAX_CHANNEL_NUM 0 /* Max channel # per card */
	#define AGTIAPI_PERIPHERAL_CHANNEL 0
	#define AGTIAPI_VOLUMESET_CHANNEL 1
	#define AGTIAPI_LUNADDR_CHANNEL 2
	#define AGTIAPI_EXTRA_DELAY 10000 /* extra 10 seconds delay */

	/*
	** Scsi ioctl test case only
	*/
	#define AGTIAPI_TEST_ABORT 0xabcd
	#define AGTIAPI_TEST_ABORT_DONE 0xabce
	#define AGTIAPI_IOCTL_SIGNATURE "AGTIAPI_IOCTL"

	#define AGTIAPI_HBA_SCSI_ID (AGTIAPI_MAX_DEVICE - 1)
	#define AGTIAPI_NO_RESEND 0x01 /* Don't resend command */
	#define AGTIAPI_RESEND 0x02 /* Resend command */
	//#define AGTIAPI_UPPER 0x04 /* Call from upper layer */
	#define AGTIAPI_CALLBACK 0x08 /* CMD call back required */

	#endif /* __AGTIAPI_DEF_H__ */
	diff --git a/sys/dev/pms/freebsd/driver/ini/src/agtiapi.c b/sys/dev/pms/freebsd/driver/ini/src/agtiapi.c
	index 1144cc5c43b0..7738e1216907 100644
	--- a/sys/dev/pms/freebsd/driver/ini/src/agtiapi.c
	+++ b/sys/dev/pms/freebsd/driver/ini/src/agtiapi.c
	@@ -1,6635 +1,6635 @@
	/*******************************************************************************
	**
	*Copyright (c) 2014 PMC-Sierra, Inc. All rights reserved.
	*
	*Redistribution and use in source and binary forms, with or without modification, are permitted provided
	*that the following conditions are met:
	*1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
	*2. Redistributions in binary form must reproduce the above copyright notice,
	*this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
	*
	*THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
	*
	*INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	*ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	*SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	*OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
	*WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
	*THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
	**
	*******************************************************************************/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");
	#include <dev/pms/config.h>

	#define MAJOR_REVISION 1
	#define MINOR_REVISION 3
	#define BUILD_REVISION 10800

	#include <sys/param.h> // defines used in kernel.h
	#include <sys/ioccom.h>
	#include <sys/module.h>
	#include <sys/systm.h>
	#include <sys/errno.h>
	#include <sys/kernel.h> // types used in module initialization
	#include <sys/conf.h> // cdevsw struct
	#include <sys/uio.h> // uio struct
	#include <sys/types.h>
	#include <sys/malloc.h>
	#include <sys/bus.h> // structs, prototypes for pci bus stuff
	#include <machine/bus.h>
	#include <sys/rman.h>
	#include <machine/resource.h>
	#include <vm/vm.h> // 1. for vtophys
	#include <vm/pmap.h> // 2. for vtophys
	#include <dev/pci/pcivar.h> // For pci_get macros
	#include <dev/pci/pcireg.h>
	#include <sys/endian.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sema.h>
	#include <sys/queue.h>
	#include <sys/taskqueue.h>
	#include <machine/atomic.h>
	#include <sys/libkern.h>
	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_debug.h>
	#include <cam/cam_periph.h> //
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_message.h>
	#include <sys/systm.h>
	#include <sys/types.h>
	#include <dev/pms/RefTisa/tisa/api/tiapi.h>
	#include <dev/pms/freebsd/driver/ini/src/agtiapi.h>
	#include <dev/pms/freebsd/driver/ini/src/agtiproto.h>
	#include <dev/pms/RefTisa/tisa/api/ostiapi.h>
	#include <dev/pms/RefTisa/tisa/sassata/common/tdsatypes.h>
	#include <dev/pms/freebsd/driver/common/lxencrypt.h>

	MALLOC_DEFINE( M_PMC_MCCB, "CCB List", "CCB List for PMCS driver" );

	MALLOC_DEFINE( M_PMC_MSTL, "STLock malloc",
	"allocated in agtiapi_attach as memory for lock use" );
	MALLOC_DEFINE( M_PMC_MDVT, "ag_device_t malloc",
	"allocated in agtiapi_attach as mem for ag_device_t pDevList" );
	MALLOC_DEFINE( M_PMC_MPRT, "ag_portal_data_t malloc",
	"allocated in agtiapi_attach as mem for *pPortalData" );
	MALLOC_DEFINE( M_PMC_MDEV, "tiDeviceHandle_t * malloc",
	"allocated in agtiapi_GetDevHandle as local mem for **agDev" );
	MALLOC_DEFINE( M_PMC_MFLG, "lDevFlags * malloc",
	"allocated in agtiapi_GetDevHandle as local mem for * flags" );
	#ifdef LINUX_PERBI_SUPPORT
	MALLOC_DEFINE( M_PMC_MSLR, "ag_slr_map_t malloc",
	"mem allocated in agtiapi_attach for pSLRList" );
	MALLOC_DEFINE( M_PMC_MTGT, "ag_tgt_map_t malloc",
	"mem allocated in agtiapi_attach for pWWNList" );
	#endif
	MALLOC_DEFINE(TEMP,"tempbuff","buffer for payload");
	MALLOC_DEFINE(TEMP2, "tempbuff", "buffer for agtiapi_getdevlist");
	STATIC U32 agtiapi_intx_mode = 0;
	STATIC U08 ag_Perbi = 0;
	STATIC U32 agtiapi_polling_mode = 0;
	STATIC U32 ag_card_good = 0; // * total card initialized
	STATIC U32 ag_option_flag = 0; // * adjustable parameter flag
	STATIC U32 agtiapi_1st_time = 1;
	STATIC U32 ag_timeout_secs = 10; //Made timeout equivalent to linux

	U32 gTiDebugLevel = 1;
	S32 ag_encryption_enable = 0;
	atomic_t outstanding_encrypted_io_count;

	#define cache_line_size() CACHE_LINE_SIZE

	#define PMCoffsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)

	#define CPU_TO_LE32(dst, src) \
	dst.lower = htole32(LOW_32_BITS(src)); \
	dst.upper = htole32(HIGH_32_BITS(src))

	#define CMND_TO_CHANNEL( ccb ) ( ccb->ccb_h.path_id )
	#define CMND_TO_TARGET( ccb ) ( ccb->ccb_h.target_id )
	#define CMND_TO_LUN( ccb ) ( ccb->ccb_h.target_lun )

	STATIC U08 agtiapi_AddrModes[AGTIAPI_MAX_CHANNEL_NUM + 1] =
	{ AGTIAPI_PERIPHERAL };

	#ifdef LINUX_PERBI_SUPPORT
	// Holding area for target-WWN mapping assignments on the boot line
	static ag_mapping_t *agMappingList = NULL; // modified by agtiapi_Setup()
	#endif

	// * For Debugging Purpose
	#ifdef AGTIAPI_DEBUG
	#define AGTIAPI_WWN(name, len) wwnprintk(name, len)
	#else
	#define AGTIAPI_WWN(name, len)
	#endif


	#define AGTIAPI_WWNPRINTK(name, len, format, a...) \
	AGTIAPI_PRINTK(format "name ", a); \
	AGTIAPI_WWN((unsigned char*)name, len);

	#define AGTIAPI_ERR_WWNPRINTK(name, len, format, a...) \
	printk(KERN_DEBUG format "name ", ## a); \
	wwnprintk((unsigned char*)name, len);
	#define AGTIAPI_CPY_DEV_INFO(root, dev, pDev) \
	tiINIGetDeviceInfo(root, dev, &pDev->devInfo); \
	wwncpy(pDev);

	#ifdef AGTIAPI_LOCAL_LOCK

	#define AG_CARD_LOCAL_LOCK(lock) ,(lock)
	#define AG_SPIN_LOCK_IRQ(lock, flags)
	#define AG_SPIN_UNLOCK_IRQ(lock, flags)
	#define AG_SPIN_LOCK(lock)
	#define AG_SPIN_UNLOCK(lock)
	#define AG_GLOBAL_ARG(arg)
	#define AG_PERF_SPINLOCK(lock)
	#define AG_PERF_SPINLOCK_IRQ(lock, flags)


	#define AG_LOCAL_LOCK(lock) if (lock) \
	mtx_lock(lock)
	#define AG_LOCAL_UNLOCK(lock) if (lock) \
	mtx_unlock(lock)
	#define AG_LOCAL_FLAGS(_flags) unsigned long _flags = 0
	#endif


	#define AG_GET_DONE_PCCB(pccb, pmcsc) \
	{ \
	AG_LOCAL_LOCK(&pmcsc->doneLock); \
	pccb = pmcsc->ccbDoneHead; \
	if (pccb != NULL) \
	{ \
	pmcsc->ccbDoneHead = NULL; \
	pmcsc->ccbDoneTail = NULL; \
	AG_LOCAL_UNLOCK(&pmcsc->doneLock); \
	agtiapi_Done(pmcsc, pccb); \
	} \
	else \
	AG_LOCAL_UNLOCK(&pmcsc->doneLock); \
	}

	#define AG_GET_DONE_SMP_PCCB(pccb, pmcsc) \
	{ \
	AG_LOCAL_LOCK(&pmcsc->doneSMPLock); \
	pccb = pmcsc->smpDoneHead; \
	if (pccb != NULL) \
	{ \
	pmcsc->smpDoneHead = NULL; \
	pmcsc->smpDoneTail = NULL; \
	AG_LOCAL_UNLOCK(&pmcsc->doneSMPLock); \
	agtiapi_SMPDone(pmcsc, pccb); \
	} \
	else \
	AG_LOCAL_UNLOCK(&pmcsc->doneSMPLock); \
	}

	#ifdef AGTIAPI_DUMP_IO_DEBUG
	#define AG_IO_DUMPCCB(pccb) agtiapi_DumpCCB(pccb)
	#else
	#define AG_IO_DUMPCCB(pccb)
	#endif

	#define SCHED_DELAY_JIFFIES 4 /* in seconds */

	#ifdef HOTPLUG_SUPPORT
	#define AG_HOTPLUG_LOCK_INIT(lock) mxt_init(lock)
	#define AG_LIST_LOCK(lock) mtx_lock(lock)
	#define AG_LIST_UNLOCK(lock) mtx_unlock(lock)
	#else
	#define AG_HOTPLUG_LOCK_INIT(lock)
	#define AG_LIST_LOCK(lock)
	#define AG_LIST_UNLOCK(lock)
	#endif

	STATIC void agtiapi_CheckIOTimeout(void *data);



	static ag_card_info_t agCardInfoList[ AGTIAPI_MAX_CARDS ]; // card info list
	static void agtiapi_cam_action( struct cam_sim , union ccb );
	static void agtiapi_cam_poll( struct cam_sim * );

	// Function prototypes
	static d_open_t agtiapi_open;
	static d_close_t agtiapi_close;
	static d_read_t agtiapi_read;
	static d_write_t agtiapi_write;
	static d_ioctl_t agtiapi_CharIoctl;
	static void agtiapi_async(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg);
	void agtiapi_adjust_queue_depth(struct cam_path *path, bit32 QueueDepth);

	// Character device entry points
	static struct cdevsw agtiapi_cdevsw = {
	.d_version = D_VERSION,
	.d_open = agtiapi_open,
	.d_close = agtiapi_close,
	.d_read = agtiapi_read,
	.d_write = agtiapi_write,
	.d_ioctl = agtiapi_CharIoctl,
	.d_name = "pmspcv",
	};

	U32 maxTargets = 0;
	U32 ag_portal_count = 0;

	// In the cdevsw routines, we find our softc by using the si_drv1 member
	// of struct cdev. We set this variable to point to our softc in our
	// attach routine when we create the /dev entry.

	int agtiapi_open( struct cdev dev, int oflags, int devtype, struct thread td )
	{
	struct agtiapi_softc *sc;
	/* Look up our softc. */
	sc = dev->si_drv1;
	AGTIAPI_PRINTK("agtiapi_open\n");
	AGTIAPI_PRINTK("Opened successfully. sc->my_dev %p\n", sc->my_dev);
	return( 0 );
	}

	int agtiapi_close( struct cdev dev, int fflag, int devtype, struct thread td )
	{
	struct agtiapi_softc *sc;
	// Look up our softc
	sc = dev->si_drv1;
	AGTIAPI_PRINTK("agtiapi_close\n");
	AGTIAPI_PRINTK("Closed. sc->my_dev %p\n", sc->my_dev);
	return( 0 );
	}

	int agtiapi_read( struct cdev dev, struct uio uio, int ioflag )
	{
	struct agtiapi_softc *sc;
	// Look up our softc
	sc = dev->si_drv1;
	AGTIAPI_PRINTK( "agtiapi_read\n" );
	AGTIAPI_PRINTK( "Asked to read %lu bytes. sc->my_dev %p\n",
	uio->uio_resid, sc->my_dev );
	return( 0 );
	}

	int agtiapi_write( struct cdev dev, struct uio uio, int ioflag )
	{
	struct agtiapi_softc *sc;
	// Look up our softc
	sc = dev->si_drv1;
	AGTIAPI_PRINTK( "agtiapi_write\n" );
	AGTIAPI_PRINTK( "Asked to write %lu bytes. sc->my_dev %p\n",
	uio->uio_resid, sc->my_dev );
	return( 0 );
	}

	int agtiapi_getdevlist( struct agtiapi_softc *pCard,
	tiIOCTLPayload_t *agIOCTLPayload )
	{
	tdDeviceListPayload_t *pIoctlPayload =
	(tdDeviceListPayload_t *) agIOCTLPayload->FunctionSpecificArea;
	tdDeviceInfoIOCTL_t *pDeviceInfo = NULL;
	bit8 *pDeviceInfoOrg;
	tdsaDeviceData_t *pDeviceData = NULL;
	tiDeviceHandle_t **devList = NULL;
	tiDeviceHandle_t **devHandleArray = NULL;
	tiDeviceHandle_t *pDeviceHandle = NULL;
	bit32 x, memNeeded1;
	bit32 count, total;
	bit32 MaxDeviceCount;
	bit32 ret_val=IOCTL_CALL_INVALID_CODE;
	ag_portal_data_t *pPortalData;
	bit8 *pDeviceHandleList = NULL;
	AGTIAPI_PRINTK( "agtiapi_getdevlist: Enter\n" );

	pDeviceInfoOrg = pIoctlPayload -> pDeviceInfo;
	MaxDeviceCount = pCard->devDiscover;
	if (MaxDeviceCount > pIoctlPayload->deviceLength )
	{
	AGTIAPI_PRINTK( "agtiapi_getdevlist: MaxDeviceCount: %d > Requested device length: %d\n", MaxDeviceCount, pIoctlPayload->deviceLength );
	MaxDeviceCount = pIoctlPayload->deviceLength;
	ret_val = IOCTL_CALL_FAIL;
	}
	AGTIAPI_PRINTK( "agtiapi_getdevlist: MaxDeviceCount: %d > Requested device length: %d\n", MaxDeviceCount, pIoctlPayload->deviceLength );
	memNeeded1 = AG_ALIGNSIZE( MaxDeviceCount * sizeof(tiDeviceHandle_t *),
	sizeof(void *) );
	AGTIAPI_PRINTK("agtiapi_getdevlist: portCount %d\n", pCard->portCount);
	devList = malloc(memNeeded1, TEMP2, M_WAITOK);
	if (devList == NULL)
	{
	AGTIAPI_PRINTK("agtiapi_getdevlist: failed to allocate memory\n");
	ret_val = IOCTL_CALL_FAIL;
	agIOCTLPayload->Status = IOCTL_ERR_STATUS_INTERNAL_ERROR;
	return ret_val;
	}
	osti_memset(devList, 0, memNeeded1);
	pPortalData = &pCard->pPortalData[0];
	pDeviceHandleList = (bit8*)devList;
	for (total = x = 0; x < pCard->portCount; x++, pPortalData++)
	{
	count = tiINIGetDeviceHandlesForWinIOCTL(&pCard->tiRoot,
	&pPortalData->portalInfo.tiPortalContext,
	( tiDeviceHandle_t **)pDeviceHandleList ,MaxDeviceCount );
	if (count == DISCOVERY_IN_PROGRESS)
	{
	AGTIAPI_PRINTK( "agtiapi_getdevlist: DISCOVERY_IN_PROGRESS on "
	"portal %d\n", x );
	free(devList, TEMP2);
	ret_val = IOCTL_CALL_FAIL;
	agIOCTLPayload->Status = IOCTL_ERR_STATUS_INTERNAL_ERROR;
	return ret_val;
	}
	total += count;
	pDeviceHandleList+= countsizeof(tiDeviceHandle_t );
	MaxDeviceCount-= count;
	}
	if (total > pIoctlPayload->deviceLength)
	{
	total = pIoctlPayload->deviceLength;
	}
	// dump device information from device handle list
	count = 0;

	devHandleArray = devList;
	for (x = 0; x < pCard->devDiscover; x++)
	{
	pDeviceHandle = (tiDeviceHandle_t*)devHandleArray[x];
	if (devList[x] != agNULL)
	{
	pDeviceData = devList [x]->tdData;

	pDeviceInfo = (tdDeviceInfoIOCTL_t)(pDeviceInfoOrg + sizeof(tdDeviceInfoIOCTL_t) count);
	if (pDeviceData != agNULL && pDeviceInfo != agNULL)
	{
	osti_memcpy( &pDeviceInfo->sasAddressHi,
	pDeviceData->agDeviceInfo.sasAddressHi,
	sizeof(bit32) );
	osti_memcpy( &pDeviceInfo->sasAddressLo,
	pDeviceData->agDeviceInfo.sasAddressLo,
	sizeof(bit32) );
	#if 0
	pDeviceInfo->sasAddressHi =
	DMA_BEBIT32_TO_BIT32( pDeviceInfo->sasAddressHi );
	pDeviceInfo->sasAddressLo =
	DMA_BEBIT32_TO_BIT32( pDeviceInfo->sasAddressLo );
	#endif

	pDeviceInfo->deviceType =
	( pDeviceData->agDeviceInfo.devType_S_Rate & 0x30 ) >> 4;
	pDeviceInfo->linkRate =
	pDeviceData->agDeviceInfo.devType_S_Rate & 0x0F;
	pDeviceInfo->phyId = pDeviceData->phyID;
	pDeviceInfo->ishost = pDeviceData->target_ssp_stp_smp;
	pDeviceInfo->DeviceHandle= (unsigned long)pDeviceHandle;
	if(pDeviceInfo->deviceType == 0x02)
	{
	bit8 *sasAddressHi;
	bit8 *sasAddressLo;
	tiIniGetDirectSataSasAddr(&pCard->tiRoot, pDeviceData->phyID, &sasAddressHi, &sasAddressLo);
	pDeviceInfo->sasAddressHi = DMA_BEBIT32_TO_BIT32((bit32)sasAddressHi);
	pDeviceInfo->sasAddressLo = DMA_BEBIT32_TO_BIT32((bit32)sasAddressLo) + pDeviceData->phyID + 16;
	}
	else
	{
	pDeviceInfo->sasAddressHi =
	DMA_BEBIT32_TO_BIT32( pDeviceInfo->sasAddressHi );
	pDeviceInfo->sasAddressLo =
	DMA_BEBIT32_TO_BIT32( pDeviceInfo->sasAddressLo );
	}

	AGTIAPI_PRINTK( "agtiapi_getdevlist: devicetype %x\n",
	pDeviceInfo->deviceType );
	AGTIAPI_PRINTK( "agtiapi_getdevlist: linkrate %x\n",
	pDeviceInfo->linkRate );
	AGTIAPI_PRINTK( "agtiapi_getdevlist: phyID %x\n",
	pDeviceInfo->phyId );
	AGTIAPI_PRINTK( "agtiapi_getdevlist: addresshi %x\n",
	pDeviceInfo->sasAddressHi );
	AGTIAPI_PRINTK( "agtiapi_getdevlist: addresslo %x\n",
	pDeviceInfo->sasAddressHi );
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_getdevlist: pDeviceData %p or pDeviceInfo "
	"%p is NULL %d\n", pDeviceData, pDeviceInfo, x );
	}
	count++;
	}
	}
	pIoctlPayload->realDeviceCount = count;
	AGTIAPI_PRINTK( "agtiapi_getdevlist: Exit RealDeviceCount = %d\n", count );
	if (devList)
	{
	free(devList, TEMP2);
	}
	if(ret_val != IOCTL_CALL_FAIL)
	{
	ret_val = IOCTL_CALL_SUCCESS;
	}
	agIOCTLPayload->Status = IOCTL_ERR_STATUS_OK;
	return ret_val;
	}

	/******************************************************************************
	agtiapi_getCardInfo()

	Purpose:
	This function retrives the Card information
	Parameters:

	Return:
	A number - error
	0 - HBA has been detected
	Note:
	******************************************************************************/
	int agtiapi_getCardInfo ( struct agtiapi_softc *pCard,
	U32_64 size,
	void *buffer )
	{
	CardInfo_t *pCardInfo;

	pCardInfo = (CardInfo_t *)buffer;

	pCardInfo->deviceId = pci_get_device(pCard->my_dev);
	pCardInfo->vendorId =pci_get_vendor(pCard->my_dev) ;
	memcpy( pCardInfo->pciMemBaseSpc,
	pCard->pCardInfo->pciMemBaseSpc,
	((sizeof(U32_64))*PCI_NUMBER_BARS) );
	pCardInfo->deviceNum = pci_get_slot(pCard->my_dev);
	pCardInfo->pciMemBase = pCard->pCardInfo->pciMemBase;
	pCardInfo->pciIOAddrLow = pCard->pCardInfo->pciIOAddrLow;
	pCardInfo->pciIOAddrUp = pCard->pCardInfo->pciIOAddrUp;
	pCardInfo->busNum =pci_get_bus(pCard->my_dev);
	return 0;
	}

	void agtiapi_adjust_queue_depth(struct cam_path *path, bit32 QueueDepth)
	{
	struct ccb_relsim crs;
	xpt_setup_ccb(&crs.ccb_h, path, 5);
	crs.ccb_h.func_code = XPT_REL_SIMQ;
	crs.ccb_h.flags = CAM_DEV_QFREEZE;
	crs.release_flags = RELSIM_ADJUST_OPENINGS;
	crs.openings = QueueDepth;
	xpt_action((union ccb *)&crs);
	if(crs.ccb_h.status != CAM_REQ_CMP) {
	printf("XPT_REL_SIMQ failed\n");
	}
	}
	static void
	agtiapi_async(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg)
	{
	struct agtiapi_softc *pmsc;
	U32 TID;
	ag_device_t *targ;
	pmsc = (struct agtiapi_softc*)callback_arg;
	switch (code) {
	case AC_FOUND_DEVICE:
	{
	struct ccb_getdev *cgd;
	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL) {
	break;
	}
	TID = cgd->ccb_h.target_id;
	if (TID >= 0 && TID < maxTargets){
	if (pmsc != NULL){
	TID = INDEX(pmsc, TID);
	targ = &pmsc->pDevList[TID];
	agtiapi_adjust_queue_depth(path, targ->qdepth);
	}
	}
	break;
	}
	default:
	break;
	}
	}
	/******************************************************************************
	agtiapi_CharIoctl()

	Purpose:
	This function handles the ioctl from application layer
	Parameters:

	Return:
	A number - error
	0 - HBA has been detected
	Note:
	******************************************************************************/
	static int agtiapi_CharIoctl( struct cdev *dev,
	u_long cmd,
	caddr_t data,
	int fflag,
	struct thread *td )
	{
	struct sema mx;
	datatosend *load; // structure defined in lxcommon.h
	tiIOCTLPayload_t *pIoctlPayload;
	struct agtiapi_softc *pCard;
	pCard=dev->si_drv1;
	U32 status = 0;
	U32 retValue;
	int err = 0;
	int error = 0;
	tdDeviceListPayload_t *pDeviceList = NULL;
	unsigned long flags;

	switch (cmd)
	{
	case AGTIAPI_IOCTL:
	load=(datatosend*)data;
	pIoctlPayload = malloc(load->datasize,TEMP,M_WAITOK);
	AGTIAPI_PRINTK( "agtiapi_CharIoctl: old load->datasize = %d\n", load->datasize );
	//Copy payload to kernel buffer, on success it returns 0
	err = copyin(load->data,pIoctlPayload,load->datasize);
	if (err)
	{
	status = IOCTL_CALL_FAIL;
	return status;
	}
	sema_init(&mx,0,"sem");
	pCard->pIoctlSem =&mx;
	pCard->up_count = pCard->down_count = 0;
	if ( pIoctlPayload->MajorFunction == IOCTL_MJ_GET_DEVICE_LIST )
	{
	retValue = agtiapi_getdevlist(pCard, pIoctlPayload);
	if (retValue == 0)
	{
	pIoctlPayload->Status = IOCTL_CALL_SUCCESS;
	status = IOCTL_CALL_SUCCESS;
	}
	else
	{
	pIoctlPayload->Status = IOCTL_CALL_FAIL;
	status = IOCTL_CALL_FAIL;
	}
	//update new device length
	pDeviceList = (tdDeviceListPayload_t*)pIoctlPayload->FunctionSpecificArea;
	load->datasize =load->datasize - sizeof(tdDeviceInfoIOCTL_t) * (pDeviceList->deviceLength - pDeviceList->realDeviceCount);
	AGTIAPI_PRINTK( "agtiapi_CharIoctl: new load->datasize = %d\n", load->datasize );

	}
	else if (pIoctlPayload->MajorFunction == IOCTL_MN_GET_CARD_INFO)
	{
	retValue = agtiapi_getCardInfo( pCard,
	pIoctlPayload->Length,
	(pIoctlPayload->FunctionSpecificArea) );
	if (retValue == 0)
	{
	pIoctlPayload->Status = IOCTL_CALL_SUCCESS;
	status = IOCTL_CALL_SUCCESS;
	}
	else
	{
	pIoctlPayload->Status = IOCTL_CALL_FAIL;
	status = IOCTL_CALL_FAIL;
	}
	}
	else if ( pIoctlPayload->MajorFunction == IOCTL_MJ_CHECK_DPMC_EVENT )
	{
	if ( pCard->flags & AGTIAPI_PORT_PANIC )
	{
	strcpy ( pIoctlPayload->FunctionSpecificArea, "DPMC LEAN\n" );
	}
	else
	{
	strcpy ( pIoctlPayload->FunctionSpecificArea, "do not dpmc lean\n" );
	}
	pIoctlPayload->Status = IOCTL_CALL_SUCCESS;
	status = IOCTL_CALL_SUCCESS;
	}
	else if (pIoctlPayload->MajorFunction == IOCTL_MJ_CHECK_FATAL_ERROR )
	{
	AGTIAPI_PRINTK("agtiapi_CharIoctl: IOCTL_MJ_CHECK_FATAL_ERROR call received for card %d\n", pCard->cardNo);
	//read port status to see if there is a fatal event
	if(pCard->flags & AGTIAPI_PORT_PANIC)
	{
	printf("agtiapi_CharIoctl: Port Panic Status For Card %d is True\n",pCard->cardNo);
	pIoctlPayload->Status = IOCTL_MJ_FATAL_ERR_CHK_SEND_TRUE;
	}
	else
	{
	AGTIAPI_PRINTK("agtiapi_CharIoctl: Port Panic Status For Card %d is False\n",pCard->cardNo);
	pIoctlPayload->Status = IOCTL_MJ_FATAL_ERR_CHK_SEND_FALSE;
	}
	status = IOCTL_CALL_SUCCESS;
	}
	else if (pIoctlPayload->MajorFunction == IOCTL_MJ_FATAL_ERROR_DUMP_COMPLETE)
	{
	AGTIAPI_PRINTK("agtiapi_CharIoctl: IOCTL_MJ_FATAL_ERROR_DUMP_COMPLETE call received for card %d\n", pCard->cardNo);
	//set flags bit status to be a soft reset
	pCard->flags \|= AGTIAPI_SOFT_RESET;
	//trigger soft reset for the card
	retValue = agtiapi_ResetCard (pCard, &flags);

	if(retValue == AGTIAPI_SUCCESS)
	{
	//clear port panic status
	pCard->flags &= ~AGTIAPI_PORT_PANIC;
	pIoctlPayload->Status = IOCTL_MJ_FATAL_ERROR_SOFT_RESET_TRIG;
	status = IOCTL_CALL_SUCCESS;
	}
	else
	{
	pIoctlPayload->Status = IOCTL_CALL_FAIL;
	status = IOCTL_CALL_FAIL;
	}
	}
	else
	{
	status = tiCOMMgntIOCTL( &pCard->tiRoot,
	pIoctlPayload,
	pCard,
	NULL,
	NULL );
	if (status == IOCTL_CALL_PENDING)
	{
	ostiIOCTLWaitForSignal(&pCard->tiRoot,NULL, NULL, NULL);
	status = IOCTL_CALL_SUCCESS;
	}
	}
	pCard->pIoctlSem = NULL;
	err = 0;

	//copy kernel buffer to userland buffer
	err=copyout(pIoctlPayload,load->data,load->datasize);
	if (err)
	{
	status = IOCTL_CALL_FAIL;
	return status;
	}
	free(pIoctlPayload,TEMP);
	pIoctlPayload=NULL;
	break;
	default:
	error = ENOTTY;
	break;
	}
	return(status);
	}

	/******************************************************************************
	agtiapi_probe()

	Purpose:
	This function initialize and registere all detected HBAs.
	The first function being called in driver after agtiapi_probe()
	Parameters:
	device_t dev (IN) - device pointer
	Return:
	A number - error
	0 - HBA has been detected
	Note:
	******************************************************************************/
	static int agtiapi_probe( device_t dev )
	{
	int retVal;
	int thisCard;
	ag_card_info_t *thisCardInst;

	thisCard = device_get_unit( dev );
	if ( thisCard >= AGTIAPI_MAX_CARDS )
	{
	device_printf( dev, "Too many PMC-Sierra cards detected ERROR!\n" );
	return (ENXIO); // maybe change to different return value?
	}
	thisCardInst = &agCardInfoList[ thisCard ];
	retVal = agtiapi_ProbeCard( dev, thisCardInst, thisCard );
	if ( retVal )
	return (ENXIO); // maybe change to different return value?
	return( BUS_PROBE_DEFAULT ); // successful probe
	}


	/******************************************************************************
	agtiapi_attach()

	Purpose:
	This function initialize and registere all detected HBAs.
	The first function being called in driver after agtiapi_probe()
	Parameters:
	device_t dev (IN) - device pointer
	Return:
	A number - error
	0 - HBA has been detected
	Note:
	******************************************************************************/
	static int agtiapi_attach( device_t devx )
	{
	// keeping get_unit call to once
	int thisCard = device_get_unit( devx );
	struct agtiapi_softc *pmsc;
	ag_card_info_t *thisCardInst = &agCardInfoList[ thisCard ];
	ag_resource_info_t *pRscInfo;
	int idx;
	int lenRecv;
	char buffer [256], *pLastUsedChar;
	union ccb *ccb;
	int bus, tid, lun;
	struct ccb_setasync csa;

	AGTIAPI_PRINTK("agtiapi_attach: start dev %p thisCard %d\n", devx, thisCard);
	// AGTIAPI_PRINTK( "agtiapi_attach: entry pointer values A %p / %p\n",
	// thisCardInst->pPCIDev, thisCardInst );
	AGTIAPI_PRINTK( "agtiapi_attach: deviceID: 0x%x\n", pci_get_devid( devx ) );

	TUNABLE_INT_FETCH( "DPMC_TIMEOUT_SECS", &ag_timeout_secs );
	TUNABLE_INT_FETCH( "DPMC_TIDEBUG_LEVEL", &gTiDebugLevel );
	// printf( "agtiapi_attach: debugLevel %d, timeout %d\n",
	// gTiDebugLevel, ag_timeout_secs );
	if ( ag_timeout_secs < 1 )
	{
	ag_timeout_secs = 1; // set minimum timeout value of 1 second
	}
	ag_timeout_secs = (ag_timeout_secs * 1000); // convert to millisecond notation

	// Look up our softc and initialize its fields.
	pmsc = device_get_softc( devx );
	pmsc->my_dev = devx;

	/* Get NumberOfPortals */
	if ((ostiGetTransportParam(
	&pmsc->tiRoot,
	"Global",
	"CardDefault",
	agNULL,
	agNULL,
	agNULL,
	agNULL,
	"NumberOfPortals",
	buffer,
	255,
	&lenRecv
	) == tiSuccess) && (lenRecv != 0))
	{
	if (osti_strncmp(buffer, "0x", 2) == 0)
	{
	ag_portal_count = osti_strtoul (buffer, &pLastUsedChar, 0);
	}
	else
	{
	ag_portal_count = osti_strtoul (buffer, &pLastUsedChar, 10);
	}
	if (ag_portal_count > AGTIAPI_MAX_PORTALS)
	ag_portal_count = AGTIAPI_MAX_PORTALS;
	}
	else
	{
	ag_portal_count = AGTIAPI_MAX_PORTALS;
	}
	AGTIAPI_PRINTK( "agtiapi_attach: ag_portal_count=%d\n", ag_portal_count );
	// initialize hostdata structure
	pmsc->flags \|= AGTIAPI_INIT_TIME \| AGTIAPI_SCSI_REGISTERED \|
	AGTIAPI_INITIATOR;
	pmsc->cardNo = thisCard;
	pmsc->ccbTotal = 0;
	pmsc->portCount = ag_portal_count;
	pmsc->pCardInfo = thisCardInst;
	pmsc->tiRoot.osData = pmsc;
	pmsc->pCardInfo->pCard = (void *)pmsc;
	pmsc->VidDid = ( pci_get_vendor(devx) << 16 ) \| pci_get_device( devx );
	pmsc->SimQFrozen = agFALSE;
	pmsc->devq_flag = agFALSE;
	pRscInfo = &thisCardInst->tiRscInfo;

	osti_memset(buffer, 0, 256);
	lenRecv = 0;

	/* Get MaxTargets */
	if ((ostiGetTransportParam(
	&pmsc->tiRoot,
	"Global",
	"InitiatorParms",
	agNULL,
	agNULL,
	agNULL,
	agNULL,
	"MaxTargets",
	buffer,
	sizeof(buffer),
	&lenRecv
	) == tiSuccess) && (lenRecv != 0))
	{
	if (osti_strncmp(buffer, "0x", 2) == 0)
	{
	maxTargets = osti_strtoul (buffer, &pLastUsedChar, 0);
	AGTIAPI_PRINTK( "agtiapi_attach: maxTargets = osti_strtoul 0 \n" );
	}
	else
	{
	maxTargets = osti_strtoul (buffer, &pLastUsedChar, 10);
	AGTIAPI_PRINTK( "agtiapi_attach: maxTargets = osti_strtoul 10\n" );
	}
	}
	else

	{
	if(Is_ADP8H(pmsc))
	maxTargets = AGTIAPI_MAX_DEVICE_8H;
	else if(Is_ADP7H(pmsc))
	maxTargets = AGTIAPI_MAX_DEVICE_7H;
	else
	maxTargets = AGTIAPI_MAX_DEVICE;
	}

	if (maxTargets > AGTIAPI_HW_LIMIT_DEVICE)
	{
	AGTIAPI_PRINTK( "agtiapi_attach: maxTargets: %d > AGTIAPI_HW_LIMIT_DEVICE: %d\n", maxTargets, AGTIAPI_HW_LIMIT_DEVICE );
	AGTIAPI_PRINTK( "agtiapi_attach: change maxTargets = AGTIAPI_HW_LIMIT_DEVICE\n" );
	maxTargets = AGTIAPI_HW_LIMIT_DEVICE;
	}
	pmsc->devDiscover = maxTargets ;

	#ifdef HIALEAH_ENCRYPTION
	ag_encryption_enable = 1;
	if(ag_encryption_enable && pci_get_device(pmsc->pCardInfo->pPCIDev) ==
	PCI_DEVICE_ID_HIALEAH_HBA_SPCVE)
	{
	pmsc->encrypt = 1;
	pRscInfo->tiLoLevelResource.loLevelOption.encryption = agTRUE;
	printf("agtiapi_attach: Encryption Enabled\n" );
	}
	#endif
	// ## for now, skip calls to ostiGetTransportParam(...)
	// ## for now, skip references to DIF & EDC

	// Create a /dev entry for this device. The kernel will assign us
	// a major number automatically. We use the unit number of this
	// device as the minor number and name the character device
	// "agtiapi<unit>".
	pmsc->my_cdev = make_dev( &agtiapi_cdevsw, thisCard, UID_ROOT, GID_WHEEL,
	0600, "spcv%u", thisCard );
	pmsc->my_cdev->si_drv1 = pmsc;

	mtx_init( &thisCardInst->pmIOLock, "pmc SAS I/O lock",
	NULL, MTX_DEF\|MTX_RECURSE );

	struct cam_devq *devq;

	/* set the maximum number of pending IOs */
	devq = cam_simq_alloc( AGTIAPI_MAX_CAM_Q_DEPTH );
	if (devq == NULL)
	{
	AGTIAPI_PRINTK("agtiapi_attach: cam_simq_alloc is NULL\n" );
	return( EIO );
	}

	struct cam_sim *lsim;
	lsim = cam_sim_alloc( agtiapi_cam_action,
	agtiapi_cam_poll,
	"pmspcbsd",
	pmsc,
	thisCard,
	&thisCardInst->pmIOLock,
	1, // queued per target
	AGTIAPI_MAX_CAM_Q_DEPTH, // max tag depth
	devq );
	if ( lsim == NULL ) {
	cam_simq_free( devq );
	AGTIAPI_PRINTK("agtiapi_attach: cam_sim_alloc is NULL\n" );
	return( EIO );
	}

	pmsc->dev_scan = agFALSE;
	//one cam sim per scsi bus
	mtx_lock( &thisCardInst->pmIOLock );
	if ( xpt_bus_register( lsim, devx, 0 ) != CAM_SUCCESS )
	{ // bus 0
	cam_sim_free( lsim, TRUE );
	mtx_unlock( &thisCardInst->pmIOLock );
	AGTIAPI_PRINTK("agtiapi_attach: xpt_bus_register fails\n" );
	return( EIO );
	}

	pmsc->sim = lsim;
	bus = cam_sim_path(pmsc->sim);
	tid = CAM_TARGET_WILDCARD;
	lun = CAM_LUN_WILDCARD;
	ccb = xpt_alloc_ccb_nowait();
	if (ccb == agNULL)
	{
	mtx_unlock( &thisCardInst->pmIOLock );
	cam_sim_free( lsim, TRUE );
	cam_simq_free( devq );
	return ( EIO );
	}
	if (xpt_create_path(&ccb->ccb_h.path, agNULL, bus, tid,
	CAM_LUN_WILDCARD) != CAM_REQ_CMP)
	{
	mtx_unlock( &thisCardInst->pmIOLock );
	cam_sim_free( lsim, TRUE );
	cam_simq_free( devq );
	xpt_free_ccb(ccb);
	return( EIO );
	}
	pmsc->path = ccb->ccb_h.path;
	xpt_setup_ccb(&csa.ccb_h, pmsc->path, 5);
	csa.ccb_h.func_code = XPT_SASYNC_CB;
	csa.event_enable = AC_FOUND_DEVICE;
	csa.callback = agtiapi_async;
	csa.callback_arg = pmsc;
	xpt_action((union ccb *)&csa);
	if (csa.ccb_h.status != CAM_REQ_CMP) {
	AGTIAPI_PRINTK("agtiapi_attach: Unable to register AC_FOUND_DEVICE\n" );
	}
	lsim->devq = devq;
	mtx_unlock( &thisCardInst->pmIOLock );




	// get TD and lower layer memory requirements
	tiCOMGetResource( &pmsc->tiRoot,
	&pRscInfo->tiLoLevelResource,
	&pRscInfo->tiInitiatorResource,
	NULL,
	&pRscInfo->tiSharedMem );

	agtiapi_ScopeDMARes( thisCardInst );
	AGTIAPI_PRINTK( "agtiapi_attach: size from the call agtiapi_ScopeDMARes"
	" 0x%x \n", pmsc->typhn );

	// initialize card information and get resource ready
	if( agtiapi_InitResource( thisCardInst ) == AGTIAPI_FAIL ) {
	AGTIAPI_PRINTK( "agtiapi_attach: Card %d initialize resource ERROR\n",
	thisCard );
	}

	// begin: allocate and initialize card portal info resource
	ag_portal_data_t *pPortalData;
	if (pmsc->portCount == 0)
	{
	pmsc->pPortalData = NULL;
	}
	else
	{
	pmsc->pPortalData = (ag_portal_data_t *)
	malloc( sizeof(ag_portal_data_t) * pmsc->portCount,
	M_PMC_MPRT, M_ZERO \| M_WAITOK );
	if (pmsc->pPortalData == NULL)
	{
	AGTIAPI_PRINTK( "agtiapi_attach: Portal memory allocation ERROR\n" );
	}
	}

	pPortalData = pmsc->pPortalData;
	for( idx = 0; idx < pmsc->portCount; idx++ ) {
	pPortalData->pCard = pmsc;
	pPortalData->portalInfo.portID = idx;
	pPortalData->portalInfo.tiPortalContext.osData = (void *)pPortalData;
	pPortalData++;
	}
	// end: allocate and initialize card portal info resource

	// begin: enable msix

	// setup msix
	// map to interrupt handler
	int error = 0;
	int mesgs = MAX_MSIX_NUM_VECTOR;
	int i, cnt;

	void (intrHandler[MAX_MSIX_NUM_ISR])(void arg) =
	{
	agtiapi_IntrHandler0,
	agtiapi_IntrHandler1,
	agtiapi_IntrHandler2,
	agtiapi_IntrHandler3,
	agtiapi_IntrHandler4,
	agtiapi_IntrHandler5,
	agtiapi_IntrHandler6,
	agtiapi_IntrHandler7,
	agtiapi_IntrHandler8,
	agtiapi_IntrHandler9,
	agtiapi_IntrHandler10,
	agtiapi_IntrHandler11,
	agtiapi_IntrHandler12,
	agtiapi_IntrHandler13,
	agtiapi_IntrHandler14,
	agtiapi_IntrHandler15

	};

	cnt = pci_msix_count(devx);
	AGTIAPI_PRINTK("supported MSIX %d\n", cnt); //this should be 64
	mesgs = MIN(mesgs, cnt);
	error = pci_alloc_msix(devx, &mesgs);
	if (error != 0) {
	printf( "pci_alloc_msix error %d\n", error );
	AGTIAPI_PRINTK("error %d\n", error);
	return( EIO );
	}

	for(i=0; i < mesgs; i++) {
	pmsc->rscID[i] = i + 1;
	pmsc->irq[i] = bus_alloc_resource_any( devx,
	SYS_RES_IRQ,
	&pmsc->rscID[i],
	RF_ACTIVE );
	if( pmsc->irq[i] == NULL ) {
	printf( "RES_IRQ went terribly bad at %d\n", i );
	return( EIO );
	}

	if ( (error = bus_setup_intr( devx, pmsc->irq[i],
	INTR_TYPE_CAM \| INTR_MPSAFE,
	NULL,
	intrHandler[i],
	pmsc,
	&pmsc->intrcookie[i] )
	) != 0 ) {
	device_printf( devx, "Failed to register handler" );
	return( EIO );
	}
	}
	pmsc->flags \|= AGTIAPI_IRQ_REQUESTED;
	pmsc->pCardInfo->maxInterruptVectors = MAX_MSIX_NUM_VECTOR;
	// end: enable msix

	int ret = 0;
	ret = agtiapi_InitCardSW(pmsc);
	if (ret == AGTIAPI_FAIL \|\| ret == AGTIAPI_UNKNOWN)
	{
	AGTIAPI_PRINTK( "agtiapi_attach: agtiapi_InitCardSW failure %d\n",
	ret );
	return( EIO );
	}

	pmsc->ccbFreeList = NULL;
	pmsc->ccbChainList = NULL;
	pmsc->ccbAllocList = NULL;

	pmsc->flags \|= ( AGTIAPI_INSTALLED );

	ret = agtiapi_alloc_requests( pmsc );
	if( ret != 0 ) {
	AGTIAPI_PRINTK( "agtiapi_attach: agtiapi_alloc_requests failure %d\n",
	ret );
	return( EIO );
	}

	ret = agtiapi_alloc_ostimem( pmsc );
	if (ret != AGTIAPI_SUCCESS)
	{
	AGTIAPI_PRINTK( "agtiapi_attach: agtiapi_alloc_ostimem failure %d\n",
	ret );
	return( EIO );
	}

	ret = agtiapi_InitCardHW( pmsc );
	if (ret != 0)
	{
	AGTIAPI_PRINTK( "agtiapi_attach: agtiapi_InitCardHW failure %d\n",
	ret );
	return( EIO );
	}

	#ifdef HIALEAH_ENCRYPTION
	if(pmsc->encrypt)
	{
	if((agtiapi_SetupEncryption(pmsc)) < 0)
	AGTIAPI_PRINTK("SetupEncryption returned less than 0\n");
	}
	#endif

	pmsc->flags &= ~AGTIAPI_INIT_TIME;
	return( 0 );
	}

	/******************************************************************************
	agtiapi_InitCardSW()

	Purpose:
	Host Bus Adapter Initialization
	Parameters:
	struct agtiapi_softc *pmsc (IN) Pointer to the HBA data structure
	Return:
	AGTIAPI_SUCCESS - success
	AGTIAPI_FAIL - fail
	Note:
	TBD, need chip register information
	******************************************************************************/
	STATIC agBOOLEAN agtiapi_InitCardSW( struct agtiapi_softc *pmsc )
	{
	ag_card_info_t *thisCardInst = pmsc->pCardInfo;
	ag_resource_info_t *pRscInfo = &thisCardInst->tiRscInfo;
	int initSWIdx;

	// begin: agtiapi_InitCardSW()
	// now init some essential locks n agtiapi_InitCardSW
	mtx_init( &pmsc->sendLock, "local q send lock", NULL, MTX_DEF );
	mtx_init( &pmsc->doneLock, "local q done lock", NULL, MTX_DEF );
	mtx_init( &pmsc->sendSMPLock, "local q send lock", NULL, MTX_DEF );
	mtx_init( &pmsc->doneSMPLock, "local q done lock", NULL, MTX_DEF );
	mtx_init( &pmsc->ccbLock, "ccb list lock", NULL, MTX_DEF );
	mtx_init( &pmsc->devListLock, "hotP devListLock", NULL, MTX_DEF );
	mtx_init( &pmsc->memLock, "dynamic memory lock", NULL, MTX_DEF );
	mtx_init( &pmsc->freezeLock, "sim freeze lock", NULL, MTX_DEF \| MTX_RECURSE);

	// initialize lower layer resources
	//## if (pCard->flags & AGTIAPI_INIT_TIME) {
	#ifdef HIALEAH_ENCRYPTION
	/* Enable encryption if chip supports it */
	if (pci_get_device(pmsc->pCardInfo->pPCIDev) ==
	PCI_DEVICE_ID_HIALEAH_HBA_SPCVE)
	pmsc->encrypt = 1;

	if (pmsc->encrypt)
	pRscInfo->tiLoLevelResource.loLevelOption.encryption = agTRUE;
	#endif
	pmsc->flags &= ~(AGTIAPI_PORT_INITIALIZED \| AGTIAPI_SYS_INTR_ON);


	// For now, up to 16 MSIX vectors are supported
	thisCardInst->tiRscInfo.tiLoLevelResource.loLevelOption.
	maxInterruptVectors = pmsc->pCardInfo->maxInterruptVectors;
	AGTIAPI_PRINTK( "agtiapi_InitCardSW: maxInterruptVectors set to %d",
	pmsc->pCardInfo->maxInterruptVectors );
	thisCardInst->tiRscInfo.tiLoLevelResource.loLevelOption.max_MSI_InterruptVectors = 0;
	thisCardInst->tiRscInfo.tiLoLevelResource.loLevelOption.flag = 0;
	pRscInfo->tiLoLevelResource.loLevelOption.maxNumOSLocks = 0;

	AGTIAPI_PRINTK( "agtiapi_InitCardSW: tiCOMInit root %p, dev %p, pmsc %p\n",
	&pmsc->tiRoot, pmsc->my_dev, pmsc );
	if( tiCOMInit( &pmsc->tiRoot,
	&thisCardInst->tiRscInfo.tiLoLevelResource,
	&thisCardInst->tiRscInfo.tiInitiatorResource,
	NULL,
	&thisCardInst->tiRscInfo.tiSharedMem ) != tiSuccess ) {
	AGTIAPI_PRINTK( "agtiapi_InitCardSW: tiCOMInit ERROR\n" );
	return AGTIAPI_FAIL;
	}
	int maxLocks;
	maxLocks = pRscInfo->tiLoLevelResource.loLevelOption.numOfQueuesPerPort;
	pmsc->STLock = malloc( ( maxLocks * sizeof(struct mtx) ), M_PMC_MSTL,
	M_ZERO \| M_WAITOK );

	for( initSWIdx = 0; initSWIdx < maxLocks; initSWIdx++ )
	{
	// init all indexes
	mtx_init( &pmsc->STLock[initSWIdx], "LL & TD lock", NULL, MTX_DEF );
	}

	if( tiCOMPortInit( &pmsc->tiRoot, agFALSE ) != tiSuccess ) {
	printf( "agtiapi_InitCardSW: tiCOMPortInit ERROR -- AGTIAPI_FAIL\n" );
	return AGTIAPI_FAIL;
	}
	AGTIAPI_PRINTK( "agtiapi_InitCardSW: tiCOMPortInit"
	" root %p, dev %p, pmsc %p\n",
	&pmsc->tiRoot, pmsc->my_dev, pmsc );

	pmsc->flags \|= AGTIAPI_PORT_INITIALIZED;
	pmsc->freezeSim = agFALSE;

	#ifdef HIALEAH_ENCRYPTION
	atomic_set(&outstanding_encrypted_io_count, 0);
	/fix below/
	/*if(pmsc->encrypt && (pmsc->flags & AGTIAPI_INIT_TIME))
	if((agtiapi_SetupEncryptionPools(pmsc)) != 0)
	printf("SetupEncryptionPools failed\n"); */
	#endif
	return AGTIAPI_SUCCESS;
	// end: agtiapi_InitCardSW()
	}

	/******************************************************************************
	agtiapi_InitCardHW()

	Purpose:
	Host Bus Adapter Initialization
	Parameters:
	struct agtiapi_softc *pmsc (IN) Pointer to the HBA data structure
	Return:
	AGTIAPI_SUCCESS - success
	AGTIAPI_FAIL - fail
	Note:
	TBD, need chip register information
	******************************************************************************/
	STATIC agBOOLEAN agtiapi_InitCardHW( struct agtiapi_softc *pmsc )
	{
	U32 numVal;
	U32 count;
	U32 loop;
	// begin: agtiapi_InitCardHW()

	ag_portal_info_t *pPortalInfo = NULL;
	ag_portal_data_t *pPortalData;

	// ISR is registered, enable chip interrupt.
	tiCOMSystemInterruptsActive( &pmsc->tiRoot, agTRUE );
	pmsc->flags \|= AGTIAPI_SYS_INTR_ON;

	numVal = sizeof(ag_device_t) * pmsc->devDiscover;
	pmsc->pDevList =
	(ag_device_t *)malloc( numVal, M_PMC_MDVT, M_ZERO \| M_WAITOK );
	if( !pmsc->pDevList ) {
	AGTIAPI_PRINTK( "agtiapi_InitCardHW: kmalloc %d DevList ERROR\n", numVal );
	panic( "agtiapi_InitCardHW\n" );
	return AGTIAPI_FAIL;
	}

	#ifdef LINUX_PERBI_SUPPORT
	numVal = sizeof(ag_slr_map_t) * pmsc->devDiscover;
	pmsc->pSLRList =
	(ag_slr_map_t *)malloc( numVal, M_PMC_MSLR, M_ZERO \| M_WAITOK );
	if( !pmsc->pSLRList ) {
	AGTIAPI_PRINTK( "agtiapi_InitCardHW: kmalloc %d SLRList ERROR\n", numVal );
	panic( "agtiapi_InitCardHW SLRL\n" );
	return AGTIAPI_FAIL;
	}

	numVal = sizeof(ag_tgt_map_t) * pmsc->devDiscover;
	pmsc->pWWNList =
	(ag_tgt_map_t *)malloc( numVal, M_PMC_MTGT, M_ZERO \| M_WAITOK );
	if( !pmsc->pWWNList ) {
	AGTIAPI_PRINTK( "agtiapi_InitCardHW: kmalloc %d WWNList ERROR\n", numVal );
	panic( "agtiapi_InitCardHW WWNL\n" );
	return AGTIAPI_FAIL;
	}

	// Get the WWN_to_target_ID mappings from the
	// holding area which contains the input of the
	// system configuration file.
	if( ag_Perbi )
	agtiapi_GetWWNMappings( pmsc, agMappingList );
	else {
	agtiapi_GetWWNMappings( pmsc, 0 );
	if( agMappingList )
	printf( "agtiapi_InitCardHW: WWN PERBI disabled WARN\n" );
	}
	#endif

	//agtiapi_DelaySec(5);
	DELAY( 500000 );

	pmsc->tgtCount = 0;

	pmsc->flags &= ~AGTIAPI_CB_DONE;
	pPortalData = pmsc->pPortalData;

	//start port

	for (count = 0; count < pmsc->portCount; count++)
	{
	AG_SPIN_LOCK_IRQ( agtiapi_host_lock, flags );

	pPortalInfo = &pPortalData->portalInfo;
	pPortalInfo->portStatus &= ~( AGTIAPI_PORT_START \|
	AGTIAPI_PORT_DISC_READY \|
	AGTIAPI_DISC_DONE \|
	AGTIAPI_DISC_COMPLETE );

	for (loop = 0; loop < AGTIAPI_LOOP_MAX; loop++)
	{
	AGTIAPI_PRINTK( "tiCOMPortStart entry data %p / %d / %p\n",
	&pmsc->tiRoot,
	pPortalInfo->portID,
	&pPortalInfo->tiPortalContext );

	if( tiCOMPortStart( &pmsc->tiRoot,
	pPortalInfo->portID,
	&pPortalInfo->tiPortalContext,
	0 )
	!= tiSuccess ) {
	AG_SPIN_UNLOCK_IRQ( agtiapi_host_lock, flags );
	agtiapi_DelayMSec( AGTIAPI_EXTRA_DELAY );
	AG_SPIN_LOCK_IRQ(agtiapi_host_lock, flags);
	AGTIAPI_PRINTK( "tiCOMPortStart failed -- no loop, portalData %p\n",
	pPortalData );
	}
	else {
	AGTIAPI_PRINTK( "tiCOMPortStart success no loop, portalData %p\n",
	pPortalData );
	break;
	}
	} // end of for loop
	/* release lock */
	AG_SPIN_UNLOCK_IRQ( agtiapi_host_lock, flags );

	if( loop >= AGTIAPI_LOOP_MAX ) {
	return AGTIAPI_FAIL;
	}
	tiCOMGetPortInfo( &pmsc->tiRoot,
	&pPortalInfo->tiPortalContext,
	&pPortalInfo->tiPortInfo );
	pPortalData++;
	}

	/* discover target device */
	#ifndef HOTPLUG_SUPPORT
	agtiapi_DiscoverTgt( pCard );
	#endif


	pmsc->flags \|= AGTIAPI_INSTALLED;

	if( pmsc->flags & AGTIAPI_INIT_TIME ) {
	agtiapi_TITimer( (void *)pmsc );
	pmsc->flags \|= AGTIAPI_TIMER_ON;
	}

	return 0;
	}



	/******************************************************************************
	agtiapi_IntrHandlerx_()

	Purpose:
	Interrupt service routine.
	Parameters:
	void arg (IN) Pointer to the HBA data structure
	bit32 idx (IN) Vector index
	******************************************************************************/
	void agtiapi_IntrHandlerx_( void *arg, int index )
	{

	struct agtiapi_softc *pCard;
	int rv;

	pCard = (struct agtiapi_softc *)arg;

	#ifndef AGTIAPI_DPC
	ccb_t *pccb;
	#endif

	AG_LOCAL_LOCK(&(pCard->pCardInfo->pmIOLock));
	AG_PERF_SPINLOCK(agtiapi_host_lock);
	if (pCard->flags & AGTIAPI_SHUT_DOWN)
	goto ext;

	rv = tiCOMInterruptHandler(&pCard->tiRoot, index);
	if (rv == agFALSE)
	{
	/* not our irq */
	AG_SPIN_UNLOCK(agtiapi_host_lock);
	AG_LOCAL_UNLOCK(&(pCard->pCardInfo->pmIOLock));
	return;
	}


	#ifdef AGTIAPI_DPC
	tasklet_hi_schedule(&pCard->tasklet_dpc[idx]);
	#else
	/* consume all completed entries, 100 is random number to be big enough */
	tiCOMDelayedInterruptHandler(&pCard->tiRoot, index, 100, tiInterruptContext);
	AG_GET_DONE_PCCB(pccb, pCard);
	AG_GET_DONE_SMP_PCCB(pccb, pCard);
	#endif

	ext:
	AG_SPIN_UNLOCK(agtiapi_host_lock);
	AG_LOCAL_UNLOCK(&(pCard->pCardInfo->pmIOLock));
	return;

	}

	/******************************************************************************
	agtiapi_IntrHandler0()
	Purpose: Interrupt service routine for interrupt vector index 0.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler0( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 0 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler1()
	Purpose: Interrupt service routine for interrupt vector index 1.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler1( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 1 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler2()
	Purpose: Interrupt service routine for interrupt vector index 2.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler2( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 2 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler3()
	Purpose: Interrupt service routine for interrupt vector index 3.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler3( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 3 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler4()
	Purpose: Interrupt service routine for interrupt vector index 4.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler4( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 4 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler5()
	Purpose: Interrupt service routine for interrupt vector index 5.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler5( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 5 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler6()
	Purpose: Interrupt service routine for interrupt vector index 6.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler6( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 6 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler7()
	Purpose: Interrupt service routine for interrupt vector index 7.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler7( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 7 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler8()
	Purpose: Interrupt service routine for interrupt vector index 8.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler8( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 8 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler9()
	Purpose: Interrupt service routine for interrupt vector index 9.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler9( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 9 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler10()
	Purpose: Interrupt service routine for interrupt vector index 10.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler10( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 10 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler11()
	Purpose: Interrupt service routine for interrupt vector index 11.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler11( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 11 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler12()
	Purpose: Interrupt service routine for interrupt vector index 12.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler12( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 12 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler13()
	Purpose: Interrupt service routine for interrupt vector index 13.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler13( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 13 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler14()
	Purpose: Interrupt service routine for interrupt vector index 14.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler14( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 14 );
	return;
	}

	/******************************************************************************
	agtiapi_IntrHandler15()
	Purpose: Interrupt service routine for interrupt vector index 15.
	Parameters: void arg (IN) Pointer to the HBA data structure
	******************************************************************************/
	void agtiapi_IntrHandler15( void *arg )
	{
	agtiapi_IntrHandlerx_( arg, 15 );
	return;
	}

	static void agtiapi_SglMemoryCB( void *arg,
	bus_dma_segment_t *dm_segs,
	int nseg,
	int error )
	{
	bus_addr_t *addr;
	AGTIAPI_PRINTK("agtiapi_SglMemoryCB: start\n");
	if (error != 0)
	{
	AGTIAPI_PRINTK("agtiapi_SglMemoryCB: error %d\n", error);
	panic("agtiapi_SglMemoryCB: error %d\n", error);
	return;
	}
	addr = arg;
	*addr = dm_segs[0].ds_addr;
	return;
	}

	static void agtiapi_MemoryCB( void *arg,
	bus_dma_segment_t *dm_segs,
	int nseg,
	int error )
	{
	bus_addr_t *addr;
	AGTIAPI_PRINTK("agtiapi_MemoryCB: start\n");
	if (error != 0)
	{
	AGTIAPI_PRINTK("agtiapi_MemoryCB: error %d\n", error);
	panic("agtiapi_MemoryCB: error %d\n", error);
	return;
	}
	addr = arg;
	*addr = dm_segs[0].ds_addr;
	return;
	}

	/******************************************************************************
	agtiapi_alloc_requests()

	Purpose:
	Allocates resources such as dma tag and timer
	Parameters:
	struct agtiapi_softc *pmsc (IN) Pointer to the HBA data structure
	Return:
	AGTIAPI_SUCCESS - success
	AGTIAPI_FAIL - fail
	Note:
	******************************************************************************/
	int agtiapi_alloc_requests( struct agtiapi_softc *pmcsc )
	{

	int rsize, nsegs;
	U32 next_tick;

	nsegs = AGTIAPI_NSEGS;
	rsize = AGTIAPI_MAX_DMA_SEGS; // 128
	- AGTIAPI_PRINTK( "agtiapi_alloc_requests: MAXPHYS 0x%x PAGE_SIZE 0x%x \n",
	- MAXPHYS, PAGE_SIZE );
	+ AGTIAPI_PRINTK( "agtiapi_alloc_requests: maxphys 0x%lx PAGE_SIZE 0x%x \n",
	+ maxphys, PAGE_SIZE );
	AGTIAPI_PRINTK( "agtiapi_alloc_requests: nsegs %d rsize %d \n",
	nsegs, rsize ); // 32, 128
	// This is for csio->data_ptr
	if( bus_dma_tag_create( agNULL, // parent
	1, // alignment
	0, // boundary
	BUS_SPACE_MAXADDR, // lowaddr
	BUS_SPACE_MAXADDR, // highaddr
	NULL, // filter
	NULL, // filterarg
	BUS_SPACE_MAXSIZE_32BIT, // maxsize
	nsegs, // nsegments
	BUS_SPACE_MAXSIZE_32BIT, // maxsegsize
	BUS_DMA_ALLOCNOW, // flags
	busdma_lock_mutex, // lockfunc
	&pmcsc->pCardInfo->pmIOLock, // lockarg
	&pmcsc->buffer_dmat ) ) {
	AGTIAPI_PRINTK( "agtiapi_alloc_requests: Cannot alloc request DMA tag\n" );
	return( ENOMEM );
	}

	// This is for tiSgl_t of pccb in agtiapi_PrepCCBs()
	rsize =
	(sizeof(tiSgl_t) * AGTIAPI_NSEGS) *
	AGTIAPI_CCB_PER_DEVICE * maxTargets;
	AGTIAPI_PRINTK( "agtiapi_alloc_requests: rsize %d \n", rsize ); // 32, 128
	if( bus_dma_tag_create( agNULL, // parent
	32, // alignment
	0, // boundary
	BUS_SPACE_MAXADDR_32BIT, // lowaddr
	BUS_SPACE_MAXADDR, // highaddr
	NULL, // filter
	NULL, // filterarg
	rsize, // maxsize
	1, // nsegments
	rsize, // maxsegsize
	BUS_DMA_ALLOCNOW, // flags
	NULL, // lockfunc
	NULL, // lockarg
	&pmcsc->tisgl_dmat ) ) {
	AGTIAPI_PRINTK( "agtiapi_alloc_requests: Cannot alloc request DMA tag\n" );
	return( ENOMEM );
	}

	if( bus_dmamem_alloc( pmcsc->tisgl_dmat,
	(void **)&pmcsc->tisgl_mem,
	BUS_DMA_NOWAIT,
	&pmcsc->tisgl_map ) ) {
	AGTIAPI_PRINTK( "agtiapi_alloc_requests: Cannot allocate SGL memory\n" );
	return( ENOMEM );
	}

	bzero( pmcsc->tisgl_mem, rsize );
	bus_dmamap_load( pmcsc->tisgl_dmat,
	pmcsc->tisgl_map,
	pmcsc->tisgl_mem,
	rsize,
	agtiapi_SglMemoryCB,
	&pmcsc->tisgl_busaddr,
	BUS_DMA_NOWAIT /* 0 */ );

	mtx_init( &pmcsc->OS_timer_lock, "OS timer lock", NULL, MTX_DEF );
	mtx_init( &pmcsc->IO_timer_lock, "IO timer lock", NULL, MTX_DEF );
	mtx_init( &pmcsc->devRmTimerLock, "targ rm timer lock", NULL, MTX_DEF );
	callout_init_mtx( &pmcsc->OS_timer, &pmcsc->OS_timer_lock, 0 );
	callout_init_mtx( &pmcsc->IO_timer, &pmcsc->IO_timer_lock, 0 );
	callout_init_mtx( &pmcsc->devRmTimer,
	&pmcsc->devRmTimerLock, 0);

	next_tick = pmcsc->pCardInfo->tiRscInfo.tiLoLevelResource.
	loLevelOption.usecsPerTick / USEC_PER_TICK;
	AGTIAPI_PRINTK( "agtiapi_alloc_requests: before callout_reset, "
	"next_tick 0x%x\n", next_tick );
	callout_reset( &pmcsc->OS_timer, next_tick, agtiapi_TITimer, pmcsc );
	return 0;
	}

	/******************************************************************************
	agtiapi_alloc_ostimem()

	Purpose:
	Allocates memory used later in ostiAllocMemory
	Parameters:
	struct agtiapi_softc *pmcsc (IN) Pointer to the HBA data structure
	Return:
	AGTIAPI_SUCCESS - success
	AGTIAPI_FAIL - fail
	Note:
	This is a pre-allocation for ostiAllocMemory() "non-cacheable" function calls
	******************************************************************************/
	int agtiapi_alloc_ostimem( struct agtiapi_softc *pmcsc ) {
	int rsize, nomsize;

	nomsize = 4096;
	rsize = AGTIAPI_DYNAMIC_MAX * nomsize; // 8M
	AGTIAPI_PRINTK("agtiapi_alloc_ostimem: rsize %d \n", rsize);

	if( bus_dma_tag_create( agNULL, // parent
	32, // alignment
	0, // boundary
	BUS_SPACE_MAXADDR, // lowaddr
	BUS_SPACE_MAXADDR, // highaddr
	NULL, // filter
	NULL, // filterarg
	rsize, // maxsize (size)
	1, // number of segments
	rsize, // maxsegsize
	0, // flags
	NULL, // lockfunc
	NULL, // lockarg
	&pmcsc->osti_dmat ) ) {
	AGTIAPI_PRINTK( "agtiapi_alloc_ostimem: Can't create no-cache mem tag\n" );
	return AGTIAPI_FAIL;
	}


	if( bus_dmamem_alloc( pmcsc->osti_dmat,
	&pmcsc->osti_mem,
	BUS_DMA_WAITOK \| BUS_DMA_ZERO \| BUS_DMA_NOCACHE,
	&pmcsc->osti_mapp ) ) {
	AGTIAPI_PRINTK( "agtiapi_alloc_ostimem: Cannot allocate cache mem %d\n",
	rsize );
	return AGTIAPI_FAIL;
	}


	bus_dmamap_load( pmcsc->osti_dmat,
	pmcsc->osti_mapp,
	pmcsc->osti_mem,
	rsize,
	agtiapi_MemoryCB, // try reuse of CB for same goal
	&pmcsc->osti_busaddr,
	BUS_DMA_NOWAIT );

	// populate all the ag_dma_addr_t osti_busaddr/mem fields with addresses for
	// handy reference when driver is in motion
	int idx;
	ag_card_info_t *pCardInfo = pmcsc->pCardInfo;
	ag_dma_addr_t *pMem;

	for( idx = 0; idx < AGTIAPI_DYNAMIC_MAX; idx++ ) {
	pMem = &pCardInfo->dynamicMem[idx];
	pMem->nocache_busaddr = pmcsc->osti_busaddr + ( idx * nomsize );
	pMem->nocache_mem = (void)((U64)pmcsc->osti_mem + ( idx nomsize ));
	pCardInfo->freeDynamicMem[idx] = &pCardInfo->dynamicMem[idx];
	}

	pCardInfo->topOfFreeDynamicMem = AGTIAPI_DYNAMIC_MAX;

	return AGTIAPI_SUCCESS;
	}


	/******************************************************************************
	agtiapi_cam_action()

	Purpose:
	Parses CAM frames and triggers a corresponding action
	Parameters:
	struct cam_sim *sim (IN) Pointer to SIM data structure
	union ccb * ccb (IN) Pointer to CAM ccb data structure
	Return:
	Note:
	******************************************************************************/
	static void agtiapi_cam_action( struct cam_sim sim, union ccb ccb )
	{
	struct agtiapi_softc *pmcsc;
	tiDeviceHandle_t *pDevHandle = NULL; // acts as flag as well
	tiDeviceInfo_t devInfo;
	int pathID, targetID, lunID;
	int lRetVal;
	U32 TID;
	U32 speed = 150000;

	pmcsc = cam_sim_softc( sim );
	AGTIAPI_IO( "agtiapi_cam_action: start pmcs %p\n", pmcsc );

	if (pmcsc == agNULL)
	{
	AGTIAPI_PRINTK( "agtiapi_cam_action: start pmcs is NULL\n" );
	return;
	}
	mtx_assert( &(pmcsc->pCardInfo->pmIOLock), MA_OWNED );

	AGTIAPI_IO( "agtiapi_cam_action: cardNO %d func_code 0x%x\n", pmcsc->cardNo, ccb->ccb_h.func_code );

	pathID = xpt_path_path_id( ccb->ccb_h.path );
	targetID = xpt_path_target_id( ccb->ccb_h.path );
	lunID = xpt_path_lun_id( ccb->ccb_h.path );

	AGTIAPI_IO( "agtiapi_cam_action: P 0x%x T 0x%x L 0x%x\n",
	pathID, targetID, lunID );

	switch (ccb->ccb_h.func_code)
	{
	case XPT_PATH_INQ:
	{
	struct ccb_pathinq *cpi;

	/* See architecure book p180*/
	cpi = &ccb->cpi;
	cpi->version_num = 1;
	cpi->hba_inquiry = PI_SDTR_ABLE \| PI_TAG_ABLE \| PI_WIDE_16;
	cpi->target_sprt = 0;
	cpi->hba_misc = PIM_NOBUSRESET \| PIM_SEQSCAN;
	cpi->hba_eng_cnt = 0;
	cpi->max_target = maxTargets - 1;
	cpi->max_lun = AGTIAPI_MAX_LUN;
	cpi->maxio = 1024 1024; / Max supported I/O size, in bytes. */
	cpi->initiator_id = 255;
	strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strlcpy(cpi->hba_vid, "PMC", HBA_IDLEN);
	strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
	cpi->unit_number = cam_sim_unit(sim);
	cpi->bus_id = cam_sim_bus(sim);
	// rate is set when XPT_GET_TRAN_SETTINGS is processed
	cpi->base_transfer_speed = 150000;
	cpi->transport = XPORT_SAS;
	cpi->transport_version = 0;
	cpi->protocol = PROTO_SCSI;
	cpi->protocol_version = SCSI_REV_SPC3;
	cpi->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_GET_TRAN_SETTINGS:
	{
	struct ccb_trans_settings *cts;
	struct ccb_trans_settings_sas *sas;
	struct ccb_trans_settings_scsi *scsi;

	if ( pmcsc->flags & AGTIAPI_SHUT_DOWN )
	{
	return;
	}

	cts = &ccb->cts;
	sas = &ccb->cts.xport_specific.sas;
	scsi = &cts->proto_specific.scsi;

	cts->protocol = PROTO_SCSI;
	cts->protocol_version = SCSI_REV_SPC3;
	cts->transport = XPORT_SAS;
	cts->transport_version = 0;

	sas->valid = CTS_SAS_VALID_SPEED;

	/* this sets the "MB/s transfers" */
	if (pmcsc != NULL && targetID >= 0 && targetID < maxTargets)
	{
	if (pmcsc->pWWNList != NULL)
	{
	TID = INDEX(pmcsc, targetID);
	if (TID < maxTargets)
	{
	pDevHandle = pmcsc->pDevList[TID].pDevHandle;
	}
	}
	}
	if (pDevHandle)
	{
	tiINIGetDeviceInfo( &pmcsc->tiRoot, pDevHandle, &devInfo );
	switch (devInfo.info.devType_S_Rate & 0xF)
	{
	case 0x8: speed = 150000;
	break;
	case 0x9: speed = 300000;
	break;
	case 0xA: speed = 600000;
	break;
	case 0xB: speed = 1200000;
	break;
	default: speed = 150000;
	break;
	}
	}
	sas->bitrate = speed;
	scsi->valid = CTS_SCSI_VALID_TQ;
	scsi->flags = CTS_SCSI_FLAGS_TAG_ENB;
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_RESET_BUS:
	{
	lRetVal = agtiapi_eh_HostReset( pmcsc, ccb ); // usually works first time
	if ( SUCCESS == lRetVal )
	{
	AGTIAPI_PRINTK( "agtiapi_cam_action: bus reset success.\n" );
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_cam_action: bus reset failed.\n" );
	}
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_RESET_DEV:
	{
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_ABORT:
	{
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	#if __FreeBSD_version >= 900026
	case XPT_SMP_IO:
	{
	agtiapi_QueueSMP( pmcsc, ccb );
	return;
	}
	#endif /* __FreeBSD_version >= 900026 */
	case XPT_SCSI_IO:
	{
	if(pmcsc->dev_scan == agFALSE)
	{
	ccb->ccb_h.status = CAM_SEL_TIMEOUT;
	break;
	}
	if (pmcsc->flags & AGTIAPI_SHUT_DOWN)
	{
	AGTIAPI_PRINTK( "agtiapi_cam_action: shutdown, XPT_SCSI_IO 0x%x\n",
	XPT_SCSI_IO );
	ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
	break;
	}
	else
	{
	AGTIAPI_IO( "agtiapi_cam_action: Zero XPT_SCSI_IO 0x%x, doing IOs\n",
	XPT_SCSI_IO );
	agtiapi_QueueCmnd_( pmcsc, ccb );
	return;
	}
	}

	case XPT_CALC_GEOMETRY:
	{
	cam_calc_geometry(&ccb->ccg, 1);
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	default:
	{
	/*
	XPT_SET_TRAN_SETTINGS
	*/
	AGTIAPI_IO( "agtiapi_cam_action: default function code 0x%x\n",
	ccb->ccb_h.func_code );
	ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
	break;
	}
	} /* switch */
	xpt_done(ccb);
	}


	/******************************************************************************
	agtiapi_GetCCB()

	Purpose:
	Get a ccb from free list or allocate a new one
	Parameters:
	struct agtiapi_softc *pmcsc (IN) Pointer to HBA structure
	Return:
	Pointer to a ccb structure, or NULL if not available
	Note:
	******************************************************************************/
	STATIC pccb_t agtiapi_GetCCB( struct agtiapi_softc *pmcsc )
	{
	pccb_t pccb;

	AGTIAPI_IO( "agtiapi_GetCCB: start\n" );

	AG_LOCAL_LOCK( &pmcsc->ccbLock );

	/* get the ccb from the head of the free list */
	if ((pccb = (pccb_t)pmcsc->ccbFreeList) != NULL)
	{
	pmcsc->ccbFreeList = (caddr_t *)pccb->pccbNext;
	pccb->pccbNext = NULL;
	pccb->flags = ACTIVE;
	pccb->startTime = 0;
	pmcsc->activeCCB++;
	AGTIAPI_IO( "agtiapi_GetCCB: re-allocated ccb %p\n", pccb );
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_GetCCB: kmalloc ERROR - no ccb allocated\n" );
	}

	AG_LOCAL_UNLOCK( &pmcsc->ccbLock );
	return pccb;
	}

	/******************************************************************************
	agtiapi_QueueCmnd_()

	Purpose:
	Calls for sending CCB and excuting on HBA.
	Parameters:
	struct agtiapi_softc *pmsc (IN) Pointer to the HBA data structure
	union ccb * ccb (IN) Pointer to CAM ccb data structure
	Return:
	0 - Command is pending to execute
	1 - Command returned without further process
	Note:
	******************************************************************************/
	int agtiapi_QueueCmnd_(struct agtiapi_softc pmcsc, union ccb ccb)
	{
	struct ccb_scsiio *csio = &ccb->csio;
	pccb_t pccb = agNULL; // call dequeue
	int status = tiSuccess;
	U32 Channel = CMND_TO_CHANNEL(ccb);
	U32 TID = CMND_TO_TARGET(ccb);
	U32 LUN = CMND_TO_LUN(ccb);

	AGTIAPI_IO( "agtiapi_QueueCmnd_: start\n" );

	/* no support for CBD > 16 */
	if (csio->cdb_len > 16)
	{
	AGTIAPI_PRINTK( "agtiapi_QueueCmnd_: unsupported CDB length %d\n",
	csio->cdb_len );
	ccb->ccb_h.status &= ~CAM_SIM_QUEUED;
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	ccb->ccb_h.status \|= CAM_REQ_INVALID;//CAM_REQ_CMP;
	xpt_done(ccb);
	return tiError;
	}
	if (TID < 0 \|\| TID >= maxTargets)
	{
	AGTIAPI_PRINTK("agtiapi_QueueCmnd_: INVALID TID ERROR\n");
	ccb->ccb_h.status &= ~CAM_SIM_QUEUED;
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	ccb->ccb_h.status \|= CAM_DEV_NOT_THERE;//CAM_REQ_CMP;
	xpt_done(ccb);
	return tiError;
	}
	/* get a ccb */
	if ((pccb = agtiapi_GetCCB(pmcsc)) == NULL)
	{
	AGTIAPI_PRINTK("agtiapi_QueueCmnd_: GetCCB ERROR\n");
	if (pmcsc != NULL)
	{
	ag_device_t *targ;
	TID = INDEX(pmcsc, TID);
	targ = &pmcsc->pDevList[TID];
	agtiapi_adjust_queue_depth(ccb->ccb_h.path,targ->qdepth);
	}
	ccb->ccb_h.status &= ~CAM_SIM_QUEUED;
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	ccb->ccb_h.status \|= CAM_REQUEUE_REQ;
	xpt_done(ccb);
	return tiBusy;
	}
	pccb->pmcsc = pmcsc;
	/* initialize Command Control Block (CCB) */
	pccb->targetId = TID;
	pccb->lun = LUN;
	pccb->channel = Channel;
	pccb->ccb = ccb; /* for struct scsi_cmnd */
	pccb->senseLen = csio->sense_len;
	pccb->startTime = ticks;
	pccb->pSenseData = (caddr_t) &csio->sense_data;
	pccb->tiSuperScsiRequest.flags = 0;

	/* each channel is reserved for different addr modes */
	pccb->addrMode = agtiapi_AddrModes[Channel];

	status = agtiapi_PrepareSGList(pmcsc, pccb);
	if (status != tiSuccess)
	{
	AGTIAPI_PRINTK("agtiapi_QueueCmnd_: agtiapi_PrepareSGList failure\n");
	agtiapi_FreeCCB(pmcsc, pccb);
	if (status == tiReject)
	{
	ccb->ccb_h.status = CAM_REQ_INVALID;
	}
	else
	{
	ccb->ccb_h.status = CAM_REQ_CMP;
	}
	xpt_done( ccb );
	return tiError;
	}
	return status;
	}

	/******************************************************************************
	agtiapi_DumpCDB()

	Purpose:
	Prints out CDB
	Parameters:
	const char *ptitle (IN) A string to be printed
	ccb_t *pccb (IN) A pointer to the driver's own CCB, not CAM's CCB
	Return:
	Note:
	******************************************************************************/
	STATIC void agtiapi_DumpCDB(const char ptitle, ccb_t pccb)
	{
	union ccb *ccb;
	struct ccb_scsiio *csio;
	bit8 cdb[64];
	int len;

	if (pccb == NULL)
	{
	printf( "agtiapi_DumpCDB: no pccb here \n" );
	panic("agtiapi_DumpCDB: pccb is NULL. called from %s\n", ptitle);
	return;
	}
	ccb = pccb->ccb;
	if (ccb == NULL)
	{
	printf( "agtiapi_DumpCDB: no ccb here \n" );
	panic( "agtiapi_DumpCDB: pccb %p ccb %p flags %d ccb NULL! "
	"called from %s\n",
	pccb, pccb->ccb, pccb->flags, ptitle );
	return;
	}
	csio = &ccb->csio;
	if (csio == NULL)
	{
	printf( "agtiapi_DumpCDB: no csio here \n" );
	panic( "agtiapi_DumpCDB: pccb%p ccb%p flags%d csio NULL! called from %s\n",
	pccb, pccb->ccb, pccb->flags, ptitle );
	return;
	}
	len = MIN(64, csio->cdb_len);
	if (csio->ccb_h.flags & CAM_CDB_POINTER)
	{
	bcopy(csio->cdb_io.cdb_ptr, &cdb[0], len);
	}
	else
	{
	bcopy(csio->cdb_io.cdb_bytes, &cdb[0], len);
	}

	AGTIAPI_IO( "agtiapi_DumpCDB: pccb%p CDB0x%x csio->cdb_len %d"
	" len %d from %s\n",
	pccb, cdb[0],
	csio->cdb_len,
	len,
	ptitle );
	return;
	}

	/******************************************************************************
	agtiapi_DoSoftReset()

	Purpose:
	Do card reset
	Parameters:
	data (IN) point to pmcsc (struct agtiapi_softc )
	Return:
	Note:
	******************************************************************************/
	int agtiapi_DoSoftReset (struct agtiapi_softc *pmcsc)
	{
	int ret;
	unsigned long flags;

	pmcsc->flags \|= AGTIAPI_SOFT_RESET;
	AG_SPIN_LOCK_IRQ( agtiapi_host_lock, flags );
	ret = agtiapi_ResetCard( pmcsc, &flags );
	AG_SPIN_UNLOCK_IRQ( agtiapi_host_lock, flags );

	if( ret != AGTIAPI_SUCCESS )
	return tiError;

	return SUCCESS;
	}

	/******************************************************************************
	agtiapi_CheckIOTimeout()

	Purpose:
	Timeout function for SCSI IO or TM
	Parameters:
	data (IN) point to pCard (ag_card_t )
	Return:
	Note:
	******************************************************************************/
	STATIC void agtiapi_CheckIOTimeout(void *data)
	{
	U32 status = AGTIAPI_SUCCESS;
	ccb_t *pccb;
	struct agtiapi_softc *pmcsc;
	pccb_t pccb_curr;
	pccb_t pccb_next;
	pmcsc = (struct agtiapi_softc *)data;

	//AGTIAPI_PRINTK("agtiapi_CheckIOTimeout: Enter\n");

	//AGTIAPI_PRINTK("agtiapi_CheckIOTimeout: Active CCB %d\n", pmcsc->activeCCB);

	pccb = (pccb_t)pmcsc->ccbChainList;

	/* if link is down, do nothing */
	if ((pccb == NULL) \|\| (pmcsc->activeCCB == 0))
	{
	//AGTIAPI_PRINTK("agtiapi_CheckIOTimeout: goto restart_timer\n");
	goto restart_timer;
	}

	AG_SPIN_LOCK_IRQ(agtiapi_host_lock, flags);
	if (pmcsc->flags & AGTIAPI_SHUT_DOWN)
	goto ext;

	pccb_curr = pccb;

	/* Walk thorugh the IO Chain linked list to find the pending io */
	/* Set the TM flag based on the pccb type, i.e SCSI IO or TM cmd */
	while (pccb_curr != NULL)
	{
	/* start from 1st ccb in the chain */
	pccb_next = pccb_curr->pccbChainNext;
	if( (pccb_curr->flags == 0) \|\| (pccb_curr->tiIORequest.tdData == NULL) \|\|
	(pccb_curr->startTime == 0) /* && (pccb->startTime == 0) */)
	{
	//AGTIAPI_PRINTK("agtiapi_CheckIOTimeout: move to next element\n");
	}
	else if ( ( (ticks-pccb_curr->startTime) >= ag_timeout_secs ) &&
	!(pccb_curr->flags & TIMEDOUT) )
	{
	AGTIAPI_PRINTK( "agtiapi_CheckIOTimeout: pccb %p timed out, call TM "
	"function -- flags=%x startTime=%ld tdData = %p\n",
	pccb_curr, pccb_curr->flags, pccb->startTime,
	pccb_curr->tiIORequest.tdData );
	pccb_curr->flags \|= TIMEDOUT;
	status = agtiapi_StartTM(pmcsc, pccb_curr);
	if (status == AGTIAPI_SUCCESS)
	{
	AGTIAPI_PRINTK( "agtiapi_CheckIOTimeout: TM Request sent with "
	"success\n" );
	goto restart_timer;
	}
	else
	{
	#ifdef AGTIAPI_LOCAL_RESET
	/* abort request did not go through */
	AGTIAPI_PRINTK("agtiapi_CheckIOTimeout: Abort request failed\n");
	/* TODO: call Soft reset here */
	AGTIAPI_PRINTK( "agtiapi_CheckIOTimeout:in agtiapi_CheckIOTimeout() "
	"abort request did not go thru ==> soft reset#7, then "
	"restart timer\n" );
	agtiapi_DoSoftReset (pmcsc);
	goto restart_timer;
	#endif
	}
	}
	pccb_curr = pccb_next;
	}
	restart_timer:
	callout_reset(&pmcsc->IO_timer, 1*hz, agtiapi_CheckIOTimeout, pmcsc);

	ext:
	AG_SPIN_UNLOCK_IRQ(agtiapi_host_lock, flags);
	return;
	}

	/******************************************************************************
	agtiapi_StartTM()

	Purpose:
	DDI calls for aborting outstanding IO command
	Parameters:
	struct scsi_cmnd *pccb (IN) Pointer to the command to be aborted
	unsigned long flags (IN/out) spinlock flags used in locking from
	calling layers
	Return:
	AGTIAPI_SUCCESS - success
	AGTIAPI_FAIL - fail
	******************************************************************************/
	int
	agtiapi_StartTM(struct agtiapi_softc pCard, ccb_t pccb)
	{
	ccb_t *pTMccb = NULL;
	U32 status = AGTIAPI_SUCCESS;
	ag_device_t *pDevice = NULL;
	U32 TMstatus = tiSuccess;
	AGTIAPI_PRINTK( "agtiapi_StartTM: pccb %p, pccb->flags %x\n",
	pccb, pccb->flags );
	if (pccb == NULL)
	{
	AGTIAPI_PRINTK("agtiapi_StartTM: %p not found\n",pccb);
	status = AGTIAPI_SUCCESS;
	goto ext;
	}
	if (!pccb->tiIORequest.tdData)
	{
	/* should not be the case */
	AGTIAPI_PRINTK("agtiapi_StartTM: ccb %p flag 0x%x tid %d no tdData "
	"ERROR\n", pccb, pccb->flags, pccb->targetId);
	status = AGTIAPI_FAIL;
	}
	else
	{
	/* If timedout CCB is TM_ABORT_TASK command, issue LocalAbort first to
	clear pending TM_ABORT_TASK */
	/* Else Device State will not be put back to Operational, (refer FW) */
	if (pccb->flags & TASK_MANAGEMENT)
	{
	if (tiINIIOAbort(&pCard->tiRoot, &pccb->tiIORequest) != tiSuccess)
	{
	AGTIAPI_PRINTK( "agtiapi_StartTM: LocalAbort Request for Abort_TASK "
	"TM failed\n" );
	/* TODO: call Soft reset here */
	AGTIAPI_PRINTK( "agtiapi_StartTM: in agtiapi_StartTM() abort "
	"tiINIIOAbort() failed ==> soft reset#8\n" );
	agtiapi_DoSoftReset( pCard );
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_StartTM: LocalAbort for Abort_TASK TM "
	"Request sent\n" );
	status = AGTIAPI_SUCCESS;
	}
	}
	else
	{
	/* get a ccb */
	if ((pTMccb = agtiapi_GetCCB(pCard)) == NULL)
	{
	AGTIAPI_PRINTK("agtiapi_StartTM: TM resource unavailable!\n");
	status = AGTIAPI_FAIL;
	goto ext;
	}
	pTMccb->pmcsc = pCard;
	pTMccb->targetId = pccb->targetId;
	pTMccb->devHandle = pccb->devHandle;
	if (pTMccb->targetId >= pCard->devDiscover)
	{
	AGTIAPI_PRINTK("agtiapi_StartTM: Incorrect dev Id in TM!\n");
	status = AGTIAPI_FAIL;
	goto ext;
	}
	if (pTMccb->targetId < 0 \|\| pTMccb->targetId >= maxTargets)
	{
	return AGTIAPI_FAIL;
	}
	if (INDEX(pCard, pTMccb->targetId) >= maxTargets)
	{
	return AGTIAPI_FAIL;
	}
	pDevice = &pCard->pDevList[INDEX(pCard, pTMccb->targetId)];
	if ((pDevice == NULL) \|\| !(pDevice->flags & ACTIVE))
	{
	return AGTIAPI_FAIL;
	}

	/* save pending io to issue local abort at Task mgmt CB */
	pTMccb->pccbIO = pccb;
	AGTIAPI_PRINTK( "agtiapi_StartTM: pTMccb %p flag %x tid %d via TM "
	"request !\n",
	pTMccb, pTMccb->flags, pTMccb->targetId );
	pTMccb->flags &= ~(TASK_SUCCESS \| ACTIVE);
	pTMccb->flags \|= TASK_MANAGEMENT;
	TMstatus = tiINITaskManagement(&pCard->tiRoot,
	pccb->devHandle,
	AG_ABORT_TASK,
	&pccb->tiSuperScsiRequest.scsiCmnd.lun,
	&pccb->tiIORequest,
	&pTMccb->tiIORequest);
	if (TMstatus == tiSuccess)
	{
	AGTIAPI_PRINTK( "agtiapi_StartTM: TM_ABORT_TASK request success ccb "
	"%p, pTMccb %p\n",
	pccb, pTMccb );
	pTMccb->startTime = ticks;
	status = AGTIAPI_SUCCESS;
	}
	else if (TMstatus == tiIONoDevice)
	{
	AGTIAPI_PRINTK( "agtiapi_StartTM: TM_ABORT_TASK request tiIONoDevice ccb "
	"%p, pTMccb %p\n",
	pccb, pTMccb );
	status = AGTIAPI_SUCCESS;
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_StartTM: TM_ABORT_TASK request failed ccb %p, "
	"pTMccb %p\n",
	pccb, pTMccb );
	status = AGTIAPI_FAIL;
	agtiapi_FreeTMCCB(pCard, pTMccb);
	/* TODO */
	/* call TM_TARGET_RESET */
	}
	}
	}
	ext:
	AGTIAPI_PRINTK("agtiapi_StartTM: return %d flgs %x\n", status,
	(pccb) ? pccb->flags : -1);
	return status;
	} /* agtiapi_StartTM */

	#if __FreeBSD_version > 901000
	/******************************************************************************
	agtiapi_PrepareSGList()

	Purpose:
	This function prepares scatter-gather list for the given ccb
	Parameters:
	struct agtiapi_softc *pmsc (IN) Pointer to the HBA data structure
	ccb_t *pccb (IN) A pointer to the driver's own CCB, not CAM's CCB
	Return:
	0 - success
	1 - failure

	Note:
	******************************************************************************/
	static int agtiapi_PrepareSGList(struct agtiapi_softc pmcsc, ccb_t pccb)
	{
	union ccb *ccb = pccb->ccb;
	struct ccb_scsiio *csio = &ccb->csio;
	struct ccb_hdr *ccbh = &ccb->ccb_h;
	AGTIAPI_IO( "agtiapi_PrepareSGList: start\n" );

	// agtiapi_DumpCDB("agtiapi_PrepareSGList", pccb);
	AGTIAPI_IO( "agtiapi_PrepareSGList: dxfer_len %d\n", csio->dxfer_len );

	if ((ccbh->flags & CAM_DIR_MASK) != CAM_DIR_NONE)
	{
	switch((ccbh->flags & CAM_DATA_MASK))
	{
	int error;
	struct bus_dma_segment seg;
	case CAM_DATA_VADDR:
	/* Virtual address that needs to translated into one or more physical address ranges. */
	// int error;
	// AG_LOCAL_LOCK(&(pmcsc->pCardInfo->pmIOLock));
	AGTIAPI_IO( "agtiapi_PrepareSGList: virtual address\n" );
	error = bus_dmamap_load( pmcsc->buffer_dmat,
	pccb->CCB_dmamap,
	csio->data_ptr,
	csio->dxfer_len,
	agtiapi_PrepareSGListCB,
	pccb,
	BUS_DMA_NOWAIT/* 0 */ );
	// AG_LOCAL_UNLOCK( &(pmcsc->pCardInfo->pmIOLock) );

	if (error == EINPROGRESS)
	{
	/* So as to maintain ordering, freeze the controller queue until our mapping is returned. */
	AGTIAPI_PRINTK("agtiapi_PrepareSGList: EINPROGRESS\n");
	xpt_freeze_simq(pmcsc->sim, 1);
	pmcsc->SimQFrozen = agTRUE;
	ccbh->status \|= CAM_RELEASE_SIMQ;
	}
	break;
	case CAM_DATA_PADDR:
	/* We have been given a pointer to single physical buffer. */
	/* pccb->tiSuperScsiRequest.sglVirtualAddr = seg.ds_addr; */
	//struct bus_dma_segment seg;
	AGTIAPI_PRINTK("agtiapi_PrepareSGList: physical address\n");
	seg.ds_addr =
	(bus_addr_t)(vm_offset_t)csio->data_ptr;
	seg.ds_len = csio->dxfer_len;
	// * 0xFF to be defined
	agtiapi_PrepareSGListCB(pccb, &seg, 1, 0xAABBCCDD);
	break;
	default:
	AGTIAPI_PRINTK("agtiapi_PrepareSGList: unexpected case\n");
	return tiReject;
	}
	}
	else
	{
	agtiapi_PrepareSGListCB(pccb, NULL, 0, 0xAAAAAAAA);
	}
	return tiSuccess;
	}
	#else
	/******************************************************************************
	agtiapi_PrepareSGList()

	Purpose:
	This function prepares scatter-gather list for the given ccb
	Parameters:
	struct agtiapi_softc *pmsc (IN) Pointer to the HBA data structure
	ccb_t *pccb (IN) A pointer to the driver's own CCB, not CAM's CCB
	Return:
	0 - success
	1 - failure

	Note:
	******************************************************************************/
	static int agtiapi_PrepareSGList(struct agtiapi_softc pmcsc, ccb_t pccb)
	{
	union ccb *ccb = pccb->ccb;
	struct ccb_scsiio *csio = &ccb->csio;
	struct ccb_hdr *ccbh = &ccb->ccb_h;
	AGTIAPI_IO( "agtiapi_PrepareSGList: start\n" );
	// agtiapi_DumpCDB("agtiapi_PrepareSGList", pccb);
	AGTIAPI_IO( "agtiapi_PrepareSGList: dxfer_len %d\n", csio->dxfer_len );

	if ((ccbh->flags & CAM_DIR_MASK) != CAM_DIR_NONE)
	{
	if ((ccbh->flags & CAM_SCATTER_VALID) == 0)
	{
	/* We've been given a pointer to a single buffer. */
	if ((ccbh->flags & CAM_DATA_PHYS) == 0)
	{
	/* Virtual address that needs to translated into one or more physical address ranges. */
	int error;
	// AG_LOCAL_LOCK(&(pmcsc->pCardInfo->pmIOLock));
	AGTIAPI_IO( "agtiapi_PrepareSGList: virtual address\n" );
	error = bus_dmamap_load( pmcsc->buffer_dmat,
	pccb->CCB_dmamap,
	csio->data_ptr,
	csio->dxfer_len,
	agtiapi_PrepareSGListCB,
	pccb,
	BUS_DMA_NOWAIT/* 0 */ );
	// AG_LOCAL_UNLOCK( &(pmcsc->pCardInfo->pmIOLock) );

	if (error == EINPROGRESS)
	{
	/* So as to maintain ordering, freeze the controller queue until our mapping is returned. */
	AGTIAPI_PRINTK("agtiapi_PrepareSGList: EINPROGRESS\n");
	xpt_freeze_simq(pmcsc->sim, 1);
	pmcsc->SimQFrozen = agTRUE;
	ccbh->status \|= CAM_RELEASE_SIMQ;
	}
	}
	else
	{
	/* We have been given a pointer to single physical buffer. */
	/* pccb->tiSuperScsiRequest.sglVirtualAddr = seg.ds_addr; */
	struct bus_dma_segment seg;
	AGTIAPI_PRINTK("agtiapi_PrepareSGList: physical address\n");
	seg.ds_addr =
	(bus_addr_t)(vm_offset_t)csio->data_ptr;
	seg.ds_len = csio->dxfer_len;
	// * 0xFF to be defined
	agtiapi_PrepareSGListCB(pccb, &seg, 1, 0xAABBCCDD);
	}
	}
	else
	{

	AGTIAPI_PRINTK("agtiapi_PrepareSGList: unexpected case\n");
	return tiReject;
	}
	}
	else
	{
	agtiapi_PrepareSGListCB(pccb, NULL, 0, 0xAAAAAAAA);
	}
	return tiSuccess;
	}

	#endif
	/******************************************************************************
	agtiapi_PrepareSGListCB()

	Purpose:
	Callback function for bus_dmamap_load()
	This fuctions sends IO to LL layer.
	Parameters:
	void *arg (IN) Pointer to the HBA data structure
	bus_dma_segment_t *segs (IN) Pointer to dma segment
	int nsegs (IN) number of dma segment
	int error (IN) error
	Return:
	Note:
	******************************************************************************/
	static void agtiapi_PrepareSGListCB( void *arg,
	bus_dma_segment_t *segs,
	int nsegs,
	int error )
	{
	pccb_t pccb = arg;
	union ccb *ccb = pccb->ccb;
	struct ccb_scsiio *csio = &ccb->csio;

	struct agtiapi_softc *pmcsc;
	tiIniScsiCmnd_t *pScsiCmnd;
	bit32 i;
	bus_dmasync_op_t op;
	U32_64 phys_addr;
	U08 *CDB;
	int io_is_encryptable = 0;
	unsigned long long start_lba = 0;
	ag_device_t *pDev;
	U32 TID = CMND_TO_TARGET(ccb);

	AGTIAPI_IO( "agtiapi_PrepareSGListCB: start, nsegs %d error 0x%x\n",
	nsegs, error );
	pmcsc = pccb->pmcsc;

	if (error != tiSuccess)
	{
	if (error == 0xAABBCCDD \|\| error == 0xAAAAAAAA)
	{
	// do nothing
	}
	else
	{
	AGTIAPI_PRINTK("agtiapi_PrepareSGListCB: error status 0x%x\n", error);
	bus_dmamap_unload(pmcsc->buffer_dmat, pccb->CCB_dmamap);
	bus_dmamap_destroy(pmcsc->buffer_dmat, pccb->CCB_dmamap);
	agtiapi_FreeCCB(pmcsc, pccb);
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	return;
	}
	}

	if (nsegs > AGTIAPI_MAX_DMA_SEGS)
	{
	AGTIAPI_PRINTK( "agtiapi_PrepareSGListCB: over the limit. nsegs %d"
	" AGTIAPI_MAX_DMA_SEGS %d\n",
	nsegs, AGTIAPI_MAX_DMA_SEGS );
	bus_dmamap_unload(pmcsc->buffer_dmat, pccb->CCB_dmamap);
	bus_dmamap_destroy(pmcsc->buffer_dmat, pccb->CCB_dmamap);
	agtiapi_FreeCCB(pmcsc, pccb);
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	return;
	}


	/* fill in IO information */
	pccb->dataLen = csio->dxfer_len;

	/* start fill in sgl structure */
	if (nsegs == 1 && error == 0xAABBCCDD)
	{
	/* to be tested */
	/* A single physical buffer */
	AGTIAPI_PRINTK("agtiapi_PrepareSGListCB: nsegs is 1\n");
	CPU_TO_LE32(pccb->tiSuperScsiRequest.agSgl1, segs[0].ds_addr);
	pccb->tiSuperScsiRequest.agSgl1.len = htole32(pccb->dataLen);
	pccb->tiSuperScsiRequest.agSgl1.type = htole32(tiSgl);
	pccb->tiSuperScsiRequest.sglVirtualAddr = (void *)segs->ds_addr;
	pccb->numSgElements = 1;
	}
	else if (nsegs == 0 && error == 0xAAAAAAAA)
	{
	/* no data transfer */
	AGTIAPI_IO( "agtiapi_PrepareSGListCB: no data transfer\n" );
	pccb->tiSuperScsiRequest.agSgl1.len = 0;
	pccb->dataLen = 0;
	pccb->numSgElements = 0;
	}
	else
	{
	/* virtual/logical buffer */
	if (nsegs == 1)
	{
	pccb->dataLen = segs[0].ds_len;

	CPU_TO_LE32(pccb->tiSuperScsiRequest.agSgl1, segs[0].ds_addr);
	pccb->tiSuperScsiRequest.agSgl1.type = htole32(tiSgl);
	pccb->tiSuperScsiRequest.agSgl1.len = htole32(segs[0].ds_len);
	pccb->tiSuperScsiRequest.sglVirtualAddr = (void *)csio->data_ptr;
	pccb->numSgElements = nsegs;

	}
	else
	{
	pccb->dataLen = 0;
	/* loop */
	for (i = 0; i < nsegs; i++)
	{
	pccb->sgList[i].len = htole32(segs[i].ds_len);
	CPU_TO_LE32(pccb->sgList[i], segs[i].ds_addr);
	pccb->sgList[i].type = htole32(tiSgl);
	pccb->dataLen += segs[i].ds_len;

	} /* for */
	pccb->numSgElements = nsegs;
	/* set up sgl buffer address */
	CPU_TO_LE32(pccb->tiSuperScsiRequest.agSgl1, pccb->tisgl_busaddr);
	pccb->tiSuperScsiRequest.agSgl1.type = htole32(tiSglList);
	pccb->tiSuperScsiRequest.agSgl1.len = htole32(pccb->dataLen);
	pccb->tiSuperScsiRequest.sglVirtualAddr = (void *)csio->data_ptr;
	pccb->numSgElements = nsegs;
	} /* else */
	}

	/* set data transfer direction */
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT)
	{
	op = BUS_DMASYNC_PREWRITE;
	pccb->tiSuperScsiRequest.dataDirection = tiDirectionOut;
	}
	else
	{
	op = BUS_DMASYNC_PREREAD;
	pccb->tiSuperScsiRequest.dataDirection = tiDirectionIn;
	}

	pScsiCmnd = &pccb->tiSuperScsiRequest.scsiCmnd;

	pScsiCmnd->expDataLength = pccb->dataLen;

	if (csio->ccb_h.flags & CAM_CDB_POINTER)
	{
	bcopy(csio->cdb_io.cdb_ptr, &pScsiCmnd->cdb[0], csio->cdb_len);
	}
	else
	{
	bcopy(csio->cdb_io.cdb_bytes, &pScsiCmnd->cdb[0],csio->cdb_len);
	}

	CDB = &pScsiCmnd->cdb[0];

	switch (CDB[0])
	{
	case REQUEST_SENSE: /* requires different buffer */
	/* This code should not be excercised because SAS support auto sense
	For the completeness, vtophys() is still used here.
	*/
	AGTIAPI_PRINTK("agtiapi_PrepareSGListCB: QueueCmnd - REQUEST SENSE new\n");
	pccb->tiSuperScsiRequest.agSgl1.len = htole32(pccb->senseLen);
	phys_addr = vtophys(&csio->sense_data);
	CPU_TO_LE32(pccb->tiSuperScsiRequest.agSgl1, phys_addr);
	pccb->tiSuperScsiRequest.agSgl1.type = htole32(tiSgl);
	pccb->dataLen = pccb->senseLen;
	pccb->numSgElements = 1;
	break;
	case INQUIRY:
	/* only using lun 0 for device type detection */
	pccb->flags \|= AGTIAPI_INQUIRY;
	break;
	case TEST_UNIT_READY:
	case RESERVE:
	case RELEASE:
	case START_STOP:
	pccb->tiSuperScsiRequest.agSgl1.len = 0;
	pccb->dataLen = 0;
	break;
	case READ_6:
	case WRITE_6:
	/* Extract LBA */
	start_lba = ((CDB[1] & 0x1f) << 16) \|
	(CDB[2] << 8) \|
	(CDB[3]);
	#ifdef HIALEAH_ENCRYPTION
	io_is_encryptable = 1;
	#endif
	break;
	case READ_10:
	case WRITE_10:
	case READ_12:
	case WRITE_12:
	/* Extract LBA */
	start_lba = (CDB[2] << 24) \|
	(CDB[3] << 16) \|
	(CDB[4] << 8) \|
	(CDB[5]);
	#ifdef HIALEAH_ENCRYPTION
	io_is_encryptable = 1;
	#endif
	break;
	case READ_16:
	case WRITE_16:
	/* Extract LBA */
	start_lba = (CDB[2] << 24) \|
	(CDB[3] << 16) \|
	(CDB[4] << 8) \|
	(CDB[5]);
	start_lba <<= 32;
	start_lba \|= ((CDB[6] << 24) \|
	(CDB[7] << 16) \|
	(CDB[8] << 8) \|
	(CDB[9]));
	#ifdef HIALEAH_ENCRYPTION
	io_is_encryptable = 1;
	#endif
	break;
	default:
	break;
	}

	/* fill device lun based one address mode */
	agtiapi_SetLunField(pccb);

	if (pccb->targetId < 0 \|\| pccb->targetId >= maxTargets)
	{
	pccb->ccbStatus = tiIOFailed;
	pccb->scsiStatus = tiDetailNoLogin;
	agtiapi_FreeCCB(pmcsc, pccb);
	ccb->ccb_h.status = CAM_DEV_NOT_THERE; // ## v. CAM_FUNC_NOTAVAIL
	xpt_done(ccb);
	pccb->ccb = NULL;
	return;
	}
	if (INDEX(pmcsc, pccb->targetId) >= maxTargets)
	{
	pccb->ccbStatus = tiIOFailed;
	pccb->scsiStatus = tiDetailNoLogin;
	agtiapi_FreeCCB(pmcsc, pccb);
	ccb->ccb_h.status = CAM_DEV_NOT_THERE; // ## v. CAM_FUNC_NOTAVAIL
	xpt_done(ccb);
	pccb->ccb = NULL;
	return;
	}
	pDev = &pmcsc->pDevList[INDEX(pmcsc, pccb->targetId)];

	#if 1
	if ((pmcsc->flags & EDC_DATA) &&
	(pDev->flags & EDC_DATA))
	{
	/*
	* EDC support:
	*
	* Possible command supported -
	* READ_6, READ_10, READ_12, READ_16, READ_LONG, READ_BUFFER,
	* READ_DEFECT_DATA, etc.
	* WRITE_6, WRITE_10, WRITE_12, WRITE_16, WRITE_LONG, WRITE_LONG2,
	* WRITE_BUFFER, WRITE_VERIFY, WRITE_VERIFY_12, etc.
	*
	* Do some data length adjustment and set chip operation instruction.
	*/
	switch (CDB[0])
	{
	case READ_6:
	case READ_10:
	case READ_12:
	case READ_16:
	// BUG_ON(pccb->tiSuperScsiRequest.flags & TI_SCSI_INITIATOR_ENCRYPT);
	#ifdef AGTIAPI_TEST_DIF
	pccb->tiSuperScsiRequest.flags \|= TI_SCSI_INITIATOR_DIF;
	#endif
	pccb->flags \|= EDC_DATA;

	#ifdef TEST_VERIFY_AND_FORWARD
	pccb->tiSuperScsiRequest.Dif.flags =
	DIF_VERIFY_FORWARD \| DIF_UDT_REF_BLOCK_COUNT;
	if(pDev->sector_size == 520) {
	pScsiCmnd->expDataLength += (pccb->dataLen / 512) * 8;
	} else if(pDev->sector_size == 4104) {
	pScsiCmnd->expDataLength += (pccb->dataLen / 4096) * 8;
	}
	#else
	#ifdef AGTIAPI_TEST_DIF
	pccb->tiSuperScsiRequest.Dif.flags =
	DIF_VERIFY_DELETE \| DIF_UDT_REF_BLOCK_COUNT;
	#endif
	#endif
	#ifdef AGTIAPI_TEST_DIF
	switch(pDev->sector_size) {
	case 528:
	pccb->tiSuperScsiRequest.Dif.flags \|=
	( DIF_BLOCK_SIZE_520 << 16 );
	break;
	case 4104:
	pccb->tiSuperScsiRequest.Dif.flags \|=
	( DIF_BLOCK_SIZE_4096 << 16 );
	break;
	case 4168:
	pccb->tiSuperScsiRequest.Dif.flags \|=
	( DIF_BLOCK_SIZE_4160 << 16 );
	break;
	}

	if(pCard->flags & EDC_DATA_CRC)
	pccb->tiSuperScsiRequest.Dif.flags \|= DIF_CRC_VERIFICATION;

	/* Turn on upper 4 bits of UVM */
	pccb->tiSuperScsiRequest.Dif.flags \|= 0x03c00000;

	#endif
	#ifdef AGTIAPI_TEST_DPL
	if(agtiapi_SetupDifPerLA(pCard, pccb, start_lba) < 0) {
	printk(KERN_ERR "SetupDifPerLA Failed.\n");
	cmnd->result = SCSI_HOST(DID_ERROR);
	goto err;
	}
	pccb->tiSuperScsiRequest.Dif.enableDIFPerLA = TRUE;
	#endif
	#ifdef AGTIAPI_TEST_DIF
	/* Set App Tag */
	pccb->tiSuperScsiRequest.Dif.udtArray[0] = 0xaa;
	pccb->tiSuperScsiRequest.Dif.udtArray[1] = 0xbb;

	/* Set LBA in UDT array */
	if(CDB[0] == READ_6) {
	pccb->tiSuperScsiRequest.Dif.udtArray[2] = CDB[3];
	pccb->tiSuperScsiRequest.Dif.udtArray[3] = CDB[2];
	pccb->tiSuperScsiRequest.Dif.udtArray[4] = CDB[1] & 0x1f;
	pccb->tiSuperScsiRequest.Dif.udtArray[5] = 0;
	} else if(CDB[0] == READ_10 \|\| CDB[0] == READ_12) {
	pccb->tiSuperScsiRequest.Dif.udtArray[2] = CDB[5];
	pccb->tiSuperScsiRequest.Dif.udtArray[3] = CDB[4];
	pccb->tiSuperScsiRequest.Dif.udtArray[4] = CDB[3];
	pccb->tiSuperScsiRequest.Dif.udtArray[5] = CDB[2];
	} else if(CDB[0] == READ_16) {
	pccb->tiSuperScsiRequest.Dif.udtArray[2] = CDB[9];
	pccb->tiSuperScsiRequest.Dif.udtArray[3] = CDB[8];
	pccb->tiSuperScsiRequest.Dif.udtArray[4] = CDB[7];
	pccb->tiSuperScsiRequest.Dif.udtArray[5] = CDB[6];
	/* Note: 32 bits lost */
	}
	#endif

	break;
	case WRITE_6:
	case WRITE_10:
	case WRITE_12:
	case WRITE_16:
	// BUG_ON(pccb->tiSuperScsiRequest.flags & TI_SCSI_INITIATOR_ENCRYPT);
	pccb->flags \|= EDC_DATA;
	#ifdef AGTIAPI_TEST_DIF
	pccb->tiSuperScsiRequest.flags \|= TI_SCSI_INITIATOR_DIF;
	pccb->tiSuperScsiRequest.Dif.flags =
	DIF_INSERT \| DIF_UDT_REF_BLOCK_COUNT;
	switch(pDev->sector_size) {
	case 528:
	pccb->tiSuperScsiRequest.Dif.flags \|=
	(DIF_BLOCK_SIZE_520 << 16);
	break;
	case 4104:
	pccb->tiSuperScsiRequest.Dif.flags \|=
	( DIF_BLOCK_SIZE_4096 << 16 );
	break;
	case 4168:
	pccb->tiSuperScsiRequest.Dif.flags \|=
	( DIF_BLOCK_SIZE_4160 << 16 );
	break;
	}

	/* Turn on upper 4 bits of UUM */
	pccb->tiSuperScsiRequest.Dif.flags \|= 0xf0000000;
	#endif
	#ifdef AGTIAPI_TEST_DPL
	if(agtiapi_SetupDifPerLA(pCard, pccb, start_lba) < 0) {
	printk(KERN_ERR "SetupDifPerLA Failed.\n");
	cmnd->result = SCSI_HOST(DID_ERROR);
	goto err;
	}
	pccb->tiSuperScsiRequest.Dif.enableDIFPerLA = TRUE;
	#endif
	#ifdef AGTIAPI_TEST_DIF
	/* Set App Tag */
	pccb->tiSuperScsiRequest.Dif.udtArray[0] = 0xaa;
	pccb->tiSuperScsiRequest.Dif.udtArray[1] = 0xbb;

	/* Set LBA in UDT array */
	if(CDB[0] == WRITE_6) {
	pccb->tiSuperScsiRequest.Dif.udtArray[2] = CDB[3];
	pccb->tiSuperScsiRequest.Dif.udtArray[3] = CDB[2];
	pccb->tiSuperScsiRequest.Dif.udtArray[4] = CDB[1] & 0x1f;
	} else if(CDB[0] == WRITE_10 \|\| CDB[0] == WRITE_12) {
	pccb->tiSuperScsiRequest.Dif.udtArray[2] = CDB[5];
	pccb->tiSuperScsiRequest.Dif.udtArray[3] = CDB[4];
	pccb->tiSuperScsiRequest.Dif.udtArray[4] = CDB[3];
	pccb->tiSuperScsiRequest.Dif.udtArray[5] = CDB[2];
	} else if(CDB[0] == WRITE_16) {
	pccb->tiSuperScsiRequest.Dif.udtArray[2] = CDB[5];
	pccb->tiSuperScsiRequest.Dif.udtArray[3] = CDB[4];
	pccb->tiSuperScsiRequest.Dif.udtArray[4] = CDB[3];
	pccb->tiSuperScsiRequest.Dif.udtArray[5] = CDB[2];
	/* Note: 32 bits lost */
	}
	#endif
	break;
	}
	}
	#endif /* end of DIF */

	if ((ccb->ccb_h.flags & CAM_TAG_ACTION_VALID) != 0)
	{
	switch(csio->tag_action)
	{
	case MSG_HEAD_OF_Q_TAG:
	pScsiCmnd->taskAttribute = TASK_HEAD_OF_QUEUE;
	break;
	case MSG_ACA_TASK:
	pScsiCmnd->taskAttribute = TASK_ACA;
	break;
	case MSG_ORDERED_Q_TAG:
	pScsiCmnd->taskAttribute = TASK_ORDERED;
	break;
	case MSG_SIMPLE_Q_TAG: /* fall through */
	default:
	pScsiCmnd->taskAttribute = TASK_SIMPLE;
	break;
	}
	}

	if (pccb->tiSuperScsiRequest.agSgl1.len != 0 && pccb->dataLen != 0)
	{
	/* should be just before start IO */
	bus_dmamap_sync(pmcsc->buffer_dmat, pccb->CCB_dmamap, op);
	}

	/*
	* If assigned pDevHandle is not available
	* then there is no need to send it to StartIO()
	*/
	if (pccb->targetId < 0 \|\| pccb->targetId >= maxTargets)
	{
	pccb->ccbStatus = tiIOFailed;
	pccb->scsiStatus = tiDetailNoLogin;
	agtiapi_FreeCCB(pmcsc, pccb);
	ccb->ccb_h.status = CAM_DEV_NOT_THERE; // ## v. CAM_FUNC_NOTAVAIL
	xpt_done(ccb);
	pccb->ccb = NULL;
	return;
	}
	TID = INDEX(pmcsc, pccb->targetId);
	if ((TID >= pmcsc->devDiscover) \|\|
	!(pccb->devHandle = pmcsc->pDevList[TID].pDevHandle))
	{
	/*
	AGTIAPI_PRINTK( "agtiapi_PrepareSGListCB: not sending ccb devH %p,"
	" target %d tid %d/%d card %p ERROR pccb %p\n",
	pccb->devHandle, pccb->targetId, TID,
	pmcsc->devDiscover, pmcsc, pccb );
	*/
	pccb->ccbStatus = tiIOFailed;
	pccb->scsiStatus = tiDetailNoLogin;
	agtiapi_FreeCCB(pmcsc, pccb);
	ccb->ccb_h.status = CAM_DEV_NOT_THERE; // ## v. CAM_FUNC_NOTAVAIL
	xpt_done(ccb);
	pccb->ccb = NULL;
	return;
	}
	AGTIAPI_IO( "agtiapi_PrepareSGListCB: send ccb pccb->devHandle %p, "
	"pccb->targetId %d TID %d pmcsc->devDiscover %d card %p\n",
	pccb->devHandle, pccb->targetId, TID, pmcsc->devDiscover,
	pmcsc );
	#ifdef HIALEAH_ENCRYPTION
	if(pmcsc->encrypt && io_is_encryptable) {
	agtiapi_SetupEncryptedIO(pmcsc, pccb, start_lba);
	} else{
	io_is_encryptable = 0;
	pccb->tiSuperScsiRequest.flags = 0;
	}
	#endif
	// put the request in send queue
	agtiapi_QueueCCB( pmcsc, &pmcsc->ccbSendHead, &pmcsc->ccbSendTail
	AG_CARD_LOCAL_LOCK(&pmcsc->sendLock), pccb );
	agtiapi_StartIO(pmcsc);
	return;
	}

	/******************************************************************************
	agtiapi_StartIO()

	Purpose:
	Send IO request down for processing.
	Parameters:
	(struct agtiapi_softc *pmcsc (IN) Pointer to HBA data structure
	Return:
	Note:
	******************************************************************************/
	STATIC void agtiapi_StartIO( struct agtiapi_softc *pmcsc )
	{
	ccb_t *pccb;
	int TID;
	ag_device_t *targ;

	AGTIAPI_IO( "agtiapi_StartIO: start\n" );

	AG_LOCAL_LOCK( &pmcsc->sendLock );
	pccb = pmcsc->ccbSendHead;

	/* if link is down, do nothing */
	if ((pccb == NULL) \|\| pmcsc->flags & AGTIAPI_RESET)
	{
	AG_LOCAL_UNLOCK( &pmcsc->sendLock );
	AGTIAPI_PRINTK( "agtiapi_StartIO: goto ext\n" );
	goto ext;
	}


	if (pmcsc != NULL && pccb->targetId >= 0 && pccb->targetId < maxTargets)
	{
	TID = INDEX(pmcsc, pccb->targetId);
	targ = &pmcsc->pDevList[TID];
	}


	/* clear send queue */
	pmcsc->ccbSendHead = NULL;
	pmcsc->ccbSendTail = NULL;
	AG_LOCAL_UNLOCK( &pmcsc->sendLock );

	/* send all ccbs down */
	while (pccb)
	{
	pccb_t pccb_next;
	U32 status;

	pccb_next = pccb->pccbNext;
	pccb->pccbNext = NULL;

	if (!pccb->ccb)
	{
	AGTIAPI_PRINTK( "agtiapi_StartIO: pccb->ccb is NULL ERROR!\n" );
	pccb = pccb_next;
	continue;
	}
	AG_IO_DUMPCCB( pccb );

	if (!pccb->devHandle)
	{
	agtiapi_DumpCCB( pccb );
	AGTIAPI_PRINTK( "agtiapi_StartIO: ccb NULL device ERROR!\n" );
	pccb = pccb_next;
	continue;
	}
	AGTIAPI_IO( "agtiapi_StartIO: ccb %p retry %d\n", pccb, pccb->retryCount );

	#ifndef ABORT_TEST
	if( !pccb->devHandle \|\| !pccb->devHandle->osData \|\| /* in rmmod case */
	!(((ag_device_t *)(pccb->devHandle->osData))->flags & ACTIVE))
	{
	AGTIAPI_PRINTK( "agtiapi_StartIO: device %p not active! ERROR\n",
	pccb->devHandle );
	if( pccb->devHandle ) {
	AGTIAPI_PRINTK( "agtiapi_StartIO: device not active detail"
	" -- osData:%p\n",
	pccb->devHandle->osData );
	if( pccb->devHandle->osData ) {
	AGTIAPI_PRINTK( "agtiapi_StartIO: more device not active detail"
	" -- active flag:%d\n",
	( (ag_device_t *)
	(pccb->devHandle->osData))->flags & ACTIVE );
	}
	}
	pccb->ccbStatus = tiIOFailed;
	pccb->scsiStatus = tiDetailNoLogin;
	agtiapi_Done( pmcsc, pccb );
	pccb = pccb_next;
	continue;
	}
	#endif

	#ifdef FAST_IO_TEST
	status = agtiapi_FastIOTest( pmcsc, pccb );
	#else
	status = tiINISuperIOStart( &pmcsc->tiRoot,
	&pccb->tiIORequest,
	pccb->devHandle,
	&pccb->tiSuperScsiRequest,
	(void *)&pccb->tdIOReqBody,
	tiInterruptContext );
	#endif
	switch( status )
	{
	case tiSuccess:
	/*
	static int squelchCount = 0;
	if ( 200000 == squelchCount++ ) // squelch prints
	{
	AGTIAPI_PRINTK( "agtiapi_StartIO: tiINIIOStart stat tiSuccess %p\n",
	pccb );
	squelchCount = 0; // reset count
	}
	*/


	break;
	case tiDeviceBusy:
	AGTIAPI_PRINTK( "agtiapi_StartIO: tiINIIOStart status tiDeviceBusy %p\n",
	pccb->ccb );
	#ifdef LOGEVENT
	agtiapi_LogEvent( pmcsc,
	IOCTL_EVT_SEV_INFORMATIONAL,
	0,
	agNULL,
	0,
	"tiINIIOStart tiDeviceBusy " );
	#endif
	pccb->ccbStatus = tiIOFailed;
	pccb->scsiStatus = tiDeviceBusy;
	agtiapi_Done(pmcsc, pccb);
	break;
	case tiBusy:

	AGTIAPI_PRINTK( "agtiapi_StartIO: tiINIIOStart status tiBusy %p\n",
	pccb->ccb );
	#ifdef LOGEVENT
	agtiapi_LogEvent( pmcsc,
	IOCTL_EVT_SEV_INFORMATIONAL,
	0,
	agNULL,
	0,
	"tiINIIOStart tiBusy " );
	#endif

	pccb->ccbStatus = tiIOFailed;
	pccb->scsiStatus = tiBusy;
	agtiapi_Done(pmcsc, pccb);

	break;
	case tiIONoDevice:
	AGTIAPI_PRINTK( "agtiapi_StartIO: tiINIIOStart status tiNoDevice %p "
	"ERROR\n", pccb->ccb );
	#ifdef LOGEVENT
	agtiapi_LogEvent( pmcsc,
	IOCTL_EVT_SEV_INFORMATIONAL,
	0,
	agNULL,
	0,
	"tiINIIOStart invalid device handle " );
	#endif
	#ifndef ABORT_TEST
	/* return command back to OS due to no device available */
	((ag_device_t *)(pccb->devHandle->osData))->flags &= ~ACTIVE;
	pccb->ccbStatus = tiIOFailed;
	pccb->scsiStatus = tiDetailNoLogin;
	agtiapi_Done(pmcsc, pccb);
	#else
	/* for short cable pull, we want IO retried - 3-18-2005 */
	agtiapi_QueueCCB(pmcsc, &pmcsc->ccbSendHead, &pmcsc->ccbSendTail
	AG_CARD_LOCAL_LOCK(&pmcsc->sendLock), pccb);
	#endif
	break;
	case tiError:
	AGTIAPI_PRINTK("agtiapi_StartIO: tiINIIOStart status tiError %p\n",
	pccb->ccb);
	#ifdef LOGEVENT
	agtiapi_LogEvent(pmcsc,
	IOCTL_EVT_SEV_INFORMATIONAL,
	0,
	agNULL,
	0,
	"tiINIIOStart tiError ");
	#endif
	pccb->ccbStatus = tiIOFailed;
	pccb->scsiStatus = tiDetailOtherError;
	agtiapi_Done(pmcsc, pccb);
	break;
	default:
	AGTIAPI_PRINTK("agtiapi_StartIO: tiINIIOStart status default %x %p\n",
	status, pccb->ccb);
	#ifdef LOGEVENT
	agtiapi_LogEvent(pmcsc,
	IOCTL_EVT_SEV_ERROR,
	0,
	agNULL,
	0,
	"tiINIIOStart unexpected status ");
	#endif
	pccb->ccbStatus = tiIOFailed;
	pccb->scsiStatus = tiDetailOtherError;
	agtiapi_Done(pmcsc, pccb);
	}

	pccb = pccb_next;
	}
	ext:
	/* some IO requests might have been completed */
	AG_GET_DONE_PCCB(pccb, pmcsc);
	return;
	}

	/******************************************************************************
	agtiapi_StartSMP()

	Purpose:
	Send SMP request down for processing.
	Parameters:
	(struct agtiapi_softc *pmcsc (IN) Pointer to HBA data structure
	Return:
	Note:
	******************************************************************************/
	STATIC void agtiapi_StartSMP(struct agtiapi_softc *pmcsc)
	{
	ccb_t *pccb;

	AGTIAPI_PRINTK("agtiapi_StartSMP: start\n");

	AG_LOCAL_LOCK(&pmcsc->sendSMPLock);
	pccb = pmcsc->smpSendHead;

	/* if link is down, do nothing */
	if ((pccb == NULL) \|\| pmcsc->flags & AGTIAPI_RESET)
	{
	AG_LOCAL_UNLOCK(&pmcsc->sendSMPLock);
	AGTIAPI_PRINTK("agtiapi_StartSMP: goto ext\n");
	goto ext;
	}

	/* clear send queue */
	pmcsc->smpSendHead = NULL;
	pmcsc->smpSendTail = NULL;
	AG_LOCAL_UNLOCK(&pmcsc->sendSMPLock);

	/* send all ccbs down */
	while (pccb)
	{
	pccb_t pccb_next;
	U32 status;

	pccb_next = pccb->pccbNext;
	pccb->pccbNext = NULL;

	if (!pccb->ccb)
	{
	AGTIAPI_PRINTK("agtiapi_StartSMP: pccb->ccb is NULL ERROR!\n");
	pccb = pccb_next;
	continue;
	}

	if (!pccb->devHandle)
	{
	AGTIAPI_PRINTK("agtiapi_StartSMP: ccb NULL device ERROR!\n");
	pccb = pccb_next;
	continue;
	}
	pccb->flags \|= TAG_SMP; // mark as SMP for later tracking
	AGTIAPI_PRINTK( "agtiapi_StartSMP: ccb %p retry %d\n",
	pccb, pccb->retryCount );
	status = tiINISMPStart( &pmcsc->tiRoot,
	&pccb->tiIORequest,
	pccb->devHandle,
	&pccb->tiSMPFrame,
	(void *)&pccb->tdIOReqBody,
	tiInterruptContext);

	switch (status)
	{
	case tiSuccess:
	break;
	case tiBusy:
	AGTIAPI_PRINTK("agtiapi_StartSMP: tiINISMPStart status tiBusy %p\n",
	pccb->ccb);
	/* pending ccb back to send queue */
	agtiapi_QueueCCB(pmcsc, &pmcsc->smpSendHead, &pmcsc->smpSendTail
	AG_CARD_LOCAL_LOCK(&pmcsc->sendSMPLock), pccb);
	break;
	case tiError:
	AGTIAPI_PRINTK("agtiapi_StartIO: tiINIIOStart status tiError %p\n",
	pccb->ccb);
	pccb->ccbStatus = tiSMPFailed;
	agtiapi_SMPDone(pmcsc, pccb);
	break;
	default:
	AGTIAPI_PRINTK("agtiapi_StartIO: tiINIIOStart status default %x %p\n",
	status, pccb->ccb);
	pccb->ccbStatus = tiSMPFailed;
	agtiapi_SMPDone(pmcsc, pccb);
	}

	pccb = pccb_next;
	}
	ext:
	/* some SMP requests might have been completed */
	AG_GET_DONE_SMP_PCCB(pccb, pmcsc);

	return;
	}

	#if __FreeBSD_version > 901000
	/******************************************************************************
	agtiapi_PrepareSMPSGList()

	Purpose:
	This function prepares scatter-gather list for the given ccb
	Parameters:
	struct agtiapi_softc *pmsc (IN) Pointer to the HBA data structure
	ccb_t *pccb (IN) A pointer to the driver's own CCB, not CAM's CCB
	Return:
	0 - success
	1 - failure

	Note:
	******************************************************************************/
	static int agtiapi_PrepareSMPSGList( struct agtiapi_softc pmcsc, ccb_t pccb )
	{
	/* Pointer to CAM's ccb */
	union ccb *ccb = pccb->ccb;
	struct ccb_smpio *csmpio = &ccb->smpio;
	struct ccb_hdr *ccbh = &ccb->ccb_h;

	AGTIAPI_PRINTK("agtiapi_PrepareSMPSGList: start\n");
	switch((ccbh->flags & CAM_DATA_MASK))
	{
	case CAM_DATA_PADDR:
	case CAM_DATA_SG_PADDR:
	AGTIAPI_PRINTK( "agtiapi_PrepareSMPSGList: Physical Address not supported\n");
	ccb->ccb_h.status = CAM_REQ_INVALID;
	xpt_done(ccb);
	return tiReject;
	case CAM_DATA_SG:

	/*
	* Currently we do not support Multiple SG list
	* return error for now
	*/
	if ( (csmpio->smp_request_sglist_cnt > 1)
	\|\| (csmpio->smp_response_sglist_cnt > 1) )
	{
	AGTIAPI_PRINTK( "agtiapi_PrepareSMPSGList: Multiple SG list not supported\n");
	ccb->ccb_h.status = CAM_REQ_INVALID;
	xpt_done(ccb);
	return tiReject;
	}
	}
	if ( csmpio->smp_request_sglist_cnt != 0 )
	{
	/*
	* Virtual address that needs to translated into
	* one or more physical address ranges.
	*/
	int error;
	//AG_LOCAL_LOCK(&(pmcsc->pCardInfo->pmIOLock));
	AGTIAPI_PRINTK("agtiapi_PrepareSGList: virtual address\n");
	error = bus_dmamap_load( pmcsc->buffer_dmat,
	pccb->CCB_dmamap,
	csmpio->smp_request,
	csmpio->smp_request_len,
	agtiapi_PrepareSMPSGListCB,
	pccb,
	BUS_DMA_NOWAIT /* 0 */ );

	//AG_LOCAL_UNLOCK(&(pmcsc->pCardInfo->pmIOLock));

	if (error == EINPROGRESS)
	{
	/*
	* So as to maintain ordering,
	* freeze the controller queue
	* until our mapping is
	* returned.
	*/
	AGTIAPI_PRINTK( "agtiapi_PrepareSGList: EINPROGRESS\n" );
	xpt_freeze_simq( pmcsc->sim, 1 );
	pmcsc->SimQFrozen = agTRUE;
	ccbh->status \|= CAM_RELEASE_SIMQ;
	}
	}
	if( csmpio->smp_response_sglist_cnt != 0 )
	{
	/*
	* Virtual address that needs to translated into
	* one or more physical address ranges.
	*/
	int error;
	//AG_LOCAL_LOCK( &(pmcsc->pCardInfo->pmIOLock) );
	AGTIAPI_PRINTK( "agtiapi_PrepareSGList: virtual address\n" );
	error = bus_dmamap_load( pmcsc->buffer_dmat,
	pccb->CCB_dmamap,
	csmpio->smp_response,
	csmpio->smp_response_len,
	agtiapi_PrepareSMPSGListCB,
	pccb,
	BUS_DMA_NOWAIT /* 0 */ );

	//AG_LOCAL_UNLOCK( &(pmcsc->pCardInfo->pmIOLock) );

	if ( error == EINPROGRESS )
	{
	/*
	* So as to maintain ordering,
	* freeze the controller queue
	* until our mapping is
	* returned.
	*/
	AGTIAPI_PRINTK( "agtiapi_PrepareSGList: EINPROGRESS\n" );
	xpt_freeze_simq( pmcsc->sim, 1 );
	pmcsc->SimQFrozen = agTRUE;
	ccbh->status \|= CAM_RELEASE_SIMQ;
	}
	}

	else
	{
	if ( (csmpio->smp_request_sglist_cnt == 0) &&
	(csmpio->smp_response_sglist_cnt == 0) )
	{
	AGTIAPI_PRINTK( "agtiapi_PrepareSMPSGList: physical address\n" );
	pccb->tiSMPFrame.outFrameBuf = (void *)csmpio->smp_request;
	pccb->tiSMPFrame.outFrameLen = csmpio->smp_request_len;
	pccb->tiSMPFrame.expectedRespLen = csmpio->smp_response_len;

	// 0xFF to be defined
	agtiapi_PrepareSMPSGListCB( pccb, NULL, 0, 0xAABBCCDD );
	}
	pccb->tiSMPFrame.flag = 0;
	}

	return tiSuccess;
	}
	#else

	/******************************************************************************
	agtiapi_PrepareSMPSGList()

	Purpose:
	This function prepares scatter-gather list for the given ccb
	Parameters:
	struct agtiapi_softc *pmsc (IN) Pointer to the HBA data structure
	ccb_t *pccb (IN) A pointer to the driver's own CCB, not CAM's CCB
	Return:
	0 - success
	1 - failure

	Note:
	******************************************************************************/
	static int agtiapi_PrepareSMPSGList( struct agtiapi_softc pmcsc, ccb_t pccb )
	{
	/* Pointer to CAM's ccb */
	union ccb *ccb = pccb->ccb;
	struct ccb_smpio *csmpio = &ccb->smpio;
	struct ccb_hdr *ccbh = &ccb->ccb_h;

	AGTIAPI_PRINTK("agtiapi_PrepareSMPSGList: start\n");

	if (ccbh->flags & (CAM_DATA_PHYS\|CAM_SG_LIST_PHYS))
	{
	AGTIAPI_PRINTK( "agtiapi_PrepareSMPSGList: Physical Address "
	"not supported\n" );
	ccb->ccb_h.status = CAM_REQ_INVALID;
	xpt_done(ccb);
	return tiReject;
	}

	if (ccbh->flags & CAM_SCATTER_VALID)
	{
	/*
	* Currently we do not support Multiple SG list
	* return error for now
	*/
	if ( (csmpio->smp_request_sglist_cnt > 1)
	\|\| (csmpio->smp_response_sglist_cnt > 1) )
	{
	AGTIAPI_PRINTK( "agtiapi_PrepareSMPSGList: Multiple SG list "
	"not supported\n" );
	ccb->ccb_h.status = CAM_REQ_INVALID;
	xpt_done(ccb);
	return tiReject;
	}
	if ( csmpio->smp_request_sglist_cnt != 0 )
	{
	/*
	* Virtual address that needs to translated into
	* one or more physical address ranges.
	*/
	int error;
	//AG_LOCAL_LOCK(&(pmcsc->pCardInfo->pmIOLock));
	AGTIAPI_PRINTK("agtiapi_PrepareSGList: virtual address\n");
	error = bus_dmamap_load( pmcsc->buffer_dmat,
	pccb->CCB_dmamap,
	csmpio->smp_request,
	csmpio->smp_request_len,
	agtiapi_PrepareSMPSGListCB,
	pccb,
	BUS_DMA_NOWAIT /* 0 */ );

	//AG_LOCAL_UNLOCK(&(pmcsc->pCardInfo->pmIOLock));

	if (error == EINPROGRESS)
	{
	/*
	* So as to maintain ordering,
	* freeze the controller queue
	* until our mapping is
	* returned.
	*/
	AGTIAPI_PRINTK( "agtiapi_PrepareSGList: EINPROGRESS\n" );
	xpt_freeze_simq( pmcsc->sim, 1 );
	pmcsc->SimQFrozen = agTRUE;
	ccbh->status \|= CAM_RELEASE_SIMQ;
	}
	}
	if( csmpio->smp_response_sglist_cnt != 0 )
	{
	/*
	* Virtual address that needs to translated into
	* one or more physical address ranges.
	*/
	int error;
	//AG_LOCAL_LOCK( &(pmcsc->pCardInfo->pmIOLock) );
	AGTIAPI_PRINTK( "agtiapi_PrepareSGList: virtual address\n" );
	error = bus_dmamap_load( pmcsc->buffer_dmat,
	pccb->CCB_dmamap,
	csmpio->smp_response,
	csmpio->smp_response_len,
	agtiapi_PrepareSMPSGListCB,
	pccb,
	BUS_DMA_NOWAIT /* 0 */ );

	//AG_LOCAL_UNLOCK( &(pmcsc->pCardInfo->pmIOLock) );

	if ( error == EINPROGRESS )
	{
	/*
	* So as to maintain ordering,
	* freeze the controller queue
	* until our mapping is
	* returned.
	*/
	AGTIAPI_PRINTK( "agtiapi_PrepareSGList: EINPROGRESS\n" );
	xpt_freeze_simq( pmcsc->sim, 1 );
	pmcsc->SimQFrozen = agTRUE;
	ccbh->status \|= CAM_RELEASE_SIMQ;
	}
	}
	}
	else
	{
	if ( (csmpio->smp_request_sglist_cnt == 0) &&
	(csmpio->smp_response_sglist_cnt == 0) )
	{
	AGTIAPI_PRINTK( "agtiapi_PrepareSMPSGList: physical address\n" );
	pccb->tiSMPFrame.outFrameBuf = (void *)csmpio->smp_request;
	pccb->tiSMPFrame.outFrameLen = csmpio->smp_request_len;
	pccb->tiSMPFrame.expectedRespLen = csmpio->smp_response_len;

	// 0xFF to be defined
	agtiapi_PrepareSMPSGListCB( pccb, NULL, 0, 0xAABBCCDD );
	}
	pccb->tiSMPFrame.flag = 0;
	}

	return tiSuccess;
	}

	#endif
	/******************************************************************************
	agtiapi_PrepareSMPSGListCB()

	Purpose:
	Callback function for bus_dmamap_load()
	This fuctions sends IO to LL layer.
	Parameters:
	void *arg (IN) Pointer to the HBA data structure
	bus_dma_segment_t *segs (IN) Pointer to dma segment
	int nsegs (IN) number of dma segment
	int error (IN) error
	Return:
	Note:
	******************************************************************************/
	static void agtiapi_PrepareSMPSGListCB( void *arg,
	bus_dma_segment_t *segs,
	int nsegs,
	int error )
	{
	pccb_t pccb = arg;
	union ccb *ccb = pccb->ccb;
	struct agtiapi_softc *pmcsc;
	U32 TID = CMND_TO_TARGET(ccb);
	int status;
	tiDeviceHandle_t *tiExpDevHandle;
	tiPortalContext_t *tiExpPortalContext;
	ag_portal_info_t *tiExpPortalInfo;

	AGTIAPI_PRINTK( "agtiapi_PrepareSMPSGListCB: start, nsegs %d error 0x%x\n",
	nsegs, error );
	pmcsc = pccb->pmcsc;

	if ( error != tiSuccess )
	{
	if (error == 0xAABBCCDD)
	{
	// do nothing
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_PrepareSMPSGListCB: error status 0x%x\n",
	error );
	bus_dmamap_unload( pmcsc->buffer_dmat, pccb->CCB_dmamap );
	bus_dmamap_destroy( pmcsc->buffer_dmat, pccb->CCB_dmamap );
	agtiapi_FreeCCB( pmcsc, pccb );
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done( ccb );
	return;
	}
	}

	if ( nsegs > AGTIAPI_MAX_DMA_SEGS )
	{
	AGTIAPI_PRINTK( "agtiapi_PrepareSMPSGListCB: over the limit. nsegs %d "
	"AGTIAPI_MAX_DMA_SEGS %d\n",
	nsegs, AGTIAPI_MAX_DMA_SEGS );
	bus_dmamap_unload( pmcsc->buffer_dmat, pccb->CCB_dmamap );
	bus_dmamap_destroy( pmcsc->buffer_dmat, pccb->CCB_dmamap );
	agtiapi_FreeCCB( pmcsc, pccb );
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done( ccb );
	return;
	}

	/*
	* If assigned pDevHandle is not available
	* then there is no need to send it to StartIO()
	*/
	/* TODO: Add check for deviceType */
	if ( pccb->targetId < 0 \|\| pccb->targetId >= maxTargets )
	{
	agtiapi_FreeCCB( pmcsc, pccb );
	ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
	xpt_done(ccb);
	pccb->ccb = NULL;
	return;
	}
	TID = INDEX( pmcsc, pccb->targetId );
	if ( (TID >= pmcsc->devDiscover) \|\|
	!(pccb->devHandle = pmcsc->pDevList[TID].pDevHandle) )
	{
	AGTIAPI_PRINTK( "agtiapi_PrepareSMPSGListCB: not sending ccb devH %p, "
	"target %d tid %d/%d "
	"card %p ERROR pccb %p\n",
	pccb->devHandle,
	pccb->targetId,
	TID,
	pmcsc->devDiscover,
	pmcsc,
	pccb );
	agtiapi_FreeCCB( pmcsc, pccb );
	ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
	xpt_done( ccb );
	pccb->ccb = NULL;
	return;
	}
	/* TODO: add indirect handling */
	/* set the flag correctly based on Indiret SMP request and response */

	AGTIAPI_PRINTK( "agtiapi_PrepareSMPSGListCB: send ccb pccb->devHandle %p, "
	"pccb->targetId %d TID %d pmcsc->devDiscover %d card %p\n",
	pccb->devHandle,
	pccb->targetId, TID,
	pmcsc->devDiscover,
	pmcsc );
	tiExpDevHandle = pccb->devHandle;
	tiExpPortalInfo = pmcsc->pDevList[TID].pPortalInfo;
	tiExpPortalContext = &tiExpPortalInfo->tiPortalContext;
	/* Look for the expander associated with the ses device */
	status = tiINIGetExpander( &pmcsc->tiRoot,
	tiExpPortalContext,
	pccb->devHandle,
	&tiExpDevHandle );

	if ( status != tiSuccess )
	{
	AGTIAPI_PRINTK( "agtiapi_PrepareSMPSGListCB: Error getting Expander "
	"device\n" );
	agtiapi_FreeCCB( pmcsc, pccb );
	ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
	xpt_done( ccb );
	pccb->ccb = NULL;
	return;
	}

	/* this is expander device */
	pccb->devHandle = tiExpDevHandle;
	/* put the request in send queue */
	agtiapi_QueueCCB( pmcsc, &pmcsc->smpSendHead, &pmcsc->smpSendTail
	AG_CARD_LOCAL_LOCK(&pmcsc->sendSMPLock), pccb );

	agtiapi_StartSMP( pmcsc );

	return;
	}


	/******************************************************************************
	agtiapi_Done()

	Purpose:
	Processing completed ccbs
	Parameters:
	struct agtiapi_softc *pmcsc (IN) Pointer to HBA data structure
	ccb_t *pccb (IN) A pointer to the driver's own CCB, not CAM's CCB
	Return:
	Note:
	******************************************************************************/
	STATIC void agtiapi_Done(struct agtiapi_softc pmcsc, ccb_t pccb)
	{
	pccb_t pccb_curr = pccb;
	pccb_t pccb_next;

	tiIniScsiCmnd_t *cmnd;
	union ccb * ccb;

	AGTIAPI_IO("agtiapi_Done: start\n");
	while (pccb_curr)
	{
	/* start from 1st ccb in the chain */
	pccb_next = pccb_curr->pccbNext;

	if (agtiapi_CheckError(pmcsc, pccb_curr) != 0)
	{
	/* send command back and release the ccb */
	cmnd = &pccb_curr->tiSuperScsiRequest.scsiCmnd;

	if (cmnd->cdb[0] == RECEIVE_DIAGNOSTIC)
	{
	AGTIAPI_PRINTK("agtiapi_Done: RECEIVE_DIAG pg %d id %d cmnd %p pccb "
	"%p\n", cmnd->cdb[2], pccb_curr->targetId, cmnd,
	pccb_curr);
	}

	CMND_DMA_UNMAP(pmcsc, ccb);

	/* send the request back to the CAM */
	ccb = pccb_curr->ccb;
	agtiapi_FreeCCB(pmcsc, pccb_curr);
	xpt_done(ccb);
	}
	pccb_curr = pccb_next;
	}
	return;
	}

	/******************************************************************************
	agtiapi_SMPDone()

	Purpose:
	Processing completed ccbs
	Parameters:
	struct agtiapi_softc *pmcsc (IN) Ponter to HBA data structure
	ccb_t *pccb (IN) A pointer to the driver's own CCB, not
	CAM's CCB
	Return:
	Note:
	******************************************************************************/
	STATIC void agtiapi_SMPDone(struct agtiapi_softc pmcsc, ccb_t pccb)
	{
	pccb_t pccb_curr = pccb;
	pccb_t pccb_next;

	union ccb * ccb;

	AGTIAPI_PRINTK("agtiapi_SMPDone: start\n");

	while (pccb_curr)
	{
	/* start from 1st ccb in the chain */
	pccb_next = pccb_curr->pccbNext;

	if (agtiapi_CheckSMPError(pmcsc, pccb_curr) != 0)
	{
	CMND_DMA_UNMAP(pmcsc, ccb);

	/* send the request back to the CAM */
	ccb = pccb_curr->ccb;
	agtiapi_FreeSMPCCB(pmcsc, pccb_curr);
	xpt_done(ccb);

	}
	pccb_curr = pccb_next;
	}

	AGTIAPI_PRINTK("agtiapi_SMPDone: Done\n");
	return;
	}

	/******************************************************************************
	agtiapi_hexdump()

	Purpose:
	Utility function for dumping in hex
	Parameters:
	const char *ptitle (IN) A string to be printed
	bit8 *pbuf (IN) A pointer to a buffer to be printed.
	int len (IN) The lengther of the buffer
	Return:
	Note:
	******************************************************************************/
	void agtiapi_hexdump(const char ptitle, bit8 pbuf, int len)
	{
	int i;
	AGTIAPI_PRINTK("%s - hexdump(len=%d):\n", ptitle, (int)len);
	if (!pbuf)
	{
	AGTIAPI_PRINTK("pbuf is NULL\n");
	return;
	}
	for (i = 0; i < len; )
	{
	if (len - i > 4)
	{
	AGTIAPI_PRINTK( " 0x%02x, 0x%02x, 0x%02x, 0x%02x,\n", pbuf[i], pbuf[i+1],
	pbuf[i+2], pbuf[i+3] );
	i += 4;
	}
	else
	{
	AGTIAPI_PRINTK(" 0x%02x,", pbuf[i]);
	i++;
	}
	}
	AGTIAPI_PRINTK("\n");
	}


	/******************************************************************************
	agtiapi_CheckError()

	Purpose:
	Processes status pertaining to the ccb -- whether it was
	completed successfully, aborted, or error encountered.
	Parameters:
	ag_card_t *pCard (IN) Pointer to HBA data structure
	ccb_t *pccd (IN) A pointer to the driver's own CCB, not CAM's CCB
	Return:
	0 - the command retry is required
	1 - the command process is completed
	Note:

	******************************************************************************/
	STATIC U32 agtiapi_CheckError(struct agtiapi_softc pmcsc, ccb_t pccb)
	{
	ag_device_t *pDevice;
	// union ccb * ccb = pccb->ccb;
	union ccb * ccb;
	int is_error, TID;

	if (pccb == NULL) {
	return 0;
	}
	ccb = pccb->ccb;
	AGTIAPI_IO("agtiapi_CheckError: start\n");
	if (ccb == NULL)
	{
	/* shouldn't be here but just in case we do */
	AGTIAPI_PRINTK("agtiapi_CheckError: CCB orphan = %p ERROR\n", pccb);
	agtiapi_FreeCCB(pmcsc, pccb);
	return 0;
	}

	is_error = 1;
	pDevice = NULL;
	if (pmcsc != NULL && pccb->targetId >= 0 && pccb->targetId < maxTargets)
	{
	if (pmcsc->pWWNList != NULL)
	{
	TID = INDEX(pmcsc, pccb->targetId);
	if (TID < maxTargets)
	{
	pDevice = &pmcsc->pDevList[TID];
	if (pDevice != NULL)
	{
	is_error = 0;
	}
	}
	}
	}
	if (is_error)
	{
	AGTIAPI_PRINTK("agtiapi_CheckError: pDevice == NULL\n");
	agtiapi_FreeCCB(pmcsc, pccb);
	return 0;
	}

	/* SCSI status */
	ccb->csio.scsi_status = pccb->scsiStatus;

	if(pDevice->CCBCount > 0){
	atomic_subtract_int(&pDevice->CCBCount,1);
	}
	AG_LOCAL_LOCK(&pmcsc->freezeLock);
	if(pmcsc->freezeSim == agTRUE)
	{
	pmcsc->freezeSim = agFALSE;
	xpt_release_simq(pmcsc->sim, 1);
	}
	AG_LOCAL_UNLOCK(&pmcsc->freezeLock);

	switch (pccb->ccbStatus)
	{
	case tiIOSuccess:
	AGTIAPI_IO("agtiapi_CheckError: tiIOSuccess pccb %p\n", pccb);
	/* CAM status */
	if (pccb->scsiStatus == SCSI_STATUS_OK)
	{
	ccb->ccb_h.status = CAM_REQ_CMP;
	}
	else
	if (pccb->scsiStatus == SCSI_TASK_ABORTED)
	{
	ccb->ccb_h.status = CAM_REQ_ABORTED;
	}
	else
	{
	ccb->ccb_h.status = CAM_SCSI_STATUS_ERROR;
	}
	if (ccb->csio.scsi_status == SCSI_CHECK_CONDITION)
	{
	ccb->ccb_h.status \|= CAM_AUTOSNS_VALID;
	}

	break;

	case tiIOOverRun:
	AGTIAPI_PRINTK("agtiapi_CheckError: tiIOOverRun pccb %p\n", pccb);
	/* resid is ignored for this condition */
	ccb->csio.resid = 0;
	ccb->ccb_h.status = CAM_DATA_RUN_ERR;
	break;
	case tiIOUnderRun:
	AGTIAPI_PRINTK("agtiapi_CheckError: tiIOUnderRun pccb %p\n", pccb);
	ccb->csio.resid = pccb->scsiStatus;
	ccb->ccb_h.status = CAM_REQ_CMP;
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	break;

	case tiIOFailed:
	AGTIAPI_PRINTK( "agtiapi_CheckError: pccb %p tiIOFailed %d id %d ERROR\n",
	pccb, pccb->scsiStatus, pccb->targetId );
	if (pccb->scsiStatus == tiDeviceBusy)
	{
	AGTIAPI_IO( "agtiapi_CheckError: pccb %p tiIOFailed - tiDetailBusy\n",
	pccb );
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	ccb->ccb_h.status \|= CAM_REQUEUE_REQ;
	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0)
	{
	ccb->ccb_h.status \|= CAM_DEV_QFRZN;
	xpt_freeze_devq(ccb->ccb_h.path, /count/1);
	}
	}
	else if(pccb->scsiStatus == tiBusy)
	{
	AG_LOCAL_LOCK(&pmcsc->freezeLock);
	if(pmcsc->freezeSim == agFALSE)
	{
	pmcsc->freezeSim = agTRUE;
	xpt_freeze_simq(pmcsc->sim, 1);
	}
	AG_LOCAL_UNLOCK(&pmcsc->freezeLock);
	ccb->ccb_h.status &= ~CAM_SIM_QUEUED;
	ccb->ccb_h.status \|= CAM_REQUEUE_REQ;
	}
	else if (pccb->scsiStatus == tiDetailNoLogin)
	{
	AGTIAPI_PRINTK( "agtiapi_CheckError: pccb %p tiIOFailed - "
	"tiDetailNoLogin ERROR\n", pccb );
	ccb->ccb_h.status = CAM_DEV_NOT_THERE;
	}
	else if (pccb->scsiStatus == tiDetailNotValid)
	{
	AGTIAPI_PRINTK( "agtiapi_CheckError: pccb %p tiIOFailed - "
	"tiDetailNotValid ERROR\n", pccb );
	ccb->ccb_h.status = CAM_REQ_INVALID;
	}
	else if (pccb->scsiStatus == tiDetailAbortLogin)
	{
	AGTIAPI_PRINTK( "agtiapi_CheckError: pccb %p tiIOFailed - "
	"tiDetailAbortLogin ERROR\n", pccb );
	ccb->ccb_h.status = CAM_REQ_ABORTED;
	}
	else if (pccb->scsiStatus == tiDetailAbortReset)
	{
	AGTIAPI_PRINTK( "agtiapi_CheckError: pccb %p tiIOFailed - "
	"tiDetailAbortReset ERROR\n", pccb );
	ccb->ccb_h.status = CAM_REQ_ABORTED;
	}
	else if (pccb->scsiStatus == tiDetailAborted)
	{
	AGTIAPI_PRINTK( "agtiapi_CheckError: pccb %p tiIOFailed - "
	"tiDetailAborted ERROR\n", pccb );
	ccb->ccb_h.status = CAM_REQ_ABORTED;
	}
	else if (pccb->scsiStatus == tiDetailOtherError)
	{
	AGTIAPI_PRINTK( "agtiapi_CheckError: pccb %p tiIOFailed - "
	"tiDetailOtherError ERROR\n", pccb );
	ccb->ccb_h.status = CAM_REQ_ABORTED;
	}
	break;
	case tiIODifError:
	AGTIAPI_PRINTK( "agtiapi_CheckError: pccb %p tiIOFailed %d id %d ERROR\n",
	pccb, pccb->scsiStatus, pccb->targetId );
	if (pccb->scsiStatus == tiDetailDifAppTagMismatch)
	{
	AGTIAPI_IO( "agtiapi_CheckError: pccb %p tiIOFailed - "
	"tiDetailDifAppTagMismatch\n", pccb );
	ccb->ccb_h.status = CAM_REQ_CMP_ERR;
	}
	else if (pccb->scsiStatus == tiDetailDifRefTagMismatch)
	{
	AGTIAPI_PRINTK( "agtiapi_CheckError: pccb %p tiIOFailed - "
	"tiDetailDifRefTagMismatch\n", pccb );
	ccb->ccb_h.status = CAM_REQ_CMP_ERR;
	}
	else if (pccb->scsiStatus == tiDetailDifCrcMismatch)
	{
	AGTIAPI_PRINTK( "agtiapi_CheckError: pccb %p tiIOFailed - "
	"tiDetailDifCrcMismatch\n", pccb );
	ccb->ccb_h.status = CAM_REQ_CMP_ERR;
	}
	break;
	#ifdef HIALEAH_ENCRYPTION
	case tiIOEncryptError:
	AGTIAPI_PRINTK( "agtiapi_CheckError: pccb %p tiIOFailed %d id %d ERROR\n",
	pccb, pccb->scsiStatus, pccb->targetId );
	if (pccb->scsiStatus == tiDetailDekKeyCacheMiss)
	{
	AGTIAPI_PRINTK( "agtiapi_CheckError: %s: pccb %p tiIOFailed - "
	"tiDetailDekKeyCacheMiss ERROR\n",
	__FUNCTION__, pccb );
	ccb->ccb_h.status = CAM_REQ_ABORTED;
	agtiapi_HandleEncryptedIOFailure(pDevice, pccb);
	}
	else if (pccb->scsiStatus == tiDetailDekIVMismatch)
	{
	AGTIAPI_PRINTK( "agtiapi_CheckError: %s: pccb %p tiIOFailed - "
	"tiDetailDekIVMismatch ERROR\n", __FUNCTION__, pccb );
	ccb->ccb_h.status = CAM_REQ_ABORTED;
	agtiapi_HandleEncryptedIOFailure(pDevice, pccb);
	}
	break;
	#endif
	default:
	AGTIAPI_PRINTK( "agtiapi_CheckError: pccb %p tiIOdefault %d id %d ERROR\n",
	pccb, pccb->ccbStatus, pccb->targetId );
	ccb->ccb_h.status = CAM_REQ_CMP_ERR;
	break;
	}

	return 1;
	}


	/******************************************************************************
	agtiapi_SMPCheckError()

	Purpose:
	Processes status pertaining to the ccb -- whether it was
	completed successfully, aborted, or error encountered.
	Parameters:
	ag_card_t *pCard (IN) Pointer to HBA data structure
	ccb_t *pccd (IN) A pointer to the driver's own CCB, not CAM's CCB
	Return:
	0 - the command retry is required
	1 - the command process is completed
	Note:

	******************************************************************************/
	STATIC U32 agtiapi_CheckSMPError( struct agtiapi_softc pmcsc, ccb_t pccb )
	{
	union ccb * ccb = pccb->ccb;

	AGTIAPI_PRINTK("agtiapi_CheckSMPError: start\n");

	if (!ccb)
	{
	/* shouldn't be here but just in case we do */
	AGTIAPI_PRINTK( "agtiapi_CheckSMPError: CCB orphan = %p ERROR\n",
	pccb );
	agtiapi_FreeSMPCCB(pmcsc, pccb);
	return 0;
	}

	switch (pccb->ccbStatus)
	{
	case tiSMPSuccess:
	AGTIAPI_PRINTK( "agtiapi_CheckSMPError: tiSMPSuccess pccb %p\n",
	pccb );
	/* CAM status */
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	case tiSMPFailed:
	AGTIAPI_PRINTK( "agtiapi_CheckSMPError: tiSMPFailed pccb %p\n",
	pccb );
	/* CAM status */
	ccb->ccb_h.status = CAM_REQ_CMP_ERR;
	break;
	default:
	AGTIAPI_PRINTK( "agtiapi_CheckSMPError: pccb %p tiSMPdefault %d "
	"id %d ERROR\n",
	pccb,
	pccb->ccbStatus,
	pccb->targetId );
	ccb->ccb_h.status = CAM_REQ_CMP_ERR;
	break;
	}


	return 1;

	}

	/******************************************************************************
	agtiapi_HandleEncryptedIOFailure():

	Purpose:
	Parameters:
	Return:
	Note:
	Currently not used.
	******************************************************************************/
	void agtiapi_HandleEncryptedIOFailure(ag_device_t pDev, ccb_t pccb)
	{

	AGTIAPI_PRINTK("agtiapi_HandleEncryptedIOFailure: start\n");
	return;
	}

	/******************************************************************************
	agtiapi_Retry()

	Purpose:
	Retry a ccb.
	Parameters:
	struct agtiapi_softc *pmcsc (IN) Pointer to the HBA structure
	ccb_t *pccb (IN) A pointer to the driver's own CCB, not CAM's CCB
	Return:
	Note:
	Currently not used.
	******************************************************************************/
	STATIC void agtiapi_Retry(struct agtiapi_softc pmcsc, ccb_t pccb)
	{
	pccb->retryCount++;
	pccb->flags = ACTIVE \| AGTIAPI_RETRY;
	pccb->ccbStatus = 0;
	pccb->scsiStatus = 0;
	pccb->startTime = ticks;

	AGTIAPI_PRINTK( "agtiapi_Retry: start\n" );
	AGTIAPI_PRINTK( "agtiapi_Retry: ccb %p retry %d flgs x%x\n", pccb,
	pccb->retryCount, pccb->flags );

	agtiapi_QueueCCB(pmcsc, &pmcsc->ccbSendHead, &pmcsc->ccbSendTail
	AG_CARD_LOCAL_LOCK(&pmcsc->sendLock), pccb);
	return;
	}


	/******************************************************************************
	agtiapi_DumpCCB()

	Purpose:
	Dump CCB for debuging
	Parameters:
	ccb_t *pccb (IN) A pointer to the driver's own CCB, not CAM's CCB
	Return:
	Note:
	******************************************************************************/
	STATIC void agtiapi_DumpCCB(ccb_t *pccb)
	{
	AGTIAPI_PRINTK("agtiapi_DumpCCB: pccb %p, devHandle %p, tid %d, lun %d\n",
	pccb,
	pccb->devHandle,
	pccb->targetId,
	pccb->lun);
	AGTIAPI_PRINTK("flag 0x%x, add_mode 0x%x, ccbStatus 0x%x, scsiStatus 0x%x\n",
	pccb->flags,
	pccb->addrMode,
	pccb->ccbStatus,
	pccb->scsiStatus);
	AGTIAPI_PRINTK("scsi comand = 0x%x, numSgElements = %d\n",
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[0],
	pccb->numSgElements);
	AGTIAPI_PRINTK("dataLen = 0x%x, sens_len = 0x%x\n",
	pccb->dataLen,
	pccb->senseLen);
	AGTIAPI_PRINTK("tiSuperScsiRequest:\n");
	AGTIAPI_PRINTK("scsiCmnd: expDataLength 0x%x, taskAttribute 0x%x\n",
	pccb->tiSuperScsiRequest.scsiCmnd.expDataLength,
	pccb->tiSuperScsiRequest.scsiCmnd.taskAttribute);
	AGTIAPI_PRINTK("cdb[0] = 0x%x, cdb[1] = 0x%x, cdb[2] = 0x%x, cdb[3] = 0x%x\n",
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[0],
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[1],
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[2],
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[3]);
	AGTIAPI_PRINTK("cdb[4] = 0x%x, cdb[5] = 0x%x, cdb[6] = 0x%x, cdb[7] = 0x%x\n",
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[4],
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[5],
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[6],
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[7]);
	AGTIAPI_PRINTK( "cdb[8] = 0x%x, cdb[9] = 0x%x, cdb[10] = 0x%x, "
	"cdb[11] = 0x%x\n",
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[8],
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[9],
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[10],
	pccb->tiSuperScsiRequest.scsiCmnd.cdb[11] );
	AGTIAPI_PRINTK("agSgl1: upper 0x%x, lower 0x%x, len 0x%x, type %d\n",
	pccb->tiSuperScsiRequest.agSgl1.upper,
	pccb->tiSuperScsiRequest.agSgl1.lower,
	pccb->tiSuperScsiRequest.agSgl1.len,
	pccb->tiSuperScsiRequest.agSgl1.type);
	}

	/******************************************************************************
	agtiapi_eh_HostReset()

	Purpose:
	A new error handler of Host Reset command.
	Parameters:
	scsi_cmnd *cmnd (IN) Pointer to a command to the HBA to be reset
	Return:
	SUCCESS - success
	FAILED - fail
	Note:
	******************************************************************************/
	int agtiapi_eh_HostReset( struct agtiapi_softc pmcsc, union ccb cmnd )
	{
	AGTIAPI_PRINTK( "agtiapi_eh_HostReset: ccb pointer %p\n",
	cmnd );

	if( cmnd == NULL )
	{
	printf( "agtiapi_eh_HostReset: null command, skipping reset.\n" );
	return tiInvalidHandle;
	}

	#ifdef LOGEVENT
	agtiapi_LogEvent( pmcsc,
	IOCTL_EVT_SEV_INFORMATIONAL,
	0,
	agNULL,
	0,
	"agtiapi_eh_HostReset! " );
	#endif

	return agtiapi_DoSoftReset( pmcsc );
	}


	/******************************************************************************
	agtiapi_QueueCCB()

	Purpose:
	Put ccb in ccb queue at the tail
	Parameters:
	struct agtiapi_softc *pmcsc (IN) Pointer to HBA data structure
	pccb_t *phead (IN) Double pointer to ccb queue head
	pccb_t *ptail (IN) Double pointer to ccb queue tail
	ccb_t *pccb (IN) Poiner to a ccb to be queued
	Return:
	Note:
	Put the ccb to the tail of queue
	******************************************************************************/
	STATIC void agtiapi_QueueCCB( struct agtiapi_softc *pmcsc,
	pccb_t *phead,
	pccb_t *ptail,
	#ifdef AGTIAPI_LOCAL_LOCK
	struct mtx *mutex,
	#endif
	ccb_t *pccb )
	{
	AGTIAPI_IO( "agtiapi_QueueCCB: start\n" );
	AGTIAPI_IO( "agtiapi_QueueCCB: %p to %p\n", pccb, phead );
	if (phead == NULL \|\| ptail == NULL)
	{
	panic( "agtiapi_QueueCCB: phead %p ptail %p", phead, ptail );
	}
	pccb->pccbNext = NULL;
	AG_LOCAL_LOCK( mutex );
	if (*phead == NULL)
	{
	//WARN_ON(ptail != NULL); / critical, just get more logs */
	*phead = pccb;
	}
	else
	{
	//WARN_ON(ptail == NULL); / critical, just get more logs */
	if (*ptail)
	(*ptail)->pccbNext = pccb;
	}
	*ptail = pccb;
	AG_LOCAL_UNLOCK( mutex );
	return;
	}


	/******************************************************************************
	agtiapi_QueueCCB()

	Purpose:

	Parameters:


	Return:
	Note:

	******************************************************************************/
	static int agtiapi_QueueSMP(struct agtiapi_softc pmcsc, union ccb ccb)
	{
	pccb_t pccb = agNULL; /* call dequeue */
	int status = tiSuccess;
	int targetID = xpt_path_target_id(ccb->ccb_h.path);

	AGTIAPI_PRINTK("agtiapi_QueueSMP: start\n");

	/* get a ccb */
	if ((pccb = agtiapi_GetCCB(pmcsc)) == NULL)
	{
	AGTIAPI_PRINTK("agtiapi_QueueSMP: GetCCB ERROR\n");
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	return tiBusy;
	}
	pccb->pmcsc = pmcsc;

	/* initialize Command Control Block (CCB) */
	pccb->targetId = targetID;
	pccb->ccb = ccb; /* for struct scsi_cmnd */

	status = agtiapi_PrepareSMPSGList(pmcsc, pccb);

	if (status != tiSuccess)
	{
	AGTIAPI_PRINTK("agtiapi_QueueSMP: agtiapi_PrepareSMPSGList failure\n");
	agtiapi_FreeCCB(pmcsc, pccb);
	if (status == tiReject)
	{
	ccb->ccb_h.status = CAM_REQ_INVALID;
	}
	else
	{
	ccb->ccb_h.status = CAM_REQ_CMP;
	}
	xpt_done(ccb);
	return tiError;
	}

	return status;
	}

	/******************************************************************************
	agtiapi_SetLunField()

	Purpose:
	Set LUN field based on different address mode
	Parameters:
	ccb_t *pccb (IN) A pointer to the driver's own CCB, not CAM's CCB
	Return:
	Note:
	******************************************************************************/
	void agtiapi_SetLunField(ccb_t *pccb)
	{
	U08 *pchar;

	pchar = (U08 *)&pccb->tiSuperScsiRequest.scsiCmnd.lun;

	// AGTIAPI_PRINTK("agtiapi_SetLunField: start\n");

	switch (pccb->addrMode)
	{
	case AGTIAPI_PERIPHERAL:
	*pchar++ = 0;
	*pchar = (U08)pccb->lun;
	break;
	case AGTIAPI_VOLUME_SET:
	*pchar++ = (AGTIAPI_VOLUME_SET << AGTIAPI_ADDRMODE_SHIFT) \|
	(U08)((pccb->lun >> 8) & 0x3F);
	*pchar = (U08)pccb->lun;
	break;
	case AGTIAPI_LUN_ADDR:
	*pchar++ = (AGTIAPI_LUN_ADDR << AGTIAPI_ADDRMODE_SHIFT) \|
	pccb->targetId;
	*pchar = (U08)pccb->lun;
	break;
	}


	}


	/*****************************************************************************
	agtiapi_FreeCCB()

	Purpose:
	Free a ccb and put it back to ccbFreeList.
	Parameters:
	struct agtiapi_softc *pmcsc (IN) Pointer to HBA data structure
	pccb_t pccb (IN) A pointer to the driver's own CCB, not
	CAM's CCB
	Returns:
	Note:
	*****************************************************************************/
	STATIC void agtiapi_FreeCCB(struct agtiapi_softc *pmcsc, pccb_t pccb)
	{
	union ccb *ccb = pccb->ccb;
	bus_dmasync_op_t op;

	AG_LOCAL_LOCK(&pmcsc->ccbLock);
	AGTIAPI_IO( "agtiapi_FreeCCB: start %p\n", pccb );

	#ifdef AGTIAPI_TEST_EPL
	tiEncrypt_t *encrypt;
	#endif

	agtiapi_DumpCDB( "agtiapi_FreeCCB", pccb );

	if (pccb->sgList != agNULL)
	{
	AGTIAPI_IO( "agtiapi_FreeCCB: pccb->sgList is NOT null\n" );
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_FreeCCB: pccb->sgList is null\n" );
	}

	/* set data transfer direction */
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT)
	{
	op = BUS_DMASYNC_POSTWRITE;
	}
	else
	{
	op = BUS_DMASYNC_POSTREAD;
	}

	if (pccb->numSgElements == 0)
	{
	// do nothing
	AGTIAPI_IO( "agtiapi_FreeCCB: numSgElements zero\n" );
	}
	else if (pccb->numSgElements == 1)
	{
	AGTIAPI_IO( "agtiapi_FreeCCB: numSgElements is one\n" );
	//op is either BUS_DMASYNC_POSTWRITE or BUS_DMASYNC_POSTREAD
	bus_dmamap_sync(pmcsc->buffer_dmat, pccb->CCB_dmamap, op);
	bus_dmamap_unload(pmcsc->buffer_dmat, pccb->CCB_dmamap);
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_FreeCCB: numSgElements 2 or higher \n" );
	//op is either BUS_DMASYNC_POSTWRITE or BUS_DMASYNC_POSTREAD
	bus_dmamap_sync(pmcsc->buffer_dmat, pccb->CCB_dmamap, op);
	bus_dmamap_unload(pmcsc->buffer_dmat, pccb->CCB_dmamap);
	}

	#ifdef AGTIAPI_TEST_DPL
	if (pccb->tiSuperScsiRequest.Dif.enableDIFPerLA == TRUE) {
	if(pccb->dplPtr)
	memset( (char *) pccb->dplPtr,
	0,
	MAX_DPL_REGIONS * sizeof(dplaRegion_t) );
	pccb->tiSuperScsiRequest.Dif.enableDIFPerLA = FALSE;
	pccb->tiSuperScsiRequest.Dif.DIFPerLAAddrLo = 0;
	pccb->tiSuperScsiRequest.Dif.DIFPerLAAddrHi = 0;
	}
	#endif

	#ifdef AGTIAPI_TEST_EPL
	encrypt = &pccb->tiSuperScsiRequest.Encrypt;
	if (encrypt->enableEncryptionPerLA == TRUE) {
	encrypt->enableEncryptionPerLA = FALSE;
	encrypt->EncryptionPerLAAddrLo = 0;
	encrypt->EncryptionPerLAAddrHi = 0;
	}
	#endif

	#ifdef ENABLE_SATA_DIF
	if (pccb->holePtr && pccb->dmaHandleHole)
	pci_free_consistent( pmcsc->pCardInfo->pPCIDev,
	512,
	pccb->holePtr,
	pccb->dmaHandleHole );
	pccb->holePtr = 0;
	pccb->dmaHandleHole = 0;
	#endif

	pccb->dataLen = 0;
	pccb->retryCount = 0;
	pccb->ccbStatus = 0;
	pccb->scsiStatus = 0;
	pccb->startTime = 0;
	pccb->dmaHandle = 0;
	pccb->numSgElements = 0;
	pccb->tiIORequest.tdData = 0;
	memset((void *)&pccb->tiSuperScsiRequest, 0, AGSCSI_INIT_XCHG_LEN);

	#ifdef HIALEAH_ENCRYPTION
	if (pmcsc->encrypt)
	agtiapi_CleanupEncryptedIO(pmcsc, pccb);
	#endif

	pccb->flags = 0;
	pccb->ccb = NULL;
	pccb->pccbIO = NULL;
	pccb->pccbNext = (pccb_t)pmcsc->ccbFreeList;
	pmcsc->ccbFreeList = (caddr_t *)pccb;

	pmcsc->activeCCB--;

	AG_LOCAL_UNLOCK(&pmcsc->ccbLock);
	return;
	}


	/******************************************************************************
	agtiapi_FlushCCBs()

	Purpose:
	Flush all in processed ccbs.
	Parameters:
	ag_card_t *pCard (IN) Pointer to HBA data structure
	U32 flag (IN) Flag to call back
	Return:
	Note:
	******************************************************************************/
	STATIC void agtiapi_FlushCCBs( struct agtiapi_softc *pCard, U32 flag )
	{
	union ccb *ccb;
	ccb_t *pccb;

	AGTIAPI_PRINTK( "agtiapi_FlushCCBs: enter \n" );
	for( pccb = (pccb_t)pCard->ccbChainList;
	pccb != NULL;
	pccb = pccb->pccbChainNext ) {
	if( pccb->flags == 0 )
	{
	// printf( "agtiapi_FlushCCBs: nothing, continue \n" );
	continue;
	}
	ccb = pccb->ccb;
	if ( pccb->flags & ( TASK_MANAGEMENT \| DEV_RESET ) )
	{
	AGTIAPI_PRINTK( "agtiapi_FlushCCBs: agtiapi_FreeTMCCB \n" );
	agtiapi_FreeTMCCB( pCard, pccb );
	}
	else
	{
	if ( pccb->flags & TAG_SMP )
	{
	AGTIAPI_PRINTK( "agtiapi_FlushCCBs: agtiapi_FreeSMPCCB \n" );
	agtiapi_FreeSMPCCB( pCard, pccb );
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_FlushCCBs: agtiapi_FreeCCB \n" );
	agtiapi_FreeCCB( pCard, pccb );
	}
	if( ccb ) {
	CMND_DMA_UNMAP( pCard, ccb );
	if( flag == AGTIAPI_CALLBACK ) {
	ccb->ccb_h.status = CAM_SCSI_BUS_RESET;
	xpt_done( ccb );
	}
	}
	}
	}
	}

	/*****************************************************************************
	agtiapi_FreeSMPCCB()

	Purpose:
	Free a ccb and put it back to ccbFreeList.
	Parameters:
	struct agtiapi_softc *pmcsc (IN) Pointer to HBA data structure
	pccb_t pccb (IN) A pointer to the driver's own CCB, not
	CAM's CCB
	Returns:
	Note:
	*****************************************************************************/
	STATIC void agtiapi_FreeSMPCCB(struct agtiapi_softc *pmcsc, pccb_t pccb)
	{
	union ccb *ccb = pccb->ccb;
	bus_dmasync_op_t op;

	AG_LOCAL_LOCK(&pmcsc->ccbLock);
	AGTIAPI_PRINTK("agtiapi_FreeSMPCCB: start %p\n", pccb);

	/* set data transfer direction */
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT)
	{
	op = BUS_DMASYNC_POSTWRITE;
	}
	else
	{
	op = BUS_DMASYNC_POSTREAD;
	}

	if (pccb->numSgElements == 0)
	{
	// do nothing
	AGTIAPI_PRINTK("agtiapi_FreeSMPCCB: numSgElements 0\n");
	}
	else if (pccb->numSgElements == 1)
	{
	AGTIAPI_PRINTK("agtiapi_FreeSMPCCB: numSgElements 1\n");
	//op is either BUS_DMASYNC_POSTWRITE or BUS_DMASYNC_POSTREAD
	bus_dmamap_sync(pmcsc->buffer_dmat, pccb->CCB_dmamap, op);
	bus_dmamap_unload(pmcsc->buffer_dmat, pccb->CCB_dmamap);
	}
	else
	{
	AGTIAPI_PRINTK("agtiapi_FreeSMPCCB: numSgElements 2 or higher \n");
	//op is either BUS_DMASYNC_POSTWRITE or BUS_DMASYNC_POSTREAD
	bus_dmamap_sync(pmcsc->buffer_dmat, pccb->CCB_dmamap, op);
	bus_dmamap_unload(pmcsc->buffer_dmat, pccb->CCB_dmamap);
	}

	/dma api cleanning/
	pccb->dataLen = 0;
	pccb->retryCount = 0;
	pccb->ccbStatus = 0;
	pccb->startTime = 0;
	pccb->dmaHandle = 0;
	pccb->numSgElements = 0;
	pccb->tiIORequest.tdData = 0;
	memset((void *)&pccb->tiSMPFrame, 0, AGSMP_INIT_XCHG_LEN);

	pccb->flags = 0;
	pccb->ccb = NULL;
	pccb->pccbNext = (pccb_t)pmcsc->ccbFreeList;
	pmcsc->ccbFreeList = (caddr_t *)pccb;

	pmcsc->activeCCB--;

	AG_LOCAL_UNLOCK(&pmcsc->ccbLock);
	return;

	}

	/*****************************************************************************
	agtiapi_FreeTMCCB()

	Purpose:
	Free a ccb and put it back to ccbFreeList.
	Parameters:
	struct agtiapi_softc *pmcsc (IN) Pointer to HBA data structure
	pccb_t pccb (IN) A pointer to the driver's own CCB, not
	CAM's CCB
	Returns:
	Note:
	*****************************************************************************/
	STATIC void agtiapi_FreeTMCCB(struct agtiapi_softc *pmcsc, pccb_t pccb)
	{
	AG_LOCAL_LOCK(&pmcsc->ccbLock);
	AGTIAPI_PRINTK("agtiapi_FreeTMCCB: start %p\n", pccb);
	pccb->dataLen = 0;
	pccb->retryCount = 0;
	pccb->ccbStatus = 0;
	pccb->scsiStatus = 0;
	pccb->startTime = 0;
	pccb->dmaHandle = 0;
	pccb->numSgElements = 0;
	pccb->tiIORequest.tdData = 0;
	memset((void *)&pccb->tiSuperScsiRequest, 0, AGSCSI_INIT_XCHG_LEN);
	pccb->flags = 0;
	pccb->ccb = NULL;
	pccb->pccbIO = NULL;
	pccb->pccbNext = (pccb_t)pmcsc->ccbFreeList;
	pmcsc->ccbFreeList = (caddr_t *)pccb;
	pmcsc->activeCCB--;
	AG_LOCAL_UNLOCK(&pmcsc->ccbLock);
	return;
	}
	/******************************************************************************
	agtiapi_CheckAllVectors():

	Purpose:
	Parameters:
	Return:
	Note:
	Currently, not used.
	******************************************************************************/
	void agtiapi_CheckAllVectors( struct agtiapi_softc *pCard, bit32 context )
	{
	#ifdef SPC_MSIX_INTR
	if (!agtiapi_intx_mode)
	{
	int i;

	for (i = 0; i < pCard->pCardInfo->maxInterruptVectors; i++)
	if (tiCOMInterruptHandler(&pCard->tiRoot, i) == agTRUE)
	tiCOMDelayedInterruptHandler(&pCard->tiRoot, i, 100, context);
	}
	else
	if (tiCOMInterruptHandler(&pCard->tiRoot, 0) == agTRUE)
	tiCOMDelayedInterruptHandler(&pCard->tiRoot, 0, 100, context);
	#else
	if (tiCOMInterruptHandler(&pCard->tiRoot, 0) == agTRUE)
	tiCOMDelayedInterruptHandler(&pCard->tiRoot, 0, 100, context);
	#endif

	}


	/******************************************************************************
	agtiapi_CheckCB()

	Purpose:
	Check call back function returned event for process completion
	Parameters:
	struct agtiapi_softc *pCard Pointer to card data structure
	U32 milisec (IN) Waiting time for expected event
	U32 flag (IN) Flag of the event to check
	U32 *pStatus (IN) Pointer to status of the card or port to check
	Return:
	AGTIAPI_SUCCESS - event comes as expected
	AGTIAPI_FAIL - event not coming
	Note:
	Currently, not used
	******************************************************************************/
	agBOOLEAN agtiapi_CheckCB( struct agtiapi_softc *pCard,
	U32 milisec,
	U32 flag,
	volatile U32 *pStatus )
	{
	U32 msecsPerTick = pCard->pCardInfo->tiRscInfo.tiInitiatorResource.
	initiatorOption.usecsPerTick / 1000;
	S32 i = milisec/msecsPerTick;
	AG_GLOBAL_ARG( _flags );

	AGTIAPI_PRINTK( "agtiapi_CheckCB: start\n" );
	AGTIAPI_FLOW( "agtiapi_CheckCB: start\n" );

	if( i <= 0 )
	i = 1;
	while (i > 0)
	{
	if (*pStatus & TASK_MANAGEMENT)
	{
	if (*pStatus & AGTIAPI_CB_DONE)
	{
	if( flag == 0 \|\| *pStatus & flag )
	return AGTIAPI_SUCCESS;
	else
	return AGTIAPI_FAIL;
	}
	}
	else if (pCard->flags & AGTIAPI_CB_DONE)
	{
	if( flag == 0 \|\| *pStatus & flag )
	return AGTIAPI_SUCCESS;
	else
	return AGTIAPI_FAIL;
	}

	agtiapi_DelayMSec( msecsPerTick );

	AG_SPIN_LOCK_IRQ( agtiapi_host_lock, _flags );
	tiCOMTimerTick( &pCard->tiRoot );

	agtiapi_CheckAllVectors( pCard, tiNonInterruptContext );
	AG_SPIN_UNLOCK_IRQ( agtiapi_host_lock, _flags );

	i--;
	}

	if( *pStatus & TASK_MANAGEMENT )
	*pStatus \|= TASK_TIMEOUT;

	return AGTIAPI_FAIL;
	}


	/******************************************************************************
	agtiapi_DiscoverTgt()

	Purpose:
	Discover available devices
	Parameters:
	struct agtiapi_softc *pCard (IN) Pointer to the HBA data structure
	Return:
	Note:
	******************************************************************************/
	STATIC void agtiapi_DiscoverTgt(struct agtiapi_softc *pCard)
	{

	ag_portal_data_t *pPortalData;
	U32 count;

	AGTIAPI_PRINTK("agtiapi_DiscoverTgt: start\n");
	AGTIAPI_FLOW("agtiapi_DiscoverTgt\n");
	AGTIAPI_INIT("agtiapi_DiscoverTgt\n");

	pPortalData = pCard->pPortalData;
	for (count = 0; count < pCard->portCount; count++, pPortalData++)
	{
	pCard->flags &= ~AGTIAPI_CB_DONE;
	if (!(PORTAL_STATUS(pPortalData) & AGTIAPI_PORT_DISC_READY))
	{
	if (pCard->flags & AGTIAPI_INIT_TIME)
	{
	if (agtiapi_CheckCB(pCard, 5000, AGTIAPI_PORT_DISC_READY,
	&PORTAL_STATUS(pPortalData)) == AGTIAPI_FAIL)
	{
	AGTIAPI_PRINTK( "agtiapi_DiscoverTgt: Port %p / %d not ready for "
	"discovery\n",
	pPortalData, count );
	/*
	* There is no need to spend time on discovering device
	* if port is not ready to do so.
	*/
	continue;
	}
	}
	else
	continue;
	}

	AGTIAPI_FLOW( "agtiapi_DiscoverTgt: Portal %p DiscoverTargets starts\n",
	pPortalData );
	AGTIAPI_INIT_DELAY(1000);

	pCard->flags &= ~AGTIAPI_CB_DONE;
	if (tiINIDiscoverTargets(&pCard->tiRoot,
	&pPortalData->portalInfo.tiPortalContext,
	FORCE_PERSISTENT_ASSIGN_MASK)
	!= tiSuccess)
	AGTIAPI_PRINTK("agtiapi_DiscoverTgt: tiINIDiscoverTargets ERROR\n");

	/*
	* Should wait till discovery completion to start
	* next portal. However, lower layer have issue on
	* multi-portal case under Linux.
	*/
	}

	pPortalData = pCard->pPortalData;
	for (count = 0; count < pCard->portCount; count++, pPortalData++)
	{
	if ((PORTAL_STATUS(pPortalData) & AGTIAPI_PORT_DISC_READY))
	{
	if (agtiapi_CheckCB(pCard, 20000, AGTIAPI_DISC_COMPLETE,
	&PORTAL_STATUS(pPortalData)) == AGTIAPI_FAIL)
	{
	if ((PORTAL_STATUS(pPortalData) & AGTIAPI_DISC_COMPLETE))
	AGTIAPI_PRINTK( "agtiapi_DiscoverTgt: Portal %p discover complete, "
	"status 0x%x\n",
	pPortalData,
	PORTAL_STATUS(pPortalData) );
	else
	AGTIAPI_PRINTK( "agtiapi_DiscoverTgt: Portal %p discover is not "
	"completed, status 0x%x\n",
	pPortalData, PORTAL_STATUS(pPortalData) );
	continue;
	}
	AGTIAPI_PRINTK( "agtiapi_DiscoverTgt: Portal %d discover target "
	"success\n",
	count );
	}
	}

	/*
	* Calling to get device handle should be done per portal based
	* and better right after discovery is done. However, lower iscsi
	* layer may not returns discovery complete in correct sequence or we
	* ran out time. We get device handle for all portals together
	* after discovery is done or timed out.
	*/
	pPortalData = pCard->pPortalData;
	for (count = 0; count < pCard->portCount; count++, pPortalData++)
	{
	/*
	* We try to get device handle no matter
	* if discovery is completed or not.
	*/
	if (PORTAL_STATUS(pPortalData) & AGTIAPI_PORT_DISC_READY)
	{
	U32 i;

	for (i = 0; i < AGTIAPI_GET_DEV_MAX; i++)
	{
	if (agtiapi_GetDevHandle(pCard, &pPortalData->portalInfo, 0, 0) != 0)
	break;
	agtiapi_DelayMSec(AGTIAPI_EXTRA_DELAY);
	}

	if ((PORTAL_STATUS(pPortalData) & AGTIAPI_DISC_COMPLETE) \|\|
	(pCard->tgtCount > 0))
	PORTAL_STATUS(pPortalData) \|= ( AGTIAPI_DISC_DONE \|
	AGTIAPI_PORT_LINK_UP );
	}
	}

	return;

	}



	/******************************************************************************
	agtiapi_PrepCCBs()

	Purpose:
	Prepares CCB including DMA map.
	Parameters:
	struct agtiapi_softc *pCard (IN) Pointer to the HBA data structure
	ccb_hdr_t *hdr (IN) Pointer to the CCB header
	U32 size (IN) size
	U32 max_ccb (IN) count

	Return:
	Note:
	******************************************************************************/
	STATIC void agtiapi_PrepCCBs( struct agtiapi_softc *pCard,
	ccb_hdr_t *hdr,
	U32 size,
	U32 max_ccb,
	int tid )
	{

	int i;
	U32 hdr_sz, ccb_sz;
	ccb_t *pccb = NULL;
	int offset = 0;
	int nsegs = 0;
	int sgl_sz = 0;

	AGTIAPI_PRINTK("agtiapi_PrepCCBs: start\n");
	offset = tid * AGTIAPI_CCB_PER_DEVICE;
	nsegs = AGTIAPI_NSEGS;
	sgl_sz = sizeof(tiSgl_t) * nsegs;
	AGTIAPI_PRINTK( "agtiapi_PrepCCBs: tid %d offset %d nsegs %d sizeof(tiSgl_t) "
	"%lu, max_ccb %d\n",
	tid,
	offset,
	nsegs,
	sizeof(tiSgl_t),
	max_ccb );

	ccb_sz = roundup2(AGTIAPI_CCB_SIZE, cache_line_size());
	hdr_sz = roundup2(sizeof(*hdr), cache_line_size());

	AGTIAPI_PRINTK("agtiapi_PrepCCBs: after cache line\n");

	memset((void *)hdr, 0, size);
	hdr->next = pCard->ccbAllocList;
	pCard->ccbAllocList = hdr;

	AGTIAPI_PRINTK("agtiapi_PrepCCBs: after memset\n");

	pccb = (ccb_t) ((char)hdr + hdr_sz);

	for (i = 0; i < max_ccb; i++, pccb = (ccb_t)((char)pccb + ccb_sz))
	{
	pccb->tiIORequest.osData = (void *)pccb;

	/*
	* Initially put all the ccbs on the free list
	* in addition to chainlist.
	* ccbChainList is a list of all available ccbs
	* (free/active everything)
	*/
	pccb->pccbChainNext = (pccb_t)pCard->ccbChainList;
	pccb->pccbNext = (pccb_t)pCard->ccbFreeList;

	pCard->ccbChainList = (caddr_t *)pccb;
	pCard->ccbFreeList = (caddr_t *)pccb;
	pCard->ccbTotal++;

	#ifdef AGTIAPI_ALIGN_CHECK
	if (&pccb & 0x63)
	AGTIAPI_PRINTK("pccb = %p\n", pccb);
	if (pccb->devHandle & 0x63)
	AGTIAPI_PRINTK("devHandle addr = %p\n", &pccb->devHandle);
	if (&pccb->lun & 0x63)
	AGTIAPI_PRINTK("lun addr = %p\n", &pccb->lun);
	if (&pccb->targetId & 0x63)
	AGTIAPI_PRINTK("tig addr = %p\n", &pccb->targetId);
	if (&pccb->ccbStatus & 0x63)
	AGTIAPI_PRINTK("ccbStatus addr = %p\n", &pccb->ccbStatus);
	if (&pccb->scsiStatus & 0x63)
	AGTIAPI_PRINTK("scsiStatus addr = %p\n", &pccb->scsiStatus);
	if (&pccb->dataLen & 0x63)
	AGTIAPI_PRINTK("dataLen addr = %p\n", &pccb->dataLen);
	if (&pccb->senseLen & 0x63)
	AGTIAPI_PRINTK("senseLen addr = %p\n", &pccb->senseLen);
	if (&pccb->numSgElements & 0x63)
	AGTIAPI_PRINTK("numSgElements addr = %p\n", &pccb->numSgElements);
	if (&pccb->retryCount & 0x63)
	AGTIAPI_PRINTK("retry cnt addr = %p\n", &pccb->retryCount);
	if (&pccb->flags & 0x63)
	AGTIAPI_PRINTK("flag addr = %p\n", &pccb->flags);
	if (&pccb->pSenseData & 0x63)
	AGTIAPI_PRINTK("senseData addr = %p\n", &pccb->pSenseData);
	if (&pccb->sgList[0] & 0x63)
	AGTIAPI_PRINTK("SgList 0 = %p\n", &pccb->sgList[0]);
	if (&pccb->pccbNext & 0x63)
	AGTIAPI_PRINTK("ccb next = %p\n", &pccb->pccbNext);
	if (&pccb->pccbChainNext & 0x63)
	AGTIAPI_PRINTK("ccbChainNext = %p\n", &pccb->pccbChainNext);
	if (&pccb->cmd & 0x63)
	AGTIAPI_PRINTK("command = %p\n", &pccb->cmd);
	if( &pccb->startTime & 0x63 )
	AGTIAPI_PRINTK( "startTime = %p\n", &pccb->startTime );
	if (&pccb->tiIORequest & 0x63)
	AGTIAPI_PRINTK("tiIOReq addr = %p\n", &pccb->tiIORequest);
	if (&pccb->tdIOReqBody & 0x63)
	AGTIAPI_PRINTK("tdIORequestBody addr = %p\n", &pccb->tdIOReqBody);
	if (&pccb->tiSuperScsiRequest & 0x63)
	AGTIAPI_PRINTK( "InitiatorExchange addr = %p\n",
	&pccb->tiSuperScsiRequest );
	#endif
	if ( bus_dmamap_create( pCard->buffer_dmat, 0, &pccb->CCB_dmamap ) !=
	tiSuccess)
	{
	AGTIAPI_PRINTK("agtiapi_PrepCCBs: can't create dma\n");
	return;
	}
	/* assigns tiSgl_t memory to pccb */
	pccb->sgList = (void)((U64)pCard->tisgl_mem + ((i + offset) sgl_sz));
	pccb->tisgl_busaddr = pCard->tisgl_busaddr + ((i + offset) * sgl_sz);
	pccb->ccb = NULL;
	pccb->pccbIO = NULL;
	pccb->startTime = 0;
	}

	#ifdef AGTIAPI_ALIGN_CHECK
	AGTIAPI_PRINTK("ccb size = %d / %d\n", sizeof(ccb_t), ccb_sz);
	#endif
	return;
	}

	/******************************************************************************
	agtiapi_InitCCBs()

	Purpose:
	Create and initialize per card based CCB pool.
	Parameters:
	struct agtiapi_softc *pCard (IN) Pointer to the HBA data structure
	int tgtCount (IN) Count
	Return:
	Total number of ccb allocated
	Note:
	******************************************************************************/
	STATIC U32 agtiapi_InitCCBs(struct agtiapi_softc *pCard, int tgtCount, int tid)
	{

	U32 max_ccb, size, ccb_sz, hdr_sz;
	int no_allocs = 0, i;
	ccb_hdr_t *hdr = NULL;

	AGTIAPI_PRINTK("agtiapi_InitCCBs: start\n");
	AGTIAPI_PRINTK("agtiapi_InitCCBs: tgtCount %d tid %d\n", tgtCount, tid);
	AGTIAPI_FLOW("agtiapi_InitCCBs: tgtCount %d tid %d\n", tgtCount, tid);

	#ifndef HOTPLUG_SUPPORT
	if (pCard->tgtCount > AGSA_MAX_INBOUND_Q)
	return 1;
	#else
	if (tgtCount > AGSA_MAX_INBOUND_Q)
	tgtCount = AGSA_MAX_INBOUND_Q;
	#endif

	max_ccb = tgtCount * AGTIAPI_CCB_PER_DEVICE;// / 4; // TBR
	ccb_sz = roundup2(AGTIAPI_CCB_SIZE, cache_line_size());
	hdr_sz = roundup2(sizeof(*hdr), cache_line_size());
	size = ccb_sz * max_ccb + hdr_sz;

	for (i = 0; i < (1 << no_allocs); i++)
	{
	hdr = (ccb_hdr_t*)malloc( size, M_PMC_MCCB, M_NOWAIT );
	if( !hdr )
	{
	panic( "agtiapi_InitCCBs: bug!!!\n" );
	}
	else
	{
	agtiapi_PrepCCBs( pCard, hdr, size, max_ccb, tid );
	}
	}

	return 1;

	}


	#ifdef LINUX_PERBI_SUPPORT
	/******************************************************************************
	agtiapi_GetWWNMappings()

	Purpose:
	Get the mappings from target IDs to WWNs, if any.
	Store them in the WWN_list array, indexed by target ID.
	Leave the devListIndex field blank; this will be filled-in later.
	Parameters:
	ag_card_t *pCard (IN) Pointer to HBA data structure
	ag_mapping_t *pMapList (IN) Pointer to mapped device list
	Return:
	Note: The boot command line parameters are used to load the
	mapping information, which is contained in the system
	configuration file.
	******************************************************************************/
	STATIC void agtiapi_GetWWNMappings( struct agtiapi_softc *pCard,
	ag_mapping_t *pMapList )
	{
	int devDisc;
	int lIdx = 0;
	ag_tgt_map_t *pWWNList;
	ag_slr_map_t *pSLRList;
	ag_device_t *pDevList;

	if( !pCard )
	panic( "agtiapi_GetWWNMappings: no pCard \n" );

	AGTIAPI_PRINTK( "agtiapi_GetWWNMappings: start\n" );

	pWWNList = pCard->pWWNList;
	pSLRList = pCard->pSLRList;
	pDevList = pCard->pDevList;
	pCard->numTgtHardMapped = 0;
	devDisc = pCard->devDiscover;

	pWWNList[devDisc-1].devListIndex = maxTargets;
	pSLRList[devDisc-1].localeNameLen = -2;
	pSLRList[devDisc-1].remoteNameLen = -2;
	pDevList[devDisc-1].targetId = maxTargets;

	/*
	* Get the mappings from holding area which contains
	* the input of the system file and store them
	* in the WWN_list array, indexed by target ID.
	*/
	for ( lIdx = 0; lIdx < devDisc - 1; lIdx++) {
	pWWNList[lIdx].flags = 0;
	pWWNList[lIdx].devListIndex = maxTargets;
	pSLRList[lIdx].localeNameLen = -1;
	pSLRList[lIdx].remoteNameLen = -1;
	}

	// this is where we would propagate values fed to pMapList

	} /* agtiapi_GetWWNMappings */

	#endif


	/******************************************************************************
	agtiapi_FindWWNListNext()
	Purpose:
	finds first available new (unused) wwn list entry

	Parameters:
	ag_tgt_map_t *pWWNList Pointer to head of wwn list
	int lstMax Number of entries in WWNList
	Return:
	index into WWNList indicating available entry space;
	if available entry space is not found, return negative value
	******************************************************************************/
	STATIC int agtiapi_FindWWNListNext( ag_tgt_map_t *pWWNList, int lstMax )
	{
	int lLstIdx;

	for ( lLstIdx = 0; lLstIdx < lstMax; lLstIdx++ )
	{
	if ( pWWNList[lLstIdx].devListIndex == lstMax &&
	pWWNList[lLstIdx].targetLen == 0 )
	{
	AGTIAPI_PRINTK( "agtiapi_FindWWNListNext: %d %d %d %d v. %d\n",
	lLstIdx,
	pWWNList[lLstIdx].devListIndex,
	pWWNList[lLstIdx].targetLen,
	pWWNList[lLstIdx].portId,
	lstMax );
	return lLstIdx;
	}
	}
	return -1;
	}


	/******************************************************************************
	agtiapi_GetDevHandle()

	Purpose:
	Get device handle. Handles will be placed in the
	devlist array with same order as TargetList provided and
	will be mapped to a scsi target id and registered to OS later.
	Parameters:
	struct agtiapi_softc *pCard (IN) Pointer to the HBA data structure
	ag_portal_info_t *pPortalInfo (IN) Pointer to the portal data structure
	U32 eType (IN) Port event
	U32 eStatus (IN) Port event status
	Return:
	Number of device handle slot present
	Note:
	The sequence of device handle will match the sequence of taregt list
	******************************************************************************/
	STATIC U32 agtiapi_GetDevHandle( struct agtiapi_softc *pCard,
	ag_portal_info_t *pPortalInfo,
	U32 eType,
	U32 eStatus )
	{
	ag_device_t *pDevice;
	// tiDeviceHandle_t *agDev[pCard->devDiscover];
	tiDeviceHandle_t **agDev;
	int devIdx, szdv, devTotal, cmpsetRtn;
	int lDevIndex = 0, lRunScanFlag = FALSE;
	int *lDevFlags;
	tiPortInfo_t portInfT;
	ag_device_t lTmpDevice;
	ag_tgt_map_t *pWWNList;
	ag_slr_map_t *pSLRList;
	bit32 lReadRm;
	bit16 lReadCt;


	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: start\n" );
	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: pCard->devDiscover %d / tgtCt %d\n",
	pCard->devDiscover, pCard->tgtCount );
	AGTIAPI_FLOW( "agtiapi_GetDevHandle: portalInfo %p\n", pPortalInfo );
	AGTIAPI_INIT_DELAY( 1000 );

	agDev = (tiDeviceHandle_t *) malloc( sizeof(tiDeviceHandle_t ) * pCard->devDiscover,
	M_PMC_MDEV, M_ZERO \| M_NOWAIT);
	if (agDev == NULL)
	{
	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: failed to alloc agDev[]\n" );
	return 0;
	}

	lDevFlags = (int ) malloc( sizeof(int) pCard->devDiscover,
	M_PMC_MFLG, M_ZERO \| M_NOWAIT );
	if (lDevFlags == NULL)
	{
	free((caddr_t)agDev, M_PMC_MDEV);
	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: failed to alloc lDevFlags[]\n" );
	return 0;
	}

	pWWNList = pCard->pWWNList;
	pSLRList = pCard->pSLRList;

	memset( (void )agDev, 0, sizeof(void ) * pCard->devDiscover );
	memset( lDevFlags, 0, sizeof(int) * pCard->devDiscover );

	// get device handles
	devTotal = tiINIGetDeviceHandles( &pCard->tiRoot,
	&pPortalInfo->tiPortalContext,
	(tiDeviceHandle_t **)agDev,
	pCard->devDiscover );

	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: portalInfo %p port id %d event %u "
	"status %u card %p pCard->devDiscover %d devTotal %d "
	"pPortalInfo->devTotal %d pPortalInfo->devPrev %d "
	"AGTIAPI_INIT_TIME %x\n",
	pPortalInfo, pPortalInfo->portID, eType, eStatus, pCard,
	pCard->devDiscover, devTotal, pPortalInfo->devTotal,
	pPortalInfo->devPrev,
	pCard->flags & AGTIAPI_INIT_TIME );

	// reset devTotal from any previous runs of this
	pPortalInfo->devPrev = devTotal;
	pPortalInfo->devTotal = devTotal;

	AG_LIST_LOCK( &pCard->devListLock );

	if ( tiCOMGetPortInfo( &pCard->tiRoot,
	&pPortalInfo->tiPortalContext,
	&portInfT )
	!= tiSuccess)
	{
	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: tiCOMGetPortInfo did not succeed. \n" );
	}


	szdv = sizeof( pPortalInfo->pDevList ) / sizeof( pPortalInfo->pDevList[0] );
	if (szdv > pCard->devDiscover)
	{
	szdv = pCard->devDiscover;
	}

	// reconstructing dev list via comparison of wwn

	for ( devIdx = 0; devIdx < pCard->devDiscover; devIdx++ )
	{
	if ( agDev[devIdx] != NULL )
	{
	// AGTIAPI_PRINTK( "agtiapi_GetDevHandle: agDev %d not NULL %p\n",
	// devIdx, agDev[devIdx] );

	// pack temp device structure for tiINIGetDeviceInfo call
	pDevice = &lTmpDevice;
	pDevice->devType = DIRECT_DEVICE;
	pDevice->pCard = (void *)pCard;
	pDevice->flags = ACTIVE;
	pDevice->pPortalInfo = pPortalInfo;
	pDevice->pDevHandle = agDev[devIdx];
	pDevice->qbusy = agFALSE;

	//AGTIAPI_PRINTK( "agtiapi_GetDevHandle: idx %d / %d : %p \n",
	// devIdx, pCard->devDiscover, agDev[devIdx] );

	tiINIGetDeviceInfo( &pCard->tiRoot, agDev[devIdx],
	&pDevice->devInfo );

	//AGTIAPI_PRINTK( "agtiapi_GetDevHandle: wwn sizes %ld %d/%d ",
	// sizeof(pDevice->targetName),
	// pDevice->devInfo.osAddress1,
	// pDevice->devInfo.osAddress2 );

	wwncpy( pDevice );
	wwnprintk( (unsigned char*)pDevice->targetName, pDevice->targetLen );

	for ( lDevIndex = 0; lDevIndex < szdv; lDevIndex++ ) // match w/ wwn list
	{
	if ( (pCard->pDevList[lDevIndex].portalId == pPortalInfo->portID) &&
	pDevice->targetLen > 0 &&
	portInfT.localNameLen > 0 &&
	portInfT.remoteNameLen > 0 &&
	pSLRList[pWWNList[lDevIndex].sasLrIdx].localeNameLen > 0 &&
	pSLRList[pWWNList[lDevIndex].sasLrIdx].remoteNameLen > 0 &&
	( portInfT.localNameLen ==
	pSLRList[pWWNList[lDevIndex].sasLrIdx].localeNameLen ) &&
	( portInfT.remoteNameLen ==
	pSLRList[pWWNList[lDevIndex].sasLrIdx].remoteNameLen ) &&
	memcmp( pWWNList[lDevIndex].targetName, pDevice->targetName,
	pDevice->targetLen ) == 0 &&
	memcmp( pSLRList[pWWNList[lDevIndex].sasLrIdx].localeName,
	portInfT.localName,
	portInfT.localNameLen ) == 0 &&
	memcmp( pSLRList[pWWNList[lDevIndex].sasLrIdx].remoteName,
	portInfT.remoteName,
	portInfT.remoteNameLen ) == 0 )
	{
	AGTIAPI_PRINTK( " pWWNList match @ %d/%d/%d \n",
	lDevIndex, devIdx, pPortalInfo->portID );

	if ( (pCard->pDevList[lDevIndex].targetId == lDevIndex) &&
	( pPortalInfo->pDevList[lDevIndex] ==
	&pCard->pDevList[lDevIndex] ) ) // active
	{

	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: dev in use %d of %d/%d\n",
	lDevIndex, devTotal, pPortalInfo->portID );
	lDevFlags[devIdx] \|= DPMC_LEANFLAG_AGDEVUSED; // agDev handle
	lDevFlags[lDevIndex] \|= DPMC_LEANFLAG_PDEVSUSED; // pDevice used
	lReadRm = atomic_readandclear_32( &pWWNList[lDevIndex].devRemoved );
	if ( lReadRm ) // cleared timeout, now remove count for timer
	{
	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: clear timer count for"
	" %d of %d\n",
	lDevIndex, pPortalInfo->portID );
	atomic_subtract_16( &pCard->rmChkCt, 1 );
	lReadCt = atomic_load_acq_16( &pCard->rmChkCt );
	if ( 0 == lReadCt )
	{
	callout_stop( &pCard->devRmTimer );
	}
	}
	break;
	}

	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: goin fresh on %d of %d/%d\n",
	lDevIndex, // reactivate now
	devTotal, pPortalInfo->portID );

	// pDevice going fresh
	lRunScanFlag = TRUE; // scan and clear outstanding removals

	// pCard->tgtCount++; ##
	pDevice->targetId = lDevIndex;
	pDevice->portalId = pPortalInfo->portID;

	memcpy ( &pCard->pDevList[lDevIndex], pDevice, sizeof(lTmpDevice) );
	agDev[devIdx]->osData = (void *)&pCard->pDevList[lDevIndex];
	if ( agtiapi_InitCCBs( pCard, 1, pDevice->targetId ) == 0 )
	{
	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: InitCCB "
	"tgtCnt %d ERROR!\n", pCard->tgtCount );
	AG_LIST_UNLOCK( &pCard->devListLock );
	free((caddr_t)lDevFlags, M_PMC_MFLG);
	free((caddr_t)agDev, M_PMC_MDEV);
	return 0;
	}
	pPortalInfo->pDevList[lDevIndex] = &pCard->pDevList[lDevIndex]; // (ag_device_t *)
	if ( 0 == lDevFlags[devIdx] )
	{
	pPortalInfo->devTotal++;
	lDevFlags[devIdx] \|= DPMC_LEANFLAG_AGDEVUSED; // agDev used
	lDevFlags[lDevIndex] \|= DPMC_LEANFLAG_PDEVSUSED; // pDevice used
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: odd dev handle "
	"status inspect %d %d %d\n",
	lDevFlags[devIdx], devIdx, lDevIndex );
	pPortalInfo->devTotal++;
	lDevFlags[devIdx] \|= DPMC_LEANFLAG_AGDEVUSED; // agDev used
	lDevFlags[lDevIndex] \|= DPMC_LEANFLAG_PDEVSUSED; // pDevice used

	}
	break;
	}
	}
	// end: match this wwn with previous wwn list

	// we have an agDev entry, but no pWWNList target for it
	if ( !(lDevFlags[devIdx] & DPMC_LEANFLAG_AGDEVUSED) )
	{ // flag dev handle not accounted for yet
	lDevFlags[devIdx] \|= DPMC_LEANFLAG_NOWWNLIST;
	// later, get an empty pDevice and map this agDev.
	// AGTIAPI_PRINTK( "agtiapi_GetDevHandle: devIdx %d flags 0x%x, %d\n",
	// devIdx, lDevFlags[devIdx], (lDevFlags[devIdx] & 8) );
	}
	}
	else
	{
	lDevFlags[devIdx] \|= DPMC_LEANFLAG_NOAGDEVYT; // known empty agDev handle
	}
	}

	// AGTIAPI_PRINTK( "agtiapi_GetDevHandle: all WWN all the time, "
	// "devLstIdx/flags/(WWNL)portId ... \n" );
	// review device list for further action needed
	for ( devIdx = 0; devIdx < pCard->devDiscover; devIdx++ )
	{
	if ( lDevFlags[devIdx] & DPMC_LEANFLAG_NOWWNLIST ) // new target, register
	{
	int lNextDyad; // find next available dyad entry

	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: register new target, "
	"devIdx %d -- %d \n", devIdx, pCard->devDiscover );
	lRunScanFlag = TRUE; // scan and clear outstanding removals
	for ( lNextDyad = 0; lNextDyad < pCard->devDiscover; lNextDyad++ )
	{
	if ( pSLRList[lNextDyad].localeNameLen < 0 &&
	pSLRList[lNextDyad].remoteNameLen < 0 )
	break;
	}

	if ( lNextDyad == pCard->devDiscover )
	{
	printf( "agtiapi_GetDevHandle: failed to find available SAS LR\n" );
	AG_LIST_UNLOCK( &pCard->devListLock );
	free( (caddr_t)lDevFlags, M_PMC_MFLG );
	free( (caddr_t)agDev, M_PMC_MDEV );
	return 0;
	}
	// index of new entry
	lDevIndex = agtiapi_FindWWNListNext( pWWNList, pCard->devDiscover );
	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: listIdx new target %d of %d/%d\n",
	lDevIndex, devTotal, pPortalInfo->portID );
	if ( 0 > lDevIndex )
	{
	printf( "agtiapi_GetDevHandle: WARNING -- WWNList exhausted.\n" );
	continue;
	}

	pDevice = &pCard->pDevList[lDevIndex];

	tiINIGetDeviceInfo( &pCard->tiRoot, agDev[devIdx], &pDevice->devInfo );
	wwncpy( pDevice );
	agtiapi_InitCCBs( pCard, 1, lDevIndex );

	pDevice->pCard = (void *)pCard;
	pDevice->devType = DIRECT_DEVICE;

	// begin to populate new WWNList entry
	memcpy( pWWNList[lDevIndex].targetName, pDevice->targetName, pDevice->targetLen );
	pWWNList[lDevIndex].targetLen = pDevice->targetLen;

	pWWNList[lDevIndex].flags = SOFT_MAPPED;
	pWWNList[lDevIndex].portId = pPortalInfo->portID;
	pWWNList[lDevIndex].devListIndex = lDevIndex;
	pWWNList[lDevIndex].sasLrIdx = lNextDyad;

	pSLRList[lNextDyad].localeNameLen = portInfT.localNameLen;
	pSLRList[lNextDyad].remoteNameLen = portInfT.remoteNameLen;
	memcpy( pSLRList[lNextDyad].localeName, portInfT.localName, portInfT.localNameLen );
	memcpy( pSLRList[lNextDyad].remoteName, portInfT.remoteName, portInfT.remoteNameLen );
	// end of populating new WWNList entry

	pDevice->targetId = lDevIndex;

	pDevice->flags = ACTIVE;
	pDevice->CCBCount = 0;
	pDevice->pDevHandle = agDev[devIdx];
	agDev[devIdx]->osData = (void*)pDevice;

	pDevice->pPortalInfo = pPortalInfo;
	pDevice->portalId = pPortalInfo->portID;
	pPortalInfo->pDevList[lDevIndex] = (void*)pDevice;
	lDevFlags[lDevIndex] \|= DPMC_LEANFLAG_PDEVSUSED; // mark pDevice slot used
	}

	if ( (pCard->pDevList[devIdx].portalId == pPortalInfo->portID) &&
	!(lDevFlags[devIdx] & DPMC_LEANFLAG_PDEVSUSED) ) // pDevice not used
	{
	pDevice = &pCard->pDevList[devIdx];
	//pDevice->flags &= ~ACTIVE;
	if ( ( pDevice->pDevHandle != NULL \|\|
	pPortalInfo->pDevList[devIdx] != NULL ) )
	{
	atomic_add_16( &pCard->rmChkCt, 1 ); // show count of lost device

	if (FALSE == lRunScanFlag)
	{

	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: targ dropped out %d of %d/%d\n",
	devIdx, devTotal, pPortalInfo->portID );
	// if ( 0 == pWWNList[devIdx].devRemoved ) '.devRemoved = 5;
	cmpsetRtn = atomic_cmpset_32( &pWWNList[devIdx].devRemoved, 0, 5 );
	if ( 0 == cmpsetRtn )
	{
	AGTIAPI_PRINTK( "agtiapi_GetDevHandle: target %d timer already set\n",
	devIdx );
	}
	else
	{
	callout_reset( &pCard->devRmTimer, 1 * hz, agtiapi_devRmCheck, pCard );
	}
	}
	// else ... scan coming soon enough anyway, ignore timer for dropout
	}
	}
	} // end of for ( devIdx = 0; ...

	AG_LIST_UNLOCK( &pCard->devListLock );

	free((caddr_t)lDevFlags, M_PMC_MFLG);
	free((caddr_t)agDev, M_PMC_MDEV);

	if ( TRUE == lRunScanFlag )
	agtiapi_clrRmScan( pCard );

	return devTotal;
	} // end agtiapi_GetDevHandle

	/******************************************************************************
	agtiapi_scan()

	Purpose:
	Triggers CAM's scan
	Parameters:
	struct agtiapi_softc *pCard (IN) Pointer to the HBA data structure
	Return:
	Note:
	******************************************************************************/
	static void agtiapi_scan(struct agtiapi_softc *pmcsc)
	{
	union ccb *ccb;
	int bus, tid, lun;

	AGTIAPI_PRINTK("agtiapi_scan: start cardNO %d \n", pmcsc->cardNo);

	bus = cam_sim_path(pmcsc->sim);

	tid = CAM_TARGET_WILDCARD;
	lun = CAM_LUN_WILDCARD;

	mtx_lock(&(pmcsc->pCardInfo->pmIOLock));
	ccb = xpt_alloc_ccb_nowait();
	if (ccb == agNULL)
	{
	mtx_unlock(&(pmcsc->pCardInfo->pmIOLock));
	return;
	}
	if (xpt_create_path(&ccb->ccb_h.path, agNULL, bus, tid,
	CAM_LUN_WILDCARD) != CAM_REQ_CMP)
	{
	mtx_unlock(&(pmcsc->pCardInfo->pmIOLock));
	xpt_free_ccb(ccb);
	return;
	}

	mtx_unlock(&(pmcsc->pCardInfo->pmIOLock));
	pmcsc->dev_scan = agTRUE;
	xpt_rescan(ccb);
	return;
	}

	/******************************************************************************
	agtiapi_DeQueueCCB()

	Purpose:
	Remove a ccb from a queue
	Parameters:
	struct agtiapi_softc *pCard (IN) Pointer to the card structure
	pccb_t *phead (IN) Pointer to a head of ccb queue
	ccb_t *pccd (IN) Pointer to the ccb to be processed
	Return:
	AGTIAPI_SUCCESS - the ccb is removed from queue
	AGTIAPI_FAIL - the ccb is not found from queue
	Note:
	******************************************************************************/
	STATIC agBOOLEAN
	agtiapi_DeQueueCCB(struct agtiapi_softc pCard, pccb_t phead, pccb_t *ptail,
	#ifdef AGTIAPI_LOCAL_LOCK
	struct mtx *lock,
	#endif
	ccb_t *pccb)
	{
	ccb_t *pccb_curr;
	U32 status = AGTIAPI_FAIL;

	AGTIAPI_PRINTK("agtiapi_DeQueueCCB: %p from %p\n", pccb, phead);

	if (pccb == NULL \|\| *phead == NULL)
	{
	return AGTIAPI_FAIL;
	}

	AGTIAPI_PRINTK("agtiapi_DeQueueCCB: %p from %p\n", pccb, phead);
	AG_LOCAL_LOCK(lock);

	if (pccb == *phead)
	{
	phead = (phead)->pccbNext;
	if (pccb == *ptail)
	{
	*ptail = NULL;
	}
	else
	pccb->pccbNext = NULL;
	status = AGTIAPI_SUCCESS;
	}
	else
	{
	pccb_curr = *phead;
	while (pccb_curr->pccbNext != NULL)
	{
	if (pccb_curr->pccbNext == pccb)
	{
	pccb_curr->pccbNext = pccb->pccbNext;
	pccb->pccbNext = NULL;
	if (pccb == *ptail)
	{
	*ptail = pccb_curr;
	}
	else
	pccb->pccbNext = NULL;
	status = AGTIAPI_SUCCESS;
	break;
	}
	pccb_curr = pccb_curr->pccbNext;
	}
	}
	AG_LOCAL_UNLOCK(lock);

	return status;
	}


	STATIC void wwnprintk( unsigned char *name, int len )
	{
	int i;

	for (i = 0; i < len; i++, name++)
	AGTIAPI_PRINTK("%02x", *name);
	AGTIAPI_PRINTK("\n");
	}
	/*
	* SAS and SATA behind expander has 8 byte long unique address.
	* However, direct connect SATA device use 512 byte unique device id.
	* SPC uses remoteName to indicate length of ID and remoteAddress for the
	* address of memory that holding ID.
	*/
	STATIC int wwncpy( ag_device_t *pDevice )
	{
	int rc = 0;

	if (sizeof(pDevice->targetName) >= pDevice->devInfo.osAddress1 +
	pDevice->devInfo.osAddress2)
	{
	memcpy(pDevice->targetName,
	pDevice->devInfo.remoteName,
	pDevice->devInfo.osAddress1);
	memcpy(pDevice->targetName + pDevice->devInfo.osAddress1,
	pDevice->devInfo.remoteAddress,
	pDevice->devInfo.osAddress2);
	pDevice->targetLen = pDevice->devInfo.osAddress1 +
	pDevice->devInfo.osAddress2;
	rc = pDevice->targetLen;
	}
	else
	{
	AGTIAPI_PRINTK("WWN wrong size: %d + %d ERROR\n",
	pDevice->devInfo.osAddress1, pDevice->devInfo.osAddress2);
	rc = -1;
	}
	return rc;
	}


	/******************************************************************************
	agtiapi_ReleaseCCBs()

	Purpose:
	Free all allocated CCB memories for the Host Adapter.
	Parameters:
	struct agtiapi_softc *pCard (IN) Pointer to HBA data structure
	Return:
	Note:
	******************************************************************************/
	STATIC void agtiapi_ReleaseCCBs( struct agtiapi_softc *pCard )
	{

	ccb_hdr_t *hdr;
	U32 hdr_sz;
	ccb_t *pccb = NULL;

	AGTIAPI_PRINTK( "agtiapi_ReleaseCCBs: start\n" );

	#if ( defined AGTIAPI_TEST_DPL \|\| defined AGTIAPI_TEST_EPL )
	ccb_t *pccb;
	#endif

	#ifdef AGTIAPI_TEST_DPL
	for (pccb = (pccb_t)pCard->ccbChainList; pccb != NULL;
	pccb = pccb->pccbChainNext)
	{
	if(pccb->dplPtr && pccb->dplDma)
	pci_pool_free(pCard->dpl_ctx_pool, pccb->dplPtr, pccb->dplDma);
	}
	#endif

	#ifdef AGTIAPI_TEST_EPL
	for (pccb = (pccb_t)pCard->ccbChainList; pccb != NULL;
	pccb = pccb->pccbChainNext)
	{
	if(pccb->epl_ptr && pccb->epl_dma_ptr)
	pci_pool_free(
	pCard->epl_ctx_pool,
	pccb->epl_ptr,
	pccb->epl_dma_ptr
	);
	}
	#endif

	while ((hdr = pCard->ccbAllocList) != NULL)
	{
	pCard->ccbAllocList = hdr->next;
	hdr_sz = roundup2(sizeof(*hdr), cache_line_size());
	pccb = (ccb_t) ((char)hdr + hdr_sz);
	if (pCard->buffer_dmat != NULL && pccb->CCB_dmamap != NULL)
	{
	bus_dmamap_destroy(pCard->buffer_dmat, pccb->CCB_dmamap);
	}
	free(hdr, M_PMC_MCCB);
	}
	pCard->ccbAllocList = NULL;


	return;
	}

	/******************************************************************************
	agtiapi_TITimer()

	Purpose:
	Timer tick for tisa common layer
	Parameters:
	void *data (IN) Pointer to the HBA data structure
	Return:
	Note:
	******************************************************************************/
	STATIC void agtiapi_TITimer( void *data )
	{

	U32 next_tick;
	struct agtiapi_softc *pCard;

	pCard = (struct agtiapi_softc *)data;

	// AGTIAPI_PRINTK("agtiapi_TITimer: start\n");
	AG_GLOBAL_ARG( flags );

	next_tick = pCard->pCardInfo->tiRscInfo.tiLoLevelResource.
	loLevelOption.usecsPerTick / USEC_PER_TICK;

	if( next_tick == 0 ) /* no timer required */
	return;
	AG_SPIN_LOCK_IRQ( agtiapi_host_lock, flags );
	if( pCard->flags & AGTIAPI_SHUT_DOWN )
	goto ext;
	tiCOMTimerTick( &pCard->tiRoot ); /* tisa common layer timer tick */

	//add for polling mode
	#ifdef PMC_SPC
	if( agtiapi_polling_mode )
	agtiapi_CheckAllVectors( pCard, tiNonInterruptContext );
	#endif
	callout_reset( &pCard->OS_timer, next_tick, agtiapi_TITimer, pCard );
	ext:
	AG_SPIN_UNLOCK_IRQ( agtiapi_host_lock, flags );
	return;
	}

	/******************************************************************************
	agtiapi_clrRmScan()

	Purpose:
	Clears device list entries scheduled for timeout and calls scan
	Parameters:
	struct agtiapi_softc *pCard (IN) Pointer to HBA data structure
	******************************************************************************/
	STATIC void agtiapi_clrRmScan( struct agtiapi_softc *pCard )
	{
	ag_tgt_map_t *pWWNList;
	ag_portal_info_t *pPortalInfo;
	ag_portal_data_t *pPortalData;
	int lIdx;
	bit32 lReadRm;
	bit16 lReadCt;

	pWWNList = pCard->pWWNList;

	AGTIAPI_PRINTK( "agtiapi_clrRmScan: start\n" );

	AG_LIST_LOCK( &pCard->devListLock );

	for ( lIdx = 0; lIdx < pCard->devDiscover; lIdx++ )
	{
	lReadCt = atomic_load_acq_16( &pCard->rmChkCt );
	if ( 0 == lReadCt )
	{
	break; // trim to who cares
	}

	lReadRm = atomic_readandclear_32( &pWWNList[lIdx].devRemoved );
	if ( lReadRm > 0 )
	{
	pCard->pDevList[lIdx].flags &= ~ACTIVE;
	pCard->pDevList[lIdx].pDevHandle = NULL;

	pPortalData = &pCard->pPortalData[pWWNList[lIdx].portId];
	pPortalInfo = &pPortalData->portalInfo;
	pPortalInfo->pDevList[lIdx] = NULL;
	AGTIAPI_PRINTK( "agtiapi_clrRmScan: cleared dev %d at port %d\n",
	lIdx, pWWNList[lIdx].portId );
	atomic_subtract_16( &pCard->rmChkCt, 1 );
	}
	}
	AG_LIST_UNLOCK( &pCard->devListLock );

	agtiapi_scan( pCard );
	}


	/******************************************************************************
	agtiapi_devRmCheck()

	Purpose:
	Timer tick to check for timeout on missing targets
	Removes device list entry when timeout is reached
	Parameters:
	void *data (IN) Pointer to the HBA data structure
	******************************************************************************/
	STATIC void agtiapi_devRmCheck( void *data )
	{
	struct agtiapi_softc *pCard;
	ag_tgt_map_t *pWWNList;
	int lIdx, cmpsetRtn, lRunScanFlag = FALSE;
	bit16 lReadCt;
	bit32 lReadRm;

	pCard = ( struct agtiapi_softc * )data;

	// routine overhead
	if ( callout_pending( &pCard->devRmTimer ) ) // callout was reset
	{
	return;
	}
	if ( !callout_active( &pCard->devRmTimer ) ) // callout was stopped
	{
	return;
	}
	callout_deactivate( &pCard->devRmTimer );

	if( pCard->flags & AGTIAPI_SHUT_DOWN )
	{
	return; // implicit timer clear
	}

	pWWNList = pCard->pWWNList;

	AG_LIST_LOCK( &pCard->devListLock );
	lReadCt = atomic_load_acq_16( &pCard->rmChkCt );
	if ( lReadCt )
	{
	if ( callout_pending(&pCard->devRmTimer) == FALSE )
	{
	callout_reset( &pCard->devRmTimer, 1 * hz, agtiapi_devRmCheck, pCard );
	}
	else
	{
	AG_LIST_UNLOCK( &pCard->devListLock );
	return;
	}

	for ( lIdx = 0; lIdx < pCard->devDiscover; lIdx++ )
	{
	lReadCt = atomic_load_acq_16( &pCard->rmChkCt );
	if ( 0 == lReadCt )
	{
	break; // if handled somewhere else, get out
	}

	lReadRm = atomic_load_acq_32( &pWWNList[lIdx].devRemoved );
	if ( lReadRm > 0 )
	{
	if ( 1 == lReadRm ) // timed out
	{ // no decrement of devRemoved as way to leave a clrRmScan marker
	lRunScanFlag = TRUE; // other devRemoved values are about to get wiped
	break; // ... so bail out
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_devRmCheck: counting down dev %d @ %d; %d\n",
	lIdx, lReadRm, lReadCt );
	cmpsetRtn = atomic_cmpset_32( &pWWNList[lIdx].devRemoved,
	lReadRm,
	lReadRm-1 );
	if ( 0 == cmpsetRtn )
	{
	printf( "agtiapi_devRmCheck: %d decrement already handled\n",
	lIdx );
	}
	}
	}
	}
	AG_LIST_UNLOCK( &pCard->devListLock );

	if ( TRUE == lRunScanFlag )
	agtiapi_clrRmScan( pCard );
	}
	else
	{
	AG_LIST_UNLOCK( &pCard->devListLock );
	}

	return;
	}


	static void agtiapi_cam_poll( struct cam_sim *asim )
	{
	return;
	}

	/*****************************************************************************
	agtiapi_ResetCard()

	Purpose:
	Hard or soft reset on the controller and resend any
	outstanding requests if needed.
	Parameters:
	struct agtiapi_softc *pCard (IN) Pointer to HBA data structure
	unsigned lomg flags (IN/OUT) Flags used in locking done from calling layers
	Return:
	AGTIAPI_SUCCESS - reset successful
	AGTIAPI_FAIL - reset failed
	Note:
	*****************************************************************************/
	U32 agtiapi_ResetCard( struct agtiapi_softc pCard, unsigned long flags )
	{
	ag_device_t *pDevice;
	U32 lIdx = 0;
	U32 lFlagVal;
	agBOOLEAN ret;
	ag_portal_info_t *pPortalInfo;
	ag_portal_data_t *pPortalData;
	U32 count, loop;
	int szdv;

	if( pCard->flags & AGTIAPI_RESET ) {
	AGTIAPI_PRINTK( "agtiapi_ResetCard: reset card already in progress!\n" );
	return AGTIAPI_FAIL;
	}

	AGTIAPI_PRINTK( "agtiapi_ResetCard: Enter cnt %d\n",
	pCard->resetCount );
	#ifdef LOGEVENT
	agtiapi_LogEvent( pCard,
	IOCTL_EVT_SEV_INFORMATIONAL,
	0,
	agNULL,
	0,
	"Reset initiator time = %d!",
	pCard->resetCount + 1 );
	#endif

	pCard->flags \|= AGTIAPI_RESET;
	pCard->flags &= ~(AGTIAPI_CB_DONE \| AGTIAPI_RESET_SUCCESS);
	tiCOMSystemInterruptsActive( &pCard->tiRoot, FALSE );
	pCard->flags &= ~AGTIAPI_SYS_INTR_ON;

	agtiapi_FlushCCBs( pCard, AGTIAPI_CALLBACK );

	for ( lIdx = 1; 3 >= lIdx; lIdx++ ) // we try reset up to 3 times
	{
	if( pCard->flags & AGTIAPI_SOFT_RESET )
	{
	AGTIAPI_PRINTK( "agtiapi_ResetCard: soft variant\n" );
	tiCOMReset( &pCard->tiRoot, tiSoftReset );
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_ResetCard: no flag, no reset!\n" );
	}

	lFlagVal = AGTIAPI_RESET_SUCCESS;
	AG_SPIN_UNLOCK_IRQ( agtiapi_host_lock, *flags );
	ret = agtiapi_CheckCB( pCard, 50000, lFlagVal, &pCard->flags );
	AG_SPIN_LOCK_IRQ( agtiapi_host_lock, *flags );

	if( ret == AGTIAPI_FAIL )
	{
	AGTIAPI_PRINTK( "agtiapi_ResetCard: CheckCB indicates failed reset call, "
	"try again?\n" );
	}
	else
	{
	break;
	}
	}
	if ( 1 < lIdx )
	{
	if ( AGTIAPI_FAIL == ret )
	{
	AGTIAPI_PRINTK( "agtiapi_ResetCard: soft reset failed after try %d\n",
	lIdx );
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_ResetCard: soft reset success at try %d\n",
	lIdx );
	}
	}
	if( AGTIAPI_FAIL == ret )
	{
	printf( "agtiapi_ResetCard: reset ERROR\n" );
	pCard->flags &= ~AGTIAPI_INSTALLED;
	return AGTIAPI_FAIL;
	}

	pCard->flags &= ~AGTIAPI_SOFT_RESET;

	// disable all devices
	pDevice = pCard->pDevList;
	for( lIdx = 0; lIdx < maxTargets; lIdx++, pDevice++ )
	{
	/* if ( pDevice->flags & ACTIVE )
	{
	printf( "agtiapi_ResetCard: before ... active device %d\n", lIdx );
	} */
	pDevice->flags &= ~ACTIVE;
	}

	AG_SPIN_UNLOCK_IRQ( agtiapi_host_lock, *flags );
	if( tiCOMPortInit( &pCard->tiRoot, agFALSE ) != tiSuccess )
	printf( "agtiapi_ResetCard: tiCOMPortInit FAILED \n" );
	else
	AGTIAPI_PRINTK( "agtiapi_ResetCard: tiCOMPortInit success\n" );

	if( !pCard->pDevList ) { // try to get a little sanity here
	AGTIAPI_PRINTK( "agtiapi_ResetCard: no pDevList ERROR %p\n",
	pCard->pDevList );
	return AGTIAPI_FAIL;
	}

	AGTIAPI_PRINTK( "agtiapi_ResetCard: pre target-count %d port-count %d\n",
	pCard->tgtCount, pCard->portCount );
	pCard->tgtCount = 0;

	DELAY( 500000 );

	pCard->flags &= ~AGTIAPI_CB_DONE;

	pPortalData = pCard->pPortalData;

	for( count = 0; count < pCard->portCount; count++ ) {
	AG_SPIN_LOCK_IRQ( agtiapi_host_lock, flags );
	pPortalInfo = &pPortalData->portalInfo;
	pPortalInfo->portStatus = 0;
	pPortalInfo->portStatus &= ~( AGTIAPI_PORT_START \|
	AGTIAPI_PORT_DISC_READY \|
	AGTIAPI_DISC_DONE \|
	AGTIAPI_DISC_COMPLETE );

	szdv =
	sizeof( pPortalInfo->pDevList ) / sizeof( pPortalInfo->pDevList[0] );
	if (szdv > pCard->devDiscover)
	{
	szdv = pCard->devDiscover;
	}

	for( lIdx = 0, loop = 0;
	lIdx < szdv && loop < pPortalInfo->devTotal;
	lIdx++ )
	{
	pDevice = (ag_device_t*)pPortalInfo->pDevList[lIdx];
	if( pDevice )
	{
	loop++;
	pDevice->pDevHandle = 0; // mark for availability in pCard->pDevList[]
	// don't erase more as the device is scheduled for removal on DPC
	}
	AGTIAPI_PRINTK( "agtiapi_ResetCard: reset pDev %p pDevList %p idx %d\n",
	pDevice, pPortalInfo->pDevList, lIdx );
	pPortalInfo->devTotal = pPortalInfo->devPrev = 0;
	}

	for( lIdx = 0; lIdx < maxTargets; lIdx++ )
	{ // we reconstruct dev list later in get dev handle
	pPortalInfo->pDevList[lIdx] = NULL;
	}

	for( loop = 0; loop < AGTIAPI_LOOP_MAX; loop++ )
	{
	AGTIAPI_PRINTK( "agtiapi_ResetCard: tiCOMPortStart entry data "
	"%p / %d / %p\n",
	&pCard->tiRoot,
	pPortalInfo->portID,
	&pPortalInfo->tiPortalContext );

	if( tiCOMPortStart( &pCard->tiRoot,
	pPortalInfo->portID,
	&pPortalInfo->tiPortalContext,
	0 )
	!= tiSuccess )
	{
	printf( "agtiapi_ResetCard: tiCOMPortStart %d FAILED\n",
	pPortalInfo->portID );
	}
	else
	{
	AGTIAPI_PRINTK( "agtiapi_ResetCard: tiCOMPortStart %d success\n",
	pPortalInfo->portID );
	break;
	}
	}
	AG_SPIN_UNLOCK_IRQ( agtiapi_host_lock, flags );
	tiCOMGetPortInfo( &pCard->tiRoot,
	&pPortalInfo->tiPortalContext,
	&pPortalInfo->tiPortInfo );
	pPortalData++;
	}
	// ## fail case: pCard->flags &= ~AGTIAPI_INSTALLED;


	AG_SPIN_LOCK_IRQ(agtiapi_host_lock, *flags);

	if( !(pCard->flags & AGTIAPI_INSTALLED) ) // driver not installed !
	{
	printf( "agtiapi_ResetCard: error, driver not intstalled? "
	"!AGTIAPI_INSTALLED \n" );
	return AGTIAPI_FAIL;
	}

	AGTIAPI_PRINTK( "agtiapi_ResetCard: total device %d\n", pCard->tgtCount );

	#ifdef LOGEVENT
	agtiapi_LogEvent( pCard,
	IOCTL_EVT_SEV_INFORMATIONAL,
	0,
	agNULL,
	0,
	"Reset initiator total device = %d!",
	pCard->tgtCount );
	#endif
	pCard->resetCount++;

	AGTIAPI_PRINTK( "agtiapi_ResetCard: clear send and done queues\n" );
	// clear send & done queue
	AG_LOCAL_LOCK( &pCard->sendLock );
	pCard->ccbSendHead = NULL;
	pCard->ccbSendTail = NULL;
	AG_LOCAL_UNLOCK( &pCard->sendLock );

	AG_LOCAL_LOCK( &pCard->doneLock );
	pCard->ccbDoneHead = NULL;
	pCard->ccbDoneTail = NULL;
	AG_LOCAL_UNLOCK( &pCard->doneLock );

	// clear smp queues also
	AG_LOCAL_LOCK( &pCard->sendSMPLock );
	pCard->smpSendHead = NULL;
	pCard->smpSendTail = NULL;
	AG_LOCAL_UNLOCK( &pCard->sendSMPLock );

	AG_LOCAL_LOCK( &pCard->doneSMPLock );
	pCard->smpDoneHead = NULL;
	pCard->smpDoneTail = NULL;
	AG_LOCAL_UNLOCK( &pCard->doneSMPLock );

	// finished with all reset stuff, now start things back up
	tiCOMSystemInterruptsActive( &pCard->tiRoot, TRUE );
	pCard->flags \|= AGTIAPI_SYS_INTR_ON;
	pCard->flags \|= AGTIAPI_HAD_RESET;
	pCard->flags &= ~AGTIAPI_RESET; // ##
	agtiapi_StartIO( pCard );
	AGTIAPI_PRINTK( "agtiapi_ResetCard: local return success\n" );
	return AGTIAPI_SUCCESS;
	} // agtiapi_ResetCard


	/******************************************************************************
	agtiapi_ReleaseHBA()

	Purpose:
	Releases all resources previously acquired to support
	a specific Host Adapter, including the I/O Address range,
	and unregisters the agtiapi Host Adapter.
	Parameters:
	device_t dev (IN) - device pointer
	Return:
	always return 0 - success
	Note:
	******************************************************************************/
	int agtiapi_ReleaseHBA( device_t dev )
	{

	int thisCard = device_get_unit( dev ); // keeping get_unit call to once
	int i;
	ag_card_info_t *thisCardInst = &agCardInfoList[ thisCard ];
	struct ccb_setasync csa;
	struct agtiapi_softc *pCard;
	pCard = device_get_softc( dev );
	ag_card_info_t *pCardInfo = pCard->pCardInfo;
	ag_resource_info_t *pRscInfo = &thisCardInst->tiRscInfo;

	AG_GLOBAL_ARG(flags);

	AGTIAPI_PRINTK( "agtiapi_ReleaseHBA: start\n" );

	if (thisCardInst != pCardInfo)
	{
	AGTIAPI_PRINTK( "agtiapi_ReleaseHBA: Wrong ag_card_info_t thisCardInst %p "
	"pCardInfo %p\n",
	thisCardInst,
	pCardInfo );
	panic( "agtiapi_ReleaseHBA: Wrong ag_card_info_t thisCardInst %p pCardInfo "
	"%p\n",
	thisCardInst,
	pCardInfo );
	return( EIO );
	}


	AGTIAPI_PRINTK( "agtiapi_ReleaseHBA card %p\n", pCard );
	pCard->flags \|= AGTIAPI_SHUT_DOWN;


	// remove timer
	if (pCard->flags & AGTIAPI_TIMER_ON)
	{
	AG_SPIN_LOCK_IRQ( agtiapi_host_lock, flags );
	callout_drain( &pCard->OS_timer );
	callout_drain( &pCard->devRmTimer );
	callout_drain(&pCard->IO_timer);
	AG_SPIN_UNLOCK_IRQ( agtiapi_host_lock, flags );
	AGTIAPI_PRINTK( "agtiapi_ReleaseHBA: timer released\n" );
	}

	#ifdef HIALEAH_ENCRYPTION
	//Release encryption table memory - Fix it
	//if(pCard->encrypt && (pCard->flags & AGTIAPI_INSTALLED))
	//agtiapi_CleanupEncryption(pCard);
	#endif

	/*
	* Shutdown the channel so that chip gets frozen
	* and it does not do any more pci-bus accesses.
	*/
	if (pCard->flags & AGTIAPI_SYS_INTR_ON)
	{
	tiCOMSystemInterruptsActive( &pCard->tiRoot, FALSE );
	pCard->flags &= ~AGTIAPI_SYS_INTR_ON;
	AGTIAPI_PRINTK( "agtiapi_ReleaseHBA: card interrupt off\n" );
	}
	if (pCard->flags & AGTIAPI_INSTALLED)
	{
	tiCOMShutDown( &pCard->tiRoot );
	AGTIAPI_PRINTK( "agtiapi_ReleaseHBA: low layers shutdown\n" );
	}

	/*
	* first release IRQ, so that we do not get any more interrupts
	* from this host
	*/
	if (pCard->flags & AGTIAPI_IRQ_REQUESTED)
	{
	if (!agtiapi_intx_mode)
	{
	int i;
	for (i = 0; i< MAX_MSIX_NUM_VECTOR; i++)
	{
	if (pCard->irq[i] != agNULL && pCard->rscID[i] != 0)
	{
	bus_teardown_intr(dev, pCard->irq[i], pCard->intrcookie[i]);
	bus_release_resource( dev,
	SYS_RES_IRQ,
	pCard->rscID[i],
	pCard->irq[i] );
	}
	}
	pci_release_msi(dev);
	}
	pCard->flags &= ~AGTIAPI_IRQ_REQUESTED;



	#ifdef AGTIAPI_DPC
	for (i = 0; i < MAX_MSIX_NUM_DPC; i++)
	tasklet_kill(&pCard->tasklet_dpc[i]);
	#endif
	AGTIAPI_PRINTK("agtiapi_ReleaseHBA: IRQ released\n");
	}

	// release memory vs. alloc in agtiapi_alloc_ostimem; used in ostiAllocMemory
	if( pCard->osti_busaddr != 0 ) {
	bus_dmamap_unload( pCard->osti_dmat, pCard->osti_mapp );
	}
	if( pCard->osti_mem != NULL ) {
	bus_dmamem_free( pCard->osti_dmat, pCard->osti_mem, pCard->osti_mapp );
	}
	if( pCard->osti_dmat != NULL ) {
	bus_dma_tag_destroy( pCard->osti_dmat );
	}

	/* unmap the mapped PCI memory */
	/* calls bus_release_resource( ,SYS_RES_MEMORY, ..) */
	agtiapi_ReleasePCIMem(thisCardInst);

	/* release all ccbs */
	if (pCard->ccbTotal)
	{
	//calls bus_dmamap_destroy() for all pccbs
	agtiapi_ReleaseCCBs(pCard);
	AGTIAPI_PRINTK("agtiapi_ReleaseHBA: CCB released\n");
	}

	#ifdef HIALEAH_ENCRYPTION
	/release encryption resources - Fix it/
	if(pCard->encrypt)
	{
	/Check that all IO's are completed /
	if(atomic_read (&outstanding_encrypted_io_count) > 0)
	{
	printf("%s: WARNING: %d outstanding encrypted IOs !\n", __FUNCTION__, atomic_read(&outstanding_encrypted_io_count));
	}
	//agtiapi_CleanupEncryptionPools(pCard);
	}
	#endif


	/* release device list */
	if( pCard->pDevList ) {
	free((caddr_t)pCard->pDevList, M_PMC_MDVT);
	pCard->pDevList = NULL;
	AGTIAPI_PRINTK("agtiapi_ReleaseHBA: device list released\n");
	}
	#ifdef LINUX_PERBI_SUPPORT // ## review use of PERBI
	AGTIAPI_PRINTK( "agtiapi_ReleaseHBA: WWN list %p \n", pCard->pWWNList );
	if( pCard->pWWNList ) {
	free( (caddr_t)pCard->pWWNList, M_PMC_MTGT );
	pCard->pWWNList = NULL;
	AGTIAPI_PRINTK("agtiapi_ReleaseHBA: WWN list released\n");
	}
	if( pCard->pSLRList ) {
	free( (caddr_t)pCard->pSLRList, M_PMC_MSLR );
	pCard->pSLRList = NULL;
	AGTIAPI_PRINTK("agtiapi_ReleaseHBA: SAS Local Remote list released\n");
	}

	#endif
	if (pCard->pPortalData)
	{
	free((caddr_t)pCard->pPortalData, M_PMC_MPRT);
	pCard->pPortalData = NULL;
	AGTIAPI_PRINTK("agtiapi_ReleaseHBA: PortalData released\n");
	}
	//calls contigfree() or free()
	agtiapi_MemFree(pCardInfo);
	AGTIAPI_PRINTK("agtiapi_ReleaseHBA: low level resource released\n");

	#ifdef HOTPLUG_SUPPORT
	if (pCard->flags & AGTIAPI_PORT_INITIALIZED)
	{
	// agtiapi_FreeDevWorkList(pCard);
	AGTIAPI_PRINTK("agtiapi_ReleaseHBA: (HP dev) work resources released\n");
	}
	#endif

	/*
	* TBD, scsi_unregister may release wrong host data structure
	* which cause NULL pointer shows up.
	*/
	if (pCard->flags & AGTIAPI_SCSI_REGISTERED)
	{
	pCard->flags &= ~AGTIAPI_SCSI_REGISTERED;


	#ifdef AGTIAPI_LOCAL_LOCK
	if (pCard->STLock)
	{
	//destroy mtx
	int maxLocks;
	maxLocks = pRscInfo->tiLoLevelResource.loLevelOption.numOfQueuesPerPort;

	for( i = 0; i < maxLocks; i++ )
	{
	mtx_destroy(&pCard->STLock[i]);
	}
	free(pCard->STLock, M_PMC_MSTL);
	pCard->STLock = NULL;
	}
	#endif

	}
	ag_card_good--;

	/* reset agtiapi_1st_time if this is the only card */
	if (!ag_card_good && !agtiapi_1st_time)
	{
	agtiapi_1st_time = 1;
	}

	/* for tiSgl_t memeory */
	if (pCard->tisgl_busaddr != 0)
	{
	bus_dmamap_unload(pCard->tisgl_dmat, pCard->tisgl_map);
	}
	if (pCard->tisgl_mem != NULL)
	{
	bus_dmamem_free(pCard->tisgl_dmat, pCard->tisgl_mem, pCard->tisgl_map);
	}
	if (pCard->tisgl_dmat != NULL)
	{
	bus_dma_tag_destroy(pCard->tisgl_dmat);
	}

	if (pCard->buffer_dmat != agNULL)
	{
	bus_dma_tag_destroy(pCard->buffer_dmat);
	}

	if (pCard->sim != NULL)
	{
	mtx_lock(&thisCardInst->pmIOLock);
	xpt_setup_ccb(&csa.ccb_h, pCard->path, 5);
	csa.ccb_h.func_code = XPT_SASYNC_CB;
	csa.event_enable = 0;
	csa.callback = agtiapi_async;
	csa.callback_arg = pCard;
	xpt_action((union ccb *)&csa);
	xpt_free_path(pCard->path);
	// if (pCard->ccbTotal == 0)
	if (pCard->ccbTotal <= thisCard)
	{
	/*
	no link up so that simq has not been released.
	In order to remove cam, we call this.
	*/
	xpt_release_simq(pCard->sim, 1);
	}
	xpt_bus_deregister(cam_sim_path(pCard->sim));
	cam_sim_free(pCard->sim, FALSE);
	mtx_unlock(&thisCardInst->pmIOLock);
	}
	if (pCard->devq != NULL)
	{
	cam_simq_free(pCard->devq);
	}

	//destroy mtx
	mtx_destroy( &thisCardInst->pmIOLock );
	mtx_destroy( &pCard->sendLock );
	mtx_destroy( &pCard->doneLock );
	mtx_destroy( &pCard->sendSMPLock );
	mtx_destroy( &pCard->doneSMPLock );
	mtx_destroy( &pCard->ccbLock );
	mtx_destroy( &pCard->devListLock );
	mtx_destroy( &pCard->OS_timer_lock );
	mtx_destroy( &pCard->devRmTimerLock );
	mtx_destroy( &pCard->memLock );
	mtx_destroy( &pCard->freezeLock );

	destroy_dev( pCard->my_cdev );
	memset((void *)pCardInfo, 0, sizeof(ag_card_info_t));
	return 0;
	}


	// Called during system shutdown after sync
	static int agtiapi_shutdown( device_t dev )
	{
	AGTIAPI_PRINTK( "agtiapi_shutdown\n" );
	return( 0 );
	}

	static int agtiapi_suspend( device_t dev ) // Device suspend routine.
	{
	AGTIAPI_PRINTK( "agtiapi_suspend\n" );
	return( 0 );
	}

	static int agtiapi_resume( device_t dev ) // Device resume routine.
	{
	AGTIAPI_PRINTK( "agtiapi_resume\n" );
	return( 0 );
	}

	static device_method_t agtiapi_methods[] = { // Device interface
	DEVMETHOD( device_probe, agtiapi_probe ),
	DEVMETHOD( device_attach, agtiapi_attach ),
	DEVMETHOD( device_detach, agtiapi_ReleaseHBA ),
	DEVMETHOD( device_shutdown, agtiapi_shutdown ),
	DEVMETHOD( device_suspend, agtiapi_suspend ),
	DEVMETHOD( device_resume, agtiapi_resume ),
	{ 0, 0 }
	};

	static devclass_t pmspcv_devclass;

	static driver_t pmspcv_driver = {
	"pmspcv",
	agtiapi_methods,
	sizeof( struct agtiapi_softc )
	};

	DRIVER_MODULE( pmspcv, pci, pmspcv_driver, pmspcv_devclass, 0, 0 );
	MODULE_DEPEND( pmspcv, cam, 1, 1, 1 );
	MODULE_DEPEND( pmspcv, pci, 1, 1, 1 );

	#include <dev/pms/freebsd/driver/common/lxosapi.c>
	#include <dev/pms/freebsd/driver/ini/src/osapi.c>
	#include <dev/pms/freebsd/driver/common/lxutil.c>
	#include <dev/pms/freebsd/driver/common/lxencrypt.c>


	diff --git a/sys/dev/sdhci/sdhci.c b/sys/dev/sdhci/sdhci.c
	index d81f26a6f41e..91474cabd2d3 100644
	--- a/sys/dev/sdhci/sdhci.c
	+++ b/sys/dev/sdhci/sdhci.c
	@@ -1,2780 +1,2780 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2008 Alexander Motin <mav@FreeBSD.org>
	* Copyright (c) 2017 Marius Strobl <marius@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/callout.h>
	#include <sys/conf.h>
	#include <sys/kernel.h>
	#include <sys/kobj.h>
	#include <sys/libkern.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/resource.h>
	#include <sys/rman.h>
	#include <sys/sysctl.h>
	#include <sys/taskqueue.h>

	#include <machine/bus.h>
	#include <machine/resource.h>
	#include <machine/stdarg.h>

	#include <dev/mmc/bridge.h>
	#include <dev/mmc/mmcreg.h>
	#include <dev/mmc/mmcbrvar.h>

	#include <dev/sdhci/sdhci.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_debug.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt_sim.h>

	#include "mmcbr_if.h"
	#include "sdhci_if.h"

	#include "opt_mmccam.h"

	SYSCTL_NODE(_hw, OID_AUTO, sdhci, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"sdhci driver");

	static int sdhci_debug = 0;
	SYSCTL_INT(_hw_sdhci, OID_AUTO, debug, CTLFLAG_RWTUN, &sdhci_debug, 0,
	"Debug level");
	u_int sdhci_quirk_clear = 0;
	SYSCTL_INT(_hw_sdhci, OID_AUTO, quirk_clear, CTLFLAG_RWTUN, &sdhci_quirk_clear,
	0, "Mask of quirks to clear");
	u_int sdhci_quirk_set = 0;
	SYSCTL_INT(_hw_sdhci, OID_AUTO, quirk_set, CTLFLAG_RWTUN, &sdhci_quirk_set, 0,
	"Mask of quirks to set");

	#define RD1(slot, off) SDHCI_READ_1((slot)->bus, (slot), (off))
	#define RD2(slot, off) SDHCI_READ_2((slot)->bus, (slot), (off))
	#define RD4(slot, off) SDHCI_READ_4((slot)->bus, (slot), (off))
	#define RD_MULTI_4(slot, off, ptr, count) \
	SDHCI_READ_MULTI_4((slot)->bus, (slot), (off), (ptr), (count))

	#define WR1(slot, off, val) SDHCI_WRITE_1((slot)->bus, (slot), (off), (val))
	#define WR2(slot, off, val) SDHCI_WRITE_2((slot)->bus, (slot), (off), (val))
	#define WR4(slot, off, val) SDHCI_WRITE_4((slot)->bus, (slot), (off), (val))
	#define WR_MULTI_4(slot, off, ptr, count) \
	SDHCI_WRITE_MULTI_4((slot)->bus, (slot), (off), (ptr), (count))

	static void sdhci_acmd_irq(struct sdhci_slot *slot, uint16_t acmd_err);
	static void sdhci_card_poll(void *arg);
	static void sdhci_card_task(void *arg, int pending);
	static void sdhci_cmd_irq(struct sdhci_slot *slot, uint32_t intmask);
	static void sdhci_data_irq(struct sdhci_slot *slot, uint32_t intmask);
	static int sdhci_exec_tuning(struct sdhci_slot *slot, bool reset);
	static void sdhci_handle_card_present_locked(struct sdhci_slot *slot,
	bool is_present);
	static void sdhci_finish_command(struct sdhci_slot *slot);
	static void sdhci_init(struct sdhci_slot *slot);
	static void sdhci_read_block_pio(struct sdhci_slot *slot);
	static void sdhci_req_done(struct sdhci_slot *slot);
	static void sdhci_req_wakeup(struct mmc_request *req);
	static void sdhci_reset(struct sdhci_slot *slot, uint8_t mask);
	static void sdhci_retune(void *arg);
	static void sdhci_set_clock(struct sdhci_slot *slot, uint32_t clock);
	static void sdhci_set_power(struct sdhci_slot *slot, u_char power);
	static void sdhci_set_transfer_mode(struct sdhci_slot *slot,
	const struct mmc_data *data);
	static void sdhci_start(struct sdhci_slot *slot);
	static void sdhci_timeout(void *arg);
	static void sdhci_start_command(struct sdhci_slot *slot,
	struct mmc_command *cmd);
	static void sdhci_start_data(struct sdhci_slot *slot,
	const struct mmc_data *data);
	static void sdhci_write_block_pio(struct sdhci_slot *slot);
	static void sdhci_transfer_pio(struct sdhci_slot *slot);

	#ifdef MMCCAM
	/* CAM-related */
	static void sdhci_cam_action(struct cam_sim sim, union ccb ccb);
	static int sdhci_cam_get_possible_host_clock(const struct sdhci_slot *slot,
	int proposed_clock);
	static void sdhci_cam_poll(struct cam_sim *sim);
	static int sdhci_cam_request(struct sdhci_slot slot, union ccb ccb);
	static int sdhci_cam_settran_settings(struct sdhci_slot slot, union ccb ccb);
	static int sdhci_cam_update_ios(struct sdhci_slot *slot);
	#endif

	/* helper routines */
	static int sdhci_dma_alloc(struct sdhci_slot *slot);
	static void sdhci_dma_free(struct sdhci_slot *slot);
	static void sdhci_dumpregs(struct sdhci_slot *slot);
	static void sdhci_getaddr(void arg, bus_dma_segment_t segs, int nsegs,
	int error);
	static int slot_printf(const struct sdhci_slot slot, const char fmt, ...)
	__printflike(2, 3);
	static uint32_t sdhci_tuning_intmask(const struct sdhci_slot *slot);

	#define SDHCI_LOCK(_slot) mtx_lock(&(_slot)->mtx)
	#define SDHCI_UNLOCK(_slot) mtx_unlock(&(_slot)->mtx)
	#define SDHCI_LOCK_INIT(_slot) \
	mtx_init(&_slot->mtx, "SD slot mtx", "sdhci", MTX_DEF)
	#define SDHCI_LOCK_DESTROY(_slot) mtx_destroy(&_slot->mtx);
	#define SDHCI_ASSERT_LOCKED(_slot) mtx_assert(&_slot->mtx, MA_OWNED);
	#define SDHCI_ASSERT_UNLOCKED(_slot) mtx_assert(&_slot->mtx, MA_NOTOWNED);

	#define SDHCI_DEFAULT_MAX_FREQ 50

	#define SDHCI_200_MAX_DIVIDER 256
	#define SDHCI_300_MAX_DIVIDER 2046

	#define SDHCI_CARD_PRESENT_TICKS (hz / 5)
	#define SDHCI_INSERT_DELAY_TICKS (hz / 2)

	/*
	* Broadcom BCM577xx Controller Constants
	*/
	/* Maximum divider supported by the default clock source. */
	#define BCM577XX_DEFAULT_MAX_DIVIDER 256
	/* Alternative clock's base frequency. */
	#define BCM577XX_ALT_CLOCK_BASE 63000000

	#define BCM577XX_HOST_CONTROL 0x198
	#define BCM577XX_CTRL_CLKSEL_MASK 0xFFFFCFFF
	#define BCM577XX_CTRL_CLKSEL_SHIFT 12
	#define BCM577XX_CTRL_CLKSEL_DEFAULT 0x0
	#define BCM577XX_CTRL_CLKSEL_64MHZ 0x3

	static void
	sdhci_getaddr(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{

	if (error != 0) {
	printf("getaddr: error %d\n", error);
	return;
	}
	(bus_addr_t )arg = segs[0].ds_addr;
	}

	static int
	slot_printf(const struct sdhci_slot slot, const char fmt, ...)
	{
	char buf[128];
	va_list ap;
	int retval;

	/*
	* Make sure we print a single line all together rather than in two
	* halves to avoid console gibberish bingo.
	*/
	va_start(ap, fmt);
	retval = vsnprintf(buf, sizeof(buf), fmt, ap);
	va_end(ap);

	retval += printf("%s-slot%d: %s",
	device_get_nameunit(slot->bus), slot->num, buf);
	return (retval);
	}

	static void
	sdhci_dumpregs(struct sdhci_slot *slot)
	{

	slot_printf(slot,
	"============== REGISTER DUMP ==============\n");

	slot_printf(slot, "Sys addr: 0x%08x \| Version: 0x%08x\n",
	RD4(slot, SDHCI_DMA_ADDRESS), RD2(slot, SDHCI_HOST_VERSION));
	slot_printf(slot, "Blk size: 0x%08x \| Blk cnt: 0x%08x\n",
	RD2(slot, SDHCI_BLOCK_SIZE), RD2(slot, SDHCI_BLOCK_COUNT));
	slot_printf(slot, "Argument: 0x%08x \| Trn mode: 0x%08x\n",
	RD4(slot, SDHCI_ARGUMENT), RD2(slot, SDHCI_TRANSFER_MODE));
	slot_printf(slot, "Present: 0x%08x \| Host ctl: 0x%08x\n",
	RD4(slot, SDHCI_PRESENT_STATE), RD1(slot, SDHCI_HOST_CONTROL));
	slot_printf(slot, "Power: 0x%08x \| Blk gap: 0x%08x\n",
	RD1(slot, SDHCI_POWER_CONTROL), RD1(slot, SDHCI_BLOCK_GAP_CONTROL));
	slot_printf(slot, "Wake-up: 0x%08x \| Clock: 0x%08x\n",
	RD1(slot, SDHCI_WAKE_UP_CONTROL), RD2(slot, SDHCI_CLOCK_CONTROL));
	slot_printf(slot, "Timeout: 0x%08x \| Int stat: 0x%08x\n",
	RD1(slot, SDHCI_TIMEOUT_CONTROL), RD4(slot, SDHCI_INT_STATUS));
	slot_printf(slot, "Int enab: 0x%08x \| Sig enab: 0x%08x\n",
	RD4(slot, SDHCI_INT_ENABLE), RD4(slot, SDHCI_SIGNAL_ENABLE));
	slot_printf(slot, "AC12 err: 0x%08x \| Host ctl2:0x%08x\n",
	RD2(slot, SDHCI_ACMD12_ERR), RD2(slot, SDHCI_HOST_CONTROL2));
	slot_printf(slot, "Caps: 0x%08x \| Caps2: 0x%08x\n",
	RD4(slot, SDHCI_CAPABILITIES), RD4(slot, SDHCI_CAPABILITIES2));
	slot_printf(slot, "Max curr: 0x%08x \| ADMA err: 0x%08x\n",
	RD4(slot, SDHCI_MAX_CURRENT), RD1(slot, SDHCI_ADMA_ERR));
	slot_printf(slot, "ADMA addr:0x%08x \| Slot int: 0x%08x\n",
	RD4(slot, SDHCI_ADMA_ADDRESS_LO), RD2(slot, SDHCI_SLOT_INT_STATUS));

	slot_printf(slot,
	"===========================================\n");
	}

	static void
	sdhci_reset(struct sdhci_slot *slot, uint8_t mask)
	{
	int timeout;
	uint32_t clock;

	if (slot->quirks & SDHCI_QUIRK_NO_CARD_NO_RESET) {
	if (!SDHCI_GET_CARD_PRESENT(slot->bus, slot))
	return;
	}

	/* Some controllers need this kick or reset won't work. */
	if ((mask & SDHCI_RESET_ALL) == 0 &&
	(slot->quirks & SDHCI_QUIRK_CLOCK_BEFORE_RESET)) {
	/* This is to force an update */
	clock = slot->clock;
	slot->clock = 0;
	sdhci_set_clock(slot, clock);
	}

	if (mask & SDHCI_RESET_ALL) {
	slot->clock = 0;
	slot->power = 0;
	}

	WR1(slot, SDHCI_SOFTWARE_RESET, mask);

	if (slot->quirks & SDHCI_QUIRK_WAITFOR_RESET_ASSERTED) {
	/*
	* Resets on TI OMAPs and AM335x are incompatible with SDHCI
	* specification. The reset bit has internal propagation delay,
	* so a fast read after write returns 0 even if reset process is
	* in progress. The workaround is to poll for 1 before polling
	* for 0. In the worst case, if we miss seeing it asserted the
	* time we spent waiting is enough to ensure the reset finishes.
	*/
	timeout = 10000;
	while ((RD1(slot, SDHCI_SOFTWARE_RESET) & mask) != mask) {
	if (timeout <= 0)
	break;
	timeout--;
	DELAY(1);
	}
	}

	/* Wait max 100 ms */
	timeout = 10000;
	/* Controller clears the bits when it's done */
	while (RD1(slot, SDHCI_SOFTWARE_RESET) & mask) {
	if (timeout <= 0) {
	slot_printf(slot, "Reset 0x%x never completed.\n",
	mask);
	sdhci_dumpregs(slot);
	return;
	}
	timeout--;
	DELAY(10);
	}
	}

	static uint32_t
	sdhci_tuning_intmask(const struct sdhci_slot *slot)
	{
	uint32_t intmask;

	intmask = 0;
	if (slot->opt & SDHCI_TUNING_ENABLED) {
	intmask \|= SDHCI_INT_TUNEERR;
	if (slot->retune_mode == SDHCI_RETUNE_MODE_2 \|\|
	slot->retune_mode == SDHCI_RETUNE_MODE_3)
	intmask \|= SDHCI_INT_RETUNE;
	}
	return (intmask);
	}

	static void
	sdhci_init(struct sdhci_slot *slot)
	{

	sdhci_reset(slot, SDHCI_RESET_ALL);

	/* Enable interrupts. */
	slot->intmask = SDHCI_INT_BUS_POWER \| SDHCI_INT_DATA_END_BIT \|
	SDHCI_INT_DATA_CRC \| SDHCI_INT_DATA_TIMEOUT \| SDHCI_INT_INDEX \|
	SDHCI_INT_END_BIT \| SDHCI_INT_CRC \| SDHCI_INT_TIMEOUT \|
	SDHCI_INT_DATA_AVAIL \| SDHCI_INT_SPACE_AVAIL \|
	SDHCI_INT_DMA_END \| SDHCI_INT_DATA_END \| SDHCI_INT_RESPONSE \|
	SDHCI_INT_ACMD12ERR;

	if (!(slot->quirks & SDHCI_QUIRK_POLL_CARD_PRESENT) &&
	!(slot->opt & SDHCI_NON_REMOVABLE)) {
	slot->intmask \|= SDHCI_INT_CARD_REMOVE \| SDHCI_INT_CARD_INSERT;
	}

	WR4(slot, SDHCI_INT_ENABLE, slot->intmask);
	WR4(slot, SDHCI_SIGNAL_ENABLE, slot->intmask);
	}

	static void
	sdhci_set_clock(struct sdhci_slot *slot, uint32_t clock)
	{
	uint32_t clk_base;
	uint32_t clk_sel;
	uint32_t res;
	uint16_t clk;
	uint16_t div;
	int timeout;

	if (clock == slot->clock)
	return;
	slot->clock = clock;

	/* Turn off the clock. */
	clk = RD2(slot, SDHCI_CLOCK_CONTROL);
	WR2(slot, SDHCI_CLOCK_CONTROL, clk & ~SDHCI_CLOCK_CARD_EN);
	/* If no clock requested - leave it so. */
	if (clock == 0)
	return;

	/* Determine the clock base frequency */
	clk_base = slot->max_clk;
	if (slot->quirks & SDHCI_QUIRK_BCM577XX_400KHZ_CLKSRC) {
	clk_sel = RD2(slot, BCM577XX_HOST_CONTROL) &
	BCM577XX_CTRL_CLKSEL_MASK;

	/*
	* Select clock source appropriate for the requested frequency.
	*/
	if ((clk_base / BCM577XX_DEFAULT_MAX_DIVIDER) > clock) {
	clk_base = BCM577XX_ALT_CLOCK_BASE;
	clk_sel \|= (BCM577XX_CTRL_CLKSEL_64MHZ <<
	BCM577XX_CTRL_CLKSEL_SHIFT);
	} else {
	clk_sel \|= (BCM577XX_CTRL_CLKSEL_DEFAULT <<
	BCM577XX_CTRL_CLKSEL_SHIFT);
	}

	WR2(slot, BCM577XX_HOST_CONTROL, clk_sel);
	}

	/* Recalculate timeout clock frequency based on the new sd clock. */
	if (slot->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK)
	slot->timeout_clk = slot->clock / 1000;

	if (slot->version < SDHCI_SPEC_300) {
	/* Looking for highest freq <= clock. */
	res = clk_base;
	for (div = 1; div < SDHCI_200_MAX_DIVIDER; div <<= 1) {
	if (res <= clock)
	break;
	res >>= 1;
	}
	/* Divider 1:1 is 0x00, 2:1 is 0x01, 256:1 is 0x80 ... */
	div >>= 1;
	} else {
	/* Version 3.0 divisors are multiples of two up to 1023 * 2 */
	if (clock >= clk_base)
	div = 0;
	else {
	for (div = 2; div < SDHCI_300_MAX_DIVIDER; div += 2) {
	if ((clk_base / div) <= clock)
	break;
	}
	}
	div >>= 1;
	}

	if (bootverbose \|\| sdhci_debug)
	slot_printf(slot, "Divider %d for freq %d (base %d)\n",
	div, clock, clk_base);

	/* Now we have got divider, set it. */
	clk = (div & SDHCI_DIVIDER_MASK) << SDHCI_DIVIDER_SHIFT;
	clk \|= ((div >> SDHCI_DIVIDER_MASK_LEN) & SDHCI_DIVIDER_HI_MASK)
	<< SDHCI_DIVIDER_HI_SHIFT;

	WR2(slot, SDHCI_CLOCK_CONTROL, clk);
	/* Enable clock. */
	clk \|= SDHCI_CLOCK_INT_EN;
	WR2(slot, SDHCI_CLOCK_CONTROL, clk);
	/* Wait up to 10 ms until it stabilize. */
	timeout = 10;
	while (!((clk = RD2(slot, SDHCI_CLOCK_CONTROL))
	& SDHCI_CLOCK_INT_STABLE)) {
	if (timeout == 0) {
	slot_printf(slot,
	"Internal clock never stabilised.\n");
	sdhci_dumpregs(slot);
	return;
	}
	timeout--;
	DELAY(1000);
	}
	/* Pass clock signal to the bus. */
	clk \|= SDHCI_CLOCK_CARD_EN;
	WR2(slot, SDHCI_CLOCK_CONTROL, clk);
	}

	static void
	sdhci_set_power(struct sdhci_slot *slot, u_char power)
	{
	int i;
	uint8_t pwr;

	if (slot->power == power)
	return;

	slot->power = power;

	/* Turn off the power. */
	pwr = 0;
	WR1(slot, SDHCI_POWER_CONTROL, pwr);
	/* If power down requested - leave it so. */
	if (power == 0)
	return;
	/* Set voltage. */
	switch (1 << power) {
	case MMC_OCR_LOW_VOLTAGE:
	pwr \|= SDHCI_POWER_180;
	break;
	case MMC_OCR_290_300:
	case MMC_OCR_300_310:
	pwr \|= SDHCI_POWER_300;
	break;
	case MMC_OCR_320_330:
	case MMC_OCR_330_340:
	pwr \|= SDHCI_POWER_330;
	break;
	}
	WR1(slot, SDHCI_POWER_CONTROL, pwr);
	/*
	* Turn on VDD1 power. Note that at least some Intel controllers can
	* fail to enable bus power on the first try after transiting from D3
	* to D0, so we give them up to 2 ms.
	*/
	pwr \|= SDHCI_POWER_ON;
	for (i = 0; i < 20; i++) {
	WR1(slot, SDHCI_POWER_CONTROL, pwr);
	if (RD1(slot, SDHCI_POWER_CONTROL) & SDHCI_POWER_ON)
	break;
	DELAY(100);
	}
	if (!(RD1(slot, SDHCI_POWER_CONTROL) & SDHCI_POWER_ON))
	slot_printf(slot, "Bus power failed to enable\n");

	if (slot->quirks & SDHCI_QUIRK_INTEL_POWER_UP_RESET) {
	WR1(slot, SDHCI_POWER_CONTROL, pwr \| 0x10);
	DELAY(10);
	WR1(slot, SDHCI_POWER_CONTROL, pwr);
	DELAY(300);
	}
	}

	static void
	sdhci_read_block_pio(struct sdhci_slot *slot)
	{
	uint32_t data;
	char *buffer;
	size_t left;

	buffer = slot->curcmd->data->data;
	buffer += slot->offset;
	/* Transfer one block at a time. */
	#ifdef MMCCAM
	if (slot->curcmd->data->flags & MMC_DATA_BLOCK_SIZE)
	left = min(slot->curcmd->data->block_size,
	slot->curcmd->data->len - slot->offset);
	else
	#endif
	left = min(512, slot->curcmd->data->len - slot->offset);
	slot->offset += left;

	/* If we are too fast, broken controllers return zeroes. */
	if (slot->quirks & SDHCI_QUIRK_BROKEN_TIMINGS)
	DELAY(10);
	/* Handle unaligned and aligned buffer cases. */
	if ((intptr_t)buffer & 3) {
	while (left > 3) {
	data = RD4(slot, SDHCI_BUFFER);
	buffer[0] = data;
	buffer[1] = (data >> 8);
	buffer[2] = (data >> 16);
	buffer[3] = (data >> 24);
	buffer += 4;
	left -= 4;
	}
	} else {
	RD_MULTI_4(slot, SDHCI_BUFFER,
	(uint32_t *)buffer, left >> 2);
	left &= 3;
	}
	/* Handle uneven size case. */
	if (left > 0) {
	data = RD4(slot, SDHCI_BUFFER);
	while (left > 0) {
	*(buffer++) = data;
	data >>= 8;
	left--;
	}
	}
	}

	static void
	sdhci_write_block_pio(struct sdhci_slot *slot)
	{
	uint32_t data = 0;
	char *buffer;
	size_t left;

	buffer = slot->curcmd->data->data;
	buffer += slot->offset;
	/* Transfer one block at a time. */
	#ifdef MMCCAM
	if (slot->curcmd->data->flags & MMC_DATA_BLOCK_SIZE) {
	left = min(slot->curcmd->data->block_size,
	slot->curcmd->data->len - slot->offset);
	} else
	#endif
	left = min(512, slot->curcmd->data->len - slot->offset);
	slot->offset += left;

	/* Handle unaligned and aligned buffer cases. */
	if ((intptr_t)buffer & 3) {
	while (left > 3) {
	data = buffer[0] +
	(buffer[1] << 8) +
	(buffer[2] << 16) +
	(buffer[3] << 24);
	left -= 4;
	buffer += 4;
	WR4(slot, SDHCI_BUFFER, data);
	}
	} else {
	WR_MULTI_4(slot, SDHCI_BUFFER,
	(uint32_t *)buffer, left >> 2);
	left &= 3;
	}
	/* Handle uneven size case. */
	if (left > 0) {
	while (left > 0) {
	data <<= 8;
	data += *(buffer++);
	left--;
	}
	WR4(slot, SDHCI_BUFFER, data);
	}
	}

	static void
	sdhci_transfer_pio(struct sdhci_slot *slot)
	{

	/* Read as many blocks as possible. */
	if (slot->curcmd->data->flags & MMC_DATA_READ) {
	while (RD4(slot, SDHCI_PRESENT_STATE) &
	SDHCI_DATA_AVAILABLE) {
	sdhci_read_block_pio(slot);
	if (slot->offset >= slot->curcmd->data->len)
	break;
	}
	} else {
	while (RD4(slot, SDHCI_PRESENT_STATE) &
	SDHCI_SPACE_AVAILABLE) {
	sdhci_write_block_pio(slot);
	if (slot->offset >= slot->curcmd->data->len)
	break;
	}
	}
	}

	static void
	sdhci_card_task(void *arg, int pending __unused)
	{
	struct sdhci_slot *slot = arg;
	device_t d;

	SDHCI_LOCK(slot);
	if (SDHCI_GET_CARD_PRESENT(slot->bus, slot)) {
	#ifdef MMCCAM
	if (slot->card_present == 0) {
	#else
	if (slot->dev == NULL) {
	#endif
	/* If card is present - attach mmc bus. */
	if (bootverbose \|\| sdhci_debug)
	slot_printf(slot, "Card inserted\n");
	#ifdef MMCCAM
	slot->card_present = 1;
	mmccam_start_discovery(slot->sim);
	SDHCI_UNLOCK(slot);
	#else
	d = slot->dev = device_add_child(slot->bus, "mmc", -1);
	SDHCI_UNLOCK(slot);
	if (d) {
	device_set_ivars(d, slot);
	(void)device_probe_and_attach(d);
	}
	#endif
	} else
	SDHCI_UNLOCK(slot);
	} else {
	#ifdef MMCCAM
	if (slot->card_present == 1) {
	#else
	if (slot->dev != NULL) {
	#endif
	/* If no card present - detach mmc bus. */
	if (bootverbose \|\| sdhci_debug)
	slot_printf(slot, "Card removed\n");
	d = slot->dev;
	slot->dev = NULL;
	#ifdef MMCCAM
	slot->card_present = 0;
	mmccam_start_discovery(slot->sim);
	SDHCI_UNLOCK(slot);
	#else
	slot->intmask &= ~sdhci_tuning_intmask(slot);
	WR4(slot, SDHCI_INT_ENABLE, slot->intmask);
	WR4(slot, SDHCI_SIGNAL_ENABLE, slot->intmask);
	slot->opt &= ~SDHCI_TUNING_ENABLED;
	SDHCI_UNLOCK(slot);
	callout_drain(&slot->retune_callout);
	device_delete_child(slot->bus, d);
	#endif
	} else
	SDHCI_UNLOCK(slot);
	}
	}

	static void
	sdhci_handle_card_present_locked(struct sdhci_slot *slot, bool is_present)
	{
	bool was_present;

	/*
	* If there was no card and now there is one, schedule the task to
	* create the child device after a short delay. The delay is to
	* debounce the card insert (sometimes the card detect pin stabilizes
	* before the other pins have made good contact).
	*
	* If there was a card present and now it's gone, immediately schedule
	* the task to delete the child device. No debouncing -- gone is gone,
	* because once power is removed, a full card re-init is needed, and
	* that happens by deleting and recreating the child device.
	*/
	#ifdef MMCCAM
	was_present = slot->card_present;
	#else
	was_present = slot->dev != NULL;
	#endif
	if (!was_present && is_present) {
	taskqueue_enqueue_timeout(taskqueue_swi_giant,
	&slot->card_delayed_task, -SDHCI_INSERT_DELAY_TICKS);
	} else if (was_present && !is_present) {
	taskqueue_enqueue(taskqueue_swi_giant, &slot->card_task);
	}
	}

	void
	sdhci_handle_card_present(struct sdhci_slot *slot, bool is_present)
	{

	SDHCI_LOCK(slot);
	sdhci_handle_card_present_locked(slot, is_present);
	SDHCI_UNLOCK(slot);
	}

	static void
	sdhci_card_poll(void *arg)
	{
	struct sdhci_slot *slot = arg;

	sdhci_handle_card_present(slot,
	SDHCI_GET_CARD_PRESENT(slot->bus, slot));
	callout_reset(&slot->card_poll_callout, SDHCI_CARD_PRESENT_TICKS,
	sdhci_card_poll, slot);
	}

	static int
	sdhci_dma_alloc(struct sdhci_slot *slot)
	{
	int err;

	if (!(slot->quirks & SDHCI_QUIRK_BROKEN_SDMA_BOUNDARY)) {
	- if (MAXPHYS <= 1024 * 4)
	+ if (maxphys <= 1024 * 4)
	slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_4K;
	- else if (MAXPHYS <= 1024 * 8)
	+ else if (maxphys <= 1024 * 8)
	slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_8K;
	- else if (MAXPHYS <= 1024 * 16)
	+ else if (maxphys <= 1024 * 16)
	slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_16K;
	- else if (MAXPHYS <= 1024 * 32)
	+ else if (maxphys <= 1024 * 32)
	slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_32K;
	- else if (MAXPHYS <= 1024 * 64)
	+ else if (maxphys <= 1024 * 64)
	slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_64K;
	- else if (MAXPHYS <= 1024 * 128)
	+ else if (maxphys <= 1024 * 128)
	slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_128K;
	- else if (MAXPHYS <= 1024 * 256)
	+ else if (maxphys <= 1024 * 256)
	slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_256K;
	else
	slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_512K;
	}
	slot->sdma_bbufsz = SDHCI_SDMA_BNDRY_TO_BBUFSZ(slot->sdma_boundary);

	/*
	* Allocate the DMA tag for an SDMA bounce buffer.
	* Note that the SDHCI specification doesn't state any alignment
	* constraint for the SDMA system address. However, controllers
	* typically ignore the SDMA boundary bits in SDHCI_DMA_ADDRESS when
	* forming the actual address of data, requiring the SDMA buffer to
	* be aligned to the SDMA boundary.
	*/
	err = bus_dma_tag_create(bus_get_dma_tag(slot->bus), slot->sdma_bbufsz,
	0, BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL,
	slot->sdma_bbufsz, 1, slot->sdma_bbufsz, BUS_DMA_ALLOCNOW,
	NULL, NULL, &slot->dmatag);
	if (err != 0) {
	slot_printf(slot, "Can't create DMA tag for SDMA\n");
	return (err);
	}
	/* Allocate DMA memory for the SDMA bounce buffer. */
	err = bus_dmamem_alloc(slot->dmatag, (void **)&slot->dmamem,
	BUS_DMA_NOWAIT, &slot->dmamap);
	if (err != 0) {
	slot_printf(slot, "Can't alloc DMA memory for SDMA\n");
	bus_dma_tag_destroy(slot->dmatag);
	return (err);
	}
	/* Map the memory of the SDMA bounce buffer. */
	err = bus_dmamap_load(slot->dmatag, slot->dmamap,
	(void *)slot->dmamem, slot->sdma_bbufsz, sdhci_getaddr,
	&slot->paddr, 0);
	if (err != 0 \|\| slot->paddr == 0) {
	slot_printf(slot, "Can't load DMA memory for SDMA\n");
	bus_dmamem_free(slot->dmatag, slot->dmamem, slot->dmamap);
	bus_dma_tag_destroy(slot->dmatag);
	if (err)
	return (err);
	else
	return (EFAULT);
	}

	return (0);
	}

	static void
	sdhci_dma_free(struct sdhci_slot *slot)
	{

	bus_dmamap_unload(slot->dmatag, slot->dmamap);
	bus_dmamem_free(slot->dmatag, slot->dmamem, slot->dmamap);
	bus_dma_tag_destroy(slot->dmatag);
	}

	int
	sdhci_init_slot(device_t dev, struct sdhci_slot *slot, int num)
	{
	kobjop_desc_t kobj_desc;
	kobj_method_t *kobj_method;
	uint32_t caps, caps2, freq, host_caps;
	int err;

	SDHCI_LOCK_INIT(slot);

	slot->num = num;
	slot->bus = dev;

	slot->version = (RD2(slot, SDHCI_HOST_VERSION)
	>> SDHCI_SPEC_VER_SHIFT) & SDHCI_SPEC_VER_MASK;
	if (slot->quirks & SDHCI_QUIRK_MISSING_CAPS) {
	caps = slot->caps;
	caps2 = slot->caps2;
	} else {
	caps = RD4(slot, SDHCI_CAPABILITIES);
	if (slot->version >= SDHCI_SPEC_300)
	caps2 = RD4(slot, SDHCI_CAPABILITIES2);
	else
	caps2 = 0;
	}
	if (slot->version >= SDHCI_SPEC_300) {
	if ((caps & SDHCI_SLOTTYPE_MASK) != SDHCI_SLOTTYPE_REMOVABLE &&
	(caps & SDHCI_SLOTTYPE_MASK) != SDHCI_SLOTTYPE_EMBEDDED) {
	slot_printf(slot,
	"Driver doesn't support shared bus slots\n");
	SDHCI_LOCK_DESTROY(slot);
	return (ENXIO);
	} else if ((caps & SDHCI_SLOTTYPE_MASK) ==
	SDHCI_SLOTTYPE_EMBEDDED) {
	slot->opt \|= SDHCI_SLOT_EMBEDDED \| SDHCI_NON_REMOVABLE;
	}
	}
	/* Calculate base clock frequency. */
	if (slot->version >= SDHCI_SPEC_300)
	freq = (caps & SDHCI_CLOCK_V3_BASE_MASK) >>
	SDHCI_CLOCK_BASE_SHIFT;
	else
	freq = (caps & SDHCI_CLOCK_BASE_MASK) >>
	SDHCI_CLOCK_BASE_SHIFT;
	if (freq != 0)
	slot->max_clk = freq * 1000000;
	/*
	* If the frequency wasn't in the capabilities and the hardware driver
	* hasn't already set max_clk we're probably not going to work right
	* with an assumption, so complain about it.
	*/
	if (slot->max_clk == 0) {
	slot->max_clk = SDHCI_DEFAULT_MAX_FREQ * 1000000;
	slot_printf(slot, "Hardware doesn't specify base clock "
	"frequency, using %dMHz as default.\n",
	SDHCI_DEFAULT_MAX_FREQ);
	}
	/* Calculate/set timeout clock frequency. */
	if (slot->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK) {
	slot->timeout_clk = slot->max_clk / 1000;
	} else if (slot->quirks & SDHCI_QUIRK_DATA_TIMEOUT_1MHZ) {
	slot->timeout_clk = 1000;
	} else {
	slot->timeout_clk = (caps & SDHCI_TIMEOUT_CLK_MASK) >>
	SDHCI_TIMEOUT_CLK_SHIFT;
	if (caps & SDHCI_TIMEOUT_CLK_UNIT)
	slot->timeout_clk *= 1000;
	}
	/*
	* If the frequency wasn't in the capabilities and the hardware driver
	* hasn't already set timeout_clk we'll probably work okay using the
	* max timeout, but still mention it.
	*/
	if (slot->timeout_clk == 0) {
	slot_printf(slot, "Hardware doesn't specify timeout clock "
	"frequency, setting BROKEN_TIMEOUT quirk.\n");
	slot->quirks \|= SDHCI_QUIRK_BROKEN_TIMEOUT_VAL;
	}

	slot->host.f_min = SDHCI_MIN_FREQ(slot->bus, slot);
	slot->host.f_max = slot->max_clk;
	slot->host.host_ocr = 0;
	if (caps & SDHCI_CAN_VDD_330)
	slot->host.host_ocr \|= MMC_OCR_320_330 \| MMC_OCR_330_340;
	if (caps & SDHCI_CAN_VDD_300)
	slot->host.host_ocr \|= MMC_OCR_290_300 \| MMC_OCR_300_310;
	/*
	* 1.8V VDD is not supposed to be used for removable cards. Hardware
	* prior to v3.0 had no way to indicate embedded slots, but did
	* sometimes support 1.8v for non-removable devices.
	*/
	if ((caps & SDHCI_CAN_VDD_180) && (slot->version < SDHCI_SPEC_300 \|\|
	(slot->opt & SDHCI_SLOT_EMBEDDED)))
	slot->host.host_ocr \|= MMC_OCR_LOW_VOLTAGE;
	if (slot->host.host_ocr == 0) {
	slot_printf(slot, "Hardware doesn't report any "
	"support voltages.\n");
	}

	host_caps = MMC_CAP_4_BIT_DATA;
	if (caps & SDHCI_CAN_DO_8BITBUS)
	host_caps \|= MMC_CAP_8_BIT_DATA;
	if (caps & SDHCI_CAN_DO_HISPD)
	host_caps \|= MMC_CAP_HSPEED;
	if (slot->quirks & SDHCI_QUIRK_BOOT_NOACC)
	host_caps \|= MMC_CAP_BOOT_NOACC;
	if (slot->quirks & SDHCI_QUIRK_WAIT_WHILE_BUSY)
	host_caps \|= MMC_CAP_WAIT_WHILE_BUSY;

	/* Determine supported UHS-I and eMMC modes. */
	if (caps2 & (SDHCI_CAN_SDR50 \| SDHCI_CAN_SDR104 \| SDHCI_CAN_DDR50))
	host_caps \|= MMC_CAP_UHS_SDR12 \| MMC_CAP_UHS_SDR25;
	if (caps2 & SDHCI_CAN_SDR104) {
	host_caps \|= MMC_CAP_UHS_SDR104 \| MMC_CAP_UHS_SDR50;
	if (!(slot->quirks & SDHCI_QUIRK_BROKEN_MMC_HS200))
	host_caps \|= MMC_CAP_MMC_HS200;
	} else if (caps2 & SDHCI_CAN_SDR50)
	host_caps \|= MMC_CAP_UHS_SDR50;
	if (caps2 & SDHCI_CAN_DDR50 &&
	!(slot->quirks & SDHCI_QUIRK_BROKEN_UHS_DDR50))
	host_caps \|= MMC_CAP_UHS_DDR50;
	if (slot->quirks & SDHCI_QUIRK_MMC_DDR52)
	host_caps \|= MMC_CAP_MMC_DDR52;
	if (slot->quirks & SDHCI_QUIRK_CAPS_BIT63_FOR_MMC_HS400 &&
	caps2 & SDHCI_CAN_MMC_HS400)
	host_caps \|= MMC_CAP_MMC_HS400;
	if (slot->quirks & SDHCI_QUIRK_MMC_HS400_IF_CAN_SDR104 &&
	caps2 & SDHCI_CAN_SDR104)
	host_caps \|= MMC_CAP_MMC_HS400;

	/*
	* Disable UHS-I and eMMC modes if the set_uhs_timing method is the
	* default NULL implementation.
	*/
	kobj_desc = &sdhci_set_uhs_timing_desc;
	kobj_method = kobj_lookup_method(((kobj_t)dev)->ops->cls, NULL,
	kobj_desc);
	if (kobj_method == &kobj_desc->deflt)
	host_caps &= ~(MMC_CAP_UHS_SDR12 \| MMC_CAP_UHS_SDR25 \|
	MMC_CAP_UHS_SDR50 \| MMC_CAP_UHS_DDR50 \| MMC_CAP_UHS_SDR104 \|
	MMC_CAP_MMC_DDR52 \| MMC_CAP_MMC_HS200 \| MMC_CAP_MMC_HS400);

	#define SDHCI_CAP_MODES_TUNING(caps2) \
	(((caps2) & SDHCI_TUNE_SDR50 ? MMC_CAP_UHS_SDR50 : 0) \| \
	MMC_CAP_UHS_DDR50 \| MMC_CAP_UHS_SDR104 \| MMC_CAP_MMC_HS200 \| \
	MMC_CAP_MMC_HS400)

	/*
	* Disable UHS-I and eMMC modes that require (re-)tuning if either
	* the tune or re-tune method is the default NULL implementation.
	*/
	kobj_desc = &mmcbr_tune_desc;
	kobj_method = kobj_lookup_method(((kobj_t)dev)->ops->cls, NULL,
	kobj_desc);
	if (kobj_method == &kobj_desc->deflt)
	goto no_tuning;
	kobj_desc = &mmcbr_retune_desc;
	kobj_method = kobj_lookup_method(((kobj_t)dev)->ops->cls, NULL,
	kobj_desc);
	if (kobj_method == &kobj_desc->deflt) {
	no_tuning:
	host_caps &= ~(SDHCI_CAP_MODES_TUNING(caps2));
	}

	/* Allocate tuning structures and determine tuning parameters. */
	if (host_caps & SDHCI_CAP_MODES_TUNING(caps2)) {
	slot->opt \|= SDHCI_TUNING_SUPPORTED;
	slot->tune_req = malloc(sizeof(*slot->tune_req), M_DEVBUF,
	M_WAITOK);
	slot->tune_cmd = malloc(sizeof(*slot->tune_cmd), M_DEVBUF,
	M_WAITOK);
	slot->tune_data = malloc(sizeof(*slot->tune_data), M_DEVBUF,
	M_WAITOK);
	if (caps2 & SDHCI_TUNE_SDR50)
	slot->opt \|= SDHCI_SDR50_NEEDS_TUNING;
	slot->retune_mode = (caps2 & SDHCI_RETUNE_MODES_MASK) >>
	SDHCI_RETUNE_MODES_SHIFT;
	if (slot->retune_mode == SDHCI_RETUNE_MODE_1) {
	slot->retune_count = (caps2 & SDHCI_RETUNE_CNT_MASK) >>
	SDHCI_RETUNE_CNT_SHIFT;
	if (slot->retune_count > 0xb) {
	slot_printf(slot, "Unknown re-tuning count "
	"%x, using 1 sec\n", slot->retune_count);
	slot->retune_count = 1;
	} else if (slot->retune_count != 0)
	slot->retune_count =
	1 << (slot->retune_count - 1);
	}
	}

	#undef SDHCI_CAP_MODES_TUNING

	/* Determine supported VCCQ signaling levels. */
	host_caps \|= MMC_CAP_SIGNALING_330;
	if (host_caps & (MMC_CAP_UHS_SDR12 \| MMC_CAP_UHS_SDR25 \|
	MMC_CAP_UHS_SDR50 \| MMC_CAP_UHS_DDR50 \| MMC_CAP_UHS_SDR104 \|
	MMC_CAP_MMC_DDR52_180 \| MMC_CAP_MMC_HS200_180 \|
	MMC_CAP_MMC_HS400_180))
	host_caps \|= MMC_CAP_SIGNALING_120 \| MMC_CAP_SIGNALING_180;

	/*
	* Disable 1.2 V and 1.8 V signaling if the switch_vccq method is the
	* default NULL implementation. Disable 1.2 V support if it's the
	* generic SDHCI implementation.
	*/
	kobj_desc = &mmcbr_switch_vccq_desc;
	kobj_method = kobj_lookup_method(((kobj_t)dev)->ops->cls, NULL,
	kobj_desc);
	if (kobj_method == &kobj_desc->deflt)
	host_caps &= ~(MMC_CAP_SIGNALING_120 \| MMC_CAP_SIGNALING_180);
	else if (kobj_method->func == (kobjop_t)sdhci_generic_switch_vccq)
	host_caps &= ~MMC_CAP_SIGNALING_120;

	/* Determine supported driver types (type B is always mandatory). */
	if (caps2 & SDHCI_CAN_DRIVE_TYPE_A)
	host_caps \|= MMC_CAP_DRIVER_TYPE_A;
	if (caps2 & SDHCI_CAN_DRIVE_TYPE_C)
	host_caps \|= MMC_CAP_DRIVER_TYPE_C;
	if (caps2 & SDHCI_CAN_DRIVE_TYPE_D)
	host_caps \|= MMC_CAP_DRIVER_TYPE_D;
	slot->host.caps = host_caps;

	/* Decide if we have usable DMA. */
	if (caps & SDHCI_CAN_DO_DMA)
	slot->opt \|= SDHCI_HAVE_DMA;

	if (slot->quirks & SDHCI_QUIRK_BROKEN_DMA)
	slot->opt &= ~SDHCI_HAVE_DMA;
	if (slot->quirks & SDHCI_QUIRK_FORCE_DMA)
	slot->opt \|= SDHCI_HAVE_DMA;
	if (slot->quirks & SDHCI_QUIRK_ALL_SLOTS_NON_REMOVABLE)
	slot->opt \|= SDHCI_NON_REMOVABLE;

	/*
	* Use platform-provided transfer backend
	* with PIO as a fallback mechanism
	*/
	if (slot->opt & SDHCI_PLATFORM_TRANSFER)
	slot->opt &= ~SDHCI_HAVE_DMA;

	if (slot->opt & SDHCI_HAVE_DMA) {
	err = sdhci_dma_alloc(slot);
	if (err != 0) {
	if (slot->opt & SDHCI_TUNING_SUPPORTED) {
	free(slot->tune_req, M_DEVBUF);
	free(slot->tune_cmd, M_DEVBUF);
	free(slot->tune_data, M_DEVBUF);
	}
	SDHCI_LOCK_DESTROY(slot);
	return (err);
	}
	}

	if (bootverbose \|\| sdhci_debug) {
	slot_printf(slot,
	"%uMHz%s %s VDD:%s%s%s VCCQ: 3.3V%s%s DRV: B%s%s%s %s %s\n",
	slot->max_clk / 1000000,
	(caps & SDHCI_CAN_DO_HISPD) ? " HS" : "",
	(host_caps & MMC_CAP_8_BIT_DATA) ? "8bits" :
	((host_caps & MMC_CAP_4_BIT_DATA) ? "4bits" : "1bit"),
	(caps & SDHCI_CAN_VDD_330) ? " 3.3V" : "",
	(caps & SDHCI_CAN_VDD_300) ? " 3.0V" : "",
	((caps & SDHCI_CAN_VDD_180) &&
	(slot->opt & SDHCI_SLOT_EMBEDDED)) ? " 1.8V" : "",
	(host_caps & MMC_CAP_SIGNALING_180) ? " 1.8V" : "",
	(host_caps & MMC_CAP_SIGNALING_120) ? " 1.2V" : "",
	(host_caps & MMC_CAP_DRIVER_TYPE_A) ? "A" : "",
	(host_caps & MMC_CAP_DRIVER_TYPE_C) ? "C" : "",
	(host_caps & MMC_CAP_DRIVER_TYPE_D) ? "D" : "",
	(slot->opt & SDHCI_HAVE_DMA) ? "DMA" : "PIO",
	(slot->opt & SDHCI_SLOT_EMBEDDED) ? "embedded" :
	(slot->opt & SDHCI_NON_REMOVABLE) ? "non-removable" :
	"removable");
	if (host_caps & (MMC_CAP_MMC_DDR52 \| MMC_CAP_MMC_HS200 \|
	MMC_CAP_MMC_HS400 \| MMC_CAP_MMC_ENH_STROBE))
	slot_printf(slot, "eMMC:%s%s%s%s\n",
	(host_caps & MMC_CAP_MMC_DDR52) ? " DDR52" : "",
	(host_caps & MMC_CAP_MMC_HS200) ? " HS200" : "",
	(host_caps & MMC_CAP_MMC_HS400) ? " HS400" : "",
	((host_caps &
	(MMC_CAP_MMC_HS400 \| MMC_CAP_MMC_ENH_STROBE)) ==
	(MMC_CAP_MMC_HS400 \| MMC_CAP_MMC_ENH_STROBE)) ?
	" HS400ES" : "");
	if (host_caps & (MMC_CAP_UHS_SDR12 \| MMC_CAP_UHS_SDR25 \|
	MMC_CAP_UHS_SDR50 \| MMC_CAP_UHS_SDR104))
	slot_printf(slot, "UHS-I:%s%s%s%s%s\n",
	(host_caps & MMC_CAP_UHS_SDR12) ? " SDR12" : "",
	(host_caps & MMC_CAP_UHS_SDR25) ? " SDR25" : "",
	(host_caps & MMC_CAP_UHS_SDR50) ? " SDR50" : "",
	(host_caps & MMC_CAP_UHS_SDR104) ? " SDR104" : "",
	(host_caps & MMC_CAP_UHS_DDR50) ? " DDR50" : "");
	if (slot->opt & SDHCI_TUNING_SUPPORTED)
	slot_printf(slot, "Re-tuning count %d secs, mode %d\n",
	slot->retune_count, slot->retune_mode + 1);
	sdhci_dumpregs(slot);
	}

	slot->timeout = 10;
	SYSCTL_ADD_INT(device_get_sysctl_ctx(slot->bus),
	SYSCTL_CHILDREN(device_get_sysctl_tree(slot->bus)), OID_AUTO,
	"timeout", CTLFLAG_RWTUN, &slot->timeout, 0,
	"Maximum timeout for SDHCI transfers (in secs)");
	TASK_INIT(&slot->card_task, 0, sdhci_card_task, slot);
	TIMEOUT_TASK_INIT(taskqueue_swi_giant, &slot->card_delayed_task, 0,
	sdhci_card_task, slot);
	callout_init(&slot->card_poll_callout, 1);
	callout_init_mtx(&slot->timeout_callout, &slot->mtx, 0);
	callout_init_mtx(&slot->retune_callout, &slot->mtx, 0);

	if ((slot->quirks & SDHCI_QUIRK_POLL_CARD_PRESENT) &&
	!(slot->opt & SDHCI_NON_REMOVABLE)) {
	callout_reset(&slot->card_poll_callout,
	SDHCI_CARD_PRESENT_TICKS, sdhci_card_poll, slot);
	}

	sdhci_init(slot);

	return (0);
	}

	#ifndef MMCCAM
	void
	sdhci_start_slot(struct sdhci_slot *slot)
	{

	sdhci_card_task(slot, 0);
	}
	#endif

	int
	sdhci_cleanup_slot(struct sdhci_slot *slot)
	{
	device_t d;

	callout_drain(&slot->timeout_callout);
	callout_drain(&slot->card_poll_callout);
	callout_drain(&slot->retune_callout);
	taskqueue_drain(taskqueue_swi_giant, &slot->card_task);
	taskqueue_drain_timeout(taskqueue_swi_giant, &slot->card_delayed_task);

	SDHCI_LOCK(slot);
	d = slot->dev;
	slot->dev = NULL;
	SDHCI_UNLOCK(slot);
	if (d != NULL)
	device_delete_child(slot->bus, d);

	SDHCI_LOCK(slot);
	sdhci_reset(slot, SDHCI_RESET_ALL);
	SDHCI_UNLOCK(slot);
	if (slot->opt & SDHCI_HAVE_DMA)
	sdhci_dma_free(slot);
	if (slot->opt & SDHCI_TUNING_SUPPORTED) {
	free(slot->tune_req, M_DEVBUF);
	free(slot->tune_cmd, M_DEVBUF);
	free(slot->tune_data, M_DEVBUF);
	}

	SDHCI_LOCK_DESTROY(slot);

	return (0);
	}

	int
	sdhci_generic_suspend(struct sdhci_slot *slot)
	{

	/*
	* We expect the MMC layer to issue initial tuning after resume.
	* Otherwise, we'd need to indicate re-tuning including circuit reset
	* being required at least for re-tuning modes 1 and 2 ourselves.
	*/
	callout_drain(&slot->retune_callout);
	SDHCI_LOCK(slot);
	slot->opt &= ~SDHCI_TUNING_ENABLED;
	sdhci_reset(slot, SDHCI_RESET_ALL);
	SDHCI_UNLOCK(slot);

	return (0);
	}

	int
	sdhci_generic_resume(struct sdhci_slot *slot)
	{

	SDHCI_LOCK(slot);
	sdhci_init(slot);
	SDHCI_UNLOCK(slot);

	return (0);
	}

	uint32_t
	sdhci_generic_min_freq(device_t brdev __unused, struct sdhci_slot *slot)
	{

	if (slot->version >= SDHCI_SPEC_300)
	return (slot->max_clk / SDHCI_300_MAX_DIVIDER);
	else
	return (slot->max_clk / SDHCI_200_MAX_DIVIDER);
	}

	bool
	sdhci_generic_get_card_present(device_t brdev __unused, struct sdhci_slot *slot)
	{

	if (slot->opt & SDHCI_NON_REMOVABLE)
	return true;

	return (RD4(slot, SDHCI_PRESENT_STATE) & SDHCI_CARD_PRESENT);
	}

	void
	sdhci_generic_set_uhs_timing(device_t brdev __unused, struct sdhci_slot *slot)
	{
	const struct mmc_ios *ios;
	uint16_t hostctrl2;

	if (slot->version < SDHCI_SPEC_300)
	return;

	SDHCI_ASSERT_LOCKED(slot);
	ios = &slot->host.ios;
	sdhci_set_clock(slot, 0);
	hostctrl2 = RD2(slot, SDHCI_HOST_CONTROL2);
	hostctrl2 &= ~SDHCI_CTRL2_UHS_MASK;
	if (ios->clock > SD_SDR50_MAX) {
	if (ios->timing == bus_timing_mmc_hs400 \|\|
	ios->timing == bus_timing_mmc_hs400es)
	hostctrl2 \|= SDHCI_CTRL2_MMC_HS400;
	else
	hostctrl2 \|= SDHCI_CTRL2_UHS_SDR104;
	}
	else if (ios->clock > SD_SDR25_MAX)
	hostctrl2 \|= SDHCI_CTRL2_UHS_SDR50;
	else if (ios->clock > SD_SDR12_MAX) {
	if (ios->timing == bus_timing_uhs_ddr50 \|\|
	ios->timing == bus_timing_mmc_ddr52)
	hostctrl2 \|= SDHCI_CTRL2_UHS_DDR50;
	else
	hostctrl2 \|= SDHCI_CTRL2_UHS_SDR25;
	} else if (ios->clock > SD_MMC_CARD_ID_FREQUENCY)
	hostctrl2 \|= SDHCI_CTRL2_UHS_SDR12;
	WR2(slot, SDHCI_HOST_CONTROL2, hostctrl2);
	sdhci_set_clock(slot, ios->clock);
	}

	int
	sdhci_generic_update_ios(device_t brdev, device_t reqdev)
	{
	struct sdhci_slot *slot = device_get_ivars(reqdev);
	struct mmc_ios *ios = &slot->host.ios;

	SDHCI_LOCK(slot);
	/* Do full reset on bus power down to clear from any state. */
	if (ios->power_mode == power_off) {
	WR4(slot, SDHCI_SIGNAL_ENABLE, 0);
	sdhci_init(slot);
	}
	/* Configure the bus. */
	sdhci_set_clock(slot, ios->clock);
	sdhci_set_power(slot, (ios->power_mode == power_off) ? 0 : ios->vdd);
	if (ios->bus_width == bus_width_8) {
	slot->hostctrl \|= SDHCI_CTRL_8BITBUS;
	slot->hostctrl &= ~SDHCI_CTRL_4BITBUS;
	} else if (ios->bus_width == bus_width_4) {
	slot->hostctrl &= ~SDHCI_CTRL_8BITBUS;
	slot->hostctrl \|= SDHCI_CTRL_4BITBUS;
	} else if (ios->bus_width == bus_width_1) {
	slot->hostctrl &= ~SDHCI_CTRL_8BITBUS;
	slot->hostctrl &= ~SDHCI_CTRL_4BITBUS;
	} else {
	panic("Invalid bus width: %d", ios->bus_width);
	}
	if (ios->clock > SD_SDR12_MAX &&
	!(slot->quirks & SDHCI_QUIRK_DONT_SET_HISPD_BIT))
	slot->hostctrl \|= SDHCI_CTRL_HISPD;
	else
	slot->hostctrl &= ~SDHCI_CTRL_HISPD;
	WR1(slot, SDHCI_HOST_CONTROL, slot->hostctrl);
	SDHCI_SET_UHS_TIMING(brdev, slot);
	/* Some controllers like reset after bus changes. */
	if (slot->quirks & SDHCI_QUIRK_RESET_ON_IOS)
	sdhci_reset(slot, SDHCI_RESET_CMD \| SDHCI_RESET_DATA);

	SDHCI_UNLOCK(slot);
	return (0);
	}

	int
	sdhci_generic_switch_vccq(device_t brdev __unused, device_t reqdev)
	{
	struct sdhci_slot *slot = device_get_ivars(reqdev);
	enum mmc_vccq vccq;
	int err;
	uint16_t hostctrl2;

	if (slot->version < SDHCI_SPEC_300)
	return (0);

	err = 0;
	vccq = slot->host.ios.vccq;
	SDHCI_LOCK(slot);
	sdhci_set_clock(slot, 0);
	hostctrl2 = RD2(slot, SDHCI_HOST_CONTROL2);
	switch (vccq) {
	case vccq_330:
	if (!(hostctrl2 & SDHCI_CTRL2_S18_ENABLE))
	goto done;
	hostctrl2 &= ~SDHCI_CTRL2_S18_ENABLE;
	WR2(slot, SDHCI_HOST_CONTROL2, hostctrl2);
	DELAY(5000);
	hostctrl2 = RD2(slot, SDHCI_HOST_CONTROL2);
	if (!(hostctrl2 & SDHCI_CTRL2_S18_ENABLE))
	goto done;
	err = EAGAIN;
	break;
	case vccq_180:
	if (!(slot->host.caps & MMC_CAP_SIGNALING_180)) {
	err = EINVAL;
	goto done;
	}
	if (hostctrl2 & SDHCI_CTRL2_S18_ENABLE)
	goto done;
	hostctrl2 \|= SDHCI_CTRL2_S18_ENABLE;
	WR2(slot, SDHCI_HOST_CONTROL2, hostctrl2);
	DELAY(5000);
	hostctrl2 = RD2(slot, SDHCI_HOST_CONTROL2);
	if (hostctrl2 & SDHCI_CTRL2_S18_ENABLE)
	goto done;
	err = EAGAIN;
	break;
	default:
	slot_printf(slot,
	"Attempt to set unsupported signaling voltage\n");
	err = EINVAL;
	break;
	}
	done:
	sdhci_set_clock(slot, slot->host.ios.clock);
	SDHCI_UNLOCK(slot);
	return (err);
	}

	int
	sdhci_generic_tune(device_t brdev __unused, device_t reqdev, bool hs400)
	{
	struct sdhci_slot *slot = device_get_ivars(reqdev);
	const struct mmc_ios *ios = &slot->host.ios;
	struct mmc_command *tune_cmd;
	struct mmc_data *tune_data;
	uint32_t opcode;
	int err;

	if (!(slot->opt & SDHCI_TUNING_SUPPORTED))
	return (0);

	slot->retune_ticks = slot->retune_count * hz;
	opcode = MMC_SEND_TUNING_BLOCK;
	SDHCI_LOCK(slot);
	switch (ios->timing) {
	case bus_timing_mmc_hs400:
	slot_printf(slot, "HS400 must be tuned in HS200 mode\n");
	SDHCI_UNLOCK(slot);
	return (EINVAL);
	case bus_timing_mmc_hs200:
	/*
	* In HS400 mode, controllers use the data strobe line to
	* latch data from the devices so periodic re-tuning isn't
	* expected to be required.
	*/
	if (hs400)
	slot->retune_ticks = 0;
	opcode = MMC_SEND_TUNING_BLOCK_HS200;
	break;
	case bus_timing_uhs_ddr50:
	case bus_timing_uhs_sdr104:
	break;
	case bus_timing_uhs_sdr50:
	if (slot->opt & SDHCI_SDR50_NEEDS_TUNING)
	break;
	/* FALLTHROUGH */
	default:
	SDHCI_UNLOCK(slot);
	return (0);
	}

	tune_cmd = slot->tune_cmd;
	memset(tune_cmd, 0, sizeof(*tune_cmd));
	tune_cmd->opcode = opcode;
	tune_cmd->flags = MMC_RSP_R1 \| MMC_CMD_ADTC;
	tune_data = tune_cmd->data = slot->tune_data;
	memset(tune_data, 0, sizeof(*tune_data));
	tune_data->len = (opcode == MMC_SEND_TUNING_BLOCK_HS200 &&
	ios->bus_width == bus_width_8) ? MMC_TUNING_LEN_HS200 :
	MMC_TUNING_LEN;
	tune_data->flags = MMC_DATA_READ;
	tune_data->mrq = tune_cmd->mrq = slot->tune_req;

	slot->opt &= ~SDHCI_TUNING_ENABLED;
	err = sdhci_exec_tuning(slot, true);
	if (err == 0) {
	slot->opt \|= SDHCI_TUNING_ENABLED;
	slot->intmask \|= sdhci_tuning_intmask(slot);
	WR4(slot, SDHCI_INT_ENABLE, slot->intmask);
	WR4(slot, SDHCI_SIGNAL_ENABLE, slot->intmask);
	if (slot->retune_ticks) {
	callout_reset(&slot->retune_callout, slot->retune_ticks,
	sdhci_retune, slot);
	}
	}
	SDHCI_UNLOCK(slot);
	return (err);
	}

	int
	sdhci_generic_retune(device_t brdev __unused, device_t reqdev, bool reset)
	{
	struct sdhci_slot *slot = device_get_ivars(reqdev);
	int err;

	if (!(slot->opt & SDHCI_TUNING_ENABLED))
	return (0);

	/* HS400 must be tuned in HS200 mode. */
	if (slot->host.ios.timing == bus_timing_mmc_hs400)
	return (EINVAL);

	SDHCI_LOCK(slot);
	err = sdhci_exec_tuning(slot, reset);
	/*
	* There are two ways sdhci_exec_tuning() can fail:
	* EBUSY should not actually happen when requests are only issued
	* with the host properly acquired, and
	* EIO re-tuning failed (but it did work initially).
	*
	* In both cases, we should retry at later point if periodic re-tuning
	* is enabled. Note that due to slot->retune_req not being cleared in
	* these failure cases, the MMC layer should trigger another attempt at
	* re-tuning with the next request anyway, though.
	*/
	if (slot->retune_ticks) {
	callout_reset(&slot->retune_callout, slot->retune_ticks,
	sdhci_retune, slot);
	}
	SDHCI_UNLOCK(slot);
	return (err);
	}

	static int
	sdhci_exec_tuning(struct sdhci_slot *slot, bool reset)
	{
	struct mmc_request *tune_req;
	struct mmc_command *tune_cmd;
	int i;
	uint32_t intmask;
	uint16_t hostctrl2;
	u_char opt;

	SDHCI_ASSERT_LOCKED(slot);
	if (slot->req != NULL)
	return (EBUSY);

	/* Tuning doesn't work with DMA enabled. */
	opt = slot->opt;
	slot->opt = opt & ~SDHCI_HAVE_DMA;

	/*
	* Ensure that as documented, SDHCI_INT_DATA_AVAIL is the only
	* kind of interrupt we receive in response to a tuning request.
	*/
	intmask = slot->intmask;
	slot->intmask = SDHCI_INT_DATA_AVAIL;
	WR4(slot, SDHCI_INT_ENABLE, SDHCI_INT_DATA_AVAIL);
	WR4(slot, SDHCI_SIGNAL_ENABLE, SDHCI_INT_DATA_AVAIL);

	hostctrl2 = RD2(slot, SDHCI_HOST_CONTROL2);
	if (reset)
	hostctrl2 &= ~SDHCI_CTRL2_SAMPLING_CLOCK;
	else
	hostctrl2 \|= SDHCI_CTRL2_SAMPLING_CLOCK;
	WR2(slot, SDHCI_HOST_CONTROL2, hostctrl2 \| SDHCI_CTRL2_EXEC_TUNING);

	tune_req = slot->tune_req;
	tune_cmd = slot->tune_cmd;
	for (i = 0; i < MMC_TUNING_MAX; i++) {
	memset(tune_req, 0, sizeof(*tune_req));
	tune_req->cmd = tune_cmd;
	tune_req->done = sdhci_req_wakeup;
	tune_req->done_data = slot;
	slot->req = tune_req;
	slot->flags = 0;
	sdhci_start(slot);
	while (!(tune_req->flags & MMC_REQ_DONE))
	msleep(tune_req, &slot->mtx, 0, "sdhciet", 0);
	if (!(tune_req->flags & MMC_TUNE_DONE))
	break;
	hostctrl2 = RD2(slot, SDHCI_HOST_CONTROL2);
	if (!(hostctrl2 & SDHCI_CTRL2_EXEC_TUNING))
	break;
	if (tune_cmd->opcode == MMC_SEND_TUNING_BLOCK)
	DELAY(1000);
	}

	/*
	* Restore DMA usage and interrupts.
	* Note that the interrupt aggregation code might have cleared
	* SDHCI_INT_DMA_END and/or SDHCI_INT_RESPONSE in slot->intmask
	* and SDHCI_SIGNAL_ENABLE respectively so ensure SDHCI_INT_ENABLE
	* doesn't lose these.
	*/
	slot->opt = opt;
	slot->intmask = intmask;
	WR4(slot, SDHCI_INT_ENABLE, intmask \| SDHCI_INT_DMA_END \|
	SDHCI_INT_RESPONSE);
	WR4(slot, SDHCI_SIGNAL_ENABLE, intmask);

	if ((hostctrl2 & (SDHCI_CTRL2_EXEC_TUNING \|
	SDHCI_CTRL2_SAMPLING_CLOCK)) == SDHCI_CTRL2_SAMPLING_CLOCK) {
	slot->retune_req = 0;
	return (0);
	}

	slot_printf(slot, "Tuning failed, using fixed sampling clock\n");
	WR2(slot, SDHCI_HOST_CONTROL2, hostctrl2 & ~(SDHCI_CTRL2_EXEC_TUNING \|
	SDHCI_CTRL2_SAMPLING_CLOCK));
	sdhci_reset(slot, SDHCI_RESET_CMD \| SDHCI_RESET_DATA);
	return (EIO);
	}

	static void
	sdhci_retune(void *arg)
	{
	struct sdhci_slot *slot = arg;

	slot->retune_req \|= SDHCI_RETUNE_REQ_NEEDED;
	}

	#ifdef MMCCAM
	static void
	sdhci_req_done(struct sdhci_slot *slot)
	{
	union ccb *ccb;

	if (__predict_false(sdhci_debug > 1))
	slot_printf(slot, "%s\n", __func__);
	if (slot->ccb != NULL && slot->curcmd != NULL) {
	callout_stop(&slot->timeout_callout);
	ccb = slot->ccb;
	slot->ccb = NULL;
	slot->curcmd = NULL;

	/* Tell CAM the request is finished */
	struct ccb_mmcio *mmcio;
	mmcio = &ccb->mmcio;

	ccb->ccb_h.status =
	(mmcio->cmd.error == 0 ? CAM_REQ_CMP : CAM_REQ_CMP_ERR);
	xpt_done(ccb);
	}
	}
	#else
	static void
	sdhci_req_done(struct sdhci_slot *slot)
	{
	struct mmc_request *req;

	if (slot->req != NULL && slot->curcmd != NULL) {
	callout_stop(&slot->timeout_callout);
	req = slot->req;
	slot->req = NULL;
	slot->curcmd = NULL;
	req->done(req);
	}
	}
	#endif

	static void
	sdhci_req_wakeup(struct mmc_request *req)
	{
	struct sdhci_slot *slot;

	slot = req->done_data;
	req->flags \|= MMC_REQ_DONE;
	wakeup(req);
	}

	static void
	sdhci_timeout(void *arg)
	{
	struct sdhci_slot *slot = arg;

	if (slot->curcmd != NULL) {
	slot_printf(slot, "Controller timeout\n");
	sdhci_dumpregs(slot);
	sdhci_reset(slot, SDHCI_RESET_CMD \| SDHCI_RESET_DATA);
	slot->curcmd->error = MMC_ERR_TIMEOUT;
	sdhci_req_done(slot);
	} else {
	slot_printf(slot, "Spurious timeout - no active command\n");
	}
	}

	static void
	sdhci_set_transfer_mode(struct sdhci_slot slot, const struct mmc_data data)
	{
	uint16_t mode;

	if (data == NULL)
	return;

	mode = SDHCI_TRNS_BLK_CNT_EN;
	if (data->len > 512 \|\| data->block_count > 1) {
	mode \|= SDHCI_TRNS_MULTI;
	if (data->block_count == 0 && __predict_true(
	#ifdef MMCCAM
	slot->ccb->mmcio.stop.opcode == MMC_STOP_TRANSMISSION &&
	#else
	slot->req->stop != NULL &&
	#endif
	!(slot->quirks & SDHCI_QUIRK_BROKEN_AUTO_STOP)))
	mode \|= SDHCI_TRNS_ACMD12;
	}
	if (data->flags & MMC_DATA_READ)
	mode \|= SDHCI_TRNS_READ;
	if (slot->flags & SDHCI_USE_DMA)
	mode \|= SDHCI_TRNS_DMA;

	WR2(slot, SDHCI_TRANSFER_MODE, mode);
	}

	static void
	sdhci_start_command(struct sdhci_slot slot, struct mmc_command cmd)
	{
	int flags, timeout;
	uint32_t mask;

	slot->curcmd = cmd;
	slot->cmd_done = 0;

	cmd->error = MMC_ERR_NONE;

	/* This flags combination is not supported by controller. */
	if ((cmd->flags & MMC_RSP_136) && (cmd->flags & MMC_RSP_BUSY)) {
	slot_printf(slot, "Unsupported response type!\n");
	cmd->error = MMC_ERR_FAILED;
	sdhci_req_done(slot);
	return;
	}

	/*
	* Do not issue command if there is no card, clock or power.
	* Controller will not detect timeout without clock active.
	*/
	if (!SDHCI_GET_CARD_PRESENT(slot->bus, slot) \|\|
	slot->power == 0 \|\|
	slot->clock == 0) {
	slot_printf(slot,
	"Cannot issue a command (power=%d clock=%d)",
	slot->power, slot->clock);
	cmd->error = MMC_ERR_FAILED;
	sdhci_req_done(slot);
	return;
	}
	/* Always wait for free CMD bus. */
	mask = SDHCI_CMD_INHIBIT;
	/* Wait for free DAT if we have data or busy signal. */
	if (cmd->data != NULL \|\| (cmd->flags & MMC_RSP_BUSY))
	mask \|= SDHCI_DAT_INHIBIT;
	/*
	* We shouldn't wait for DAT for stop commands or CMD19/CMD21. Note
	* that these latter are also special in that SDHCI_CMD_DATA should
	* be set below but no actual data is ever read from the controller.
	*/
	#ifdef MMCCAM
	if (cmd == &slot->ccb->mmcio.stop \|\|
	#else
	if (cmd == slot->req->stop \|\|
	#endif
	__predict_false(cmd->opcode == MMC_SEND_TUNING_BLOCK \|\|
	cmd->opcode == MMC_SEND_TUNING_BLOCK_HS200))
	mask &= ~SDHCI_DAT_INHIBIT;
	/*
	* Wait for bus no more then 250 ms. Typically there will be no wait
	* here at all, but when writing a crash dump we may be bypassing the
	* host platform's interrupt handler, and in some cases that handler
	* may be working around hardware quirks such as not respecting r1b
	* busy indications. In those cases, this wait-loop serves the purpose
	* of waiting for the prior command and data transfers to be done, and
	* SD cards are allowed to take up to 250ms for write and erase ops.
	* (It's usually more like 20-30ms in the real world.)
	*/
	timeout = 250;
	while (mask & RD4(slot, SDHCI_PRESENT_STATE)) {
	if (timeout == 0) {
	slot_printf(slot, "Controller never released "
	"inhibit bit(s).\n");
	sdhci_dumpregs(slot);
	cmd->error = MMC_ERR_FAILED;
	sdhci_req_done(slot);
	return;
	}
	timeout--;
	DELAY(1000);
	}

	/* Prepare command flags. */
	if (!(cmd->flags & MMC_RSP_PRESENT))
	flags = SDHCI_CMD_RESP_NONE;
	else if (cmd->flags & MMC_RSP_136)
	flags = SDHCI_CMD_RESP_LONG;
	else if (cmd->flags & MMC_RSP_BUSY)
	flags = SDHCI_CMD_RESP_SHORT_BUSY;
	else
	flags = SDHCI_CMD_RESP_SHORT;
	if (cmd->flags & MMC_RSP_CRC)
	flags \|= SDHCI_CMD_CRC;
	if (cmd->flags & MMC_RSP_OPCODE)
	flags \|= SDHCI_CMD_INDEX;
	if (cmd->data != NULL)
	flags \|= SDHCI_CMD_DATA;
	if (cmd->opcode == MMC_STOP_TRANSMISSION)
	flags \|= SDHCI_CMD_TYPE_ABORT;
	/* Prepare data. */
	sdhci_start_data(slot, cmd->data);
	/*
	* Interrupt aggregation: To reduce total number of interrupts
	* group response interrupt with data interrupt when possible.
	* If there going to be data interrupt, mask response one.
	*/
	if (slot->data_done == 0) {
	WR4(slot, SDHCI_SIGNAL_ENABLE,
	slot->intmask &= ~SDHCI_INT_RESPONSE);
	}
	/* Set command argument. */
	WR4(slot, SDHCI_ARGUMENT, cmd->arg);
	/* Set data transfer mode. */
	sdhci_set_transfer_mode(slot, cmd->data);
	if (__predict_false(sdhci_debug > 1))
	slot_printf(slot, "Starting command opcode %#04x flags %#04x\n",
	cmd->opcode, flags);

	/* Start command. */
	WR2(slot, SDHCI_COMMAND_FLAGS, (cmd->opcode << 8) \| (flags & 0xff));
	/* Start timeout callout. */
	callout_reset(&slot->timeout_callout, slot->timeout * hz,
	sdhci_timeout, slot);
	}

	static void
	sdhci_finish_command(struct sdhci_slot *slot)
	{
	int i;
	uint32_t val;
	uint8_t extra;

	if (__predict_false(sdhci_debug > 1))
	slot_printf(slot, "%s: called, err %d flags %#04x\n",
	__func__, slot->curcmd->error, slot->curcmd->flags);
	slot->cmd_done = 1;
	/*
	* Interrupt aggregation: Restore command interrupt.
	* Main restore point for the case when command interrupt
	* happened first.
	*/
	if (__predict_true(slot->curcmd->opcode != MMC_SEND_TUNING_BLOCK &&
	slot->curcmd->opcode != MMC_SEND_TUNING_BLOCK_HS200))
	WR4(slot, SDHCI_SIGNAL_ENABLE, slot->intmask \|=
	SDHCI_INT_RESPONSE);
	/* In case of error - reset host and return. */
	if (slot->curcmd->error) {
	if (slot->curcmd->error == MMC_ERR_BADCRC)
	slot->retune_req \|= SDHCI_RETUNE_REQ_RESET;
	sdhci_reset(slot, SDHCI_RESET_CMD);
	sdhci_reset(slot, SDHCI_RESET_DATA);
	sdhci_start(slot);
	return;
	}
	/* If command has response - fetch it. */
	if (slot->curcmd->flags & MMC_RSP_PRESENT) {
	if (slot->curcmd->flags & MMC_RSP_136) {
	/* CRC is stripped so we need one byte shift. */
	extra = 0;
	for (i = 0; i < 4; i++) {
	val = RD4(slot, SDHCI_RESPONSE + i * 4);
	if (slot->quirks &
	SDHCI_QUIRK_DONT_SHIFT_RESPONSE)
	slot->curcmd->resp[3 - i] = val;
	else {
	slot->curcmd->resp[3 - i] =
	(val << 8) \| extra;
	extra = val >> 24;
	}
	}
	} else
	slot->curcmd->resp[0] = RD4(slot, SDHCI_RESPONSE);
	}
	if (__predict_false(sdhci_debug > 1))
	slot_printf(slot, "Resp: %#04x %#04x %#04x %#04x\n",
	slot->curcmd->resp[0], slot->curcmd->resp[1],
	slot->curcmd->resp[2], slot->curcmd->resp[3]);

	/* If data ready - finish. */
	if (slot->data_done)
	sdhci_start(slot);
	}

	static void
	sdhci_start_data(struct sdhci_slot slot, const struct mmc_data data)
	{
	uint32_t blkcnt, blksz, current_timeout, sdma_bbufsz, target_timeout;
	uint8_t div;

	if (data == NULL && (slot->curcmd->flags & MMC_RSP_BUSY) == 0) {
	slot->data_done = 1;
	return;
	}

	slot->data_done = 0;

	/* Calculate and set data timeout.*/
	/* XXX: We should have this from mmc layer, now assume 1 sec. */
	if (slot->quirks & SDHCI_QUIRK_BROKEN_TIMEOUT_VAL) {
	div = 0xE;
	} else {
	target_timeout = 1000000;
	div = 0;
	current_timeout = (1 << 13) * 1000 / slot->timeout_clk;
	while (current_timeout < target_timeout && div < 0xE) {
	++div;
	current_timeout <<= 1;
	}
	/* Compensate for an off-by-one error in the CaFe chip.*/
	if (div < 0xE &&
	(slot->quirks & SDHCI_QUIRK_INCR_TIMEOUT_CONTROL)) {
	++div;
	}
	}
	WR1(slot, SDHCI_TIMEOUT_CONTROL, div);

	if (data == NULL)
	return;

	/* Use DMA if possible. */
	if ((slot->opt & SDHCI_HAVE_DMA))
	slot->flags \|= SDHCI_USE_DMA;
	/* If data is small, broken DMA may return zeroes instead of data. */
	if ((slot->quirks & SDHCI_QUIRK_BROKEN_TIMINGS) &&
	(data->len <= 512))
	slot->flags &= ~SDHCI_USE_DMA;
	/* Some controllers require even block sizes. */
	if ((slot->quirks & SDHCI_QUIRK_32BIT_DMA_SIZE) &&
	((data->len) & 0x3))
	slot->flags &= ~SDHCI_USE_DMA;
	/* Load DMA buffer. */
	if (slot->flags & SDHCI_USE_DMA) {
	sdma_bbufsz = slot->sdma_bbufsz;
	if (data->flags & MMC_DATA_READ)
	bus_dmamap_sync(slot->dmatag, slot->dmamap,
	BUS_DMASYNC_PREREAD);
	else {
	memcpy(slot->dmamem, data->data, ulmin(data->len,
	sdma_bbufsz));
	bus_dmamap_sync(slot->dmatag, slot->dmamap,
	BUS_DMASYNC_PREWRITE);
	}
	WR4(slot, SDHCI_DMA_ADDRESS, slot->paddr);
	/*
	* Interrupt aggregation: Mask border interrupt for the last
	* bounce buffer and unmask otherwise.
	*/
	if (data->len == sdma_bbufsz)
	slot->intmask &= ~SDHCI_INT_DMA_END;
	else
	slot->intmask \|= SDHCI_INT_DMA_END;
	WR4(slot, SDHCI_SIGNAL_ENABLE, slot->intmask);
	}
	/* Current data offset for both PIO and DMA. */
	slot->offset = 0;
	#ifdef MMCCAM
	if (data->flags & MMC_DATA_BLOCK_SIZE) {
	/* Set block size and request border interrupts on the SDMA boundary. */
	blksz = SDHCI_MAKE_BLKSZ(slot->sdma_boundary, data->block_size);
	blkcnt = data->block_count;
	if (__predict_false(sdhci_debug > 0))
	slot_printf(slot, "SDIO Custom block params: blksz: "
	"%#10x, blk cnt: %#10x\n", blksz, blkcnt);
	} else
	#endif
	{
	/* Set block size and request border interrupts on the SDMA boundary. */
	blksz = SDHCI_MAKE_BLKSZ(slot->sdma_boundary, ulmin(data->len, 512));
	blkcnt = howmany(data->len, 512);
	}

	WR2(slot, SDHCI_BLOCK_SIZE, blksz);
	WR2(slot, SDHCI_BLOCK_COUNT, blkcnt);
	if (__predict_false(sdhci_debug > 1))
	slot_printf(slot, "Blk size: 0x%08x \| Blk cnt: 0x%08x\n",
	blksz, blkcnt);
	}

	void
	sdhci_finish_data(struct sdhci_slot *slot)
	{
	struct mmc_data *data = slot->curcmd->data;
	size_t left;

	/* Interrupt aggregation: Restore command interrupt.
	* Auxiliary restore point for the case when data interrupt
	* happened first. */
	if (!slot->cmd_done) {
	WR4(slot, SDHCI_SIGNAL_ENABLE,
	slot->intmask \|= SDHCI_INT_RESPONSE);
	}
	/* Unload rest of data from DMA buffer. */
	if (!slot->data_done && (slot->flags & SDHCI_USE_DMA) &&
	slot->curcmd->data != NULL) {
	if (data->flags & MMC_DATA_READ) {
	left = data->len - slot->offset;
	bus_dmamap_sync(slot->dmatag, slot->dmamap,
	BUS_DMASYNC_POSTREAD);
	memcpy((u_char*)data->data + slot->offset, slot->dmamem,
	ulmin(left, slot->sdma_bbufsz));
	} else
	bus_dmamap_sync(slot->dmatag, slot->dmamap,
	BUS_DMASYNC_POSTWRITE);
	}
	slot->data_done = 1;
	/* If there was error - reset the host. */
	if (slot->curcmd->error) {
	if (slot->curcmd->error == MMC_ERR_BADCRC)
	slot->retune_req \|= SDHCI_RETUNE_REQ_RESET;
	sdhci_reset(slot, SDHCI_RESET_CMD);
	sdhci_reset(slot, SDHCI_RESET_DATA);
	sdhci_start(slot);
	return;
	}
	/* If we already have command response - finish. */
	if (slot->cmd_done)
	sdhci_start(slot);
	}

	#ifdef MMCCAM
	static void
	sdhci_start(struct sdhci_slot *slot)
	{
	union ccb *ccb;
	struct ccb_mmcio *mmcio;

	ccb = slot->ccb;
	if (ccb == NULL)
	return;

	mmcio = &ccb->mmcio;
	if (!(slot->flags & CMD_STARTED)) {
	slot->flags \|= CMD_STARTED;
	sdhci_start_command(slot, &mmcio->cmd);
	return;
	}

	/*
	* Old stack doesn't use this!
	* Enabling this code causes significant performance degradation
	* and IRQ storms on BBB, Wandboard behaves fine.
	* Not using this code does no harm...
	if (!(slot->flags & STOP_STARTED) && mmcio->stop.opcode != 0) {
	slot->flags \|= STOP_STARTED;
	sdhci_start_command(slot, &mmcio->stop);
	return;
	}
	*/
	if (__predict_false(sdhci_debug > 1))
	slot_printf(slot, "result: %d\n", mmcio->cmd.error);
	if (mmcio->cmd.error == 0 &&
	(slot->quirks & SDHCI_QUIRK_RESET_AFTER_REQUEST)) {
	sdhci_reset(slot, SDHCI_RESET_CMD);
	sdhci_reset(slot, SDHCI_RESET_DATA);
	}

	sdhci_req_done(slot);
	}
	#else
	static void
	sdhci_start(struct sdhci_slot *slot)
	{
	const struct mmc_request *req;

	req = slot->req;
	if (req == NULL)
	return;

	if (!(slot->flags & CMD_STARTED)) {
	slot->flags \|= CMD_STARTED;
	sdhci_start_command(slot, req->cmd);
	return;
	}
	if ((slot->quirks & SDHCI_QUIRK_BROKEN_AUTO_STOP) &&
	!(slot->flags & STOP_STARTED) && req->stop) {
	slot->flags \|= STOP_STARTED;
	sdhci_start_command(slot, req->stop);
	return;
	}
	if (__predict_false(sdhci_debug > 1))
	slot_printf(slot, "result: %d\n", req->cmd->error);
	if (!req->cmd->error &&
	((slot->curcmd == req->stop &&
	(slot->quirks & SDHCI_QUIRK_BROKEN_AUTO_STOP)) \|\|
	(slot->quirks & SDHCI_QUIRK_RESET_AFTER_REQUEST))) {
	sdhci_reset(slot, SDHCI_RESET_CMD);
	sdhci_reset(slot, SDHCI_RESET_DATA);
	}

	sdhci_req_done(slot);
	}
	#endif

	int
	sdhci_generic_request(device_t brdev __unused, device_t reqdev,
	struct mmc_request *req)
	{
	struct sdhci_slot *slot = device_get_ivars(reqdev);

	SDHCI_LOCK(slot);
	if (slot->req != NULL) {
	SDHCI_UNLOCK(slot);
	return (EBUSY);
	}
	if (__predict_false(sdhci_debug > 1)) {
	slot_printf(slot,
	"CMD%u arg %#x flags %#x dlen %u dflags %#x\n",
	req->cmd->opcode, req->cmd->arg, req->cmd->flags,
	(req->cmd->data)?(u_int)req->cmd->data->len:0,
	(req->cmd->data)?req->cmd->data->flags:0);
	}
	slot->req = req;
	slot->flags = 0;
	sdhci_start(slot);
	SDHCI_UNLOCK(slot);
	if (dumping) {
	while (slot->req != NULL) {
	sdhci_generic_intr(slot);
	DELAY(10);
	}
	}
	return (0);
	}

	int
	sdhci_generic_get_ro(device_t brdev __unused, device_t reqdev)
	{
	struct sdhci_slot *slot = device_get_ivars(reqdev);
	uint32_t val;

	SDHCI_LOCK(slot);
	val = RD4(slot, SDHCI_PRESENT_STATE);
	SDHCI_UNLOCK(slot);
	return (!(val & SDHCI_WRITE_PROTECT));
	}

	int
	sdhci_generic_acquire_host(device_t brdev __unused, device_t reqdev)
	{
	struct sdhci_slot *slot = device_get_ivars(reqdev);
	int err = 0;

	SDHCI_LOCK(slot);
	while (slot->bus_busy)
	msleep(slot, &slot->mtx, 0, "sdhciah", 0);
	slot->bus_busy++;
	/* Activate led. */
	WR1(slot, SDHCI_HOST_CONTROL, slot->hostctrl \|= SDHCI_CTRL_LED);
	SDHCI_UNLOCK(slot);
	return (err);
	}

	int
	sdhci_generic_release_host(device_t brdev __unused, device_t reqdev)
	{
	struct sdhci_slot *slot = device_get_ivars(reqdev);

	SDHCI_LOCK(slot);
	/* Deactivate led. */
	WR1(slot, SDHCI_HOST_CONTROL, slot->hostctrl &= ~SDHCI_CTRL_LED);
	slot->bus_busy--;
	SDHCI_UNLOCK(slot);
	wakeup(slot);
	return (0);
	}

	static void
	sdhci_cmd_irq(struct sdhci_slot *slot, uint32_t intmask)
	{

	if (!slot->curcmd) {
	slot_printf(slot, "Got command interrupt 0x%08x, but "
	"there is no active command.\n", intmask);
	sdhci_dumpregs(slot);
	return;
	}
	if (intmask & SDHCI_INT_TIMEOUT)
	slot->curcmd->error = MMC_ERR_TIMEOUT;
	else if (intmask & SDHCI_INT_CRC)
	slot->curcmd->error = MMC_ERR_BADCRC;
	else if (intmask & (SDHCI_INT_END_BIT \| SDHCI_INT_INDEX))
	slot->curcmd->error = MMC_ERR_FIFO;

	sdhci_finish_command(slot);
	}

	static void
	sdhci_data_irq(struct sdhci_slot *slot, uint32_t intmask)
	{
	struct mmc_data *data;
	size_t left;
	uint32_t sdma_bbufsz;

	if (!slot->curcmd) {
	slot_printf(slot, "Got data interrupt 0x%08x, but "
	"there is no active command.\n", intmask);
	sdhci_dumpregs(slot);
	return;
	}
	if (slot->curcmd->data == NULL &&
	(slot->curcmd->flags & MMC_RSP_BUSY) == 0) {
	slot_printf(slot, "Got data interrupt 0x%08x, but "
	"there is no active data operation.\n",
	intmask);
	sdhci_dumpregs(slot);
	return;
	}
	if (intmask & SDHCI_INT_DATA_TIMEOUT)
	slot->curcmd->error = MMC_ERR_TIMEOUT;
	else if (intmask & (SDHCI_INT_DATA_CRC \| SDHCI_INT_DATA_END_BIT))
	slot->curcmd->error = MMC_ERR_BADCRC;
	if (slot->curcmd->data == NULL &&
	(intmask & (SDHCI_INT_DATA_AVAIL \| SDHCI_INT_SPACE_AVAIL \|
	SDHCI_INT_DMA_END))) {
	slot_printf(slot, "Got data interrupt 0x%08x, but "
	"there is busy-only command.\n", intmask);
	sdhci_dumpregs(slot);
	slot->curcmd->error = MMC_ERR_INVALID;
	}
	if (slot->curcmd->error) {
	/* No need to continue after any error. */
	goto done;
	}

	/* Handle tuning completion interrupt. */
	if (__predict_false((intmask & SDHCI_INT_DATA_AVAIL) &&
	(slot->curcmd->opcode == MMC_SEND_TUNING_BLOCK \|\|
	slot->curcmd->opcode == MMC_SEND_TUNING_BLOCK_HS200))) {
	slot->req->flags \|= MMC_TUNE_DONE;
	sdhci_finish_command(slot);
	sdhci_finish_data(slot);
	return;
	}
	/* Handle PIO interrupt. */
	if (intmask & (SDHCI_INT_DATA_AVAIL \| SDHCI_INT_SPACE_AVAIL)) {
	if ((slot->opt & SDHCI_PLATFORM_TRANSFER) &&
	SDHCI_PLATFORM_WILL_HANDLE(slot->bus, slot)) {
	SDHCI_PLATFORM_START_TRANSFER(slot->bus, slot,
	&intmask);
	slot->flags \|= PLATFORM_DATA_STARTED;
	} else
	sdhci_transfer_pio(slot);
	}
	/* Handle DMA border. */
	if (intmask & SDHCI_INT_DMA_END) {
	data = slot->curcmd->data;
	sdma_bbufsz = slot->sdma_bbufsz;

	/* Unload DMA buffer ... */
	left = data->len - slot->offset;
	if (data->flags & MMC_DATA_READ) {
	bus_dmamap_sync(slot->dmatag, slot->dmamap,
	BUS_DMASYNC_POSTREAD);
	memcpy((u_char*)data->data + slot->offset, slot->dmamem,
	ulmin(left, sdma_bbufsz));
	} else {
	bus_dmamap_sync(slot->dmatag, slot->dmamap,
	BUS_DMASYNC_POSTWRITE);
	}
	/* ... and reload it again. */
	slot->offset += sdma_bbufsz;
	left = data->len - slot->offset;
	if (data->flags & MMC_DATA_READ) {
	bus_dmamap_sync(slot->dmatag, slot->dmamap,
	BUS_DMASYNC_PREREAD);
	} else {
	memcpy(slot->dmamem, (u_char*)data->data + slot->offset,
	ulmin(left, sdma_bbufsz));
	bus_dmamap_sync(slot->dmatag, slot->dmamap,
	BUS_DMASYNC_PREWRITE);
	}
	/*
	* Interrupt aggregation: Mask border interrupt for the last
	* bounce buffer.
	*/
	if (left == sdma_bbufsz) {
	slot->intmask &= ~SDHCI_INT_DMA_END;
	WR4(slot, SDHCI_SIGNAL_ENABLE, slot->intmask);
	}
	/* Restart DMA. */
	WR4(slot, SDHCI_DMA_ADDRESS, slot->paddr);
	}
	/* We have got all data. */
	if (intmask & SDHCI_INT_DATA_END) {
	if (slot->flags & PLATFORM_DATA_STARTED) {
	slot->flags &= ~PLATFORM_DATA_STARTED;
	SDHCI_PLATFORM_FINISH_TRANSFER(slot->bus, slot);
	} else
	sdhci_finish_data(slot);
	}
	done:
	if (slot->curcmd != NULL && slot->curcmd->error != 0) {
	if (slot->flags & PLATFORM_DATA_STARTED) {
	slot->flags &= ~PLATFORM_DATA_STARTED;
	SDHCI_PLATFORM_FINISH_TRANSFER(slot->bus, slot);
	} else
	sdhci_finish_data(slot);
	}
	}

	static void
	sdhci_acmd_irq(struct sdhci_slot *slot, uint16_t acmd_err)
	{

	if (!slot->curcmd) {
	slot_printf(slot, "Got AutoCMD12 error 0x%04x, but "
	"there is no active command.\n", acmd_err);
	sdhci_dumpregs(slot);
	return;
	}
	slot_printf(slot, "Got AutoCMD12 error 0x%04x\n", acmd_err);
	sdhci_reset(slot, SDHCI_RESET_CMD);
	}

	void
	sdhci_generic_intr(struct sdhci_slot *slot)
	{
	uint32_t intmask, present;
	uint16_t val16;

	SDHCI_LOCK(slot);
	/* Read slot interrupt status. */
	intmask = RD4(slot, SDHCI_INT_STATUS);
	if (intmask == 0 \|\| intmask == 0xffffffff) {
	SDHCI_UNLOCK(slot);
	return;
	}
	if (__predict_false(sdhci_debug > 2))
	slot_printf(slot, "Interrupt %#x\n", intmask);

	/* Handle tuning error interrupt. */
	if (__predict_false(intmask & SDHCI_INT_TUNEERR)) {
	WR4(slot, SDHCI_INT_STATUS, SDHCI_INT_TUNEERR);
	slot_printf(slot, "Tuning error indicated\n");
	slot->retune_req \|= SDHCI_RETUNE_REQ_RESET;
	if (slot->curcmd) {
	slot->curcmd->error = MMC_ERR_BADCRC;
	sdhci_finish_command(slot);
	}
	}
	/* Handle re-tuning interrupt. */
	if (__predict_false(intmask & SDHCI_INT_RETUNE))
	slot->retune_req \|= SDHCI_RETUNE_REQ_NEEDED;
	/* Handle card presence interrupts. */
	if (intmask & (SDHCI_INT_CARD_INSERT \| SDHCI_INT_CARD_REMOVE)) {
	present = (intmask & SDHCI_INT_CARD_INSERT) != 0;
	slot->intmask &=
	~(SDHCI_INT_CARD_INSERT \| SDHCI_INT_CARD_REMOVE);
	slot->intmask \|= present ? SDHCI_INT_CARD_REMOVE :
	SDHCI_INT_CARD_INSERT;
	WR4(slot, SDHCI_INT_ENABLE, slot->intmask);
	WR4(slot, SDHCI_SIGNAL_ENABLE, slot->intmask);
	WR4(slot, SDHCI_INT_STATUS, intmask &
	(SDHCI_INT_CARD_INSERT \| SDHCI_INT_CARD_REMOVE));
	sdhci_handle_card_present_locked(slot, present);
	}
	/* Handle command interrupts. */
	if (intmask & SDHCI_INT_CMD_MASK) {
	WR4(slot, SDHCI_INT_STATUS, intmask & SDHCI_INT_CMD_MASK);
	sdhci_cmd_irq(slot, intmask & SDHCI_INT_CMD_MASK);
	}
	/* Handle data interrupts. */
	if (intmask & SDHCI_INT_DATA_MASK) {
	WR4(slot, SDHCI_INT_STATUS, intmask & SDHCI_INT_DATA_MASK);
	/* Don't call data_irq in case of errored command. */
	if ((intmask & SDHCI_INT_CMD_ERROR_MASK) == 0)
	sdhci_data_irq(slot, intmask & SDHCI_INT_DATA_MASK);
	}
	/* Handle AutoCMD12 error interrupt. */
	if (intmask & SDHCI_INT_ACMD12ERR) {
	/* Clearing SDHCI_INT_ACMD12ERR may clear SDHCI_ACMD12_ERR. */
	val16 = RD2(slot, SDHCI_ACMD12_ERR);
	WR4(slot, SDHCI_INT_STATUS, SDHCI_INT_ACMD12ERR);
	sdhci_acmd_irq(slot, val16);
	}
	/* Handle bus power interrupt. */
	if (intmask & SDHCI_INT_BUS_POWER) {
	WR4(slot, SDHCI_INT_STATUS, SDHCI_INT_BUS_POWER);
	slot_printf(slot, "Card is consuming too much power!\n");
	}
	intmask &= ~(SDHCI_INT_ERROR \| SDHCI_INT_TUNEERR \| SDHCI_INT_RETUNE \|
	SDHCI_INT_CARD_INSERT \| SDHCI_INT_CARD_REMOVE \| SDHCI_INT_CMD_MASK \|
	SDHCI_INT_DATA_MASK \| SDHCI_INT_ACMD12ERR \| SDHCI_INT_BUS_POWER);
	/* The rest is unknown. */
	if (intmask) {
	WR4(slot, SDHCI_INT_STATUS, intmask);
	slot_printf(slot, "Unexpected interrupt 0x%08x.\n",
	intmask);
	sdhci_dumpregs(slot);
	}

	SDHCI_UNLOCK(slot);
	}

	int
	sdhci_generic_read_ivar(device_t bus, device_t child, int which,
	uintptr_t *result)
	{
	const struct sdhci_slot *slot = device_get_ivars(child);

	switch (which) {
	default:
	return (EINVAL);
	case MMCBR_IVAR_BUS_MODE:
	*result = slot->host.ios.bus_mode;
	break;
	case MMCBR_IVAR_BUS_WIDTH:
	*result = slot->host.ios.bus_width;
	break;
	case MMCBR_IVAR_CHIP_SELECT:
	*result = slot->host.ios.chip_select;
	break;
	case MMCBR_IVAR_CLOCK:
	*result = slot->host.ios.clock;
	break;
	case MMCBR_IVAR_F_MIN:
	*result = slot->host.f_min;
	break;
	case MMCBR_IVAR_F_MAX:
	*result = slot->host.f_max;
	break;
	case MMCBR_IVAR_HOST_OCR:
	*result = slot->host.host_ocr;
	break;
	case MMCBR_IVAR_MODE:
	*result = slot->host.mode;
	break;
	case MMCBR_IVAR_OCR:
	*result = slot->host.ocr;
	break;
	case MMCBR_IVAR_POWER_MODE:
	*result = slot->host.ios.power_mode;
	break;
	case MMCBR_IVAR_VDD:
	*result = slot->host.ios.vdd;
	break;
	case MMCBR_IVAR_RETUNE_REQ:
	if (slot->opt & SDHCI_TUNING_ENABLED) {
	if (slot->retune_req & SDHCI_RETUNE_REQ_RESET) {
	*result = retune_req_reset;
	break;
	}
	if (slot->retune_req & SDHCI_RETUNE_REQ_NEEDED) {
	*result = retune_req_normal;
	break;
	}
	}
	*result = retune_req_none;
	break;
	case MMCBR_IVAR_VCCQ:
	*result = slot->host.ios.vccq;
	break;
	case MMCBR_IVAR_CAPS:
	*result = slot->host.caps;
	break;
	case MMCBR_IVAR_TIMING:
	*result = slot->host.ios.timing;
	break;
	case MMCBR_IVAR_MAX_DATA:
	/*
	* Re-tuning modes 1 and 2 restrict the maximum data length
	* per read/write command to 4 MiB.
	*/
	if (slot->opt & SDHCI_TUNING_ENABLED &&
	(slot->retune_mode == SDHCI_RETUNE_MODE_1 \|\|
	slot->retune_mode == SDHCI_RETUNE_MODE_2)) {
	result = 4 1024 * 1024 / MMC_SECTOR_SIZE;
	break;
	}
	*result = 65535;
	break;
	case MMCBR_IVAR_MAX_BUSY_TIMEOUT:
	/*
	* Currently, sdhci_start_data() hardcodes 1 s for all CMDs.
	*/
	*result = 1000000;
	break;
	}
	return (0);
	}

	int
	sdhci_generic_write_ivar(device_t bus, device_t child, int which,
	uintptr_t value)
	{
	struct sdhci_slot *slot = device_get_ivars(child);
	uint32_t clock, max_clock;
	int i;

	if (sdhci_debug > 1)
	slot_printf(slot, "%s: var=%d\n", __func__, which);
	switch (which) {
	default:
	return (EINVAL);
	case MMCBR_IVAR_BUS_MODE:
	slot->host.ios.bus_mode = value;
	break;
	case MMCBR_IVAR_BUS_WIDTH:
	slot->host.ios.bus_width = value;
	break;
	case MMCBR_IVAR_CHIP_SELECT:
	slot->host.ios.chip_select = value;
	break;
	case MMCBR_IVAR_CLOCK:
	if (value > 0) {
	max_clock = slot->max_clk;
	clock = max_clock;

	if (slot->version < SDHCI_SPEC_300) {
	for (i = 0; i < SDHCI_200_MAX_DIVIDER;
	i <<= 1) {
	if (clock <= value)
	break;
	clock >>= 1;
	}
	} else {
	for (i = 0; i < SDHCI_300_MAX_DIVIDER;
	i += 2) {
	if (clock <= value)
	break;
	clock = max_clock / (i + 2);
	}
	}

	slot->host.ios.clock = clock;
	} else
	slot->host.ios.clock = 0;
	break;
	case MMCBR_IVAR_MODE:
	slot->host.mode = value;
	break;
	case MMCBR_IVAR_OCR:
	slot->host.ocr = value;
	break;
	case MMCBR_IVAR_POWER_MODE:
	slot->host.ios.power_mode = value;
	break;
	case MMCBR_IVAR_VDD:
	slot->host.ios.vdd = value;
	break;
	case MMCBR_IVAR_VCCQ:
	slot->host.ios.vccq = value;
	break;
	case MMCBR_IVAR_TIMING:
	slot->host.ios.timing = value;
	break;
	case MMCBR_IVAR_CAPS:
	case MMCBR_IVAR_HOST_OCR:
	case MMCBR_IVAR_F_MIN:
	case MMCBR_IVAR_F_MAX:
	case MMCBR_IVAR_MAX_DATA:
	case MMCBR_IVAR_RETUNE_REQ:
	return (EINVAL);
	}
	return (0);
	}

	#ifdef MMCCAM
	void
	sdhci_start_slot(struct sdhci_slot *slot)
	{

	if ((slot->devq = cam_simq_alloc(1)) == NULL)
	goto fail;

	mtx_init(&slot->sim_mtx, "sdhcisim", NULL, MTX_DEF);
	slot->sim = cam_sim_alloc_dev(sdhci_cam_action, sdhci_cam_poll,
	"sdhci_slot", slot, slot->bus,
	&slot->sim_mtx, 1, 1, slot->devq);

	if (slot->sim == NULL) {
	cam_simq_free(slot->devq);
	slot_printf(slot, "cannot allocate CAM SIM\n");
	goto fail;
	}

	mtx_lock(&slot->sim_mtx);
	if (xpt_bus_register(slot->sim, slot->bus, 0) != 0) {
	slot_printf(slot, "cannot register SCSI pass-through bus\n");
	cam_sim_free(slot->sim, FALSE);
	cam_simq_free(slot->devq);
	mtx_unlock(&slot->sim_mtx);
	goto fail;
	}
	mtx_unlock(&slot->sim_mtx);

	/* End CAM-specific init */
	slot->card_present = 0;
	sdhci_card_task(slot, 0);
	return;

	fail:
	if (slot->sim != NULL) {
	mtx_lock(&slot->sim_mtx);
	xpt_bus_deregister(cam_sim_path(slot->sim));
	cam_sim_free(slot->sim, FALSE);
	mtx_unlock(&slot->sim_mtx);
	}

	if (slot->devq != NULL)
	cam_simq_free(slot->devq);
	}

	void
	sdhci_cam_action(struct cam_sim sim, union ccb ccb)
	{
	struct sdhci_slot *slot;

	slot = cam_sim_softc(sim);
	if (slot == NULL) {
	ccb->ccb_h.status = CAM_SEL_TIMEOUT;
	xpt_done(ccb);
	return;
	}

	mtx_assert(&slot->sim_mtx, MA_OWNED);

	switch (ccb->ccb_h.func_code) {
	case XPT_PATH_INQ:
	- mmc_path_inq(&ccb->cpi, "Deglitch Networks", sim, MAXPHYS);
	+ mmc_path_inq(&ccb->cpi, "Deglitch Networks", sim, maxphys);
	break;

	case XPT_GET_TRAN_SETTINGS:
	{
	struct ccb_trans_settings *cts = &ccb->cts;
	uint32_t max_data;

	if (sdhci_debug > 1)
	slot_printf(slot, "Got XPT_GET_TRAN_SETTINGS\n");

	cts->protocol = PROTO_MMCSD;
	cts->protocol_version = 1;
	cts->transport = XPORT_MMCSD;
	cts->transport_version = 1;
	cts->xport_specific.valid = 0;
	cts->proto_specific.mmc.host_ocr = slot->host.host_ocr;
	cts->proto_specific.mmc.host_f_min = slot->host.f_min;
	cts->proto_specific.mmc.host_f_max = slot->host.f_max;
	cts->proto_specific.mmc.host_caps = slot->host.caps;
	/*
	* Re-tuning modes 1 and 2 restrict the maximum data length
	* per read/write command to 4 MiB.
	*/
	if (slot->opt & SDHCI_TUNING_ENABLED &&
	(slot->retune_mode == SDHCI_RETUNE_MODE_1 \|\|
	slot->retune_mode == SDHCI_RETUNE_MODE_2)) {
	max_data = 4 * 1024 * 1024 / MMC_SECTOR_SIZE;
	} else {
	max_data = 65535;
	}
	cts->proto_specific.mmc.host_max_data = max_data;

	memcpy(&cts->proto_specific.mmc.ios, &slot->host.ios, sizeof(struct mmc_ios));
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_SET_TRAN_SETTINGS:
	if (sdhci_debug > 1)
	slot_printf(slot, "Got XPT_SET_TRAN_SETTINGS\n");
	sdhci_cam_settran_settings(slot, ccb);
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	case XPT_RESET_BUS:
	if (sdhci_debug > 1)
	slot_printf(slot, "Got XPT_RESET_BUS, ACK it...\n");
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	case XPT_MMC_IO:
	/*
	* Here is the HW-dependent part of
	* sending the command to the underlying h/w
	* At some point in the future an interrupt comes.
	* Then the request will be marked as completed.
	*/
	if (__predict_false(sdhci_debug > 1))
	slot_printf(slot, "Got XPT_MMC_IO\n");
	ccb->ccb_h.status = CAM_REQ_INPROG;

	sdhci_cam_request(cam_sim_softc(sim), ccb);
	return;
	default:
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	}
	xpt_done(ccb);
	return;
	}

	void
	sdhci_cam_poll(struct cam_sim *sim)
	{
	return;
	}

	static int
	sdhci_cam_get_possible_host_clock(const struct sdhci_slot *slot,
	int proposed_clock)
	{
	int max_clock, clock, i;

	if (proposed_clock == 0)
	return 0;
	max_clock = slot->max_clk;
	clock = max_clock;

	if (slot->version < SDHCI_SPEC_300) {
	for (i = 0; i < SDHCI_200_MAX_DIVIDER; i <<= 1) {
	if (clock <= proposed_clock)
	break;
	clock >>= 1;
	}
	} else {
	for (i = 0; i < SDHCI_300_MAX_DIVIDER; i += 2) {
	if (clock <= proposed_clock)
	break;
	clock = max_clock / (i + 2);
	}
	}
	return clock;
	}

	static int
	sdhci_cam_settran_settings(struct sdhci_slot slot, union ccb ccb)
	{
	struct mmc_ios *ios;
	const struct mmc_ios *new_ios;
	const struct ccb_trans_settings_mmc *cts;

	ios = &slot->host.ios;
	cts = &ccb->cts.proto_specific.mmc;
	new_ios = &cts->ios;

	/* Update only requested fields */
	if (cts->ios_valid & MMC_CLK) {
	ios->clock = sdhci_cam_get_possible_host_clock(slot, new_ios->clock);
	if (sdhci_debug > 1)
	slot_printf(slot, "Clock => %d\n", ios->clock);
	}
	if (cts->ios_valid & MMC_VDD) {
	ios->vdd = new_ios->vdd;
	if (sdhci_debug > 1)
	slot_printf(slot, "VDD => %d\n", ios->vdd);
	}
	if (cts->ios_valid & MMC_CS) {
	ios->chip_select = new_ios->chip_select;
	if (sdhci_debug > 1)
	slot_printf(slot, "CS => %d\n", ios->chip_select);
	}
	if (cts->ios_valid & MMC_BW) {
	ios->bus_width = new_ios->bus_width;
	if (sdhci_debug > 1)
	slot_printf(slot, "Bus width => %d\n", ios->bus_width);
	}
	if (cts->ios_valid & MMC_PM) {
	ios->power_mode = new_ios->power_mode;
	if (sdhci_debug > 1)
	slot_printf(slot, "Power mode => %d\n", ios->power_mode);
	}
	if (cts->ios_valid & MMC_BT) {
	ios->timing = new_ios->timing;
	if (sdhci_debug > 1)
	slot_printf(slot, "Timing => %d\n", ios->timing);
	}
	if (cts->ios_valid & MMC_BM) {
	ios->bus_mode = new_ios->bus_mode;
	if (sdhci_debug > 1)
	slot_printf(slot, "Bus mode => %d\n", ios->bus_mode);
	}
	if (cts->ios_valid & MMC_VCCQ) {
	ios->vccq = new_ios->vccq;
	if (sdhci_debug > 1)
	slot_printf(slot, "VCCQ => %d\n", ios->vccq);
	}

	/* XXX Provide a way to call a chip-specific IOS update, required for TI */
	return (sdhci_cam_update_ios(slot));
	}

	static int
	sdhci_cam_update_ios(struct sdhci_slot *slot)
	{
	struct mmc_ios *ios = &slot->host.ios;

	if (sdhci_debug > 1)
	slot_printf(slot, "%s: power_mode=%d, clk=%d, bus_width=%d, timing=%d\n",
	__func__, ios->power_mode, ios->clock, ios->bus_width, ios->timing);
	SDHCI_LOCK(slot);
	/* Do full reset on bus power down to clear from any state. */
	if (ios->power_mode == power_off) {
	WR4(slot, SDHCI_SIGNAL_ENABLE, 0);
	sdhci_init(slot);
	}
	/* Configure the bus. */
	sdhci_set_clock(slot, ios->clock);
	sdhci_set_power(slot, (ios->power_mode == power_off) ? 0 : ios->vdd);
	if (ios->bus_width == bus_width_8) {
	slot->hostctrl \|= SDHCI_CTRL_8BITBUS;
	slot->hostctrl &= ~SDHCI_CTRL_4BITBUS;
	} else if (ios->bus_width == bus_width_4) {
	slot->hostctrl &= ~SDHCI_CTRL_8BITBUS;
	slot->hostctrl \|= SDHCI_CTRL_4BITBUS;
	} else if (ios->bus_width == bus_width_1) {
	slot->hostctrl &= ~SDHCI_CTRL_8BITBUS;
	slot->hostctrl &= ~SDHCI_CTRL_4BITBUS;
	} else {
	panic("Invalid bus width: %d", ios->bus_width);
	}
	if (ios->timing == bus_timing_hs &&
	!(slot->quirks & SDHCI_QUIRK_DONT_SET_HISPD_BIT))
	slot->hostctrl \|= SDHCI_CTRL_HISPD;
	else
	slot->hostctrl &= ~SDHCI_CTRL_HISPD;
	WR1(slot, SDHCI_HOST_CONTROL, slot->hostctrl);
	/* Some controllers like reset after bus changes. */
	if(slot->quirks & SDHCI_QUIRK_RESET_ON_IOS)
	sdhci_reset(slot, SDHCI_RESET_CMD \| SDHCI_RESET_DATA);

	SDHCI_UNLOCK(slot);
	return (0);
	}

	static int
	sdhci_cam_request(struct sdhci_slot slot, union ccb ccb)
	{
	const struct ccb_mmcio *mmcio;

	mmcio = &ccb->mmcio;

	SDHCI_LOCK(slot);
	/* if (slot->req != NULL) {
	SDHCI_UNLOCK(slot);
	return (EBUSY);
	}
	*/
	if (__predict_false(sdhci_debug > 1)) {
	slot_printf(slot, "CMD%u arg %#x flags %#x dlen %u dflags %#x "
	"blksz=%zu blkcnt=%zu\n",
	mmcio->cmd.opcode, mmcio->cmd.arg, mmcio->cmd.flags,
	mmcio->cmd.data != NULL ? (unsigned int) mmcio->cmd.data->len : 0,
	mmcio->cmd.data != NULL ? mmcio->cmd.data->flags : 0,
	mmcio->cmd.data != NULL ? mmcio->cmd.data->block_size : 0,
	mmcio->cmd.data != NULL ? mmcio->cmd.data->block_count : 0);
	}
	if (mmcio->cmd.data != NULL) {
	if (mmcio->cmd.data->len == 0 \|\| mmcio->cmd.data->flags == 0)
	panic("data->len = %d, data->flags = %d -- something is b0rked",
	(int)mmcio->cmd.data->len, mmcio->cmd.data->flags);
	}
	slot->ccb = ccb;
	slot->flags = 0;
	sdhci_start(slot);
	SDHCI_UNLOCK(slot);
	if (dumping) {
	while (slot->ccb != NULL) {
	sdhci_generic_intr(slot);
	DELAY(10);
	}
	}
	return (0);
	}
	#endif /* MMCCAM */

	MODULE_VERSION(sdhci, SDHCI_VERSION);
	diff --git a/sys/dev/siis/siis.c b/sys/dev/siis/siis.c
	index 13e5447ed4ac..1c0000e89b07 100644
	--- a/sys/dev/siis/siis.c
	+++ b/sys/dev/siis/siis.c
	@@ -1,1991 +1,1988 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2009 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/module.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/ata.h>
	#include <sys/bus.h>
	#include <sys/endian.h>
	#include <sys/malloc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sema.h>
	#include <sys/taskqueue.h>
	#include <vm/uma.h>
	#include <machine/stdarg.h>
	#include <machine/resource.h>
	#include <machine/bus.h>
	#include <sys/rman.h>
	#include <dev/led/led.h>
	#include <dev/pci/pcivar.h>
	#include <dev/pci/pcireg.h>
	#include "siis.h"

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/cam_debug.h>

	/* local prototypes */
	static int siis_setup_interrupt(device_t dev);
	static void siis_intr(void *data);
	static int siis_suspend(device_t dev);
	static int siis_resume(device_t dev);
	static int siis_ch_init(device_t dev);
	static int siis_ch_deinit(device_t dev);
	static int siis_ch_suspend(device_t dev);
	static int siis_ch_resume(device_t dev);
	static void siis_ch_intr_locked(void *data);
	static void siis_ch_intr(void *data);
	static void siis_ch_led(void *priv, int onoff);
	static void siis_begin_transaction(device_t dev, union ccb *ccb);
	static void siis_dmasetprd(void arg, bus_dma_segment_t segs, int nsegs, int error);
	static void siis_execute_transaction(struct siis_slot *slot);
	static void siis_timeout(void *arg);
	static void siis_end_transaction(struct siis_slot *slot, enum siis_err_type et);
	static int siis_setup_fis(device_t dev, struct siis_cmd ctp, union ccb ccb, int tag);
	static void siis_dmainit(device_t dev);
	static void siis_dmasetupc_cb(void xsc, bus_dma_segment_t segs, int nsegs, int error);
	static void siis_dmafini(device_t dev);
	static void siis_slotsalloc(device_t dev);
	static void siis_slotsfree(device_t dev);
	static void siis_reset(device_t dev);
	static void siis_portinit(device_t dev);
	static int siis_wait_ready(device_t dev, int t);

	static int siis_sata_connect(struct siis_channel *ch);

	static void siis_issue_recovery(device_t dev);
	static void siis_process_read_log(device_t dev, union ccb *ccb);
	static void siis_process_request_sense(device_t dev, union ccb *ccb);

	static void siisaction(struct cam_sim sim, union ccb ccb);
	static void siispoll(struct cam_sim *sim);

	static MALLOC_DEFINE(M_SIIS, "SIIS driver", "SIIS driver data buffers");

	static struct {
	uint32_t id;
	const char *name;
	int ports;
	int quirks;
	#define SIIS_Q_SNTF 1
	#define SIIS_Q_NOMSI 2
	} siis_ids[] = {
	{0x31241095, "SiI3124", 4, 0},
	{0x31248086, "SiI3124", 4, 0},
	{0x31321095, "SiI3132", 2, SIIS_Q_SNTF\|SIIS_Q_NOMSI},
	{0x02421095, "SiI3132", 2, SIIS_Q_SNTF\|SIIS_Q_NOMSI},
	{0x02441095, "SiI3132", 2, SIIS_Q_SNTF\|SIIS_Q_NOMSI},
	{0x31311095, "SiI3131", 1, SIIS_Q_SNTF\|SIIS_Q_NOMSI},
	{0x35311095, "SiI3531", 1, SIIS_Q_SNTF\|SIIS_Q_NOMSI},
	{0, NULL, 0, 0}
	};

	#define recovery_type spriv_field0
	#define RECOVERY_NONE 0
	#define RECOVERY_READ_LOG 1
	#define RECOVERY_REQUEST_SENSE 2
	#define recovery_slot spriv_field1

	static int
	siis_probe(device_t dev)
	{
	char buf[64];
	int i;
	uint32_t devid = pci_get_devid(dev);

	for (i = 0; siis_ids[i].id != 0; i++) {
	if (siis_ids[i].id == devid) {
	snprintf(buf, sizeof(buf), "%s SATA controller",
	siis_ids[i].name);
	device_set_desc_copy(dev, buf);
	return (BUS_PROBE_DEFAULT);
	}
	}
	return (ENXIO);
	}

	static int
	siis_attach(device_t dev)
	{
	struct siis_controller *ctlr = device_get_softc(dev);
	uint32_t devid = pci_get_devid(dev);
	device_t child;
	int error, i, unit;

	ctlr->dev = dev;
	for (i = 0; siis_ids[i].id != 0; i++) {
	if (siis_ids[i].id == devid)
	break;
	}
	ctlr->quirks = siis_ids[i].quirks;
	/* Global memory */
	ctlr->r_grid = PCIR_BAR(0);
	if (!(ctlr->r_gmem = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&ctlr->r_grid, RF_ACTIVE)))
	return (ENXIO);
	ctlr->gctl = ATA_INL(ctlr->r_gmem, SIIS_GCTL);
	/* Channels memory */
	ctlr->r_rid = PCIR_BAR(2);
	if (!(ctlr->r_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&ctlr->r_rid, RF_ACTIVE)))
	return (ENXIO);
	/* Setup our own memory management for channels. */
	ctlr->sc_iomem.rm_start = rman_get_start(ctlr->r_mem);
	ctlr->sc_iomem.rm_end = rman_get_end(ctlr->r_mem);
	ctlr->sc_iomem.rm_type = RMAN_ARRAY;
	ctlr->sc_iomem.rm_descr = "I/O memory addresses";
	if ((error = rman_init(&ctlr->sc_iomem)) != 0) {
	bus_release_resource(dev, SYS_RES_MEMORY, ctlr->r_rid, ctlr->r_mem);
	bus_release_resource(dev, SYS_RES_MEMORY, ctlr->r_grid, ctlr->r_gmem);
	return (error);
	}
	if ((error = rman_manage_region(&ctlr->sc_iomem,
	rman_get_start(ctlr->r_mem), rman_get_end(ctlr->r_mem))) != 0) {
	bus_release_resource(dev, SYS_RES_MEMORY, ctlr->r_rid, ctlr->r_mem);
	bus_release_resource(dev, SYS_RES_MEMORY, ctlr->r_grid, ctlr->r_gmem);
	rman_fini(&ctlr->sc_iomem);
	return (error);
	}
	pci_enable_busmaster(dev);
	/* Reset controller */
	siis_resume(dev);
	/* Number of HW channels */
	ctlr->channels = siis_ids[i].ports;
	/* Setup interrupts. */
	if (siis_setup_interrupt(dev)) {
	bus_release_resource(dev, SYS_RES_MEMORY, ctlr->r_rid, ctlr->r_mem);
	bus_release_resource(dev, SYS_RES_MEMORY, ctlr->r_grid, ctlr->r_gmem);
	rman_fini(&ctlr->sc_iomem);
	return ENXIO;
	}
	/* Attach all channels on this controller */
	for (unit = 0; unit < ctlr->channels; unit++) {
	child = device_add_child(dev, "siisch", -1);
	if (child == NULL)
	device_printf(dev, "failed to add channel device\n");
	else
	device_set_ivars(child, (void *)(intptr_t)unit);
	}
	bus_generic_attach(dev);
	return 0;
	}

	static int
	siis_detach(device_t dev)
	{
	struct siis_controller *ctlr = device_get_softc(dev);

	/* Detach & delete all children */
	device_delete_children(dev);

	/* Free interrupts. */
	if (ctlr->irq.r_irq) {
	bus_teardown_intr(dev, ctlr->irq.r_irq,
	ctlr->irq.handle);
	bus_release_resource(dev, SYS_RES_IRQ,
	ctlr->irq.r_irq_rid, ctlr->irq.r_irq);
	}
	pci_release_msi(dev);
	/* Free memory. */
	rman_fini(&ctlr->sc_iomem);
	bus_release_resource(dev, SYS_RES_MEMORY, ctlr->r_rid, ctlr->r_mem);
	bus_release_resource(dev, SYS_RES_MEMORY, ctlr->r_grid, ctlr->r_gmem);
	return (0);
	}

	static int
	siis_suspend(device_t dev)
	{
	struct siis_controller *ctlr = device_get_softc(dev);

	bus_generic_suspend(dev);
	/* Put controller into reset state. */
	ctlr->gctl \|= SIIS_GCTL_GRESET;
	ATA_OUTL(ctlr->r_gmem, SIIS_GCTL, ctlr->gctl);
	return 0;
	}

	static int
	siis_resume(device_t dev)
	{
	struct siis_controller *ctlr = device_get_softc(dev);

	/* Set PCIe max read request size to at least 1024 bytes */
	if (pci_get_max_read_req(dev) < 1024)
	pci_set_max_read_req(dev, 1024);
	/* Put controller into reset state. */
	ctlr->gctl \|= SIIS_GCTL_GRESET;
	ATA_OUTL(ctlr->r_gmem, SIIS_GCTL, ctlr->gctl);
	DELAY(10000);
	/* Get controller out of reset state and enable port interrupts. */
	ctlr->gctl &= ~(SIIS_GCTL_GRESET \| SIIS_GCTL_I2C_IE);
	ctlr->gctl \|= 0x0000000f;
	ATA_OUTL(ctlr->r_gmem, SIIS_GCTL, ctlr->gctl);
	return (bus_generic_resume(dev));
	}

	static int
	siis_setup_interrupt(device_t dev)
	{
	struct siis_controller *ctlr = device_get_softc(dev);
	int msi = ctlr->quirks & SIIS_Q_NOMSI ? 0 : 1;

	/* Process hints. */
	resource_int_value(device_get_name(dev),
	device_get_unit(dev), "msi", &msi);
	if (msi < 0)
	msi = 0;
	else if (msi > 0)
	msi = min(1, pci_msi_count(dev));
	/* Allocate MSI if needed/present. */
	if (msi && pci_alloc_msi(dev, &msi) != 0)
	msi = 0;
	/* Allocate all IRQs. */
	ctlr->irq.r_irq_rid = msi ? 1 : 0;
	if (!(ctlr->irq.r_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ,
	&ctlr->irq.r_irq_rid, RF_SHAREABLE \| RF_ACTIVE))) {
	device_printf(dev, "unable to map interrupt\n");
	return ENXIO;
	}
	if ((bus_setup_intr(dev, ctlr->irq.r_irq, ATA_INTR_FLAGS, NULL,
	siis_intr, ctlr, &ctlr->irq.handle))) {
	/* SOS XXX release r_irq */
	device_printf(dev, "unable to setup interrupt\n");
	return ENXIO;
	}
	return (0);
	}

	/*
	* Common case interrupt handler.
	*/
	static void
	siis_intr(void *data)
	{
	struct siis_controller ctlr = (struct siis_controller )data;
	u_int32_t is;
	void *arg;
	int unit;

	is = ATA_INL(ctlr->r_gmem, SIIS_IS);
	for (unit = 0; unit < ctlr->channels; unit++) {
	if ((is & SIIS_IS_PORT(unit)) != 0 &&
	(arg = ctlr->interrupt[unit].argument)) {
	ctlr->interrupt[unit].function(arg);
	}
	}
	/* Acknowledge interrupt, if MSI enabled. */
	if (ctlr->irq.r_irq_rid) {
	ATA_OUTL(ctlr->r_gmem, SIIS_GCTL,
	ctlr->gctl \| SIIS_GCTL_MSIACK);
	}
	}

	static struct resource *
	siis_alloc_resource(device_t dev, device_t child, int type, int *rid,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
	{
	struct siis_controller *ctlr = device_get_softc(dev);
	int unit = ((struct siis_channel *)device_get_softc(child))->unit;
	struct resource *res = NULL;
	int offset = unit << 13;
	rman_res_t st;

	switch (type) {
	case SYS_RES_MEMORY:
	st = rman_get_start(ctlr->r_mem);
	res = rman_reserve_resource(&ctlr->sc_iomem, st + offset,
	st + offset + 0x2000, 0x2000, RF_ACTIVE, child);
	if (res) {
	bus_space_handle_t bsh;
	bus_space_tag_t bst;
	bsh = rman_get_bushandle(ctlr->r_mem);
	bst = rman_get_bustag(ctlr->r_mem);
	bus_space_subregion(bst, bsh, offset, 0x2000, &bsh);
	rman_set_bushandle(res, bsh);
	rman_set_bustag(res, bst);
	}
	break;
	case SYS_RES_IRQ:
	if (*rid == ATA_IRQ_RID)
	res = ctlr->irq.r_irq;
	break;
	}
	return (res);
	}

	static int
	siis_release_resource(device_t dev, device_t child, int type, int rid,
	struct resource *r)
	{

	switch (type) {
	case SYS_RES_MEMORY:
	rman_release_resource(r);
	return (0);
	case SYS_RES_IRQ:
	if (rid != ATA_IRQ_RID)
	return ENOENT;
	return (0);
	}
	return (EINVAL);
	}

	static int
	siis_setup_intr(device_t dev, device_t child, struct resource *irq,
	int flags, driver_filter_t filter, driver_intr_t function,
	void argument, void *cookiep)
	{
	struct siis_controller *ctlr = device_get_softc(dev);
	int unit = (intptr_t)device_get_ivars(child);

	if (filter != NULL) {
	printf("siis.c: we cannot use a filter here\n");
	return (EINVAL);
	}
	ctlr->interrupt[unit].function = function;
	ctlr->interrupt[unit].argument = argument;
	return (0);
	}

	static int
	siis_teardown_intr(device_t dev, device_t child, struct resource *irq,
	void *cookie)
	{
	struct siis_controller *ctlr = device_get_softc(dev);
	int unit = (intptr_t)device_get_ivars(child);

	ctlr->interrupt[unit].function = NULL;
	ctlr->interrupt[unit].argument = NULL;
	return (0);
	}

	static int
	siis_print_child(device_t dev, device_t child)
	{
	int retval;

	retval = bus_print_child_header(dev, child);
	retval += printf(" at channel %d",
	(int)(intptr_t)device_get_ivars(child));
	retval += bus_print_child_footer(dev, child);

	return (retval);
	}

	static int
	siis_child_location_str(device_t dev, device_t child, char *buf,
	size_t buflen)
	{

	snprintf(buf, buflen, "channel=%d",
	(int)(intptr_t)device_get_ivars(child));
	return (0);
	}

	static bus_dma_tag_t
	siis_get_dma_tag(device_t bus, device_t child)
	{

	return (bus_get_dma_tag(bus));
	}

	devclass_t siis_devclass;
	static device_method_t siis_methods[] = {
	DEVMETHOD(device_probe, siis_probe),
	DEVMETHOD(device_attach, siis_attach),
	DEVMETHOD(device_detach, siis_detach),
	DEVMETHOD(device_suspend, siis_suspend),
	DEVMETHOD(device_resume, siis_resume),
	DEVMETHOD(bus_print_child, siis_print_child),
	DEVMETHOD(bus_alloc_resource, siis_alloc_resource),
	DEVMETHOD(bus_release_resource, siis_release_resource),
	DEVMETHOD(bus_setup_intr, siis_setup_intr),
	DEVMETHOD(bus_teardown_intr,siis_teardown_intr),
	DEVMETHOD(bus_child_location_str, siis_child_location_str),
	DEVMETHOD(bus_get_dma_tag, siis_get_dma_tag),
	{ 0, 0 }
	};
	static driver_t siis_driver = {
	"siis",
	siis_methods,
	sizeof(struct siis_controller)
	};
	DRIVER_MODULE(siis, pci, siis_driver, siis_devclass, 0, 0);
	MODULE_VERSION(siis, 1);
	MODULE_DEPEND(siis, cam, 1, 1, 1);

	static int
	siis_ch_probe(device_t dev)
	{

	device_set_desc_copy(dev, "SIIS channel");
	return (BUS_PROBE_DEFAULT);
	}

	static int
	siis_ch_attach(device_t dev)
	{
	struct siis_controller *ctlr = device_get_softc(device_get_parent(dev));
	struct siis_channel *ch = device_get_softc(dev);
	struct cam_devq *devq;
	int rid, error, i, sata_rev = 0;

	ch->dev = dev;
	ch->unit = (intptr_t)device_get_ivars(dev);
	ch->quirks = ctlr->quirks;
	ch->pm_level = 0;
	resource_int_value(device_get_name(dev),
	device_get_unit(dev), "pm_level", &ch->pm_level);
	resource_int_value(device_get_name(dev),
	device_get_unit(dev), "sata_rev", &sata_rev);
	for (i = 0; i < 16; i++) {
	ch->user[i].revision = sata_rev;
	ch->user[i].mode = 0;
	ch->user[i].bytecount = 8192;
	ch->user[i].tags = SIIS_MAX_SLOTS;
	ch->curr[i] = ch->user[i];
	if (ch->pm_level)
	ch->user[i].caps = CTS_SATA_CAPS_H_PMREQ;
	ch->user[i].caps \|= CTS_SATA_CAPS_H_AN;
	}
	mtx_init(&ch->mtx, "SIIS channel lock", NULL, MTX_DEF);
	rid = ch->unit;
	if (!(ch->r_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&rid, RF_ACTIVE)))
	return (ENXIO);
	siis_dmainit(dev);
	siis_slotsalloc(dev);
	siis_ch_init(dev);
	mtx_lock(&ch->mtx);
	rid = ATA_IRQ_RID;
	if (!(ch->r_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ,
	&rid, RF_SHAREABLE \| RF_ACTIVE))) {
	device_printf(dev, "Unable to map interrupt\n");
	error = ENXIO;
	goto err0;
	}
	if ((bus_setup_intr(dev, ch->r_irq, ATA_INTR_FLAGS, NULL,
	siis_ch_intr_locked, dev, &ch->ih))) {
	device_printf(dev, "Unable to setup interrupt\n");
	error = ENXIO;
	goto err1;
	}
	/* Create the device queue for our SIM. */
	devq = cam_simq_alloc(SIIS_MAX_SLOTS);
	if (devq == NULL) {
	device_printf(dev, "Unable to allocate simq\n");
	error = ENOMEM;
	goto err1;
	}
	/* Construct SIM entry */
	ch->sim = cam_sim_alloc(siisaction, siispoll, "siisch", ch,
	device_get_unit(dev), &ch->mtx, 2, SIIS_MAX_SLOTS, devq);
	if (ch->sim == NULL) {
	cam_simq_free(devq);
	device_printf(dev, "unable to allocate sim\n");
	error = ENOMEM;
	goto err1;
	}
	if (xpt_bus_register(ch->sim, dev, 0) != CAM_SUCCESS) {
	device_printf(dev, "unable to register xpt bus\n");
	error = ENXIO;
	goto err2;
	}
	if (xpt_create_path(&ch->path, /periph/NULL, cam_sim_path(ch->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	device_printf(dev, "unable to create path\n");
	error = ENXIO;
	goto err3;
	}
	mtx_unlock(&ch->mtx);
	ch->led = led_create(siis_ch_led, dev, device_get_nameunit(dev));
	return (0);

	err3:
	xpt_bus_deregister(cam_sim_path(ch->sim));
	err2:
	cam_sim_free(ch->sim, /free_devq/TRUE);
	err1:
	bus_release_resource(dev, SYS_RES_IRQ, ATA_IRQ_RID, ch->r_irq);
	err0:
	bus_release_resource(dev, SYS_RES_MEMORY, ch->unit, ch->r_mem);
	mtx_unlock(&ch->mtx);
	mtx_destroy(&ch->mtx);
	return (error);
	}

	static int
	siis_ch_detach(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);

	led_destroy(ch->led);
	mtx_lock(&ch->mtx);
	xpt_async(AC_LOST_DEVICE, ch->path, NULL);
	xpt_free_path(ch->path);
	xpt_bus_deregister(cam_sim_path(ch->sim));
	cam_sim_free(ch->sim, /free_devq/TRUE);
	mtx_unlock(&ch->mtx);

	bus_teardown_intr(dev, ch->r_irq, ch->ih);
	bus_release_resource(dev, SYS_RES_IRQ, ATA_IRQ_RID, ch->r_irq);

	siis_ch_deinit(dev);
	siis_slotsfree(dev);
	siis_dmafini(dev);

	bus_release_resource(dev, SYS_RES_MEMORY, ch->unit, ch->r_mem);
	mtx_destroy(&ch->mtx);
	return (0);
	}

	static int
	siis_ch_init(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);

	/* Get port out of reset state. */
	ATA_OUTL(ch->r_mem, SIIS_P_CTLCLR, SIIS_P_CTL_PORT_RESET);
	ATA_OUTL(ch->r_mem, SIIS_P_CTLCLR, SIIS_P_CTL_32BIT);
	if (ch->pm_present)
	ATA_OUTL(ch->r_mem, SIIS_P_CTLSET, SIIS_P_CTL_PME);
	else
	ATA_OUTL(ch->r_mem, SIIS_P_CTLCLR, SIIS_P_CTL_PME);
	/* Enable port interrupts */
	ATA_OUTL(ch->r_mem, SIIS_P_IESET, SIIS_P_IX_ENABLED);
	return (0);
	}

	static int
	siis_ch_deinit(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);

	/* Put port into reset state. */
	ATA_OUTL(ch->r_mem, SIIS_P_CTLSET, SIIS_P_CTL_PORT_RESET);
	return (0);
	}

	static int
	siis_ch_suspend(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);

	mtx_lock(&ch->mtx);
	xpt_freeze_simq(ch->sim, 1);
	while (ch->oslots)
	msleep(ch, &ch->mtx, PRIBIO, "siissusp", hz/100);
	siis_ch_deinit(dev);
	mtx_unlock(&ch->mtx);
	return (0);
	}

	static int
	siis_ch_resume(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);

	mtx_lock(&ch->mtx);
	siis_ch_init(dev);
	siis_reset(dev);
	xpt_release_simq(ch->sim, TRUE);
	mtx_unlock(&ch->mtx);
	return (0);
	}

	devclass_t siisch_devclass;
	static device_method_t siisch_methods[] = {
	DEVMETHOD(device_probe, siis_ch_probe),
	DEVMETHOD(device_attach, siis_ch_attach),
	DEVMETHOD(device_detach, siis_ch_detach),
	DEVMETHOD(device_suspend, siis_ch_suspend),
	DEVMETHOD(device_resume, siis_ch_resume),
	{ 0, 0 }
	};
	static driver_t siisch_driver = {
	"siisch",
	siisch_methods,
	sizeof(struct siis_channel)
	};
	DRIVER_MODULE(siisch, siis, siisch_driver, siis_devclass, 0, 0);

	static void
	siis_ch_led(void *priv, int onoff)
	{
	device_t dev;
	struct siis_channel *ch;

	dev = (device_t)priv;
	ch = device_get_softc(dev);

	if (onoff == 0)
	ATA_OUTL(ch->r_mem, SIIS_P_CTLCLR, SIIS_P_CTL_LED_ON);
	else
	ATA_OUTL(ch->r_mem, SIIS_P_CTLSET, SIIS_P_CTL_LED_ON);
	}

	struct siis_dc_cb_args {
	bus_addr_t maddr;
	int error;
	};

	static void
	siis_dmainit(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);
	struct siis_dc_cb_args dcba;

	/* Command area. */
	if (bus_dma_tag_create(bus_get_dma_tag(dev), 1024, 0,
	BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
	NULL, NULL, SIIS_WORK_SIZE, 1, SIIS_WORK_SIZE,
	0, NULL, NULL, &ch->dma.work_tag))
	goto error;
	if (bus_dmamem_alloc(ch->dma.work_tag, (void **)&ch->dma.work, 0,
	&ch->dma.work_map))
	goto error;
	if (bus_dmamap_load(ch->dma.work_tag, ch->dma.work_map, ch->dma.work,
	SIIS_WORK_SIZE, siis_dmasetupc_cb, &dcba, 0) \|\| dcba.error) {
	bus_dmamem_free(ch->dma.work_tag, ch->dma.work, ch->dma.work_map);
	goto error;
	}
	ch->dma.work_bus = dcba.maddr;
	/* Data area. */
	if (bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0,
	BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
	NULL, NULL,
	- SIIS_SG_ENTRIES * PAGE_SIZE * SIIS_MAX_SLOTS,
	- SIIS_SG_ENTRIES, 0xFFFFFFFF,
	+ SIIS_SG_ENTRIES * PAGE_SIZE, SIIS_SG_ENTRIES, 0xFFFFFFFF,
	0, busdma_lock_mutex, &ch->mtx, &ch->dma.data_tag)) {
	goto error;
	}
	return;

	error:
	device_printf(dev, "WARNING - DMA initialization failed\n");
	siis_dmafini(dev);
	}

	static void
	siis_dmasetupc_cb(void xsc, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct siis_dc_cb_args dcba = (struct siis_dc_cb_args )xsc;

	if (!(dcba->error = error))
	dcba->maddr = segs[0].ds_addr;
	}

	static void
	siis_dmafini(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);

	if (ch->dma.data_tag) {
	bus_dma_tag_destroy(ch->dma.data_tag);
	ch->dma.data_tag = NULL;
	}
	if (ch->dma.work_bus) {
	bus_dmamap_unload(ch->dma.work_tag, ch->dma.work_map);
	bus_dmamem_free(ch->dma.work_tag, ch->dma.work, ch->dma.work_map);
	ch->dma.work_bus = 0;
	ch->dma.work_map = NULL;
	ch->dma.work = NULL;
	}
	if (ch->dma.work_tag) {
	bus_dma_tag_destroy(ch->dma.work_tag);
	ch->dma.work_tag = NULL;
	}
	}

	static void
	siis_slotsalloc(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);
	int i;

	/* Alloc and setup command/dma slots */
	bzero(ch->slot, sizeof(ch->slot));
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	struct siis_slot *slot = &ch->slot[i];

	slot->dev = dev;
	slot->slot = i;
	slot->state = SIIS_SLOT_EMPTY;
	+ slot->prb_offset = SIIS_PRB_SIZE * i;
	slot->ccb = NULL;
	callout_init_mtx(&slot->timeout, &ch->mtx, 0);

	if (bus_dmamap_create(ch->dma.data_tag, 0, &slot->dma.data_map))
	device_printf(ch->dev, "FAILURE - create data_map\n");
	}
	}

	static void
	siis_slotsfree(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);
	int i;

	/* Free all dma slots */
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	struct siis_slot *slot = &ch->slot[i];

	callout_drain(&slot->timeout);
	if (slot->dma.data_map) {
	bus_dmamap_destroy(ch->dma.data_tag, slot->dma.data_map);
	slot->dma.data_map = NULL;
	}
	}
	}

	static void
	siis_notify_events(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);
	struct cam_path *dpath;
	u_int32_t status;
	int i;

	if (ch->quirks & SIIS_Q_SNTF) {
	status = ATA_INL(ch->r_mem, SIIS_P_SNTF);
	ATA_OUTL(ch->r_mem, SIIS_P_SNTF, status);
	} else {
	/*
	* Without SNTF we have no idea which device sent notification.
	* If PMP is connected, assume it, else - device.
	*/
	status = (ch->pm_present) ? 0x8000 : 0x0001;
	}
	if (bootverbose)
	device_printf(dev, "SNTF 0x%04x\n", status);
	for (i = 0; i < 16; i++) {
	if ((status & (1 << i)) == 0)
	continue;
	if (xpt_create_path(&dpath, NULL,
	xpt_path_path_id(ch->path), i, 0) == CAM_REQ_CMP) {
	xpt_async(AC_SCSI_AEN, dpath, NULL);
	xpt_free_path(dpath);
	}
	}

	}

	static void
	siis_phy_check_events(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);

	/* If we have a connection event, deal with it */
	if (ch->pm_level == 0) {
	u_int32_t status = ATA_INL(ch->r_mem, SIIS_P_SSTS);
	union ccb *ccb;

	if (bootverbose) {
	if (((status & ATA_SS_DET_MASK) == ATA_SS_DET_PHY_ONLINE) &&
	((status & ATA_SS_SPD_MASK) != ATA_SS_SPD_NO_SPEED) &&
	((status & ATA_SS_IPM_MASK) == ATA_SS_IPM_ACTIVE)) {
	device_printf(dev, "CONNECT requested\n");
	} else
	device_printf(dev, "DISCONNECT requested\n");
	}
	siis_reset(dev);
	if ((ccb = xpt_alloc_ccb_nowait()) == NULL)
	return;
	if (xpt_create_path(&ccb->ccb_h.path, NULL,
	cam_sim_path(ch->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	xpt_free_ccb(ccb);
	return;
	}
	xpt_rescan(ccb);
	}
	}

	static void
	siis_ch_intr_locked(void *data)
	{
	device_t dev = (device_t)data;
	struct siis_channel *ch = device_get_softc(dev);

	mtx_lock(&ch->mtx);
	siis_ch_intr(data);
	mtx_unlock(&ch->mtx);
	}

	static void
	siis_ch_intr(void *data)
	{
	device_t dev = (device_t)data;
	struct siis_channel *ch = device_get_softc(dev);
	uint32_t istatus, sstatus, ctx, estatus, ok, err = 0;
	enum siis_err_type et;
	int i, ccs, port, tslots;

	mtx_assert(&ch->mtx, MA_OWNED);
	/* Read command statuses. */
	sstatus = ATA_INL(ch->r_mem, SIIS_P_SS);
	ok = ch->rslots & ~sstatus;
	/* Complete all successful commands. */
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	if ((ok >> i) & 1)
	siis_end_transaction(&ch->slot[i], SIIS_ERR_NONE);
	}
	/* Do we have any other events? */
	if ((sstatus & SIIS_P_SS_ATTN) == 0)
	return;
	/* Read and clear interrupt statuses. */
	istatus = ATA_INL(ch->r_mem, SIIS_P_IS) &
	(0xFFFF & ~SIIS_P_IX_COMMCOMP);
	ATA_OUTL(ch->r_mem, SIIS_P_IS, istatus);
	/* Process PHY events */
	if (istatus & SIIS_P_IX_PHYRDYCHG)
	siis_phy_check_events(dev);
	/* Process NOTIFY events */
	if (istatus & SIIS_P_IX_SDBN)
	siis_notify_events(dev);
	/* Process command errors */
	if (istatus & SIIS_P_IX_COMMERR) {
	estatus = ATA_INL(ch->r_mem, SIIS_P_CMDERR);
	ctx = ATA_INL(ch->r_mem, SIIS_P_CTX);
	ccs = (ctx & SIIS_P_CTX_SLOT) >> SIIS_P_CTX_SLOT_SHIFT;
	port = (ctx & SIIS_P_CTX_PMP) >> SIIS_P_CTX_PMP_SHIFT;
	err = ch->rslots & sstatus;
	//device_printf(dev, "%s ERROR ss %08x is %08x rs %08x es %d act %d port %d serr %08x\n",
	// __func__, sstatus, istatus, ch->rslots, estatus, ccs, port,
	// ATA_INL(ch->r_mem, SIIS_P_SERR));

	if (!ch->recoverycmd && !ch->recovery) {
	xpt_freeze_simq(ch->sim, ch->numrslots);
	ch->recovery = 1;
	}
	if (ch->frozen) {
	union ccb *fccb = ch->frozen;
	ch->frozen = NULL;
	fccb->ccb_h.status &= ~CAM_STATUS_MASK;
	fccb->ccb_h.status \|= CAM_REQUEUE_REQ \| CAM_RELEASE_SIMQ;
	if (!(fccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(fccb->ccb_h.path, 1);
	fccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	xpt_done(fccb);
	}
	if (estatus == SIIS_P_CMDERR_DEV \|\|
	estatus == SIIS_P_CMDERR_SDB \|\|
	estatus == SIIS_P_CMDERR_DATAFIS) {
	tslots = ch->numtslots[port];
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	/* XXX: requests in loading state. */
	if (((ch->rslots >> i) & 1) == 0)
	continue;
	if (ch->slot[i].ccb->ccb_h.target_id != port)
	continue;
	if (tslots == 0) {
	/* Untagged operation. */
	if (i == ccs)
	et = SIIS_ERR_TFE;
	else
	et = SIIS_ERR_INNOCENT;
	} else {
	/* Tagged operation. */
	et = SIIS_ERR_NCQ;
	}
	siis_end_transaction(&ch->slot[i], et);
	}
	/*
	* We can't reinit port if there are some other
	* commands active, use resume to complete them.
	*/
	if (ch->rslots != 0 && !ch->recoverycmd)
	ATA_OUTL(ch->r_mem, SIIS_P_CTLSET, SIIS_P_CTL_RESUME);
	} else {
	if (estatus == SIIS_P_CMDERR_SENDFIS \|\|
	estatus == SIIS_P_CMDERR_INCSTATE \|\|
	estatus == SIIS_P_CMDERR_PPE \|\|
	estatus == SIIS_P_CMDERR_SERVICE) {
	et = SIIS_ERR_SATA;
	} else
	et = SIIS_ERR_INVALID;
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	/* XXX: requests in loading state. */
	if (((ch->rslots >> i) & 1) == 0)
	continue;
	siis_end_transaction(&ch->slot[i], et);
	}
	}
	}
	}

	/* Must be called with channel locked. */
	static int
	siis_check_collision(device_t dev, union ccb *ccb)
	{
	struct siis_channel *ch = device_get_softc(dev);

	mtx_assert(&ch->mtx, MA_OWNED);
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA)) {
	/* Tagged command while we have no supported tag free. */
	if (((~ch->oslots) & (0x7fffffff >> (31 -
	ch->curr[ccb->ccb_h.target_id].tags))) == 0)
	return (1);
	}
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & (CAM_ATAIO_CONTROL \| CAM_ATAIO_NEEDRESULT))) {
	/* Atomic command while anything active. */
	if (ch->numrslots != 0)
	return (1);
	}
	/* We have some atomic command running. */
	if (ch->aslots != 0)
	return (1);
	return (0);
	}

	/* Must be called with channel locked. */
	static void
	siis_begin_transaction(device_t dev, union ccb *ccb)
	{
	struct siis_channel *ch = device_get_softc(dev);
	struct siis_slot *slot;
	int tag, tags;

	mtx_assert(&ch->mtx, MA_OWNED);
	/* Choose empty slot. */
	tags = SIIS_MAX_SLOTS;
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA))
	tags = ch->curr[ccb->ccb_h.target_id].tags;
	tag = fls((~ch->oslots) & (0x7fffffff >> (31 - tags))) - 1;
	/* Occupy chosen slot. */
	slot = &ch->slot[tag];
	slot->ccb = ccb;
	/* Update channel stats. */
	ch->oslots \|= (1 << slot->slot);
	ch->numrslots++;
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA)) {
	ch->numtslots[ccb->ccb_h.target_id]++;
	}
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & (CAM_ATAIO_CONTROL \| CAM_ATAIO_NEEDRESULT)))
	ch->aslots \|= (1 << slot->slot);
	slot->dma.nsegs = 0;
	/* If request moves data, setup and load SG list */
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) {
	slot->state = SIIS_SLOT_LOADING;
	bus_dmamap_load_ccb(ch->dma.data_tag, slot->dma.data_map,
	ccb, siis_dmasetprd, slot, 0);
	} else
	siis_execute_transaction(slot);
	}

	/* Locked by busdma engine. */
	static void
	siis_dmasetprd(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct siis_slot *slot = arg;
	struct siis_channel *ch = device_get_softc(slot->dev);
	struct siis_cmd *ctp;
	struct siis_dma_prd *prd;
	int i;

	mtx_assert(&ch->mtx, MA_OWNED);
	if (error) {
	device_printf(slot->dev, "DMA load error\n");
	if (!ch->recoverycmd)
	xpt_freeze_simq(ch->sim, 1);
	siis_end_transaction(slot, SIIS_ERR_INVALID);
	return;
	}
	KASSERT(nsegs <= SIIS_SG_ENTRIES, ("too many DMA segment entries\n"));
	slot->dma.nsegs = nsegs;
	if (nsegs != 0) {
	/* Get a piece of the workspace for this request */
	- ctp = (struct siis_cmd *)(ch->dma.work + SIIS_CT_OFFSET +
	- (SIIS_CT_SIZE * slot->slot));
	+ ctp = (struct siis_cmd *)(ch->dma.work + slot->prb_offset);
	/* Fill S/G table */
	if (slot->ccb->ccb_h.func_code == XPT_ATA_IO)
	prd = &ctp->u.ata.prd[0];
	else
	prd = &ctp->u.atapi.prd[0];
	for (i = 0; i < nsegs; i++) {
	prd[i].dba = htole64(segs[i].ds_addr);
	prd[i].dbc = htole32(segs[i].ds_len);
	prd[i].control = 0;
	}
	prd[nsegs - 1].control = htole32(SIIS_PRD_TRM);
	bus_dmamap_sync(ch->dma.data_tag, slot->dma.data_map,
	((slot->ccb->ccb_h.flags & CAM_DIR_IN) ?
	BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE));
	}
	siis_execute_transaction(slot);
	}

	/* Must be called with channel locked. */
	static void
	siis_execute_transaction(struct siis_slot *slot)
	{
	device_t dev = slot->dev;
	struct siis_channel *ch = device_get_softc(dev);
	struct siis_cmd *ctp;
	union ccb *ccb = slot->ccb;
	u_int64_t prb_bus;

	mtx_assert(&ch->mtx, MA_OWNED);
	/* Get a piece of the workspace for this request */
	- ctp = (struct siis_cmd *)
	- (ch->dma.work + SIIS_CT_OFFSET + (SIIS_CT_SIZE * slot->slot));
	+ ctp = (struct siis_cmd *)(ch->dma.work + slot->prb_offset);
	ctp->control = 0;
	ctp->protocol_override = 0;
	ctp->transfer_count = 0;
	/* Special handling for Soft Reset command. */
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	if (ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) {
	ctp->control \|= htole16(SIIS_PRB_SOFT_RESET);
	} else {
	ctp->control \|= htole16(SIIS_PRB_PROTOCOL_OVERRIDE);
	if (ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA) {
	ctp->protocol_override \|=
	htole16(SIIS_PRB_PROTO_NCQ);
	}
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
	ctp->protocol_override \|=
	htole16(SIIS_PRB_PROTO_READ);
	} else
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
	ctp->protocol_override \|=
	htole16(SIIS_PRB_PROTO_WRITE);
	}
	}
	} else if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN)
	ctp->control \|= htole16(SIIS_PRB_PACKET_READ);
	else
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT)
	ctp->control \|= htole16(SIIS_PRB_PACKET_WRITE);
	}
	/* Special handling for Soft Reset command. */
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) &&
	(ccb->ataio.cmd.control & ATA_A_RESET)) {
	/* Kick controller into sane state */
	siis_portinit(dev);
	}
	/* Setup the FIS for this request */
	if (!siis_setup_fis(dev, ctp, ccb, slot->slot)) {
	device_printf(ch->dev, "Setting up SATA FIS failed\n");
	if (!ch->recoverycmd)
	xpt_freeze_simq(ch->sim, 1);
	siis_end_transaction(slot, SIIS_ERR_INVALID);
	return;
	}
	bus_dmamap_sync(ch->dma.work_tag, ch->dma.work_map,
	BUS_DMASYNC_PREWRITE);
	/* Issue command to the controller. */
	slot->state = SIIS_SLOT_RUNNING;
	ch->rslots \|= (1 << slot->slot);
	- prb_bus = ch->dma.work_bus +
	- SIIS_CT_OFFSET + (SIIS_CT_SIZE * slot->slot);
	+ prb_bus = ch->dma.work_bus + slot->prb_offset;
	ATA_OUTL(ch->r_mem, SIIS_P_CACTL(slot->slot), prb_bus);
	ATA_OUTL(ch->r_mem, SIIS_P_CACTH(slot->slot), prb_bus >> 32);
	/* Start command execution timeout */
	callout_reset_sbt(&slot->timeout, SBT_1MS * ccb->ccb_h.timeout, 0,
	siis_timeout, slot, 0);
	return;
	}

	/* Must be called with channel locked. */
	static void
	siis_process_timeout(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);
	int i;

	mtx_assert(&ch->mtx, MA_OWNED);
	if (!ch->recoverycmd && !ch->recovery) {
	xpt_freeze_simq(ch->sim, ch->numrslots);
	ch->recovery = 1;
	}
	/* Handle the rest of commands. */
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	/* Do we have a running request on slot? */
	if (ch->slot[i].state < SIIS_SLOT_RUNNING)
	continue;
	siis_end_transaction(&ch->slot[i], SIIS_ERR_TIMEOUT);
	}
	}

	/* Must be called with channel locked. */
	static void
	siis_rearm_timeout(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);
	int i;

	mtx_assert(&ch->mtx, MA_OWNED);
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	struct siis_slot *slot = &ch->slot[i];

	/* Do we have a running request on slot? */
	if (slot->state < SIIS_SLOT_RUNNING)
	continue;
	if ((ch->toslots & (1 << i)) == 0)
	continue;
	callout_reset_sbt(&slot->timeout,
	SBT_1MS * slot->ccb->ccb_h.timeout, 0,
	siis_timeout, slot, 0);
	}
	}

	/* Locked by callout mechanism. */
	static void
	siis_timeout(void *arg)
	{
	struct siis_slot *slot = arg;
	device_t dev = slot->dev;
	struct siis_channel *ch = device_get_softc(dev);
	union ccb *ccb = slot->ccb;

	mtx_assert(&ch->mtx, MA_OWNED);
	/* Check for stale timeout. */
	if (slot->state < SIIS_SLOT_RUNNING)
	return;

	/* Handle soft-reset timeouts without doing hard-reset. */
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) &&
	(ccb->ataio.cmd.control & ATA_A_RESET)) {
	xpt_freeze_simq(ch->sim, ch->numrslots);
	siis_end_transaction(slot, SIIS_ERR_TFE);
	return;
	}

	device_printf(dev, "Timeout on slot %d\n", slot->slot);
	device_printf(dev, "%s is %08x ss %08x rs %08x es %08x sts %08x serr %08x\n",
	__func__, ATA_INL(ch->r_mem, SIIS_P_IS),
	ATA_INL(ch->r_mem, SIIS_P_SS), ch->rslots,
	ATA_INL(ch->r_mem, SIIS_P_CMDERR), ATA_INL(ch->r_mem, SIIS_P_STS),
	ATA_INL(ch->r_mem, SIIS_P_SERR));

	if (ch->toslots == 0)
	xpt_freeze_simq(ch->sim, 1);
	ch->toslots \|= (1 << slot->slot);
	if ((ch->rslots & ~ch->toslots) == 0)
	siis_process_timeout(dev);
	else
	device_printf(dev, " ... waiting for slots %08x\n",
	ch->rslots & ~ch->toslots);
	}

	/* Must be called with channel locked. */
	static void
	siis_end_transaction(struct siis_slot *slot, enum siis_err_type et)
	{
	device_t dev = slot->dev;
	struct siis_channel *ch = device_get_softc(dev);
	union ccb *ccb = slot->ccb;
	int lastto;

	mtx_assert(&ch->mtx, MA_OWNED);
	bus_dmamap_sync(ch->dma.work_tag, ch->dma.work_map,
	BUS_DMASYNC_POSTWRITE);
	/* Read result registers to the result struct
	* May be incorrect if several commands finished same time,
	* so read only when sure or have to.
	*/
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	struct ata_res *res = &ccb->ataio.res;
	if ((et == SIIS_ERR_TFE) \|\|
	(ccb->ataio.cmd.flags & CAM_ATAIO_NEEDRESULT)) {
	int offs = SIIS_P_LRAM_SLOT(slot->slot) + 8;

	res->status = ATA_INB(ch->r_mem, offs + 2);
	res->error = ATA_INB(ch->r_mem, offs + 3);
	res->lba_low = ATA_INB(ch->r_mem, offs + 4);
	res->lba_mid = ATA_INB(ch->r_mem, offs + 5);
	res->lba_high = ATA_INB(ch->r_mem, offs + 6);
	res->device = ATA_INB(ch->r_mem, offs + 7);
	res->lba_low_exp = ATA_INB(ch->r_mem, offs + 8);
	res->lba_mid_exp = ATA_INB(ch->r_mem, offs + 9);
	res->lba_high_exp = ATA_INB(ch->r_mem, offs + 10);
	res->sector_count = ATA_INB(ch->r_mem, offs + 12);
	res->sector_count_exp = ATA_INB(ch->r_mem, offs + 13);
	} else
	bzero(res, sizeof(*res));
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN &&
	ch->numrslots == 1) {
	ccb->ataio.resid = ccb->ataio.dxfer_len -
	ATA_INL(ch->r_mem, SIIS_P_LRAM_SLOT(slot->slot) + 4);
	}
	} else {
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN &&
	ch->numrslots == 1) {
	ccb->csio.resid = ccb->csio.dxfer_len -
	ATA_INL(ch->r_mem, SIIS_P_LRAM_SLOT(slot->slot) + 4);
	}
	}
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) {
	bus_dmamap_sync(ch->dma.data_tag, slot->dma.data_map,
	(ccb->ccb_h.flags & CAM_DIR_IN) ?
	BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
	bus_dmamap_unload(ch->dma.data_tag, slot->dma.data_map);
	}
	/* Set proper result status. */
	if (et != SIIS_ERR_NONE \|\| ch->recovery) {
	ch->eslots \|= (1 << slot->slot);
	ccb->ccb_h.status \|= CAM_RELEASE_SIMQ;
	}
	/* In case of error, freeze device for proper recovery. */
	if (et != SIIS_ERR_NONE && (!ch->recoverycmd) &&
	!(ccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	switch (et) {
	case SIIS_ERR_NONE:
	ccb->ccb_h.status \|= CAM_REQ_CMP;
	if (ccb->ccb_h.func_code == XPT_SCSI_IO)
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	break;
	case SIIS_ERR_INVALID:
	ch->fatalerr = 1;
	ccb->ccb_h.status \|= CAM_REQ_INVALID;
	break;
	case SIIS_ERR_INNOCENT:
	ccb->ccb_h.status \|= CAM_REQUEUE_REQ;
	break;
	case SIIS_ERR_TFE:
	case SIIS_ERR_NCQ:
	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
	ccb->ccb_h.status \|= CAM_SCSI_STATUS_ERROR;
	ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
	} else {
	ccb->ccb_h.status \|= CAM_ATA_STATUS_ERROR;
	}
	break;
	case SIIS_ERR_SATA:
	ch->fatalerr = 1;
	ccb->ccb_h.status \|= CAM_UNCOR_PARITY;
	break;
	case SIIS_ERR_TIMEOUT:
	ch->fatalerr = 1;
	ccb->ccb_h.status \|= CAM_CMD_TIMEOUT;
	break;
	default:
	ccb->ccb_h.status \|= CAM_REQ_CMP_ERR;
	}
	/* Free slot. */
	ch->oslots &= ~(1 << slot->slot);
	ch->rslots &= ~(1 << slot->slot);
	ch->aslots &= ~(1 << slot->slot);
	slot->state = SIIS_SLOT_EMPTY;
	slot->ccb = NULL;
	/* Update channel stats. */
	ch->numrslots--;
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA)) {
	ch->numtslots[ccb->ccb_h.target_id]--;
	}
	/* Cancel timeout state if request completed normally. */
	if (et != SIIS_ERR_TIMEOUT) {
	lastto = (ch->toslots == (1 << slot->slot));
	ch->toslots &= ~(1 << slot->slot);
	if (lastto)
	xpt_release_simq(ch->sim, TRUE);
	}
	/* If it was our READ LOG command - process it. */
	if (ccb->ccb_h.recovery_type == RECOVERY_READ_LOG) {
	siis_process_read_log(dev, ccb);
	/* If it was our REQUEST SENSE command - process it. */
	} else if (ccb->ccb_h.recovery_type == RECOVERY_REQUEST_SENSE) {
	siis_process_request_sense(dev, ccb);
	/* If it was NCQ or ATAPI command error, put result on hold. */
	} else if (et == SIIS_ERR_NCQ \|\|
	((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR &&
	(ccb->ccb_h.flags & CAM_DIS_AUTOSENSE) == 0)) {
	ch->hold[slot->slot] = ccb;
	ch->numhslots++;
	} else
	xpt_done(ccb);
	/* If we have no other active commands, ... */
	if (ch->rslots == 0) {
	/* if there were timeouts or fatal error - reset port. */
	if (ch->toslots != 0 \|\| ch->fatalerr) {
	siis_reset(dev);
	} else {
	/* if we have slots in error, we can reinit port. */
	if (ch->eslots != 0)
	siis_portinit(dev);
	/* if there commands on hold, we can do recovery. */
	if (!ch->recoverycmd && ch->numhslots)
	siis_issue_recovery(dev);
	}
	/* If all the reset of commands are in timeout - abort them. */
	} else if ((ch->rslots & ~ch->toslots) == 0 &&
	et != SIIS_ERR_TIMEOUT)
	siis_rearm_timeout(dev);
	/* Unfreeze frozen command. */
	if (ch->frozen && !siis_check_collision(dev, ch->frozen)) {
	union ccb *fccb = ch->frozen;
	ch->frozen = NULL;
	siis_begin_transaction(dev, fccb);
	xpt_release_simq(ch->sim, TRUE);
	}
	}

	static void
	siis_issue_recovery(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);
	union ccb *ccb;
	struct ccb_ataio *ataio;
	struct ccb_scsiio *csio;
	int i;

	/* Find some held command. */
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	if (ch->hold[i])
	break;
	}
	if (i == SIIS_MAX_SLOTS)
	return;
	ccb = xpt_alloc_ccb_nowait();
	if (ccb == NULL) {
	device_printf(dev, "Unable to allocate recovery command\n");
	completeall:
	/* We can't do anything -- complete held commands. */
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	if (ch->hold[i] == NULL)
	continue;
	ch->hold[i]->ccb_h.status &= ~CAM_STATUS_MASK;
	ch->hold[i]->ccb_h.status \|= CAM_RESRC_UNAVAIL;
	xpt_done(ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	siis_reset(dev);
	return;
	}
	ccb->ccb_h = ch->hold[i]->ccb_h; /* Reuse old header. */
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	/* READ LOG */
	ccb->ccb_h.recovery_type = RECOVERY_READ_LOG;
	ccb->ccb_h.func_code = XPT_ATA_IO;
	ccb->ccb_h.flags = CAM_DIR_IN;
	ccb->ccb_h.timeout = 1000; /* 1s should be enough. */
	ataio = &ccb->ataio;
	ataio->data_ptr = malloc(512, M_SIIS, M_NOWAIT);
	if (ataio->data_ptr == NULL) {
	xpt_free_ccb(ccb);
	device_printf(dev,
	"Unable to allocate memory for READ LOG command\n");
	goto completeall;
	}
	ataio->dxfer_len = 512;
	bzero(&ataio->cmd, sizeof(ataio->cmd));
	ataio->cmd.flags = CAM_ATAIO_48BIT;
	ataio->cmd.command = 0x2F; /* READ LOG EXT */
	ataio->cmd.sector_count = 1;
	ataio->cmd.sector_count_exp = 0;
	ataio->cmd.lba_low = 0x10;
	ataio->cmd.lba_mid = 0;
	ataio->cmd.lba_mid_exp = 0;
	} else {
	/* REQUEST SENSE */
	ccb->ccb_h.recovery_type = RECOVERY_REQUEST_SENSE;
	ccb->ccb_h.recovery_slot = i;
	ccb->ccb_h.func_code = XPT_SCSI_IO;
	ccb->ccb_h.flags = CAM_DIR_IN;
	ccb->ccb_h.status = 0;
	ccb->ccb_h.timeout = 1000; /* 1s should be enough. */
	csio = &ccb->csio;
	csio->data_ptr = (void *)&ch->hold[i]->csio.sense_data;
	csio->dxfer_len = ch->hold[i]->csio.sense_len;
	csio->cdb_len = 6;
	bzero(&csio->cdb_io, sizeof(csio->cdb_io));
	csio->cdb_io.cdb_bytes[0] = 0x03;
	csio->cdb_io.cdb_bytes[4] = csio->dxfer_len;
	}
	ch->recoverycmd = 1;
	siis_begin_transaction(dev, ccb);
	}

	static void
	siis_process_read_log(device_t dev, union ccb *ccb)
	{
	struct siis_channel *ch = device_get_softc(dev);
	uint8_t *data;
	struct ata_res *res;
	int i;

	ch->recoverycmd = 0;
	data = ccb->ataio.data_ptr;
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP &&
	(data[0] & 0x80) == 0) {
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	if (!ch->hold[i])
	continue;
	if (ch->hold[i]->ccb_h.target_id != ccb->ccb_h.target_id)
	continue;
	if ((data[0] & 0x1F) == i) {
	res = &ch->hold[i]->ataio.res;
	res->status = data[2];
	res->error = data[3];
	res->lba_low = data[4];
	res->lba_mid = data[5];
	res->lba_high = data[6];
	res->device = data[7];
	res->lba_low_exp = data[8];
	res->lba_mid_exp = data[9];
	res->lba_high_exp = data[10];
	res->sector_count = data[12];
	res->sector_count_exp = data[13];
	} else {
	ch->hold[i]->ccb_h.status &= ~CAM_STATUS_MASK;
	ch->hold[i]->ccb_h.status \|= CAM_REQUEUE_REQ;
	}
	xpt_done(ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	} else {
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)
	device_printf(dev, "Error while READ LOG EXT\n");
	else if ((data[0] & 0x80) == 0) {
	device_printf(dev, "Non-queued command error in READ LOG EXT\n");
	}
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	if (!ch->hold[i])
	continue;
	if (ch->hold[i]->ccb_h.target_id != ccb->ccb_h.target_id)
	continue;
	xpt_done(ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	}
	free(ccb->ataio.data_ptr, M_SIIS);
	xpt_free_ccb(ccb);
	}

	static void
	siis_process_request_sense(device_t dev, union ccb *ccb)
	{
	struct siis_channel *ch = device_get_softc(dev);
	int i;

	ch->recoverycmd = 0;

	i = ccb->ccb_h.recovery_slot;
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	ch->hold[i]->ccb_h.status \|= CAM_AUTOSNS_VALID;
	} else {
	ch->hold[i]->ccb_h.status &= ~CAM_STATUS_MASK;
	ch->hold[i]->ccb_h.status \|= CAM_AUTOSENSE_FAIL;
	}
	xpt_done(ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	xpt_free_ccb(ccb);
	}

	static void
	siis_portinit(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);
	int i;

	ch->eslots = 0;
	ch->recovery = 0;
	ATA_OUTL(ch->r_mem, SIIS_P_CTLCLR, SIIS_P_CTL_RESUME);
	for (i = 0; i < 16; i++) {
	ATA_OUTL(ch->r_mem, SIIS_P_PMPSTS(i), 0),
	ATA_OUTL(ch->r_mem, SIIS_P_PMPQACT(i), 0);
	}
	ATA_OUTL(ch->r_mem, SIIS_P_CTLSET, SIIS_P_CTL_PORT_INIT);
	siis_wait_ready(dev, 1000);
	}

	static int
	siis_devreset(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);
	int timeout = 0;
	uint32_t val;

	ATA_OUTL(ch->r_mem, SIIS_P_CTLSET, SIIS_P_CTL_DEV_RESET);
	while (((val = ATA_INL(ch->r_mem, SIIS_P_STS)) &
	SIIS_P_CTL_DEV_RESET) != 0) {
	DELAY(100);
	if (timeout++ > 1000) {
	device_printf(dev, "device reset stuck "
	"(timeout 100ms) status = %08x\n", val);
	return (EBUSY);
	}
	}
	return (0);
	}

	static int
	siis_wait_ready(device_t dev, int t)
	{
	struct siis_channel *ch = device_get_softc(dev);
	int timeout = 0;
	uint32_t val;

	while (((val = ATA_INL(ch->r_mem, SIIS_P_STS)) &
	SIIS_P_CTL_READY) == 0) {
	DELAY(1000);
	if (timeout++ > t) {
	device_printf(dev, "port is not ready (timeout %dms) "
	"status = %08x\n", t, val);
	return (EBUSY);
	}
	}
	return (0);
	}

	static void
	siis_reset(device_t dev)
	{
	struct siis_channel *ch = device_get_softc(dev);
	int i, retry = 0, sata_rev;
	uint32_t val;

	xpt_freeze_simq(ch->sim, 1);
	if (bootverbose)
	device_printf(dev, "SIIS reset...\n");
	if (!ch->recoverycmd && !ch->recovery)
	xpt_freeze_simq(ch->sim, ch->numrslots);
	/* Requeue frozen command. */
	if (ch->frozen) {
	union ccb *fccb = ch->frozen;
	ch->frozen = NULL;
	fccb->ccb_h.status &= ~CAM_STATUS_MASK;
	fccb->ccb_h.status \|= CAM_REQUEUE_REQ \| CAM_RELEASE_SIMQ;
	if (!(fccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(fccb->ccb_h.path, 1);
	fccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	xpt_done(fccb);
	}
	/* Requeue all running commands. */
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	/* Do we have a running request on slot? */
	if (ch->slot[i].state < SIIS_SLOT_RUNNING)
	continue;
	/* XXX; Commands in loading state. */
	siis_end_transaction(&ch->slot[i], SIIS_ERR_INNOCENT);
	}
	/* Finish all held commands as-is. */
	for (i = 0; i < SIIS_MAX_SLOTS; i++) {
	if (!ch->hold[i])
	continue;
	xpt_done(ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	if (ch->toslots != 0)
	xpt_release_simq(ch->sim, TRUE);
	ch->eslots = 0;
	ch->recovery = 0;
	ch->toslots = 0;
	ch->fatalerr = 0;
	/* Disable port interrupts */
	ATA_OUTL(ch->r_mem, SIIS_P_IECLR, 0x0000FFFF);
	/* Set speed limit. */
	sata_rev = ch->user[ch->pm_present ? 15 : 0].revision;
	if (sata_rev == 1)
	val = ATA_SC_SPD_SPEED_GEN1;
	else if (sata_rev == 2)
	val = ATA_SC_SPD_SPEED_GEN2;
	else if (sata_rev == 3)
	val = ATA_SC_SPD_SPEED_GEN3;
	else
	val = 0;
	ATA_OUTL(ch->r_mem, SIIS_P_SCTL,
	ATA_SC_DET_IDLE \| val \| ((ch->pm_level > 0) ? 0 :
	(ATA_SC_IPM_DIS_PARTIAL \| ATA_SC_IPM_DIS_SLUMBER)));
	retry:
	siis_devreset(dev);
	/* Reset and reconnect PHY, */
	if (!siis_sata_connect(ch)) {
	ch->devices = 0;
	/* Enable port interrupts */
	ATA_OUTL(ch->r_mem, SIIS_P_IESET, SIIS_P_IX_ENABLED);
	if (bootverbose)
	device_printf(dev,
	"SIIS reset done: phy reset found no device\n");
	/* Tell the XPT about the event */
	xpt_async(AC_BUS_RESET, ch->path, NULL);
	xpt_release_simq(ch->sim, TRUE);
	return;
	}
	/* Wait for port ready status. */
	if (siis_wait_ready(dev, 1000)) {
	device_printf(dev, "port ready timeout\n");
	if (!retry) {
	device_printf(dev, "trying full port reset ...\n");
	/* Get port to the reset state. */
	ATA_OUTL(ch->r_mem, SIIS_P_CTLSET, SIIS_P_CTL_PORT_RESET);
	DELAY(10000);
	/* Get port out of reset state. */
	ATA_OUTL(ch->r_mem, SIIS_P_CTLCLR, SIIS_P_CTL_PORT_RESET);
	ATA_OUTL(ch->r_mem, SIIS_P_CTLCLR, SIIS_P_CTL_32BIT);
	if (ch->pm_present)
	ATA_OUTL(ch->r_mem, SIIS_P_CTLSET, SIIS_P_CTL_PME);
	else
	ATA_OUTL(ch->r_mem, SIIS_P_CTLCLR, SIIS_P_CTL_PME);
	siis_wait_ready(dev, 5000);
	retry = 1;
	goto retry;
	}
	}
	ch->devices = 1;
	/* Enable port interrupts */
	ATA_OUTL(ch->r_mem, SIIS_P_IS, 0xFFFFFFFF);
	ATA_OUTL(ch->r_mem, SIIS_P_IESET, SIIS_P_IX_ENABLED);
	if (bootverbose)
	device_printf(dev, "SIIS reset done: devices=%08x\n", ch->devices);
	/* Tell the XPT about the event */
	xpt_async(AC_BUS_RESET, ch->path, NULL);
	xpt_release_simq(ch->sim, TRUE);
	}

	static int
	siis_setup_fis(device_t dev, struct siis_cmd ctp, union ccb ccb, int tag)
	{
	struct siis_channel *ch = device_get_softc(dev);
	u_int8_t *fis = &ctp->fis[0];

	bzero(fis, 24);
	fis[0] = 0x27; /* host to device */
	fis[1] = (ccb->ccb_h.target_id & 0x0f);
	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
	fis[1] \|= 0x80;
	fis[2] = ATA_PACKET_CMD;
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE &&
	ch->curr[ccb->ccb_h.target_id].mode >= ATA_DMA)
	fis[3] = ATA_F_DMA;
	else {
	fis[5] = ccb->csio.dxfer_len;
	fis[6] = ccb->csio.dxfer_len >> 8;
	}
	fis[7] = ATA_D_LBA;
	fis[15] = ATA_A_4BIT;
	bzero(ctp->u.atapi.ccb, 16);
	bcopy((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
	ccb->csio.cdb_io.cdb_ptr : ccb->csio.cdb_io.cdb_bytes,
	ctp->u.atapi.ccb, ccb->csio.cdb_len);
	} else if ((ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) == 0) {
	fis[1] \|= 0x80;
	fis[2] = ccb->ataio.cmd.command;
	fis[3] = ccb->ataio.cmd.features;
	fis[4] = ccb->ataio.cmd.lba_low;
	fis[5] = ccb->ataio.cmd.lba_mid;
	fis[6] = ccb->ataio.cmd.lba_high;
	fis[7] = ccb->ataio.cmd.device;
	fis[8] = ccb->ataio.cmd.lba_low_exp;
	fis[9] = ccb->ataio.cmd.lba_mid_exp;
	fis[10] = ccb->ataio.cmd.lba_high_exp;
	fis[11] = ccb->ataio.cmd.features_exp;
	fis[12] = ccb->ataio.cmd.sector_count;
	if (ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA) {
	fis[12] &= 0x07;
	fis[12] \|= tag << 3;
	}
	fis[13] = ccb->ataio.cmd.sector_count_exp;
	if (ccb->ataio.ata_flags & ATA_FLAG_ICC)
	fis[14] = ccb->ataio.icc;
	fis[15] = ATA_A_4BIT;
	if (ccb->ataio.ata_flags & ATA_FLAG_AUX) {
	fis[16] = ccb->ataio.aux & 0xff;
	fis[17] = (ccb->ataio.aux >> 8) & 0xff;
	fis[18] = (ccb->ataio.aux >> 16) & 0xff;
	fis[19] = (ccb->ataio.aux >> 24) & 0xff;
	}
	} else {
	/* Soft reset. */
	}
	return (20);
	}

	static int
	siis_sata_connect(struct siis_channel *ch)
	{
	u_int32_t status;
	int timeout, found = 0;

	/* Wait up to 100ms for "connect well" */
	for (timeout = 0; timeout < 1000 ; timeout++) {
	status = ATA_INL(ch->r_mem, SIIS_P_SSTS);
	if ((status & ATA_SS_DET_MASK) != ATA_SS_DET_NO_DEVICE)
	found = 1;
	if (((status & ATA_SS_DET_MASK) == ATA_SS_DET_PHY_ONLINE) &&
	((status & ATA_SS_SPD_MASK) != ATA_SS_SPD_NO_SPEED) &&
	((status & ATA_SS_IPM_MASK) == ATA_SS_IPM_ACTIVE))
	break;
	if ((status & ATA_SS_DET_MASK) == ATA_SS_DET_PHY_OFFLINE) {
	if (bootverbose) {
	device_printf(ch->dev, "SATA offline status=%08x\n",
	status);
	}
	return (0);
	}
	if (found == 0 && timeout >= 100)
	break;
	DELAY(100);
	}
	if (timeout >= 1000 \|\| !found) {
	if (bootverbose) {
	device_printf(ch->dev,
	"SATA connect timeout time=%dus status=%08x\n",
	timeout * 100, status);
	}
	return (0);
	}
	if (bootverbose) {
	device_printf(ch->dev, "SATA connect time=%dus status=%08x\n",
	timeout * 100, status);
	}
	/* Clear SATA error register */
	ATA_OUTL(ch->r_mem, SIIS_P_SERR, 0xffffffff);
	return (1);
	}

	static int
	siis_check_ids(device_t dev, union ccb *ccb)
	{

	if (ccb->ccb_h.target_id > 15) {
	ccb->ccb_h.status = CAM_TID_INVALID;
	xpt_done(ccb);
	return (-1);
	}
	if (ccb->ccb_h.target_lun != 0) {
	ccb->ccb_h.status = CAM_LUN_INVALID;
	xpt_done(ccb);
	return (-1);
	}
	return (0);
	}

	static void
	siisaction(struct cam_sim sim, union ccb ccb)
	{
	device_t dev, parent;
	struct siis_channel *ch;

	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("siisaction func_code=%x\n",
	ccb->ccb_h.func_code));

	ch = (struct siis_channel *)cam_sim_softc(sim);
	dev = ch->dev;
	mtx_assert(&ch->mtx, MA_OWNED);
	switch (ccb->ccb_h.func_code) {
	/* Common cases first */
	case XPT_ATA_IO: /* Execute the requested I/O operation */
	case XPT_SCSI_IO:
	if (siis_check_ids(dev, ccb))
	return;
	if (ch->devices == 0 \|\|
	(ch->pm_present == 0 &&
	ccb->ccb_h.target_id > 0 && ccb->ccb_h.target_id < 15)) {
	ccb->ccb_h.status = CAM_SEL_TIMEOUT;
	break;
	}
	ccb->ccb_h.recovery_type = RECOVERY_NONE;
	/* Check for command collision. */
	if (siis_check_collision(dev, ccb)) {
	/* Freeze command. */
	ch->frozen = ccb;
	/* We have only one frozen slot, so freeze simq also. */
	xpt_freeze_simq(ch->sim, 1);
	return;
	}
	siis_begin_transaction(dev, ccb);
	return;
	case XPT_ABORT: /* Abort the specified CCB */
	/* XXX Implement */
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	case XPT_SET_TRAN_SETTINGS:
	{
	struct ccb_trans_settings *cts = &ccb->cts;
	struct siis_device *d;

	if (siis_check_ids(dev, ccb))
	return;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS)
	d = &ch->curr[ccb->ccb_h.target_id];
	else
	d = &ch->user[ccb->ccb_h.target_id];
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_REVISION)
	d->revision = cts->xport_specific.sata.revision;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_MODE)
	d->mode = cts->xport_specific.sata.mode;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_BYTECOUNT)
	d->bytecount = min(8192, cts->xport_specific.sata.bytecount);
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_TAGS)
	d->tags = min(SIIS_MAX_SLOTS, cts->xport_specific.sata.tags);
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_PM) {
	ch->pm_present = cts->xport_specific.sata.pm_present;
	if (ch->pm_present)
	ATA_OUTL(ch->r_mem, SIIS_P_CTLSET, SIIS_P_CTL_PME);
	else
	ATA_OUTL(ch->r_mem, SIIS_P_CTLCLR, SIIS_P_CTL_PME);
	}
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_TAGS)
	d->atapi = cts->xport_specific.sata.atapi;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_CAPS)
	d->caps = cts->xport_specific.sata.caps;
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_GET_TRAN_SETTINGS:
	/* Get default/user set transfer settings for the target */
	{
	struct ccb_trans_settings *cts = &ccb->cts;
	struct siis_device *d;
	uint32_t status;

	if (siis_check_ids(dev, ccb))
	return;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS)
	d = &ch->curr[ccb->ccb_h.target_id];
	else
	d = &ch->user[ccb->ccb_h.target_id];
	cts->protocol = PROTO_UNSPECIFIED;
	cts->protocol_version = PROTO_VERSION_UNSPECIFIED;
	cts->transport = XPORT_SATA;
	cts->transport_version = XPORT_VERSION_UNSPECIFIED;
	cts->proto_specific.valid = 0;
	cts->xport_specific.sata.valid = 0;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS &&
	(ccb->ccb_h.target_id == 15 \|\|
	(ccb->ccb_h.target_id == 0 && !ch->pm_present))) {
	status = ATA_INL(ch->r_mem, SIIS_P_SSTS) & ATA_SS_SPD_MASK;
	if (status & 0x0f0) {
	cts->xport_specific.sata.revision =
	(status & 0x0f0) >> 4;
	cts->xport_specific.sata.valid \|=
	CTS_SATA_VALID_REVISION;
	}
	cts->xport_specific.sata.caps = d->caps & CTS_SATA_CAPS_D;
	if (ch->pm_level)
	cts->xport_specific.sata.caps \|= CTS_SATA_CAPS_H_PMREQ;
	cts->xport_specific.sata.caps \|= CTS_SATA_CAPS_H_AN;
	cts->xport_specific.sata.caps &=
	ch->user[ccb->ccb_h.target_id].caps;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_CAPS;
	} else {
	cts->xport_specific.sata.revision = d->revision;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_REVISION;
	cts->xport_specific.sata.caps = d->caps;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS &&
	(ch->quirks & SIIS_Q_SNTF) == 0)
	cts->xport_specific.sata.caps &= ~CTS_SATA_CAPS_H_AN;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_CAPS;
	}
	cts->xport_specific.sata.mode = d->mode;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_MODE;
	cts->xport_specific.sata.bytecount = d->bytecount;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_BYTECOUNT;
	cts->xport_specific.sata.pm_present = ch->pm_present;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_PM;
	cts->xport_specific.sata.tags = d->tags;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_TAGS;
	cts->xport_specific.sata.atapi = d->atapi;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_ATAPI;
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_RESET_BUS: /* Reset the specified SCSI bus */
	case XPT_RESET_DEV: /* Bus Device Reset the specified SCSI device */
	siis_reset(dev);
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	case XPT_TERM_IO: /* Terminate the I/O process */
	/* XXX Implement */
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	case XPT_PATH_INQ: /* Path routing inquiry */
	{
	struct ccb_pathinq *cpi = &ccb->cpi;

	parent = device_get_parent(dev);
	cpi->version_num = 1; /* XXX??? */
	cpi->hba_inquiry = PI_SDTR_ABLE \| PI_TAG_ABLE;
	cpi->hba_inquiry \|= PI_SATAPM;
	cpi->target_sprt = 0;
	cpi->hba_misc = PIM_SEQSCAN \| PIM_UNMAPPED \| PIM_ATA_EXT;
	cpi->hba_eng_cnt = 0;
	cpi->max_target = 15;
	cpi->max_lun = 0;
	cpi->initiator_id = 0;
	cpi->bus_id = cam_sim_bus(sim);
	cpi->base_transfer_speed = 150000;
	strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strlcpy(cpi->hba_vid, "SIIS", HBA_IDLEN);
	strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
	cpi->unit_number = cam_sim_unit(sim);
	cpi->transport = XPORT_SATA;
	cpi->transport_version = XPORT_VERSION_UNSPECIFIED;
	cpi->protocol = PROTO_ATA;
	cpi->protocol_version = PROTO_VERSION_UNSPECIFIED;
	- cpi->maxio = MAXPHYS;
	+ cpi->maxio = maxphys;
	cpi->hba_vendor = pci_get_vendor(parent);
	cpi->hba_device = pci_get_device(parent);
	cpi->hba_subvendor = pci_get_subvendor(parent);
	cpi->hba_subdevice = pci_get_subdevice(parent);
	cpi->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	default:
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	}
	xpt_done(ccb);
	}

	static void
	siispoll(struct cam_sim *sim)
	{
	struct siis_channel ch = (struct siis_channel )cam_sim_softc(sim);

	siis_ch_intr(ch->dev);
	}
	diff --git a/sys/dev/siis/siis.h b/sys/dev/siis/siis.h
	index 383b0e0b98ac..dac43cf2dfd4 100644
	--- a/sys/dev/siis/siis.h
	+++ b/sys/dev/siis/siis.h
	@@ -1,463 +1,458 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2009 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	/* ATA register defines */
	#define ATA_DATA 0 /* (RW) data */

	#define ATA_FEATURE 1 /* (W) feature */
	#define ATA_F_DMA 0x01 /* enable DMA */
	#define ATA_F_OVL 0x02 /* enable overlap */

	#define ATA_COUNT 2 /* (W) sector count */

	#define ATA_SECTOR 3 /* (RW) sector # */
	#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */
	#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */
	#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */
	#define ATA_D_LBA 0x40 /* use LBA addressing */
	#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */

	#define ATA_COMMAND 7 /* (W) command */

	#define ATA_ERROR 8 /* (R) error */
	#define ATA_E_ILI 0x01 /* illegal length */
	#define ATA_E_NM 0x02 /* no media */
	#define ATA_E_ABORT 0x04 /* command aborted */
	#define ATA_E_MCR 0x08 /* media change request */
	#define ATA_E_IDNF 0x10 /* ID not found */
	#define ATA_E_MC 0x20 /* media changed */
	#define ATA_E_UNC 0x40 /* uncorrectable data */
	#define ATA_E_ICRC 0x80 /* UDMA crc error */
	#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */

	#define ATA_IREASON 9 /* (R) interrupt reason */
	#define ATA_I_CMD 0x01 /* cmd (1) \| data (0) */
	#define ATA_I_IN 0x02 /* read (1) \| write (0) */
	#define ATA_I_RELEASE 0x04 /* released bus (1) */
	#define ATA_I_TAGMASK 0xf8 /* tag mask */

	#define ATA_STATUS 10 /* (R) status */
	#define ATA_ALTSTAT 11 /* (R) alternate status */
	#define ATA_S_ERROR 0x01 /* error */
	#define ATA_S_INDEX 0x02 /* index */
	#define ATA_S_CORR 0x04 /* data corrected */
	#define ATA_S_DRQ 0x08 /* data request */
	#define ATA_S_DSC 0x10 /* drive seek completed */
	#define ATA_S_SERVICE 0x10 /* drive needs service */
	#define ATA_S_DWF 0x20 /* drive write fault */
	#define ATA_S_DMA 0x20 /* DMA ready */
	#define ATA_S_READY 0x40 /* drive ready */
	#define ATA_S_BUSY 0x80 /* busy */

	#define ATA_CONTROL 12 /* (W) control */
	#define ATA_A_IDS 0x02 /* disable interrupts */
	#define ATA_A_RESET 0x04 /* RESET controller */
	#define ATA_A_4BIT 0x08 /* 4 head bits */
	#define ATA_A_HOB 0x80 /* High Order Byte enable */

	/* SATA register defines */
	#define ATA_SSTATUS 13
	#define ATA_SS_DET_MASK 0x0000000f
	#define ATA_SS_DET_NO_DEVICE 0x00000000
	#define ATA_SS_DET_DEV_PRESENT 0x00000001
	#define ATA_SS_DET_PHY_ONLINE 0x00000003
	#define ATA_SS_DET_PHY_OFFLINE 0x00000004

	#define ATA_SS_SPD_MASK 0x000000f0
	#define ATA_SS_SPD_NO_SPEED 0x00000000
	#define ATA_SS_SPD_GEN1 0x00000010
	#define ATA_SS_SPD_GEN2 0x00000020
	#define ATA_SS_SPD_GEN3 0x00000030

	#define ATA_SS_IPM_MASK 0x00000f00
	#define ATA_SS_IPM_NO_DEVICE 0x00000000
	#define ATA_SS_IPM_ACTIVE 0x00000100
	#define ATA_SS_IPM_PARTIAL 0x00000200
	#define ATA_SS_IPM_SLUMBER 0x00000600

	#define ATA_SERROR 14
	#define ATA_SE_DATA_CORRECTED 0x00000001
	#define ATA_SE_COMM_CORRECTED 0x00000002
	#define ATA_SE_DATA_ERR 0x00000100
	#define ATA_SE_COMM_ERR 0x00000200
	#define ATA_SE_PROT_ERR 0x00000400
	#define ATA_SE_HOST_ERR 0x00000800
	#define ATA_SE_PHY_CHANGED 0x00010000
	#define ATA_SE_PHY_IERROR 0x00020000
	#define ATA_SE_COMM_WAKE 0x00040000
	#define ATA_SE_DECODE_ERR 0x00080000
	#define ATA_SE_PARITY_ERR 0x00100000
	#define ATA_SE_CRC_ERR 0x00200000
	#define ATA_SE_HANDSHAKE_ERR 0x00400000
	#define ATA_SE_LINKSEQ_ERR 0x00800000
	#define ATA_SE_TRANSPORT_ERR 0x01000000
	#define ATA_SE_UNKNOWN_FIS 0x02000000

	#define ATA_SCONTROL 15
	#define ATA_SC_DET_MASK 0x0000000f
	#define ATA_SC_DET_IDLE 0x00000000
	#define ATA_SC_DET_RESET 0x00000001
	#define ATA_SC_DET_DISABLE 0x00000004

	#define ATA_SC_SPD_MASK 0x000000f0
	#define ATA_SC_SPD_NO_SPEED 0x00000000
	#define ATA_SC_SPD_SPEED_GEN1 0x00000010
	#define ATA_SC_SPD_SPEED_GEN2 0x00000020
	#define ATA_SC_SPD_SPEED_GEN3 0x00000030

	#define ATA_SC_IPM_MASK 0x00000f00
	#define ATA_SC_IPM_NONE 0x00000000
	#define ATA_SC_IPM_DIS_PARTIAL 0x00000100
	#define ATA_SC_IPM_DIS_SLUMBER 0x00000200

	#define ATA_SACTIVE 16

	/*
	* Global registers
	*/
	#define SIIS_GCTL 0x0040 /* Global Control */
	#define SIIS_GCTL_GRESET 0x80000000 /* Global Reset */
	#define SIIS_GCTL_MSIACK 0x40000000 /* MSI Ack */
	#define SIIS_GCTL_I2C_IE 0x20000000 /* I2C int enable */
	#define SIIS_GCTL_300CAP 0x01000000 /* 3Gb/s capable (R) */
	#define SIIS_GCTL_PIE(n) (1 << (n)) /* Port int enable */
	#define SIIS_IS 0x0044 /* Interrupt Status */
	#define SIIS_IS_I2C 0x20000000 /* I2C Int Status */
	#define SIIS_IS_PORT(n) (1 << (n)) /* Port interrupt stat */
	#define SIIS_PHYCONF 0x0048 /* PHY Configuration */
	#define SIIS_BIST_CTL 0x0050
	#define SIIS_BIST_PATTERN 0x0054 /* 32 bit pattern */
	#define SIIS_BIST_STATUS 0x0058
	#define SIIS_I2C_CTL 0x0060
	#define SIIS_I2C_STS 0x0064
	#define SIIS_I2C_SADDR 0x0068
	#define SIIS_I2C_DATA 0x006C
	#define SIIS_FLASH_ADDR 0x0070
	#define SIIS_GPIO 0x0074

	/*
	* Port registers
	*/

	#define SIIS_P_LRAM 0x0000
	#define SIIS_P_LRAM_SLOT(i) (SIIS_P_LRAM + i * 128)
	#define SIIS_P_PMPSTS(i) (0x0F80 + i * 8)
	#define SIIS_P_PMPQACT(i) (0x0F80 + i * 8 + 4)
	#define SIIS_P_STS 0x1000
	#define SIIS_P_CTLSET 0x1000
	#define SIIS_P_CTLCLR 0x1004
	#define SIIS_P_CTL_READY 0x80000000
	#define SIIS_P_CTL_OOBB 0x02000000
	#define SIIS_P_CTL_ACT 0x001F0000
	#define SIIS_P_CTL_ACT_SHIFT 16
	#define SIIS_P_CTL_LED_ON 0x00008000
	#define SIIS_P_CTL_AIA 0x00004000
	#define SIIS_P_CTL_PME 0x00002000
	#define SIIS_P_CTL_IA 0x00001000
	#define SIIS_P_CTL_IR 0x00000800
	#define SIIS_P_CTL_32BIT 0x00000400
	#define SIIS_P_CTL_SCR_DIS 0x00000200
	#define SIIS_P_CTL_CONT_DIS 0x00000100
	#define SIIS_P_CTL_TBIST 0x00000080
	#define SIIS_P_CTL_RESUME 0x00000040
	#define SIIS_P_CTL_PLENGTH 0x00000020
	#define SIIS_P_CTL_LED_DIS 0x00000010
	#define SIIS_P_CTL_INT_NCOR 0x00000008
	#define SIIS_P_CTL_PORT_INIT 0x00000004
	#define SIIS_P_CTL_DEV_RESET 0x00000002
	#define SIIS_P_CTL_PORT_RESET 0x00000001
	#define SIIS_P_IS 0x1008
	#define SIIS_P_IX_SDBN 0x00000800
	#define SIIS_P_IX_HS_ET 0x00000400
	#define SIIS_P_IX_CRC_ET 0x00000200
	#define SIIS_P_IX_8_10_ET 0x00000100
	#define SIIS_P_IX_DEX 0x00000080
	#define SIIS_P_IX_UNRECFIS 0x00000040
	#define SIIS_P_IX_COMWAKE 0x00000020
	#define SIIS_P_IX_PHYRDYCHG 0x00000010
	#define SIIS_P_IX_PMCHG 0x00000008
	#define SIIS_P_IX_READY 0x00000004
	#define SIIS_P_IX_COMMERR 0x00000002
	#define SIIS_P_IX_COMMCOMP 0x00000001
	#define SIIS_P_IX_ENABLED SIIS_P_IX_COMMCOMP \| SIIS_P_IX_COMMERR \| \
	SIIS_P_IX_PHYRDYCHG \| SIIS_P_IX_SDBN
	#define SIIS_P_IESET 0x1010
	#define SIIS_P_IECLR 0x1014
	#define SIIS_P_CACTU 0x101C
	#define SIIS_P_CMDEFIFO 0x1020
	#define SIIS_P_CMDERR 0x1024
	#define SIIS_P_CMDERR_DEV 1
	#define SIIS_P_CMDERR_SDB 2
	#define SIIS_P_CMDERR_DATAFIS 3
	#define SIIS_P_CMDERR_SENDFIS 4
	#define SIIS_P_CMDERR_INCSTATE 5
	#define SIIS_P_CMDERR_DIRECTION 6
	#define SIIS_P_CMDERR_UNDERRUN 7
	#define SIIS_P_CMDERR_OVERRUN 8
	#define SIIS_P_CMDERR_LLOVERRUN 9
	#define SIIS_P_CMDERR_PPE 11
	#define SIIS_P_CMDERR_SGTALIGN 16
	#define SIIS_P_CMDERR_PCITASGT 17
	#define SIIS_P_CMDERR_OCIMASGT 18
	#define SIIS_P_CMDERR_PCIPESGT 19
	#define SIIS_P_CMDERR_PRBALIGN 24
	#define SIIS_P_CMDERR_PCITAPRB 25
	#define SIIS_P_CMDERR_PCIMAPRB 26
	#define SIIS_P_CMDERR_PCIPEPRB 27
	#define SIIS_P_CMDERR_PCITADATA 33
	#define SIIS_P_CMDERR_PCIMADATA 34
	#define SIIS_P_CMDERR_PCIPEDATA 35
	#define SIIS_P_CMDERR_SERVICE 36
	#define SIIS_P_FISCFG 0x1028
	#define SIIS_P_PCIEFIFOTH 0x102C
	#define SIIS_P_8_10_DEC_ERR 0x1040
	#define SIIS_P_CRC_ERR 0x1044
	#define SIIS_P_HS_ERR 0x1048
	#define SIIS_P_PHYCFG 0x1050
	#define SIIS_P_SS 0x1800
	#define SIIS_P_SS_ATTN 0x80000000
	#define SIIS_P_CACTL(i) (0x1C00 + i * 8)
	#define SIIS_P_CACTH(i) (0x1C00 + i * 8 + 4)
	#define SIIS_P_CTX 0x1E04
	#define SIIS_P_CTX_SLOT 0x0000001F
	#define SIIS_P_CTX_SLOT_SHIFT 0
	#define SIIS_P_CTX_PMP 0x000001E0
	#define SIIS_P_CTX_PMP_SHIFT 5

	#define SIIS_P_SCTL 0x1F00
	#define SIIS_P_SSTS 0x1F04
	#define SIIS_P_SERR 0x1F08
	#define SIIS_P_SACT 0x1F0C
	#define SIIS_P_SNTF 0x1F10

	#define SIIS_MAX_PORTS 4
	#define SIIS_MAX_SLOTS 31

	#define SIIS_OFFSET 0x100
	#define SIIS_STEP 0x80

	-/* Just to be sure, if building as module. */
	-#if MAXPHYS < 512 * 1024
	-#undef MAXPHYS
	-#define MAXPHYS 512 * 1024
	-#endif
	/* Pessimistic prognosis on number of required S/G entries */
	-#define SIIS_SG_ENTRIES (roundup(btoc(MAXPHYS), 4) + 1)
	-/* Command tables. Up to 32 commands, Each, 128byte aligned. */
	-#define SIIS_CT_OFFSET 0
	-#define SIIS_CT_SIZE (32 + 16 + SIIS_SG_ENTRIES * 16)
	+#define SIIS_SG_ENTRIES (roundup(btoc(maxphys), 4) + 1)
	+/* Port Request Block + S/G entries. 128byte aligned. */
	+#define SIIS_PRB_SIZE (32 + 16 + SIIS_SG_ENTRIES * 16)
	/* Total main work area. */
	-#define SIIS_WORK_SIZE (SIIS_CT_OFFSET + SIIS_CT_SIZE * SIIS_MAX_SLOTS)
	+#define SIIS_WORK_SIZE (SIIS_PRB_SIZE * SIIS_MAX_SLOTS)

	struct siis_dma_prd {
	u_int64_t dba;
	u_int32_t dbc;
	u_int32_t control;
	#define SIIS_PRD_TRM 0x80000000
	#define SIIS_PRD_LNK 0x40000000
	#define SIIS_PRD_DRD 0x20000000
	#define SIIS_PRD_XCF 0x10000000
	} __packed;

	struct siis_cmd_ata {
	- struct siis_dma_prd prd[1 + SIIS_SG_ENTRIES];
	+ struct siis_dma_prd prd[2];
	} __packed;

	struct siis_cmd_atapi {
	u_int8_t ccb[16];
	- struct siis_dma_prd prd[SIIS_SG_ENTRIES];
	+ struct siis_dma_prd prd[1];
	} __packed;

	struct siis_cmd {
	u_int16_t control;
	#define SIIS_PRB_PROTOCOL_OVERRIDE 0x0001
	#define SIIS_PRB_RETRANSMIT 0x0002
	#define SIIS_PRB_EXTERNAL_COMMAND 0x0004
	#define SIIS_PRB_RECEIVE 0x0008
	#define SIIS_PRB_PACKET_READ 0x0010
	#define SIIS_PRB_PACKET_WRITE 0x0020
	#define SIIS_PRB_INTERRUPT_MASK 0x0040
	#define SIIS_PRB_SOFT_RESET 0x0080
	u_int16_t protocol_override;
	#define SIIS_PRB_PROTO_PACKET 0x0001
	#define SIIS_PRB_PROTO_TCQ 0x0002
	#define SIIS_PRB_PROTO_NCQ 0x0004
	#define SIIS_PRB_PROTO_READ 0x0008
	#define SIIS_PRB_PROTO_WRITE 0x0010
	#define SIIS_PRB_PROTO_TRANSPARENT 0x0020
	u_int32_t transfer_count;
	u_int8_t fis[24];
	union {
	struct siis_cmd_ata ata;
	struct siis_cmd_atapi atapi;
	} u;
	} __packed;

	/* misc defines */
	#define ATA_IRQ_RID 0
	#define ATA_INTR_FLAGS (INTR_MPSAFE\|INTR_TYPE_BIO\|INTR_ENTROPY)

	struct ata_dmaslot {
	bus_dmamap_t data_map; /* data DMA map */
	int nsegs; /* Number of segs loaded */
	};

	/* structure holding DMA related information */
	struct ata_dma {
	bus_dma_tag_t work_tag; /* workspace DMA tag */
	bus_dmamap_t work_map; /* workspace DMA map */
	uint8_t work; / workspace */
	bus_addr_t work_bus; /* bus address of work */
	bus_dma_tag_t data_tag; /* data DMA tag */
	};

	enum siis_slot_states {
	SIIS_SLOT_EMPTY,
	SIIS_SLOT_LOADING,
	SIIS_SLOT_RUNNING,
	SIIS_SLOT_WAITING
	};

	struct siis_slot {
	device_t dev; /* Device handle */
	u_int8_t slot; /* Number of this slot */
	enum siis_slot_states state; /* Slot state */
	+ u_int prb_offset; /* PRB offset */
	union ccb ccb; / CCB occupying slot */
	struct ata_dmaslot dma; /* DMA data of this slot */
	struct callout timeout; /* Execution timeout */
	};

	struct siis_device {
	int revision;
	int mode;
	u_int bytecount;
	u_int atapi;
	u_int tags;
	u_int caps;
	};

	/* structure describing an ATA channel */
	struct siis_channel {
	device_t dev; /* Device handle */
	int unit; /* Physical channel */
	struct resource r_mem; / Memory of this channel */
	struct resource r_irq; / Interrupt of this channel */
	void ih; / Interrupt handle */
	struct ata_dma dma; /* DMA data */
	struct cam_sim *sim;
	struct cam_path *path;
	struct cdev led; / Activity led led(4) cdev. */
	int quirks;
	int pm_level; /* power management level */

	struct siis_slot slot[SIIS_MAX_SLOTS];
	union ccb *hold[SIIS_MAX_SLOTS];
	struct mtx mtx; /* state lock */
	int devices; /* What is present */
	int pm_present; /* PM presence reported */
	uint32_t oslots; /* Occupied slots */
	uint32_t rslots; /* Running slots */
	uint32_t aslots; /* Slots with atomic commands */
	uint32_t eslots; /* Slots in error */
	uint32_t toslots; /* Slots in timeout */
	int numrslots; /* Number of running slots */
	int numtslots[SIIS_MAX_SLOTS]; /* Number of tagged slots */
	int numhslots; /* Number of held slots */
	int recoverycmd; /* Our READ LOG active */
	int fatalerr; /* Fatal error happened */
	int recovery; /* Some slots are in error */
	union ccb frozen; / Frozen command */

	struct siis_device user[16]; /* User-specified settings */
	struct siis_device curr[16]; /* Current settings */
	};

	/* structure describing a SIIS controller */
	struct siis_controller {
	device_t dev;
	int r_grid;
	struct resource *r_gmem;
	int r_rid;
	struct resource *r_mem;
	struct rman sc_iomem;
	struct siis_controller_irq {
	struct resource *r_irq;
	void *handle;
	int r_irq_rid;
	} irq;
	int quirks;
	int channels;
	uint32_t gctl;
	struct {
	void (function)(void );
	void *argument;
	} interrupt[SIIS_MAX_PORTS];
	};

	enum siis_err_type {
	SIIS_ERR_NONE, /* No error */
	SIIS_ERR_INVALID, /* Error detected by us before submitting. */
	SIIS_ERR_INNOCENT, /* Innocent victim. */
	SIIS_ERR_TFE, /* Task File Error. */
	SIIS_ERR_SATA, /* SATA error. */
	SIIS_ERR_TIMEOUT, /* Command execution timeout. */
	SIIS_ERR_NCQ, /* NCQ command error. CCB should be put on hold
	* until READ LOG executed to reveal error. */
	};

	/* macros to hide busspace uglyness */
	#define ATA_INB(res, offset) \
	bus_read_1((res), (offset))
	#define ATA_INW(res, offset) \
	bus_read_2((res), (offset))
	#define ATA_INL(res, offset) \
	bus_read_4((res), (offset))
	#define ATA_INSW(res, offset, addr, count) \
	bus_read_multi_2((res), (offset), (addr), (count))
	#define ATA_INSW_STRM(res, offset, addr, count) \
	bus_read_multi_stream_2((res), (offset), (addr), (count))
	#define ATA_INSL(res, offset, addr, count) \
	bus_read_multi_4((res), (offset), (addr), (count))
	#define ATA_INSL_STRM(res, offset, addr, count) \
	bus_read_multi_stream_4((res), (offset), (addr), (count))
	#define ATA_OUTB(res, offset, value) \
	bus_write_1((res), (offset), (value))
	#define ATA_OUTW(res, offset, value) \
	bus_write_2((res), (offset), (value))
	#define ATA_OUTL(res, offset, value) \
	bus_write_4((res), (offset), (value))
	#define ATA_OUTSW(res, offset, addr, count) \
	bus_write_multi_2((res), (offset), (addr), (count))
	#define ATA_OUTSW_STRM(res, offset, addr, count) \
	bus_write_multi_stream_2((res), (offset), (addr), (count))
	#define ATA_OUTSL(res, offset, addr, count) \
	bus_write_multi_4((res), (offset), (addr), (count))
	#define ATA_OUTSL_STRM(res, offset, addr, count) \
	bus_write_multi_stream_4((res), (offset), (addr), (count))
	diff --git a/sys/dev/sym/sym_conf.h b/sys/dev/sym/sym_conf.h
	index 135b0e85992f..9fd113cc1fc3 100644
	--- a/sys/dev/sym/sym_conf.h
	+++ b/sys/dev/sym/sym_conf.h
	@@ -1,303 +1,303 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Device driver optimized for the Symbios/LSI 53C896/53C895A/53C1010
	* PCI-SCSI controllers.
	*
	* Copyright (C) 1999-2001 Gerard Roudier <groudier@free.fr>
	*
	* This driver also supports the following Symbios/LSI PCI-SCSI chips:
	* 53C810A, 53C825A, 53C860, 53C875, 53C876, 53C885, 53C895,
	* 53C810, 53C815, 53C825 and the 53C1510D is 53C8XX mode.
	*
	*
	* This driver for FreeBSD-CAM is derived from the Linux sym53c8xx driver.
	* Copyright (C) 1998-1999 Gerard Roudier
	*
	* The sym53c8xx driver is derived from the ncr53c8xx driver that had been
	* a port of the FreeBSD ncr driver to Linux-1.2.13.
	*
	* The original ncr driver has been written for 386bsd and FreeBSD by
	* Wolfgang Stanglmeier <wolf@cologne.de>
	* Stefan Esser <se@mi.Uni-Koeln.de>
	* Copyright (C) 1994 Wolfgang Stanglmeier
	*
	* The initialisation code, and part of the code that addresses
	* FreeBSD-CAM services is based on the aic7xxx driver for FreeBSD-CAM
	* written by Justin T. Gibbs.
	*
	* Other major contributions:
	*
	* NVRAM detection and reading.
	* Copyright (C) 1997 Richard Waltham <dormouse@farsrobt.demon.co.uk>
	*
	*-----------------------------------------------------------------------------
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/* $FreeBSD$ */

	#ifndef SYM_CONF_H
	#define SYM_CONF_H

	/*-------------------------------------------------------------------
	* Static configuration.
	*-------------------------------------------------------------------
	*/

	/*
	* Also support early NCR 810, 815 and 825 chips.
	*/
	#define SYM_CONF_GENERIC_SUPPORT

	/*
	* Use Normal IO instead of MMIO.
	*/
	/* #define SYM_CONF_IOMAPPED */

	/*
	* Max tags for a device (logical unit)
	* We use a power of 2, (7) means 2<<7=128
	* Maximum is 8 -> 256 tags
	*/
	#define SYM_CONF_MAX_TAG_ORDER (6)

	/*
	* DMA boundary
	* We need to ensure 16 MB boundaries not to be crossed during DMA of
	* each segment, due to some chips being flawed.
	*/
	#define SYM_CONF_DMA_BOUNDARY (1UL << 24)

	/*
	* Max number of scatter/gather entries for an I/O.
	* Each entry costs 8 bytes in the internal CCB data structure.
	* We use at most 33 segments but also no more than required for handling
	- * MAXPHYS.
	+ * legacy MAXPHYS == 128 * 1024.
	*/
	-#define SYM_CONF_MAX_SG (MIN(33, (MAXPHYS / PAGE_SIZE) + 1))
	+#define SYM_CONF_MAX_SG (MIN(33, (128 * 1024 / PAGE_SIZE) + 1))

	/*
	* Max number of targets.
	* Maximum is 16 and you are advised not to change this value.
	*/
	#define SYM_CONF_MAX_TARGET (16)

	/*
	* Max number of logical units.
	* SPI-2 allows up to 64 logical units, but in real life, target
	* that implements more that 7 logical units are pretty rare.
	* Anyway, the cost of accepting up to 64 logical unit is low in
	* this driver, thus going with the maximum is acceptable.
	*/
	#define SYM_CONF_MAX_LUN (64)

	/*
	* Max number of IO control blocks queued to the controller.
	* Each entry needs 8 bytes and the queues are allocated contiguously.
	* Since we donnot want to allocate more than a page, the theorical
	* maximum is PAGE_SIZE/8. For safety, we announce a bit less to the
	* access method. :)
	* When not supplied, as it is suggested, the driver compute some
	* good value for this parameter.
	*/
	/* #define SYM_CONF_MAX_START (PAGE_SIZE/8 - 16) */

	/*
	* Support for NVRAM.
	*/
	#define SYM_CONF_NVRAM_SUPPORT
	/* #define SYM_CONF_NVRAM_SUPPORT */

	/*
	* Support for Immediate Arbitration.
	* Not advised.
	*/
	/* #define SYM_CONF_IARB_SUPPORT */

	/*-------------------------------------------------------------------
	* Configuration that could be dynamic if it was possible
	* to pass arguments to the driver.
	*-------------------------------------------------------------------
	*/

	/*
	* HOST default scsi id.
	*/
	#define SYM_SETUP_HOST_ID 7

	/*
	* Max synchronous transfers.
	*/
	#define SYM_SETUP_MIN_SYNC (9)

	/*
	* Max wide order.
	*/
	#define SYM_SETUP_MAX_WIDE (1)

	/*
	* Max SCSI offset.
	*/
	#define SYM_SETUP_MAX_OFFS (63)

	/*
	* Default number of tags.
	*/
	#define SYM_SETUP_MAX_TAG (1<<SYM_CONF_MAX_TAG_ORDER)

	/*
	* SYMBIOS NVRAM format support.
	*/
	#define SYM_SETUP_SYMBIOS_NVRAM (1)

	/*
	* TEKRAM NVRAM format support.
	*/
	#define SYM_SETUP_TEKRAM_NVRAM (1)

	/*
	* PCI parity checking.
	* It should not be an option, but some poor or broken
	* PCI-HOST bridges have been reported to make problems
	* when this feature is enabled.
	* Setting this option to 0 tells the driver not to
	* enable the checking against PCI parity.
	*/
	#ifndef SYM_SETUP_PCI_PARITY
	#define SYM_SETUP_PCI_PARITY (1)
	#endif

	/*
	* SCSI parity checking.
	*/
	#define SYM_SETUP_SCSI_PARITY (1)

	/*
	* SCSI activity LED.
	*/
	#define SYM_SETUP_SCSI_LED (0)

	/*
	* SCSI High Voltage Differential support.
	*
	* HVD/LVD/SE capable controllers (895, 895A, 896, 1010)
	* report the actual SCSI BUS mode from the STEST4 IO
	* register.
	*
	* But for HVD/SE only capable chips (825a, 875, 885),
	* the driver uses some heuristic to probe against HVD.
	* Normally, the chip senses the DIFFSENS signal and
	* should switch its BUS tranceivers to high impedance
	* in situation of the driver having been wrong about
	* the actual BUS mode. May-be, the BUS mode probing of
	* the driver is safe, but, given that it may be partially
	* based on some previous IO register settings, it
	* cannot be stated so. Thus, decision has been taken
	* to require a user option to be set for the DIFF probing
	* to be applied for the 825a, 875 and 885 chips.
	*
	* This setup option works as follows:
	*
	* 0 -> HVD only supported for 895, 895A, 896, 1010.
	* 1 -> HVD probed for 825A, 875, 885.
	* 2 -> HVD assumed for 825A, 875, 885 (not advised).
	*/
	#ifndef SYM_SETUP_SCSI_DIFF
	#define SYM_SETUP_SCSI_DIFF (0)
	#endif

	/*
	* IRQ mode.
	*/
	#define SYM_SETUP_IRQ_MODE (0)

	/*
	* Check SCSI BUS signal on reset.
	*/
	#define SYM_SETUP_SCSI_BUS_CHECK (1)

	/*
	* Max burst for PCI (1<<value)
	* 7 means: (1<<7) = 128 DWORDS.
	*/
	#define SYM_SETUP_BURST_ORDER (7)

	/*
	* Only relevant if IARB support configured.
	* - Max number of successive settings of IARB hints.
	* - Set IARB on arbitration lost.
	*/
	#define SYM_CONF_IARB_MAX 3
	#define SYM_CONF_SET_IARB_ON_ARB_LOST 1

	/*
	* Returning wrong residuals may make problems.
	* When zero, this define tells the driver to
	* always return 0 as transfer residual.
	* Btw, all my testings of residuals have succeeded.
	*/
	#define SYM_CONF_RESIDUAL_SUPPORT 1

	/*
	* Supported maximum number of LUNs to announce to
	* the access method.
	* The driver supports up to 64 LUNs per target as
	* required by SPI-2/SPI-3. However some SCSI devices
	* designed prior to these specifications or not being
	* conformant may be highly confused when they are
	* asked about a LUN > 7.
	*/
	#ifndef SYM_SETUP_MAX_LUN
	#define SYM_SETUP_MAX_LUN (8)
	#endif

	/*
	* Low priority probe map.
	*
	* This option is used as a bitmap to tell the driver
	* about chips that are to be claimed with a low priority
	* (-2000) by the probe method. This allows any other driver
	* that may return some higher priority value for the same
	* chips to take precedence over this driver (sym).
	* This option may be used when both the ncr driver and this
	* driver are configured.
	*
	* Bits are to be coded as follows:
	* 0x01 -> 810a, 860
	* 0x02 -> 825a, 875, 885, 895
	* 0x04 -> 895a, 896, 1510d
	* 0x08 -> 1010
	* 0x40 -> 810, 815, 825
	*
	* For example, value 5 tells the driver to claim support
	* for 810a, 860, 895a, 896 and 1510d with low priority,
	* allowing the ncr driver to take precedence if configured.
	*/
	#ifndef SYM_SETUP_LP_PROBE_MAP
	#define SYM_SETUP_LP_PROBE_MAP 0
	#endif

	#endif /* SYM_CONF_H */
	diff --git a/sys/dev/usb/storage/umass.c b/sys/dev/usb/storage/umass.c
	index cfec3fc91fc1..ee182d112e42 100644
	--- a/sys/dev/usb/storage/umass.c
	+++ b/sys/dev/usb/storage/umass.c
	@@ -1,3015 +1,3015 @@
	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1999 MAEKAWA Masahide <bishop@rr.iij4u.or.jp>,
	* Nick Hibma <n_hibma@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	* $NetBSD: umass.c,v 1.28 2000/04/02 23:46:53 augustss Exp $
	*/

	/* Also already merged from NetBSD:
	* $NetBSD: umass.c,v 1.67 2001/11/25 19:05:22 augustss Exp $
	* $NetBSD: umass.c,v 1.90 2002/11/04 19:17:33 pooka Exp $
	* $NetBSD: umass.c,v 1.108 2003/11/07 17:03:25 wiz Exp $
	* $NetBSD: umass.c,v 1.109 2003/12/04 13:57:31 keihan Exp $
	*/

	/*
	* Universal Serial Bus Mass Storage Class specs:
	* http://www.usb.org/developers/devclass_docs/usb_msc_overview_1.2.pdf
	* http://www.usb.org/developers/devclass_docs/usbmassbulk_10.pdf
	* http://www.usb.org/developers/devclass_docs/usb_msc_cbi_1.1.pdf
	* http://www.usb.org/developers/devclass_docs/usbmass-ufi10.pdf
	*/

	/*
	* Ported to NetBSD by Lennart Augustsson <augustss@NetBSD.org>.
	* Parts of the code written by Jason R. Thorpe <thorpej@shagadelic.org>.
	*/

	/*
	* The driver handles 3 Wire Protocols
	* - Command/Bulk/Interrupt (CBI)
	* - Command/Bulk/Interrupt with Command Completion Interrupt (CBI with CCI)
	* - Mass Storage Bulk-Only (BBB)
	* (BBB refers Bulk/Bulk/Bulk for Command/Data/Status phases)
	*
	* Over these wire protocols it handles the following command protocols
	* - SCSI
	* - UFI (floppy command set)
	* - 8070i (ATAPI)
	*
	* UFI and 8070i (ATAPI) are transformed versions of the SCSI command set. The
	* sc->sc_transform method is used to convert the commands into the appropriate
	* format (if at all necessary). For example, UFI requires all commands to be
	* 12 bytes in length amongst other things.
	*
	* The source code below is marked and can be split into a number of pieces
	* (in this order):
	*
	* - probe/attach/detach
	* - generic transfer routines
	* - BBB
	* - CBI
	* - CBI_I (in addition to functions from CBI)
	* - CAM (Common Access Method)
	* - SCSI
	* - UFI
	* - 8070i (ATAPI)
	*
	* The protocols are implemented using a state machine, for the transfers as
	* well as for the resets. The state machine is contained in umass_t_*_callback.
	* The state machine is started through either umass_command_start() or
	* umass_reset().
	*
	* The reason for doing this is a) CAM performs a lot better this way and b) it
	* avoids using tsleep from interrupt context (for example after a failed
	* transfer).
	*/

	/*
	* The SCSI related part of this driver has been derived from the
	* dev/ppbus/vpo.c driver, by Nicolas Souchu (nsouch@FreeBSD.org).
	*
	* The CAM layer uses so called actions which are messages sent to the host
	* adapter for completion. The actions come in through umass_cam_action. The
	* appropriate block of routines is called depending on the transport protocol
	* in use. When the transfer has finished, these routines call
	* umass_cam_cb again to complete the CAM command.
	*/

	#include <sys/stdint.h>
	#include <sys/stddef.h>
	#include <sys/param.h>
	#include <sys/queue.h>
	#include <sys/types.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bus.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/condvar.h>
	#include <sys/sysctl.h>
	#include <sys/sx.h>
	#include <sys/unistd.h>
	#include <sys/callout.h>
	#include <sys/malloc.h>
	#include <sys/priv.h>

	#include <dev/usb/usb.h>
	#include <dev/usb/usbdi.h>
	#include <dev/usb/usbdi_util.h>
	#include "usbdevs.h"

	#include <dev/usb/quirk/usb_quirk.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_da.h>

	#include <cam/cam_periph.h>

	#ifdef USB_DEBUG
	#define DIF(m, x) \
	do { \
	if (umass_debug & (m)) { x ; } \
	} while (0)

	#define DPRINTF(sc, m, fmt, ...) \
	do { \
	if (umass_debug & (m)) { \
	printf("%s:%s: " fmt, \
	(sc) ? (const char *)(sc)->sc_name : \
	(const char *)"umassX", \
	__FUNCTION__ ,## __VA_ARGS__); \
	} \
	} while (0)

	#define UDMASS_GEN 0x00010000 /* general */
	#define UDMASS_SCSI 0x00020000 /* scsi */
	#define UDMASS_UFI 0x00040000 /* ufi command set */
	#define UDMASS_ATAPI 0x00080000 /* 8070i command set */
	#define UDMASS_CMD (UDMASS_SCSI\|UDMASS_UFI\|UDMASS_ATAPI)
	#define UDMASS_USB 0x00100000 /* USB general */
	#define UDMASS_BBB 0x00200000 /* Bulk-Only transfers */
	#define UDMASS_CBI 0x00400000 /* CBI transfers */
	#define UDMASS_WIRE (UDMASS_BBB\|UDMASS_CBI)
	#define UDMASS_ALL 0xffff0000 /* all of the above */
	static int umass_debug;
	static int umass_throttle;

	static SYSCTL_NODE(_hw_usb, OID_AUTO, umass, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"USB umass");
	SYSCTL_INT(_hw_usb_umass, OID_AUTO, debug, CTLFLAG_RWTUN,
	&umass_debug, 0, "umass debug level");
	SYSCTL_INT(_hw_usb_umass, OID_AUTO, throttle, CTLFLAG_RWTUN,
	&umass_throttle, 0, "Forced delay between commands in milliseconds");
	#else
	#define DIF(...) do { } while (0)
	#define DPRINTF(...) do { } while (0)
	#endif

	#define UMASS_BULK_SIZE (1 << 17)
	#define UMASS_CBI_DIAGNOSTIC_CMDLEN 12 /* bytes */
	#define UMASS_MAX_CMDLEN MAX(12, CAM_MAX_CDBLEN) /* bytes */

	/* USB transfer definitions */

	#define UMASS_T_BBB_RESET1 0 /* Bulk-Only */
	#define UMASS_T_BBB_RESET2 1
	#define UMASS_T_BBB_RESET3 2
	#define UMASS_T_BBB_COMMAND 3
	#define UMASS_T_BBB_DATA_READ 4
	#define UMASS_T_BBB_DATA_RD_CS 5
	#define UMASS_T_BBB_DATA_WRITE 6
	#define UMASS_T_BBB_DATA_WR_CS 7
	#define UMASS_T_BBB_STATUS 8
	#define UMASS_T_BBB_MAX 9

	#define UMASS_T_CBI_RESET1 0 /* CBI */
	#define UMASS_T_CBI_RESET2 1
	#define UMASS_T_CBI_RESET3 2
	#define UMASS_T_CBI_COMMAND 3
	#define UMASS_T_CBI_DATA_READ 4
	#define UMASS_T_CBI_DATA_RD_CS 5
	#define UMASS_T_CBI_DATA_WRITE 6
	#define UMASS_T_CBI_DATA_WR_CS 7
	#define UMASS_T_CBI_STATUS 8
	#define UMASS_T_CBI_RESET4 9
	#define UMASS_T_CBI_MAX 10

	#define UMASS_T_MAX MAX(UMASS_T_CBI_MAX, UMASS_T_BBB_MAX)

	/* Generic definitions */

	/* Direction for transfer */
	#define DIR_NONE 0
	#define DIR_IN 1
	#define DIR_OUT 2

	/* device name */
	#define DEVNAME "umass"
	#define DEVNAME_SIM "umass-sim"

	/* Approximate maximum transfer speeds (assumes 33% overhead). */
	#define UMASS_FULL_TRANSFER_SPEED 1000
	#define UMASS_HIGH_TRANSFER_SPEED 40000
	#define UMASS_SUPER_TRANSFER_SPEED 400000
	#define UMASS_FLOPPY_TRANSFER_SPEED 20

	#define UMASS_TIMEOUT 5000 /* ms */

	/* CAM specific definitions */

	#define UMASS_SCSIID_MAX 1 /* maximum number of drives expected */
	#define UMASS_SCSIID_HOST UMASS_SCSIID_MAX

	/* Bulk-Only features */

	#define UR_BBB_RESET 0xff /* Bulk-Only reset */
	#define UR_BBB_GET_MAX_LUN 0xfe /* Get maximum lun */

	/* Command Block Wrapper */
	typedef struct {
	uDWord dCBWSignature;
	#define CBWSIGNATURE 0x43425355
	uDWord dCBWTag;
	uDWord dCBWDataTransferLength;
	uByte bCBWFlags;
	#define CBWFLAGS_OUT 0x00
	#define CBWFLAGS_IN 0x80
	uByte bCBWLUN;
	uByte bCDBLength;
	#define CBWCDBLENGTH 16
	uByte CBWCDB[CBWCDBLENGTH];
	} __packed umass_bbb_cbw_t;

	#define UMASS_BBB_CBW_SIZE 31

	/* Command Status Wrapper */
	typedef struct {
	uDWord dCSWSignature;
	#define CSWSIGNATURE 0x53425355
	#define CSWSIGNATURE_IMAGINATION_DBX1 0x43425355
	#define CSWSIGNATURE_OLYMPUS_C1 0x55425355
	uDWord dCSWTag;
	uDWord dCSWDataResidue;
	uByte bCSWStatus;
	#define CSWSTATUS_GOOD 0x0
	#define CSWSTATUS_FAILED 0x1
	#define CSWSTATUS_PHASE 0x2
	} __packed umass_bbb_csw_t;

	#define UMASS_BBB_CSW_SIZE 13

	/* CBI features */

	#define UR_CBI_ADSC 0x00

	typedef union {
	struct {
	uint8_t type;
	#define IDB_TYPE_CCI 0x00
	uint8_t value;
	#define IDB_VALUE_PASS 0x00
	#define IDB_VALUE_FAIL 0x01
	#define IDB_VALUE_PHASE 0x02
	#define IDB_VALUE_PERSISTENT 0x03
	#define IDB_VALUE_STATUS_MASK 0x03
	} __packed common;

	struct {
	uint8_t asc;
	uint8_t ascq;
	} __packed ufi;
	} __packed umass_cbi_sbl_t;

	struct umass_softc; /* see below */

	typedef void (umass_callback_t)(struct umass_softc sc, union ccb ccb,
	uint32_t residue, uint8_t status);

	#define STATUS_CMD_OK 0 /* everything ok */
	#define STATUS_CMD_UNKNOWN 1 /* will have to fetch sense */
	#define STATUS_CMD_FAILED 2 /* transfer was ok, command failed */
	#define STATUS_WIRE_FAILED 3 /* couldn't even get command across */

	typedef uint8_t (umass_transform_t)(struct umass_softc sc, uint8_t cmd_ptr,
	uint8_t cmd_len);

	/* Wire and command protocol */
	#define UMASS_PROTO_BBB 0x0001 /* USB wire protocol */
	#define UMASS_PROTO_CBI 0x0002
	#define UMASS_PROTO_CBI_I 0x0004
	#define UMASS_PROTO_WIRE 0x00ff /* USB wire protocol mask */
	#define UMASS_PROTO_SCSI 0x0100 /* command protocol */
	#define UMASS_PROTO_ATAPI 0x0200
	#define UMASS_PROTO_UFI 0x0400
	#define UMASS_PROTO_RBC 0x0800
	#define UMASS_PROTO_COMMAND 0xff00 /* command protocol mask */

	/* Device specific quirks */
	#define NO_QUIRKS 0x0000
	/*
	* The drive does not support Test Unit Ready. Convert to Start Unit
	*/
	#define NO_TEST_UNIT_READY 0x0001
	/*
	* The drive does not reset the Unit Attention state after REQUEST
	* SENSE has been sent. The INQUIRY command does not reset the UA
	* either, and so CAM runs in circles trying to retrieve the initial
	* INQUIRY data.
	*/
	#define RS_NO_CLEAR_UA 0x0002
	/* The drive does not support START STOP. */
	#define NO_START_STOP 0x0004
	/* Don't ask for full inquiry data (255b). */
	#define FORCE_SHORT_INQUIRY 0x0008
	/* Needs to be initialised the Shuttle way */
	#define SHUTTLE_INIT 0x0010
	/* Drive needs to be switched to alternate iface 1 */
	#define ALT_IFACE_1 0x0020
	/* Drive does not do 1Mb/s, but just floppy speeds (20kb/s) */
	#define FLOPPY_SPEED 0x0040
	/* The device can't count and gets the residue of transfers wrong */
	#define IGNORE_RESIDUE 0x0080
	/* No GetMaxLun call */
	#define NO_GETMAXLUN 0x0100
	/* The device uses a weird CSWSIGNATURE. */
	#define WRONG_CSWSIG 0x0200
	/* Device cannot handle INQUIRY so fake a generic response */
	#define NO_INQUIRY 0x0400
	/* Device cannot handle INQUIRY EVPD, return CHECK CONDITION */
	#define NO_INQUIRY_EVPD 0x0800
	/* Pad all RBC requests to 12 bytes. */
	#define RBC_PAD_TO_12 0x1000
	/*
	* Device reports number of sectors from READ_CAPACITY, not max
	* sector number.
	*/
	#define READ_CAPACITY_OFFBY1 0x2000
	/*
	* Device cannot handle a SCSI synchronize cache command. Normally
	* this quirk would be handled in the cam layer, but for IDE bridges
	* we need to associate the quirk with the bridge and not the
	* underlying disk device. This is handled by faking a success
	* result.
	*/
	#define NO_SYNCHRONIZE_CACHE 0x4000
	/* Device does not support 'PREVENT/ALLOW MEDIUM REMOVAL'. */
	#define NO_PREVENT_ALLOW 0x8000

	struct umass_softc {
	struct scsi_sense cam_scsi_sense;
	struct scsi_test_unit_ready cam_scsi_test_unit_ready;
	struct mtx sc_mtx;
	struct {
	uint8_t *data_ptr;
	union ccb *ccb;
	umass_callback_t *callback;

	uint32_t data_len; /* bytes */
	uint32_t data_rem; /* bytes */
	uint32_t data_timeout; /* ms */
	uint32_t actlen; /* bytes */

	uint8_t cmd_data[UMASS_MAX_CMDLEN];
	uint8_t cmd_len; /* bytes */
	uint8_t dir;
	uint8_t lun;
	} sc_transfer;

	/* Bulk specific variables for transfers in progress */
	umass_bbb_cbw_t cbw; /* command block wrapper */
	umass_bbb_csw_t csw; /* command status wrapper */

	/* CBI specific variables for transfers in progress */
	umass_cbi_sbl_t sbl; /* status block */

	device_t sc_dev;
	struct usb_device *sc_udev;
	struct cam_sim sc_sim; / SCSI Interface Module */
	struct usb_xfer *sc_xfer[UMASS_T_MAX];

	/*
	* The command transform function is used to convert the SCSI
	* commands into their derivatives, like UFI, ATAPI, and friends.
	*/
	umass_transform_t *sc_transform;

	uint32_t sc_unit;
	uint32_t sc_quirks; /* they got it almost right */
	uint32_t sc_proto; /* wire and cmd protocol */

	uint8_t sc_name[16];
	uint8_t sc_iface_no; /* interface number */
	uint8_t sc_maxlun; /* maximum LUN number, inclusive */
	uint8_t sc_last_xfer_index;
	uint8_t sc_status_try;
	};

	struct umass_probe_proto {
	uint32_t quirks;
	uint32_t proto;

	int error;
	};

	/* prototypes */

	static device_probe_t umass_probe;
	static device_attach_t umass_attach;
	static device_detach_t umass_detach;

	static usb_callback_t umass_tr_error;
	static usb_callback_t umass_t_bbb_reset1_callback;
	static usb_callback_t umass_t_bbb_reset2_callback;
	static usb_callback_t umass_t_bbb_reset3_callback;
	static usb_callback_t umass_t_bbb_command_callback;
	static usb_callback_t umass_t_bbb_data_read_callback;
	static usb_callback_t umass_t_bbb_data_rd_cs_callback;
	static usb_callback_t umass_t_bbb_data_write_callback;
	static usb_callback_t umass_t_bbb_data_wr_cs_callback;
	static usb_callback_t umass_t_bbb_status_callback;
	static usb_callback_t umass_t_cbi_reset1_callback;
	static usb_callback_t umass_t_cbi_reset2_callback;
	static usb_callback_t umass_t_cbi_reset3_callback;
	static usb_callback_t umass_t_cbi_reset4_callback;
	static usb_callback_t umass_t_cbi_command_callback;
	static usb_callback_t umass_t_cbi_data_read_callback;
	static usb_callback_t umass_t_cbi_data_rd_cs_callback;
	static usb_callback_t umass_t_cbi_data_write_callback;
	static usb_callback_t umass_t_cbi_data_wr_cs_callback;
	static usb_callback_t umass_t_cbi_status_callback;

	static void umass_cancel_ccb(struct umass_softc *);
	static void umass_init_shuttle(struct umass_softc *);
	static void umass_reset(struct umass_softc *);
	static void umass_t_bbb_data_clear_stall_callback(struct usb_xfer *,
	uint8_t, uint8_t, usb_error_t);
	static void umass_command_start(struct umass_softc , uint8_t, void ,
	uint32_t, uint32_t, umass_callback_t , union ccb );
	static uint8_t umass_bbb_get_max_lun(struct umass_softc *);
	static void umass_cbi_start_status(struct umass_softc *);
	static void umass_t_cbi_data_clear_stall_callback(struct usb_xfer *,
	uint8_t, uint8_t, usb_error_t);
	static int umass_cam_attach_sim(struct umass_softc *);
	static void umass_cam_attach(struct umass_softc *);
	static void umass_cam_detach_sim(struct umass_softc *);
	static void umass_cam_action(struct cam_sim , union ccb );
	static void umass_cam_poll(struct cam_sim *);
	static void umass_cam_cb(struct umass_softc , union ccb , uint32_t,
	uint8_t);
	static void umass_cam_sense_cb(struct umass_softc , union ccb , uint32_t,
	uint8_t);
	static void umass_cam_quirk_cb(struct umass_softc , union ccb , uint32_t,
	uint8_t);
	static uint8_t umass_scsi_transform(struct umass_softc , uint8_t , uint8_t);
	static uint8_t umass_rbc_transform(struct umass_softc , uint8_t , uint8_t);
	static uint8_t umass_ufi_transform(struct umass_softc , uint8_t , uint8_t);
	static uint8_t umass_atapi_transform(struct umass_softc , uint8_t ,
	uint8_t);
	static uint8_t umass_no_transform(struct umass_softc , uint8_t , uint8_t);
	static uint8_t umass_std_transform(struct umass_softc , union ccb , uint8_t
	*, uint8_t);

	#ifdef USB_DEBUG
	static void umass_bbb_dump_cbw(struct umass_softc , umass_bbb_cbw_t );
	static void umass_bbb_dump_csw(struct umass_softc , umass_bbb_csw_t );
	static void umass_cbi_dump_cmd(struct umass_softc , void , uint8_t);
	static void umass_dump_buffer(struct umass_softc , uint8_t , uint32_t,
	uint32_t);
	#endif

	static struct usb_config umass_bbb_config[UMASS_T_BBB_MAX] = {
	[UMASS_T_BBB_RESET1] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_bbb_reset1_callback,
	.timeout = 5000, /* 5 seconds */
	.interval = 500, /* 500 milliseconds */
	},

	[UMASS_T_BBB_RESET2] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_bbb_reset2_callback,
	.timeout = 5000, /* 5 seconds */
	.interval = 50, /* 50 milliseconds */
	},

	[UMASS_T_BBB_RESET3] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_bbb_reset3_callback,
	.timeout = 5000, /* 5 seconds */
	.interval = 50, /* 50 milliseconds */
	},

	[UMASS_T_BBB_COMMAND] = {
	.type = UE_BULK,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_OUT,
	.bufsize = sizeof(umass_bbb_cbw_t),
	.callback = &umass_t_bbb_command_callback,
	.timeout = 5000, /* 5 seconds */
	},

	[UMASS_T_BBB_DATA_READ] = {
	.type = UE_BULK,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_IN,
	.bufsize = UMASS_BULK_SIZE,
	.flags = {.proxy_buffer = 1,.short_xfer_ok = 1,.ext_buffer=1,},
	.callback = &umass_t_bbb_data_read_callback,
	.timeout = 0, /* overwritten later */
	},

	[UMASS_T_BBB_DATA_RD_CS] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_bbb_data_rd_cs_callback,
	.timeout = 5000, /* 5 seconds */
	},

	[UMASS_T_BBB_DATA_WRITE] = {
	.type = UE_BULK,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_OUT,
	.bufsize = UMASS_BULK_SIZE,
	.flags = {.proxy_buffer = 1,.short_xfer_ok = 1,.ext_buffer=1,},
	.callback = &umass_t_bbb_data_write_callback,
	.timeout = 0, /* overwritten later */
	},

	[UMASS_T_BBB_DATA_WR_CS] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_bbb_data_wr_cs_callback,
	.timeout = 5000, /* 5 seconds */
	},

	[UMASS_T_BBB_STATUS] = {
	.type = UE_BULK,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_IN,
	.bufsize = sizeof(umass_bbb_csw_t),
	.flags = {.short_xfer_ok = 1,},
	.callback = &umass_t_bbb_status_callback,
	.timeout = 5000, /* ms */
	},
	};

	static struct usb_config umass_cbi_config[UMASS_T_CBI_MAX] = {
	[UMASS_T_CBI_RESET1] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = (sizeof(struct usb_device_request) +
	UMASS_CBI_DIAGNOSTIC_CMDLEN),
	.callback = &umass_t_cbi_reset1_callback,
	.timeout = 5000, /* 5 seconds */
	.interval = 500, /* 500 milliseconds */
	},

	[UMASS_T_CBI_RESET2] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_cbi_reset2_callback,
	.timeout = 5000, /* 5 seconds */
	.interval = 50, /* 50 milliseconds */
	},

	[UMASS_T_CBI_RESET3] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_cbi_reset3_callback,
	.timeout = 5000, /* 5 seconds */
	.interval = 50, /* 50 milliseconds */
	},

	[UMASS_T_CBI_COMMAND] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = (sizeof(struct usb_device_request) +
	UMASS_MAX_CMDLEN),
	.callback = &umass_t_cbi_command_callback,
	.timeout = 5000, /* 5 seconds */
	},

	[UMASS_T_CBI_DATA_READ] = {
	.type = UE_BULK,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_IN,
	.bufsize = UMASS_BULK_SIZE,
	.flags = {.proxy_buffer = 1,.short_xfer_ok = 1,.ext_buffer=1,},
	.callback = &umass_t_cbi_data_read_callback,
	.timeout = 0, /* overwritten later */
	},

	[UMASS_T_CBI_DATA_RD_CS] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_cbi_data_rd_cs_callback,
	.timeout = 5000, /* 5 seconds */
	},

	[UMASS_T_CBI_DATA_WRITE] = {
	.type = UE_BULK,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_OUT,
	.bufsize = UMASS_BULK_SIZE,
	.flags = {.proxy_buffer = 1,.short_xfer_ok = 1,.ext_buffer=1,},
	.callback = &umass_t_cbi_data_write_callback,
	.timeout = 0, /* overwritten later */
	},

	[UMASS_T_CBI_DATA_WR_CS] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_cbi_data_wr_cs_callback,
	.timeout = 5000, /* 5 seconds */
	},

	[UMASS_T_CBI_STATUS] = {
	.type = UE_INTERRUPT,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_IN,
	.flags = {.short_xfer_ok = 1,.no_pipe_ok = 1,},
	.bufsize = sizeof(umass_cbi_sbl_t),
	.callback = &umass_t_cbi_status_callback,
	.timeout = 5000, /* ms */
	},

	[UMASS_T_CBI_RESET4] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_cbi_reset4_callback,
	.timeout = 5000, /* ms */
	},
	};

	/* If device cannot return valid inquiry data, fake it */
	static const uint8_t fake_inq_data[SHORT_INQUIRY_LENGTH] = {
	0, /* removable */ 0x80, SCSI_REV_2, SCSI_REV_2,
	/* additional_length */ 31, 0, 0, 0
	};

	#define UFI_COMMAND_LENGTH 12 /* UFI commands are always 12 bytes */
	#define ATAPI_COMMAND_LENGTH 12 /* ATAPI commands are always 12 bytes */

	static devclass_t umass_devclass;

	static device_method_t umass_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, umass_probe),
	DEVMETHOD(device_attach, umass_attach),
	DEVMETHOD(device_detach, umass_detach),

	DEVMETHOD_END
	};

	static driver_t umass_driver = {
	.name = "umass",
	.methods = umass_methods,
	.size = sizeof(struct umass_softc),
	};

	static const STRUCT_USB_HOST_ID __used umass_devs[] = {
	/* generic mass storage class */
	{USB_IFACE_CLASS(UICLASS_MASS),},
	};

	DRIVER_MODULE(umass, uhub, umass_driver, umass_devclass, NULL, 0);
	MODULE_DEPEND(umass, usb, 1, 1, 1);
	MODULE_DEPEND(umass, cam, 1, 1, 1);
	MODULE_VERSION(umass, 1);
	USB_PNP_HOST_INFO(umass_devs);

	/*
	* USB device probe/attach/detach
	*/

	static uint16_t
	umass_get_proto(struct usb_interface *iface)
	{
	struct usb_interface_descriptor *id;
	uint16_t retval;

	retval = 0;

	/* Check for a standards compliant device */
	id = usbd_get_interface_descriptor(iface);
	if ((id == NULL) \|\|
	(id->bInterfaceClass != UICLASS_MASS)) {
	goto done;
	}
	switch (id->bInterfaceSubClass) {
	case UISUBCLASS_SCSI:
	retval \|= UMASS_PROTO_SCSI;
	break;
	case UISUBCLASS_UFI:
	retval \|= UMASS_PROTO_UFI;
	break;
	case UISUBCLASS_RBC:
	retval \|= UMASS_PROTO_RBC;
	break;
	case UISUBCLASS_SFF8020I:
	case UISUBCLASS_SFF8070I:
	retval \|= UMASS_PROTO_ATAPI;
	break;
	default:
	goto done;
	}

	switch (id->bInterfaceProtocol) {
	case UIPROTO_MASS_CBI:
	retval \|= UMASS_PROTO_CBI;
	break;
	case UIPROTO_MASS_CBI_I:
	retval \|= UMASS_PROTO_CBI_I;
	break;
	case UIPROTO_MASS_BBB_OLD:
	case UIPROTO_MASS_BBB:
	retval \|= UMASS_PROTO_BBB;
	break;
	default:
	goto done;
	}
	done:
	return (retval);
	}

	/*
	* Match the device we are seeing with the devices supported.
	*/
	static struct umass_probe_proto
	umass_probe_proto(device_t dev, struct usb_attach_arg *uaa)
	{
	struct umass_probe_proto ret;
	uint32_t quirks = NO_QUIRKS;
	uint32_t proto = umass_get_proto(uaa->iface);

	memset(&ret, 0, sizeof(ret));
	ret.error = BUS_PROBE_GENERIC;

	/* Search for protocol enforcement */

	if (usb_test_quirk(uaa, UQ_MSC_FORCE_WIRE_BBB)) {
	proto &= ~UMASS_PROTO_WIRE;
	proto \|= UMASS_PROTO_BBB;
	} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_WIRE_CBI)) {
	proto &= ~UMASS_PROTO_WIRE;
	proto \|= UMASS_PROTO_CBI;
	} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_WIRE_CBI_I)) {
	proto &= ~UMASS_PROTO_WIRE;
	proto \|= UMASS_PROTO_CBI_I;
	}

	if (usb_test_quirk(uaa, UQ_MSC_FORCE_PROTO_SCSI)) {
	proto &= ~UMASS_PROTO_COMMAND;
	proto \|= UMASS_PROTO_SCSI;
	} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_PROTO_ATAPI)) {
	proto &= ~UMASS_PROTO_COMMAND;
	proto \|= UMASS_PROTO_ATAPI;
	} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_PROTO_UFI)) {
	proto &= ~UMASS_PROTO_COMMAND;
	proto \|= UMASS_PROTO_UFI;
	} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_PROTO_RBC)) {
	proto &= ~UMASS_PROTO_COMMAND;
	proto \|= UMASS_PROTO_RBC;
	}

	/* Check if the protocol is invalid */

	if ((proto & UMASS_PROTO_COMMAND) == 0) {
	ret.error = ENXIO;
	goto done;
	}

	if ((proto & UMASS_PROTO_WIRE) == 0) {
	ret.error = ENXIO;
	goto done;
	}

	/* Search for quirks */

	if (usb_test_quirk(uaa, UQ_MSC_NO_TEST_UNIT_READY))
	quirks \|= NO_TEST_UNIT_READY;
	if (usb_test_quirk(uaa, UQ_MSC_NO_RS_CLEAR_UA))
	quirks \|= RS_NO_CLEAR_UA;
	if (usb_test_quirk(uaa, UQ_MSC_NO_START_STOP))
	quirks \|= NO_START_STOP;
	if (usb_test_quirk(uaa, UQ_MSC_NO_GETMAXLUN))
	quirks \|= NO_GETMAXLUN;
	if (usb_test_quirk(uaa, UQ_MSC_NO_INQUIRY))
	quirks \|= NO_INQUIRY;
	if (usb_test_quirk(uaa, UQ_MSC_NO_INQUIRY_EVPD))
	quirks \|= NO_INQUIRY_EVPD;
	if (usb_test_quirk(uaa, UQ_MSC_NO_PREVENT_ALLOW))
	quirks \|= NO_PREVENT_ALLOW;
	if (usb_test_quirk(uaa, UQ_MSC_NO_SYNC_CACHE))
	quirks \|= NO_SYNCHRONIZE_CACHE;
	if (usb_test_quirk(uaa, UQ_MSC_SHUTTLE_INIT))
	quirks \|= SHUTTLE_INIT;
	if (usb_test_quirk(uaa, UQ_MSC_ALT_IFACE_1))
	quirks \|= ALT_IFACE_1;
	if (usb_test_quirk(uaa, UQ_MSC_FLOPPY_SPEED))
	quirks \|= FLOPPY_SPEED;
	if (usb_test_quirk(uaa, UQ_MSC_IGNORE_RESIDUE))
	quirks \|= IGNORE_RESIDUE;
	if (usb_test_quirk(uaa, UQ_MSC_WRONG_CSWSIG))
	quirks \|= WRONG_CSWSIG;
	if (usb_test_quirk(uaa, UQ_MSC_RBC_PAD_TO_12))
	quirks \|= RBC_PAD_TO_12;
	if (usb_test_quirk(uaa, UQ_MSC_READ_CAP_OFFBY1))
	quirks \|= READ_CAPACITY_OFFBY1;
	if (usb_test_quirk(uaa, UQ_MSC_FORCE_SHORT_INQ))
	quirks \|= FORCE_SHORT_INQUIRY;

	done:
	ret.quirks = quirks;
	ret.proto = proto;
	return (ret);
	}

	static int
	umass_probe(device_t dev)
	{
	struct usb_attach_arg *uaa = device_get_ivars(dev);
	struct umass_probe_proto temp;

	if (uaa->usb_mode != USB_MODE_HOST) {
	return (ENXIO);
	}
	temp = umass_probe_proto(dev, uaa);

	return (temp.error);
	}

	static int
	umass_attach(device_t dev)
	{
	struct umass_softc *sc = device_get_softc(dev);
	struct usb_attach_arg *uaa = device_get_ivars(dev);
	struct umass_probe_proto temp = umass_probe_proto(dev, uaa);
	struct usb_interface_descriptor *id;
	int err;

	/*
	* NOTE: the softc struct is cleared in device_set_driver.
	* We can safely call umass_detach without specifically
	* initializing the struct.
	*/

	sc->sc_dev = dev;
	sc->sc_udev = uaa->device;
	sc->sc_proto = temp.proto;
	sc->sc_quirks = temp.quirks;
	sc->sc_unit = device_get_unit(dev);

	snprintf(sc->sc_name, sizeof(sc->sc_name),
	"%s", device_get_nameunit(dev));

	device_set_usb_desc(dev);

	mtx_init(&sc->sc_mtx, device_get_nameunit(dev),
	NULL, MTX_DEF \| MTX_RECURSE);

	/* get interface index */

	id = usbd_get_interface_descriptor(uaa->iface);
	if (id == NULL) {
	device_printf(dev, "failed to get "
	"interface number\n");
	goto detach;
	}
	sc->sc_iface_no = id->bInterfaceNumber;

	#ifdef USB_DEBUG
	device_printf(dev, " ");

	switch (sc->sc_proto & UMASS_PROTO_COMMAND) {
	case UMASS_PROTO_SCSI:
	printf("SCSI");
	break;
	case UMASS_PROTO_ATAPI:
	printf("8070i (ATAPI)");
	break;
	case UMASS_PROTO_UFI:
	printf("UFI");
	break;
	case UMASS_PROTO_RBC:
	printf("RBC");
	break;
	default:
	printf("(unknown 0x%02x)",
	sc->sc_proto & UMASS_PROTO_COMMAND);
	break;
	}

	printf(" over ");

	switch (sc->sc_proto & UMASS_PROTO_WIRE) {
	case UMASS_PROTO_BBB:
	printf("Bulk-Only");
	break;
	case UMASS_PROTO_CBI: /* uses Comand/Bulk pipes */
	printf("CBI");
	break;
	case UMASS_PROTO_CBI_I: /* uses Comand/Bulk/Interrupt pipes */
	printf("CBI with CCI");
	break;
	default:
	printf("(unknown 0x%02x)",
	sc->sc_proto & UMASS_PROTO_WIRE);
	}

	printf("; quirks = 0x%04x\n", sc->sc_quirks);
	#endif

	if (sc->sc_quirks & ALT_IFACE_1) {
	err = usbd_set_alt_interface_index
	(uaa->device, uaa->info.bIfaceIndex, 1);

	if (err) {
	DPRINTF(sc, UDMASS_USB, "could not switch to "
	"Alt Interface 1\n");
	goto detach;
	}
	}
	/* allocate all required USB transfers */

	if (sc->sc_proto & UMASS_PROTO_BBB) {
	err = usbd_transfer_setup(uaa->device,
	&uaa->info.bIfaceIndex, sc->sc_xfer, umass_bbb_config,
	UMASS_T_BBB_MAX, sc, &sc->sc_mtx);

	/* skip reset first time */
	sc->sc_last_xfer_index = UMASS_T_BBB_COMMAND;

	} else if (sc->sc_proto & (UMASS_PROTO_CBI \| UMASS_PROTO_CBI_I)) {
	err = usbd_transfer_setup(uaa->device,
	&uaa->info.bIfaceIndex, sc->sc_xfer, umass_cbi_config,
	UMASS_T_CBI_MAX, sc, &sc->sc_mtx);

	/* skip reset first time */
	sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;

	} else {
	err = USB_ERR_INVAL;
	}

	if (err) {
	device_printf(dev, "could not setup required "
	"transfers, %s\n", usbd_errstr(err));
	goto detach;
	}
	#ifdef USB_DEBUG
	if (umass_throttle > 0) {
	uint8_t x;
	int iv;

	iv = umass_throttle;

	if (iv < 1)
	iv = 1;
	else if (iv > 8000)
	iv = 8000;

	for (x = 0; x != UMASS_T_MAX; x++) {
	if (sc->sc_xfer[x] != NULL)
	usbd_xfer_set_interval(sc->sc_xfer[x], iv);
	}
	}
	#endif
	sc->sc_transform =
	(sc->sc_proto & UMASS_PROTO_SCSI) ? &umass_scsi_transform :
	(sc->sc_proto & UMASS_PROTO_UFI) ? &umass_ufi_transform :
	(sc->sc_proto & UMASS_PROTO_ATAPI) ? &umass_atapi_transform :
	(sc->sc_proto & UMASS_PROTO_RBC) ? &umass_rbc_transform :
	&umass_no_transform;

	/* from here onwards the device can be used. */

	if (sc->sc_quirks & SHUTTLE_INIT) {
	umass_init_shuttle(sc);
	}
	/* get the maximum LUN supported by the device */

	if (((sc->sc_proto & UMASS_PROTO_WIRE) == UMASS_PROTO_BBB) &&
	!(sc->sc_quirks & NO_GETMAXLUN))
	sc->sc_maxlun = umass_bbb_get_max_lun(sc);
	else
	sc->sc_maxlun = 0;

	/* Prepare the SCSI command block */
	sc->cam_scsi_sense.opcode = REQUEST_SENSE;
	sc->cam_scsi_test_unit_ready.opcode = TEST_UNIT_READY;

	/* register the SIM */
	err = umass_cam_attach_sim(sc);
	if (err) {
	goto detach;
	}
	/* scan the SIM */
	umass_cam_attach(sc);

	DPRINTF(sc, UDMASS_GEN, "Attach finished\n");

	return (0); /* success */

	detach:
	umass_detach(dev);
	return (ENXIO); /* failure */
	}

	static int
	umass_detach(device_t dev)
	{
	struct umass_softc *sc = device_get_softc(dev);

	DPRINTF(sc, UDMASS_USB, "\n");

	/* teardown our statemachine */

	usbd_transfer_unsetup(sc->sc_xfer, UMASS_T_MAX);

	mtx_lock(&sc->sc_mtx);

	/* cancel any leftover CCB's */

	umass_cancel_ccb(sc);

	umass_cam_detach_sim(sc);

	mtx_unlock(&sc->sc_mtx);

	mtx_destroy(&sc->sc_mtx);

	return (0); /* success */
	}

	static void
	umass_init_shuttle(struct umass_softc *sc)
	{
	struct usb_device_request req;
	uint8_t status[2] = {0, 0};

	/*
	* The Linux driver does this, but no one can tell us what the
	* command does.
	*/
	req.bmRequestType = UT_READ_VENDOR_DEVICE;
	req.bRequest = 1; /* XXX unknown command */
	USETW(req.wValue, 0);
	req.wIndex[0] = sc->sc_iface_no;
	req.wIndex[1] = 0;
	USETW(req.wLength, sizeof(status));
	usbd_do_request(sc->sc_udev, NULL, &req, &status);

	DPRINTF(sc, UDMASS_GEN, "Shuttle init returned 0x%02x%02x\n",
	status[0], status[1]);
	}

	/*
	* Generic functions to handle transfers
	*/

	static void
	umass_transfer_start(struct umass_softc *sc, uint8_t xfer_index)
	{
	DPRINTF(sc, UDMASS_GEN, "transfer index = "
	"%d\n", xfer_index);

	if (sc->sc_xfer[xfer_index]) {
	sc->sc_last_xfer_index = xfer_index;
	usbd_transfer_start(sc->sc_xfer[xfer_index]);
	} else {
	umass_cancel_ccb(sc);
	}
	}

	static void
	umass_reset(struct umass_softc *sc)
	{
	DPRINTF(sc, UDMASS_GEN, "resetting device\n");

	/*
	* stop the last transfer, if not already stopped:
	*/
	usbd_transfer_stop(sc->sc_xfer[sc->sc_last_xfer_index]);
	umass_transfer_start(sc, 0);
	}

	static void
	umass_cancel_ccb(struct umass_softc *sc)
	{
	union ccb *ccb;

	USB_MTX_ASSERT(&sc->sc_mtx, MA_OWNED);

	ccb = sc->sc_transfer.ccb;
	sc->sc_transfer.ccb = NULL;
	sc->sc_last_xfer_index = 0;

	if (ccb) {
	(sc->sc_transfer.callback)
	(sc, ccb, (sc->sc_transfer.data_len -
	sc->sc_transfer.actlen), STATUS_WIRE_FAILED);
	}
	}

	static void
	umass_tr_error(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);

	if (error != USB_ERR_CANCELLED) {
	DPRINTF(sc, UDMASS_GEN, "transfer error, %s -> "
	"reset\n", usbd_errstr(error));
	}
	umass_cancel_ccb(sc);
	}

	/*
	* BBB protocol specific functions
	*/

	static void
	umass_t_bbb_reset1_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	struct usb_device_request req;
	struct usb_page_cache *pc;

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	umass_transfer_start(sc, UMASS_T_BBB_RESET2);
	return;

	case USB_ST_SETUP:
	/*
	* Reset recovery (5.3.4 in Universal Serial Bus Mass Storage Class)
	*
	* For Reset Recovery the host shall issue in the following order:
	* a) a Bulk-Only Mass Storage Reset
	* b) a Clear Feature HALT to the Bulk-In endpoint
	* c) a Clear Feature HALT to the Bulk-Out endpoint
	*
	* This is done in 3 steps, using 3 transfers:
	* UMASS_T_BBB_RESET1
	* UMASS_T_BBB_RESET2
	* UMASS_T_BBB_RESET3
	*/

	DPRINTF(sc, UDMASS_BBB, "BBB reset!\n");

	req.bmRequestType = UT_WRITE_CLASS_INTERFACE;
	req.bRequest = UR_BBB_RESET; /* bulk only reset */
	USETW(req.wValue, 0);
	req.wIndex[0] = sc->sc_iface_no;
	req.wIndex[1] = 0;
	USETW(req.wLength, 0);

	pc = usbd_xfer_get_frame(xfer, 0);
	usbd_copy_in(pc, 0, &req, sizeof(req));

	usbd_xfer_set_frame_len(xfer, 0, sizeof(req));
	usbd_xfer_set_frames(xfer, 1);
	usbd_transfer_submit(xfer);
	return;

	default: /* Error */
	umass_tr_error(xfer, error);
	return;
	}
	}

	static void
	umass_t_bbb_reset2_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_bbb_data_clear_stall_callback(xfer, UMASS_T_BBB_RESET3,
	UMASS_T_BBB_DATA_READ, error);
	}

	static void
	umass_t_bbb_reset3_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_bbb_data_clear_stall_callback(xfer, UMASS_T_BBB_COMMAND,
	UMASS_T_BBB_DATA_WRITE, error);
	}

	static void
	umass_t_bbb_data_clear_stall_callback(struct usb_xfer *xfer,
	uint8_t next_xfer, uint8_t stall_xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	tr_transferred:
	umass_transfer_start(sc, next_xfer);
	return;

	case USB_ST_SETUP:
	if (usbd_clear_stall_callback(xfer, sc->sc_xfer[stall_xfer])) {
	goto tr_transferred;
	}
	return;

	default: /* Error */
	umass_tr_error(xfer, error);
	return;
	}
	}

	static void
	umass_t_bbb_command_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	union ccb *ccb = sc->sc_transfer.ccb;
	struct usb_page_cache *pc;
	uint32_t tag;

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	umass_transfer_start
	(sc, ((sc->sc_transfer.dir == DIR_IN) ? UMASS_T_BBB_DATA_READ :
	(sc->sc_transfer.dir == DIR_OUT) ? UMASS_T_BBB_DATA_WRITE :
	UMASS_T_BBB_STATUS));
	return;

	case USB_ST_SETUP:

	sc->sc_status_try = 0;

	if (ccb) {
	/*
	* the initial value is not important,
	* as long as the values are unique:
	*/
	tag = UGETDW(sc->cbw.dCBWTag) + 1;

	USETDW(sc->cbw.dCBWSignature, CBWSIGNATURE);
	USETDW(sc->cbw.dCBWTag, tag);

	/*
	* dCBWDataTransferLength:
	* This field indicates the number of bytes of data that the host
	* intends to transfer on the IN or OUT Bulk endpoint(as indicated by
	* the Direction bit) during the execution of this command. If this
	* field is set to 0, the device will expect that no data will be
	* transferred IN or OUT during this command, regardless of the value
	* of the Direction bit defined in dCBWFlags.
	*/
	USETDW(sc->cbw.dCBWDataTransferLength, sc->sc_transfer.data_len);

	/*
	* dCBWFlags:
	* The bits of the Flags field are defined as follows:
	* Bits 0-6 reserved
	* Bit 7 Direction - this bit shall be ignored if the
	* dCBWDataTransferLength field is zero.
	* 0 = data Out from host to device
	* 1 = data In from device to host
	*/
	sc->cbw.bCBWFlags = ((sc->sc_transfer.dir == DIR_IN) ?
	CBWFLAGS_IN : CBWFLAGS_OUT);
	sc->cbw.bCBWLUN = sc->sc_transfer.lun;

	if (sc->sc_transfer.cmd_len > sizeof(sc->cbw.CBWCDB)) {
	sc->sc_transfer.cmd_len = sizeof(sc->cbw.CBWCDB);
	DPRINTF(sc, UDMASS_BBB, "Truncating long command!\n");
	}
	sc->cbw.bCDBLength = sc->sc_transfer.cmd_len;

	/* copy SCSI command data */
	memcpy(sc->cbw.CBWCDB, sc->sc_transfer.cmd_data,
	sc->sc_transfer.cmd_len);

	/* clear remaining command area */
	memset(sc->cbw.CBWCDB +
	sc->sc_transfer.cmd_len, 0,
	sizeof(sc->cbw.CBWCDB) -
	sc->sc_transfer.cmd_len);

	DIF(UDMASS_BBB, umass_bbb_dump_cbw(sc, &sc->cbw));

	pc = usbd_xfer_get_frame(xfer, 0);
	usbd_copy_in(pc, 0, &sc->cbw, sizeof(sc->cbw));
	usbd_xfer_set_frame_len(xfer, 0, sizeof(sc->cbw));

	usbd_transfer_submit(xfer);
	}
	return;

	default: /* Error */
	umass_tr_error(xfer, error);
	return;
	}
	}

	static void
	umass_t_bbb_data_read_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	uint32_t max_bulk = usbd_xfer_max_len(xfer);
	int actlen, sumlen;

	usbd_xfer_status(xfer, &actlen, &sumlen, NULL, NULL);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	sc->sc_transfer.data_rem -= actlen;
	sc->sc_transfer.data_ptr += actlen;
	sc->sc_transfer.actlen += actlen;

	if (actlen < sumlen) {
	/* short transfer */
	sc->sc_transfer.data_rem = 0;
	}
	case USB_ST_SETUP:
	DPRINTF(sc, UDMASS_BBB, "max_bulk=%d, data_rem=%d\n",
	max_bulk, sc->sc_transfer.data_rem);

	if (sc->sc_transfer.data_rem == 0) {
	umass_transfer_start(sc, UMASS_T_BBB_STATUS);
	return;
	}
	if (max_bulk > sc->sc_transfer.data_rem) {
	max_bulk = sc->sc_transfer.data_rem;
	}
	usbd_xfer_set_timeout(xfer, sc->sc_transfer.data_timeout);

	usbd_xfer_set_frame_data(xfer, 0, sc->sc_transfer.data_ptr,
	max_bulk);

	usbd_transfer_submit(xfer);
	return;

	default: /* Error */
	if (error == USB_ERR_CANCELLED) {
	umass_tr_error(xfer, error);
	} else {
	umass_transfer_start(sc, UMASS_T_BBB_DATA_RD_CS);
	}
	return;
	}
	}

	static void
	umass_t_bbb_data_rd_cs_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_bbb_data_clear_stall_callback(xfer, UMASS_T_BBB_STATUS,
	UMASS_T_BBB_DATA_READ, error);
	}

	static void
	umass_t_bbb_data_write_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	uint32_t max_bulk = usbd_xfer_max_len(xfer);
	int actlen, sumlen;

	usbd_xfer_status(xfer, &actlen, &sumlen, NULL, NULL);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	sc->sc_transfer.data_rem -= actlen;
	sc->sc_transfer.data_ptr += actlen;
	sc->sc_transfer.actlen += actlen;

	if (actlen < sumlen) {
	/* short transfer */
	sc->sc_transfer.data_rem = 0;
	}
	case USB_ST_SETUP:
	DPRINTF(sc, UDMASS_BBB, "max_bulk=%d, data_rem=%d\n",
	max_bulk, sc->sc_transfer.data_rem);

	if (sc->sc_transfer.data_rem == 0) {
	umass_transfer_start(sc, UMASS_T_BBB_STATUS);
	return;
	}
	if (max_bulk > sc->sc_transfer.data_rem) {
	max_bulk = sc->sc_transfer.data_rem;
	}
	usbd_xfer_set_timeout(xfer, sc->sc_transfer.data_timeout);

	usbd_xfer_set_frame_data(xfer, 0, sc->sc_transfer.data_ptr,
	max_bulk);

	usbd_transfer_submit(xfer);
	return;

	default: /* Error */
	if (error == USB_ERR_CANCELLED) {
	umass_tr_error(xfer, error);
	} else {
	umass_transfer_start(sc, UMASS_T_BBB_DATA_WR_CS);
	}
	return;
	}
	}

	static void
	umass_t_bbb_data_wr_cs_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_bbb_data_clear_stall_callback(xfer, UMASS_T_BBB_STATUS,
	UMASS_T_BBB_DATA_WRITE, error);
	}

	static void
	umass_t_bbb_status_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	union ccb *ccb = sc->sc_transfer.ccb;
	struct usb_page_cache *pc;
	uint32_t residue;
	int actlen;

	usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:

	/*
	* Do a full reset if there is something wrong with the CSW:
	*/
	sc->sc_status_try = 1;

	/* Zero missing parts of the CSW: */

	if (actlen < (int)sizeof(sc->csw))
	memset(&sc->csw, 0, sizeof(sc->csw));

	pc = usbd_xfer_get_frame(xfer, 0);
	usbd_copy_out(pc, 0, &sc->csw, actlen);

	DIF(UDMASS_BBB, umass_bbb_dump_csw(sc, &sc->csw));

	residue = UGETDW(sc->csw.dCSWDataResidue);

	if ((!residue) \|\| (sc->sc_quirks & IGNORE_RESIDUE)) {
	residue = (sc->sc_transfer.data_len -
	sc->sc_transfer.actlen);
	}
	if (residue > sc->sc_transfer.data_len) {
	DPRINTF(sc, UDMASS_BBB, "truncating residue from %d "
	"to %d bytes\n", residue, sc->sc_transfer.data_len);
	residue = sc->sc_transfer.data_len;
	}
	/* translate weird command-status signatures: */
	if (sc->sc_quirks & WRONG_CSWSIG) {
	uint32_t temp = UGETDW(sc->csw.dCSWSignature);

	if ((temp == CSWSIGNATURE_OLYMPUS_C1) \|\|
	(temp == CSWSIGNATURE_IMAGINATION_DBX1)) {
	USETDW(sc->csw.dCSWSignature, CSWSIGNATURE);
	}
	}
	/* check CSW and handle eventual error */
	if (UGETDW(sc->csw.dCSWSignature) != CSWSIGNATURE) {
	DPRINTF(sc, UDMASS_BBB, "bad CSW signature 0x%08x != 0x%08x\n",
	UGETDW(sc->csw.dCSWSignature), CSWSIGNATURE);
	/*
	* Invalid CSW: Wrong signature or wrong tag might
	* indicate that we lost synchronization. Reset the
	* device.
	*/
	goto tr_error;
	} else if (UGETDW(sc->csw.dCSWTag) != UGETDW(sc->cbw.dCBWTag)) {
	DPRINTF(sc, UDMASS_BBB, "Invalid CSW: tag 0x%08x should be "
	"0x%08x\n", UGETDW(sc->csw.dCSWTag),
	UGETDW(sc->cbw.dCBWTag));
	goto tr_error;
	} else if (sc->csw.bCSWStatus > CSWSTATUS_PHASE) {
	DPRINTF(sc, UDMASS_BBB, "Invalid CSW: status %d > %d\n",
	sc->csw.bCSWStatus, CSWSTATUS_PHASE);
	goto tr_error;
	} else if (sc->csw.bCSWStatus == CSWSTATUS_PHASE) {
	DPRINTF(sc, UDMASS_BBB, "Phase error, residue = "
	"%d\n", residue);
	goto tr_error;
	} else if (sc->sc_transfer.actlen > sc->sc_transfer.data_len) {
	DPRINTF(sc, UDMASS_BBB, "Buffer overrun %d > %d\n",
	sc->sc_transfer.actlen, sc->sc_transfer.data_len);
	goto tr_error;
	} else if (sc->csw.bCSWStatus == CSWSTATUS_FAILED) {
	DPRINTF(sc, UDMASS_BBB, "Command failed, residue = "
	"%d\n", residue);

	sc->sc_transfer.ccb = NULL;

	sc->sc_last_xfer_index = UMASS_T_BBB_COMMAND;

	(sc->sc_transfer.callback)
	(sc, ccb, residue, STATUS_CMD_FAILED);
	} else {
	sc->sc_transfer.ccb = NULL;

	sc->sc_last_xfer_index = UMASS_T_BBB_COMMAND;

	(sc->sc_transfer.callback)
	(sc, ccb, residue, STATUS_CMD_OK);
	}
	return;

	case USB_ST_SETUP:
	usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer));
	usbd_transfer_submit(xfer);
	return;

	default:
	tr_error:
	DPRINTF(sc, UDMASS_BBB, "Failed to read CSW: %s, try %d\n",
	usbd_errstr(error), sc->sc_status_try);

	if ((error == USB_ERR_CANCELLED) \|\|
	(sc->sc_status_try)) {
	umass_tr_error(xfer, error);
	} else {
	sc->sc_status_try = 1;
	umass_transfer_start(sc, UMASS_T_BBB_DATA_RD_CS);
	}
	return;
	}
	}

	static void
	umass_command_start(struct umass_softc *sc, uint8_t dir,
	void *data_ptr, uint32_t data_len,
	uint32_t data_timeout, umass_callback_t *callback,
	union ccb *ccb)
	{
	sc->sc_transfer.lun = ccb->ccb_h.target_lun;

	/*
	* NOTE: assumes that "sc->sc_transfer.cmd_data" and
	* "sc->sc_transfer.cmd_len" has been properly
	* initialized.
	*/

	sc->sc_transfer.dir = data_len ? dir : DIR_NONE;
	sc->sc_transfer.data_ptr = data_ptr;
	sc->sc_transfer.data_len = data_len;
	sc->sc_transfer.data_rem = data_len;
	sc->sc_transfer.data_timeout = (data_timeout + UMASS_TIMEOUT);

	sc->sc_transfer.actlen = 0;
	sc->sc_transfer.callback = callback;
	sc->sc_transfer.ccb = ccb;

	if (sc->sc_xfer[sc->sc_last_xfer_index]) {
	usbd_transfer_start(sc->sc_xfer[sc->sc_last_xfer_index]);
	} else {
	umass_cancel_ccb(sc);
	}
	}

	static uint8_t
	umass_bbb_get_max_lun(struct umass_softc *sc)
	{
	struct usb_device_request req;
	usb_error_t err;
	uint8_t buf = 0;

	/* The Get Max Lun command is a class-specific request. */
	req.bmRequestType = UT_READ_CLASS_INTERFACE;
	req.bRequest = UR_BBB_GET_MAX_LUN;
	USETW(req.wValue, 0);
	req.wIndex[0] = sc->sc_iface_no;
	req.wIndex[1] = 0;
	USETW(req.wLength, 1);

	err = usbd_do_request(sc->sc_udev, NULL, &req, &buf);
	if (err) {
	buf = 0;

	/* Device doesn't support Get Max Lun request. */
	printf("%s: Get Max Lun not supported (%s)\n",
	sc->sc_name, usbd_errstr(err));
	}
	return (buf);
	}

	/*
	* Command/Bulk/Interrupt (CBI) specific functions
	*/

	static void
	umass_cbi_start_status(struct umass_softc *sc)
	{
	if (sc->sc_xfer[UMASS_T_CBI_STATUS]) {
	umass_transfer_start(sc, UMASS_T_CBI_STATUS);
	} else {
	union ccb *ccb = sc->sc_transfer.ccb;

	sc->sc_transfer.ccb = NULL;

	sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;

	(sc->sc_transfer.callback)
	(sc, ccb, (sc->sc_transfer.data_len -
	sc->sc_transfer.actlen), STATUS_CMD_UNKNOWN);
	}
	}

	static void
	umass_t_cbi_reset1_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	struct usb_device_request req;
	struct usb_page_cache *pc;
	uint8_t buf[UMASS_CBI_DIAGNOSTIC_CMDLEN];

	uint8_t i;

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	umass_transfer_start(sc, UMASS_T_CBI_RESET2);
	break;

	case USB_ST_SETUP:
	/*
	* Command Block Reset Protocol
	*
	* First send a reset request to the device. Then clear
	* any possibly stalled bulk endpoints.
	*
	* This is done in 3 steps, using 3 transfers:
	* UMASS_T_CBI_RESET1
	* UMASS_T_CBI_RESET2
	* UMASS_T_CBI_RESET3
	* UMASS_T_CBI_RESET4 (only if there is an interrupt endpoint)
	*/

	DPRINTF(sc, UDMASS_CBI, "CBI reset!\n");

	req.bmRequestType = UT_WRITE_CLASS_INTERFACE;
	req.bRequest = UR_CBI_ADSC;
	USETW(req.wValue, 0);
	req.wIndex[0] = sc->sc_iface_no;
	req.wIndex[1] = 0;
	USETW(req.wLength, UMASS_CBI_DIAGNOSTIC_CMDLEN);

	/*
	* The 0x1d code is the SEND DIAGNOSTIC command. To
	* distinguish between the two, the last 10 bytes of the CBL
	* is filled with 0xff (section 2.2 of the CBI
	* specification)
	*/
	buf[0] = 0x1d; /* Command Block Reset */
	buf[1] = 0x04;

	for (i = 2; i < UMASS_CBI_DIAGNOSTIC_CMDLEN; i++) {
	buf[i] = 0xff;
	}

	pc = usbd_xfer_get_frame(xfer, 0);
	usbd_copy_in(pc, 0, &req, sizeof(req));
	pc = usbd_xfer_get_frame(xfer, 1);
	usbd_copy_in(pc, 0, buf, sizeof(buf));

	usbd_xfer_set_frame_len(xfer, 0, sizeof(req));
	usbd_xfer_set_frame_len(xfer, 1, sizeof(buf));
	usbd_xfer_set_frames(xfer, 2);
	usbd_transfer_submit(xfer);
	break;

	default: /* Error */
	if (error == USB_ERR_CANCELLED)
	umass_tr_error(xfer, error);
	else
	umass_transfer_start(sc, UMASS_T_CBI_RESET2);
	break;
	}
	}

	static void
	umass_t_cbi_reset2_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_cbi_data_clear_stall_callback(xfer, UMASS_T_CBI_RESET3,
	UMASS_T_CBI_DATA_READ, error);
	}

	static void
	umass_t_cbi_reset3_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);

	umass_t_cbi_data_clear_stall_callback
	(xfer, (sc->sc_xfer[UMASS_T_CBI_RESET4] &&
	sc->sc_xfer[UMASS_T_CBI_STATUS]) ?
	UMASS_T_CBI_RESET4 : UMASS_T_CBI_COMMAND,
	UMASS_T_CBI_DATA_WRITE, error);
	}

	static void
	umass_t_cbi_reset4_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_cbi_data_clear_stall_callback(xfer, UMASS_T_CBI_COMMAND,
	UMASS_T_CBI_STATUS, error);
	}

	static void
	umass_t_cbi_data_clear_stall_callback(struct usb_xfer *xfer,
	uint8_t next_xfer, uint8_t stall_xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	tr_transferred:
	if (next_xfer == UMASS_T_CBI_STATUS) {
	umass_cbi_start_status(sc);
	} else {
	umass_transfer_start(sc, next_xfer);
	}
	break;

	case USB_ST_SETUP:
	if (usbd_clear_stall_callback(xfer, sc->sc_xfer[stall_xfer])) {
	goto tr_transferred; /* should not happen */
	}
	break;

	default: /* Error */
	umass_tr_error(xfer, error);
	break;
	}
	}

	static void
	umass_t_cbi_command_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	union ccb *ccb = sc->sc_transfer.ccb;
	struct usb_device_request req;
	struct usb_page_cache *pc;

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:

	if (sc->sc_transfer.dir == DIR_NONE) {
	umass_cbi_start_status(sc);
	} else {
	umass_transfer_start
	(sc, (sc->sc_transfer.dir == DIR_IN) ?
	UMASS_T_CBI_DATA_READ : UMASS_T_CBI_DATA_WRITE);
	}
	break;

	case USB_ST_SETUP:

	if (ccb) {
	/*
	* do a CBI transfer with cmd_len bytes from
	* cmd_data, possibly a data phase of data_len
	* bytes from/to the device and finally a status
	* read phase.
	*/

	req.bmRequestType = UT_WRITE_CLASS_INTERFACE;
	req.bRequest = UR_CBI_ADSC;
	USETW(req.wValue, 0);
	req.wIndex[0] = sc->sc_iface_no;
	req.wIndex[1] = 0;
	req.wLength[0] = sc->sc_transfer.cmd_len;
	req.wLength[1] = 0;

	pc = usbd_xfer_get_frame(xfer, 0);
	usbd_copy_in(pc, 0, &req, sizeof(req));
	pc = usbd_xfer_get_frame(xfer, 1);
	usbd_copy_in(pc, 0, sc->sc_transfer.cmd_data,
	sc->sc_transfer.cmd_len);

	usbd_xfer_set_frame_len(xfer, 0, sizeof(req));
	usbd_xfer_set_frame_len(xfer, 1, sc->sc_transfer.cmd_len);
	usbd_xfer_set_frames(xfer,
	sc->sc_transfer.cmd_len ? 2 : 1);

	DIF(UDMASS_CBI,
	umass_cbi_dump_cmd(sc,
	sc->sc_transfer.cmd_data,
	sc->sc_transfer.cmd_len));

	usbd_transfer_submit(xfer);
	}
	break;

	default: /* Error */
	/*
	* STALL on the control pipe can be result of the command error.
	* Attempt to clear this STALL same as for bulk pipe also
	* results in command completion interrupt, but ASC/ASCQ there
	* look like not always valid, so don't bother about it.
	*/
	if ((error == USB_ERR_STALLED) \|\|
	(sc->sc_transfer.callback == &umass_cam_cb)) {
	sc->sc_transfer.ccb = NULL;
	(sc->sc_transfer.callback)
	(sc, ccb, sc->sc_transfer.data_len,
	STATUS_CMD_UNKNOWN);
	} else {
	umass_tr_error(xfer, error);
	/* skip reset */
	sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;
	}
	break;
	}
	}

	static void
	umass_t_cbi_data_read_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	uint32_t max_bulk = usbd_xfer_max_len(xfer);
	int actlen, sumlen;

	usbd_xfer_status(xfer, &actlen, &sumlen, NULL, NULL);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	sc->sc_transfer.data_rem -= actlen;
	sc->sc_transfer.data_ptr += actlen;
	sc->sc_transfer.actlen += actlen;

	if (actlen < sumlen) {
	/* short transfer */
	sc->sc_transfer.data_rem = 0;
	}
	case USB_ST_SETUP:
	DPRINTF(sc, UDMASS_CBI, "max_bulk=%d, data_rem=%d\n",
	max_bulk, sc->sc_transfer.data_rem);

	if (sc->sc_transfer.data_rem == 0) {
	umass_cbi_start_status(sc);
	break;
	}
	if (max_bulk > sc->sc_transfer.data_rem) {
	max_bulk = sc->sc_transfer.data_rem;
	}
	usbd_xfer_set_timeout(xfer, sc->sc_transfer.data_timeout);

	usbd_xfer_set_frame_data(xfer, 0, sc->sc_transfer.data_ptr,
	max_bulk);

	usbd_transfer_submit(xfer);
	break;

	default: /* Error */
	if ((error == USB_ERR_CANCELLED) \|\|
	(sc->sc_transfer.callback != &umass_cam_cb)) {
	umass_tr_error(xfer, error);
	} else {
	umass_transfer_start(sc, UMASS_T_CBI_DATA_RD_CS);
	}
	break;
	}
	}

	static void
	umass_t_cbi_data_rd_cs_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_cbi_data_clear_stall_callback(xfer, UMASS_T_CBI_STATUS,
	UMASS_T_CBI_DATA_READ, error);
	}

	static void
	umass_t_cbi_data_write_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	uint32_t max_bulk = usbd_xfer_max_len(xfer);
	int actlen, sumlen;

	usbd_xfer_status(xfer, &actlen, &sumlen, NULL, NULL);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	sc->sc_transfer.data_rem -= actlen;
	sc->sc_transfer.data_ptr += actlen;
	sc->sc_transfer.actlen += actlen;

	if (actlen < sumlen) {
	/* short transfer */
	sc->sc_transfer.data_rem = 0;
	}
	case USB_ST_SETUP:
	DPRINTF(sc, UDMASS_CBI, "max_bulk=%d, data_rem=%d\n",
	max_bulk, sc->sc_transfer.data_rem);

	if (sc->sc_transfer.data_rem == 0) {
	umass_cbi_start_status(sc);
	break;
	}
	if (max_bulk > sc->sc_transfer.data_rem) {
	max_bulk = sc->sc_transfer.data_rem;
	}
	usbd_xfer_set_timeout(xfer, sc->sc_transfer.data_timeout);

	usbd_xfer_set_frame_data(xfer, 0, sc->sc_transfer.data_ptr,
	max_bulk);

	usbd_transfer_submit(xfer);
	break;

	default: /* Error */
	if ((error == USB_ERR_CANCELLED) \|\|
	(sc->sc_transfer.callback != &umass_cam_cb)) {
	umass_tr_error(xfer, error);
	} else {
	umass_transfer_start(sc, UMASS_T_CBI_DATA_WR_CS);
	}
	break;
	}
	}

	static void
	umass_t_cbi_data_wr_cs_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_cbi_data_clear_stall_callback(xfer, UMASS_T_CBI_STATUS,
	UMASS_T_CBI_DATA_WRITE, error);
	}

	static void
	umass_t_cbi_status_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	union ccb *ccb = sc->sc_transfer.ccb;
	struct usb_page_cache *pc;
	uint32_t residue;
	uint8_t status;
	int actlen;

	usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:

	if (actlen < (int)sizeof(sc->sbl)) {
	goto tr_setup;
	}
	pc = usbd_xfer_get_frame(xfer, 0);
	usbd_copy_out(pc, 0, &sc->sbl, sizeof(sc->sbl));

	residue = (sc->sc_transfer.data_len -
	sc->sc_transfer.actlen);

	/* dissect the information in the buffer */

	if (sc->sc_proto & UMASS_PROTO_UFI) {
	/*
	* Section 3.4.3.1.3 specifies that the UFI command
	* protocol returns an ASC and ASCQ in the interrupt
	* data block.
	*/

	DPRINTF(sc, UDMASS_CBI, "UFI CCI, ASC = 0x%02x, "
	"ASCQ = 0x%02x\n", sc->sbl.ufi.asc,
	sc->sbl.ufi.ascq);

	status = (((sc->sbl.ufi.asc == 0) &&
	(sc->sbl.ufi.ascq == 0)) ?
	STATUS_CMD_OK : STATUS_CMD_FAILED);

	sc->sc_transfer.ccb = NULL;

	sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;

	(sc->sc_transfer.callback)
	(sc, ccb, residue, status);

	break;

	} else {
	/* Command Interrupt Data Block */

	DPRINTF(sc, UDMASS_CBI, "type=0x%02x, value=0x%02x\n",
	sc->sbl.common.type, sc->sbl.common.value);

	if (sc->sbl.common.type == IDB_TYPE_CCI) {
	status = (sc->sbl.common.value & IDB_VALUE_STATUS_MASK);

	status = ((status == IDB_VALUE_PASS) ? STATUS_CMD_OK :
	(status == IDB_VALUE_FAIL) ? STATUS_CMD_FAILED :
	(status == IDB_VALUE_PERSISTENT) ? STATUS_CMD_FAILED :
	STATUS_WIRE_FAILED);

	sc->sc_transfer.ccb = NULL;

	sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;

	(sc->sc_transfer.callback)
	(sc, ccb, residue, status);

	break;
	}
	}

	/* fallthrough */

	case USB_ST_SETUP:
	tr_setup:
	usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer));
	usbd_transfer_submit(xfer);
	break;

	default: /* Error */
	DPRINTF(sc, UDMASS_CBI, "Failed to read CSW: %s\n",
	usbd_errstr(error));
	umass_tr_error(xfer, error);
	break;
	}
	}

	/*
	* CAM specific functions (used by SCSI, UFI, 8070i (ATAPI))
	*/

	static int
	umass_cam_attach_sim(struct umass_softc *sc)
	{
	struct cam_devq devq; / Per device Queue */
	cam_status status;

	/*
	* A HBA is attached to the CAM layer.
	*
	* The CAM layer will then after a while start probing for devices on
	* the bus. The number of SIMs is limited to one.
	*/

	devq = cam_simq_alloc(1 /* maximum openings */ );
	if (devq == NULL) {
	return (ENOMEM);
	}
	sc->sc_sim = cam_sim_alloc
	(&umass_cam_action, &umass_cam_poll,
	DEVNAME_SIM,
	sc /* priv */ ,
	sc->sc_unit /* unit number */ ,
	&sc->sc_mtx /* mutex */ ,
	1 /* maximum device openings */ ,
	0 /* maximum tagged device openings */ ,
	devq);

	if (sc->sc_sim == NULL) {
	cam_simq_free(devq);
	return (ENOMEM);
	}

	mtx_lock(&sc->sc_mtx);
	status = xpt_bus_register(sc->sc_sim, sc->sc_dev, sc->sc_unit);
	if (status != CAM_SUCCESS) {
	cam_sim_free(sc->sc_sim, /* free_devq */ TRUE);
	mtx_unlock(&sc->sc_mtx);
	printf("%s: xpt_bus_register failed with status %#x\n",
	__func__, status);
	return (ENOMEM);
	}
	mtx_unlock(&sc->sc_mtx);

	return (0);
	}

	static void
	umass_cam_attach(struct umass_softc *sc)
	{
	#ifndef USB_DEBUG
	if (bootverbose)
	#endif
	printf("%s:%d:%d: Attached to scbus%d\n",
	sc->sc_name, cam_sim_path(sc->sc_sim),
	sc->sc_unit, cam_sim_path(sc->sc_sim));
	}

	/* umass_cam_detach
	* detach from the CAM layer
	*/

	static void
	umass_cam_detach_sim(struct umass_softc *sc)
	{
	cam_status status;

	if (sc->sc_sim != NULL) {
	status = xpt_bus_deregister(cam_sim_path(sc->sc_sim));
	if (status == CAM_REQ_CMP) {
	/* accessing the softc is not possible after this */
	sc->sc_sim->softc = NULL;
	DPRINTF(sc, UDMASS_SCSI, "%s: %s:%d:%d caling "
	"cam_sim_free sim %p refc %u mtx %p\n",
	__func__, sc->sc_name, cam_sim_path(sc->sc_sim),
	sc->sc_unit, sc->sc_sim,
	sc->sc_sim->refcount, sc->sc_sim->mtx);
	cam_sim_free(sc->sc_sim, /* free_devq */ TRUE);
	} else {
	panic("%s: %s: CAM layer is busy: %#x\n",
	__func__, sc->sc_name, status);
	}
	sc->sc_sim = NULL;
	}
	}

	/* umass_cam_action
	* CAM requests for action come through here
	*/

	static void
	umass_cam_action(struct cam_sim sim, union ccb ccb)
	{
	struct umass_softc sc = (struct umass_softc )sim->softc;

	if (sc == NULL) {
	ccb->ccb_h.status = CAM_SEL_TIMEOUT;
	xpt_done(ccb);
	return;
	}

	/* Perform the requested action */
	switch (ccb->ccb_h.func_code) {
	case XPT_SCSI_IO:
	{
	uint8_t *cmd;
	uint8_t dir;

	if (ccb->csio.ccb_h.flags & CAM_CDB_POINTER) {
	cmd = (uint8_t *)(ccb->csio.cdb_io.cdb_ptr);
	} else {
	cmd = (uint8_t *)(ccb->csio.cdb_io.cdb_bytes);
	}

	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_SCSI_IO: "
	"cmd: 0x%02x, flags: 0x%02x, "
	"%db cmd/%db data/%db sense\n",
	cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun, cmd[0],
	ccb->ccb_h.flags & CAM_DIR_MASK, ccb->csio.cdb_len,
	ccb->csio.dxfer_len, ccb->csio.sense_len);

	if (sc->sc_transfer.ccb) {
	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_SCSI_IO: "
	"I/O in progress, deferring\n",
	cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun);
	ccb->ccb_h.status = CAM_SCSI_BUSY;
	xpt_done(ccb);
	goto done;
	}
	switch (ccb->ccb_h.flags & CAM_DIR_MASK) {
	case CAM_DIR_IN:
	dir = DIR_IN;
	break;
	case CAM_DIR_OUT:
	dir = DIR_OUT;
	DIF(UDMASS_SCSI,
	umass_dump_buffer(sc, ccb->csio.data_ptr,
	ccb->csio.dxfer_len, 48));
	break;
	default:
	dir = DIR_NONE;
	}

	ccb->ccb_h.status = CAM_REQ_INPROG \| CAM_SIM_QUEUED;

	/*
	* sc->sc_transform will convert the command to the
	* command format needed by the specific command set
	* and return the converted command in
	* "sc->sc_transfer.cmd_data"
	*/
	if (umass_std_transform(sc, ccb, cmd, ccb->csio.cdb_len)) {
	if (sc->sc_transfer.cmd_data[0] == INQUIRY) {
	const char *pserial;

	pserial = usb_get_serial(sc->sc_udev);

	/*
	* Umass devices don't generally report their serial numbers
	* in the usual SCSI way. Emulate it here.
	*/
	if ((sc->sc_transfer.cmd_data[1] & SI_EVPD) &&
	(sc->sc_transfer.cmd_data[2] == SVPD_UNIT_SERIAL_NUMBER) &&
	(pserial[0] != '\0')) {
	struct scsi_vpd_unit_serial_number *vpd_serial;

	vpd_serial = (struct scsi_vpd_unit_serial_number *)ccb->csio.data_ptr;
	vpd_serial->length = strlen(pserial);
	if (vpd_serial->length > sizeof(vpd_serial->serial_num))
	vpd_serial->length = sizeof(vpd_serial->serial_num);
	memcpy(vpd_serial->serial_num, pserial, vpd_serial->length);
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	goto done;
	}

	/*
	* Handle EVPD inquiry for broken devices first
	* NO_INQUIRY also implies NO_INQUIRY_EVPD
	*/
	if ((sc->sc_quirks & (NO_INQUIRY_EVPD \| NO_INQUIRY)) &&
	(sc->sc_transfer.cmd_data[1] & SI_EVPD)) {
	scsi_set_sense_data(&ccb->csio.sense_data,
	/sense_format/ SSD_TYPE_NONE,
	/current_error/ 1,
	/sense_key/ SSD_KEY_ILLEGAL_REQUEST,
	/asc/ 0x24,
	/ascq/ 0x00,
	/extra args/ SSD_ELEM_NONE);
	ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
	ccb->ccb_h.status =
	CAM_SCSI_STATUS_ERROR \|
	CAM_AUTOSNS_VALID \|
	CAM_DEV_QFRZN;
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	xpt_done(ccb);
	goto done;
	}
	/*
	* Return fake inquiry data for
	* broken devices
	*/
	if (sc->sc_quirks & NO_INQUIRY) {
	memcpy(ccb->csio.data_ptr, &fake_inq_data,
	sizeof(fake_inq_data));
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	goto done;
	}
	if (sc->sc_quirks & FORCE_SHORT_INQUIRY) {
	ccb->csio.dxfer_len = SHORT_INQUIRY_LENGTH;
	}
	} else if (sc->sc_transfer.cmd_data[0] == PREVENT_ALLOW) {
	if (sc->sc_quirks & NO_PREVENT_ALLOW) {
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	goto done;
	}
	} else if (sc->sc_transfer.cmd_data[0] == SYNCHRONIZE_CACHE) {
	if (sc->sc_quirks & NO_SYNCHRONIZE_CACHE) {
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	goto done;
	}
	}
	umass_command_start(sc, dir, ccb->csio.data_ptr,
	ccb->csio.dxfer_len,
	ccb->ccb_h.timeout,
	&umass_cam_cb, ccb);
	}
	break;
	}
	case XPT_PATH_INQ:
	{
	struct ccb_pathinq *cpi = &ccb->cpi;

	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_PATH_INQ:.\n",
	sc ? cam_sim_path(sc->sc_sim) : -1, ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun);

	/* host specific information */
	cpi->version_num = 1;
	cpi->hba_inquiry = 0;
	cpi->target_sprt = 0;
	cpi->hba_misc = PIM_NO_6_BYTE;
	cpi->hba_eng_cnt = 0;
	cpi->max_target = UMASS_SCSIID_MAX; /* one target */
	cpi->initiator_id = UMASS_SCSIID_HOST;
	strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strlcpy(cpi->hba_vid, "USB SCSI", HBA_IDLEN);
	strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
	cpi->unit_number = cam_sim_unit(sim);
	cpi->bus_id = sc->sc_unit;
	cpi->protocol = PROTO_SCSI;
	cpi->protocol_version = SCSI_REV_2;
	cpi->transport = XPORT_USB;
	cpi->transport_version = 0;

	if (sc == NULL) {
	cpi->base_transfer_speed = 0;
	cpi->max_lun = 0;
	} else {
	if (sc->sc_quirks & FLOPPY_SPEED) {
	cpi->base_transfer_speed =
	UMASS_FLOPPY_TRANSFER_SPEED;
	} else {
	switch (usbd_get_speed(sc->sc_udev)) {
	case USB_SPEED_SUPER:
	cpi->base_transfer_speed =
	UMASS_SUPER_TRANSFER_SPEED;
	- cpi->maxio = MAXPHYS;
	+ cpi->maxio = maxphys;
	break;
	case USB_SPEED_HIGH:
	cpi->base_transfer_speed =
	UMASS_HIGH_TRANSFER_SPEED;
	break;
	default:
	cpi->base_transfer_speed =
	UMASS_FULL_TRANSFER_SPEED;
	break;
	}
	}
	cpi->max_lun = sc->sc_maxlun;
	}

	cpi->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	break;
	}
	case XPT_RESET_DEV:
	{
	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_RESET_DEV:.\n",
	cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun);

	umass_reset(sc);

	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	break;
	}
	case XPT_GET_TRAN_SETTINGS:
	{
	struct ccb_trans_settings *cts = &ccb->cts;

	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_GET_TRAN_SETTINGS:.\n",
	cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun);

	cts->protocol = PROTO_SCSI;
	cts->protocol_version = SCSI_REV_2;
	cts->transport = XPORT_USB;
	cts->transport_version = 0;
	cts->xport_specific.valid = 0;

	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	break;
	}
	case XPT_SET_TRAN_SETTINGS:
	{
	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_SET_TRAN_SETTINGS:.\n",
	cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun);

	ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
	xpt_done(ccb);
	break;
	}
	case XPT_CALC_GEOMETRY:
	{
	cam_calc_geometry(&ccb->ccg, /* extended */ 1);
	xpt_done(ccb);
	break;
	}
	case XPT_NOOP:
	{
	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_NOOP:.\n",
	sc ? cam_sim_path(sc->sc_sim) : -1, ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun);

	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	break;
	}
	default:
	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:func_code 0x%04x: "
	"Not implemented\n",
	sc ? cam_sim_path(sc->sc_sim) : -1, ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun, ccb->ccb_h.func_code);

	ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
	xpt_done(ccb);
	break;
	}

	done:
	return;
	}

	static void
	umass_cam_poll(struct cam_sim *sim)
	{
	struct umass_softc sc = (struct umass_softc )sim->softc;

	if (sc == NULL)
	return;

	DPRINTF(sc, UDMASS_SCSI, "CAM poll\n");

	usbd_transfer_poll(sc->sc_xfer, UMASS_T_MAX);
	}

	/* umass_cam_cb
	* finalise a completed CAM command
	*/

	static void
	umass_cam_cb(struct umass_softc sc, union ccb ccb, uint32_t residue,
	uint8_t status)
	{
	ccb->csio.resid = residue;

	switch (status) {
	case STATUS_CMD_OK:
	ccb->ccb_h.status = CAM_REQ_CMP;
	if ((sc->sc_quirks & READ_CAPACITY_OFFBY1) &&
	(ccb->ccb_h.func_code == XPT_SCSI_IO) &&
	(ccb->csio.cdb_io.cdb_bytes[0] == READ_CAPACITY)) {
	struct scsi_read_capacity_data *rcap;
	uint32_t maxsector;

	rcap = (void *)(ccb->csio.data_ptr);
	maxsector = scsi_4btoul(rcap->addr) - 1;
	scsi_ulto4b(maxsector, rcap->addr);
	}
	/*
	* We have to add SVPD_UNIT_SERIAL_NUMBER to the list
	* of pages supported by the device - otherwise, CAM
	* will never ask us for the serial number if the
	* device cannot handle that by itself.
	*/
	if (ccb->ccb_h.func_code == XPT_SCSI_IO &&
	sc->sc_transfer.cmd_data[0] == INQUIRY &&
	(sc->sc_transfer.cmd_data[1] & SI_EVPD) &&
	sc->sc_transfer.cmd_data[2] == SVPD_SUPPORTED_PAGE_LIST &&
	(usb_get_serial(sc->sc_udev)[0] != '\0')) {
	struct ccb_scsiio *csio;
	struct scsi_vpd_supported_page_list *page_list;

	csio = &ccb->csio;
	page_list = (struct scsi_vpd_supported_page_list *)csio->data_ptr;
	if (page_list->length + 1 < SVPD_SUPPORTED_PAGES_SIZE) {
	page_list->list[page_list->length] = SVPD_UNIT_SERIAL_NUMBER;
	page_list->length++;
	}
	}
	xpt_done(ccb);
	break;

	case STATUS_CMD_UNKNOWN:
	case STATUS_CMD_FAILED:

	/* fetch sense data */

	/* the rest of the command was filled in at attach */
	sc->cam_scsi_sense.length = ccb->csio.sense_len;

	DPRINTF(sc, UDMASS_SCSI, "Fetching %d bytes of "
	"sense data\n", ccb->csio.sense_len);

	if (umass_std_transform(sc, ccb, &sc->cam_scsi_sense.opcode,
	sizeof(sc->cam_scsi_sense))) {
	if ((sc->sc_quirks & FORCE_SHORT_INQUIRY) &&
	(sc->sc_transfer.cmd_data[0] == INQUIRY)) {
	ccb->csio.sense_len = SHORT_INQUIRY_LENGTH;
	}
	umass_command_start(sc, DIR_IN, &ccb->csio.sense_data.error_code,
	ccb->csio.sense_len, ccb->ccb_h.timeout,
	&umass_cam_sense_cb, ccb);
	}
	break;

	default:
	/*
	* The wire protocol failed and will hopefully have
	* recovered. We return an error to CAM and let CAM
	* retry the command if necessary.
	*/
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status = CAM_REQ_CMP_ERR \| CAM_DEV_QFRZN;
	xpt_done(ccb);
	break;
	}
	}

	/*
	* Finalise a completed autosense operation
	*/
	static void
	umass_cam_sense_cb(struct umass_softc sc, union ccb ccb, uint32_t residue,
	uint8_t status)
	{
	uint8_t *cmd;

	switch (status) {
	case STATUS_CMD_OK:
	case STATUS_CMD_UNKNOWN:
	case STATUS_CMD_FAILED: {
	int key, sense_len;

	ccb->csio.sense_resid = residue;
	sense_len = ccb->csio.sense_len - ccb->csio.sense_resid;
	key = scsi_get_sense_key(&ccb->csio.sense_data, sense_len,
	/show_errors/ 1);

	if (ccb->csio.ccb_h.flags & CAM_CDB_POINTER) {
	cmd = (uint8_t *)(ccb->csio.cdb_io.cdb_ptr);
	} else {
	cmd = (uint8_t *)(ccb->csio.cdb_io.cdb_bytes);
	}

	/*
	* Getting sense data always succeeds (apart from wire
	* failures):
	*/
	if ((sc->sc_quirks & RS_NO_CLEAR_UA) &&
	(cmd[0] == INQUIRY) &&
	(key == SSD_KEY_UNIT_ATTENTION)) {
	/*
	* Ignore unit attention errors in the case where
	* the Unit Attention state is not cleared on
	* REQUEST SENSE. They will appear again at the next
	* command.
	*/
	ccb->ccb_h.status = CAM_REQ_CMP;
	} else if (key == SSD_KEY_NO_SENSE) {
	/*
	* No problem after all (in the case of CBI without
	* CCI)
	*/
	ccb->ccb_h.status = CAM_REQ_CMP;
	} else if ((sc->sc_quirks & RS_NO_CLEAR_UA) &&
	(cmd[0] == READ_CAPACITY) &&
	(key == SSD_KEY_UNIT_ATTENTION)) {
	/*
	* Some devices do not clear the unit attention error
	* on request sense. We insert a test unit ready
	* command to make sure we clear the unit attention
	* condition, then allow the retry to proceed as
	* usual.
	*/

	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status = CAM_SCSI_STATUS_ERROR
	\| CAM_AUTOSNS_VALID \| CAM_DEV_QFRZN;
	ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;

	#if 0
	DELAY(300000);
	#endif
	DPRINTF(sc, UDMASS_SCSI, "Doing a sneaky"
	"TEST_UNIT_READY\n");

	/* the rest of the command was filled in at attach */

	if ((sc->sc_transform)(sc,
	&sc->cam_scsi_test_unit_ready.opcode,
	sizeof(sc->cam_scsi_test_unit_ready)) == 1) {
	umass_command_start(sc, DIR_NONE, NULL, 0,
	ccb->ccb_h.timeout,
	&umass_cam_quirk_cb, ccb);
	break;
	}
	} else {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	if (key >= 0) {
	ccb->ccb_h.status = CAM_SCSI_STATUS_ERROR
	\| CAM_AUTOSNS_VALID \| CAM_DEV_QFRZN;
	ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
	} else
	ccb->ccb_h.status = CAM_AUTOSENSE_FAIL
	\| CAM_DEV_QFRZN;
	}
	xpt_done(ccb);
	break;
	}
	default:
	DPRINTF(sc, UDMASS_SCSI, "Autosense failed, "
	"status %d\n", status);
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status = CAM_AUTOSENSE_FAIL \| CAM_DEV_QFRZN;
	xpt_done(ccb);
	}
	}

	/*
	* This completion code just handles the fact that we sent a test-unit-ready
	* after having previously failed a READ CAPACITY with CHECK_COND. The CCB
	* status for CAM is already set earlier.
	*/
	static void
	umass_cam_quirk_cb(struct umass_softc sc, union ccb ccb, uint32_t residue,
	uint8_t status)
	{
	DPRINTF(sc, UDMASS_SCSI, "Test unit ready "
	"returned status %d\n", status);

	xpt_done(ccb);
	}

	/*
	* SCSI specific functions
	*/

	static uint8_t
	umass_scsi_transform(struct umass_softc sc, uint8_t cmd_ptr,
	uint8_t cmd_len)
	{
	if ((cmd_len == 0) \|\|
	(cmd_len > sizeof(sc->sc_transfer.cmd_data))) {
	DPRINTF(sc, UDMASS_SCSI, "Invalid command "
	"length: %d bytes\n", cmd_len);
	return (0); /* failure */
	}
	sc->sc_transfer.cmd_len = cmd_len;

	switch (cmd_ptr[0]) {
	case TEST_UNIT_READY:
	if (sc->sc_quirks & NO_TEST_UNIT_READY) {
	DPRINTF(sc, UDMASS_SCSI, "Converted TEST_UNIT_READY "
	"to START_UNIT\n");
	memset(sc->sc_transfer.cmd_data, 0, cmd_len);
	sc->sc_transfer.cmd_data[0] = START_STOP_UNIT;
	sc->sc_transfer.cmd_data[4] = SSS_START;
	return (1);
	}
	break;

	case INQUIRY:
	/*
	* some drives wedge when asked for full inquiry
	* information.
	*/
	if (sc->sc_quirks & FORCE_SHORT_INQUIRY) {
	memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
	sc->sc_transfer.cmd_data[4] = SHORT_INQUIRY_LENGTH;
	return (1);
	}
	break;
	}

	memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
	return (1);
	}

	static uint8_t
	umass_rbc_transform(struct umass_softc sc, uint8_t cmd_ptr, uint8_t cmd_len)
	{
	if ((cmd_len == 0) \|\|
	(cmd_len > sizeof(sc->sc_transfer.cmd_data))) {
	DPRINTF(sc, UDMASS_SCSI, "Invalid command "
	"length: %d bytes\n", cmd_len);
	return (0); /* failure */
	}
	switch (cmd_ptr[0]) {
	/* these commands are defined in RBC: */
	case READ_10:
	case READ_CAPACITY:
	case START_STOP_UNIT:
	case SYNCHRONIZE_CACHE:
	case WRITE_10:
	case VERIFY_10:
	case INQUIRY:
	case MODE_SELECT_10:
	case MODE_SENSE_10:
	case TEST_UNIT_READY:
	case WRITE_BUFFER:
	/*
	* The following commands are not listed in my copy of the
	* RBC specs. CAM however seems to want those, and at least
	* the Sony DSC device appears to support those as well
	*/
	case REQUEST_SENSE:
	case PREVENT_ALLOW:

	memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);

	if ((sc->sc_quirks & RBC_PAD_TO_12) && (cmd_len < 12)) {
	memset(sc->sc_transfer.cmd_data + cmd_len,
	0, 12 - cmd_len);
	cmd_len = 12;
	}
	sc->sc_transfer.cmd_len = cmd_len;
	return (1); /* success */

	/* All other commands are not legal in RBC */
	default:
	DPRINTF(sc, UDMASS_SCSI, "Unsupported RBC "
	"command 0x%02x\n", cmd_ptr[0]);
	return (0); /* failure */
	}
	}

	static uint8_t
	umass_ufi_transform(struct umass_softc sc, uint8_t cmd_ptr,
	uint8_t cmd_len)
	{
	if ((cmd_len == 0) \|\|
	(cmd_len > sizeof(sc->sc_transfer.cmd_data))) {
	DPRINTF(sc, UDMASS_SCSI, "Invalid command "
	"length: %d bytes\n", cmd_len);
	return (0); /* failure */
	}
	/* An UFI command is always 12 bytes in length */
	sc->sc_transfer.cmd_len = UFI_COMMAND_LENGTH;

	/* Zero the command data */
	memset(sc->sc_transfer.cmd_data, 0, UFI_COMMAND_LENGTH);

	switch (cmd_ptr[0]) {
	/*
	* Commands of which the format has been verified. They
	* should work. Copy the command into the (zeroed out)
	* destination buffer.
	*/
	case TEST_UNIT_READY:
	if (sc->sc_quirks & NO_TEST_UNIT_READY) {
	/*
	* Some devices do not support this command. Start
	* Stop Unit should give the same results
	*/
	DPRINTF(sc, UDMASS_UFI, "Converted TEST_UNIT_READY "
	"to START_UNIT\n");

	sc->sc_transfer.cmd_data[0] = START_STOP_UNIT;
	sc->sc_transfer.cmd_data[4] = SSS_START;
	return (1);
	}
	break;

	case REZERO_UNIT:
	case REQUEST_SENSE:
	case FORMAT_UNIT:
	case INQUIRY:
	case START_STOP_UNIT:
	case SEND_DIAGNOSTIC:
	case PREVENT_ALLOW:
	case READ_CAPACITY:
	case READ_10:
	case WRITE_10:
	case POSITION_TO_ELEMENT: /* SEEK_10 */
	case WRITE_AND_VERIFY:
	case VERIFY:
	case MODE_SELECT_10:
	case MODE_SENSE_10:
	case READ_12:
	case WRITE_12:
	case READ_FORMAT_CAPACITIES:
	break;

	/*
	* SYNCHRONIZE_CACHE isn't supported by UFI, nor should it be
	* required for UFI devices, so it is appropriate to fake
	* success.
	*/
	case SYNCHRONIZE_CACHE:
	return (2);

	default:
	DPRINTF(sc, UDMASS_SCSI, "Unsupported UFI "
	"command 0x%02x\n", cmd_ptr[0]);
	return (0); /* failure */
	}

	memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
	return (1); /* success */
	}

	/*
	* 8070i (ATAPI) specific functions
	*/
	static uint8_t
	umass_atapi_transform(struct umass_softc sc, uint8_t cmd_ptr,
	uint8_t cmd_len)
	{
	if ((cmd_len == 0) \|\|
	(cmd_len > sizeof(sc->sc_transfer.cmd_data))) {
	DPRINTF(sc, UDMASS_SCSI, "Invalid command "
	"length: %d bytes\n", cmd_len);
	return (0); /* failure */
	}
	/* An ATAPI command is always 12 bytes in length. */
	sc->sc_transfer.cmd_len = ATAPI_COMMAND_LENGTH;

	/* Zero the command data */
	memset(sc->sc_transfer.cmd_data, 0, ATAPI_COMMAND_LENGTH);

	switch (cmd_ptr[0]) {
	/*
	* Commands of which the format has been verified. They
	* should work. Copy the command into the destination
	* buffer.
	*/
	case INQUIRY:
	/*
	* some drives wedge when asked for full inquiry
	* information.
	*/
	if (sc->sc_quirks & FORCE_SHORT_INQUIRY) {
	memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);

	sc->sc_transfer.cmd_data[4] = SHORT_INQUIRY_LENGTH;
	return (1);
	}
	break;

	case TEST_UNIT_READY:
	if (sc->sc_quirks & NO_TEST_UNIT_READY) {
	DPRINTF(sc, UDMASS_SCSI, "Converted TEST_UNIT_READY "
	"to START_UNIT\n");
	sc->sc_transfer.cmd_data[0] = START_STOP_UNIT;
	sc->sc_transfer.cmd_data[4] = SSS_START;
	return (1);
	}
	break;

	case REZERO_UNIT:
	case REQUEST_SENSE:
	case START_STOP_UNIT:
	case SEND_DIAGNOSTIC:
	case PREVENT_ALLOW:
	case READ_CAPACITY:
	case READ_10:
	case WRITE_10:
	case POSITION_TO_ELEMENT: /* SEEK_10 */
	case SYNCHRONIZE_CACHE:
	case MODE_SELECT_10:
	case MODE_SENSE_10:
	case READ_BUFFER:
	case 0x42: /* READ_SUBCHANNEL */
	case 0x43: /* READ_TOC */
	case 0x44: /* READ_HEADER */
	case 0x47: /* PLAY_MSF (Play Minute/Second/Frame) */
	case 0x48: /* PLAY_TRACK */
	case 0x49: /* PLAY_TRACK_REL */
	case 0x4b: /* PAUSE */
	case 0x51: /* READ_DISK_INFO */
	case 0x52: /* READ_TRACK_INFO */
	case 0x54: /* SEND_OPC */
	case 0x59: /* READ_MASTER_CUE */
	case 0x5b: /* CLOSE_TR_SESSION */
	case 0x5c: /* READ_BUFFER_CAP */
	case 0x5d: /* SEND_CUE_SHEET */
	case 0xa1: /* BLANK */
	case 0xa5: /* PLAY_12 */
	case 0xa6: /* EXCHANGE_MEDIUM */
	case 0xad: /* READ_DVD_STRUCTURE */
	case 0xbb: /* SET_CD_SPEED */
	case 0xe5: /* READ_TRACK_INFO_PHILIPS */
	break;

	case READ_12:
	case WRITE_12:
	default:
	DPRINTF(sc, UDMASS_SCSI, "Unsupported ATAPI "
	"command 0x%02x - trying anyway\n",
	cmd_ptr[0]);
	break;
	}

	memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
	return (1); /* success */
	}

	static uint8_t
	umass_no_transform(struct umass_softc sc, uint8_t cmd,
	uint8_t cmdlen)
	{
	return (0); /* failure */
	}

	static uint8_t
	umass_std_transform(struct umass_softc sc, union ccb ccb,
	uint8_t *cmd, uint8_t cmdlen)
	{
	uint8_t retval;

	retval = (sc->sc_transform) (sc, cmd, cmdlen);

	if (retval == 2) {
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	return (0);
	} else if (retval == 0) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status = CAM_REQ_INVALID \| CAM_DEV_QFRZN;
	xpt_done(ccb);
	return (0);
	}
	/* Command should be executed */
	return (1);
	}

	#ifdef USB_DEBUG
	static void
	umass_bbb_dump_cbw(struct umass_softc sc, umass_bbb_cbw_t cbw)
	{
	uint8_t *c = cbw->CBWCDB;

	uint32_t dlen = UGETDW(cbw->dCBWDataTransferLength);
	uint32_t tag = UGETDW(cbw->dCBWTag);

	uint8_t clen = cbw->bCDBLength;
	uint8_t flags = cbw->bCBWFlags;
	uint8_t lun = cbw->bCBWLUN;

	DPRINTF(sc, UDMASS_BBB, "CBW %d: cmd = %db "
	"(0x%02x%02x%02x%02x%02x%02x%s), "
	"data = %db, lun = %d, dir = %s\n",
	tag, clen,
	c[0], c[1], c[2], c[3], c[4], c[5], (clen > 6 ? "..." : ""),
	dlen, lun, (flags == CBWFLAGS_IN ? "in" :
	(flags == CBWFLAGS_OUT ? "out" : "<invalid>")));
	}

	static void
	umass_bbb_dump_csw(struct umass_softc sc, umass_bbb_csw_t csw)
	{
	uint32_t sig = UGETDW(csw->dCSWSignature);
	uint32_t tag = UGETDW(csw->dCSWTag);
	uint32_t res = UGETDW(csw->dCSWDataResidue);
	uint8_t status = csw->bCSWStatus;

	DPRINTF(sc, UDMASS_BBB, "CSW %d: sig = 0x%08x (%s), tag = 0x%08x, "
	"res = %d, status = 0x%02x (%s)\n",
	tag, sig, (sig == CSWSIGNATURE ? "valid" : "invalid"),
	tag, res,
	status, (status == CSWSTATUS_GOOD ? "good" :
	(status == CSWSTATUS_FAILED ? "failed" :
	(status == CSWSTATUS_PHASE ? "phase" : "<invalid>"))));
	}

	static void
	umass_cbi_dump_cmd(struct umass_softc sc, void cmd, uint8_t cmdlen)
	{
	uint8_t *c = cmd;
	uint8_t dir = sc->sc_transfer.dir;

	DPRINTF(sc, UDMASS_BBB, "cmd = %db "
	"(0x%02x%02x%02x%02x%02x%02x%s), "
	"data = %db, dir = %s\n",
	cmdlen,
	c[0], c[1], c[2], c[3], c[4], c[5], (cmdlen > 6 ? "..." : ""),
	sc->sc_transfer.data_len,
	(dir == DIR_IN ? "in" :
	(dir == DIR_OUT ? "out" :
	(dir == DIR_NONE ? "no data phase" : "<invalid>"))));
	}

	static void
	umass_dump_buffer(struct umass_softc sc, uint8_t buffer, uint32_t buflen,
	uint32_t printlen)
	{
	uint32_t i, j;
	char s1[40];
	char s2[40];
	char s3[5];

	s1[0] = '\0';
	s3[0] = '\0';

	sprintf(s2, " buffer=%p, buflen=%d", buffer, buflen);
	for (i = 0; (i < buflen) && (i < printlen); i++) {
	j = i % 16;
	if (j == 0 && i != 0) {
	DPRINTF(sc, UDMASS_GEN, "0x %s%s\n",
	s1, s2);
	s2[0] = '\0';
	}
	sprintf(&s1[j * 2], "%02x", buffer[i] & 0xff);
	}
	if (buflen > printlen)
	sprintf(s3, " ...");
	DPRINTF(sc, UDMASS_GEN, "0x %s%s%s\n",
	s1, s2, s3);
	}

	#endif
	diff --git a/sys/dev/virtio/block/virtio_blk.c b/sys/dev/virtio/block/virtio_blk.c
	index ff7d573cfa7b..22005f52f4bc 100644
	--- a/sys/dev/virtio/block/virtio_blk.c
	+++ b/sys/dev/virtio/block/virtio_blk.c
	@@ -1,1455 +1,1455 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice unmodified, this list of conditions, and the following
	* disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	/* Driver for VirtIO block devices. */

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bio.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/sglist.h>
	#include <sys/sysctl.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/queue.h>

	#include <geom/geom.h>
	#include <geom/geom_disk.h>

	#include <machine/bus.h>
	#include <machine/resource.h>
	#include <sys/bus.h>
	#include <sys/rman.h>

	#include <dev/virtio/virtio.h>
	#include <dev/virtio/virtqueue.h>
	#include <dev/virtio/block/virtio_blk.h>

	#include "virtio_if.h"

	struct vtblk_request {
	struct virtio_blk_outhdr vbr_hdr;
	struct bio *vbr_bp;
	uint8_t vbr_ack;
	TAILQ_ENTRY(vtblk_request) vbr_link;
	};

	enum vtblk_cache_mode {
	VTBLK_CACHE_WRITETHROUGH,
	VTBLK_CACHE_WRITEBACK,
	VTBLK_CACHE_MAX
	};

	struct vtblk_softc {
	device_t vtblk_dev;
	struct mtx vtblk_mtx;
	uint64_t vtblk_features;
	uint32_t vtblk_flags;
	#define VTBLK_FLAG_INDIRECT 0x0001
	#define VTBLK_FLAG_READONLY 0x0002
	#define VTBLK_FLAG_DETACH 0x0004
	#define VTBLK_FLAG_SUSPEND 0x0008
	#define VTBLK_FLAG_BARRIER 0x0010
	#define VTBLK_FLAG_WC_CONFIG 0x0020
	#define VTBLK_FLAG_DISCARD 0x0040

	struct virtqueue *vtblk_vq;
	struct sglist *vtblk_sglist;
	struct disk *vtblk_disk;

	struct bio_queue_head vtblk_bioq;
	TAILQ_HEAD(, vtblk_request)
	vtblk_req_free;
	TAILQ_HEAD(, vtblk_request)
	vtblk_req_ready;
	struct vtblk_request *vtblk_req_ordered;

	int vtblk_max_nsegs;
	int vtblk_request_count;
	enum vtblk_cache_mode vtblk_write_cache;

	struct bio_queue vtblk_dump_queue;
	struct vtblk_request vtblk_dump_request;
	};

	static struct virtio_feature_desc vtblk_feature_desc[] = {
	{ VIRTIO_BLK_F_BARRIER, "HostBarrier" },
	{ VIRTIO_BLK_F_SIZE_MAX, "MaxSegSize" },
	{ VIRTIO_BLK_F_SEG_MAX, "MaxNumSegs" },
	{ VIRTIO_BLK_F_GEOMETRY, "DiskGeometry" },
	{ VIRTIO_BLK_F_RO, "ReadOnly" },
	{ VIRTIO_BLK_F_BLK_SIZE, "BlockSize" },
	{ VIRTIO_BLK_F_SCSI, "SCSICmds" },
	{ VIRTIO_BLK_F_WCE, "WriteCache" },
	{ VIRTIO_BLK_F_TOPOLOGY, "Topology" },
	{ VIRTIO_BLK_F_CONFIG_WCE, "ConfigWCE" },
	{ VIRTIO_BLK_F_DISCARD, "Discard" },
	{ 0, NULL }
	};

	static int vtblk_modevent(module_t, int, void *);

	static int vtblk_probe(device_t);
	static int vtblk_attach(device_t);
	static int vtblk_detach(device_t);
	static int vtblk_suspend(device_t);
	static int vtblk_resume(device_t);
	static int vtblk_shutdown(device_t);
	static int vtblk_config_change(device_t);

	static int vtblk_open(struct disk *);
	static int vtblk_close(struct disk *);
	static int vtblk_ioctl(struct disk , u_long, void , int,
	struct thread *);
	static int vtblk_dump(void , void , vm_offset_t, off_t, size_t);
	static void vtblk_strategy(struct bio *);

	static void vtblk_negotiate_features(struct vtblk_softc *);
	static void vtblk_setup_features(struct vtblk_softc *);
	static int vtblk_maximum_segments(struct vtblk_softc *,
	struct virtio_blk_config *);
	static int vtblk_alloc_virtqueue(struct vtblk_softc *);
	static void vtblk_resize_disk(struct vtblk_softc *, uint64_t);
	static void vtblk_alloc_disk(struct vtblk_softc *,
	struct virtio_blk_config *);
	static void vtblk_create_disk(struct vtblk_softc *);

	static int vtblk_request_prealloc(struct vtblk_softc *);
	static void vtblk_request_free(struct vtblk_softc *);
	static struct vtblk_request *
	vtblk_request_dequeue(struct vtblk_softc *);
	static void vtblk_request_enqueue(struct vtblk_softc *,
	struct vtblk_request *);
	static struct vtblk_request *
	vtblk_request_next_ready(struct vtblk_softc *);
	static void vtblk_request_requeue_ready(struct vtblk_softc *,
	struct vtblk_request *);
	static struct vtblk_request *
	vtblk_request_next(struct vtblk_softc *);
	static struct vtblk_request *
	vtblk_request_bio(struct vtblk_softc *);
	static int vtblk_request_execute(struct vtblk_softc *,
	struct vtblk_request *);
	static int vtblk_request_error(struct vtblk_request *);

	static void vtblk_queue_completed(struct vtblk_softc *,
	struct bio_queue *);
	static void vtblk_done_completed(struct vtblk_softc *,
	struct bio_queue *);
	static void vtblk_drain_vq(struct vtblk_softc *);
	static void vtblk_drain(struct vtblk_softc *);

	static void vtblk_startio(struct vtblk_softc *);
	static void vtblk_bio_done(struct vtblk_softc , struct bio , int);

	static void vtblk_read_config(struct vtblk_softc *,
	struct virtio_blk_config *);
	static void vtblk_ident(struct vtblk_softc *);
	static int vtblk_poll_request(struct vtblk_softc *,
	struct vtblk_request *);
	static int vtblk_quiesce(struct vtblk_softc *);
	static void vtblk_vq_intr(void *);
	static void vtblk_stop(struct vtblk_softc *);

	static void vtblk_dump_quiesce(struct vtblk_softc *);
	static int vtblk_dump_write(struct vtblk_softc , void , off_t, size_t);
	static int vtblk_dump_flush(struct vtblk_softc *);
	static void vtblk_dump_complete(struct vtblk_softc *);

	static void vtblk_set_write_cache(struct vtblk_softc *, int);
	static int vtblk_write_cache_enabled(struct vtblk_softc *sc,
	struct virtio_blk_config *);
	static int vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS);

	static void vtblk_setup_sysctl(struct vtblk_softc *);
	static int vtblk_tunable_int(struct vtblk_softc , const char , int);

	/* Tunables. */
	static int vtblk_no_ident = 0;
	TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
	static int vtblk_writecache_mode = -1;
	TUNABLE_INT("hw.vtblk.writecache_mode", &vtblk_writecache_mode);

	/* Features desired/implemented by this driver. */
	#define VTBLK_FEATURES \
	(VIRTIO_BLK_F_BARRIER \| \
	VIRTIO_BLK_F_SIZE_MAX \| \
	VIRTIO_BLK_F_SEG_MAX \| \
	VIRTIO_BLK_F_GEOMETRY \| \
	VIRTIO_BLK_F_RO \| \
	VIRTIO_BLK_F_BLK_SIZE \| \
	VIRTIO_BLK_F_WCE \| \
	VIRTIO_BLK_F_TOPOLOGY \| \
	VIRTIO_BLK_F_CONFIG_WCE \| \
	VIRTIO_BLK_F_DISCARD \| \
	VIRTIO_RING_F_INDIRECT_DESC)

	#define VTBLK_MTX(_sc) &(_sc)->vtblk_mtx
	#define VTBLK_LOCK_INIT(_sc, _name) \
	mtx_init(VTBLK_MTX((_sc)), (_name), \
	"VirtIO Block Lock", MTX_DEF)
	#define VTBLK_LOCK(_sc) mtx_lock(VTBLK_MTX((_sc)))
	#define VTBLK_UNLOCK(_sc) mtx_unlock(VTBLK_MTX((_sc)))
	#define VTBLK_LOCK_DESTROY(_sc) mtx_destroy(VTBLK_MTX((_sc)))
	#define VTBLK_LOCK_ASSERT(_sc) mtx_assert(VTBLK_MTX((_sc)), MA_OWNED)
	#define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \
	mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED)

	#define VTBLK_DISK_NAME "vtbd"
	#define VTBLK_QUIESCE_TIMEOUT (30 * hz)

	/*
	* Each block request uses at least two segments - one for the header
	* and one for the status.
	*/
	#define VTBLK_MIN_SEGMENTS 2

	static device_method_t vtblk_methods[] = {
	/* Device methods. */
	DEVMETHOD(device_probe, vtblk_probe),
	DEVMETHOD(device_attach, vtblk_attach),
	DEVMETHOD(device_detach, vtblk_detach),
	DEVMETHOD(device_suspend, vtblk_suspend),
	DEVMETHOD(device_resume, vtblk_resume),
	DEVMETHOD(device_shutdown, vtblk_shutdown),

	/* VirtIO methods. */
	DEVMETHOD(virtio_config_change, vtblk_config_change),

	DEVMETHOD_END
	};

	static driver_t vtblk_driver = {
	"vtblk",
	vtblk_methods,
	sizeof(struct vtblk_softc)
	};
	static devclass_t vtblk_devclass;

	DRIVER_MODULE(virtio_blk, virtio_mmio, vtblk_driver, vtblk_devclass,
	vtblk_modevent, 0);
	DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass,
	vtblk_modevent, 0);
	MODULE_VERSION(virtio_blk, 1);
	MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);

	VIRTIO_SIMPLE_PNPTABLE(virtio_blk, VIRTIO_ID_BLOCK, "VirtIO Block Adapter");
	VIRTIO_SIMPLE_PNPINFO(virtio_mmio, virtio_blk);
	VIRTIO_SIMPLE_PNPINFO(virtio_pci, virtio_blk);

	static int
	vtblk_modevent(module_t mod, int type, void *unused)
	{
	int error;

	error = 0;

	switch (type) {
	case MOD_LOAD:
	case MOD_QUIESCE:
	case MOD_UNLOAD:
	case MOD_SHUTDOWN:
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}

	return (error);
	}

	static int
	vtblk_probe(device_t dev)
	{
	return (VIRTIO_SIMPLE_PROBE(dev, virtio_blk));
	}

	static int
	vtblk_attach(device_t dev)
	{
	struct vtblk_softc *sc;
	struct virtio_blk_config blkcfg;
	int error;

	virtio_set_feature_desc(dev, vtblk_feature_desc);

	sc = device_get_softc(dev);
	sc->vtblk_dev = dev;
	VTBLK_LOCK_INIT(sc, device_get_nameunit(dev));
	bioq_init(&sc->vtblk_bioq);
	TAILQ_INIT(&sc->vtblk_dump_queue);
	TAILQ_INIT(&sc->vtblk_req_free);
	TAILQ_INIT(&sc->vtblk_req_ready);

	vtblk_setup_sysctl(sc);
	vtblk_setup_features(sc);

	vtblk_read_config(sc, &blkcfg);

	/*
	* With the current sglist(9) implementation, it is not easy
	* for us to support a maximum segment size as adjacent
	* segments are coalesced. For now, just make sure it's larger
	* than the maximum supported transfer size.
	*/
	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
	- if (blkcfg.size_max < MAXPHYS) {
	+ if (blkcfg.size_max < maxphys) {
	error = ENOTSUP;
	device_printf(dev, "host requires unsupported "
	"maximum segment size feature\n");
	goto fail;
	}
	}

	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
	error = EINVAL;
	device_printf(dev, "fewer than minimum number of segments "
	"allowed: %d\n", sc->vtblk_max_nsegs);
	goto fail;
	}

	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
	if (sc->vtblk_sglist == NULL) {
	error = ENOMEM;
	device_printf(dev, "cannot allocate sglist\n");
	goto fail;
	}

	error = vtblk_alloc_virtqueue(sc);
	if (error) {
	device_printf(dev, "cannot allocate virtqueue\n");
	goto fail;
	}

	error = vtblk_request_prealloc(sc);
	if (error) {
	device_printf(dev, "cannot preallocate requests\n");
	goto fail;
	}

	vtblk_alloc_disk(sc, &blkcfg);

	error = virtio_setup_intr(dev, INTR_TYPE_BIO \| INTR_ENTROPY);
	if (error) {
	device_printf(dev, "cannot setup virtqueue interrupt\n");
	goto fail;
	}

	vtblk_create_disk(sc);

	virtqueue_enable_intr(sc->vtblk_vq);

	fail:
	if (error)
	vtblk_detach(dev);

	return (error);
	}

	static int
	vtblk_detach(device_t dev)
	{
	struct vtblk_softc *sc;

	sc = device_get_softc(dev);

	VTBLK_LOCK(sc);
	sc->vtblk_flags \|= VTBLK_FLAG_DETACH;
	if (device_is_attached(dev))
	vtblk_stop(sc);
	VTBLK_UNLOCK(sc);

	vtblk_drain(sc);

	if (sc->vtblk_disk != NULL) {
	disk_destroy(sc->vtblk_disk);
	sc->vtblk_disk = NULL;
	}

	if (sc->vtblk_sglist != NULL) {
	sglist_free(sc->vtblk_sglist);
	sc->vtblk_sglist = NULL;
	}

	VTBLK_LOCK_DESTROY(sc);

	return (0);
	}

	static int
	vtblk_suspend(device_t dev)
	{
	struct vtblk_softc *sc;
	int error;

	sc = device_get_softc(dev);

	VTBLK_LOCK(sc);
	sc->vtblk_flags \|= VTBLK_FLAG_SUSPEND;
	/* XXX BMV: virtio_stop(), etc needed here? */
	error = vtblk_quiesce(sc);
	if (error)
	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
	VTBLK_UNLOCK(sc);

	return (error);
	}

	static int
	vtblk_resume(device_t dev)
	{
	struct vtblk_softc *sc;

	sc = device_get_softc(dev);

	VTBLK_LOCK(sc);
	/* XXX BMV: virtio_reinit(), etc needed here? */
	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
	vtblk_startio(sc);
	VTBLK_UNLOCK(sc);

	return (0);
	}

	static int
	vtblk_shutdown(device_t dev)
	{

	return (0);
	}

	static int
	vtblk_config_change(device_t dev)
	{
	struct vtblk_softc *sc;
	struct virtio_blk_config blkcfg;
	uint64_t capacity;

	sc = device_get_softc(dev);

	vtblk_read_config(sc, &blkcfg);

	/* Capacity is always in 512-byte units. */
	capacity = blkcfg.capacity * VTBLK_BSIZE;

	if (sc->vtblk_disk->d_mediasize != capacity)
	vtblk_resize_disk(sc, capacity);

	return (0);
	}

	static int
	vtblk_open(struct disk *dp)
	{
	struct vtblk_softc *sc;

	if ((sc = dp->d_drv1) == NULL)
	return (ENXIO);

	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
	}

	static int
	vtblk_close(struct disk *dp)
	{
	struct vtblk_softc *sc;

	if ((sc = dp->d_drv1) == NULL)
	return (ENXIO);

	return (0);
	}

	static int
	vtblk_ioctl(struct disk dp, u_long cmd, void addr, int flag,
	struct thread *td)
	{
	struct vtblk_softc *sc;

	if ((sc = dp->d_drv1) == NULL)
	return (ENXIO);

	return (ENOTTY);
	}

	static int
	vtblk_dump(void arg, void virtual, vm_offset_t physical, off_t offset,
	size_t length)
	{
	struct disk *dp;
	struct vtblk_softc *sc;
	int error;

	dp = arg;
	error = 0;

	if ((sc = dp->d_drv1) == NULL)
	return (ENXIO);

	VTBLK_LOCK(sc);

	vtblk_dump_quiesce(sc);

	if (length > 0)
	error = vtblk_dump_write(sc, virtual, offset, length);
	if (error \|\| (virtual == NULL && offset == 0))
	vtblk_dump_complete(sc);

	VTBLK_UNLOCK(sc);

	return (error);
	}

	static void
	vtblk_strategy(struct bio *bp)
	{
	struct vtblk_softc *sc;

	if ((sc = bp->bio_disk->d_drv1) == NULL) {
	vtblk_bio_done(NULL, bp, EINVAL);
	return;
	}

	/*
	* Fail any write if RO. Unfortunately, there does not seem to
	* be a better way to report our readonly'ness to GEOM above.
	*/
	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
	(bp->bio_cmd == BIO_WRITE \|\| bp->bio_cmd == BIO_FLUSH \|\|
	bp->bio_cmd == BIO_DELETE)) {
	vtblk_bio_done(sc, bp, EROFS);
	return;
	}

	if ((bp->bio_cmd != BIO_READ) && (bp->bio_cmd != BIO_WRITE) &&
	(bp->bio_cmd != BIO_FLUSH) && (bp->bio_cmd != BIO_DELETE)) {
	vtblk_bio_done(sc, bp, EOPNOTSUPP);
	return;
	}

	VTBLK_LOCK(sc);

	if (sc->vtblk_flags & VTBLK_FLAG_DETACH) {
	VTBLK_UNLOCK(sc);
	vtblk_bio_done(sc, bp, ENXIO);
	return;
	}

	if ((bp->bio_cmd == BIO_DELETE) &&
	!(sc->vtblk_flags & VTBLK_FLAG_DISCARD)) {
	VTBLK_UNLOCK(sc);
	vtblk_bio_done(sc, bp, EOPNOTSUPP);
	return;
	}

	bioq_insert_tail(&sc->vtblk_bioq, bp);
	vtblk_startio(sc);

	VTBLK_UNLOCK(sc);
	}

	static void
	vtblk_negotiate_features(struct vtblk_softc *sc)
	{
	device_t dev;
	uint64_t features;

	dev = sc->vtblk_dev;
	features = VTBLK_FEATURES;

	sc->vtblk_features = virtio_negotiate_features(dev, features);
	}

	static void
	vtblk_setup_features(struct vtblk_softc *sc)
	{
	device_t dev;

	dev = sc->vtblk_dev;

	vtblk_negotiate_features(sc);

	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
	sc->vtblk_flags \|= VTBLK_FLAG_INDIRECT;
	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
	sc->vtblk_flags \|= VTBLK_FLAG_READONLY;
	if (virtio_with_feature(dev, VIRTIO_BLK_F_BARRIER))
	sc->vtblk_flags \|= VTBLK_FLAG_BARRIER;
	if (virtio_with_feature(dev, VIRTIO_BLK_F_CONFIG_WCE))
	sc->vtblk_flags \|= VTBLK_FLAG_WC_CONFIG;
	if (virtio_with_feature(dev, VIRTIO_BLK_F_DISCARD))
	sc->vtblk_flags \|= VTBLK_FLAG_DISCARD;
	}

	static int
	vtblk_maximum_segments(struct vtblk_softc *sc,
	struct virtio_blk_config *blkcfg)
	{
	device_t dev;
	int nsegs;

	dev = sc->vtblk_dev;
	nsegs = VTBLK_MIN_SEGMENTS;

	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
	- nsegs += MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1);
	+ nsegs += MIN(blkcfg->seg_max, maxphys / PAGE_SIZE + 1);
	if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
	nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
	} else
	nsegs += 1;

	return (nsegs);
	}

	static int
	vtblk_alloc_virtqueue(struct vtblk_softc *sc)
	{
	device_t dev;
	struct vq_alloc_info vq_info;

	dev = sc->vtblk_dev;

	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
	vtblk_vq_intr, sc, &sc->vtblk_vq,
	"%s request", device_get_nameunit(dev));

	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
	}

	static void
	vtblk_resize_disk(struct vtblk_softc *sc, uint64_t new_capacity)
	{
	device_t dev;
	struct disk *dp;
	int error;

	dev = sc->vtblk_dev;
	dp = sc->vtblk_disk;

	dp->d_mediasize = new_capacity;
	if (bootverbose) {
	device_printf(dev, "resized to %juMB (%ju %u byte sectors)\n",
	(uintmax_t) dp->d_mediasize >> 20,
	(uintmax_t) dp->d_mediasize / dp->d_sectorsize,
	dp->d_sectorsize);
	}

	error = disk_resize(dp, M_NOWAIT);
	if (error) {
	device_printf(dev,
	"disk_resize(9) failed, error: %d\n", error);
	}
	}

	static void
	vtblk_alloc_disk(struct vtblk_softc sc, struct virtio_blk_config blkcfg)
	{
	device_t dev;
	struct disk *dp;

	dev = sc->vtblk_dev;

	sc->vtblk_disk = dp = disk_alloc();
	dp->d_open = vtblk_open;
	dp->d_close = vtblk_close;
	dp->d_ioctl = vtblk_ioctl;
	dp->d_strategy = vtblk_strategy;
	dp->d_name = VTBLK_DISK_NAME;
	dp->d_unit = device_get_unit(dev);
	dp->d_drv1 = sc;
	dp->d_flags = DISKFLAG_CANFLUSHCACHE \| DISKFLAG_UNMAPPED_BIO \|
	DISKFLAG_DIRECT_COMPLETION;
	dp->d_hba_vendor = virtio_get_vendor(dev);
	dp->d_hba_device = virtio_get_device(dev);
	dp->d_hba_subvendor = virtio_get_subvendor(dev);
	dp->d_hba_subdevice = virtio_get_subdevice(dev);

	if ((sc->vtblk_flags & VTBLK_FLAG_READONLY) == 0)
	dp->d_dump = vtblk_dump;

	/* Capacity is always in 512-byte units. */
	dp->d_mediasize = blkcfg->capacity * VTBLK_BSIZE;

	if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
	dp->d_sectorsize = blkcfg->blk_size;
	else
	dp->d_sectorsize = VTBLK_BSIZE;

	/*
	* The VirtIO maximum I/O size is given in terms of segments.
	* However, FreeBSD limits I/O size by logical buffer size, not
	* by physically contiguous pages. Therefore, we have to assume
	* no pages are contiguous. This may impose an artificially low
	* maximum I/O size. But in practice, since QEMU advertises 128
	* segments, this gives us a maximum IO size of 125 * PAGE_SIZE,
	- * which is typically greater than MAXPHYS. Eventually we should
	- * just advertise MAXPHYS and split buffers that are too big.
	+ * which is typically greater than maxphys. Eventually we should
	+ * just advertise maxphys and split buffers that are too big.
	*
	* Note we must subtract one additional segment in case of non
	* page aligned buffers.
	*/
	dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS - 1) *
	PAGE_SIZE;
	if (dp->d_maxsize < PAGE_SIZE)
	dp->d_maxsize = PAGE_SIZE; /* XXX */

	if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) {
	dp->d_fwsectors = blkcfg->geometry.sectors;
	dp->d_fwheads = blkcfg->geometry.heads;
	}

	if (virtio_with_feature(dev, VIRTIO_BLK_F_TOPOLOGY) &&
	blkcfg->topology.physical_block_exp > 0) {
	dp->d_stripesize = dp->d_sectorsize *
	(1 << blkcfg->topology.physical_block_exp);
	dp->d_stripeoffset = (dp->d_stripesize -
	blkcfg->topology.alignment_offset * dp->d_sectorsize) %
	dp->d_stripesize;
	}

	if (virtio_with_feature(dev, VIRTIO_BLK_F_DISCARD)) {
	dp->d_flags \|= DISKFLAG_CANDELETE;
	dp->d_delmaxsize = blkcfg->max_discard_sectors * VTBLK_BSIZE;
	}

	if (vtblk_write_cache_enabled(sc, blkcfg) != 0)
	sc->vtblk_write_cache = VTBLK_CACHE_WRITEBACK;
	else
	sc->vtblk_write_cache = VTBLK_CACHE_WRITETHROUGH;
	}

	static void
	vtblk_create_disk(struct vtblk_softc *sc)
	{
	struct disk *dp;

	dp = sc->vtblk_disk;

	vtblk_ident(sc);

	device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n",
	(uintmax_t) dp->d_mediasize >> 20,
	(uintmax_t) dp->d_mediasize / dp->d_sectorsize,
	dp->d_sectorsize);

	disk_create(dp, DISK_VERSION);
	}

	static int
	vtblk_request_prealloc(struct vtblk_softc *sc)
	{
	struct vtblk_request *req;
	int i, nreqs;

	nreqs = virtqueue_size(sc->vtblk_vq);

	/*
	* Preallocate sufficient requests to keep the virtqueue full. Each
	* request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
	* the number allocated when indirect descriptors are not available.
	*/
	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
	nreqs /= VTBLK_MIN_SEGMENTS;

	for (i = 0; i < nreqs; i++) {
	req = malloc(sizeof(struct vtblk_request), M_DEVBUF, M_NOWAIT);
	if (req == NULL)
	return (ENOMEM);

	MPASS(sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr)) == 1);
	MPASS(sglist_count(&req->vbr_ack, sizeof(req->vbr_ack)) == 1);

	sc->vtblk_request_count++;
	vtblk_request_enqueue(sc, req);
	}

	return (0);
	}

	static void
	vtblk_request_free(struct vtblk_softc *sc)
	{
	struct vtblk_request *req;

	MPASS(TAILQ_EMPTY(&sc->vtblk_req_ready));

	while ((req = vtblk_request_dequeue(sc)) != NULL) {
	sc->vtblk_request_count--;
	free(req, M_DEVBUF);
	}

	KASSERT(sc->vtblk_request_count == 0,
	("%s: leaked %d requests", __func__, sc->vtblk_request_count));
	}

	static struct vtblk_request *
	vtblk_request_dequeue(struct vtblk_softc *sc)
	{
	struct vtblk_request *req;

	req = TAILQ_FIRST(&sc->vtblk_req_free);
	if (req != NULL) {
	TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
	bzero(req, sizeof(struct vtblk_request));
	}

	return (req);
	}

	static void
	vtblk_request_enqueue(struct vtblk_softc sc, struct vtblk_request req)
	{

	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
	}

	static struct vtblk_request *
	vtblk_request_next_ready(struct vtblk_softc *sc)
	{
	struct vtblk_request *req;

	req = TAILQ_FIRST(&sc->vtblk_req_ready);
	if (req != NULL)
	TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);

	return (req);
	}

	static void
	vtblk_request_requeue_ready(struct vtblk_softc sc, struct vtblk_request req)
	{

	/* NOTE: Currently, there will be at most one request in the queue. */
	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
	}

	static struct vtblk_request *
	vtblk_request_next(struct vtblk_softc *sc)
	{
	struct vtblk_request *req;

	req = vtblk_request_next_ready(sc);
	if (req != NULL)
	return (req);

	return (vtblk_request_bio(sc));
	}

	static struct vtblk_request *
	vtblk_request_bio(struct vtblk_softc *sc)
	{
	struct bio_queue_head *bioq;
	struct vtblk_request *req;
	struct bio *bp;

	bioq = &sc->vtblk_bioq;

	if (bioq_first(bioq) == NULL)
	return (NULL);

	req = vtblk_request_dequeue(sc);
	if (req == NULL)
	return (NULL);

	bp = bioq_takefirst(bioq);
	req->vbr_bp = bp;
	req->vbr_ack = -1;
	req->vbr_hdr.ioprio = 1;

	switch (bp->bio_cmd) {
	case BIO_FLUSH:
	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
	break;
	case BIO_READ:
	req->vbr_hdr.type = VIRTIO_BLK_T_IN;
	req->vbr_hdr.sector = bp->bio_offset / VTBLK_BSIZE;
	break;
	case BIO_WRITE:
	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
	req->vbr_hdr.sector = bp->bio_offset / VTBLK_BSIZE;
	break;
	case BIO_DELETE:
	req->vbr_hdr.type = VIRTIO_BLK_T_DISCARD;
	req->vbr_hdr.sector = bp->bio_offset / VTBLK_BSIZE;
	break;
	default:
	panic("%s: bio with unhandled cmd: %d", __func__, bp->bio_cmd);
	}

	if (bp->bio_flags & BIO_ORDERED)
	req->vbr_hdr.type \|= VIRTIO_BLK_T_BARRIER;

	return (req);
	}

	static int
	vtblk_request_execute(struct vtblk_softc sc, struct vtblk_request req)
	{
	struct virtqueue *vq;
	struct sglist *sg;
	struct bio *bp;
	int ordered, readable, writable, error;

	vq = sc->vtblk_vq;
	sg = sc->vtblk_sglist;
	bp = req->vbr_bp;
	ordered = 0;
	writable = 0;

	/*
	* Some hosts (such as bhyve) do not implement the barrier feature,
	* so we emulate it in the driver by allowing the barrier request
	* to be the only one in flight.
	*/
	if ((sc->vtblk_flags & VTBLK_FLAG_BARRIER) == 0) {
	if (sc->vtblk_req_ordered != NULL)
	return (EBUSY);
	if (bp->bio_flags & BIO_ORDERED) {
	if (!virtqueue_empty(vq))
	return (EBUSY);
	ordered = 1;
	req->vbr_hdr.type &= ~VIRTIO_BLK_T_BARRIER;
	}
	}

	sglist_reset(sg);
	sglist_append(sg, &req->vbr_hdr, sizeof(struct virtio_blk_outhdr));

	if (bp->bio_cmd == BIO_READ \|\| bp->bio_cmd == BIO_WRITE) {
	error = sglist_append_bio(sg, bp);
	if (error \|\| sg->sg_nseg == sg->sg_maxseg) {
	panic("%s: bio %p data buffer too big %d",
	__func__, bp, error);
	}

	/* BIO_READ means the host writes into our buffer. */
	if (bp->bio_cmd == BIO_READ)
	writable = sg->sg_nseg - 1;
	} else if (bp->bio_cmd == BIO_DELETE) {
	struct virtio_blk_discard_write_zeroes *discard;

	discard = malloc(sizeof(*discard), M_DEVBUF, M_NOWAIT \| M_ZERO);
	if (discard == NULL)
	return (ENOMEM);
	discard->sector = bp->bio_offset / VTBLK_BSIZE;
	discard->num_sectors = bp->bio_bcount / VTBLK_BSIZE;
	bp->bio_driver1 = discard;
	error = sglist_append(sg, discard, sizeof(*discard));
	if (error \|\| sg->sg_nseg == sg->sg_maxseg) {
	panic("%s: bio %p data buffer too big %d",
	__func__, bp, error);
	}
	}

	writable++;
	sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
	readable = sg->sg_nseg - writable;

	error = virtqueue_enqueue(vq, req, sg, readable, writable);
	if (error == 0 && ordered)
	sc->vtblk_req_ordered = req;

	return (error);
	}

	static int
	vtblk_request_error(struct vtblk_request *req)
	{
	int error;

	switch (req->vbr_ack) {
	case VIRTIO_BLK_S_OK:
	error = 0;
	break;
	case VIRTIO_BLK_S_UNSUPP:
	error = ENOTSUP;
	break;
	default:
	error = EIO;
	break;
	}

	return (error);
	}

	static void
	vtblk_queue_completed(struct vtblk_softc sc, struct bio_queue queue)
	{
	struct vtblk_request *req;
	struct bio *bp;

	while ((req = virtqueue_dequeue(sc->vtblk_vq, NULL)) != NULL) {
	if (sc->vtblk_req_ordered != NULL) {
	MPASS(sc->vtblk_req_ordered == req);
	sc->vtblk_req_ordered = NULL;
	}

	bp = req->vbr_bp;
	bp->bio_error = vtblk_request_error(req);
	TAILQ_INSERT_TAIL(queue, bp, bio_queue);

	vtblk_request_enqueue(sc, req);
	}
	}

	static void
	vtblk_done_completed(struct vtblk_softc sc, struct bio_queue queue)
	{
	struct bio bp, tmp;

	TAILQ_FOREACH_SAFE(bp, queue, bio_queue, tmp) {
	if (bp->bio_error != 0)
	disk_err(bp, "hard error", -1, 1);
	vtblk_bio_done(sc, bp, bp->bio_error);
	}
	}

	static void
	vtblk_drain_vq(struct vtblk_softc *sc)
	{
	struct virtqueue *vq;
	struct vtblk_request *req;
	int last;

	vq = sc->vtblk_vq;
	last = 0;

	while ((req = virtqueue_drain(vq, &last)) != NULL) {
	vtblk_bio_done(sc, req->vbr_bp, ENXIO);
	vtblk_request_enqueue(sc, req);
	}

	sc->vtblk_req_ordered = NULL;
	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
	}

	static void
	vtblk_drain(struct vtblk_softc *sc)
	{
	struct bio_queue queue;
	struct bio_queue_head *bioq;
	struct vtblk_request *req;
	struct bio *bp;

	bioq = &sc->vtblk_bioq;
	TAILQ_INIT(&queue);

	if (sc->vtblk_vq != NULL) {
	vtblk_queue_completed(sc, &queue);
	vtblk_done_completed(sc, &queue);

	vtblk_drain_vq(sc);
	}

	while ((req = vtblk_request_next_ready(sc)) != NULL) {
	vtblk_bio_done(sc, req->vbr_bp, ENXIO);
	vtblk_request_enqueue(sc, req);
	}

	while (bioq_first(bioq) != NULL) {
	bp = bioq_takefirst(bioq);
	vtblk_bio_done(sc, bp, ENXIO);
	}

	vtblk_request_free(sc);
	}

	static void
	vtblk_startio(struct vtblk_softc *sc)
	{
	struct virtqueue *vq;
	struct vtblk_request *req;
	int enq;

	VTBLK_LOCK_ASSERT(sc);
	vq = sc->vtblk_vq;
	enq = 0;

	if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND)
	return;

	while (!virtqueue_full(vq)) {
	req = vtblk_request_next(sc);
	if (req == NULL)
	break;

	if (vtblk_request_execute(sc, req) != 0) {
	vtblk_request_requeue_ready(sc, req);
	break;
	}

	enq++;
	}

	if (enq > 0)
	virtqueue_notify(vq);
	}

	static void
	vtblk_bio_done(struct vtblk_softc sc, struct bio bp, int error)
	{

	/* Because of GEOM direct dispatch, we cannot hold any locks. */
	if (sc != NULL)
	VTBLK_LOCK_ASSERT_NOTOWNED(sc);

	if (error) {
	bp->bio_resid = bp->bio_bcount;
	bp->bio_error = error;
	bp->bio_flags \|= BIO_ERROR;
	}

	if (bp->bio_driver1 != NULL) {
	free(bp->bio_driver1, M_DEVBUF);
	bp->bio_driver1 = NULL;
	}

	biodone(bp);
	}

	#define VTBLK_GET_CONFIG(_dev, _feature, _field, _cfg) \
	if (virtio_with_feature(_dev, _feature)) { \
	virtio_read_device_config(_dev, \
	offsetof(struct virtio_blk_config, _field), \
	&(_cfg)->_field, sizeof((_cfg)->_field)); \
	}

	static void
	vtblk_read_config(struct vtblk_softc sc, struct virtio_blk_config blkcfg)
	{
	device_t dev;

	dev = sc->vtblk_dev;

	bzero(blkcfg, sizeof(struct virtio_blk_config));

	/* The capacity is always available. */
	virtio_read_device_config(dev, offsetof(struct virtio_blk_config,
	capacity), &blkcfg->capacity, sizeof(blkcfg->capacity));

	/* Read the configuration if the feature was negotiated. */
	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_SIZE_MAX, size_max, blkcfg);
	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_SEG_MAX, seg_max, blkcfg);
	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY, geometry, blkcfg);
	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_BLK_SIZE, blk_size, blkcfg);
	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY, topology, blkcfg);
	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_CONFIG_WCE, wce, blkcfg);
	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_sectors,
	blkcfg);
	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_seg, blkcfg);
	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, discard_sector_alignment,
	blkcfg);
	}

	#undef VTBLK_GET_CONFIG

	static void
	vtblk_ident(struct vtblk_softc *sc)
	{
	struct bio buf;
	struct disk *dp;
	struct vtblk_request *req;
	int len, error;

	dp = sc->vtblk_disk;
	len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE);

	if (vtblk_tunable_int(sc, "no_ident", vtblk_no_ident) != 0)
	return;

	req = vtblk_request_dequeue(sc);
	if (req == NULL)
	return;

	req->vbr_ack = -1;
	req->vbr_hdr.type = VIRTIO_BLK_T_GET_ID;
	req->vbr_hdr.ioprio = 1;
	req->vbr_hdr.sector = 0;

	req->vbr_bp = &buf;
	g_reset_bio(&buf);

	buf.bio_cmd = BIO_READ;
	buf.bio_data = dp->d_ident;
	buf.bio_bcount = len;

	VTBLK_LOCK(sc);
	error = vtblk_poll_request(sc, req);
	VTBLK_UNLOCK(sc);

	vtblk_request_enqueue(sc, req);

	if (error) {
	device_printf(sc->vtblk_dev,
	"error getting device identifier: %d\n", error);
	}
	}

	static int
	vtblk_poll_request(struct vtblk_softc sc, struct vtblk_request req)
	{
	struct virtqueue *vq;
	int error;

	vq = sc->vtblk_vq;

	if (!virtqueue_empty(vq))
	return (EBUSY);

	error = vtblk_request_execute(sc, req);
	if (error)
	return (error);

	virtqueue_notify(vq);
	virtqueue_poll(vq, NULL);

	error = vtblk_request_error(req);
	if (error && bootverbose) {
	device_printf(sc->vtblk_dev,
	"%s: IO error: %d\n", __func__, error);
	}

	return (error);
	}

	static int
	vtblk_quiesce(struct vtblk_softc *sc)
	{
	int error;

	VTBLK_LOCK_ASSERT(sc);
	error = 0;

	while (!virtqueue_empty(sc->vtblk_vq)) {
	if (mtx_sleep(&sc->vtblk_vq, VTBLK_MTX(sc), PRIBIO, "vtblkq",
	VTBLK_QUIESCE_TIMEOUT) == EWOULDBLOCK) {
	error = EBUSY;
	break;
	}
	}

	return (error);
	}

	static void
	vtblk_vq_intr(void *xsc)
	{
	struct vtblk_softc *sc;
	struct virtqueue *vq;
	struct bio_queue queue;

	sc = xsc;
	vq = sc->vtblk_vq;
	TAILQ_INIT(&queue);

	VTBLK_LOCK(sc);

	again:
	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
	goto out;

	vtblk_queue_completed(sc, &queue);
	vtblk_startio(sc);

	if (virtqueue_enable_intr(vq) != 0) {
	virtqueue_disable_intr(vq);
	goto again;
	}

	if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND)
	wakeup(&sc->vtblk_vq);

	out:
	VTBLK_UNLOCK(sc);
	vtblk_done_completed(sc, &queue);
	}

	static void
	vtblk_stop(struct vtblk_softc *sc)
	{

	virtqueue_disable_intr(sc->vtblk_vq);
	virtio_stop(sc->vtblk_dev);
	}

	static void
	vtblk_dump_quiesce(struct vtblk_softc *sc)
	{

	/*
	* Spin here until all the requests in-flight at the time of the
	* dump are completed and queued. The queued requests will be
	* biodone'd once the dump is finished.
	*/
	while (!virtqueue_empty(sc->vtblk_vq))
	vtblk_queue_completed(sc, &sc->vtblk_dump_queue);
	}

	static int
	vtblk_dump_write(struct vtblk_softc sc, void virtual, off_t offset,
	size_t length)
	{
	struct bio buf;
	struct vtblk_request *req;

	req = &sc->vtblk_dump_request;
	req->vbr_ack = -1;
	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
	req->vbr_hdr.ioprio = 1;
	req->vbr_hdr.sector = offset / VTBLK_BSIZE;

	req->vbr_bp = &buf;
	g_reset_bio(&buf);

	buf.bio_cmd = BIO_WRITE;
	buf.bio_data = virtual;
	buf.bio_bcount = length;

	return (vtblk_poll_request(sc, req));
	}

	static int
	vtblk_dump_flush(struct vtblk_softc *sc)
	{
	struct bio buf;
	struct vtblk_request *req;

	req = &sc->vtblk_dump_request;
	req->vbr_ack = -1;
	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
	req->vbr_hdr.ioprio = 1;
	req->vbr_hdr.sector = 0;

	req->vbr_bp = &buf;
	g_reset_bio(&buf);

	buf.bio_cmd = BIO_FLUSH;

	return (vtblk_poll_request(sc, req));
	}

	static void
	vtblk_dump_complete(struct vtblk_softc *sc)
	{

	vtblk_dump_flush(sc);

	VTBLK_UNLOCK(sc);
	vtblk_done_completed(sc, &sc->vtblk_dump_queue);
	VTBLK_LOCK(sc);
	}

	static void
	vtblk_set_write_cache(struct vtblk_softc *sc, int wc)
	{

	/* Set either writeback (1) or writethrough (0) mode. */
	virtio_write_dev_config_1(sc->vtblk_dev,
	offsetof(struct virtio_blk_config, wce), wc);
	}

	static int
	vtblk_write_cache_enabled(struct vtblk_softc *sc,
	struct virtio_blk_config *blkcfg)
	{
	int wc;

	if (sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) {
	wc = vtblk_tunable_int(sc, "writecache_mode",
	vtblk_writecache_mode);
	if (wc >= 0 && wc < VTBLK_CACHE_MAX)
	vtblk_set_write_cache(sc, wc);
	else
	wc = blkcfg->wce;
	} else
	wc = virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_WCE);

	return (wc);
	}

	static int
	vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS)
	{
	struct vtblk_softc *sc;
	int wc, error;

	sc = oidp->oid_arg1;
	wc = sc->vtblk_write_cache;

	error = sysctl_handle_int(oidp, &wc, 0, req);
	if (error \|\| req->newptr == NULL)
	return (error);
	if ((sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) == 0)
	return (EPERM);
	if (wc < 0 \|\| wc >= VTBLK_CACHE_MAX)
	return (EINVAL);

	VTBLK_LOCK(sc);
	sc->vtblk_write_cache = wc;
	vtblk_set_write_cache(sc, sc->vtblk_write_cache);
	VTBLK_UNLOCK(sc);

	return (0);
	}

	static void
	vtblk_setup_sysctl(struct vtblk_softc *sc)
	{
	device_t dev;
	struct sysctl_ctx_list *ctx;
	struct sysctl_oid *tree;
	struct sysctl_oid_list *child;

	dev = sc->vtblk_dev;
	ctx = device_get_sysctl_ctx(dev);
	tree = device_get_sysctl_tree(dev);
	child = SYSCTL_CHILDREN(tree);

	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "writecache_mode",
	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_MPSAFE, sc, 0,
	vtblk_write_cache_sysctl, "I",
	"Write cache mode (writethrough (0) or writeback (1))");
	}

	static int
	vtblk_tunable_int(struct vtblk_softc sc, const char knob, int def)
	{
	char path[64];

	snprintf(path, sizeof(path),
	"hw.vtblk.%d.%s", device_get_unit(sc->vtblk_dev), knob);
	TUNABLE_INT_FETCH(path, &def);

	return (def);
	}
	diff --git a/sys/dev/virtio/scsi/virtio_scsi.c b/sys/dev/virtio/scsi/virtio_scsi.c
	index 6412fedaadbc..332f71ef26bf 100644
	--- a/sys/dev/virtio/scsi/virtio_scsi.c
	+++ b/sys/dev/virtio/scsi/virtio_scsi.c
	@@ -1,2340 +1,2340 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2012, Bryan Venteicher <bryanv@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice unmodified, this list of conditions, and the following
	* disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	/* Driver for VirtIO SCSI devices. */

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/sglist.h>
	#include <sys/sysctl.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/callout.h>
	#include <sys/queue.h>
	#include <sys/sbuf.h>

	#include <machine/stdarg.h>

	#include <machine/bus.h>
	#include <machine/resource.h>
	#include <sys/bus.h>
	#include <sys/rman.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/cam_debug.h>
	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_message.h>

	#include <dev/virtio/virtio.h>
	#include <dev/virtio/virtqueue.h>
	#include <dev/virtio/scsi/virtio_scsi.h>
	#include <dev/virtio/scsi/virtio_scsivar.h>

	#include "virtio_if.h"

	static int vtscsi_modevent(module_t, int, void *);

	static int vtscsi_probe(device_t);
	static int vtscsi_attach(device_t);
	static int vtscsi_detach(device_t);
	static int vtscsi_suspend(device_t);
	static int vtscsi_resume(device_t);

	static void vtscsi_negotiate_features(struct vtscsi_softc *);
	static void vtscsi_read_config(struct vtscsi_softc *,
	struct virtio_scsi_config *);
	static int vtscsi_maximum_segments(struct vtscsi_softc *, int);
	static int vtscsi_alloc_virtqueues(struct vtscsi_softc *);
	static void vtscsi_check_sizes(struct vtscsi_softc *);
	static void vtscsi_write_device_config(struct vtscsi_softc *);
	static int vtscsi_reinit(struct vtscsi_softc *);

	static int vtscsi_alloc_cam(struct vtscsi_softc *);
	static int vtscsi_register_cam(struct vtscsi_softc *);
	static void vtscsi_free_cam(struct vtscsi_softc *);
	static void vtscsi_cam_async(void , uint32_t, struct cam_path , void *);
	static int vtscsi_register_async(struct vtscsi_softc *);
	static void vtscsi_deregister_async(struct vtscsi_softc *);
	static void vtscsi_cam_action(struct cam_sim , union ccb );
	static void vtscsi_cam_poll(struct cam_sim *);

	static void vtscsi_cam_scsi_io(struct vtscsi_softc , struct cam_sim ,
	union ccb *);
	static void vtscsi_cam_get_tran_settings(struct vtscsi_softc *,
	union ccb *);
	static void vtscsi_cam_reset_bus(struct vtscsi_softc , union ccb );
	static void vtscsi_cam_reset_dev(struct vtscsi_softc , union ccb );
	static void vtscsi_cam_abort(struct vtscsi_softc , union ccb );
	static void vtscsi_cam_path_inquiry(struct vtscsi_softc *,
	struct cam_sim , union ccb );

	static int vtscsi_sg_append_scsi_buf(struct vtscsi_softc *,
	struct sglist , struct ccb_scsiio );
	static int vtscsi_fill_scsi_cmd_sglist(struct vtscsi_softc *,
	struct vtscsi_request , int , int *);
	static int vtscsi_execute_scsi_cmd(struct vtscsi_softc *,
	struct vtscsi_request *);
	static int vtscsi_start_scsi_cmd(struct vtscsi_softc , union ccb );
	static void vtscsi_complete_abort_timedout_scsi_cmd(struct vtscsi_softc *,
	struct vtscsi_request *);
	static int vtscsi_abort_timedout_scsi_cmd(struct vtscsi_softc *,
	struct vtscsi_request *);
	static void vtscsi_timedout_scsi_cmd(void *);
	static cam_status vtscsi_scsi_cmd_cam_status(struct virtio_scsi_cmd_resp *);
	static cam_status vtscsi_complete_scsi_cmd_response(struct vtscsi_softc *,
	struct ccb_scsiio , struct virtio_scsi_cmd_resp );
	static void vtscsi_complete_scsi_cmd(struct vtscsi_softc *,
	struct vtscsi_request *);

	static void vtscsi_poll_ctrl_req(struct vtscsi_softc *,
	struct vtscsi_request *);
	static int vtscsi_execute_ctrl_req(struct vtscsi_softc *,
	struct vtscsi_request , struct sglist , int, int, int);
	static void vtscsi_complete_abort_task_cmd(struct vtscsi_softc *c,
	struct vtscsi_request *);
	static int vtscsi_execute_abort_task_cmd(struct vtscsi_softc *,
	struct vtscsi_request *);
	static int vtscsi_execute_reset_dev_cmd(struct vtscsi_softc *,
	struct vtscsi_request *);

	static void vtscsi_get_request_lun(uint8_t [], target_id_t , lun_id_t );
	static void vtscsi_set_request_lun(struct ccb_hdr *, uint8_t []);
	static void vtscsi_init_scsi_cmd_req(struct ccb_scsiio *,
	struct virtio_scsi_cmd_req *);
	static void vtscsi_init_ctrl_tmf_req(struct ccb_hdr *, uint32_t,
	uintptr_t, struct virtio_scsi_ctrl_tmf_req *);

	static void vtscsi_freeze_simq(struct vtscsi_softc *, int);
	static int vtscsi_thaw_simq(struct vtscsi_softc *, int);

	static void vtscsi_announce(struct vtscsi_softc *, uint32_t, target_id_t,
	lun_id_t);
	static void vtscsi_execute_rescan(struct vtscsi_softc *, target_id_t,
	lun_id_t);
	static void vtscsi_execute_rescan_bus(struct vtscsi_softc *);

	static void vtscsi_handle_event(struct vtscsi_softc *,
	struct virtio_scsi_event *);
	static int vtscsi_enqueue_event_buf(struct vtscsi_softc *,
	struct virtio_scsi_event *);
	static int vtscsi_init_event_vq(struct vtscsi_softc *);
	static void vtscsi_reinit_event_vq(struct vtscsi_softc *);
	static void vtscsi_drain_event_vq(struct vtscsi_softc *);

	static void vtscsi_complete_vqs_locked(struct vtscsi_softc *);
	static void vtscsi_complete_vqs(struct vtscsi_softc *);
	static void vtscsi_drain_vqs(struct vtscsi_softc *);
	static void vtscsi_cancel_request(struct vtscsi_softc *,
	struct vtscsi_request *);
	static void vtscsi_drain_vq(struct vtscsi_softc , struct virtqueue );
	static void vtscsi_stop(struct vtscsi_softc *);
	static int vtscsi_reset_bus(struct vtscsi_softc *);

	static void vtscsi_init_request(struct vtscsi_softc *,
	struct vtscsi_request *);
	static int vtscsi_alloc_requests(struct vtscsi_softc *);
	static void vtscsi_free_requests(struct vtscsi_softc *);
	static void vtscsi_enqueue_request(struct vtscsi_softc *,
	struct vtscsi_request *);
	static struct vtscsi_request * vtscsi_dequeue_request(struct vtscsi_softc *);

	static void vtscsi_complete_request(struct vtscsi_request *);
	static void vtscsi_complete_vq(struct vtscsi_softc , struct virtqueue );

	static void vtscsi_control_vq_intr(void *);
	static void vtscsi_event_vq_intr(void *);
	static void vtscsi_request_vq_intr(void *);
	static void vtscsi_disable_vqs_intr(struct vtscsi_softc *);
	static void vtscsi_enable_vqs_intr(struct vtscsi_softc *);

	static void vtscsi_get_tunables(struct vtscsi_softc *);
	static void vtscsi_add_sysctl(struct vtscsi_softc *);

	static void vtscsi_printf_req(struct vtscsi_request , const char ,
	const char *, ...);

	/* Global tunables. */
	/*
	* The current QEMU VirtIO SCSI implementation does not cancel in-flight
	* IO during virtio_stop(). So in-flight requests still complete after the
	* device reset. We would have to wait for all the in-flight IO to complete,
	* which defeats the typical purpose of a bus reset. We could simulate the
	* bus reset with either I_T_NEXUS_RESET of all the targets, or with
	* LOGICAL_UNIT_RESET of all the LUNs (assuming there is space in the
	* control virtqueue). But this isn't very useful if things really go off
	* the rails, so default to disabled for now.
	*/
	static int vtscsi_bus_reset_disable = 1;
	TUNABLE_INT("hw.vtscsi.bus_reset_disable", &vtscsi_bus_reset_disable);

	static struct virtio_feature_desc vtscsi_feature_desc[] = {
	{ VIRTIO_SCSI_F_INOUT, "InOut" },
	{ VIRTIO_SCSI_F_HOTPLUG, "Hotplug" },
	{ 0, NULL }
	};

	static device_method_t vtscsi_methods[] = {
	/* Device methods. */
	DEVMETHOD(device_probe, vtscsi_probe),
	DEVMETHOD(device_attach, vtscsi_attach),
	DEVMETHOD(device_detach, vtscsi_detach),
	DEVMETHOD(device_suspend, vtscsi_suspend),
	DEVMETHOD(device_resume, vtscsi_resume),

	DEVMETHOD_END
	};

	static driver_t vtscsi_driver = {
	"vtscsi",
	vtscsi_methods,
	sizeof(struct vtscsi_softc)
	};
	static devclass_t vtscsi_devclass;

	DRIVER_MODULE(virtio_scsi, virtio_mmio, vtscsi_driver, vtscsi_devclass,
	vtscsi_modevent, 0);
	DRIVER_MODULE(virtio_scsi, virtio_pci, vtscsi_driver, vtscsi_devclass,
	vtscsi_modevent, 0);
	MODULE_VERSION(virtio_scsi, 1);
	MODULE_DEPEND(virtio_scsi, virtio, 1, 1, 1);
	MODULE_DEPEND(virtio_scsi, cam, 1, 1, 1);

	VIRTIO_SIMPLE_PNPTABLE(virtio_scsi, VIRTIO_ID_SCSI, "VirtIO SCSI Adapter");
	VIRTIO_SIMPLE_PNPINFO(virtio_mmio, virtio_scsi);
	VIRTIO_SIMPLE_PNPINFO(virtio_pci, virtio_scsi);

	static int
	vtscsi_modevent(module_t mod, int type, void *unused)
	{
	int error;

	switch (type) {
	case MOD_LOAD:
	case MOD_QUIESCE:
	case MOD_UNLOAD:
	case MOD_SHUTDOWN:
	error = 0;
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}

	return (error);
	}

	static int
	vtscsi_probe(device_t dev)
	{
	return (VIRTIO_SIMPLE_PROBE(dev, virtio_scsi));
	}

	static int
	vtscsi_attach(device_t dev)
	{
	struct vtscsi_softc *sc;
	struct virtio_scsi_config scsicfg;
	int error;

	sc = device_get_softc(dev);
	sc->vtscsi_dev = dev;

	VTSCSI_LOCK_INIT(sc, device_get_nameunit(dev));
	TAILQ_INIT(&sc->vtscsi_req_free);

	vtscsi_get_tunables(sc);
	vtscsi_add_sysctl(sc);

	virtio_set_feature_desc(dev, vtscsi_feature_desc);
	vtscsi_negotiate_features(sc);

	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
	sc->vtscsi_flags \|= VTSCSI_FLAG_INDIRECT;
	if (virtio_with_feature(dev, VIRTIO_SCSI_F_INOUT))
	sc->vtscsi_flags \|= VTSCSI_FLAG_BIDIRECTIONAL;
	if (virtio_with_feature(dev, VIRTIO_SCSI_F_HOTPLUG))
	sc->vtscsi_flags \|= VTSCSI_FLAG_HOTPLUG;

	vtscsi_read_config(sc, &scsicfg);

	sc->vtscsi_max_channel = scsicfg.max_channel;
	sc->vtscsi_max_target = scsicfg.max_target;
	sc->vtscsi_max_lun = scsicfg.max_lun;
	sc->vtscsi_event_buf_size = scsicfg.event_info_size;

	vtscsi_write_device_config(sc);

	sc->vtscsi_max_nsegs = vtscsi_maximum_segments(sc, scsicfg.seg_max);
	sc->vtscsi_sglist = sglist_alloc(sc->vtscsi_max_nsegs, M_NOWAIT);
	if (sc->vtscsi_sglist == NULL) {
	error = ENOMEM;
	device_printf(dev, "cannot allocate sglist\n");
	goto fail;
	}

	error = vtscsi_alloc_virtqueues(sc);
	if (error) {
	device_printf(dev, "cannot allocate virtqueues\n");
	goto fail;
	}

	vtscsi_check_sizes(sc);

	error = vtscsi_init_event_vq(sc);
	if (error) {
	device_printf(dev, "cannot populate the eventvq\n");
	goto fail;
	}

	error = vtscsi_alloc_requests(sc);
	if (error) {
	device_printf(dev, "cannot allocate requests\n");
	goto fail;
	}

	error = vtscsi_alloc_cam(sc);
	if (error) {
	device_printf(dev, "cannot allocate CAM structures\n");
	goto fail;
	}

	error = virtio_setup_intr(dev, INTR_TYPE_CAM);
	if (error) {
	device_printf(dev, "cannot setup virtqueue interrupts\n");
	goto fail;
	}

	vtscsi_enable_vqs_intr(sc);

	/*
	* Register with CAM after interrupts are enabled so we will get
	* notified of the probe responses.
	*/
	error = vtscsi_register_cam(sc);
	if (error) {
	device_printf(dev, "cannot register with CAM\n");
	goto fail;
	}

	fail:
	if (error)
	vtscsi_detach(dev);

	return (error);
	}

	static int
	vtscsi_detach(device_t dev)
	{
	struct vtscsi_softc *sc;

	sc = device_get_softc(dev);

	VTSCSI_LOCK(sc);
	sc->vtscsi_flags \|= VTSCSI_FLAG_DETACH;
	if (device_is_attached(dev))
	vtscsi_stop(sc);
	VTSCSI_UNLOCK(sc);

	vtscsi_complete_vqs(sc);
	vtscsi_drain_vqs(sc);

	vtscsi_free_cam(sc);
	vtscsi_free_requests(sc);

	if (sc->vtscsi_sglist != NULL) {
	sglist_free(sc->vtscsi_sglist);
	sc->vtscsi_sglist = NULL;
	}

	VTSCSI_LOCK_DESTROY(sc);

	return (0);
	}

	static int
	vtscsi_suspend(device_t dev)
	{

	return (0);
	}

	static int
	vtscsi_resume(device_t dev)
	{

	return (0);
	}

	static void
	vtscsi_negotiate_features(struct vtscsi_softc *sc)
	{
	device_t dev;
	uint64_t features;

	dev = sc->vtscsi_dev;
	features = virtio_negotiate_features(dev, VTSCSI_FEATURES);
	sc->vtscsi_features = features;
	}

	#define VTSCSI_GET_CONFIG(_dev, _field, _cfg) \
	virtio_read_device_config(_dev, \
	offsetof(struct virtio_scsi_config, _field), \
	&(_cfg)->_field, sizeof((_cfg)->_field)) \

	static void
	vtscsi_read_config(struct vtscsi_softc *sc,
	struct virtio_scsi_config *scsicfg)
	{
	device_t dev;

	dev = sc->vtscsi_dev;

	bzero(scsicfg, sizeof(struct virtio_scsi_config));

	VTSCSI_GET_CONFIG(dev, num_queues, scsicfg);
	VTSCSI_GET_CONFIG(dev, seg_max, scsicfg);
	VTSCSI_GET_CONFIG(dev, max_sectors, scsicfg);
	VTSCSI_GET_CONFIG(dev, cmd_per_lun, scsicfg);
	VTSCSI_GET_CONFIG(dev, event_info_size, scsicfg);
	VTSCSI_GET_CONFIG(dev, sense_size, scsicfg);
	VTSCSI_GET_CONFIG(dev, cdb_size, scsicfg);
	VTSCSI_GET_CONFIG(dev, max_channel, scsicfg);
	VTSCSI_GET_CONFIG(dev, max_target, scsicfg);
	VTSCSI_GET_CONFIG(dev, max_lun, scsicfg);
	}

	#undef VTSCSI_GET_CONFIG

	static int
	vtscsi_maximum_segments(struct vtscsi_softc *sc, int seg_max)
	{
	int nsegs;

	nsegs = VTSCSI_MIN_SEGMENTS;

	if (seg_max > 0) {
	- nsegs += MIN(seg_max, MAXPHYS / PAGE_SIZE + 1);
	+ nsegs += MIN(seg_max, maxphys / PAGE_SIZE + 1);
	if (sc->vtscsi_flags & VTSCSI_FLAG_INDIRECT)
	nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
	} else
	nsegs += 1;

	return (nsegs);
	}

	static int
	vtscsi_alloc_virtqueues(struct vtscsi_softc *sc)
	{
	device_t dev;
	struct vq_alloc_info vq_info[3];
	int nvqs;

	dev = sc->vtscsi_dev;
	nvqs = 3;

	VQ_ALLOC_INFO_INIT(&vq_info[0], 0, vtscsi_control_vq_intr, sc,
	&sc->vtscsi_control_vq, "%s control", device_get_nameunit(dev));

	VQ_ALLOC_INFO_INIT(&vq_info[1], 0, vtscsi_event_vq_intr, sc,
	&sc->vtscsi_event_vq, "%s event", device_get_nameunit(dev));

	VQ_ALLOC_INFO_INIT(&vq_info[2], sc->vtscsi_max_nsegs,
	vtscsi_request_vq_intr, sc, &sc->vtscsi_request_vq,
	"%s request", device_get_nameunit(dev));

	return (virtio_alloc_virtqueues(dev, 0, nvqs, vq_info));
	}

	static void
	vtscsi_check_sizes(struct vtscsi_softc *sc)
	{
	int rqsize;

	if ((sc->vtscsi_flags & VTSCSI_FLAG_INDIRECT) == 0) {
	/*
	* Ensure the assertions in virtqueue_enqueue(),
	* even if the hypervisor reports a bad seg_max.
	*/
	rqsize = virtqueue_size(sc->vtscsi_request_vq);
	if (sc->vtscsi_max_nsegs > rqsize) {
	device_printf(sc->vtscsi_dev,
	"clamping seg_max (%d %d)\n", sc->vtscsi_max_nsegs,
	rqsize);
	sc->vtscsi_max_nsegs = rqsize;
	}
	}
	}

	static void
	vtscsi_write_device_config(struct vtscsi_softc *sc)
	{

	virtio_write_dev_config_4(sc->vtscsi_dev,
	offsetof(struct virtio_scsi_config, sense_size),
	VIRTIO_SCSI_SENSE_SIZE);

	/*
	* This is the size in the virtio_scsi_cmd_req structure. Note
	* this value (32) is larger than the maximum CAM CDB size (16).
	*/
	virtio_write_dev_config_4(sc->vtscsi_dev,
	offsetof(struct virtio_scsi_config, cdb_size),
	VIRTIO_SCSI_CDB_SIZE);
	}

	static int
	vtscsi_reinit(struct vtscsi_softc *sc)
	{
	device_t dev;
	int error;

	dev = sc->vtscsi_dev;

	error = virtio_reinit(dev, sc->vtscsi_features);
	if (error == 0) {
	vtscsi_write_device_config(sc);
	vtscsi_reinit_event_vq(sc);
	virtio_reinit_complete(dev);

	vtscsi_enable_vqs_intr(sc);
	}

	vtscsi_dprintf(sc, VTSCSI_TRACE, "error=%d\n", error);

	return (error);
	}

	static int
	vtscsi_alloc_cam(struct vtscsi_softc *sc)
	{
	device_t dev;
	struct cam_devq *devq;
	int openings;

	dev = sc->vtscsi_dev;
	openings = sc->vtscsi_nrequests - VTSCSI_RESERVED_REQUESTS;

	devq = cam_simq_alloc(openings);
	if (devq == NULL) {
	device_printf(dev, "cannot allocate SIM queue\n");
	return (ENOMEM);
	}

	sc->vtscsi_sim = cam_sim_alloc(vtscsi_cam_action, vtscsi_cam_poll,
	"vtscsi", sc, device_get_unit(dev), VTSCSI_MTX(sc), 1,
	openings, devq);
	if (sc->vtscsi_sim == NULL) {
	cam_simq_free(devq);
	device_printf(dev, "cannot allocate SIM\n");
	return (ENOMEM);
	}

	return (0);
	}

	static int
	vtscsi_register_cam(struct vtscsi_softc *sc)
	{
	device_t dev;
	int registered, error;

	dev = sc->vtscsi_dev;
	registered = 0;

	VTSCSI_LOCK(sc);

	if (xpt_bus_register(sc->vtscsi_sim, dev, 0) != CAM_SUCCESS) {
	error = ENOMEM;
	device_printf(dev, "cannot register XPT bus\n");
	goto fail;
	}

	registered = 1;

	if (xpt_create_path(&sc->vtscsi_path, NULL,
	cam_sim_path(sc->vtscsi_sim), CAM_TARGET_WILDCARD,
	CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	error = ENOMEM;
	device_printf(dev, "cannot create bus path\n");
	goto fail;
	}

	if (vtscsi_register_async(sc) != CAM_REQ_CMP) {
	error = EIO;
	device_printf(dev, "cannot register async callback\n");
	goto fail;
	}

	VTSCSI_UNLOCK(sc);

	return (0);

	fail:
	if (sc->vtscsi_path != NULL) {
	xpt_free_path(sc->vtscsi_path);
	sc->vtscsi_path = NULL;
	}

	if (registered != 0)
	xpt_bus_deregister(cam_sim_path(sc->vtscsi_sim));

	VTSCSI_UNLOCK(sc);

	return (error);
	}

	static void
	vtscsi_free_cam(struct vtscsi_softc *sc)
	{

	VTSCSI_LOCK(sc);

	if (sc->vtscsi_path != NULL) {
	vtscsi_deregister_async(sc);

	xpt_free_path(sc->vtscsi_path);
	sc->vtscsi_path = NULL;

	xpt_bus_deregister(cam_sim_path(sc->vtscsi_sim));
	}

	if (sc->vtscsi_sim != NULL) {
	cam_sim_free(sc->vtscsi_sim, 1);
	sc->vtscsi_sim = NULL;
	}

	VTSCSI_UNLOCK(sc);
	}

	static void
	vtscsi_cam_async(void cb_arg, uint32_t code, struct cam_path path, void *arg)
	{
	struct cam_sim *sim;
	struct vtscsi_softc *sc;

	sim = cb_arg;
	sc = cam_sim_softc(sim);

	vtscsi_dprintf(sc, VTSCSI_TRACE, "code=%u\n", code);

	/*
	* TODO Once QEMU supports event reporting, we should
	* (un)subscribe to events here.
	*/
	switch (code) {
	case AC_FOUND_DEVICE:
	break;
	case AC_LOST_DEVICE:
	break;
	}
	}

	static int
	vtscsi_register_async(struct vtscsi_softc *sc)
	{
	struct ccb_setasync csa;

	xpt_setup_ccb(&csa.ccb_h, sc->vtscsi_path, 5);
	csa.ccb_h.func_code = XPT_SASYNC_CB;
	csa.event_enable = AC_LOST_DEVICE \| AC_FOUND_DEVICE;
	csa.callback = vtscsi_cam_async;
	csa.callback_arg = sc->vtscsi_sim;

	xpt_action((union ccb *) &csa);

	return (csa.ccb_h.status);
	}

	static void
	vtscsi_deregister_async(struct vtscsi_softc *sc)
	{
	struct ccb_setasync csa;

	xpt_setup_ccb(&csa.ccb_h, sc->vtscsi_path, 5);
	csa.ccb_h.func_code = XPT_SASYNC_CB;
	csa.event_enable = 0;
	csa.callback = vtscsi_cam_async;
	csa.callback_arg = sc->vtscsi_sim;

	xpt_action((union ccb *) &csa);
	}

	static void
	vtscsi_cam_action(struct cam_sim sim, union ccb ccb)
	{
	struct vtscsi_softc *sc;
	struct ccb_hdr *ccbh;

	sc = cam_sim_softc(sim);
	ccbh = &ccb->ccb_h;

	VTSCSI_LOCK_OWNED(sc);

	if (sc->vtscsi_flags & VTSCSI_FLAG_DETACH) {
	/*
	* The VTSCSI_MTX is briefly dropped between setting
	* VTSCSI_FLAG_DETACH and deregistering with CAM, so
	* drop any CCBs that come in during that window.
	*/
	ccbh->status = CAM_NO_HBA;
	xpt_done(ccb);
	return;
	}

	switch (ccbh->func_code) {
	case XPT_SCSI_IO:
	vtscsi_cam_scsi_io(sc, sim, ccb);
	break;

	case XPT_SET_TRAN_SETTINGS:
	ccbh->status = CAM_FUNC_NOTAVAIL;
	xpt_done(ccb);
	break;

	case XPT_GET_TRAN_SETTINGS:
	vtscsi_cam_get_tran_settings(sc, ccb);
	break;

	case XPT_RESET_BUS:
	vtscsi_cam_reset_bus(sc, ccb);
	break;

	case XPT_RESET_DEV:
	vtscsi_cam_reset_dev(sc, ccb);
	break;

	case XPT_ABORT:
	vtscsi_cam_abort(sc, ccb);
	break;

	case XPT_CALC_GEOMETRY:
	cam_calc_geometry(&ccb->ccg, 1);
	xpt_done(ccb);
	break;

	case XPT_PATH_INQ:
	vtscsi_cam_path_inquiry(sc, sim, ccb);
	break;

	default:
	vtscsi_dprintf(sc, VTSCSI_ERROR,
	"invalid ccb=%p func=%#x\n", ccb, ccbh->func_code);

	ccbh->status = CAM_REQ_INVALID;
	xpt_done(ccb);
	break;
	}
	}

	static void
	vtscsi_cam_poll(struct cam_sim *sim)
	{
	struct vtscsi_softc *sc;

	sc = cam_sim_softc(sim);

	vtscsi_complete_vqs_locked(sc);
	}

	static void
	vtscsi_cam_scsi_io(struct vtscsi_softc sc, struct cam_sim sim,
	union ccb *ccb)
	{
	struct ccb_hdr *ccbh;
	struct ccb_scsiio *csio;
	int error;

	ccbh = &ccb->ccb_h;
	csio = &ccb->csio;

	if (csio->cdb_len > VIRTIO_SCSI_CDB_SIZE) {
	error = EINVAL;
	ccbh->status = CAM_REQ_INVALID;
	goto done;
	}

	if ((ccbh->flags & CAM_DIR_MASK) == CAM_DIR_BOTH &&
	(sc->vtscsi_flags & VTSCSI_FLAG_BIDIRECTIONAL) == 0) {
	error = EINVAL;
	ccbh->status = CAM_REQ_INVALID;
	goto done;
	}

	error = vtscsi_start_scsi_cmd(sc, ccb);

	done:
	if (error) {
	vtscsi_dprintf(sc, VTSCSI_ERROR,
	"error=%d ccb=%p status=%#x\n", error, ccb, ccbh->status);
	xpt_done(ccb);
	}
	}

	static void
	vtscsi_cam_get_tran_settings(struct vtscsi_softc sc, union ccb ccb)
	{
	struct ccb_trans_settings *cts;
	struct ccb_trans_settings_scsi *scsi;

	cts = &ccb->cts;
	scsi = &cts->proto_specific.scsi;

	cts->protocol = PROTO_SCSI;
	cts->protocol_version = SCSI_REV_SPC3;
	cts->transport = XPORT_SAS;
	cts->transport_version = 0;

	scsi->valid = CTS_SCSI_VALID_TQ;
	scsi->flags = CTS_SCSI_FLAGS_TAG_ENB;

	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	}

	static void
	vtscsi_cam_reset_bus(struct vtscsi_softc sc, union ccb ccb)
	{
	int error;

	error = vtscsi_reset_bus(sc);
	if (error == 0)
	ccb->ccb_h.status = CAM_REQ_CMP;
	else
	ccb->ccb_h.status = CAM_REQ_CMP_ERR;

	vtscsi_dprintf(sc, VTSCSI_TRACE, "error=%d ccb=%p status=%#x\n",
	error, ccb, ccb->ccb_h.status);

	xpt_done(ccb);
	}

	static void
	vtscsi_cam_reset_dev(struct vtscsi_softc sc, union ccb ccb)
	{
	struct ccb_hdr *ccbh;
	struct vtscsi_request *req;
	int error;

	ccbh = &ccb->ccb_h;

	req = vtscsi_dequeue_request(sc);
	if (req == NULL) {
	error = EAGAIN;
	vtscsi_freeze_simq(sc, VTSCSI_REQUEST);
	goto fail;
	}

	req->vsr_ccb = ccb;

	error = vtscsi_execute_reset_dev_cmd(sc, req);
	if (error == 0)
	return;

	vtscsi_enqueue_request(sc, req);

	fail:
	vtscsi_dprintf(sc, VTSCSI_ERROR, "error=%d req=%p ccb=%p\n",
	error, req, ccb);

	if (error == EAGAIN)
	ccbh->status = CAM_RESRC_UNAVAIL;
	else
	ccbh->status = CAM_REQ_CMP_ERR;

	xpt_done(ccb);
	}

	static void
	vtscsi_cam_abort(struct vtscsi_softc sc, union ccb ccb)
	{
	struct vtscsi_request *req;
	struct ccb_hdr *ccbh;
	int error;

	ccbh = &ccb->ccb_h;

	req = vtscsi_dequeue_request(sc);
	if (req == NULL) {
	error = EAGAIN;
	vtscsi_freeze_simq(sc, VTSCSI_REQUEST);
	goto fail;
	}

	req->vsr_ccb = ccb;

	error = vtscsi_execute_abort_task_cmd(sc, req);
	if (error == 0)
	return;

	vtscsi_enqueue_request(sc, req);

	fail:
	vtscsi_dprintf(sc, VTSCSI_ERROR, "error=%d req=%p ccb=%p\n",
	error, req, ccb);

	if (error == EAGAIN)
	ccbh->status = CAM_RESRC_UNAVAIL;
	else
	ccbh->status = CAM_REQ_CMP_ERR;

	xpt_done(ccb);
	}

	static void
	vtscsi_cam_path_inquiry(struct vtscsi_softc sc, struct cam_sim sim,
	union ccb *ccb)
	{
	device_t dev;
	struct ccb_pathinq *cpi;

	dev = sc->vtscsi_dev;
	cpi = &ccb->cpi;

	vtscsi_dprintf(sc, VTSCSI_TRACE, "sim=%p ccb=%p\n", sim, ccb);

	cpi->version_num = 1;
	cpi->hba_inquiry = PI_TAG_ABLE;
	cpi->target_sprt = 0;
	cpi->hba_misc = PIM_SEQSCAN \| PIM_UNMAPPED;
	if (vtscsi_bus_reset_disable != 0)
	cpi->hba_misc \|= PIM_NOBUSRESET;
	cpi->hba_eng_cnt = 0;

	cpi->max_target = sc->vtscsi_max_target;
	cpi->max_lun = sc->vtscsi_max_lun;
	cpi->initiator_id = cpi->max_target + 1;

	strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strlcpy(cpi->hba_vid, "VirtIO", HBA_IDLEN);
	strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);

	cpi->unit_number = cam_sim_unit(sim);
	cpi->bus_id = cam_sim_bus(sim);

	cpi->base_transfer_speed = 300000;

	cpi->protocol = PROTO_SCSI;
	cpi->protocol_version = SCSI_REV_SPC3;
	cpi->transport = XPORT_SAS;
	cpi->transport_version = 0;

	cpi->maxio = (sc->vtscsi_max_nsegs - VTSCSI_MIN_SEGMENTS - 1) *
	PAGE_SIZE;

	cpi->hba_vendor = virtio_get_vendor(dev);
	cpi->hba_device = virtio_get_device(dev);
	cpi->hba_subvendor = virtio_get_subvendor(dev);
	cpi->hba_subdevice = virtio_get_subdevice(dev);

	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	}

	static int
	vtscsi_sg_append_scsi_buf(struct vtscsi_softc sc, struct sglist sg,
	struct ccb_scsiio *csio)
	{
	struct ccb_hdr *ccbh;
	struct bus_dma_segment *dseg;
	int i, error;

	ccbh = &csio->ccb_h;
	error = 0;

	switch ((ccbh->flags & CAM_DATA_MASK)) {
	case CAM_DATA_VADDR:
	error = sglist_append(sg, csio->data_ptr, csio->dxfer_len);
	break;
	case CAM_DATA_PADDR:
	error = sglist_append_phys(sg,
	(vm_paddr_t)(vm_offset_t) csio->data_ptr, csio->dxfer_len);
	break;
	case CAM_DATA_SG:
	for (i = 0; i < csio->sglist_cnt && error == 0; i++) {
	dseg = &((struct bus_dma_segment *)csio->data_ptr)[i];
	error = sglist_append(sg,
	(void *)(vm_offset_t) dseg->ds_addr, dseg->ds_len);
	}
	break;
	case CAM_DATA_SG_PADDR:
	for (i = 0; i < csio->sglist_cnt && error == 0; i++) {
	dseg = &((struct bus_dma_segment *)csio->data_ptr)[i];
	error = sglist_append_phys(sg,
	(vm_paddr_t) dseg->ds_addr, dseg->ds_len);
	}
	break;
	case CAM_DATA_BIO:
	error = sglist_append_bio(sg, (struct bio *) csio->data_ptr);
	break;
	default:
	error = EINVAL;
	break;
	}

	return (error);
	}

	static int
	vtscsi_fill_scsi_cmd_sglist(struct vtscsi_softc sc, struct vtscsi_request req,
	int readable, int writable)
	{
	struct sglist *sg;
	struct ccb_hdr *ccbh;
	struct ccb_scsiio *csio;
	struct virtio_scsi_cmd_req *cmd_req;
	struct virtio_scsi_cmd_resp *cmd_resp;
	int error;

	sg = sc->vtscsi_sglist;
	csio = &req->vsr_ccb->csio;
	ccbh = &csio->ccb_h;
	cmd_req = &req->vsr_cmd_req;
	cmd_resp = &req->vsr_cmd_resp;

	sglist_reset(sg);

	sglist_append(sg, cmd_req, sizeof(struct virtio_scsi_cmd_req));
	if ((ccbh->flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
	error = vtscsi_sg_append_scsi_buf(sc, sg, csio);
	/* At least one segment must be left for the response. */
	if (error \|\| sg->sg_nseg == sg->sg_maxseg)
	goto fail;
	}

	*readable = sg->sg_nseg;

	sglist_append(sg, cmd_resp, sizeof(struct virtio_scsi_cmd_resp));
	if ((ccbh->flags & CAM_DIR_MASK) == CAM_DIR_IN) {
	error = vtscsi_sg_append_scsi_buf(sc, sg, csio);
	if (error)
	goto fail;
	}

	writable = sg->sg_nseg - readable;

	vtscsi_dprintf(sc, VTSCSI_TRACE, "req=%p ccb=%p readable=%d "
	"writable=%d\n", req, ccbh, readable, writable);

	return (0);

	fail:
	/*
	* This should never happen unless maxio was incorrectly set.
	*/
	vtscsi_set_ccb_status(ccbh, CAM_REQ_TOO_BIG, 0);

	vtscsi_dprintf(sc, VTSCSI_ERROR, "error=%d req=%p ccb=%p "
	"nseg=%d maxseg=%d\n",
	error, req, ccbh, sg->sg_nseg, sg->sg_maxseg);

	return (EFBIG);
	}

	static int
	vtscsi_execute_scsi_cmd(struct vtscsi_softc sc, struct vtscsi_request req)
	{
	struct sglist *sg;
	struct virtqueue *vq;
	struct ccb_scsiio *csio;
	struct ccb_hdr *ccbh;
	struct virtio_scsi_cmd_req *cmd_req;
	struct virtio_scsi_cmd_resp *cmd_resp;
	int readable, writable, error;

	sg = sc->vtscsi_sglist;
	vq = sc->vtscsi_request_vq;
	csio = &req->vsr_ccb->csio;
	ccbh = &csio->ccb_h;
	cmd_req = &req->vsr_cmd_req;
	cmd_resp = &req->vsr_cmd_resp;

	vtscsi_init_scsi_cmd_req(csio, cmd_req);

	error = vtscsi_fill_scsi_cmd_sglist(sc, req, &readable, &writable);
	if (error)
	return (error);

	req->vsr_complete = vtscsi_complete_scsi_cmd;
	cmd_resp->response = -1;

	error = virtqueue_enqueue(vq, req, sg, readable, writable);
	if (error) {
	vtscsi_dprintf(sc, VTSCSI_ERROR,
	"enqueue error=%d req=%p ccb=%p\n", error, req, ccbh);

	ccbh->status = CAM_REQUEUE_REQ;
	vtscsi_freeze_simq(sc, VTSCSI_REQUEST_VQ);
	return (error);
	}

	ccbh->status \|= CAM_SIM_QUEUED;
	ccbh->ccbh_vtscsi_req = req;

	virtqueue_notify(vq);

	if (ccbh->timeout != CAM_TIME_INFINITY) {
	req->vsr_flags \|= VTSCSI_REQ_FLAG_TIMEOUT_SET;
	callout_reset_sbt(&req->vsr_callout, SBT_1MS * ccbh->timeout,
	0, vtscsi_timedout_scsi_cmd, req, 0);
	}

	vtscsi_dprintf_req(req, VTSCSI_TRACE, "enqueued req=%p ccb=%p\n",
	req, ccbh);

	return (0);
	}

	static int
	vtscsi_start_scsi_cmd(struct vtscsi_softc sc, union ccb ccb)
	{
	struct vtscsi_request *req;
	int error;

	req = vtscsi_dequeue_request(sc);
	if (req == NULL) {
	ccb->ccb_h.status = CAM_REQUEUE_REQ;
	vtscsi_freeze_simq(sc, VTSCSI_REQUEST);
	return (ENOBUFS);
	}

	req->vsr_ccb = ccb;

	error = vtscsi_execute_scsi_cmd(sc, req);
	if (error)
	vtscsi_enqueue_request(sc, req);

	return (error);
	}

	static void
	vtscsi_complete_abort_timedout_scsi_cmd(struct vtscsi_softc *sc,
	struct vtscsi_request *req)
	{
	struct virtio_scsi_ctrl_tmf_resp *tmf_resp;
	struct vtscsi_request *to_req;
	uint8_t response;

	tmf_resp = &req->vsr_tmf_resp;
	response = tmf_resp->response;
	to_req = req->vsr_timedout_req;

	vtscsi_dprintf(sc, VTSCSI_TRACE, "req=%p to_req=%p response=%d\n",
	req, to_req, response);

	vtscsi_enqueue_request(sc, req);

	/*
	* The timedout request could have completed between when the
	* abort task was sent and when the host processed it.
	*/
	if (to_req->vsr_state != VTSCSI_REQ_STATE_TIMEDOUT)
	return;

	/* The timedout request was successfully aborted. */
	if (response == VIRTIO_SCSI_S_FUNCTION_COMPLETE)
	return;

	/* Don't bother if the device is going away. */
	if (sc->vtscsi_flags & VTSCSI_FLAG_DETACH)
	return;

	/* The timedout request will be aborted by the reset. */
	if (sc->vtscsi_flags & VTSCSI_FLAG_RESET)
	return;

	vtscsi_reset_bus(sc);
	}

	static int
	vtscsi_abort_timedout_scsi_cmd(struct vtscsi_softc *sc,
	struct vtscsi_request *to_req)
	{
	struct sglist *sg;
	struct ccb_hdr *to_ccbh;
	struct vtscsi_request *req;
	struct virtio_scsi_ctrl_tmf_req *tmf_req;
	struct virtio_scsi_ctrl_tmf_resp *tmf_resp;
	int error;

	sg = sc->vtscsi_sglist;
	to_ccbh = &to_req->vsr_ccb->ccb_h;

	req = vtscsi_dequeue_request(sc);
	if (req == NULL) {
	error = ENOBUFS;
	goto fail;
	}

	tmf_req = &req->vsr_tmf_req;
	tmf_resp = &req->vsr_tmf_resp;

	vtscsi_init_ctrl_tmf_req(to_ccbh, VIRTIO_SCSI_T_TMF_ABORT_TASK,
	(uintptr_t) to_ccbh, tmf_req);

	sglist_reset(sg);
	sglist_append(sg, tmf_req, sizeof(struct virtio_scsi_ctrl_tmf_req));
	sglist_append(sg, tmf_resp, sizeof(struct virtio_scsi_ctrl_tmf_resp));

	req->vsr_timedout_req = to_req;
	req->vsr_complete = vtscsi_complete_abort_timedout_scsi_cmd;
	tmf_resp->response = -1;

	error = vtscsi_execute_ctrl_req(sc, req, sg, 1, 1,
	VTSCSI_EXECUTE_ASYNC);
	if (error == 0)
	return (0);

	vtscsi_enqueue_request(sc, req);

	fail:
	vtscsi_dprintf(sc, VTSCSI_ERROR, "error=%d req=%p "
	"timedout req=%p ccb=%p\n", error, req, to_req, to_ccbh);

	return (error);
	}

	static void
	vtscsi_timedout_scsi_cmd(void *xreq)
	{
	struct vtscsi_softc *sc;
	struct vtscsi_request *to_req;

	to_req = xreq;
	sc = to_req->vsr_softc;

	vtscsi_dprintf(sc, VTSCSI_INFO, "timedout req=%p ccb=%p state=%#x\n",
	to_req, to_req->vsr_ccb, to_req->vsr_state);

	/* Don't bother if the device is going away. */
	if (sc->vtscsi_flags & VTSCSI_FLAG_DETACH)
	return;

	/*
	* Bail if the request is not in use. We likely raced when
	* stopping the callout handler or it has already been aborted.
	*/
	if (to_req->vsr_state != VTSCSI_REQ_STATE_INUSE \|\|
	(to_req->vsr_flags & VTSCSI_REQ_FLAG_TIMEOUT_SET) == 0)
	return;

	/*
	* Complete the request queue in case the timedout request is
	* actually just pending.
	*/
	vtscsi_complete_vq(sc, sc->vtscsi_request_vq);
	if (to_req->vsr_state == VTSCSI_REQ_STATE_FREE)
	return;

	sc->vtscsi_stats.scsi_cmd_timeouts++;
	to_req->vsr_state = VTSCSI_REQ_STATE_TIMEDOUT;

	if (vtscsi_abort_timedout_scsi_cmd(sc, to_req) == 0)
	return;

	vtscsi_dprintf(sc, VTSCSI_ERROR, "resetting bus\n");
	vtscsi_reset_bus(sc);
	}

	static cam_status
	vtscsi_scsi_cmd_cam_status(struct virtio_scsi_cmd_resp *cmd_resp)
	{
	cam_status status;

	switch (cmd_resp->response) {
	case VIRTIO_SCSI_S_OK:
	status = CAM_REQ_CMP;
	break;
	case VIRTIO_SCSI_S_OVERRUN:
	status = CAM_DATA_RUN_ERR;
	break;
	case VIRTIO_SCSI_S_ABORTED:
	status = CAM_REQ_ABORTED;
	break;
	case VIRTIO_SCSI_S_BAD_TARGET:
	status = CAM_SEL_TIMEOUT;
	break;
	case VIRTIO_SCSI_S_RESET:
	status = CAM_SCSI_BUS_RESET;
	break;
	case VIRTIO_SCSI_S_BUSY:
	status = CAM_SCSI_BUSY;
	break;
	case VIRTIO_SCSI_S_TRANSPORT_FAILURE:
	case VIRTIO_SCSI_S_TARGET_FAILURE:
	case VIRTIO_SCSI_S_NEXUS_FAILURE:
	status = CAM_SCSI_IT_NEXUS_LOST;
	break;
	default: /* VIRTIO_SCSI_S_FAILURE */
	status = CAM_REQ_CMP_ERR;
	break;
	}

	return (status);
	}

	static cam_status
	vtscsi_complete_scsi_cmd_response(struct vtscsi_softc *sc,
	struct ccb_scsiio csio, struct virtio_scsi_cmd_resp cmd_resp)
	{
	cam_status status;

	csio->scsi_status = cmd_resp->status;
	csio->resid = cmd_resp->resid;

	if (csio->scsi_status == SCSI_STATUS_OK)
	status = CAM_REQ_CMP;
	else
	status = CAM_SCSI_STATUS_ERROR;

	if (cmd_resp->sense_len > 0) {
	status \|= CAM_AUTOSNS_VALID;

	if (cmd_resp->sense_len < csio->sense_len)
	csio->sense_resid = csio->sense_len -
	cmd_resp->sense_len;
	else
	csio->sense_resid = 0;

	memcpy(&csio->sense_data, cmd_resp->sense,
	csio->sense_len - csio->sense_resid);
	}

	vtscsi_dprintf(sc, status == CAM_REQ_CMP ? VTSCSI_TRACE : VTSCSI_ERROR,
	"ccb=%p scsi_status=%#x resid=%u sense_resid=%u\n",
	csio, csio->scsi_status, csio->resid, csio->sense_resid);

	return (status);
	}

	static void
	vtscsi_complete_scsi_cmd(struct vtscsi_softc sc, struct vtscsi_request req)
	{
	struct ccb_hdr *ccbh;
	struct ccb_scsiio *csio;
	struct virtio_scsi_cmd_resp *cmd_resp;
	cam_status status;

	csio = &req->vsr_ccb->csio;
	ccbh = &csio->ccb_h;
	cmd_resp = &req->vsr_cmd_resp;

	KASSERT(ccbh->ccbh_vtscsi_req == req,
	("ccb %p req mismatch %p/%p", ccbh, ccbh->ccbh_vtscsi_req, req));

	if (req->vsr_flags & VTSCSI_REQ_FLAG_TIMEOUT_SET)
	callout_stop(&req->vsr_callout);

	status = vtscsi_scsi_cmd_cam_status(cmd_resp);
	if (status == CAM_REQ_ABORTED) {
	if (req->vsr_state == VTSCSI_REQ_STATE_TIMEDOUT)
	status = CAM_CMD_TIMEOUT;
	} else if (status == CAM_REQ_CMP)
	status = vtscsi_complete_scsi_cmd_response(sc, csio, cmd_resp);

	if ((status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	status \|= CAM_DEV_QFRZN;
	xpt_freeze_devq(ccbh->path, 1);
	}

	if (vtscsi_thaw_simq(sc, VTSCSI_REQUEST \| VTSCSI_REQUEST_VQ) != 0)
	status \|= CAM_RELEASE_SIMQ;

	vtscsi_dprintf(sc, VTSCSI_TRACE, "req=%p ccb=%p status=%#x\n",
	req, ccbh, status);

	ccbh->status = status;
	xpt_done(req->vsr_ccb);
	vtscsi_enqueue_request(sc, req);
	}

	static void
	vtscsi_poll_ctrl_req(struct vtscsi_softc sc, struct vtscsi_request req)
	{

	/* XXX We probably shouldn't poll forever. */
	req->vsr_flags \|= VTSCSI_REQ_FLAG_POLLED;
	do
	vtscsi_complete_vq(sc, sc->vtscsi_control_vq);
	while ((req->vsr_flags & VTSCSI_REQ_FLAG_COMPLETE) == 0);

	req->vsr_flags &= ~VTSCSI_REQ_FLAG_POLLED;
	}

	static int
	vtscsi_execute_ctrl_req(struct vtscsi_softc sc, struct vtscsi_request req,
	struct sglist *sg, int readable, int writable, int flag)
	{
	struct virtqueue *vq;
	int error;

	vq = sc->vtscsi_control_vq;

	MPASS(flag == VTSCSI_EXECUTE_POLL \|\| req->vsr_complete != NULL);

	error = virtqueue_enqueue(vq, req, sg, readable, writable);
	if (error) {
	/*
	* Return EAGAIN when the virtqueue does not have enough
	* descriptors available.
	*/
	if (error == ENOSPC \|\| error == EMSGSIZE)
	error = EAGAIN;

	return (error);
	}

	virtqueue_notify(vq);
	if (flag == VTSCSI_EXECUTE_POLL)
	vtscsi_poll_ctrl_req(sc, req);

	return (0);
	}

	static void
	vtscsi_complete_abort_task_cmd(struct vtscsi_softc *sc,
	struct vtscsi_request *req)
	{
	union ccb *ccb;
	struct ccb_hdr *ccbh;
	struct virtio_scsi_ctrl_tmf_resp *tmf_resp;

	ccb = req->vsr_ccb;
	ccbh = &ccb->ccb_h;
	tmf_resp = &req->vsr_tmf_resp;

	switch (tmf_resp->response) {
	case VIRTIO_SCSI_S_FUNCTION_COMPLETE:
	ccbh->status = CAM_REQ_CMP;
	break;
	case VIRTIO_SCSI_S_FUNCTION_REJECTED:
	ccbh->status = CAM_UA_ABORT;
	break;
	default:
	ccbh->status = CAM_REQ_CMP_ERR;
	break;
	}

	xpt_done(ccb);
	vtscsi_enqueue_request(sc, req);
	}

	static int
	vtscsi_execute_abort_task_cmd(struct vtscsi_softc *sc,
	struct vtscsi_request *req)
	{
	struct sglist *sg;
	struct ccb_abort *cab;
	struct ccb_hdr *ccbh;
	struct ccb_hdr *abort_ccbh;
	struct vtscsi_request *abort_req;
	struct virtio_scsi_ctrl_tmf_req *tmf_req;
	struct virtio_scsi_ctrl_tmf_resp *tmf_resp;
	int error;

	sg = sc->vtscsi_sglist;
	cab = &req->vsr_ccb->cab;
	ccbh = &cab->ccb_h;
	tmf_req = &req->vsr_tmf_req;
	tmf_resp = &req->vsr_tmf_resp;

	/* CCB header and request that's to be aborted. */
	abort_ccbh = &cab->abort_ccb->ccb_h;
	abort_req = abort_ccbh->ccbh_vtscsi_req;

	if (abort_ccbh->func_code != XPT_SCSI_IO \|\| abort_req == NULL) {
	error = EINVAL;
	goto fail;
	}

	/* Only attempt to abort requests that could be in-flight. */
	if (abort_req->vsr_state != VTSCSI_REQ_STATE_INUSE) {
	error = EALREADY;
	goto fail;
	}

	abort_req->vsr_state = VTSCSI_REQ_STATE_ABORTED;
	if (abort_req->vsr_flags & VTSCSI_REQ_FLAG_TIMEOUT_SET)
	callout_stop(&abort_req->vsr_callout);

	vtscsi_init_ctrl_tmf_req(ccbh, VIRTIO_SCSI_T_TMF_ABORT_TASK,
	(uintptr_t) abort_ccbh, tmf_req);

	sglist_reset(sg);
	sglist_append(sg, tmf_req, sizeof(struct virtio_scsi_ctrl_tmf_req));
	sglist_append(sg, tmf_resp, sizeof(struct virtio_scsi_ctrl_tmf_resp));

	req->vsr_complete = vtscsi_complete_abort_task_cmd;
	tmf_resp->response = -1;

	error = vtscsi_execute_ctrl_req(sc, req, sg, 1, 1,
	VTSCSI_EXECUTE_ASYNC);

	fail:
	vtscsi_dprintf(sc, VTSCSI_TRACE, "error=%d req=%p abort_ccb=%p "
	"abort_req=%p\n", error, req, abort_ccbh, abort_req);

	return (error);
	}

	static void
	vtscsi_complete_reset_dev_cmd(struct vtscsi_softc *sc,
	struct vtscsi_request *req)
	{
	union ccb *ccb;
	struct ccb_hdr *ccbh;
	struct virtio_scsi_ctrl_tmf_resp *tmf_resp;

	ccb = req->vsr_ccb;
	ccbh = &ccb->ccb_h;
	tmf_resp = &req->vsr_tmf_resp;

	vtscsi_dprintf(sc, VTSCSI_TRACE, "req=%p ccb=%p response=%d\n",
	req, ccb, tmf_resp->response);

	if (tmf_resp->response == VIRTIO_SCSI_S_FUNCTION_COMPLETE) {
	ccbh->status = CAM_REQ_CMP;
	vtscsi_announce(sc, AC_SENT_BDR, ccbh->target_id,
	ccbh->target_lun);
	} else
	ccbh->status = CAM_REQ_CMP_ERR;

	xpt_done(ccb);
	vtscsi_enqueue_request(sc, req);
	}

	static int
	vtscsi_execute_reset_dev_cmd(struct vtscsi_softc *sc,
	struct vtscsi_request *req)
	{
	struct sglist *sg;
	struct ccb_resetdev *crd;
	struct ccb_hdr *ccbh;
	struct virtio_scsi_ctrl_tmf_req *tmf_req;
	struct virtio_scsi_ctrl_tmf_resp *tmf_resp;
	uint32_t subtype;
	int error;

	sg = sc->vtscsi_sglist;
	crd = &req->vsr_ccb->crd;
	ccbh = &crd->ccb_h;
	tmf_req = &req->vsr_tmf_req;
	tmf_resp = &req->vsr_tmf_resp;

	if (ccbh->target_lun == CAM_LUN_WILDCARD)
	subtype = VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET;
	else
	subtype = VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET;

	vtscsi_init_ctrl_tmf_req(ccbh, subtype, 0, tmf_req);

	sglist_reset(sg);
	sglist_append(sg, tmf_req, sizeof(struct virtio_scsi_ctrl_tmf_req));
	sglist_append(sg, tmf_resp, sizeof(struct virtio_scsi_ctrl_tmf_resp));

	req->vsr_complete = vtscsi_complete_reset_dev_cmd;
	tmf_resp->response = -1;

	error = vtscsi_execute_ctrl_req(sc, req, sg, 1, 1,
	VTSCSI_EXECUTE_ASYNC);

	vtscsi_dprintf(sc, VTSCSI_TRACE, "error=%d req=%p ccb=%p\n",
	error, req, ccbh);

	return (error);
	}

	static void
	vtscsi_get_request_lun(uint8_t lun[], target_id_t target_id, lun_id_t lun_id)
	{

	*target_id = lun[1];
	*lun_id = (lun[2] << 8) \| lun[3];
	}

	static void
	vtscsi_set_request_lun(struct ccb_hdr *ccbh, uint8_t lun[])
	{

	lun[0] = 1;
	lun[1] = ccbh->target_id;
	lun[2] = 0x40 \| ((ccbh->target_lun >> 8) & 0x3F);
	lun[3] = ccbh->target_lun & 0xFF;
	}

	static void
	vtscsi_init_scsi_cmd_req(struct ccb_scsiio *csio,
	struct virtio_scsi_cmd_req *cmd_req)
	{
	uint8_t attr;

	switch (csio->tag_action) {
	case MSG_HEAD_OF_Q_TAG:
	attr = VIRTIO_SCSI_S_HEAD;
	break;
	case MSG_ORDERED_Q_TAG:
	attr = VIRTIO_SCSI_S_ORDERED;
	break;
	case MSG_ACA_TASK:
	attr = VIRTIO_SCSI_S_ACA;
	break;
	default: /* MSG_SIMPLE_Q_TAG */
	attr = VIRTIO_SCSI_S_SIMPLE;
	break;
	}

	vtscsi_set_request_lun(&csio->ccb_h, cmd_req->lun);
	cmd_req->tag = (uintptr_t) csio;
	cmd_req->task_attr = attr;

	memcpy(cmd_req->cdb,
	csio->ccb_h.flags & CAM_CDB_POINTER ?
	csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes,
	csio->cdb_len);
	}

	static void
	vtscsi_init_ctrl_tmf_req(struct ccb_hdr *ccbh, uint32_t subtype,
	uintptr_t tag, struct virtio_scsi_ctrl_tmf_req *tmf_req)
	{

	vtscsi_set_request_lun(ccbh, tmf_req->lun);

	tmf_req->type = VIRTIO_SCSI_T_TMF;
	tmf_req->subtype = subtype;
	tmf_req->tag = tag;
	}

	static void
	vtscsi_freeze_simq(struct vtscsi_softc *sc, int reason)
	{
	int frozen;

	frozen = sc->vtscsi_frozen;

	if (reason & VTSCSI_REQUEST &&
	(sc->vtscsi_frozen & VTSCSI_FROZEN_NO_REQUESTS) == 0)
	sc->vtscsi_frozen \|= VTSCSI_FROZEN_NO_REQUESTS;

	if (reason & VTSCSI_REQUEST_VQ &&
	(sc->vtscsi_frozen & VTSCSI_FROZEN_REQUEST_VQ_FULL) == 0)
	sc->vtscsi_frozen \|= VTSCSI_FROZEN_REQUEST_VQ_FULL;

	/* Freeze the SIMQ if transitioned to frozen. */
	if (frozen == 0 && sc->vtscsi_frozen != 0) {
	vtscsi_dprintf(sc, VTSCSI_INFO, "SIMQ frozen\n");
	xpt_freeze_simq(sc->vtscsi_sim, 1);
	}
	}

	static int
	vtscsi_thaw_simq(struct vtscsi_softc *sc, int reason)
	{
	int thawed;

	if (sc->vtscsi_frozen == 0 \|\| reason == 0)
	return (0);

	if (reason & VTSCSI_REQUEST &&
	sc->vtscsi_frozen & VTSCSI_FROZEN_NO_REQUESTS)
	sc->vtscsi_frozen &= ~VTSCSI_FROZEN_NO_REQUESTS;

	if (reason & VTSCSI_REQUEST_VQ &&
	sc->vtscsi_frozen & VTSCSI_FROZEN_REQUEST_VQ_FULL)
	sc->vtscsi_frozen &= ~VTSCSI_FROZEN_REQUEST_VQ_FULL;

	thawed = sc->vtscsi_frozen == 0;
	if (thawed != 0)
	vtscsi_dprintf(sc, VTSCSI_INFO, "SIMQ thawed\n");

	return (thawed);
	}

	static void
	vtscsi_announce(struct vtscsi_softc *sc, uint32_t ac_code,
	target_id_t target_id, lun_id_t lun_id)
	{
	struct cam_path *path;

	/* Use the wildcard path from our softc for bus announcements. */
	if (target_id == CAM_TARGET_WILDCARD && lun_id == CAM_LUN_WILDCARD) {
	xpt_async(ac_code, sc->vtscsi_path, NULL);
	return;
	}

	if (xpt_create_path(&path, NULL, cam_sim_path(sc->vtscsi_sim),
	target_id, lun_id) != CAM_REQ_CMP) {
	vtscsi_dprintf(sc, VTSCSI_ERROR, "cannot create path\n");
	return;
	}

	xpt_async(ac_code, path, NULL);
	xpt_free_path(path);
	}

	static void
	vtscsi_execute_rescan(struct vtscsi_softc *sc, target_id_t target_id,
	lun_id_t lun_id)
	{
	union ccb *ccb;
	cam_status status;

	ccb = xpt_alloc_ccb_nowait();
	if (ccb == NULL) {
	vtscsi_dprintf(sc, VTSCSI_ERROR, "cannot allocate CCB\n");
	return;
	}

	status = xpt_create_path(&ccb->ccb_h.path, NULL,
	cam_sim_path(sc->vtscsi_sim), target_id, lun_id);
	if (status != CAM_REQ_CMP) {
	xpt_free_ccb(ccb);
	return;
	}

	xpt_rescan(ccb);
	}

	static void
	vtscsi_execute_rescan_bus(struct vtscsi_softc *sc)
	{

	vtscsi_execute_rescan(sc, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD);
	}

	static void
	vtscsi_transport_reset_event(struct vtscsi_softc *sc,
	struct virtio_scsi_event *event)
	{
	target_id_t target_id;
	lun_id_t lun_id;

	vtscsi_get_request_lun(event->lun, &target_id, &lun_id);

	switch (event->reason) {
	case VIRTIO_SCSI_EVT_RESET_RESCAN:
	case VIRTIO_SCSI_EVT_RESET_REMOVED:
	vtscsi_execute_rescan(sc, target_id, lun_id);
	break;
	default:
	device_printf(sc->vtscsi_dev,
	"unhandled transport event reason: %d\n", event->reason);
	break;
	}
	}

	static void
	vtscsi_handle_event(struct vtscsi_softc sc, struct virtio_scsi_event event)
	{
	int error;

	if ((event->event & VIRTIO_SCSI_T_EVENTS_MISSED) == 0) {
	switch (event->event) {
	case VIRTIO_SCSI_T_TRANSPORT_RESET:
	vtscsi_transport_reset_event(sc, event);
	break;
	default:
	device_printf(sc->vtscsi_dev,
	"unhandled event: %d\n", event->event);
	break;
	}
	} else
	vtscsi_execute_rescan_bus(sc);

	/*
	* This should always be successful since the buffer
	* was just dequeued.
	*/
	error = vtscsi_enqueue_event_buf(sc, event);
	KASSERT(error == 0,
	("cannot requeue event buffer: %d", error));
	}

	static int
	vtscsi_enqueue_event_buf(struct vtscsi_softc *sc,
	struct virtio_scsi_event *event)
	{
	struct sglist *sg;
	struct virtqueue *vq;
	int size, error;

	sg = sc->vtscsi_sglist;
	vq = sc->vtscsi_event_vq;
	size = sc->vtscsi_event_buf_size;

	bzero(event, size);

	sglist_reset(sg);
	error = sglist_append(sg, event, size);
	if (error)
	return (error);

	error = virtqueue_enqueue(vq, event, sg, 0, sg->sg_nseg);
	if (error)
	return (error);

	virtqueue_notify(vq);

	return (0);
	}

	static int
	vtscsi_init_event_vq(struct vtscsi_softc *sc)
	{
	struct virtio_scsi_event *event;
	int i, size, error;

	/*
	* The first release of QEMU with VirtIO SCSI support would crash
	* when attempting to notify the event virtqueue. This was fixed
	* when hotplug support was added.
	*/
	if (sc->vtscsi_flags & VTSCSI_FLAG_HOTPLUG)
	size = sc->vtscsi_event_buf_size;
	else
	size = 0;

	if (size < sizeof(struct virtio_scsi_event))
	return (0);

	for (i = 0; i < VTSCSI_NUM_EVENT_BUFS; i++) {
	event = &sc->vtscsi_event_bufs[i];

	error = vtscsi_enqueue_event_buf(sc, event);
	if (error)
	break;
	}

	/*
	* Even just one buffer is enough. Missed events are
	* denoted with the VIRTIO_SCSI_T_EVENTS_MISSED flag.
	*/
	if (i > 0)
	error = 0;

	return (error);
	}

	static void
	vtscsi_reinit_event_vq(struct vtscsi_softc *sc)
	{
	struct virtio_scsi_event *event;
	int i, error;

	if ((sc->vtscsi_flags & VTSCSI_FLAG_HOTPLUG) == 0 \|\|
	sc->vtscsi_event_buf_size < sizeof(struct virtio_scsi_event))
	return;

	for (i = 0; i < VTSCSI_NUM_EVENT_BUFS; i++) {
	event = &sc->vtscsi_event_bufs[i];

	error = vtscsi_enqueue_event_buf(sc, event);
	if (error)
	break;
	}

	KASSERT(i > 0, ("cannot reinit event vq: %d", error));
	}

	static void
	vtscsi_drain_event_vq(struct vtscsi_softc *sc)
	{
	struct virtqueue *vq;
	int last;

	vq = sc->vtscsi_event_vq;
	last = 0;

	while (virtqueue_drain(vq, &last) != NULL)
	;

	KASSERT(virtqueue_empty(vq), ("eventvq not empty"));
	}

	static void
	vtscsi_complete_vqs_locked(struct vtscsi_softc *sc)
	{

	VTSCSI_LOCK_OWNED(sc);

	if (sc->vtscsi_request_vq != NULL)
	vtscsi_complete_vq(sc, sc->vtscsi_request_vq);
	if (sc->vtscsi_control_vq != NULL)
	vtscsi_complete_vq(sc, sc->vtscsi_control_vq);
	}

	static void
	vtscsi_complete_vqs(struct vtscsi_softc *sc)
	{

	VTSCSI_LOCK(sc);
	vtscsi_complete_vqs_locked(sc);
	VTSCSI_UNLOCK(sc);
	}

	static void
	vtscsi_cancel_request(struct vtscsi_softc sc, struct vtscsi_request req)
	{
	union ccb *ccb;
	int detach;

	ccb = req->vsr_ccb;

	vtscsi_dprintf(sc, VTSCSI_TRACE, "req=%p ccb=%p\n", req, ccb);

	/*
	* The callout must be drained when detaching since the request is
	* about to be freed. The VTSCSI_MTX must not be held for this in
	* case the callout is pending because there is a deadlock potential.
	* Otherwise, the virtqueue is being drained because of a bus reset
	* so we only need to attempt to stop the callouts.
	*/
	detach = (sc->vtscsi_flags & VTSCSI_FLAG_DETACH) != 0;
	if (detach != 0)
	VTSCSI_LOCK_NOTOWNED(sc);
	else
	VTSCSI_LOCK_OWNED(sc);

	if (req->vsr_flags & VTSCSI_REQ_FLAG_TIMEOUT_SET) {
	if (detach != 0)
	callout_drain(&req->vsr_callout);
	else
	callout_stop(&req->vsr_callout);
	}

	if (ccb != NULL) {
	if (detach != 0) {
	VTSCSI_LOCK(sc);
	ccb->ccb_h.status = CAM_NO_HBA;
	} else
	ccb->ccb_h.status = CAM_REQUEUE_REQ;
	xpt_done(ccb);
	if (detach != 0)
	VTSCSI_UNLOCK(sc);
	}

	vtscsi_enqueue_request(sc, req);
	}

	static void
	vtscsi_drain_vq(struct vtscsi_softc sc, struct virtqueue vq)
	{
	struct vtscsi_request *req;
	int last;

	last = 0;

	vtscsi_dprintf(sc, VTSCSI_TRACE, "vq=%p\n", vq);

	while ((req = virtqueue_drain(vq, &last)) != NULL)
	vtscsi_cancel_request(sc, req);

	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
	}

	static void
	vtscsi_drain_vqs(struct vtscsi_softc *sc)
	{

	if (sc->vtscsi_control_vq != NULL)
	vtscsi_drain_vq(sc, sc->vtscsi_control_vq);
	if (sc->vtscsi_request_vq != NULL)
	vtscsi_drain_vq(sc, sc->vtscsi_request_vq);
	if (sc->vtscsi_event_vq != NULL)
	vtscsi_drain_event_vq(sc);
	}

	static void
	vtscsi_stop(struct vtscsi_softc *sc)
	{

	vtscsi_disable_vqs_intr(sc);
	virtio_stop(sc->vtscsi_dev);
	}

	static int
	vtscsi_reset_bus(struct vtscsi_softc *sc)
	{
	int error;

	VTSCSI_LOCK_OWNED(sc);

	if (vtscsi_bus_reset_disable != 0) {
	device_printf(sc->vtscsi_dev, "bus reset disabled\n");
	return (0);
	}

	sc->vtscsi_flags \|= VTSCSI_FLAG_RESET;

	/*
	* vtscsi_stop() will cause the in-flight requests to be canceled.
	* Those requests are then completed here so CAM will retry them
	* after the reset is complete.
	*/
	vtscsi_stop(sc);
	vtscsi_complete_vqs_locked(sc);

	/* Rid the virtqueues of any remaining requests. */
	vtscsi_drain_vqs(sc);

	/*
	* Any resource shortage that froze the SIMQ cannot persist across
	* a bus reset so ensure it gets thawed here.
	*/
	if (vtscsi_thaw_simq(sc, VTSCSI_REQUEST \| VTSCSI_REQUEST_VQ) != 0)
	xpt_release_simq(sc->vtscsi_sim, 0);

	error = vtscsi_reinit(sc);
	if (error) {
	device_printf(sc->vtscsi_dev,
	"reinitialization failed, stopping device...\n");
	vtscsi_stop(sc);
	} else
	vtscsi_announce(sc, AC_BUS_RESET, CAM_TARGET_WILDCARD,
	CAM_LUN_WILDCARD);

	sc->vtscsi_flags &= ~VTSCSI_FLAG_RESET;

	return (error);
	}

	static void
	vtscsi_init_request(struct vtscsi_softc sc, struct vtscsi_request req)
	{

	#ifdef INVARIANTS
	int req_nsegs, resp_nsegs;

	req_nsegs = sglist_count(&req->vsr_ureq, sizeof(req->vsr_ureq));
	resp_nsegs = sglist_count(&req->vsr_uresp, sizeof(req->vsr_uresp));

	KASSERT(req_nsegs == 1, ("request crossed page boundary"));
	KASSERT(resp_nsegs == 1, ("response crossed page boundary"));
	#endif

	req->vsr_softc = sc;
	callout_init_mtx(&req->vsr_callout, VTSCSI_MTX(sc), 0);
	}

	static int
	vtscsi_alloc_requests(struct vtscsi_softc *sc)
	{
	struct vtscsi_request *req;
	int i, nreqs;

	/*
	* Commands destined for either the request or control queues come
	* from the same SIM queue. Use the size of the request virtqueue
	* as it (should) be much more frequently used. Some additional
	* requests are allocated for internal (TMF) use.
	*/
	nreqs = virtqueue_size(sc->vtscsi_request_vq);
	if ((sc->vtscsi_flags & VTSCSI_FLAG_INDIRECT) == 0)
	nreqs /= VTSCSI_MIN_SEGMENTS;
	nreqs += VTSCSI_RESERVED_REQUESTS;

	for (i = 0; i < nreqs; i++) {
	req = malloc(sizeof(struct vtscsi_request), M_DEVBUF,
	M_NOWAIT);
	if (req == NULL)
	return (ENOMEM);

	vtscsi_init_request(sc, req);

	sc->vtscsi_nrequests++;
	vtscsi_enqueue_request(sc, req);
	}

	return (0);
	}

	static void
	vtscsi_free_requests(struct vtscsi_softc *sc)
	{
	struct vtscsi_request *req;

	while ((req = vtscsi_dequeue_request(sc)) != NULL) {
	KASSERT(callout_active(&req->vsr_callout) == 0,
	("request callout still active"));

	sc->vtscsi_nrequests--;
	free(req, M_DEVBUF);
	}

	KASSERT(sc->vtscsi_nrequests == 0, ("leaked requests: %d",
	sc->vtscsi_nrequests));
	}

	static void
	vtscsi_enqueue_request(struct vtscsi_softc sc, struct vtscsi_request req)
	{

	KASSERT(req->vsr_softc == sc,
	("non-matching request vsr_softc %p/%p", req->vsr_softc, sc));

	vtscsi_dprintf(sc, VTSCSI_TRACE, "req=%p\n", req);

	/* A request is available so the SIMQ could be released. */
	if (vtscsi_thaw_simq(sc, VTSCSI_REQUEST) != 0)
	xpt_release_simq(sc->vtscsi_sim, 1);

	req->vsr_ccb = NULL;
	req->vsr_complete = NULL;
	req->vsr_ptr0 = NULL;
	req->vsr_state = VTSCSI_REQ_STATE_FREE;
	req->vsr_flags = 0;

	bzero(&req->vsr_ureq, sizeof(req->vsr_ureq));
	bzero(&req->vsr_uresp, sizeof(req->vsr_uresp));

	/*
	* We insert at the tail of the queue in order to make it
	* very unlikely a request will be reused if we race with
	* stopping its callout handler.
	*/
	TAILQ_INSERT_TAIL(&sc->vtscsi_req_free, req, vsr_link);
	}

	static struct vtscsi_request *
	vtscsi_dequeue_request(struct vtscsi_softc *sc)
	{
	struct vtscsi_request *req;

	req = TAILQ_FIRST(&sc->vtscsi_req_free);
	if (req != NULL) {
	req->vsr_state = VTSCSI_REQ_STATE_INUSE;
	TAILQ_REMOVE(&sc->vtscsi_req_free, req, vsr_link);
	} else
	sc->vtscsi_stats.dequeue_no_requests++;

	vtscsi_dprintf(sc, VTSCSI_TRACE, "req=%p\n", req);

	return (req);
	}

	static void
	vtscsi_complete_request(struct vtscsi_request *req)
	{

	if (req->vsr_flags & VTSCSI_REQ_FLAG_POLLED)
	req->vsr_flags \|= VTSCSI_REQ_FLAG_COMPLETE;

	if (req->vsr_complete != NULL)
	req->vsr_complete(req->vsr_softc, req);
	}

	static void
	vtscsi_complete_vq(struct vtscsi_softc sc, struct virtqueue vq)
	{
	struct vtscsi_request *req;

	VTSCSI_LOCK_OWNED(sc);

	while ((req = virtqueue_dequeue(vq, NULL)) != NULL)
	vtscsi_complete_request(req);
	}

	static void
	vtscsi_control_vq_intr(void *xsc)
	{
	struct vtscsi_softc *sc;
	struct virtqueue *vq;

	sc = xsc;
	vq = sc->vtscsi_control_vq;

	again:
	VTSCSI_LOCK(sc);

	vtscsi_complete_vq(sc, sc->vtscsi_control_vq);

	if (virtqueue_enable_intr(vq) != 0) {
	virtqueue_disable_intr(vq);
	VTSCSI_UNLOCK(sc);
	goto again;
	}

	VTSCSI_UNLOCK(sc);
	}

	static void
	vtscsi_event_vq_intr(void *xsc)
	{
	struct vtscsi_softc *sc;
	struct virtqueue *vq;
	struct virtio_scsi_event *event;

	sc = xsc;
	vq = sc->vtscsi_event_vq;

	again:
	VTSCSI_LOCK(sc);

	while ((event = virtqueue_dequeue(vq, NULL)) != NULL)
	vtscsi_handle_event(sc, event);

	if (virtqueue_enable_intr(vq) != 0) {
	virtqueue_disable_intr(vq);
	VTSCSI_UNLOCK(sc);
	goto again;
	}

	VTSCSI_UNLOCK(sc);
	}

	static void
	vtscsi_request_vq_intr(void *xsc)
	{
	struct vtscsi_softc *sc;
	struct virtqueue *vq;

	sc = xsc;
	vq = sc->vtscsi_request_vq;

	again:
	VTSCSI_LOCK(sc);

	vtscsi_complete_vq(sc, sc->vtscsi_request_vq);

	if (virtqueue_enable_intr(vq) != 0) {
	virtqueue_disable_intr(vq);
	VTSCSI_UNLOCK(sc);
	goto again;
	}

	VTSCSI_UNLOCK(sc);
	}

	static void
	vtscsi_disable_vqs_intr(struct vtscsi_softc *sc)
	{

	virtqueue_disable_intr(sc->vtscsi_control_vq);
	virtqueue_disable_intr(sc->vtscsi_event_vq);
	virtqueue_disable_intr(sc->vtscsi_request_vq);
	}

	static void
	vtscsi_enable_vqs_intr(struct vtscsi_softc *sc)
	{

	virtqueue_enable_intr(sc->vtscsi_control_vq);
	virtqueue_enable_intr(sc->vtscsi_event_vq);
	virtqueue_enable_intr(sc->vtscsi_request_vq);
	}

	static void
	vtscsi_get_tunables(struct vtscsi_softc *sc)
	{
	char tmpstr[64];

	TUNABLE_INT_FETCH("hw.vtscsi.debug_level", &sc->vtscsi_debug);

	snprintf(tmpstr, sizeof(tmpstr), "dev.vtscsi.%d.debug_level",
	device_get_unit(sc->vtscsi_dev));
	TUNABLE_INT_FETCH(tmpstr, &sc->vtscsi_debug);
	}

	static void
	vtscsi_add_sysctl(struct vtscsi_softc *sc)
	{
	device_t dev;
	struct vtscsi_statistics *stats;
	struct sysctl_ctx_list *ctx;
	struct sysctl_oid *tree;
	struct sysctl_oid_list *child;

	dev = sc->vtscsi_dev;
	stats = &sc->vtscsi_stats;
	ctx = device_get_sysctl_ctx(dev);
	tree = device_get_sysctl_tree(dev);
	child = SYSCTL_CHILDREN(tree);

	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "debug_level",
	CTLFLAG_RW, &sc->vtscsi_debug, 0,
	"Debug level");

	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "scsi_cmd_timeouts",
	CTLFLAG_RD, &stats->scsi_cmd_timeouts,
	"SCSI command timeouts");
	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "dequeue_no_requests",
	CTLFLAG_RD, &stats->dequeue_no_requests,
	"No available requests to dequeue");
	}

	static void
	vtscsi_printf_req(struct vtscsi_request req, const char func,
	const char *fmt, ...)
	{
	struct vtscsi_softc *sc;
	union ccb *ccb;
	struct sbuf sb;
	va_list ap;
	char str[192];
	char path_str[64];

	if (req == NULL)
	return;

	sc = req->vsr_softc;
	ccb = req->vsr_ccb;

	va_start(ap, fmt);
	sbuf_new(&sb, str, sizeof(str), 0);

	if (ccb == NULL) {
	sbuf_printf(&sb, "(noperiph:%s%d:%u): ",
	cam_sim_name(sc->vtscsi_sim), cam_sim_unit(sc->vtscsi_sim),
	cam_sim_bus(sc->vtscsi_sim));
	} else {
	xpt_path_string(ccb->ccb_h.path, path_str, sizeof(path_str));
	sbuf_cat(&sb, path_str);
	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
	scsi_command_string(&ccb->csio, &sb);
	sbuf_printf(&sb, "length %d ", ccb->csio.dxfer_len);
	}
	}

	sbuf_vprintf(&sb, fmt, ap);
	va_end(ap);

	sbuf_finish(&sb);
	printf("%s: %s: %s", device_get_nameunit(sc->vtscsi_dev), func,
	sbuf_data(&sb));
	}
	diff --git a/sys/dev/xen/blkback/blkback.c b/sys/dev/xen/blkback/blkback.c
	index db73bb315cd4..d60920c819b8 100644
	--- a/sys/dev/xen/blkback/blkback.c
	+++ b/sys/dev/xen/blkback/blkback.c
	@@ -1,3920 +1,3921 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2009-2012 Spectra Logic Corporation
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification.
	* 2. Redistributions in binary form must reproduce at minimum a disclaimer
	* substantially similar to the "NO WARRANTY" disclaimer below
	* ("Disclaimer") and any redistribution must be conditioned upon
	* including a substantially similar Disclaimer requirement for further
	* binary redistribution.
	*
	* NO WARRANTY
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	* HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
	* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGES.
	*
	* Authors: Justin T. Gibbs (Spectra Logic Corporation)
	* Ken Merry (Spectra Logic Corporation)
	*/
	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/**
	* \file blkback.c
	*
	* \brief Device driver supporting the vending of block storage from
	* a FreeBSD domain to other domains.
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>

	#include <sys/bio.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/devicestat.h>
	#include <sys/disk.h>
	#include <sys/fcntl.h>
	#include <sys/filedesc.h>
	#include <sys/kdb.h>
	#include <sys/module.h>
	#include <sys/namei.h>
	#include <sys/proc.h>
	#include <sys/rman.h>
	#include <sys/taskqueue.h>
	#include <sys/types.h>
	#include <sys/vnode.h>
	#include <sys/mount.h>
	#include <sys/sysctl.h>
	#include <sys/bitstring.h>
	#include <sys/sdt.h>

	#include <geom/geom.h>

	#include <machine/_inttypes.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_kern.h>

	#include <xen/xen-os.h>
	#include <xen/blkif.h>
	#include <xen/gnttab.h>
	#include <xen/xen_intr.h>

	#include <xen/interface/event_channel.h>
	#include <xen/interface/grant_table.h>

	#include <xen/xenbus/xenbusvar.h>

	/--------------------------- Compile-time Tunables --------------------------/
	/**
	* The maximum number of shared memory ring pages we will allow in a
	* negotiated block-front/back communication channel. Allow enough
	* ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
	*/
	#define XBB_MAX_RING_PAGES 32

	/**
	* The maximum number of outstanding request blocks (request headers plus
	* additional segment blocks) we will allow in a negotiated block-front/back
	* communication channel.
	*/
	#define XBB_MAX_REQUESTS \
	__CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES)

	/**
	* \brief Define to force all I/O to be performed on memory owned by the
	* backend device, with a copy-in/out to the remote domain's memory.
	*
	* \note This option is currently required when this driver's domain is
	* operating in HVM mode on a system using an IOMMU.
	*
	* This driver uses Xen's grant table API to gain access to the memory of
	* the remote domains it serves. When our domain is operating in PV mode,
	* the grant table mechanism directly updates our domain's page table entries
	* to point to the physical pages of the remote domain. This scheme guarantees
	* that blkback and the backing devices it uses can safely perform DMA
	* operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to
	* insure that our domain cannot DMA to pages owned by another domain. As
	* of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
	* table API. For this reason, in HVM mode, we must bounce all requests into
	* memory that is mapped into our domain at domain startup and thus has
	* valid IOMMU mappings.
	*/
	#define XBB_USE_BOUNCE_BUFFERS

	/**
	* \brief Define to enable rudimentary request logging to the console.
	*/
	#undef XBB_DEBUG

	/---------------------------------- Macros ----------------------------------/
	/**
	* Custom malloc type for all driver allocations.
	*/
	static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");

	#ifdef XBB_DEBUG
	#define DPRINTF(fmt, args...) \
	printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
	#else
	#define DPRINTF(fmt, args...) do {} while(0)
	#endif

	/**
	* The maximum mapped region size per request we will allow in a negotiated
	* block-front/back communication channel.
	+ * Use old default of MAXPHYS == 128K.
	*/
	#define XBB_MAX_REQUEST_SIZE \
	- MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
	+ MIN(128 * 1024, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)

	/**
	* The maximum number of segments (within a request header and accompanying
	* segment blocks) per request we will allow in a negotiated block-front/back
	* communication channel.
	*/
	#define XBB_MAX_SEGMENTS_PER_REQUEST \
	(MIN(UIO_MAXIOV, \
	MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \
	(XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))

	/**
	* The maximum number of ring pages that we can allow per request list.
	* We limit this to the maximum number of segments per request, because
	* that is already a reasonable number of segments to aggregate. This
	* number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
	* because that would leave situations where we can't dispatch even one
	* large request.
	*/
	#define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST

	/--------------------------- Forward Declarations ---------------------------/
	struct xbb_softc;
	struct xbb_xen_req;

	static void xbb_attach_failed(struct xbb_softc xbb, int err, const char fmt,
	...) __attribute__((format(printf, 3, 4)));
	static int xbb_shutdown(struct xbb_softc *xbb);

	/------------------------------ Data Structures -----------------------------/

	STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);

	typedef enum {
	XBB_REQLIST_NONE = 0x00,
	XBB_REQLIST_MAPPED = 0x01
	} xbb_reqlist_flags;

	struct xbb_xen_reqlist {
	/**
	* Back reference to the parent block back instance for this
	* request. Used during bio_done handling.
	*/
	struct xbb_softc *xbb;

	/**
	* BLKIF_OP code for this request.
	*/
	int operation;

	/**
	* Set to BLKIF_RSP_* to indicate request status.
	*
	* This field allows an error status to be recorded even if the
	* delivery of this status must be deferred. Deferred reporting
	* is necessary, for example, when an error is detected during
	* completion processing of one bio when other bios for this
	* request are still outstanding.
	*/
	int status;

	/**
	* Number of 512 byte sectors not transferred.
	*/
	int residual_512b_sectors;

	/**
	* Starting sector number of the first request in the list.
	*/
	off_t starting_sector_number;

	/**
	* If we're going to coalesce, the next contiguous sector would be
	* this one.
	*/
	off_t next_contig_sector;

	/**
	* Number of child requests in the list.
	*/
	int num_children;

	/**
	* Number of I/O requests still pending on the backend.
	*/
	int pendcnt;

	/**
	* Total number of segments for requests in the list.
	*/
	int nr_segments;

	/**
	* Flags for this particular request list.
	*/
	xbb_reqlist_flags flags;

	/**
	* Kernel virtual address space reserved for this request
	* list structure and used to map the remote domain's pages for
	* this I/O, into our domain's address space.
	*/
	uint8_t *kva;

	/**
	* Base, pseudo-physical address, corresponding to the start
	* of this request's kva region.
	*/
	uint64_t gnt_base;

	#ifdef XBB_USE_BOUNCE_BUFFERS
	/**
	* Pre-allocated domain local memory used to proxy remote
	* domain memory during I/O operations.
	*/
	uint8_t *bounce;
	#endif

	/**
	* Array of grant handles (one per page) used to map this request.
	*/
	grant_handle_t *gnt_handles;

	/**
	* Device statistics request ordering type (ordered or simple).
	*/
	devstat_tag_type ds_tag_type;

	/**
	* Device statistics request type (read, write, no_data).
	*/
	devstat_trans_flags ds_trans_type;

	/**
	* The start time for this request.
	*/
	struct bintime ds_t0;

	/**
	* Linked list of contiguous requests with the same operation type.
	*/
	struct xbb_xen_req_list contig_req_list;

	/**
	* Linked list links used to aggregate idle requests in the
	* request list free pool (xbb->reqlist_free_stailq) and pending
	* requests waiting for execution (xbb->reqlist_pending_stailq).
	*/
	STAILQ_ENTRY(xbb_xen_reqlist) links;
	};

	STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);

	/**
	* \brief Object tracking an in-flight I/O from a Xen VBD consumer.
	*/
	struct xbb_xen_req {
	/**
	* Linked list links used to aggregate requests into a reqlist
	* and to store them in the request free pool.
	*/
	STAILQ_ENTRY(xbb_xen_req) links;

	/**
	* The remote domain's identifier for this I/O request.
	*/
	uint64_t id;

	/**
	* The number of pages currently mapped for this request.
	*/
	int nr_pages;

	/**
	* The number of 512 byte sectors comprising this requests.
	*/
	int nr_512b_sectors;

	/**
	* BLKIF_OP code for this request.
	*/
	int operation;

	/**
	* Storage used for non-native ring requests.
	*/
	blkif_request_t ring_req_storage;

	/**
	* Pointer to the Xen request in the ring.
	*/
	blkif_request_t *ring_req;

	/**
	* Consumer index for this request.
	*/
	RING_IDX req_ring_idx;

	/**
	* The start time for this request.
	*/
	struct bintime ds_t0;

	/**
	* Pointer back to our parent request list.
	*/
	struct xbb_xen_reqlist *reqlist;
	};
	SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);

	/**
	* \brief Configuration data for the shared memory request ring
	* used to communicate with the front-end client of this
	* this driver.
	*/
	struct xbb_ring_config {
	/** KVA address where ring memory is mapped. */
	vm_offset_t va;

	/** The pseudo-physical address where ring memory is mapped.*/
	uint64_t gnt_addr;

	/**
	* Grant table handles, one per-ring page, returned by the
	* hyperpervisor upon mapping of the ring and required to
	* unmap it when a connection is torn down.
	*/
	grant_handle_t handle[XBB_MAX_RING_PAGES];

	/**
	* The device bus address returned by the hypervisor when
	* mapping the ring and required to unmap it when a connection
	* is torn down.
	*/
	uint64_t bus_addr[XBB_MAX_RING_PAGES];

	/** The number of ring pages mapped for the current connection. */
	u_int ring_pages;

	/**
	* The grant references, one per-ring page, supplied by the
	* front-end, allowing us to reference the ring pages in the
	* front-end's domain and to map these pages into our own domain.
	*/
	grant_ref_t ring_ref[XBB_MAX_RING_PAGES];

	/** The interrupt driven even channel used to signal ring events. */
	evtchn_port_t evtchn;
	};

	/**
	* Per-instance connection state flags.
	*/
	typedef enum
	{
	/**
	* The front-end requested a read-only mount of the
	* back-end device/file.
	*/
	XBBF_READ_ONLY = 0x01,

	/** Communication with the front-end has been established. */
	XBBF_RING_CONNECTED = 0x02,

	/**
	* Front-end requests exist in the ring and are waiting for
	* xbb_xen_req objects to free up.
	*/
	XBBF_RESOURCE_SHORTAGE = 0x04,

	/** Connection teardown in progress. */
	XBBF_SHUTDOWN = 0x08,

	/** A thread is already performing shutdown processing. */
	XBBF_IN_SHUTDOWN = 0x10
	} xbb_flag_t;

	/** Backend device type. */
	typedef enum {
	/** Backend type unknown. */
	XBB_TYPE_NONE = 0x00,

	/**
	* Backend type disk (access via cdev switch
	* strategy routine).
	*/
	XBB_TYPE_DISK = 0x01,

	/** Backend type file (access vnode operations.). */
	XBB_TYPE_FILE = 0x02
	} xbb_type;

	/**
	* \brief Structure used to memoize information about a per-request
	* scatter-gather list.
	*
	* The chief benefit of using this data structure is it avoids having
	* to reparse the possibly discontiguous S/G list in the original
	* request. Due to the way that the mapping of the memory backing an
	* I/O transaction is handled by Xen, a second pass is unavoidable.
	* At least this way the second walk is a simple array traversal.
	*
	* \note A single Scatter/Gather element in the block interface covers
	* at most 1 machine page. In this context a sector (blkif
	* nomenclature, not what I'd choose) is a 512b aligned unit
	* of mapping within the machine page referenced by an S/G
	* element.
	*/
	struct xbb_sg {
	/** The number of 512b data chunks mapped in this S/G element. */
	int16_t nsect;

	/**
	* The index (0 based) of the first 512b data chunk mapped
	* in this S/G element.
	*/
	uint8_t first_sect;

	/**
	* The index (0 based) of the last 512b data chunk mapped
	* in this S/G element.
	*/
	uint8_t last_sect;
	};

	/**
	* Character device backend specific configuration data.
	*/
	struct xbb_dev_data {
	/** Cdev used for device backend access. */
	struct cdev *cdev;

	/** Cdev switch used for device backend access. */
	struct cdevsw *csw;

	/** Used to hold a reference on opened cdev backend devices. */
	int dev_ref;
	};

	/**
	* File backend specific configuration data.
	*/
	struct xbb_file_data {
	/** Credentials to use for vnode backed (file based) I/O. */
	struct ucred *cred;

	/**
	* \brief Array of io vectors used to process file based I/O.
	*
	* Only a single file based request is outstanding per-xbb instance,
	* so we only need one of these.
	*/
	struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
	#ifdef XBB_USE_BOUNCE_BUFFERS

	/**
	* \brief Array of io vectors used to handle bouncing of file reads.
	*
	* Vnode operations are free to modify uio data during their
	* exectuion. In the case of a read with bounce buffering active,
	* we need some of the data from the original uio in order to
	* bounce-out the read data. This array serves as the temporary
	* storage for this saved data.
	*/
	struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];

	/**
	* \brief Array of memoized bounce buffer kva offsets used
	* in the file based backend.
	*
	* Due to the way that the mapping of the memory backing an
	* I/O transaction is handled by Xen, a second pass through
	* the request sg elements is unavoidable. We memoize the computed
	* bounce address here to reduce the cost of the second walk.
	*/
	void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
	#endif /* XBB_USE_BOUNCE_BUFFERS */
	};

	/**
	* Collection of backend type specific data.
	*/
	union xbb_backend_data {
	struct xbb_dev_data dev;
	struct xbb_file_data file;
	};

	/**
	* Function signature of backend specific I/O handlers.
	*/
	typedef int (xbb_dispatch_t)(struct xbb_softc xbb,
	struct xbb_xen_reqlist *reqlist, int operation,
	int flags);

	/**
	* Per-instance configuration data.
	*/
	struct xbb_softc {
	/**
	* Task-queue used to process I/O requests.
	*/
	struct taskqueue *io_taskqueue;

	/**
	* Single "run the request queue" task enqueued
	* on io_taskqueue.
	*/
	struct task io_task;

	/** Device type for this instance. */
	xbb_type device_type;

	/** NewBus device corresponding to this instance. */
	device_t dev;

	/** Backend specific dispatch routine for this instance. */
	xbb_dispatch_t dispatch_io;

	/** The number of requests outstanding on the backend device/file. */
	int active_request_count;

	/** Free pool of request tracking structures. */
	struct xbb_xen_req_list request_free_stailq;

	/** Array, sized at connection time, of request tracking structures. */
	struct xbb_xen_req *requests;

	/** Free pool of request list structures. */
	struct xbb_xen_reqlist_list reqlist_free_stailq;

	/** List of pending request lists awaiting execution. */
	struct xbb_xen_reqlist_list reqlist_pending_stailq;

	/** Array, sized at connection time, of request list structures. */
	struct xbb_xen_reqlist *request_lists;

	/**
	* Global pool of kva used for mapping remote domain ring
	* and I/O transaction data.
	*/
	vm_offset_t kva;

	/** Pseudo-physical address corresponding to kva. */
	uint64_t gnt_base_addr;

	/** The size of the global kva pool. */
	int kva_size;

	/** The size of the KVA area used for request lists. */
	int reqlist_kva_size;

	/** The number of pages of KVA used for request lists */
	int reqlist_kva_pages;

	/** Bitmap of free KVA pages */
	bitstr_t *kva_free;

	/**
	* \brief Cached value of the front-end's domain id.
	*
	* This value is used at once for each mapped page in
	* a transaction. We cache it to avoid incuring the
	* cost of an ivar access every time this is needed.
	*/
	domid_t otherend_id;

	/**
	* \brief The blkif protocol abi in effect.
	*
	* There are situations where the back and front ends can
	* have a different, native abi (e.g. intel x86_64 and
	* 32bit x86 domains on the same machine). The back-end
	* always accommodates the front-end's native abi. That
	* value is pulled from the XenStore and recorded here.
	*/
	int abi;

	/**
	* \brief The maximum number of requests and request lists allowed
	* to be in flight at a time.
	*
	* This value is negotiated via the XenStore.
	*/
	u_int max_requests;

	/**
	* \brief The maximum number of segments (1 page per segment)
	* that can be mapped by a request.
	*
	* This value is negotiated via the XenStore.
	*/
	u_int max_request_segments;

	/**
	* \brief Maximum number of segments per request list.
	*
	* This value is derived from and will generally be larger than
	* max_request_segments.
	*/
	u_int max_reqlist_segments;

	/**
	* The maximum size of any request to this back-end
	* device.
	*
	* This value is negotiated via the XenStore.
	*/
	u_int max_request_size;

	/**
	* The maximum size of any request list. This is derived directly
	* from max_reqlist_segments.
	*/
	u_int max_reqlist_size;

	/** Various configuration and state bit flags. */
	xbb_flag_t flags;

	/** Ring mapping and interrupt configuration data. */
	struct xbb_ring_config ring_config;

	/** Runtime, cross-abi safe, structures for ring access. */
	blkif_back_rings_t rings;

	/** IRQ mapping for the communication ring event channel. */
	xen_intr_handle_t xen_intr_handle;

	/**
	* \brief Backend access mode flags (e.g. write, or read-only).
	*
	* This value is passed to us by the front-end via the XenStore.
	*/
	char *dev_mode;

	/**
	* \brief Backend device type (e.g. "disk", "cdrom", "floppy").
	*
	* This value is passed to us by the front-end via the XenStore.
	* Currently unused.
	*/
	char *dev_type;

	/**
	* \brief Backend device/file identifier.
	*
	* This value is passed to us by the front-end via the XenStore.
	* We expect this to be a POSIX path indicating the file or
	* device to open.
	*/
	char *dev_name;

	/**
	* Vnode corresponding to the backend device node or file
	* we are acessing.
	*/
	struct vnode *vn;

	union xbb_backend_data backend;

	/** The native sector size of the backend. */
	u_int sector_size;

	/** log2 of sector_size. */
	u_int sector_size_shift;

	/** Size in bytes of the backend device or file. */
	off_t media_size;

	/**
	* \brief media_size expressed in terms of the backend native
	* sector size.
	*
	* (e.g. xbb->media_size >> xbb->sector_size_shift).
	*/
	uint64_t media_num_sectors;

	/**
	* \brief Array of memoized scatter gather data computed during the
	* conversion of blkif ring requests to internal xbb_xen_req
	* structures.
	*
	* Ring processing is serialized so we only need one of these.
	*/
	struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];

	/**
	* Temporary grant table map used in xbb_dispatch_io(). When
	* XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
	* stack could cause a stack overflow.
	*/
	struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST];

	/** Mutex protecting per-instance data. */
	struct mtx lock;

	/**
	* Resource representing allocated physical address space
	* associated with our per-instance kva region.
	*/
	struct resource *pseudo_phys_res;

	/** Resource id for allocated physical address space. */
	int pseudo_phys_res_id;

	/**
	* I/O statistics from BlockBack dispatch down. These are
	* coalesced requests, and we start them right before execution.
	*/
	struct devstat *xbb_stats;

	/**
	* I/O statistics coming into BlockBack. These are the requests as
	* we get them from BlockFront. They are started as soon as we
	* receive a request, and completed when the I/O is complete.
	*/
	struct devstat *xbb_stats_in;

	/** Disable sending flush to the backend */
	int disable_flush;

	/** Send a real flush for every N flush requests */
	int flush_interval;

	/** Count of flush requests in the interval */
	int flush_count;

	/** Don't coalesce requests if this is set */
	int no_coalesce_reqs;

	/** Number of requests we have received */
	uint64_t reqs_received;

	/** Number of requests we have completed*/
	uint64_t reqs_completed;

	/** Number of requests we queued but not pushed*/
	uint64_t reqs_queued_for_completion;

	/** Number of requests we completed with an error status*/
	uint64_t reqs_completed_with_error;

	/** How many forced dispatches (i.e. without coalescing) have happened */
	uint64_t forced_dispatch;

	/** How many normal dispatches have happened */
	uint64_t normal_dispatch;

	/** How many total dispatches have happened */
	uint64_t total_dispatch;

	/** How many times we have run out of KVA */
	uint64_t kva_shortages;

	/** How many times we have run out of request structures */
	uint64_t request_shortages;

	/** Watch to wait for hotplug script execution */
	struct xs_watch hotplug_watch;

	/** Got the needed data from hotplug scripts? */
	bool hotplug_done;
	};

	/---------------------------- Request Processing ----------------------------/
	/**
	* Allocate an internal transaction tracking structure from the free pool.
	*
	* \param xbb Per-instance xbb configuration structure.
	*
	* \return On success, a pointer to the allocated xbb_xen_req structure.
	* Otherwise NULL.
	*/
	static inline struct xbb_xen_req *
	xbb_get_req(struct xbb_softc *xbb)
	{
	struct xbb_xen_req *req;

	req = NULL;

	mtx_assert(&xbb->lock, MA_OWNED);

	if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
	STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
	xbb->active_request_count++;
	}

	return (req);
	}

	/**
	* Return an allocated transaction tracking structure to the free pool.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param req The request structure to free.
	*/
	static inline void
	xbb_release_req(struct xbb_softc xbb, struct xbb_xen_req req)
	{
	mtx_assert(&xbb->lock, MA_OWNED);

	STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
	xbb->active_request_count--;

	KASSERT(xbb->active_request_count >= 0,
	("xbb_release_req: negative active count"));
	}

	/**
	* Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param req_list The list of requests to free.
	* \param nreqs The number of items in the list.
	*/
	static inline void
	xbb_release_reqs(struct xbb_softc xbb, struct xbb_xen_req_list req_list,
	int nreqs)
	{
	mtx_assert(&xbb->lock, MA_OWNED);

	STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
	xbb->active_request_count -= nreqs;

	KASSERT(xbb->active_request_count >= 0,
	("xbb_release_reqs: negative active count"));
	}

	/**
	* Given a page index and 512b sector offset within that page,
	* calculate an offset into a request's kva region.
	*
	* \param reqlist The request structure whose kva region will be accessed.
	* \param pagenr The page index used to compute the kva offset.
	* \param sector The 512b sector index used to compute the page relative
	* kva offset.
	*
	* \return The computed global KVA offset.
	*/
	static inline uint8_t *
	xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
	{
	return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
	}

	#ifdef XBB_USE_BOUNCE_BUFFERS
	/**
	* Given a page index and 512b sector offset within that page,
	* calculate an offset into a request's local bounce memory region.
	*
	* \param reqlist The request structure whose bounce region will be accessed.
	* \param pagenr The page index used to compute the bounce offset.
	* \param sector The 512b sector index used to compute the page relative
	* bounce offset.
	*
	* \return The computed global bounce buffer address.
	*/
	static inline uint8_t *
	xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
	{
	return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
	}
	#endif

	/**
	* Given a page number and 512b sector offset within that page,
	* calculate an offset into the request's memory region that the
	* underlying backend device/file should use for I/O.
	*
	* \param reqlist The request structure whose I/O region will be accessed.
	* \param pagenr The page index used to compute the I/O offset.
	* \param sector The 512b sector index used to compute the page relative
	* I/O offset.
	*
	* \return The computed global I/O address.
	*
	* Depending on configuration, this will either be a local bounce buffer
	* or a pointer to the memory mapped in from the front-end domain for
	* this request.
	*/
	static inline uint8_t *
	xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
	{
	#ifdef XBB_USE_BOUNCE_BUFFERS
	return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
	#else
	return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
	#endif
	}

	/**
	* Given a page index and 512b sector offset within that page, calculate
	* an offset into the local pseudo-physical address space used to map a
	* front-end's request data into a request.
	*
	* \param reqlist The request list structure whose pseudo-physical region
	* will be accessed.
	* \param pagenr The page index used to compute the pseudo-physical offset.
	* \param sector The 512b sector index used to compute the page relative
	* pseudo-physical offset.
	*
	* \return The computed global pseudo-phsyical address.
	*
	* Depending on configuration, this will either be a local bounce buffer
	* or a pointer to the memory mapped in from the front-end domain for
	* this request.
	*/
	static inline uintptr_t
	xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
	{
	struct xbb_softc *xbb;

	xbb = reqlist->xbb;

	return ((uintptr_t)(xbb->gnt_base_addr +
	(uintptr_t)(reqlist->kva - xbb->kva) +
	(PAGE_SIZE * pagenr) + (sector << 9)));
	}

	/**
	* Get Kernel Virtual Address space for mapping requests.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param nr_pages Number of pages needed.
	* \param check_only If set, check for free KVA but don't allocate it.
	* \param have_lock If set, xbb lock is already held.
	*
	* \return On success, a pointer to the allocated KVA region. Otherwise NULL.
	*
	* Note: This should be unnecessary once we have either chaining or
	* scatter/gather support for struct bio. At that point we'll be able to
	* put multiple addresses and lengths in one bio/bio chain and won't need
	* to map everything into one virtual segment.
	*/
	static uint8_t *
	xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
	{
	int first_clear;
	int num_clear;
	uint8_t *free_kva;
	int i;

	KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));

	first_clear = 0;
	free_kva = NULL;

	mtx_lock(&xbb->lock);

	/*
	* Look for the first available page. If there are none, we're done.
	*/
	bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);

	if (first_clear == -1)
	goto bailout;

	/*
	* Starting at the first available page, look for consecutive free
	* pages that will satisfy the user's request.
	*/
	for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
	/*
	* If this is true, the page is used, so we have to reset
	* the number of clear pages and the first clear page
	* (since it pointed to a region with an insufficient number
	* of clear pages).
	*/
	if (bit_test(xbb->kva_free, i)) {
	num_clear = 0;
	first_clear = -1;
	continue;
	}

	if (first_clear == -1)
	first_clear = i;

	/*
	* If this is true, we've found a large enough free region
	* to satisfy the request.
	*/
	if (++num_clear == nr_pages) {
	bit_nset(xbb->kva_free, first_clear,
	first_clear + nr_pages - 1);

	free_kva = xbb->kva +
	(uint8_t )((intptr_t)first_clear PAGE_SIZE);

	KASSERT(free_kva >= (uint8_t *)xbb->kva &&
	free_kva + (nr_pages * PAGE_SIZE) <=
	(uint8_t *)xbb->ring_config.va,
	("Free KVA %p len %d out of range, "
	"kva = %#jx, ring VA = %#jx\n", free_kva,
	nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
	(uintmax_t)xbb->ring_config.va));
	break;
	}
	}

	bailout:

	if (free_kva == NULL) {
	xbb->flags \|= XBBF_RESOURCE_SHORTAGE;
	xbb->kva_shortages++;
	}

	mtx_unlock(&xbb->lock);

	return (free_kva);
	}

	/**
	* Free allocated KVA.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param kva_ptr Pointer to allocated KVA region.
	* \param nr_pages Number of pages in the KVA region.
	*/
	static void
	xbb_free_kva(struct xbb_softc xbb, uint8_t kva_ptr, int nr_pages)
	{
	intptr_t start_page;

	mtx_assert(&xbb->lock, MA_OWNED);

	start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
	bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);

	}

	/**
	* Unmap the front-end pages associated with this I/O request.
	*
	* \param req The request structure to unmap.
	*/
	static void
	xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
	{
	struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
	u_int i;
	u_int invcount;
	int error;

	invcount = 0;
	for (i = 0; i < reqlist->nr_segments; i++) {
	if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
	continue;

	unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0);
	unmap[invcount].dev_bus_addr = 0;
	unmap[invcount].handle = reqlist->gnt_handles[i];
	reqlist->gnt_handles[i] = GRANT_REF_INVALID;
	invcount++;
	}

	error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
	unmap, invcount);
	KASSERT(error == 0, ("Grant table operation failed"));
	}

	/**
	* Allocate an internal transaction tracking structure from the free pool.
	*
	* \param xbb Per-instance xbb configuration structure.
	*
	* \return On success, a pointer to the allocated xbb_xen_reqlist structure.
	* Otherwise NULL.
	*/
	static inline struct xbb_xen_reqlist *
	xbb_get_reqlist(struct xbb_softc *xbb)
	{
	struct xbb_xen_reqlist *reqlist;

	reqlist = NULL;

	mtx_assert(&xbb->lock, MA_OWNED);

	if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
	STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
	reqlist->flags = XBB_REQLIST_NONE;
	reqlist->kva = NULL;
	reqlist->status = BLKIF_RSP_OKAY;
	reqlist->residual_512b_sectors = 0;
	reqlist->num_children = 0;
	reqlist->nr_segments = 0;
	STAILQ_INIT(&reqlist->contig_req_list);
	}

	return (reqlist);
	}

	/**
	* Return an allocated transaction tracking structure to the free pool.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param req The request list structure to free.
	* \param wakeup If set, wakeup the work thread if freeing this reqlist
	* during a resource shortage condition.
	*/
	static inline void
	xbb_release_reqlist(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist,
	int wakeup)
	{

	mtx_assert(&xbb->lock, MA_OWNED);

	if (wakeup) {
	wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
	xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
	}

	if (reqlist->kva != NULL)
	xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);

	xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);

	STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);

	if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
	/*
	* Shutdown is in progress. See if we can
	* progress further now that one more request
	* has completed and been returned to the
	* free pool.
	*/
	xbb_shutdown(xbb);
	}

	if (wakeup != 0)
	taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
	}

	/**
	* Request resources and do basic request setup.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param reqlist Pointer to reqlist pointer.
	* \param ring_req Pointer to a block ring request.
	* \param ring_index The ring index of this request.
	*
	* \return 0 for success, non-zero for failure.
	*/
	static int
	xbb_get_resources(struct xbb_softc xbb, struct xbb_xen_reqlist *reqlist,
	blkif_request_t *ring_req, RING_IDX ring_idx)
	{
	struct xbb_xen_reqlist *nreqlist;
	struct xbb_xen_req *nreq;

	nreqlist = NULL;
	nreq = NULL;

	mtx_lock(&xbb->lock);

	/*
	* We don't allow new resources to be allocated if we're in the
	* process of shutting down.
	*/
	if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
	mtx_unlock(&xbb->lock);
	return (1);
	}

	/*
	* Allocate a reqlist if the caller doesn't have one already.
	*/
	if (*reqlist == NULL) {
	nreqlist = xbb_get_reqlist(xbb);
	if (nreqlist == NULL)
	goto bailout_error;
	}

	/* We always allocate a request. */
	nreq = xbb_get_req(xbb);
	if (nreq == NULL)
	goto bailout_error;

	mtx_unlock(&xbb->lock);

	if (*reqlist == NULL) {
	*reqlist = nreqlist;
	nreqlist->operation = ring_req->operation;
	nreqlist->starting_sector_number = ring_req->sector_number;
	STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
	links);
	}

	nreq->reqlist = *reqlist;
	nreq->req_ring_idx = ring_idx;
	nreq->id = ring_req->id;
	nreq->operation = ring_req->operation;

	if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
	bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
	nreq->ring_req = &nreq->ring_req_storage;
	} else {
	nreq->ring_req = ring_req;
	}

	binuptime(&nreq->ds_t0);
	devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
	STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
	(*reqlist)->num_children++;
	(*reqlist)->nr_segments += ring_req->nr_segments;

	return (0);

	bailout_error:

	/*
	* We're out of resources, so set the shortage flag. The next time
	* a request is released, we'll try waking up the work thread to
	* see if we can allocate more resources.
	*/
	xbb->flags \|= XBBF_RESOURCE_SHORTAGE;
	xbb->request_shortages++;

	if (nreq != NULL)
	xbb_release_req(xbb, nreq);

	if (nreqlist != NULL)
	xbb_release_reqlist(xbb, nreqlist, /wakeup/ 0);

	mtx_unlock(&xbb->lock);

	return (1);
	}

	/**
	* Create and queue a response to a blkif request.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param req The request structure to which to respond.
	* \param status The status code to report. See BLKIF_RSP_*
	* in sys/xen/interface/io/blkif.h.
	*/
	static void
	xbb_queue_response(struct xbb_softc xbb, struct xbb_xen_req req, int status)
	{
	blkif_response_t *resp;

	/*
	* The mutex is required here, and should be held across this call
	* until after the subsequent call to xbb_push_responses(). This
	* is to guarantee that another context won't queue responses and
	* push them while we're active.
	*
	* That could lead to the other end being notified of responses
	* before the resources have been freed on this end. The other end
	* would then be able to queue additional I/O, and we may run out
	* of resources because we haven't freed them all yet.
	*/
	mtx_assert(&xbb->lock, MA_OWNED);

	/*
	* Place on the response ring for the relevant domain.
	* For now, only the spacing between entries is different
	* in the different ABIs, not the response entry layout.
	*/
	switch (xbb->abi) {
	case BLKIF_PROTOCOL_NATIVE:
	resp = RING_GET_RESPONSE(&xbb->rings.native,
	xbb->rings.native.rsp_prod_pvt);
	break;
	case BLKIF_PROTOCOL_X86_32:
	resp = (blkif_response_t *)
	RING_GET_RESPONSE(&xbb->rings.x86_32,
	xbb->rings.x86_32.rsp_prod_pvt);
	break;
	case BLKIF_PROTOCOL_X86_64:
	resp = (blkif_response_t *)
	RING_GET_RESPONSE(&xbb->rings.x86_64,
	xbb->rings.x86_64.rsp_prod_pvt);
	break;
	default:
	panic("Unexpected blkif protocol ABI.");
	}

	resp->id = req->id;
	resp->operation = req->operation;
	resp->status = status;

	if (status != BLKIF_RSP_OKAY)
	xbb->reqs_completed_with_error++;

	xbb->rings.common.rsp_prod_pvt++;

	xbb->reqs_queued_for_completion++;

	}

	/**
	* Send queued responses to blkif requests.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param run_taskqueue Flag that is set to 1 if the taskqueue
	* should be run, 0 if it does not need to be run.
	* \param notify Flag that is set to 1 if the other end should be
	* notified via irq, 0 if the other end should not be
	* notified.
	*/
	static void
	xbb_push_responses(struct xbb_softc xbb, int run_taskqueue, int *notify)
	{
	int more_to_do;

	/*
	* The mutex is required here.
	*/
	mtx_assert(&xbb->lock, MA_OWNED);

	more_to_do = 0;

	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify);

	if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
	/*
	* Tail check for pending requests. Allows frontend to avoid
	* notifications if requests are already in flight (lower
	* overheads and promotes batching).
	*/
	RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
	} else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
	more_to_do = 1;
	}

	xbb->reqs_completed += xbb->reqs_queued_for_completion;
	xbb->reqs_queued_for_completion = 0;

	*run_taskqueue = more_to_do;
	}

	/**
	* Complete a request list.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param reqlist Allocated internal request list structure.
	*/
	static void
	xbb_complete_reqlist(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist)
	{
	struct xbb_xen_req *nreq;
	off_t sectors_sent;
	int notify, run_taskqueue;

	sectors_sent = 0;

	if (reqlist->flags & XBB_REQLIST_MAPPED)
	xbb_unmap_reqlist(reqlist);

	mtx_lock(&xbb->lock);

	/*
	* All I/O is done, send the response. A lock is not necessary
	* to protect the request list, because all requests have
	* completed. Therefore this is the only context accessing this
	* reqlist right now. However, in order to make sure that no one
	* else queues responses onto the queue or pushes them to the other
	* side while we're active, we need to hold the lock across the
	* calls to xbb_queue_response() and xbb_push_responses().
	*/
	STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
	off_t cur_sectors_sent;

	/* Put this response on the ring, but don't push yet */
	xbb_queue_response(xbb, nreq, reqlist->status);

	/* We don't report bytes sent if there is an error. */
	if (reqlist->status == BLKIF_RSP_OKAY)
	cur_sectors_sent = nreq->nr_512b_sectors;
	else
	cur_sectors_sent = 0;

	sectors_sent += cur_sectors_sent;

	devstat_end_transaction(xbb->xbb_stats_in,
	/bytes/cur_sectors_sent << 9,
	reqlist->ds_tag_type,
	reqlist->ds_trans_type,
	/now/NULL,
	/then/&nreq->ds_t0);
	}

	/*
	* Take out any sectors not sent. If we wind up negative (which
	* might happen if an error is reported as well as a residual), just
	* report 0 sectors sent.
	*/
	sectors_sent -= reqlist->residual_512b_sectors;
	if (sectors_sent < 0)
	sectors_sent = 0;

	devstat_end_transaction(xbb->xbb_stats,
	/bytes/ sectors_sent << 9,
	reqlist->ds_tag_type,
	reqlist->ds_trans_type,
	/now/NULL,
	/then/&reqlist->ds_t0);

	xbb_release_reqlist(xbb, reqlist, /wakeup/ 1);

	xbb_push_responses(xbb, &run_taskqueue, &notify);

	mtx_unlock(&xbb->lock);

	if (run_taskqueue)
	taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);

	if (notify)
	xen_intr_signal(xbb->xen_intr_handle);
	}

	/**
	* Completion handler for buffer I/O requests issued by the device
	* backend driver.
	*
	* \param bio The buffer I/O request on which to perform completion
	* processing.
	*/
	static void
	xbb_bio_done(struct bio *bio)
	{
	struct xbb_softc *xbb;
	struct xbb_xen_reqlist *reqlist;

	reqlist = bio->bio_caller1;
	xbb = reqlist->xbb;

	reqlist->residual_512b_sectors += bio->bio_resid >> 9;

	/*
	* This is a bit imprecise. With aggregated I/O a single
	* request list can contain multiple front-end requests and
	* a multiple bios may point to a single request. By carefully
	* walking the request list, we could map residuals and errors
	* back to the original front-end request, but the interface
	* isn't sufficiently rich for us to properly report the error.
	* So, we just treat the entire request list as having failed if an
	* error occurs on any part. And, if an error occurs, we treat
	* the amount of data transferred as 0.
	*
	* For residuals, we report it on the overall aggregated device,
	* but not on the individual requests, since we don't currently
	* do the work to determine which front-end request to which the
	* residual applies.
	*/
	if (bio->bio_error) {
	DPRINTF("BIO returned error %d for operation on device %s\n",
	bio->bio_error, xbb->dev_name);
	reqlist->status = BLKIF_RSP_ERROR;

	if (bio->bio_error == ENXIO
	&& xenbus_get_state(xbb->dev) == XenbusStateConnected) {
	/*
	* Backend device has disappeared. Signal the
	* front-end that we (the device proxy) want to
	* go away.
	*/
	xenbus_set_state(xbb->dev, XenbusStateClosing);
	}
	}

	#ifdef XBB_USE_BOUNCE_BUFFERS
	if (bio->bio_cmd == BIO_READ) {
	vm_offset_t kva_offset;

	kva_offset = (vm_offset_t)bio->bio_data
	- (vm_offset_t)reqlist->bounce;
	memcpy((uint8_t *)reqlist->kva + kva_offset,
	bio->bio_data, bio->bio_bcount);
	}
	#endif /* XBB_USE_BOUNCE_BUFFERS */

	/*
	* Decrement the pending count for the request list. When we're
	* done with the requests, send status back for all of them.
	*/
	if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1)
	xbb_complete_reqlist(xbb, reqlist);

	g_destroy_bio(bio);
	}

	/**
	* Parse a blkif request into an internal request structure and send
	* it to the backend for processing.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param reqlist Allocated internal request list structure.
	*
	* \return On success, 0. For resource shortages, non-zero.
	*
	* This routine performs the backend common aspects of request parsing
	* including compiling an internal request structure, parsing the S/G
	* list and any secondary ring requests in which they may reside, and
	* the mapping of front-end I/O pages into our domain.
	*/
	static int
	xbb_dispatch_io(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist)
	{
	struct xbb_sg *xbb_sg;
	struct gnttab_map_grant_ref *map;
	struct blkif_request_segment *sg;
	struct blkif_request_segment *last_block_sg;
	struct xbb_xen_req *nreq;
	u_int nseg;
	u_int seg_idx;
	u_int block_segs;
	int nr_sects;
	int total_sects;
	int operation;
	uint8_t bio_flags;
	int error;

	reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE;
	bio_flags = 0;
	total_sects = 0;
	nr_sects = 0;

	/*
	* First determine whether we have enough free KVA to satisfy this
	* request list. If not, tell xbb_run_queue() so it can go to
	* sleep until we have more KVA.
	*/
	reqlist->kva = NULL;
	if (reqlist->nr_segments != 0) {
	reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments);
	if (reqlist->kva == NULL) {
	/*
	* If we're out of KVA, return ENOMEM.
	*/
	return (ENOMEM);
	}
	}

	binuptime(&reqlist->ds_t0);
	devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0);

	switch (reqlist->operation) {
	case BLKIF_OP_WRITE_BARRIER:
	bio_flags \|= BIO_ORDERED;
	reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
	/* FALLTHROUGH */
	case BLKIF_OP_WRITE:
	operation = BIO_WRITE;
	reqlist->ds_trans_type = DEVSTAT_WRITE;
	if ((xbb->flags & XBBF_READ_ONLY) != 0) {
	DPRINTF("Attempt to write to read only device %s\n",
	xbb->dev_name);
	reqlist->status = BLKIF_RSP_ERROR;
	goto send_response;
	}
	break;
	case BLKIF_OP_READ:
	operation = BIO_READ;
	reqlist->ds_trans_type = DEVSTAT_READ;
	break;
	case BLKIF_OP_FLUSH_DISKCACHE:
	/*
	* If this is true, the user has requested that we disable
	* flush support. So we just complete the requests
	* successfully.
	*/
	if (xbb->disable_flush != 0) {
	goto send_response;
	}

	/*
	* The user has requested that we only send a real flush
	* for every N flush requests. So keep count, and either
	* complete the request immediately or queue it for the
	* backend.
	*/
	if (xbb->flush_interval != 0) {
	if (++(xbb->flush_count) < xbb->flush_interval) {
	goto send_response;
	} else
	xbb->flush_count = 0;
	}

	operation = BIO_FLUSH;
	reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
	reqlist->ds_trans_type = DEVSTAT_NO_DATA;
	goto do_dispatch;
	/NOTREACHED/
	default:
	DPRINTF("error: unknown block io operation [%d]\n",
	reqlist->operation);
	reqlist->status = BLKIF_RSP_ERROR;
	goto send_response;
	}

	reqlist->xbb = xbb;
	xbb_sg = xbb->xbb_sgs;
	map = xbb->maps;
	seg_idx = 0;

	STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
	blkif_request_t *ring_req;
	RING_IDX req_ring_idx;
	u_int req_seg_idx;

	ring_req = nreq->ring_req;
	req_ring_idx = nreq->req_ring_idx;
	nr_sects = 0;
	nseg = ring_req->nr_segments;
	nreq->nr_pages = nseg;
	nreq->nr_512b_sectors = 0;
	req_seg_idx = 0;
	sg = NULL;

	/* Check that number of segments is sane. */
	if (__predict_false(nseg == 0)
	\|\| __predict_false(nseg > xbb->max_request_segments)) {
	DPRINTF("Bad number of segments in request (%d)\n",
	nseg);
	reqlist->status = BLKIF_RSP_ERROR;
	goto send_response;
	}

	block_segs = nseg;
	sg = ring_req->seg;
	last_block_sg = sg + block_segs;

	while (sg < last_block_sg) {
	KASSERT(seg_idx <
	XBB_MAX_SEGMENTS_PER_REQLIST,
	("seg_idx %d is too large, max "
	"segs %d\n", seg_idx,
	XBB_MAX_SEGMENTS_PER_REQLIST));

	xbb_sg->first_sect = sg->first_sect;
	xbb_sg->last_sect = sg->last_sect;
	xbb_sg->nsect =
	(int8_t)(sg->last_sect -
	sg->first_sect + 1);

	if ((sg->last_sect >= (PAGE_SIZE >> 9))
	\|\| (xbb_sg->nsect <= 0)) {
	reqlist->status = BLKIF_RSP_ERROR;
	goto send_response;
	}

	nr_sects += xbb_sg->nsect;
	map->host_addr = xbb_get_gntaddr(reqlist,
	seg_idx, /sector/0);
	KASSERT(map->host_addr + PAGE_SIZE <=
	xbb->ring_config.gnt_addr,
	("Host address %#jx len %d overlaps "
	"ring address %#jx\n",
	(uintmax_t)map->host_addr, PAGE_SIZE,
	(uintmax_t)xbb->ring_config.gnt_addr));

	map->flags = GNTMAP_host_map;
	map->ref = sg->gref;
	map->dom = xbb->otherend_id;
	if (operation == BIO_WRITE)
	map->flags \|= GNTMAP_readonly;
	sg++;
	map++;
	xbb_sg++;
	seg_idx++;
	req_seg_idx++;
	}

	/* Convert to the disk's sector size */
	nreq->nr_512b_sectors = nr_sects;
	nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
	total_sects += nr_sects;

	if ((nreq->nr_512b_sectors &
	((xbb->sector_size >> 9) - 1)) != 0) {
	device_printf(xbb->dev, "%s: I/O size (%d) is not "
	"a multiple of the backing store sector "
	"size (%d)\n", __func__,
	nreq->nr_512b_sectors << 9,
	xbb->sector_size);
	reqlist->status = BLKIF_RSP_ERROR;
	goto send_response;
	}
	}

	error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
	xbb->maps, reqlist->nr_segments);
	if (error != 0)
	panic("Grant table operation failed (%d)", error);

	reqlist->flags \|= XBB_REQLIST_MAPPED;

	for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments;
	seg_idx++, map++){
	if (__predict_false(map->status != 0)) {
	DPRINTF("invalid buffer -- could not remap "
	"it (%d)\n", map->status);
	DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags "
	"0x%x ref 0x%x, dom %d\n", seg_idx,
	map->host_addr, map->flags, map->ref,
	map->dom);
	reqlist->status = BLKIF_RSP_ERROR;
	goto send_response;
	}

	reqlist->gnt_handles[seg_idx] = map->handle;
	}
	if (reqlist->starting_sector_number + total_sects >
	xbb->media_num_sectors) {
	DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
	"extends past end of device %s\n",
	operation == BIO_READ ? "read" : "write",
	reqlist->starting_sector_number,
	reqlist->starting_sector_number + total_sects,
	xbb->dev_name);
	reqlist->status = BLKIF_RSP_ERROR;
	goto send_response;
	}

	do_dispatch:

	error = xbb->dispatch_io(xbb,
	reqlist,
	operation,
	bio_flags);

	if (error != 0) {
	reqlist->status = BLKIF_RSP_ERROR;
	goto send_response;
	}

	return (0);

	send_response:

	xbb_complete_reqlist(xbb, reqlist);

	return (0);
	}

	static __inline int
	xbb_count_sects(blkif_request_t *ring_req)
	{
	int i;
	int cur_size = 0;

	for (i = 0; i < ring_req->nr_segments; i++) {
	int nsect;

	nsect = (int8_t)(ring_req->seg[i].last_sect -
	ring_req->seg[i].first_sect + 1);
	if (nsect <= 0)
	break;

	cur_size += nsect;
	}

	return (cur_size);
	}

	/**
	* Process incoming requests from the shared communication ring in response
	* to a signal on the ring's event channel.
	*
	* \param context Callback argument registerd during task initialization -
	* the xbb_softc for this instance.
	* \param pending The number of taskqueue_enqueue events that have
	* occurred since this handler was last run.
	*/
	static void
	xbb_run_queue(void *context, int pending)
	{
	struct xbb_softc *xbb;
	blkif_back_rings_t *rings;
	RING_IDX rp;
	uint64_t cur_sector;
	int cur_operation;
	struct xbb_xen_reqlist *reqlist;

	xbb = (struct xbb_softc *)context;
	rings = &xbb->rings;

	/*
	* Work gather and dispatch loop. Note that we have a bias here
	* towards gathering I/O sent by blockfront. We first gather up
	* everything in the ring, as long as we have resources. Then we
	* dispatch one request, and then attempt to gather up any
	* additional requests that have come in while we were dispatching
	* the request.
	*
	* This allows us to get a clearer picture (via devstat) of how
	* many requests blockfront is queueing to us at any given time.
	*/
	for (;;) {
	int retval;

	/*
	* Initialize reqlist to the last element in the pending
	* queue, if there is one. This allows us to add more
	* requests to that request list, if we have room.
	*/
	reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq,
	xbb_xen_reqlist, links);
	if (reqlist != NULL) {
	cur_sector = reqlist->next_contig_sector;
	cur_operation = reqlist->operation;
	} else {
	cur_operation = 0;
	cur_sector = 0;
	}

	/*
	* Cache req_prod to avoid accessing a cache line shared
	* with the frontend.
	*/
	rp = rings->common.sring->req_prod;

	/* Ensure we see queued requests up to 'rp'. */
	rmb();

	/**
	* Run so long as there is work to consume and the generation
	* of a response will not overflow the ring.
	*
	* @note There's a 1 to 1 relationship between requests and
	* responses, so an overflow should never occur. This
	* test is to protect our domain from digesting bogus
	* data. Shouldn't we log this?
	*/
	while (rings->common.req_cons != rp
	&& RING_REQUEST_CONS_OVERFLOW(&rings->common,
	rings->common.req_cons) == 0){
	blkif_request_t ring_req_storage;
	blkif_request_t *ring_req;
	int cur_size;

	switch (xbb->abi) {
	case BLKIF_PROTOCOL_NATIVE:
	ring_req = RING_GET_REQUEST(&xbb->rings.native,
	rings->common.req_cons);
	break;
	case BLKIF_PROTOCOL_X86_32:
	{
	struct blkif_x86_32_request *ring_req32;

	ring_req32 = RING_GET_REQUEST(
	&xbb->rings.x86_32, rings->common.req_cons);
	blkif_get_x86_32_req(&ring_req_storage,
	ring_req32);
	ring_req = &ring_req_storage;
	break;
	}
	case BLKIF_PROTOCOL_X86_64:
	{
	struct blkif_x86_64_request *ring_req64;

	ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64,
	rings->common.req_cons);
	blkif_get_x86_64_req(&ring_req_storage,
	ring_req64);
	ring_req = &ring_req_storage;
	break;
	}
	default:
	panic("Unexpected blkif protocol ABI.");
	/* NOTREACHED */
	}

	/*
	* Check for situations that would require closing
	* off this I/O for further coalescing:
	* - Coalescing is turned off.
	* - Current I/O is out of sequence with the previous
	* I/O.
	* - Coalesced I/O would be too large.
	*/
	if ((reqlist != NULL)
	&& ((xbb->no_coalesce_reqs != 0)
	\|\| ((xbb->no_coalesce_reqs == 0)
	&& ((ring_req->sector_number != cur_sector)
	\|\| (ring_req->operation != cur_operation)
	\|\| ((ring_req->nr_segments + reqlist->nr_segments) >
	xbb->max_reqlist_segments))))) {
	reqlist = NULL;
	}

	/*
	* Grab and check for all resources in one shot.
	* If we can't get all of the resources we need,
	* the shortage is noted and the thread will get
	* woken up when more resources are available.
	*/
	retval = xbb_get_resources(xbb, &reqlist, ring_req,
	xbb->rings.common.req_cons);

	if (retval != 0) {
	/*
	* Resource shortage has been recorded.
	* We'll be scheduled to run once a request
	* object frees up due to a completion.
	*/
	break;
	}

	/*
	* Signify that we can overwrite this request with
	* a response by incrementing our consumer index.
	* The response won't be generated until after
	* we've already consumed all necessary data out
	* of the version of the request in the ring buffer
	* (for native mode). We must update the consumer
	* index before issuing back-end I/O so there is
	* no possibility that it will complete and a
	* response be generated before we make room in
	* the queue for that response.
	*/
	xbb->rings.common.req_cons++;
	xbb->reqs_received++;

	cur_size = xbb_count_sects(ring_req);
	cur_sector = ring_req->sector_number + cur_size;
	reqlist->next_contig_sector = cur_sector;
	cur_operation = ring_req->operation;
	}

	/* Check for I/O to dispatch */
	reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
	if (reqlist == NULL) {
	/*
	* We're out of work to do, put the task queue to
	* sleep.
	*/
	break;
	}

	/*
	* Grab the first request off the queue and attempt
	* to dispatch it.
	*/
	STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links);

	retval = xbb_dispatch_io(xbb, reqlist);
	if (retval != 0) {
	/*
	* xbb_dispatch_io() returns non-zero only when
	* there is a resource shortage. If that's the
	* case, re-queue this request on the head of the
	* queue, and go to sleep until we have more
	* resources.
	*/
	STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq,
	reqlist, links);
	break;
	} else {
	/*
	* If we still have anything on the queue after
	* removing the head entry, that is because we
	* met one of the criteria to create a new
	* request list (outlined above), and we'll call
	* that a forced dispatch for statistical purposes.
	*
	* Otherwise, if there is only one element on the
	* queue, we coalesced everything available on
	* the ring and we'll call that a normal dispatch.
	*/
	reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);

	if (reqlist != NULL)
	xbb->forced_dispatch++;
	else
	xbb->normal_dispatch++;

	xbb->total_dispatch++;
	}
	}
	}

	/**
	* Interrupt handler bound to the shared ring's event channel.
	*
	* \param arg Callback argument registerd during event channel
	* binding - the xbb_softc for this instance.
	*/
	static int
	xbb_filter(void *arg)
	{
	struct xbb_softc *xbb;

	/* Defer to taskqueue thread. */
	xbb = (struct xbb_softc *)arg;
	taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);

	return (FILTER_HANDLED);
	}

	SDT_PROVIDER_DEFINE(xbb);
	SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int");
	SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t",
	"uint64_t");
	SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int",
	"uint64_t", "uint64_t");

	/----------------------------- Backend Handlers -----------------------------/
	/**
	* Backend handler for character device access.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param reqlist Allocated internal request list structure.
	* \param operation BIO_* I/O operation code.
	* \param bio_flags Additional bio_flag data to pass to any generated
	* bios (e.g. BIO_ORDERED)..
	*
	* \return 0 for success, errno codes for failure.
	*/
	static int
	xbb_dispatch_dev(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist,
	int operation, int bio_flags)
	{
	struct xbb_dev_data *dev_data;
	struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST];
	off_t bio_offset;
	struct bio *bio;
	struct xbb_sg *xbb_sg;
	u_int nbio;
	u_int bio_idx;
	u_int nseg;
	u_int seg_idx;
	int error;

	dev_data = &xbb->backend.dev;
	bio_offset = (off_t)reqlist->starting_sector_number
	<< xbb->sector_size_shift;
	error = 0;
	nbio = 0;
	bio_idx = 0;

	if (operation == BIO_FLUSH) {
	bio = g_new_bio();
	if (__predict_false(bio == NULL)) {
	DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
	error = ENOMEM;
	return (error);
	}

	bio->bio_cmd = BIO_FLUSH;
	bio->bio_flags \|= BIO_ORDERED;
	bio->bio_dev = dev_data->cdev;
	bio->bio_offset = 0;
	bio->bio_data = 0;
	bio->bio_done = xbb_bio_done;
	bio->bio_caller1 = reqlist;
	bio->bio_pblkno = 0;

	reqlist->pendcnt = 1;

	SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
	device_get_unit(xbb->dev));

	(*dev_data->csw->d_strategy)(bio);

	return (0);
	}

	xbb_sg = xbb->xbb_sgs;
	bio = NULL;
	nseg = reqlist->nr_segments;

	for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
	/*
	* KVA will not be contiguous, so any additional
	* I/O will need to be represented in a new bio.
	*/
	if ((bio != NULL)
	&& (xbb_sg->first_sect != 0)) {
	if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
	printf("%s: Discontiguous I/O request "
	"from domain %d ends on "
	"non-sector boundary\n",
	__func__, xbb->otherend_id);
	error = EINVAL;
	goto fail_free_bios;
	}
	bio = NULL;
	}

	if (bio == NULL) {
	/*
	* Make sure that the start of this bio is
	* aligned to a device sector.
	*/
	if ((bio_offset & (xbb->sector_size - 1)) != 0){
	printf("%s: Misaligned I/O request "
	"from domain %d\n", __func__,
	xbb->otherend_id);
	error = EINVAL;
	goto fail_free_bios;
	}

	bio = bios[nbio++] = g_new_bio();
	if (__predict_false(bio == NULL)) {
	error = ENOMEM;
	goto fail_free_bios;
	}
	bio->bio_cmd = operation;
	bio->bio_flags \|= bio_flags;
	bio->bio_dev = dev_data->cdev;
	bio->bio_offset = bio_offset;
	bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx,
	xbb_sg->first_sect);
	bio->bio_done = xbb_bio_done;
	bio->bio_caller1 = reqlist;
	bio->bio_pblkno = bio_offset >> xbb->sector_size_shift;
	}

	bio->bio_length += xbb_sg->nsect << 9;
	bio->bio_bcount = bio->bio_length;
	bio_offset += xbb_sg->nsect << 9;

	if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
	if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
	printf("%s: Discontiguous I/O request "
	"from domain %d ends on "
	"non-sector boundary\n",
	__func__, xbb->otherend_id);
	error = EINVAL;
	goto fail_free_bios;
	}
	/*
	* KVA will not be contiguous, so any additional
	* I/O will need to be represented in a new bio.
	*/
	bio = NULL;
	}
	}

	reqlist->pendcnt = nbio;

	for (bio_idx = 0; bio_idx < nbio; bio_idx++)
	{
	#ifdef XBB_USE_BOUNCE_BUFFERS
	vm_offset_t kva_offset;

	kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
	- (vm_offset_t)reqlist->bounce;
	if (operation == BIO_WRITE) {
	memcpy(bios[bio_idx]->bio_data,
	(uint8_t *)reqlist->kva + kva_offset,
	bios[bio_idx]->bio_bcount);
	}
	#endif
	if (operation == BIO_READ) {
	SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
	device_get_unit(xbb->dev),
	bios[bio_idx]->bio_offset,
	bios[bio_idx]->bio_length);
	} else if (operation == BIO_WRITE) {
	SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
	device_get_unit(xbb->dev),
	bios[bio_idx]->bio_offset,
	bios[bio_idx]->bio_length);
	}
	(*dev_data->csw->d_strategy)(bios[bio_idx]);
	}

	return (error);

	fail_free_bios:
	for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
	g_destroy_bio(bios[bio_idx]);

	return (error);
	}

	SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int");
	SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t",
	"uint64_t");
	SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int",
	"uint64_t", "uint64_t");

	/**
	* Backend handler for file access.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param reqlist Allocated internal request list.
	* \param operation BIO_* I/O operation code.
	* \param flags Additional bio_flag data to pass to any generated bios
	* (e.g. BIO_ORDERED)..
	*
	* \return 0 for success, errno codes for failure.
	*/
	static int
	xbb_dispatch_file(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist,
	int operation, int flags)
	{
	struct xbb_file_data *file_data;
	u_int seg_idx;
	u_int nseg;
	struct uio xuio;
	struct xbb_sg *xbb_sg;
	struct iovec *xiovec;
	#ifdef XBB_USE_BOUNCE_BUFFERS
	void **p_vaddr;
	int saved_uio_iovcnt;
	#endif /* XBB_USE_BOUNCE_BUFFERS */
	int error;

	file_data = &xbb->backend.file;
	error = 0;
	bzero(&xuio, sizeof(xuio));

	switch (operation) {
	case BIO_READ:
	xuio.uio_rw = UIO_READ;
	break;
	case BIO_WRITE:
	xuio.uio_rw = UIO_WRITE;
	break;
	case BIO_FLUSH: {
	struct mount *mountpoint;

	SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
	device_get_unit(xbb->dev));

	(void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);

	vn_lock(xbb->vn, LK_EXCLUSIVE \| LK_RETRY);
	error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
	VOP_UNLOCK(xbb->vn);

	vn_finished_write(mountpoint);

	goto bailout_send_response;
	/* NOTREACHED */
	}
	default:
	panic("invalid operation %d", operation);
	/* NOTREACHED */
	}
	xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number
	<< xbb->sector_size_shift;
	xuio.uio_segflg = UIO_SYSSPACE;
	xuio.uio_iov = file_data->xiovecs;
	xuio.uio_iovcnt = 0;
	xbb_sg = xbb->xbb_sgs;
	nseg = reqlist->nr_segments;

	for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
	/*
	* If the first sector is not 0, the KVA will
	* not be contiguous and we'll need to go on
	* to another segment.
	*/
	if (xbb_sg->first_sect != 0)
	xiovec = NULL;

	if (xiovec == NULL) {
	xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
	xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
	seg_idx, xbb_sg->first_sect);
	#ifdef XBB_USE_BOUNCE_BUFFERS
	/*
	* Store the address of the incoming
	* buffer at this particular offset
	* as well, so we can do the copy
	* later without having to do more
	* work to recalculate this address.
	*/
	p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
	*p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx,
	xbb_sg->first_sect);
	#endif /* XBB_USE_BOUNCE_BUFFERS */
	xiovec->iov_len = 0;
	xuio.uio_iovcnt++;
	}

	xiovec->iov_len += xbb_sg->nsect << 9;

	xuio.uio_resid += xbb_sg->nsect << 9;

	/*
	* If the last sector is not the full page
	* size count, the next segment will not be
	* contiguous in KVA and we need a new iovec.
	*/
	if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
	xiovec = NULL;
	}

	xuio.uio_td = curthread;

	#ifdef XBB_USE_BOUNCE_BUFFERS
	saved_uio_iovcnt = xuio.uio_iovcnt;

	if (operation == BIO_WRITE) {
	/* Copy the write data to the local buffer. */
	for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
	xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
	seg_idx++, xiovec++, p_vaddr++) {
	memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
	}
	} else {
	/*
	* We only need to save off the iovecs in the case of a
	* read, because the copy for the read happens after the
	* VOP_READ(). (The uio will get modified in that call
	* sequence.)
	*/
	memcpy(file_data->saved_xiovecs, xuio.uio_iov,
	xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
	}
	#endif /* XBB_USE_BOUNCE_BUFFERS */

	switch (operation) {
	case BIO_READ:

	SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
	device_get_unit(xbb->dev), xuio.uio_offset,
	xuio.uio_resid);

	vn_lock(xbb->vn, LK_EXCLUSIVE \| LK_RETRY);

	/*
	* UFS pays attention to IO_DIRECT for reads. If the
	* DIRECTIO option is configured into the kernel, it calls
	* ffs_rawread(). But that only works for single-segment
	* uios with user space addresses. In our case, with a
	* kernel uio, it still reads into the buffer cache, but it
	* will just try to release the buffer from the cache later
	* on in ffs_read().
	*
	* ZFS does not pay attention to IO_DIRECT for reads.
	*
	* UFS does not pay attention to IO_SYNC for reads.
	*
	* ZFS pays attention to IO_SYNC (which translates into the
	* Solaris define FRSYNC for zfs_read()) for reads. It
	* attempts to sync the file before reading.
	*
	* So, to attempt to provide some barrier semantics in the
	* BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.
	*/
	error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
	(IO_DIRECT\|IO_SYNC) : 0, file_data->cred);

	VOP_UNLOCK(xbb->vn);
	break;
	case BIO_WRITE: {
	struct mount *mountpoint;

	SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
	device_get_unit(xbb->dev), xuio.uio_offset,
	xuio.uio_resid);

	(void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);

	vn_lock(xbb->vn, LK_EXCLUSIVE \| LK_RETRY);

	/*
	* UFS pays attention to IO_DIRECT for writes. The write
	* is done asynchronously. (Normally the write would just
	* get put into cache.
	*
	* UFS pays attention to IO_SYNC for writes. It will
	* attempt to write the buffer out synchronously if that
	* flag is set.
	*
	* ZFS does not pay attention to IO_DIRECT for writes.
	*
	* ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
	* for writes. It will flush the transaction from the
	* cache before returning.
	*
	* So if we've got the BIO_ORDERED flag set, we want
	* IO_SYNC in either the UFS or ZFS case.
	*/
	error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
	IO_SYNC : 0, file_data->cred);
	VOP_UNLOCK(xbb->vn);

	vn_finished_write(mountpoint);

	break;
	}
	default:
	panic("invalid operation %d", operation);
	/* NOTREACHED */
	}

	#ifdef XBB_USE_BOUNCE_BUFFERS
	/* We only need to copy here for read operations */
	if (operation == BIO_READ) {
	for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
	xiovec = file_data->saved_xiovecs;
	seg_idx < saved_uio_iovcnt; seg_idx++,
	xiovec++, p_vaddr++) {
	/*
	* Note that we have to use the copy of the
	* io vector we made above. uiomove() modifies
	* the uio and its referenced vector as uiomove
	* performs the copy, so we can't rely on any
	* state from the original uio.
	*/
	memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
	}
	}
	#endif /* XBB_USE_BOUNCE_BUFFERS */

	bailout_send_response:

	if (error != 0)
	reqlist->status = BLKIF_RSP_ERROR;

	xbb_complete_reqlist(xbb, reqlist);

	return (0);
	}

	/--------------------------- Backend Configuration --------------------------/
	/**
	* Close and cleanup any backend device/file specific state for this
	* block back instance.
	*
	* \param xbb Per-instance xbb configuration structure.
	*/
	static void
	xbb_close_backend(struct xbb_softc *xbb)
	{
	DROP_GIANT();
	DPRINTF("closing dev=%s\n", xbb->dev_name);
	if (xbb->vn) {
	int flags = FREAD;

	if ((xbb->flags & XBBF_READ_ONLY) == 0)
	flags \|= FWRITE;

	switch (xbb->device_type) {
	case XBB_TYPE_DISK:
	if (xbb->backend.dev.csw) {
	dev_relthread(xbb->backend.dev.cdev,
	xbb->backend.dev.dev_ref);
	xbb->backend.dev.csw = NULL;
	xbb->backend.dev.cdev = NULL;
	}
	break;
	case XBB_TYPE_FILE:
	break;
	case XBB_TYPE_NONE:
	default:
	panic("Unexpected backend type.");
	break;
	}

	(void)vn_close(xbb->vn, flags, NOCRED, curthread);
	xbb->vn = NULL;

	switch (xbb->device_type) {
	case XBB_TYPE_DISK:
	break;
	case XBB_TYPE_FILE:
	if (xbb->backend.file.cred != NULL) {
	crfree(xbb->backend.file.cred);
	xbb->backend.file.cred = NULL;
	}
	break;
	case XBB_TYPE_NONE:
	default:
	panic("Unexpected backend type.");
	break;
	}
	}
	PICKUP_GIANT();
	}

	/**
	* Open a character device to be used for backend I/O.
	*
	* \param xbb Per-instance xbb configuration structure.
	*
	* \return 0 for success, errno codes for failure.
	*/
	static int
	xbb_open_dev(struct xbb_softc *xbb)
	{
	struct vattr vattr;
	struct cdev *dev;
	struct cdevsw *devsw;
	int error;

	xbb->device_type = XBB_TYPE_DISK;
	xbb->dispatch_io = xbb_dispatch_dev;
	xbb->backend.dev.cdev = xbb->vn->v_rdev;
	xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
	&xbb->backend.dev.dev_ref);
	if (xbb->backend.dev.csw == NULL)
	panic("Unable to retrieve device switch");

	error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
	if (error) {
	xenbus_dev_fatal(xbb->dev, error, "error getting "
	"vnode attributes for device %s",
	xbb->dev_name);
	return (error);
	}

	dev = xbb->vn->v_rdev;
	devsw = dev->si_devsw;
	if (!devsw->d_ioctl) {
	xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
	"device %s!", xbb->dev_name);
	return (ENODEV);
	}

	error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
	(caddr_t)&xbb->sector_size, FREAD,
	curthread);
	if (error) {
	xenbus_dev_fatal(xbb->dev, error,
	"error calling ioctl DIOCGSECTORSIZE "
	"for device %s", xbb->dev_name);
	return (error);
	}

	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
	(caddr_t)&xbb->media_size, FREAD,
	curthread);
	if (error) {
	xenbus_dev_fatal(xbb->dev, error,
	"error calling ioctl DIOCGMEDIASIZE "
	"for device %s", xbb->dev_name);
	return (error);
	}

	return (0);
	}

	/**
	* Open a file to be used for backend I/O.
	*
	* \param xbb Per-instance xbb configuration structure.
	*
	* \return 0 for success, errno codes for failure.
	*/
	static int
	xbb_open_file(struct xbb_softc *xbb)
	{
	struct xbb_file_data *file_data;
	struct vattr vattr;
	int error;

	file_data = &xbb->backend.file;
	xbb->device_type = XBB_TYPE_FILE;
	xbb->dispatch_io = xbb_dispatch_file;
	error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
	if (error != 0) {
	xenbus_dev_fatal(xbb->dev, error,
	"error calling VOP_GETATTR()"
	"for file %s", xbb->dev_name);
	return (error);
	}

	/*
	* Verify that we have the ability to upgrade to exclusive
	* access on this file so we can trap errors at open instead
	* of reporting them during first access.
	*/
	if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
	vn_lock(xbb->vn, LK_UPGRADE \| LK_RETRY);
	if (VN_IS_DOOMED(xbb->vn)) {
	error = EBADF;
	xenbus_dev_fatal(xbb->dev, error,
	"error locking file %s",
	xbb->dev_name);

	return (error);
	}
	}

	file_data->cred = crhold(curthread->td_ucred);
	xbb->media_size = vattr.va_size;

	/*
	* XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
	* With ZFS, it is 131072 bytes. Block sizes that large don't work
	* with disklabel and UFS on FreeBSD at least. Large block sizes
	* may not work with other OSes as well. So just export a sector
	* size of 512 bytes, which should work with any OS or
	* application. Since our backing is a file, any block size will
	* work fine for the backing store.
	*/
	#if 0
	xbb->sector_size = vattr.va_blocksize;
	#endif
	xbb->sector_size = 512;

	/*
	* Sanity check. The media size has to be at least one
	* sector long.
	*/
	if (xbb->media_size < xbb->sector_size) {
	error = EINVAL;
	xenbus_dev_fatal(xbb->dev, error,
	"file %s size %ju < block size %u",
	xbb->dev_name,
	(uintmax_t)xbb->media_size,
	xbb->sector_size);
	}
	return (error);
	}

	/**
	* Open the backend provider for this connection.
	*
	* \param xbb Per-instance xbb configuration structure.
	*
	* \return 0 for success, errno codes for failure.
	*/
	static int
	xbb_open_backend(struct xbb_softc *xbb)
	{
	struct nameidata nd;
	int flags;
	int error;

	flags = FREAD;
	error = 0;

	DPRINTF("opening dev=%s\n", xbb->dev_name);

	if (rootvnode == NULL) {
	xenbus_dev_fatal(xbb->dev, ENOENT,
	"Root file system not mounted");
	return (ENOENT);
	}

	if ((xbb->flags & XBBF_READ_ONLY) == 0)
	flags \|= FWRITE;

	pwd_ensure_dirs();

	again:
	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
	error = vn_open(&nd, &flags, 0, NULL);
	if (error) {
	/*
	* This is the only reasonable guess we can make as far as
	* path if the user doesn't give us a fully qualified path.
	* If they want to specify a file, they need to specify the
	* full path.
	*/
	if (xbb->dev_name[0] != '/') {
	char *dev_path = "/dev/";
	char *dev_name;

	/* Try adding device path at beginning of name */
	dev_name = malloc(strlen(xbb->dev_name)
	+ strlen(dev_path) + 1,
	M_XENBLOCKBACK, M_NOWAIT);
	if (dev_name) {
	sprintf(dev_name, "%s%s", dev_path,
	xbb->dev_name);
	free(xbb->dev_name, M_XENBLOCKBACK);
	xbb->dev_name = dev_name;
	goto again;
	}
	}
	xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
	xbb->dev_name);
	return (error);
	}

	NDFREE(&nd, NDF_ONLY_PNBUF);

	xbb->vn = nd.ni_vp;

	/* We only support disks and files. */
	if (vn_isdisk_error(xbb->vn, &error)) {
	error = xbb_open_dev(xbb);
	} else if (xbb->vn->v_type == VREG) {
	error = xbb_open_file(xbb);
	} else {
	error = EINVAL;
	xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
	"or file", xbb->dev_name);
	}
	VOP_UNLOCK(xbb->vn);

	if (error != 0) {
	xbb_close_backend(xbb);
	return (error);
	}

	xbb->sector_size_shift = fls(xbb->sector_size) - 1;
	xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;

	DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
	(xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
	xbb->dev_name, xbb->sector_size, xbb->media_size);

	return (0);
	}

	/------------------------ Inter-Domain Communication ------------------------/
	/**
	* Free dynamically allocated KVA or pseudo-physical address allocations.
	*
	* \param xbb Per-instance xbb configuration structure.
	*/
	static void
	xbb_free_communication_mem(struct xbb_softc *xbb)
	{
	if (xbb->kva != 0) {
	if (xbb->pseudo_phys_res != NULL) {
	xenmem_free(xbb->dev, xbb->pseudo_phys_res_id,
	xbb->pseudo_phys_res);
	xbb->pseudo_phys_res = NULL;
	}
	}
	xbb->kva = 0;
	xbb->gnt_base_addr = 0;
	if (xbb->kva_free != NULL) {
	free(xbb->kva_free, M_XENBLOCKBACK);
	xbb->kva_free = NULL;
	}
	}

	/**
	* Cleanup all inter-domain communication mechanisms.
	*
	* \param xbb Per-instance xbb configuration structure.
	*/
	static int
	xbb_disconnect(struct xbb_softc *xbb)
	{
	struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES];
	struct gnttab_unmap_grant_ref *op;
	u_int ring_idx;
	int error;

	DPRINTF("\n");

	if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
	return (0);

	mtx_unlock(&xbb->lock);
	xen_intr_unbind(&xbb->xen_intr_handle);
	taskqueue_drain(xbb->io_taskqueue, &xbb->io_task);
	mtx_lock(&xbb->lock);

	/*
	* No new interrupts can generate work, but we must wait
	* for all currently active requests to drain.
	*/
	if (xbb->active_request_count != 0)
	return (EAGAIN);

	for (ring_idx = 0, op = ops;
	ring_idx < xbb->ring_config.ring_pages;
	ring_idx++, op++) {
	op->host_addr = xbb->ring_config.gnt_addr
	+ (ring_idx * PAGE_SIZE);
	op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
	op->handle = xbb->ring_config.handle[ring_idx];
	}

	error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
	xbb->ring_config.ring_pages);
	if (error != 0)
	panic("Grant table op failed (%d)", error);

	xbb_free_communication_mem(xbb);

	if (xbb->requests != NULL) {
	free(xbb->requests, M_XENBLOCKBACK);
	xbb->requests = NULL;
	}

	if (xbb->request_lists != NULL) {
	struct xbb_xen_reqlist *reqlist;
	int i;

	/* There is one request list for ever allocated request. */
	for (i = 0, reqlist = xbb->request_lists;
	i < xbb->max_requests; i++, reqlist++){
	#ifdef XBB_USE_BOUNCE_BUFFERS
	if (reqlist->bounce != NULL) {
	free(reqlist->bounce, M_XENBLOCKBACK);
	reqlist->bounce = NULL;
	}
	#endif
	if (reqlist->gnt_handles != NULL) {
	free(reqlist->gnt_handles, M_XENBLOCKBACK);
	reqlist->gnt_handles = NULL;
	}
	}
	free(xbb->request_lists, M_XENBLOCKBACK);
	xbb->request_lists = NULL;
	}

	xbb->flags &= ~XBBF_RING_CONNECTED;
	return (0);
	}

	/**
	* Map shared memory ring into domain local address space, initialize
	* ring control structures, and bind an interrupt to the event channel
	* used to notify us of ring changes.
	*
	* \param xbb Per-instance xbb configuration structure.
	*/
	static int
	xbb_connect_ring(struct xbb_softc *xbb)
	{
	struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES];
	struct gnttab_map_grant_ref *gnt;
	u_int ring_idx;
	int error;

	if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
	return (0);

	/*
	* Kva for our ring is at the tail of the region of kva allocated
	* by xbb_alloc_communication_mem().
	*/
	xbb->ring_config.va = xbb->kva
	+ (xbb->kva_size
	- (xbb->ring_config.ring_pages * PAGE_SIZE));
	xbb->ring_config.gnt_addr = xbb->gnt_base_addr
	+ (xbb->kva_size
	- (xbb->ring_config.ring_pages * PAGE_SIZE));

	for (ring_idx = 0, gnt = gnts;
	ring_idx < xbb->ring_config.ring_pages;
	ring_idx++, gnt++) {
	gnt->host_addr = xbb->ring_config.gnt_addr
	+ (ring_idx * PAGE_SIZE);
	gnt->flags = GNTMAP_host_map;
	gnt->ref = xbb->ring_config.ring_ref[ring_idx];
	gnt->dom = xbb->otherend_id;
	}

	error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
	xbb->ring_config.ring_pages);
	if (error)
	panic("blkback: Ring page grant table op failed (%d)", error);

	for (ring_idx = 0, gnt = gnts;
	ring_idx < xbb->ring_config.ring_pages;
	ring_idx++, gnt++) {
	if (gnt->status != 0) {
	xbb->ring_config.va = 0;
	xenbus_dev_fatal(xbb->dev, EACCES,
	"Ring shared page mapping failed. "
	"Status %d.", gnt->status);
	return (EACCES);
	}
	xbb->ring_config.handle[ring_idx] = gnt->handle;
	xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
	}

	/* Initialize the ring based on ABI. */
	switch (xbb->abi) {
	case BLKIF_PROTOCOL_NATIVE:
	{
	blkif_sring_t *sring;
	sring = (blkif_sring_t *)xbb->ring_config.va;
	BACK_RING_INIT(&xbb->rings.native, sring,
	xbb->ring_config.ring_pages * PAGE_SIZE);
	break;
	}
	case BLKIF_PROTOCOL_X86_32:
	{
	blkif_x86_32_sring_t *sring_x86_32;
	sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
	BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
	xbb->ring_config.ring_pages * PAGE_SIZE);
	break;
	}
	case BLKIF_PROTOCOL_X86_64:
	{
	blkif_x86_64_sring_t *sring_x86_64;
	sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
	BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
	xbb->ring_config.ring_pages * PAGE_SIZE);
	break;
	}
	default:
	panic("Unexpected blkif protocol ABI.");
	}

	xbb->flags \|= XBBF_RING_CONNECTED;

	error = xen_intr_bind_remote_port(xbb->dev,
	xbb->otherend_id,
	xbb->ring_config.evtchn,
	xbb_filter,
	/ithread_handler/NULL,
	/arg/xbb,
	INTR_TYPE_BIO \| INTR_MPSAFE,
	&xbb->xen_intr_handle);
	if (error) {
	(void)xbb_disconnect(xbb);
	xenbus_dev_fatal(xbb->dev, error, "binding event channel");
	return (error);
	}

	DPRINTF("rings connected!\n");

	return 0;
	}

	/**
	* Size KVA and pseudo-physical address allocations based on negotiated
	* values for the size and number of I/O requests, and the size of our
	* communication ring.
	*
	* \param xbb Per-instance xbb configuration structure.
	*
	* These address spaces are used to dynamically map pages in the
	* front-end's domain into our own.
	*/
	static int
	xbb_alloc_communication_mem(struct xbb_softc *xbb)
	{
	xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments;
	xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE;
	xbb->kva_size = xbb->reqlist_kva_size +
	(xbb->ring_config.ring_pages * PAGE_SIZE);

	xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT);
	if (xbb->kva_free == NULL)
	return (ENOMEM);

	DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n",
	device_get_nameunit(xbb->dev), xbb->kva_size,
	xbb->reqlist_kva_size);
	/*
	* Reserve a range of pseudo physical memory that we can map
	* into kva. These pages will only be backed by machine
	* pages ("real memory") during the lifetime of front-end requests
	* via grant table operations.
	*/
	xbb->pseudo_phys_res_id = 0;
	xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id,
	xbb->kva_size);
	if (xbb->pseudo_phys_res == NULL) {
	xbb->kva = 0;
	return (ENOMEM);
	}
	xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
	xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);

	DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n",
	device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva,
	(uintmax_t)xbb->gnt_base_addr);
	return (0);
	}

	/**
	* Collect front-end information from the XenStore.
	*
	* \param xbb Per-instance xbb configuration structure.
	*/
	static int
	xbb_collect_frontend_info(struct xbb_softc *xbb)
	{
	char protocol_abi[64];
	const char *otherend_path;
	int error;
	u_int ring_idx;
	u_int ring_page_order;
	size_t ring_size;

	otherend_path = xenbus_get_otherend_path(xbb->dev);

	/*
	* Protocol defaults valid even if all negotiation fails.
	*/
	xbb->ring_config.ring_pages = 1;
	xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
	xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE;

	/*
	* Mandatory data (used in all versions of the protocol) first.
	*/
	error = xs_scanf(XST_NIL, otherend_path,
	"event-channel", NULL, "%" PRIu32,
	&xbb->ring_config.evtchn);
	if (error != 0) {
	xenbus_dev_fatal(xbb->dev, error,
	"Unable to retrieve event-channel information "
	"from frontend %s. Unable to connect.",
	xenbus_get_otherend_path(xbb->dev));
	return (error);
	}

	/*
	* These fields are initialized to legacy protocol defaults
	* so we only need to fail if reading the updated value succeeds
	* and the new value is outside of its allowed range.
	*
	* \note xs_gather() returns on the first encountered error, so
	* we must use independent calls in order to guarantee
	* we don't miss information in a sparsly populated front-end
	* tree.
	*
	* \note xs_scanf() does not update variables for unmatched
	* fields.
	*/
	ring_page_order = 0;
	xbb->max_requests = 32;

	(void)xs_scanf(XST_NIL, otherend_path,
	"ring-page-order", NULL, "%u",
	&ring_page_order);
	xbb->ring_config.ring_pages = 1 << ring_page_order;
	ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
	xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);

	if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) {
	xenbus_dev_fatal(xbb->dev, EINVAL,
	"Front-end specified ring-pages of %u "
	"exceeds backend limit of %u. "
	"Unable to connect.",
	xbb->ring_config.ring_pages,
	XBB_MAX_RING_PAGES);
	return (EINVAL);
	}

	if (xbb->ring_config.ring_pages == 1) {
	error = xs_gather(XST_NIL, otherend_path,
	"ring-ref", "%" PRIu32,
	&xbb->ring_config.ring_ref[0],
	NULL);
	if (error != 0) {
	xenbus_dev_fatal(xbb->dev, error,
	"Unable to retrieve ring information "
	"from frontend %s. Unable to "
	"connect.",
	xenbus_get_otherend_path(xbb->dev));
	return (error);
	}
	} else {
	/* Multi-page ring format. */
	for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
	ring_idx++) {
	char ring_ref_name[]= "ring_refXX";

	snprintf(ring_ref_name, sizeof(ring_ref_name),
	"ring-ref%u", ring_idx);
	error = xs_scanf(XST_NIL, otherend_path,
	ring_ref_name, NULL, "%" PRIu32,
	&xbb->ring_config.ring_ref[ring_idx]);
	if (error != 0) {
	xenbus_dev_fatal(xbb->dev, error,
	"Failed to retriev grant "
	"reference for page %u of "
	"shared ring. Unable "
	"to connect.", ring_idx);
	return (error);
	}
	}
	}

	error = xs_gather(XST_NIL, otherend_path,
	"protocol", "%63s", protocol_abi,
	NULL);
	if (error != 0
	\|\| !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
	/*
	* Assume native if the frontend has not
	* published ABI data or it has published and
	* matches our own ABI.
	*/
	xbb->abi = BLKIF_PROTOCOL_NATIVE;
	} else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
	xbb->abi = BLKIF_PROTOCOL_X86_32;
	} else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
	xbb->abi = BLKIF_PROTOCOL_X86_64;
	} else {
	xenbus_dev_fatal(xbb->dev, EINVAL,
	"Unknown protocol ABI (%s) published by "
	"frontend. Unable to connect.", protocol_abi);
	return (EINVAL);
	}
	return (0);
	}

	/**
	* Allocate per-request data structures given request size and number
	* information negotiated with the front-end.
	*
	* \param xbb Per-instance xbb configuration structure.
	*/
	static int
	xbb_alloc_requests(struct xbb_softc *xbb)
	{
	struct xbb_xen_req *req;
	struct xbb_xen_req *last_req;

	/*
	* Allocate request book keeping datastructures.
	*/
	xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
	M_XENBLOCKBACK, M_NOWAIT\|M_ZERO);
	if (xbb->requests == NULL) {
	xenbus_dev_fatal(xbb->dev, ENOMEM,
	"Unable to allocate request structures");
	return (ENOMEM);
	}

	req = xbb->requests;
	last_req = &xbb->requests[xbb->max_requests - 1];
	STAILQ_INIT(&xbb->request_free_stailq);
	while (req <= last_req) {
	STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links);
	req++;
	}
	return (0);
	}

	static int
	xbb_alloc_request_lists(struct xbb_softc *xbb)
	{
	struct xbb_xen_reqlist *reqlist;
	int i;

	/*
	* If no requests can be merged, we need 1 request list per
	* in flight request.
	*/
	xbb->request_lists = malloc(xbb->max_requests *
	sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT\|M_ZERO);
	if (xbb->request_lists == NULL) {
	xenbus_dev_fatal(xbb->dev, ENOMEM,
	"Unable to allocate request list structures");
	return (ENOMEM);
	}

	STAILQ_INIT(&xbb->reqlist_free_stailq);
	STAILQ_INIT(&xbb->reqlist_pending_stailq);
	for (i = 0; i < xbb->max_requests; i++) {
	int seg;

	reqlist = &xbb->request_lists[i];

	reqlist->xbb = xbb;

	#ifdef XBB_USE_BOUNCE_BUFFERS
	reqlist->bounce = malloc(xbb->max_reqlist_size,
	M_XENBLOCKBACK, M_NOWAIT);
	if (reqlist->bounce == NULL) {
	xenbus_dev_fatal(xbb->dev, ENOMEM,
	"Unable to allocate request "
	"bounce buffers");
	return (ENOMEM);
	}
	#endif /* XBB_USE_BOUNCE_BUFFERS */

	reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
	sizeof(*reqlist->gnt_handles),
	M_XENBLOCKBACK, M_NOWAIT\|M_ZERO);
	if (reqlist->gnt_handles == NULL) {
	xenbus_dev_fatal(xbb->dev, ENOMEM,
	"Unable to allocate request "
	"grant references");
	return (ENOMEM);
	}

	for (seg = 0; seg < xbb->max_reqlist_segments; seg++)
	reqlist->gnt_handles[seg] = GRANT_REF_INVALID;

	STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
	}
	return (0);
	}

	/**
	* Supply information about the physical device to the frontend
	* via XenBus.
	*
	* \param xbb Per-instance xbb configuration structure.
	*/
	static int
	xbb_publish_backend_info(struct xbb_softc *xbb)
	{
	struct xs_transaction xst;
	const char *our_path;
	const char *leaf;
	int error;

	our_path = xenbus_get_node(xbb->dev);
	while (1) {
	error = xs_transaction_start(&xst);
	if (error != 0) {
	xenbus_dev_fatal(xbb->dev, error,
	"Error publishing backend info "
	"(start transaction)");
	return (error);
	}

	leaf = "sectors";
	error = xs_printf(xst, our_path, leaf,
	"%"PRIu64, xbb->media_num_sectors);
	if (error != 0)
	break;

	/* XXX Support all VBD attributes here. */
	leaf = "info";
	error = xs_printf(xst, our_path, leaf, "%u",
	xbb->flags & XBBF_READ_ONLY
	? VDISK_READONLY : 0);
	if (error != 0)
	break;

	leaf = "sector-size";
	error = xs_printf(xst, our_path, leaf, "%u",
	xbb->sector_size);
	if (error != 0)
	break;

	error = xs_transaction_end(xst, 0);
	if (error == 0) {
	return (0);
	} else if (error != EAGAIN) {
	xenbus_dev_fatal(xbb->dev, error, "ending transaction");
	return (error);
	}
	}

	xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
	our_path, leaf);
	xs_transaction_end(xst, 1);
	return (error);
	}

	/**
	* Connect to our blkfront peer now that it has completed publishing
	* its configuration into the XenStore.
	*
	* \param xbb Per-instance xbb configuration structure.
	*/
	static void
	xbb_connect(struct xbb_softc *xbb)
	{
	int error;

	if (!xbb->hotplug_done \|\|
	(xenbus_get_state(xbb->dev) != XenbusStateInitWait) \|\|
	(xbb_collect_frontend_info(xbb) != 0))
	return;

	xbb->flags &= ~XBBF_SHUTDOWN;

	/*
	* We limit the maximum number of reqlist segments to the maximum
	* number of segments in the ring, or our absolute maximum,
	* whichever is smaller.
	*/
	xbb->max_reqlist_segments = MIN(xbb->max_request_segments *
	xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST);

	/*
	* The maximum size is simply a function of the number of segments
	* we can handle.
	*/
	xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE;

	/* Allocate resources whose size depends on front-end configuration. */
	error = xbb_alloc_communication_mem(xbb);
	if (error != 0) {
	xenbus_dev_fatal(xbb->dev, error,
	"Unable to allocate communication memory");
	return;
	}

	error = xbb_alloc_requests(xbb);
	if (error != 0) {
	/* Specific errors are reported by xbb_alloc_requests(). */
	return;
	}

	error = xbb_alloc_request_lists(xbb);
	if (error != 0) {
	/* Specific errors are reported by xbb_alloc_request_lists(). */
	return;
	}

	/*
	* Connect communication channel.
	*/
	error = xbb_connect_ring(xbb);
	if (error != 0) {
	/* Specific errors are reported by xbb_connect_ring(). */
	return;
	}

	if (xbb_publish_backend_info(xbb) != 0) {
	/*
	* If we can't publish our data, we cannot participate
	* in this connection, and waiting for a front-end state
	* change will not help the situation.
	*/
	(void)xbb_disconnect(xbb);
	return;
	}

	/* Ready for I/O. */
	xenbus_set_state(xbb->dev, XenbusStateConnected);
	}

	/-------------------------- Device Teardown Support -------------------------/
	/**
	* Perform device shutdown functions.
	*
	* \param xbb Per-instance xbb configuration structure.
	*
	* Mark this instance as shutting down, wait for any active I/O on the
	* backend device/file to drain, disconnect from the front-end, and notify
	* any waiters (e.g. a thread invoking our detach method) that detach can
	* now proceed.
	*/
	static int
	xbb_shutdown(struct xbb_softc *xbb)
	{
	XenbusState frontState;
	int error;

	DPRINTF("\n");

	/*
	* Due to the need to drop our mutex during some
	* xenbus operations, it is possible for two threads
	* to attempt to close out shutdown processing at
	* the same time. Tell the caller that hits this
	* race to try back later.
	*/
	if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
	return (EAGAIN);

	xbb->flags \|= XBBF_IN_SHUTDOWN;
	mtx_unlock(&xbb->lock);

	if (xbb->hotplug_watch.node != NULL) {
	xs_unregister_watch(&xbb->hotplug_watch);
	free(xbb->hotplug_watch.node, M_XENBLOCKBACK);
	xbb->hotplug_watch.node = NULL;
	}
	xbb->hotplug_done = false;

	if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
	xenbus_set_state(xbb->dev, XenbusStateClosing);

	frontState = xenbus_get_otherend_state(xbb->dev);
	mtx_lock(&xbb->lock);
	xbb->flags &= ~XBBF_IN_SHUTDOWN;

	/* Wait for the frontend to disconnect (if it's connected). */
	if (frontState == XenbusStateConnected)
	return (EAGAIN);

	DPRINTF("\n");

	/* Indicate shutdown is in progress. */
	xbb->flags \|= XBBF_SHUTDOWN;

	/* Disconnect from the front-end. */
	error = xbb_disconnect(xbb);
	if (error != 0) {
	/*
	* Requests still outstanding. We'll be called again
	* once they complete.
	*/
	KASSERT(error == EAGAIN,
	("%s: Unexpected xbb_disconnect() failure %d",
	__func__, error));

	return (error);
	}

	DPRINTF("\n");

	/* Indicate to xbb_detach() that is it safe to proceed. */
	wakeup(xbb);

	return (0);
	}

	/**
	* Report an attach time error to the console and Xen, and cleanup
	* this instance by forcing immediate detach processing.
	*
	* \param xbb Per-instance xbb configuration structure.
	* \param err Errno describing the error.
	* \param fmt Printf style format and arguments
	*/
	static void
	xbb_attach_failed(struct xbb_softc xbb, int err, const char fmt, ...)
	{
	va_list ap;
	va_list ap_hotplug;

	va_start(ap, fmt);
	va_copy(ap_hotplug, ap);
	xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
	"hotplug-error", fmt, ap_hotplug);
	va_end(ap_hotplug);
	xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
	"hotplug-status", "error");

	xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
	va_end(ap);

	xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
	"online", "0");
	mtx_lock(&xbb->lock);
	xbb_shutdown(xbb);
	mtx_unlock(&xbb->lock);
	}

	/---------------------------- NewBus Entrypoints ----------------------------/
	/**
	* Inspect a XenBus device and claim it if is of the appropriate type.
	*
	* \param dev NewBus device object representing a candidate XenBus device.
	*
	* \return 0 for success, errno codes for failure.
	*/
	static int
	xbb_probe(device_t dev)
	{

	if (!strcmp(xenbus_get_type(dev), "vbd")) {
	device_set_desc(dev, "Backend Virtual Block Device");
	device_quiet(dev);
	return (0);
	}

	return (ENXIO);
	}

	/**
	* Setup sysctl variables to control various Block Back parameters.
	*
	* \param xbb Xen Block Back softc.
	*
	*/
	static void
	xbb_setup_sysctl(struct xbb_softc *xbb)
	{
	struct sysctl_ctx_list *sysctl_ctx = NULL;
	struct sysctl_oid *sysctl_tree = NULL;

	sysctl_ctx = device_get_sysctl_ctx(xbb->dev);
	if (sysctl_ctx == NULL)
	return;

	sysctl_tree = device_get_sysctl_tree(xbb->dev);
	if (sysctl_tree == NULL)
	return;

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0,
	"fake the flush command");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0,
	"send a real flush for N flush requests");

	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0,
	"Don't coalesce contiguous requests");

	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"reqs_received", CTLFLAG_RW, &xbb->reqs_received,
	"how many I/O requests we have received");

	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
	"how many I/O requests have been completed");

	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"reqs_queued_for_completion", CTLFLAG_RW,
	&xbb->reqs_queued_for_completion,
	"how many I/O requests queued but not yet pushed");

	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"reqs_completed_with_error", CTLFLAG_RW,
	&xbb->reqs_completed_with_error,
	"how many I/O requests completed with error status");

	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
	"how many I/O dispatches were forced");

	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch,
	"how many I/O dispatches were normal");

	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"total_dispatch", CTLFLAG_RW, &xbb->total_dispatch,
	"total number of I/O dispatches");

	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"kva_shortages", CTLFLAG_RW, &xbb->kva_shortages,
	"how many times we have run out of KVA");

	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"request_shortages", CTLFLAG_RW,
	&xbb->request_shortages,
	"how many times we have run out of requests");

	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"max_requests", CTLFLAG_RD, &xbb->max_requests, 0,
	"maximum outstanding requests (negotiated)");

	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"max_request_segments", CTLFLAG_RD,
	&xbb->max_request_segments, 0,
	"maximum number of pages per requests (negotiated)");

	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"max_request_size", CTLFLAG_RD,
	&xbb->max_request_size, 0,
	"maximum size in bytes of a request (negotiated)");

	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
	"ring_pages", CTLFLAG_RD,
	&xbb->ring_config.ring_pages, 0,
	"communication channel pages (negotiated)");
	}

	static void
	xbb_attach_disk(struct xs_watch watch, const char *vec, unsigned int len)
	{
	device_t dev;
	struct xbb_softc *xbb;
	int error;

	dev = (device_t) watch->callback_data;
	xbb = device_get_softc(dev);

	error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path",
	NULL, &xbb->dev_name, NULL);
	if (error != 0)
	return;

	xs_unregister_watch(watch);
	free(watch->node, M_XENBLOCKBACK);
	watch->node = NULL;

	/* Collect physical device information. */
	error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
	"device-type", NULL, &xbb->dev_type,
	NULL);
	if (error != 0)
	xbb->dev_type = NULL;

	error = xs_gather(XST_NIL, xenbus_get_node(dev),
	"mode", NULL, &xbb->dev_mode,
	NULL);
	if (error != 0) {
	xbb_attach_failed(xbb, error, "reading backend fields at %s",
	xenbus_get_node(dev));
	return;
	}

	/* Parse fopen style mode flags. */
	if (strchr(xbb->dev_mode, 'w') == NULL)
	xbb->flags \|= XBBF_READ_ONLY;

	/*
	* Verify the physical device is present and can support
	* the desired I/O mode.
	*/
	error = xbb_open_backend(xbb);
	if (error != 0) {
	xbb_attach_failed(xbb, error, "Unable to open %s",
	xbb->dev_name);
	return;
	}

	/* Use devstat(9) for recording statistics. */
	xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
	xbb->sector_size,
	DEVSTAT_ALL_SUPPORTED,
	DEVSTAT_TYPE_DIRECT
	\| DEVSTAT_TYPE_IF_OTHER,
	DEVSTAT_PRIORITY_OTHER);

	xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev),
	xbb->sector_size,
	DEVSTAT_ALL_SUPPORTED,
	DEVSTAT_TYPE_DIRECT
	\| DEVSTAT_TYPE_IF_OTHER,
	DEVSTAT_PRIORITY_OTHER);
	/*
	* Setup sysctl variables.
	*/
	xbb_setup_sysctl(xbb);

	/*
	* Create a taskqueue for doing work that must occur from a
	* thread context.
	*/
	xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev),
	M_NOWAIT,
	taskqueue_thread_enqueue,
	/contxt/&xbb->io_taskqueue);
	if (xbb->io_taskqueue == NULL) {
	xbb_attach_failed(xbb, error, "Unable to create taskqueue");
	return;
	}

	taskqueue_start_threads(&xbb->io_taskqueue,
	/num threads/1,
	/priority/PWAIT,
	/thread name/
	"%s taskq", device_get_nameunit(dev));

	/* Update hot-plug status to satisfy xend. */
	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
	"hotplug-status", "connected");
	if (error) {
	xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
	xenbus_get_node(xbb->dev));
	return;
	}

	xbb->hotplug_done = true;

	/* The front end might be waiting for the backend, attach if so. */
	if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised)
	xbb_connect(xbb);
	}

	/**
	* Attach to a XenBus device that has been claimed by our probe routine.
	*
	* \param dev NewBus device object representing this Xen Block Back instance.
	*
	* \return 0 for success, errno codes for failure.
	*/
	static int
	xbb_attach(device_t dev)
	{
	struct xbb_softc *xbb;
	int error;
	u_int max_ring_page_order;
	struct sbuf *watch_path;

	DPRINTF("Attaching to %s\n", xenbus_get_node(dev));

	/*
	* Basic initialization.
	* After this block it is safe to call xbb_detach()
	* to clean up any allocated data for this instance.
	*/
	xbb = device_get_softc(dev);
	xbb->dev = dev;
	xbb->otherend_id = xenbus_get_otherend_id(dev);
	TASK_INIT(&xbb->io_task, /priority/0, xbb_run_queue, xbb);
	mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);

	/*
	* Publish protocol capabilities for consumption by the
	* front-end.
	*/
	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
	"feature-barrier", "1");
	if (error) {
	xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
	xenbus_get_node(xbb->dev));
	return (error);
	}

	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
	"feature-flush-cache", "1");
	if (error) {
	xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
	xenbus_get_node(xbb->dev));
	return (error);
	}

	max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
	"max-ring-page-order", "%u", max_ring_page_order);
	if (error) {
	xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
	xenbus_get_node(xbb->dev));
	return (error);
	}

	/*
	* We need to wait for hotplug script execution before
	* moving forward.
	*/
	KASSERT(!xbb->hotplug_done, ("Hotplug scripts already executed"));
	watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path");
	xbb->hotplug_watch.callback_data = (uintptr_t)dev;
	xbb->hotplug_watch.callback = xbb_attach_disk;
	KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup"));
	xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK);
	sbuf_delete(watch_path);
	error = xs_register_watch(&xbb->hotplug_watch);
	if (error != 0) {
	xbb_attach_failed(xbb, error, "failed to create watch on %s",
	xbb->hotplug_watch.node);
	free(xbb->hotplug_watch.node, M_XENBLOCKBACK);
	return (error);
	}

	/* Tell the toolstack blkback has attached. */
	xenbus_set_state(dev, XenbusStateInitWait);

	return (0);
	}

	/**
	* Detach from a block back device instance.
	*
	* \param dev NewBus device object representing this Xen Block Back instance.
	*
	* \return 0 for success, errno codes for failure.
	*
	* \note A block back device may be detached at any time in its life-cycle,
	* including part way through the attach process. For this reason,
	* initialization order and the initialization state checks in this
	* routine must be carefully coupled so that attach time failures
	* are gracefully handled.
	*/
	static int
	xbb_detach(device_t dev)
	{
	struct xbb_softc *xbb;

	DPRINTF("\n");

	xbb = device_get_softc(dev);
	mtx_lock(&xbb->lock);
	while (xbb_shutdown(xbb) == EAGAIN) {
	msleep(xbb, &xbb->lock, /wakeup prio unchanged/0,
	"xbb_shutdown", 0);
	}
	mtx_unlock(&xbb->lock);

	DPRINTF("\n");

	if (xbb->io_taskqueue != NULL)
	taskqueue_free(xbb->io_taskqueue);

	if (xbb->xbb_stats != NULL)
	devstat_remove_entry(xbb->xbb_stats);

	if (xbb->xbb_stats_in != NULL)
	devstat_remove_entry(xbb->xbb_stats_in);

	xbb_close_backend(xbb);

	if (xbb->dev_mode != NULL) {
	free(xbb->dev_mode, M_XENSTORE);
	xbb->dev_mode = NULL;
	}

	if (xbb->dev_type != NULL) {
	free(xbb->dev_type, M_XENSTORE);
	xbb->dev_type = NULL;
	}

	if (xbb->dev_name != NULL) {
	free(xbb->dev_name, M_XENSTORE);
	xbb->dev_name = NULL;
	}

	mtx_destroy(&xbb->lock);
	return (0);
	}

	/**
	* Prepare this block back device for suspension of this VM.
	*
	* \param dev NewBus device object representing this Xen Block Back instance.
	*
	* \return 0 for success, errno codes for failure.
	*/
	static int
	xbb_suspend(device_t dev)
	{
	#ifdef NOT_YET
	struct xbb_softc *sc = device_get_softc(dev);

	/* Prevent new requests being issued until we fix things up. */
	mtx_lock(&sc->xb_io_lock);
	sc->connected = BLKIF_STATE_SUSPENDED;
	mtx_unlock(&sc->xb_io_lock);
	#endif

	return (0);
	}

	/**
	* Perform any processing required to recover from a suspended state.
	*
	* \param dev NewBus device object representing this Xen Block Back instance.
	*
	* \return 0 for success, errno codes for failure.
	*/
	static int
	xbb_resume(device_t dev)
	{
	return (0);
	}

	/**
	* Handle state changes expressed via the XenStore by our front-end peer.
	*
	* \param dev NewBus device object representing this Xen
	* Block Back instance.
	* \param frontend_state The new state of the front-end.
	*
	* \return 0 for success, errno codes for failure.
	*/
	static void
	xbb_frontend_changed(device_t dev, XenbusState frontend_state)
	{
	struct xbb_softc *xbb = device_get_softc(dev);

	DPRINTF("frontend_state=%s, xbb_state=%s\n",
	xenbus_strstate(frontend_state),
	xenbus_strstate(xenbus_get_state(xbb->dev)));

	switch (frontend_state) {
	case XenbusStateInitialising:
	break;
	case XenbusStateInitialised:
	case XenbusStateConnected:
	xbb_connect(xbb);
	break;
	case XenbusStateClosing:
	case XenbusStateClosed:
	mtx_lock(&xbb->lock);
	xbb_shutdown(xbb);
	mtx_unlock(&xbb->lock);
	if (frontend_state == XenbusStateClosed)
	xenbus_set_state(xbb->dev, XenbusStateClosed);
	break;
	default:
	xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
	frontend_state);
	break;
	}
	}

	/---------------------------- NewBus Registration ---------------------------/
	static device_method_t xbb_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, xbb_probe),
	DEVMETHOD(device_attach, xbb_attach),
	DEVMETHOD(device_detach, xbb_detach),
	DEVMETHOD(device_shutdown, bus_generic_shutdown),
	DEVMETHOD(device_suspend, xbb_suspend),
	DEVMETHOD(device_resume, xbb_resume),

	/* Xenbus interface */
	DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
	{ 0, 0 }
	};

	static driver_t xbb_driver = {
	"xbbd",
	xbb_methods,
	sizeof(struct xbb_softc),
	};
	devclass_t xbb_devclass;

	DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);
	diff --git a/sys/dev/xen/blkfront/blkfront.c b/sys/dev/xen/blkfront/blkfront.c
	index 5823fcdc564a..847644ca1e67 100644
	--- a/sys/dev/xen/blkfront/blkfront.c
	+++ b/sys/dev/xen/blkfront/blkfront.c
	@@ -1,1652 +1,1652 @@
	/*
	* XenBSD block device driver
	*
	* Copyright (c) 2010-2013 Spectra Logic Corporation
	* Copyright (c) 2009 Scott Long, Yahoo!
	* Copyright (c) 2009 Frank Suchomel, Citrix
	* Copyright (c) 2009 Doug F. Rabson, Citrix
	* Copyright (c) 2005 Kip Macy
	* Copyright (c) 2003-2004, Keir Fraser & Steve Hand
	* Modifications by Mark A. Williamson are (c) Intel Research Cambridge
	*
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to
	* deal in the Software without restriction, including without limitation the
	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
	* sell copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	* DEALINGS IN THE SOFTWARE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/kernel.h>
	#include <vm/vm.h>
	#include <vm/pmap.h>

	#include <sys/bio.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/module.h>
	#include <sys/sysctl.h>

	#include <machine/bus.h>
	#include <sys/rman.h>
	#include <machine/resource.h>
	#include <machine/intr_machdep.h>
	#include <machine/vmparam.h>

	#include <xen/xen-os.h>
	#include <xen/hypervisor.h>
	#include <xen/xen_intr.h>
	#include <xen/gnttab.h>
	#include <xen/interface/grant_table.h>
	#include <xen/interface/io/protocols.h>
	#include <xen/xenbus/xenbusvar.h>

	#include <machine/_inttypes.h>

	#include <geom/geom_disk.h>

	#include <dev/xen/blkfront/block.h>

	#include "xenbus_if.h"

	/--------------------------- Forward Declarations ---------------------------/
	static void xbd_closing(device_t);
	static void xbd_startio(struct xbd_softc *sc);

	/---------------------------------- Macros ----------------------------------/
	#if 0
	#define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args)
	#else
	#define DPRINTK(fmt, args...)
	#endif

	#define XBD_SECTOR_SHFT 9

	/---------------------------- Global Static Data ----------------------------/
	static MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data");

	static int xbd_enable_indirect = 1;
	SYSCTL_NODE(_hw, OID_AUTO, xbd, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"xbd driver parameters");
	SYSCTL_INT(_hw_xbd, OID_AUTO, xbd_enable_indirect, CTLFLAG_RDTUN,
	&xbd_enable_indirect, 0, "Enable xbd indirect segments");

	/---------------------------- Command Processing ----------------------------/
	static void
	xbd_freeze(struct xbd_softc *sc, xbd_flag_t xbd_flag)
	{
	if (xbd_flag != XBDF_NONE && (sc->xbd_flags & xbd_flag) != 0)
	return;

	sc->xbd_flags \|= xbd_flag;
	sc->xbd_qfrozen_cnt++;
	}

	static void
	xbd_thaw(struct xbd_softc *sc, xbd_flag_t xbd_flag)
	{
	if (xbd_flag != XBDF_NONE && (sc->xbd_flags & xbd_flag) == 0)
	return;

	if (sc->xbd_qfrozen_cnt == 0)
	panic("%s: Thaw with flag 0x%x while not frozen.",
	__func__, xbd_flag);

	sc->xbd_flags &= ~xbd_flag;
	sc->xbd_qfrozen_cnt--;
	}

	static void
	xbd_cm_freeze(struct xbd_softc sc, struct xbd_command cm, xbdc_flag_t cm_flag)
	{
	if ((cm->cm_flags & XBDCF_FROZEN) != 0)
	return;

	cm->cm_flags \|= XBDCF_FROZEN\|cm_flag;
	xbd_freeze(sc, XBDF_NONE);
	}

	static void
	xbd_cm_thaw(struct xbd_softc sc, struct xbd_command cm)
	{
	if ((cm->cm_flags & XBDCF_FROZEN) == 0)
	return;

	cm->cm_flags &= ~XBDCF_FROZEN;
	xbd_thaw(sc, XBDF_NONE);
	}

	static inline void
	xbd_flush_requests(struct xbd_softc *sc)
	{
	int notify;

	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->xbd_ring, notify);

	if (notify)
	xen_intr_signal(sc->xen_intr_handle);
	}

	static void
	xbd_free_command(struct xbd_command *cm)
	{

	KASSERT((cm->cm_flags & XBDCF_Q_MASK) == XBD_Q_NONE,
	("Freeing command that is still on queue %d.",
	cm->cm_flags & XBDCF_Q_MASK));

	cm->cm_flags = XBDCF_INITIALIZER;
	cm->cm_bp = NULL;
	cm->cm_complete = NULL;
	xbd_enqueue_cm(cm, XBD_Q_FREE);
	xbd_thaw(cm->cm_sc, XBDF_CM_SHORTAGE);
	}

	static void
	xbd_mksegarray(bus_dma_segment_t *segs, int nsegs,
	grant_ref_t * gref_head, int otherend_id, int readonly,
	grant_ref_t * sg_ref, struct blkif_request_segment *sg)
	{
	struct blkif_request_segment *last_block_sg = sg + nsegs;
	vm_paddr_t buffer_ma;
	uint64_t fsect, lsect;
	int ref;

	while (sg < last_block_sg) {
	KASSERT(segs->ds_addr % (1 << XBD_SECTOR_SHFT) == 0,
	("XEN disk driver I/O must be sector aligned"));
	KASSERT(segs->ds_len % (1 << XBD_SECTOR_SHFT) == 0,
	("XEN disk driver I/Os must be a multiple of "
	"the sector length"));
	buffer_ma = segs->ds_addr;
	fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
	lsect = fsect + (segs->ds_len >> XBD_SECTOR_SHFT) - 1;

	KASSERT(lsect <= 7, ("XEN disk driver data cannot "
	"cross a page boundary"));

	/* install a grant reference. */
	ref = gnttab_claim_grant_reference(gref_head);

	/*
	* GNTTAB_LIST_END == 0xffffffff, but it is private
	* to gnttab.c.
	*/
	KASSERT(ref != ~0, ("grant_reference failed"));

	gnttab_grant_foreign_access_ref(
	ref,
	otherend_id,
	buffer_ma >> PAGE_SHIFT,
	readonly);

	*sg_ref = ref;
	*sg = (struct blkif_request_segment) {
	.gref = ref,
	.first_sect = fsect,
	.last_sect = lsect
	};
	sg++;
	sg_ref++;
	segs++;
	}
	}

	static void
	xbd_queue_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct xbd_softc *sc;
	struct xbd_command *cm;
	int op;

	cm = arg;
	sc = cm->cm_sc;

	if (error) {
	cm->cm_bp->bio_error = EIO;
	biodone(cm->cm_bp);
	xbd_free_command(cm);
	return;
	}

	KASSERT(nsegs <= sc->xbd_max_request_segments,
	("Too many segments in a blkfront I/O"));

	if (nsegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST) {
	blkif_request_t *ring_req;

	/* Fill out a blkif_request_t structure. */
	ring_req = (blkif_request_t *)
	RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt);
	sc->xbd_ring.req_prod_pvt++;
	ring_req->id = cm->cm_id;
	ring_req->operation = cm->cm_operation;
	ring_req->sector_number = cm->cm_sector_number;
	ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk;
	ring_req->nr_segments = nsegs;
	cm->cm_nseg = nsegs;
	xbd_mksegarray(segs, nsegs, &cm->cm_gref_head,
	xenbus_get_otherend_id(sc->xbd_dev),
	cm->cm_operation == BLKIF_OP_WRITE,
	cm->cm_sg_refs, ring_req->seg);
	} else {
	blkif_request_indirect_t *ring_req;

	/* Fill out a blkif_request_indirect_t structure. */
	ring_req = (blkif_request_indirect_t *)
	RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt);
	sc->xbd_ring.req_prod_pvt++;
	ring_req->id = cm->cm_id;
	ring_req->operation = BLKIF_OP_INDIRECT;
	ring_req->indirect_op = cm->cm_operation;
	ring_req->sector_number = cm->cm_sector_number;
	ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk;
	ring_req->nr_segments = nsegs;
	cm->cm_nseg = nsegs;
	xbd_mksegarray(segs, nsegs, &cm->cm_gref_head,
	xenbus_get_otherend_id(sc->xbd_dev),
	cm->cm_operation == BLKIF_OP_WRITE,
	cm->cm_sg_refs, cm->cm_indirectionpages);
	memcpy(ring_req->indirect_grefs, &cm->cm_indirectionrefs,
	sizeof(grant_ref_t) * sc->xbd_max_request_indirectpages);
	}

	if (cm->cm_operation == BLKIF_OP_READ)
	op = BUS_DMASYNC_PREREAD;
	else if (cm->cm_operation == BLKIF_OP_WRITE)
	op = BUS_DMASYNC_PREWRITE;
	else
	op = 0;
	bus_dmamap_sync(sc->xbd_io_dmat, cm->cm_map, op);

	gnttab_free_grant_references(cm->cm_gref_head);

	xbd_enqueue_cm(cm, XBD_Q_BUSY);

	/*
	* If bus dma had to asynchronously call us back to dispatch
	* this command, we are no longer executing in the context of
	* xbd_startio(). Thus we cannot rely on xbd_startio()'s call to
	* xbd_flush_requests() to publish this command to the backend
	* along with any other commands that it could batch.
	*/
	if ((cm->cm_flags & XBDCF_ASYNC_MAPPING) != 0)
	xbd_flush_requests(sc);

	return;
	}

	static int
	xbd_queue_request(struct xbd_softc sc, struct xbd_command cm)
	{
	int error;

	if (cm->cm_bp != NULL)
	error = bus_dmamap_load_bio(sc->xbd_io_dmat, cm->cm_map,
	cm->cm_bp, xbd_queue_cb, cm, 0);
	else
	error = bus_dmamap_load(sc->xbd_io_dmat, cm->cm_map,
	cm->cm_data, cm->cm_datalen, xbd_queue_cb, cm, 0);
	if (error == EINPROGRESS) {
	/*
	* Maintain queuing order by freezing the queue. The next
	* command may not require as many resources as the command
	* we just attempted to map, so we can't rely on bus dma
	* blocking for it too.
	*/
	xbd_cm_freeze(sc, cm, XBDCF_ASYNC_MAPPING);
	return (0);
	}

	return (error);
	}

	static void
	xbd_restart_queue_callback(void *arg)
	{
	struct xbd_softc *sc = arg;

	mtx_lock(&sc->xbd_io_lock);

	xbd_thaw(sc, XBDF_GNT_SHORTAGE);

	xbd_startio(sc);

	mtx_unlock(&sc->xbd_io_lock);
	}

	static struct xbd_command *
	xbd_bio_command(struct xbd_softc *sc)
	{
	struct xbd_command *cm;
	struct bio *bp;

	if (__predict_false(sc->xbd_state != XBD_STATE_CONNECTED))
	return (NULL);

	bp = xbd_dequeue_bio(sc);
	if (bp == NULL)
	return (NULL);

	if ((cm = xbd_dequeue_cm(sc, XBD_Q_FREE)) == NULL) {
	xbd_freeze(sc, XBDF_CM_SHORTAGE);
	xbd_requeue_bio(sc, bp);
	return (NULL);
	}

	if (gnttab_alloc_grant_references(sc->xbd_max_request_segments,
	&cm->cm_gref_head) != 0) {
	gnttab_request_free_callback(&sc->xbd_callback,
	xbd_restart_queue_callback, sc,
	sc->xbd_max_request_segments);
	xbd_freeze(sc, XBDF_GNT_SHORTAGE);
	xbd_requeue_bio(sc, bp);
	xbd_enqueue_cm(cm, XBD_Q_FREE);
	return (NULL);
	}

	cm->cm_bp = bp;
	cm->cm_sector_number = (blkif_sector_t)bp->bio_pblkno;

	switch (bp->bio_cmd) {
	case BIO_READ:
	cm->cm_operation = BLKIF_OP_READ;
	break;
	case BIO_WRITE:
	cm->cm_operation = BLKIF_OP_WRITE;
	if ((bp->bio_flags & BIO_ORDERED) != 0) {
	if ((sc->xbd_flags & XBDF_BARRIER) != 0) {
	cm->cm_operation = BLKIF_OP_WRITE_BARRIER;
	} else {
	/*
	* Single step this command.
	*/
	cm->cm_flags \|= XBDCF_Q_FREEZE;
	if (xbd_queue_length(sc, XBD_Q_BUSY) != 0) {
	/*
	* Wait for in-flight requests to
	* finish.
	*/
	xbd_freeze(sc, XBDF_WAIT_IDLE);
	xbd_requeue_cm(cm, XBD_Q_READY);
	return (NULL);
	}
	}
	}
	break;
	case BIO_FLUSH:
	if ((sc->xbd_flags & XBDF_FLUSH) != 0)
	cm->cm_operation = BLKIF_OP_FLUSH_DISKCACHE;
	else if ((sc->xbd_flags & XBDF_BARRIER) != 0)
	cm->cm_operation = BLKIF_OP_WRITE_BARRIER;
	else
	panic("flush request, but no flush support available");
	break;
	default:
	biofinish(bp, NULL, EOPNOTSUPP);
	xbd_enqueue_cm(cm, XBD_Q_FREE);
	return (NULL);
	}

	return (cm);
	}

	/*
	* Dequeue buffers and place them in the shared communication ring.
	* Return when no more requests can be accepted or all buffers have
	* been queued.
	*
	* Signal XEN once the ring has been filled out.
	*/
	static void
	xbd_startio(struct xbd_softc *sc)
	{
	struct xbd_command *cm;
	int error, queued = 0;

	mtx_assert(&sc->xbd_io_lock, MA_OWNED);

	if (sc->xbd_state != XBD_STATE_CONNECTED)
	return;

	while (!RING_FULL(&sc->xbd_ring)) {
	if (sc->xbd_qfrozen_cnt != 0)
	break;

	cm = xbd_dequeue_cm(sc, XBD_Q_READY);

	if (cm == NULL)
	cm = xbd_bio_command(sc);

	if (cm == NULL)
	break;

	if ((cm->cm_flags & XBDCF_Q_FREEZE) != 0) {
	/*
	* Single step command. Future work is
	* held off until this command completes.
	*/
	xbd_cm_freeze(sc, cm, XBDCF_Q_FREEZE);
	}

	if ((error = xbd_queue_request(sc, cm)) != 0) {
	printf("xbd_queue_request returned %d\n", error);
	break;
	}
	queued++;
	}

	if (queued != 0)
	xbd_flush_requests(sc);
	}

	static void
	xbd_bio_complete(struct xbd_softc sc, struct xbd_command cm)
	{
	struct bio *bp;

	bp = cm->cm_bp;

	if (__predict_false(cm->cm_status != BLKIF_RSP_OKAY)) {
	disk_err(bp, "disk error" , -1, 0);
	printf(" status: %x\n", cm->cm_status);
	bp->bio_flags \|= BIO_ERROR;
	}

	if (bp->bio_flags & BIO_ERROR)
	bp->bio_error = EIO;
	else
	bp->bio_resid = 0;

	xbd_free_command(cm);
	biodone(bp);
	}

	static void
	xbd_int(void *xsc)
	{
	struct xbd_softc *sc = xsc;
	struct xbd_command *cm;
	blkif_response_t *bret;
	RING_IDX i, rp;
	int op;

	mtx_lock(&sc->xbd_io_lock);

	if (__predict_false(sc->xbd_state == XBD_STATE_DISCONNECTED)) {
	mtx_unlock(&sc->xbd_io_lock);
	return;
	}

	again:
	rp = sc->xbd_ring.sring->rsp_prod;
	rmb(); /* Ensure we see queued responses up to 'rp'. */

	for (i = sc->xbd_ring.rsp_cons; i != rp;) {
	bret = RING_GET_RESPONSE(&sc->xbd_ring, i);
	cm = &sc->xbd_shadow[bret->id];

	xbd_remove_cm(cm, XBD_Q_BUSY);
	gnttab_end_foreign_access_references(cm->cm_nseg,
	cm->cm_sg_refs);
	i++;

	if (cm->cm_operation == BLKIF_OP_READ)
	op = BUS_DMASYNC_POSTREAD;
	else if (cm->cm_operation == BLKIF_OP_WRITE \|\|
	cm->cm_operation == BLKIF_OP_WRITE_BARRIER)
	op = BUS_DMASYNC_POSTWRITE;
	else
	op = 0;
	bus_dmamap_sync(sc->xbd_io_dmat, cm->cm_map, op);
	bus_dmamap_unload(sc->xbd_io_dmat, cm->cm_map);

	/*
	* Release any hold this command has on future command
	* dispatch.
	*/
	xbd_cm_thaw(sc, cm);

	/*
	* Directly call the i/o complete routine to save an
	* an indirection in the common case.
	*/
	cm->cm_status = bret->status;
	if (cm->cm_bp)
	xbd_bio_complete(sc, cm);
	else if (cm->cm_complete != NULL)
	cm->cm_complete(cm);
	else
	xbd_free_command(cm);
	}

	sc->xbd_ring.rsp_cons = i;

	if (i != sc->xbd_ring.req_prod_pvt) {
	int more_to_do;
	RING_FINAL_CHECK_FOR_RESPONSES(&sc->xbd_ring, more_to_do);
	if (more_to_do)
	goto again;
	} else {
	sc->xbd_ring.sring->rsp_event = i + 1;
	}

	if (xbd_queue_length(sc, XBD_Q_BUSY) == 0)
	xbd_thaw(sc, XBDF_WAIT_IDLE);

	xbd_startio(sc);

	if (__predict_false(sc->xbd_state == XBD_STATE_SUSPENDED))
	wakeup(&sc->xbd_cm_q[XBD_Q_BUSY]);

	mtx_unlock(&sc->xbd_io_lock);
	}

	/------------------------------- Dump Support -------------------------------/
	/**
	* Quiesce the disk writes for a dump file before allowing the next buffer.
	*/
	static void
	xbd_quiesce(struct xbd_softc *sc)
	{
	int mtd;

	// While there are outstanding requests
	while (xbd_queue_length(sc, XBD_Q_BUSY) != 0) {
	RING_FINAL_CHECK_FOR_RESPONSES(&sc->xbd_ring, mtd);
	if (mtd) {
	/* Received request completions, update queue. */
	xbd_int(sc);
	}
	if (xbd_queue_length(sc, XBD_Q_BUSY) != 0) {
	/*
	* Still pending requests, wait for the disk i/o
	* to complete.
	*/
	HYPERVISOR_yield();
	}
	}
	}

	/* Kernel dump function for a paravirtualized disk device */
	static void
	xbd_dump_complete(struct xbd_command *cm)
	{

	xbd_enqueue_cm(cm, XBD_Q_COMPLETE);
	}

	static int
	xbd_dump(void arg, void virtual, vm_offset_t physical, off_t offset,
	size_t length)
	{
	struct disk *dp = arg;
	struct xbd_softc *sc = dp->d_drv1;
	struct xbd_command *cm;
	size_t chunk;
	int sbp;
	int rc = 0;

	if (length == 0)
	return (0);

	xbd_quiesce(sc); /* All quiet on the western front. */

	/*
	* If this lock is held, then this module is failing, and a
	* successful kernel dump is highly unlikely anyway.
	*/
	mtx_lock(&sc->xbd_io_lock);

	/* Split the 64KB block as needed */
	for (sbp=0; length > 0; sbp++) {
	cm = xbd_dequeue_cm(sc, XBD_Q_FREE);
	if (cm == NULL) {
	mtx_unlock(&sc->xbd_io_lock);
	device_printf(sc->xbd_dev, "dump: no more commands?\n");
	return (EBUSY);
	}

	if (gnttab_alloc_grant_references(sc->xbd_max_request_segments,
	&cm->cm_gref_head) != 0) {
	xbd_free_command(cm);
	mtx_unlock(&sc->xbd_io_lock);
	device_printf(sc->xbd_dev, "no more grant allocs?\n");
	return (EBUSY);
	}

	chunk = length > sc->xbd_max_request_size ?
	sc->xbd_max_request_size : length;
	cm->cm_data = virtual;
	cm->cm_datalen = chunk;
	cm->cm_operation = BLKIF_OP_WRITE;
	cm->cm_sector_number = offset / dp->d_sectorsize;
	cm->cm_complete = xbd_dump_complete;

	xbd_enqueue_cm(cm, XBD_Q_READY);

	length -= chunk;
	offset += chunk;
	virtual = (char *) virtual + chunk;
	}

	/* Tell DOM0 to do the I/O */
	xbd_startio(sc);
	mtx_unlock(&sc->xbd_io_lock);

	/* Poll for the completion. */
	xbd_quiesce(sc); /* All quite on the eastern front */

	/* If there were any errors, bail out... */
	while ((cm = xbd_dequeue_cm(sc, XBD_Q_COMPLETE)) != NULL) {
	if (cm->cm_status != BLKIF_RSP_OKAY) {
	device_printf(sc->xbd_dev,
	"Dump I/O failed at sector %jd\n",
	cm->cm_sector_number);
	rc = EIO;
	}
	xbd_free_command(cm);
	}

	return (rc);
	}

	/----------------------------- Disk Entrypoints -----------------------------/
	static int
	xbd_open(struct disk *dp)
	{
	struct xbd_softc *sc = dp->d_drv1;

	if (sc == NULL) {
	printf("xbd%d: not found", dp->d_unit);
	return (ENXIO);
	}

	sc->xbd_flags \|= XBDF_OPEN;
	sc->xbd_users++;
	return (0);
	}

	static int
	xbd_close(struct disk *dp)
	{
	struct xbd_softc *sc = dp->d_drv1;

	if (sc == NULL)
	return (ENXIO);
	sc->xbd_flags &= ~XBDF_OPEN;
	if (--(sc->xbd_users) == 0) {
	/*
	* Check whether we have been instructed to close. We will
	* have ignored this request initially, as the device was
	* still mounted.
	*/
	if (xenbus_get_otherend_state(sc->xbd_dev) ==
	XenbusStateClosing)
	xbd_closing(sc->xbd_dev);
	}
	return (0);
	}

	static int
	xbd_ioctl(struct disk dp, u_long cmd, void addr, int flag, struct thread *td)
	{
	struct xbd_softc *sc = dp->d_drv1;

	if (sc == NULL)
	return (ENXIO);

	return (ENOTTY);
	}

	/*
	* Read/write routine for a buffer. Finds the proper unit, place it on
	* the sortq and kick the controller.
	*/
	static void
	xbd_strategy(struct bio *bp)
	{
	struct xbd_softc *sc = bp->bio_disk->d_drv1;

	/* bogus disk? */
	if (sc == NULL) {
	bp->bio_error = EINVAL;
	bp->bio_flags \|= BIO_ERROR;
	bp->bio_resid = bp->bio_bcount;
	biodone(bp);
	return;
	}

	/*
	* Place it in the queue of disk activities for this disk
	*/
	mtx_lock(&sc->xbd_io_lock);

	xbd_enqueue_bio(sc, bp);
	xbd_startio(sc);

	mtx_unlock(&sc->xbd_io_lock);
	return;
	}

	/------------------------------ Ring Management -----------------------------/
	static int
	xbd_alloc_ring(struct xbd_softc *sc)
	{
	blkif_sring_t *sring;
	uintptr_t sring_page_addr;
	int error;
	int i;

	sring = malloc(sc->xbd_ring_pages * PAGE_SIZE, M_XENBLOCKFRONT,
	M_NOWAIT\|M_ZERO);
	if (sring == NULL) {
	xenbus_dev_fatal(sc->xbd_dev, ENOMEM, "allocating shared ring");
	return (ENOMEM);
	}
	SHARED_RING_INIT(sring);
	FRONT_RING_INIT(&sc->xbd_ring, sring, sc->xbd_ring_pages * PAGE_SIZE);

	for (i = 0, sring_page_addr = (uintptr_t)sring;
	i < sc->xbd_ring_pages;
	i++, sring_page_addr += PAGE_SIZE) {
	error = xenbus_grant_ring(sc->xbd_dev,
	(vtophys(sring_page_addr) >> PAGE_SHIFT),
	&sc->xbd_ring_ref[i]);
	if (error) {
	xenbus_dev_fatal(sc->xbd_dev, error,
	"granting ring_ref(%d)", i);
	return (error);
	}
	}
	if (sc->xbd_ring_pages == 1) {
	error = xs_printf(XST_NIL, xenbus_get_node(sc->xbd_dev),
	"ring-ref", "%u", sc->xbd_ring_ref[0]);
	if (error) {
	xenbus_dev_fatal(sc->xbd_dev, error,
	"writing %s/ring-ref",
	xenbus_get_node(sc->xbd_dev));
	return (error);
	}
	} else {
	for (i = 0; i < sc->xbd_ring_pages; i++) {
	char ring_ref_name[]= "ring_refXX";

	snprintf(ring_ref_name, sizeof(ring_ref_name),
	"ring-ref%u", i);
	error = xs_printf(XST_NIL, xenbus_get_node(sc->xbd_dev),
	ring_ref_name, "%u", sc->xbd_ring_ref[i]);
	if (error) {
	xenbus_dev_fatal(sc->xbd_dev, error,
	"writing %s/%s",
	xenbus_get_node(sc->xbd_dev),
	ring_ref_name);
	return (error);
	}
	}
	}

	error = xen_intr_alloc_and_bind_local_port(sc->xbd_dev,
	xenbus_get_otherend_id(sc->xbd_dev), NULL, xbd_int, sc,
	INTR_TYPE_BIO \| INTR_MPSAFE, &sc->xen_intr_handle);
	if (error) {
	xenbus_dev_fatal(sc->xbd_dev, error,
	"xen_intr_alloc_and_bind_local_port failed");
	return (error);
	}

	return (0);
	}

	static void
	xbd_free_ring(struct xbd_softc *sc)
	{
	int i;

	if (sc->xbd_ring.sring == NULL)
	return;

	for (i = 0; i < sc->xbd_ring_pages; i++) {
	if (sc->xbd_ring_ref[i] != GRANT_REF_INVALID) {
	gnttab_end_foreign_access_ref(sc->xbd_ring_ref[i]);
	sc->xbd_ring_ref[i] = GRANT_REF_INVALID;
	}
	}
	free(sc->xbd_ring.sring, M_XENBLOCKFRONT);
	sc->xbd_ring.sring = NULL;
	}

	/-------------------------- Initialization/Teardown -------------------------/
	static int
	xbd_feature_string(struct xbd_softc sc, char features, size_t len)
	{
	struct sbuf sb;
	int feature_cnt;

	sbuf_new(&sb, features, len, SBUF_FIXEDLEN);

	feature_cnt = 0;
	if ((sc->xbd_flags & XBDF_FLUSH) != 0) {
	sbuf_printf(&sb, "flush");
	feature_cnt++;
	}

	if ((sc->xbd_flags & XBDF_BARRIER) != 0) {
	if (feature_cnt != 0)
	sbuf_printf(&sb, ", ");
	sbuf_printf(&sb, "write_barrier");
	feature_cnt++;
	}

	if ((sc->xbd_flags & XBDF_DISCARD) != 0) {
	if (feature_cnt != 0)
	sbuf_printf(&sb, ", ");
	sbuf_printf(&sb, "discard");
	feature_cnt++;
	}

	if ((sc->xbd_flags & XBDF_PERSISTENT) != 0) {
	if (feature_cnt != 0)
	sbuf_printf(&sb, ", ");
	sbuf_printf(&sb, "persistent_grants");
	feature_cnt++;
	}

	(void) sbuf_finish(&sb);
	return (sbuf_len(&sb));
	}

	static int
	xbd_sysctl_features(SYSCTL_HANDLER_ARGS)
	{
	char features[80];
	struct xbd_softc *sc = arg1;
	int error;
	int len;

	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
	return (error);

	len = xbd_feature_string(sc, features, sizeof(features));

	/* len is -1 on error, which will make the SYSCTL_OUT a no-op. */
	return (SYSCTL_OUT(req, features, len + 1/NUL/));
	}

	static void
	xbd_setup_sysctl(struct xbd_softc *xbd)
	{
	struct sysctl_ctx_list *sysctl_ctx = NULL;
	struct sysctl_oid *sysctl_tree = NULL;
	struct sysctl_oid_list *children;

	sysctl_ctx = device_get_sysctl_ctx(xbd->xbd_dev);
	if (sysctl_ctx == NULL)
	return;

	sysctl_tree = device_get_sysctl_tree(xbd->xbd_dev);
	if (sysctl_tree == NULL)
	return;

	children = SYSCTL_CHILDREN(sysctl_tree);
	SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO,
	"max_requests", CTLFLAG_RD, &xbd->xbd_max_requests, -1,
	"maximum outstanding requests (negotiated)");

	SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO,
	"max_request_segments", CTLFLAG_RD,
	&xbd->xbd_max_request_segments, 0,
	"maximum number of pages per requests (negotiated)");

	SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO,
	"max_request_size", CTLFLAG_RD, &xbd->xbd_max_request_size, 0,
	"maximum size in bytes of a request (negotiated)");

	SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO,
	"ring_pages", CTLFLAG_RD, &xbd->xbd_ring_pages, 0,
	"communication channel pages (negotiated)");

	SYSCTL_ADD_PROC(sysctl_ctx, children, OID_AUTO,
	"features", CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_NEEDGIANT, xbd,
	0, xbd_sysctl_features, "A", "protocol features (negotiated)");
	}

	/*
	* Translate Linux major/minor to an appropriate name and unit
	* number. For HVM guests, this allows us to use the same drive names
	* with blkfront as the emulated drives, easing transition slightly.
	*/
	static void
	xbd_vdevice_to_unit(uint32_t vdevice, int unit, const char *name)
	{
	static struct vdev_info {
	int major;
	int shift;
	int base;
	const char *name;
	} info[] = {
	{3, 6, 0, "ada"}, /* ide0 */
	{22, 6, 2, "ada"}, /* ide1 */
	{33, 6, 4, "ada"}, /* ide2 */
	{34, 6, 6, "ada"}, /* ide3 */
	{56, 6, 8, "ada"}, /* ide4 */
	{57, 6, 10, "ada"}, /* ide5 */
	{88, 6, 12, "ada"}, /* ide6 */
	{89, 6, 14, "ada"}, /* ide7 */
	{90, 6, 16, "ada"}, /* ide8 */
	{91, 6, 18, "ada"}, /* ide9 */

	{8, 4, 0, "da"}, /* scsi disk0 */
	{65, 4, 16, "da"}, /* scsi disk1 */
	{66, 4, 32, "da"}, /* scsi disk2 */
	{67, 4, 48, "da"}, /* scsi disk3 */
	{68, 4, 64, "da"}, /* scsi disk4 */
	{69, 4, 80, "da"}, /* scsi disk5 */
	{70, 4, 96, "da"}, /* scsi disk6 */
	{71, 4, 112, "da"}, /* scsi disk7 */
	{128, 4, 128, "da"}, /* scsi disk8 */
	{129, 4, 144, "da"}, /* scsi disk9 */
	{130, 4, 160, "da"}, /* scsi disk10 */
	{131, 4, 176, "da"}, /* scsi disk11 */
	{132, 4, 192, "da"}, /* scsi disk12 */
	{133, 4, 208, "da"}, /* scsi disk13 */
	{134, 4, 224, "da"}, /* scsi disk14 */
	{135, 4, 240, "da"}, /* scsi disk15 */

	{202, 4, 0, "xbd"}, /* xbd */

	{0, 0, 0, NULL},
	};
	int major = vdevice >> 8;
	int minor = vdevice & 0xff;
	int i;

	if (vdevice & (1 << 28)) {
	*unit = (vdevice & ((1 << 28) - 1)) >> 8;
	*name = "xbd";
	return;
	}

	for (i = 0; info[i].major; i++) {
	if (info[i].major == major) {
	*unit = info[i].base + (minor >> info[i].shift);
	*name = info[i].name;
	return;
	}
	}

	*unit = minor >> 4;
	*name = "xbd";
	}

	int
	xbd_instance_create(struct xbd_softc *sc, blkif_sector_t sectors,
	int vdevice, uint16_t vdisk_info, unsigned long sector_size,
	unsigned long phys_sector_size)
	{
	char features[80];
	int unit, error = 0;
	const char *name;

	xbd_vdevice_to_unit(vdevice, &unit, &name);

	sc->xbd_unit = unit;

	if (strcmp(name, "xbd") != 0)
	device_printf(sc->xbd_dev, "attaching as %s%d\n", name, unit);

	if (xbd_feature_string(sc, features, sizeof(features)) > 0) {
	device_printf(sc->xbd_dev, "features: %s\n",
	features);
	}

	sc->xbd_disk = disk_alloc();
	sc->xbd_disk->d_unit = sc->xbd_unit;
	sc->xbd_disk->d_open = xbd_open;
	sc->xbd_disk->d_close = xbd_close;
	sc->xbd_disk->d_ioctl = xbd_ioctl;
	sc->xbd_disk->d_strategy = xbd_strategy;
	sc->xbd_disk->d_dump = xbd_dump;
	sc->xbd_disk->d_name = name;
	sc->xbd_disk->d_drv1 = sc;
	sc->xbd_disk->d_sectorsize = sector_size;
	sc->xbd_disk->d_stripesize = phys_sector_size;
	sc->xbd_disk->d_stripeoffset = 0;

	sc->xbd_disk->d_mediasize = sectors * sector_size;
	sc->xbd_disk->d_maxsize = sc->xbd_max_request_size;
	sc->xbd_disk->d_flags = DISKFLAG_UNMAPPED_BIO;
	if ((sc->xbd_flags & (XBDF_FLUSH\|XBDF_BARRIER)) != 0) {
	sc->xbd_disk->d_flags \|= DISKFLAG_CANFLUSHCACHE;
	device_printf(sc->xbd_dev,
	"synchronize cache commands enabled.\n");
	}
	disk_create(sc->xbd_disk, DISK_VERSION);

	return error;
	}

	static void
	xbd_free(struct xbd_softc *sc)
	{
	int i;

	/* Prevent new requests being issued until we fix things up. */
	mtx_lock(&sc->xbd_io_lock);
	sc->xbd_state = XBD_STATE_DISCONNECTED;
	mtx_unlock(&sc->xbd_io_lock);

	/* Free resources associated with old device channel. */
	xbd_free_ring(sc);
	if (sc->xbd_shadow) {
	for (i = 0; i < sc->xbd_max_requests; i++) {
	struct xbd_command *cm;

	cm = &sc->xbd_shadow[i];
	if (cm->cm_sg_refs != NULL) {
	free(cm->cm_sg_refs, M_XENBLOCKFRONT);
	cm->cm_sg_refs = NULL;
	}

	if (cm->cm_indirectionpages != NULL) {
	gnttab_end_foreign_access_references(
	sc->xbd_max_request_indirectpages,
	&cm->cm_indirectionrefs[0]);
	contigfree(cm->cm_indirectionpages, PAGE_SIZE *
	sc->xbd_max_request_indirectpages,
	M_XENBLOCKFRONT);
	cm->cm_indirectionpages = NULL;
	}

	bus_dmamap_destroy(sc->xbd_io_dmat, cm->cm_map);
	}
	free(sc->xbd_shadow, M_XENBLOCKFRONT);
	sc->xbd_shadow = NULL;

	bus_dma_tag_destroy(sc->xbd_io_dmat);

	xbd_initq_cm(sc, XBD_Q_FREE);
	xbd_initq_cm(sc, XBD_Q_READY);
	xbd_initq_cm(sc, XBD_Q_COMPLETE);
	}

	xen_intr_unbind(&sc->xen_intr_handle);

	}

	/--------------------------- State Change Handlers --------------------------/
	static void
	xbd_initialize(struct xbd_softc *sc)
	{
	const char *otherend_path;
	const char *node_path;
	uint32_t max_ring_page_order;
	int error;

	if (xenbus_get_state(sc->xbd_dev) != XenbusStateInitialising) {
	/* Initialization has already been performed. */
	return;
	}

	/*
	* Protocol defaults valid even if negotiation for a
	* setting fails.
	*/
	max_ring_page_order = 0;
	sc->xbd_ring_pages = 1;

	/*
	* Protocol negotiation.
	*
	* \note xs_gather() returns on the first encountered error, so
	* we must use independent calls in order to guarantee
	* we don't miss information in a sparsly populated back-end
	* tree.
	*
	* \note xs_scanf() does not update variables for unmatched
	* fields.
	*/
	otherend_path = xenbus_get_otherend_path(sc->xbd_dev);
	node_path = xenbus_get_node(sc->xbd_dev);

	/* Support both backend schemes for relaying ring page limits. */
	(void)xs_scanf(XST_NIL, otherend_path,
	"max-ring-page-order", NULL, "%" PRIu32,
	&max_ring_page_order);
	sc->xbd_ring_pages = 1 << max_ring_page_order;
	(void)xs_scanf(XST_NIL, otherend_path,
	"max-ring-pages", NULL, "%" PRIu32,
	&sc->xbd_ring_pages);
	if (sc->xbd_ring_pages < 1)
	sc->xbd_ring_pages = 1;

	if (sc->xbd_ring_pages > XBD_MAX_RING_PAGES) {
	device_printf(sc->xbd_dev,
	"Back-end specified ring-pages of %u "
	"limited to front-end limit of %u.\n",
	sc->xbd_ring_pages, XBD_MAX_RING_PAGES);
	sc->xbd_ring_pages = XBD_MAX_RING_PAGES;
	}

	if (powerof2(sc->xbd_ring_pages) == 0) {
	uint32_t new_page_limit;

	new_page_limit = 0x01 << (fls(sc->xbd_ring_pages) - 1);
	device_printf(sc->xbd_dev,
	"Back-end specified ring-pages of %u "
	"is not a power of 2. Limited to %u.\n",
	sc->xbd_ring_pages, new_page_limit);
	sc->xbd_ring_pages = new_page_limit;
	}

	sc->xbd_max_requests =
	BLKIF_MAX_RING_REQUESTS(sc->xbd_ring_pages * PAGE_SIZE);
	if (sc->xbd_max_requests > XBD_MAX_REQUESTS) {
	device_printf(sc->xbd_dev,
	"Back-end specified max_requests of %u "
	"limited to front-end limit of %zu.\n",
	sc->xbd_max_requests, XBD_MAX_REQUESTS);
	sc->xbd_max_requests = XBD_MAX_REQUESTS;
	}

	if (xbd_alloc_ring(sc) != 0)
	return;

	/* Support both backend schemes for relaying ring page limits. */
	if (sc->xbd_ring_pages > 1) {
	error = xs_printf(XST_NIL, node_path,
	"num-ring-pages","%u",
	sc->xbd_ring_pages);
	if (error) {
	xenbus_dev_fatal(sc->xbd_dev, error,
	"writing %s/num-ring-pages",
	node_path);
	return;
	}

	error = xs_printf(XST_NIL, node_path,
	"ring-page-order", "%u",
	fls(sc->xbd_ring_pages) - 1);
	if (error) {
	xenbus_dev_fatal(sc->xbd_dev, error,
	"writing %s/ring-page-order",
	node_path);
	return;
	}
	}

	error = xs_printf(XST_NIL, node_path, "event-channel",
	"%u", xen_intr_port(sc->xen_intr_handle));
	if (error) {
	xenbus_dev_fatal(sc->xbd_dev, error,
	"writing %s/event-channel",
	node_path);
	return;
	}

	error = xs_printf(XST_NIL, node_path, "protocol",
	"%s", XEN_IO_PROTO_ABI_NATIVE);
	if (error) {
	xenbus_dev_fatal(sc->xbd_dev, error,
	"writing %s/protocol",
	node_path);
	return;
	}

	xenbus_set_state(sc->xbd_dev, XenbusStateInitialised);
	}

	/*
	* Invoked when the backend is finally 'ready' (and has published
	* the details about the physical device - #sectors, size, etc).
	*/
	static void
	xbd_connect(struct xbd_softc *sc)
	{
	device_t dev = sc->xbd_dev;
	blkif_sector_t sectors;
	unsigned long sector_size, phys_sector_size;
	unsigned int binfo;
	int err, feature_barrier, feature_flush;
	int i, j;

	DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev));

	if (sc->xbd_state == XBD_STATE_SUSPENDED) {
	return;
	}

	if (sc->xbd_state == XBD_STATE_CONNECTED) {
	struct disk *disk;

	disk = sc->xbd_disk;
	if (disk == NULL) {
	return;
	}
	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
	"sectors", "%"PRIu64, &sectors, NULL);
	if (err != 0) {
	xenbus_dev_error(dev, err,
	"reading sectors at %s",
	xenbus_get_otherend_path(dev));
	return;
	}
	disk->d_mediasize = disk->d_sectorsize * sectors;
	err = disk_resize(disk, M_NOWAIT);
	if (err) {
	xenbus_dev_error(dev, err,
	"unable to resize disk %s%u",
	disk->d_name, disk->d_unit);
	return;
	}
	device_printf(sc->xbd_dev,
	"changed capacity to %jd\n",
	(intmax_t)disk->d_mediasize);
	return;
	}

	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
	"sectors", "%"PRIu64, &sectors,
	"info", "%u", &binfo,
	"sector-size", "%lu", &sector_size,
	NULL);
	if (err) {
	xenbus_dev_fatal(dev, err,
	"reading backend fields at %s",
	xenbus_get_otherend_path(dev));
	return;
	}
	if ((sectors == 0) \|\| (sector_size == 0)) {
	xenbus_dev_fatal(dev, 0,
	"invalid parameters from %s:"
	" sectors = %"PRIu64", sector_size = %lu",
	xenbus_get_otherend_path(dev),
	sectors, sector_size);
	return;
	}
	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
	"physical-sector-size", "%lu", &phys_sector_size,
	NULL);
	if (err \|\| phys_sector_size <= sector_size)
	phys_sector_size = 0;
	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
	"feature-barrier", "%d", &feature_barrier,
	NULL);
	if (err == 0 && feature_barrier != 0)
	sc->xbd_flags \|= XBDF_BARRIER;

	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
	"feature-flush-cache", "%d", &feature_flush,
	NULL);
	if (err == 0 && feature_flush != 0)
	sc->xbd_flags \|= XBDF_FLUSH;

	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
	"feature-max-indirect-segments", "%" PRIu32,
	&sc->xbd_max_request_segments, NULL);
	if ((err != 0) \|\| (xbd_enable_indirect == 0))
	sc->xbd_max_request_segments = 0;
	if (sc->xbd_max_request_segments > XBD_MAX_INDIRECT_SEGMENTS)
	sc->xbd_max_request_segments = XBD_MAX_INDIRECT_SEGMENTS;
	- if (sc->xbd_max_request_segments > XBD_SIZE_TO_SEGS(MAXPHYS))
	- sc->xbd_max_request_segments = XBD_SIZE_TO_SEGS(MAXPHYS);
	+ if (sc->xbd_max_request_segments > XBD_SIZE_TO_SEGS(maxphys))
	+ sc->xbd_max_request_segments = XBD_SIZE_TO_SEGS(maxphys);
	sc->xbd_max_request_indirectpages =
	XBD_INDIRECT_SEGS_TO_PAGES(sc->xbd_max_request_segments);
	if (sc->xbd_max_request_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST)
	sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
	sc->xbd_max_request_size =
	XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments);

	/* Allocate datastructures based on negotiated values. */
	err = bus_dma_tag_create(
	bus_get_dma_tag(sc->xbd_dev), /* parent */
	512, PAGE_SIZE, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	sc->xbd_max_request_size,
	sc->xbd_max_request_segments,
	PAGE_SIZE, /* maxsegsize */
	BUS_DMA_ALLOCNOW, /* flags */
	busdma_lock_mutex, /* lockfunc */
	&sc->xbd_io_lock, /* lockarg */
	&sc->xbd_io_dmat);
	if (err != 0) {
	xenbus_dev_fatal(sc->xbd_dev, err,
	"Cannot allocate parent DMA tag\n");
	return;
	}

	/* Per-transaction data allocation. */
	sc->xbd_shadow = malloc(sizeof(sc->xbd_shadow) sc->xbd_max_requests,
	M_XENBLOCKFRONT, M_NOWAIT\|M_ZERO);
	if (sc->xbd_shadow == NULL) {
	bus_dma_tag_destroy(sc->xbd_io_dmat);
	xenbus_dev_fatal(sc->xbd_dev, ENOMEM,
	"Cannot allocate request structures\n");
	return;
	}

	for (i = 0; i < sc->xbd_max_requests; i++) {
	struct xbd_command *cm;
	void * indirectpages;

	cm = &sc->xbd_shadow[i];
	cm->cm_sg_refs = malloc(
	sizeof(grant_ref_t) * sc->xbd_max_request_segments,
	M_XENBLOCKFRONT, M_NOWAIT);
	if (cm->cm_sg_refs == NULL)
	break;
	cm->cm_id = i;
	cm->cm_flags = XBDCF_INITIALIZER;
	cm->cm_sc = sc;
	if (bus_dmamap_create(sc->xbd_io_dmat, 0, &cm->cm_map) != 0)
	break;
	if (sc->xbd_max_request_indirectpages > 0) {
	indirectpages = contigmalloc(
	PAGE_SIZE * sc->xbd_max_request_indirectpages,
	M_XENBLOCKFRONT, M_ZERO \| M_NOWAIT, 0, ~0,
	PAGE_SIZE, 0);
	if (indirectpages == NULL)
	sc->xbd_max_request_indirectpages = 0;
	} else {
	indirectpages = NULL;
	}
	for (j = 0; j < sc->xbd_max_request_indirectpages; j++) {
	if (gnttab_grant_foreign_access(
	xenbus_get_otherend_id(sc->xbd_dev),
	(vtophys(indirectpages) >> PAGE_SHIFT) + j,
	1 /* grant read-only access */,
	&cm->cm_indirectionrefs[j]))
	break;
	}
	if (j < sc->xbd_max_request_indirectpages) {
	contigfree(indirectpages,
	PAGE_SIZE * sc->xbd_max_request_indirectpages,
	M_XENBLOCKFRONT);
	break;
	}
	cm->cm_indirectionpages = indirectpages;
	xbd_free_command(cm);
	}

	if (sc->xbd_disk == NULL) {
	device_printf(dev, "%juMB <%s> at %s",
	(uintmax_t) sectors / (1048576 / sector_size),
	device_get_desc(dev),
	xenbus_get_node(dev));
	bus_print_child_footer(device_get_parent(dev), dev);

	xbd_instance_create(sc, sectors, sc->xbd_vdevice, binfo,
	sector_size, phys_sector_size);
	}

	(void)xenbus_set_state(dev, XenbusStateConnected);

	/* Kick pending requests. */
	mtx_lock(&sc->xbd_io_lock);
	sc->xbd_state = XBD_STATE_CONNECTED;
	xbd_startio(sc);
	sc->xbd_flags \|= XBDF_READY;
	mtx_unlock(&sc->xbd_io_lock);
	}

	/**
	* Handle the change of state of the backend to Closing. We must delete our
	* device-layer structures now, to ensure that writes are flushed through to
	* the backend. Once this is done, we can switch to Closed in
	* acknowledgement.
	*/
	static void
	xbd_closing(device_t dev)
	{
	struct xbd_softc *sc = device_get_softc(dev);

	xenbus_set_state(dev, XenbusStateClosing);

	DPRINTK("xbd_closing: %s removed\n", xenbus_get_node(dev));

	if (sc->xbd_disk != NULL) {
	disk_destroy(sc->xbd_disk);
	sc->xbd_disk = NULL;
	}

	xenbus_set_state(dev, XenbusStateClosed);
	}

	/---------------------------- NewBus Entrypoints ----------------------------/
	static int
	xbd_probe(device_t dev)
	{
	if (strcmp(xenbus_get_type(dev), "vbd") != 0)
	return (ENXIO);

	if (xen_hvm_domain() && xen_disable_pv_disks != 0)
	return (ENXIO);

	if (xen_hvm_domain()) {
	int error;
	char *type;

	/*
	* When running in an HVM domain, IDE disk emulation is
	* disabled early in boot so that native drivers will
	* not see emulated hardware. However, CDROM device
	* emulation cannot be disabled.
	*
	* Through use of FreeBSD's vm_guest and xen_hvm_domain()
	* APIs, we could modify the native CDROM driver to fail its
	* probe when running under Xen. Unfortunatlely, the PV
	* CDROM support in XenServer (up through at least version
	* 6.2) isn't functional, so we instead rely on the emulated
	* CDROM instance, and fail to attach the PV one here in
	* the blkfront driver.
	*/
	error = xs_read(XST_NIL, xenbus_get_node(dev),
	"device-type", NULL, (void **) &type);
	if (error)
	return (ENXIO);

	if (strncmp(type, "cdrom", 5) == 0) {
	free(type, M_XENSTORE);
	return (ENXIO);
	}
	free(type, M_XENSTORE);
	}

	device_set_desc(dev, "Virtual Block Device");
	device_quiet(dev);
	return (0);
	}

	/*
	* Setup supplies the backend dir, virtual device. We place an event
	* channel and shared frame entries. We watch backend to wait if it's
	* ok.
	*/
	static int
	xbd_attach(device_t dev)
	{
	struct xbd_softc *sc;
	const char *name;
	uint32_t vdevice;
	int error;
	int i;
	int unit;

	/* FIXME: Use dynamic device id if this is not set. */
	error = xs_scanf(XST_NIL, xenbus_get_node(dev),
	"virtual-device", NULL, "%" PRIu32, &vdevice);
	if (error)
	error = xs_scanf(XST_NIL, xenbus_get_node(dev),
	"virtual-device-ext", NULL, "%" PRIu32, &vdevice);
	if (error) {
	xenbus_dev_fatal(dev, error, "reading virtual-device");
	device_printf(dev, "Couldn't determine virtual device.\n");
	return (error);
	}

	xbd_vdevice_to_unit(vdevice, &unit, &name);
	if (!strcmp(name, "xbd"))
	device_set_unit(dev, unit);

	sc = device_get_softc(dev);
	mtx_init(&sc->xbd_io_lock, "blkfront i/o lock", NULL, MTX_DEF);
	xbd_initqs(sc);
	for (i = 0; i < XBD_MAX_RING_PAGES; i++)
	sc->xbd_ring_ref[i] = GRANT_REF_INVALID;

	sc->xbd_dev = dev;
	sc->xbd_vdevice = vdevice;
	sc->xbd_state = XBD_STATE_DISCONNECTED;

	xbd_setup_sysctl(sc);

	/* Wait for backend device to publish its protocol capabilities. */
	xenbus_set_state(dev, XenbusStateInitialising);

	return (0);
	}

	static int
	xbd_detach(device_t dev)
	{
	struct xbd_softc *sc = device_get_softc(dev);

	DPRINTK("%s: %s removed\n", __func__, xenbus_get_node(dev));

	xbd_free(sc);
	mtx_destroy(&sc->xbd_io_lock);

	return 0;
	}

	static int
	xbd_suspend(device_t dev)
	{
	struct xbd_softc *sc = device_get_softc(dev);
	int retval;
	int saved_state;

	/* Prevent new requests being issued until we fix things up. */
	mtx_lock(&sc->xbd_io_lock);
	saved_state = sc->xbd_state;
	sc->xbd_state = XBD_STATE_SUSPENDED;

	/* Wait for outstanding I/O to drain. */
	retval = 0;
	while (xbd_queue_length(sc, XBD_Q_BUSY) != 0) {
	if (msleep(&sc->xbd_cm_q[XBD_Q_BUSY], &sc->xbd_io_lock,
	PRIBIO, "blkf_susp", 30 * hz) == EWOULDBLOCK) {
	retval = EBUSY;
	break;
	}
	}
	mtx_unlock(&sc->xbd_io_lock);

	if (retval != 0)
	sc->xbd_state = saved_state;

	return (retval);
	}

	static int
	xbd_resume(device_t dev)
	{
	struct xbd_softc *sc = device_get_softc(dev);

	if (xen_suspend_cancelled) {
	sc->xbd_state = XBD_STATE_CONNECTED;
	return (0);
	}

	DPRINTK("xbd_resume: %s\n", xenbus_get_node(dev));

	xbd_free(sc);
	xbd_initialize(sc);
	return (0);
	}

	/**
	* Callback received when the backend's state changes.
	*/
	static void
	xbd_backend_changed(device_t dev, XenbusState backend_state)
	{
	struct xbd_softc *sc = device_get_softc(dev);

	DPRINTK("backend_state=%d\n", backend_state);

	switch (backend_state) {
	case XenbusStateUnknown:
	case XenbusStateInitialising:
	case XenbusStateReconfigured:
	case XenbusStateReconfiguring:
	case XenbusStateClosed:
	break;

	case XenbusStateInitWait:
	case XenbusStateInitialised:
	xbd_initialize(sc);
	break;

	case XenbusStateConnected:
	xbd_initialize(sc);
	xbd_connect(sc);
	break;

	case XenbusStateClosing:
	if (sc->xbd_users > 0) {
	device_printf(dev, "detaching with pending users\n");
	KASSERT(sc->xbd_disk != NULL,
	("NULL disk with pending users\n"));
	disk_gone(sc->xbd_disk);
	} else {
	xbd_closing(dev);
	}
	break;
	}
	}

	/---------------------------- NewBus Registration ---------------------------/
	static device_method_t xbd_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, xbd_probe),
	DEVMETHOD(device_attach, xbd_attach),
	DEVMETHOD(device_detach, xbd_detach),
	DEVMETHOD(device_shutdown, bus_generic_shutdown),
	DEVMETHOD(device_suspend, xbd_suspend),
	DEVMETHOD(device_resume, xbd_resume),

	/* Xenbus interface */
	DEVMETHOD(xenbus_otherend_changed, xbd_backend_changed),

	{ 0, 0 }
	};

	static driver_t xbd_driver = {
	"xbd",
	xbd_methods,
	sizeof(struct xbd_softc),
	};
	devclass_t xbd_devclass;

	DRIVER_MODULE(xbd, xenbusb_front, xbd_driver, xbd_devclass, 0, 0);
	diff --git a/sys/fs/cd9660/cd9660_vfsops.c b/sys/fs/cd9660/cd9660_vfsops.c
	index 4300b4468054..21d3c3e13a8f 100644
	--- a/sys/fs/cd9660/cd9660_vfsops.c
	+++ b/sys/fs/cd9660/cd9660_vfsops.c
	@@ -1,852 +1,852 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1994
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley
	* by Pace Willisson (pace@blitz.com). The Rock Ridge Extension
	* Support code is derived from software contributed to Berkeley
	* by Atsushi Murai (amurai@spec.co.jp).
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)cd9660_vfsops.c 8.18 (Berkeley) 5/22/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/kernel.h>
	#include <sys/vnode.h>
	#include <sys/mount.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/cdio.h>
	#include <sys/conf.h>
	#include <sys/fcntl.h>
	#include <sys/malloc.h>
	#include <sys/stat.h>
	#include <sys/syslog.h>
	#include <sys/iconv.h>

	#include <fs/cd9660/iso.h>
	#include <fs/cd9660/iso_rrip.h>
	#include <fs/cd9660/cd9660_node.h>
	#include <fs/cd9660/cd9660_mount.h>

	#include <geom/geom.h>
	#include <geom/geom_vfs.h>

	MALLOC_DEFINE(M_ISOFSMNT, "isofs_mount", "ISOFS mount structure");
	MALLOC_DEFINE(M_ISOFSNODE, "isofs_node", "ISOFS vnode private part");

	struct iconv_functions *cd9660_iconv = NULL;

	static vfs_mount_t cd9660_mount;
	static vfs_cmount_t cd9660_cmount;
	static vfs_unmount_t cd9660_unmount;
	static vfs_root_t cd9660_root;
	static vfs_statfs_t cd9660_statfs;
	static vfs_vget_t cd9660_vget;
	static vfs_fhtovp_t cd9660_fhtovp;

	static struct vfsops cd9660_vfsops = {
	.vfs_fhtovp = cd9660_fhtovp,
	.vfs_mount = cd9660_mount,
	.vfs_cmount = cd9660_cmount,
	.vfs_root = cd9660_root,
	.vfs_statfs = cd9660_statfs,
	.vfs_unmount = cd9660_unmount,
	.vfs_vget = cd9660_vget,
	};
	VFS_SET(cd9660_vfsops, cd9660, VFCF_READONLY);
	MODULE_VERSION(cd9660, 1);

	static int cd9660_vfs_hash_cmp(struct vnode vp, void pino);
	static int iso_mountfs(struct vnode devvp, struct mount mp);

	/*
	* VFS Operations.
	*/

	static int
	cd9660_cmount(struct mntarg ma, void data, uint64_t flags)
	{
	struct iso_args args;
	int error;

	error = copyin(data, &args, sizeof args);
	if (error)
	return (error);

	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
	ma = mount_arg(ma, "export", &args.export, sizeof(args.export));
	ma = mount_argsu(ma, "cs_disk", args.cs_disk, 64);
	ma = mount_argsu(ma, "cs_local", args.cs_local, 64);
	ma = mount_argf(ma, "ssector", "%u", args.ssector);
	ma = mount_argb(ma, !(args.flags & ISOFSMNT_NORRIP), "norrip");
	ma = mount_argb(ma, args.flags & ISOFSMNT_GENS, "nogens");
	ma = mount_argb(ma, args.flags & ISOFSMNT_EXTATT, "noextatt");
	ma = mount_argb(ma, !(args.flags & ISOFSMNT_NOJOLIET), "nojoliet");
	ma = mount_argb(ma,
	args.flags & ISOFSMNT_BROKENJOLIET, "nobrokenjoliet");
	ma = mount_argb(ma, args.flags & ISOFSMNT_KICONV, "nokiconv");

	error = kernel_mount(ma, flags);

	return (error);
	}

	static int
	cd9660_mount(struct mount *mp)
	{
	struct vnode *devvp;
	struct thread *td;
	char *fspec;
	int error;
	accmode_t accmode;
	struct nameidata ndp;
	struct iso_mnt *imp = NULL;

	td = curthread;

	/*
	* Unconditionally mount as read-only.
	*/
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_RDONLY;
	MNT_IUNLOCK(mp);

	fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
	if (error)
	return (error);

	imp = VFSTOISOFS(mp);

	if (mp->mnt_flag & MNT_UPDATE) {
	if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0))
	return (0);
	}
	/*
	* Not an update, or updating the name: look up the name
	* and verify that it refers to a sensible block device.
	*/
	NDINIT(&ndp, LOOKUP, FOLLOW \| LOCKLEAF, UIO_SYSSPACE, fspec, td);
	if ((error = namei(&ndp)))
	return (error);
	NDFREE(&ndp, NDF_ONLY_PNBUF);
	devvp = ndp.ni_vp;

	if (!vn_isdisk_error(devvp, &error)) {
	vput(devvp);
	return (error);
	}

	/*
	* Verify that user has necessary permissions on the device,
	* or has superuser abilities
	*/
	accmode = VREAD;
	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
	if (error)
	error = priv_check(td, PRIV_VFS_MOUNT_PERM);
	if (error) {
	vput(devvp);
	return (error);
	}

	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
	error = iso_mountfs(devvp, mp);
	if (error)
	vrele(devvp);
	} else {
	if (devvp != imp->im_devvp)
	error = EINVAL; /* needs translation */
	vput(devvp);
	}
	if (error)
	return (error);
	vfs_mountedfrom(mp, fspec);
	return (0);
	}

	/*
	* Common code for mount and mountroot
	*/
	static int
	iso_mountfs(devvp, mp)
	struct vnode *devvp;
	struct mount *mp;
	{
	struct iso_mnt *isomp = NULL;
	struct buf *bp = NULL;
	struct buf pribp = NULL, supbp = NULL;
	struct cdev *dev;
	int error = EINVAL;
	int high_sierra = 0;
	int iso_bsize;
	int iso_blknum;
	int joliet_level;
	int isverified = 0;
	struct iso_volume_descriptor *vdp = NULL;
	struct iso_primary_descriptor *pri = NULL;
	struct iso_sierra_primary_descriptor *pri_sierra = NULL;
	struct iso_supplementary_descriptor *sup = NULL;
	struct iso_directory_record *rootp;
	int logical_block_size, ssector;
	struct g_consumer *cp;
	struct bufobj *bo;
	char cs_local, cs_disk;

	dev = devvp->v_rdev;
	dev_ref(dev);
	g_topology_lock();
	error = g_vfs_open(devvp, &cp, "cd9660", 0);
	if (error == 0)
	g_getattr("MNT::verified", cp, &isverified);
	g_topology_unlock();
	VOP_UNLOCK(devvp);
	if (error)
	goto out;
	if (devvp->v_rdev->si_iosize_max != 0)
	mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
	- if (mp->mnt_iosize_max > MAXPHYS)
	- mp->mnt_iosize_max = MAXPHYS;
	+ if (mp->mnt_iosize_max > maxphys)
	+ mp->mnt_iosize_max = maxphys;

	bo = &devvp->v_bufobj;

	/* This is the "logical sector size". The standard says this
	* should be 2048 or the physical sector size on the device,
	* whichever is greater.
	*/
	if ((ISO_DEFAULT_BLOCK_SIZE % cp->provider->sectorsize) != 0) {
	error = EINVAL;
	goto out;
	}

	iso_bsize = cp->provider->sectorsize;

	joliet_level = 0;
	if (1 != vfs_scanopt(mp->mnt_optnew, "ssector", "%d", &ssector))
	ssector = 0;
	for (iso_blknum = 16 + ssector;
	iso_blknum < 100 + ssector;
	iso_blknum++) {
	if ((error = bread(devvp, iso_blknum * btodb(ISO_DEFAULT_BLOCK_SIZE),
	iso_bsize, NOCRED, &bp)) != 0)
	goto out;

	vdp = (struct iso_volume_descriptor *)bp->b_data;
	if (bcmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) != 0) {
	if (bcmp (vdp->id_sierra, ISO_SIERRA_ID,
	sizeof vdp->id_sierra) != 0) {
	error = EINVAL;
	goto out;
	} else
	high_sierra = 1;
	}
	switch (isonum_711 (high_sierra? vdp->type_sierra: vdp->type)){
	case ISO_VD_PRIMARY:
	if (pribp == NULL) {
	pribp = bp;
	bp = NULL;
	pri = (struct iso_primary_descriptor *)vdp;
	pri_sierra =
	(struct iso_sierra_primary_descriptor *)vdp;
	}
	break;

	case ISO_VD_SUPPLEMENTARY:
	if (supbp == NULL) {
	supbp = bp;
	bp = NULL;
	sup = (struct iso_supplementary_descriptor *)vdp;

	if (!vfs_flagopt(mp->mnt_optnew, "nojoliet", NULL, 0)) {
	if (bcmp(sup->escape, "%/@", 3) == 0)
	joliet_level = 1;
	if (bcmp(sup->escape, "%/C", 3) == 0)
	joliet_level = 2;
	if (bcmp(sup->escape, "%/E", 3) == 0)
	joliet_level = 3;

	if ((isonum_711 (sup->flags) & 1) &&
	!vfs_flagopt(mp->mnt_optnew, "brokenjoliet", NULL, 0))
	joliet_level = 0;
	}
	}
	break;

	case ISO_VD_END:
	goto vd_end;

	default:
	break;
	}
	if (bp != NULL) {
	brelse(bp);
	bp = NULL;
	}
	}
	vd_end:
	if (bp != NULL) {
	brelse(bp);
	bp = NULL;
	}

	if (pri == NULL) {
	error = EINVAL;
	goto out;
	}

	logical_block_size =
	isonum_723 (high_sierra?
	pri_sierra->logical_block_size:
	pri->logical_block_size);

	if (logical_block_size < DEV_BSIZE \|\| logical_block_size > MAXBSIZE
	\|\| (logical_block_size & (logical_block_size - 1)) != 0) {
	error = EINVAL;
	goto out;
	}

	rootp = (struct iso_directory_record *)
	(high_sierra?
	pri_sierra->root_directory_record:
	pri->root_directory_record);

	isomp = malloc(sizeof *isomp, M_ISOFSMNT, M_WAITOK \| M_ZERO);
	isomp->im_cp = cp;
	isomp->im_bo = bo;
	isomp->logical_block_size = logical_block_size;
	isomp->volume_space_size =
	isonum_733 (high_sierra?
	pri_sierra->volume_space_size:
	pri->volume_space_size);
	isomp->joliet_level = 0;
	/*
	* Since an ISO9660 multi-session CD can also access previous
	* sessions, we have to include them into the space consider-
	* ations. This doesn't yield a very accurate number since
	* parts of the old sessions might be inaccessible now, but we
	* can't do much better. This is also important for the NFS
	* filehandle validation.
	*/
	isomp->volume_space_size += ssector;
	memcpy(isomp->root, rootp, sizeof isomp->root);
	isomp->root_extent = isonum_733 (rootp->extent);
	isomp->root_size = isonum_733 (rootp->size);

	isomp->im_bmask = logical_block_size - 1;
	isomp->im_bshift = ffs(logical_block_size) - 1;

	pribp->b_flags \|= B_AGE;
	brelse(pribp);
	pribp = NULL;
	rootp = NULL;
	pri = NULL;
	pri_sierra = NULL;

	mp->mnt_data = isomp;
	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
	mp->mnt_maxsymlinklen = 0;
	MNT_ILOCK(mp);
	if (isverified)
	mp->mnt_flag \|= MNT_VERIFIED;
	mp->mnt_flag \|= MNT_LOCAL;
	mp->mnt_kern_flag \|= MNTK_LOOKUP_SHARED \| MNTK_EXTENDED_SHARED;
	MNT_IUNLOCK(mp);
	isomp->im_mountp = mp;
	isomp->im_dev = dev;
	isomp->im_devvp = devvp;

	vfs_flagopt(mp->mnt_optnew, "norrip", &isomp->im_flags, ISOFSMNT_NORRIP);
	vfs_flagopt(mp->mnt_optnew, "gens", &isomp->im_flags, ISOFSMNT_GENS);
	vfs_flagopt(mp->mnt_optnew, "extatt", &isomp->im_flags, ISOFSMNT_EXTATT);
	vfs_flagopt(mp->mnt_optnew, "nojoliet", &isomp->im_flags, ISOFSMNT_NOJOLIET);
	vfs_flagopt(mp->mnt_optnew, "kiconv", &isomp->im_flags, ISOFSMNT_KICONV);

	/* Check the Rock Ridge Extension support */
	if (!(isomp->im_flags & ISOFSMNT_NORRIP)) {
	if ((error = bread(isomp->im_devvp, (isomp->root_extent +
	isonum_711(((struct iso_directory_record *)isomp->root)->
	ext_attr_length)) << (isomp->im_bshift - DEV_BSHIFT),
	isomp->logical_block_size, NOCRED, &bp)) != 0)
	goto out;

	rootp = (struct iso_directory_record *)bp->b_data;

	if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) {
	isomp->im_flags \|= ISOFSMNT_NORRIP;
	} else {
	isomp->im_flags &= ~ISOFSMNT_GENS;
	}

	/*
	* The contents are valid,
	* but they will get reread as part of another vnode, so...
	*/
	bp->b_flags \|= B_AGE;
	brelse(bp);
	bp = NULL;
	rootp = NULL;
	}

	if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) {
	cs_local = vfs_getopts(mp->mnt_optnew, "cs_local", &error);
	if (error)
	goto out;
	cs_disk = vfs_getopts(mp->mnt_optnew, "cs_disk", &error);
	if (error)
	goto out;
	cd9660_iconv->open(cs_local, cs_disk, &isomp->im_d2l);
	cd9660_iconv->open(cs_disk, cs_local, &isomp->im_l2d);
	} else {
	isomp->im_d2l = NULL;
	isomp->im_l2d = NULL;
	}

	if (high_sierra) {
	/* this effectively ignores all the mount flags */
	if (bootverbose)
	log(LOG_INFO, "cd9660: High Sierra Format\n");
	isomp->iso_ftype = ISO_FTYPE_HIGH_SIERRA;
	} else
	switch (isomp->im_flags&(ISOFSMNT_NORRIP\|ISOFSMNT_GENS)) {
	default:
	isomp->iso_ftype = ISO_FTYPE_DEFAULT;
	break;
	case ISOFSMNT_GENS\|ISOFSMNT_NORRIP:
	isomp->iso_ftype = ISO_FTYPE_9660;
	break;
	case 0:
	if (bootverbose)
	log(LOG_INFO, "cd9660: RockRidge Extension\n");
	isomp->iso_ftype = ISO_FTYPE_RRIP;
	break;
	}

	/* Decide whether to use the Joliet descriptor */

	if (isomp->iso_ftype != ISO_FTYPE_RRIP && joliet_level) {
	if (bootverbose)
	log(LOG_INFO, "cd9660: Joliet Extension (Level %d)\n",
	joliet_level);
	rootp = (struct iso_directory_record *)
	sup->root_directory_record;
	memcpy(isomp->root, rootp, sizeof isomp->root);
	isomp->root_extent = isonum_733 (rootp->extent);
	isomp->root_size = isonum_733 (rootp->size);
	isomp->joliet_level = joliet_level;
	supbp->b_flags \|= B_AGE;
	}

	if (supbp) {
	brelse(supbp);
	supbp = NULL;
	sup = NULL;
	}

	return 0;
	out:
	if (bp != NULL)
	brelse(bp);
	if (pribp != NULL)
	brelse(pribp);
	if (supbp != NULL)
	brelse(supbp);
	if (cp != NULL) {
	g_topology_lock();
	g_vfs_close(cp);
	g_topology_unlock();
	}
	if (isomp) {
	free(isomp, M_ISOFSMNT);
	mp->mnt_data = NULL;
	}
	dev_rel(dev);
	return error;
	}

	/*
	* unmount system call
	*/
	static int
	cd9660_unmount(mp, mntflags)
	struct mount *mp;
	int mntflags;
	{
	struct iso_mnt *isomp;
	int error, flags = 0;

	if (mntflags & MNT_FORCE)
	flags \|= FORCECLOSE;
	if ((error = vflush(mp, 0, flags, curthread)))
	return (error);

	isomp = VFSTOISOFS(mp);

	if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) {
	if (isomp->im_d2l)
	cd9660_iconv->close(isomp->im_d2l);
	if (isomp->im_l2d)
	cd9660_iconv->close(isomp->im_l2d);
	}
	g_topology_lock();
	g_vfs_close(isomp->im_cp);
	g_topology_unlock();
	vrele(isomp->im_devvp);
	dev_rel(isomp->im_dev);
	free(isomp, M_ISOFSMNT);
	mp->mnt_data = NULL;
	MNT_ILOCK(mp);
	mp->mnt_flag &= ~MNT_LOCAL;
	MNT_IUNLOCK(mp);
	return (error);
	}

	/*
	* Return root of a filesystem
	*/
	static int
	cd9660_root(mp, flags, vpp)
	struct mount *mp;
	int flags;
	struct vnode **vpp;
	{
	struct iso_mnt *imp = VFSTOISOFS(mp);
	struct iso_directory_record *dp =
	(struct iso_directory_record *)imp->root;
	cd_ino_t ino = isodirino(dp, imp);

	/*
	* With RRIP we must use the `.' entry of the root directory.
	* Simply tell vget, that it's a relocated directory.
	*/
	return (cd9660_vget_internal(mp, ino, flags, vpp,
	imp->iso_ftype == ISO_FTYPE_RRIP, dp));
	}

	/*
	* Get filesystem statistics.
	*/
	static int
	cd9660_statfs(mp, sbp)
	struct mount *mp;
	struct statfs *sbp;
	{
	struct iso_mnt *isomp;

	isomp = VFSTOISOFS(mp);

	sbp->f_bsize = isomp->logical_block_size;
	sbp->f_iosize = sbp->f_bsize; /* XXX */
	sbp->f_blocks = isomp->volume_space_size;
	sbp->f_bfree = 0; /* total free blocks */
	sbp->f_bavail = 0; /* blocks free for non superuser */
	sbp->f_files = 0; /* total files */
	sbp->f_ffree = 0; /* free file nodes */
	return 0;
	}

	/*
	* File handle to vnode
	*
	* Have to be really careful about stale file handles:
	* - check that the inode number is in range
	* - call iget() to get the locked inode
	* - check for an unallocated inode (i_mode == 0)
	* - check that the generation number matches
	*/

	/* ARGSUSED */
	static int
	cd9660_fhtovp(mp, fhp, flags, vpp)
	struct mount *mp;
	struct fid *fhp;
	int flags;
	struct vnode **vpp;
	{
	struct ifid ifh;
	struct iso_node *ip;
	struct vnode *nvp;
	int error;

	memcpy(&ifh, fhp, sizeof(ifh));

	#ifdef ISOFS_DBG
	printf("fhtovp: ino %d, start %ld\n",
	ifh.ifid_ino, ifh.ifid_start);
	#endif

	if ((error = VFS_VGET(mp, ifh.ifid_ino, LK_EXCLUSIVE, &nvp)) != 0) {
	*vpp = NULLVP;
	return (error);
	}
	ip = VTOI(nvp);
	if (ip->inode.iso_mode == 0) {
	vput(nvp);
	*vpp = NULLVP;
	return (ESTALE);
	}
	*vpp = nvp;
	vnode_create_vobject(*vpp, ip->i_size, curthread);
	return (0);
	}

	/*
	* Conform to standard VFS interface; can't vget arbitrary inodes beyond 4GB
	* into media with current inode scheme and 32-bit ino_t. This shouldn't be
	* needed for anything other than nfsd, and who exports a mounted DVD over NFS?
	*/
	static int
	cd9660_vget(mp, ino, flags, vpp)
	struct mount *mp;
	ino_t ino;
	int flags;
	struct vnode **vpp;
	{

	/*
	* XXXX
	* It would be nice if we didn't always set the `relocated' flag
	* and force the extra read, but I don't want to think about fixing
	* that right now.
	*/
	return (cd9660_vget_internal(mp, ino, flags, vpp,
	#if 0
	VFSTOISOFS(mp)->iso_ftype == ISO_FTYPE_RRIP,
	#else
	0,
	#endif
	(struct iso_directory_record *)0));
	}

	/* Use special comparator for full 64-bit ino comparison. */
	static int
	cd9660_vfs_hash_cmp(vp, pino)
	struct vnode *vp;
	void *pino;
	{
	struct iso_node *ip;
	cd_ino_t ino;

	ip = VTOI(vp);
	ino = (cd_ino_t )pino;
	return (ip->i_number != ino);
	}

	int
	cd9660_vget_internal(mp, ino, flags, vpp, relocated, isodir)
	struct mount *mp;
	cd_ino_t ino;
	int flags;
	struct vnode **vpp;
	int relocated;
	struct iso_directory_record *isodir;
	{
	struct iso_mnt *imp;
	struct iso_node *ip;
	struct buf *bp;
	struct vnode *vp;
	int error;
	struct thread *td;

	td = curthread;
	error = vfs_hash_get(mp, ino, flags, td, vpp, cd9660_vfs_hash_cmp,
	&ino);
	if (error \|\| *vpp != NULL)
	return (error);

	/*
	* We must promote to an exclusive lock for vnode creation. This
	* can happen if lookup is passed LOCKSHARED.
	*/
	if ((flags & LK_TYPE_MASK) == LK_SHARED) {
	flags &= ~LK_TYPE_MASK;
	flags \|= LK_EXCLUSIVE;
	}

	/*
	* We do not lock vnode creation as it is believed to be too
	* expensive for such rare case as simultaneous creation of vnode
	* for same ino by different processes. We just allow them to race
	* and check later to decide who wins. Let the race begin!
	*/

	imp = VFSTOISOFS(mp);

	/* Allocate a new vnode/iso_node. */
	if ((error = getnewvnode("isofs", mp, &cd9660_vnodeops, &vp)) != 0) {
	*vpp = NULLVP;
	return (error);
	}
	ip = malloc(sizeof(struct iso_node), M_ISOFSNODE,
	M_WAITOK \| M_ZERO);
	vp->v_data = ip;
	ip->i_vnode = vp;
	ip->i_number = ino;

	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
	error = insmntque(vp, mp);
	if (error != 0) {
	free(ip, M_ISOFSNODE);
	*vpp = NULLVP;
	return (error);
	}
	error = vfs_hash_insert(vp, ino, flags, td, vpp, cd9660_vfs_hash_cmp,
	&ino);
	if (error \|\| *vpp != NULL)
	return (error);

	if (isodir == NULL) {
	int lbn, off;

	lbn = lblkno(imp, ino);
	if (lbn >= imp->volume_space_size) {
	vput(vp);
	printf("fhtovp: lbn exceed volume space %d\n", lbn);
	return (ESTALE);
	}

	off = blkoff(imp, ino);
	if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) {
	vput(vp);
	printf("fhtovp: crosses block boundary %d\n",
	off + ISO_DIRECTORY_RECORD_SIZE);
	return (ESTALE);
	}

	error = bread(imp->im_devvp,
	lbn << (imp->im_bshift - DEV_BSHIFT),
	imp->logical_block_size, NOCRED, &bp);
	if (error) {
	vput(vp);
	printf("fhtovp: bread error %d\n",error);
	return (error);
	}
	isodir = (struct iso_directory_record *)(bp->b_data + off);

	if (off + isonum_711(isodir->length) >
	imp->logical_block_size) {
	vput(vp);
	brelse(bp);
	printf("fhtovp: directory crosses block boundary %d[off=%d/len=%d]\n",
	off +isonum_711(isodir->length), off,
	isonum_711(isodir->length));
	return (ESTALE);
	}

	#if 0
	if (isonum_733(isodir->extent) +
	isonum_711(isodir->ext_attr_length) != ifhp->ifid_start) {
	brelse(bp);
	printf("fhtovp: file start miss %d vs %d\n",
	isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length),
	ifhp->ifid_start);
	return (ESTALE);
	}
	#endif
	} else
	bp = NULL;

	ip->i_mnt = imp;

	if (relocated) {
	/*
	* On relocated directories we must
	* read the `.' entry out of a dir.
	*/
	ip->iso_start = ino >> imp->im_bshift;
	if (bp != NULL)
	brelse(bp);
	if ((error = cd9660_blkatoff(vp, (off_t)0, NULL, &bp)) != 0) {
	vput(vp);
	return (error);
	}
	isodir = (struct iso_directory_record *)bp->b_data;
	}

	ip->iso_extent = isonum_733(isodir->extent);
	ip->i_size = isonum_733(isodir->size);
	ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent;

	/*
	* Setup time stamp, attribute
	*/
	vp->v_type = VNON;
	switch (imp->iso_ftype) {
	default: /* ISO_FTYPE_9660 */
	{
	struct buf *bp2;
	int off;
	if ((imp->im_flags & ISOFSMNT_EXTATT)
	&& (off = isonum_711(isodir->ext_attr_length)))
	cd9660_blkatoff(vp, (off_t)-(off << imp->im_bshift), NULL,
	&bp2);
	else
	bp2 = NULL;
	cd9660_defattr(isodir, ip, bp2, ISO_FTYPE_9660);
	cd9660_deftstamp(isodir, ip, bp2, ISO_FTYPE_9660);
	if (bp2)
	brelse(bp2);
	break;
	}
	case ISO_FTYPE_RRIP:
	cd9660_rrip_analyze(isodir, ip, imp);
	break;
	}

	brelse(bp);

	/*
	* Initialize the associated vnode
	*/
	switch (vp->v_type = IFTOVT(ip->inode.iso_mode)) {
	case VFIFO:
	vp->v_op = &cd9660_fifoops;
	break;
	default:
	VN_LOCK_ASHARE(vp);
	break;
	}

	if (ip->iso_extent == imp->root_extent)
	vp->v_vflag \|= VV_ROOT;

	/*
	* XXX need generation number?
	*/

	*vpp = vp;
	return (0);
	}
	diff --git a/sys/fs/ext2fs/ext2_vfsops.c b/sys/fs/ext2fs/ext2_vfsops.c
	index bfc5c074f6f0..a64e28186769 100644
	--- a/sys/fs/ext2fs/ext2_vfsops.c
	+++ b/sys/fs/ext2fs/ext2_vfsops.c
	@@ -1,1443 +1,1443 @@
	/*-
	* modified for EXT2FS support in Lites 1.1
	*
	* Aug 1995, Godmar Back (gback@cs.utah.edu)
	* University of Utah, Department of Computer Science
	*/
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1989, 1991, 1993, 1994
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ffs_vfsops.c 8.8 (Berkeley) 4/18/94
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/kernel.h>
	#include <sys/vnode.h>
	#include <sys/mount.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/conf.h>
	#include <sys/endian.h>
	#include <sys/fcntl.h>
	#include <sys/malloc.h>
	#include <sys/sdt.h>
	#include <sys/stat.h>
	#include <sys/mutex.h>

	#include <geom/geom.h>
	#include <geom/geom_vfs.h>

	#include <fs/ext2fs/fs.h>
	#include <fs/ext2fs/ext2_mount.h>
	#include <fs/ext2fs/inode.h>

	#include <fs/ext2fs/ext2fs.h>
	#include <fs/ext2fs/ext2_dinode.h>
	#include <fs/ext2fs/ext2_extern.h>
	#include <fs/ext2fs/ext2_extents.h>

	SDT_PROVIDER_DECLARE(ext2fs);
	/*
	* ext2fs trace probe:
	* arg0: verbosity. Higher numbers give more verbose messages
	* arg1: Textual message
	*/
	SDT_PROBE_DEFINE2(ext2fs, , vfsops, trace, "int", "char*");
	SDT_PROBE_DEFINE2(ext2fs, , vfsops, ext2_cg_validate_error, "char*", "int");
	SDT_PROBE_DEFINE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "char*");

	static int ext2_flushfiles(struct mount mp, int flags, struct thread td);
	static int ext2_mountfs(struct vnode , struct mount );
	static int ext2_reload(struct mount mp, struct thread td);
	static int ext2_sbupdate(struct ext2mount *, int);
	static int ext2_cgupdate(struct ext2mount *, int);
	static vfs_unmount_t ext2_unmount;
	static vfs_root_t ext2_root;
	static vfs_statfs_t ext2_statfs;
	static vfs_sync_t ext2_sync;
	static vfs_vget_t ext2_vget;
	static vfs_fhtovp_t ext2_fhtovp;
	static vfs_mount_t ext2_mount;

	MALLOC_DEFINE(M_EXT2NODE, "ext2_node", "EXT2 vnode private part");
	static MALLOC_DEFINE(M_EXT2MNT, "ext2_mount", "EXT2 mount structure");

	static struct vfsops ext2fs_vfsops = {
	.vfs_fhtovp = ext2_fhtovp,
	.vfs_mount = ext2_mount,
	.vfs_root = ext2_root, /* root inode via vget */
	.vfs_statfs = ext2_statfs,
	.vfs_sync = ext2_sync,
	.vfs_unmount = ext2_unmount,
	.vfs_vget = ext2_vget,
	};

	VFS_SET(ext2fs_vfsops, ext2fs, 0);

	static int ext2_check_sb_compat(struct ext2fs es, struct cdev dev,
	int ronly);
	static int ext2_compute_sb_data(struct vnode * devvp,
	struct ext2fs * es, struct m_ext2fs * fs);

	static const char *ext2_opts[] = { "acls", "async", "noatime", "noclusterr",
	"noclusterw", "noexec", "export", "force", "from", "multilabel",
	"suiddir", "nosymfollow", "sync", "union", NULL };

	/*
	* VFS Operations.
	*
	* mount system call
	*/
	static int
	ext2_mount(struct mount *mp)
	{
	struct vfsoptlist *opts;
	struct vnode *devvp;
	struct thread *td;
	struct ext2mount *ump = NULL;
	struct m_ext2fs *fs;
	struct nameidata nd, *ndp = &nd;
	accmode_t accmode;
	char path, fspec;
	int error, flags, len;

	td = curthread;
	opts = mp->mnt_optnew;

	if (vfs_filteropt(opts, ext2_opts))
	return (EINVAL);

	vfs_getopt(opts, "fspath", (void **)&path, NULL);
	/* Double-check the length of path.. */
	if (strlen(path) >= MAXMNTLEN)
	return (ENAMETOOLONG);

	fspec = NULL;
	error = vfs_getopt(opts, "from", (void **)&fspec, &len);
	if (!error && fspec[len - 1] != '\0')
	return (EINVAL);

	/*
	* If updating, check whether changing from read-only to
	* read/write; if there is no device name, that's all we do.
	*/
	if (mp->mnt_flag & MNT_UPDATE) {
	ump = VFSTOEXT2(mp);
	fs = ump->um_e2fs;
	error = 0;
	if (fs->e2fs_ronly == 0 &&
	vfs_flagopt(opts, "ro", NULL, 0)) {
	error = VFS_SYNC(mp, MNT_WAIT);
	if (error)
	return (error);
	flags = WRITECLOSE;
	if (mp->mnt_flag & MNT_FORCE)
	flags \|= FORCECLOSE;
	error = ext2_flushfiles(mp, flags, td);
	if (error == 0 && fs->e2fs_wasvalid &&
	ext2_cgupdate(ump, MNT_WAIT) == 0) {
	fs->e2fs->e2fs_state =
	htole16((le16toh(fs->e2fs->e2fs_state) \|
	E2FS_ISCLEAN));
	ext2_sbupdate(ump, MNT_WAIT);
	}
	fs->e2fs_ronly = 1;
	vfs_flagopt(opts, "ro", &mp->mnt_flag, MNT_RDONLY);
	g_topology_lock();
	g_access(ump->um_cp, 0, -1, 0);
	g_topology_unlock();
	}
	if (!error && (mp->mnt_flag & MNT_RELOAD))
	error = ext2_reload(mp, td);
	if (error)
	return (error);
	devvp = ump->um_devvp;
	if (fs->e2fs_ronly && !vfs_flagopt(opts, "ro", NULL, 0)) {
	if (ext2_check_sb_compat(fs->e2fs, devvp->v_rdev, 0))
	return (EPERM);

	/*
	* If upgrade to read-write by non-root, then verify
	* that user has necessary permissions on the device.
	*/
	vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY);
	error = VOP_ACCESS(devvp, VREAD \| VWRITE,
	td->td_ucred, td);
	if (error)
	error = priv_check(td, PRIV_VFS_MOUNT_PERM);
	if (error) {
	VOP_UNLOCK(devvp);
	return (error);
	}
	VOP_UNLOCK(devvp);
	g_topology_lock();
	error = g_access(ump->um_cp, 0, 1, 0);
	g_topology_unlock();
	if (error)
	return (error);

	if ((le16toh(fs->e2fs->e2fs_state) & E2FS_ISCLEAN) == 0 \|\|
	(le16toh(fs->e2fs->e2fs_state) & E2FS_ERRORS)) {
	if (mp->mnt_flag & MNT_FORCE) {
	printf(
	"WARNING: %s was not properly dismounted\n", fs->e2fs_fsmnt);
	} else {
	printf(
	"WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n",
	fs->e2fs_fsmnt);
	return (EPERM);
	}
	}
	fs->e2fs->e2fs_state =
	htole16(le16toh(fs->e2fs->e2fs_state) & ~E2FS_ISCLEAN);
	(void)ext2_cgupdate(ump, MNT_WAIT);
	fs->e2fs_ronly = 0;
	MNT_ILOCK(mp);
	mp->mnt_flag &= ~MNT_RDONLY;
	MNT_IUNLOCK(mp);
	}
	if (vfs_flagopt(opts, "export", NULL, 0)) {
	/* Process export requests in vfs_mount.c. */
	return (error);
	}
	}

	/*
	* Not an update, or updating the name: look up the name
	* and verify that it refers to a sensible disk device.
	*/
	if (fspec == NULL)
	return (EINVAL);
	NDINIT(ndp, LOOKUP, FOLLOW \| LOCKLEAF, UIO_SYSSPACE, fspec, td);
	if ((error = namei(ndp)) != 0)
	return (error);
	NDFREE(ndp, NDF_ONLY_PNBUF);
	devvp = ndp->ni_vp;

	if (!vn_isdisk_error(devvp, &error)) {
	vput(devvp);
	return (error);
	}

	/*
	* If mount by non-root, then verify that user has necessary
	* permissions on the device.
	*
	* XXXRW: VOP_ACCESS() enough?
	*/
	accmode = VREAD;
	if ((mp->mnt_flag & MNT_RDONLY) == 0)
	accmode \|= VWRITE;
	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
	if (error)
	error = priv_check(td, PRIV_VFS_MOUNT_PERM);
	if (error) {
	vput(devvp);
	return (error);
	}

	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
	error = ext2_mountfs(devvp, mp);
	} else {
	if (devvp != ump->um_devvp) {
	vput(devvp);
	return (EINVAL); /* needs translation */
	} else
	vput(devvp);
	}
	if (error) {
	vrele(devvp);
	return (error);
	}
	ump = VFSTOEXT2(mp);
	fs = ump->um_e2fs;

	/*
	* Note that this strncpy() is ok because of a check at the start
	* of ext2_mount().
	*/
	strncpy(fs->e2fs_fsmnt, path, MAXMNTLEN);
	fs->e2fs_fsmnt[MAXMNTLEN - 1] = '\0';
	vfs_mountedfrom(mp, fspec);
	return (0);
	}

	static int
	ext2_check_sb_compat(struct ext2fs es, struct cdev dev, int ronly)
	{
	uint32_t i, mask;

	if (le16toh(es->e2fs_magic) != E2FS_MAGIC) {
	printf("ext2fs: %s: wrong magic number %#x (expected %#x)\n",
	devtoname(dev), le16toh(es->e2fs_magic), E2FS_MAGIC);
	return (1);
	}
	if (le32toh(es->e2fs_rev) > E2FS_REV0) {
	mask = le32toh(es->e2fs_features_incompat) & ~(EXT2F_INCOMPAT_SUPP);
	if (mask) {
	printf("WARNING: mount of %s denied due to "
	"unsupported optional features:\n", devtoname(dev));
	for (i = 0;
	i < sizeof(incompat)/sizeof(struct ext2_feature);
	i++)
	if (mask & incompat[i].mask)
	printf("%s ", incompat[i].name);
	printf("\n");
	return (1);
	}
	mask = le32toh(es->e2fs_features_rocompat) & ~EXT2F_ROCOMPAT_SUPP;
	if (!ronly && mask) {
	printf("WARNING: R/W mount of %s denied due to "
	"unsupported optional features:\n", devtoname(dev));
	for (i = 0;
	i < sizeof(ro_compat)/sizeof(struct ext2_feature);
	i++)
	if (mask & ro_compat[i].mask)
	printf("%s ", ro_compat[i].name);
	printf("\n");
	return (1);
	}
	}
	return (0);
	}

	static e4fs_daddr_t
	ext2_cg_location(struct m_ext2fs *fs, int number)
	{
	int cg, descpb, logical_sb, has_super = 0;

	/*
	* Adjust logical superblock block number.
	* Godmar thinks: if the blocksize is greater than 1024, then
	* the superblock is logically part of block zero.
	*/
	logical_sb = fs->e2fs_bsize > SBSIZE ? 0 : 1;

	if (!EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_META_BG) \|\|
	number < le32toh(fs->e2fs->e3fs_first_meta_bg))
	return (logical_sb + number + 1);

	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT))
	descpb = fs->e2fs_bsize / sizeof(struct ext2_gd);
	else
	descpb = fs->e2fs_bsize / E2FS_REV0_GD_SIZE;

	cg = descpb * number;

	if (ext2_cg_has_sb(fs, cg))
	has_super = 1;

	return (has_super + cg * (e4fs_daddr_t)EXT2_BLOCKS_PER_GROUP(fs) +
	le32toh(fs->e2fs->e2fs_first_dblock));
	}

	static int
	ext2_cg_validate(struct m_ext2fs *fs)
	{
	uint64_t b_bitmap;
	uint64_t i_bitmap;
	uint64_t i_tables;
	uint64_t first_block, last_block, last_cg_block;
	struct ext2_gd *gd;
	unsigned int i, cg_count;

	first_block = le32toh(fs->e2fs->e2fs_first_dblock);
	last_cg_block = ext2_cg_number_gdb(fs, 0);
	cg_count = fs->e2fs_gcount;

	for (i = 0; i < fs->e2fs_gcount; i++) {
	gd = &fs->e2fs_gd[i];

	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_FLEX_BG) \|\|
	i == fs->e2fs_gcount - 1) {
	last_block = fs->e2fs_bcount - 1;
	} else {
	last_block = first_block +
	(EXT2_BLOCKS_PER_GROUP(fs) - 1);
	}

	if ((cg_count == fs->e2fs_gcount) &&
	!(le16toh(gd->ext4bgd_flags) & EXT2_BG_INODE_ZEROED))
	cg_count = i;

	b_bitmap = e2fs_gd_get_b_bitmap(gd);
	if (b_bitmap == 0) {
	SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
	"block bitmap is zero", i);
	return (EINVAL);
	}
	if (b_bitmap <= last_cg_block) {
	SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
	"block bitmap overlaps gds", i);
	return (EINVAL);
	}
	if (b_bitmap < first_block \|\| b_bitmap > last_block) {
	SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
	"block bitmap not in group", i);
	return (EINVAL);
	}

	i_bitmap = e2fs_gd_get_i_bitmap(gd);
	if (i_bitmap == 0) {
	SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
	"inode bitmap is zero", i);
	return (EINVAL);
	}
	if (i_bitmap <= last_cg_block) {
	SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
	"inode bitmap overlaps gds", i);
	return (EINVAL);
	}
	if (i_bitmap < first_block \|\| i_bitmap > last_block) {
	SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
	"inode bitmap not in group blk", i);
	return (EINVAL);
	}

	i_tables = e2fs_gd_get_i_tables(gd);
	if (i_tables == 0) {
	SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
	"inode table is zero", i);
	return (EINVAL);
	}
	if (i_tables <= last_cg_block) {
	SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
	"inode talbes overlaps gds", i);
	return (EINVAL);
	}
	if (i_tables < first_block \|\|
	i_tables + fs->e2fs_itpg - 1 > last_block) {
	SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
	"inode tables not in group blk", i);
	return (EINVAL);
	}

	if (!EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_FLEX_BG))
	first_block += EXT2_BLOCKS_PER_GROUP(fs);
	}

	return (0);
	}

	/*
	* This computes the fields of the m_ext2fs structure from the
	* data in the ext2fs structure read in.
	*/
	static int
	ext2_compute_sb_data(struct vnode devvp, struct ext2fs es,
	struct m_ext2fs *fs)
	{
	struct buf *bp;
	uint32_t e2fs_descpb, e2fs_gdbcount_alloc;
	int i, j;
	int g_count = 0;
	int error;

	/* Check checksum features */
	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) &&
	EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"incorrect checksum features combination");
	return (EINVAL);
	}

	/* Precompute checksum seed for all metadata */
	ext2_sb_csum_set_seed(fs);

	/* Verify sb csum if possible */
	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) {
	error = ext2_sb_csum_verify(fs);
	if (error) {
	return (error);
	}
	}

	/* Check for block size = 1K\|2K\|4K */
	if (le32toh(es->e2fs_log_bsize) > 2) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"bad block size");
	return (EINVAL);
	}

	fs->e2fs_bshift = EXT2_MIN_BLOCK_LOG_SIZE + le32toh(es->e2fs_log_bsize);
	fs->e2fs_bsize = 1U << fs->e2fs_bshift;
	fs->e2fs_fsbtodb = le32toh(es->e2fs_log_bsize) + 1;
	fs->e2fs_qbmask = fs->e2fs_bsize - 1;

	/* Check for fragment size */
	if (le32toh(es->e2fs_log_fsize) >
	(EXT2_MAX_FRAG_LOG_SIZE - EXT2_MIN_BLOCK_LOG_SIZE)) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"invalid log cluster size");
	return (EINVAL);
	}

	fs->e2fs_fsize = EXT2_MIN_FRAG_SIZE << le32toh(es->e2fs_log_fsize);
	if (fs->e2fs_fsize != fs->e2fs_bsize) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"fragment size != block size");
	return (EINVAL);
	}

	fs->e2fs_fpb = fs->e2fs_bsize / fs->e2fs_fsize;

	/* Check reserved gdt blocks for future filesystem expansion */
	if (le16toh(es->e2fs_reserved_ngdb) > (fs->e2fs_bsize / 4)) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"number of reserved GDT blocks too large");
	return (EINVAL);
	}

	if (le32toh(es->e2fs_rev) == E2FS_REV0) {
	fs->e2fs_isize = E2FS_REV0_INODE_SIZE;
	} else {
	fs->e2fs_isize = le16toh(es->e2fs_inode_size);

	/*
	* Check first ino.
	*/
	if (le32toh(es->e2fs_first_ino) < EXT2_FIRSTINO) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"invalid first ino");
	return (EINVAL);
	}

	/*
	* Simple sanity check for superblock inode size value.
	*/
	if (EXT2_INODE_SIZE(fs) < E2FS_REV0_INODE_SIZE \|\|
	EXT2_INODE_SIZE(fs) > fs->e2fs_bsize \|\|
	(fs->e2fs_isize & (fs->e2fs_isize - 1)) != 0) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"invalid inode size");
	return (EINVAL);
	}
	}

	/* Check group descriptors */
	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT) &&
	le16toh(es->e3fs_desc_size) != E2FS_64BIT_GD_SIZE) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"unsupported 64bit descriptor size");
	return (EINVAL);
	}

	fs->e2fs_bpg = le32toh(es->e2fs_bpg);
	fs->e2fs_fpg = le32toh(es->e2fs_fpg);
	if (fs->e2fs_bpg == 0 \|\| fs->e2fs_fpg == 0) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"zero blocks/fragments per group");
	return (EINVAL);
	} else if (fs->e2fs_bpg != fs->e2fs_fpg) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"blocks per group not equal fragments per group");
	return (EINVAL);
	}

	if (fs->e2fs_bpg != fs->e2fs_bsize * 8) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"non-standard group size unsupported");
	return (EINVAL);
	}

	fs->e2fs_ipb = fs->e2fs_bsize / EXT2_INODE_SIZE(fs);
	if (fs->e2fs_ipb == 0 \|\|
	fs->e2fs_ipb > fs->e2fs_bsize / E2FS_REV0_INODE_SIZE) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"bad inodes per block size");
	return (EINVAL);
	}

	fs->e2fs_ipg = le32toh(es->e2fs_ipg);
	if (fs->e2fs_ipg < fs->e2fs_ipb \|\| fs->e2fs_ipg > fs->e2fs_bsize * 8) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"invalid inodes per group");
	return (EINVAL);
	}

	fs->e2fs_itpg = fs->e2fs_ipg / fs->e2fs_ipb;

	fs->e2fs_bcount = le32toh(es->e2fs_bcount);
	fs->e2fs_rbcount = le32toh(es->e2fs_rbcount);
	fs->e2fs_fbcount = le32toh(es->e2fs_fbcount);
	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
	fs->e2fs_bcount \|= (uint64_t)(le32toh(es->e4fs_bcount_hi)) << 32;
	fs->e2fs_rbcount \|= (uint64_t)(le32toh(es->e4fs_rbcount_hi)) << 32;
	fs->e2fs_fbcount \|= (uint64_t)(le32toh(es->e4fs_fbcount_hi)) << 32;
	}
	if (fs->e2fs_rbcount > fs->e2fs_bcount \|\|
	fs->e2fs_fbcount > fs->e2fs_bcount) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"invalid block count");
	return (EINVAL);
	}

	fs->e2fs_ficount = le32toh(es->e2fs_ficount);
	if (fs->e2fs_ficount > le32toh(es->e2fs_icount)) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"invalid number of free inodes");
	return (EINVAL);
	}

	if (le32toh(es->e2fs_first_dblock) >= fs->e2fs_bcount) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"first data block out of range");
	return (EINVAL);
	}

	fs->e2fs_gcount = howmany(fs->e2fs_bcount -
	le32toh(es->e2fs_first_dblock), EXT2_BLOCKS_PER_GROUP(fs));
	if (fs->e2fs_gcount > ((uint64_t)1 << 32) - EXT2_DESCS_PER_BLOCK(fs)) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"groups count too large");
	return (EINVAL);
	}

	/* Check for extra isize in big inodes. */
	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_EXTRA_ISIZE) &&
	EXT2_INODE_SIZE(fs) < sizeof(struct ext2fs_dinode)) {
	SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
	"no space for extra inode timestamps");
	return (EINVAL);
	}

	/* s_resuid / s_resgid ? */

	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
	e2fs_descpb = fs->e2fs_bsize / E2FS_64BIT_GD_SIZE;
	e2fs_gdbcount_alloc = howmany(fs->e2fs_gcount, e2fs_descpb);
	} else {
	e2fs_descpb = fs->e2fs_bsize / E2FS_REV0_GD_SIZE;
	e2fs_gdbcount_alloc = howmany(fs->e2fs_gcount,
	fs->e2fs_bsize / sizeof(struct ext2_gd));
	}
	fs->e2fs_gdbcount = howmany(fs->e2fs_gcount, e2fs_descpb);
	fs->e2fs_gd = malloc(e2fs_gdbcount_alloc * fs->e2fs_bsize,
	M_EXT2MNT, M_WAITOK \| M_ZERO);
	fs->e2fs_contigdirs = malloc(fs->e2fs_gcount *
	sizeof(*fs->e2fs_contigdirs), M_EXT2MNT, M_WAITOK \| M_ZERO);

	for (i = 0; i < fs->e2fs_gdbcount; i++) {
	error = bread(devvp,
	fsbtodb(fs, ext2_cg_location(fs, i)),
	fs->e2fs_bsize, NOCRED, &bp);
	if (error) {
	/*
	* fs->e2fs_gd and fs->e2fs_contigdirs
	* will be freed later by the caller,
	* because this function could be called from
	* MNT_UPDATE path.
	*/
	return (error);
	}
	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
	memcpy(&fs->e2fs_gd[
	i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
	bp->b_data, fs->e2fs_bsize);
	} else {
	for (j = 0; j < e2fs_descpb &&
	g_count < fs->e2fs_gcount; j++, g_count++)
	memcpy(&fs->e2fs_gd[g_count],
	bp->b_data + j * E2FS_REV0_GD_SIZE,
	E2FS_REV0_GD_SIZE);
	}
	brelse(bp);
	bp = NULL;
	}

	/* Validate cgs consistency */
	error = ext2_cg_validate(fs);
	if (error)
	return (error);

	/* Verfy cgs csum */
	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) \|\|
	EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) {
	error = ext2_gd_csum_verify(fs, devvp->v_rdev);
	if (error)
	return (error);
	}
	/* Initialization for the ext2 Orlov allocator variant. */
	fs->e2fs_total_dir = 0;
	for (i = 0; i < fs->e2fs_gcount; i++)
	fs->e2fs_total_dir += e2fs_gd_get_ndirs(&fs->e2fs_gd[i]);

	if (le32toh(es->e2fs_rev) == E2FS_REV0 \|\|
	!EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_LARGEFILE))
	fs->e2fs_maxfilesize = 0x7fffffff;
	else {
	fs->e2fs_maxfilesize = 0xffffffffffff;
	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_HUGE_FILE))
	fs->e2fs_maxfilesize = 0x7fffffffffffffff;
	}
	if (le32toh(es->e4fs_flags) & E2FS_UNSIGNED_HASH) {
	fs->e2fs_uhash = 3;
	} else if ((le32toh(es->e4fs_flags) & E2FS_SIGNED_HASH) == 0) {
	#ifdef __CHAR_UNSIGNED__
	es->e4fs_flags = htole32(le32toh(es->e4fs_flags) \| E2FS_UNSIGNED_HASH);
	fs->e2fs_uhash = 3;
	#else
	es->e4fs_flags = htole32(le32toh(es->e4fs_flags) \| E2FS_SIGNED_HASH);
	#endif
	}
	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM))
	error = ext2_sb_csum_verify(fs);

	return (error);
	}

	/*
	* Reload all incore data for a filesystem (used after running fsck on
	* the root filesystem and finding things to fix). The filesystem must
	* be mounted read-only.
	*
	* Things to do to update the mount:
	* 1) invalidate all cached meta-data.
	* 2) re-read superblock from disk.
	* 3) invalidate all cluster summary information.
	* 4) invalidate all inactive vnodes.
	* 5) invalidate all cached file data.
	* 6) re-read inode data for all active vnodes.
	* XXX we are missing some steps, in particular # 3, this has to be reviewed.
	*/
	static int
	ext2_reload(struct mount mp, struct thread td)
	{
	struct vnode vp, mvp, *devvp;
	struct inode *ip;
	struct buf *bp;
	struct ext2fs *es;
	struct m_ext2fs *fs;
	struct csum *sump;
	int error, i;
	int32_t *lp;

	if ((mp->mnt_flag & MNT_RDONLY) == 0)
	return (EINVAL);
	/*
	* Step 1: invalidate all cached meta-data.
	*/
	devvp = VFSTOEXT2(mp)->um_devvp;
	vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY);
	if (vinvalbuf(devvp, 0, 0, 0) != 0)
	panic("ext2_reload: dirty1");
	VOP_UNLOCK(devvp);

	/*
	* Step 2: re-read superblock from disk.
	* constants have been adjusted for ext2
	*/
	if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0)
	return (error);
	es = (struct ext2fs *)bp->b_data;
	if (ext2_check_sb_compat(es, devvp->v_rdev, 0) != 0) {
	brelse(bp);
	return (EIO); /* XXX needs translation */
	}
	fs = VFSTOEXT2(mp)->um_e2fs;
	bcopy(bp->b_data, fs->e2fs, sizeof(struct ext2fs));

	if ((error = ext2_compute_sb_data(devvp, es, fs)) != 0) {
	brelse(bp);
	return (error);
	}
	#ifdef UNKLAR
	if (fs->fs_sbsize < SBSIZE)
	bp->b_flags \|= B_INVAL;
	#endif
	brelse(bp);

	/*
	* Step 3: invalidate all cluster summary information.
	*/
	if (fs->e2fs_contigsumsize > 0) {
	lp = fs->e2fs_maxcluster;
	sump = fs->e2fs_clustersum;
	for (i = 0; i < fs->e2fs_gcount; i++, sump++) {
	*lp++ = fs->e2fs_contigsumsize;
	sump->cs_init = 0;
	bzero(sump->cs_sum, fs->e2fs_contigsumsize + 1);
	}
	}

	loop:
	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
	/*
	* Step 4: invalidate all cached file data.
	*/
	if (vget(vp, LK_EXCLUSIVE \| LK_INTERLOCK)) {
	MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
	goto loop;
	}
	if (vinvalbuf(vp, 0, 0, 0))
	panic("ext2_reload: dirty2");

	/*
	* Step 5: re-read inode data for all active vnodes.
	*/
	ip = VTOI(vp);
	error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
	(int)fs->e2fs_bsize, NOCRED, &bp);
	if (error) {
	VOP_UNLOCK(vp);
	vrele(vp);
	MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
	return (error);
	}

	error = ext2_ei2i((struct ext2fs_dinode )((char )bp->b_data +
	EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number)), ip);

	brelse(bp);
	VOP_UNLOCK(vp);
	vrele(vp);

	if (error) {
	MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
	return (error);
	}
	}
	return (0);
	}

	/*
	* Common code for mount and mountroot.
	*/
	static int
	ext2_mountfs(struct vnode devvp, struct mount mp)
	{
	struct ext2mount *ump;
	struct buf *bp;
	struct m_ext2fs *fs;
	struct ext2fs *es;
	struct cdev *dev = devvp->v_rdev;
	struct g_consumer *cp;
	struct bufobj *bo;
	struct csum *sump;
	int error;
	int ronly;
	int i;
	u_long size;
	int32_t *lp;
	int32_t e2fs_maxcontig;

	ronly = vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0);
	/* XXX: use VOP_ACESS to check FS perms */
	g_topology_lock();
	error = g_vfs_open(devvp, &cp, "ext2fs", ronly ? 0 : 1);
	g_topology_unlock();
	VOP_UNLOCK(devvp);
	if (error)
	return (error);

	/* XXX: should we check for some sectorsize or 512 instead? */
	if (((SBSIZE % cp->provider->sectorsize) != 0) \|\|
	(SBSIZE < cp->provider->sectorsize)) {
	g_topology_lock();
	g_vfs_close(cp);
	g_topology_unlock();
	return (EINVAL);
	}

	bo = &devvp->v_bufobj;
	bo->bo_private = cp;
	bo->bo_ops = g_vfs_bufops;
	if (devvp->v_rdev->si_iosize_max != 0)
	mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
	- if (mp->mnt_iosize_max > MAXPHYS)
	- mp->mnt_iosize_max = MAXPHYS;
	+ if (mp->mnt_iosize_max > maxphys)
	+ mp->mnt_iosize_max = maxphys;

	bp = NULL;
	ump = NULL;
	if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0)
	goto out;
	es = (struct ext2fs *)bp->b_data;
	if (ext2_check_sb_compat(es, dev, ronly) != 0) {
	error = EINVAL; /* XXX needs translation */
	goto out;
	}
	if ((le16toh(es->e2fs_state) & E2FS_ISCLEAN) == 0 \|\|
	(le16toh(es->e2fs_state) & E2FS_ERRORS)) {
	if (ronly \|\| (mp->mnt_flag & MNT_FORCE)) {
	printf(
	"WARNING: Filesystem was not properly dismounted\n");
	} else {
	printf(
	"WARNING: R/W mount denied. Filesystem is not clean - run fsck\n");
	error = EPERM;
	goto out;
	}
	}
	ump = malloc(sizeof(*ump), M_EXT2MNT, M_WAITOK \| M_ZERO);

	/*
	* I don't know whether this is the right strategy. Note that
	* we dynamically allocate both an m_ext2fs and an ext2fs
	* while Linux keeps the super block in a locked buffer.
	*/
	ump->um_e2fs = malloc(sizeof(struct m_ext2fs),
	M_EXT2MNT, M_WAITOK \| M_ZERO);
	ump->um_e2fs->e2fs = malloc(sizeof(struct ext2fs),
	M_EXT2MNT, M_WAITOK);
	mtx_init(EXT2_MTX(ump), "EXT2FS", "EXT2FS Lock", MTX_DEF);
	bcopy(es, ump->um_e2fs->e2fs, (u_int)sizeof(struct ext2fs));
	if ((error = ext2_compute_sb_data(devvp, ump->um_e2fs->e2fs, ump->um_e2fs)))
	goto out;

	/*
	* Calculate the maximum contiguous blocks and size of cluster summary
	* array. In FFS this is done by newfs; however, the superblock
	* in ext2fs doesn't have these variables, so we can calculate
	* them here.
	*/
	- e2fs_maxcontig = MAX(1, MAXPHYS / ump->um_e2fs->e2fs_bsize);
	+ e2fs_maxcontig = MAX(1, maxphys / ump->um_e2fs->e2fs_bsize);
	ump->um_e2fs->e2fs_contigsumsize = MIN(e2fs_maxcontig, EXT2_MAXCONTIG);
	if (ump->um_e2fs->e2fs_contigsumsize > 0) {
	size = ump->um_e2fs->e2fs_gcount * sizeof(int32_t);
	ump->um_e2fs->e2fs_maxcluster = malloc(size, M_EXT2MNT, M_WAITOK);
	size = ump->um_e2fs->e2fs_gcount * sizeof(struct csum);
	ump->um_e2fs->e2fs_clustersum = malloc(size, M_EXT2MNT, M_WAITOK);
	lp = ump->um_e2fs->e2fs_maxcluster;
	sump = ump->um_e2fs->e2fs_clustersum;
	for (i = 0; i < ump->um_e2fs->e2fs_gcount; i++, sump++) {
	*lp++ = ump->um_e2fs->e2fs_contigsumsize;
	sump->cs_init = 0;
	sump->cs_sum = malloc((ump->um_e2fs->e2fs_contigsumsize + 1) *
	sizeof(int32_t), M_EXT2MNT, M_WAITOK \| M_ZERO);
	}
	}

	brelse(bp);
	bp = NULL;
	fs = ump->um_e2fs;
	fs->e2fs_ronly = ronly; /* ronly is set according to mnt_flags */

	/*
	* If the fs is not mounted read-only, make sure the super block is
	* always written back on a sync().
	*/
	fs->e2fs_wasvalid = le16toh(fs->e2fs->e2fs_state) & E2FS_ISCLEAN ? 1 : 0;
	if (ronly == 0) {
	fs->e2fs_fmod = 1; /* mark it modified and set fs invalid */
	fs->e2fs->e2fs_state =
	htole16(le16toh(fs->e2fs->e2fs_state) & ~E2FS_ISCLEAN);
	}
	mp->mnt_data = ump;
	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
	mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN;
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_LOCAL;
	MNT_IUNLOCK(mp);
	ump->um_mountp = mp;
	ump->um_dev = dev;
	ump->um_devvp = devvp;
	ump->um_bo = &devvp->v_bufobj;
	ump->um_cp = cp;

	/*
	* Setting those two parameters allowed us to use
	* ufs_bmap w/o changse!
	*/
	ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs);
	ump->um_bptrtodb = le32toh(fs->e2fs->e2fs_log_bsize) + 1;
	ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs);
	if (ronly == 0)
	ext2_sbupdate(ump, MNT_WAIT);
	/*
	* Initialize filesystem stat information in mount struct.
	*/
	MNT_ILOCK(mp);
	mp->mnt_kern_flag \|= MNTK_LOOKUP_SHARED \| MNTK_EXTENDED_SHARED \|
	MNTK_USES_BCACHE;
	MNT_IUNLOCK(mp);
	return (0);
	out:
	if (bp)
	brelse(bp);
	if (cp != NULL) {
	g_topology_lock();
	g_vfs_close(cp);
	g_topology_unlock();
	}
	if (ump) {
	mtx_destroy(EXT2_MTX(ump));
	free(ump->um_e2fs->e2fs_gd, M_EXT2MNT);
	free(ump->um_e2fs->e2fs_contigdirs, M_EXT2MNT);
	free(ump->um_e2fs->e2fs, M_EXT2MNT);
	free(ump->um_e2fs, M_EXT2MNT);
	free(ump, M_EXT2MNT);
	mp->mnt_data = NULL;
	}
	return (error);
	}

	/*
	* Unmount system call.
	*/
	static int
	ext2_unmount(struct mount *mp, int mntflags)
	{
	struct ext2mount *ump;
	struct m_ext2fs *fs;
	struct csum *sump;
	int error, flags, i, ronly;

	flags = 0;
	if (mntflags & MNT_FORCE) {
	if (mp->mnt_flag & MNT_ROOTFS)
	return (EINVAL);
	flags \|= FORCECLOSE;
	}
	if ((error = ext2_flushfiles(mp, flags, curthread)) != 0)
	return (error);
	ump = VFSTOEXT2(mp);
	fs = ump->um_e2fs;
	ronly = fs->e2fs_ronly;
	if (ronly == 0 && ext2_cgupdate(ump, MNT_WAIT) == 0) {
	if (fs->e2fs_wasvalid)
	fs->e2fs->e2fs_state =
	htole16(le16toh(fs->e2fs->e2fs_state) \| E2FS_ISCLEAN);
	ext2_sbupdate(ump, MNT_WAIT);
	}

	g_topology_lock();
	g_vfs_close(ump->um_cp);
	g_topology_unlock();
	vrele(ump->um_devvp);
	sump = fs->e2fs_clustersum;
	for (i = 0; i < fs->e2fs_gcount; i++, sump++)
	free(sump->cs_sum, M_EXT2MNT);
	free(fs->e2fs_clustersum, M_EXT2MNT);
	free(fs->e2fs_maxcluster, M_EXT2MNT);
	free(fs->e2fs_gd, M_EXT2MNT);
	free(fs->e2fs_contigdirs, M_EXT2MNT);
	free(fs->e2fs, M_EXT2MNT);
	free(fs, M_EXT2MNT);
	free(ump, M_EXT2MNT);
	mp->mnt_data = NULL;
	MNT_ILOCK(mp);
	mp->mnt_flag &= ~MNT_LOCAL;
	MNT_IUNLOCK(mp);
	return (error);
	}

	/*
	* Flush out all the files in a filesystem.
	*/
	static int
	ext2_flushfiles(struct mount mp, int flags, struct thread td)
	{
	int error;

	error = vflush(mp, 0, flags, td);
	return (error);
	}

	/*
	* Get filesystem statistics.
	*/
	int
	ext2_statfs(struct mount mp, struct statfs sbp)
	{
	struct ext2mount *ump;
	struct m_ext2fs *fs;
	uint32_t overhead, overhead_per_group, ngdb;
	int i, ngroups;

	ump = VFSTOEXT2(mp);
	fs = ump->um_e2fs;
	if (le16toh(fs->e2fs->e2fs_magic) != E2FS_MAGIC)
	panic("ext2_statfs");

	/*
	* Compute the overhead (FS structures)
	*/
	overhead_per_group =
	1 /* block bitmap */ +
	1 /* inode bitmap */ +
	fs->e2fs_itpg;
	overhead = le32toh(fs->e2fs->e2fs_first_dblock) +
	fs->e2fs_gcount * overhead_per_group;
	if (le32toh(fs->e2fs->e2fs_rev) > E2FS_REV0 &&
	le32toh(fs->e2fs->e2fs_features_rocompat) & EXT2F_ROCOMPAT_SPARSESUPER) {
	for (i = 0, ngroups = 0; i < fs->e2fs_gcount; i++) {
	if (ext2_cg_has_sb(fs, i))
	ngroups++;
	}
	} else {
	ngroups = fs->e2fs_gcount;
	}
	ngdb = fs->e2fs_gdbcount;
	if (le32toh(fs->e2fs->e2fs_rev) > E2FS_REV0 &&
	le32toh(fs->e2fs->e2fs_features_compat) & EXT2F_COMPAT_RESIZE)
	ngdb += le16toh(fs->e2fs->e2fs_reserved_ngdb);
	overhead += ngroups * (1 /* superblock */ + ngdb);

	sbp->f_bsize = EXT2_FRAG_SIZE(fs);
	sbp->f_iosize = EXT2_BLOCK_SIZE(fs);
	sbp->f_blocks = fs->e2fs_bcount - overhead;
	sbp->f_bfree = fs->e2fs_fbcount;
	sbp->f_bavail = sbp->f_bfree - fs->e2fs_rbcount;
	sbp->f_files = le32toh(fs->e2fs->e2fs_icount);
	sbp->f_ffree = fs->e2fs_ficount;
	return (0);
	}

	/*
	* Go through the disk queues to initiate sandbagged IO;
	* go through the inodes to write those that have been modified;
	* initiate the writing of the super block if it has been modified.
	*
	* Note: we are always called with the filesystem marked `MPBUSY'.
	*/
	static int
	ext2_sync(struct mount *mp, int waitfor)
	{
	struct vnode mvp, vp;
	struct thread *td;
	struct inode *ip;
	struct ext2mount *ump = VFSTOEXT2(mp);
	struct m_ext2fs *fs;
	int error, allerror = 0;

	td = curthread;
	fs = ump->um_e2fs;
	if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) { /* XXX */
	panic("ext2_sync: rofs mod fs=%s", fs->e2fs_fsmnt);
	}

	/*
	* Write back each (modified) inode.
	*/
	loop:
	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
	if (vp->v_type == VNON) {
	VI_UNLOCK(vp);
	continue;
	}
	ip = VTOI(vp);
	if ((ip->i_flag &
	(IN_ACCESS \| IN_CHANGE \| IN_MODIFIED \| IN_UPDATE)) == 0 &&
	(vp->v_bufobj.bo_dirty.bv_cnt == 0 \|\|
	waitfor == MNT_LAZY)) {
	VI_UNLOCK(vp);
	continue;
	}
	error = vget(vp, LK_EXCLUSIVE \| LK_NOWAIT \| LK_INTERLOCK);
	if (error) {
	if (error == ENOENT) {
	MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
	goto loop;
	}
	continue;
	}
	if ((error = VOP_FSYNC(vp, waitfor, td)) != 0)
	allerror = error;
	VOP_UNLOCK(vp);
	vrele(vp);
	}

	/*
	* Force stale filesystem control information to be flushed.
	*/
	if (waitfor != MNT_LAZY) {
	vn_lock(ump->um_devvp, LK_EXCLUSIVE \| LK_RETRY);
	if ((error = VOP_FSYNC(ump->um_devvp, waitfor, td)) != 0)
	allerror = error;
	VOP_UNLOCK(ump->um_devvp);
	}

	/*
	* Write back modified superblock.
	*/
	if (fs->e2fs_fmod != 0) {
	fs->e2fs_fmod = 0;
	fs->e2fs->e2fs_wtime = htole32(time_second);
	if ((error = ext2_cgupdate(ump, waitfor)) != 0)
	allerror = error;
	}
	return (allerror);
	}

	/*
	* Look up an EXT2FS dinode number to find its incore vnode, otherwise read it
	* in from disk. If it is in core, wait for the lock bit to clear, then
	* return the inode locked. Detection and handling of mount points must be
	* done by the calling routine.
	*/
	static int
	ext2_vget(struct mount mp, ino_t ino, int flags, struct vnode *vpp)
	{
	struct m_ext2fs *fs;
	struct inode *ip;
	struct ext2mount *ump;
	struct buf *bp;
	struct vnode *vp;
	struct thread *td;
	unsigned int i, used_blocks;
	int error;

	td = curthread;
	error = vfs_hash_get(mp, ino, flags, td, vpp, NULL, NULL);
	if (error \|\| *vpp != NULL)
	return (error);

	ump = VFSTOEXT2(mp);
	ip = malloc(sizeof(struct inode), M_EXT2NODE, M_WAITOK \| M_ZERO);

	/* Allocate a new vnode/inode. */
	if ((error = getnewvnode("ext2fs", mp, &ext2_vnodeops, &vp)) != 0) {
	*vpp = NULL;
	free(ip, M_EXT2NODE);
	return (error);
	}
	vp->v_data = ip;
	ip->i_vnode = vp;
	ip->i_e2fs = fs = ump->um_e2fs;
	ip->i_ump = ump;
	ip->i_number = ino;

	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
	error = insmntque(vp, mp);
	if (error != 0) {
	free(ip, M_EXT2NODE);
	*vpp = NULL;
	return (error);
	}
	error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL);
	if (error \|\| *vpp != NULL)
	return (error);

	/* Read in the disk contents for the inode, copy into the inode. */
	if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
	(int)fs->e2fs_bsize, NOCRED, &bp)) != 0) {
	/*
	* The inode does not contain anything useful, so it would
	* be misleading to leave it on its hash chain. With mode
	* still zero, it will be unlinked and returned to the free
	* list by vput().
	*/
	brelse(bp);
	vput(vp);
	*vpp = NULL;
	return (error);
	}
	/* convert ext2 inode to dinode */
	error = ext2_ei2i((struct ext2fs_dinode )((char )bp->b_data +
	EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ino)), ip);
	if (error) {
	brelse(bp);
	vput(vp);
	*vpp = NULL;
	return (error);
	}
	ip->i_block_group = ino_to_cg(fs, ino);
	ip->i_next_alloc_block = 0;
	ip->i_next_alloc_goal = 0;

	/*
	* Now we want to make sure that block pointers for unused
	* blocks are zeroed out - ext2_balloc depends on this
	* although for regular files and directories only
	*
	* If IN_E4EXTENTS is enabled, unused blocks are not zeroed
	* out because we could corrupt the extent tree.
	*/
	if (!(ip->i_flag & IN_E4EXTENTS) &&
	(S_ISDIR(ip->i_mode) \|\| S_ISREG(ip->i_mode))) {
	used_blocks = howmany(ip->i_size, fs->e2fs_bsize);
	for (i = used_blocks; i < EXT2_NDIR_BLOCKS; i++)
	ip->i_db[i] = 0;
	}
	#ifdef EXT2FS_PRINT_EXTENTS
	ext2_print_inode(ip);
	ext4_ext_print_extent_tree_status(ip);
	#endif
	bqrelse(bp);

	/*
	* Initialize the vnode from the inode, check for aliases.
	* Note that the underlying vnode may have changed.
	*/
	if ((error = ext2_vinit(mp, &ext2_fifoops, &vp)) != 0) {
	vput(vp);
	*vpp = NULL;
	return (error);
	}

	/*
	* Finish inode initialization.
	*/

	*vpp = vp;
	return (0);
	}

	/*
	* File handle to vnode
	*
	* Have to be really careful about stale file handles:
	* - check that the inode number is valid
	* - call ext2_vget() to get the locked inode
	* - check for an unallocated inode (i_mode == 0)
	* - check that the given client host has export rights and return
	* those rights via. exflagsp and credanonp
	*/
	static int
	ext2_fhtovp(struct mount mp, struct fid fhp, int flags, struct vnode **vpp)
	{
	struct inode *ip;
	struct ufid *ufhp;
	struct vnode *nvp;
	struct m_ext2fs *fs;
	int error;

	ufhp = (struct ufid *)fhp;
	fs = VFSTOEXT2(mp)->um_e2fs;
	if (ufhp->ufid_ino < EXT2_ROOTINO \|\|
	ufhp->ufid_ino > fs->e2fs_gcount * fs->e2fs_ipg)
	return (ESTALE);

	error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp);
	if (error) {
	*vpp = NULLVP;
	return (error);
	}
	ip = VTOI(nvp);
	if (ip->i_mode == 0 \|\|
	ip->i_gen != ufhp->ufid_gen \|\| ip->i_nlink <= 0) {
	vput(nvp);
	*vpp = NULLVP;
	return (ESTALE);
	}
	*vpp = nvp;
	vnode_create_vobject(*vpp, 0, curthread);
	return (0);
	}

	/*
	* Write a superblock and associated information back to disk.
	*/
	static int
	ext2_sbupdate(struct ext2mount *mp, int waitfor)
	{
	struct m_ext2fs *fs = mp->um_e2fs;
	struct ext2fs *es = fs->e2fs;
	struct buf *bp;
	int error = 0;

	es->e2fs_bcount = htole32(fs->e2fs_bcount & 0xffffffff);
	es->e2fs_rbcount = htole32(fs->e2fs_rbcount & 0xffffffff);
	es->e2fs_fbcount = htole32(fs->e2fs_fbcount & 0xffffffff);
	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
	es->e4fs_bcount_hi = htole32(fs->e2fs_bcount >> 32);
	es->e4fs_rbcount_hi = htole32(fs->e2fs_rbcount >> 32);
	es->e4fs_fbcount_hi = htole32(fs->e2fs_fbcount >> 32);
	}

	es->e2fs_ficount = htole32(fs->e2fs_ficount);

	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM))
	ext2_sb_csum_set(fs);

	bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0, 0);
	bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2fs));
	if (waitfor == MNT_WAIT)
	error = bwrite(bp);
	else
	bawrite(bp);

	/*
	* The buffers for group descriptors, inode bitmaps and block bitmaps
	* are not busy at this point and are (hopefully) written by the
	* usual sync mechanism. No need to write them here.
	*/
	return (error);
	}
	int
	ext2_cgupdate(struct ext2mount *mp, int waitfor)
	{
	struct m_ext2fs *fs = mp->um_e2fs;
	struct buf *bp;
	int i, j, g_count = 0, error = 0, allerror = 0;

	allerror = ext2_sbupdate(mp, waitfor);

	/* Update gd csums */
	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) \|\|
	EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM))
	ext2_gd_csum_set(fs);

	for (i = 0; i < fs->e2fs_gdbcount; i++) {
	bp = getblk(mp->um_devvp, fsbtodb(fs,
	ext2_cg_location(fs, i)),
	fs->e2fs_bsize, 0, 0, 0);
	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
	memcpy(bp->b_data, &fs->e2fs_gd[
	i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
	fs->e2fs_bsize);
	} else {
	for (j = 0; j < fs->e2fs_bsize / E2FS_REV0_GD_SIZE &&
	g_count < fs->e2fs_gcount; j++, g_count++)
	memcpy(bp->b_data + j * E2FS_REV0_GD_SIZE,
	&fs->e2fs_gd[g_count], E2FS_REV0_GD_SIZE);
	}
	if (waitfor == MNT_WAIT)
	error = bwrite(bp);
	else
	bawrite(bp);
	}

	if (!allerror && error)
	allerror = error;
	return (allerror);
	}

	/*
	* Return the root of a filesystem.
	*/
	static int
	ext2_root(struct mount mp, int flags, struct vnode *vpp)
	{
	struct vnode *nvp;
	int error;

	error = VFS_VGET(mp, EXT2_ROOTINO, LK_EXCLUSIVE, &nvp);
	if (error)
	return (error);
	*vpp = nvp;
	return (0);
	}
	diff --git a/sys/fs/fuse/fuse_vfsops.c b/sys/fs/fuse/fuse_vfsops.c
	index 6cfdb6c3d801..04d273127ade 100644
	--- a/sys/fs/fuse/fuse_vfsops.c
	+++ b/sys/fs/fuse/fuse_vfsops.c
	@@ -1,699 +1,699 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 2007-2009 Google Inc. and Amit Singh
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are
	* met:
	*
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above
	* copyright notice, this list of conditions and the following disclaimer
	* in the documentation and/or other materials provided with the
	* distribution.
	* * Neither the name of Google Inc. nor the names of its
	* contributors may be used to endorse or promote products derived from
	* this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	*
	* Copyright (c) 2019 The FreeBSD Foundation
	*
	* Portions of this software were developed by BFF Storage Systems, LLC under
	* sponsorship from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/buf.h>
	#include <sys/module.h>
	#include <sys/systm.h>
	#include <sys/errno.h>
	#include <sys/kernel.h>
	#include <sys/capsicum.h>
	#include <sys/conf.h>
	#include <sys/filedesc.h>
	#include <sys/uio.h>
	#include <sys/malloc.h>
	#include <sys/queue.h>
	#include <sys/lock.h>
	#include <sys/sx.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/vnode.h>
	#include <sys/namei.h>
	#include <sys/mount.h>
	#include <sys/sysctl.h>
	#include <sys/fcntl.h>

	#include "fuse.h"
	#include "fuse_node.h"
	#include "fuse_ipc.h"
	#include "fuse_internal.h"

	#include <sys/priv.h>
	#include <security/mac/mac_framework.h>

	SDT_PROVIDER_DECLARE(fusefs);
	/*
	* Fuse trace probe:
	* arg0: verbosity. Higher numbers give more verbose messages
	* arg1: Textual message
	*/
	SDT_PROBE_DEFINE2(fusefs, , vfsops, trace, "int", "char*");

	/* This will do for privilege types for now */
	#ifndef PRIV_VFS_FUSE_ALLOWOTHER
	#define PRIV_VFS_FUSE_ALLOWOTHER PRIV_VFS_MOUNT_NONUSER
	#endif
	#ifndef PRIV_VFS_FUSE_MOUNT_NONUSER
	#define PRIV_VFS_FUSE_MOUNT_NONUSER PRIV_VFS_MOUNT_NONUSER
	#endif
	#ifndef PRIV_VFS_FUSE_SYNC_UNMOUNT
	#define PRIV_VFS_FUSE_SYNC_UNMOUNT PRIV_VFS_MOUNT_NONUSER
	#endif

	static vfs_fhtovp_t fuse_vfsop_fhtovp;
	static vfs_mount_t fuse_vfsop_mount;
	static vfs_unmount_t fuse_vfsop_unmount;
	static vfs_root_t fuse_vfsop_root;
	static vfs_statfs_t fuse_vfsop_statfs;
	static vfs_vget_t fuse_vfsop_vget;

	struct vfsops fuse_vfsops = {
	.vfs_fhtovp = fuse_vfsop_fhtovp,
	.vfs_mount = fuse_vfsop_mount,
	.vfs_unmount = fuse_vfsop_unmount,
	.vfs_root = fuse_vfsop_root,
	.vfs_statfs = fuse_vfsop_statfs,
	.vfs_vget = fuse_vfsop_vget,
	};

	static int fuse_enforce_dev_perms = 0;

	SYSCTL_INT(_vfs_fusefs, OID_AUTO, enforce_dev_perms, CTLFLAG_RW,
	&fuse_enforce_dev_perms, 0,
	"enforce fuse device permissions for secondary mounts");

	MALLOC_DEFINE(M_FUSEVFS, "fuse_filesystem", "buffer for fuse vfs layer");

	static int
	fuse_getdevice(const char fspec, struct thread td, struct cdev **fdevp)
	{
	struct nameidata nd, *ndp = &nd;
	struct vnode *devvp;
	struct cdev *fdev;
	int err;

	/*
	* Not an update, or updating the name: look up the name
	* and verify that it refers to a sensible disk device.
	*/

	NDINIT(ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, fspec, td);
	if ((err = namei(ndp)) != 0)
	return err;
	NDFREE(ndp, NDF_ONLY_PNBUF);
	devvp = ndp->ni_vp;

	if (devvp->v_type != VCHR) {
	vrele(devvp);
	return ENXIO;
	}
	fdev = devvp->v_rdev;
	dev_ref(fdev);

	if (fuse_enforce_dev_perms) {
	/*
	* Check if mounter can open the fuse device.
	*
	* This has significance only if we are doing a secondary mount
	* which doesn't involve actually opening fuse devices, but we
	* still want to enforce the permissions of the device (in
	* order to keep control over the circle of fuse users).
	*
	* (In case of primary mounts, we are either the superuser so
	* we can do anything anyway, or we can mount only if the
	* device is already opened by us, ie. we are permitted to open
	* the device.)
	*/
	#if 0
	#ifdef MAC
	err = mac_check_vnode_open(td->td_ucred, devvp, VREAD \| VWRITE);
	if (!err)
	#endif
	#endif /* 0 */
	err = VOP_ACCESS(devvp, VREAD \| VWRITE, td->td_ucred, td);
	if (err) {
	vrele(devvp);
	dev_rel(fdev);
	return err;
	}
	}
	/*
	* according to coda code, no extra lock is needed --
	* although in sys/vnode.h this field is marked "v"
	*/
	vrele(devvp);

	if (!fdev->si_devsw \|\|
	strcmp("fuse", fdev->si_devsw->d_name)) {
	dev_rel(fdev);
	return ENXIO;
	}
	*fdevp = fdev;

	return 0;
	}

	#define FUSE_FLAGOPT(fnam, fval) do { \
	vfs_flagopt(opts, #fnam, &mntopts, fval); \
	vfs_flagopt(opts, "__" #fnam, &__mntopts, fval); \
	} while (0)

	SDT_PROBE_DEFINE1(fusefs, , vfsops, mntopts, "uint64_t");
	SDT_PROBE_DEFINE4(fusefs, , vfsops, mount_err, "char", "struct fuse_data",
	"struct mount*", "int");

	static int
	fuse_vfs_remount(struct mount mp, struct thread td, uint64_t mntopts,
	uint32_t max_read, int daemon_timeout)
	{
	int err = 0;
	struct fuse_data *data = fuse_get_mpdata(mp);
	/* Don't allow these options to be changed */
	const static unsigned long long cant_update_opts =
	MNT_USER; /* Mount owner must be the user running the daemon */

	FUSE_LOCK();

	if ((mp->mnt_flag ^ data->mnt_flag) & cant_update_opts) {
	err = EOPNOTSUPP;
	SDT_PROBE4(fusefs, , vfsops, mount_err,
	"Can't change these mount options during remount",
	data, mp, err);
	goto out;
	}
	if (((data->dataflags ^ mntopts) & FSESS_MNTOPTS_MASK) \|\|
	(data->max_read != max_read) \|\|
	(data->daemon_timeout != daemon_timeout)) {
	// TODO: allow changing options where it makes sense
	err = EOPNOTSUPP;
	SDT_PROBE4(fusefs, , vfsops, mount_err,
	"Can't change fuse mount options during remount",
	data, mp, err);
	goto out;
	}

	if (fdata_get_dead(data)) {
	err = ENOTCONN;
	SDT_PROBE4(fusefs, , vfsops, mount_err,
	"device is dead during mount", data, mp, err);
	goto out;
	}

	/* Sanity + permission checks */
	if (!data->daemoncred)
	panic("fuse daemon found, but identity unknown");
	if (mntopts & FSESS_DAEMON_CAN_SPY)
	err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER);
	if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid)
	/* are we allowed to do the first mount? */
	err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER);

	out:
	FUSE_UNLOCK();
	return err;
	}

	static int
	fuse_vfsop_fhtovp(struct mount mp, struct fid fhp, int flags,
	struct vnode **vpp)
	{
	struct fuse_fid ffhp = (struct fuse_fid )fhp;
	struct fuse_vnode_data *fvdat;
	struct vnode *nvp;
	int error;

	if (!(fuse_get_mpdata(mp)->dataflags & FSESS_EXPORT_SUPPORT))
	return EOPNOTSUPP;

	error = VFS_VGET(mp, ffhp->nid, LK_EXCLUSIVE, &nvp);
	if (error) {
	*vpp = NULLVP;
	return (error);
	}
	fvdat = VTOFUD(nvp);
	if (fvdat->generation != ffhp->gen ) {
	vput(nvp);
	*vpp = NULLVP;
	return (ESTALE);
	}
	*vpp = nvp;
	vnode_create_vobject(*vpp, 0, curthread);
	return (0);
	}

	static int
	fuse_vfsop_mount(struct mount *mp)
	{
	int err;

	uint64_t mntopts, __mntopts;
	uint32_t max_read;
	int linux_errnos;
	int daemon_timeout;
	int fd;

	struct cdev *fdev;
	struct fuse_data *data = NULL;
	struct thread *td;
	struct file fp, fptmp;
	char fspec, subtype;
	struct vfsoptlist *opts;

	subtype = NULL;
	max_read = ~0;
	linux_errnos = 0;
	err = 0;
	mntopts = 0;
	__mntopts = 0;
	td = curthread;

	/* Get the new options passed to mount */
	opts = mp->mnt_optnew;

	if (!opts)
	return EINVAL;

	/* `fspath' contains the mount point (eg. /mnt/fuse/sshfs); REQUIRED */
	if (!vfs_getopts(opts, "fspath", &err))
	return err;

	/*
	* With the help of underscored options the mount program
	* can inform us from the flags it sets by default
	*/
	FUSE_FLAGOPT(allow_other, FSESS_DAEMON_CAN_SPY);
	FUSE_FLAGOPT(push_symlinks_in, FSESS_PUSH_SYMLINKS_IN);
	FUSE_FLAGOPT(default_permissions, FSESS_DEFAULT_PERMISSIONS);
	FUSE_FLAGOPT(intr, FSESS_INTR);

	(void)vfs_scanopt(opts, "max_read=", "%u", &max_read);
	(void)vfs_scanopt(opts, "linux_errnos", "%d", &linux_errnos);
	if (vfs_scanopt(opts, "timeout=", "%u", &daemon_timeout) == 1) {
	if (daemon_timeout < FUSE_MIN_DAEMON_TIMEOUT)
	daemon_timeout = FUSE_MIN_DAEMON_TIMEOUT;
	else if (daemon_timeout > FUSE_MAX_DAEMON_TIMEOUT)
	daemon_timeout = FUSE_MAX_DAEMON_TIMEOUT;
	} else {
	daemon_timeout = FUSE_DEFAULT_DAEMON_TIMEOUT;
	}
	subtype = vfs_getopts(opts, "subtype=", &err);

	SDT_PROBE1(fusefs, , vfsops, mntopts, mntopts);

	if (mp->mnt_flag & MNT_UPDATE) {
	return fuse_vfs_remount(mp, td, mntopts, max_read,
	daemon_timeout);
	}

	/* `from' contains the device name (eg. /dev/fuse0); REQUIRED */
	fspec = vfs_getopts(opts, "from", &err);
	if (!fspec)
	return err;

	/* `fd' contains the filedescriptor for this session; REQUIRED */
	if (vfs_scanopt(opts, "fd", "%d", &fd) != 1)
	return EINVAL;

	err = fuse_getdevice(fspec, td, &fdev);
	if (err != 0)
	return err;

	err = fget(td, fd, &cap_read_rights, &fp);
	if (err != 0) {
	SDT_PROBE2(fusefs, , vfsops, trace, 1,
	"invalid or not opened device");
	goto out;
	}
	fptmp = td->td_fpop;
	td->td_fpop = fp;
	err = devfs_get_cdevpriv((void **)&data);
	td->td_fpop = fptmp;
	fdrop(fp, td);
	FUSE_LOCK();

	if (err != 0 \|\| data == NULL) {
	err = ENXIO;
	SDT_PROBE4(fusefs, , vfsops, mount_err,
	"invalid or not opened device", data, mp, err);
	FUSE_UNLOCK();
	goto out;
	}
	if (fdata_get_dead(data)) {
	err = ENOTCONN;
	SDT_PROBE4(fusefs, , vfsops, mount_err,
	"device is dead during mount", data, mp, err);
	FUSE_UNLOCK();
	goto out;
	}
	/* Sanity + permission checks */
	if (!data->daemoncred)
	panic("fuse daemon found, but identity unknown");
	if (mntopts & FSESS_DAEMON_CAN_SPY)
	err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER);
	if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid)
	/* are we allowed to do the first mount? */
	err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER);
	if (err) {
	FUSE_UNLOCK();
	goto out;
	}
	data->ref++;
	data->mp = mp;
	data->dataflags \|= mntopts;
	data->max_read = max_read;
	data->daemon_timeout = daemon_timeout;
	data->linux_errnos = linux_errnos;
	data->mnt_flag = mp->mnt_flag & MNT_UPDATEMASK;
	FUSE_UNLOCK();

	vfs_getnewfsid(mp);
	MNT_ILOCK(mp);
	mp->mnt_data = data;
	/*
	* FUSE file systems can be either local or remote, but the kernel
	* can't tell the difference.
	*/
	mp->mnt_flag &= ~MNT_LOCAL;
	mp->mnt_kern_flag \|= MNTK_USES_BCACHE;
	/*
	* Disable nullfs cacheing because it can consume too many resources in
	* the FUSE server.
	*/
	mp->mnt_kern_flag \|= MNTK_NULL_NOCACHE;
	MNT_IUNLOCK(mp);
	/* We need this here as this slot is used by getnewvnode() */
	mp->mnt_stat.f_iosize = maxbcachebuf;
	if (subtype) {
	strlcat(mp->mnt_stat.f_fstypename, ".", MFSNAMELEN);
	strlcat(mp->mnt_stat.f_fstypename, subtype, MFSNAMELEN);
	}
	memset(mp->mnt_stat.f_mntfromname, 0, MNAMELEN);
	strlcpy(mp->mnt_stat.f_mntfromname, fspec, MNAMELEN);
	- mp->mnt_iosize_max = MAXPHYS;
	+ mp->mnt_iosize_max = maxphys;

	/* Now handshaking with daemon */
	fuse_internal_send_init(data, td);

	out:
	if (err) {
	FUSE_LOCK();
	if (data != NULL && data->mp == mp) {
	/*
	* Destroy device only if we acquired reference to
	* it
	*/
	SDT_PROBE4(fusefs, , vfsops, mount_err,
	"mount failed, destroy device", data, mp, err);
	data->mp = NULL;
	mp->mnt_data = NULL;
	fdata_trydestroy(data);
	}
	FUSE_UNLOCK();
	dev_rel(fdev);
	}
	return err;
	}

	static int
	fuse_vfsop_unmount(struct mount *mp, int mntflags)
	{
	int err = 0;
	int flags = 0;

	struct cdev *fdev;
	struct fuse_data *data;
	struct fuse_dispatcher fdi;
	struct thread *td = curthread;

	if (mntflags & MNT_FORCE) {
	flags \|= FORCECLOSE;
	}
	data = fuse_get_mpdata(mp);
	if (!data) {
	panic("no private data for mount point?");
	}
	/* There is 1 extra root vnode reference (mp->mnt_data). */
	FUSE_LOCK();
	if (data->vroot != NULL) {
	struct vnode *vroot = data->vroot;

	data->vroot = NULL;
	FUSE_UNLOCK();
	vrele(vroot);
	} else
	FUSE_UNLOCK();
	err = vflush(mp, 0, flags, td);
	if (err) {
	return err;
	}
	if (fdata_get_dead(data)) {
	goto alreadydead;
	}
	if (fsess_isimpl(mp, FUSE_DESTROY)) {
	fdisp_init(&fdi, 0);
	fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL);

	(void)fdisp_wait_answ(&fdi);
	fdisp_destroy(&fdi);
	}

	fdata_set_dead(data);

	alreadydead:
	FUSE_LOCK();
	data->mp = NULL;
	fdev = data->fdev;
	fdata_trydestroy(data);
	FUSE_UNLOCK();

	MNT_ILOCK(mp);
	mp->mnt_data = NULL;
	MNT_IUNLOCK(mp);

	dev_rel(fdev);

	return 0;
	}

	SDT_PROBE_DEFINE1(fusefs, , vfsops, invalidate_without_export,
	"struct mount*");
	static int
	fuse_vfsop_vget(struct mount mp, ino_t ino, int flags, struct vnode *vpp)
	{
	struct fuse_data *data = fuse_get_mpdata(mp);
	uint64_t nodeid = ino;
	struct thread *td = curthread;
	struct fuse_dispatcher fdi;
	struct fuse_entry_out *feo;
	struct fuse_vnode_data *fvdat;
	const char dot[] = ".";
	off_t filesize;
	enum vtype vtyp;
	int error;

	if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) {
	/*
	* Unreachable unless you do something stupid, like export a
	* nullfs mount of a fusefs file system.
	*/
	SDT_PROBE1(fusefs, , vfsops, invalidate_without_export, mp);
	return (EOPNOTSUPP);
	}

	error = fuse_internal_get_cached_vnode(mp, ino, flags, vpp);
	if (error \|\| *vpp != NULL)
	return error;

	/* Do a LOOKUP, using nodeid as the parent and "." as filename */
	fdisp_init(&fdi, sizeof(dot));
	fdisp_make(&fdi, FUSE_LOOKUP, mp, nodeid, td, td->td_ucred);
	memcpy(fdi.indata, dot, sizeof(dot));
	error = fdisp_wait_answ(&fdi);

	if (error)
	return error;

	feo = (struct fuse_entry_out *)fdi.answ;
	if (feo->nodeid == 0) {
	/* zero nodeid means ENOENT and cache it */
	error = ENOENT;
	goto out;
	}

	vtyp = IFTOVT(feo->attr.mode);
	error = fuse_vnode_get(mp, feo, nodeid, NULL, vpp, NULL, vtyp);
	if (error)
	goto out;
	filesize = feo->attr.size;

	/*
	* In the case where we are looking up a FUSE node represented by an
	* existing cached vnode, and the true size reported by FUSE_LOOKUP
	* doesn't match the vnode's cached size, then any cached writes beyond
	* the file's current size are lost.
	*
	* We can get here:
	* * following attribute cache expiration, or
	* * due a bug in the daemon, or
	*/
	fvdat = VTOFUD(*vpp);
	if (vnode_isreg(*vpp) &&
	filesize != fvdat->cached_attrs.va_size &&
	fvdat->flag & FN_SIZECHANGE) {
	printf("%s: WB cache incoherent on %s!\n", __func__,
	vnode_mount(*vpp)->mnt_stat.f_mntonname);

	fvdat->flag &= ~FN_SIZECHANGE;
	}

	fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid,
	feo->attr_valid_nsec, NULL);
	fuse_validity_2_bintime(feo->entry_valid, feo->entry_valid_nsec,
	&fvdat->entry_cache_timeout);
	out:
	fdisp_destroy(&fdi);
	return error;
	}

	static int
	fuse_vfsop_root(struct mount mp, int lkflags, struct vnode *vpp)
	{
	struct fuse_data *data = fuse_get_mpdata(mp);
	int err = 0;

	if (data->vroot != NULL) {
	err = vget(data->vroot, lkflags);
	if (err == 0)
	*vpp = data->vroot;
	} else {
	err = fuse_vnode_get(mp, NULL, FUSE_ROOT_ID, NULL, vpp, NULL,
	VDIR);
	if (err == 0) {
	FUSE_LOCK();
	MPASS(data->vroot == NULL \|\| data->vroot == *vpp);
	if (data->vroot == NULL) {
	SDT_PROBE2(fusefs, , vfsops, trace, 1,
	"new root vnode");
	data->vroot = *vpp;
	FUSE_UNLOCK();
	vref(*vpp);
	} else if (data->vroot != *vpp) {
	SDT_PROBE2(fusefs, , vfsops, trace, 1,
	"root vnode race");
	FUSE_UNLOCK();
	VOP_UNLOCK(*vpp);
	vrele(*vpp);
	vrecycle(*vpp);
	*vpp = data->vroot;
	} else
	FUSE_UNLOCK();
	}
	}
	return err;
	}

	static int
	fuse_vfsop_statfs(struct mount mp, struct statfs sbp)
	{
	struct fuse_dispatcher fdi;
	int err = 0;

	struct fuse_statfs_out *fsfo;
	struct fuse_data *data;

	data = fuse_get_mpdata(mp);

	if (!(data->dataflags & FSESS_INITED))
	goto fake;

	fdisp_init(&fdi, 0);
	fdisp_make(&fdi, FUSE_STATFS, mp, FUSE_ROOT_ID, NULL, NULL);
	err = fdisp_wait_answ(&fdi);
	if (err) {
	fdisp_destroy(&fdi);
	if (err == ENOTCONN) {
	/*
	* We want to seem a legitimate fs even if the daemon
	* is stiff dead... (so that, eg., we can still do path
	* based unmounting after the daemon dies).
	*/
	goto fake;
	}
	return err;
	}
	fsfo = fdi.answ;

	sbp->f_blocks = fsfo->st.blocks;
	sbp->f_bfree = fsfo->st.bfree;
	sbp->f_bavail = fsfo->st.bavail;
	sbp->f_files = fsfo->st.files;
	sbp->f_ffree = fsfo->st.ffree; /* cast from uint64_t to int64_t */
	sbp->f_namemax = fsfo->st.namelen;
	sbp->f_bsize = fsfo->st.frsize; /* cast from uint32_t to uint64_t */

	fdisp_destroy(&fdi);
	return 0;

	fake:
	sbp->f_blocks = 0;
	sbp->f_bfree = 0;
	sbp->f_bavail = 0;
	sbp->f_files = 0;
	sbp->f_ffree = 0;
	sbp->f_namemax = 0;
	sbp->f_bsize = S_BLKSIZE;

	return 0;
	}
	diff --git a/sys/fs/msdosfs/msdosfs_vfsops.c b/sys/fs/msdosfs/msdosfs_vfsops.c
	index 86d5b920643f..187463cc7f4d 100644
	--- a/sys/fs/msdosfs/msdosfs_vfsops.c
	+++ b/sys/fs/msdosfs/msdosfs_vfsops.c
	@@ -1,1012 +1,1012 @@
	/* $FreeBSD$ */
	/* $NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $ */

	/*-
	* SPDX-License-Identifier: BSD-4-Clause
	*
	* Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
	* Copyright (C) 1994, 1995, 1997 TooLs GmbH.
	* All rights reserved.
	* Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by TooLs GmbH.
	* 4. The name of TooLs GmbH may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
	* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/
	/*-
	* Written by Paul Popelka (paulp@uts.amdahl.com)
	*
	* You can do anything you want with this software, just don't say you wrote
	* it, and don't remove this notice.
	*
	* This software is provided "as is".
	*
	* The author supplies this software to be publicly redistributed on the
	* understanding that the author is not responsible for the correct
	* functioning of this software in any circumstances and is not liable for
	* any damages caused by this software.
	*
	* October 1992
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/buf.h>
	#include <sys/conf.h>
	#include <sys/fcntl.h>
	#include <sys/iconv.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/stat.h>
	#include <sys/vnode.h>

	#include <geom/geom.h>
	#include <geom/geom_vfs.h>

	#include <fs/msdosfs/bootsect.h>
	#include <fs/msdosfs/bpb.h>
	#include <fs/msdosfs/direntry.h>
	#include <fs/msdosfs/denode.h>
	#include <fs/msdosfs/fat.h>
	#include <fs/msdosfs/msdosfsmount.h>

	#ifdef MSDOSFS_DEBUG
	#include <sys/rwlock.h>
	#endif

	static const char msdosfs_lock_msg[] = "fatlk";

	/* Mount options that we support. */
	static const char *msdosfs_opts[] = {
	"async", "noatime", "noclusterr", "noclusterw",
	"export", "force", "from", "sync",
	"cs_dos", "cs_local", "cs_win", "dirmask",
	"gid", "kiconv", "longname",
	"longnames", "mask", "shortname", "shortnames",
	"uid", "win95", "nowin95",
	NULL
	};

	#if 1 /def PC98/
	/*
	* XXX - The boot signature formatted by NEC PC-98 DOS looks like a
	* garbage or a random value :-{
	* If you want to use that broken-signatured media, define the
	* following symbol even though PC/AT.
	* (ex. mount PC-98 DOS formatted FD on PC/AT)
	*/
	#define MSDOSFS_NOCHECKSIG
	#endif

	MALLOC_DEFINE(M_MSDOSFSMNT, "msdosfs_mount", "MSDOSFS mount structure");
	static MALLOC_DEFINE(M_MSDOSFSFAT, "msdosfs_fat", "MSDOSFS file allocation table");

	struct iconv_functions *msdosfs_iconv;

	static int update_mp(struct mount mp, struct thread td);
	static int mountmsdosfs(struct vnode devvp, struct mount mp);
	static vfs_fhtovp_t msdosfs_fhtovp;
	static vfs_mount_t msdosfs_mount;
	static vfs_root_t msdosfs_root;
	static vfs_statfs_t msdosfs_statfs;
	static vfs_sync_t msdosfs_sync;
	static vfs_unmount_t msdosfs_unmount;

	/* Maximum length of a character set name (arbitrary). */
	#define MAXCSLEN 64

	static int
	update_mp(struct mount mp, struct thread td)
	{
	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
	void dos, win, *local;
	int error, v;

	if (!vfs_getopt(mp->mnt_optnew, "kiconv", NULL, NULL)) {
	if (msdosfs_iconv != NULL) {
	error = vfs_getopt(mp->mnt_optnew,
	"cs_win", &win, NULL);
	if (!error)
	error = vfs_getopt(mp->mnt_optnew,
	"cs_local", &local, NULL);
	if (!error)
	error = vfs_getopt(mp->mnt_optnew,
	"cs_dos", &dos, NULL);
	if (!error) {
	msdosfs_iconv->open(win, local, &pmp->pm_u2w);
	msdosfs_iconv->open(local, win, &pmp->pm_w2u);
	msdosfs_iconv->open(dos, local, &pmp->pm_u2d);
	msdosfs_iconv->open(local, dos, &pmp->pm_d2u);
	}
	if (error != 0)
	return (error);
	} else {
	pmp->pm_w2u = NULL;
	pmp->pm_u2w = NULL;
	pmp->pm_d2u = NULL;
	pmp->pm_u2d = NULL;
	}
	}

	if (vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v) == 1)
	pmp->pm_gid = v;
	if (vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v) == 1)
	pmp->pm_uid = v;
	if (vfs_scanopt(mp->mnt_optnew, "mask", "%d", &v) == 1)
	pmp->pm_mask = v & ALLPERMS;
	if (vfs_scanopt(mp->mnt_optnew, "dirmask", "%d", &v) == 1)
	pmp->pm_dirmask = v & ALLPERMS;
	vfs_flagopt(mp->mnt_optnew, "shortname",
	&pmp->pm_flags, MSDOSFSMNT_SHORTNAME);
	vfs_flagopt(mp->mnt_optnew, "shortnames",
	&pmp->pm_flags, MSDOSFSMNT_SHORTNAME);
	vfs_flagopt(mp->mnt_optnew, "longname",
	&pmp->pm_flags, MSDOSFSMNT_LONGNAME);
	vfs_flagopt(mp->mnt_optnew, "longnames",
	&pmp->pm_flags, MSDOSFSMNT_LONGNAME);
	vfs_flagopt(mp->mnt_optnew, "kiconv",
	&pmp->pm_flags, MSDOSFSMNT_KICONV);

	if (vfs_getopt(mp->mnt_optnew, "nowin95", NULL, NULL) == 0)
	pmp->pm_flags \|= MSDOSFSMNT_NOWIN95;
	else
	pmp->pm_flags &= ~MSDOSFSMNT_NOWIN95;

	if (pmp->pm_flags & MSDOSFSMNT_NOWIN95)
	pmp->pm_flags \|= MSDOSFSMNT_SHORTNAME;
	else
	pmp->pm_flags \|= MSDOSFSMNT_LONGNAME;
	return 0;
	}

	static int
	msdosfs_cmount(struct mntarg ma, void data, uint64_t flags)
	{
	struct msdosfs_args args;
	int error;

	if (data == NULL)
	return (EINVAL);
	error = copyin(data, &args, sizeof args);
	if (error)
	return (error);

	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
	ma = mount_arg(ma, "export", &args.export, sizeof(args.export));
	ma = mount_argf(ma, "uid", "%d", args.uid);
	ma = mount_argf(ma, "gid", "%d", args.gid);
	ma = mount_argf(ma, "mask", "%d", args.mask);
	ma = mount_argf(ma, "dirmask", "%d", args.dirmask);

	ma = mount_argb(ma, args.flags & MSDOSFSMNT_SHORTNAME, "noshortname");
	ma = mount_argb(ma, args.flags & MSDOSFSMNT_LONGNAME, "nolongname");
	ma = mount_argb(ma, !(args.flags & MSDOSFSMNT_NOWIN95), "nowin95");
	ma = mount_argb(ma, args.flags & MSDOSFSMNT_KICONV, "nokiconv");

	ma = mount_argsu(ma, "cs_win", args.cs_win, MAXCSLEN);
	ma = mount_argsu(ma, "cs_dos", args.cs_dos, MAXCSLEN);
	ma = mount_argsu(ma, "cs_local", args.cs_local, MAXCSLEN);

	error = kernel_mount(ma, flags);

	return (error);
	}

	/*
	* mp - path - addr in user space of mount point (ie /usr or whatever)
	* data - addr in user space of mount params including the name of the block
	* special file to treat as a filesystem.
	*/
	static int
	msdosfs_mount(struct mount *mp)
	{
	struct vnode devvp; / vnode for blk device to mount */
	struct thread *td;
	/* msdosfs specific mount control block */
	struct msdosfsmount *pmp = NULL;
	struct nameidata ndp;
	int error, flags;
	accmode_t accmode;
	char *from;

	td = curthread;
	if (vfs_filteropt(mp->mnt_optnew, msdosfs_opts))
	return (EINVAL);

	/*
	* If updating, check whether changing from read-only to
	* read/write; if there is no device name, that's all we do.
	*/
	if (mp->mnt_flag & MNT_UPDATE) {
	pmp = VFSTOMSDOSFS(mp);
	if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) &&
	vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
	if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
	return (error);
	error = vfs_write_suspend_umnt(mp);
	if (error != 0)
	return (error);

	flags = WRITECLOSE;
	if (mp->mnt_flag & MNT_FORCE)
	flags \|= FORCECLOSE;
	error = vflush(mp, 0, flags, td);
	if (error != 0) {
	vfs_write_resume(mp, 0);
	return (error);
	}

	/*
	* Now the volume is clean. Mark it so while the
	* device is still rw.
	*/
	error = markvoldirty(pmp, 0);
	if (error != 0) {
	vfs_write_resume(mp, 0);
	(void)markvoldirty(pmp, 1);
	return (error);
	}

	/* Downgrade the device from rw to ro. */
	g_topology_lock();
	error = g_access(pmp->pm_cp, 0, -1, 0);
	g_topology_unlock();
	if (error) {
	vfs_write_resume(mp, 0);
	(void)markvoldirty(pmp, 1);
	return (error);
	}

	/*
	* Backing out after an error was painful in the
	* above. Now we are committed to succeeding.
	*/
	pmp->pm_fmod = 0;
	pmp->pm_flags \|= MSDOSFSMNT_RONLY;
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_RDONLY;
	MNT_IUNLOCK(mp);
	vfs_write_resume(mp, 0);
	} else if ((pmp->pm_flags & MSDOSFSMNT_RONLY) &&
	!vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
	/*
	* If upgrade to read-write by non-root, then verify
	* that user has necessary permissions on the device.
	*/
	devvp = pmp->pm_devvp;
	vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY);
	error = VOP_ACCESS(devvp, VREAD \| VWRITE,
	td->td_ucred, td);
	if (error)
	error = priv_check(td, PRIV_VFS_MOUNT_PERM);
	if (error) {
	VOP_UNLOCK(devvp);
	return (error);
	}
	VOP_UNLOCK(devvp);
	g_topology_lock();
	error = g_access(pmp->pm_cp, 0, 1, 0);
	g_topology_unlock();
	if (error)
	return (error);

	/* Now that the volume is modifiable, mark it dirty. */
	error = markvoldirty_upgrade(pmp, true, true);
	if (error) {
	/*
	* If dirtying the superblock failed, drop GEOM
	* 'w' refs (we're still RO).
	*/
	g_topology_lock();
	(void)g_access(pmp->pm_cp, 0, -1, 0);
	g_topology_unlock();

	return (error);
	}

	pmp->pm_fmod = 1;
	pmp->pm_flags &= ~MSDOSFSMNT_RONLY;
	MNT_ILOCK(mp);
	mp->mnt_flag &= ~MNT_RDONLY;
	MNT_IUNLOCK(mp);
	}
	}
	/*
	* Not an update, or updating the name: look up the name
	* and verify that it refers to a sensible disk device.
	*/
	if (vfs_getopt(mp->mnt_optnew, "from", (void **)&from, NULL))
	return (EINVAL);
	NDINIT(&ndp, LOOKUP, FOLLOW \| LOCKLEAF, UIO_SYSSPACE, from, td);
	error = namei(&ndp);
	if (error)
	return (error);
	devvp = ndp.ni_vp;
	NDFREE(&ndp, NDF_ONLY_PNBUF);

	if (!vn_isdisk_error(devvp, &error)) {
	vput(devvp);
	return (error);
	}
	/*
	* If mount by non-root, then verify that user has necessary
	* permissions on the device.
	*/
	accmode = VREAD;
	if ((mp->mnt_flag & MNT_RDONLY) == 0)
	accmode \|= VWRITE;
	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
	if (error)
	error = priv_check(td, PRIV_VFS_MOUNT_PERM);
	if (error) {
	vput(devvp);
	return (error);
	}
	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
	error = mountmsdosfs(devvp, mp);
	#ifdef MSDOSFS_DEBUG /* only needed for the printf below */
	pmp = VFSTOMSDOSFS(mp);
	#endif
	} else {
	vput(devvp);
	if (devvp != pmp->pm_devvp)
	return (EINVAL); /* XXX needs translation */
	}
	if (error) {
	vrele(devvp);
	return (error);
	}

	error = update_mp(mp, td);
	if (error) {
	if ((mp->mnt_flag & MNT_UPDATE) == 0)
	msdosfs_unmount(mp, MNT_FORCE);
	return error;
	}

	vfs_mountedfrom(mp, from);
	#ifdef MSDOSFS_DEBUG
	printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap);
	#endif
	return (0);
	}

	static int
	mountmsdosfs(struct vnode devvp, struct mount mp)
	{
	struct msdosfsmount *pmp;
	struct buf *bp;
	struct cdev *dev;
	union bootsector *bsp;
	struct byte_bpb33 *b33;
	struct byte_bpb50 *b50;
	struct byte_bpb710 *b710;
	uint8_t SecPerClust;
	u_long clusters;
	int ronly, error;
	struct g_consumer *cp;
	struct bufobj *bo;

	bp = NULL; /* This and pmp both used in error_exit. */
	pmp = NULL;
	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;

	dev = devvp->v_rdev;
	if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0,
	(uintptr_t)mp) == 0) {
	VOP_UNLOCK(devvp);
	return (EBUSY);
	}
	g_topology_lock();
	error = g_vfs_open(devvp, &cp, "msdosfs", ronly ? 0 : 1);
	g_topology_unlock();
	if (error != 0) {
	atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
	VOP_UNLOCK(devvp);
	return (error);
	}
	dev_ref(dev);
	bo = &devvp->v_bufobj;
	VOP_UNLOCK(devvp);
	if (dev->si_iosize_max != 0)
	mp->mnt_iosize_max = dev->si_iosize_max;
	- if (mp->mnt_iosize_max > MAXPHYS)
	- mp->mnt_iosize_max = MAXPHYS;
	+ if (mp->mnt_iosize_max > maxphys)
	+ mp->mnt_iosize_max = maxphys;

	/*
	* Read the boot sector of the filesystem, and then check the
	* boot signature. If not a dos boot sector then error out.
	*
	* NOTE: 8192 is a magic size that works for ffs.
	*/
	error = bread(devvp, 0, 8192, NOCRED, &bp);
	if (error)
	goto error_exit;
	bp->b_flags \|= B_AGE;
	bsp = (union bootsector *)bp->b_data;
	b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB;
	b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB;
	b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB;

	#ifndef MSDOSFS_NOCHECKSIG
	if (bsp->bs50.bsBootSectSig0 != BOOTSIG0
	\|\| bsp->bs50.bsBootSectSig1 != BOOTSIG1) {
	error = EINVAL;
	goto error_exit;
	}
	#endif

	pmp = malloc(sizeof *pmp, M_MSDOSFSMNT, M_WAITOK \| M_ZERO);
	pmp->pm_mountp = mp;
	pmp->pm_cp = cp;
	pmp->pm_bo = bo;

	lockinit(&pmp->pm_fatlock, 0, msdosfs_lock_msg, 0, 0);

	/*
	* Initialize ownerships and permissions, since nothing else will
	* initialize them iff we are mounting root.
	*/
	pmp->pm_uid = UID_ROOT;
	pmp->pm_gid = GID_WHEEL;
	pmp->pm_mask = pmp->pm_dirmask = S_IXUSR \| S_IXGRP \| S_IXOTH \|
	S_IRUSR \| S_IRGRP \| S_IROTH \| S_IWUSR;

	/*
	* Compute several useful quantities from the bpb in the
	* bootsector. Copy in the dos 5 variant of the bpb then fix up
	* the fields that are different between dos 5 and dos 3.3.
	*/
	SecPerClust = b50->bpbSecPerClust;
	pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec);
	if (pmp->pm_BytesPerSec < DEV_BSIZE) {
	error = EINVAL;
	goto error_exit;
	}
	pmp->pm_ResSectors = getushort(b50->bpbResSectors);
	pmp->pm_FATs = b50->bpbFATs;
	pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts);
	pmp->pm_Sectors = getushort(b50->bpbSectors);
	pmp->pm_FATsecs = getushort(b50->bpbFATsecs);
	pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack);
	pmp->pm_Heads = getushort(b50->bpbHeads);
	pmp->pm_Media = b50->bpbMedia;

	/* calculate the ratio of sector size to DEV_BSIZE */
	pmp->pm_BlkPerSec = pmp->pm_BytesPerSec / DEV_BSIZE;

	/*
	* We don't check pm_Heads nor pm_SecPerTrack, because
	* these may not be set for EFI file systems. We don't
	* use these anyway, so we're unaffected if they are
	* invalid.
	*/
	if (!pmp->pm_BytesPerSec \|\| !SecPerClust) {
	error = EINVAL;
	goto error_exit;
	}

	if (pmp->pm_Sectors == 0) {
	pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs);
	pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors);
	} else {
	pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs);
	pmp->pm_HugeSectors = pmp->pm_Sectors;
	}

	if (pmp->pm_RootDirEnts == 0) {
	if (pmp->pm_FATsecs
	\|\| getushort(b710->bpbFSVers)) {
	error = EINVAL;
	#ifdef MSDOSFS_DEBUG
	printf("mountmsdosfs(): bad FAT32 filesystem\n");
	#endif
	goto error_exit;
	}
	pmp->pm_fatmask = FAT32_MASK;
	pmp->pm_fatmult = 4;
	pmp->pm_fatdiv = 1;
	pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs);
	if (getushort(b710->bpbExtFlags) & FATMIRROR)
	pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM;
	else
	pmp->pm_flags \|= MSDOSFS_FATMIRROR;
	} else
	pmp->pm_flags \|= MSDOSFS_FATMIRROR;

	/*
	* Check a few values (could do some more):
	* - logical sector size: power of 2, >= block size
	* - sectors per cluster: power of 2, >= 1
	* - number of sectors: >= 1, <= size of partition
	* - number of FAT sectors: >= 1
	*/
	if ( (SecPerClust == 0)
	\|\| (SecPerClust & (SecPerClust - 1))
	\|\| (pmp->pm_BytesPerSec < DEV_BSIZE)
	\|\| (pmp->pm_BytesPerSec & (pmp->pm_BytesPerSec - 1))
	\|\| (pmp->pm_HugeSectors == 0)
	\|\| (pmp->pm_FATsecs == 0)
	\|\| (SecPerClust * pmp->pm_BlkPerSec > MAXBSIZE / DEV_BSIZE)
	) {
	error = EINVAL;
	goto error_exit;
	}

	pmp->pm_HugeSectors *= pmp->pm_BlkPerSec;
	pmp->pm_HiddenSects = pmp->pm_BlkPerSec; / XXX not used? */
	pmp->pm_FATsecs *= pmp->pm_BlkPerSec;
	SecPerClust *= pmp->pm_BlkPerSec;

	pmp->pm_fatblk = pmp->pm_ResSectors * pmp->pm_BlkPerSec;

	if (FAT32(pmp)) {
	pmp->pm_rootdirblk = getulong(b710->bpbRootClust);
	pmp->pm_firstcluster = pmp->pm_fatblk
	+ (pmp->pm_FATs * pmp->pm_FATsecs);
	pmp->pm_fsinfo = getushort(b710->bpbFSInfo) * pmp->pm_BlkPerSec;
	} else {
	pmp->pm_rootdirblk = pmp->pm_fatblk +
	(pmp->pm_FATs * pmp->pm_FATsecs);
	pmp->pm_rootdirsize = howmany(pmp->pm_RootDirEnts *
	sizeof(struct direntry), DEV_BSIZE); /* in blocks */
	pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize;
	}

	pmp->pm_maxcluster = (pmp->pm_HugeSectors - pmp->pm_firstcluster) /
	SecPerClust + 1;
	pmp->pm_fatsize = pmp->pm_FATsecs * DEV_BSIZE; /* XXX not used? */

	if (pmp->pm_fatmask == 0) {
	if (pmp->pm_maxcluster
	<= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) {
	/*
	* This will usually be a floppy disk. This size makes
	* sure that one FAT entry will not be split across
	* multiple blocks.
	*/
	pmp->pm_fatmask = FAT12_MASK;
	pmp->pm_fatmult = 3;
	pmp->pm_fatdiv = 2;
	} else {
	pmp->pm_fatmask = FAT16_MASK;
	pmp->pm_fatmult = 2;
	pmp->pm_fatdiv = 1;
	}
	}

	clusters = (pmp->pm_fatsize / pmp->pm_fatmult) * pmp->pm_fatdiv;
	if (pmp->pm_maxcluster >= clusters) {
	#ifdef MSDOSFS_DEBUG
	printf("Warning: number of clusters (%ld) exceeds FAT "
	"capacity (%ld)\n", pmp->pm_maxcluster + 1, clusters);
	#endif
	pmp->pm_maxcluster = clusters - 1;
	}

	if (FAT12(pmp))
	pmp->pm_fatblocksize = 3 * 512;
	else
	pmp->pm_fatblocksize = PAGE_SIZE;
	pmp->pm_fatblocksize = roundup(pmp->pm_fatblocksize,
	pmp->pm_BytesPerSec);
	pmp->pm_fatblocksec = pmp->pm_fatblocksize / DEV_BSIZE;
	pmp->pm_bnshift = ffs(DEV_BSIZE) - 1;

	/*
	* Compute mask and shift value for isolating cluster relative byte
	* offsets and cluster numbers from a file offset.
	*/
	pmp->pm_bpcluster = SecPerClust * DEV_BSIZE;
	pmp->pm_crbomask = pmp->pm_bpcluster - 1;
	pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1;

	/*
	* Check for valid cluster size
	* must be a power of 2
	*/
	if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) {
	error = EINVAL;
	goto error_exit;
	}

	/*
	* Release the bootsector buffer.
	*/
	brelse(bp);
	bp = NULL;

	/*
	* Check the fsinfo sector if we have one. Silently fix up our
	* in-core copy of fp->fsinxtfree if it is unknown (0xffffffff)
	* or too large. Ignore fp->fsinfree for now, since we need to
	* read the entire FAT anyway to fill the inuse map.
	*/
	if (pmp->pm_fsinfo) {
	struct fsinfo *fp;

	if ((error = bread(devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec,
	NOCRED, &bp)) != 0)
	goto error_exit;
	fp = (struct fsinfo *)bp->b_data;
	if (!bcmp(fp->fsisig1, "RRaA", 4)
	&& !bcmp(fp->fsisig2, "rrAa", 4)
	&& !bcmp(fp->fsisig3, "\0\0\125\252", 4)) {
	pmp->pm_nxtfree = getulong(fp->fsinxtfree);
	if (pmp->pm_nxtfree > pmp->pm_maxcluster)
	pmp->pm_nxtfree = CLUST_FIRST;
	} else
	pmp->pm_fsinfo = 0;
	brelse(bp);
	bp = NULL;
	}

	/*
	* Finish initializing pmp->pm_nxtfree (just in case the first few
	* sectors aren't properly reserved in the FAT). This completes
	* the fixup for fp->fsinxtfree, and fixes up the zero-initialized
	* value if there is no fsinfo. We will use pmp->pm_nxtfree
	* internally even if there is no fsinfo.
	*/
	if (pmp->pm_nxtfree < CLUST_FIRST)
	pmp->pm_nxtfree = CLUST_FIRST;

	/*
	* Allocate memory for the bitmap of allocated clusters, and then
	* fill it in.
	*/
	pmp->pm_inusemap = malloc(howmany(pmp->pm_maxcluster + 1, N_INUSEBITS)
	* sizeof(*pmp->pm_inusemap),
	M_MSDOSFSFAT, M_WAITOK);

	/*
	* fillinusemap() needs pm_devvp.
	*/
	pmp->pm_devvp = devvp;
	pmp->pm_dev = dev;

	/*
	* Have the inuse map filled in.
	*/
	MSDOSFS_LOCK_MP(pmp);
	error = fillinusemap(pmp);
	MSDOSFS_UNLOCK_MP(pmp);
	if (error != 0)
	goto error_exit;

	/*
	* If they want FAT updates to be synchronous then let them suffer
	* the performance degradation in exchange for the on disk copy of
	* the FAT being correct just about all the time. I suppose this
	* would be a good thing to turn on if the kernel is still flakey.
	*/
	if (mp->mnt_flag & MNT_SYNCHRONOUS)
	pmp->pm_flags \|= MSDOSFSMNT_WAITONFAT;

	/*
	* Finish up.
	*/
	if (ronly)
	pmp->pm_flags \|= MSDOSFSMNT_RONLY;
	else {
	if ((error = markvoldirty(pmp, 1)) != 0)
	goto error_exit;
	pmp->pm_fmod = 1;
	}
	mp->mnt_data = pmp;
	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_LOCAL;
	mp->mnt_kern_flag \|= MNTK_USES_BCACHE \| MNTK_NO_IOPF;
	MNT_IUNLOCK(mp);

	return (0);

	error_exit:
	if (bp)
	brelse(bp);
	if (cp != NULL) {
	g_topology_lock();
	g_vfs_close(cp);
	g_topology_unlock();
	}
	if (pmp) {
	lockdestroy(&pmp->pm_fatlock);
	free(pmp->pm_inusemap, M_MSDOSFSFAT);
	free(pmp, M_MSDOSFSMNT);
	mp->mnt_data = NULL;
	}
	atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
	dev_rel(dev);
	return (error);
	}

	/*
	* Unmount the filesystem described by mp.
	*/
	static int
	msdosfs_unmount(struct mount *mp, int mntflags)
	{
	struct msdosfsmount *pmp;
	int error, flags;
	bool susp;

	error = flags = 0;
	pmp = VFSTOMSDOSFS(mp);
	susp = (pmp->pm_flags & MSDOSFSMNT_RONLY) == 0;

	if (susp) {
	error = vfs_write_suspend_umnt(mp);
	if (error != 0)
	return (error);
	}

	if ((mntflags & MNT_FORCE) != 0)
	flags \|= FORCECLOSE;
	error = vflush(mp, 0, flags, curthread);
	if (error != 0 && error != ENXIO) {
	if (susp)
	vfs_write_resume(mp, VR_START_WRITE);
	return (error);
	}
	if (susp) {
	error = markvoldirty(pmp, 0);
	if (error != 0 && error != ENXIO) {
	if (susp)
	vfs_write_resume(mp, VR_START_WRITE);
	(void)markvoldirty(pmp, 1);
	return (error);
	}
	}
	if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
	if (pmp->pm_w2u)
	msdosfs_iconv->close(pmp->pm_w2u);
	if (pmp->pm_u2w)
	msdosfs_iconv->close(pmp->pm_u2w);
	if (pmp->pm_d2u)
	msdosfs_iconv->close(pmp->pm_d2u);
	if (pmp->pm_u2d)
	msdosfs_iconv->close(pmp->pm_u2d);
	}

	#ifdef MSDOSFS_DEBUG
	{
	struct vnode *vp = pmp->pm_devvp;
	struct bufobj *bo;

	bo = &vp->v_bufobj;
	BO_LOCK(bo);
	VI_LOCK(vp);
	vn_printf(vp,
	"msdosfs_umount(): just before calling VOP_CLOSE()\n");
	printf("freef %p, freeb %p, mount %p\n",
	TAILQ_NEXT(vp, v_vnodelist), vp->v_vnodelist.tqe_prev,
	vp->v_mount);
	printf("cleanblkhd %p, dirtyblkhd %p, numoutput %d, type %d\n",
	TAILQ_FIRST(&vp->v_bufobj.bo_clean.bv_hd),
	TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd),
	vp->v_bufobj.bo_numoutput, vp->v_type);
	VI_UNLOCK(vp);
	BO_UNLOCK(bo);
	}
	#endif
	if (susp)
	vfs_write_resume(mp, VR_START_WRITE);

	g_topology_lock();
	g_vfs_close(pmp->pm_cp);
	g_topology_unlock();
	atomic_store_rel_ptr((uintptr_t *)&pmp->pm_dev->si_mountpt, 0);
	vrele(pmp->pm_devvp);
	dev_rel(pmp->pm_dev);
	free(pmp->pm_inusemap, M_MSDOSFSFAT);
	lockdestroy(&pmp->pm_fatlock);
	free(pmp, M_MSDOSFSMNT);
	mp->mnt_data = NULL;
	MNT_ILOCK(mp);
	mp->mnt_flag &= ~MNT_LOCAL;
	MNT_IUNLOCK(mp);
	return (error);
	}

	static int
	msdosfs_root(struct mount mp, int flags, struct vnode *vpp)
	{
	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
	struct denode *ndep;
	int error;

	#ifdef MSDOSFS_DEBUG
	printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp);
	#endif
	error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep);
	if (error)
	return (error);
	*vpp = DETOV(ndep);
	return (0);
	}

	static int
	msdosfs_statfs(struct mount mp, struct statfs sbp)
	{
	struct msdosfsmount *pmp;

	pmp = VFSTOMSDOSFS(mp);
	sbp->f_bsize = pmp->pm_bpcluster;
	sbp->f_iosize = pmp->pm_bpcluster;
	sbp->f_blocks = pmp->pm_maxcluster + 1;
	sbp->f_bfree = pmp->pm_freeclustercount;
	sbp->f_bavail = pmp->pm_freeclustercount;
	sbp->f_files = pmp->pm_RootDirEnts; /* XXX */
	sbp->f_ffree = 0; /* what to put in here? */
	return (0);
	}

	/*
	* If we have an FSInfo block, update it.
	*/
	static int
	msdosfs_fsiflush(struct msdosfsmount *pmp, int waitfor)
	{
	struct fsinfo *fp;
	struct buf *bp;
	int error;

	MSDOSFS_LOCK_MP(pmp);
	if (pmp->pm_fsinfo == 0 \|\| (pmp->pm_flags & MSDOSFS_FSIMOD) == 0) {
	error = 0;
	goto unlock;
	}
	error = bread(pmp->pm_devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec,
	NOCRED, &bp);
	if (error != 0) {
	goto unlock;
	}
	fp = (struct fsinfo *)bp->b_data;
	putulong(fp->fsinfree, pmp->pm_freeclustercount);
	putulong(fp->fsinxtfree, pmp->pm_nxtfree);
	pmp->pm_flags &= ~MSDOSFS_FSIMOD;
	if (waitfor == MNT_WAIT)
	error = bwrite(bp);
	else
	bawrite(bp);
	unlock:
	MSDOSFS_UNLOCK_MP(pmp);
	return (error);
	}

	static int
	msdosfs_sync(struct mount *mp, int waitfor)
	{
	struct vnode vp, nvp;
	struct thread *td;
	struct denode *dep;
	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
	int error, allerror = 0;

	td = curthread;

	/*
	* If we ever switch to not updating all of the FATs all the time,
	* this would be the place to update them from the first one.
	*/
	if (pmp->pm_fmod != 0) {
	if (pmp->pm_flags & MSDOSFSMNT_RONLY)
	panic("msdosfs_sync: rofs mod");
	else {
	/* update FATs here */
	}
	}
	/*
	* Write back each (modified) denode.
	*/
	loop:
	MNT_VNODE_FOREACH_ALL(vp, mp, nvp) {
	if (vp->v_type == VNON) {
	VI_UNLOCK(vp);
	continue;
	}
	dep = VTODE(vp);
	if ((dep->de_flag &
	(DE_ACCESS \| DE_CREATE \| DE_UPDATE \| DE_MODIFIED)) == 0 &&
	(vp->v_bufobj.bo_dirty.bv_cnt == 0 \|\|
	waitfor == MNT_LAZY)) {
	VI_UNLOCK(vp);
	continue;
	}
	error = vget(vp, LK_EXCLUSIVE \| LK_NOWAIT \| LK_INTERLOCK);
	if (error) {
	if (error == ENOENT) {
	MNT_VNODE_FOREACH_ALL_ABORT(mp, nvp);
	goto loop;
	}
	continue;
	}
	error = VOP_FSYNC(vp, waitfor, td);
	if (error)
	allerror = error;
	VOP_UNLOCK(vp);
	vrele(vp);
	}

	/*
	* Flush filesystem control info.
	*/
	if (waitfor != MNT_LAZY) {
	vn_lock(pmp->pm_devvp, LK_EXCLUSIVE \| LK_RETRY);
	error = VOP_FSYNC(pmp->pm_devvp, waitfor, td);
	if (error)
	allerror = error;
	VOP_UNLOCK(pmp->pm_devvp);
	}

	error = msdosfs_fsiflush(pmp, waitfor);
	if (error != 0)
	allerror = error;

	if (allerror == 0 && waitfor == MNT_SUSPEND) {
	MNT_ILOCK(mp);
	mp->mnt_kern_flag \|= MNTK_SUSPEND2 \| MNTK_SUSPENDED;
	MNT_IUNLOCK(mp);
	}
	return (allerror);
	}

	static int
	msdosfs_fhtovp(struct mount mp, struct fid fhp, int flags, struct vnode **vpp)
	{
	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
	struct defid defhp = (struct defid ) fhp;
	struct denode *dep;
	int error;

	error = deget(pmp, defhp->defid_dirclust, defhp->defid_dirofs, &dep);
	if (error) {
	*vpp = NULLVP;
	return (error);
	}
	*vpp = DETOV(dep);
	vnode_create_vobject(*vpp, dep->de_FileSize, curthread);
	return (0);
	}

	static struct vfsops msdosfs_vfsops = {
	.vfs_fhtovp = msdosfs_fhtovp,
	.vfs_mount = msdosfs_mount,
	.vfs_cmount = msdosfs_cmount,
	.vfs_root = msdosfs_root,
	.vfs_statfs = msdosfs_statfs,
	.vfs_sync = msdosfs_sync,
	.vfs_unmount = msdosfs_unmount,
	};

	VFS_SET(msdosfs_vfsops, msdosfs, 0);
	MODULE_VERSION(msdosfs, 1);
	diff --git a/sys/fs/udf/udf_vfsops.c b/sys/fs/udf/udf_vfsops.c
	index 1cd6a8d16091..132f4e7703d7 100644
	--- a/sys/fs/udf/udf_vfsops.c
	+++ b/sys/fs/udf/udf_vfsops.c
	@@ -1,838 +1,838 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2001, 2002 Scott Long <scottl@freebsd.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	/* udf_vfsops.c */
	/* Implement the VFS side of things */

	/*
	* Ok, here's how it goes. The UDF specs are pretty clear on how each data
	* structure is made up, but not very clear on how they relate to each other.
	* Here is the skinny... This demostrates a filesystem with one file in the
	* root directory. Subdirectories are treated just as normal files, but they
	* have File Id Descriptors of their children as their file data. As for the
	* Anchor Volume Descriptor Pointer, it can exist in two of the following three
	* places: sector 256, sector n (the max sector of the disk), or sector
	* n - 256. It's a pretty good bet that one will exist at sector 256 though.
	* One caveat is unclosed CD media. For that, sector 256 cannot be written,
	* so the Anchor Volume Descriptor Pointer can exist at sector 512 until the
	* media is closed.
	*
	* Sector:
	* 256:
	* n: Anchor Volume Descriptor Pointer
	* n - 256: \|
	* \|
	* \|-->Main Volume Descriptor Sequence
	* \| \|
	* \| \|
	* \| \|-->Logical Volume Descriptor
	* \| \|
	* \|-->Partition Descriptor \|
	* \| \|
	* \| \|
	* \|-->Fileset Descriptor
	* \|
	* \|
	* \|-->Root Dir File Entry
	* \|
	* \|
	* \|-->File data:
	* File Id Descriptor
	* \|
	* \|
	* \|-->File Entry
	* \|
	* \|
	* \|-->File data
	*/
	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/uio.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/conf.h>
	#include <sys/dirent.h>
	#include <sys/fcntl.h>
	#include <sys/iconv.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	#include <sys/vnode.h>
	#include <sys/endian.h>

	#include <geom/geom.h>
	#include <geom/geom_vfs.h>

	#include <vm/uma.h>

	#include <fs/udf/ecma167-udf.h>
	#include <fs/udf/osta.h>
	#include <fs/udf/udf.h>
	#include <fs/udf/udf_mount.h>

	static MALLOC_DEFINE(M_UDFMOUNT, "udf_mount", "UDF mount structure");
	MALLOC_DEFINE(M_UDFFENTRY, "udf_fentry", "UDF file entry structure");

	struct iconv_functions *udf_iconv = NULL;

	/* Zones */
	uma_zone_t udf_zone_trans = NULL;
	uma_zone_t udf_zone_node = NULL;
	uma_zone_t udf_zone_ds = NULL;

	static vfs_init_t udf_init;
	static vfs_uninit_t udf_uninit;
	static vfs_mount_t udf_mount;
	static vfs_root_t udf_root;
	static vfs_statfs_t udf_statfs;
	static vfs_unmount_t udf_unmount;
	static vfs_fhtovp_t udf_fhtovp;

	static int udf_find_partmaps(struct udf_mnt , struct logvol_desc );

	static struct vfsops udf_vfsops = {
	.vfs_fhtovp = udf_fhtovp,
	.vfs_init = udf_init,
	.vfs_mount = udf_mount,
	.vfs_root = udf_root,
	.vfs_statfs = udf_statfs,
	.vfs_uninit = udf_uninit,
	.vfs_unmount = udf_unmount,
	.vfs_vget = udf_vget,
	};
	VFS_SET(udf_vfsops, udf, VFCF_READONLY);

	MODULE_VERSION(udf, 1);

	static int udf_mountfs(struct vnode , struct mount );

	static int
	udf_init(struct vfsconf *foo)
	{

	/*
	* This code used to pre-allocate a certain number of pages for each
	* pool, reducing the need to grow the zones later on. UMA doesn't
	* advertise any such functionality, unfortunately =-<
	*/
	udf_zone_trans = uma_zcreate("UDF translation buffer, zone", MAXNAMLEN *
	sizeof(unicode_t), NULL, NULL, NULL, NULL, 0, 0);

	udf_zone_node = uma_zcreate("UDF Node zone", sizeof(struct udf_node),
	NULL, NULL, NULL, NULL, 0, 0);

	udf_zone_ds = uma_zcreate("UDF Dirstream zone",
	sizeof(struct udf_dirstream), NULL, NULL, NULL, NULL, 0, 0);

	if ((udf_zone_node == NULL) \|\| (udf_zone_trans == NULL) \|\|
	(udf_zone_ds == NULL)) {
	printf("Cannot create allocation zones.\n");
	return (ENOMEM);
	}

	return 0;
	}

	static int
	udf_uninit(struct vfsconf *foo)
	{

	if (udf_zone_trans != NULL) {
	uma_zdestroy(udf_zone_trans);
	udf_zone_trans = NULL;
	}

	if (udf_zone_node != NULL) {
	uma_zdestroy(udf_zone_node);
	udf_zone_node = NULL;
	}

	if (udf_zone_ds != NULL) {
	uma_zdestroy(udf_zone_ds);
	udf_zone_ds = NULL;
	}

	return (0);
	}

	static int
	udf_mount(struct mount *mp)
	{
	struct vnode devvp; / vnode of the mount device */
	struct thread *td;
	struct udf_mnt *imp = NULL;
	struct vfsoptlist *opts;
	char fspec, cs_disk, *cs_local;
	int error, len, *udf_flags;
	struct nameidata nd, *ndp = &nd;

	td = curthread;
	opts = mp->mnt_optnew;

	/*
	* Unconditionally mount as read-only.
	*/
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_RDONLY;
	MNT_IUNLOCK(mp);

	/*
	* No root filesystem support. Probably not a big deal, since the
	* bootloader doesn't understand UDF.
	*/
	if (mp->mnt_flag & MNT_ROOTFS)
	return (ENOTSUP);

	fspec = NULL;
	error = vfs_getopt(opts, "from", (void **)&fspec, &len);
	if (!error && fspec[len - 1] != '\0')
	return (EINVAL);

	if (mp->mnt_flag & MNT_UPDATE) {
	return (0);
	}

	/* Check that the mount device exists */
	if (fspec == NULL)
	return (EINVAL);
	NDINIT(ndp, LOOKUP, FOLLOW \| LOCKLEAF, UIO_SYSSPACE, fspec, td);
	if ((error = namei(ndp)))
	return (error);
	NDFREE(ndp, NDF_ONLY_PNBUF);
	devvp = ndp->ni_vp;

	if (!vn_isdisk_error(devvp, &error)) {
	vput(devvp);
	return (error);
	}

	/* Check the access rights on the mount device */
	error = VOP_ACCESS(devvp, VREAD, td->td_ucred, td);
	if (error)
	error = priv_check(td, PRIV_VFS_MOUNT_PERM);
	if (error) {
	vput(devvp);
	return (error);
	}

	if ((error = udf_mountfs(devvp, mp))) {
	vrele(devvp);
	return (error);
	}

	imp = VFSTOUDFFS(mp);

	udf_flags = NULL;
	error = vfs_getopt(opts, "flags", (void **)&udf_flags, &len);
	if (error \|\| len != sizeof(int))
	return (EINVAL);
	imp->im_flags = *udf_flags;

	if (imp->im_flags & UDFMNT_KICONV && udf_iconv) {
	cs_disk = NULL;
	error = vfs_getopt(opts, "cs_disk", (void **)&cs_disk, &len);
	if (!error && cs_disk[len - 1] != '\0')
	return (EINVAL);
	cs_local = NULL;
	error = vfs_getopt(opts, "cs_local", (void **)&cs_local, &len);
	if (!error && cs_local[len - 1] != '\0')
	return (EINVAL);
	udf_iconv->open(cs_local, cs_disk, &imp->im_d2l);
	#if 0
	udf_iconv->open(cs_disk, cs_local, &imp->im_l2d);
	#endif
	}

	vfs_mountedfrom(mp, fspec);
	return 0;
	};

	/*
	* Check the descriptor tag for both the correct id and correct checksum.
	* Return zero if all is good, EINVAL if not.
	*/
	int
	udf_checktag(struct desc_tag *tag, uint16_t id)
	{
	uint8_t *itag;
	uint8_t i, cksum = 0;

	itag = (uint8_t *)tag;

	if (le16toh(tag->id) != id)
	return (EINVAL);

	for (i = 0; i < 16; i++)
	cksum = cksum + itag[i];
	cksum = cksum - itag[4];

	if (cksum == tag->cksum)
	return (0);

	return (EINVAL);
	}

	static int
	udf_mountfs(struct vnode devvp, struct mount mp)
	{
	struct buf *bp = NULL;
	struct cdev *dev;
	struct anchor_vdp avdp;
	struct udf_mnt *udfmp = NULL;
	struct part_desc *pd;
	struct logvol_desc *lvd;
	struct fileset_desc *fsd;
	struct file_entry *root_fentry;
	uint32_t sector, size, mvds_start, mvds_end;
	uint32_t logical_secsize;
	uint32_t fsd_offset = 0;
	uint16_t part_num = 0, fsd_part = 0;
	int error = EINVAL;
	int logvol_found = 0, part_found = 0, fsd_found = 0;
	int bsize;
	struct g_consumer *cp;
	struct bufobj *bo;

	dev = devvp->v_rdev;
	dev_ref(dev);
	g_topology_lock();
	error = g_vfs_open(devvp, &cp, "udf", 0);
	g_topology_unlock();
	VOP_UNLOCK(devvp);
	if (error)
	goto bail;

	bo = &devvp->v_bufobj;

	if (devvp->v_rdev->si_iosize_max != 0)
	mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
	- if (mp->mnt_iosize_max > MAXPHYS)
	- mp->mnt_iosize_max = MAXPHYS;
	+ if (mp->mnt_iosize_max > maxphys)
	+ mp->mnt_iosize_max = maxphys;

	/* XXX: should be M_WAITOK */
	udfmp = malloc(sizeof(struct udf_mnt), M_UDFMOUNT,
	M_NOWAIT \| M_ZERO);
	if (udfmp == NULL) {
	printf("Cannot allocate UDF mount struct\n");
	error = ENOMEM;
	goto bail;
	}

	mp->mnt_data = udfmp;
	mp->mnt_stat.f_fsid.val[0] = dev2udev(devvp->v_rdev);
	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_LOCAL;
	mp->mnt_kern_flag \|= MNTK_LOOKUP_SHARED \| MNTK_EXTENDED_SHARED;
	MNT_IUNLOCK(mp);
	udfmp->im_mountp = mp;
	udfmp->im_dev = dev;
	udfmp->im_devvp = devvp;
	udfmp->im_d2l = NULL;
	udfmp->im_cp = cp;
	udfmp->im_bo = bo;

	#if 0
	udfmp->im_l2d = NULL;
	#endif
	/*
	* The UDF specification defines a logical sectorsize of 2048
	* for DVD media.
	*/
	logical_secsize = 2048;

	if (((logical_secsize % cp->provider->sectorsize) != 0) \|\|
	(logical_secsize < cp->provider->sectorsize)) {
	error = EINVAL;
	goto bail;
	}

	bsize = cp->provider->sectorsize;

	/*
	* Get the Anchor Volume Descriptor Pointer from sector 256.
	* XXX Should also check sector n - 256, n, and 512.
	*/
	sector = 256;
	if ((error = bread(devvp, sector * btodb(logical_secsize), bsize,
	NOCRED, &bp)) != 0)
	goto bail;
	if ((error = udf_checktag((struct desc_tag *)bp->b_data, TAGID_ANCHOR)))
	goto bail;

	bcopy(bp->b_data, &avdp, sizeof(struct anchor_vdp));
	brelse(bp);
	bp = NULL;

	/*
	* Extract the Partition Descriptor and Logical Volume Descriptor
	* from the Volume Descriptor Sequence.
	* XXX Should we care about the partition type right now?
	* XXX What about multiple partitions?
	*/
	mvds_start = le32toh(avdp.main_vds_ex.loc);
	mvds_end = mvds_start + (le32toh(avdp.main_vds_ex.len) - 1) / bsize;
	for (sector = mvds_start; sector < mvds_end; sector++) {
	if ((error = bread(devvp, sector * btodb(logical_secsize),
	bsize, NOCRED, &bp)) != 0) {
	printf("Can't read sector %d of VDS\n", sector);
	goto bail;
	}
	lvd = (struct logvol_desc *)bp->b_data;
	if (!udf_checktag(&lvd->tag, TAGID_LOGVOL)) {
	udfmp->bsize = le32toh(lvd->lb_size);
	udfmp->bmask = udfmp->bsize - 1;
	udfmp->bshift = ffs(udfmp->bsize) - 1;
	fsd_part = le16toh(lvd->_lvd_use.fsd_loc.loc.part_num);
	fsd_offset = le32toh(lvd->_lvd_use.fsd_loc.loc.lb_num);
	if (udf_find_partmaps(udfmp, lvd))
	break;
	logvol_found = 1;
	}
	pd = (struct part_desc *)bp->b_data;
	if (!udf_checktag(&pd->tag, TAGID_PARTITION)) {
	part_found = 1;
	part_num = le16toh(pd->part_num);
	udfmp->part_len = le32toh(pd->part_len);
	udfmp->part_start = le32toh(pd->start_loc);
	}

	brelse(bp);
	bp = NULL;
	if ((part_found) && (logvol_found))
	break;
	}

	if (!part_found \|\| !logvol_found) {
	error = EINVAL;
	goto bail;
	}

	if (fsd_part != part_num) {
	printf("FSD does not lie within the partition!\n");
	error = EINVAL;
	goto bail;
	}

	/*
	* Grab the Fileset Descriptor
	* Thanks to Chuck McCrobie <mccrobie@cablespeed.com> for pointing
	* me in the right direction here.
	*/
	sector = udfmp->part_start + fsd_offset;
	if ((error = RDSECTOR(devvp, sector, udfmp->bsize, &bp)) != 0) {
	printf("Cannot read sector %d of FSD\n", sector);
	goto bail;
	}
	fsd = (struct fileset_desc *)bp->b_data;
	if (!udf_checktag(&fsd->tag, TAGID_FSD)) {
	fsd_found = 1;
	bcopy(&fsd->rootdir_icb, &udfmp->root_icb,
	sizeof(struct long_ad));
	}

	brelse(bp);
	bp = NULL;

	if (!fsd_found) {
	printf("Couldn't find the fsd\n");
	error = EINVAL;
	goto bail;
	}

	/*
	* Find the file entry for the root directory.
	*/
	sector = le32toh(udfmp->root_icb.loc.lb_num) + udfmp->part_start;
	size = le32toh(udfmp->root_icb.len);
	if ((error = udf_readdevblks(udfmp, sector, size, &bp)) != 0) {
	printf("Cannot read sector %d\n", sector);
	goto bail;
	}

	root_fentry = (struct file_entry *)bp->b_data;
	if ((error = udf_checktag(&root_fentry->tag, TAGID_FENTRY))) {
	printf("Invalid root file entry!\n");
	goto bail;
	}

	brelse(bp);
	bp = NULL;

	return 0;

	bail:
	if (udfmp != NULL)
	free(udfmp, M_UDFMOUNT);
	if (bp != NULL)
	brelse(bp);
	if (cp != NULL) {
	g_topology_lock();
	g_vfs_close(cp);
	g_topology_unlock();
	}
	dev_rel(dev);
	return error;
	};

	static int
	udf_unmount(struct mount *mp, int mntflags)
	{
	struct udf_mnt *udfmp;
	int error, flags = 0;

	udfmp = VFSTOUDFFS(mp);

	if (mntflags & MNT_FORCE)
	flags \|= FORCECLOSE;

	if ((error = vflush(mp, 0, flags, curthread)))
	return (error);

	if (udfmp->im_flags & UDFMNT_KICONV && udf_iconv) {
	if (udfmp->im_d2l)
	udf_iconv->close(udfmp->im_d2l);
	#if 0
	if (udfmp->im_l2d)
	udf_iconv->close(udfmp->im_l2d);
	#endif
	}

	g_topology_lock();
	g_vfs_close(udfmp->im_cp);
	g_topology_unlock();
	vrele(udfmp->im_devvp);
	dev_rel(udfmp->im_dev);

	if (udfmp->s_table != NULL)
	free(udfmp->s_table, M_UDFMOUNT);

	free(udfmp, M_UDFMOUNT);

	mp->mnt_data = NULL;
	MNT_ILOCK(mp);
	mp->mnt_flag &= ~MNT_LOCAL;
	MNT_IUNLOCK(mp);

	return (0);
	}

	static int
	udf_root(struct mount mp, int flags, struct vnode *vpp)
	{
	struct udf_mnt *udfmp;
	ino_t id;

	udfmp = VFSTOUDFFS(mp);

	id = udf_getid(&udfmp->root_icb);

	return (udf_vget(mp, id, flags, vpp));
	}

	static int
	udf_statfs(struct mount mp, struct statfs sbp)
	{
	struct udf_mnt *udfmp;

	udfmp = VFSTOUDFFS(mp);

	sbp->f_bsize = udfmp->bsize;
	sbp->f_iosize = udfmp->bsize;
	sbp->f_blocks = udfmp->part_len;
	sbp->f_bfree = 0;
	sbp->f_bavail = 0;
	sbp->f_files = 0;
	sbp->f_ffree = 0;
	return 0;
	}

	int
	udf_vget(struct mount mp, ino_t ino, int flags, struct vnode *vpp)
	{
	struct buf *bp;
	struct vnode *devvp;
	struct udf_mnt *udfmp;
	struct thread *td;
	struct vnode *vp;
	struct udf_node *unode;
	struct file_entry *fe;
	uint32_t lea, lad;
	int error, sector, size;

	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
	if (error \|\| *vpp != NULL)
	return (error);

	/*
	* We must promote to an exclusive lock for vnode creation. This
	* can happen if lookup is passed LOCKSHARED.
	*/
	if ((flags & LK_TYPE_MASK) == LK_SHARED) {
	flags &= ~LK_TYPE_MASK;
	flags \|= LK_EXCLUSIVE;
	}

	/*
	* We do not lock vnode creation as it is believed to be too
	* expensive for such rare case as simultaneous creation of vnode
	* for same ino by different processes. We just allow them to race
	* and check later to decide who wins. Let the race begin!
	*/

	td = curthread;
	udfmp = VFSTOUDFFS(mp);

	unode = uma_zalloc(udf_zone_node, M_WAITOK \| M_ZERO);

	if ((error = udf_allocv(mp, &vp, td))) {
	printf("Error from udf_allocv\n");
	uma_zfree(udf_zone_node, unode);
	return (error);
	}

	unode->i_vnode = vp;
	unode->hash_id = ino;
	unode->udfmp = udfmp;
	vp->v_data = unode;

	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
	error = insmntque(vp, mp);
	if (error != 0) {
	uma_zfree(udf_zone_node, unode);
	return (error);
	}
	error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL);
	if (error \|\| *vpp != NULL)
	return (error);

	/*
	* Copy in the file entry. Per the spec, the size can only be 1 block.
	*/
	sector = ino + udfmp->part_start;
	devvp = udfmp->im_devvp;
	if ((error = RDSECTOR(devvp, sector, udfmp->bsize, &bp)) != 0) {
	printf("Cannot read sector %d\n", sector);
	goto error;
	}

	/*
	* File entry length validation.
	*/
	fe = (struct file_entry *)bp->b_data;
	if (udf_checktag(&fe->tag, TAGID_FENTRY)) {
	printf("Invalid file entry!\n");
	error = ENOMEM;
	goto error;
	}
	lea = le32toh(fe->l_ea);
	lad = le32toh(fe->l_ad);
	if (lea > udfmp->bsize \|\| lad > udfmp->bsize) {
	printf("Invalid EA and AD lengths %u, %u\n", lea, lad);
	error = EIO;
	goto error;
	}
	size = UDF_FENTRY_SIZE + lea + lad;
	if (size > udfmp->bsize) {
	printf("Invalid file entry size %u\n", size);
	error = EIO;
	goto error;
	}

	unode->fentry = malloc(size, M_UDFFENTRY, M_NOWAIT \| M_ZERO);
	if (unode->fentry == NULL) {
	printf("Cannot allocate file entry block\n");
	error = ENOMEM;
	goto error;
	}

	bcopy(bp->b_data, unode->fentry, size);

	brelse(bp);
	bp = NULL;

	switch (unode->fentry->icbtag.file_type) {
	default:
	vp->v_type = VBAD;
	break;
	case 4:
	vp->v_type = VDIR;
	break;
	case 5:
	vp->v_type = VREG;
	break;
	case 6:
	vp->v_type = VBLK;
	break;
	case 7:
	vp->v_type = VCHR;
	break;
	case 9:
	vp->v_type = VFIFO;
	vp->v_op = &udf_fifoops;
	break;
	case 10:
	vp->v_type = VSOCK;
	break;
	case 12:
	vp->v_type = VLNK;
	break;
	}

	if (vp->v_type != VFIFO)
	VN_LOCK_ASHARE(vp);

	if (ino == udf_getid(&udfmp->root_icb))
	vp->v_vflag \|= VV_ROOT;

	*vpp = vp;

	return (0);

	error:
	vgone(vp);
	vput(vp);
	brelse(bp);
	*vpp = NULL;
	return (error);
	}

	static int
	udf_fhtovp(struct mount mp, struct fid fhp, int flags, struct vnode **vpp)
	{
	struct ifid *ifhp;
	struct vnode *nvp;
	struct udf_node *np;
	off_t fsize;
	int error;

	ifhp = (struct ifid *)fhp;

	if ((error = VFS_VGET(mp, ifhp->ifid_ino, LK_EXCLUSIVE, &nvp)) != 0) {
	*vpp = NULLVP;
	return (error);
	}

	np = VTON(nvp);
	fsize = le64toh(np->fentry->inf_len);

	*vpp = nvp;
	vnode_create_vobject(*vpp, fsize, curthread);
	return (0);
	}

	static int
	udf_find_partmaps(struct udf_mnt udfmp, struct logvol_desc lvd)
	{
	struct part_map_spare *pms;
	struct regid *pmap_id;
	struct buf *bp;
	unsigned char regid_id[UDF_REGID_ID_SIZE + 1];
	int i, k, ptype, psize, error;
	uint8_t pmap = (uint8_t ) &lvd->maps[0];

	for (i = 0; i < le32toh(lvd->n_pm); i++) {
	ptype = pmap[0];
	psize = pmap[1];
	if (((ptype != 1) && (ptype != 2)) \|\|
	((psize != UDF_PMAP_TYPE1_SIZE) &&
	(psize != UDF_PMAP_TYPE2_SIZE))) {
	printf("Invalid partition map found\n");
	return (1);
	}

	if (ptype == 1) {
	/* Type 1 map. We don't care */
	pmap += UDF_PMAP_TYPE1_SIZE;
	continue;
	}

	/* Type 2 map. Gotta find out the details */
	pmap_id = (struct regid *)&pmap[4];
	bzero(&regid_id[0], UDF_REGID_ID_SIZE);
	bcopy(&pmap_id->id[0], &regid_id[0], UDF_REGID_ID_SIZE);

	if (bcmp(&regid_id[0], "*UDF Sparable Partition",
	UDF_REGID_ID_SIZE)) {
	printf("Unsupported partition map: %s\n", &regid_id[0]);
	return (1);
	}

	pms = (struct part_map_spare *)pmap;
	pmap += UDF_PMAP_TYPE2_SIZE;
	udfmp->s_table = malloc(le32toh(pms->st_size),
	M_UDFMOUNT, M_NOWAIT \| M_ZERO);
	if (udfmp->s_table == NULL)
	return (ENOMEM);

	/* Calculate the number of sectors per packet. */
	/* XXX Logical or physical? */
	udfmp->p_sectors = le16toh(pms->packet_len) / udfmp->bsize;

	/*
	* XXX If reading the first Sparing Table fails, should look
	* for another table.
	*/
	if ((error = udf_readdevblks(udfmp, le32toh(pms->st_loc[0]),
	le32toh(pms->st_size), &bp)) != 0) {
	if (bp != NULL)
	brelse(bp);
	printf("Failed to read Sparing Table at sector %d\n",
	le32toh(pms->st_loc[0]));
	free(udfmp->s_table, M_UDFMOUNT);
	return (error);
	}
	bcopy(bp->b_data, udfmp->s_table, le32toh(pms->st_size));
	brelse(bp);

	if (udf_checktag(&udfmp->s_table->tag, 0)) {
	printf("Invalid sparing table found\n");
	free(udfmp->s_table, M_UDFMOUNT);
	return (EINVAL);
	}

	/* See how many valid entries there are here. The list is
	* supposed to be sorted. 0xfffffff0 and higher are not valid
	*/
	for (k = 0; k < le16toh(udfmp->s_table->rt_l); k++) {
	udfmp->s_table_entries = k;
	if (le32toh(udfmp->s_table->entries[k].org) >=
	0xfffffff0)
	break;
	}
	}

	return (0);
	}
	diff --git a/sys/geom/cache/g_cache.c b/sys/geom/cache/g_cache.c
	index d28c2ad62bad..05e45e6ea452 100644
	--- a/sys/geom/cache/g_cache.c
	+++ b/sys/geom/cache/g_cache.c
	@@ -1,1014 +1,1014 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2006 Ruslan Ermilov <ru@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/bio.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/queue.h>
	#include <sys/sbuf.h>
	#include <sys/time.h>
	#include <vm/uma.h>
	#include <geom/geom.h>
	#include <geom/geom_dbg.h>
	#include <geom/cache/g_cache.h>

	FEATURE(geom_cache, "GEOM cache module");

	static MALLOC_DEFINE(M_GCACHE, "gcache_data", "GEOM_CACHE Data");

	SYSCTL_DECL(_kern_geom);
	static SYSCTL_NODE(_kern_geom, OID_AUTO, cache, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_CACHE stuff");
	static u_int g_cache_debug = 0;
	SYSCTL_UINT(_kern_geom_cache, OID_AUTO, debug, CTLFLAG_RW, &g_cache_debug, 0,
	"Debug level");
	static u_int g_cache_enable = 1;
	SYSCTL_UINT(_kern_geom_cache, OID_AUTO, enable, CTLFLAG_RW, &g_cache_enable, 0,
	"");
	static u_int g_cache_timeout = 10;
	SYSCTL_UINT(_kern_geom_cache, OID_AUTO, timeout, CTLFLAG_RW, &g_cache_timeout,
	0, "");
	static u_int g_cache_idletime = 5;
	SYSCTL_UINT(_kern_geom_cache, OID_AUTO, idletime, CTLFLAG_RW, &g_cache_idletime,
	0, "");
	static u_int g_cache_used_lo = 5;
	static u_int g_cache_used_hi = 20;
	static int
	sysctl_handle_pct(SYSCTL_HANDLER_ARGS)
	{
	u_int val = (u_int )arg1;
	int error;

	error = sysctl_handle_int(oidp, &val, 0, req);
	if (error \|\| !req->newptr)
	return (error);
	if (val > 100)
	return (EINVAL);
	if ((arg1 == &g_cache_used_lo && val > g_cache_used_hi) \|\|
	(arg1 == &g_cache_used_hi && g_cache_used_lo > val))
	return (EINVAL);
	(u_int )arg1 = val;
	return (0);
	}
	SYSCTL_PROC(_kern_geom_cache, OID_AUTO, used_lo,
	CTLTYPE_UINT \| CTLFLAG_RW \| CTLFLAG_NEEDGIANT, &g_cache_used_lo, 0,
	sysctl_handle_pct, "IU",
	"");
	SYSCTL_PROC(_kern_geom_cache, OID_AUTO, used_hi,
	CTLTYPE_UINT \| CTLFLAG_RW \| CTLFLAG_NEEDGIANT, &g_cache_used_hi, 0,
	sysctl_handle_pct, "IU",
	"");

	static int g_cache_destroy(struct g_cache_softc *sc, boolean_t force);
	static g_ctl_destroy_geom_t g_cache_destroy_geom;

	static g_taste_t g_cache_taste;
	static g_ctl_req_t g_cache_config;
	static g_dumpconf_t g_cache_dumpconf;

	struct g_class g_cache_class = {
	.name = G_CACHE_CLASS_NAME,
	.version = G_VERSION,
	.ctlreq = g_cache_config,
	.taste = g_cache_taste,
	.destroy_geom = g_cache_destroy_geom
	};

	#define OFF2BNO(off, sc) ((off) >> (sc)->sc_bshift)
	#define BNO2OFF(bno, sc) ((bno) << (sc)->sc_bshift)

	static struct g_cache_desc *
	g_cache_alloc(struct g_cache_softc *sc)
	{
	struct g_cache_desc *dp;

	mtx_assert(&sc->sc_mtx, MA_OWNED);

	if (!TAILQ_EMPTY(&sc->sc_usedlist)) {
	dp = TAILQ_FIRST(&sc->sc_usedlist);
	TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used);
	sc->sc_nused--;
	dp->d_flags = 0;
	LIST_REMOVE(dp, d_next);
	return (dp);
	}
	if (sc->sc_nent > sc->sc_maxent) {
	sc->sc_cachefull++;
	return (NULL);
	}
	dp = malloc(sizeof(*dp), M_GCACHE, M_NOWAIT \| M_ZERO);
	if (dp == NULL)
	return (NULL);
	dp->d_data = uma_zalloc(sc->sc_zone, M_NOWAIT);
	if (dp->d_data == NULL) {
	free(dp, M_GCACHE);
	return (NULL);
	}
	sc->sc_nent++;
	return (dp);
	}

	static void
	g_cache_free(struct g_cache_softc sc, struct g_cache_desc dp)
	{

	mtx_assert(&sc->sc_mtx, MA_OWNED);

	uma_zfree(sc->sc_zone, dp->d_data);
	free(dp, M_GCACHE);
	sc->sc_nent--;
	}

	static void
	g_cache_free_used(struct g_cache_softc *sc)
	{
	struct g_cache_desc *dp;
	u_int n;

	mtx_assert(&sc->sc_mtx, MA_OWNED);

	n = g_cache_used_lo * sc->sc_maxent / 100;
	while (sc->sc_nused > n) {
	KASSERT(!TAILQ_EMPTY(&sc->sc_usedlist), ("used list empty"));
	dp = TAILQ_FIRST(&sc->sc_usedlist);
	TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used);
	sc->sc_nused--;
	LIST_REMOVE(dp, d_next);
	g_cache_free(sc, dp);
	}
	}

	static void
	g_cache_deliver(struct g_cache_softc sc, struct bio bp,
	struct g_cache_desc *dp, int error)
	{
	off_t off1, off, len;

	mtx_assert(&sc->sc_mtx, MA_OWNED);
	KASSERT(OFF2BNO(bp->bio_offset, sc) <= dp->d_bno, ("wrong entry"));
	KASSERT(OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc) >=
	dp->d_bno, ("wrong entry"));

	off1 = BNO2OFF(dp->d_bno, sc);
	off = MAX(bp->bio_offset, off1);
	len = MIN(bp->bio_offset + bp->bio_length, off1 + sc->sc_bsize) - off;

	if (bp->bio_error == 0)
	bp->bio_error = error;
	if (bp->bio_error == 0) {
	bcopy(dp->d_data + (off - off1),
	bp->bio_data + (off - bp->bio_offset), len);
	}
	bp->bio_completed += len;
	KASSERT(bp->bio_completed <= bp->bio_length, ("extra data"));
	if (bp->bio_completed == bp->bio_length) {
	if (bp->bio_error != 0)
	bp->bio_completed = 0;
	g_io_deliver(bp, bp->bio_error);
	}

	if (dp->d_flags & D_FLAG_USED) {
	TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used);
	TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used);
	} else if (OFF2BNO(off + len, sc) > dp->d_bno) {
	TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used);
	sc->sc_nused++;
	dp->d_flags \|= D_FLAG_USED;
	}
	dp->d_atime = time_uptime;
	}

	static void
	g_cache_done(struct bio *bp)
	{
	struct g_cache_softc *sc;
	struct g_cache_desc *dp;
	struct bio bp2, tmpbp;

	sc = bp->bio_from->geom->softc;
	KASSERT(G_CACHE_DESC1(bp) == sc, ("corrupt bio_caller in g_cache_done()"));
	dp = G_CACHE_DESC2(bp);
	mtx_lock(&sc->sc_mtx);
	bp2 = dp->d_biolist;
	while (bp2 != NULL) {
	KASSERT(G_CACHE_NEXT_BIO1(bp2) == sc, ("corrupt bio_driver in g_cache_done()"));
	tmpbp = G_CACHE_NEXT_BIO2(bp2);
	g_cache_deliver(sc, bp2, dp, bp->bio_error);
	bp2 = tmpbp;
	}
	dp->d_biolist = NULL;
	if (dp->d_flags & D_FLAG_INVALID) {
	sc->sc_invalid--;
	g_cache_free(sc, dp);
	} else if (bp->bio_error) {
	LIST_REMOVE(dp, d_next);
	if (dp->d_flags & D_FLAG_USED) {
	TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used);
	sc->sc_nused--;
	}
	g_cache_free(sc, dp);
	}
	mtx_unlock(&sc->sc_mtx);
	g_destroy_bio(bp);
	}

	static struct g_cache_desc *
	g_cache_lookup(struct g_cache_softc *sc, off_t bno)
	{
	struct g_cache_desc *dp;

	mtx_assert(&sc->sc_mtx, MA_OWNED);

	LIST_FOREACH(dp, &sc->sc_desclist[G_CACHE_BUCKET(bno)], d_next)
	if (dp->d_bno == bno)
	return (dp);
	return (NULL);
	}

	static int
	g_cache_read(struct g_cache_softc sc, struct bio bp)
	{
	struct bio *cbp;
	struct g_cache_desc *dp;

	mtx_lock(&sc->sc_mtx);
	dp = g_cache_lookup(sc,
	OFF2BNO(bp->bio_offset + bp->bio_completed, sc));
	if (dp != NULL) {
	/* Add to waiters list or deliver. */
	sc->sc_cachehits++;
	if (dp->d_biolist != NULL) {
	G_CACHE_NEXT_BIO1(bp) = sc;
	G_CACHE_NEXT_BIO2(bp) = dp->d_biolist;
	dp->d_biolist = bp;
	} else
	g_cache_deliver(sc, bp, dp, 0);
	mtx_unlock(&sc->sc_mtx);
	return (0);
	}

	/* Cache miss. Allocate entry and schedule bio. */
	sc->sc_cachemisses++;
	dp = g_cache_alloc(sc);
	if (dp == NULL) {
	mtx_unlock(&sc->sc_mtx);
	return (ENOMEM);
	}
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	g_cache_free(sc, dp);
	mtx_unlock(&sc->sc_mtx);
	return (ENOMEM);
	}

	dp->d_bno = OFF2BNO(bp->bio_offset + bp->bio_completed, sc);
	G_CACHE_NEXT_BIO1(bp) = sc;
	G_CACHE_NEXT_BIO2(bp) = NULL;
	dp->d_biolist = bp;
	LIST_INSERT_HEAD(&sc->sc_desclist[G_CACHE_BUCKET(dp->d_bno)],
	dp, d_next);
	mtx_unlock(&sc->sc_mtx);

	G_CACHE_DESC1(cbp) = sc;
	G_CACHE_DESC2(cbp) = dp;
	cbp->bio_done = g_cache_done;
	cbp->bio_offset = BNO2OFF(dp->d_bno, sc);
	cbp->bio_data = dp->d_data;
	cbp->bio_length = sc->sc_bsize;
	g_io_request(cbp, LIST_FIRST(&bp->bio_to->geom->consumer));
	return (0);
	}

	static void
	g_cache_invalidate(struct g_cache_softc sc, struct bio bp)
	{
	struct g_cache_desc *dp;
	off_t bno, lim;

	mtx_lock(&sc->sc_mtx);
	bno = OFF2BNO(bp->bio_offset, sc);
	lim = OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc);
	do {
	if ((dp = g_cache_lookup(sc, bno)) != NULL) {
	LIST_REMOVE(dp, d_next);
	if (dp->d_flags & D_FLAG_USED) {
	TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used);
	sc->sc_nused--;
	}
	if (dp->d_biolist == NULL)
	g_cache_free(sc, dp);
	else {
	dp->d_flags = D_FLAG_INVALID;
	sc->sc_invalid++;
	}
	}
	bno++;
	} while (bno <= lim);
	mtx_unlock(&sc->sc_mtx);
	}

	static void
	g_cache_start(struct bio *bp)
	{
	struct g_cache_softc *sc;
	struct g_geom *gp;
	struct g_cache_desc *dp;
	struct bio *cbp;

	gp = bp->bio_to->geom;
	sc = gp->softc;
	G_CACHE_LOGREQ(bp, "Request received.");
	switch (bp->bio_cmd) {
	case BIO_READ:
	sc->sc_reads++;
	sc->sc_readbytes += bp->bio_length;
	if (!g_cache_enable)
	break;
	if (bp->bio_offset + bp->bio_length > sc->sc_tail)
	break;
	if (OFF2BNO(bp->bio_offset, sc) ==
	OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc)) {
	sc->sc_cachereads++;
	sc->sc_cachereadbytes += bp->bio_length;
	if (g_cache_read(sc, bp) == 0)
	return;
	sc->sc_cachereads--;
	sc->sc_cachereadbytes -= bp->bio_length;
	break;
	} else if (OFF2BNO(bp->bio_offset, sc) + 1 ==
	OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc)) {
	mtx_lock(&sc->sc_mtx);
	dp = g_cache_lookup(sc, OFF2BNO(bp->bio_offset, sc));
	if (dp == NULL \|\| dp->d_biolist != NULL) {
	mtx_unlock(&sc->sc_mtx);
	break;
	}
	sc->sc_cachereads++;
	sc->sc_cachereadbytes += bp->bio_length;
	g_cache_deliver(sc, bp, dp, 0);
	mtx_unlock(&sc->sc_mtx);
	if (g_cache_read(sc, bp) == 0)
	return;
	sc->sc_cachereads--;
	sc->sc_cachereadbytes -= bp->bio_length;
	break;
	}
	break;
	case BIO_WRITE:
	sc->sc_writes++;
	sc->sc_wrotebytes += bp->bio_length;
	g_cache_invalidate(sc, bp);
	break;
	}
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	g_io_deliver(bp, ENOMEM);
	return;
	}
	cbp->bio_done = g_std_done;
	G_CACHE_LOGREQ(cbp, "Sending request.");
	g_io_request(cbp, LIST_FIRST(&gp->consumer));
	}

	static void
	g_cache_go(void *arg)
	{
	struct g_cache_softc *sc = arg;
	struct g_cache_desc *dp;
	int i;

	mtx_assert(&sc->sc_mtx, MA_OWNED);

	/* Forcibly mark idle ready entries as used. */
	for (i = 0; i < G_CACHE_BUCKETS; i++) {
	LIST_FOREACH(dp, &sc->sc_desclist[i], d_next) {
	if (dp->d_flags & D_FLAG_USED \|\|
	dp->d_biolist != NULL \|\|
	time_uptime - dp->d_atime < g_cache_idletime)
	continue;
	TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used);
	sc->sc_nused++;
	dp->d_flags \|= D_FLAG_USED;
	}
	}

	/* Keep the number of used entries low. */
	if (sc->sc_nused > g_cache_used_hi * sc->sc_maxent / 100)
	g_cache_free_used(sc);

	callout_reset(&sc->sc_callout, g_cache_timeout * hz, g_cache_go, sc);
	}

	static int
	g_cache_access(struct g_provider *pp, int dr, int dw, int de)
	{
	struct g_geom *gp;
	struct g_consumer *cp;
	int error;

	gp = pp->geom;
	cp = LIST_FIRST(&gp->consumer);
	error = g_access(cp, dr, dw, de);

	return (error);
	}

	static void
	g_cache_orphan(struct g_consumer *cp)
	{

	g_topology_assert();
	g_cache_destroy(cp->geom->softc, 1);
	}

	static struct g_cache_softc *
	g_cache_find_device(struct g_class mp, const char name)
	{
	struct g_geom *gp;

	LIST_FOREACH(gp, &mp->geom, geom) {
	if (strcmp(gp->name, name) == 0)
	return (gp->softc);
	}
	return (NULL);
	}

	static struct g_geom *
	g_cache_create(struct g_class mp, struct g_provider pp,
	const struct g_cache_metadata *md, u_int type)
	{
	struct g_cache_softc *sc;
	struct g_geom *gp;
	struct g_provider *newpp;
	struct g_consumer *cp;
	u_int bshift;
	int i;

	g_topology_assert();

	gp = NULL;
	newpp = NULL;
	cp = NULL;

	G_CACHE_DEBUG(1, "Creating device %s.", md->md_name);

	/* Cache size is minimum 100. */
	if (md->md_size < 100) {
	G_CACHE_DEBUG(0, "Invalid size for device %s.", md->md_name);
	return (NULL);
	}

	/* Block size restrictions. */
	bshift = ffs(md->md_bsize) - 1;
	- if (md->md_bsize == 0 \|\| md->md_bsize > MAXPHYS \|\|
	+ if (md->md_bsize == 0 \|\| md->md_bsize > maxphys \|\|
	md->md_bsize != 1 << bshift \|\|
	(md->md_bsize % pp->sectorsize) != 0) {
	G_CACHE_DEBUG(0, "Invalid blocksize for provider %s.", pp->name);
	return (NULL);
	}

	/* Check for duplicate unit. */
	if (g_cache_find_device(mp, (const char *)&md->md_name) != NULL) {
	G_CACHE_DEBUG(0, "Provider %s already exists.", md->md_name);
	return (NULL);
	}

	gp = g_new_geomf(mp, "%s", md->md_name);
	sc = g_malloc(sizeof(*sc), M_WAITOK \| M_ZERO);
	sc->sc_type = type;
	sc->sc_bshift = bshift;
	sc->sc_bsize = 1 << bshift;
	sc->sc_zone = uma_zcreate("gcache", sc->sc_bsize, NULL, NULL, NULL, NULL,
	UMA_ALIGN_PTR, 0);
	mtx_init(&sc->sc_mtx, "GEOM CACHE mutex", NULL, MTX_DEF);
	for (i = 0; i < G_CACHE_BUCKETS; i++)
	LIST_INIT(&sc->sc_desclist[i]);
	TAILQ_INIT(&sc->sc_usedlist);
	sc->sc_maxent = md->md_size;
	callout_init_mtx(&sc->sc_callout, &sc->sc_mtx, 0);
	gp->softc = sc;
	sc->sc_geom = gp;
	gp->start = g_cache_start;
	gp->orphan = g_cache_orphan;
	gp->access = g_cache_access;
	gp->dumpconf = g_cache_dumpconf;

	newpp = g_new_providerf(gp, "cache/%s", gp->name);
	newpp->sectorsize = pp->sectorsize;
	newpp->mediasize = pp->mediasize;
	if (type == G_CACHE_TYPE_AUTOMATIC)
	newpp->mediasize -= pp->sectorsize;
	sc->sc_tail = BNO2OFF(OFF2BNO(newpp->mediasize, sc), sc);

	cp = g_new_consumer(gp);
	if (g_attach(cp, pp) != 0) {
	G_CACHE_DEBUG(0, "Cannot attach to provider %s.", pp->name);
	g_destroy_consumer(cp);
	g_destroy_provider(newpp);
	mtx_destroy(&sc->sc_mtx);
	g_free(sc);
	g_destroy_geom(gp);
	return (NULL);
	}

	g_error_provider(newpp, 0);
	G_CACHE_DEBUG(0, "Device %s created.", gp->name);
	callout_reset(&sc->sc_callout, g_cache_timeout * hz, g_cache_go, sc);
	return (gp);
	}

	static int
	g_cache_destroy(struct g_cache_softc *sc, boolean_t force)
	{
	struct g_geom *gp;
	struct g_provider *pp;
	struct g_cache_desc dp, dp2;
	int i;

	g_topology_assert();
	if (sc == NULL)
	return (ENXIO);
	gp = sc->sc_geom;
	pp = LIST_FIRST(&gp->provider);
	if (pp != NULL && (pp->acr != 0 \|\| pp->acw != 0 \|\| pp->ace != 0)) {
	if (force) {
	G_CACHE_DEBUG(0, "Device %s is still open, so it "
	"can't be definitely removed.", pp->name);
	} else {
	G_CACHE_DEBUG(1, "Device %s is still open (r%dw%de%d).",
	pp->name, pp->acr, pp->acw, pp->ace);
	return (EBUSY);
	}
	} else {
	G_CACHE_DEBUG(0, "Device %s removed.", gp->name);
	}
	callout_drain(&sc->sc_callout);
	mtx_lock(&sc->sc_mtx);
	for (i = 0; i < G_CACHE_BUCKETS; i++) {
	dp = LIST_FIRST(&sc->sc_desclist[i]);
	while (dp != NULL) {
	dp2 = LIST_NEXT(dp, d_next);
	g_cache_free(sc, dp);
	dp = dp2;
	}
	}
	mtx_unlock(&sc->sc_mtx);
	mtx_destroy(&sc->sc_mtx);
	uma_zdestroy(sc->sc_zone);
	g_free(sc);
	gp->softc = NULL;
	g_wither_geom(gp, ENXIO);

	return (0);
	}

	static int
	g_cache_destroy_geom(struct gctl_req req, struct g_class mp, struct g_geom *gp)
	{

	return (g_cache_destroy(gp->softc, 0));
	}

	static int
	g_cache_read_metadata(struct g_consumer cp, struct g_cache_metadata md)
	{
	struct g_provider *pp;
	u_char *buf;
	int error;

	g_topology_assert();

	error = g_access(cp, 1, 0, 0);
	if (error != 0)
	return (error);
	pp = cp->provider;
	g_topology_unlock();
	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
	&error);
	g_topology_lock();
	g_access(cp, -1, 0, 0);
	if (buf == NULL)
	return (error);

	/* Decode metadata. */
	cache_metadata_decode(buf, md);
	g_free(buf);

	return (0);
	}

	static int
	g_cache_write_metadata(struct g_consumer cp, struct g_cache_metadata md)
	{
	struct g_provider *pp;
	u_char *buf;
	int error;

	g_topology_assert();

	error = g_access(cp, 0, 1, 0);
	if (error != 0)
	return (error);
	pp = cp->provider;
	buf = malloc((size_t)pp->sectorsize, M_GCACHE, M_WAITOK \| M_ZERO);
	cache_metadata_encode(md, buf);
	g_topology_unlock();
	error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize);
	g_topology_lock();
	g_access(cp, 0, -1, 0);
	free(buf, M_GCACHE);

	return (error);
	}

	static struct g_geom *
	g_cache_taste(struct g_class mp, struct g_provider pp, int flags __unused)
	{
	struct g_cache_metadata md;
	struct g_consumer *cp;
	struct g_geom *gp;
	int error;

	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
	g_topology_assert();

	G_CACHE_DEBUG(3, "Tasting %s.", pp->name);

	gp = g_new_geomf(mp, "cache:taste");
	gp->start = g_cache_start;
	gp->orphan = g_cache_orphan;
	gp->access = g_cache_access;
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error == 0) {
	error = g_cache_read_metadata(cp, &md);
	g_detach(cp);
	}
	g_destroy_consumer(cp);
	g_destroy_geom(gp);
	if (error != 0)
	return (NULL);

	if (strcmp(md.md_magic, G_CACHE_MAGIC) != 0)
	return (NULL);
	if (md.md_version > G_CACHE_VERSION) {
	printf("geom_cache.ko module is too old to handle %s.\n",
	pp->name);
	return (NULL);
	}
	if (md.md_provsize != pp->mediasize)
	return (NULL);

	gp = g_cache_create(mp, pp, &md, G_CACHE_TYPE_AUTOMATIC);
	if (gp == NULL) {
	G_CACHE_DEBUG(0, "Can't create %s.", md.md_name);
	return (NULL);
	}
	return (gp);
	}

	static void
	g_cache_ctl_create(struct gctl_req req, struct g_class mp)
	{
	struct g_cache_metadata md;
	struct g_provider *pp;
	struct g_geom *gp;
	intmax_t bsize, size;
	const char *name;
	int *nargs;

	g_topology_assert();

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "No '%s' argument", "nargs");
	return;
	}
	if (*nargs != 2) {
	gctl_error(req, "Invalid number of arguments.");
	return;
	}

	strlcpy(md.md_magic, G_CACHE_MAGIC, sizeof(md.md_magic));
	md.md_version = G_CACHE_VERSION;
	name = gctl_get_asciiparam(req, "arg0");
	if (name == NULL) {
	gctl_error(req, "No 'arg0' argument");
	return;
	}
	strlcpy(md.md_name, name, sizeof(md.md_name));

	size = gctl_get_paraml(req, "size", sizeof(*size));
	if (size == NULL) {
	gctl_error(req, "No '%s' argument", "size");
	return;
	}
	if ((u_int)*size < 100) {
	gctl_error(req, "Invalid '%s' argument", "size");
	return;
	}
	md.md_size = (u_int)*size;

	bsize = gctl_get_paraml(req, "blocksize", sizeof(*bsize));
	if (bsize == NULL) {
	gctl_error(req, "No '%s' argument", "blocksize");
	return;
	}
	if (*bsize < 0) {
	gctl_error(req, "Invalid '%s' argument", "blocksize");
	return;
	}
	md.md_bsize = (u_int)*bsize;

	/* This field is not important here. */
	md.md_provsize = 0;

	pp = gctl_get_provider(req, "arg1");
	if (pp == NULL)
	return;
	gp = g_cache_create(mp, pp, &md, G_CACHE_TYPE_MANUAL);
	if (gp == NULL) {
	gctl_error(req, "Can't create %s.", md.md_name);
	return;
	}
	}

	static void
	g_cache_ctl_configure(struct gctl_req req, struct g_class mp)
	{
	struct g_cache_metadata md;
	struct g_cache_softc *sc;
	struct g_consumer *cp;
	intmax_t bsize, size;
	const char *name;
	int error, *nargs;

	g_topology_assert();

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "No '%s' argument", "nargs");
	return;
	}
	if (*nargs != 1) {
	gctl_error(req, "Missing device.");
	return;
	}

	name = gctl_get_asciiparam(req, "arg0");
	if (name == NULL) {
	gctl_error(req, "No 'arg0' argument");
	return;
	}
	sc = g_cache_find_device(mp, name);
	if (sc == NULL) {
	G_CACHE_DEBUG(1, "Device %s is invalid.", name);
	gctl_error(req, "Device %s is invalid.", name);
	return;
	}

	size = gctl_get_paraml(req, "size", sizeof(*size));
	if (size == NULL) {
	gctl_error(req, "No '%s' argument", "size");
	return;
	}
	if ((u_int)size != 0 && (u_int)size < 100) {
	gctl_error(req, "Invalid '%s' argument", "size");
	return;
	}
	if ((u_int)*size != 0)
	sc->sc_maxent = (u_int)*size;

	bsize = gctl_get_paraml(req, "blocksize", sizeof(*bsize));
	if (bsize == NULL) {
	gctl_error(req, "No '%s' argument", "blocksize");
	return;
	}
	if (*bsize < 0) {
	gctl_error(req, "Invalid '%s' argument", "blocksize");
	return;
	}

	if (sc->sc_type != G_CACHE_TYPE_AUTOMATIC)
	return;

	strlcpy(md.md_name, name, sizeof(md.md_name));
	strlcpy(md.md_magic, G_CACHE_MAGIC, sizeof(md.md_magic));
	md.md_version = G_CACHE_VERSION;
	if ((u_int)*size != 0)
	md.md_size = (u_int)*size;
	else
	md.md_size = sc->sc_maxent;
	if ((u_int)*bsize != 0)
	md.md_bsize = (u_int)*bsize;
	else
	md.md_bsize = sc->sc_bsize;
	cp = LIST_FIRST(&sc->sc_geom->consumer);
	md.md_provsize = cp->provider->mediasize;
	error = g_cache_write_metadata(cp, &md);
	if (error == 0)
	G_CACHE_DEBUG(2, "Metadata on %s updated.", cp->provider->name);
	else
	G_CACHE_DEBUG(0, "Cannot update metadata on %s (error=%d).",
	cp->provider->name, error);
	}

	static void
	g_cache_ctl_destroy(struct gctl_req req, struct g_class mp)
	{
	int nargs, force, error, i;
	struct g_cache_softc *sc;
	const char *name;
	char param[16];

	g_topology_assert();

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "No '%s' argument", "nargs");
	return;
	}
	if (*nargs <= 0) {
	gctl_error(req, "Missing device(s).");
	return;
	}
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (force == NULL) {
	gctl_error(req, "No 'force' argument");
	return;
	}

	for (i = 0; i < *nargs; i++) {
	snprintf(param, sizeof(param), "arg%d", i);
	name = gctl_get_asciiparam(req, param);
	if (name == NULL) {
	gctl_error(req, "No 'arg%d' argument", i);
	return;
	}
	sc = g_cache_find_device(mp, name);
	if (sc == NULL) {
	G_CACHE_DEBUG(1, "Device %s is invalid.", name);
	gctl_error(req, "Device %s is invalid.", name);
	return;
	}
	error = g_cache_destroy(sc, *force);
	if (error != 0) {
	gctl_error(req, "Cannot destroy device %s (error=%d).",
	sc->sc_name, error);
	return;
	}
	}
	}

	static void
	g_cache_ctl_reset(struct gctl_req req, struct g_class mp)
	{
	struct g_cache_softc *sc;
	const char *name;
	char param[16];
	int i, *nargs;

	g_topology_assert();

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "No '%s' argument", "nargs");
	return;
	}
	if (*nargs <= 0) {
	gctl_error(req, "Missing device(s).");
	return;
	}

	for (i = 0; i < *nargs; i++) {
	snprintf(param, sizeof(param), "arg%d", i);
	name = gctl_get_asciiparam(req, param);
	if (name == NULL) {
	gctl_error(req, "No 'arg%d' argument", i);
	return;
	}
	sc = g_cache_find_device(mp, name);
	if (sc == NULL) {
	G_CACHE_DEBUG(1, "Device %s is invalid.", name);
	gctl_error(req, "Device %s is invalid.", name);
	return;
	}
	sc->sc_reads = 0;
	sc->sc_readbytes = 0;
	sc->sc_cachereads = 0;
	sc->sc_cachereadbytes = 0;
	sc->sc_cachehits = 0;
	sc->sc_cachemisses = 0;
	sc->sc_cachefull = 0;
	sc->sc_writes = 0;
	sc->sc_wrotebytes = 0;
	}
	}

	static void
	g_cache_config(struct gctl_req req, struct g_class mp, const char *verb)
	{
	uint32_t *version;

	g_topology_assert();

	version = gctl_get_paraml(req, "version", sizeof(*version));
	if (version == NULL) {
	gctl_error(req, "No '%s' argument.", "version");
	return;
	}
	if (*version != G_CACHE_VERSION) {
	gctl_error(req, "Userland and kernel parts are out of sync.");
	return;
	}

	if (strcmp(verb, "create") == 0) {
	g_cache_ctl_create(req, mp);
	return;
	} else if (strcmp(verb, "configure") == 0) {
	g_cache_ctl_configure(req, mp);
	return;
	} else if (strcmp(verb, "destroy") == 0 \|\|
	strcmp(verb, "stop") == 0) {
	g_cache_ctl_destroy(req, mp);
	return;
	} else if (strcmp(verb, "reset") == 0) {
	g_cache_ctl_reset(req, mp);
	return;
	}

	gctl_error(req, "Unknown verb.");
	}

	static void
	g_cache_dumpconf(struct sbuf sb, const char indent, struct g_geom *gp,
	struct g_consumer cp, struct g_provider pp)
	{
	struct g_cache_softc *sc;

	if (pp != NULL \|\| cp != NULL)
	return;
	sc = gp->softc;
	sbuf_printf(sb, "%s<Size>%u</Size>\n", indent, sc->sc_maxent);
	sbuf_printf(sb, "%s<BlockSize>%u</BlockSize>\n", indent, sc->sc_bsize);
	sbuf_printf(sb, "%s<TailOffset>%ju</TailOffset>\n", indent,
	(uintmax_t)sc->sc_tail);
	sbuf_printf(sb, "%s<Entries>%u</Entries>\n", indent, sc->sc_nent);
	sbuf_printf(sb, "%s<UsedEntries>%u</UsedEntries>\n", indent,
	sc->sc_nused);
	sbuf_printf(sb, "%s<InvalidEntries>%u</InvalidEntries>\n", indent,
	sc->sc_invalid);
	sbuf_printf(sb, "%s<Reads>%ju</Reads>\n", indent, sc->sc_reads);
	sbuf_printf(sb, "%s<ReadBytes>%ju</ReadBytes>\n", indent,
	sc->sc_readbytes);
	sbuf_printf(sb, "%s<CacheReads>%ju</CacheReads>\n", indent,
	sc->sc_cachereads);
	sbuf_printf(sb, "%s<CacheReadBytes>%ju</CacheReadBytes>\n", indent,
	sc->sc_cachereadbytes);
	sbuf_printf(sb, "%s<CacheHits>%ju</CacheHits>\n", indent,
	sc->sc_cachehits);
	sbuf_printf(sb, "%s<CacheMisses>%ju</CacheMisses>\n", indent,
	sc->sc_cachemisses);
	sbuf_printf(sb, "%s<CacheFull>%ju</CacheFull>\n", indent,
	sc->sc_cachefull);
	sbuf_printf(sb, "%s<Writes>%ju</Writes>\n", indent, sc->sc_writes);
	sbuf_printf(sb, "%s<WroteBytes>%ju</WroteBytes>\n", indent,
	sc->sc_wrotebytes);
	}

	DECLARE_GEOM_CLASS(g_cache_class, g_cache);
	MODULE_VERSION(geom_cache, 0);
	diff --git a/sys/geom/eli/g_eli_integrity.c b/sys/geom/eli/g_eli_integrity.c
	index d20b753256dd..4cf982e3ddfa 100644
	--- a/sys/geom/eli/g_eli_integrity.c
	+++ b/sys/geom/eli/g_eli_integrity.c
	@@ -1,556 +1,556 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2005-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/linker.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/bio.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/kthread.h>
	#include <sys/proc.h>
	#include <sys/sched.h>
	#include <sys/smp.h>
	#include <sys/vnode.h>

	#include <vm/uma.h>

	#include <geom/geom.h>
	#include <geom/geom_dbg.h>
	#include <geom/eli/g_eli.h>
	#include <geom/eli/pkcs5v2.h>

	/*
	* The data layout description when integrity verification is configured.
	*
	* One of the most important assumption here is that authenticated data and its
	* HMAC has to be stored in the same place (namely in the same sector) to make
	* it work reliable.
	* The problem is that file systems work only with sectors that are multiple of
	* 512 bytes and a power of two number.
	* My idea to implement it is as follows.
	* Let's store HMAC in sector. This is a must. This leaves us 480 bytes for
	* data. We can't use that directly (ie. we can't create provider with 480 bytes
	* sector size). We need another sector from where we take only 32 bytes of data
	* and we store HMAC of this data as well. This takes two sectors from the
	* original provider at the input and leaves us one sector of authenticated data
	* at the output. Not very efficient, but you got the idea.
	* Now, let's assume, we want to create provider with 4096 bytes sector.
	* To output 4096 bytes of authenticated data we need 8x480 plus 1x256, so we
	* need nine 512-bytes sectors at the input to get one 4096-bytes sector at the
	* output. That's better. With 4096 bytes sector we can use 89% of size of the
	* original provider. I find it as an acceptable cost.
	* The reliability comes from the fact, that every HMAC stored inside the sector
	* is calculated only for the data in the same sector, so its impossible to
	* write new data and leave old HMAC or vice versa.
	*
	* And here is the picture:
	*
	* da0: +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+-----+
	* \|32b \|480b\| \|32b \|480b\| \|32b \|480b\| \|32b \|480b\| \|32b \|480b\| \|32b \|480b\| \|32b \|480b\| \|32b \|480b\| \|32b \|256b \|
	* \|HMAC\|Data\| \|HMAC\|Data\| \|HMAC\|Data\| \|HMAC\|Data\| \|HMAC\|Data\| \|HMAC\|Data\| \|HMAC\|Data\| \|HMAC\|Data\| \|HMAC\|Data \|
	* +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+-----+
	* \|512 bytes\| \|512 bytes\| \|512 bytes\| \|512 bytes\| \|512 bytes\| \|512 bytes\| \|512 bytes\| \|512 bytes\| \|288 bytes \|
	* +---------+ +---------+ +---------+ +---------+ +---------+ +---------+ +---------+ +---------+ \|224 unused\|
	* +----------+
	* da0.eli: +----+----+----+----+----+----+----+----+----+
	* \|480b\|480b\|480b\|480b\|480b\|480b\|480b\|480b\|256b\|
	* +----+----+----+----+----+----+----+----+----+
	* \| 4096 bytes \|
	* +--------------------------------------------+
	*
	* PS. You can use any sector size with geli(8). My example is using 4kB,
	* because it's most efficient. For 8kB sectors you need 2 extra sectors,
	* so the cost is the same as for 4kB sectors.
	*/

	/*
	* Code paths:
	* BIO_READ:
	* g_eli_start -> g_eli_auth_read -> g_io_request -> g_eli_read_done -> g_eli_auth_run -> g_eli_auth_read_done -> g_io_deliver
	* BIO_WRITE:
	* g_eli_start -> g_eli_auth_run -> g_eli_auth_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
	*/

	MALLOC_DECLARE(M_ELI);

	/*
	* Here we generate key for HMAC. Every sector has its own HMAC key, so it is
	* not possible to copy sectors.
	* We cannot depend on fact, that every sector has its own IV, because different
	* IV doesn't change HMAC, when we use encrypt-then-authenticate method.
	*/
	static void
	g_eli_auth_keygen(struct g_eli_softc sc, off_t offset, u_char key)
	{
	SHA256_CTX ctx;

	/* Copy precalculated SHA256 context. */
	bcopy(&sc->sc_akeyctx, &ctx, sizeof(ctx));
	SHA256_Update(&ctx, (uint8_t *)&offset, sizeof(offset));
	SHA256_Final(key, &ctx);
	}

	/*
	* The function is called after we read and decrypt data.
	*
	* g_eli_start -> g_eli_auth_read -> g_io_request -> g_eli_read_done -> g_eli_auth_run -> G_ELI_AUTH_READ_DONE -> g_io_deliver
	*/
	static int
	g_eli_auth_read_done(struct cryptop *crp)
	{
	struct g_eli_softc *sc;
	struct bio *bp;

	if (crp->crp_etype == EAGAIN) {
	if (g_eli_crypto_rerun(crp) == 0)
	return (0);
	}
	bp = (struct bio *)crp->crp_opaque;
	bp->bio_inbed++;
	sc = bp->bio_to->geom->softc;
	if (crp->crp_etype == 0) {
	bp->bio_completed += crp->crp_payload_length;
	G_ELI_DEBUG(3, "Crypto READ request done (%d/%d) (add=%d completed=%jd).",
	bp->bio_inbed, bp->bio_children, crp->crp_payload_length, (intmax_t)bp->bio_completed);
	} else {
	u_int nsec, decr_secsize, encr_secsize, rel_sec;
	int *errorp;

	/* Sectorsize of decrypted provider eg. 4096. */
	decr_secsize = bp->bio_to->sectorsize;
	/* The real sectorsize of encrypted provider, eg. 512. */
	encr_secsize =
	LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize;
	/* Number of sectors from decrypted provider, eg. 2. */
	nsec = bp->bio_length / decr_secsize;
	/* Number of sectors from encrypted provider, eg. 18. */
	nsec = (nsec * sc->sc_bytes_per_sector) / encr_secsize;
	/* Which relative sector this request decrypted. */
	rel_sec = ((crp->crp_buf.cb_buf + crp->crp_payload_start) -
	(char *)bp->bio_driver2) / encr_secsize;

	errorp = (int )((char )bp->bio_driver2 + encr_secsize * nsec +
	sizeof(int) * rel_sec);
	*errorp = crp->crp_etype;
	G_ELI_DEBUG(1,
	"Crypto READ request failed (%d/%d) error=%d.",
	bp->bio_inbed, bp->bio_children, crp->crp_etype);
	if (bp->bio_error == 0 \|\| bp->bio_error == EINTEGRITY)
	bp->bio_error = crp->crp_etype == EBADMSG ?
	EINTEGRITY : crp->crp_etype;
	}
	if (crp->crp_cipher_key != NULL)
	g_eli_key_drop(sc, __DECONST(void *, crp->crp_cipher_key));
	crypto_freereq(crp);
	/*
	* Do we have all sectors already?
	*/
	if (bp->bio_inbed < bp->bio_children)
	return (0);

	if (bp->bio_error == 0) {
	u_int i, lsec, nsec, data_secsize, decr_secsize, encr_secsize;
	u_char srcdata, dstdata;

	/* Sectorsize of decrypted provider eg. 4096. */
	decr_secsize = bp->bio_to->sectorsize;
	/* The real sectorsize of encrypted provider, eg. 512. */
	encr_secsize = LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize;
	/* Number of data bytes in one encrypted sector, eg. 480. */
	data_secsize = sc->sc_data_per_sector;
	/* Number of sectors from decrypted provider, eg. 2. */
	nsec = bp->bio_length / decr_secsize;
	/* Number of sectors from encrypted provider, eg. 18. */
	nsec = (nsec * sc->sc_bytes_per_sector) / encr_secsize;
	/* Last sector number in every big sector, eg. 9. */
	lsec = sc->sc_bytes_per_sector / encr_secsize;

	srcdata = bp->bio_driver2;
	dstdata = bp->bio_data;

	for (i = 1; i <= nsec; i++) {
	data_secsize = sc->sc_data_per_sector;
	if ((i % lsec) == 0)
	data_secsize = decr_secsize % data_secsize;
	bcopy(srcdata + sc->sc_alen, dstdata, data_secsize);
	srcdata += encr_secsize;
	dstdata += data_secsize;
	}
	} else if (bp->bio_error == EINTEGRITY) {
	u_int i, lsec, nsec, data_secsize, decr_secsize, encr_secsize;
	int *errorp;
	off_t coroff, corsize, dstoff;

	/* Sectorsize of decrypted provider eg. 4096. */
	decr_secsize = bp->bio_to->sectorsize;
	/* The real sectorsize of encrypted provider, eg. 512. */
	encr_secsize = LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize;
	/* Number of data bytes in one encrypted sector, eg. 480. */
	data_secsize = sc->sc_data_per_sector;
	/* Number of sectors from decrypted provider, eg. 2. */
	nsec = bp->bio_length / decr_secsize;
	/* Number of sectors from encrypted provider, eg. 18. */
	nsec = (nsec * sc->sc_bytes_per_sector) / encr_secsize;
	/* Last sector number in every big sector, eg. 9. */
	lsec = sc->sc_bytes_per_sector / encr_secsize;

	errorp = (int )((char )bp->bio_driver2 + encr_secsize * nsec);
	coroff = -1;
	corsize = 0;
	dstoff = bp->bio_offset;

	for (i = 1; i <= nsec; i++) {
	data_secsize = sc->sc_data_per_sector;
	if ((i % lsec) == 0)
	data_secsize = decr_secsize % data_secsize;
	if (errorp[i - 1] == EBADMSG) {
	/*
	* Corruption detected, remember the offset if
	* this is the first corrupted sector and
	* increase size.
	*/
	if (coroff == -1)
	coroff = dstoff;
	corsize += data_secsize;
	} else {
	/*
	* No corruption, good.
	* Report previous corruption if there was one.
	*/
	if (coroff != -1) {
	G_ELI_DEBUG(0, "%s: Failed to authenticate %jd "
	"bytes of data at offset %jd.",
	sc->sc_name, (intmax_t)corsize,
	(intmax_t)coroff);
	coroff = -1;
	corsize = 0;
	}
	}
	dstoff += data_secsize;
	}
	/* Report previous corruption if there was one. */
	if (coroff != -1) {
	G_ELI_DEBUG(0, "%s: Failed to authenticate %jd "
	"bytes of data at offset %jd.",
	sc->sc_name, (intmax_t)corsize, (intmax_t)coroff);
	}
	}
	free(bp->bio_driver2, M_ELI);
	bp->bio_driver2 = NULL;
	if (bp->bio_error != 0) {
	if (bp->bio_error != EINTEGRITY) {
	G_ELI_LOGREQ(0, bp,
	"Crypto READ request failed (error=%d).",
	bp->bio_error);
	}
	bp->bio_completed = 0;
	}
	/*
	* Read is finished, send it up.
	*/
	g_io_deliver(bp, bp->bio_error);
	atomic_subtract_int(&sc->sc_inflight, 1);
	return (0);
	}

	/*
	* The function is called after data encryption.
	*
	* g_eli_start -> g_eli_auth_run -> G_ELI_AUTH_WRITE_DONE -> g_io_request -> g_eli_write_done -> g_io_deliver
	*/
	static int
	g_eli_auth_write_done(struct cryptop *crp)
	{
	struct g_eli_softc *sc;
	struct g_consumer *cp;
	struct bio bp, cbp, *cbp2;
	u_int nsec;

	if (crp->crp_etype == EAGAIN) {
	if (g_eli_crypto_rerun(crp) == 0)
	return (0);
	}
	bp = (struct bio *)crp->crp_opaque;
	bp->bio_inbed++;
	if (crp->crp_etype == 0) {
	G_ELI_DEBUG(3, "Crypto WRITE request done (%d/%d).",
	bp->bio_inbed, bp->bio_children);
	} else {
	G_ELI_DEBUG(1, "Crypto WRITE request failed (%d/%d) error=%d.",
	bp->bio_inbed, bp->bio_children, crp->crp_etype);
	if (bp->bio_error == 0)
	bp->bio_error = crp->crp_etype;
	}
	sc = bp->bio_to->geom->softc;
	if (crp->crp_cipher_key != NULL)
	g_eli_key_drop(sc, __DECONST(void *, crp->crp_cipher_key));
	crypto_freereq(crp);
	/*
	* All sectors are already encrypted?
	*/
	if (bp->bio_inbed < bp->bio_children)
	return (0);
	if (bp->bio_error != 0) {
	G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).",
	bp->bio_error);
	free(bp->bio_driver2, M_ELI);
	bp->bio_driver2 = NULL;
	cbp = bp->bio_driver1;
	bp->bio_driver1 = NULL;
	g_destroy_bio(cbp);
	g_io_deliver(bp, bp->bio_error);
	atomic_subtract_int(&sc->sc_inflight, 1);
	return (0);
	}
	cp = LIST_FIRST(&sc->sc_geom->consumer);
	cbp = bp->bio_driver1;
	bp->bio_driver1 = NULL;
	cbp->bio_to = cp->provider;
	cbp->bio_done = g_eli_write_done;

	/* Number of sectors from decrypted provider, eg. 1. */
	nsec = bp->bio_length / bp->bio_to->sectorsize;
	/* Number of sectors from encrypted provider, eg. 9. */
	nsec = (nsec * sc->sc_bytes_per_sector) / cp->provider->sectorsize;

	cbp->bio_length = cp->provider->sectorsize * nsec;
	cbp->bio_offset = (bp->bio_offset / bp->bio_to->sectorsize) * sc->sc_bytes_per_sector;
	cbp->bio_data = bp->bio_driver2;

	/*
	* We write more than what is requested, so we have to be ready to write
	- * more than MAXPHYS.
	+ * more than maxphys.
	*/
	cbp2 = NULL;
	- if (cbp->bio_length > MAXPHYS) {
	+ if (cbp->bio_length > maxphys) {
	cbp2 = g_duplicate_bio(bp);
	- cbp2->bio_length = cbp->bio_length - MAXPHYS;
	- cbp2->bio_data = cbp->bio_data + MAXPHYS;
	- cbp2->bio_offset = cbp->bio_offset + MAXPHYS;
	+ cbp2->bio_length = cbp->bio_length - maxphys;
	+ cbp2->bio_data = cbp->bio_data + maxphys;
	+ cbp2->bio_offset = cbp->bio_offset + maxphys;
	cbp2->bio_to = cp->provider;
	cbp2->bio_done = g_eli_write_done;
	- cbp->bio_length = MAXPHYS;
	+ cbp->bio_length = maxphys;
	}
	/*
	* Send encrypted data to the provider.
	*/
	G_ELI_LOGREQ(2, cbp, "Sending request.");
	bp->bio_inbed = 0;
	bp->bio_children = (cbp2 != NULL ? 2 : 1);
	g_io_request(cbp, cp);
	if (cbp2 != NULL) {
	G_ELI_LOGREQ(2, cbp2, "Sending request.");
	g_io_request(cbp2, cp);
	}
	return (0);
	}

	void
	g_eli_auth_read(struct g_eli_softc sc, struct bio bp)
	{
	struct g_consumer *cp;
	struct bio cbp, cbp2;
	size_t size;
	off_t nsec;

	bp->bio_pflags = 0;

	cp = LIST_FIRST(&sc->sc_geom->consumer);
	cbp = bp->bio_driver1;
	bp->bio_driver1 = NULL;
	cbp->bio_to = cp->provider;
	cbp->bio_done = g_eli_read_done;

	/* Number of sectors from decrypted provider, eg. 1. */
	nsec = bp->bio_length / bp->bio_to->sectorsize;
	/* Number of sectors from encrypted provider, eg. 9. */
	nsec = (nsec * sc->sc_bytes_per_sector) / cp->provider->sectorsize;

	cbp->bio_length = cp->provider->sectorsize * nsec;
	size = cbp->bio_length;
	size += sizeof(int) * nsec;
	size += G_ELI_AUTH_SECKEYLEN * nsec;
	cbp->bio_offset = (bp->bio_offset / bp->bio_to->sectorsize) * sc->sc_bytes_per_sector;
	bp->bio_driver2 = malloc(size, M_ELI, M_WAITOK);
	cbp->bio_data = bp->bio_driver2;

	/* Clear the error array. */
	memset((char *)bp->bio_driver2 + cbp->bio_length, 0,
	sizeof(int) * nsec);

	/*
	* We read more than what is requested, so we have to be ready to read
	- * more than MAXPHYS.
	+ * more than maxphys.
	*/
	cbp2 = NULL;
	- if (cbp->bio_length > MAXPHYS) {
	+ if (cbp->bio_length > maxphys) {
	cbp2 = g_duplicate_bio(bp);
	- cbp2->bio_length = cbp->bio_length - MAXPHYS;
	- cbp2->bio_data = cbp->bio_data + MAXPHYS;
	- cbp2->bio_offset = cbp->bio_offset + MAXPHYS;
	+ cbp2->bio_length = cbp->bio_length - maxphys;
	+ cbp2->bio_data = cbp->bio_data + maxphys;
	+ cbp2->bio_offset = cbp->bio_offset + maxphys;
	cbp2->bio_to = cp->provider;
	cbp2->bio_done = g_eli_read_done;
	- cbp->bio_length = MAXPHYS;
	+ cbp->bio_length = maxphys;
	}
	/*
	* Read encrypted data from provider.
	*/
	G_ELI_LOGREQ(2, cbp, "Sending request.");
	g_io_request(cbp, cp);
	if (cbp2 != NULL) {
	G_ELI_LOGREQ(2, cbp2, "Sending request.");
	g_io_request(cbp2, cp);
	}
	}

	/*
	* This is the main function responsible for cryptography (ie. communication
	* with crypto(9) subsystem).
	*
	* BIO_READ:
	* g_eli_start -> g_eli_auth_read -> g_io_request -> g_eli_read_done -> G_ELI_AUTH_RUN -> g_eli_auth_read_done -> g_io_deliver
	* BIO_WRITE:
	* g_eli_start -> G_ELI_AUTH_RUN -> g_eli_auth_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
	*/
	void
	g_eli_auth_run(struct g_eli_worker wr, struct bio bp)
	{
	struct g_eli_softc *sc;
	struct cryptop *crp;
	u_int i, lsec, nsec, data_secsize, decr_secsize, encr_secsize;
	off_t dstoff;
	u_char p, data, authkey, plaindata;
	int error;

	G_ELI_LOGREQ(3, bp, "%s", __func__);

	bp->bio_pflags = wr->w_number;
	sc = wr->w_softc;
	/* Sectorsize of decrypted provider eg. 4096. */
	decr_secsize = bp->bio_to->sectorsize;
	/* The real sectorsize of encrypted provider, eg. 512. */
	encr_secsize = LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize;
	/* Number of data bytes in one encrypted sector, eg. 480. */
	data_secsize = sc->sc_data_per_sector;
	/* Number of sectors from decrypted provider, eg. 2. */
	nsec = bp->bio_length / decr_secsize;
	/* Number of sectors from encrypted provider, eg. 18. */
	nsec = (nsec * sc->sc_bytes_per_sector) / encr_secsize;
	/* Last sector number in every big sector, eg. 9. */
	lsec = sc->sc_bytes_per_sector / encr_secsize;
	/* Destination offset, used for IV generation. */
	dstoff = (bp->bio_offset / bp->bio_to->sectorsize) * sc->sc_bytes_per_sector;

	plaindata = bp->bio_data;
	if (bp->bio_cmd == BIO_READ) {
	data = bp->bio_driver2;
	p = data + encr_secsize * nsec;
	p += sizeof(int) * nsec;
	} else {
	size_t size;

	size = encr_secsize * nsec;
	size += G_ELI_AUTH_SECKEYLEN * nsec;
	size += sizeof(uintptr_t); /* Space for alignment. */
	data = malloc(size, M_ELI, M_WAITOK);
	bp->bio_driver2 = data;
	p = data + encr_secsize * nsec;
	}
	bp->bio_inbed = 0;
	bp->bio_children = nsec;

	#if defined(__mips_n64) \|\| defined(__mips_o64)
	p = (char *)roundup((uintptr_t)p, sizeof(uintptr_t));
	#endif

	for (i = 1; i <= nsec; i++, dstoff += encr_secsize) {
	crp = crypto_getreq(wr->w_sid, M_WAITOK);
	authkey = (u_char *)p; p += G_ELI_AUTH_SECKEYLEN;

	data_secsize = sc->sc_data_per_sector;
	if ((i % lsec) == 0) {
	data_secsize = decr_secsize % data_secsize;
	/*
	* Last encrypted sector of each decrypted sector is
	* only partially filled.
	*/
	if (bp->bio_cmd == BIO_WRITE)
	memset(data + sc->sc_alen + data_secsize, 0,
	encr_secsize - sc->sc_alen - data_secsize);
	}

	if (bp->bio_cmd == BIO_WRITE) {
	bcopy(plaindata, data + sc->sc_alen, data_secsize);
	plaindata += data_secsize;
	}

	crypto_use_buf(crp, data, sc->sc_alen + data_secsize);
	crp->crp_opaque = (void *)bp;
	data += encr_secsize;
	crp->crp_flags = CRYPTO_F_CBIFSYNC;
	if (g_eli_batch)
	crp->crp_flags \|= CRYPTO_F_BATCH;
	if (bp->bio_cmd == BIO_WRITE) {
	crp->crp_callback = g_eli_auth_write_done;
	crp->crp_op = CRYPTO_OP_ENCRYPT \|
	CRYPTO_OP_COMPUTE_DIGEST;
	} else {
	crp->crp_callback = g_eli_auth_read_done;
	crp->crp_op = CRYPTO_OP_DECRYPT \|
	CRYPTO_OP_VERIFY_DIGEST;
	}

	crp->crp_digest_start = 0;
	crp->crp_payload_start = sc->sc_alen;
	crp->crp_payload_length = data_secsize;
	if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) == 0) {
	crp->crp_cipher_key = g_eli_key_hold(sc, dstoff,
	encr_secsize);
	}
	if (g_eli_ivlen(sc->sc_ealgo) != 0) {
	crp->crp_flags \|= CRYPTO_F_IV_SEPARATE;
	g_eli_crypto_ivgen(sc, dstoff, crp->crp_iv,
	sizeof(crp->crp_iv));
	}

	g_eli_auth_keygen(sc, dstoff, authkey);
	crp->crp_auth_key = authkey;

	error = crypto_dispatch(crp);
	KASSERT(error == 0, ("crypto_dispatch() failed (error=%d)",
	error));
	}
	}
	diff --git a/sys/geom/geom_dev.c b/sys/geom/geom_dev.c
	index 3dcf8a38a1e2..9c33ab71e6c8 100644
	--- a/sys/geom/geom_dev.c
	+++ b/sys/geom/geom_dev.c
	@@ -1,899 +1,899 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 2002 Poul-Henning Kamp
	* Copyright (c) 2002 Networks Associates Technology, Inc.
	* All rights reserved.
	*
	* This software was developed for the FreeBSD Project by Poul-Henning Kamp
	* and NAI Labs, the Security Research Division of Network Associates, Inc.
	* under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
	* DARPA CHATS research program.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The names of the authors may not be used to endorse or promote
	* products derived from this software without specific prior written
	* permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/kernel.h>
	#include <sys/conf.h>
	#include <sys/ctype.h>
	#include <sys/bio.h>
	#include <sys/devctl.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/errno.h>
	#include <sys/time.h>
	#include <sys/disk.h>
	#include <sys/fcntl.h>
	#include <sys/limits.h>
	#include <sys/sysctl.h>
	#include <geom/geom.h>
	#include <geom/geom_int.h>
	#include <machine/stdarg.h>

	struct g_dev_softc {
	struct mtx sc_mtx;
	struct cdev *sc_dev;
	struct cdev *sc_alias;
	int sc_open;
	u_int sc_active;
	#define SC_A_DESTROY (1 << 31)
	#define SC_A_OPEN (1 << 30)
	#define SC_A_ACTIVE (SC_A_OPEN - 1)
	};

	static d_open_t g_dev_open;
	static d_close_t g_dev_close;
	static d_strategy_t g_dev_strategy;
	static d_ioctl_t g_dev_ioctl;

	static struct cdevsw g_dev_cdevsw = {
	.d_version = D_VERSION,
	.d_open = g_dev_open,
	.d_close = g_dev_close,
	.d_read = physread,
	.d_write = physwrite,
	.d_ioctl = g_dev_ioctl,
	.d_strategy = g_dev_strategy,
	.d_name = "g_dev",
	.d_flags = D_DISK \| D_TRACKCLOSE,
	};

	static g_init_t g_dev_init;
	static g_fini_t g_dev_fini;
	static g_taste_t g_dev_taste;
	static g_orphan_t g_dev_orphan;
	static g_attrchanged_t g_dev_attrchanged;
	static g_resize_t g_dev_resize;

	static struct g_class g_dev_class = {
	.name = "DEV",
	.version = G_VERSION,
	.init = g_dev_init,
	.fini = g_dev_fini,
	.taste = g_dev_taste,
	.orphan = g_dev_orphan,
	.attrchanged = g_dev_attrchanged,
	.resize = g_dev_resize
	};

	/*
	* We target 262144 (8 x 32768) sectors by default as this significantly
	* increases the throughput on commonly used SSD's with a marginal
	* increase in non-interruptible request latency.
	*/
	static uint64_t g_dev_del_max_sectors = 262144;
	SYSCTL_DECL(_kern_geom);
	SYSCTL_NODE(_kern_geom, OID_AUTO, dev, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_DEV stuff");
	SYSCTL_QUAD(_kern_geom_dev, OID_AUTO, delete_max_sectors, CTLFLAG_RW,
	&g_dev_del_max_sectors, 0, "Maximum number of sectors in a single "
	"delete request sent to the provider. Larger requests are chunked "
	"so they can be interrupted. (0 = disable chunking)");

	static char *dumpdev = NULL;
	static void
	g_dev_init(struct g_class *mp)
	{

	dumpdev = kern_getenv("dumpdev");
	}

	static void
	g_dev_fini(struct g_class *mp)
	{

	freeenv(dumpdev);
	dumpdev = NULL;
	}

	static int
	g_dev_setdumpdev(struct cdev dev, struct diocskerneldump_arg kda)
	{
	struct g_kerneldump kd;
	struct g_consumer *cp;
	int error, len;

	MPASS(dev != NULL && kda != NULL);
	MPASS(kda->kda_index != KDA_REMOVE);

	cp = dev->si_drv2;
	len = sizeof(kd);
	memset(&kd, 0, len);
	kd.offset = 0;
	kd.length = OFF_MAX;
	error = g_io_getattr("GEOM::kerneldump", cp, &len, &kd);
	if (error != 0)
	return (error);

	error = dumper_insert(&kd.di, devtoname(dev), kda);
	if (error == 0)
	dev->si_flags \|= SI_DUMPDEV;

	return (error);
	}

	static int
	init_dumpdev(struct cdev *dev)
	{
	struct diocskerneldump_arg kda;
	struct g_consumer *cp;
	const char devprefix = _PATH_DEV, devname;
	int error;
	size_t len;

	bzero(&kda, sizeof(kda));
	kda.kda_index = KDA_APPEND;

	if (dumpdev == NULL)
	return (0);

	len = strlen(devprefix);
	devname = devtoname(dev);
	if (strcmp(devname, dumpdev) != 0 &&
	(strncmp(dumpdev, devprefix, len) != 0 \|\|
	strcmp(devname, dumpdev + len) != 0))
	return (0);

	cp = (struct g_consumer *)dev->si_drv2;
	error = g_access(cp, 1, 0, 0);
	if (error != 0)
	return (error);

	error = g_dev_setdumpdev(dev, &kda);
	if (error == 0) {
	freeenv(dumpdev);
	dumpdev = NULL;
	}

	(void)g_access(cp, -1, 0, 0);

	return (error);
	}

	static void
	g_dev_destroy(void *arg, int flags __unused)
	{
	struct g_consumer *cp;
	struct g_geom *gp;
	struct g_dev_softc *sc;
	char buf[SPECNAMELEN + 6];

	g_topology_assert();
	cp = arg;
	gp = cp->geom;
	sc = cp->private;
	g_trace(G_T_TOPOLOGY, "g_dev_destroy(%p(%s))", cp, gp->name);
	snprintf(buf, sizeof(buf), "cdev=%s", gp->name);
	devctl_notify("GEOM", "DEV", "DESTROY", buf);
	if (cp->acr > 0 \|\| cp->acw > 0 \|\| cp->ace > 0)
	g_access(cp, -cp->acr, -cp->acw, -cp->ace);
	g_detach(cp);
	g_destroy_consumer(cp);
	g_destroy_geom(gp);
	mtx_destroy(&sc->sc_mtx);
	g_free(sc);
	}

	void
	g_dev_print(void)
	{
	struct g_geom *gp;
	char const *p = "";

	LIST_FOREACH(gp, &g_dev_class.geom, geom) {
	printf("%s%s", p, gp->name);
	p = " ";
	}
	printf("\n");
	}

	static void
	g_dev_set_physpath(struct g_consumer *cp)
	{
	struct g_dev_softc *sc;
	char *physpath;
	int error, physpath_len;

	if (g_access(cp, 1, 0, 0) != 0)
	return;

	sc = cp->private;
	physpath_len = MAXPATHLEN;
	physpath = g_malloc(physpath_len, M_WAITOK\|M_ZERO);
	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
	g_access(cp, -1, 0, 0);
	if (error == 0 && strlen(physpath) != 0) {
	struct cdev dev, old_alias_dev;
	struct cdev **alias_devp;

	dev = sc->sc_dev;
	old_alias_dev = sc->sc_alias;
	alias_devp = (struct cdev **)&sc->sc_alias;
	make_dev_physpath_alias(MAKEDEV_WAITOK, alias_devp, dev,
	old_alias_dev, physpath);
	} else if (sc->sc_alias) {
	destroy_dev((struct cdev *)sc->sc_alias);
	sc->sc_alias = NULL;
	}
	g_free(physpath);
	}

	static void
	g_dev_set_media(struct g_consumer *cp)
	{
	struct g_dev_softc *sc;
	struct cdev *dev;
	char buf[SPECNAMELEN + 6];

	sc = cp->private;
	dev = sc->sc_dev;
	snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name);
	devctl_notify("DEVFS", "CDEV", "MEDIACHANGE", buf);
	devctl_notify("GEOM", "DEV", "MEDIACHANGE", buf);
	dev = sc->sc_alias;
	if (dev != NULL) {
	snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name);
	devctl_notify("DEVFS", "CDEV", "MEDIACHANGE", buf);
	devctl_notify("GEOM", "DEV", "MEDIACHANGE", buf);
	}
	}

	static void
	g_dev_attrchanged(struct g_consumer cp, const char attr)
	{

	if (strcmp(attr, "GEOM::media") == 0) {
	g_dev_set_media(cp);
	return;
	}

	if (strcmp(attr, "GEOM::physpath") == 0) {
	g_dev_set_physpath(cp);
	return;
	}
	}

	static void
	g_dev_resize(struct g_consumer *cp)
	{
	char buf[SPECNAMELEN + 6];

	snprintf(buf, sizeof(buf), "cdev=%s", cp->provider->name);
	devctl_notify("GEOM", "DEV", "SIZECHANGE", buf);
	}

	struct g_provider *
	g_dev_getprovider(struct cdev *dev)
	{
	struct g_consumer *cp;

	g_topology_assert();
	if (dev == NULL)
	return (NULL);
	if (dev->si_devsw != &g_dev_cdevsw)
	return (NULL);
	cp = dev->si_drv2;
	return (cp->provider);
	}

	static struct g_geom *
	g_dev_taste(struct g_class mp, struct g_provider pp, int insist __unused)
	{
	struct g_geom *gp;
	struct g_geom_alias *gap;
	struct g_consumer *cp;
	struct g_dev_softc *sc;
	int error;
	struct cdev dev, adev;
	char buf[SPECNAMELEN + 6];
	struct make_dev_args args;

	g_trace(G_T_TOPOLOGY, "dev_taste(%s,%s)", mp->name, pp->name);
	g_topology_assert();
	gp = g_new_geomf(mp, "%s", pp->name);
	sc = g_malloc(sizeof(*sc), M_WAITOK \| M_ZERO);
	mtx_init(&sc->sc_mtx, "g_dev", NULL, MTX_DEF);
	cp = g_new_consumer(gp);
	cp->private = sc;
	cp->flags \|= G_CF_DIRECT_SEND \| G_CF_DIRECT_RECEIVE;
	error = g_attach(cp, pp);
	if (error != 0) {
	printf("%s: g_dev_taste(%s) failed to g_attach, error=%d\n",
	__func__, pp->name, error);
	g_destroy_consumer(cp);
	g_destroy_geom(gp);
	mtx_destroy(&sc->sc_mtx);
	g_free(sc);
	return (NULL);
	}
	make_dev_args_init(&args);
	args.mda_flags = MAKEDEV_CHECKNAME \| MAKEDEV_WAITOK;
	args.mda_devsw = &g_dev_cdevsw;
	args.mda_cr = NULL;
	args.mda_uid = UID_ROOT;
	args.mda_gid = GID_OPERATOR;
	args.mda_mode = 0640;
	args.mda_si_drv1 = sc;
	args.mda_si_drv2 = cp;
	error = make_dev_s(&args, &sc->sc_dev, "%s", gp->name);
	if (error != 0) {
	printf("%s: make_dev_p() failed (gp->name=%s, error=%d)\n",
	__func__, gp->name, error);
	g_detach(cp);
	g_destroy_consumer(cp);
	g_destroy_geom(gp);
	mtx_destroy(&sc->sc_mtx);
	g_free(sc);
	return (NULL);
	}
	dev = sc->sc_dev;
	dev->si_flags \|= SI_UNMAPPED;
	- dev->si_iosize_max = MAXPHYS;
	+ dev->si_iosize_max = maxphys;
	error = init_dumpdev(dev);
	if (error != 0)
	printf("%s: init_dumpdev() failed (gp->name=%s, error=%d)\n",
	__func__, gp->name, error);

	g_dev_attrchanged(cp, "GEOM::physpath");
	snprintf(buf, sizeof(buf), "cdev=%s", gp->name);
	devctl_notify("GEOM", "DEV", "CREATE", buf);
	/*
	* Now add all the aliases for this drive
	*/
	LIST_FOREACH(gap, &pp->aliases, ga_next) {
	error = make_dev_alias_p(MAKEDEV_CHECKNAME \| MAKEDEV_WAITOK, &adev, dev,
	"%s", gap->ga_alias);
	if (error) {
	printf("%s: make_dev_alias_p() failed (name=%s, error=%d)\n",
	__func__, gap->ga_alias, error);
	continue;
	}
	snprintf(buf, sizeof(buf), "cdev=%s", gap->ga_alias);
	devctl_notify("GEOM", "DEV", "CREATE", buf);
	}

	return (gp);
	}

	static int
	g_dev_open(struct cdev dev, int flags, int fmt, struct thread td)
	{
	struct g_consumer *cp;
	struct g_dev_softc *sc;
	int error, r, w, e;

	cp = dev->si_drv2;
	g_trace(G_T_ACCESS, "g_dev_open(%s, %d, %d, %p)",
	cp->geom->name, flags, fmt, td);

	r = flags & FREAD ? 1 : 0;
	w = flags & FWRITE ? 1 : 0;
	#ifdef notyet
	e = flags & O_EXCL ? 1 : 0;
	#else
	e = 0;
	#endif

	/*
	* This happens on attempt to open a device node with O_EXEC.
	*/
	if (r + w + e == 0)
	return (EINVAL);

	if (w) {
	/*
	* When running in very secure mode, do not allow
	* opens for writing of any disks.
	*/
	error = securelevel_ge(td->td_ucred, 2);
	if (error)
	return (error);
	}
	g_topology_lock();
	error = g_access(cp, r, w, e);
	g_topology_unlock();
	if (error == 0) {
	sc = dev->si_drv1;
	mtx_lock(&sc->sc_mtx);
	if (sc->sc_open == 0 && (sc->sc_active & SC_A_ACTIVE) != 0)
	wakeup(&sc->sc_active);
	sc->sc_open += r + w + e;
	if (sc->sc_open == 0)
	atomic_clear_int(&sc->sc_active, SC_A_OPEN);
	else
	atomic_set_int(&sc->sc_active, SC_A_OPEN);
	mtx_unlock(&sc->sc_mtx);
	}
	return (error);
	}

	static int
	g_dev_close(struct cdev dev, int flags, int fmt, struct thread td)
	{
	struct g_consumer *cp;
	struct g_dev_softc *sc;
	int error, r, w, e;

	cp = dev->si_drv2;
	g_trace(G_T_ACCESS, "g_dev_close(%s, %d, %d, %p)",
	cp->geom->name, flags, fmt, td);

	r = flags & FREAD ? -1 : 0;
	w = flags & FWRITE ? -1 : 0;
	#ifdef notyet
	e = flags & O_EXCL ? -1 : 0;
	#else
	e = 0;
	#endif

	/*
	* The vgonel(9) - caused by eg. forced unmount of devfs - calls
	* VOP_CLOSE(9) on devfs vnode without any FREAD or FWRITE flags,
	* which would result in zero deltas, which in turn would cause
	* panic in g_access(9).
	*
	* Note that we cannot zero the counters (ie. do "r = cp->acr"
	* etc) instead, because the consumer might be opened in another
	* devfs instance.
	*/
	if (r + w + e == 0)
	return (EINVAL);

	sc = dev->si_drv1;
	mtx_lock(&sc->sc_mtx);
	sc->sc_open += r + w + e;
	if (sc->sc_open == 0)
	atomic_clear_int(&sc->sc_active, SC_A_OPEN);
	else
	atomic_set_int(&sc->sc_active, SC_A_OPEN);
	while (sc->sc_open == 0 && (sc->sc_active & SC_A_ACTIVE) != 0)
	msleep(&sc->sc_active, &sc->sc_mtx, 0, "g_dev_close", hz / 10);
	mtx_unlock(&sc->sc_mtx);
	g_topology_lock();
	error = g_access(cp, r, w, e);
	g_topology_unlock();
	return (error);
	}

	static int
	g_dev_ioctl(struct cdev dev, u_long cmd, caddr_t data, int fflag, struct thread td)
	{
	struct g_consumer *cp;
	struct g_provider *pp;
	off_t offset, length, chunk, odd;
	int i, error;
	#ifdef COMPAT_FREEBSD12
	struct diocskerneldump_arg kda_copy;
	#endif

	cp = dev->si_drv2;
	pp = cp->provider;

	/* If consumer or provider is dying, don't disturb. */
	if (cp->flags & G_CF_ORPHAN)
	return (ENXIO);
	if (pp->error)
	return (pp->error);

	error = 0;
	KASSERT(cp->acr \|\| cp->acw,
	("Consumer with zero access count in g_dev_ioctl"));

	i = IOCPARM_LEN(cmd);
	switch (cmd) {
	case DIOCGSECTORSIZE:
	(u_int )data = pp->sectorsize;
	if ((u_int )data == 0)
	error = ENOENT;
	break;
	case DIOCGMEDIASIZE:
	(off_t )data = pp->mediasize;
	if ((off_t )data == 0)
	error = ENOENT;
	break;
	case DIOCGFWSECTORS:
	error = g_io_getattr("GEOM::fwsectors", cp, &i, data);
	if (error == 0 && (u_int )data == 0)
	error = ENOENT;
	break;
	case DIOCGFWHEADS:
	error = g_io_getattr("GEOM::fwheads", cp, &i, data);
	if (error == 0 && (u_int )data == 0)
	error = ENOENT;
	break;
	#ifdef COMPAT_FREEBSD11
	case DIOCSKERNELDUMP_FREEBSD11:
	{
	struct diocskerneldump_arg kda;

	gone_in(13, "FreeBSD 11.x ABI compat");

	bzero(&kda, sizeof(kda));
	kda.kda_encryption = KERNELDUMP_ENC_NONE;
	kda.kda_index = ((u_int )data ? 0 : KDA_REMOVE_ALL);
	if (kda.kda_index == KDA_REMOVE_ALL)
	error = dumper_remove(devtoname(dev), &kda);
	else
	error = g_dev_setdumpdev(dev, &kda);
	break;
	}
	#endif
	#ifdef COMPAT_FREEBSD12
	case DIOCSKERNELDUMP_FREEBSD12:
	{
	struct diocskerneldump_arg_freebsd12 *kda12;

	gone_in(14, "FreeBSD 12.x ABI compat");

	kda12 = (void *)data;
	memcpy(&kda_copy, kda12, sizeof(kda_copy));
	kda_copy.kda_index = (kda12->kda12_enable ?
	0 : KDA_REMOVE_ALL);

	explicit_bzero(kda12, sizeof(*kda12));
	/* Kludge to pass kda_copy to kda in fallthrough. */
	data = (void *)&kda_copy;
	}
	/* FALLTHROUGH */
	#endif
	case DIOCSKERNELDUMP:
	{
	struct diocskerneldump_arg *kda;
	uint8_t *encryptedkey;

	kda = (struct diocskerneldump_arg *)data;
	if (kda->kda_index == KDA_REMOVE_ALL \|\|
	kda->kda_index == KDA_REMOVE_DEV \|\|
	kda->kda_index == KDA_REMOVE) {
	error = dumper_remove(devtoname(dev), kda);
	explicit_bzero(kda, sizeof(*kda));
	break;
	}

	if (kda->kda_encryption != KERNELDUMP_ENC_NONE) {
	if (kda->kda_encryptedkeysize == 0 \|\|
	kda->kda_encryptedkeysize >
	KERNELDUMP_ENCKEY_MAX_SIZE) {
	explicit_bzero(kda, sizeof(*kda));
	return (EINVAL);
	}
	encryptedkey = malloc(kda->kda_encryptedkeysize, M_TEMP,
	M_WAITOK);
	error = copyin(kda->kda_encryptedkey, encryptedkey,
	kda->kda_encryptedkeysize);
	} else {
	encryptedkey = NULL;
	}
	if (error == 0) {
	kda->kda_encryptedkey = encryptedkey;
	error = g_dev_setdumpdev(dev, kda);
	}
	zfree(encryptedkey, M_TEMP);
	explicit_bzero(kda, sizeof(*kda));
	break;
	}
	case DIOCGFLUSH:
	error = g_io_flush(cp);
	break;
	case DIOCGDELETE:
	offset = ((off_t *)data)[0];
	length = ((off_t *)data)[1];
	if ((offset % pp->sectorsize) != 0 \|\|
	(length % pp->sectorsize) != 0 \|\| length <= 0) {
	printf("%s: offset=%jd length=%jd\n", __func__, offset,
	length);
	error = EINVAL;
	break;
	}
	if ((pp->mediasize > 0) && (offset >= pp->mediasize)) {
	/*
	* Catch out-of-bounds requests here. The problem is
	* that due to historical GEOM I/O implementation
	* peculatities, g_delete_data() would always return
	* success for requests starting just the next byte
	* after providers media boundary. Condition check on
	* non-zero media size, since that condition would
	* (most likely) cause ENXIO instead.
	*/
	error = EIO;
	break;
	}
	while (length > 0) {
	chunk = length;
	if (g_dev_del_max_sectors != 0 &&
	chunk > g_dev_del_max_sectors * pp->sectorsize) {
	chunk = g_dev_del_max_sectors * pp->sectorsize;
	if (pp->stripesize > 0) {
	odd = (offset + chunk +
	pp->stripeoffset) % pp->stripesize;
	if (chunk > odd)
	chunk -= odd;
	}
	}
	error = g_delete_data(cp, offset, chunk);
	length -= chunk;
	offset += chunk;
	if (error)
	break;
	/*
	* Since the request size can be large, the service
	* time can be is likewise. We make this ioctl
	* interruptible by checking for signals for each bio.
	*/
	if (SIGPENDING(td))
	break;
	}
	break;
	case DIOCGIDENT:
	error = g_io_getattr("GEOM::ident", cp, &i, data);
	break;
	case DIOCGPROVIDERNAME:
	strlcpy(data, pp->name, i);
	break;
	case DIOCGSTRIPESIZE:
	(off_t )data = pp->stripesize;
	break;
	case DIOCGSTRIPEOFFSET:
	(off_t )data = pp->stripeoffset;
	break;
	case DIOCGPHYSPATH:
	error = g_io_getattr("GEOM::physpath", cp, &i, data);
	if (error == 0 && (char )data == '\0')
	error = ENOENT;
	break;
	case DIOCGATTR: {
	struct diocgattr_arg arg = (struct diocgattr_arg )data;

	if (arg->len > sizeof(arg->value)) {
	error = EINVAL;
	break;
	}
	error = g_io_getattr(arg->name, cp, &arg->len, &arg->value);
	break;
	}
	case DIOCZONECMD: {
	struct disk_zone_args zone_args =(struct disk_zone_args )data;
	struct disk_zone_rep_entry new_entries, old_entries;
	struct disk_zone_report *rep;
	size_t alloc_size;

	old_entries = NULL;
	new_entries = NULL;
	rep = NULL;
	alloc_size = 0;

	if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) {
	rep = &zone_args->zone_params.report;
	-#define MAXENTRIES (MAXPHYS / sizeof(struct disk_zone_rep_entry))
	+#define MAXENTRIES (maxphys / sizeof(struct disk_zone_rep_entry))
	if (rep->entries_allocated > MAXENTRIES)
	rep->entries_allocated = MAXENTRIES;
	alloc_size = rep->entries_allocated *
	sizeof(struct disk_zone_rep_entry);
	if (alloc_size != 0)
	new_entries = g_malloc(alloc_size,
	- M_WAITOK\| M_ZERO);
	+ M_WAITOK \| M_ZERO);
	old_entries = rep->entries;
	rep->entries = new_entries;
	}
	error = g_io_zonecmd(zone_args, cp);
	if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES &&
	alloc_size != 0 && error == 0)
	error = copyout(new_entries, old_entries, alloc_size);
	if (old_entries != NULL && rep != NULL)
	rep->entries = old_entries;
	if (new_entries != NULL)
	g_free(new_entries);
	break;
	}
	default:
	if (pp->geom->ioctl != NULL) {
	error = pp->geom->ioctl(pp, cmd, data, fflag, td);
	} else {
	error = ENOIOCTL;
	}
	}

	return (error);
	}

	static void
	g_dev_done(struct bio *bp2)
	{
	struct g_consumer *cp;
	struct g_dev_softc *sc;
	struct bio *bp;
	int active;

	cp = bp2->bio_from;
	sc = cp->private;
	bp = bp2->bio_parent;
	bp->bio_error = bp2->bio_error;
	bp->bio_completed = bp2->bio_completed;
	bp->bio_resid = bp->bio_length - bp2->bio_completed;
	if (bp2->bio_cmd == BIO_ZONE)
	bcopy(&bp2->bio_zone, &bp->bio_zone, sizeof(bp->bio_zone));

	if (bp2->bio_error != 0) {
	g_trace(G_T_BIO, "g_dev_done(%p) had error %d",
	bp2, bp2->bio_error);
	bp->bio_flags \|= BIO_ERROR;
	} else {
	g_trace(G_T_BIO, "g_dev_done(%p/%p) resid %ld completed %jd",
	bp2, bp, bp2->bio_resid, (intmax_t)bp2->bio_completed);
	}
	g_destroy_bio(bp2);
	active = atomic_fetchadd_int(&sc->sc_active, -1) - 1;
	if ((active & SC_A_ACTIVE) == 0) {
	if ((active & SC_A_OPEN) == 0)
	wakeup(&sc->sc_active);
	if (active & SC_A_DESTROY)
	g_post_event(g_dev_destroy, cp, M_NOWAIT, NULL);
	}
	biodone(bp);
	}

	static void
	g_dev_strategy(struct bio *bp)
	{
	struct g_consumer *cp;
	struct bio *bp2;
	struct cdev *dev;
	struct g_dev_softc *sc;

	KASSERT(bp->bio_cmd == BIO_READ \|\|
	bp->bio_cmd == BIO_WRITE \|\|
	bp->bio_cmd == BIO_DELETE \|\|
	bp->bio_cmd == BIO_FLUSH \|\|
	bp->bio_cmd == BIO_ZONE,
	("Wrong bio_cmd bio=%p cmd=%d", bp, bp->bio_cmd));
	dev = bp->bio_dev;
	cp = dev->si_drv2;
	KASSERT(cp->acr \|\| cp->acw,
	("Consumer with zero access count in g_dev_strategy"));
	biotrack(bp, __func__);
	#ifdef INVARIANTS
	if ((bp->bio_offset % cp->provider->sectorsize) != 0 \|\|
	(bp->bio_bcount % cp->provider->sectorsize) != 0) {
	bp->bio_resid = bp->bio_bcount;
	biofinish(bp, NULL, EINVAL);
	return;
	}
	#endif
	sc = dev->si_drv1;
	KASSERT(sc->sc_open > 0, ("Closed device in g_dev_strategy"));
	atomic_add_int(&sc->sc_active, 1);

	for (;;) {
	/*
	* XXX: This is not an ideal solution, but I believe it to
	* XXX: deadlock safely, all things considered.
	*/
	bp2 = g_clone_bio(bp);
	if (bp2 != NULL)
	break;
	pause("gdstrat", hz / 10);
	}
	KASSERT(bp2 != NULL, ("XXX: ENOMEM in a bad place"));
	bp2->bio_done = g_dev_done;
	g_trace(G_T_BIO,
	"g_dev_strategy(%p/%p) offset %jd length %jd data %p cmd %d",
	bp, bp2, (intmax_t)bp->bio_offset, (intmax_t)bp2->bio_length,
	bp2->bio_data, bp2->bio_cmd);
	g_io_request(bp2, cp);
	KASSERT(cp->acr \|\| cp->acw,
	("g_dev_strategy raced with g_dev_close and lost"));

	}

	/*
	* g_dev_callback()
	*
	* Called by devfs when asynchronous device destruction is completed.
	* - Mark that we have no attached device any more.
	* - If there are no outstanding requests, schedule geom destruction.
	* Otherwise destruction will be scheduled later by g_dev_done().
	*/

	static void
	g_dev_callback(void *arg)
	{
	struct g_consumer *cp;
	struct g_dev_softc *sc;
	int active;

	cp = arg;
	sc = cp->private;
	g_trace(G_T_TOPOLOGY, "g_dev_callback(%p(%s))", cp, cp->geom->name);

	sc->sc_dev = NULL;
	sc->sc_alias = NULL;
	active = atomic_fetchadd_int(&sc->sc_active, SC_A_DESTROY);
	if ((active & SC_A_ACTIVE) == 0)
	g_post_event(g_dev_destroy, cp, M_WAITOK, NULL);
	}

	/*
	* g_dev_orphan()
	*
	* Called from below when the provider orphaned us.
	* - Clear any dump settings.
	* - Request asynchronous device destruction to prevent any more requests
	* from coming in. The provider is already marked with an error, so
	* anything which comes in the interim will be returned immediately.
	*/

	static void
	g_dev_orphan(struct g_consumer *cp)
	{
	struct cdev *dev;
	struct g_dev_softc *sc;

	g_topology_assert();
	sc = cp->private;
	dev = sc->sc_dev;
	g_trace(G_T_TOPOLOGY, "g_dev_orphan(%p(%s))", cp, cp->geom->name);

	/* Reset any dump-area set on this device */
	if (dev->si_flags & SI_DUMPDEV) {
	struct diocskerneldump_arg kda;

	bzero(&kda, sizeof(kda));
	kda.kda_index = KDA_REMOVE_DEV;
	(void)dumper_remove(devtoname(dev), &kda);
	}

	/* Destroy the struct cdev so we get no more requests /
	delist_dev(dev);
	destroy_dev_sched_cb(dev, g_dev_callback, cp);
	}

	DECLARE_GEOM_CLASS(g_dev_class, g_dev);
	diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c
	index 0e857aa3ce10..31213f0f2b22 100644
	--- a/sys/geom/geom_io.c
	+++ b/sys/geom/geom_io.c
	@@ -1,1080 +1,1080 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 2002 Poul-Henning Kamp
	* Copyright (c) 2002 Networks Associates Technology, Inc.
	* Copyright (c) 2013 The FreeBSD Foundation
	* All rights reserved.
	*
	* This software was developed for the FreeBSD Project by Poul-Henning Kamp
	* and NAI Labs, the Security Research Division of Network Associates, Inc.
	* under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
	* DARPA CHATS research program.
	*
	* Portions of this software were developed by Konstantin Belousov
	* under sponsorship from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The names of the authors may not be used to endorse or promote
	* products derived from this software without specific prior written
	* permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/bio.h>
	#include <sys/ktr.h>
	#include <sys/proc.h>
	#include <sys/sbuf.h>
	#include <sys/stack.h>
	#include <sys/sysctl.h>
	#include <sys/vmem.h>
	#include <machine/stdarg.h>

	#include <sys/errno.h>
	#include <geom/geom.h>
	#include <geom/geom_int.h>
	#include <sys/devicestat.h>

	#include <vm/uma.h>
	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_page.h>
	#include <vm/vm_object.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_map.h>

	static int g_io_transient_map_bio(struct bio *bp);

	static struct g_bioq g_bio_run_down;
	static struct g_bioq g_bio_run_up;

	/*
	* Pace is a hint that we've had some trouble recently allocating
	* bios, so we should back off trying to send I/O down the stack
	* a bit to let the problem resolve. When pacing, we also turn
	* off direct dispatch to also reduce memory pressure from I/Os
	* there, at the expxense of some added latency while the memory
	* pressures exist. See g_io_schedule_down() for more details
	* and limitations.
	*/
	static volatile u_int __read_mostly pace;

	static uma_zone_t __read_mostly biozone;

	#include <machine/atomic.h>

	static void
	g_bioq_lock(struct g_bioq *bq)
	{

	mtx_lock(&bq->bio_queue_lock);
	}

	static void
	g_bioq_unlock(struct g_bioq *bq)
	{

	mtx_unlock(&bq->bio_queue_lock);
	}

	#if 0
	static void
	g_bioq_destroy(struct g_bioq *bq)
	{

	mtx_destroy(&bq->bio_queue_lock);
	}
	#endif

	static void
	g_bioq_init(struct g_bioq *bq)
	{

	TAILQ_INIT(&bq->bio_queue);
	mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF);
	}

	static struct bio *
	g_bioq_first(struct g_bioq *bq)
	{
	struct bio *bp;

	bp = TAILQ_FIRST(&bq->bio_queue);
	if (bp != NULL) {
	KASSERT((bp->bio_flags & BIO_ONQUEUE),
	("Bio not on queue bp=%p target %p", bp, bq));
	bp->bio_flags &= ~BIO_ONQUEUE;
	TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue);
	bq->bio_queue_length--;
	}
	return (bp);
	}

	struct bio *
	g_new_bio(void)
	{
	struct bio *bp;

	bp = uma_zalloc(biozone, M_NOWAIT \| M_ZERO);
	#ifdef KTR
	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
	struct stack st;

	CTR1(KTR_GEOM, "g_new_bio(): %p", bp);
	stack_save(&st);
	CTRSTACK(KTR_GEOM, &st, 3);
	}
	#endif
	return (bp);
	}

	struct bio *
	g_alloc_bio(void)
	{
	struct bio *bp;

	bp = uma_zalloc(biozone, M_WAITOK \| M_ZERO);
	#ifdef KTR
	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
	struct stack st;

	CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp);
	stack_save(&st);
	CTRSTACK(KTR_GEOM, &st, 3);
	}
	#endif
	return (bp);
	}

	void
	g_destroy_bio(struct bio *bp)
	{
	#ifdef KTR
	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
	struct stack st;

	CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp);
	stack_save(&st);
	CTRSTACK(KTR_GEOM, &st, 3);
	}
	#endif
	uma_zfree(biozone, bp);
	}

	struct bio *
	g_clone_bio(struct bio *bp)
	{
	struct bio *bp2;

	bp2 = uma_zalloc(biozone, M_NOWAIT \| M_ZERO);
	if (bp2 != NULL) {
	bp2->bio_parent = bp;
	bp2->bio_cmd = bp->bio_cmd;
	/*
	* BIO_ORDERED flag may be used by disk drivers to enforce
	* ordering restrictions, so this flag needs to be cloned.
	* BIO_UNMAPPED and BIO_VLIST should be inherited, to properly
	* indicate which way the buffer is passed.
	* Other bio flags are not suitable for cloning.
	*/
	bp2->bio_flags = bp->bio_flags &
	(BIO_ORDERED \| BIO_UNMAPPED \| BIO_VLIST);
	bp2->bio_length = bp->bio_length;
	bp2->bio_offset = bp->bio_offset;
	bp2->bio_data = bp->bio_data;
	bp2->bio_ma = bp->bio_ma;
	bp2->bio_ma_n = bp->bio_ma_n;
	bp2->bio_ma_offset = bp->bio_ma_offset;
	bp2->bio_attribute = bp->bio_attribute;
	if (bp->bio_cmd == BIO_ZONE)
	bcopy(&bp->bio_zone, &bp2->bio_zone,
	sizeof(bp->bio_zone));
	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	bp2->bio_track_bp = bp->bio_track_bp;
	#endif
	bp->bio_children++;
	}
	#ifdef KTR
	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
	struct stack st;

	CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2);
	stack_save(&st);
	CTRSTACK(KTR_GEOM, &st, 3);
	}
	#endif
	return(bp2);
	}

	struct bio *
	g_duplicate_bio(struct bio *bp)
	{
	struct bio *bp2;

	bp2 = uma_zalloc(biozone, M_WAITOK \| M_ZERO);
	bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED \| BIO_VLIST);
	bp2->bio_parent = bp;
	bp2->bio_cmd = bp->bio_cmd;
	bp2->bio_length = bp->bio_length;
	bp2->bio_offset = bp->bio_offset;
	bp2->bio_data = bp->bio_data;
	bp2->bio_ma = bp->bio_ma;
	bp2->bio_ma_n = bp->bio_ma_n;
	bp2->bio_ma_offset = bp->bio_ma_offset;
	bp2->bio_attribute = bp->bio_attribute;
	bp->bio_children++;
	#ifdef KTR
	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
	struct stack st;

	CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2);
	stack_save(&st);
	CTRSTACK(KTR_GEOM, &st, 3);
	}
	#endif
	return(bp2);
	}

	void
	g_reset_bio(struct bio *bp)
	{

	bzero(bp, sizeof(*bp));
	}

	void
	g_io_init()
	{

	g_bioq_init(&g_bio_run_down);
	g_bioq_init(&g_bio_run_up);
	biozone = uma_zcreate("g_bio", sizeof (struct bio),
	NULL, NULL,
	NULL, NULL,
	0, 0);
	}

	int
	g_io_getattr(const char attr, struct g_consumer cp, int len, void ptr)
	{
	struct bio *bp;
	int error;

	g_trace(G_T_BIO, "bio_getattr(%s)", attr);
	bp = g_alloc_bio();
	bp->bio_cmd = BIO_GETATTR;
	bp->bio_done = NULL;
	bp->bio_attribute = attr;
	bp->bio_length = *len;
	bp->bio_data = ptr;
	g_io_request(bp, cp);
	error = biowait(bp, "ggetattr");
	*len = bp->bio_completed;
	g_destroy_bio(bp);
	return (error);
	}

	int
	g_io_zonecmd(struct disk_zone_args zone_args, struct g_consumer cp)
	{
	struct bio *bp;
	int error;

	g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd);
	bp = g_alloc_bio();
	bp->bio_cmd = BIO_ZONE;
	bp->bio_done = NULL;
	/*
	* XXX KDM need to handle report zone data.
	*/
	bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args));
	if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES)
	bp->bio_length =
	zone_args->zone_params.report.entries_allocated *
	sizeof(struct disk_zone_rep_entry);
	else
	bp->bio_length = 0;

	g_io_request(bp, cp);
	error = biowait(bp, "gzone");
	bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args));
	g_destroy_bio(bp);
	return (error);
	}

	/*
	* Send a BIO_SPEEDUP down the stack. This is used to tell the lower layers that
	* the upper layers have detected a resource shortage. The lower layers are
	* advised to stop delaying I/O that they might be holding for performance
	* reasons and to schedule it (non-trims) or complete it successfully (trims) as
	* quickly as it can. bio_length is the amount of the shortage. This call
	* should be non-blocking. bio_resid is used to communicate back if the lower
	* layers couldn't find bio_length worth of I/O to schedule or discard. A length
	* of 0 means to do as much as you can (schedule the h/w queues full, discard
	* all trims). flags are a hint from the upper layers to the lower layers what
	* operation should be done.
	*/
	int
	g_io_speedup(off_t shortage, u_int flags, size_t resid, struct g_consumer cp)
	{
	struct bio *bp;
	int error;

	KASSERT((flags & (BIO_SPEEDUP_TRIM \| BIO_SPEEDUP_WRITE)) != 0,
	("Invalid flags passed to g_io_speedup: %#x", flags));
	g_trace(G_T_BIO, "bio_speedup(%s, %jd, %#x)", cp->provider->name,
	(intmax_t)shortage, flags);
	bp = g_new_bio();
	if (bp == NULL)
	return (ENOMEM);
	bp->bio_cmd = BIO_SPEEDUP;
	bp->bio_length = shortage;
	bp->bio_done = NULL;
	bp->bio_flags \|= flags;
	g_io_request(bp, cp);
	error = biowait(bp, "gflush");
	*resid = bp->bio_resid;
	g_destroy_bio(bp);
	return (error);
	}

	int
	g_io_flush(struct g_consumer *cp)
	{
	struct bio *bp;
	int error;

	g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name);
	bp = g_alloc_bio();
	bp->bio_cmd = BIO_FLUSH;
	bp->bio_flags \|= BIO_ORDERED;
	bp->bio_done = NULL;
	bp->bio_attribute = NULL;
	bp->bio_offset = cp->provider->mediasize;
	bp->bio_length = 0;
	bp->bio_data = NULL;
	g_io_request(bp, cp);
	error = biowait(bp, "gflush");
	g_destroy_bio(bp);
	return (error);
	}

	static int
	g_io_check(struct bio *bp)
	{
	struct g_consumer *cp;
	struct g_provider *pp;
	off_t excess;
	int error;

	biotrack(bp, __func__);

	cp = bp->bio_from;
	pp = bp->bio_to;

	/* Fail if access counters dont allow the operation */
	switch(bp->bio_cmd) {
	case BIO_READ:
	case BIO_GETATTR:
	if (cp->acr == 0)
	return (EPERM);
	break;
	case BIO_WRITE:
	case BIO_DELETE:
	case BIO_SPEEDUP:
	case BIO_FLUSH:
	if (cp->acw == 0)
	return (EPERM);
	break;
	case BIO_ZONE:
	if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) \|\|
	(bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) {
	if (cp->acr == 0)
	return (EPERM);
	} else if (cp->acw == 0)
	return (EPERM);
	break;
	default:
	return (EPERM);
	}
	/* if provider is marked for error, don't disturb. */
	if (pp->error)
	return (pp->error);
	if (cp->flags & G_CF_ORPHAN)
	return (ENXIO);

	switch(bp->bio_cmd) {
	case BIO_READ:
	case BIO_WRITE:
	case BIO_DELETE:
	/* Zero sectorsize or mediasize is probably a lack of media. */
	if (pp->sectorsize == 0 \|\| pp->mediasize == 0)
	return (ENXIO);
	/* Reject I/O not on sector boundary */
	if (bp->bio_offset % pp->sectorsize)
	return (EINVAL);
	/* Reject I/O not integral sector long */
	if (bp->bio_length % pp->sectorsize)
	return (EINVAL);
	/* Reject requests before or past the end of media. */
	if (bp->bio_offset < 0)
	return (EIO);
	if (bp->bio_offset > pp->mediasize)
	return (EIO);

	/* Truncate requests to the end of providers media. */
	excess = bp->bio_offset + bp->bio_length;
	if (excess > bp->bio_to->mediasize) {
	KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 \|\|
	round_page(bp->bio_ma_offset +
	bp->bio_length) / PAGE_SIZE == bp->bio_ma_n,
	("excess bio %p too short", bp));
	excess -= bp->bio_to->mediasize;
	bp->bio_length -= excess;
	if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
	bp->bio_ma_n = round_page(bp->bio_ma_offset +
	bp->bio_length) / PAGE_SIZE;
	}
	if (excess > 0)
	CTR3(KTR_GEOM, "g_down truncated bio "
	"%p provider %s by %d", bp,
	bp->bio_to->name, excess);
	}

	/* Deliver zero length transfers right here. */
	if (bp->bio_length == 0) {
	CTR2(KTR_GEOM, "g_down terminated 0-length "
	"bp %p provider %s", bp, bp->bio_to->name);
	return (0);
	}

	if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
	(bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
	(bp->bio_cmd == BIO_READ \|\| bp->bio_cmd == BIO_WRITE)) {
	if ((error = g_io_transient_map_bio(bp)) >= 0)
	return (error);
	}
	break;
	default:
	break;
	}
	return (EJUSTRETURN);
	}

	void
	g_io_request(struct bio bp, struct g_consumer cp)
	{
	struct g_provider *pp;
	int direct, error, first;
	uint8_t cmd;

	biotrack(bp, __func__);

	KASSERT(cp != NULL, ("NULL cp in g_io_request"));
	KASSERT(bp != NULL, ("NULL bp in g_io_request"));
	pp = cp->provider;
	KASSERT(pp != NULL, ("consumer not attached in g_io_request"));
	#ifdef DIAGNOSTIC
	KASSERT(bp->bio_driver1 == NULL,
	("bio_driver1 used by the consumer (geom %s)", cp->geom->name));
	KASSERT(bp->bio_driver2 == NULL,
	("bio_driver2 used by the consumer (geom %s)", cp->geom->name));
	KASSERT(bp->bio_pflags == 0,
	("bio_pflags used by the consumer (geom %s)", cp->geom->name));
	/*
	* Remember consumer's private fields, so we can detect if they were
	* modified by the provider.
	*/
	bp->_bio_caller1 = bp->bio_caller1;
	bp->_bio_caller2 = bp->bio_caller2;
	bp->_bio_cflags = bp->bio_cflags;
	#endif

	cmd = bp->bio_cmd;
	if (cmd == BIO_READ \|\| cmd == BIO_WRITE \|\| cmd == BIO_GETATTR) {
	KASSERT(bp->bio_data != NULL,
	("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd));
	}
	if (cmd == BIO_DELETE \|\| cmd == BIO_FLUSH) {
	KASSERT(bp->bio_data == NULL,
	("non-NULL bp->data in g_io_request(cmd=%hu)",
	bp->bio_cmd));
	}
	if (cmd == BIO_READ \|\| cmd == BIO_WRITE \|\| cmd == BIO_DELETE) {
	KASSERT(bp->bio_offset % cp->provider->sectorsize == 0,
	("wrong offset %jd for sectorsize %u",
	bp->bio_offset, cp->provider->sectorsize));
	KASSERT(bp->bio_length % cp->provider->sectorsize == 0,
	("wrong length %jd for sectorsize %u",
	bp->bio_length, cp->provider->sectorsize));
	}

	g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d",
	bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd);

	bp->bio_from = cp;
	bp->bio_to = pp;
	bp->bio_error = 0;
	bp->bio_completed = 0;

	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
	("Bio already on queue bp=%p", bp));

	if ((g_collectstats & G_STATS_CONSUMERS) != 0 \|\|
	((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
	binuptime(&bp->bio_t0);
	else
	getbinuptime(&bp->bio_t0);
	if (g_collectstats & G_STATS_CONSUMERS)
	devstat_start_transaction_bio_t0(cp->stat, bp);
	if (g_collectstats & G_STATS_PROVIDERS)
	devstat_start_transaction_bio_t0(pp->stat, bp);
	#ifdef INVARIANTS
	atomic_add_int(&cp->nstart, 1);
	#endif

	#ifdef GET_STACK_USAGE
	direct = (cp->flags & G_CF_DIRECT_SEND) != 0 &&
	(pp->flags & G_PF_DIRECT_RECEIVE) != 0 &&
	!g_is_geom_thread(curthread) &&
	((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 \|\|
	(bp->bio_flags & BIO_UNMAPPED) == 0 \|\| THREAD_CAN_SLEEP()) &&
	pace == 0;
	if (direct) {
	/* Block direct execution if less then half of stack left. */
	size_t st, su;
	GET_STACK_USAGE(st, su);
	if (su * 2 > st)
	direct = 0;
	}
	#else
	direct = 0;
	#endif

	if (direct) {
	error = g_io_check(bp);
	if (error >= 0) {
	CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p "
	"provider %s returned %d", bp, bp->bio_to->name,
	error);
	g_io_deliver(bp, error);
	return;
	}
	bp->bio_to->geom->start(bp);
	} else {
	g_bioq_lock(&g_bio_run_down);
	first = TAILQ_EMPTY(&g_bio_run_down.bio_queue);
	TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue);
	bp->bio_flags \|= BIO_ONQUEUE;
	g_bio_run_down.bio_queue_length++;
	g_bioq_unlock(&g_bio_run_down);
	/* Pass it on down. */
	if (first)
	wakeup(&g_wait_down);
	}
	}

	void
	g_io_deliver(struct bio *bp, int error)
	{
	struct bintime now;
	struct g_consumer *cp;
	struct g_provider *pp;
	struct mtx *mtxp;
	int direct, first;

	biotrack(bp, __func__);

	KASSERT(bp != NULL, ("NULL bp in g_io_deliver"));
	pp = bp->bio_to;
	KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver"));
	cp = bp->bio_from;
	if (cp == NULL) {
	bp->bio_error = error;
	bp->bio_done(bp);
	return;
	}
	KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver"));
	KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver"));
	#ifdef DIAGNOSTIC
	/*
	* Some classes - GJournal in particular - can modify bio's
	* private fields while the bio is in transit; G_GEOM_VOLATILE_BIO
	* flag means it's an expected behaviour for that particular geom.
	*/
	if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) {
	KASSERT(bp->bio_caller1 == bp->_bio_caller1,
	("bio_caller1 used by the provider %s", pp->name));
	KASSERT(bp->bio_caller2 == bp->_bio_caller2,
	("bio_caller2 used by the provider %s", pp->name));
	KASSERT(bp->bio_cflags == bp->_bio_cflags,
	("bio_cflags used by the provider %s", pp->name));
	}
	#endif
	KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0"));
	KASSERT(bp->bio_completed <= bp->bio_length,
	("bio_completed can't be greater than bio_length"));

	g_trace(G_T_BIO,
	"g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd",
	bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error,
	(intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);

	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
	("Bio already on queue bp=%p", bp));

	/*
	* XXX: next two doesn't belong here
	*/
	bp->bio_bcount = bp->bio_length;
	bp->bio_resid = bp->bio_bcount - bp->bio_completed;

	#ifdef GET_STACK_USAGE
	direct = (pp->flags & G_PF_DIRECT_SEND) &&
	(cp->flags & G_CF_DIRECT_RECEIVE) &&
	!g_is_geom_thread(curthread);
	if (direct) {
	/* Block direct execution if less then half of stack left. */
	size_t st, su;
	GET_STACK_USAGE(st, su);
	if (su * 2 > st)
	direct = 0;
	}
	#else
	direct = 0;
	#endif

	/*
	* The statistics collection is lockless, as such, but we
	* can not update one instance of the statistics from more
	* than one thread at a time, so grab the lock first.
	*/
	if ((g_collectstats & G_STATS_CONSUMERS) != 0 \|\|
	((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
	binuptime(&now);
	mtxp = mtx_pool_find(mtxpool_sleep, cp);
	mtx_lock(mtxp);
	if (g_collectstats & G_STATS_PROVIDERS)
	devstat_end_transaction_bio_bt(pp->stat, bp, &now);
	if (g_collectstats & G_STATS_CONSUMERS)
	devstat_end_transaction_bio_bt(cp->stat, bp, &now);
	#ifdef INVARIANTS
	cp->nend++;
	#endif
	mtx_unlock(mtxp);

	if (error != ENOMEM) {
	bp->bio_error = error;
	if (direct) {
	biodone(bp);
	} else {
	g_bioq_lock(&g_bio_run_up);
	first = TAILQ_EMPTY(&g_bio_run_up.bio_queue);
	TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue);
	bp->bio_flags \|= BIO_ONQUEUE;
	g_bio_run_up.bio_queue_length++;
	g_bioq_unlock(&g_bio_run_up);
	if (first)
	wakeup(&g_wait_up);
	}
	return;
	}

	if (bootverbose)
	printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name);
	bp->bio_children = 0;
	bp->bio_inbed = 0;
	bp->bio_driver1 = NULL;
	bp->bio_driver2 = NULL;
	bp->bio_pflags = 0;
	g_io_request(bp, cp);
	pace = 1;
	return;
	}

	SYSCTL_DECL(_kern_geom);

	static long transient_maps;
	SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
	&transient_maps, 0,
	"Total count of the transient mapping requests");
	u_int transient_map_retries = 10;
	SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
	&transient_map_retries, 0,
	"Max count of retries used before giving up on creating transient map");
	int transient_map_hard_failures;
	SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
	&transient_map_hard_failures, 0,
	"Failures to establish the transient mapping due to retry attempts "
	"exhausted");
	int transient_map_soft_failures;
	SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
	&transient_map_soft_failures, 0,
	"Count of retried failures to establish the transient mapping");
	int inflight_transient_maps;
	SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
	&inflight_transient_maps, 0,
	"Current count of the active transient maps");

	static int
	g_io_transient_map_bio(struct bio *bp)
	{
	vm_offset_t addr;
	long size;
	u_int retried;

	KASSERT(unmapped_buf_allowed, ("unmapped disabled"));

	size = round_page(bp->bio_ma_offset + bp->bio_length);
	KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
	addr = 0;
	retried = 0;
	atomic_add_long(&transient_maps, 1);
	retry:
	if (vmem_alloc(transient_arena, size, M_BESTFIT \| M_NOWAIT, &addr)) {
	if (transient_map_retries != 0 &&
	retried >= transient_map_retries) {
	CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
	bp, bp->bio_to->name);
	atomic_add_int(&transient_map_hard_failures, 1);
	return (EDEADLK/* XXXKIB */);
	} else {
	/*
	* Naive attempt to quisce the I/O to get more
	* in-flight requests completed and defragment
	* the transient_arena.
	*/
	CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
	bp, bp->bio_to->name, retried);
	pause("g_d_tra", hz / 10);
	retried++;
	atomic_add_int(&transient_map_soft_failures, 1);
	goto retry;
	}
	}
	atomic_add_int(&inflight_transient_maps, 1);
	pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
	bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
	bp->bio_flags \|= BIO_TRANSIENT_MAPPING;
	bp->bio_flags &= ~BIO_UNMAPPED;
	return (EJUSTRETURN);
	}

	void
	g_io_schedule_down(struct thread *tp __unused)
	{
	struct bio *bp;
	int error;

	for(;;) {
	g_bioq_lock(&g_bio_run_down);
	bp = g_bioq_first(&g_bio_run_down);
	if (bp == NULL) {
	CTR0(KTR_GEOM, "g_down going to sleep");
	msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock,
	PRIBIO \| PDROP, "-", 0);
	continue;
	}
	CTR0(KTR_GEOM, "g_down has work to do");
	g_bioq_unlock(&g_bio_run_down);
	biotrack(bp, __func__);
	if (pace != 0) {
	/*
	* There has been at least one memory allocation
	* failure since the last I/O completed. Pause 1ms to
	* give the system a chance to free up memory. We only
	* do this once because a large number of allocations
	* can fail in the direct dispatch case and there's no
	* relationship between the number of these failures and
	* the length of the outage. If there's still an outage,
	* we'll pause again and again until it's
	* resolved. Older versions paused longer and once per
	* allocation failure. This was OK for a single threaded
	* g_down, but with direct dispatch would lead to max of
	* 10 IOPs for minutes at a time when transient memory
	* issues prevented allocation for a batch of requests
	* from the upper layers.
	*
	* XXX This pacing is really lame. It needs to be solved
	* by other methods. This is OK only because the worst
	* case scenario is so rare. In the worst case scenario
	* all memory is tied up waiting for I/O to complete
	* which can never happen since we can't allocate bios
	* for that I/O.
	*/
	CTR0(KTR_GEOM, "g_down pacing self");
	pause("g_down", min(hz/1000, 1));
	pace = 0;
	}
	CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp,
	bp->bio_to->name);
	error = g_io_check(bp);
	if (error >= 0) {
	CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider "
	"%s returned %d", bp, bp->bio_to->name, error);
	g_io_deliver(bp, error);
	continue;
	}
	THREAD_NO_SLEEPING();
	CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
	"len %ld", bp, bp->bio_to->name, bp->bio_offset,
	bp->bio_length);
	bp->bio_to->geom->start(bp);
	THREAD_SLEEPING_OK();
	}
	}

	void
	g_io_schedule_up(struct thread *tp __unused)
	{
	struct bio *bp;

	for(;;) {
	g_bioq_lock(&g_bio_run_up);
	bp = g_bioq_first(&g_bio_run_up);
	if (bp == NULL) {
	CTR0(KTR_GEOM, "g_up going to sleep");
	msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock,
	PRIBIO \| PDROP, "-", 0);
	continue;
	}
	g_bioq_unlock(&g_bio_run_up);
	THREAD_NO_SLEEPING();
	CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off "
	"%jd len %ld", bp, bp->bio_to->name,
	bp->bio_offset, bp->bio_length);
	biodone(bp);
	THREAD_SLEEPING_OK();
	}
	}

	void *
	g_read_data(struct g_consumer cp, off_t offset, off_t length, int error)
	{
	struct bio *bp;
	void *ptr;
	int errorc;

	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
	- length <= MAXPHYS, ("g_read_data(): invalid length %jd",
	+ length <= maxphys, ("g_read_data(): invalid length %jd",
	(intmax_t)length));

	bp = g_alloc_bio();
	bp->bio_cmd = BIO_READ;
	bp->bio_done = NULL;
	bp->bio_offset = offset;
	bp->bio_length = length;
	ptr = g_malloc(length, M_WAITOK);
	bp->bio_data = ptr;
	g_io_request(bp, cp);
	errorc = biowait(bp, "gread");
	if (error != NULL)
	*error = errorc;
	g_destroy_bio(bp);
	if (errorc) {
	g_free(ptr);
	ptr = NULL;
	}
	return (ptr);
	}

	/*
	* A read function for use by ffs_sbget when used by GEOM-layer routines.
	*/
	int
	g_use_g_read_data(void devfd, off_t loc, void *bufp, int size)
	{
	struct g_consumer *cp;

	KASSERT(*bufp == NULL,
	("g_use_g_read_data: non-NULL bufp %p\n", bufp));

	cp = (struct g_consumer *)devfd;
	/*
	* Take care not to issue an invalid I/O request. The offset of
	* the superblock candidate must be multiples of the provider's
	* sector size, otherwise an FFS can't exist on the provider
	* anyway.
	*/
	if (loc % cp->provider->sectorsize != 0)
	return (ENOENT);
	*bufp = g_read_data(cp, loc, size, NULL);
	if (*bufp == NULL)
	return (ENOENT);
	return (0);
	}

	int
	g_write_data(struct g_consumer cp, off_t offset, void ptr, off_t length)
	{
	struct bio *bp;
	int error;

	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
	- length <= MAXPHYS, ("g_write_data(): invalid length %jd",
	+ length <= maxphys, ("g_write_data(): invalid length %jd",
	(intmax_t)length));

	bp = g_alloc_bio();
	bp->bio_cmd = BIO_WRITE;
	bp->bio_done = NULL;
	bp->bio_offset = offset;
	bp->bio_length = length;
	bp->bio_data = ptr;
	g_io_request(bp, cp);
	error = biowait(bp, "gwrite");
	g_destroy_bio(bp);
	return (error);
	}

	/*
	* A write function for use by ffs_sbput when used by GEOM-layer routines.
	*/
	int
	g_use_g_write_data(void devfd, off_t loc, void buf, int size)
	{

	return (g_write_data((struct g_consumer *)devfd, loc, buf, size));
	}

	int
	g_delete_data(struct g_consumer *cp, off_t offset, off_t length)
	{
	struct bio *bp;
	int error;

	KASSERT(length > 0 && length >= cp->provider->sectorsize,
	("g_delete_data(): invalid length %jd", (intmax_t)length));

	bp = g_alloc_bio();
	bp->bio_cmd = BIO_DELETE;
	bp->bio_done = NULL;
	bp->bio_offset = offset;
	bp->bio_length = length;
	bp->bio_data = NULL;
	g_io_request(bp, cp);
	error = biowait(bp, "gdelete");
	g_destroy_bio(bp);
	return (error);
	}

	void
	g_print_bio(const char prefix, const struct bio bp, const char *fmtsuffix,
	...)
	{
	#ifndef PRINTF_BUFR_SIZE
	#define PRINTF_BUFR_SIZE 64
	#endif
	char bufr[PRINTF_BUFR_SIZE];
	struct sbuf sb, *sbp __unused;
	va_list ap;

	sbp = sbuf_new(&sb, bufr, sizeof(bufr), SBUF_FIXEDLEN);
	KASSERT(sbp != NULL, ("sbuf_new misused?"));

	sbuf_set_drain(&sb, sbuf_printf_drain, NULL);

	sbuf_cat(&sb, prefix);
	g_format_bio(&sb, bp);

	va_start(ap, fmtsuffix);
	sbuf_vprintf(&sb, fmtsuffix, ap);
	va_end(ap);

	sbuf_nl_terminate(&sb);

	sbuf_finish(&sb);
	sbuf_delete(&sb);
	}

	void
	g_format_bio(struct sbuf sb, const struct bio bp)
	{
	const char pname, cmd = NULL;

	if (bp->bio_to != NULL)
	pname = bp->bio_to->name;
	else
	pname = "[unknown]";

	switch (bp->bio_cmd) {
	case BIO_GETATTR:
	cmd = "GETATTR";
	sbuf_printf(sb, "%s[%s(attr=%s)]", pname, cmd,
	bp->bio_attribute);
	return;
	case BIO_FLUSH:
	cmd = "FLUSH";
	sbuf_printf(sb, "%s[%s]", pname, cmd);
	return;
	case BIO_ZONE: {
	char *subcmd = NULL;
	cmd = "ZONE";
	switch (bp->bio_zone.zone_cmd) {
	case DISK_ZONE_OPEN:
	subcmd = "OPEN";
	break;
	case DISK_ZONE_CLOSE:
	subcmd = "CLOSE";
	break;
	case DISK_ZONE_FINISH:
	subcmd = "FINISH";
	break;
	case DISK_ZONE_RWP:
	subcmd = "RWP";
	break;
	case DISK_ZONE_REPORT_ZONES:
	subcmd = "REPORT ZONES";
	break;
	case DISK_ZONE_GET_PARAMS:
	subcmd = "GET PARAMS";
	break;
	default:
	subcmd = "UNKNOWN";
	break;
	}
	sbuf_printf(sb, "%s[%s,%s]", pname, cmd, subcmd);
	return;
	}
	case BIO_READ:
	cmd = "READ";
	break;
	case BIO_WRITE:
	cmd = "WRITE";
	break;
	case BIO_DELETE:
	cmd = "DELETE";
	break;
	default:
	cmd = "UNKNOWN";
	sbuf_printf(sb, "%s[%s()]", pname, cmd);
	return;
	}
	sbuf_printf(sb, "%s[%s(offset=%jd, length=%jd)]", pname, cmd,
	(intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
	}
	diff --git a/sys/geom/journal/g_journal.c b/sys/geom/journal/g_journal.c
	index 43c105496879..0b518d172b5a 100644
	--- a/sys/geom/journal/g_journal.c
	+++ b/sys/geom/journal/g_journal.c
	@@ -1,3024 +1,3024 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/bio.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/eventhandler.h>
	#include <sys/proc.h>
	#include <sys/kthread.h>
	#include <sys/sched.h>
	#include <sys/taskqueue.h>
	#include <sys/vnode.h>
	#include <sys/sbuf.h>
	#ifdef GJ_MEMDEBUG
	#include <sys/stack.h>
	#include <sys/kdb.h>
	#endif
	#include <vm/vm.h>
	#include <vm/vm_kern.h>
	#include <geom/geom.h>
	#include <geom/geom_dbg.h>

	#include <geom/journal/g_journal.h>

	FEATURE(geom_journal, "GEOM journaling support");

	/*
	* On-disk journal format:
	*
	* JH - Journal header
	* RH - Record header
	*
	* %%%%%% **** +------+ +------+ **** +------+ %%%%%%
	* % JH % * RH * \| Data \| \| Data \| ... * RH * \| Data \| ... % JH % ...
	* %%%%%% **** +------+ +------+ **** +------+ %%%%%%
	*
	*/

	CTASSERT(sizeof(struct g_journal_header) <= 512);
	CTASSERT(sizeof(struct g_journal_record_header) <= 512);

	static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data");
	static struct mtx g_journal_cache_mtx;
	MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF);

	const struct g_journal_desc *g_journal_filesystems[] = {
	&g_journal_ufs,
	NULL
	};

	SYSCTL_DECL(_kern_geom);

	int g_journal_debug = 0;
	static u_int g_journal_switch_time = 10;
	static u_int g_journal_force_switch = 70;
	static u_int g_journal_parallel_flushes = 16;
	static u_int g_journal_parallel_copies = 16;
	static u_int g_journal_accept_immediately = 64;
	static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES;
	static u_int g_journal_do_optimize = 1;

	static SYSCTL_NODE(_kern_geom, OID_AUTO, journal,
	CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_JOURNAL stuff");
	SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RWTUN, &g_journal_debug, 0,
	"Debug level");
	SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW,
	&g_journal_switch_time, 0, "Switch journals every N seconds");
	SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW,
	&g_journal_force_switch, 0, "Force switch when journal is N% full");
	SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW,
	&g_journal_parallel_flushes, 0,
	"Number of flush I/O requests to send in parallel");
	SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW,
	&g_journal_accept_immediately, 0,
	"Number of I/O requests accepted immediately");
	SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW,
	&g_journal_parallel_copies, 0,
	"Number of copy I/O requests to send in parallel");
	static int
	g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS)
	{
	u_int entries;
	int error;

	entries = g_journal_record_entries;
	error = sysctl_handle_int(oidp, &entries, 0, req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	if (entries < 1 \|\| entries > GJ_RECORD_HEADER_NENTRIES)
	return (EINVAL);
	g_journal_record_entries = entries;
	return (0);
	}
	SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries,
	CTLTYPE_UINT \| CTLFLAG_RW \| CTLFLAG_NEEDGIANT, NULL, 0,
	g_journal_record_entries_sysctl, "I",
	"Maximum number of entires in one journal record");
	SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW,
	&g_journal_do_optimize, 0, "Try to combine bios on flush and copy");

	static u_long g_journal_cache_used = 0;
	static u_long g_journal_cache_limit = 64 * 1024 * 1024;
	static u_int g_journal_cache_divisor = 2;
	static u_int g_journal_cache_switch = 90;
	static u_int g_journal_cache_misses = 0;
	static u_int g_journal_cache_alloc_failures = 0;
	static u_long g_journal_cache_low = 0;

	static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache,
	CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_JOURNAL cache");
	SYSCTL_ULONG(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD,
	&g_journal_cache_used, 0, "Number of allocated bytes");
	static int
	g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS)
	{
	u_long limit;
	int error;

	limit = g_journal_cache_limit;
	error = sysctl_handle_long(oidp, &limit, 0, req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	g_journal_cache_limit = limit;
	g_journal_cache_low = (limit / 100) * g_journal_cache_switch;
	return (0);
	}
	SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit,
	CTLTYPE_ULONG \| CTLFLAG_RWTUN \| CTLFLAG_NEEDGIANT, NULL, 0,
	g_journal_cache_limit_sysctl, "I",
	"Maximum number of allocated bytes");
	SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN,
	&g_journal_cache_divisor, 0,
	"(kmem_size / kern.geom.journal.cache.divisor) == cache size");
	static int
	g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS)
	{
	u_int cswitch;
	int error;

	cswitch = g_journal_cache_switch;
	error = sysctl_handle_int(oidp, &cswitch, 0, req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	if (cswitch > 100)
	return (EINVAL);
	g_journal_cache_switch = cswitch;
	g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch;
	return (0);
	}
	SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch,
	CTLTYPE_UINT \| CTLFLAG_RW \| CTLFLAG_NEEDGIANT, NULL, 0,
	g_journal_cache_switch_sysctl, "I",
	"Force switch when we hit this percent of cache use");
	SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW,
	&g_journal_cache_misses, 0, "Number of cache misses");
	SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW,
	&g_journal_cache_alloc_failures, 0, "Memory allocation failures");

	static u_long g_journal_stats_bytes_skipped = 0;
	static u_long g_journal_stats_combined_ios = 0;
	static u_long g_journal_stats_switches = 0;
	static u_long g_journal_stats_wait_for_copy = 0;
	static u_long g_journal_stats_journal_full = 0;
	static u_long g_journal_stats_low_mem = 0;

	static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats,
	CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_JOURNAL statistics");
	SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW,
	&g_journal_stats_bytes_skipped, 0, "Number of skipped bytes");
	SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW,
	&g_journal_stats_combined_ios, 0, "Number of combined I/O requests");
	SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW,
	&g_journal_stats_switches, 0, "Number of journal switches");
	SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW,
	&g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch");
	SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW,
	&g_journal_stats_journal_full, 0,
	"Number of times journal was almost full.");
	SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW,
	&g_journal_stats_low_mem, 0, "Number of times low_mem hook was called.");

	static g_taste_t g_journal_taste;
	static g_ctl_req_t g_journal_config;
	static g_dumpconf_t g_journal_dumpconf;
	static g_init_t g_journal_init;
	static g_fini_t g_journal_fini;

	struct g_class g_journal_class = {
	.name = G_JOURNAL_CLASS_NAME,
	.version = G_VERSION,
	.taste = g_journal_taste,
	.ctlreq = g_journal_config,
	.dumpconf = g_journal_dumpconf,
	.init = g_journal_init,
	.fini = g_journal_fini
	};

	static int g_journal_destroy(struct g_journal_softc *sc);
	static void g_journal_metadata_update(struct g_journal_softc *sc);
	static void g_journal_start_switcher(struct g_class *mp);
	static void g_journal_stop_switcher(void);
	static void g_journal_switch_wait(struct g_journal_softc *sc);

	#define GJ_SWITCHER_WORKING 0
	#define GJ_SWITCHER_DIE 1
	#define GJ_SWITCHER_DIED 2
	static struct proc *g_journal_switcher_proc = NULL;
	static int g_journal_switcher_state = GJ_SWITCHER_WORKING;
	static int g_journal_switcher_wokenup = 0;
	static int g_journal_sync_requested = 0;

	#ifdef GJ_MEMDEBUG
	struct meminfo {
	size_t mi_size;
	struct stack mi_stack;
	};
	#endif

	/*
	* We use our own malloc/realloc/free funtions, so we can collect statistics
	* and force journal switch when we're running out of cache.
	*/
	static void *
	gj_malloc(size_t size, int flags)
	{
	void *p;
	#ifdef GJ_MEMDEBUG
	struct meminfo *mi;
	#endif

	mtx_lock(&g_journal_cache_mtx);
	if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup &&
	g_journal_cache_used + size > g_journal_cache_low) {
	GJ_DEBUG(1, "No cache, waking up the switcher.");
	g_journal_switcher_wokenup = 1;
	wakeup(&g_journal_switcher_state);
	}
	if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 &&
	g_journal_cache_used + size > g_journal_cache_limit) {
	mtx_unlock(&g_journal_cache_mtx);
	g_journal_cache_alloc_failures++;
	return (NULL);
	}
	g_journal_cache_used += size;
	mtx_unlock(&g_journal_cache_mtx);
	flags &= ~M_NOWAIT;
	#ifndef GJ_MEMDEBUG
	p = malloc(size, M_JOURNAL, flags \| M_WAITOK);
	#else
	mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags \| M_WAITOK);
	p = (u_char )mi + sizeof(mi);
	mi->mi_size = size;
	stack_save(&mi->mi_stack);
	#endif
	return (p);
	}

	static void
	gj_free(void *p, size_t size)
	{
	#ifdef GJ_MEMDEBUG
	struct meminfo *mi;
	#endif

	KASSERT(p != NULL, ("p=NULL"));
	KASSERT(size > 0, ("size=0"));
	mtx_lock(&g_journal_cache_mtx);
	KASSERT(g_journal_cache_used >= size, ("Freeing too much?"));
	g_journal_cache_used -= size;
	mtx_unlock(&g_journal_cache_mtx);
	#ifdef GJ_MEMDEBUG
	mi = p = (void )((u_char )p - sizeof(*mi));
	if (mi->mi_size != size) {
	printf("GJOURNAL: Size mismatch! %zu != %zu\n", size,
	mi->mi_size);
	printf("GJOURNAL: Alloc backtrace:\n");
	stack_print(&mi->mi_stack);
	printf("GJOURNAL: Free backtrace:\n");
	kdb_backtrace();
	}
	#endif
	free(p, M_JOURNAL);
	}

	static void *
	gj_realloc(void *p, size_t size, size_t oldsize)
	{
	void *np;

	#ifndef GJ_MEMDEBUG
	mtx_lock(&g_journal_cache_mtx);
	g_journal_cache_used -= oldsize;
	g_journal_cache_used += size;
	mtx_unlock(&g_journal_cache_mtx);
	np = realloc(p, size, M_JOURNAL, M_WAITOK);
	#else
	np = gj_malloc(size, M_WAITOK);
	bcopy(p, np, MIN(oldsize, size));
	gj_free(p, oldsize);
	#endif
	return (np);
	}

	static void
	g_journal_check_overflow(struct g_journal_softc *sc)
	{
	off_t length, used;

	if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset &&
	sc->sc_journal_offset >= sc->sc_inactive.jj_offset) \|\|
	(sc->sc_active.jj_offset > sc->sc_inactive.jj_offset &&
	sc->sc_journal_offset >= sc->sc_inactive.jj_offset &&
	sc->sc_journal_offset < sc->sc_active.jj_offset)) {
	panic("Journal overflow "
	"(id = %u joffset=%jd active=%jd inactive=%jd)",
	(unsigned)sc->sc_id,
	(intmax_t)sc->sc_journal_offset,
	(intmax_t)sc->sc_active.jj_offset,
	(intmax_t)sc->sc_inactive.jj_offset);
	}
	if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) {
	length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset;
	used = sc->sc_journal_offset - sc->sc_active.jj_offset;
	} else {
	length = sc->sc_jend - sc->sc_active.jj_offset;
	length += sc->sc_inactive.jj_offset - sc->sc_jstart;
	if (sc->sc_journal_offset >= sc->sc_active.jj_offset)
	used = sc->sc_journal_offset - sc->sc_active.jj_offset;
	else {
	used = sc->sc_jend - sc->sc_active.jj_offset;
	used += sc->sc_journal_offset - sc->sc_jstart;
	}
	}
	/* Already woken up? */
	if (g_journal_switcher_wokenup)
	return;
	/*
	* If the active journal takes more than g_journal_force_switch precent
	* of free journal space, we force journal switch.
	*/
	KASSERT(length > 0,
	("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd",
	(intmax_t)length, (intmax_t)used,
	(intmax_t)sc->sc_active.jj_offset,
	(intmax_t)sc->sc_inactive.jj_offset,
	(intmax_t)sc->sc_journal_offset));
	if ((used * 100) / length > g_journal_force_switch) {
	g_journal_stats_journal_full++;
	GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.",
	sc->sc_name, (used * 100) / length);
	mtx_lock(&g_journal_cache_mtx);
	g_journal_switcher_wokenup = 1;
	wakeup(&g_journal_switcher_state);
	mtx_unlock(&g_journal_cache_mtx);
	}
	}

	static void
	g_journal_orphan(struct g_consumer *cp)
	{
	struct g_journal_softc *sc;
	char name[256];
	int error;

	g_topology_assert();
	sc = cp->geom->softc;
	strlcpy(name, cp->provider->name, sizeof(name));
	GJ_DEBUG(0, "Lost provider %s.", name);
	if (sc == NULL)
	return;
	error = g_journal_destroy(sc);
	if (error == 0)
	GJ_DEBUG(0, "Journal %s destroyed.", name);
	else {
	GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). "
	"Destroy it manually after last close.", sc->sc_name,
	error);
	}
	}

	static int
	g_journal_access(struct g_provider *pp, int acr, int acw, int ace)
	{
	struct g_journal_softc *sc;
	int dcr, dcw, dce;

	g_topology_assert();
	GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name,
	acr, acw, ace);

	dcr = pp->acr + acr;
	dcw = pp->acw + acw;
	dce = pp->ace + ace;

	sc = pp->geom->softc;
	if (sc == NULL \|\| (sc->sc_flags & GJF_DEVICE_DESTROY)) {
	if (acr <= 0 && acw <= 0 && ace <= 0)
	return (0);
	else
	return (ENXIO);
	}
	if (pp->acw == 0 && dcw > 0) {
	GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name);
	sc->sc_flags &= ~GJF_DEVICE_CLEAN;
	g_topology_unlock();
	g_journal_metadata_update(sc);
	g_topology_lock();
	} /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) {
	GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
	sc->sc_flags \|= GJF_DEVICE_CLEAN;
	g_topology_unlock();
	g_journal_metadata_update(sc);
	g_topology_lock();
	} */
	return (0);
	}

	static void
	g_journal_header_encode(struct g_journal_header hdr, u_char data)
	{

	bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC));
	data += sizeof(GJ_HEADER_MAGIC);
	le32enc(data, hdr->jh_journal_id);
	data += 4;
	le32enc(data, hdr->jh_journal_next_id);
	}

	static int
	g_journal_header_decode(const u_char data, struct g_journal_header hdr)
	{

	bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic));
	data += sizeof(hdr->jh_magic);
	if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0)
	return (EINVAL);
	hdr->jh_journal_id = le32dec(data);
	data += 4;
	hdr->jh_journal_next_id = le32dec(data);
	return (0);
	}

	static void
	g_journal_flush_cache(struct g_journal_softc *sc)
	{
	struct bintime bt;
	int error;

	if (sc->sc_bio_flush == 0)
	return;
	GJ_TIMER_START(1, &bt);
	if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) {
	error = g_io_flush(sc->sc_jconsumer);
	GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
	sc->sc_jconsumer->provider->name, error);
	}
	if (sc->sc_bio_flush & GJ_FLUSH_DATA) {
	/*
	* TODO: This could be called in parallel with the
	* previous call.
	*/
	error = g_io_flush(sc->sc_dconsumer);
	GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
	sc->sc_dconsumer->provider->name, error);
	}
	GJ_TIMER_STOP(1, &bt, "Cache flush time");
	}

	static int
	g_journal_write_header(struct g_journal_softc *sc)
	{
	struct g_journal_header hdr;
	struct g_consumer *cp;
	u_char *buf;
	int error;

	cp = sc->sc_jconsumer;
	buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);

	strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic));
	hdr.jh_journal_id = sc->sc_journal_id;
	hdr.jh_journal_next_id = sc->sc_journal_next_id;
	g_journal_header_encode(&hdr, buf);
	error = g_write_data(cp, sc->sc_journal_offset, buf,
	cp->provider->sectorsize);
	/* if (error == 0) */
	sc->sc_journal_offset += cp->provider->sectorsize;

	gj_free(buf, cp->provider->sectorsize);
	return (error);
	}

	/*
	* Every journal record has a header and data following it.
	* Functions below are used to decode the header before storing it to
	* little endian and to encode it after reading to system endianness.
	*/
	static void
	g_journal_record_header_encode(struct g_journal_record_header *hdr,
	u_char *data)
	{
	struct g_journal_entry *ent;
	u_int i;

	bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC));
	data += sizeof(GJ_RECORD_HEADER_MAGIC);
	le32enc(data, hdr->jrh_journal_id);
	data += 8;
	le16enc(data, hdr->jrh_nentries);
	data += 2;
	bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum));
	data += 8;
	for (i = 0; i < hdr->jrh_nentries; i++) {
	ent = &hdr->jrh_entries[i];
	le64enc(data, ent->je_joffset);
	data += 8;
	le64enc(data, ent->je_offset);
	data += 8;
	le64enc(data, ent->je_length);
	data += 8;
	}
	}

	static int
	g_journal_record_header_decode(const u_char *data,
	struct g_journal_record_header *hdr)
	{
	struct g_journal_entry *ent;
	u_int i;

	bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic));
	data += sizeof(hdr->jrh_magic);
	if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0)
	return (EINVAL);
	hdr->jrh_journal_id = le32dec(data);
	data += 8;
	hdr->jrh_nentries = le16dec(data);
	data += 2;
	if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES)
	return (EINVAL);
	bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum));
	data += 8;
	for (i = 0; i < hdr->jrh_nentries; i++) {
	ent = &hdr->jrh_entries[i];
	ent->je_joffset = le64dec(data);
	data += 8;
	ent->je_offset = le64dec(data);
	data += 8;
	ent->je_length = le64dec(data);
	data += 8;
	}
	return (0);
	}

	/*
	* Function reads metadata from a provider (via the given consumer), decodes
	* it to system endianness and verifies its correctness.
	*/
	static int
	g_journal_metadata_read(struct g_consumer cp, struct g_journal_metadata md)
	{
	struct g_provider *pp;
	u_char *buf;
	int error;

	g_topology_assert();

	error = g_access(cp, 1, 0, 0);
	if (error != 0)
	return (error);
	pp = cp->provider;
	g_topology_unlock();
	/* Metadata is stored in last sector. */
	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
	&error);
	g_topology_lock();
	g_access(cp, -1, 0, 0);
	if (buf == NULL) {
	GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).",
	cp->provider->name, error);
	return (error);
	}

	/* Decode metadata. */
	error = journal_metadata_decode(buf, md);
	g_free(buf);
	/* Is this is gjournal provider at all? */
	if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0)
	return (EINVAL);
	/*
	* Are we able to handle this version of metadata?
	* We only maintain backward compatibility.
	*/
	if (md->md_version > G_JOURNAL_VERSION) {
	GJ_DEBUG(0,
	"Kernel module is too old to handle metadata from %s.",
	cp->provider->name);
	return (EINVAL);
	}
	/* Is checksum correct? */
	if (error != 0) {
	GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.",
	cp->provider->name);
	return (error);
	}
	return (0);
	}

	/*
	* Two functions below are responsible for updating metadata.
	* Only metadata on the data provider is updated (we need to update
	* information about active journal in there).
	*/
	static void
	g_journal_metadata_done(struct bio *bp)
	{

	/*
	* There is not much we can do on error except informing about it.
	*/
	if (bp->bio_error != 0) {
	GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).",
	bp->bio_error);
	} else {
	GJ_LOGREQ(2, bp, "Metadata updated.");
	}
	gj_free(bp->bio_data, bp->bio_length);
	g_destroy_bio(bp);
	}

	static void
	g_journal_metadata_update(struct g_journal_softc *sc)
	{
	struct g_journal_metadata md;
	struct g_consumer *cp;
	struct bio *bp;
	u_char *sector;

	cp = sc->sc_dconsumer;
	sector = gj_malloc(cp->provider->sectorsize, M_WAITOK);
	strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic));
	md.md_version = G_JOURNAL_VERSION;
	md.md_id = sc->sc_id;
	md.md_type = sc->sc_orig_type;
	md.md_jstart = sc->sc_jstart;
	md.md_jend = sc->sc_jend;
	md.md_joffset = sc->sc_inactive.jj_offset;
	md.md_jid = sc->sc_journal_previous_id;
	md.md_flags = 0;
	if (sc->sc_flags & GJF_DEVICE_CLEAN)
	md.md_flags \|= GJ_FLAG_CLEAN;

	if (sc->sc_flags & GJF_DEVICE_HARDCODED)
	strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider));
	else
	bzero(md.md_provider, sizeof(md.md_provider));
	md.md_provsize = cp->provider->mediasize;
	journal_metadata_encode(&md, sector);

	/*
	* Flush the cache, so we know all data are on disk.
	* We write here informations like "journal is consistent", so we need
	* to be sure it is. Without BIO_FLUSH here, we can end up in situation
	* where metadata is stored on disk, but not all data.
	*/
	g_journal_flush_cache(sc);

	bp = g_alloc_bio();
	bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize;
	bp->bio_length = cp->provider->sectorsize;
	bp->bio_data = sector;
	bp->bio_cmd = BIO_WRITE;
	if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) {
	bp->bio_done = g_journal_metadata_done;
	g_io_request(bp, cp);
	} else {
	bp->bio_done = NULL;
	g_io_request(bp, cp);
	biowait(bp, "gjmdu");
	g_journal_metadata_done(bp);
	}

	/*
	* Be sure metadata reached the disk.
	*/
	g_journal_flush_cache(sc);
	}

	/*
	* This is where the I/O request comes from the GEOM.
	*/
	static void
	g_journal_start(struct bio *bp)
	{
	struct g_journal_softc *sc;

	sc = bp->bio_to->geom->softc;
	GJ_LOGREQ(3, bp, "Request received.");

	switch (bp->bio_cmd) {
	case BIO_READ:
	case BIO_WRITE:
	mtx_lock(&sc->sc_mtx);
	bioq_insert_tail(&sc->sc_regular_queue, bp);
	wakeup(sc);
	mtx_unlock(&sc->sc_mtx);
	return;
	case BIO_GETATTR:
	if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) {
	strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length);
	bp->bio_completed = strlen(bp->bio_to->name) + 1;
	g_io_deliver(bp, 0);
	return;
	}
	/* FALLTHROUGH */
	case BIO_SPEEDUP:
	case BIO_DELETE:
	default:
	g_io_deliver(bp, EOPNOTSUPP);
	return;
	}
	}

	static void
	g_journal_std_done(struct bio *bp)
	{
	struct g_journal_softc *sc;

	sc = bp->bio_from->geom->softc;
	mtx_lock(&sc->sc_mtx);
	bioq_insert_tail(&sc->sc_back_queue, bp);
	wakeup(sc);
	mtx_unlock(&sc->sc_mtx);
	}

	static struct bio *
	g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data,
	int flags)
	{
	struct bio *bp;

	bp = g_alloc_bio();
	bp->bio_offset = start;
	bp->bio_joffset = joffset;
	bp->bio_length = end - start;
	bp->bio_cmd = BIO_WRITE;
	bp->bio_done = g_journal_std_done;
	if (data == NULL)
	bp->bio_data = NULL;
	else {
	bp->bio_data = gj_malloc(bp->bio_length, flags);
	if (bp->bio_data != NULL)
	bcopy(data, bp->bio_data, bp->bio_length);
	}
	return (bp);
	}

	#define g_journal_insert_bio(head, bp, flags) \
	g_journal_insert((head), (bp)->bio_offset, \
	(bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset, \
	(bp)->bio_data, flags)
	/*
	* The function below does a lot more than just inserting bio to the queue.
	* It keeps the queue sorted by offset and ensures that there are no doubled
	* data (it combines bios where ranges overlap).
	*
	* The function returns the number of bios inserted (as bio can be splitted).
	*/
	static int
	g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset,
	u_char *data, int flags)
	{
	struct bio nbp, cbp, *pbp;
	off_t cstart, cend;
	u_char *tmpdata;
	int n;

	GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend,
	joffset);
	n = 0;
	pbp = NULL;
	GJQ_FOREACH(*head, cbp) {
	cstart = cbp->bio_offset;
	cend = cbp->bio_offset + cbp->bio_length;

	if (nstart >= cend) {
	/*
	* +-------------+
	* \| \|
	* \| current \| +-------------+
	* \| bio \| \| \|
	* \| \| \| new \|
	* +-------------+ \| bio \|
	* \| \|
	* +-------------+
	*/
	GJ_DEBUG(3, "INSERT(%p): 1", *head);
	} else if (nend <= cstart) {
	/*
	* +-------------+
	* \| \|
	* +-------------+ \| current \|
	* \| \| \| bio \|
	* \| new \| \| \|
	* \| bio \| +-------------+
	* \| \|
	* +-------------+
	*/
	nbp = g_journal_new_bio(nstart, nend, joffset, data,
	flags);
	if (pbp == NULL)
	*head = nbp;
	else
	pbp->bio_next = nbp;
	nbp->bio_next = cbp;
	n++;
	GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp,
	pbp);
	goto end;
	} else if (nstart <= cstart && nend >= cend) {
	/*
	* +-------------+ +-------------+
	* \| current bio \| \| current bio \|
	* +---+-------------+---+ +-------------+---+
	* \| \| \| \| \| \| \|
	* \| \| \| \| \| \| \|
	* \| +-------------+ \| +-------------+ \|
	* \| new bio \| \| new bio \|
	* +---------------------+ +-----------------+
	*
	* +-------------+ +-------------+
	* \| current bio \| \| current bio \|
	* +---+-------------+ +-------------+
	* \| \| \| \| \|
	* \| \| \| \| \|
	* \| +-------------+ +-------------+
	* \| new bio \| \| new bio \|
	* +-----------------+ +-------------+
	*/
	g_journal_stats_bytes_skipped += cbp->bio_length;
	cbp->bio_offset = nstart;
	cbp->bio_joffset = joffset;
	cbp->bio_length = cend - nstart;
	if (cbp->bio_data != NULL) {
	gj_free(cbp->bio_data, cend - cstart);
	cbp->bio_data = NULL;
	}
	if (data != NULL) {
	cbp->bio_data = gj_malloc(cbp->bio_length,
	flags);
	if (cbp->bio_data != NULL) {
	bcopy(data, cbp->bio_data,
	cbp->bio_length);
	}
	data += cend - nstart;
	}
	joffset += cend - nstart;
	nstart = cend;
	GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp);
	} else if (nstart > cstart && nend >= cend) {
	/*
	* +-----------------+ +-------------+
	* \| current bio \| \| current bio \|
	* \| +-------------+ \| +---------+---+
	* \| \| \| \| \| \| \|
	* \| \| \| \| \| \| \|
	* +---+-------------+ +---+---------+ \|
	* \| new bio \| \| new bio \|
	* +-------------+ +-------------+
	*/
	g_journal_stats_bytes_skipped += cend - nstart;
	nbp = g_journal_new_bio(nstart, cend, joffset, data,
	flags);
	nbp->bio_next = cbp->bio_next;
	cbp->bio_next = nbp;
	cbp->bio_length = nstart - cstart;
	if (cbp->bio_data != NULL) {
	cbp->bio_data = gj_realloc(cbp->bio_data,
	cbp->bio_length, cend - cstart);
	}
	if (data != NULL)
	data += cend - nstart;
	joffset += cend - nstart;
	nstart = cend;
	n++;
	GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp);
	} else if (nstart > cstart && nend < cend) {
	/*
	* +---------------------+
	* \| current bio \|
	* \| +-------------+ \|
	* \| \| \| \|
	* \| \| \| \|
	* +---+-------------+---+
	* \| new bio \|
	* +-------------+
	*/
	g_journal_stats_bytes_skipped += nend - nstart;
	nbp = g_journal_new_bio(nstart, nend, joffset, data,
	flags);
	nbp->bio_next = cbp->bio_next;
	cbp->bio_next = nbp;
	if (cbp->bio_data == NULL)
	tmpdata = NULL;
	else
	tmpdata = cbp->bio_data + nend - cstart;
	nbp = g_journal_new_bio(nend, cend,
	cbp->bio_joffset + nend - cstart, tmpdata, flags);
	nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next;
	((struct bio *)cbp->bio_next)->bio_next = nbp;
	cbp->bio_length = nstart - cstart;
	if (cbp->bio_data != NULL) {
	cbp->bio_data = gj_realloc(cbp->bio_data,
	cbp->bio_length, cend - cstart);
	}
	n += 2;
	GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp);
	goto end;
	} else if (nstart <= cstart && nend < cend) {
	/*
	* +-----------------+ +-------------+
	* \| current bio \| \| current bio \|
	* +-------------+ \| +---+---------+ \|
	* \| \| \| \| \| \| \|
	* \| \| \| \| \| \| \|
	* +-------------+---+ \| +---------+---+
	* \| new bio \| \| new bio \|
	* +-------------+ +-------------+
	*/
	g_journal_stats_bytes_skipped += nend - nstart;
	nbp = g_journal_new_bio(nstart, nend, joffset, data,
	flags);
	if (pbp == NULL)
	*head = nbp;
	else
	pbp->bio_next = nbp;
	nbp->bio_next = cbp;
	cbp->bio_offset = nend;
	cbp->bio_length = cend - nend;
	cbp->bio_joffset += nend - cstart;
	tmpdata = cbp->bio_data;
	if (tmpdata != NULL) {
	cbp->bio_data = gj_malloc(cbp->bio_length,
	flags);
	if (cbp->bio_data != NULL) {
	bcopy(tmpdata + nend - cstart,
	cbp->bio_data, cbp->bio_length);
	}
	gj_free(tmpdata, cend - cstart);
	}
	n++;
	GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp);
	goto end;
	}
	if (nstart == nend)
	goto end;
	pbp = cbp;
	}
	nbp = g_journal_new_bio(nstart, nend, joffset, data, flags);
	if (pbp == NULL)
	*head = nbp;
	else
	pbp->bio_next = nbp;
	nbp->bio_next = NULL;
	n++;
	GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp);
	end:
	if (g_journal_debug >= 3) {
	GJQ_FOREACH(*head, cbp) {
	GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp,
	(intmax_t)cbp->bio_offset,
	(intmax_t)cbp->bio_length,
	(intmax_t)cbp->bio_joffset, cbp->bio_data);
	}
	GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n);
	}
	return (n);
	}

	/*
	* The function combines neighbour bios trying to squeeze as much data as
	* possible into one bio.
	*
	* The function returns the number of bios combined (negative value).
	*/
	static int
	g_journal_optimize(struct bio *head)
	{
	struct bio cbp, pbp;
	int n;

	n = 0;
	pbp = NULL;
	GJQ_FOREACH(head, cbp) {
	/* Skip bios which has to be read first. */
	if (cbp->bio_data == NULL) {
	pbp = NULL;
	continue;
	}
	/* There is no previous bio yet. */
	if (pbp == NULL) {
	pbp = cbp;
	continue;
	}
	/* Is this a neighbour bio? */
	if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) {
	/* Be sure that bios queue is sorted. */
	KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset,
	("poffset=%jd plength=%jd coffset=%jd",
	(intmax_t)pbp->bio_offset,
	(intmax_t)pbp->bio_length,
	(intmax_t)cbp->bio_offset));
	pbp = cbp;
	continue;
	}
	/* Be sure we don't end up with too big bio. */
	- if (pbp->bio_length + cbp->bio_length > MAXPHYS) {
	+ if (pbp->bio_length + cbp->bio_length > maxphys) {
	pbp = cbp;
	continue;
	}
	/* Ok, we can join bios. */
	GJ_LOGREQ(4, pbp, "Join: ");
	GJ_LOGREQ(4, cbp, "and: ");
	pbp->bio_data = gj_realloc(pbp->bio_data,
	pbp->bio_length + cbp->bio_length, pbp->bio_length);
	bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length,
	cbp->bio_length);
	gj_free(cbp->bio_data, cbp->bio_length);
	pbp->bio_length += cbp->bio_length;
	pbp->bio_next = cbp->bio_next;
	g_destroy_bio(cbp);
	cbp = pbp;
	g_journal_stats_combined_ios++;
	n--;
	GJ_LOGREQ(4, pbp, "Got: ");
	}
	return (n);
	}

	/*
	* TODO: Update comment.
	* These are functions responsible for copying one portion of data from journal
	* to the destination provider.
	* The order goes like this:
	* 1. Read the header, which contains informations about data blocks
	* following it.
	* 2. Read the data blocks from the journal.
	* 3. Write the data blocks on the data provider.
	*
	* g_journal_copy_start()
	* g_journal_copy_done() - got finished write request, logs potential errors.
	*/

	/*
	* When there is no data in cache, this function is used to read it.
	*/
	static void
	g_journal_read_first(struct g_journal_softc sc, struct bio bp)
	{
	struct bio *cbp;

	/*
	* We were short in memory, so data was freed.
	* In that case we need to read it back from journal.
	*/
	cbp = g_alloc_bio();
	cbp->bio_cflags = bp->bio_cflags;
	cbp->bio_parent = bp;
	cbp->bio_offset = bp->bio_joffset;
	cbp->bio_length = bp->bio_length;
	cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK);
	cbp->bio_cmd = BIO_READ;
	cbp->bio_done = g_journal_std_done;
	GJ_LOGREQ(4, cbp, "READ FIRST");
	g_io_request(cbp, sc->sc_jconsumer);
	g_journal_cache_misses++;
	}

	static void
	g_journal_copy_send(struct g_journal_softc *sc)
	{
	struct bio bioq, bp, *lbp;

	bioq = lbp = NULL;
	mtx_lock(&sc->sc_mtx);
	for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) {
	bp = GJQ_FIRST(sc->sc_inactive.jj_queue);
	if (bp == NULL)
	break;
	GJQ_REMOVE(sc->sc_inactive.jj_queue, bp);
	sc->sc_copy_in_progress++;
	GJQ_INSERT_AFTER(bioq, bp, lbp);
	lbp = bp;
	}
	mtx_unlock(&sc->sc_mtx);
	if (g_journal_do_optimize)
	sc->sc_copy_in_progress += g_journal_optimize(bioq);
	while ((bp = GJQ_FIRST(bioq)) != NULL) {
	GJQ_REMOVE(bioq, bp);
	GJQ_INSERT_HEAD(sc->sc_copy_queue, bp);
	bp->bio_cflags = GJ_BIO_COPY;
	if (bp->bio_data == NULL)
	g_journal_read_first(sc, bp);
	else {
	bp->bio_joffset = 0;
	GJ_LOGREQ(4, bp, "SEND");
	g_io_request(bp, sc->sc_dconsumer);
	}
	}
	}

	static void
	g_journal_copy_start(struct g_journal_softc *sc)
	{

	/*
	* Remember in metadata that we're starting to copy journaled data
	* to the data provider.
	* In case of power failure, we will copy these data once again on boot.
	*/
	if (!sc->sc_journal_copying) {
	sc->sc_journal_copying = 1;
	GJ_DEBUG(1, "Starting copy of journal.");
	g_journal_metadata_update(sc);
	}
	g_journal_copy_send(sc);
	}

	/*
	* Data block has been read from the journal provider.
	*/
	static int
	g_journal_copy_read_done(struct bio *bp)
	{
	struct g_journal_softc *sc;
	struct g_consumer *cp;
	struct bio *pbp;

	KASSERT(bp->bio_cflags == GJ_BIO_COPY,
	("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));

	sc = bp->bio_from->geom->softc;
	pbp = bp->bio_parent;

	if (bp->bio_error != 0) {
	GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
	bp->bio_to->name, bp->bio_error);
	/*
	* We will not be able to deliver WRITE request as well.
	*/
	gj_free(bp->bio_data, bp->bio_length);
	g_destroy_bio(pbp);
	g_destroy_bio(bp);
	sc->sc_copy_in_progress--;
	return (1);
	}
	pbp->bio_data = bp->bio_data;
	cp = sc->sc_dconsumer;
	g_io_request(pbp, cp);
	GJ_LOGREQ(4, bp, "READ DONE");
	g_destroy_bio(bp);
	return (0);
	}

	/*
	* Data block has been written to the data provider.
	*/
	static void
	g_journal_copy_write_done(struct bio *bp)
	{
	struct g_journal_softc *sc;

	KASSERT(bp->bio_cflags == GJ_BIO_COPY,
	("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));

	sc = bp->bio_from->geom->softc;
	sc->sc_copy_in_progress--;

	if (bp->bio_error != 0) {
	GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)",
	bp->bio_error);
	}
	GJQ_REMOVE(sc->sc_copy_queue, bp);
	gj_free(bp->bio_data, bp->bio_length);
	GJ_LOGREQ(4, bp, "DONE");
	g_destroy_bio(bp);

	if (sc->sc_copy_in_progress == 0) {
	/*
	* This was the last write request for this journal.
	*/
	GJ_DEBUG(1, "Data has been copied.");
	sc->sc_journal_copying = 0;
	}
	}

	static void g_journal_flush_done(struct bio *bp);

	/*
	* Flush one record onto active journal provider.
	*/
	static void
	g_journal_flush(struct g_journal_softc *sc)
	{
	struct g_journal_record_header hdr;
	struct g_journal_entry *ent;
	struct g_provider *pp;
	struct bio **bioq;
	struct bio bp, fbp, *pbp;
	off_t joffset;
	u_char *data, hash[16];
	MD5_CTX ctx;
	u_int i;

	if (sc->sc_current_count == 0)
	return;

	pp = sc->sc_jprovider;
	GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
	joffset = sc->sc_journal_offset;

	GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.",
	sc->sc_current_count, pp->name, (intmax_t)joffset);

	/*
	* Store 'journal id', so we know to which journal this record belongs.
	*/
	hdr.jrh_journal_id = sc->sc_journal_id;
	/* Could be less than g_journal_record_entries if called due timeout. */
	hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries);
	strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic));

	bioq = &sc->sc_active.jj_queue;
	GJQ_LAST(sc->sc_flush_queue, pbp);

	fbp = g_alloc_bio();
	fbp->bio_parent = NULL;
	fbp->bio_cflags = GJ_BIO_JOURNAL;
	fbp->bio_offset = -1;
	fbp->bio_joffset = joffset;
	fbp->bio_length = pp->sectorsize;
	fbp->bio_cmd = BIO_WRITE;
	fbp->bio_done = g_journal_std_done;
	GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp);
	pbp = fbp;
	fbp->bio_to = pp;
	GJ_LOGREQ(4, fbp, "FLUSH_OUT");
	joffset += pp->sectorsize;
	sc->sc_flush_count++;
	if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
	MD5Init(&ctx);

	for (i = 0; i < hdr.jrh_nentries; i++) {
	bp = sc->sc_current_queue;
	KASSERT(bp != NULL, ("NULL bp"));
	bp->bio_to = pp;
	GJ_LOGREQ(4, bp, "FLUSHED");
	sc->sc_current_queue = bp->bio_next;
	bp->bio_next = NULL;
	sc->sc_current_count--;

	/* Add to the header. */
	ent = &hdr.jrh_entries[i];
	ent->je_offset = bp->bio_offset;
	ent->je_joffset = joffset;
	ent->je_length = bp->bio_length;

	data = bp->bio_data;
	if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
	MD5Update(&ctx, data, ent->je_length);
	g_reset_bio(bp);
	bp->bio_cflags = GJ_BIO_JOURNAL;
	bp->bio_offset = ent->je_offset;
	bp->bio_joffset = ent->je_joffset;
	bp->bio_length = ent->je_length;
	bp->bio_data = data;
	bp->bio_cmd = BIO_WRITE;
	bp->bio_done = g_journal_std_done;
	GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp);
	pbp = bp;
	bp->bio_to = pp;
	GJ_LOGREQ(4, bp, "FLUSH_OUT");
	joffset += bp->bio_length;
	sc->sc_flush_count++;

	/*
	* Add request to the active sc_journal_queue queue.
	* This is our cache. After journal switch we don't have to
	* read the data from the inactive journal, because we keep
	* it in memory.
	*/
	g_journal_insert(bioq, ent->je_offset,
	ent->je_offset + ent->je_length, ent->je_joffset, data,
	M_NOWAIT);
	}

	/*
	* After all requests, store valid header.
	*/
	data = gj_malloc(pp->sectorsize, M_WAITOK);
	if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
	MD5Final(hash, &ctx);
	bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum));
	}
	g_journal_record_header_encode(&hdr, data);
	fbp->bio_data = data;

	sc->sc_journal_offset = joffset;

	g_journal_check_overflow(sc);
	}

	/*
	* Flush request finished.
	*/
	static void
	g_journal_flush_done(struct bio *bp)
	{
	struct g_journal_softc *sc;
	struct g_consumer *cp;

	KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL,
	("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL));

	cp = bp->bio_from;
	sc = cp->geom->softc;
	sc->sc_flush_in_progress--;

	if (bp->bio_error != 0) {
	GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)",
	bp->bio_error);
	}
	gj_free(bp->bio_data, bp->bio_length);
	GJ_LOGREQ(4, bp, "DONE");
	g_destroy_bio(bp);
	}

	static void g_journal_release_delayed(struct g_journal_softc *sc);

	static void
	g_journal_flush_send(struct g_journal_softc *sc)
	{
	struct g_consumer *cp;
	struct bio bioq, bp, *lbp;

	cp = sc->sc_jconsumer;
	bioq = lbp = NULL;
	while (sc->sc_flush_in_progress < g_journal_parallel_flushes) {
	/* Send one flush requests to the active journal. */
	bp = GJQ_FIRST(sc->sc_flush_queue);
	if (bp != NULL) {
	GJQ_REMOVE(sc->sc_flush_queue, bp);
	sc->sc_flush_count--;
	bp->bio_offset = bp->bio_joffset;
	bp->bio_joffset = 0;
	sc->sc_flush_in_progress++;
	GJQ_INSERT_AFTER(bioq, bp, lbp);
	lbp = bp;
	}
	/* Try to release delayed requests. */
	g_journal_release_delayed(sc);
	/* If there are no requests to flush, leave. */
	if (GJQ_FIRST(sc->sc_flush_queue) == NULL)
	break;
	}
	if (g_journal_do_optimize)
	sc->sc_flush_in_progress += g_journal_optimize(bioq);
	while ((bp = GJQ_FIRST(bioq)) != NULL) {
	GJQ_REMOVE(bioq, bp);
	GJ_LOGREQ(3, bp, "Flush request send");
	g_io_request(bp, cp);
	}
	}

	static void
	g_journal_add_current(struct g_journal_softc sc, struct bio bp)
	{
	int n;

	GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count);
	n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK);
	sc->sc_current_count += n;
	n = g_journal_optimize(sc->sc_current_queue);
	sc->sc_current_count += n;
	/*
	* For requests which are added to the current queue we deliver
	* response immediately.
	*/
	bp->bio_completed = bp->bio_length;
	g_io_deliver(bp, 0);
	if (sc->sc_current_count >= g_journal_record_entries) {
	/*
	* Let's flush one record onto active journal provider.
	*/
	g_journal_flush(sc);
	}
	}

	static void
	g_journal_release_delayed(struct g_journal_softc *sc)
	{
	struct bio *bp;

	for (;;) {
	/* The flush queue is full, exit. */
	if (sc->sc_flush_count >= g_journal_accept_immediately)
	return;
	bp = bioq_takefirst(&sc->sc_delayed_queue);
	if (bp == NULL)
	return;
	sc->sc_delayed_count--;
	g_journal_add_current(sc, bp);
	}
	}

	/*
	* Add I/O request to the current queue. If we have enough requests for one
	* journal record we flush them onto active journal provider.
	*/
	static void
	g_journal_add_request(struct g_journal_softc sc, struct bio bp)
	{

	/*
	* The flush queue is full, we need to delay the request.
	*/
	if (sc->sc_delayed_count > 0 \|\|
	sc->sc_flush_count >= g_journal_accept_immediately) {
	GJ_LOGREQ(4, bp, "DELAYED");
	bioq_insert_tail(&sc->sc_delayed_queue, bp);
	sc->sc_delayed_count++;
	return;
	}

	KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue),
	("DELAYED queue not empty."));
	g_journal_add_current(sc, bp);
	}

	static void g_journal_read_done(struct bio *bp);

	/*
	* Try to find requested data in cache.
	*/
	static struct bio *
	g_journal_read_find(struct bio head, int sorted, struct bio pbp, off_t ostart,
	off_t oend)
	{
	off_t cstart, cend;
	struct bio *bp;

	GJQ_FOREACH(head, bp) {
	if (bp->bio_offset == -1)
	continue;
	cstart = MAX(ostart, bp->bio_offset);
	cend = MIN(oend, bp->bio_offset + bp->bio_length);
	if (cend <= ostart)
	continue;
	else if (cstart >= oend) {
	if (!sorted)
	continue;
	else {
	bp = NULL;
	break;
	}
	}
	if (bp->bio_data == NULL)
	break;
	GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
	bp);
	bcopy(bp->bio_data + cstart - bp->bio_offset,
	pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
	pbp->bio_completed += cend - cstart;
	if (pbp->bio_completed == pbp->bio_length) {
	/*
	* Cool, the whole request was in cache, deliver happy
	* message.
	*/
	g_io_deliver(pbp, 0);
	return (pbp);
	}
	break;
	}
	return (bp);
	}

	/*
	* This function is used for collecting data on read.
	* The complexity is because parts of the data can be stored in four different
	* places:
	* - in memory - the data not yet send to the active journal provider
	* - in the active journal
	* - in the inactive journal
	* - in the data provider
	*/
	static void
	g_journal_read(struct g_journal_softc sc, struct bio pbp, off_t ostart,
	off_t oend)
	{
	struct bio bp, nbp, *head;
	off_t cstart, cend;
	u_int i, sorted = 0;

	GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend);

	cstart = cend = -1;
	bp = NULL;
	head = NULL;
	for (i = 1; i <= 5; i++) {
	switch (i) {
	case 1: /* Not-yet-send data. */
	head = sc->sc_current_queue;
	sorted = 1;
	break;
	case 2: /* Skip flush queue as they are also in active queue */
	continue;
	case 3: /* Active journal. */
	head = sc->sc_active.jj_queue;
	sorted = 1;
	break;
	case 4: /* Inactive journal. */
	/*
	* XXX: Here could be a race with g_journal_lowmem().
	*/
	head = sc->sc_inactive.jj_queue;
	sorted = 1;
	break;
	case 5: /* In-flight to the data provider. */
	head = sc->sc_copy_queue;
	sorted = 0;
	break;
	default:
	panic("gjournal %s: i=%d", __func__, i);
	}
	bp = g_journal_read_find(head, sorted, pbp, ostart, oend);
	if (bp == pbp) { /* Got the whole request. */
	GJ_DEBUG(2, "Got the whole request from %u.", i);
	return;
	} else if (bp != NULL) {
	cstart = MAX(ostart, bp->bio_offset);
	cend = MIN(oend, bp->bio_offset + bp->bio_length);
	GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).",
	i, (intmax_t)cstart, (intmax_t)cend);
	break;
	}
	}
	if (bp != NULL) {
	if (bp->bio_data == NULL) {
	nbp = g_duplicate_bio(pbp);
	nbp->bio_cflags = GJ_BIO_READ;
	nbp->bio_data =
	pbp->bio_data + cstart - pbp->bio_offset;
	nbp->bio_offset =
	bp->bio_joffset + cstart - bp->bio_offset;
	nbp->bio_length = cend - cstart;
	nbp->bio_done = g_journal_read_done;
	g_io_request(nbp, sc->sc_jconsumer);
	}
	/*
	* If we don't have the whole request yet, call g_journal_read()
	* recursively.
	*/
	if (ostart < cstart)
	g_journal_read(sc, pbp, ostart, cstart);
	if (oend > cend)
	g_journal_read(sc, pbp, cend, oend);
	} else {
	/*
	* No data in memory, no data in journal.
	* Its time for asking data provider.
	*/
	GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend);
	nbp = g_duplicate_bio(pbp);
	nbp->bio_cflags = GJ_BIO_READ;
	nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset;
	nbp->bio_offset = ostart;
	nbp->bio_length = oend - ostart;
	nbp->bio_done = g_journal_read_done;
	g_io_request(nbp, sc->sc_dconsumer);
	/* We have the whole request, return here. */
	return;
	}
	}

	/*
	* Function responsible for handling finished READ requests.
	* Actually, g_std_done() could be used here, the only difference is that we
	* log error.
	*/
	static void
	g_journal_read_done(struct bio *bp)
	{
	struct bio *pbp;

	KASSERT(bp->bio_cflags == GJ_BIO_READ,
	("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ));

	pbp = bp->bio_parent;
	pbp->bio_inbed++;
	pbp->bio_completed += bp->bio_length;

	if (bp->bio_error != 0) {
	if (pbp->bio_error == 0)
	pbp->bio_error = bp->bio_error;
	GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
	bp->bio_to->name, bp->bio_error);
	}
	g_destroy_bio(bp);
	if (pbp->bio_children == pbp->bio_inbed &&
	pbp->bio_completed == pbp->bio_length) {
	/* We're done. */
	g_io_deliver(pbp, 0);
	}
	}

	/*
	* Deactive current journal and active next one.
	*/
	static void
	g_journal_switch(struct g_journal_softc *sc)
	{
	struct g_provider *pp;

	if (JEMPTY(sc)) {
	GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
	pp = LIST_FIRST(&sc->sc_geom->provider);
	if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) {
	sc->sc_flags \|= GJF_DEVICE_CLEAN;
	GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
	g_journal_metadata_update(sc);
	}
	} else {
	GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name);

	pp = sc->sc_jprovider;

	sc->sc_journal_previous_id = sc->sc_journal_id;

	sc->sc_journal_id = sc->sc_journal_next_id;
	sc->sc_journal_next_id = arc4random();

	GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);

	g_journal_write_header(sc);

	sc->sc_inactive.jj_offset = sc->sc_active.jj_offset;
	sc->sc_inactive.jj_queue = sc->sc_active.jj_queue;

	sc->sc_active.jj_offset =
	sc->sc_journal_offset - pp->sectorsize;
	sc->sc_active.jj_queue = NULL;

	/*
	* Switch is done, start copying data from the (now) inactive
	* journal to the data provider.
	*/
	g_journal_copy_start(sc);
	}
	mtx_lock(&sc->sc_mtx);
	sc->sc_flags &= ~GJF_DEVICE_SWITCH;
	mtx_unlock(&sc->sc_mtx);
	}

	static void
	g_journal_initialize(struct g_journal_softc *sc)
	{

	sc->sc_journal_id = arc4random();
	sc->sc_journal_next_id = arc4random();
	sc->sc_journal_previous_id = sc->sc_journal_id;
	sc->sc_journal_offset = sc->sc_jstart;
	sc->sc_inactive.jj_offset = sc->sc_jstart;
	g_journal_write_header(sc);
	sc->sc_active.jj_offset = sc->sc_jstart;
	}

	static void
	g_journal_mark_as_dirty(struct g_journal_softc *sc)
	{
	const struct g_journal_desc *desc;
	int i;

	GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name);
	for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++)
	desc->jd_dirty(sc->sc_dconsumer);
	}

	/*
	* Function read record header from the given journal.
	* It is very simlar to g_read_data(9), but it doesn't allocate memory for bio
	* and data on every call.
	*/
	static int
	g_journal_sync_read(struct g_consumer cp, struct bio bp, off_t offset,
	void *data)
	{
	int error;

	g_reset_bio(bp);
	bp->bio_cmd = BIO_READ;
	bp->bio_done = NULL;
	bp->bio_offset = offset;
	bp->bio_length = cp->provider->sectorsize;
	bp->bio_data = data;
	g_io_request(bp, cp);
	error = biowait(bp, "gjs_read");
	return (error);
	}

	#if 0
	/*
	* Function is called when we start the journal device and we detect that
	* one of the journals was not fully copied.
	* The purpose of this function is to read all records headers from journal
	* and placed them in the inactive queue, so we can start journal
	* synchronization process and the journal provider itself.
	* Design decision was taken to not synchronize the whole journal here as it
	* can take too much time. Reading headers only and delaying synchronization
	* process until after journal provider is started should be the best choice.
	*/
	#endif

	static void
	g_journal_sync(struct g_journal_softc *sc)
	{
	struct g_journal_record_header rhdr;
	struct g_journal_entry *ent;
	struct g_journal_header jhdr;
	struct g_consumer *cp;
	struct bio bp, fbp, *tbp;
	off_t joffset, offset;
	u_char *buf, sum[16];
	uint64_t id;
	MD5_CTX ctx;
	int error, found, i;

	found = 0;
	fbp = NULL;
	cp = sc->sc_jconsumer;
	bp = g_alloc_bio();
	buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
	offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset;

	GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset);

	/*
	* Read and decode first journal header.
	*/
	error = g_journal_sync_read(cp, bp, offset, buf);
	if (error != 0) {
	GJ_DEBUG(0, "Error while reading journal header from %s.",
	cp->provider->name);
	goto end;
	}
	error = g_journal_header_decode(buf, &jhdr);
	if (error != 0) {
	GJ_DEBUG(0, "Cannot decode journal header from %s.",
	cp->provider->name);
	goto end;
	}
	id = sc->sc_journal_id;
	if (jhdr.jh_journal_id != sc->sc_journal_id) {
	GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).",
	(intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id);
	goto end;
	}
	offset += cp->provider->sectorsize;
	id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;

	for (;;) {
	/*
	* If the biggest record won't fit, look for a record header or
	* journal header from the beginning.
	*/
	GJ_VALIDATE_OFFSET(offset, sc);
	error = g_journal_sync_read(cp, bp, offset, buf);
	if (error != 0) {
	/*
	* Not good. Having an error while reading header
	* means, that we cannot read next headers and in
	* consequence we cannot find termination.
	*/
	GJ_DEBUG(0,
	"Error while reading record header from %s.",
	cp->provider->name);
	break;
	}

	error = g_journal_record_header_decode(buf, &rhdr);
	if (error != 0) {
	GJ_DEBUG(2, "Not a record header at %jd (error=%d).",
	(intmax_t)offset, error);
	/*
	* This is not a record header.
	* If we are lucky, this is next journal header.
	*/
	error = g_journal_header_decode(buf, &jhdr);
	if (error != 0) {
	GJ_DEBUG(1, "Not a journal header at %jd (error=%d).",
	(intmax_t)offset, error);
	/*
	* Nope, this is not journal header, which
	* bascially means that journal is not
	* terminated properly.
	*/
	error = ENOENT;
	break;
	}
	/*
	* Ok. This is header of _some_ journal. Now we need to
	* verify if this is header of the _next_ journal.
	*/
	if (jhdr.jh_journal_id != id) {
	GJ_DEBUG(1, "Journal ID mismatch at %jd "
	"(0x%08x != 0x%08x).", (intmax_t)offset,
	(u_int)jhdr.jh_journal_id, (u_int)id);
	error = ENOENT;
	break;
	}

	/* Found termination. */
	found++;
	GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).",
	(intmax_t)offset, (u_int)id);
	sc->sc_active.jj_offset = offset;
	sc->sc_journal_offset =
	offset + cp->provider->sectorsize;
	sc->sc_journal_id = id;
	id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;

	while ((tbp = fbp) != NULL) {
	fbp = tbp->bio_next;
	GJ_LOGREQ(3, tbp, "Adding request.");
	g_journal_insert_bio(&sc->sc_inactive.jj_queue,
	tbp, M_WAITOK);
	}

	/* Skip journal's header. */
	offset += cp->provider->sectorsize;
	continue;
	}

	/* Skip record's header. */
	offset += cp->provider->sectorsize;

	/*
	* Add information about every record entry to the inactive
	* queue.
	*/
	if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
	MD5Init(&ctx);
	for (i = 0; i < rhdr.jrh_nentries; i++) {
	ent = &rhdr.jrh_entries[i];
	GJ_DEBUG(3, "Insert entry: %jd %jd.",
	(intmax_t)ent->je_offset, (intmax_t)ent->je_length);
	g_journal_insert(&fbp, ent->je_offset,
	ent->je_offset + ent->je_length, ent->je_joffset,
	NULL, M_WAITOK);
	if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
	u_char *buf2;

	/*
	* TODO: Should use faster function (like
	* g_journal_sync_read()).
	*/
	buf2 = g_read_data(cp, offset, ent->je_length,
	NULL);
	if (buf2 == NULL)
	GJ_DEBUG(0, "Cannot read data at %jd.",
	(intmax_t)offset);
	else {
	MD5Update(&ctx, buf2, ent->je_length);
	g_free(buf2);
	}
	}
	/* Skip entry's data. */
	offset += ent->je_length;
	}
	if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
	MD5Final(sum, &ctx);
	if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) {
	GJ_DEBUG(0, "MD5 hash mismatch at %jd!",
	(intmax_t)offset);
	}
	}
	}
	end:
	gj_free(bp->bio_data, cp->provider->sectorsize);
	g_destroy_bio(bp);

	/* Remove bios from unterminated journal. */
	while ((tbp = fbp) != NULL) {
	fbp = tbp->bio_next;
	g_destroy_bio(tbp);
	}

	if (found < 1 && joffset > 0) {
	GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.",
	sc->sc_name);
	while ((tbp = sc->sc_inactive.jj_queue) != NULL) {
	sc->sc_inactive.jj_queue = tbp->bio_next;
	g_destroy_bio(tbp);
	}
	g_journal_initialize(sc);
	g_journal_mark_as_dirty(sc);
	} else {
	GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name);
	g_journal_copy_start(sc);
	}
	}

	/*
	* Wait for requests.
	* If we have requests in the current queue, flush them after 3 seconds from the
	* last flush. In this way we don't wait forever (or for journal switch) with
	* storing not full records on journal.
	*/
	static void
	g_journal_wait(struct g_journal_softc *sc, time_t last_write)
	{
	int error, timeout;

	GJ_DEBUG(3, "%s: enter", __func__);
	if (sc->sc_current_count == 0) {
	if (g_journal_debug < 2)
	msleep(sc, &sc->sc_mtx, PRIBIO \| PDROP, "gj:work", 0);
	else {
	/*
	* If we have debug turned on, show number of elements
	* in various queues.
	*/
	for (;;) {
	error = msleep(sc, &sc->sc_mtx, PRIBIO,
	"gj:work", hz * 3);
	if (error == 0) {
	mtx_unlock(&sc->sc_mtx);
	break;
	}
	GJ_DEBUG(3, "Report: current count=%d",
	sc->sc_current_count);
	GJ_DEBUG(3, "Report: flush count=%d",
	sc->sc_flush_count);
	GJ_DEBUG(3, "Report: flush in progress=%d",
	sc->sc_flush_in_progress);
	GJ_DEBUG(3, "Report: copy in progress=%d",
	sc->sc_copy_in_progress);
	GJ_DEBUG(3, "Report: delayed=%d",
	sc->sc_delayed_count);
	}
	}
	GJ_DEBUG(3, "%s: exit 1", __func__);
	return;
	}

	/*
	* Flush even not full records every 3 seconds.
	*/
	timeout = (last_write + 3 - time_second) * hz;
	if (timeout <= 0) {
	mtx_unlock(&sc->sc_mtx);
	g_journal_flush(sc);
	g_journal_flush_send(sc);
	GJ_DEBUG(3, "%s: exit 2", __func__);
	return;
	}
	error = msleep(sc, &sc->sc_mtx, PRIBIO \| PDROP, "gj:work", timeout);
	if (error == EWOULDBLOCK)
	g_journal_flush_send(sc);
	GJ_DEBUG(3, "%s: exit 3", __func__);
	}

	/*
	* Worker thread.
	*/
	static void
	g_journal_worker(void *arg)
	{
	struct g_journal_softc *sc;
	struct g_geom *gp;
	struct g_provider *pp;
	struct bio *bp;
	time_t last_write;
	int type;

	thread_lock(curthread);
	sched_prio(curthread, PRIBIO);
	thread_unlock(curthread);

	sc = arg;
	type = 0; /* gcc */

	if (sc->sc_flags & GJF_DEVICE_CLEAN) {
	GJ_DEBUG(0, "Journal %s clean.", sc->sc_name);
	g_journal_initialize(sc);
	} else {
	g_journal_sync(sc);
	}
	/*
	* Check if we can use BIO_FLUSH.
	*/
	sc->sc_bio_flush = 0;
	if (g_io_flush(sc->sc_jconsumer) == 0) {
	sc->sc_bio_flush \|= GJ_FLUSH_JOURNAL;
	GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
	sc->sc_jconsumer->provider->name);
	} else {
	GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
	sc->sc_jconsumer->provider->name);
	}
	if (sc->sc_jconsumer != sc->sc_dconsumer) {
	if (g_io_flush(sc->sc_dconsumer) == 0) {
	sc->sc_bio_flush \|= GJ_FLUSH_DATA;
	GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
	sc->sc_dconsumer->provider->name);
	} else {
	GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
	sc->sc_dconsumer->provider->name);
	}
	}

	gp = sc->sc_geom;
	g_topology_lock();
	pp = g_new_providerf(gp, "%s.journal", sc->sc_name);
	pp->mediasize = sc->sc_mediasize;
	/*
	* There could be a problem when data provider and journal providers
	* have different sectorsize, but such scenario is prevented on journal
	* creation.
	*/
	pp->sectorsize = sc->sc_sectorsize;
	g_error_provider(pp, 0);
	g_topology_unlock();
	last_write = time_second;

	if (sc->sc_rootmount != NULL) {
	GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
	root_mount_rel(sc->sc_rootmount);
	sc->sc_rootmount = NULL;
	}

	for (;;) {
	/* Get first request from the queue. */
	mtx_lock(&sc->sc_mtx);
	bp = bioq_first(&sc->sc_back_queue);
	if (bp != NULL)
	type = (bp->bio_cflags & GJ_BIO_MASK);
	if (bp == NULL) {
	bp = bioq_first(&sc->sc_regular_queue);
	if (bp != NULL)
	type = GJ_BIO_REGULAR;
	}
	if (bp == NULL) {
	try_switch:
	if ((sc->sc_flags & GJF_DEVICE_SWITCH) \|\|
	(sc->sc_flags & GJF_DEVICE_DESTROY)) {
	if (sc->sc_current_count > 0) {
	mtx_unlock(&sc->sc_mtx);
	g_journal_flush(sc);
	g_journal_flush_send(sc);
	continue;
	}
	if (sc->sc_flush_in_progress > 0)
	goto sleep;
	if (sc->sc_copy_in_progress > 0)
	goto sleep;
	}
	if (sc->sc_flags & GJF_DEVICE_SWITCH) {
	mtx_unlock(&sc->sc_mtx);
	g_journal_switch(sc);
	wakeup(&sc->sc_journal_copying);
	continue;
	}
	if (sc->sc_flags & GJF_DEVICE_DESTROY) {
	GJ_DEBUG(1, "Shutting down worker "
	"thread for %s.", gp->name);
	sc->sc_worker = NULL;
	wakeup(&sc->sc_worker);
	mtx_unlock(&sc->sc_mtx);
	kproc_exit(0);
	}
	sleep:
	g_journal_wait(sc, last_write);
	continue;
	}
	/*
	* If we're in switch process, we need to delay all new
	* write requests until its done.
	*/
	if ((sc->sc_flags & GJF_DEVICE_SWITCH) &&
	type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) {
	GJ_LOGREQ(2, bp, "WRITE on SWITCH");
	goto try_switch;
	}
	if (type == GJ_BIO_REGULAR)
	bioq_remove(&sc->sc_regular_queue, bp);
	else
	bioq_remove(&sc->sc_back_queue, bp);
	mtx_unlock(&sc->sc_mtx);
	switch (type) {
	case GJ_BIO_REGULAR:
	/* Regular request. */
	switch (bp->bio_cmd) {
	case BIO_READ:
	g_journal_read(sc, bp, bp->bio_offset,
	bp->bio_offset + bp->bio_length);
	break;
	case BIO_WRITE:
	last_write = time_second;
	g_journal_add_request(sc, bp);
	g_journal_flush_send(sc);
	break;
	default:
	panic("Invalid bio_cmd (%d).", bp->bio_cmd);
	}
	break;
	case GJ_BIO_COPY:
	switch (bp->bio_cmd) {
	case BIO_READ:
	if (g_journal_copy_read_done(bp))
	g_journal_copy_send(sc);
	break;
	case BIO_WRITE:
	g_journal_copy_write_done(bp);
	g_journal_copy_send(sc);
	break;
	default:
	panic("Invalid bio_cmd (%d).", bp->bio_cmd);
	}
	break;
	case GJ_BIO_JOURNAL:
	g_journal_flush_done(bp);
	g_journal_flush_send(sc);
	break;
	case GJ_BIO_READ:
	default:
	panic("Invalid bio (%d).", type);
	}
	}
	}

	static void
	g_journal_destroy_event(void *arg, int flags __unused)
	{
	struct g_journal_softc *sc;

	g_topology_assert();
	sc = arg;
	g_journal_destroy(sc);
	}

	static void
	g_journal_timeout(void *arg)
	{
	struct g_journal_softc *sc;

	sc = arg;
	GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.",
	sc->sc_geom->name);
	g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL);
	}

	static struct g_geom *
	g_journal_create(struct g_class mp, struct g_provider pp,
	const struct g_journal_metadata *md)
	{
	struct g_journal_softc *sc;
	struct g_geom *gp;
	struct g_consumer *cp;
	int error;

	sc = NULL; /* gcc */

	g_topology_assert();
	/*
	* There are two possibilities:
	* 1. Data and both journals are on the same provider.
	* 2. Data and journals are all on separated providers.
	*/
	/* Look for journal device with the same ID. */
	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_id == md->md_id)
	break;
	}
	if (gp == NULL)
	sc = NULL;
	else if (sc != NULL && (sc->sc_type & md->md_type) != 0) {
	GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id);
	return (NULL);
	}
	if (md->md_type == 0 \|\| (md->md_type & ~GJ_TYPE_COMPLETE) != 0) {
	GJ_DEBUG(0, "Invalid type on %s.", pp->name);
	return (NULL);
	}
	if (md->md_type & GJ_TYPE_DATA) {
	GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id,
	pp->name);
	}
	if (md->md_type & GJ_TYPE_JOURNAL) {
	GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id,
	pp->name);
	}

	if (sc == NULL) {
	/* Action geom. */
	sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK \| M_ZERO);
	sc->sc_id = md->md_id;
	sc->sc_type = 0;
	sc->sc_flags = 0;
	sc->sc_worker = NULL;

	gp = g_new_geomf(mp, "gjournal %u", sc->sc_id);
	gp->start = g_journal_start;
	gp->orphan = g_journal_orphan;
	gp->access = g_journal_access;
	gp->softc = sc;
	gp->flags \|= G_GEOM_VOLATILE_BIO;
	sc->sc_geom = gp;

	mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF);

	bioq_init(&sc->sc_back_queue);
	bioq_init(&sc->sc_regular_queue);
	bioq_init(&sc->sc_delayed_queue);
	sc->sc_delayed_count = 0;
	sc->sc_current_queue = NULL;
	sc->sc_current_count = 0;
	sc->sc_flush_queue = NULL;
	sc->sc_flush_count = 0;
	sc->sc_flush_in_progress = 0;
	sc->sc_copy_queue = NULL;
	sc->sc_copy_in_progress = 0;
	sc->sc_inactive.jj_queue = NULL;
	sc->sc_active.jj_queue = NULL;

	sc->sc_rootmount = root_mount_hold("GJOURNAL");
	GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);

	callout_init(&sc->sc_callout, 1);
	if (md->md_type != GJ_TYPE_COMPLETE) {
	/*
	* Journal and data are on separate providers.
	* At this point we have only one of them.
	* We setup a timeout in case the other part will not
	* appear, so we won't wait forever.
	*/
	callout_reset(&sc->sc_callout, 5 * hz,
	g_journal_timeout, sc);
	}
	}

	/* Remember type of the data provider. */
	if (md->md_type & GJ_TYPE_DATA)
	sc->sc_orig_type = md->md_type;
	sc->sc_type \|= md->md_type;
	cp = NULL;

	if (md->md_type & GJ_TYPE_DATA) {
	if (md->md_flags & GJ_FLAG_CLEAN)
	sc->sc_flags \|= GJF_DEVICE_CLEAN;
	if (md->md_flags & GJ_FLAG_CHECKSUM)
	sc->sc_flags \|= GJF_DEVICE_CHECKSUM;
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
	pp->name, error));
	error = g_access(cp, 1, 1, 1);
	if (error != 0) {
	GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name,
	error);
	g_journal_destroy(sc);
	return (NULL);
	}
	sc->sc_dconsumer = cp;
	sc->sc_mediasize = pp->mediasize - pp->sectorsize;
	sc->sc_sectorsize = pp->sectorsize;
	sc->sc_jstart = md->md_jstart;
	sc->sc_jend = md->md_jend;
	if (md->md_provider[0] != '\0')
	sc->sc_flags \|= GJF_DEVICE_HARDCODED;
	sc->sc_journal_offset = md->md_joffset;
	sc->sc_journal_id = md->md_jid;
	sc->sc_journal_previous_id = md->md_jid;
	}
	if (md->md_type & GJ_TYPE_JOURNAL) {
	if (cp == NULL) {
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
	pp->name, error));
	error = g_access(cp, 1, 1, 1);
	if (error != 0) {
	GJ_DEBUG(0, "Cannot access %s (error=%d).",
	pp->name, error);
	g_journal_destroy(sc);
	return (NULL);
	}
	} else {
	/*
	* Journal is on the same provider as data, which means
	* that data provider ends where journal starts.
	*/
	sc->sc_mediasize = md->md_jstart;
	}
	sc->sc_jconsumer = cp;
	}

	/* Start switcher kproc if needed. */
	if (g_journal_switcher_proc == NULL)
	g_journal_start_switcher(mp);

	if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) {
	/* Journal is not complete yet. */
	return (gp);
	} else {
	/* Journal complete, cancel timeout. */
	callout_drain(&sc->sc_callout);
	}

	error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0,
	"g_journal %s", sc->sc_name);
	if (error != 0) {
	GJ_DEBUG(0, "Cannot create worker thread for %s.journal.",
	sc->sc_name);
	g_journal_destroy(sc);
	return (NULL);
	}

	return (gp);
	}

	static void
	g_journal_destroy_consumer(void *arg, int flags __unused)
	{
	struct g_consumer *cp;

	g_topology_assert();
	cp = arg;
	g_detach(cp);
	g_destroy_consumer(cp);
	}

	static int
	g_journal_destroy(struct g_journal_softc *sc)
	{
	struct g_geom *gp;
	struct g_provider *pp;
	struct g_consumer *cp;

	g_topology_assert();

	if (sc == NULL)
	return (ENXIO);

	gp = sc->sc_geom;
	pp = LIST_FIRST(&gp->provider);
	if (pp != NULL) {
	if (pp->acr != 0 \|\| pp->acw != 0 \|\| pp->ace != 0) {
	GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).",
	pp->name, pp->acr, pp->acw, pp->ace);
	return (EBUSY);
	}
	g_error_provider(pp, ENXIO);

	g_journal_flush(sc);
	g_journal_flush_send(sc);
	g_journal_switch(sc);
	}

	sc->sc_flags \|= (GJF_DEVICE_DESTROY \| GJF_DEVICE_CLEAN);

	g_topology_unlock();

	if (sc->sc_rootmount != NULL) {
	GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
	root_mount_rel(sc->sc_rootmount);
	sc->sc_rootmount = NULL;
	}

	callout_drain(&sc->sc_callout);
	mtx_lock(&sc->sc_mtx);
	wakeup(sc);
	while (sc->sc_worker != NULL)
	msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0);
	mtx_unlock(&sc->sc_mtx);

	if (pp != NULL) {
	GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
	g_journal_metadata_update(sc);
	g_topology_lock();
	g_wither_provider(pp, ENXIO);
	} else {
	g_topology_lock();
	}
	mtx_destroy(&sc->sc_mtx);

	if (sc->sc_current_count != 0) {
	GJ_DEBUG(0, "Warning! Number of current requests %d.",
	sc->sc_current_count);
	}

	gp->softc = NULL;
	LIST_FOREACH(cp, &gp->consumer, consumer) {
	if (cp->acr + cp->acw + cp->ace > 0)
	g_access(cp, -1, -1, -1);
	/*
	* We keep all consumers open for writting, so if I'll detach
	* and destroy consumer here, I'll get providers for taste, so
	* journal will be started again.
	* Sending an event here, prevents this from happening.
	*/
	g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL);
	}
	g_wither_geom(gp, ENXIO);
	free(sc, M_JOURNAL);
	return (0);
	}

	static void
	g_journal_taste_orphan(struct g_consumer *cp)
	{

	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
	cp->provider->name));
	}

	static struct g_geom *
	g_journal_taste(struct g_class mp, struct g_provider pp, int flags __unused)
	{
	struct g_journal_metadata md;
	struct g_consumer *cp;
	struct g_geom *gp;
	int error;

	g_topology_assert();
	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
	GJ_DEBUG(2, "Tasting %s.", pp->name);
	if (pp->geom->class == mp)
	return (NULL);

	gp = g_new_geomf(mp, "journal:taste");
	/* This orphan function should be never called. */
	gp->orphan = g_journal_taste_orphan;
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error == 0) {
	error = g_journal_metadata_read(cp, &md);
	g_detach(cp);
	}
	g_destroy_consumer(cp);
	g_destroy_geom(gp);
	if (error != 0)
	return (NULL);
	gp = NULL;

	if (md.md_provider[0] != '\0' &&
	!g_compare_names(md.md_provider, pp->name))
	return (NULL);
	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
	return (NULL);
	if (g_journal_debug >= 2)
	journal_metadata_dump(&md);

	gp = g_journal_create(mp, pp, &md);
	return (gp);
	}

	static struct g_journal_softc *
	g_journal_find_device(struct g_class mp, const char name)
	{
	struct g_journal_softc *sc;
	struct g_geom *gp;
	struct g_provider *pp;

	if (strncmp(name, _PATH_DEV, 5) == 0)
	name += 5;
	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_flags & GJF_DEVICE_DESTROY)
	continue;
	if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
	continue;
	pp = LIST_FIRST(&gp->provider);
	if (strcmp(sc->sc_name, name) == 0)
	return (sc);
	if (pp != NULL && strcmp(pp->name, name) == 0)
	return (sc);
	}
	return (NULL);
	}

	static void
	g_journal_ctl_destroy(struct gctl_req req, struct g_class mp)
	{
	struct g_journal_softc *sc;
	const char *name;
	char param[16];
	int *nargs;
	int error, i;

	g_topology_assert();

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "No '%s' argument.", "nargs");
	return;
	}
	if (*nargs <= 0) {
	gctl_error(req, "Missing device(s).");
	return;
	}

	for (i = 0; i < *nargs; i++) {
	snprintf(param, sizeof(param), "arg%d", i);
	name = gctl_get_asciiparam(req, param);
	if (name == NULL) {
	gctl_error(req, "No 'arg%d' argument.", i);
	return;
	}
	sc = g_journal_find_device(mp, name);
	if (sc == NULL) {
	gctl_error(req, "No such device: %s.", name);
	return;
	}
	error = g_journal_destroy(sc);
	if (error != 0) {
	gctl_error(req, "Cannot destroy device %s (error=%d).",
	LIST_FIRST(&sc->sc_geom->provider)->name, error);
	return;
	}
	}
	}

	static void
	g_journal_ctl_sync(struct gctl_req req __unused, struct g_class mp __unused)
	{

	g_topology_assert();
	g_topology_unlock();
	g_journal_sync_requested++;
	wakeup(&g_journal_switcher_state);
	while (g_journal_sync_requested > 0)
	tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2);
	g_topology_lock();
	}

	static void
	g_journal_config(struct gctl_req req, struct g_class mp, const char *verb)
	{
	uint32_t *version;

	g_topology_assert();

	version = gctl_get_paraml(req, "version", sizeof(*version));
	if (version == NULL) {
	gctl_error(req, "No '%s' argument.", "version");
	return;
	}
	if (*version != G_JOURNAL_VERSION) {
	gctl_error(req, "Userland and kernel parts are out of sync.");
	return;
	}

	if (strcmp(verb, "destroy") == 0 \|\| strcmp(verb, "stop") == 0) {
	g_journal_ctl_destroy(req, mp);
	return;
	} else if (strcmp(verb, "sync") == 0) {
	g_journal_ctl_sync(req, mp);
	return;
	}

	gctl_error(req, "Unknown verb.");
	}

	static void
	g_journal_dumpconf(struct sbuf sb, const char indent, struct g_geom *gp,
	struct g_consumer cp, struct g_provider pp)
	{
	struct g_journal_softc *sc;

	g_topology_assert();

	sc = gp->softc;
	if (sc == NULL)
	return;
	if (pp != NULL) {
	/* Nothing here. */
	} else if (cp != NULL) {
	int first = 1;

	sbuf_printf(sb, "%s<Role>", indent);
	if (cp == sc->sc_dconsumer) {
	sbuf_cat(sb, "Data");
	first = 0;
	}
	if (cp == sc->sc_jconsumer) {
	if (!first)
	sbuf_cat(sb, ",");
	sbuf_cat(sb, "Journal");
	}
	sbuf_cat(sb, "</Role>\n");
	if (cp == sc->sc_jconsumer) {
	sbuf_printf(sb, "<Jstart>%jd</Jstart>\n",
	(intmax_t)sc->sc_jstart);
	sbuf_printf(sb, "<Jend>%jd</Jend>\n",
	(intmax_t)sc->sc_jend);
	}
	} else {
	sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
	}
	}

	static eventhandler_tag g_journal_event_shutdown = NULL;
	static eventhandler_tag g_journal_event_lowmem = NULL;

	static void
	g_journal_shutdown(void *arg, int howto __unused)
	{
	struct g_class *mp;
	struct g_geom gp, gp2;

	if (KERNEL_PANICKED())
	return;
	mp = arg;
	g_topology_lock();
	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
	if (gp->softc == NULL)
	continue;
	GJ_DEBUG(0, "Shutting down geom %s.", gp->name);
	g_journal_destroy(gp->softc);
	}
	g_topology_unlock();
	}

	/*
	* Free cached requests from inactive queue in case of low memory.
	* We free GJ_FREE_AT_ONCE elements at once.
	*/
	#define GJ_FREE_AT_ONCE 4
	static void
	g_journal_lowmem(void *arg, int howto __unused)
	{
	struct g_journal_softc *sc;
	struct g_class *mp;
	struct g_geom *gp;
	struct bio *bp;
	u_int nfree = GJ_FREE_AT_ONCE;

	g_journal_stats_low_mem++;
	mp = arg;
	g_topology_lock();
	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc == NULL \|\| (sc->sc_flags & GJF_DEVICE_DESTROY))
	continue;
	mtx_lock(&sc->sc_mtx);
	for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL;
	nfree--, bp = bp->bio_next) {
	/*
	* This is safe to free the bio_data, because:
	* 1. If bio_data is NULL it will be read from the
	* inactive journal.
	* 2. If bp is sent down, it is first removed from the
	* inactive queue, so it's impossible to free the
	* data from under in-flight bio.
	* On the other hand, freeing elements from the active
	* queue, is not safe.
	*/
	if (bp->bio_data != NULL) {
	GJ_DEBUG(2, "Freeing data from %s.",
	sc->sc_name);
	gj_free(bp->bio_data, bp->bio_length);
	bp->bio_data = NULL;
	}
	}
	mtx_unlock(&sc->sc_mtx);
	if (nfree == 0)
	break;
	}
	g_topology_unlock();
	}

	static void g_journal_switcher(void *arg);

	static void
	g_journal_init(struct g_class *mp)
	{

	/* Pick a conservative value if provided value sucks. */
	if (g_journal_cache_divisor <= 0 \|\|
	(vm_kmem_size / g_journal_cache_divisor == 0)) {
	g_journal_cache_divisor = 5;
	}
	if (g_journal_cache_limit > 0) {
	g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor;
	g_journal_cache_low =
	(g_journal_cache_limit / 100) * g_journal_cache_switch;
	}
	g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync,
	g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST);
	if (g_journal_event_shutdown == NULL)
	GJ_DEBUG(0, "Warning! Cannot register shutdown event.");
	g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem,
	g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST);
	if (g_journal_event_lowmem == NULL)
	GJ_DEBUG(0, "Warning! Cannot register lowmem event.");
	}

	static void
	g_journal_fini(struct g_class *mp)
	{

	if (g_journal_event_shutdown != NULL) {
	EVENTHANDLER_DEREGISTER(shutdown_post_sync,
	g_journal_event_shutdown);
	}
	if (g_journal_event_lowmem != NULL)
	EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem);
	if (g_journal_switcher_proc != NULL)
	g_journal_stop_switcher();
	}

	DECLARE_GEOM_CLASS(g_journal_class, g_journal);

	static const struct g_journal_desc *
	g_journal_find_desc(const char *fstype)
	{
	const struct g_journal_desc *desc;
	int i;

	for (desc = g_journal_filesystems[i = 0]; desc != NULL;
	desc = g_journal_filesystems[++i]) {
	if (strcmp(desc->jd_fstype, fstype) == 0)
	break;
	}
	return (desc);
	}

	static void
	g_journal_switch_wait(struct g_journal_softc *sc)
	{
	struct bintime bt;

	mtx_assert(&sc->sc_mtx, MA_OWNED);
	if (g_journal_debug >= 2) {
	if (sc->sc_flush_in_progress > 0) {
	GJ_DEBUG(2, "%d requests flushing.",
	sc->sc_flush_in_progress);
	}
	if (sc->sc_copy_in_progress > 0) {
	GJ_DEBUG(2, "%d requests copying.",
	sc->sc_copy_in_progress);
	}
	if (sc->sc_flush_count > 0) {
	GJ_DEBUG(2, "%d requests to flush.",
	sc->sc_flush_count);
	}
	if (sc->sc_delayed_count > 0) {
	GJ_DEBUG(2, "%d requests delayed.",
	sc->sc_delayed_count);
	}
	}
	g_journal_stats_switches++;
	if (sc->sc_copy_in_progress > 0)
	g_journal_stats_wait_for_copy++;
	GJ_TIMER_START(1, &bt);
	sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
	sc->sc_flags \|= GJF_DEVICE_SWITCH;
	wakeup(sc);
	while (sc->sc_flags & GJF_DEVICE_SWITCH) {
	msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO,
	"gj:switch", 0);
	}
	GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name);
	}

	static void
	g_journal_do_switch(struct g_class *classp)
	{
	struct g_journal_softc *sc;
	const struct g_journal_desc *desc;
	struct g_geom *gp;
	struct mount *mp;
	struct bintime bt;
	char *mountpoint;
	int error, save;

	g_topology_lock();
	LIST_FOREACH(gp, &classp->geom, geom) {
	sc = gp->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_flags & GJF_DEVICE_DESTROY)
	continue;
	if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
	continue;
	mtx_lock(&sc->sc_mtx);
	sc->sc_flags \|= GJF_DEVICE_BEFORE_SWITCH;
	mtx_unlock(&sc->sc_mtx);
	}
	g_topology_unlock();

	mtx_lock(&mountlist_mtx);
	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	if (mp->mnt_gjprovider == NULL)
	continue;
	if (mp->mnt_flag & MNT_RDONLY)
	continue;
	desc = g_journal_find_desc(mp->mnt_stat.f_fstypename);
	if (desc == NULL)
	continue;
	if (vfs_busy(mp, MBF_NOWAIT \| MBF_MNTLSTLOCK))
	continue;
	/* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */

	g_topology_lock();
	sc = g_journal_find_device(classp, mp->mnt_gjprovider);
	g_topology_unlock();

	if (sc == NULL) {
	GJ_DEBUG(0, "Cannot find journal geom for %s.",
	mp->mnt_gjprovider);
	goto next;
	} else if (JEMPTY(sc)) {
	mtx_lock(&sc->sc_mtx);
	sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
	mtx_unlock(&sc->sc_mtx);
	GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
	goto next;
	}

	mountpoint = mp->mnt_stat.f_mntonname;

	error = vn_start_write(NULL, &mp, V_WAIT);
	if (error != 0) {
	GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).",
	mountpoint, error);
	goto next;
	}

	save = curthread_pflags_set(TDP_SYNCIO);

	GJ_TIMER_START(1, &bt);
	vfs_periodic(mp, MNT_NOWAIT);
	GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint);

	GJ_TIMER_START(1, &bt);
	error = VFS_SYNC(mp, MNT_NOWAIT);
	if (error == 0)
	GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint);
	else {
	GJ_DEBUG(0, "Cannot sync file system %s (error=%d).",
	mountpoint, error);
	}

	curthread_pflags_restore(save);

	vn_finished_write(mp);

	if (error != 0)
	goto next;

	/*
	* Send BIO_FLUSH before freezing the file system, so it can be
	* faster after the freeze.
	*/
	GJ_TIMER_START(1, &bt);
	g_journal_flush_cache(sc);
	GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name);

	GJ_TIMER_START(1, &bt);
	error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT);
	GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint);
	if (error != 0) {
	GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).",
	mountpoint, error);
	goto next;
	}

	error = desc->jd_clean(mp);
	if (error != 0)
	goto next;

	mtx_lock(&sc->sc_mtx);
	g_journal_switch_wait(sc);
	mtx_unlock(&sc->sc_mtx);

	vfs_write_resume(mp, 0);
	next:
	mtx_lock(&mountlist_mtx);
	vfs_unbusy(mp);
	}
	mtx_unlock(&mountlist_mtx);

	sc = NULL;
	for (;;) {
	g_topology_lock();
	LIST_FOREACH(gp, &g_journal_class.geom, geom) {
	sc = gp->softc;
	if (sc == NULL)
	continue;
	mtx_lock(&sc->sc_mtx);
	if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE &&
	!(sc->sc_flags & GJF_DEVICE_DESTROY) &&
	(sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) {
	break;
	}
	mtx_unlock(&sc->sc_mtx);
	sc = NULL;
	}
	g_topology_unlock();
	if (sc == NULL)
	break;
	mtx_assert(&sc->sc_mtx, MA_OWNED);
	g_journal_switch_wait(sc);
	mtx_unlock(&sc->sc_mtx);
	}
	}

	static void
	g_journal_start_switcher(struct g_class *mp)
	{
	int error;

	g_topology_assert();
	MPASS(g_journal_switcher_proc == NULL);
	g_journal_switcher_state = GJ_SWITCHER_WORKING;
	error = kproc_create(g_journal_switcher, mp, &g_journal_switcher_proc,
	0, 0, "g_journal switcher");
	KASSERT(error == 0, ("Cannot create switcher thread."));
	}

	static void
	g_journal_stop_switcher(void)
	{
	g_topology_assert();
	MPASS(g_journal_switcher_proc != NULL);
	g_journal_switcher_state = GJ_SWITCHER_DIE;
	wakeup(&g_journal_switcher_state);
	while (g_journal_switcher_state != GJ_SWITCHER_DIED)
	tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5);
	GJ_DEBUG(1, "Switcher died.");
	g_journal_switcher_proc = NULL;
	}

	/*
	* TODO: Kill switcher thread on last geom destruction?
	*/
	static void
	g_journal_switcher(void *arg)
	{
	struct g_class *mp;
	struct bintime bt;
	int error;

	mp = arg;
	curthread->td_pflags \|= TDP_NORUNNINGBUF;
	for (;;) {
	g_journal_switcher_wokenup = 0;
	error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait",
	g_journal_switch_time * hz);
	if (g_journal_switcher_state == GJ_SWITCHER_DIE) {
	g_journal_switcher_state = GJ_SWITCHER_DIED;
	GJ_DEBUG(1, "Switcher exiting.");
	wakeup(&g_journal_switcher_state);
	kproc_exit(0);
	}
	if (error == 0 && g_journal_sync_requested == 0) {
	GJ_DEBUG(1, "Out of cache, force switch (used=%jd "
	"limit=%jd).", (intmax_t)g_journal_cache_used,
	(intmax_t)g_journal_cache_limit);
	}
	GJ_TIMER_START(1, &bt);
	g_journal_do_switch(mp);
	GJ_TIMER_STOP(1, &bt, "Entire switch time");
	if (g_journal_sync_requested > 0) {
	g_journal_sync_requested = 0;
	wakeup(&g_journal_sync_requested);
	}
	}
	}
	diff --git a/sys/geom/journal/g_journal.h b/sys/geom/journal/g_journal.h
	index fa6127537c36..45a6f39f0e53 100644
	--- a/sys/geom/journal/g_journal.h
	+++ b/sys/geom/journal/g_journal.h
	@@ -1,376 +1,376 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#ifndef _G_JOURNAL_H_
	#define _G_JOURNAL_H_

	#include <sys/endian.h>
	#include <sys/md5.h>
	#ifdef _KERNEL
	#include <sys/bio.h>
	#endif

	#define G_JOURNAL_CLASS_NAME "JOURNAL"

	#define G_JOURNAL_MAGIC "GEOM::JOURNAL"
	/*
	* Version history:
	* 0 - Initial version number.
	*/
	#define G_JOURNAL_VERSION 0

	#ifdef _KERNEL
	extern int g_journal_debug;

	#define GJ_DEBUG(lvl, ...) \
	_GEOM_DEBUG("GEOM_JOURNAL", g_journal_debug, (lvl), NULL, __VA_ARGS__)
	#define GJ_LOGREQ(lvl, bp, ...) \
	_GEOM_DEBUG("GEOM_JOURNAL", g_journal_debug, (lvl), (bp), __VA_ARGS__)

	#define JEMPTY(sc) ((sc)->sc_journal_offset - \
	(sc)->sc_jprovider->sectorsize == \
	(sc)->sc_active.jj_offset && \
	(sc)->sc_current_count == 0)

	#define GJ_BIO_REGULAR 0x00
	#define GJ_BIO_READ 0x01
	#define GJ_BIO_JOURNAL 0x02
	#define GJ_BIO_COPY 0x03
	#define GJ_BIO_MASK 0x0f

	#if 0
	#define GJF_BIO_DONT_FREE 0x10
	#define GJF_BIO_MASK 0xf0
	#endif

	#define GJF_DEVICE_HARDCODED 0x0001
	#define GJF_DEVICE_DESTROY 0x0010
	#define GJF_DEVICE_SWITCH 0x0020
	#define GJF_DEVICE_BEFORE_SWITCH 0x0040
	#define GJF_DEVICE_CLEAN 0x0080
	#define GJF_DEVICE_CHECKSUM 0x0100

	#define GJ_HARD_LIMIT 64

	/*
	* We keep pointers to journaled data in bio structure and because we
	* need to store two off_t values (offset in data provider and offset in
	* journal), we have to borrow bio_completed field for this.
	*/
	#define bio_joffset bio_completed
	/*
	* Use bio_caller1 field as a pointer in queue.
	*/
	#define bio_next bio_caller1

	/*
	* There are two such structures maintained inside each journaled device.
	* One describes active part of the journal, were recent requests are stored.
	* The second describes the last consistent part of the journal with requests
	* that are copied to the destination provider.
	*/
	struct g_journal_journal {
	struct bio jj_queue; / Cached journal entries. */
	off_t jj_offset; /* Journal's start offset. */
	};

	struct g_journal_softc {
	uint32_t sc_id;
	uint8_t sc_type;
	uint8_t sc_orig_type;
	struct g_geom *sc_geom;
	u_int sc_flags;
	struct mtx sc_mtx;
	off_t sc_mediasize;
	u_int sc_sectorsize;
	#define GJ_FLUSH_DATA 0x01
	#define GJ_FLUSH_JOURNAL 0x02
	u_int sc_bio_flush;

	uint32_t sc_journal_id;
	uint32_t sc_journal_next_id;
	int sc_journal_copying;
	off_t sc_journal_offset;
	off_t sc_journal_previous_id;

	struct bio_queue_head sc_back_queue;
	struct bio_queue_head sc_regular_queue;

	struct bio_queue_head sc_delayed_queue;
	int sc_delayed_count;

	struct bio *sc_current_queue;
	int sc_current_count;

	struct bio *sc_flush_queue;
	int sc_flush_count;
	int sc_flush_in_progress;

	struct bio *sc_copy_queue;
	int sc_copy_in_progress;

	struct g_consumer *sc_dconsumer;
	struct g_consumer *sc_jconsumer;

	struct g_journal_journal sc_inactive;
	struct g_journal_journal sc_active;

	off_t sc_jstart; /* Journal space start offset. */
	off_t sc_jend; /* Journal space end offset. */

	struct callout sc_callout;
	struct proc *sc_worker;

	struct root_hold_token *sc_rootmount;
	};
	#define sc_dprovider sc_dconsumer->provider
	#define sc_jprovider sc_jconsumer->provider
	#define sc_name sc_dprovider->name

	#define GJQ_INSERT_HEAD(head, bp) do { \
	(bp)->bio_next = (head); \
	(head) = (bp); \
	} while (0)
	#define GJQ_INSERT_AFTER(head, bp, pbp) do { \
	if ((pbp) == NULL) \
	GJQ_INSERT_HEAD(head, bp); \
	else { \
	(bp)->bio_next = (pbp)->bio_next; \
	(pbp)->bio_next = (bp); \
	} \
	} while (0)
	#define GJQ_LAST(head, bp) do { \
	struct bio *_bp; \
	\
	if ((head) == NULL) { \
	(bp) = (head); \
	break; \
	} \
	for (_bp = (head); _bp->bio_next != NULL; _bp = _bp->bio_next) \
	continue; \
	(bp) = (_bp); \
	} while (0)
	#define GJQ_FIRST(head) (head)
	#define GJQ_REMOVE(head, bp) do { \
	struct bio *_bp; \
	\
	if ((head) == (bp)) { \
	(head) = (bp)->bio_next; \
	(bp)->bio_next = NULL; \
	break; \
	} \
	for (_bp = (head); _bp->bio_next != NULL; _bp = _bp->bio_next) {\
	if (_bp->bio_next == (bp)) \
	break; \
	} \
	KASSERT(_bp->bio_next != NULL, ("NULL bio_next")); \
	KASSERT(_bp->bio_next == (bp), ("bio_next != bp")); \
	_bp->bio_next = (bp)->bio_next; \
	(bp)->bio_next = NULL; \
	} while (0)
	#define GJQ_FOREACH(head, bp) \
	for ((bp) = (head); (bp) != NULL; (bp) = (bp)->bio_next)

	#define GJ_HEADER_MAGIC "GJHDR"

	struct g_journal_header {
	char jh_magic[sizeof(GJ_HEADER_MAGIC)];
	uint32_t jh_journal_id;
	uint32_t jh_journal_next_id;
	} __packed;

	struct g_journal_entry {
	uint64_t je_joffset;
	uint64_t je_offset;
	uint64_t je_length;
	} __packed;

	#define GJ_RECORD_HEADER_MAGIC "GJRHDR"
	#define GJ_RECORD_HEADER_NENTRIES (20)
	#define GJ_RECORD_MAX_SIZE(sc) \
	- ((sc)->sc_jprovider->sectorsize + GJ_RECORD_HEADER_NENTRIES * MAXPHYS)
	+ ((sc)->sc_jprovider->sectorsize + GJ_RECORD_HEADER_NENTRIES * maxphys)
	#define GJ_VALIDATE_OFFSET(offset, sc) do { \
	if ((offset) + GJ_RECORD_MAX_SIZE(sc) >= (sc)->sc_jend) { \
	(offset) = (sc)->sc_jstart; \
	GJ_DEBUG(2, "Starting from the beginning (%s).", \
	(sc)->sc_name); \
	} \
	} while (0)

	struct g_journal_record_header {
	char jrh_magic[sizeof(GJ_RECORD_HEADER_MAGIC)];
	uint32_t jrh_journal_id;
	uint16_t jrh_nentries;
	u_char jrh_sum[8];
	struct g_journal_entry jrh_entries[GJ_RECORD_HEADER_NENTRIES];
	} __packed;

	typedef int (g_journal_clean_t)(struct mount *mp);
	typedef void (g_journal_dirty_t)(struct g_consumer *cp);

	struct g_journal_desc {
	const char *jd_fstype;
	g_journal_clean_t *jd_clean;
	g_journal_dirty_t *jd_dirty;
	};

	/* Supported file systems. */
	extern const struct g_journal_desc g_journal_ufs;

	#define GJ_TIMER_START(lvl, bt) do { \
	if (g_journal_debug >= (lvl)) \
	binuptime(bt); \
	} while (0)
	#define GJ_TIMER_STOP(lvl, bt, ...) do { \
	if (g_journal_debug >= (lvl)) { \
	struct bintime _bt2; \
	struct timeval _tv; \
	\
	binuptime(&_bt2); \
	bintime_sub(&_bt2, bt); \
	bintime2timeval(&_bt2, &_tv); \
	printf("GEOM_JOURNAL"); \
	if (g_journal_debug > 0) \
	printf("[%u]", lvl); \
	printf(": "); \
	printf(__VA_ARGS__); \
	printf(": %jd.%06jds\n", (intmax_t)_tv.tv_sec, \
	(intmax_t)_tv.tv_usec); \
	} \
	} while (0)
	#endif /* _KERNEL */

	#define GJ_TYPE_DATA 0x01
	#define GJ_TYPE_JOURNAL 0x02
	#define GJ_TYPE_COMPLETE (GJ_TYPE_DATA\|GJ_TYPE_JOURNAL)

	#define GJ_FLAG_CLEAN 0x01
	#define GJ_FLAG_CHECKSUM 0x02

	struct g_journal_metadata {
	char md_magic[16]; /* Magic value. */
	uint32_t md_version; /* Version number. */
	uint32_t md_id; /* Journal unique ID. */
	uint8_t md_type; /* Provider type. */
	uint64_t md_jstart; /* Journal space start offset. */
	uint64_t md_jend; /* Journal space end offset. */
	uint64_t md_joffset; /* Last known consistent journal offset. */
	uint32_t md_jid; /* Last known consistent journal ID. */
	uint64_t md_flags; /* Journal flags. */
	char md_provider[16]; /* Hardcoded provider. */
	uint64_t md_provsize; /* Provider's size. */
	u_char md_hash[16]; /* MD5 hash. */
	};
	static __inline void
	journal_metadata_encode(struct g_journal_metadata md, u_char data)
	{
	MD5_CTX ctx;

	bcopy(md->md_magic, data, 16);
	le32enc(data + 16, md->md_version);
	le32enc(data + 20, md->md_id);
	*(data + 24) = md->md_type;
	le64enc(data + 25, md->md_jstart);
	le64enc(data + 33, md->md_jend);
	le64enc(data + 41, md->md_joffset);
	le32enc(data + 49, md->md_jid);
	le64enc(data + 53, md->md_flags);
	bcopy(md->md_provider, data + 61, 16);
	le64enc(data + 77, md->md_provsize);
	MD5Init(&ctx);
	MD5Update(&ctx, data, 85);
	MD5Final(md->md_hash, &ctx);
	bcopy(md->md_hash, data + 85, 16);
	}
	static __inline int
	journal_metadata_decode_v0(const u_char data, struct g_journal_metadata md)
	{
	MD5_CTX ctx;

	md->md_id = le32dec(data + 20);
	md->md_type = *(data + 24);
	md->md_jstart = le64dec(data + 25);
	md->md_jend = le64dec(data + 33);
	md->md_joffset = le64dec(data + 41);
	md->md_jid = le32dec(data + 49);
	md->md_flags = le64dec(data + 53);
	bcopy(data + 61, md->md_provider, 16);
	md->md_provsize = le64dec(data + 77);
	MD5Init(&ctx);
	MD5Update(&ctx, data, 85);
	MD5Final(md->md_hash, &ctx);
	if (bcmp(md->md_hash, data + 85, 16) != 0)
	return (EINVAL);
	return (0);
	}
	static __inline int
	journal_metadata_decode(const u_char data, struct g_journal_metadata md)
	{
	int error;

	bcopy(data, md->md_magic, 16);
	md->md_version = le32dec(data + 16);
	switch (md->md_version) {
	case 0:
	error = journal_metadata_decode_v0(data, md);
	break;
	default:
	error = EINVAL;
	break;
	}
	return (error);
	}

	static __inline void
	journal_metadata_dump(const struct g_journal_metadata *md)
	{
	static const char hex[] = "0123456789abcdef";
	char hash[16 * 2 + 1];
	u_int i;

	printf(" magic: %s\n", md->md_magic);
	printf(" version: %u\n", (u_int)md->md_version);
	printf(" id: %u\n", (u_int)md->md_id);
	printf(" type: %u\n", (u_int)md->md_type);
	printf(" start: %ju\n", (uintmax_t)md->md_jstart);
	printf(" end: %ju\n", (uintmax_t)md->md_jend);
	printf(" joffset: %ju\n", (uintmax_t)md->md_joffset);
	printf(" jid: %u\n", (u_int)md->md_jid);
	printf(" flags: %u\n", (u_int)md->md_flags);
	printf("hcprovider: %s\n", md->md_provider);
	printf(" provsize: %ju\n", (uintmax_t)md->md_provsize);
	bzero(hash, sizeof(hash));
	for (i = 0; i < 16; i++) {
	hash[i * 2] = hex[md->md_hash[i] >> 4];
	hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f];
	}
	printf(" MD5 hash: %s\n", hash);
	}
	#endif /* !_G_JOURNAL_H_ */
	diff --git a/sys/geom/mirror/g_mirror.c b/sys/geom/mirror/g_mirror.c
	index 26962cf17551..350845205485 100644
	--- a/sys/geom/mirror/g_mirror.c
	+++ b/sys/geom/mirror/g_mirror.c
	@@ -1,3582 +1,3582 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/eventhandler.h>
	#include <sys/fail.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/sbuf.h>
	#include <sys/sched.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>

	#include <geom/geom.h>
	#include <geom/geom_dbg.h>
	#include <geom/mirror/g_mirror.h>

	FEATURE(geom_mirror, "GEOM mirroring support");

	static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data");

	SYSCTL_DECL(_kern_geom);
	static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_MIRROR stuff");
	int g_mirror_debug = 0;
	SYSCTL_INT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0,
	"Debug level");
	bool g_launch_mirror_before_timeout = true;
	SYSCTL_BOOL(_kern_geom_mirror, OID_AUTO, launch_mirror_before_timeout,
	CTLFLAG_RWTUN, &g_launch_mirror_before_timeout, 0,
	"If false, force gmirror to wait out the full kern.geom.mirror.timeout "
	"before launching mirrors");
	static u_int g_mirror_timeout = 4;
	SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout,
	0, "Time to wait on all mirror components");
	static u_int g_mirror_idletime = 5;
	SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RWTUN,
	&g_mirror_idletime, 0, "Mark components as clean when idling");
	static u_int g_mirror_disconnect_on_failure = 1;
	SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
	&g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
	static u_int g_mirror_syncreqs = 2;
	SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
	&g_mirror_syncreqs, 0, "Parallel synchronization I/O requests.");
	static u_int g_mirror_sync_period = 5;
	SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_update_period, CTLFLAG_RWTUN,
	&g_mirror_sync_period, 0,
	"Metadata update period during synchronization, in seconds");

	#define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \
	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \
	msleep((ident), (mtx), (priority), (wmesg), (timeout)); \
	G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \
	} while (0)

	static eventhandler_tag g_mirror_post_sync = NULL;
	static int g_mirror_shutdown = 0;

	static g_ctl_destroy_geom_t g_mirror_destroy_geom;
	static g_taste_t g_mirror_taste;
	static g_init_t g_mirror_init;
	static g_fini_t g_mirror_fini;
	static g_provgone_t g_mirror_providergone;
	static g_resize_t g_mirror_resize;

	struct g_class g_mirror_class = {
	.name = G_MIRROR_CLASS_NAME,
	.version = G_VERSION,
	.ctlreq = g_mirror_config,
	.taste = g_mirror_taste,
	.destroy_geom = g_mirror_destroy_geom,
	.init = g_mirror_init,
	.fini = g_mirror_fini,
	.providergone = g_mirror_providergone,
	.resize = g_mirror_resize
	};

	static void g_mirror_destroy_provider(struct g_mirror_softc *sc);
	static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state);
	static void g_mirror_update_device(struct g_mirror_softc *sc, bool force);
	static void g_mirror_dumpconf(struct sbuf sb, const char indent,
	struct g_geom gp, struct g_consumer cp, struct g_provider *pp);
	static int g_mirror_refresh_device(struct g_mirror_softc *sc,
	const struct g_provider pp, const struct g_mirror_metadata md);
	static void g_mirror_sync_reinit(const struct g_mirror_disk *disk,
	struct bio *bp, off_t offset);
	static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type);
	static void g_mirror_register_request(struct g_mirror_softc *sc,
	struct bio *bp);
	static void g_mirror_sync_release(struct g_mirror_softc *sc);

	static const char *
	g_mirror_disk_state2str(int state)
	{

	switch (state) {
	case G_MIRROR_DISK_STATE_NONE:
	return ("NONE");
	case G_MIRROR_DISK_STATE_NEW:
	return ("NEW");
	case G_MIRROR_DISK_STATE_ACTIVE:
	return ("ACTIVE");
	case G_MIRROR_DISK_STATE_STALE:
	return ("STALE");
	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
	return ("SYNCHRONIZING");
	case G_MIRROR_DISK_STATE_DISCONNECTED:
	return ("DISCONNECTED");
	case G_MIRROR_DISK_STATE_DESTROY:
	return ("DESTROY");
	default:
	return ("INVALID");
	}
	}

	static const char *
	g_mirror_device_state2str(int state)
	{

	switch (state) {
	case G_MIRROR_DEVICE_STATE_STARTING:
	return ("STARTING");
	case G_MIRROR_DEVICE_STATE_RUNNING:
	return ("RUNNING");
	default:
	return ("INVALID");
	}
	}

	static const char *
	g_mirror_get_diskname(struct g_mirror_disk *disk)
	{

	if (disk->d_consumer == NULL \|\| disk->d_consumer->provider == NULL)
	return ("[unknown]");
	return (disk->d_name);
	}

	/*
	* --- Events handling functions ---
	* Events in geom_mirror are used to maintain disks and device status
	* from one thread to simplify locking.
	*/
	static void
	g_mirror_event_free(struct g_mirror_event *ep)
	{

	free(ep, M_MIRROR);
	}

	int
	g_mirror_event_send(void *arg, int state, int flags)
	{
	struct g_mirror_softc *sc;
	struct g_mirror_disk *disk;
	struct g_mirror_event *ep;
	int error;

	ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK);
	G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep);
	if ((flags & G_MIRROR_EVENT_DEVICE) != 0) {
	disk = NULL;
	sc = arg;
	} else {
	disk = arg;
	sc = disk->d_softc;
	}
	ep->e_disk = disk;
	ep->e_state = state;
	ep->e_flags = flags;
	ep->e_error = 0;
	mtx_lock(&sc->sc_events_mtx);
	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
	mtx_unlock(&sc->sc_events_mtx);
	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
	mtx_lock(&sc->sc_queue_mtx);
	wakeup(sc);
	mtx_unlock(&sc->sc_queue_mtx);
	if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0)
	return (0);
	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
	sx_xunlock(&sc->sc_lock);
	while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) {
	mtx_lock(&sc->sc_events_mtx);
	MSLEEP(ep, &sc->sc_events_mtx, PRIBIO \| PDROP, "m:event",
	hz * 5);
	}
	error = ep->e_error;
	g_mirror_event_free(ep);
	sx_xlock(&sc->sc_lock);
	return (error);
	}

	static struct g_mirror_event *
	g_mirror_event_first(struct g_mirror_softc *sc)
	{
	struct g_mirror_event *ep;

	mtx_lock(&sc->sc_events_mtx);
	ep = TAILQ_FIRST(&sc->sc_events);
	mtx_unlock(&sc->sc_events_mtx);
	return (ep);
	}

	static void
	g_mirror_event_remove(struct g_mirror_softc sc, struct g_mirror_event ep)
	{

	mtx_lock(&sc->sc_events_mtx);
	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
	mtx_unlock(&sc->sc_events_mtx);
	}

	static void
	g_mirror_event_cancel(struct g_mirror_disk *disk)
	{
	struct g_mirror_softc *sc;
	struct g_mirror_event ep, tmpep;

	sc = disk->d_softc;
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	mtx_lock(&sc->sc_events_mtx);
	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
	if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0)
	continue;
	if (ep->e_disk != disk)
	continue;
	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
	if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0)
	g_mirror_event_free(ep);
	else {
	ep->e_error = ECANCELED;
	wakeup(ep);
	}
	}
	mtx_unlock(&sc->sc_events_mtx);
	}

	/*
	* Return the number of disks in given state.
	* If state is equal to -1, count all connected disks.
	*/
	u_int
	g_mirror_ndisks(struct g_mirror_softc *sc, int state)
	{
	struct g_mirror_disk *disk;
	u_int n = 0;

	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (state == -1 \|\| disk->d_state == state)
	n++;
	}
	return (n);
	}

	/*
	* Find a disk in mirror by its disk ID.
	*/
	static struct g_mirror_disk *
	g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id)
	{
	struct g_mirror_disk *disk;

	sx_assert(&sc->sc_lock, SX_XLOCKED);

	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_id == id)
	return (disk);
	}
	return (NULL);
	}

	static u_int
	g_mirror_nrequests(struct g_mirror_softc sc, struct g_consumer cp)
	{
	struct bio *bp;
	u_int nreqs = 0;

	mtx_lock(&sc->sc_queue_mtx);
	TAILQ_FOREACH(bp, &sc->sc_queue, bio_queue) {
	if (bp->bio_from == cp)
	nreqs++;
	}
	mtx_unlock(&sc->sc_queue_mtx);
	return (nreqs);
	}

	static int
	g_mirror_is_busy(struct g_mirror_softc sc, struct g_consumer cp)
	{

	if (cp->index > 0) {
	G_MIRROR_DEBUG(2,
	"I/O requests for %s exist, can't destroy it now.",
	cp->provider->name);
	return (1);
	}
	if (g_mirror_nrequests(sc, cp) > 0) {
	G_MIRROR_DEBUG(2,
	"I/O requests for %s in queue, can't destroy it now.",
	cp->provider->name);
	return (1);
	}
	return (0);
	}

	static void
	g_mirror_destroy_consumer(void *arg, int flags __unused)
	{
	struct g_consumer *cp;

	g_topology_assert();

	cp = arg;
	G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
	g_detach(cp);
	g_destroy_consumer(cp);
	}

	static void
	g_mirror_kill_consumer(struct g_mirror_softc sc, struct g_consumer cp)
	{
	struct g_provider *pp;
	int retaste_wait;

	g_topology_assert();

	cp->private = NULL;
	if (g_mirror_is_busy(sc, cp))
	return;
	pp = cp->provider;
	retaste_wait = 0;
	if (cp->acw == 1) {
	if ((pp->geom->flags & G_GEOM_WITHER) == 0)
	retaste_wait = 1;
	}
	G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
	-cp->acw, -cp->ace, 0);
	if (cp->acr > 0 \|\| cp->acw > 0 \|\| cp->ace > 0)
	g_access(cp, -cp->acr, -cp->acw, -cp->ace);
	if (retaste_wait) {
	/*
	* After retaste event was send (inside g_access()), we can send
	* event to detach and destroy consumer.
	* A class, which has consumer to the given provider connected
	* will not receive retaste event for the provider.
	* This is the way how I ignore retaste events when I close
	* consumers opened for write: I detach and destroy consumer
	* after retaste event is sent.
	*/
	g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL);
	return;
	}
	G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name);
	g_detach(cp);
	g_destroy_consumer(cp);
	}

	static int
	g_mirror_connect_disk(struct g_mirror_disk disk, struct g_provider pp)
	{
	struct g_consumer *cp;
	int error;

	g_topology_assert_not();
	KASSERT(disk->d_consumer == NULL,
	("Disk already connected (device %s).", disk->d_softc->sc_name));

	g_topology_lock();
	cp = g_new_consumer(disk->d_softc->sc_geom);
	cp->flags \|= G_CF_DIRECT_RECEIVE;
	error = g_attach(cp, pp);
	if (error != 0) {
	g_destroy_consumer(cp);
	g_topology_unlock();
	return (error);
	}
	error = g_access(cp, 1, 1, 1);
	if (error != 0) {
	g_detach(cp);
	g_destroy_consumer(cp);
	g_topology_unlock();
	G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).",
	pp->name, error);
	return (error);
	}
	g_topology_unlock();
	disk->d_consumer = cp;
	disk->d_consumer->private = disk;
	disk->d_consumer->index = 0;

	G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk));
	return (0);
	}

	static void
	g_mirror_disconnect_consumer(struct g_mirror_softc sc, struct g_consumer cp)
	{

	g_topology_assert();

	if (cp == NULL)
	return;
	if (cp->provider != NULL)
	g_mirror_kill_consumer(sc, cp);
	else
	g_destroy_consumer(cp);
	}

	/*
	* Initialize disk. This means allocate memory, create consumer, attach it
	* to the provider and open access (r1w1e1) to it.
	*/
	static struct g_mirror_disk *
	g_mirror_init_disk(struct g_mirror_softc sc, struct g_provider pp,
	struct g_mirror_metadata md, int errorp)
	{
	struct g_mirror_disk *disk;
	int i, error;

	disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT \| M_ZERO);
	if (disk == NULL) {
	error = ENOMEM;
	goto fail;
	}
	disk->d_softc = sc;
	error = g_mirror_connect_disk(disk, pp);
	if (error != 0)
	goto fail;
	disk->d_id = md->md_did;
	disk->d_state = G_MIRROR_DISK_STATE_NONE;
	disk->d_priority = md->md_priority;
	disk->d_flags = md->md_dflags;
	error = g_getattr("GEOM::candelete", disk->d_consumer, &i);
	if (error == 0 && i != 0)
	disk->d_flags \|= G_MIRROR_DISK_FLAG_CANDELETE;
	if (md->md_provider[0] != '\0')
	disk->d_flags \|= G_MIRROR_DISK_FLAG_HARDCODED;
	disk->d_sync.ds_consumer = NULL;
	disk->d_sync.ds_offset = md->md_sync_offset;
	disk->d_sync.ds_offset_done = md->md_sync_offset;
	disk->d_sync.ds_update_ts = time_uptime;
	disk->d_genid = md->md_genid;
	disk->d_sync.ds_syncid = md->md_syncid;
	disk->d_init_ndisks = md->md_all;
	disk->d_init_slice = md->md_slice;
	disk->d_init_balance = md->md_balance;
	disk->d_init_mediasize = md->md_mediasize;
	if (errorp != NULL)
	*errorp = 0;
	return (disk);
	fail:
	if (errorp != NULL)
	*errorp = error;
	if (disk != NULL)
	free(disk, M_MIRROR);
	return (NULL);
	}

	static void
	g_mirror_destroy_disk(struct g_mirror_disk *disk)
	{
	struct g_mirror_softc *sc;

	g_topology_assert_not();
	sc = disk->d_softc;
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	g_topology_lock();
	LIST_REMOVE(disk, d_next);
	g_topology_unlock();
	g_mirror_event_cancel(disk);
	if (sc->sc_hint == disk)
	sc->sc_hint = NULL;
	switch (disk->d_state) {
	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
	g_mirror_sync_stop(disk, 1);
	/* FALLTHROUGH */
	case G_MIRROR_DISK_STATE_NEW:
	case G_MIRROR_DISK_STATE_STALE:
	case G_MIRROR_DISK_STATE_ACTIVE:
	g_topology_lock();
	g_mirror_disconnect_consumer(sc, disk->d_consumer);
	g_topology_unlock();
	free(disk, M_MIRROR);
	break;
	default:
	KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
	g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	}
	}

	static void
	g_mirror_free_device(struct g_mirror_softc *sc)
	{

	g_topology_assert();

	mtx_destroy(&sc->sc_queue_mtx);
	mtx_destroy(&sc->sc_events_mtx);
	mtx_destroy(&sc->sc_done_mtx);
	sx_destroy(&sc->sc_lock);
	free(sc, M_MIRROR);
	}

	static void
	g_mirror_providergone(struct g_provider *pp)
	{
	struct g_mirror_softc *sc = pp->private;

	if ((--sc->sc_refcnt) == 0)
	g_mirror_free_device(sc);
	}

	static void
	g_mirror_destroy_device(struct g_mirror_softc *sc)
	{
	struct g_mirror_disk *disk;
	struct g_mirror_event *ep;
	struct g_geom *gp;
	struct g_consumer cp, tmpcp;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	gp = sc->sc_geom;
	if (sc->sc_provider != NULL)
	g_mirror_destroy_provider(sc);
	for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL;
	disk = LIST_FIRST(&sc->sc_disks)) {
	disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
	g_mirror_update_metadata(disk);
	g_mirror_destroy_disk(disk);
	}
	while ((ep = g_mirror_event_first(sc)) != NULL) {
	g_mirror_event_remove(sc, ep);
	if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0)
	g_mirror_event_free(ep);
	else {
	ep->e_error = ECANCELED;
	ep->e_flags \|= G_MIRROR_EVENT_DONE;
	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep);
	mtx_lock(&sc->sc_events_mtx);
	wakeup(ep);
	mtx_unlock(&sc->sc_events_mtx);
	}
	}
	callout_drain(&sc->sc_callout);

	g_topology_lock();
	LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) {
	g_mirror_disconnect_consumer(sc, cp);
	}
	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
	G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name);
	g_wither_geom(gp, ENXIO);
	sx_xunlock(&sc->sc_lock);
	if ((--sc->sc_refcnt) == 0)
	g_mirror_free_device(sc);
	g_topology_unlock();
	}

	static void
	g_mirror_orphan(struct g_consumer *cp)
	{
	struct g_mirror_disk *disk;

	g_topology_assert();

	disk = cp->private;
	if (disk == NULL)
	return;
	disk->d_softc->sc_bump_id \|= G_MIRROR_BUMP_SYNCID;
	g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED,
	G_MIRROR_EVENT_DONTWAIT);
	}

	/*
	* Function should return the next active disk on the list.
	* It is possible that it will be the same disk as given.
	* If there are no active disks on list, NULL is returned.
	*/
	static __inline struct g_mirror_disk *
	g_mirror_find_next(struct g_mirror_softc sc, struct g_mirror_disk disk)
	{
	struct g_mirror_disk *dp;

	for (dp = LIST_NEXT(disk, d_next); dp != disk;
	dp = LIST_NEXT(dp, d_next)) {
	if (dp == NULL)
	dp = LIST_FIRST(&sc->sc_disks);
	if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE)
	break;
	}
	if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
	return (NULL);
	return (dp);
	}

	static struct g_mirror_disk *
	g_mirror_get_disk(struct g_mirror_softc *sc)
	{
	struct g_mirror_disk *disk;

	if (sc->sc_hint == NULL) {
	sc->sc_hint = LIST_FIRST(&sc->sc_disks);
	if (sc->sc_hint == NULL)
	return (NULL);
	}
	disk = sc->sc_hint;
	if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) {
	disk = g_mirror_find_next(sc, disk);
	if (disk == NULL)
	return (NULL);
	}
	sc->sc_hint = g_mirror_find_next(sc, disk);
	return (disk);
	}

	static int
	g_mirror_write_metadata(struct g_mirror_disk *disk,
	struct g_mirror_metadata *md)
	{
	struct g_mirror_softc *sc;
	struct g_consumer *cp;
	off_t offset, length;
	u_char *sector;
	int error = 0;

	g_topology_assert_not();
	sc = disk->d_softc;
	sx_assert(&sc->sc_lock, SX_LOCKED);

	cp = disk->d_consumer;
	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
	cp->acw, cp->ace));
	length = cp->provider->sectorsize;
	offset = cp->provider->mediasize - length;
	sector = malloc((size_t)length, M_MIRROR, M_WAITOK \| M_ZERO);
	if (md != NULL &&
	(sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) {
	/*
	* Handle the case, when the size of parent provider reduced.
	*/
	if (offset < md->md_mediasize)
	error = ENOSPC;
	else
	mirror_metadata_encode(md, sector);
	}
	KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_metadata_write, error);
	if (error == 0)
	error = g_write_data(cp, offset, sector, length);
	free(sector, M_MIRROR);
	if (error != 0) {
	if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) {
	disk->d_flags \|= G_MIRROR_DISK_FLAG_BROKEN;
	G_MIRROR_DEBUG(0, "Cannot write metadata on %s "
	"(device=%s, error=%d).",
	g_mirror_get_diskname(disk), sc->sc_name, error);
	} else {
	G_MIRROR_DEBUG(1, "Cannot write metadata on %s "
	"(device=%s, error=%d).",
	g_mirror_get_diskname(disk), sc->sc_name, error);
	}
	if (g_mirror_disconnect_on_failure &&
	g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) {
	sc->sc_bump_id \|= G_MIRROR_BUMP_GENID;
	g_mirror_event_send(disk,
	G_MIRROR_DISK_STATE_DISCONNECTED,
	G_MIRROR_EVENT_DONTWAIT);
	}
	}
	return (error);
	}

	static int
	g_mirror_clear_metadata(struct g_mirror_disk *disk)
	{
	int error;

	g_topology_assert_not();
	sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);

	if (disk->d_softc->sc_type != G_MIRROR_TYPE_AUTOMATIC)
	return (0);
	error = g_mirror_write_metadata(disk, NULL);
	if (error == 0) {
	G_MIRROR_DEBUG(2, "Metadata on %s cleared.",
	g_mirror_get_diskname(disk));
	} else {
	G_MIRROR_DEBUG(0,
	"Cannot clear metadata on disk %s (error=%d).",
	g_mirror_get_diskname(disk), error);
	}
	return (error);
	}

	void
	g_mirror_fill_metadata(struct g_mirror_softc sc, struct g_mirror_disk disk,
	struct g_mirror_metadata *md)
	{

	strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic));
	md->md_version = G_MIRROR_VERSION;
	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
	md->md_mid = sc->sc_id;
	md->md_all = sc->sc_ndisks;
	md->md_slice = sc->sc_slice;
	md->md_balance = sc->sc_balance;
	md->md_genid = sc->sc_genid;
	md->md_mediasize = sc->sc_mediasize;
	md->md_sectorsize = sc->sc_sectorsize;
	md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK);
	bzero(md->md_provider, sizeof(md->md_provider));
	if (disk == NULL) {
	md->md_did = arc4random();
	md->md_priority = 0;
	md->md_syncid = 0;
	md->md_dflags = 0;
	md->md_sync_offset = 0;
	md->md_provsize = 0;
	} else {
	md->md_did = disk->d_id;
	md->md_priority = disk->d_priority;
	md->md_syncid = disk->d_sync.ds_syncid;
	md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK);
	if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
	md->md_sync_offset = disk->d_sync.ds_offset_done;
	else
	md->md_sync_offset = 0;
	if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) {
	strlcpy(md->md_provider,
	disk->d_consumer->provider->name,
	sizeof(md->md_provider));
	}
	md->md_provsize = disk->d_consumer->provider->mediasize;
	}
	}

	void
	g_mirror_update_metadata(struct g_mirror_disk *disk)
	{
	struct g_mirror_softc *sc;
	struct g_mirror_metadata md;
	int error;

	g_topology_assert_not();
	sc = disk->d_softc;
	sx_assert(&sc->sc_lock, SX_LOCKED);

	if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC)
	return;
	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0)
	g_mirror_fill_metadata(sc, disk, &md);
	error = g_mirror_write_metadata(disk, &md);
	if (error == 0) {
	G_MIRROR_DEBUG(2, "Metadata on %s updated.",
	g_mirror_get_diskname(disk));
	} else {
	G_MIRROR_DEBUG(0,
	"Cannot update metadata on disk %s (error=%d).",
	g_mirror_get_diskname(disk), error);
	}
	}

	static void
	g_mirror_bump_syncid(struct g_mirror_softc *sc)
	{
	struct g_mirror_disk *disk;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);
	KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0,
	("%s called with no active disks (device=%s).", __func__,
	sc->sc_name));

	sc->sc_syncid++;
	G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
	sc->sc_syncid);
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE \|\|
	disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
	disk->d_sync.ds_syncid = sc->sc_syncid;
	g_mirror_update_metadata(disk);
	}
	}
	}

	static void
	g_mirror_bump_genid(struct g_mirror_softc *sc)
	{
	struct g_mirror_disk *disk;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);
	KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0,
	("%s called with no active disks (device=%s).", __func__,
	sc->sc_name));

	sc->sc_genid++;
	G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
	sc->sc_genid);
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE \|\|
	disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
	disk->d_genid = sc->sc_genid;
	g_mirror_update_metadata(disk);
	}
	}
	}

	static int
	g_mirror_idle(struct g_mirror_softc *sc, int acw)
	{
	struct g_mirror_disk *disk;
	int timeout;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	if (sc->sc_provider == NULL)
	return (0);
	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
	return (0);
	if (sc->sc_idle)
	return (0);
	if (sc->sc_writes > 0)
	return (0);
	if (acw > 0 \|\| (acw == -1 && sc->sc_provider->acw > 0)) {
	timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write);
	if (!g_mirror_shutdown && timeout > 0)
	return (timeout);
	}
	sc->sc_idle = 1;
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
	continue;
	G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.",
	g_mirror_get_diskname(disk), sc->sc_name);
	disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
	g_mirror_update_metadata(disk);
	}
	return (0);
	}

	static void
	g_mirror_unidle(struct g_mirror_softc *sc)
	{
	struct g_mirror_disk *disk;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
	return;
	sc->sc_idle = 0;
	sc->sc_last_write = time_uptime;
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
	continue;
	G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.",
	g_mirror_get_diskname(disk), sc->sc_name);
	disk->d_flags \|= G_MIRROR_DISK_FLAG_DIRTY;
	g_mirror_update_metadata(disk);
	}
	}

	static void
	g_mirror_done(struct bio *bp)
	{
	struct g_mirror_softc *sc;

	sc = bp->bio_from->geom->softc;
	bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR;
	mtx_lock(&sc->sc_queue_mtx);
	TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue);
	mtx_unlock(&sc->sc_queue_mtx);
	wakeup(sc);
	}

	static void
	g_mirror_regular_request_error(struct g_mirror_softc *sc,
	struct g_mirror_disk disk, struct bio bp)
	{

	if ((bp->bio_cmd == BIO_FLUSH \|\| bp->bio_cmd == BIO_SPEEDUP) &&
	bp->bio_error == EOPNOTSUPP)
	return;

	if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) {
	disk->d_flags \|= G_MIRROR_DISK_FLAG_BROKEN;
	G_MIRROR_LOGREQ(0, bp, "Request failed (error=%d).",
	bp->bio_error);
	} else {
	G_MIRROR_LOGREQ(1, bp, "Request failed (error=%d).",
	bp->bio_error);
	}
	if (g_mirror_disconnect_on_failure &&
	g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) {
	if (bp->bio_error == ENXIO &&
	bp->bio_cmd == BIO_READ)
	sc->sc_bump_id \|= G_MIRROR_BUMP_SYNCID;
	else if (bp->bio_error == ENXIO)
	sc->sc_bump_id \|= G_MIRROR_BUMP_SYNCID_NOW;
	else
	sc->sc_bump_id \|= G_MIRROR_BUMP_GENID;
	g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED,
	G_MIRROR_EVENT_DONTWAIT);
	}
	}

	static void
	g_mirror_regular_request(struct g_mirror_softc sc, struct bio bp)
	{
	struct g_mirror_disk *disk;
	struct bio *pbp;

	g_topology_assert_not();
	KASSERT(sc->sc_provider == bp->bio_parent->bio_to,
	("regular request %p with unexpected origin", bp));

	pbp = bp->bio_parent;
	bp->bio_from->index--;
	if (bp->bio_cmd == BIO_WRITE \|\| bp->bio_cmd == BIO_DELETE)
	sc->sc_writes--;
	disk = bp->bio_from->private;
	if (disk == NULL) {
	g_topology_lock();
	g_mirror_kill_consumer(sc, bp->bio_from);
	g_topology_unlock();
	}

	switch (bp->bio_cmd) {
	case BIO_READ:
	KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_read,
	bp->bio_error);
	break;
	case BIO_WRITE:
	KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_write,
	bp->bio_error);
	break;
	case BIO_DELETE:
	KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_delete,
	bp->bio_error);
	break;
	case BIO_FLUSH:
	KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_flush,
	bp->bio_error);
	break;
	case BIO_SPEEDUP:
	KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_speedup,
	bp->bio_error);
	break;
	}

	pbp->bio_inbed++;
	KASSERT(pbp->bio_inbed <= pbp->bio_children,
	("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
	pbp->bio_children));
	if (bp->bio_error == 0 && pbp->bio_error == 0) {
	G_MIRROR_LOGREQ(3, bp, "Request delivered.");
	g_destroy_bio(bp);
	if (pbp->bio_children == pbp->bio_inbed) {
	G_MIRROR_LOGREQ(3, pbp, "Request delivered.");
	pbp->bio_completed = pbp->bio_length;
	if (pbp->bio_cmd == BIO_WRITE \|\|
	pbp->bio_cmd == BIO_DELETE) {
	TAILQ_REMOVE(&sc->sc_inflight, pbp, bio_queue);
	/* Release delayed sync requests if possible. */
	g_mirror_sync_release(sc);
	}
	g_io_deliver(pbp, pbp->bio_error);
	}
	return;
	} else if (bp->bio_error != 0) {
	if (pbp->bio_error == 0)
	pbp->bio_error = bp->bio_error;
	if (disk != NULL)
	g_mirror_regular_request_error(sc, disk, bp);
	switch (pbp->bio_cmd) {
	case BIO_DELETE:
	case BIO_WRITE:
	case BIO_FLUSH:
	case BIO_SPEEDUP:
	pbp->bio_inbed--;
	pbp->bio_children--;
	break;
	}
	}
	g_destroy_bio(bp);

	switch (pbp->bio_cmd) {
	case BIO_READ:
	if (pbp->bio_inbed < pbp->bio_children)
	break;
	if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1)
	g_io_deliver(pbp, pbp->bio_error);
	else {
	pbp->bio_error = 0;
	mtx_lock(&sc->sc_queue_mtx);
	TAILQ_INSERT_TAIL(&sc->sc_queue, pbp, bio_queue);
	mtx_unlock(&sc->sc_queue_mtx);
	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
	wakeup(sc);
	}
	break;
	case BIO_DELETE:
	case BIO_WRITE:
	case BIO_FLUSH:
	case BIO_SPEEDUP:
	if (pbp->bio_children == 0) {
	/*
	* All requests failed.
	*/
	} else if (pbp->bio_inbed < pbp->bio_children) {
	/* Do nothing. */
	break;
	} else if (pbp->bio_children == pbp->bio_inbed) {
	/* Some requests succeeded. */
	pbp->bio_error = 0;
	pbp->bio_completed = pbp->bio_length;
	}
	if (pbp->bio_cmd == BIO_WRITE \|\| pbp->bio_cmd == BIO_DELETE) {
	TAILQ_REMOVE(&sc->sc_inflight, pbp, bio_queue);
	/* Release delayed sync requests if possible. */
	g_mirror_sync_release(sc);
	}
	g_io_deliver(pbp, pbp->bio_error);
	break;
	default:
	KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd));
	break;
	}
	}

	static void
	g_mirror_sync_done(struct bio *bp)
	{
	struct g_mirror_softc *sc;

	G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered.");
	sc = bp->bio_from->geom->softc;
	bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC;
	mtx_lock(&sc->sc_queue_mtx);
	TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue);
	mtx_unlock(&sc->sc_queue_mtx);
	wakeup(sc);
	}

	static void
	g_mirror_candelete(struct bio *bp)
	{
	struct g_mirror_softc *sc;
	struct g_mirror_disk *disk;
	int val;

	sc = bp->bio_to->private;
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE)
	break;
	}
	val = disk != NULL;
	g_handleattr(bp, "GEOM::candelete", &val, sizeof(val));
	}

	static void
	g_mirror_kernel_dump(struct bio *bp)
	{
	struct g_mirror_softc *sc;
	struct g_mirror_disk *disk;
	struct bio *cbp;
	struct g_kerneldump *gkd;

	/*
	* We configure dumping to the first component, because this component
	* will be used for reading with 'prefer' balance algorithm.
	* If the component with the highest priority is currently disconnected
	* we will not be able to read the dump after the reboot if it will be
	* connected and synchronized later. Can we do something better?
	*/
	sc = bp->bio_to->private;
	disk = LIST_FIRST(&sc->sc_disks);

	gkd = (struct g_kerneldump *)bp->bio_data;
	if (gkd->length > bp->bio_to->mediasize)
	gkd->length = bp->bio_to->mediasize;
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	g_io_deliver(bp, ENOMEM);
	return;
	}
	cbp->bio_done = g_std_done;
	g_io_request(cbp, disk->d_consumer);
	G_MIRROR_DEBUG(1, "Kernel dump will go to %s.",
	g_mirror_get_diskname(disk));
	}

	static void
	g_mirror_start(struct bio *bp)
	{
	struct g_mirror_softc *sc;

	sc = bp->bio_to->private;
	/*
	* If sc == NULL or there are no valid disks, provider's error
	* should be set and g_mirror_start() should not be called at all.
	*/
	KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
	("Provider's error should be set (error=%d)(mirror=%s).",
	bp->bio_to->error, bp->bio_to->name));
	G_MIRROR_LOGREQ(3, bp, "Request received.");

	switch (bp->bio_cmd) {
	case BIO_READ:
	case BIO_WRITE:
	case BIO_DELETE:
	case BIO_SPEEDUP:
	case BIO_FLUSH:
	break;
	case BIO_GETATTR:
	if (!strcmp(bp->bio_attribute, "GEOM::candelete")) {
	g_mirror_candelete(bp);
	return;
	} else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) {
	g_mirror_kernel_dump(bp);
	return;
	}
	/* FALLTHROUGH */
	default:
	g_io_deliver(bp, EOPNOTSUPP);
	return;
	}
	mtx_lock(&sc->sc_queue_mtx);
	if (bp->bio_to->error != 0) {
	mtx_unlock(&sc->sc_queue_mtx);
	g_io_deliver(bp, bp->bio_to->error);
	return;
	}
	TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue);
	mtx_unlock(&sc->sc_queue_mtx);
	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
	wakeup(sc);
	}

	/*
	* Return TRUE if the given request is colliding with a in-progress
	* synchronization request.
	*/
	static bool
	g_mirror_sync_collision(struct g_mirror_softc sc, struct bio bp)
	{
	struct g_mirror_disk *disk;
	struct bio *sbp;
	off_t rstart, rend, sstart, send;
	u_int i;

	if (sc->sc_sync.ds_ndisks == 0)
	return (false);
	rstart = bp->bio_offset;
	rend = bp->bio_offset + bp->bio_length;
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING)
	continue;
	for (i = 0; i < g_mirror_syncreqs; i++) {
	sbp = disk->d_sync.ds_bios[i];
	if (sbp == NULL)
	continue;
	sstart = sbp->bio_offset;
	send = sbp->bio_offset + sbp->bio_length;
	if (rend > sstart && rstart < send)
	return (true);
	}
	}
	return (false);
	}

	/*
	* Return TRUE if the given sync request is colliding with a in-progress regular
	* request.
	*/
	static bool
	g_mirror_regular_collision(struct g_mirror_softc sc, struct bio sbp)
	{
	off_t rstart, rend, sstart, send;
	struct bio *bp;

	if (sc->sc_sync.ds_ndisks == 0)
	return (false);
	sstart = sbp->bio_offset;
	send = sbp->bio_offset + sbp->bio_length;
	TAILQ_FOREACH(bp, &sc->sc_inflight, bio_queue) {
	rstart = bp->bio_offset;
	rend = bp->bio_offset + bp->bio_length;
	if (rend > sstart && rstart < send)
	return (true);
	}
	return (false);
	}

	/*
	* Puts regular request onto delayed queue.
	*/
	static void
	g_mirror_regular_delay(struct g_mirror_softc sc, struct bio bp)
	{

	G_MIRROR_LOGREQ(2, bp, "Delaying request.");
	TAILQ_INSERT_TAIL(&sc->sc_regular_delayed, bp, bio_queue);
	}

	/*
	* Puts synchronization request onto delayed queue.
	*/
	static void
	g_mirror_sync_delay(struct g_mirror_softc sc, struct bio bp)
	{

	G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request.");
	TAILQ_INSERT_TAIL(&sc->sc_sync_delayed, bp, bio_queue);
	}

	/*
	* Requeue delayed regular requests.
	*/
	static void
	g_mirror_regular_release(struct g_mirror_softc *sc)
	{
	struct bio *bp;

	if ((bp = TAILQ_FIRST(&sc->sc_regular_delayed)) == NULL)
	return;
	if (g_mirror_sync_collision(sc, bp))
	return;

	G_MIRROR_DEBUG(2, "Requeuing regular requests after collision.");
	mtx_lock(&sc->sc_queue_mtx);
	TAILQ_CONCAT(&sc->sc_regular_delayed, &sc->sc_queue, bio_queue);
	TAILQ_SWAP(&sc->sc_regular_delayed, &sc->sc_queue, bio, bio_queue);
	mtx_unlock(&sc->sc_queue_mtx);
	}

	/*
	* Releases delayed sync requests which don't collide anymore with regular
	* requests.
	*/
	static void
	g_mirror_sync_release(struct g_mirror_softc *sc)
	{
	struct bio bp, bp2;

	TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed, bio_queue, bp2) {
	if (g_mirror_regular_collision(sc, bp))
	continue;
	TAILQ_REMOVE(&sc->sc_sync_delayed, bp, bio_queue);
	G_MIRROR_LOGREQ(2, bp,
	"Releasing delayed synchronization request.");
	g_io_request(bp, bp->bio_from);
	}
	}

	/*
	* Free a synchronization request and clear its slot in the array.
	*/
	static void
	g_mirror_sync_request_free(struct g_mirror_disk disk, struct bio bp)
	{
	int idx;

	if (disk != NULL && disk->d_sync.ds_bios != NULL) {
	idx = (int)(uintptr_t)bp->bio_caller1;
	KASSERT(disk->d_sync.ds_bios[idx] == bp,
	("unexpected sync BIO at %p:%d", disk, idx));
	disk->d_sync.ds_bios[idx] = NULL;
	}
	free(bp->bio_data, M_MIRROR);
	g_destroy_bio(bp);
	}

	/*
	* Handle synchronization requests.
	* Every synchronization request is a two-step process: first, a read request is
	* sent to the mirror provider via the sync consumer. If that request completes
	* successfully, it is converted to a write and sent to the disk being
	* synchronized. If the write also completes successfully, the synchronization
	* offset is advanced and a new read request is submitted.
	*/
	static void
	g_mirror_sync_request(struct g_mirror_softc sc, struct bio bp)
	{
	struct g_mirror_disk *disk;
	struct g_mirror_disk_sync *sync;

	KASSERT((bp->bio_cmd == BIO_READ &&
	bp->bio_from->geom == sc->sc_sync.ds_geom) \|\|
	(bp->bio_cmd == BIO_WRITE && bp->bio_from->geom == sc->sc_geom),
	("Sync BIO %p with unexpected origin", bp));

	bp->bio_from->index--;
	disk = bp->bio_from->private;
	if (disk == NULL) {
	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
	g_topology_lock();
	g_mirror_kill_consumer(sc, bp->bio_from);
	g_topology_unlock();
	g_mirror_sync_request_free(NULL, bp);
	sx_xlock(&sc->sc_lock);
	return;
	}

	sync = &disk->d_sync;

	/*
	* Synchronization request.
	*/
	switch (bp->bio_cmd) {
	case BIO_READ: {
	struct g_consumer *cp;

	KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_read,
	bp->bio_error);

	if (bp->bio_error != 0) {
	G_MIRROR_LOGREQ(0, bp,
	"Synchronization request failed (error=%d).",
	bp->bio_error);

	/*
	* The read error will trigger a syncid bump, so there's
	* no need to do that here.
	*
	* The read error handling for regular requests will
	* retry the read from all active mirrors before passing
	* the error back up, so there's no need to retry here.
	*/
	g_mirror_sync_request_free(disk, bp);
	g_mirror_event_send(disk,
	G_MIRROR_DISK_STATE_DISCONNECTED,
	G_MIRROR_EVENT_DONTWAIT);
	return;
	}
	G_MIRROR_LOGREQ(3, bp,
	"Synchronization request half-finished.");
	bp->bio_cmd = BIO_WRITE;
	bp->bio_cflags = 0;
	cp = disk->d_consumer;
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
	cp->acr, cp->acw, cp->ace));
	cp->index++;
	g_io_request(bp, cp);
	return;
	}
	case BIO_WRITE: {
	off_t offset;
	int i;

	KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_write,
	bp->bio_error);

	if (bp->bio_error != 0) {
	G_MIRROR_LOGREQ(0, bp,
	"Synchronization request failed (error=%d).",
	bp->bio_error);
	g_mirror_sync_request_free(disk, bp);
	sc->sc_bump_id \|= G_MIRROR_BUMP_GENID;
	g_mirror_event_send(disk,
	G_MIRROR_DISK_STATE_DISCONNECTED,
	G_MIRROR_EVENT_DONTWAIT);
	return;
	}
	G_MIRROR_LOGREQ(3, bp, "Synchronization request finished.");
	if (sync->ds_offset >= sc->sc_mediasize \|\|
	sync->ds_consumer == NULL \|\|
	(sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
	/* Don't send more synchronization requests. */
	sync->ds_inflight--;
	g_mirror_sync_request_free(disk, bp);
	if (sync->ds_inflight > 0)
	return;
	if (sync->ds_consumer == NULL \|\|
	(sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
	return;
	}
	/* Disk up-to-date, activate it. */
	g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE,
	G_MIRROR_EVENT_DONTWAIT);
	return;
	}

	/* Send next synchronization request. */
	g_mirror_sync_reinit(disk, bp, sync->ds_offset);
	sync->ds_offset += bp->bio_length;

	G_MIRROR_LOGREQ(3, bp, "Sending synchronization request.");
	sync->ds_consumer->index++;

	/*
	* Delay the request if it is colliding with a regular request.
	*/
	if (g_mirror_regular_collision(sc, bp))
	g_mirror_sync_delay(sc, bp);
	else
	g_io_request(bp, sync->ds_consumer);

	/* Requeue delayed requests if possible. */
	g_mirror_regular_release(sc);

	/* Find the smallest offset */
	offset = sc->sc_mediasize;
	for (i = 0; i < g_mirror_syncreqs; i++) {
	bp = sync->ds_bios[i];
	if (bp != NULL && bp->bio_offset < offset)
	offset = bp->bio_offset;
	}
	if (g_mirror_sync_period > 0 &&
	time_uptime - sync->ds_update_ts > g_mirror_sync_period) {
	sync->ds_offset_done = offset;
	g_mirror_update_metadata(disk);
	sync->ds_update_ts = time_uptime;
	}
	return;
	}
	default:
	panic("Invalid I/O request %p", bp);
	}
	}

	static void
	g_mirror_request_prefer(struct g_mirror_softc sc, struct bio bp)
	{
	struct g_mirror_disk *disk;
	struct g_consumer *cp;
	struct bio *cbp;

	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE)
	break;
	}
	if (disk == NULL) {
	if (bp->bio_error == 0)
	bp->bio_error = ENXIO;
	g_io_deliver(bp, bp->bio_error);
	return;
	}
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	if (bp->bio_error == 0)
	bp->bio_error = ENOMEM;
	g_io_deliver(bp, bp->bio_error);
	return;
	}
	/*
	* Fill in the component buf structure.
	*/
	cp = disk->d_consumer;
	cbp->bio_done = g_mirror_done;
	cbp->bio_to = cp->provider;
	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
	cp->acw, cp->ace));
	cp->index++;
	g_io_request(cbp, cp);
	}

	static void
	g_mirror_request_round_robin(struct g_mirror_softc sc, struct bio bp)
	{
	struct g_mirror_disk *disk;
	struct g_consumer *cp;
	struct bio *cbp;

	disk = g_mirror_get_disk(sc);
	if (disk == NULL) {
	if (bp->bio_error == 0)
	bp->bio_error = ENXIO;
	g_io_deliver(bp, bp->bio_error);
	return;
	}
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	if (bp->bio_error == 0)
	bp->bio_error = ENOMEM;
	g_io_deliver(bp, bp->bio_error);
	return;
	}
	/*
	* Fill in the component buf structure.
	*/
	cp = disk->d_consumer;
	cbp->bio_done = g_mirror_done;
	cbp->bio_to = cp->provider;
	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
	cp->acw, cp->ace));
	cp->index++;
	g_io_request(cbp, cp);
	}

	#define TRACK_SIZE (1 * 1024 * 1024)
	#define LOAD_SCALE 256
	#define ABS(x) (((x) >= 0) ? (x) : (-(x)))

	static void
	g_mirror_request_load(struct g_mirror_softc sc, struct bio bp)
	{
	struct g_mirror_disk disk, dp;
	struct g_consumer *cp;
	struct bio *cbp;
	int prio, best;

	/* Find a disk with the smallest load. */
	disk = NULL;
	best = INT_MAX;
	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
	if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
	continue;
	prio = dp->load;
	/* If disk head is precisely in position - highly prefer it. */
	if (dp->d_last_offset == bp->bio_offset)
	prio -= 2 * LOAD_SCALE;
	else
	/* If disk head is close to position - prefer it. */
	if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE)
	prio -= 1 * LOAD_SCALE;
	if (prio <= best) {
	disk = dp;
	best = prio;
	}
	}
	KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name));
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	if (bp->bio_error == 0)
	bp->bio_error = ENOMEM;
	g_io_deliver(bp, bp->bio_error);
	return;
	}
	/*
	* Fill in the component buf structure.
	*/
	cp = disk->d_consumer;
	cbp->bio_done = g_mirror_done;
	cbp->bio_to = cp->provider;
	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
	cp->acw, cp->ace));
	cp->index++;
	/* Remember last head position */
	disk->d_last_offset = bp->bio_offset + bp->bio_length;
	/* Update loads. */
	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
	dp->load = (dp->d_consumer->index * LOAD_SCALE +
	dp->load * 7) / 8;
	}
	g_io_request(cbp, cp);
	}

	static void
	g_mirror_request_split(struct g_mirror_softc sc, struct bio bp)
	{
	struct bio_queue queue;
	struct g_mirror_disk *disk;
	struct g_consumer *cp;
	struct bio *cbp;
	off_t left, mod, offset, slice;
	u_char *data;
	u_int ndisks;

	if (bp->bio_length <= sc->sc_slice) {
	g_mirror_request_round_robin(sc, bp);
	return;
	}
	ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE);
	slice = bp->bio_length / ndisks;
	mod = slice % sc->sc_provider->sectorsize;
	if (mod != 0)
	slice += sc->sc_provider->sectorsize - mod;
	/*
	* Allocate all bios before sending any request, so we can
	* return ENOMEM in nice and clean way.
	*/
	left = bp->bio_length;
	offset = bp->bio_offset;
	data = bp->bio_data;
	TAILQ_INIT(&queue);
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
	continue;
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, cbp, bio_queue);
	g_destroy_bio(cbp);
	}
	if (bp->bio_error == 0)
	bp->bio_error = ENOMEM;
	g_io_deliver(bp, bp->bio_error);
	return;
	}
	TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
	cbp->bio_done = g_mirror_done;
	cbp->bio_caller1 = disk;
	cbp->bio_to = disk->d_consumer->provider;
	cbp->bio_offset = offset;
	cbp->bio_data = data;
	cbp->bio_length = MIN(left, slice);
	left -= cbp->bio_length;
	if (left == 0)
	break;
	offset += cbp->bio_length;
	data += cbp->bio_length;
	}
	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, cbp, bio_queue);
	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
	disk = cbp->bio_caller1;
	cbp->bio_caller1 = NULL;
	cp = disk->d_consumer;
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
	cp->acr, cp->acw, cp->ace));
	disk->d_consumer->index++;
	g_io_request(cbp, disk->d_consumer);
	}
	}

	static void
	g_mirror_register_request(struct g_mirror_softc sc, struct bio bp)
	{
	struct bio_queue queue;
	struct bio *cbp;
	struct g_consumer *cp;
	struct g_mirror_disk *disk;

	sx_assert(&sc->sc_lock, SA_XLOCKED);

	/*
	* To avoid ordering issues, if a write is deferred because of a
	* collision with a sync request, all I/O is deferred until that
	* write is initiated.
	*/
	if (bp->bio_from->geom != sc->sc_sync.ds_geom &&
	!TAILQ_EMPTY(&sc->sc_regular_delayed)) {
	g_mirror_regular_delay(sc, bp);
	return;
	}

	switch (bp->bio_cmd) {
	case BIO_READ:
	switch (sc->sc_balance) {
	case G_MIRROR_BALANCE_LOAD:
	g_mirror_request_load(sc, bp);
	break;
	case G_MIRROR_BALANCE_PREFER:
	g_mirror_request_prefer(sc, bp);
	break;
	case G_MIRROR_BALANCE_ROUND_ROBIN:
	g_mirror_request_round_robin(sc, bp);
	break;
	case G_MIRROR_BALANCE_SPLIT:
	g_mirror_request_split(sc, bp);
	break;
	}
	return;
	case BIO_WRITE:
	case BIO_DELETE:
	/*
	* Delay the request if it is colliding with a synchronization
	* request.
	*/
	if (g_mirror_sync_collision(sc, bp)) {
	g_mirror_regular_delay(sc, bp);
	return;
	}

	if (sc->sc_idle)
	g_mirror_unidle(sc);
	else
	sc->sc_last_write = time_uptime;

	/*
	* Bump syncid on first write.
	*/
	if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) {
	sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID;
	g_mirror_bump_syncid(sc);
	}

	/*
	* Allocate all bios before sending any request, so we can
	* return ENOMEM in nice and clean way.
	*/
	TAILQ_INIT(&queue);
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	switch (disk->d_state) {
	case G_MIRROR_DISK_STATE_ACTIVE:
	break;
	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
	if (bp->bio_offset >= disk->d_sync.ds_offset)
	continue;
	break;
	default:
	continue;
	}
	if (bp->bio_cmd == BIO_DELETE &&
	(disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0)
	continue;
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, cbp, bio_queue);
	g_destroy_bio(cbp);
	}
	if (bp->bio_error == 0)
	bp->bio_error = ENOMEM;
	g_io_deliver(bp, bp->bio_error);
	return;
	}
	TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
	cbp->bio_done = g_mirror_done;
	cp = disk->d_consumer;
	cbp->bio_caller1 = cp;
	cbp->bio_to = cp->provider;
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s not opened (r%dw%de%d).",
	cp->provider->name, cp->acr, cp->acw, cp->ace));
	}
	if (TAILQ_EMPTY(&queue)) {
	KASSERT(bp->bio_cmd == BIO_DELETE,
	("No consumers for regular request %p", bp));
	g_io_deliver(bp, EOPNOTSUPP);
	return;
	}
	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
	TAILQ_REMOVE(&queue, cbp, bio_queue);
	cp = cbp->bio_caller1;
	cbp->bio_caller1 = NULL;
	cp->index++;
	sc->sc_writes++;
	g_io_request(cbp, cp);
	}
	/*
	* Put request onto inflight queue, so we can check if new
	* synchronization requests don't collide with it.
	*/
	TAILQ_INSERT_TAIL(&sc->sc_inflight, bp, bio_queue);
	return;
	case BIO_SPEEDUP:
	case BIO_FLUSH:
	TAILQ_INIT(&queue);
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
	continue;
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, cbp, bio_queue);
	g_destroy_bio(cbp);
	}
	if (bp->bio_error == 0)
	bp->bio_error = ENOMEM;
	g_io_deliver(bp, bp->bio_error);
	return;
	}
	TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
	cbp->bio_done = g_mirror_done;
	cbp->bio_caller1 = disk;
	cbp->bio_to = disk->d_consumer->provider;
	}
	KASSERT(!TAILQ_EMPTY(&queue),
	("No consumers for regular request %p", bp));
	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
	TAILQ_REMOVE(&queue, cbp, bio_queue);
	disk = cbp->bio_caller1;
	cbp->bio_caller1 = NULL;
	cp = disk->d_consumer;
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
	cp->acr, cp->acw, cp->ace));
	cp->index++;
	g_io_request(cbp, cp);
	}
	break;
	default:
	KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
	bp->bio_cmd, sc->sc_name));
	break;
	}
	}

	static int
	g_mirror_can_destroy(struct g_mirror_softc *sc)
	{
	struct g_geom *gp;
	struct g_consumer *cp;

	g_topology_assert();
	gp = sc->sc_geom;
	if (gp->softc == NULL)
	return (1);
	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0)
	return (0);
	LIST_FOREACH(cp, &gp->consumer, consumer) {
	if (g_mirror_is_busy(sc, cp))
	return (0);
	}
	gp = sc->sc_sync.ds_geom;
	LIST_FOREACH(cp, &gp->consumer, consumer) {
	if (g_mirror_is_busy(sc, cp))
	return (0);
	}
	G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
	sc->sc_name);
	return (1);
	}

	static int
	g_mirror_try_destroy(struct g_mirror_softc *sc)
	{

	if (sc->sc_rootmount != NULL) {
	G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
	sc->sc_rootmount);
	root_mount_rel(sc->sc_rootmount);
	sc->sc_rootmount = NULL;
	}
	g_topology_lock();
	if (!g_mirror_can_destroy(sc)) {
	g_topology_unlock();
	return (0);
	}
	sc->sc_geom->softc = NULL;
	sc->sc_sync.ds_geom->softc = NULL;
	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DRAIN) != 0) {
	g_topology_unlock();
	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__,
	&sc->sc_worker);
	/* Unlock sc_lock here, as it can be destroyed after wakeup. */
	sx_xunlock(&sc->sc_lock);
	wakeup(&sc->sc_worker);
	sc->sc_worker = NULL;
	} else {
	g_topology_unlock();
	g_mirror_destroy_device(sc);
	}
	return (1);
	}

	/*
	* Worker thread.
	*/
	static void
	g_mirror_worker(void *arg)
	{
	struct g_mirror_softc *sc;
	struct g_mirror_event *ep;
	struct bio *bp;
	int timeout;

	sc = arg;
	thread_lock(curthread);
	sched_prio(curthread, PRIBIO);
	thread_unlock(curthread);

	sx_xlock(&sc->sc_lock);
	for (;;) {
	G_MIRROR_DEBUG(5, "%s: Let's see...", __func__);
	/*
	* First take a look at events.
	* This is important to handle events before any I/O requests.
	*/
	ep = g_mirror_event_first(sc);
	if (ep != NULL) {
	g_mirror_event_remove(sc, ep);
	if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) {
	/* Update only device status. */
	G_MIRROR_DEBUG(3,
	"Running event for device %s.",
	sc->sc_name);
	ep->e_error = 0;
	g_mirror_update_device(sc, true);
	} else {
	/* Update disk status. */
	G_MIRROR_DEBUG(3, "Running event for disk %s.",
	g_mirror_get_diskname(ep->e_disk));
	ep->e_error = g_mirror_update_disk(ep->e_disk,
	ep->e_state);
	if (ep->e_error == 0)
	g_mirror_update_device(sc, false);
	}
	if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) {
	KASSERT(ep->e_error == 0,
	("Error cannot be handled."));
	g_mirror_event_free(ep);
	} else {
	ep->e_flags \|= G_MIRROR_EVENT_DONE;
	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__,
	ep);
	mtx_lock(&sc->sc_events_mtx);
	wakeup(ep);
	mtx_unlock(&sc->sc_events_mtx);
	}
	if ((sc->sc_flags &
	G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
	if (g_mirror_try_destroy(sc)) {
	curthread->td_pflags &= ~TDP_GEOM;
	G_MIRROR_DEBUG(1, "Thread exiting.");
	kproc_exit(0);
	}
	}
	G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__);
	continue;
	}

	/*
	* Check if we can mark array as CLEAN and if we can't take
	* how much seconds should we wait.
	*/
	timeout = g_mirror_idle(sc, -1);

	/*
	* Handle I/O requests.
	*/
	mtx_lock(&sc->sc_queue_mtx);
	bp = TAILQ_FIRST(&sc->sc_queue);
	if (bp != NULL)
	TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue);
	else {
	if ((sc->sc_flags &
	G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
	mtx_unlock(&sc->sc_queue_mtx);
	if (g_mirror_try_destroy(sc)) {
	curthread->td_pflags &= ~TDP_GEOM;
	G_MIRROR_DEBUG(1, "Thread exiting.");
	kproc_exit(0);
	}
	mtx_lock(&sc->sc_queue_mtx);
	if (!TAILQ_EMPTY(&sc->sc_queue)) {
	mtx_unlock(&sc->sc_queue_mtx);
	continue;
	}
	}
	if (g_mirror_event_first(sc) != NULL) {
	mtx_unlock(&sc->sc_queue_mtx);
	continue;
	}
	sx_xunlock(&sc->sc_lock);
	MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO \| PDROP, "m:w1",
	timeout * hz);
	sx_xlock(&sc->sc_lock);
	G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__);
	continue;
	}
	mtx_unlock(&sc->sc_queue_mtx);

	if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
	(bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) {
	/*
	* Handle completion of the first half (the read) of a
	* block synchronization operation.
	*/
	g_mirror_sync_request(sc, bp);
	} else if (bp->bio_to != sc->sc_provider) {
	if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0)
	/*
	* Handle completion of a regular I/O request.
	*/
	g_mirror_regular_request(sc, bp);
	else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0)
	/*
	* Handle completion of the second half (the
	* write) of a block synchronization operation.
	*/
	g_mirror_sync_request(sc, bp);
	else {
	KASSERT(0,
	("Invalid request cflags=0x%hx to=%s.",
	bp->bio_cflags, bp->bio_to->name));
	}
	} else {
	/*
	* Initiate an I/O request.
	*/
	g_mirror_register_request(sc, bp);
	}
	G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__);
	}
	}

	static void
	g_mirror_update_idle(struct g_mirror_softc sc, struct g_mirror_disk disk)
	{

	sx_assert(&sc->sc_lock, SX_LOCKED);

	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
	return;
	if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) {
	G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.",
	g_mirror_get_diskname(disk), sc->sc_name);
	disk->d_flags \|= G_MIRROR_DISK_FLAG_DIRTY;
	} else if (sc->sc_idle &&
	(disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) {
	G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.",
	g_mirror_get_diskname(disk), sc->sc_name);
	disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
	}
	}

	static void
	g_mirror_sync_reinit(const struct g_mirror_disk disk, struct bio bp,
	off_t offset)
	{
	void *data;
	int idx;

	data = bp->bio_data;
	idx = (int)(uintptr_t)bp->bio_caller1;
	g_reset_bio(bp);

	bp->bio_cmd = BIO_READ;
	bp->bio_data = data;
	bp->bio_done = g_mirror_sync_done;
	bp->bio_from = disk->d_sync.ds_consumer;
	bp->bio_to = disk->d_softc->sc_provider;
	bp->bio_caller1 = (void *)(uintptr_t)idx;
	bp->bio_offset = offset;
	- bp->bio_length = MIN(MAXPHYS,
	+ bp->bio_length = MIN(maxphys,
	disk->d_softc->sc_mediasize - bp->bio_offset);
	}

	static void
	g_mirror_sync_start(struct g_mirror_disk *disk)
	{
	struct g_mirror_softc *sc;
	struct g_mirror_disk_sync *sync;
	struct g_consumer *cp;
	struct bio *bp;
	int error, i;

	g_topology_assert_not();
	sc = disk->d_softc;
	sync = &disk->d_sync;
	sx_assert(&sc->sc_lock, SX_LOCKED);

	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
	("Disk %s is not marked for synchronization.",
	g_mirror_get_diskname(disk)));
	KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
	("Device not in RUNNING state (%s, %u).", sc->sc_name,
	sc->sc_state));

	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	cp = g_new_consumer(sc->sc_sync.ds_geom);
	cp->flags \|= G_CF_DIRECT_SEND \| G_CF_DIRECT_RECEIVE;
	error = g_attach(cp, sc->sc_provider);
	KASSERT(error == 0,
	("Cannot attach to %s (error=%d).", sc->sc_name, error));
	error = g_access(cp, 1, 0, 0);
	KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);

	G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
	g_mirror_get_diskname(disk));
	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0)
	disk->d_flags \|= G_MIRROR_DISK_FLAG_DIRTY;
	KASSERT(sync->ds_consumer == NULL,
	("Sync consumer already exists (device=%s, disk=%s).",
	sc->sc_name, g_mirror_get_diskname(disk)));

	sync->ds_consumer = cp;
	sync->ds_consumer->private = disk;
	sync->ds_consumer->index = 0;

	/*
	* Allocate memory for synchronization bios and initialize them.
	*/
	sync->ds_bios = malloc(sizeof(struct bio ) g_mirror_syncreqs,
	M_MIRROR, M_WAITOK);
	for (i = 0; i < g_mirror_syncreqs; i++) {
	bp = g_alloc_bio();
	sync->ds_bios[i] = bp;

	- bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK);
	+ bp->bio_data = malloc(maxphys, M_MIRROR, M_WAITOK);
	bp->bio_caller1 = (void *)(uintptr_t)i;
	g_mirror_sync_reinit(disk, bp, sync->ds_offset);
	sync->ds_offset += bp->bio_length;
	}

	/* Increase the number of disks in SYNCHRONIZING state. */
	sc->sc_sync.ds_ndisks++;
	/* Set the number of in-flight synchronization requests. */
	sync->ds_inflight = g_mirror_syncreqs;

	/*
	* Fire off first synchronization requests.
	*/
	for (i = 0; i < g_mirror_syncreqs; i++) {
	bp = sync->ds_bios[i];
	G_MIRROR_LOGREQ(3, bp, "Sending synchronization request.");
	sync->ds_consumer->index++;
	/*
	* Delay the request if it is colliding with a regular request.
	*/
	if (g_mirror_regular_collision(sc, bp))
	g_mirror_sync_delay(sc, bp);
	else
	g_io_request(bp, sync->ds_consumer);
	}
	}

	/*
	* Stop synchronization process.
	* type: 0 - synchronization finished
	* 1 - synchronization stopped
	*/
	static void
	g_mirror_sync_stop(struct g_mirror_disk *disk, int type)
	{
	struct g_mirror_softc *sc;
	struct g_consumer *cp;

	g_topology_assert_not();
	sc = disk->d_softc;
	sx_assert(&sc->sc_lock, SX_LOCKED);

	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
	("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	if (disk->d_sync.ds_consumer == NULL)
	return;

	if (type == 0) {
	G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.",
	sc->sc_name, g_mirror_get_diskname(disk));
	} else /* if (type == 1) */ {
	G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
	sc->sc_name, g_mirror_get_diskname(disk));
	}
	g_mirror_regular_release(sc);
	free(disk->d_sync.ds_bios, M_MIRROR);
	disk->d_sync.ds_bios = NULL;
	cp = disk->d_sync.ds_consumer;
	disk->d_sync.ds_consumer = NULL;
	disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
	sc->sc_sync.ds_ndisks--;
	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
	g_topology_lock();
	g_mirror_kill_consumer(sc, cp);
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	}

	static void
	g_mirror_launch_provider(struct g_mirror_softc *sc)
	{
	struct g_mirror_disk *disk;
	struct g_provider pp, dp;

	sx_assert(&sc->sc_lock, SX_LOCKED);

	g_topology_lock();
	pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name);
	pp->flags \|= G_PF_DIRECT_RECEIVE;
	pp->mediasize = sc->sc_mediasize;
	pp->sectorsize = sc->sc_sectorsize;
	pp->stripesize = 0;
	pp->stripeoffset = 0;

	/* Splitting of unmapped BIO's could work but isn't implemented now */
	if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT)
	pp->flags \|= G_PF_ACCEPT_UNMAPPED;

	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer && disk->d_consumer->provider) {
	dp = disk->d_consumer->provider;
	if (dp->stripesize > pp->stripesize) {
	pp->stripesize = dp->stripesize;
	pp->stripeoffset = dp->stripeoffset;
	}
	/* A provider underneath us doesn't support unmapped */
	if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) {
	G_MIRROR_DEBUG(0, "Cancelling unmapped "
	"because of %s.", dp->name);
	pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
	}
	}
	}
	pp->private = sc;
	sc->sc_refcnt++;
	sc->sc_provider = pp;
	g_error_provider(pp, 0);
	g_topology_unlock();
	G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
	g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks);
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
	g_mirror_sync_start(disk);
	}
	}

	static void
	g_mirror_destroy_provider(struct g_mirror_softc *sc)
	{
	struct g_mirror_disk *disk;
	struct bio *bp;

	g_topology_assert_not();
	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
	sc->sc_name));

	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
	g_mirror_sync_stop(disk, 1);
	}

	g_topology_lock();
	g_error_provider(sc->sc_provider, ENXIO);
	mtx_lock(&sc->sc_queue_mtx);
	while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL) {
	TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue);
	/*
	* Abort any pending I/O that wasn't generated by us.
	* Synchronization requests and requests destined for individual
	* mirror components can be destroyed immediately.
	*/
	if (bp->bio_to == sc->sc_provider &&
	bp->bio_from->geom != sc->sc_sync.ds_geom) {
	g_io_deliver(bp, ENXIO);
	} else {
	if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0)
	free(bp->bio_data, M_MIRROR);
	g_destroy_bio(bp);
	}
	}
	mtx_unlock(&sc->sc_queue_mtx);
	g_wither_provider(sc->sc_provider, ENXIO);
	sc->sc_provider = NULL;
	G_MIRROR_DEBUG(0, "Device %s: provider destroyed.", sc->sc_name);
	g_topology_unlock();
	}

	static void
	g_mirror_go(void *arg)
	{
	struct g_mirror_softc *sc;

	sc = arg;
	G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
	g_mirror_event_send(sc, 0,
	G_MIRROR_EVENT_DONTWAIT \| G_MIRROR_EVENT_DEVICE);
	}

	static u_int
	g_mirror_determine_state(struct g_mirror_disk *disk)
	{
	struct g_mirror_softc *sc;
	u_int state;

	sc = disk->d_softc;
	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
	if ((disk->d_flags &
	G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0 &&
	(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 \|\|
	(disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0)) {
	/* Disk does not need synchronization. */
	state = G_MIRROR_DISK_STATE_ACTIVE;
	} else {
	if ((sc->sc_flags &
	G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 \|\|
	(disk->d_flags &
	G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) {
	/*
	* We can start synchronization from
	* the stored offset.
	*/
	state = G_MIRROR_DISK_STATE_SYNCHRONIZING;
	} else {
	state = G_MIRROR_DISK_STATE_STALE;
	}
	}
	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
	/*
	* Reset all synchronization data for this disk,
	* because if it even was synchronized, it was
	* synchronized to disks with different syncid.
	*/
	disk->d_flags \|= G_MIRROR_DISK_FLAG_SYNCHRONIZING;
	disk->d_sync.ds_offset = 0;
	disk->d_sync.ds_offset_done = 0;
	disk->d_sync.ds_syncid = sc->sc_syncid;
	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 \|\|
	(disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) {
	state = G_MIRROR_DISK_STATE_SYNCHRONIZING;
	} else {
	state = G_MIRROR_DISK_STATE_STALE;
	}
	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
	/*
	* Not good, NOT GOOD!
	* It means that mirror was started on stale disks
	* and more fresh disk just arrive.
	* If there were writes, mirror is broken, sorry.
	* I think the best choice here is don't touch
	* this disk and inform the user loudly.
	*/
	G_MIRROR_DEBUG(0, "Device %s was started before the freshest "
	"disk (%s) arrives!! It will not be connected to the "
	"running device.", sc->sc_name,
	g_mirror_get_diskname(disk));
	g_mirror_destroy_disk(disk);
	state = G_MIRROR_DISK_STATE_NONE;
	/* Return immediately, because disk was destroyed. */
	return (state);
	}
	G_MIRROR_DEBUG(3, "State for %s disk: %s.",
	g_mirror_get_diskname(disk), g_mirror_disk_state2str(state));
	return (state);
	}

	/*
	* Update device state.
	*/
	static void
	g_mirror_update_device(struct g_mirror_softc *sc, bool force)
	{
	struct g_mirror_disk *disk;
	u_int state;

	sx_assert(&sc->sc_lock, SX_XLOCKED);

	switch (sc->sc_state) {
	case G_MIRROR_DEVICE_STATE_STARTING:
	{
	struct g_mirror_disk pdisk, tdisk;
	const char *mismatch;
	uintmax_t found, newest;
	u_int dirty, ndisks;

	/* Pre-flight checks */
	LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
	/*
	* Confirm we already detected the newest genid.
	*/
	KASSERT(sc->sc_genid >= disk->d_genid,
	("%s: found newer genid %u (sc:%p had %u).", __func__,
	disk->d_genid, sc, sc->sc_genid));

	/* Kick out any previously tasted stale components. */
	if (disk->d_genid < sc->sc_genid) {
	G_MIRROR_DEBUG(0, "Stale 'genid' field on %s "
	"(device %s) (component=%u latest=%u), skipping.",
	g_mirror_get_diskname(disk), sc->sc_name,
	disk->d_genid, sc->sc_genid);
	g_mirror_destroy_disk(disk);
	sc->sc_bump_id \|= G_MIRROR_BUMP_SYNCID;
	continue;
	}

	/*
	* Confirm we already detected the newest syncid.
	*/
	KASSERT(sc->sc_syncid >= disk->d_sync.ds_syncid,
	("%s: found newer syncid %u (sc:%p had %u).",
	__func__, disk->d_sync.ds_syncid, sc,
	sc->sc_syncid));

	#define DETECT_MISMATCH(field, name) \
	if (mismatch == NULL && \
	disk->d_init_ ## field != sc->sc_ ## field) { \
	mismatch = name; \
	found = (intmax_t)disk->d_init_ ## field; \
	newest = (intmax_t)sc->sc_ ## field; \
	}
	mismatch = NULL;
	DETECT_MISMATCH(ndisks, "md_all");
	DETECT_MISMATCH(balance, "md_balance");
	DETECT_MISMATCH(slice, "md_slice");
	DETECT_MISMATCH(mediasize, "md_mediasize");
	#undef DETECT_MISMATCH
	if (mismatch != NULL) {
	G_MIRROR_DEBUG(0, "Found a mismatching '%s' "
	"field on %s (device %s) (found=%ju "
	"newest=%ju).", mismatch,
	g_mirror_get_diskname(disk), sc->sc_name,
	found, newest);
	g_mirror_destroy_disk(disk);
	sc->sc_bump_id \|= G_MIRROR_BUMP_SYNCID;
	continue;
	}
	}

	KASSERT(sc->sc_provider == NULL,
	("Non-NULL provider in STARTING state (%s).", sc->sc_name));
	/*
	* Are we ready? If the timeout (force is true) has expired, and
	* any disks are present, then yes. If we're permitted to launch
	* before the timeout has expired and the expected number of
	* current-generation mirror disks have been tasted, then yes.
	*/
	ndisks = g_mirror_ndisks(sc, -1);
	if ((force && ndisks > 0) \|\|
	(g_launch_mirror_before_timeout && ndisks == sc->sc_ndisks)) {
	;
	} else if (ndisks == 0) {
	/*
	* Disks went down in starting phase, so destroy
	* device.
	*/
	callout_drain(&sc->sc_callout);
	sc->sc_flags \|= G_MIRROR_DEVICE_FLAG_DESTROY;
	G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
	sc->sc_rootmount);
	root_mount_rel(sc->sc_rootmount);
	sc->sc_rootmount = NULL;
	return;
	} else {
	return;
	}

	/*
	* Activate all disks with the biggest syncid.
	*/
	if (force) {
	/*
	* If 'force' is true, we have been called due to
	* timeout, so don't bother canceling timeout.
	*/
	ndisks = 0;
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if ((disk->d_flags &
	G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) {
	ndisks++;
	}
	}
	if (ndisks == 0) {
	/* No valid disks found, destroy device. */
	sc->sc_flags \|= G_MIRROR_DEVICE_FLAG_DESTROY;
	G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p",
	__LINE__, sc->sc_rootmount);
	root_mount_rel(sc->sc_rootmount);
	sc->sc_rootmount = NULL;
	return;
	}
	} else {
	/* Cancel timeout. */
	callout_drain(&sc->sc_callout);
	}

	/*
	* Here we need to look for dirty disks and if all disks
	* with the biggest syncid are dirty, we have to choose
	* one with the biggest priority and rebuild the rest.
	*/
	/*
	* Find the number of dirty disks with the biggest syncid.
	* Find the number of disks with the biggest syncid.
	* While here, find a disk with the biggest priority.
	*/
	dirty = ndisks = 0;
	pdisk = NULL;
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_sync.ds_syncid != sc->sc_syncid)
	continue;
	if ((disk->d_flags &
	G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
	continue;
	}
	ndisks++;
	if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) {
	dirty++;
	if (pdisk == NULL \|\|
	pdisk->d_priority < disk->d_priority) {
	pdisk = disk;
	}
	}
	}
	if (dirty == 0) {
	/* No dirty disks at all, great. */
	} else if (dirty == ndisks) {
	/*
	* Force synchronization for all dirty disks except one
	* with the biggest priority.
	*/
	KASSERT(pdisk != NULL, ("pdisk == NULL"));
	G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a "
	"master disk for synchronization.",
	g_mirror_get_diskname(pdisk), sc->sc_name);
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_sync.ds_syncid != sc->sc_syncid)
	continue;
	if ((disk->d_flags &
	G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
	continue;
	}
	KASSERT((disk->d_flags &
	G_MIRROR_DISK_FLAG_DIRTY) != 0,
	("Disk %s isn't marked as dirty.",
	g_mirror_get_diskname(disk)));
	/* Skip the disk with the biggest priority. */
	if (disk == pdisk)
	continue;
	disk->d_sync.ds_syncid = 0;
	}
	} else if (dirty < ndisks) {
	/*
	* Force synchronization for all dirty disks.
	* We have some non-dirty disks.
	*/
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_sync.ds_syncid != sc->sc_syncid)
	continue;
	if ((disk->d_flags &
	G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
	continue;
	}
	if ((disk->d_flags &
	G_MIRROR_DISK_FLAG_DIRTY) == 0) {
	continue;
	}
	disk->d_sync.ds_syncid = 0;
	}
	}

	/* Reset hint. */
	sc->sc_hint = NULL;
	if (force) {
	/* Remember to bump syncid on first write. */
	sc->sc_bump_id \|= G_MIRROR_BUMP_SYNCID;
	}
	state = G_MIRROR_DEVICE_STATE_RUNNING;
	G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.",
	sc->sc_name, g_mirror_device_state2str(sc->sc_state),
	g_mirror_device_state2str(state));
	sc->sc_state = state;
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	state = g_mirror_determine_state(disk);
	g_mirror_event_send(disk, state,
	G_MIRROR_EVENT_DONTWAIT);
	if (state == G_MIRROR_DISK_STATE_STALE)
	sc->sc_bump_id \|= G_MIRROR_BUMP_SYNCID;
	}
	break;
	}
	case G_MIRROR_DEVICE_STATE_RUNNING:
	if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 &&
	g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) {
	/*
	* No usable disks, so destroy the device.
	*/
	sc->sc_flags \|= G_MIRROR_DEVICE_FLAG_DESTROY;
	break;
	} else if (g_mirror_ndisks(sc,
	G_MIRROR_DISK_STATE_ACTIVE) > 0 &&
	g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) {
	/*
	* We have active disks, launch provider if it doesn't
	* exist.
	*/
	if (sc->sc_provider == NULL)
	g_mirror_launch_provider(sc);
	if (sc->sc_rootmount != NULL) {
	G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p",
	__LINE__, sc->sc_rootmount);
	root_mount_rel(sc->sc_rootmount);
	sc->sc_rootmount = NULL;
	}
	}
	/*
	* Genid should be bumped immediately, so do it here.
	*/
	if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) {
	sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID;
	g_mirror_bump_genid(sc);
	}
	if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID_NOW) != 0) {
	sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID_NOW;
	g_mirror_bump_syncid(sc);
	}
	break;
	default:
	KASSERT(1 == 0, ("Wrong device state (%s, %s).",
	sc->sc_name, g_mirror_device_state2str(sc->sc_state)));
	break;
	}
	}

	/*
	* Update disk state and device state if needed.
	*/
	#define DISK_STATE_CHANGED() G_MIRROR_DEBUG(1, \
	"Disk %s state changed from %s to %s (device %s).", \
	g_mirror_get_diskname(disk), \
	g_mirror_disk_state2str(disk->d_state), \
	g_mirror_disk_state2str(state), sc->sc_name)
	static int
	g_mirror_update_disk(struct g_mirror_disk *disk, u_int state)
	{
	struct g_mirror_softc *sc;

	sc = disk->d_softc;
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	again:
	G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.",
	g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state),
	g_mirror_disk_state2str(state));
	switch (state) {
	case G_MIRROR_DISK_STATE_NEW:
	/*
	* Possible scenarios:
	* 1. New disk arrive.
	*/
	/* Previous state should be NONE. */
	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE,
	("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	DISK_STATE_CHANGED();

	disk->d_state = state;
	g_topology_lock();
	if (LIST_EMPTY(&sc->sc_disks))
	LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next);
	else {
	struct g_mirror_disk *dp;

	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
	if (disk->d_priority >= dp->d_priority) {
	LIST_INSERT_BEFORE(dp, disk, d_next);
	dp = NULL;
	break;
	}
	if (LIST_NEXT(dp, d_next) == NULL)
	break;
	}
	if (dp != NULL)
	LIST_INSERT_AFTER(dp, disk, d_next);
	}
	g_topology_unlock();
	G_MIRROR_DEBUG(1, "Device %s: provider %s detected.",
	sc->sc_name, g_mirror_get_diskname(disk));
	if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING)
	break;
	KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
	("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
	g_mirror_device_state2str(sc->sc_state),
	g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	state = g_mirror_determine_state(disk);
	if (state != G_MIRROR_DISK_STATE_NONE)
	goto again;
	break;
	case G_MIRROR_DISK_STATE_ACTIVE:
	/*
	* Possible scenarios:
	* 1. New disk does not need synchronization.
	* 2. Synchronization process finished successfully.
	*/
	KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
	("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
	g_mirror_device_state2str(sc->sc_state),
	g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	/* Previous state should be NEW or SYNCHRONIZING. */
	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW \|\|
	disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
	("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	DISK_STATE_CHANGED();

	if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
	disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING;
	disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC;
	g_mirror_sync_stop(disk, 0);
	}
	disk->d_state = state;
	disk->d_sync.ds_offset = 0;
	disk->d_sync.ds_offset_done = 0;
	g_mirror_update_idle(sc, disk);
	g_mirror_update_metadata(disk);
	G_MIRROR_DEBUG(1, "Device %s: provider %s activated.",
	sc->sc_name, g_mirror_get_diskname(disk));
	break;
	case G_MIRROR_DISK_STATE_STALE:
	/*
	* Possible scenarios:
	* 1. Stale disk was connected.
	*/
	/* Previous state should be NEW. */
	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
	("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
	("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
	g_mirror_device_state2str(sc->sc_state),
	g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	/*
	* STALE state is only possible if device is marked
	* NOAUTOSYNC.
	*/
	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0,
	("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
	g_mirror_device_state2str(sc->sc_state),
	g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	DISK_STATE_CHANGED();

	disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
	disk->d_state = state;
	g_mirror_update_metadata(disk);
	G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.",
	sc->sc_name, g_mirror_get_diskname(disk));
	break;
	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
	/*
	* Possible scenarios:
	* 1. Disk which needs synchronization was connected.
	*/
	/* Previous state should be NEW. */
	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
	("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
	("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
	g_mirror_device_state2str(sc->sc_state),
	g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	DISK_STATE_CHANGED();

	if (disk->d_state == G_MIRROR_DISK_STATE_NEW)
	disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
	disk->d_state = state;
	if (sc->sc_provider != NULL) {
	g_mirror_sync_start(disk);
	g_mirror_update_metadata(disk);
	}
	break;
	case G_MIRROR_DISK_STATE_DISCONNECTED:
	/*
	* Possible scenarios:
	* 1. Device wasn't running yet, but disk disappear.
	* 2. Disk was active and disapppear.
	* 3. Disk disappear during synchronization process.
	*/
	if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) {
	/*
	* Previous state should be ACTIVE, STALE or
	* SYNCHRONIZING.
	*/
	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE \|\|
	disk->d_state == G_MIRROR_DISK_STATE_STALE \|\|
	disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
	("Wrong disk state (%s, %s).",
	g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	} else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) {
	/* Previous state should be NEW. */
	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
	("Wrong disk state (%s, %s).",
	g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	/*
	* Reset bumping syncid if disk disappeared in STARTING
	* state.
	*/
	if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0)
	sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID;
	#ifdef INVARIANTS
	} else {
	KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
	sc->sc_name,
	g_mirror_device_state2str(sc->sc_state),
	g_mirror_get_diskname(disk),
	g_mirror_disk_state2str(disk->d_state)));
	#endif
	}
	DISK_STATE_CHANGED();
	G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.",
	sc->sc_name, g_mirror_get_diskname(disk));

	g_mirror_destroy_disk(disk);
	break;
	case G_MIRROR_DISK_STATE_DESTROY:
	{
	int error;

	error = g_mirror_clear_metadata(disk);
	if (error != 0) {
	G_MIRROR_DEBUG(0,
	"Device %s: failed to clear metadata on %s: %d.",
	sc->sc_name, g_mirror_get_diskname(disk), error);
	break;
	}
	DISK_STATE_CHANGED();
	G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.",
	sc->sc_name, g_mirror_get_diskname(disk));

	g_mirror_destroy_disk(disk);
	sc->sc_ndisks--;
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	g_mirror_update_metadata(disk);
	}
	break;
	}
	default:
	KASSERT(1 == 0, ("Unknown state (%u).", state));
	break;
	}
	return (0);
	}
	#undef DISK_STATE_CHANGED

	int
	g_mirror_read_metadata(struct g_consumer cp, struct g_mirror_metadata md)
	{
	struct g_provider *pp;
	u_char *buf;
	int error;

	g_topology_assert();

	error = g_access(cp, 1, 0, 0);
	if (error != 0)
	return (error);
	pp = cp->provider;
	g_topology_unlock();
	/* Metadata are stored on last sector. */
	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
	&error);
	g_topology_lock();
	g_access(cp, -1, 0, 0);
	if (buf == NULL) {
	G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).",
	cp->provider->name, error);
	return (error);
	}

	/* Decode metadata. */
	error = mirror_metadata_decode(buf, md);
	g_free(buf);
	if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0)
	return (EINVAL);
	if (md->md_version > G_MIRROR_VERSION) {
	G_MIRROR_DEBUG(0,
	"Kernel module is too old to handle metadata from %s.",
	cp->provider->name);
	return (EINVAL);
	}
	if (error != 0) {
	G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
	cp->provider->name);
	return (error);
	}

	return (0);
	}

	static int
	g_mirror_check_metadata(struct g_mirror_softc sc, struct g_provider pp,
	struct g_mirror_metadata *md)
	{

	G_MIRROR_DEBUG(2, "%s: md_did 0x%u disk %s device %s md_all 0x%x "
	"sc_ndisks 0x%x md_slice 0x%x sc_slice 0x%x md_balance 0x%x "
	"sc_balance 0x%x sc_mediasize 0x%jx pp_mediasize 0x%jx "
	"md_sectorsize 0x%x sc_sectorsize 0x%x md_mflags 0x%jx "
	"md_dflags 0x%jx md_syncid 0x%x md_genid 0x%x md_priority 0x%x "
	"sc_state 0x%x.",
	__func__, md->md_did, pp->name, sc->sc_name, md->md_all,
	sc->sc_ndisks, md->md_slice, sc->sc_slice, md->md_balance,
	sc->sc_balance, (uintmax_t)sc->sc_mediasize,
	(uintmax_t)pp->mediasize, md->md_sectorsize, sc->sc_sectorsize,
	(uintmax_t)md->md_mflags, (uintmax_t)md->md_dflags, md->md_syncid,
	md->md_genid, md->md_priority, sc->sc_state);

	if (g_mirror_id2disk(sc, md->md_did) != NULL) {
	G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.",
	pp->name, md->md_did);
	return (EEXIST);
	}
	if (sc->sc_mediasize > pp->mediasize) {
	G_MIRROR_DEBUG(1,
	"Invalid size of disk %s (device %s), skipping.", pp->name,
	sc->sc_name);
	return (EINVAL);
	}
	if (md->md_sectorsize != sc->sc_sectorsize) {
	G_MIRROR_DEBUG(1,
	"Invalid '%s' field on disk %s (device %s), skipping.",
	"md_sectorsize", pp->name, sc->sc_name);
	return (EINVAL);
	}
	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
	G_MIRROR_DEBUG(1,
	"Invalid sector size of disk %s (device %s), skipping.",
	pp->name, sc->sc_name);
	return (EINVAL);
	}
	if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) {
	G_MIRROR_DEBUG(1,
	"Invalid device flags on disk %s (device %s), skipping.",
	pp->name, sc->sc_name);
	return (EINVAL);
	}
	if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) {
	G_MIRROR_DEBUG(1,
	"Invalid disk flags on disk %s (device %s), skipping.",
	pp->name, sc->sc_name);
	return (EINVAL);
	}
	return (0);
	}

	int
	g_mirror_add_disk(struct g_mirror_softc sc, struct g_provider pp,
	struct g_mirror_metadata *md)
	{
	struct g_mirror_disk *disk;
	int error;

	g_topology_assert_not();
	G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name);

	error = g_mirror_check_metadata(sc, pp, md);
	if (error != 0)
	return (error);

	if (md->md_genid < sc->sc_genid) {
	G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.",
	pp->name, sc->sc_name);
	return (EINVAL);
	}

	/*
	* If the component disk we're tasting has newer metadata than the
	* STARTING gmirror device, refresh the device from the component.
	*/
	error = g_mirror_refresh_device(sc, pp, md);
	if (error != 0)
	return (error);

	disk = g_mirror_init_disk(sc, pp, md, &error);
	if (disk == NULL)
	return (error);
	error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW,
	G_MIRROR_EVENT_WAIT);
	if (error != 0)
	return (error);
	if (md->md_version < G_MIRROR_VERSION) {
	G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
	pp->name, md->md_version, G_MIRROR_VERSION);
	g_mirror_update_metadata(disk);
	}
	return (0);
	}

	static void
	g_mirror_destroy_delayed(void *arg, int flag)
	{
	struct g_mirror_softc *sc;
	int error;

	if (flag == EV_CANCEL) {
	G_MIRROR_DEBUG(1, "Destroying canceled.");
	return;
	}
	sc = arg;
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0,
	("DESTROY flag set on %s.", sc->sc_name));
	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0,
	("CLOSEWAIT flag not set on %s.", sc->sc_name));
	G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name);
	error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT);
	if (error != 0) {
	G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).",
	sc->sc_name, error);
	sx_xunlock(&sc->sc_lock);
	}
	g_topology_lock();
	}

	static int
	g_mirror_access(struct g_provider *pp, int acr, int acw, int ace)
	{
	struct g_mirror_softc *sc;
	int error = 0;

	g_topology_assert();
	G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
	acw, ace);

	sc = pp->private;
	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));

	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 \|\|
	(sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 \|\|
	LIST_EMPTY(&sc->sc_disks)) {
	if (acr > 0 \|\| acw > 0 \|\| ace > 0)
	error = ENXIO;
	goto end;
	}
	sc->sc_provider_open += acr + acw + ace;
	if (pp->acw + acw == 0)
	g_mirror_idle(sc, 0);
	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 &&
	sc->sc_provider_open == 0)
	g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK, sc, NULL);
	end:
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	return (error);
	}

	static void
	g_mirror_reinit_from_metadata(struct g_mirror_softc *sc,
	const struct g_mirror_metadata *md)
	{

	sc->sc_genid = md->md_genid;
	sc->sc_syncid = md->md_syncid;

	sc->sc_slice = md->md_slice;
	sc->sc_balance = md->md_balance;
	sc->sc_mediasize = md->md_mediasize;
	sc->sc_ndisks = md->md_all;
	sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_MASK;
	sc->sc_flags \|= (md->md_mflags & G_MIRROR_DEVICE_FLAG_MASK);
	}

	struct g_geom *
	g_mirror_create(struct g_class mp, const struct g_mirror_metadata md,
	u_int type)
	{
	struct g_mirror_softc *sc;
	struct g_geom *gp;
	int error, timeout;

	g_topology_assert();
	G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name,
	md->md_mid);

	/* One disk is minimum. */
	if (md->md_all < 1)
	return (NULL);
	/*
	* Action geom.
	*/
	gp = g_new_geomf(mp, "%s", md->md_name);
	sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK \| M_ZERO);
	gp->start = g_mirror_start;
	gp->orphan = g_mirror_orphan;
	gp->access = g_mirror_access;
	gp->dumpconf = g_mirror_dumpconf;

	sc->sc_type = type;
	sc->sc_id = md->md_mid;
	g_mirror_reinit_from_metadata(sc, md);
	sc->sc_sectorsize = md->md_sectorsize;
	sc->sc_bump_id = 0;
	sc->sc_idle = 1;
	sc->sc_last_write = time_uptime;
	sc->sc_writes = 0;
	sc->sc_refcnt = 1;
	sx_init(&sc->sc_lock, "gmirror:lock");
	TAILQ_INIT(&sc->sc_queue);
	mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF);
	TAILQ_INIT(&sc->sc_regular_delayed);
	TAILQ_INIT(&sc->sc_inflight);
	TAILQ_INIT(&sc->sc_sync_delayed);
	LIST_INIT(&sc->sc_disks);
	TAILQ_INIT(&sc->sc_events);
	mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF);
	callout_init(&sc->sc_callout, 1);
	mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF);
	sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING;
	gp->softc = sc;
	sc->sc_geom = gp;
	sc->sc_provider = NULL;
	sc->sc_provider_open = 0;
	/*
	* Synchronization geom.
	*/
	gp = g_new_geomf(mp, "%s.sync", md->md_name);
	gp->softc = sc;
	gp->orphan = g_mirror_orphan;
	sc->sc_sync.ds_geom = gp;
	sc->sc_sync.ds_ndisks = 0;
	error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0,
	"g_mirror %s", md->md_name);
	if (error != 0) {
	G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.",
	sc->sc_name);
	g_destroy_geom(sc->sc_sync.ds_geom);
	g_destroy_geom(sc->sc_geom);
	g_mirror_free_device(sc);
	return (NULL);
	}

	G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).",
	sc->sc_name, sc->sc_ndisks, sc->sc_id);

	sc->sc_rootmount = root_mount_hold("GMIRROR");
	G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
	/*
	* Run timeout.
	*/
	timeout = g_mirror_timeout * hz;
	callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc);
	return (sc->sc_geom);
	}

	int
	g_mirror_destroy(struct g_mirror_softc *sc, int how)
	{
	struct g_mirror_disk *disk;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	if (sc->sc_provider_open != 0) {
	switch (how) {
	case G_MIRROR_DESTROY_SOFT:
	G_MIRROR_DEBUG(1,
	"Device %s is still open (%d).", sc->sc_name,
	sc->sc_provider_open);
	return (EBUSY);
	case G_MIRROR_DESTROY_DELAYED:
	G_MIRROR_DEBUG(1,
	"Device %s will be destroyed on last close.",
	sc->sc_name);
	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_state ==
	G_MIRROR_DISK_STATE_SYNCHRONIZING) {
	g_mirror_sync_stop(disk, 1);
	}
	}
	sc->sc_flags \|= G_MIRROR_DEVICE_FLAG_CLOSEWAIT;
	return (EBUSY);
	case G_MIRROR_DESTROY_HARD:
	G_MIRROR_DEBUG(1, "Device %s is still open, so it "
	"can't be definitely removed.", sc->sc_name);
	}
	}

	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
	sx_xunlock(&sc->sc_lock);
	return (0);
	}
	sc->sc_flags \|= G_MIRROR_DEVICE_FLAG_DESTROY;
	sc->sc_flags \|= G_MIRROR_DEVICE_FLAG_DRAIN;
	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
	sx_xunlock(&sc->sc_lock);
	mtx_lock(&sc->sc_queue_mtx);
	wakeup(sc);
	mtx_unlock(&sc->sc_queue_mtx);
	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
	while (sc->sc_worker != NULL)
	tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5);
	G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
	sx_xlock(&sc->sc_lock);
	g_mirror_destroy_device(sc);
	return (0);
	}

	static void
	g_mirror_taste_orphan(struct g_consumer *cp)
	{

	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
	cp->provider->name));
	}

	static struct g_geom *
	g_mirror_taste(struct g_class mp, struct g_provider pp, int flags __unused)
	{
	struct g_mirror_metadata md;
	struct g_mirror_softc *sc;
	struct g_consumer *cp;
	struct g_geom *gp;
	int error;

	g_topology_assert();
	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
	G_MIRROR_DEBUG(2, "Tasting %s.", pp->name);

	gp = g_new_geomf(mp, "mirror:taste");
	/*
	* This orphan function should be never called.
	*/
	gp->orphan = g_mirror_taste_orphan;
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error == 0) {
	error = g_mirror_read_metadata(cp, &md);
	g_detach(cp);
	}
	g_destroy_consumer(cp);
	g_destroy_geom(gp);
	if (error != 0)
	return (NULL);
	gp = NULL;

	if (md.md_provider[0] != '\0' &&
	!g_compare_names(md.md_provider, pp->name))
	return (NULL);
	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
	return (NULL);
	if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) {
	G_MIRROR_DEBUG(0,
	"Device %s: provider %s marked as inactive, skipping.",
	md.md_name, pp->name);
	return (NULL);
	}
	if (g_mirror_debug >= 2)
	mirror_metadata_dump(&md);

	/*
	* Let's check if device already exists.
	*/
	sc = NULL;
	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC)
	continue;
	if (sc->sc_sync.ds_geom == gp)
	continue;
	if (strcmp(md.md_name, sc->sc_name) != 0)
	continue;
	if (md.md_mid != sc->sc_id) {
	G_MIRROR_DEBUG(0, "Device %s already configured.",
	sc->sc_name);
	return (NULL);
	}
	break;
	}
	if (gp == NULL) {
	gp = g_mirror_create(mp, &md, G_MIRROR_TYPE_AUTOMATIC);
	if (gp == NULL) {
	G_MIRROR_DEBUG(0, "Cannot create device %s.",
	md.md_name);
	return (NULL);
	}
	sc = gp->softc;
	}
	G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	sc->sc_flags \|= G_MIRROR_DEVICE_FLAG_TASTING;
	error = g_mirror_add_disk(sc, pp, &md);
	sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING;
	if (error != 0) {
	G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
	pp->name, gp->name, error);
	if (LIST_EMPTY(&sc->sc_disks)) {
	g_cancel_event(sc);
	g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD);
	g_topology_lock();
	return (NULL);
	}
	gp = NULL;
	}
	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
	g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD);
	g_topology_lock();
	return (NULL);
	}
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	return (gp);
	}

	static void
	g_mirror_resize(struct g_consumer *cp)
	{
	struct g_mirror_disk *disk;

	g_topology_assert();
	g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name);

	disk = cp->private;
	if (disk == NULL)
	return;
	g_topology_unlock();
	g_mirror_update_metadata(disk);
	g_topology_lock();
	}

	static int
	g_mirror_destroy_geom(struct gctl_req *req __unused,
	struct g_class mp __unused, struct g_geom gp)
	{
	struct g_mirror_softc *sc;
	int error;

	g_topology_unlock();
	sc = gp->softc;
	sx_xlock(&sc->sc_lock);
	g_cancel_event(sc);
	error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT);
	if (error != 0)
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	return (error);
	}

	static void
	g_mirror_dumpconf(struct sbuf sb, const char indent, struct g_geom *gp,
	struct g_consumer cp, struct g_provider pp)
	{
	struct g_mirror_softc *sc;

	g_topology_assert();

	sc = gp->softc;
	if (sc == NULL)
	return;
	/* Skip synchronization geom. */
	if (gp == sc->sc_sync.ds_geom)
	return;
	if (pp != NULL) {
	/* Nothing here. */
	} else if (cp != NULL) {
	struct g_mirror_disk *disk;

	disk = cp->private;
	if (disk == NULL)
	return;
	sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)disk->d_id);
	if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
	sbuf_printf(sb, "%s<Synchronized>", indent);
	if (disk->d_sync.ds_offset == 0)
	sbuf_cat(sb, "0%");
	else
	sbuf_printf(sb, "%u%%",
	(u_int)((disk->d_sync.ds_offset * 100) /
	sc->sc_mediasize));
	sbuf_cat(sb, "</Synchronized>\n");
	if (disk->d_sync.ds_offset > 0)
	sbuf_printf(sb, "%s<BytesSynced>%jd"
	"</BytesSynced>\n", indent,
	(intmax_t)disk->d_sync.ds_offset);
	}
	sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
	disk->d_sync.ds_syncid);
	sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent,
	disk->d_genid);
	sbuf_printf(sb, "%s<Flags>", indent);
	if (disk->d_flags == 0)
	sbuf_cat(sb, "NONE");
	else {
	int first = 1;

	#define ADD_FLAG(flag, name) do { \
	if ((disk->d_flags & (flag)) != 0) { \
	if (!first) \
	sbuf_cat(sb, ", "); \
	else \
	first = 0; \
	sbuf_cat(sb, name); \
	} \
	} while (0)
	ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY");
	ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED");
	ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE");
	ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING,
	"SYNCHRONIZING");
	ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
	ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN");
	#undef ADD_FLAG
	}
	sbuf_cat(sb, "</Flags>\n");
	sbuf_printf(sb, "%s<Priority>%u</Priority>\n", indent,
	disk->d_priority);
	sbuf_printf(sb, "%s<State>%s</State>\n", indent,
	g_mirror_disk_state2str(disk->d_state));
	} else {
	sbuf_printf(sb, "%s<Type>", indent);
	switch (sc->sc_type) {
	case G_MIRROR_TYPE_AUTOMATIC:
	sbuf_cat(sb, "AUTOMATIC");
	break;
	case G_MIRROR_TYPE_MANUAL:
	sbuf_cat(sb, "MANUAL");
	break;
	default:
	sbuf_cat(sb, "UNKNOWN");
	break;
	}
	sbuf_cat(sb, "</Type>\n");
	sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
	sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
	sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
	sbuf_printf(sb, "%s<Flags>", indent);
	if (sc->sc_flags == 0)
	sbuf_cat(sb, "NONE");
	else {
	int first = 1;

	#define ADD_FLAG(flag, name) do { \
	if ((sc->sc_flags & (flag)) != 0) { \
	if (!first) \
	sbuf_cat(sb, ", "); \
	else \
	first = 0; \
	sbuf_cat(sb, name); \
	} \
	} while (0)
	ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
	ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
	#undef ADD_FLAG
	}
	sbuf_cat(sb, "</Flags>\n");
	sbuf_printf(sb, "%s<Slice>%u</Slice>\n", indent,
	(u_int)sc->sc_slice);
	sbuf_printf(sb, "%s<Balance>%s</Balance>\n", indent,
	balance_name(sc->sc_balance));
	sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
	sc->sc_ndisks);
	sbuf_printf(sb, "%s<State>", indent);
	if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING)
	sbuf_printf(sb, "%s", "STARTING");
	else if (sc->sc_ndisks ==
	g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE))
	sbuf_printf(sb, "%s", "COMPLETE");
	else
	sbuf_printf(sb, "%s", "DEGRADED");
	sbuf_cat(sb, "</State>\n");
	}
	}

	static void
	g_mirror_shutdown_post_sync(void *arg, int howto)
	{
	struct g_class *mp;
	struct g_geom gp, gp2;
	struct g_mirror_softc *sc;
	int error;

	if (KERNEL_PANICKED())
	return;

	mp = arg;
	g_topology_lock();
	g_mirror_shutdown = 1;
	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
	if ((sc = gp->softc) == NULL)
	continue;
	/* Skip synchronization geom. */
	if (gp == sc->sc_sync.ds_geom)
	continue;
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	g_mirror_idle(sc, -1);
	g_cancel_event(sc);
	error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED);
	if (error != 0)
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	}
	g_topology_unlock();
	}

	static void
	g_mirror_init(struct g_class *mp)
	{

	g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
	g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
	if (g_mirror_post_sync == NULL)
	G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event.");
	}

	static void
	g_mirror_fini(struct g_class *mp)
	{

	if (g_mirror_post_sync != NULL)
	EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync);
	}

	/*
	* Refresh the mirror device's metadata when gmirror encounters a newer
	* generation as the individual components are being added to the mirror set.
	*/
	static int
	g_mirror_refresh_device(struct g_mirror_softc sc, const struct g_provider pp,
	const struct g_mirror_metadata *md)
	{

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	KASSERT(sc->sc_genid <= md->md_genid,
	("%s: attempted to refresh from stale component %s (device %s) "
	"(%u < %u).", __func__, pp->name, sc->sc_name, md->md_genid,
	sc->sc_genid));

	if (sc->sc_genid > md->md_genid \|\| (sc->sc_genid == md->md_genid &&
	sc->sc_syncid >= md->md_syncid))
	return (0);

	G_MIRROR_DEBUG(0, "Found newer version for device %s (genid: curr=%u "
	"new=%u; syncid: curr=%u new=%u; ndisks: curr=%u new=%u; "
	"provider=%s).", sc->sc_name, sc->sc_genid, md->md_genid,
	sc->sc_syncid, md->md_syncid, sc->sc_ndisks, md->md_all, pp->name);

	if (sc->sc_state != G_MIRROR_DEVICE_STATE_STARTING) {
	/* Probable data corruption detected */
	G_MIRROR_DEBUG(0, "Cannot refresh metadata in %s state "
	"(device=%s genid=%u). A stale mirror device was launched.",
	g_mirror_device_state2str(sc->sc_state), sc->sc_name,
	sc->sc_genid);
	return (EINVAL);
	}

	/* Update softc */
	g_mirror_reinit_from_metadata(sc, md);

	G_MIRROR_DEBUG(1, "Refresh device %s (id=%u, state=%s) from disk %s "
	"(genid=%u syncid=%u md_all=%u).", sc->sc_name, md->md_mid,
	g_mirror_device_state2str(sc->sc_state), pp->name, md->md_genid,
	md->md_syncid, (unsigned)md->md_all);

	return (0);
	}

	DECLARE_GEOM_CLASS(g_mirror_class, g_mirror);
	MODULE_VERSION(geom_mirror, 0);
	diff --git a/sys/geom/nop/g_nop.c b/sys/geom/nop/g_nop.c
	index 2cfb5a6c4281..5e383cd35733 100644
	--- a/sys/geom/nop/g_nop.c
	+++ b/sys/geom/nop/g_nop.c
	@@ -1,978 +1,978 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	* Copyright (c) 2019 Mariusz Zaborski <oshogbo@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/ctype.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/bio.h>
	#include <sys/sbuf.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <geom/geom.h>
	#include <geom/geom_dbg.h>
	#include <geom/nop/g_nop.h>

	SYSCTL_DECL(_kern_geom);
	static SYSCTL_NODE(_kern_geom, OID_AUTO, nop, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_NOP stuff");
	static u_int g_nop_debug = 0;
	SYSCTL_UINT(_kern_geom_nop, OID_AUTO, debug, CTLFLAG_RW, &g_nop_debug, 0,
	"Debug level");

	static int g_nop_destroy(struct g_geom *gp, boolean_t force);
	static int g_nop_destroy_geom(struct gctl_req req, struct g_class mp,
	struct g_geom *gp);
	static void g_nop_config(struct gctl_req req, struct g_class mp,
	const char *verb);
	static g_access_t g_nop_access;
	static g_dumpconf_t g_nop_dumpconf;
	static g_orphan_t g_nop_orphan;
	static g_provgone_t g_nop_providergone;
	static g_resize_t g_nop_resize;
	static g_start_t g_nop_start;

	struct g_class g_nop_class = {
	.name = G_NOP_CLASS_NAME,
	.version = G_VERSION,
	.ctlreq = g_nop_config,
	.destroy_geom = g_nop_destroy_geom,
	.access = g_nop_access,
	.dumpconf = g_nop_dumpconf,
	.orphan = g_nop_orphan,
	.providergone = g_nop_providergone,
	.resize = g_nop_resize,
	.start = g_nop_start,
	};

	struct g_nop_delay {
	struct callout dl_cal;
	struct bio *dl_bio;
	TAILQ_ENTRY(g_nop_delay) dl_next;
	};

	static bool
	g_nop_verify_nprefix(const char *name)
	{
	int i;

	for (i = 0; i < strlen(name); i++) {
	if (isalpha(name[i]) == 0 && isdigit(name[i]) == 0) {
	return (false);
	}
	}

	return (true);
	}

	static void
	g_nop_orphan(struct g_consumer *cp)
	{

	g_topology_assert();
	g_nop_destroy(cp->geom, 1);
	}

	static void
	g_nop_resize(struct g_consumer *cp)
	{
	struct g_nop_softc *sc;
	struct g_geom *gp;
	struct g_provider *pp;
	off_t size;

	g_topology_assert();

	gp = cp->geom;
	sc = gp->softc;

	if (sc->sc_explicitsize != 0)
	return;
	if (cp->provider->mediasize < sc->sc_offset) {
	g_nop_destroy(gp, 1);
	return;
	}
	size = cp->provider->mediasize - sc->sc_offset;
	LIST_FOREACH(pp, &gp->provider, provider)
	g_resize_provider(pp, size);
	}

	static int
	g_nop_dumper(void priv, void virtual, vm_offset_t physical, off_t offset,
	size_t length)
	{

	return (0);
	}

	static void
	g_nop_kerneldump(struct bio bp, struct g_nop_softc sc)
	{
	struct g_kerneldump *gkd;
	struct g_geom *gp;
	struct g_provider *pp;

	gkd = (struct g_kerneldump *)bp->bio_data;
	gp = bp->bio_to->geom;
	g_trace(G_T_TOPOLOGY, "%s(%s, %jd, %jd)", __func__, gp->name,
	(intmax_t)gkd->offset, (intmax_t)gkd->length);

	pp = LIST_FIRST(&gp->provider);

	gkd->di.dumper = g_nop_dumper;
	gkd->di.priv = sc;
	gkd->di.blocksize = pp->sectorsize;
	gkd->di.maxiosize = DFLTPHYS;
	gkd->di.mediaoffset = sc->sc_offset + gkd->offset;
	if (gkd->offset > sc->sc_explicitsize) {
	g_io_deliver(bp, ENODEV);
	return;
	}
	if (gkd->offset + gkd->length > sc->sc_explicitsize)
	gkd->length = sc->sc_explicitsize - gkd->offset;
	gkd->di.mediasize = gkd->length;
	g_io_deliver(bp, 0);
	}

	static void
	g_nop_pass(struct bio cbp, struct g_geom gp)
	{

	G_NOP_LOGREQ(cbp, "Sending request.");
	g_io_request(cbp, LIST_FIRST(&gp->consumer));
	}

	static void
	g_nop_pass_timeout(void *data)
	{
	struct g_nop_softc *sc;
	struct g_geom *gp;
	struct g_nop_delay *gndelay;

	gndelay = (struct g_nop_delay *)data;

	gp = gndelay->dl_bio->bio_to->geom;
	sc = gp->softc;

	mtx_lock(&sc->sc_lock);
	TAILQ_REMOVE(&sc->sc_head_delay, gndelay, dl_next);
	mtx_unlock(&sc->sc_lock);

	g_nop_pass(gndelay->dl_bio, gp);

	g_free(data);
	}

	static void
	g_nop_start(struct bio *bp)
	{
	struct g_nop_softc *sc;
	struct g_geom *gp;
	struct g_provider *pp;
	struct bio *cbp;
	u_int failprob, delayprob, delaytime;

	failprob = delayprob = delaytime = 0;

	gp = bp->bio_to->geom;
	sc = gp->softc;

	G_NOP_LOGREQ(bp, "Request received.");
	mtx_lock(&sc->sc_lock);
	switch (bp->bio_cmd) {
	case BIO_READ:
	sc->sc_reads++;
	sc->sc_readbytes += bp->bio_length;
	if (sc->sc_count_until_fail != 0) {
	sc->sc_count_until_fail -= 1;
	} else {
	failprob = sc->sc_rfailprob;
	delayprob = sc->sc_rdelayprob;
	delaytime = sc->sc_delaymsec;
	}
	break;
	case BIO_WRITE:
	sc->sc_writes++;
	sc->sc_wrotebytes += bp->bio_length;
	if (sc->sc_count_until_fail != 0) {
	sc->sc_count_until_fail -= 1;
	} else {
	failprob = sc->sc_wfailprob;
	delayprob = sc->sc_wdelayprob;
	delaytime = sc->sc_delaymsec;
	}
	break;
	case BIO_DELETE:
	sc->sc_deletes++;
	break;
	case BIO_GETATTR:
	sc->sc_getattrs++;
	if (sc->sc_physpath &&
	g_handleattr_str(bp, "GEOM::physpath", sc->sc_physpath))
	;
	else if (strcmp(bp->bio_attribute, "GEOM::kerneldump") == 0)
	g_nop_kerneldump(bp, sc);
	else
	/*
	* Fallthrough to forwarding the GETATTR down to the
	* lower level device.
	*/
	break;
	mtx_unlock(&sc->sc_lock);
	return;
	case BIO_FLUSH:
	sc->sc_flushes++;
	break;
	case BIO_SPEEDUP:
	sc->sc_speedups++;
	break;
	case BIO_CMD0:
	sc->sc_cmd0s++;
	break;
	case BIO_CMD1:
	sc->sc_cmd1s++;
	break;
	case BIO_CMD2:
	sc->sc_cmd2s++;
	break;
	}
	mtx_unlock(&sc->sc_lock);

	if (failprob > 0) {
	u_int rval;

	rval = arc4random() % 100;
	if (rval < failprob) {
	G_NOP_LOGREQLVL(1, bp, "Returning error=%d.", sc->sc_error);
	g_io_deliver(bp, sc->sc_error);
	return;
	}
	}

	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	g_io_deliver(bp, ENOMEM);
	return;
	}
	cbp->bio_done = g_std_done;
	cbp->bio_offset = bp->bio_offset + sc->sc_offset;
	pp = LIST_FIRST(&gp->provider);
	KASSERT(pp != NULL, ("NULL pp"));
	cbp->bio_to = pp;

	if (delayprob > 0) {
	struct g_nop_delay *gndelay;
	u_int rval;

	rval = arc4random() % 100;
	if (rval < delayprob) {
	gndelay = g_malloc(sizeof(*gndelay), M_NOWAIT \| M_ZERO);
	if (gndelay != NULL) {
	callout_init(&gndelay->dl_cal, 1);

	gndelay->dl_bio = cbp;

	mtx_lock(&sc->sc_lock);
	TAILQ_INSERT_TAIL(&sc->sc_head_delay, gndelay,
	dl_next);
	mtx_unlock(&sc->sc_lock);

	callout_reset(&gndelay->dl_cal,
	MSEC_2_TICKS(delaytime), g_nop_pass_timeout,
	gndelay);
	return;
	}
	}
	}

	g_nop_pass(cbp, gp);
	}

	static int
	g_nop_access(struct g_provider *pp, int dr, int dw, int de)
	{
	struct g_geom *gp;
	struct g_consumer *cp;
	int error;

	gp = pp->geom;
	cp = LIST_FIRST(&gp->consumer);
	error = g_access(cp, dr, dw, de);

	return (error);
	}

	static int
	g_nop_create(struct gctl_req req, struct g_class mp, struct g_provider *pp,
	const char *gnopname, int ioerror, u_int count_until_fail,
	u_int rfailprob, u_int wfailprob, u_int delaymsec, u_int rdelayprob,
	u_int wdelayprob, off_t offset, off_t size, u_int secsize, off_t stripesize,
	off_t stripeoffset, const char *physpath)
	{
	struct g_nop_softc *sc;
	struct g_geom *gp;
	struct g_provider *newpp;
	struct g_consumer *cp;
	struct g_geom_alias *gap;
	char name[64];
	int error, n;
	off_t explicitsize;

	g_topology_assert();

	gp = NULL;
	newpp = NULL;
	cp = NULL;

	if ((offset % pp->sectorsize) != 0) {
	gctl_error(req, "Invalid offset for provider %s.", pp->name);
	return (EINVAL);
	}
	if ((size % pp->sectorsize) != 0) {
	gctl_error(req, "Invalid size for provider %s.", pp->name);
	return (EINVAL);
	}
	if (offset >= pp->mediasize) {
	gctl_error(req, "Invalid offset for provider %s.", pp->name);
	return (EINVAL);
	}
	explicitsize = size;
	if (size == 0)
	size = pp->mediasize - offset;
	if (offset + size > pp->mediasize) {
	gctl_error(req, "Invalid size for provider %s.", pp->name);
	return (EINVAL);
	}
	if (secsize == 0)
	secsize = pp->sectorsize;
	else if ((secsize % pp->sectorsize) != 0) {
	gctl_error(req, "Invalid secsize for provider %s.", pp->name);
	return (EINVAL);
	}
	- if (secsize > MAXPHYS) {
	+ if (secsize > maxphys) {
	gctl_error(req, "secsize is too big.");
	return (EINVAL);
	}
	size -= size % secsize;
	if ((stripesize % pp->sectorsize) != 0) {
	gctl_error(req, "Invalid stripesize for provider %s.", pp->name);
	return (EINVAL);
	}
	if ((stripeoffset % pp->sectorsize) != 0) {
	gctl_error(req, "Invalid stripeoffset for provider %s.", pp->name);
	return (EINVAL);
	}
	if (stripesize != 0 && stripeoffset >= stripesize) {
	gctl_error(req, "stripeoffset is too big.");
	return (EINVAL);
	}
	if (gnopname != NULL && !g_nop_verify_nprefix(gnopname)) {
	gctl_error(req, "Name %s is invalid.", gnopname);
	return (EINVAL);
	}

	if (gnopname != NULL) {
	n = snprintf(name, sizeof(name), "%s%s", gnopname,
	G_NOP_SUFFIX);
	} else {
	n = snprintf(name, sizeof(name), "%s%s", pp->name,
	G_NOP_SUFFIX);
	}
	if (n <= 0 \|\| n >= sizeof(name)) {
	gctl_error(req, "Invalid provider name.");
	return (EINVAL);
	}
	LIST_FOREACH(gp, &mp->geom, geom) {
	if (strcmp(gp->name, name) == 0) {
	gctl_error(req, "Provider %s already exists.", name);
	return (EEXIST);
	}
	}
	gp = g_new_geomf(mp, "%s", name);
	sc = g_malloc(sizeof(*sc), M_WAITOK \| M_ZERO);
	sc->sc_offset = offset;
	sc->sc_explicitsize = explicitsize;
	sc->sc_stripesize = stripesize;
	sc->sc_stripeoffset = stripeoffset;
	if (physpath && strcmp(physpath, G_NOP_PHYSPATH_PASSTHROUGH)) {
	sc->sc_physpath = strndup(physpath, MAXPATHLEN, M_GEOM);
	} else
	sc->sc_physpath = NULL;
	sc->sc_error = ioerror;
	sc->sc_count_until_fail = count_until_fail;
	sc->sc_rfailprob = rfailprob;
	sc->sc_wfailprob = wfailprob;
	sc->sc_delaymsec = delaymsec;
	sc->sc_rdelayprob = rdelayprob;
	sc->sc_wdelayprob = wdelayprob;
	sc->sc_reads = 0;
	sc->sc_writes = 0;
	sc->sc_deletes = 0;
	sc->sc_getattrs = 0;
	sc->sc_flushes = 0;
	sc->sc_speedups = 0;
	sc->sc_cmd0s = 0;
	sc->sc_cmd1s = 0;
	sc->sc_cmd2s = 0;
	sc->sc_readbytes = 0;
	sc->sc_wrotebytes = 0;
	TAILQ_INIT(&sc->sc_head_delay);
	mtx_init(&sc->sc_lock, "gnop lock", NULL, MTX_DEF);
	gp->softc = sc;

	newpp = g_new_providerf(gp, "%s", gp->name);
	newpp->flags \|= G_PF_DIRECT_SEND \| G_PF_DIRECT_RECEIVE;
	newpp->mediasize = size;
	newpp->sectorsize = secsize;
	newpp->stripesize = stripesize;
	newpp->stripeoffset = stripeoffset;
	LIST_FOREACH(gap, &pp->aliases, ga_next)
	g_provider_add_alias(newpp, "%s%s", gap->ga_alias, G_NOP_SUFFIX);

	cp = g_new_consumer(gp);
	cp->flags \|= G_CF_DIRECT_SEND \| G_CF_DIRECT_RECEIVE;
	error = g_attach(cp, pp);
	if (error != 0) {
	gctl_error(req, "Cannot attach to provider %s.", pp->name);
	goto fail;
	}

	newpp->flags \|= pp->flags & G_PF_ACCEPT_UNMAPPED;
	g_error_provider(newpp, 0);
	G_NOP_DEBUG(0, "Device %s created.", gp->name);
	return (0);
	fail:
	if (cp->provider != NULL)
	g_detach(cp);
	g_destroy_consumer(cp);
	g_destroy_provider(newpp);
	mtx_destroy(&sc->sc_lock);
	free(sc->sc_physpath, M_GEOM);
	g_free(gp->softc);
	g_destroy_geom(gp);
	return (error);
	}

	static void
	g_nop_providergone(struct g_provider *pp)
	{
	struct g_geom *gp = pp->geom;
	struct g_nop_softc *sc = gp->softc;

	KASSERT(TAILQ_EMPTY(&sc->sc_head_delay),
	("delayed request list is not empty"));

	gp->softc = NULL;
	free(sc->sc_physpath, M_GEOM);
	mtx_destroy(&sc->sc_lock);
	g_free(sc);
	}

	static int
	g_nop_destroy(struct g_geom *gp, boolean_t force)
	{
	struct g_nop_softc *sc;
	struct g_provider *pp;

	g_topology_assert();
	sc = gp->softc;
	if (sc == NULL)
	return (ENXIO);
	pp = LIST_FIRST(&gp->provider);
	if (pp != NULL && (pp->acr != 0 \|\| pp->acw != 0 \|\| pp->ace != 0)) {
	if (force) {
	G_NOP_DEBUG(0, "Device %s is still open, so it "
	"can't be definitely removed.", pp->name);
	} else {
	G_NOP_DEBUG(1, "Device %s is still open (r%dw%de%d).",
	pp->name, pp->acr, pp->acw, pp->ace);
	return (EBUSY);
	}
	} else {
	G_NOP_DEBUG(0, "Device %s removed.", gp->name);
	}

	g_wither_geom(gp, ENXIO);

	return (0);
	}

	static int
	g_nop_destroy_geom(struct gctl_req req, struct g_class mp, struct g_geom *gp)
	{

	return (g_nop_destroy(gp, 0));
	}

	static void
	g_nop_ctl_create(struct gctl_req req, struct g_class mp)
	{
	struct g_provider *pp;
	intmax_t *val, error, rfailprob, wfailprob, count_until_fail, offset,
	secsize, size, stripesize, stripeoffset, delaymsec,
	rdelayprob, wdelayprob;
	const char physpath, gnopname;
	char param[16];
	int i, *nargs;

	g_topology_assert();

	error = -1;
	rfailprob = -1;
	wfailprob = -1;
	count_until_fail = -1;
	offset = 0;
	secsize = 0;
	size = 0;
	stripesize = 0;
	stripeoffset = 0;
	delaymsec = -1;
	rdelayprob = -1;
	wdelayprob = -1;

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "No '%s' argument", "nargs");
	return;
	}
	if (*nargs <= 0) {
	gctl_error(req, "Missing device(s).");
	return;
	}
	val = gctl_get_paraml_opt(req, "error", sizeof(*val));
	if (val != NULL) {
	error = *val;
	}
	val = gctl_get_paraml_opt(req, "rfailprob", sizeof(*val));
	if (val != NULL) {
	rfailprob = *val;
	if (rfailprob < -1 \|\| rfailprob > 100) {
	gctl_error(req, "Invalid '%s' argument", "rfailprob");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "wfailprob", sizeof(*val));
	if (val != NULL) {
	wfailprob = *val;
	if (wfailprob < -1 \|\| wfailprob > 100) {
	gctl_error(req, "Invalid '%s' argument", "wfailprob");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "delaymsec", sizeof(*val));
	if (val != NULL) {
	delaymsec = *val;
	if (delaymsec < 1 && delaymsec != -1) {
	gctl_error(req, "Invalid '%s' argument", "delaymsec");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "rdelayprob", sizeof(*val));
	if (val != NULL) {
	rdelayprob = *val;
	if (rdelayprob < -1 \|\| rdelayprob > 100) {
	gctl_error(req, "Invalid '%s' argument", "rdelayprob");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "wdelayprob", sizeof(*val));
	if (val != NULL) {
	wdelayprob = *val;
	if (wdelayprob < -1 \|\| wdelayprob > 100) {
	gctl_error(req, "Invalid '%s' argument", "wdelayprob");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "count_until_fail", sizeof(*val));
	if (val != NULL) {
	count_until_fail = *val;
	if (count_until_fail < -1) {
	gctl_error(req, "Invalid '%s' argument",
	"count_until_fail");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "offset", sizeof(*val));
	if (val != NULL) {
	offset = *val;
	if (offset < 0) {
	gctl_error(req, "Invalid '%s' argument", "offset");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "size", sizeof(*val));
	if (val != NULL) {
	size = *val;
	if (size < 0) {
	gctl_error(req, "Invalid '%s' argument", "size");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "secsize", sizeof(*val));
	if (val != NULL) {
	secsize = *val;
	if (secsize < 0) {
	gctl_error(req, "Invalid '%s' argument", "secsize");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "stripesize", sizeof(*val));
	if (val != NULL) {
	stripesize = *val;
	if (stripesize < 0) {
	gctl_error(req, "Invalid '%s' argument", "stripesize");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "stripeoffset", sizeof(*val));
	if (val != NULL) {
	stripeoffset = *val;
	if (stripeoffset < 0) {
	gctl_error(req, "Invalid '%s' argument",
	"stripeoffset");
	return;
	}
	}
	physpath = gctl_get_asciiparam(req, "physpath");
	gnopname = gctl_get_asciiparam(req, "gnopname");

	for (i = 0; i < *nargs; i++) {
	snprintf(param, sizeof(param), "arg%d", i);
	pp = gctl_get_provider(req, param);
	if (pp == NULL)
	return;
	if (g_nop_create(req, mp, pp,
	gnopname,
	error == -1 ? EIO : (int)error,
	count_until_fail == -1 ? 0 : (u_int)count_until_fail,
	rfailprob == -1 ? 0 : (u_int)rfailprob,
	wfailprob == -1 ? 0 : (u_int)wfailprob,
	delaymsec == -1 ? 1 : (u_int)delaymsec,
	rdelayprob == -1 ? 0 : (u_int)rdelayprob,
	wdelayprob == -1 ? 0 : (u_int)wdelayprob,
	(off_t)offset, (off_t)size, (u_int)secsize,
	(off_t)stripesize, (off_t)stripeoffset,
	physpath) != 0) {
	return;
	}
	}
	}

	static void
	g_nop_ctl_configure(struct gctl_req req, struct g_class mp)
	{
	struct g_nop_softc *sc;
	struct g_provider *pp;
	intmax_t *val, delaymsec, error, rdelayprob, rfailprob, wdelayprob,
	wfailprob, count_until_fail;
	char param[16];
	int i, *nargs;

	g_topology_assert();

	count_until_fail = -1;
	delaymsec = -1;
	error = -1;
	rdelayprob = -1;
	rfailprob = -1;
	wdelayprob = -1;
	wfailprob = -1;

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "No '%s' argument", "nargs");
	return;
	}
	if (*nargs <= 0) {
	gctl_error(req, "Missing device(s).");
	return;
	}
	val = gctl_get_paraml_opt(req, "error", sizeof(*val));
	if (val != NULL) {
	error = *val;
	}
	val = gctl_get_paraml_opt(req, "count_until_fail", sizeof(*val));
	if (val != NULL) {
	count_until_fail = *val;
	}
	val = gctl_get_paraml_opt(req, "rfailprob", sizeof(*val));
	if (val != NULL) {
	rfailprob = *val;
	if (rfailprob < -1 \|\| rfailprob > 100) {
	gctl_error(req, "Invalid '%s' argument", "rfailprob");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "wfailprob", sizeof(*val));
	if (val != NULL) {
	wfailprob = *val;
	if (wfailprob < -1 \|\| wfailprob > 100) {
	gctl_error(req, "Invalid '%s' argument", "wfailprob");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "delaymsec", sizeof(*val));
	if (val != NULL) {
	delaymsec = *val;
	if (delaymsec < 1 && delaymsec != -1) {
	gctl_error(req, "Invalid '%s' argument", "delaymsec");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "rdelayprob", sizeof(*val));
	if (val != NULL) {
	rdelayprob = *val;
	if (rdelayprob < -1 \|\| rdelayprob > 100) {
	gctl_error(req, "Invalid '%s' argument", "rdelayprob");
	return;
	}
	}
	val = gctl_get_paraml_opt(req, "wdelayprob", sizeof(*val));
	if (val != NULL) {
	wdelayprob = *val;
	if (wdelayprob < -1 \|\| wdelayprob > 100) {
	gctl_error(req, "Invalid '%s' argument", "wdelayprob");
	return;
	}
	}

	for (i = 0; i < *nargs; i++) {
	snprintf(param, sizeof(param), "arg%d", i);
	pp = gctl_get_provider(req, param);
	if (pp == NULL)
	return;
	if (pp->geom->class != mp) {
	G_NOP_DEBUG(1, "Provider %s is invalid.", pp->name);
	gctl_error(req, "Provider %s is invalid.", pp->name);
	return;
	}
	sc = pp->geom->softc;
	if (error != -1)
	sc->sc_error = (int)error;
	if (rfailprob != -1)
	sc->sc_rfailprob = (u_int)rfailprob;
	if (wfailprob != -1)
	sc->sc_wfailprob = (u_int)wfailprob;
	if (rdelayprob != -1)
	sc->sc_rdelayprob = (u_int)rdelayprob;
	if (wdelayprob != -1)
	sc->sc_wdelayprob = (u_int)wdelayprob;
	if (delaymsec != -1)
	sc->sc_delaymsec = (u_int)delaymsec;
	if (count_until_fail != -1)
	sc->sc_count_until_fail = (u_int)count_until_fail;
	}
	}

	static struct g_geom *
	g_nop_find_geom(struct g_class mp, const char name)
	{
	struct g_geom *gp;

	LIST_FOREACH(gp, &mp->geom, geom) {
	if (strcmp(gp->name, name) == 0)
	return (gp);
	}
	return (NULL);
	}

	static void
	g_nop_ctl_destroy(struct gctl_req req, struct g_class mp)
	{
	int nargs, force, error, i;
	struct g_geom *gp;
	const char *name;
	char param[16];

	g_topology_assert();

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "No '%s' argument", "nargs");
	return;
	}
	if (*nargs <= 0) {
	gctl_error(req, "Missing device(s).");
	return;
	}
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (force == NULL) {
	gctl_error(req, "No 'force' argument");
	return;
	}

	for (i = 0; i < *nargs; i++) {
	snprintf(param, sizeof(param), "arg%d", i);
	name = gctl_get_asciiparam(req, param);
	if (name == NULL) {
	gctl_error(req, "No 'arg%d' argument", i);
	return;
	}
	if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0)
	name += strlen(_PATH_DEV);
	gp = g_nop_find_geom(mp, name);
	if (gp == NULL) {
	G_NOP_DEBUG(1, "Device %s is invalid.", name);
	gctl_error(req, "Device %s is invalid.", name);
	return;
	}
	error = g_nop_destroy(gp, *force);
	if (error != 0) {
	gctl_error(req, "Cannot destroy device %s (error=%d).",
	gp->name, error);
	return;
	}
	}
	}

	static void
	g_nop_ctl_reset(struct gctl_req req, struct g_class mp)
	{
	struct g_nop_softc *sc;
	struct g_provider *pp;
	char param[16];
	int i, *nargs;

	g_topology_assert();

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "No '%s' argument", "nargs");
	return;
	}
	if (*nargs <= 0) {
	gctl_error(req, "Missing device(s).");
	return;
	}

	for (i = 0; i < *nargs; i++) {
	snprintf(param, sizeof(param), "arg%d", i);
	pp = gctl_get_provider(req, param);
	if (pp == NULL)
	return;
	if (pp->geom->class != mp) {
	G_NOP_DEBUG(1, "Provider %s is invalid.", pp->name);
	gctl_error(req, "Provider %s is invalid.", pp->name);
	return;
	}
	sc = pp->geom->softc;
	sc->sc_reads = 0;
	sc->sc_writes = 0;
	sc->sc_deletes = 0;
	sc->sc_getattrs = 0;
	sc->sc_flushes = 0;
	sc->sc_speedups = 0;
	sc->sc_cmd0s = 0;
	sc->sc_cmd1s = 0;
	sc->sc_cmd2s = 0;
	sc->sc_readbytes = 0;
	sc->sc_wrotebytes = 0;
	}
	}

	static void
	g_nop_config(struct gctl_req req, struct g_class mp, const char *verb)
	{
	uint32_t *version;

	g_topology_assert();

	version = gctl_get_paraml(req, "version", sizeof(*version));
	if (version == NULL) {
	gctl_error(req, "No '%s' argument.", "version");
	return;
	}
	if (*version != G_NOP_VERSION) {
	gctl_error(req, "Userland and kernel parts are out of sync.");
	return;
	}

	if (strcmp(verb, "create") == 0) {
	g_nop_ctl_create(req, mp);
	return;
	} else if (strcmp(verb, "configure") == 0) {
	g_nop_ctl_configure(req, mp);
	return;
	} else if (strcmp(verb, "destroy") == 0) {
	g_nop_ctl_destroy(req, mp);
	return;
	} else if (strcmp(verb, "reset") == 0) {
	g_nop_ctl_reset(req, mp);
	return;
	}

	gctl_error(req, "Unknown verb.");
	}

	static void
	g_nop_dumpconf(struct sbuf sb, const char indent, struct g_geom *gp,
	struct g_consumer cp, struct g_provider pp)
	{
	struct g_nop_softc *sc;

	if (pp != NULL \|\| cp != NULL)
	return;
	sc = gp->softc;
	sbuf_printf(sb, "%s<Offset>%jd</Offset>\n", indent,
	(intmax_t)sc->sc_offset);
	sbuf_printf(sb, "%s<ReadFailProb>%u</ReadFailProb>\n", indent,
	sc->sc_rfailprob);
	sbuf_printf(sb, "%s<WriteFailProb>%u</WriteFailProb>\n", indent,
	sc->sc_wfailprob);
	sbuf_printf(sb, "%s<ReadDelayedProb>%u</ReadDelayedProb>\n", indent,
	sc->sc_rdelayprob);
	sbuf_printf(sb, "%s<WriteDelayedProb>%u</WriteDelayedProb>\n", indent,
	sc->sc_wdelayprob);
	sbuf_printf(sb, "%s<Delay>%d</Delay>\n", indent, sc->sc_delaymsec);
	sbuf_printf(sb, "%s<CountUntilFail>%u</CountUntilFail>\n", indent,
	sc->sc_count_until_fail);
	sbuf_printf(sb, "%s<Error>%d</Error>\n", indent, sc->sc_error);
	sbuf_printf(sb, "%s<Reads>%ju</Reads>\n", indent, sc->sc_reads);
	sbuf_printf(sb, "%s<Writes>%ju</Writes>\n", indent, sc->sc_writes);
	sbuf_printf(sb, "%s<Deletes>%ju</Deletes>\n", indent, sc->sc_deletes);
	sbuf_printf(sb, "%s<Getattrs>%ju</Getattrs>\n", indent, sc->sc_getattrs);
	sbuf_printf(sb, "%s<Flushes>%ju</Flushes>\n", indent, sc->sc_flushes);
	sbuf_printf(sb, "%s<Speedups>%ju</Speedups>\n", indent, sc->sc_speedups);
	sbuf_printf(sb, "%s<Cmd0s>%ju</Cmd0s>\n", indent, sc->sc_cmd0s);
	sbuf_printf(sb, "%s<Cmd1s>%ju</Cmd1s>\n", indent, sc->sc_cmd1s);
	sbuf_printf(sb, "%s<Cmd2s>%ju</Cmd2s>\n", indent, sc->sc_cmd2s);
	sbuf_printf(sb, "%s<ReadBytes>%ju</ReadBytes>\n", indent,
	sc->sc_readbytes);
	sbuf_printf(sb, "%s<WroteBytes>%ju</WroteBytes>\n", indent,
	sc->sc_wrotebytes);
	}

	DECLARE_GEOM_CLASS(g_nop_class, g_nop);
	MODULE_VERSION(geom_nop, 0);
	diff --git a/sys/geom/part/g_part_apm.c b/sys/geom/part/g_part_apm.c
	index 92019c7e4f16..aa008871d58f 100644
	--- a/sys/geom/part/g_part_apm.c
	+++ b/sys/geom/part/g_part_apm.c
	@@ -1,597 +1,597 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2006-2008 Marcel Moolenaar
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/apm.h>
	#include <sys/bio.h>
	#include <sys/endian.h>
	#include <sys/kernel.h>
	#include <sys/kobj.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/queue.h>
	#include <sys/sbuf.h>
	#include <sys/systm.h>
	#include <sys/sysctl.h>
	#include <geom/geom.h>
	#include <geom/geom_int.h>
	#include <geom/part/g_part.h>

	#include "g_part_if.h"

	FEATURE(geom_part_apm, "GEOM partitioning class for Apple-style partitions");

	struct g_part_apm_table {
	struct g_part_table base;
	struct apm_ddr ddr;
	struct apm_ent self;
	int tivo_series1;
	};

	struct g_part_apm_entry {
	struct g_part_entry base;
	struct apm_ent ent;
	};

	static int g_part_apm_add(struct g_part_table , struct g_part_entry ,
	struct g_part_parms *);
	static int g_part_apm_create(struct g_part_table , struct g_part_parms );
	static int g_part_apm_destroy(struct g_part_table , struct g_part_parms );
	static void g_part_apm_dumpconf(struct g_part_table , struct g_part_entry ,
	struct sbuf , const char );
	static int g_part_apm_dumpto(struct g_part_table , struct g_part_entry );
	static int g_part_apm_modify(struct g_part_table , struct g_part_entry ,
	struct g_part_parms *);
	static const char g_part_apm_name(struct g_part_table , struct g_part_entry *,
	char *, size_t);
	static int g_part_apm_probe(struct g_part_table , struct g_consumer );
	static int g_part_apm_read(struct g_part_table , struct g_consumer );
	static const char g_part_apm_type(struct g_part_table , struct g_part_entry *,
	char *, size_t);
	static int g_part_apm_write(struct g_part_table , struct g_consumer );
	static int g_part_apm_resize(struct g_part_table , struct g_part_entry ,
	struct g_part_parms *);

	static kobj_method_t g_part_apm_methods[] = {
	KOBJMETHOD(g_part_add, g_part_apm_add),
	KOBJMETHOD(g_part_create, g_part_apm_create),
	KOBJMETHOD(g_part_destroy, g_part_apm_destroy),
	KOBJMETHOD(g_part_dumpconf, g_part_apm_dumpconf),
	KOBJMETHOD(g_part_dumpto, g_part_apm_dumpto),
	KOBJMETHOD(g_part_modify, g_part_apm_modify),
	KOBJMETHOD(g_part_resize, g_part_apm_resize),
	KOBJMETHOD(g_part_name, g_part_apm_name),
	KOBJMETHOD(g_part_probe, g_part_apm_probe),
	KOBJMETHOD(g_part_read, g_part_apm_read),
	KOBJMETHOD(g_part_type, g_part_apm_type),
	KOBJMETHOD(g_part_write, g_part_apm_write),
	{ 0, 0 }
	};

	static struct g_part_scheme g_part_apm_scheme = {
	"APM",
	g_part_apm_methods,
	sizeof(struct g_part_apm_table),
	.gps_entrysz = sizeof(struct g_part_apm_entry),
	.gps_minent = 16,
	.gps_maxent = 4096,
	};
	G_PART_SCHEME_DECLARE(g_part_apm);
	MODULE_VERSION(geom_part_apm, 0);

	static void
	swab(char *buf, size_t bufsz)
	{
	int i;
	char ch;

	for (i = 0; i < bufsz; i += 2) {
	ch = buf[i];
	buf[i] = buf[i + 1];
	buf[i + 1] = ch;
	}
	}

	static int
	apm_parse_type(const char type, char buf, size_t bufsz)
	{
	const char *alias;

	if (type[0] == '!') {
	type++;
	if (strlen(type) > bufsz)
	return (EINVAL);
	if (!strcmp(type, APM_ENT_TYPE_SELF) \|\|
	!strcmp(type, APM_ENT_TYPE_UNUSED))
	return (EINVAL);
	strncpy(buf, type, bufsz);
	return (0);
	}
	alias = g_part_alias_name(G_PART_ALIAS_APPLE_BOOT);
	if (!strcasecmp(type, alias)) {
	strcpy(buf, APM_ENT_TYPE_APPLE_BOOT);
	return (0);
	}
	alias = g_part_alias_name(G_PART_ALIAS_APPLE_HFS);
	if (!strcasecmp(type, alias)) {
	strcpy(buf, APM_ENT_TYPE_APPLE_HFS);
	return (0);
	}
	alias = g_part_alias_name(G_PART_ALIAS_APPLE_UFS);
	if (!strcasecmp(type, alias)) {
	strcpy(buf, APM_ENT_TYPE_APPLE_UFS);
	return (0);
	}
	alias = g_part_alias_name(G_PART_ALIAS_FREEBSD);
	if (!strcasecmp(type, alias)) {
	strcpy(buf, APM_ENT_TYPE_FREEBSD);
	return (0);
	}
	alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_NANDFS);
	if (!strcasecmp(type, alias)) {
	strcpy(buf, APM_ENT_TYPE_FREEBSD_NANDFS);
	return (0);
	}
	alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_SWAP);
	if (!strcasecmp(type, alias)) {
	strcpy(buf, APM_ENT_TYPE_FREEBSD_SWAP);
	return (0);
	}
	alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_UFS);
	if (!strcasecmp(type, alias)) {
	strcpy(buf, APM_ENT_TYPE_FREEBSD_UFS);
	return (0);
	}
	alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_VINUM);
	if (!strcasecmp(type, alias)) {
	strcpy(buf, APM_ENT_TYPE_FREEBSD_VINUM);
	return (0);
	}
	alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_ZFS);
	if (!strcasecmp(type, alias)) {
	strcpy(buf, APM_ENT_TYPE_FREEBSD_ZFS);
	return (0);
	}
	return (EINVAL);
	}

	static int
	apm_read_ent(struct g_consumer cp, uint32_t blk, struct apm_ent ent,
	int tivo_series1)
	{
	struct g_provider *pp;
	char *buf;
	int error;

	pp = cp->provider;
	buf = g_read_data(cp, pp->sectorsize * blk, pp->sectorsize, &error);
	if (buf == NULL)
	return (error);
	if (tivo_series1)
	swab(buf, pp->sectorsize);
	ent->ent_sig = be16dec(buf);
	ent->ent_pmblkcnt = be32dec(buf + 4);
	ent->ent_start = be32dec(buf + 8);
	ent->ent_size = be32dec(buf + 12);
	bcopy(buf + 16, ent->ent_name, sizeof(ent->ent_name));
	bcopy(buf + 48, ent->ent_type, sizeof(ent->ent_type));
	g_free(buf);
	return (0);
	}

	static int
	g_part_apm_add(struct g_part_table basetable, struct g_part_entry baseentry,
	struct g_part_parms *gpp)
	{
	struct g_part_apm_entry *entry;
	struct g_part_apm_table *table;
	int error;

	entry = (struct g_part_apm_entry *)baseentry;
	table = (struct g_part_apm_table *)basetable;
	entry->ent.ent_sig = APM_ENT_SIG;
	entry->ent.ent_pmblkcnt = table->self.ent_pmblkcnt;
	entry->ent.ent_start = gpp->gpp_start;
	entry->ent.ent_size = gpp->gpp_size;
	if (baseentry->gpe_deleted) {
	bzero(entry->ent.ent_type, sizeof(entry->ent.ent_type));
	bzero(entry->ent.ent_name, sizeof(entry->ent.ent_name));
	}
	error = apm_parse_type(gpp->gpp_type, entry->ent.ent_type,
	sizeof(entry->ent.ent_type));
	if (error)
	return (error);
	if (gpp->gpp_parms & G_PART_PARM_LABEL) {
	if (strlen(gpp->gpp_label) > sizeof(entry->ent.ent_name))
	return (EINVAL);
	strncpy(entry->ent.ent_name, gpp->gpp_label,
	sizeof(entry->ent.ent_name));
	}
	if (baseentry->gpe_index >= table->self.ent_pmblkcnt)
	table->self.ent_pmblkcnt = baseentry->gpe_index + 1;
	KASSERT(table->self.ent_size >= table->self.ent_pmblkcnt,
	("%s", __func__));
	KASSERT(table->self.ent_size > baseentry->gpe_index,
	("%s", __func__));
	return (0);
	}

	static int
	g_part_apm_create(struct g_part_table basetable, struct g_part_parms gpp)
	{
	struct g_provider *pp;
	struct g_part_apm_table *table;
	uint32_t last;

	/* We don't nest, which means that our depth should be 0. */
	if (basetable->gpt_depth != 0)
	return (ENXIO);

	table = (struct g_part_apm_table *)basetable;
	pp = gpp->gpp_provider;
	if (pp->sectorsize != 512 \|\|
	pp->mediasize < (2 + 2 * basetable->gpt_entries) * pp->sectorsize)
	return (ENOSPC);

	/* APM uses 32-bit LBAs. */
	last = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX) - 1;

	basetable->gpt_first = 2 + basetable->gpt_entries;
	basetable->gpt_last = last;

	table->ddr.ddr_sig = APM_DDR_SIG;
	table->ddr.ddr_blksize = pp->sectorsize;
	table->ddr.ddr_blkcount = last + 1;

	table->self.ent_sig = APM_ENT_SIG;
	table->self.ent_pmblkcnt = basetable->gpt_entries + 1;
	table->self.ent_start = 1;
	table->self.ent_size = table->self.ent_pmblkcnt;
	strcpy(table->self.ent_name, "Apple");
	strcpy(table->self.ent_type, APM_ENT_TYPE_SELF);
	return (0);
	}

	static int
	g_part_apm_destroy(struct g_part_table basetable, struct g_part_parms gpp)
	{

	/* Wipe the first 2 sectors to clear the partitioning. */
	basetable->gpt_smhead \|= 3;
	return (0);
	}

	static void
	g_part_apm_dumpconf(struct g_part_table table, struct g_part_entry baseentry,
	struct sbuf sb, const char indent)
	{
	union {
	char name[APM_ENT_NAMELEN + 1];
	char type[APM_ENT_TYPELEN + 1];
	} u;
	struct g_part_apm_entry *entry;

	entry = (struct g_part_apm_entry *)baseentry;
	if (indent == NULL) {
	/* conftxt: libdisk compatibility */
	sbuf_printf(sb, " xs APPLE xt %s", entry->ent.ent_type);
	} else if (entry != NULL) {
	/* confxml: partition entry information */
	strncpy(u.name, entry->ent.ent_name, APM_ENT_NAMELEN);
	u.name[APM_ENT_NAMELEN] = '\0';
	sbuf_printf(sb, "%s<label>", indent);
	g_conf_cat_escaped(sb, u.name);
	sbuf_cat(sb, "</label>\n");
	strncpy(u.type, entry->ent.ent_type, APM_ENT_TYPELEN);
	u.type[APM_ENT_TYPELEN] = '\0';
	sbuf_printf(sb, "%s<rawtype>", indent);
	g_conf_cat_escaped(sb, u.type);
	sbuf_cat(sb, "</rawtype>\n");
	} else {
	/* confxml: scheme information */
	}
	}

	static int
	g_part_apm_dumpto(struct g_part_table table, struct g_part_entry baseentry)
	{
	struct g_part_apm_entry *entry;

	entry = (struct g_part_apm_entry *)baseentry;
	return ((!strcmp(entry->ent.ent_type, APM_ENT_TYPE_FREEBSD_SWAP))
	? 1 : 0);
	}

	static int
	g_part_apm_modify(struct g_part_table *basetable,
	struct g_part_entry baseentry, struct g_part_parms gpp)
	{
	struct g_part_apm_entry *entry;
	int error;

	entry = (struct g_part_apm_entry *)baseentry;
	if (gpp->gpp_parms & G_PART_PARM_LABEL) {
	if (strlen(gpp->gpp_label) > sizeof(entry->ent.ent_name))
	return (EINVAL);
	}
	if (gpp->gpp_parms & G_PART_PARM_TYPE) {
	error = apm_parse_type(gpp->gpp_type, entry->ent.ent_type,
	sizeof(entry->ent.ent_type));
	if (error)
	return (error);
	}
	if (gpp->gpp_parms & G_PART_PARM_LABEL) {
	strncpy(entry->ent.ent_name, gpp->gpp_label,
	sizeof(entry->ent.ent_name));
	}
	return (0);
	}

	static int
	g_part_apm_resize(struct g_part_table *basetable,
	struct g_part_entry baseentry, struct g_part_parms gpp)
	{
	struct g_part_apm_entry *entry;
	struct g_provider *pp;

	if (baseentry == NULL) {
	pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider;
	basetable->gpt_last = MIN(pp->mediasize / pp->sectorsize,
	UINT32_MAX) - 1;
	return (0);
	}

	entry = (struct g_part_apm_entry *)baseentry;
	baseentry->gpe_end = baseentry->gpe_start + gpp->gpp_size - 1;
	entry->ent.ent_size = gpp->gpp_size;

	return (0);
	}

	static const char *
	g_part_apm_name(struct g_part_table table, struct g_part_entry baseentry,
	char *buf, size_t bufsz)
	{

	snprintf(buf, bufsz, "s%d", baseentry->gpe_index + 1);
	return (buf);
	}

	static int
	g_part_apm_probe(struct g_part_table basetable, struct g_consumer cp)
	{
	struct g_provider *pp;
	struct g_part_apm_table *table;
	char *buf;
	int error;

	/* We don't nest, which means that our depth should be 0. */
	if (basetable->gpt_depth != 0)
	return (ENXIO);

	table = (struct g_part_apm_table *)basetable;
	table->tivo_series1 = 0;
	pp = cp->provider;

	/* Sanity-check the provider. */
	if (pp->mediasize < 4 * pp->sectorsize)
	return (ENOSPC);

	/* Check that there's a Driver Descriptor Record (DDR). */
	buf = g_read_data(cp, 0L, pp->sectorsize, &error);
	if (buf == NULL)
	return (error);
	if (be16dec(buf) == APM_DDR_SIG) {
	/* Normal Apple DDR */
	table->ddr.ddr_sig = be16dec(buf);
	table->ddr.ddr_blksize = be16dec(buf + 2);
	table->ddr.ddr_blkcount = be32dec(buf + 4);
	g_free(buf);
	if (table->ddr.ddr_blksize != pp->sectorsize)
	return (ENXIO);
	if (table->ddr.ddr_blkcount > pp->mediasize / pp->sectorsize)
	return (ENXIO);
	} else {
	/*
	* Check for Tivo drives, which have no DDR and a different
	* signature. Those whose first two bytes are 14 92 are
	* Series 2 drives, and aren't supported. Those that start
	* with 92 14 are series 1 drives and are supported.
	*/
	if (be16dec(buf) != 0x9214) {
	/* If this is 0x1492 it could be a series 2 drive */
	g_free(buf);
	return (ENXIO);
	}
	table->ddr.ddr_sig = APM_DDR_SIG; /* XXX */
	table->ddr.ddr_blksize = pp->sectorsize; /* XXX */
	table->ddr.ddr_blkcount =
	MIN(pp->mediasize / pp->sectorsize, UINT32_MAX);
	table->tivo_series1 = 1;
	g_free(buf);
	}

	/* Check that there's a Partition Map. */
	error = apm_read_ent(cp, 1, &table->self, table->tivo_series1);
	if (error)
	return (error);
	if (table->self.ent_sig != APM_ENT_SIG)
	return (ENXIO);
	if (strcmp(table->self.ent_type, APM_ENT_TYPE_SELF))
	return (ENXIO);
	if (table->self.ent_pmblkcnt >= table->ddr.ddr_blkcount)
	return (ENXIO);
	return (G_PART_PROBE_PRI_NORM);
	}

	static int
	g_part_apm_read(struct g_part_table basetable, struct g_consumer cp)
	{
	struct apm_ent ent;
	struct g_part_apm_entry *entry;
	struct g_part_apm_table *table;
	int error, index;

	table = (struct g_part_apm_table *)basetable;

	basetable->gpt_first = table->self.ent_size + 1;
	basetable->gpt_last = table->ddr.ddr_blkcount - 1;
	basetable->gpt_entries = table->self.ent_size - 1;

	for (index = table->self.ent_pmblkcnt - 1; index > 0; index--) {
	error = apm_read_ent(cp, index + 1, &ent, table->tivo_series1);
	if (error)
	continue;
	if (!strcmp(ent.ent_type, APM_ENT_TYPE_UNUSED))
	continue;
	entry = (struct g_part_apm_entry *)g_part_new_entry(basetable,
	index, ent.ent_start, ent.ent_start + ent.ent_size - 1);
	entry->ent = ent;
	}

	return (0);
	}

	static const char *
	g_part_apm_type(struct g_part_table basetable, struct g_part_entry baseentry,
	char *buf, size_t bufsz)
	{
	struct g_part_apm_entry *entry;
	const char *type;
	size_t len;

	entry = (struct g_part_apm_entry *)baseentry;
	type = entry->ent.ent_type;
	if (!strcmp(type, APM_ENT_TYPE_APPLE_BOOT))
	return (g_part_alias_name(G_PART_ALIAS_APPLE_BOOT));
	if (!strcmp(type, APM_ENT_TYPE_APPLE_HFS))
	return (g_part_alias_name(G_PART_ALIAS_APPLE_HFS));
	if (!strcmp(type, APM_ENT_TYPE_APPLE_UFS))
	return (g_part_alias_name(G_PART_ALIAS_APPLE_UFS));
	if (!strcmp(type, APM_ENT_TYPE_FREEBSD))
	return (g_part_alias_name(G_PART_ALIAS_FREEBSD));
	if (!strcmp(type, APM_ENT_TYPE_FREEBSD_NANDFS))
	return (g_part_alias_name(G_PART_ALIAS_FREEBSD_NANDFS));
	if (!strcmp(type, APM_ENT_TYPE_FREEBSD_SWAP))
	return (g_part_alias_name(G_PART_ALIAS_FREEBSD_SWAP));
	if (!strcmp(type, APM_ENT_TYPE_FREEBSD_UFS))
	return (g_part_alias_name(G_PART_ALIAS_FREEBSD_UFS));
	if (!strcmp(type, APM_ENT_TYPE_FREEBSD_VINUM))
	return (g_part_alias_name(G_PART_ALIAS_FREEBSD_VINUM));
	if (!strcmp(type, APM_ENT_TYPE_FREEBSD_ZFS))
	return (g_part_alias_name(G_PART_ALIAS_FREEBSD_ZFS));
	buf[0] = '!';
	len = MIN(sizeof(entry->ent.ent_type), bufsz - 2);
	bcopy(type, buf + 1, len);
	buf[len + 1] = '\0';
	return (buf);
	}

	static int
	g_part_apm_write(struct g_part_table basetable, struct g_consumer cp)
	{
	struct g_provider *pp;
	struct g_part_entry *baseentry;
	struct g_part_apm_entry *entry;
	struct g_part_apm_table *table;
	char buf, ptr;
	uint32_t index;
	int error;
	size_t tblsz;

	pp = cp->provider;
	table = (struct g_part_apm_table *)basetable;
	/*
	* Tivo Series 1 disk partitions are currently read-only.
	*/
	if (table->tivo_series1)
	return (EOPNOTSUPP);

	/* Write the DDR only when we're newly created. */
	if (basetable->gpt_created) {
	buf = g_malloc(pp->sectorsize, M_WAITOK \| M_ZERO);
	be16enc(buf, table->ddr.ddr_sig);
	be16enc(buf + 2, table->ddr.ddr_blksize);
	be32enc(buf + 4, table->ddr.ddr_blkcount);
	error = g_write_data(cp, 0, buf, pp->sectorsize);
	g_free(buf);
	if (error)
	return (error);
	}

	/* Allocate the buffer for all entries */
	tblsz = table->self.ent_pmblkcnt;
	buf = g_malloc(tblsz * pp->sectorsize, M_WAITOK \| M_ZERO);

	/* Fill the self entry */
	be16enc(buf, APM_ENT_SIG);
	be32enc(buf + 4, table->self.ent_pmblkcnt);
	be32enc(buf + 8, table->self.ent_start);
	be32enc(buf + 12, table->self.ent_size);
	bcopy(table->self.ent_name, buf + 16, sizeof(table->self.ent_name));
	bcopy(table->self.ent_type, buf + 48, sizeof(table->self.ent_type));

	baseentry = LIST_FIRST(&basetable->gpt_entry);
	for (index = 1; index < tblsz; index++) {
	entry = (baseentry != NULL && index == baseentry->gpe_index)
	? (struct g_part_apm_entry *)baseentry : NULL;
	ptr = buf + index * pp->sectorsize;
	be16enc(ptr, APM_ENT_SIG);
	be32enc(ptr + 4, table->self.ent_pmblkcnt);
	if (entry != NULL && !baseentry->gpe_deleted) {
	be32enc(ptr + 8, entry->ent.ent_start);
	be32enc(ptr + 12, entry->ent.ent_size);
	bcopy(entry->ent.ent_name, ptr + 16,
	sizeof(entry->ent.ent_name));
	bcopy(entry->ent.ent_type, ptr + 48,
	sizeof(entry->ent.ent_type));
	} else {
	strcpy(ptr + 48, APM_ENT_TYPE_UNUSED);
	}
	if (entry != NULL)
	baseentry = LIST_NEXT(baseentry, gpe_entry);
	}

	- for (index = 0; index < tblsz; index += MAXPHYS / pp->sectorsize) {
	+ for (index = 0; index < tblsz; index += maxphys / pp->sectorsize) {
	error = g_write_data(cp, (1 + index) * pp->sectorsize,
	buf + index * pp->sectorsize,
	- (tblsz - index > MAXPHYS / pp->sectorsize) ? MAXPHYS:
	+ (tblsz - index > maxphys / pp->sectorsize) ? maxphys:
	(tblsz - index) * pp->sectorsize);
	if (error) {
	g_free(buf);
	return (error);
	}
	}
	g_free(buf);
	return (0);
	}
	diff --git a/sys/geom/part/g_part_gpt.c b/sys/geom/part/g_part_gpt.c
	index c9eef1c8f715..89a92977dff6 100644
	--- a/sys/geom/part/g_part_gpt.c
	+++ b/sys/geom/part/g_part_gpt.c
	@@ -1,1441 +1,1441 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2002, 2005-2007, 2011 Marcel Moolenaar
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/bio.h>
	#include <sys/diskmbr.h>
	#include <sys/gsb_crc32.h>
	#include <sys/endian.h>
	#include <sys/gpt.h>
	#include <sys/kernel.h>
	#include <sys/kobj.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/queue.h>
	#include <sys/sbuf.h>
	#include <sys/systm.h>
	#include <sys/sysctl.h>
	#include <sys/uuid.h>
	#include <geom/geom.h>
	#include <geom/geom_int.h>
	#include <geom/part/g_part.h>

	#include "g_part_if.h"

	FEATURE(geom_part_gpt, "GEOM partitioning class for GPT partitions support");

	SYSCTL_DECL(_kern_geom_part);
	static SYSCTL_NODE(_kern_geom_part, OID_AUTO, gpt,
	CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_PART_GPT GUID Partition Table");

	static u_int allow_nesting = 0;
	SYSCTL_UINT(_kern_geom_part_gpt, OID_AUTO, allow_nesting,
	CTLFLAG_RWTUN, &allow_nesting, 0, "Allow GPT to be nested inside other schemes");

	CTASSERT(offsetof(struct gpt_hdr, padding) == 92);
	CTASSERT(sizeof(struct gpt_ent) == 128);

	extern u_int geom_part_check_integrity;

	#define EQUUID(a,b) (memcmp(a, b, sizeof(struct uuid)) == 0)

	#define MBRSIZE 512

	enum gpt_elt {
	GPT_ELT_PRIHDR,
	GPT_ELT_PRITBL,
	GPT_ELT_SECHDR,
	GPT_ELT_SECTBL,
	GPT_ELT_COUNT
	};

	enum gpt_state {
	GPT_STATE_UNKNOWN, /* Not determined. */
	GPT_STATE_MISSING, /* No signature found. */
	GPT_STATE_CORRUPT, /* Checksum mismatch. */
	GPT_STATE_INVALID, /* Nonconformant/invalid. */
	GPT_STATE_OK /* Perfectly fine. */
	};

	struct g_part_gpt_table {
	struct g_part_table base;
	u_char mbr[MBRSIZE];
	struct gpt_hdr *hdr;
	quad_t lba[GPT_ELT_COUNT];
	enum gpt_state state[GPT_ELT_COUNT];
	int bootcamp;
	};

	struct g_part_gpt_entry {
	struct g_part_entry base;
	struct gpt_ent ent;
	};

	static void g_gpt_printf_utf16(struct sbuf , uint16_t , size_t);
	static void g_gpt_utf8_to_utf16(const uint8_t , uint16_t , size_t);
	static void g_gpt_set_defaults(struct g_part_table , struct g_provider );

	static int g_part_gpt_add(struct g_part_table , struct g_part_entry ,
	struct g_part_parms *);
	static int g_part_gpt_bootcode(struct g_part_table , struct g_part_parms );
	static int g_part_gpt_create(struct g_part_table , struct g_part_parms );
	static int g_part_gpt_destroy(struct g_part_table , struct g_part_parms );
	static void g_part_gpt_dumpconf(struct g_part_table , struct g_part_entry ,
	struct sbuf , const char );
	static int g_part_gpt_dumpto(struct g_part_table , struct g_part_entry );
	static int g_part_gpt_modify(struct g_part_table , struct g_part_entry ,
	struct g_part_parms *);
	static const char g_part_gpt_name(struct g_part_table , struct g_part_entry *,
	char *, size_t);
	static int g_part_gpt_probe(struct g_part_table , struct g_consumer );
	static int g_part_gpt_read(struct g_part_table , struct g_consumer );
	static int g_part_gpt_setunset(struct g_part_table *table,
	struct g_part_entry baseentry, const char attrib, unsigned int set);
	static const char g_part_gpt_type(struct g_part_table , struct g_part_entry *,
	char *, size_t);
	static int g_part_gpt_write(struct g_part_table , struct g_consumer );
	static int g_part_gpt_resize(struct g_part_table , struct g_part_entry ,
	struct g_part_parms *);
	static int g_part_gpt_recover(struct g_part_table *);

	static kobj_method_t g_part_gpt_methods[] = {
	KOBJMETHOD(g_part_add, g_part_gpt_add),
	KOBJMETHOD(g_part_bootcode, g_part_gpt_bootcode),
	KOBJMETHOD(g_part_create, g_part_gpt_create),
	KOBJMETHOD(g_part_destroy, g_part_gpt_destroy),
	KOBJMETHOD(g_part_dumpconf, g_part_gpt_dumpconf),
	KOBJMETHOD(g_part_dumpto, g_part_gpt_dumpto),
	KOBJMETHOD(g_part_modify, g_part_gpt_modify),
	KOBJMETHOD(g_part_resize, g_part_gpt_resize),
	KOBJMETHOD(g_part_name, g_part_gpt_name),
	KOBJMETHOD(g_part_probe, g_part_gpt_probe),
	KOBJMETHOD(g_part_read, g_part_gpt_read),
	KOBJMETHOD(g_part_recover, g_part_gpt_recover),
	KOBJMETHOD(g_part_setunset, g_part_gpt_setunset),
	KOBJMETHOD(g_part_type, g_part_gpt_type),
	KOBJMETHOD(g_part_write, g_part_gpt_write),
	{ 0, 0 }
	};

	static struct g_part_scheme g_part_gpt_scheme = {
	"GPT",
	g_part_gpt_methods,
	sizeof(struct g_part_gpt_table),
	.gps_entrysz = sizeof(struct g_part_gpt_entry),
	.gps_minent = 128,
	.gps_maxent = 4096,
	.gps_bootcodesz = MBRSIZE,
	};
	G_PART_SCHEME_DECLARE(g_part_gpt);
	MODULE_VERSION(geom_part_gpt, 0);

	static struct uuid gpt_uuid_apple_apfs = GPT_ENT_TYPE_APPLE_APFS;
	static struct uuid gpt_uuid_apple_boot = GPT_ENT_TYPE_APPLE_BOOT;
	static struct uuid gpt_uuid_apple_core_storage =
	GPT_ENT_TYPE_APPLE_CORE_STORAGE;
	static struct uuid gpt_uuid_apple_hfs = GPT_ENT_TYPE_APPLE_HFS;
	static struct uuid gpt_uuid_apple_label = GPT_ENT_TYPE_APPLE_LABEL;
	static struct uuid gpt_uuid_apple_raid = GPT_ENT_TYPE_APPLE_RAID;
	static struct uuid gpt_uuid_apple_raid_offline = GPT_ENT_TYPE_APPLE_RAID_OFFLINE;
	static struct uuid gpt_uuid_apple_tv_recovery = GPT_ENT_TYPE_APPLE_TV_RECOVERY;
	static struct uuid gpt_uuid_apple_ufs = GPT_ENT_TYPE_APPLE_UFS;
	static struct uuid gpt_uuid_apple_zfs = GPT_ENT_TYPE_APPLE_ZFS;
	static struct uuid gpt_uuid_bios_boot = GPT_ENT_TYPE_BIOS_BOOT;
	static struct uuid gpt_uuid_chromeos_firmware = GPT_ENT_TYPE_CHROMEOS_FIRMWARE;
	static struct uuid gpt_uuid_chromeos_kernel = GPT_ENT_TYPE_CHROMEOS_KERNEL;
	static struct uuid gpt_uuid_chromeos_reserved = GPT_ENT_TYPE_CHROMEOS_RESERVED;
	static struct uuid gpt_uuid_chromeos_root = GPT_ENT_TYPE_CHROMEOS_ROOT;
	static struct uuid gpt_uuid_dfbsd_ccd = GPT_ENT_TYPE_DRAGONFLY_CCD;
	static struct uuid gpt_uuid_dfbsd_hammer = GPT_ENT_TYPE_DRAGONFLY_HAMMER;
	static struct uuid gpt_uuid_dfbsd_hammer2 = GPT_ENT_TYPE_DRAGONFLY_HAMMER2;
	static struct uuid gpt_uuid_dfbsd_label32 = GPT_ENT_TYPE_DRAGONFLY_LABEL32;
	static struct uuid gpt_uuid_dfbsd_label64 = GPT_ENT_TYPE_DRAGONFLY_LABEL64;
	static struct uuid gpt_uuid_dfbsd_legacy = GPT_ENT_TYPE_DRAGONFLY_LEGACY;
	static struct uuid gpt_uuid_dfbsd_swap = GPT_ENT_TYPE_DRAGONFLY_SWAP;
	static struct uuid gpt_uuid_dfbsd_ufs1 = GPT_ENT_TYPE_DRAGONFLY_UFS1;
	static struct uuid gpt_uuid_dfbsd_vinum = GPT_ENT_TYPE_DRAGONFLY_VINUM;
	static struct uuid gpt_uuid_efi = GPT_ENT_TYPE_EFI;
	static struct uuid gpt_uuid_freebsd = GPT_ENT_TYPE_FREEBSD;
	static struct uuid gpt_uuid_freebsd_boot = GPT_ENT_TYPE_FREEBSD_BOOT;
	static struct uuid gpt_uuid_freebsd_nandfs = GPT_ENT_TYPE_FREEBSD_NANDFS;
	static struct uuid gpt_uuid_freebsd_swap = GPT_ENT_TYPE_FREEBSD_SWAP;
	static struct uuid gpt_uuid_freebsd_ufs = GPT_ENT_TYPE_FREEBSD_UFS;
	static struct uuid gpt_uuid_freebsd_vinum = GPT_ENT_TYPE_FREEBSD_VINUM;
	static struct uuid gpt_uuid_freebsd_zfs = GPT_ENT_TYPE_FREEBSD_ZFS;
	static struct uuid gpt_uuid_linux_data = GPT_ENT_TYPE_LINUX_DATA;
	static struct uuid gpt_uuid_linux_lvm = GPT_ENT_TYPE_LINUX_LVM;
	static struct uuid gpt_uuid_linux_raid = GPT_ENT_TYPE_LINUX_RAID;
	static struct uuid gpt_uuid_linux_swap = GPT_ENT_TYPE_LINUX_SWAP;
	static struct uuid gpt_uuid_mbr = GPT_ENT_TYPE_MBR;
	static struct uuid gpt_uuid_ms_basic_data = GPT_ENT_TYPE_MS_BASIC_DATA;
	static struct uuid gpt_uuid_ms_ldm_data = GPT_ENT_TYPE_MS_LDM_DATA;
	static struct uuid gpt_uuid_ms_ldm_metadata = GPT_ENT_TYPE_MS_LDM_METADATA;
	static struct uuid gpt_uuid_ms_recovery = GPT_ENT_TYPE_MS_RECOVERY;
	static struct uuid gpt_uuid_ms_reserved = GPT_ENT_TYPE_MS_RESERVED;
	static struct uuid gpt_uuid_ms_spaces = GPT_ENT_TYPE_MS_SPACES;
	static struct uuid gpt_uuid_netbsd_ccd = GPT_ENT_TYPE_NETBSD_CCD;
	static struct uuid gpt_uuid_netbsd_cgd = GPT_ENT_TYPE_NETBSD_CGD;
	static struct uuid gpt_uuid_netbsd_ffs = GPT_ENT_TYPE_NETBSD_FFS;
	static struct uuid gpt_uuid_netbsd_lfs = GPT_ENT_TYPE_NETBSD_LFS;
	static struct uuid gpt_uuid_netbsd_raid = GPT_ENT_TYPE_NETBSD_RAID;
	static struct uuid gpt_uuid_netbsd_swap = GPT_ENT_TYPE_NETBSD_SWAP;
	static struct uuid gpt_uuid_openbsd_data = GPT_ENT_TYPE_OPENBSD_DATA;
	static struct uuid gpt_uuid_prep_boot = GPT_ENT_TYPE_PREP_BOOT;
	static struct uuid gpt_uuid_solaris_boot = GPT_ENT_TYPE_SOLARIS_BOOT;
	static struct uuid gpt_uuid_solaris_root = GPT_ENT_TYPE_SOLARIS_ROOT;
	static struct uuid gpt_uuid_solaris_swap = GPT_ENT_TYPE_SOLARIS_SWAP;
	static struct uuid gpt_uuid_solaris_backup = GPT_ENT_TYPE_SOLARIS_BACKUP;
	static struct uuid gpt_uuid_solaris_var = GPT_ENT_TYPE_SOLARIS_VAR;
	static struct uuid gpt_uuid_solaris_home = GPT_ENT_TYPE_SOLARIS_HOME;
	static struct uuid gpt_uuid_solaris_altsec = GPT_ENT_TYPE_SOLARIS_ALTSEC;
	static struct uuid gpt_uuid_solaris_reserved = GPT_ENT_TYPE_SOLARIS_RESERVED;
	static struct uuid gpt_uuid_unused = GPT_ENT_TYPE_UNUSED;
	static struct uuid gpt_uuid_vmfs = GPT_ENT_TYPE_VMFS;
	static struct uuid gpt_uuid_vmkdiag = GPT_ENT_TYPE_VMKDIAG;
	static struct uuid gpt_uuid_vmreserved = GPT_ENT_TYPE_VMRESERVED;
	static struct uuid gpt_uuid_vmvsanhdr = GPT_ENT_TYPE_VMVSANHDR;

	static struct g_part_uuid_alias {
	struct uuid *uuid;
	int alias;
	int mbrtype;
	} gpt_uuid_alias_match[] = {
	{ &gpt_uuid_apple_apfs, G_PART_ALIAS_APPLE_APFS, 0 },
	{ &gpt_uuid_apple_boot, G_PART_ALIAS_APPLE_BOOT, 0xab },
	{ &gpt_uuid_apple_core_storage, G_PART_ALIAS_APPLE_CORE_STORAGE, 0 },
	{ &gpt_uuid_apple_hfs, G_PART_ALIAS_APPLE_HFS, 0xaf },
	{ &gpt_uuid_apple_label, G_PART_ALIAS_APPLE_LABEL, 0 },
	{ &gpt_uuid_apple_raid, G_PART_ALIAS_APPLE_RAID, 0 },
	{ &gpt_uuid_apple_raid_offline, G_PART_ALIAS_APPLE_RAID_OFFLINE, 0 },
	{ &gpt_uuid_apple_tv_recovery, G_PART_ALIAS_APPLE_TV_RECOVERY, 0 },
	{ &gpt_uuid_apple_ufs, G_PART_ALIAS_APPLE_UFS, 0 },
	{ &gpt_uuid_apple_zfs, G_PART_ALIAS_APPLE_ZFS, 0 },
	{ &gpt_uuid_bios_boot, G_PART_ALIAS_BIOS_BOOT, 0 },
	{ &gpt_uuid_chromeos_firmware, G_PART_ALIAS_CHROMEOS_FIRMWARE, 0 },
	{ &gpt_uuid_chromeos_kernel, G_PART_ALIAS_CHROMEOS_KERNEL, 0 },
	{ &gpt_uuid_chromeos_reserved, G_PART_ALIAS_CHROMEOS_RESERVED, 0 },
	{ &gpt_uuid_chromeos_root, G_PART_ALIAS_CHROMEOS_ROOT, 0 },
	{ &gpt_uuid_dfbsd_ccd, G_PART_ALIAS_DFBSD_CCD, 0 },
	{ &gpt_uuid_dfbsd_hammer, G_PART_ALIAS_DFBSD_HAMMER, 0 },
	{ &gpt_uuid_dfbsd_hammer2, G_PART_ALIAS_DFBSD_HAMMER2, 0 },
	{ &gpt_uuid_dfbsd_label32, G_PART_ALIAS_DFBSD, 0xa5 },
	{ &gpt_uuid_dfbsd_label64, G_PART_ALIAS_DFBSD64, 0xa5 },
	{ &gpt_uuid_dfbsd_legacy, G_PART_ALIAS_DFBSD_LEGACY, 0 },
	{ &gpt_uuid_dfbsd_swap, G_PART_ALIAS_DFBSD_SWAP, 0 },
	{ &gpt_uuid_dfbsd_ufs1, G_PART_ALIAS_DFBSD_UFS, 0 },
	{ &gpt_uuid_dfbsd_vinum, G_PART_ALIAS_DFBSD_VINUM, 0 },
	{ &gpt_uuid_efi, G_PART_ALIAS_EFI, 0xee },
	{ &gpt_uuid_freebsd, G_PART_ALIAS_FREEBSD, 0xa5 },
	{ &gpt_uuid_freebsd_boot, G_PART_ALIAS_FREEBSD_BOOT, 0 },
	{ &gpt_uuid_freebsd_nandfs, G_PART_ALIAS_FREEBSD_NANDFS, 0 },
	{ &gpt_uuid_freebsd_swap, G_PART_ALIAS_FREEBSD_SWAP, 0 },
	{ &gpt_uuid_freebsd_ufs, G_PART_ALIAS_FREEBSD_UFS, 0 },
	{ &gpt_uuid_freebsd_vinum, G_PART_ALIAS_FREEBSD_VINUM, 0 },
	{ &gpt_uuid_freebsd_zfs, G_PART_ALIAS_FREEBSD_ZFS, 0 },
	{ &gpt_uuid_linux_data, G_PART_ALIAS_LINUX_DATA, 0x0b },
	{ &gpt_uuid_linux_lvm, G_PART_ALIAS_LINUX_LVM, 0 },
	{ &gpt_uuid_linux_raid, G_PART_ALIAS_LINUX_RAID, 0 },
	{ &gpt_uuid_linux_swap, G_PART_ALIAS_LINUX_SWAP, 0 },
	{ &gpt_uuid_mbr, G_PART_ALIAS_MBR, 0 },
	{ &gpt_uuid_ms_basic_data, G_PART_ALIAS_MS_BASIC_DATA, 0x0b },
	{ &gpt_uuid_ms_ldm_data, G_PART_ALIAS_MS_LDM_DATA, 0 },
	{ &gpt_uuid_ms_ldm_metadata, G_PART_ALIAS_MS_LDM_METADATA, 0 },
	{ &gpt_uuid_ms_recovery, G_PART_ALIAS_MS_RECOVERY, 0 },
	{ &gpt_uuid_ms_reserved, G_PART_ALIAS_MS_RESERVED, 0 },
	{ &gpt_uuid_ms_spaces, G_PART_ALIAS_MS_SPACES, 0 },
	{ &gpt_uuid_netbsd_ccd, G_PART_ALIAS_NETBSD_CCD, 0 },
	{ &gpt_uuid_netbsd_cgd, G_PART_ALIAS_NETBSD_CGD, 0 },
	{ &gpt_uuid_netbsd_ffs, G_PART_ALIAS_NETBSD_FFS, 0 },
	{ &gpt_uuid_netbsd_lfs, G_PART_ALIAS_NETBSD_LFS, 0 },
	{ &gpt_uuid_netbsd_raid, G_PART_ALIAS_NETBSD_RAID, 0 },
	{ &gpt_uuid_netbsd_swap, G_PART_ALIAS_NETBSD_SWAP, 0 },
	{ &gpt_uuid_openbsd_data, G_PART_ALIAS_OPENBSD_DATA, 0 },
	{ &gpt_uuid_prep_boot, G_PART_ALIAS_PREP_BOOT, 0x41 },
	{ &gpt_uuid_solaris_boot, G_PART_ALIAS_SOLARIS_BOOT, 0 },
	{ &gpt_uuid_solaris_root, G_PART_ALIAS_SOLARIS_ROOT, 0 },
	{ &gpt_uuid_solaris_swap, G_PART_ALIAS_SOLARIS_SWAP, 0 },
	{ &gpt_uuid_solaris_backup, G_PART_ALIAS_SOLARIS_BACKUP, 0 },
	{ &gpt_uuid_solaris_var, G_PART_ALIAS_SOLARIS_VAR, 0 },
	{ &gpt_uuid_solaris_home, G_PART_ALIAS_SOLARIS_HOME, 0 },
	{ &gpt_uuid_solaris_altsec, G_PART_ALIAS_SOLARIS_ALTSEC, 0 },
	{ &gpt_uuid_solaris_reserved, G_PART_ALIAS_SOLARIS_RESERVED, 0 },
	{ &gpt_uuid_vmfs, G_PART_ALIAS_VMFS, 0 },
	{ &gpt_uuid_vmkdiag, G_PART_ALIAS_VMKDIAG, 0 },
	{ &gpt_uuid_vmreserved, G_PART_ALIAS_VMRESERVED, 0 },
	{ &gpt_uuid_vmvsanhdr, G_PART_ALIAS_VMVSANHDR, 0 },
	{ NULL, 0, 0 }
	};

	static int
	gpt_write_mbr_entry(u_char *mbr, int idx, int typ, quad_t start,
	quad_t end)
	{

	if (typ == 0 \|\| start > UINT32_MAX \|\| end > UINT32_MAX)
	return (EINVAL);

	mbr += DOSPARTOFF + idx * DOSPARTSIZE;
	mbr[0] = 0;
	if (start == 1) {
	/*
	* Treat the PMBR partition specially to maximize
	* interoperability with BIOSes.
	*/
	mbr[1] = mbr[3] = 0;
	mbr[2] = 2;
	} else
	mbr[1] = mbr[2] = mbr[3] = 0xff;
	mbr[4] = typ;
	mbr[5] = mbr[6] = mbr[7] = 0xff;
	le32enc(mbr + 8, (uint32_t)start);
	le32enc(mbr + 12, (uint32_t)(end - start + 1));
	return (0);
	}

	static int
	gpt_map_type(struct uuid *t)
	{
	struct g_part_uuid_alias *uap;

	for (uap = &gpt_uuid_alias_match[0]; uap->uuid; uap++) {
	if (EQUUID(t, uap->uuid))
	return (uap->mbrtype);
	}
	return (0);
	}

	static void
	gpt_create_pmbr(struct g_part_gpt_table table, struct g_provider pp)
	{

	bzero(table->mbr + DOSPARTOFF, DOSPARTSIZE * NDOSPART);
	gpt_write_mbr_entry(table->mbr, 0, 0xee, 1,
	MIN(pp->mediasize / pp->sectorsize - 1, UINT32_MAX));
	le16enc(table->mbr + DOSMAGICOFFSET, DOSMAGIC);
	}

	/*
	* Under Boot Camp the PMBR partition (type 0xEE) doesn't cover the
	* whole disk anymore. Rather, it covers the GPT table and the EFI
	* system partition only. This way the HFS+ partition and any FAT
	* partitions can be added to the MBR without creating an overlap.
	*/
	static int
	gpt_is_bootcamp(struct g_part_gpt_table table, const char provname)
	{
	uint8_t *p;

	p = table->mbr + DOSPARTOFF;
	if (p[4] != 0xee \|\| le32dec(p + 8) != 1)
	return (0);

	p += DOSPARTSIZE;
	if (p[4] != 0xaf)
	return (0);

	printf("GEOM: %s: enabling Boot Camp\n", provname);
	return (1);
	}

	static void
	gpt_update_bootcamp(struct g_part_table basetable, struct g_provider pp)
	{
	struct g_part_entry *baseentry;
	struct g_part_gpt_entry *entry;
	struct g_part_gpt_table *table;
	int bootable, error, index, slices, typ;

	table = (struct g_part_gpt_table *)basetable;

	bootable = -1;
	for (index = 0; index < NDOSPART; index++) {
	if (table->mbr[DOSPARTOFF + DOSPARTSIZE * index])
	bootable = index;
	}

	bzero(table->mbr + DOSPARTOFF, DOSPARTSIZE * NDOSPART);
	slices = 0;
	LIST_FOREACH(baseentry, &basetable->gpt_entry, gpe_entry) {
	if (baseentry->gpe_deleted)
	continue;
	index = baseentry->gpe_index - 1;
	if (index >= NDOSPART)
	continue;

	entry = (struct g_part_gpt_entry *)baseentry;

	switch (index) {
	case 0: /* This must be the EFI system partition. */
	if (!EQUUID(&entry->ent.ent_type, &gpt_uuid_efi))
	goto disable;
	error = gpt_write_mbr_entry(table->mbr, index, 0xee,
	1ull, entry->ent.ent_lba_end);
	break;
	case 1: /* This must be the HFS+ partition. */
	if (!EQUUID(&entry->ent.ent_type, &gpt_uuid_apple_hfs))
	goto disable;
	error = gpt_write_mbr_entry(table->mbr, index, 0xaf,
	entry->ent.ent_lba_start, entry->ent.ent_lba_end);
	break;
	default:
	typ = gpt_map_type(&entry->ent.ent_type);
	error = gpt_write_mbr_entry(table->mbr, index, typ,
	entry->ent.ent_lba_start, entry->ent.ent_lba_end);
	break;
	}
	if (error)
	continue;

	if (index == bootable)
	table->mbr[DOSPARTOFF + DOSPARTSIZE * index] = 0x80;
	slices \|= 1 << index;
	}
	if ((slices & 3) == 3)
	return;

	disable:
	table->bootcamp = 0;
	gpt_create_pmbr(table, pp);
	}

	static struct gpt_hdr *
	gpt_read_hdr(struct g_part_gpt_table table, struct g_consumer cp,
	enum gpt_elt elt)
	{
	struct gpt_hdr buf, hdr;
	struct g_provider *pp;
	quad_t lba, last;
	int error;
	uint32_t crc, sz;

	pp = cp->provider;
	last = (pp->mediasize / pp->sectorsize) - 1;
	table->state[elt] = GPT_STATE_MISSING;
	/*
	* If the primary header is valid look for secondary
	* header in AlternateLBA, otherwise in the last medium's LBA.
	*/
	if (elt == GPT_ELT_SECHDR) {
	if (table->state[GPT_ELT_PRIHDR] != GPT_STATE_OK)
	table->lba[elt] = last;
	} else
	table->lba[elt] = 1;
	buf = g_read_data(cp, table->lba[elt] * pp->sectorsize, pp->sectorsize,
	&error);
	if (buf == NULL)
	return (NULL);
	hdr = NULL;
	if (memcmp(buf->hdr_sig, GPT_HDR_SIG, sizeof(buf->hdr_sig)) != 0)
	goto fail;

	table->state[elt] = GPT_STATE_CORRUPT;
	sz = le32toh(buf->hdr_size);
	if (sz < 92 \|\| sz > pp->sectorsize)
	goto fail;

	hdr = g_malloc(sz, M_WAITOK \| M_ZERO);
	bcopy(buf, hdr, sz);
	hdr->hdr_size = sz;

	crc = le32toh(buf->hdr_crc_self);
	buf->hdr_crc_self = 0;
	if (crc32(buf, sz) != crc)
	goto fail;
	hdr->hdr_crc_self = crc;

	table->state[elt] = GPT_STATE_INVALID;
	hdr->hdr_revision = le32toh(buf->hdr_revision);
	if (hdr->hdr_revision < GPT_HDR_REVISION)
	goto fail;
	hdr->hdr_lba_self = le64toh(buf->hdr_lba_self);
	if (hdr->hdr_lba_self != table->lba[elt])
	goto fail;
	hdr->hdr_lba_alt = le64toh(buf->hdr_lba_alt);
	if (hdr->hdr_lba_alt == hdr->hdr_lba_self)
	goto fail;
	if (hdr->hdr_lba_alt > last && geom_part_check_integrity)
	goto fail;

	/* Check the managed area. */
	hdr->hdr_lba_start = le64toh(buf->hdr_lba_start);
	if (hdr->hdr_lba_start < 2 \|\| hdr->hdr_lba_start >= last)
	goto fail;
	hdr->hdr_lba_end = le64toh(buf->hdr_lba_end);
	if (hdr->hdr_lba_end < hdr->hdr_lba_start \|\| hdr->hdr_lba_end >= last)
	goto fail;

	/* Check the table location and size of the table. */
	hdr->hdr_entries = le32toh(buf->hdr_entries);
	hdr->hdr_entsz = le32toh(buf->hdr_entsz);
	if (hdr->hdr_entries == 0 \|\| hdr->hdr_entsz < 128 \|\|
	(hdr->hdr_entsz & 7) != 0)
	goto fail;
	hdr->hdr_lba_table = le64toh(buf->hdr_lba_table);
	if (hdr->hdr_lba_table < 2 \|\| hdr->hdr_lba_table >= last)
	goto fail;
	if (hdr->hdr_lba_table >= hdr->hdr_lba_start &&
	hdr->hdr_lba_table <= hdr->hdr_lba_end)
	goto fail;
	lba = hdr->hdr_lba_table +
	howmany(hdr->hdr_entries * hdr->hdr_entsz, pp->sectorsize) - 1;
	if (lba >= last)
	goto fail;
	if (lba >= hdr->hdr_lba_start && lba <= hdr->hdr_lba_end)
	goto fail;

	table->state[elt] = GPT_STATE_OK;
	le_uuid_dec(&buf->hdr_uuid, &hdr->hdr_uuid);
	hdr->hdr_crc_table = le32toh(buf->hdr_crc_table);

	/* save LBA for secondary header */
	if (elt == GPT_ELT_PRIHDR)
	table->lba[GPT_ELT_SECHDR] = hdr->hdr_lba_alt;

	g_free(buf);
	return (hdr);

	fail:
	if (hdr != NULL)
	g_free(hdr);
	g_free(buf);
	return (NULL);
	}

	static struct gpt_ent *
	gpt_read_tbl(struct g_part_gpt_table table, struct g_consumer cp,
	enum gpt_elt elt, struct gpt_hdr *hdr)
	{
	struct g_provider *pp;
	struct gpt_ent ent, tbl;
	char buf, p;
	unsigned int idx, sectors, tblsz, size;
	int error;

	if (hdr == NULL)
	return (NULL);

	pp = cp->provider;
	table->lba[elt] = hdr->hdr_lba_table;

	table->state[elt] = GPT_STATE_MISSING;
	tblsz = hdr->hdr_entries * hdr->hdr_entsz;
	sectors = howmany(tblsz, pp->sectorsize);
	buf = g_malloc(sectors * pp->sectorsize, M_WAITOK \| M_ZERO);
	- for (idx = 0; idx < sectors; idx += MAXPHYS / pp->sectorsize) {
	- size = (sectors - idx > MAXPHYS / pp->sectorsize) ? MAXPHYS:
	+ for (idx = 0; idx < sectors; idx += maxphys / pp->sectorsize) {
	+ size = (sectors - idx > maxphys / pp->sectorsize) ? maxphys:
	(sectors - idx) * pp->sectorsize;
	p = g_read_data(cp, (table->lba[elt] + idx) * pp->sectorsize,
	size, &error);
	if (p == NULL) {
	g_free(buf);
	return (NULL);
	}
	bcopy(p, buf + idx * pp->sectorsize, size);
	g_free(p);
	}
	table->state[elt] = GPT_STATE_CORRUPT;
	if (crc32(buf, tblsz) != hdr->hdr_crc_table) {
	g_free(buf);
	return (NULL);
	}

	table->state[elt] = GPT_STATE_OK;
	tbl = g_malloc(hdr->hdr_entries * sizeof(struct gpt_ent),
	M_WAITOK \| M_ZERO);

	for (idx = 0, ent = tbl, p = buf;
	idx < hdr->hdr_entries;
	idx++, ent++, p += hdr->hdr_entsz) {
	le_uuid_dec(p, &ent->ent_type);
	le_uuid_dec(p + 16, &ent->ent_uuid);
	ent->ent_lba_start = le64dec(p + 32);
	ent->ent_lba_end = le64dec(p + 40);
	ent->ent_attr = le64dec(p + 48);
	/* Keep UTF-16 in little-endian. */
	bcopy(p + 56, ent->ent_name, sizeof(ent->ent_name));
	}

	g_free(buf);
	return (tbl);
	}

	static int
	gpt_matched_hdrs(struct gpt_hdr pri, struct gpt_hdr sec)
	{

	if (pri == NULL \|\| sec == NULL)
	return (0);

	if (!EQUUID(&pri->hdr_uuid, &sec->hdr_uuid))
	return (0);
	return ((pri->hdr_revision == sec->hdr_revision &&
	pri->hdr_size == sec->hdr_size &&
	pri->hdr_lba_start == sec->hdr_lba_start &&
	pri->hdr_lba_end == sec->hdr_lba_end &&
	pri->hdr_entries == sec->hdr_entries &&
	pri->hdr_entsz == sec->hdr_entsz &&
	pri->hdr_crc_table == sec->hdr_crc_table) ? 1 : 0);
	}

	static int
	gpt_parse_type(const char type, struct uuid uuid)
	{
	struct uuid tmp;
	const char *alias;
	int error;
	struct g_part_uuid_alias *uap;

	if (type[0] == '!') {
	error = parse_uuid(type + 1, &tmp);
	if (error)
	return (error);
	if (EQUUID(&tmp, &gpt_uuid_unused))
	return (EINVAL);
	*uuid = tmp;
	return (0);
	}
	for (uap = &gpt_uuid_alias_match[0]; uap->uuid; uap++) {
	alias = g_part_alias_name(uap->alias);
	if (!strcasecmp(type, alias)) {
	uuid = uap->uuid;
	return (0);
	}
	}
	return (EINVAL);
	}

	static int
	g_part_gpt_add(struct g_part_table basetable, struct g_part_entry baseentry,
	struct g_part_parms *gpp)
	{
	struct g_part_gpt_entry *entry;
	int error;

	entry = (struct g_part_gpt_entry *)baseentry;
	error = gpt_parse_type(gpp->gpp_type, &entry->ent.ent_type);
	if (error)
	return (error);
	kern_uuidgen(&entry->ent.ent_uuid, 1);
	entry->ent.ent_lba_start = baseentry->gpe_start;
	entry->ent.ent_lba_end = baseentry->gpe_end;
	if (baseentry->gpe_deleted) {
	entry->ent.ent_attr = 0;
	bzero(entry->ent.ent_name, sizeof(entry->ent.ent_name));
	}
	if (gpp->gpp_parms & G_PART_PARM_LABEL)
	g_gpt_utf8_to_utf16(gpp->gpp_label, entry->ent.ent_name,
	sizeof(entry->ent.ent_name) /
	sizeof(entry->ent.ent_name[0]));
	return (0);
	}

	static int
	g_part_gpt_bootcode(struct g_part_table basetable, struct g_part_parms gpp)
	{
	struct g_part_gpt_table *table;
	size_t codesz;

	codesz = DOSPARTOFF;
	table = (struct g_part_gpt_table *)basetable;
	bzero(table->mbr, codesz);
	codesz = MIN(codesz, gpp->gpp_codesize);
	if (codesz > 0)
	bcopy(gpp->gpp_codeptr, table->mbr, codesz);
	return (0);
	}

	static int
	g_part_gpt_create(struct g_part_table basetable, struct g_part_parms gpp)
	{
	struct g_provider *pp;
	struct g_part_gpt_table *table;
	size_t tblsz;

	/* Our depth should be 0 unless nesting was explicitly enabled. */
	if (!allow_nesting && basetable->gpt_depth != 0)
	return (ENXIO);

	table = (struct g_part_gpt_table *)basetable;
	pp = gpp->gpp_provider;
	tblsz = howmany(basetable->gpt_entries * sizeof(struct gpt_ent),
	pp->sectorsize);
	if (pp->sectorsize < MBRSIZE \|\|
	pp->mediasize < (3 + 2 * tblsz + basetable->gpt_entries) *
	pp->sectorsize)
	return (ENOSPC);

	gpt_create_pmbr(table, pp);

	/* Allocate space for the header */
	table->hdr = g_malloc(sizeof(struct gpt_hdr), M_WAITOK \| M_ZERO);

	bcopy(GPT_HDR_SIG, table->hdr->hdr_sig, sizeof(table->hdr->hdr_sig));
	table->hdr->hdr_revision = GPT_HDR_REVISION;
	table->hdr->hdr_size = offsetof(struct gpt_hdr, padding);
	kern_uuidgen(&table->hdr->hdr_uuid, 1);
	table->hdr->hdr_entries = basetable->gpt_entries;
	table->hdr->hdr_entsz = sizeof(struct gpt_ent);

	g_gpt_set_defaults(basetable, pp);
	return (0);
	}

	static int
	g_part_gpt_destroy(struct g_part_table basetable, struct g_part_parms gpp)
	{
	struct g_part_gpt_table *table;
	struct g_provider *pp;

	table = (struct g_part_gpt_table *)basetable;
	pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider;
	g_free(table->hdr);
	table->hdr = NULL;

	/*
	* Wipe the first 2 sectors and last one to clear the partitioning.
	* Wipe sectors only if they have valid metadata.
	*/
	if (table->state[GPT_ELT_PRIHDR] == GPT_STATE_OK)
	basetable->gpt_smhead \|= 3;
	if (table->state[GPT_ELT_SECHDR] == GPT_STATE_OK &&
	table->lba[GPT_ELT_SECHDR] == pp->mediasize / pp->sectorsize - 1)
	basetable->gpt_smtail \|= 1;
	return (0);
	}

	static void
	g_part_gpt_dumpconf(struct g_part_table table, struct g_part_entry baseentry,
	struct sbuf sb, const char indent)
	{
	struct g_part_gpt_entry *entry;

	entry = (struct g_part_gpt_entry *)baseentry;
	if (indent == NULL) {
	/* conftxt: libdisk compatibility */
	sbuf_cat(sb, " xs GPT xt ");
	sbuf_printf_uuid(sb, &entry->ent.ent_type);
	} else if (entry != NULL) {
	/* confxml: partition entry information */
	sbuf_printf(sb, "%s<label>", indent);
	g_gpt_printf_utf16(sb, entry->ent.ent_name,
	sizeof(entry->ent.ent_name) >> 1);
	sbuf_cat(sb, "</label>\n");
	if (entry->ent.ent_attr & GPT_ENT_ATTR_BOOTME)
	sbuf_printf(sb, "%s<attrib>bootme</attrib>\n", indent);
	if (entry->ent.ent_attr & GPT_ENT_ATTR_BOOTONCE) {
	sbuf_printf(sb, "%s<attrib>bootonce</attrib>\n",
	indent);
	}
	if (entry->ent.ent_attr & GPT_ENT_ATTR_BOOTFAILED) {
	sbuf_printf(sb, "%s<attrib>bootfailed</attrib>\n",
	indent);
	}
	sbuf_printf(sb, "%s<rawtype>", indent);
	sbuf_printf_uuid(sb, &entry->ent.ent_type);
	sbuf_cat(sb, "</rawtype>\n");
	sbuf_printf(sb, "%s<rawuuid>", indent);
	sbuf_printf_uuid(sb, &entry->ent.ent_uuid);
	sbuf_cat(sb, "</rawuuid>\n");
	sbuf_printf(sb, "%s<efimedia>", indent);
	sbuf_printf(sb, "HD(%d,GPT,", entry->base.gpe_index);
	sbuf_printf_uuid(sb, &entry->ent.ent_uuid);
	sbuf_printf(sb, ",%#jx,%#jx)", (intmax_t)entry->base.gpe_start,
	(intmax_t)(entry->base.gpe_end - entry->base.gpe_start + 1));
	sbuf_cat(sb, "</efimedia>\n");
	} else {
	/* confxml: scheme information */
	}
	}

	static int
	g_part_gpt_dumpto(struct g_part_table table, struct g_part_entry baseentry)
	{
	struct g_part_gpt_entry *entry;

	entry = (struct g_part_gpt_entry *)baseentry;
	return ((EQUUID(&entry->ent.ent_type, &gpt_uuid_freebsd_swap) \|\|
	EQUUID(&entry->ent.ent_type, &gpt_uuid_linux_swap) \|\|
	EQUUID(&entry->ent.ent_type, &gpt_uuid_dfbsd_swap)) ? 1 : 0);
	}

	static int
	g_part_gpt_modify(struct g_part_table *basetable,
	struct g_part_entry baseentry, struct g_part_parms gpp)
	{
	struct g_part_gpt_entry *entry;
	int error;

	entry = (struct g_part_gpt_entry *)baseentry;
	if (gpp->gpp_parms & G_PART_PARM_TYPE) {
	error = gpt_parse_type(gpp->gpp_type, &entry->ent.ent_type);
	if (error)
	return (error);
	}
	if (gpp->gpp_parms & G_PART_PARM_LABEL)
	g_gpt_utf8_to_utf16(gpp->gpp_label, entry->ent.ent_name,
	sizeof(entry->ent.ent_name) /
	sizeof(entry->ent.ent_name[0]));
	return (0);
	}

	static int
	g_part_gpt_resize(struct g_part_table *basetable,
	struct g_part_entry baseentry, struct g_part_parms gpp)
	{
	struct g_part_gpt_entry *entry;

	if (baseentry == NULL)
	return (g_part_gpt_recover(basetable));

	entry = (struct g_part_gpt_entry *)baseentry;
	baseentry->gpe_end = baseentry->gpe_start + gpp->gpp_size - 1;
	entry->ent.ent_lba_end = baseentry->gpe_end;

	return (0);
	}

	static const char *
	g_part_gpt_name(struct g_part_table table, struct g_part_entry baseentry,
	char *buf, size_t bufsz)
	{
	struct g_part_gpt_entry *entry;
	char c;

	entry = (struct g_part_gpt_entry *)baseentry;
	c = (EQUUID(&entry->ent.ent_type, &gpt_uuid_freebsd)) ? 's' : 'p';
	snprintf(buf, bufsz, "%c%d", c, baseentry->gpe_index);
	return (buf);
	}

	static int
	g_part_gpt_probe(struct g_part_table table, struct g_consumer cp)
	{
	struct g_provider *pp;
	u_char *buf;
	int error, index, pri, res;

	/* Our depth should be 0 unless nesting was explicitly enabled. */
	if (!allow_nesting && table->gpt_depth != 0)
	return (ENXIO);

	pp = cp->provider;

	/*
	* Sanity-check the provider. Since the first sector on the provider
	* must be a PMBR and a PMBR is 512 bytes large, the sector size
	* must be at least 512 bytes. Also, since the theoretical minimum
	* number of sectors needed by GPT is 6, any medium that has less
	* than 6 sectors is never going to be able to hold a GPT. The
	* number 6 comes from:
	* 1 sector for the PMBR
	* 2 sectors for the GPT headers (each 1 sector)
	* 2 sectors for the GPT tables (each 1 sector)
	* 1 sector for an actual partition
	* It's better to catch this pathological case early than behaving
	* pathologically later on...
	*/
	if (pp->sectorsize < MBRSIZE \|\| pp->mediasize < 6 * pp->sectorsize)
	return (ENOSPC);

	/*
	* Check that there's a MBR or a PMBR. If it's a PMBR, we return
	* as the highest priority on a match, otherwise we assume some
	* GPT-unaware tool has destroyed the GPT by recreating a MBR and
	* we really want the MBR scheme to take precedence.
	*/
	buf = g_read_data(cp, 0L, pp->sectorsize, &error);
	if (buf == NULL)
	return (error);
	res = le16dec(buf + DOSMAGICOFFSET);
	pri = G_PART_PROBE_PRI_LOW;
	if (res == DOSMAGIC) {
	for (index = 0; index < NDOSPART; index++) {
	if (buf[DOSPARTOFF + DOSPARTSIZE * index + 4] == 0xee)
	pri = G_PART_PROBE_PRI_HIGH;
	}
	g_free(buf);

	/* Check that there's a primary header. */
	buf = g_read_data(cp, pp->sectorsize, pp->sectorsize, &error);
	if (buf == NULL)
	return (error);
	res = memcmp(buf, GPT_HDR_SIG, 8);
	g_free(buf);
	if (res == 0)
	return (pri);
	} else
	g_free(buf);

	/* No primary? Check that there's a secondary. */
	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
	&error);
	if (buf == NULL)
	return (error);
	res = memcmp(buf, GPT_HDR_SIG, 8);
	g_free(buf);
	return ((res == 0) ? pri : ENXIO);
	}

	static int
	g_part_gpt_read(struct g_part_table basetable, struct g_consumer cp)
	{
	struct gpt_hdr prihdr, sechdr;
	struct gpt_ent tbl, pritbl, *sectbl;
	struct g_provider *pp;
	struct g_part_gpt_table *table;
	struct g_part_gpt_entry *entry;
	u_char *buf;
	uint64_t last;
	int error, index;

	table = (struct g_part_gpt_table *)basetable;
	pp = cp->provider;
	last = (pp->mediasize / pp->sectorsize) - 1;

	/* Read the PMBR */
	buf = g_read_data(cp, 0, pp->sectorsize, &error);
	if (buf == NULL)
	return (error);
	bcopy(buf, table->mbr, MBRSIZE);
	g_free(buf);

	/* Read the primary header and table. */
	prihdr = gpt_read_hdr(table, cp, GPT_ELT_PRIHDR);
	if (table->state[GPT_ELT_PRIHDR] == GPT_STATE_OK) {
	pritbl = gpt_read_tbl(table, cp, GPT_ELT_PRITBL, prihdr);
	} else {
	table->state[GPT_ELT_PRITBL] = GPT_STATE_MISSING;
	pritbl = NULL;
	}

	/* Read the secondary header and table. */
	sechdr = gpt_read_hdr(table, cp, GPT_ELT_SECHDR);
	if (table->state[GPT_ELT_SECHDR] == GPT_STATE_OK) {
	sectbl = gpt_read_tbl(table, cp, GPT_ELT_SECTBL, sechdr);
	} else {
	table->state[GPT_ELT_SECTBL] = GPT_STATE_MISSING;
	sectbl = NULL;
	}

	/* Fail if we haven't got any good tables at all. */
	if (table->state[GPT_ELT_PRITBL] != GPT_STATE_OK &&
	table->state[GPT_ELT_SECTBL] != GPT_STATE_OK) {
	printf("GEOM: %s: corrupt or invalid GPT detected.\n",
	pp->name);
	printf("GEOM: %s: GPT rejected -- may not be recoverable.\n",
	pp->name);
	if (prihdr != NULL)
	g_free(prihdr);
	if (pritbl != NULL)
	g_free(pritbl);
	if (sechdr != NULL)
	g_free(sechdr);
	if (sectbl != NULL)
	g_free(sectbl);
	return (EINVAL);
	}

	/*
	* If both headers are good but they disagree with each other,
	* then invalidate one. We prefer to keep the primary header,
	* unless the primary table is corrupt.
	*/
	if (table->state[GPT_ELT_PRIHDR] == GPT_STATE_OK &&
	table->state[GPT_ELT_SECHDR] == GPT_STATE_OK &&
	!gpt_matched_hdrs(prihdr, sechdr)) {
	if (table->state[GPT_ELT_PRITBL] == GPT_STATE_OK) {
	table->state[GPT_ELT_SECHDR] = GPT_STATE_INVALID;
	table->state[GPT_ELT_SECTBL] = GPT_STATE_MISSING;
	g_free(sechdr);
	sechdr = NULL;
	} else {
	table->state[GPT_ELT_PRIHDR] = GPT_STATE_INVALID;
	table->state[GPT_ELT_PRITBL] = GPT_STATE_MISSING;
	g_free(prihdr);
	prihdr = NULL;
	}
	}

	if (table->state[GPT_ELT_PRITBL] != GPT_STATE_OK) {
	printf("GEOM: %s: the primary GPT table is corrupt or "
	"invalid.\n", pp->name);
	printf("GEOM: %s: using the secondary instead -- recovery "
	"strongly advised.\n", pp->name);
	table->hdr = sechdr;
	basetable->gpt_corrupt = 1;
	if (prihdr != NULL)
	g_free(prihdr);
	tbl = sectbl;
	if (pritbl != NULL)
	g_free(pritbl);
	} else {
	if (table->state[GPT_ELT_SECTBL] != GPT_STATE_OK) {
	printf("GEOM: %s: the secondary GPT table is corrupt "
	"or invalid.\n", pp->name);
	printf("GEOM: %s: using the primary only -- recovery "
	"suggested.\n", pp->name);
	basetable->gpt_corrupt = 1;
	} else if (table->lba[GPT_ELT_SECHDR] != last) {
	printf( "GEOM: %s: the secondary GPT header is not in "
	"the last LBA.\n", pp->name);
	basetable->gpt_corrupt = 1;
	}
	table->hdr = prihdr;
	if (sechdr != NULL)
	g_free(sechdr);
	tbl = pritbl;
	if (sectbl != NULL)
	g_free(sectbl);
	}

	basetable->gpt_first = table->hdr->hdr_lba_start;
	basetable->gpt_last = table->hdr->hdr_lba_end;
	basetable->gpt_entries = table->hdr->hdr_entries;

	for (index = basetable->gpt_entries - 1; index >= 0; index--) {
	if (EQUUID(&tbl[index].ent_type, &gpt_uuid_unused))
	continue;
	entry = (struct g_part_gpt_entry *)g_part_new_entry(
	basetable, index + 1, tbl[index].ent_lba_start,
	tbl[index].ent_lba_end);
	entry->ent = tbl[index];
	}

	g_free(tbl);

	/*
	* Under Mac OS X, the MBR mirrors the first 4 GPT partitions
	* if (and only if) any FAT32 or FAT16 partitions have been
	* created. This happens irrespective of whether Boot Camp is
	* used/enabled, though it's generally understood to be done
	* to support legacy Windows under Boot Camp. We refer to this
	* mirroring simply as Boot Camp. We try to detect Boot Camp
	* so that we can update the MBR if and when GPT changes have
	* been made. Note that we do not enable Boot Camp if not
	* previously enabled because we can't assume that we're on a
	* Mac alongside Mac OS X.
	*/
	table->bootcamp = gpt_is_bootcamp(table, pp->name);

	return (0);
	}

	static int
	g_part_gpt_recover(struct g_part_table *basetable)
	{
	struct g_part_gpt_table *table;
	struct g_provider *pp;

	table = (struct g_part_gpt_table *)basetable;
	pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider;
	gpt_create_pmbr(table, pp);
	g_gpt_set_defaults(basetable, pp);
	basetable->gpt_corrupt = 0;
	return (0);
	}

	static int
	g_part_gpt_setunset(struct g_part_table *basetable,
	struct g_part_entry baseentry, const char attrib, unsigned int set)
	{
	struct g_part_gpt_entry *entry;
	struct g_part_gpt_table *table;
	struct g_provider *pp;
	uint8_t *p;
	uint64_t attr;
	int i;

	table = (struct g_part_gpt_table *)basetable;
	entry = (struct g_part_gpt_entry *)baseentry;

	if (strcasecmp(attrib, "active") == 0) {
	if (table->bootcamp) {
	/* The active flag must be set on a valid entry. */
	if (entry == NULL)
	return (ENXIO);
	if (baseentry->gpe_index > NDOSPART)
	return (EINVAL);
	for (i = 0; i < NDOSPART; i++) {
	p = &table->mbr[DOSPARTOFF + i * DOSPARTSIZE];
	p[0] = (i == baseentry->gpe_index - 1)
	? ((set) ? 0x80 : 0) : 0;
	}
	} else {
	/* The PMBR is marked as active without an entry. */
	if (entry != NULL)
	return (ENXIO);
	for (i = 0; i < NDOSPART; i++) {
	p = &table->mbr[DOSPARTOFF + i * DOSPARTSIZE];
	p[0] = (p[4] == 0xee) ? ((set) ? 0x80 : 0) : 0;
	}
	}
	return (0);
	} else if (strcasecmp(attrib, "lenovofix") == 0) {
	/*
	* Write the 0xee GPT entry to slot #1 (2nd slot) in the pMBR.
	* This workaround allows Lenovo X220, T420, T520, etc to boot
	* from GPT Partitions in BIOS mode.
	*/

	if (entry != NULL)
	return (ENXIO);

	pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider;
	bzero(table->mbr + DOSPARTOFF, DOSPARTSIZE * NDOSPART);
	gpt_write_mbr_entry(table->mbr, ((set) ? 1 : 0), 0xee, 1,
	MIN(pp->mediasize / pp->sectorsize - 1, UINT32_MAX));
	return (0);
	}

	if (entry == NULL)
	return (ENODEV);

	attr = 0;
	if (strcasecmp(attrib, "bootme") == 0) {
	attr \|= GPT_ENT_ATTR_BOOTME;
	} else if (strcasecmp(attrib, "bootonce") == 0) {
	attr \|= GPT_ENT_ATTR_BOOTONCE;
	if (set)
	attr \|= GPT_ENT_ATTR_BOOTME;
	} else if (strcasecmp(attrib, "bootfailed") == 0) {
	/*
	* It should only be possible to unset BOOTFAILED, but it might
	* be useful for test purposes to also be able to set it.
	*/
	attr \|= GPT_ENT_ATTR_BOOTFAILED;
	}
	if (attr == 0)
	return (EINVAL);

	if (set)
	attr = entry->ent.ent_attr \| attr;
	else
	attr = entry->ent.ent_attr & ~attr;
	if (attr != entry->ent.ent_attr) {
	entry->ent.ent_attr = attr;
	if (!baseentry->gpe_created)
	baseentry->gpe_modified = 1;
	}
	return (0);
	}

	static const char *
	g_part_gpt_type(struct g_part_table basetable, struct g_part_entry baseentry,
	char *buf, size_t bufsz)
	{
	struct g_part_gpt_entry *entry;
	struct uuid *type;
	struct g_part_uuid_alias *uap;

	entry = (struct g_part_gpt_entry *)baseentry;
	type = &entry->ent.ent_type;
	for (uap = &gpt_uuid_alias_match[0]; uap->uuid; uap++)
	if (EQUUID(type, uap->uuid))
	return (g_part_alias_name(uap->alias));
	buf[0] = '!';
	snprintf_uuid(buf + 1, bufsz - 1, type);

	return (buf);
	}

	static int
	g_part_gpt_write(struct g_part_table basetable, struct g_consumer cp)
	{
	unsigned char buf, bp;
	struct g_provider *pp;
	struct g_part_entry *baseentry;
	struct g_part_gpt_entry *entry;
	struct g_part_gpt_table *table;
	size_t tblsz;
	uint32_t crc;
	int error, index;

	pp = cp->provider;
	table = (struct g_part_gpt_table *)basetable;
	tblsz = howmany(table->hdr->hdr_entries * table->hdr->hdr_entsz,
	pp->sectorsize);

	/* Reconstruct the MBR from the GPT if under Boot Camp. */
	if (table->bootcamp)
	gpt_update_bootcamp(basetable, pp);

	/* Write the PMBR */
	buf = g_malloc(pp->sectorsize, M_WAITOK \| M_ZERO);
	bcopy(table->mbr, buf, MBRSIZE);
	error = g_write_data(cp, 0, buf, pp->sectorsize);
	g_free(buf);
	if (error)
	return (error);

	/* Allocate space for the header and entries. */
	buf = g_malloc((tblsz + 1) * pp->sectorsize, M_WAITOK \| M_ZERO);

	memcpy(buf, table->hdr->hdr_sig, sizeof(table->hdr->hdr_sig));
	le32enc(buf + 8, table->hdr->hdr_revision);
	le32enc(buf + 12, table->hdr->hdr_size);
	le64enc(buf + 40, table->hdr->hdr_lba_start);
	le64enc(buf + 48, table->hdr->hdr_lba_end);
	le_uuid_enc(buf + 56, &table->hdr->hdr_uuid);
	le32enc(buf + 80, table->hdr->hdr_entries);
	le32enc(buf + 84, table->hdr->hdr_entsz);

	LIST_FOREACH(baseentry, &basetable->gpt_entry, gpe_entry) {
	if (baseentry->gpe_deleted)
	continue;
	entry = (struct g_part_gpt_entry *)baseentry;
	index = baseentry->gpe_index - 1;
	bp = buf + pp->sectorsize + table->hdr->hdr_entsz * index;
	le_uuid_enc(bp, &entry->ent.ent_type);
	le_uuid_enc(bp + 16, &entry->ent.ent_uuid);
	le64enc(bp + 32, entry->ent.ent_lba_start);
	le64enc(bp + 40, entry->ent.ent_lba_end);
	le64enc(bp + 48, entry->ent.ent_attr);
	memcpy(bp + 56, entry->ent.ent_name,
	sizeof(entry->ent.ent_name));
	}

	crc = crc32(buf + pp->sectorsize,
	table->hdr->hdr_entries * table->hdr->hdr_entsz);
	le32enc(buf + 88, crc);

	/* Write primary meta-data. */
	le32enc(buf + 16, 0); /* hdr_crc_self. */
	le64enc(buf + 24, table->lba[GPT_ELT_PRIHDR]); /* hdr_lba_self. */
	le64enc(buf + 32, table->lba[GPT_ELT_SECHDR]); /* hdr_lba_alt. */
	le64enc(buf + 72, table->lba[GPT_ELT_PRITBL]); /* hdr_lba_table. */
	crc = crc32(buf, table->hdr->hdr_size);
	le32enc(buf + 16, crc);

	- for (index = 0; index < tblsz; index += MAXPHYS / pp->sectorsize) {
	+ for (index = 0; index < tblsz; index += maxphys / pp->sectorsize) {
	error = g_write_data(cp,
	(table->lba[GPT_ELT_PRITBL] + index) * pp->sectorsize,
	buf + (index + 1) * pp->sectorsize,
	- (tblsz - index > MAXPHYS / pp->sectorsize) ? MAXPHYS:
	+ (tblsz - index > maxphys / pp->sectorsize) ? maxphys :
	(tblsz - index) * pp->sectorsize);
	if (error)
	goto out;
	}
	error = g_write_data(cp, table->lba[GPT_ELT_PRIHDR] * pp->sectorsize,
	buf, pp->sectorsize);
	if (error)
	goto out;

	/* Write secondary meta-data. */
	le32enc(buf + 16, 0); /* hdr_crc_self. */
	le64enc(buf + 24, table->lba[GPT_ELT_SECHDR]); /* hdr_lba_self. */
	le64enc(buf + 32, table->lba[GPT_ELT_PRIHDR]); /* hdr_lba_alt. */
	le64enc(buf + 72, table->lba[GPT_ELT_SECTBL]); /* hdr_lba_table. */
	crc = crc32(buf, table->hdr->hdr_size);
	le32enc(buf + 16, crc);

	- for (index = 0; index < tblsz; index += MAXPHYS / pp->sectorsize) {
	+ for (index = 0; index < tblsz; index += maxphys / pp->sectorsize) {
	error = g_write_data(cp,
	(table->lba[GPT_ELT_SECTBL] + index) * pp->sectorsize,
	buf + (index + 1) * pp->sectorsize,
	- (tblsz - index > MAXPHYS / pp->sectorsize) ? MAXPHYS:
	+ (tblsz - index > maxphys / pp->sectorsize) ? maxphys :
	(tblsz - index) * pp->sectorsize);
	if (error)
	goto out;
	}
	error = g_write_data(cp, table->lba[GPT_ELT_SECHDR] * pp->sectorsize,
	buf, pp->sectorsize);

	out:
	g_free(buf);
	return (error);
	}

	static void
	g_gpt_set_defaults(struct g_part_table basetable, struct g_provider pp)
	{
	struct g_part_entry *baseentry;
	struct g_part_gpt_entry *entry;
	struct g_part_gpt_table *table;
	quad_t start, end, min, max;
	quad_t lba, last;
	size_t spb, tblsz;

	table = (struct g_part_gpt_table *)basetable;
	last = pp->mediasize / pp->sectorsize - 1;
	tblsz = howmany(basetable->gpt_entries * sizeof(struct gpt_ent),
	pp->sectorsize);

	table->lba[GPT_ELT_PRIHDR] = 1;
	table->lba[GPT_ELT_PRITBL] = 2;
	table->lba[GPT_ELT_SECHDR] = last;
	table->lba[GPT_ELT_SECTBL] = last - tblsz;
	table->state[GPT_ELT_PRIHDR] = GPT_STATE_OK;
	table->state[GPT_ELT_PRITBL] = GPT_STATE_OK;
	table->state[GPT_ELT_SECHDR] = GPT_STATE_OK;
	table->state[GPT_ELT_SECTBL] = GPT_STATE_OK;

	max = start = 2 + tblsz;
	min = end = last - tblsz - 1;
	LIST_FOREACH(baseentry, &basetable->gpt_entry, gpe_entry) {
	if (baseentry->gpe_deleted)
	continue;
	entry = (struct g_part_gpt_entry *)baseentry;
	if (entry->ent.ent_lba_start < min)
	min = entry->ent.ent_lba_start;
	if (entry->ent.ent_lba_end > max)
	max = entry->ent.ent_lba_end;
	}
	spb = 4096 / pp->sectorsize;
	if (spb > 1) {
	lba = start + ((start % spb) ? spb - start % spb : 0);
	if (lba <= min)
	start = lba;
	lba = end - (end + 1) % spb;
	if (max <= lba)
	end = lba;
	}
	table->hdr->hdr_lba_start = start;
	table->hdr->hdr_lba_end = end;

	basetable->gpt_first = start;
	basetable->gpt_last = end;
	}

	static void
	g_gpt_printf_utf16(struct sbuf sb, uint16_t str, size_t len)
	{
	u_int bo;
	uint32_t ch;
	uint16_t c;

	bo = LITTLE_ENDIAN; /* GPT is little-endian */
	while (len > 0 && *str != 0) {
	ch = (bo == BIG_ENDIAN) ? be16toh(str) : le16toh(str);
	str++, len--;
	if ((ch & 0xf800) == 0xd800) {
	if (len > 0) {
	c = (bo == BIG_ENDIAN) ? be16toh(*str)
	: le16toh(*str);
	str++, len--;
	} else
	c = 0xfffd;
	if ((ch & 0x400) == 0 && (c & 0xfc00) == 0xdc00) {
	ch = ((ch & 0x3ff) << 10) + (c & 0x3ff);
	ch += 0x10000;
	} else
	ch = 0xfffd;
	} else if (ch == 0xfffe) { /* BOM (U+FEFF) swapped. */
	bo = (bo == BIG_ENDIAN) ? LITTLE_ENDIAN : BIG_ENDIAN;
	continue;
	} else if (ch == 0xfeff) /* BOM (U+FEFF) unswapped. */
	continue;

	/* Write the Unicode character in UTF-8 */
	if (ch < 0x80)
	g_conf_printf_escaped(sb, "%c", ch);
	else if (ch < 0x800)
	g_conf_printf_escaped(sb, "%c%c", 0xc0 \| (ch >> 6),
	0x80 \| (ch & 0x3f));
	else if (ch < 0x10000)
	g_conf_printf_escaped(sb, "%c%c%c", 0xe0 \| (ch >> 12),
	0x80 \| ((ch >> 6) & 0x3f), 0x80 \| (ch & 0x3f));
	else if (ch < 0x200000)
	g_conf_printf_escaped(sb, "%c%c%c%c", 0xf0 \|
	(ch >> 18), 0x80 \| ((ch >> 12) & 0x3f),
	0x80 \| ((ch >> 6) & 0x3f), 0x80 \| (ch & 0x3f));
	}
	}

	static void
	g_gpt_utf8_to_utf16(const uint8_t s8, uint16_t s16, size_t s16len)
	{
	size_t s16idx, s8idx;
	uint32_t utfchar;
	unsigned int c, utfbytes;

	s8idx = s16idx = 0;
	utfchar = 0;
	utfbytes = 0;
	bzero(s16, s16len << 1);
	while (s8[s8idx] != 0 && s16idx < s16len) {
	c = s8[s8idx++];
	if ((c & 0xc0) != 0x80) {
	/* Initial characters. */
	if (utfbytes != 0) {
	/* Incomplete encoding of previous char. */
	s16[s16idx++] = htole16(0xfffd);
	}
	if ((c & 0xf8) == 0xf0) {
	utfchar = c & 0x07;
	utfbytes = 3;
	} else if ((c & 0xf0) == 0xe0) {
	utfchar = c & 0x0f;
	utfbytes = 2;
	} else if ((c & 0xe0) == 0xc0) {
	utfchar = c & 0x1f;
	utfbytes = 1;
	} else {
	utfchar = c & 0x7f;
	utfbytes = 0;
	}
	} else {
	/* Followup characters. */
	if (utfbytes > 0) {
	utfchar = (utfchar << 6) + (c & 0x3f);
	utfbytes--;
	} else if (utfbytes == 0)
	utfbytes = ~0;
	}
	/*
	* Write the complete Unicode character as UTF-16 when we
	* have all the UTF-8 charactars collected.
	*/
	if (utfbytes == 0) {
	/*
	* If we need to write 2 UTF-16 characters, but
	* we only have room for 1, then we truncate the
	* string by writing a 0 instead.
	*/
	if (utfchar >= 0x10000 && s16idx < s16len - 1) {
	s16[s16idx++] =
	htole16(0xd800 \| ((utfchar >> 10) - 0x40));
	s16[s16idx++] =
	htole16(0xdc00 \| (utfchar & 0x3ff));
	} else
	s16[s16idx++] = (utfchar >= 0x10000) ? 0 :
	htole16(utfchar);
	}
	}
	/*
	* If our input string was truncated, append an invalid encoding
	* character to the output string.
	*/
	if (utfbytes != 0 && s16idx < s16len)
	s16[s16idx++] = htole16(0xfffd);
	}
	diff --git a/sys/geom/part/g_part_ldm.c b/sys/geom/part/g_part_ldm.c
	index 6debf39d04f7..e53578f57fee 100644
	--- a/sys/geom/part/g_part_ldm.c
	+++ b/sys/geom/part/g_part_ldm.c
	@@ -1,1486 +1,1486 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2012 Andrey V. Elsukov <ae@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/bio.h>
	#include <sys/diskmbr.h>
	#include <sys/endian.h>
	#include <sys/gpt.h>
	#include <sys/kernel.h>
	#include <sys/kobj.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/queue.h>
	#include <sys/sbuf.h>
	#include <sys/systm.h>
	#include <sys/sysctl.h>
	#include <sys/uuid.h>
	#include <geom/geom.h>
	#include <geom/part/g_part.h>

	#include "g_part_if.h"

	FEATURE(geom_part_ldm, "GEOM partitioning class for LDM support");

	SYSCTL_DECL(_kern_geom_part);
	static SYSCTL_NODE(_kern_geom_part, OID_AUTO, ldm,
	CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_PART_LDM Logical Disk Manager");

	static u_int ldm_debug = 0;
	SYSCTL_UINT(_kern_geom_part_ldm, OID_AUTO, debug,
	CTLFLAG_RWTUN, &ldm_debug, 0, "Debug level");

	/*
	* This allows access to mirrored LDM volumes. Since we do not
	* doing mirroring here, it is not enabled by default.
	*/
	static u_int show_mirrors = 0;
	SYSCTL_UINT(_kern_geom_part_ldm, OID_AUTO, show_mirrors,
	CTLFLAG_RWTUN, &show_mirrors, 0, "Show mirrored volumes");

	#define LDM_DEBUG(lvl, fmt, ...) do { \
	if (ldm_debug >= (lvl)) { \
	printf("GEOM_PART: " fmt "\n", __VA_ARGS__); \
	} \
	} while (0)
	#define LDM_DUMP(buf, size) do { \
	if (ldm_debug > 1) { \
	hexdump(buf, size, NULL, 0); \
	} \
	} while (0)

	/*
	* There are internal representations of LDM structures.
	*
	* We do not keep all fields of on-disk structures, only most useful.
	* All numbers in an on-disk structures are in big-endian format.
	*/

	/*
	* Private header is 512 bytes long. There are three copies on each disk.
	* Offset and sizes are in sectors. Location of each copy:
	* - the first offset is relative to the disk start;
	* - the second and third offset are relative to the LDM database start.
	*
	* On a disk partitioned with GPT, the LDM has not first private header.
	*/
	#define LDM_PH_MBRINDEX 0
	#define LDM_PH_GPTINDEX 2
	static const uint64_t ldm_ph_off[] = {6, 1856, 2047};
	#define LDM_VERSION_2K 0x2000b
	#define LDM_VERSION_VISTA 0x2000c
	#define LDM_PH_VERSION_OFF 0x00c
	#define LDM_PH_DISKGUID_OFF 0x030
	#define LDM_PH_DGGUID_OFF 0x0b0
	#define LDM_PH_DGNAME_OFF 0x0f0
	#define LDM_PH_START_OFF 0x11b
	#define LDM_PH_SIZE_OFF 0x123
	#define LDM_PH_DB_OFF 0x12b
	#define LDM_PH_DBSIZE_OFF 0x133
	#define LDM_PH_TH1_OFF 0x13b
	#define LDM_PH_TH2_OFF 0x143
	#define LDM_PH_CONFSIZE_OFF 0x153
	#define LDM_PH_LOGSIZE_OFF 0x15b
	#define LDM_PH_SIGN "PRIVHEAD"
	struct ldm_privhdr {
	struct uuid disk_guid;
	struct uuid dg_guid;
	u_char dg_name[32];
	uint64_t start; /* logical disk start */
	uint64_t size; /* logical disk size */
	uint64_t db_offset; /* LDM database start */
	#define LDM_DB_SIZE 2048
	uint64_t db_size; /* LDM database size */
	#define LDM_TH_COUNT 2
	uint64_t th_offset[LDM_TH_COUNT]; /* TOC header offsets */
	uint64_t conf_size; /* configuration size */
	uint64_t log_size; /* size of log */
	};

	/*
	* Table of contents header is 512 bytes long.
	* There are two identical copies at offsets from the private header.
	* Offsets are relative to the LDM database start.
	*/
	#define LDM_TH_SIGN "TOCBLOCK"
	#define LDM_TH_NAME1 "config"
	#define LDM_TH_NAME2 "log"
	#define LDM_TH_NAME1_OFF 0x024
	#define LDM_TH_CONF_OFF 0x02e
	#define LDM_TH_CONFSIZE_OFF 0x036
	#define LDM_TH_NAME2_OFF 0x046
	#define LDM_TH_LOG_OFF 0x050
	#define LDM_TH_LOGSIZE_OFF 0x058
	struct ldm_tochdr {
	uint64_t conf_offset; /* configuration offset */
	uint64_t log_offset; /* log offset */
	};

	/*
	* LDM database header is 512 bytes long.
	*/
	#define LDM_VMDB_SIGN "VMDB"
	#define LDM_DB_LASTSEQ_OFF 0x004
	#define LDM_DB_SIZE_OFF 0x008
	#define LDM_DB_STATUS_OFF 0x010
	#define LDM_DB_VERSION_OFF 0x012
	#define LDM_DB_DGNAME_OFF 0x016
	#define LDM_DB_DGGUID_OFF 0x035
	struct ldm_vmdbhdr {
	uint32_t last_seq; /* sequence number of last VBLK */
	uint32_t size; /* size of VBLK */
	};

	/*
	* The LDM database configuration section contains VMDB header and
	* many VBLKs. Each VBLK represents a disk group, disk partition,
	* component or volume.
	*
	* The most interesting for us are volumes, they are represents
	* partitions in the GEOM_PART meaning. But volume VBLK does not
	* contain all information needed to create GEOM provider. And we
	* should get this information from the related VBLK. This is how
	* VBLK releated:
	* Volumes <- Components <- Partitions -> Disks
	*
	* One volume can contain several components. In this case LDM
	* does mirroring of volume data to each component.
	*
	* Also each component can contain several partitions (spanned or
	* striped volumes).
	*/

	struct ldm_component {
	uint64_t id; /* object id */
	uint64_t vol_id; /* parent volume object id */

	int count;
	LIST_HEAD(, ldm_partition) partitions;
	LIST_ENTRY(ldm_component) entry;
	};

	struct ldm_volume {
	uint64_t id; /* object id */
	uint64_t size; /* volume size */
	uint8_t number; /* used for ordering */
	uint8_t part_type; /* partition type */

	int count;
	LIST_HEAD(, ldm_component) components;
	LIST_ENTRY(ldm_volume) entry;
	};

	struct ldm_disk {
	uint64_t id; /* object id */
	struct uuid guid; /* disk guid */

	LIST_ENTRY(ldm_disk) entry;
	};

	#if 0
	struct ldm_disk_group {
	uint64_t id; /* object id */
	struct uuid guid; /* disk group guid */
	u_char name[32]; /* disk group name */

	LIST_ENTRY(ldm_disk_group) entry;
	};
	#endif

	struct ldm_partition {
	uint64_t id; /* object id */
	uint64_t disk_id; /* disk object id */
	uint64_t comp_id; /* parent component object id */
	uint64_t start; /* offset relative to disk start */
	uint64_t offset; /* offset for spanned volumes */
	uint64_t size; /* partition size */

	LIST_ENTRY(ldm_partition) entry;
	};

	/*
	* Each VBLK is 128 bytes long and has standard 16 bytes header.
	* Some of VBLK's fields are fixed size, but others has variable size.
	* Fields with variable size are prefixed with one byte length marker.
	* Some fields are strings and also can have fixed size and variable.
	* Strings with fixed size are NULL-terminated, others are not.
	* All VBLKs have same several first fields:
	* Offset Size Description
	* ---------------+---------------+--------------------------
	* 0x00 16 standard VBLK header
	* 0x10 2 update status
	* 0x13 1 VBLK type
	* 0x18 PS object id
	* 0x18+ PN object name
	*
	* o Offset 0x18+ means '0x18 + length of all variable-width fields'
	* o 'P' in size column means 'prefixed' (variable-width),
	* 'S' - string, 'N' - number.
	*/
	#define LDM_VBLK_SIGN "VBLK"
	#define LDM_VBLK_SEQ_OFF 0x04
	#define LDM_VBLK_GROUP_OFF 0x08
	#define LDM_VBLK_INDEX_OFF 0x0c
	#define LDM_VBLK_COUNT_OFF 0x0e
	#define LDM_VBLK_TYPE_OFF 0x13
	#define LDM_VBLK_OID_OFF 0x18
	struct ldm_vblkhdr {
	uint32_t seq; /* sequence number */
	uint32_t group; /* group number */
	uint16_t index; /* index in the group */
	uint16_t count; /* number of entries in the group */
	};

	#define LDM_VBLK_T_COMPONENT 0x32
	#define LDM_VBLK_T_PARTITION 0x33
	#define LDM_VBLK_T_DISK 0x34
	#define LDM_VBLK_T_DISKGROUP 0x35
	#define LDM_VBLK_T_DISK4 0x44
	#define LDM_VBLK_T_DISKGROUP4 0x45
	#define LDM_VBLK_T_VOLUME 0x51
	struct ldm_vblk {
	uint8_t type; /* VBLK type */
	union {
	uint64_t id;
	struct ldm_volume vol;
	struct ldm_component comp;
	struct ldm_disk disk;
	struct ldm_partition part;
	#if 0
	struct ldm_disk_group disk_group;
	#endif
	} u;
	LIST_ENTRY(ldm_vblk) entry;
	};

	/*
	* Some VBLKs contains a bit more data than can fit into 128 bytes. These
	* VBLKs are called eXtended VBLK. Before parsing, the data from these VBLK
	* should be placed into continuous memory buffer. We can determine xVBLK
	* by the count field in the standard VBLK header (count > 1).
	*/
	struct ldm_xvblk {
	uint32_t group; /* xVBLK group number */
	uint32_t size; /* the total size of xVBLK */
	uint8_t map; /* bitmask of currently saved VBLKs */
	u_char data; / xVBLK data */

	LIST_ENTRY(ldm_xvblk) entry;
	};

	/* The internal representation of LDM database. */
	struct ldm_db {
	struct ldm_privhdr ph; /* private header */
	struct ldm_tochdr th; /* TOC header */
	struct ldm_vmdbhdr dh; /* VMDB header */

	LIST_HEAD(, ldm_volume) volumes;
	LIST_HEAD(, ldm_disk) disks;
	LIST_HEAD(, ldm_vblk) vblks;
	LIST_HEAD(, ldm_xvblk) xvblks;
	};

	static struct uuid gpt_uuid_ms_ldm_metadata = GPT_ENT_TYPE_MS_LDM_METADATA;

	struct g_part_ldm_table {
	struct g_part_table base;
	uint64_t db_offset;
	int is_gpt;
	};
	struct g_part_ldm_entry {
	struct g_part_entry base;
	uint8_t type;
	};

	static int g_part_ldm_add(struct g_part_table , struct g_part_entry ,
	struct g_part_parms *);
	static int g_part_ldm_bootcode(struct g_part_table , struct g_part_parms );
	static int g_part_ldm_create(struct g_part_table , struct g_part_parms );
	static int g_part_ldm_destroy(struct g_part_table , struct g_part_parms );
	static void g_part_ldm_dumpconf(struct g_part_table , struct g_part_entry ,
	struct sbuf , const char );
	static int g_part_ldm_dumpto(struct g_part_table , struct g_part_entry );
	static int g_part_ldm_modify(struct g_part_table , struct g_part_entry ,
	struct g_part_parms *);
	static const char g_part_ldm_name(struct g_part_table , struct g_part_entry *,
	char *, size_t);
	static int g_part_ldm_probe(struct g_part_table , struct g_consumer );
	static int g_part_ldm_read(struct g_part_table , struct g_consumer );
	static const char g_part_ldm_type(struct g_part_table , struct g_part_entry *,
	char *, size_t);
	static int g_part_ldm_write(struct g_part_table , struct g_consumer );

	static kobj_method_t g_part_ldm_methods[] = {
	KOBJMETHOD(g_part_add, g_part_ldm_add),
	KOBJMETHOD(g_part_bootcode, g_part_ldm_bootcode),
	KOBJMETHOD(g_part_create, g_part_ldm_create),
	KOBJMETHOD(g_part_destroy, g_part_ldm_destroy),
	KOBJMETHOD(g_part_dumpconf, g_part_ldm_dumpconf),
	KOBJMETHOD(g_part_dumpto, g_part_ldm_dumpto),
	KOBJMETHOD(g_part_modify, g_part_ldm_modify),
	KOBJMETHOD(g_part_name, g_part_ldm_name),
	KOBJMETHOD(g_part_probe, g_part_ldm_probe),
	KOBJMETHOD(g_part_read, g_part_ldm_read),
	KOBJMETHOD(g_part_type, g_part_ldm_type),
	KOBJMETHOD(g_part_write, g_part_ldm_write),
	{ 0, 0 }
	};

	static struct g_part_scheme g_part_ldm_scheme = {
	"LDM",
	g_part_ldm_methods,
	sizeof(struct g_part_ldm_table),
	.gps_entrysz = sizeof(struct g_part_ldm_entry)
	};
	G_PART_SCHEME_DECLARE(g_part_ldm);
	MODULE_VERSION(geom_part_ldm, 0);

	static struct g_part_ldm_alias {
	u_char typ;
	int alias;
	} ldm_alias_match[] = {
	{ DOSPTYP_386BSD, G_PART_ALIAS_FREEBSD },
	{ DOSPTYP_FAT32, G_PART_ALIAS_MS_FAT32 },
	{ DOSPTYP_FAT32LBA, G_PART_ALIAS_MS_FAT32LBA },
	{ DOSPTYP_LDM, G_PART_ALIAS_MS_LDM_DATA },
	{ DOSPTYP_LINLVM, G_PART_ALIAS_LINUX_LVM },
	{ DOSPTYP_LINRAID, G_PART_ALIAS_LINUX_RAID },
	{ DOSPTYP_LINSWP, G_PART_ALIAS_LINUX_SWAP },
	{ DOSPTYP_LINUX, G_PART_ALIAS_LINUX_DATA },
	{ DOSPTYP_NTFS, G_PART_ALIAS_MS_NTFS },
	};

	static u_char*
	ldm_privhdr_read(struct g_consumer cp, uint64_t off, int error)
	{
	struct g_provider *pp;
	u_char *buf;

	pp = cp->provider;
	buf = g_read_data(cp, off, pp->sectorsize, error);
	if (buf == NULL)
	return (NULL);

	if (memcmp(buf, LDM_PH_SIGN, strlen(LDM_PH_SIGN)) != 0) {
	LDM_DEBUG(1, "%s: invalid LDM private header signature",
	pp->name);
	g_free(buf);
	buf = NULL;
	*error = EINVAL;
	}
	return (buf);
	}

	static int
	ldm_privhdr_parse(struct g_consumer cp, struct ldm_privhdr hdr,
	const u_char *buf)
	{
	uint32_t version;
	int error;

	memset(hdr, 0, sizeof(*hdr));
	version = be32dec(buf + LDM_PH_VERSION_OFF);
	if (version != LDM_VERSION_2K &&
	version != LDM_VERSION_VISTA) {
	LDM_DEBUG(0, "%s: unsupported LDM version %u.%u",
	cp->provider->name, version >> 16,
	version & 0xFFFF);
	return (ENXIO);
	}
	error = parse_uuid(buf + LDM_PH_DISKGUID_OFF, &hdr->disk_guid);
	if (error != 0)
	return (error);
	error = parse_uuid(buf + LDM_PH_DGGUID_OFF, &hdr->dg_guid);
	if (error != 0)
	return (error);
	strncpy(hdr->dg_name, buf + LDM_PH_DGNAME_OFF, sizeof(hdr->dg_name));
	hdr->start = be64dec(buf + LDM_PH_START_OFF);
	hdr->size = be64dec(buf + LDM_PH_SIZE_OFF);
	hdr->db_offset = be64dec(buf + LDM_PH_DB_OFF);
	hdr->db_size = be64dec(buf + LDM_PH_DBSIZE_OFF);
	hdr->th_offset[0] = be64dec(buf + LDM_PH_TH1_OFF);
	hdr->th_offset[1] = be64dec(buf + LDM_PH_TH2_OFF);
	hdr->conf_size = be64dec(buf + LDM_PH_CONFSIZE_OFF);
	hdr->log_size = be64dec(buf + LDM_PH_LOGSIZE_OFF);
	return (0);
	}

	static int
	ldm_privhdr_check(struct ldm_db db, struct g_consumer cp, int is_gpt)
	{
	struct g_consumer *cp2;
	struct g_provider *pp;
	struct ldm_privhdr hdr;
	uint64_t offset, last;
	int error, found, i;
	u_char *buf;

	pp = cp->provider;
	if (is_gpt) {
	/*
	* The last LBA is used in several checks below, for the
	* GPT case it should be calculated relative to the whole
	* disk.
	*/
	cp2 = LIST_FIRST(&pp->geom->consumer);
	last =
	cp2->provider->mediasize / cp2->provider->sectorsize - 1;
	} else
	last = pp->mediasize / pp->sectorsize - 1;
	for (found = 0, i = is_gpt; i < nitems(ldm_ph_off); i++) {
	offset = ldm_ph_off[i];
	/*
	* In the GPT case consumer is attached to the LDM metadata
	* partition and we don't need add db_offset.
	*/
	if (!is_gpt)
	offset += db->ph.db_offset;
	if (i == LDM_PH_MBRINDEX) {
	/*
	* Prepare to errors and setup new base offset
	* to read backup private headers. Assume that LDM
	* database is in the last 1Mbyte area.
	*/
	db->ph.db_offset = last - LDM_DB_SIZE;
	}
	buf = ldm_privhdr_read(cp, offset * pp->sectorsize, &error);
	if (buf == NULL) {
	LDM_DEBUG(1, "%s: failed to read private header "
	"%d at LBA %ju", pp->name, i, (uintmax_t)offset);
	continue;
	}
	error = ldm_privhdr_parse(cp, &hdr, buf);
	if (error != 0) {
	LDM_DEBUG(1, "%s: failed to parse private "
	"header %d", pp->name, i);
	LDM_DUMP(buf, pp->sectorsize);
	g_free(buf);
	continue;
	}
	g_free(buf);
	if (hdr.start > last \|\|
	hdr.start + hdr.size - 1 > last \|\|
	(hdr.start + hdr.size - 1 > hdr.db_offset && !is_gpt) \|\|
	hdr.db_size != LDM_DB_SIZE \|\|
	hdr.db_offset + LDM_DB_SIZE - 1 > last \|\|
	hdr.th_offset[0] >= LDM_DB_SIZE \|\|
	hdr.th_offset[1] >= LDM_DB_SIZE \|\|
	hdr.conf_size + hdr.log_size >= LDM_DB_SIZE) {
	LDM_DEBUG(1, "%s: invalid values in the "
	"private header %d", pp->name, i);
	LDM_DEBUG(2, "%s: start: %jd, size: %jd, "
	"db_offset: %jd, db_size: %jd, th_offset0: %jd, "
	"th_offset1: %jd, conf_size: %jd, log_size: %jd, "
	"last: %jd", pp->name, hdr.start, hdr.size,
	hdr.db_offset, hdr.db_size, hdr.th_offset[0],
	hdr.th_offset[1], hdr.conf_size, hdr.log_size,
	last);
	continue;
	}
	if (found != 0 && memcmp(&db->ph, &hdr, sizeof(hdr)) != 0) {
	LDM_DEBUG(0, "%s: private headers are not equal",
	pp->name);
	if (i > 1) {
	/*
	* We have different headers in the LDM.
	* We can not trust this metadata.
	*/
	LDM_DEBUG(0, "%s: refuse LDM metadata",
	pp->name);
	return (EINVAL);
	}
	/*
	* We already have read primary private header
	* and it differs from this backup one.
	* Prefer the backup header and save it.
	*/
	found = 0;
	}
	if (found == 0)
	memcpy(&db->ph, &hdr, sizeof(hdr));
	found = 1;
	}
	if (found == 0) {
	LDM_DEBUG(1, "%s: valid LDM private header not found",
	pp->name);
	return (ENXIO);
	}
	return (0);
	}

	static int
	ldm_gpt_check(struct ldm_db db, struct g_consumer cp)
	{
	struct g_part_table *gpt;
	struct g_part_entry *e;
	struct g_consumer *cp2;
	int error;

	cp2 = LIST_NEXT(cp, consumer);
	g_topology_lock();
	gpt = cp->provider->geom->softc;
	error = 0;
	LIST_FOREACH(e, &gpt->gpt_entry, gpe_entry) {
	if (cp->provider == e->gpe_pp) {
	/* ms-ldm-metadata partition */
	if (e->gpe_start != db->ph.db_offset \|\|
	e->gpe_end != db->ph.db_offset + LDM_DB_SIZE - 1)
	error++;
	} else if (cp2->provider == e->gpe_pp) {
	/* ms-ldm-data partition */
	if (e->gpe_start != db->ph.start \|\|
	e->gpe_end != db->ph.start + db->ph.size - 1)
	error++;
	}
	if (error != 0) {
	LDM_DEBUG(0, "%s: GPT partition %d boundaries "
	"do not match with the LDM metadata",
	e->gpe_pp->name, e->gpe_index);
	error = ENXIO;
	break;
	}
	}
	g_topology_unlock();
	return (error);
	}

	static int
	ldm_tochdr_check(struct ldm_db db, struct g_consumer cp)
	{
	struct g_provider *pp;
	struct ldm_tochdr hdr;
	uint64_t offset, conf_size, log_size;
	int error, found, i;
	u_char *buf;

	pp = cp->provider;
	for (i = 0, found = 0; i < LDM_TH_COUNT; i++) {
	offset = db->ph.db_offset + db->ph.th_offset[i];
	buf = g_read_data(cp,
	offset * pp->sectorsize, pp->sectorsize, &error);
	if (buf == NULL) {
	LDM_DEBUG(1, "%s: failed to read TOC header "
	"at LBA %ju", pp->name, (uintmax_t)offset);
	continue;
	}
	if (memcmp(buf, LDM_TH_SIGN, strlen(LDM_TH_SIGN)) != 0 \|\|
	memcmp(buf + LDM_TH_NAME1_OFF, LDM_TH_NAME1,
	strlen(LDM_TH_NAME1)) != 0 \|\|
	memcmp(buf + LDM_TH_NAME2_OFF, LDM_TH_NAME2,
	strlen(LDM_TH_NAME2)) != 0) {
	LDM_DEBUG(1, "%s: failed to parse TOC header "
	"at LBA %ju", pp->name, (uintmax_t)offset);
	LDM_DUMP(buf, pp->sectorsize);
	g_free(buf);
	continue;
	}
	hdr.conf_offset = be64dec(buf + LDM_TH_CONF_OFF);
	hdr.log_offset = be64dec(buf + LDM_TH_LOG_OFF);
	conf_size = be64dec(buf + LDM_TH_CONFSIZE_OFF);
	log_size = be64dec(buf + LDM_TH_LOGSIZE_OFF);
	if (conf_size != db->ph.conf_size \|\|
	hdr.conf_offset + conf_size >= LDM_DB_SIZE \|\|
	log_size != db->ph.log_size \|\|
	hdr.log_offset + log_size >= LDM_DB_SIZE) {
	LDM_DEBUG(1, "%s: invalid values in the "
	"TOC header at LBA %ju", pp->name,
	(uintmax_t)offset);
	LDM_DUMP(buf, pp->sectorsize);
	g_free(buf);
	continue;
	}
	g_free(buf);
	if (found == 0)
	memcpy(&db->th, &hdr, sizeof(hdr));
	found = 1;
	}
	if (found == 0) {
	LDM_DEBUG(0, "%s: valid LDM TOC header not found.",
	pp->name);
	return (ENXIO);
	}
	return (0);
	}

	static int
	ldm_vmdbhdr_check(struct ldm_db db, struct g_consumer cp)
	{
	struct g_provider *pp;
	struct uuid dg_guid;
	uint64_t offset;
	uint32_t version;
	int error;
	u_char *buf;

	pp = cp->provider;
	offset = db->ph.db_offset + db->th.conf_offset;
	buf = g_read_data(cp, offset * pp->sectorsize, pp->sectorsize,
	&error);
	if (buf == NULL) {
	LDM_DEBUG(0, "%s: failed to read VMDB header at "
	"LBA %ju", pp->name, (uintmax_t)offset);
	return (error);
	}
	if (memcmp(buf, LDM_VMDB_SIGN, strlen(LDM_VMDB_SIGN)) != 0) {
	g_free(buf);
	LDM_DEBUG(0, "%s: failed to parse VMDB header at "
	"LBA %ju", pp->name, (uintmax_t)offset);
	return (ENXIO);
	}
	/* Check version. */
	version = be32dec(buf + LDM_DB_VERSION_OFF);
	if (version != 0x4000A) {
	g_free(buf);
	LDM_DEBUG(0, "%s: unsupported VMDB version %u.%u",
	pp->name, version >> 16, version & 0xFFFF);
	return (ENXIO);
	}
	/*
	* Check VMDB update status:
	* 1 - in a consistent state;
	* 2 - in a creation phase;
	* 3 - in a deletion phase;
	*/
	if (be16dec(buf + LDM_DB_STATUS_OFF) != 1) {
	g_free(buf);
	LDM_DEBUG(0, "%s: VMDB is not in a consistent state",
	pp->name);
	return (ENXIO);
	}
	db->dh.last_seq = be32dec(buf + LDM_DB_LASTSEQ_OFF);
	db->dh.size = be32dec(buf + LDM_DB_SIZE_OFF);
	error = parse_uuid(buf + LDM_DB_DGGUID_OFF, &dg_guid);
	/* Compare disk group name and guid from VMDB and private headers */
	if (error != 0 \|\| db->dh.size == 0 \|\|
	pp->sectorsize % db->dh.size != 0 \|\|
	strncmp(buf + LDM_DB_DGNAME_OFF, db->ph.dg_name, 31) != 0 \|\|
	memcmp(&dg_guid, &db->ph.dg_guid, sizeof(dg_guid)) != 0 \|\|
	db->dh.size * db->dh.last_seq >
	db->ph.conf_size * pp->sectorsize) {
	LDM_DEBUG(0, "%s: invalid values in the VMDB header",
	pp->name);
	LDM_DUMP(buf, pp->sectorsize);
	g_free(buf);
	return (EINVAL);
	}
	g_free(buf);
	return (0);
	}

	static int
	ldm_xvblk_handle(struct ldm_db db, struct ldm_vblkhdr vh, const u_char *p)
	{
	struct ldm_xvblk *blk;
	size_t size;

	size = db->dh.size - 16;
	LIST_FOREACH(blk, &db->xvblks, entry)
	if (blk->group == vh->group)
	break;
	if (blk == NULL) {
	blk = g_malloc(sizeof(*blk), M_WAITOK \| M_ZERO);
	blk->group = vh->group;
	blk->size = size * vh->count + 16;
	blk->data = g_malloc(blk->size, M_WAITOK \| M_ZERO);
	blk->map = 0xFF << vh->count;
	LIST_INSERT_HEAD(&db->xvblks, blk, entry);
	}
	if ((blk->map & (1 << vh->index)) != 0) {
	/* Block with given index has been already saved. */
	return (EINVAL);
	}
	/* Copy the data block to the place related to index. */
	memcpy(blk->data + size * vh->index + 16, p + 16, size);
	blk->map \|= 1 << vh->index;
	return (0);
	}

	/* Read the variable-width numeric field and return new offset */
	static int
	ldm_vnum_get(const u_char buf, int offset, uint64_t result, size_t range)
	{
	uint64_t num;
	uint8_t len;

	len = buf[offset++];
	if (len > sizeof(uint64_t) \|\| len + offset >= range)
	return (-1);
	for (num = 0; len > 0; len--)
	num = (num << 8) \| buf[offset++];
	*result = num;
	return (offset);
	}

	/* Read the variable-width string and return new offset */
	static int
	ldm_vstr_get(const u_char buf, int offset, u_char result,
	size_t maxlen, size_t range)
	{
	uint8_t len;

	len = buf[offset++];
	if (len >= maxlen \|\| len + offset >= range)
	return (-1);
	memcpy(result, buf + offset, len);
	result[len] = '\0';
	return (offset + len);
	}

	/* Just skip the variable-width variable and return new offset */
	static int
	ldm_vparm_skip(const u_char *buf, int offset, size_t range)
	{
	uint8_t len;

	len = buf[offset++];
	if (offset + len >= range)
	return (-1);

	return (offset + len);
	}

	static int
	ldm_vblk_handle(struct ldm_db db, const u_char p, size_t size)
	{
	struct ldm_vblk *blk;
	struct ldm_volume volume, last;
	const char *errstr;
	u_char vstr[64];
	int error, offset;

	blk = g_malloc(sizeof(*blk), M_WAITOK \| M_ZERO);
	blk->type = p[LDM_VBLK_TYPE_OFF];
	offset = ldm_vnum_get(p, LDM_VBLK_OID_OFF, &blk->u.id, size);
	if (offset < 0) {
	errstr = "object id";
	goto fail;
	}
	offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size);
	if (offset < 0) {
	errstr = "object name";
	goto fail;
	}
	switch (blk->type) {
	/*
	* Component VBLK fields:
	* Offset Size Description
	* ------------+-------+------------------------
	* 0x18+ PS volume state
	* 0x18+5 PN component children count
	* 0x1D+16 PN parent's volume object id
	* 0x2D+1 PN stripe size
	*/
	case LDM_VBLK_T_COMPONENT:
	offset = ldm_vparm_skip(p, offset, size);
	if (offset < 0) {
	errstr = "volume state";
	goto fail;
	}
	offset = ldm_vparm_skip(p, offset + 5, size);
	if (offset < 0) {
	errstr = "children count";
	goto fail;
	}
	offset = ldm_vnum_get(p, offset + 16,
	&blk->u.comp.vol_id, size);
	if (offset < 0) {
	errstr = "volume id";
	goto fail;
	}
	break;
	/*
	* Partition VBLK fields:
	* Offset Size Description
	* ------------+-------+------------------------
	* 0x18+12 8 partition start offset
	* 0x18+20 8 volume offset
	* 0x18+28 PN partition size
	* 0x34+ PN parent's component object id
	* 0x34+ PN disk's object id
	*/
	case LDM_VBLK_T_PARTITION:
	if (offset + 28 >= size) {
	errstr = "too small buffer";
	goto fail;
	}
	blk->u.part.start = be64dec(p + offset + 12);
	blk->u.part.offset = be64dec(p + offset + 20);
	offset = ldm_vnum_get(p, offset + 28, &blk->u.part.size, size);
	if (offset < 0) {
	errstr = "partition size";
	goto fail;
	}
	offset = ldm_vnum_get(p, offset, &blk->u.part.comp_id, size);
	if (offset < 0) {
	errstr = "component id";
	goto fail;
	}
	offset = ldm_vnum_get(p, offset, &blk->u.part.disk_id, size);
	if (offset < 0) {
	errstr = "disk id";
	goto fail;
	}
	break;
	/*
	* Disk VBLK fields:
	* Offset Size Description
	* ------------+-------+------------------------
	* 0x18+ PS disk GUID
	*/
	case LDM_VBLK_T_DISK:
	errstr = "disk guid";
	offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size);
	if (offset < 0)
	goto fail;
	error = parse_uuid(vstr, &blk->u.disk.guid);
	if (error != 0)
	goto fail;
	LIST_INSERT_HEAD(&db->disks, &blk->u.disk, entry);
	break;
	/*
	* Disk group VBLK fields:
	* Offset Size Description
	* ------------+-------+------------------------
	* 0x18+ PS disk group GUID
	*/
	case LDM_VBLK_T_DISKGROUP:
	#if 0
	strncpy(blk->u.disk_group.name, vstr,
	sizeof(blk->u.disk_group.name));
	offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size);
	if (offset < 0) {
	errstr = "disk group guid";
	goto fail;
	}
	error = parse_uuid(name, &blk->u.disk_group.guid);
	if (error != 0) {
	errstr = "disk group guid";
	goto fail;
	}
	LIST_INSERT_HEAD(&db->groups, &blk->u.disk_group, entry);
	#endif
	break;
	/*
	* Disk VBLK fields:
	* Offset Size Description
	* ------------+-------+------------------------
	* 0x18+ 16 disk GUID
	*/
	case LDM_VBLK_T_DISK4:
	be_uuid_dec(p + offset, &blk->u.disk.guid);
	LIST_INSERT_HEAD(&db->disks, &blk->u.disk, entry);
	break;
	/*
	* Disk group VBLK fields:
	* Offset Size Description
	* ------------+-------+------------------------
	* 0x18+ 16 disk GUID
	*/
	case LDM_VBLK_T_DISKGROUP4:
	#if 0
	strncpy(blk->u.disk_group.name, vstr,
	sizeof(blk->u.disk_group.name));
	be_uuid_dec(p + offset, &blk->u.disk.guid);
	LIST_INSERT_HEAD(&db->groups, &blk->u.disk_group, entry);
	#endif
	break;
	/*
	* Volume VBLK fields:
	* Offset Size Description
	* ------------+-------+------------------------
	* 0x18+ PS volume type
	* 0x18+ PS unknown
	* 0x18+ 14(S) volume state
	* 0x18+16 1 volume number
	* 0x18+21 PN volume children count
	* 0x2D+16 PN volume size
	* 0x3D+4 1 partition type
	*/
	case LDM_VBLK_T_VOLUME:
	offset = ldm_vparm_skip(p, offset, size);
	if (offset < 0) {
	errstr = "volume type";
	goto fail;
	}
	offset = ldm_vparm_skip(p, offset, size);
	if (offset < 0) {
	errstr = "unknown param";
	goto fail;
	}
	if (offset + 21 >= size) {
	errstr = "too small buffer";
	goto fail;
	}
	blk->u.vol.number = p[offset + 16];
	offset = ldm_vparm_skip(p, offset + 21, size);
	if (offset < 0) {
	errstr = "children count";
	goto fail;
	}
	offset = ldm_vnum_get(p, offset + 16, &blk->u.vol.size, size);
	if (offset < 0) {
	errstr = "volume size";
	goto fail;
	}
	if (offset + 4 >= size) {
	errstr = "too small buffer";
	goto fail;
	}
	blk->u.vol.part_type = p[offset + 4];
	/* keep volumes ordered by volume number */
	last = NULL;
	LIST_FOREACH(volume, &db->volumes, entry) {
	if (volume->number > blk->u.vol.number)
	break;
	last = volume;
	}
	if (last != NULL)
	LIST_INSERT_AFTER(last, &blk->u.vol, entry);
	else
	LIST_INSERT_HEAD(&db->volumes, &blk->u.vol, entry);
	break;
	default:
	LDM_DEBUG(1, "unknown VBLK type 0x%02x\n", blk->type);
	LDM_DUMP(p, size);
	}
	LIST_INSERT_HEAD(&db->vblks, blk, entry);
	return (0);
	fail:
	LDM_DEBUG(0, "failed to parse '%s' in VBLK of type 0x%02x\n",
	errstr, blk->type);
	LDM_DUMP(p, size);
	g_free(blk);
	return (EINVAL);
	}

	static void
	ldm_vmdb_free(struct ldm_db *db)
	{
	struct ldm_vblk *vblk;
	struct ldm_xvblk *xvblk;

	while (!LIST_EMPTY(&db->xvblks)) {
	xvblk = LIST_FIRST(&db->xvblks);
	LIST_REMOVE(xvblk, entry);
	g_free(xvblk->data);
	g_free(xvblk);
	}
	while (!LIST_EMPTY(&db->vblks)) {
	vblk = LIST_FIRST(&db->vblks);
	LIST_REMOVE(vblk, entry);
	g_free(vblk);
	}
	}

	static int
	ldm_vmdb_parse(struct ldm_db db, struct g_consumer cp)
	{
	struct g_provider *pp;
	struct ldm_vblk *vblk;
	struct ldm_xvblk *xvblk;
	struct ldm_volume *volume;
	struct ldm_component *comp;
	struct ldm_vblkhdr vh;
	u_char buf, p;
	size_t size, n, sectors;
	uint64_t offset;
	int error;

	pp = cp->provider;
	size = howmany(db->dh.last_seq * db->dh.size, pp->sectorsize);
	size -= 1; /* one sector takes vmdb header */
	- for (n = 0; n < size; n += MAXPHYS / pp->sectorsize) {
	+ for (n = 0; n < size; n += maxphys / pp->sectorsize) {
	offset = db->ph.db_offset + db->th.conf_offset + n + 1;
	- sectors = (size - n) > (MAXPHYS / pp->sectorsize) ?
	- MAXPHYS / pp->sectorsize: size - n;
	+ sectors = (size - n) > (maxphys / pp->sectorsize) ?
	+ maxphys / pp->sectorsize : size - n;
	/* read VBLKs */
	buf = g_read_data(cp, offset * pp->sectorsize,
	sectors * pp->sectorsize, &error);
	if (buf == NULL) {
	LDM_DEBUG(0, "%s: failed to read VBLK\n",
	pp->name);
	goto fail;
	}
	for (p = buf; p < buf + sectors * pp->sectorsize;
	p += db->dh.size) {
	if (memcmp(p, LDM_VBLK_SIGN,
	strlen(LDM_VBLK_SIGN)) != 0) {
	LDM_DEBUG(0, "%s: no VBLK signature\n",
	pp->name);
	LDM_DUMP(p, db->dh.size);
	goto fail;
	}
	vh.seq = be32dec(p + LDM_VBLK_SEQ_OFF);
	vh.group = be32dec(p + LDM_VBLK_GROUP_OFF);
	/* skip empty blocks */
	if (vh.seq == 0 \|\| vh.group == 0)
	continue;
	vh.index = be16dec(p + LDM_VBLK_INDEX_OFF);
	vh.count = be16dec(p + LDM_VBLK_COUNT_OFF);
	if (vh.count == 0 \|\| vh.count > 4 \|\|
	vh.seq > db->dh.last_seq) {
	LDM_DEBUG(0, "%s: invalid values "
	"in the VBLK header\n", pp->name);
	LDM_DUMP(p, db->dh.size);
	goto fail;
	}
	if (vh.count > 1) {
	error = ldm_xvblk_handle(db, &vh, p);
	if (error != 0) {
	LDM_DEBUG(0, "%s: xVBLK "
	"is corrupted\n", pp->name);
	LDM_DUMP(p, db->dh.size);
	goto fail;
	}
	continue;
	}
	if (be16dec(p + 16) != 0)
	LDM_DEBUG(1, "%s: VBLK update"
	" status is %u\n", pp->name,
	be16dec(p + 16));
	error = ldm_vblk_handle(db, p, db->dh.size);
	if (error != 0)
	goto fail;
	}
	g_free(buf);
	buf = NULL;
	}
	/* Parse xVBLKs */
	while (!LIST_EMPTY(&db->xvblks)) {
	xvblk = LIST_FIRST(&db->xvblks);
	if (xvblk->map == 0xFF) {
	error = ldm_vblk_handle(db, xvblk->data, xvblk->size);
	if (error != 0)
	goto fail;
	} else {
	LDM_DEBUG(0, "%s: incomplete or corrupt "
	"xVBLK found\n", pp->name);
	goto fail;
	}
	LIST_REMOVE(xvblk, entry);
	g_free(xvblk->data);
	g_free(xvblk);
	}
	/* construct all VBLKs relations */
	LIST_FOREACH(volume, &db->volumes, entry) {
	LIST_FOREACH(vblk, &db->vblks, entry)
	if (vblk->type == LDM_VBLK_T_COMPONENT &&
	vblk->u.comp.vol_id == volume->id) {
	LIST_INSERT_HEAD(&volume->components,
	&vblk->u.comp, entry);
	volume->count++;
	}
	LIST_FOREACH(comp, &volume->components, entry)
	LIST_FOREACH(vblk, &db->vblks, entry)
	if (vblk->type == LDM_VBLK_T_PARTITION &&
	vblk->u.part.comp_id == comp->id) {
	LIST_INSERT_HEAD(&comp->partitions,
	&vblk->u.part, entry);
	comp->count++;
	}
	}
	return (0);
	fail:
	ldm_vmdb_free(db);
	g_free(buf);
	return (ENXIO);
	}

	static int
	g_part_ldm_add(struct g_part_table basetable, struct g_part_entry baseentry,
	struct g_part_parms *gpp)
	{

	return (ENOSYS);
	}

	static int
	g_part_ldm_bootcode(struct g_part_table basetable, struct g_part_parms gpp)
	{

	return (ENOSYS);
	}

	static int
	g_part_ldm_create(struct g_part_table basetable, struct g_part_parms gpp)
	{

	return (ENOSYS);
	}

	static int
	g_part_ldm_destroy(struct g_part_table basetable, struct g_part_parms gpp)
	{
	struct g_part_ldm_table *table;
	struct g_provider *pp;

	table = (struct g_part_ldm_table *)basetable;
	/*
	* To destroy LDM on a disk partitioned with GPT we should delete
	* ms-ldm-metadata partition, but we can't do this via standard
	* GEOM_PART method.
	*/
	if (table->is_gpt)
	return (ENOSYS);
	pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider;
	/*
	* To destroy LDM we should wipe MBR, first private header and
	* backup private headers.
	*/
	basetable->gpt_smhead = (1 << ldm_ph_off[0]) \| 1;
	/*
	* Don't touch last backup private header when LDM database is
	* not located in the last 1MByte area.
	* XXX: can't remove all blocks.
	*/
	if (table->db_offset + LDM_DB_SIZE ==
	pp->mediasize / pp->sectorsize)
	basetable->gpt_smtail = 1;
	return (0);
	}

	static void
	g_part_ldm_dumpconf(struct g_part_table *basetable,
	struct g_part_entry baseentry, struct sbuf sb, const char *indent)
	{
	struct g_part_ldm_entry *entry;

	entry = (struct g_part_ldm_entry *)baseentry;
	if (indent == NULL) {
	/* conftxt: libdisk compatibility */
	sbuf_printf(sb, " xs LDM xt %u", entry->type);
	} else if (entry != NULL) {
	/* confxml: partition entry information */
	sbuf_printf(sb, "%s<rawtype>%u</rawtype>\n", indent,
	entry->type);
	} else {
	/* confxml: scheme information */
	}
	}

	static int
	g_part_ldm_dumpto(struct g_part_table table, struct g_part_entry baseentry)
	{

	return (0);
	}

	static int
	g_part_ldm_modify(struct g_part_table *basetable,
	struct g_part_entry baseentry, struct g_part_parms gpp)
	{

	return (ENOSYS);
	}

	static const char *
	g_part_ldm_name(struct g_part_table table, struct g_part_entry baseentry,
	char *buf, size_t bufsz)
	{

	snprintf(buf, bufsz, "s%d", baseentry->gpe_index);
	return (buf);
	}

	static int
	ldm_gpt_probe(struct g_part_table basetable, struct g_consumer cp)
	{
	struct g_part_ldm_table *table;
	struct g_part_table *gpt;
	struct g_part_entry *entry;
	struct g_consumer *cp2;
	struct gpt_ent *part;
	u_char *buf;
	int error;

	/*
	* XXX: We use some knowledge about GEOM_PART_GPT internal
	* structures, but it is easier than parse GPT by himself.
	*/
	g_topology_lock();
	gpt = cp->provider->geom->softc;
	LIST_FOREACH(entry, &gpt->gpt_entry, gpe_entry) {
	part = (struct gpt_ent *)(entry + 1);
	/* Search ms-ldm-metadata partition */
	if (memcmp(&part->ent_type,
	&gpt_uuid_ms_ldm_metadata, sizeof(struct uuid)) != 0 \|\|
	entry->gpe_end - entry->gpe_start < LDM_DB_SIZE - 1)
	continue;

	/* Create new consumer and attach it to metadata partition */
	cp2 = g_new_consumer(cp->geom);
	error = g_attach(cp2, entry->gpe_pp);
	if (error != 0) {
	g_destroy_consumer(cp2);
	g_topology_unlock();
	return (ENXIO);
	}
	error = g_access(cp2, 1, 0, 0);
	if (error != 0) {
	g_detach(cp2);
	g_destroy_consumer(cp2);
	g_topology_unlock();
	return (ENXIO);
	}
	g_topology_unlock();

	LDM_DEBUG(2, "%s: LDM metadata partition %s found in the GPT",
	cp->provider->name, cp2->provider->name);
	/* Read the LDM private header */
	buf = ldm_privhdr_read(cp2,
	ldm_ph_off[LDM_PH_GPTINDEX] * cp2->provider->sectorsize,
	&error);
	if (buf != NULL) {
	table = (struct g_part_ldm_table *)basetable;
	table->is_gpt = 1;
	g_free(buf);
	return (G_PART_PROBE_PRI_HIGH);
	}

	/* second consumer is no longer needed. */
	g_topology_lock();
	g_access(cp2, -1, 0, 0);
	g_detach(cp2);
	g_destroy_consumer(cp2);
	break;
	}
	g_topology_unlock();
	return (ENXIO);
	}

	static int
	g_part_ldm_probe(struct g_part_table basetable, struct g_consumer cp)
	{
	struct g_provider *pp;
	u_char *buf, type[64];
	int error, idx;

	pp = cp->provider;
	if (pp->sectorsize != 512)
	return (ENXIO);

	error = g_getattr("PART::scheme", cp, &type);
	if (error == 0 && strcmp(type, "GPT") == 0) {
	if (g_getattr("PART::type", cp, &type) != 0 \|\|
	strcmp(type, "ms-ldm-data") != 0)
	return (ENXIO);
	error = ldm_gpt_probe(basetable, cp);
	return (error);
	}

	if (basetable->gpt_depth != 0)
	return (ENXIO);

	/* LDM has 1M metadata area */
	if (pp->mediasize <= 1024 * 1024)
	return (ENOSPC);

	/* Check that there's a MBR */
	buf = g_read_data(cp, 0, pp->sectorsize, &error);
	if (buf == NULL)
	return (error);

	if (le16dec(buf + DOSMAGICOFFSET) != DOSMAGIC) {
	g_free(buf);
	return (ENXIO);
	}
	error = ENXIO;
	/* Check that we have LDM partitions in the MBR */
	for (idx = 0; idx < NDOSPART && error != 0; idx++) {
	if (buf[DOSPARTOFF + idx * DOSPARTSIZE + 4] == DOSPTYP_LDM)
	error = 0;
	}
	g_free(buf);
	if (error == 0) {
	LDM_DEBUG(2, "%s: LDM data partitions found in MBR",
	pp->name);
	/* Read the LDM private header */
	buf = ldm_privhdr_read(cp,
	ldm_ph_off[LDM_PH_MBRINDEX] * pp->sectorsize, &error);
	if (buf == NULL)
	return (error);
	g_free(buf);
	return (G_PART_PROBE_PRI_HIGH);
	}
	return (error);
	}

	static int
	g_part_ldm_read(struct g_part_table basetable, struct g_consumer cp)
	{
	struct g_part_ldm_table *table;
	struct g_part_ldm_entry *entry;
	struct g_consumer *cp2;
	struct ldm_component *comp;
	struct ldm_partition *part;
	struct ldm_volume *vol;
	struct ldm_disk *disk;
	struct ldm_db db;
	int error, index, skipped;

	table = (struct g_part_ldm_table *)basetable;
	memset(&db, 0, sizeof(db));
	cp2 = cp; /* ms-ldm-data */
	if (table->is_gpt)
	cp = LIST_FIRST(&cp->geom->consumer); /* ms-ldm-metadata */
	/* Read and parse LDM private headers. */
	error = ldm_privhdr_check(&db, cp, table->is_gpt);
	if (error != 0)
	goto gpt_cleanup;
	basetable->gpt_first = table->is_gpt ? 0: db.ph.start;
	basetable->gpt_last = basetable->gpt_first + db.ph.size - 1;
	table->db_offset = db.ph.db_offset;
	/* Make additional checks for GPT */
	if (table->is_gpt) {
	error = ldm_gpt_check(&db, cp);
	if (error != 0)
	goto gpt_cleanup;
	/*
	* Now we should reset database offset to zero, because our
	* consumer cp is attached to the ms-ldm-metadata partition
	* and we don't need add db_offset to read from it.
	*/
	db.ph.db_offset = 0;
	}
	/* Read and parse LDM TOC headers. */
	error = ldm_tochdr_check(&db, cp);
	if (error != 0)
	goto gpt_cleanup;
	/* Read and parse LDM VMDB header. */
	error = ldm_vmdbhdr_check(&db, cp);
	if (error != 0)
	goto gpt_cleanup;
	error = ldm_vmdb_parse(&db, cp);
	/*
	* For the GPT case we must detach and destroy
	* second consumer before return.
	*/
	gpt_cleanup:
	if (table->is_gpt) {
	g_topology_lock();
	g_access(cp, -1, 0, 0);
	g_detach(cp);
	g_destroy_consumer(cp);
	g_topology_unlock();
	cp = cp2;
	}
	if (error != 0)
	return (error);
	/* Search current disk in the disk list. */
	LIST_FOREACH(disk, &db.disks, entry)
	if (memcmp(&disk->guid, &db.ph.disk_guid,
	sizeof(struct uuid)) == 0)
	break;
	if (disk == NULL) {
	LDM_DEBUG(1, "%s: no LDM volumes on this disk",
	cp->provider->name);
	ldm_vmdb_free(&db);
	return (ENXIO);
	}
	index = 1;
	LIST_FOREACH(vol, &db.volumes, entry) {
	LIST_FOREACH(comp, &vol->components, entry) {
	/* Skip volumes from different disks. */
	part = LIST_FIRST(&comp->partitions);
	if (part->disk_id != disk->id)
	continue;
	skipped = 0;
	/* We don't support spanned and striped volumes. */
	if (comp->count > 1 \|\| part->offset != 0) {
	LDM_DEBUG(1, "%s: LDM volume component "
	"%ju has %u partitions. Skipped",
	cp->provider->name, (uintmax_t)comp->id,
	comp->count);
	skipped = 1;
	}
	/*
	* Allow mirrored volumes only when they are explicitly
	* allowed with kern.geom.part.ldm.show_mirrors=1.
	*/
	if (vol->count > 1 && show_mirrors == 0) {
	LDM_DEBUG(1, "%s: LDM volume %ju has %u "
	"components. Skipped",
	cp->provider->name, (uintmax_t)vol->id,
	vol->count);
	skipped = 1;
	}
	entry = (struct g_part_ldm_entry *)g_part_new_entry(
	basetable, index++,
	basetable->gpt_first + part->start,
	basetable->gpt_first + part->start +
	part->size - 1);
	/*
	* Mark skipped partition as ms-ldm-data partition.
	* We do not support them, but it is better to show
	* that we have something there, than just show
	* free space.
	*/
	if (skipped == 0)
	entry->type = vol->part_type;
	else
	entry->type = DOSPTYP_LDM;
	LDM_DEBUG(1, "%s: new volume id: %ju, start: %ju,"
	" end: %ju, type: 0x%02x\n", cp->provider->name,
	(uintmax_t)part->id,(uintmax_t)part->start +
	basetable->gpt_first, (uintmax_t)part->start +
	part->size + basetable->gpt_first - 1,
	vol->part_type);
	}
	}
	ldm_vmdb_free(&db);
	return (error);
	}

	static const char *
	g_part_ldm_type(struct g_part_table basetable, struct g_part_entry baseentry,
	char *buf, size_t bufsz)
	{
	struct g_part_ldm_entry *entry;
	int i;

	entry = (struct g_part_ldm_entry *)baseentry;
	for (i = 0; i < nitems(ldm_alias_match); i++) {
	if (ldm_alias_match[i].typ == entry->type)
	return (g_part_alias_name(ldm_alias_match[i].alias));
	}
	snprintf(buf, bufsz, "!%d", entry->type);
	return (buf);
	}

	static int
	g_part_ldm_write(struct g_part_table basetable, struct g_consumer cp)
	{

	return (ENOSYS);
	}
	diff --git a/sys/geom/raid/md_ddf.c b/sys/geom/raid/md_ddf.c
	index 68c058da61f4..0a3ec6637337 100644
	--- a/sys/geom/raid/md_ddf.c
	+++ b/sys/geom/raid/md_ddf.c
	@@ -1,3087 +1,3087 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2012 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/bio.h>
	#include <sys/gsb_crc32.h>
	#include <sys/endian.h>
	#include <sys/kernel.h>
	#include <sys/kobj.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/systm.h>
	#include <sys/time.h>
	#include <sys/clock.h>
	#include <sys/disk.h>
	#include <geom/geom.h>
	#include <geom/geom_dbg.h>
	#include "geom/raid/g_raid.h"
	#include "geom/raid/md_ddf.h"
	#include "g_raid_md_if.h"

	static MALLOC_DEFINE(M_MD_DDF, "md_ddf_data", "GEOM_RAID DDF metadata");

	#define DDF_MAX_DISKS_HARD 128

	#define DDF_MAX_DISKS 16
	#define DDF_MAX_VDISKS 7
	#define DDF_MAX_PARTITIONS 1

	#define DECADE (360024(36510+2)) / 10 years in seconds. */

	struct ddf_meta {
	u_int sectorsize;
	u_int bigendian;
	struct ddf_header *hdr;
	struct ddf_cd_record *cdr;
	struct ddf_pd_record *pdr;
	struct ddf_vd_record *vdr;
	void *cr;
	struct ddf_pdd_record *pdd;
	struct ddf_bbm_log *bbm;
	};

	struct ddf_vol_meta {
	u_int sectorsize;
	u_int bigendian;
	struct ddf_header *hdr;
	struct ddf_cd_record *cdr;
	struct ddf_vd_entry *vde;
	struct ddf_vdc_record *vdc;
	struct ddf_vdc_record *bvdc[DDF_MAX_DISKS_HARD];
	};

	struct g_raid_md_ddf_perdisk {
	struct ddf_meta pd_meta;
	};

	struct g_raid_md_ddf_pervolume {
	struct ddf_vol_meta pv_meta;
	int pv_started;
	struct callout pv_start_co; /* STARTING state timer. */
	};

	struct g_raid_md_ddf_object {
	struct g_raid_md_object mdio_base;
	u_int mdio_bigendian;
	struct ddf_meta mdio_meta;
	int mdio_starting;
	struct callout mdio_start_co; /* STARTING state timer. */
	int mdio_started;
	struct root_hold_token mdio_rootmount; / Root mount delay token. */
	};

	static g_raid_md_create_req_t g_raid_md_create_req_ddf;
	static g_raid_md_taste_t g_raid_md_taste_ddf;
	static g_raid_md_event_t g_raid_md_event_ddf;
	static g_raid_md_volume_event_t g_raid_md_volume_event_ddf;
	static g_raid_md_ctl_t g_raid_md_ctl_ddf;
	static g_raid_md_write_t g_raid_md_write_ddf;
	static g_raid_md_fail_disk_t g_raid_md_fail_disk_ddf;
	static g_raid_md_free_disk_t g_raid_md_free_disk_ddf;
	static g_raid_md_free_volume_t g_raid_md_free_volume_ddf;
	static g_raid_md_free_t g_raid_md_free_ddf;

	static kobj_method_t g_raid_md_ddf_methods[] = {
	KOBJMETHOD(g_raid_md_create_req, g_raid_md_create_req_ddf),
	KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_ddf),
	KOBJMETHOD(g_raid_md_event, g_raid_md_event_ddf),
	KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_ddf),
	KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_ddf),
	KOBJMETHOD(g_raid_md_write, g_raid_md_write_ddf),
	KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_ddf),
	KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_ddf),
	KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_ddf),
	KOBJMETHOD(g_raid_md_free, g_raid_md_free_ddf),
	{ 0, 0 }
	};

	static struct g_raid_md_class g_raid_md_ddf_class = {
	"DDF",
	g_raid_md_ddf_methods,
	sizeof(struct g_raid_md_ddf_object),
	.mdc_enable = 1,
	.mdc_priority = 100
	};

	#define GET8(m, f) ((m)->f)
	#define GET16(m, f) ((m)->bigendian ? be16dec(&(m)->f) : le16dec(&(m)->f))
	#define GET32(m, f) ((m)->bigendian ? be32dec(&(m)->f) : le32dec(&(m)->f))
	#define GET64(m, f) ((m)->bigendian ? be64dec(&(m)->f) : le64dec(&(m)->f))
	#define GET8D(m, f) (f)
	#define GET16D(m, f) ((m)->bigendian ? be16dec(&f) : le16dec(&f))
	#define GET32D(m, f) ((m)->bigendian ? be32dec(&f) : le32dec(&f))
	#define GET64D(m, f) ((m)->bigendian ? be64dec(&f) : le64dec(&f))
	#define GET8P(m, f) (*(f))
	#define GET16P(m, f) ((m)->bigendian ? be16dec(f) : le16dec(f))
	#define GET32P(m, f) ((m)->bigendian ? be32dec(f) : le32dec(f))
	#define GET64P(m, f) ((m)->bigendian ? be64dec(f) : le64dec(f))

	#define SET8P(m, f, v) \
	(*(f) = (v))
	#define SET16P(m, f, v) \
	do { \
	if ((m)->bigendian) \
	be16enc((f), (v)); \
	else \
	le16enc((f), (v)); \
	} while (0)
	#define SET32P(m, f, v) \
	do { \
	if ((m)->bigendian) \
	be32enc((f), (v)); \
	else \
	le32enc((f), (v)); \
	} while (0)
	#define SET64P(m, f, v) \
	do { \
	if ((m)->bigendian) \
	be64enc((f), (v)); \
	else \
	le64enc((f), (v)); \
	} while (0)
	#define SET8(m, f, v) SET8P((m), &((m)->f), (v))
	#define SET16(m, f, v) SET16P((m), &((m)->f), (v))
	#define SET32(m, f, v) SET32P((m), &((m)->f), (v))
	#define SET64(m, f, v) SET64P((m), &((m)->f), (v))
	#define SET8D(m, f, v) SET8P((m), &(f), (v))
	#define SET16D(m, f, v) SET16P((m), &(f), (v))
	#define SET32D(m, f, v) SET32P((m), &(f), (v))
	#define SET64D(m, f, v) SET64P((m), &(f), (v))

	#define GETCRNUM(m) (GET32((m), hdr->cr_length) / \
	GET16((m), hdr->Configuration_Record_Length))

	#define GETVDCPTR(m, n) ((struct ddf_vdc_record )((uint8_t )(m)->cr + \
	(n) * GET16((m), hdr->Configuration_Record_Length) * \
	(m)->sectorsize))

	#define GETSAPTR(m, n) ((struct ddf_sa_record )((uint8_t )(m)->cr + \
	(n) * GET16((m), hdr->Configuration_Record_Length) * \
	(m)->sectorsize))

	static int
	isff(uint8_t *buf, int size)
	{
	int i;

	for (i = 0; i < size; i++)
	if (buf[i] != 0xff)
	return (0);
	return (1);
	}

	static void
	print_guid(uint8_t *buf)
	{
	int i, ascii;

	ascii = 1;
	for (i = 0; i < 24; i++) {
	if (buf[i] != 0 && (buf[i] < ' ' \|\| buf[i] > 127)) {
	ascii = 0;
	break;
	}
	}
	if (ascii) {
	printf("'%.24s'", buf);
	} else {
	for (i = 0; i < 24; i++)
	printf("%02x", buf[i]);
	}
	}

	static void
	g_raid_md_ddf_print(struct ddf_meta *meta)
	{
	struct ddf_vdc_record *vdc;
	struct ddf_vuc_record *vuc;
	struct ddf_sa_record *sa;
	uint64_t *val2;
	uint32_t val;
	int i, j, k, num, num2;

	if (g_raid_debug < 1)
	return;

	printf("******* DDF Metadata *******\n");
	printf("** Header **\n");
	printf("DDF_Header_GUID ");
	print_guid(meta->hdr->DDF_Header_GUID);
	printf("\n");
	printf("DDF_rev %8.8s\n", (char *)&meta->hdr->DDF_rev[0]);
	printf("Sequence_Number 0x%08x\n", GET32(meta, hdr->Sequence_Number));
	printf("TimeStamp 0x%08x\n", GET32(meta, hdr->TimeStamp));
	printf("Open_Flag 0x%02x\n", GET16(meta, hdr->Open_Flag));
	printf("Foreign_Flag 0x%02x\n", GET16(meta, hdr->Foreign_Flag));
	printf("Diskgrouping 0x%02x\n", GET16(meta, hdr->Diskgrouping));
	printf("Primary_Header_LBA %ju\n", GET64(meta, hdr->Primary_Header_LBA));
	printf("Secondary_Header_LBA %ju\n", GET64(meta, hdr->Secondary_Header_LBA));
	printf("WorkSpace_Length %u\n", GET32(meta, hdr->WorkSpace_Length));
	printf("WorkSpace_LBA %ju\n", GET64(meta, hdr->WorkSpace_LBA));
	printf("Max_PD_Entries %u\n", GET16(meta, hdr->Max_PD_Entries));
	printf("Max_VD_Entries %u\n", GET16(meta, hdr->Max_VD_Entries));
	printf("Max_Partitions %u\n", GET16(meta, hdr->Max_Partitions));
	printf("Configuration_Record_Length %u\n", GET16(meta, hdr->Configuration_Record_Length));
	printf("Max_Primary_Element_Entries %u\n", GET16(meta, hdr->Max_Primary_Element_Entries));
	printf("Controller Data %u:%u\n", GET32(meta, hdr->cd_section), GET32(meta, hdr->cd_length));
	printf("Physical Disk %u:%u\n", GET32(meta, hdr->pdr_section), GET32(meta, hdr->pdr_length));
	printf("Virtual Disk %u:%u\n", GET32(meta, hdr->vdr_section), GET32(meta, hdr->vdr_length));
	printf("Configuration Recs %u:%u\n", GET32(meta, hdr->cr_section), GET32(meta, hdr->cr_length));
	printf("Physical Disk Recs %u:%u\n", GET32(meta, hdr->pdd_section), GET32(meta, hdr->pdd_length));
	printf("BBM Log %u:%u\n", GET32(meta, hdr->bbmlog_section), GET32(meta, hdr->bbmlog_length));
	printf("Diagnostic Space %u:%u\n", GET32(meta, hdr->Diagnostic_Space), GET32(meta, hdr->Diagnostic_Space_Length));
	printf("Vendor_Specific_Logs %u:%u\n", GET32(meta, hdr->Vendor_Specific_Logs), GET32(meta, hdr->Vendor_Specific_Logs_Length));
	printf("** Controller Data **\n");
	printf("Controller_GUID ");
	print_guid(meta->cdr->Controller_GUID);
	printf("\n");
	printf("Controller_Type 0x%04x%04x 0x%04x%04x\n",
	GET16(meta, cdr->Controller_Type.Vendor_ID),
	GET16(meta, cdr->Controller_Type.Device_ID),
	GET16(meta, cdr->Controller_Type.SubVendor_ID),
	GET16(meta, cdr->Controller_Type.SubDevice_ID));
	printf("Product_ID '%.16s'\n", (char *)&meta->cdr->Product_ID[0]);
	printf("** Physical Disk Records **\n");
	printf("Populated_PDEs %u\n", GET16(meta, pdr->Populated_PDEs));
	printf("Max_PDE_Supported %u\n", GET16(meta, pdr->Max_PDE_Supported));
	for (j = 0; j < GET16(meta, pdr->Populated_PDEs); j++) {
	if (isff(meta->pdr->entry[j].PD_GUID, 24))
	continue;
	if (GET32(meta, pdr->entry[j].PD_Reference) == 0xffffffff)
	continue;
	printf("PD_GUID ");
	print_guid(meta->pdr->entry[j].PD_GUID);
	printf("\n");
	printf("PD_Reference 0x%08x\n",
	GET32(meta, pdr->entry[j].PD_Reference));
	printf("PD_Type 0x%04x\n",
	GET16(meta, pdr->entry[j].PD_Type));
	printf("PD_State 0x%04x\n",
	GET16(meta, pdr->entry[j].PD_State));
	printf("Configured_Size %ju\n",
	GET64(meta, pdr->entry[j].Configured_Size));
	printf("Block_Size %u\n",
	GET16(meta, pdr->entry[j].Block_Size));
	}
	printf("** Virtual Disk Records **\n");
	printf("Populated_VDEs %u\n", GET16(meta, vdr->Populated_VDEs));
	printf("Max_VDE_Supported %u\n", GET16(meta, vdr->Max_VDE_Supported));
	for (j = 0; j < GET16(meta, vdr->Populated_VDEs); j++) {
	if (isff(meta->vdr->entry[j].VD_GUID, 24))
	continue;
	printf("VD_GUID ");
	print_guid(meta->vdr->entry[j].VD_GUID);
	printf("\n");
	printf("VD_Number 0x%04x\n",
	GET16(meta, vdr->entry[j].VD_Number));
	printf("VD_Type 0x%04x\n",
	GET16(meta, vdr->entry[j].VD_Type));
	printf("VD_State 0x%02x\n",
	GET8(meta, vdr->entry[j].VD_State));
	printf("Init_State 0x%02x\n",
	GET8(meta, vdr->entry[j].Init_State));
	printf("Drive_Failures_Remaining %u\n",
	GET8(meta, vdr->entry[j].Drive_Failures_Remaining));
	printf("VD_Name '%.16s'\n",
	(char *)&meta->vdr->entry[j].VD_Name);
	}
	printf("** Configuration Records **\n");
	num = GETCRNUM(meta);
	for (j = 0; j < num; j++) {
	vdc = GETVDCPTR(meta, j);
	val = GET32D(meta, vdc->Signature);
	switch (val) {
	case DDF_VDCR_SIGNATURE:
	printf(" Virtual Disk Configuration \n");
	printf("VD_GUID ");
	print_guid(vdc->VD_GUID);
	printf("\n");
	printf("Timestamp 0x%08x\n",
	GET32D(meta, vdc->Timestamp));
	printf("Sequence_Number 0x%08x\n",
	GET32D(meta, vdc->Sequence_Number));
	printf("Primary_Element_Count %u\n",
	GET16D(meta, vdc->Primary_Element_Count));
	printf("Stripe_Size %u\n",
	GET8D(meta, vdc->Stripe_Size));
	printf("Primary_RAID_Level 0x%02x\n",
	GET8D(meta, vdc->Primary_RAID_Level));
	printf("RLQ 0x%02x\n",
	GET8D(meta, vdc->RLQ));
	printf("Secondary_Element_Count %u\n",
	GET8D(meta, vdc->Secondary_Element_Count));
	printf("Secondary_Element_Seq %u\n",
	GET8D(meta, vdc->Secondary_Element_Seq));
	printf("Secondary_RAID_Level 0x%02x\n",
	GET8D(meta, vdc->Secondary_RAID_Level));
	printf("Block_Count %ju\n",
	GET64D(meta, vdc->Block_Count));
	printf("VD_Size %ju\n",
	GET64D(meta, vdc->VD_Size));
	printf("Block_Size %u\n",
	GET16D(meta, vdc->Block_Size));
	printf("Rotate_Parity_count %u\n",
	GET8D(meta, vdc->Rotate_Parity_count));
	printf("Associated_Spare_Disks");
	for (i = 0; i < 8; i++) {
	if (GET32D(meta, vdc->Associated_Spares[i]) != 0xffffffff)
	printf(" 0x%08x", GET32D(meta, vdc->Associated_Spares[i]));
	}
	printf("\n");
	printf("Cache_Flags %016jx\n",
	GET64D(meta, vdc->Cache_Flags));
	printf("BG_Rate %u\n",
	GET8D(meta, vdc->BG_Rate));
	printf("MDF_Parity_Disks %u\n",
	GET8D(meta, vdc->MDF_Parity_Disks));
	printf("MDF_Parity_Generator_Polynomial 0x%04x\n",
	GET16D(meta, vdc->MDF_Parity_Generator_Polynomial));
	printf("MDF_Constant_Generation_Method 0x%02x\n",
	GET8D(meta, vdc->MDF_Constant_Generation_Method));
	printf("Physical_Disks ");
	num2 = GET16D(meta, vdc->Primary_Element_Count);
	val2 = (uint64_t *)&(vdc->Physical_Disk_Sequence[GET16(meta, hdr->Max_Primary_Element_Entries)]);
	for (i = 0; i < num2; i++)
	printf(" 0x%08x @ %ju",
	GET32D(meta, vdc->Physical_Disk_Sequence[i]),
	GET64P(meta, val2 + i));
	printf("\n");
	break;
	case DDF_VUCR_SIGNATURE:
	printf(" Vendor Unique Configuration \n");
	vuc = (struct ddf_vuc_record *)vdc;
	printf("VD_GUID ");
	print_guid(vuc->VD_GUID);
	printf("\n");
	break;
	case DDF_SA_SIGNATURE:
	printf(" Spare Assignment Configuration \n");
	sa = (struct ddf_sa_record *)vdc;
	printf("Timestamp 0x%08x\n",
	GET32D(meta, sa->Timestamp));
	printf("Spare_Type 0x%02x\n",
	GET8D(meta, sa->Spare_Type));
	printf("Populated_SAEs %u\n",
	GET16D(meta, sa->Populated_SAEs));
	printf("MAX_SAE_Supported %u\n",
	GET16D(meta, sa->MAX_SAE_Supported));
	for (i = 0; i < GET16D(meta, sa->Populated_SAEs); i++) {
	if (isff(sa->entry[i].VD_GUID, 24))
	continue;
	printf("VD_GUID ");
	for (k = 0; k < 24; k++)
	printf("%02x", sa->entry[i].VD_GUID[k]);
	printf("\n");
	printf("Secondary_Element %u\n",
	GET16D(meta, sa->entry[i].Secondary_Element));
	}
	break;
	case 0x00000000:
	case 0xFFFFFFFF:
	break;
	default:
	printf("Unknown configuration signature %08x\n", val);
	break;
	}
	}
	printf("** Physical Disk Data **\n");
	printf("PD_GUID ");
	print_guid(meta->pdd->PD_GUID);
	printf("\n");
	printf("PD_Reference 0x%08x\n",
	GET32(meta, pdd->PD_Reference));
	printf("Forced_Ref_Flag 0x%02x\n",
	GET8(meta, pdd->Forced_Ref_Flag));
	printf("Forced_PD_GUID_Flag 0x%02x\n",
	GET8(meta, pdd->Forced_PD_GUID_Flag));
	}

	static int
	ddf_meta_find_pd(struct ddf_meta meta, uint8_t GUID, uint32_t PD_Reference)
	{
	int i;

	for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) {
	if (GUID != NULL) {
	if (memcmp(meta->pdr->entry[i].PD_GUID, GUID, 24) == 0)
	return (i);
	} else if (PD_Reference != 0xffffffff) {
	if (GET32(meta, pdr->entry[i].PD_Reference) == PD_Reference)
	return (i);
	} else
	if (isff(meta->pdr->entry[i].PD_GUID, 24))
	return (i);
	}
	if (GUID == NULL && PD_Reference == 0xffffffff) {
	if (i >= GET16(meta, pdr->Max_PDE_Supported))
	return (-1);
	SET16(meta, pdr->Populated_PDEs, i + 1);
	return (i);
	}
	return (-1);
	}

	static int
	ddf_meta_find_vd(struct ddf_meta meta, uint8_t GUID)
	{
	int i;

	for (i = 0; i < GET16(meta, vdr->Populated_VDEs); i++) {
	if (GUID != NULL) {
	if (memcmp(meta->vdr->entry[i].VD_GUID, GUID, 24) == 0)
	return (i);
	} else
	if (isff(meta->vdr->entry[i].VD_GUID, 24))
	return (i);
	}
	if (GUID == NULL) {
	if (i >= GET16(meta, vdr->Max_VDE_Supported))
	return (-1);
	SET16(meta, vdr->Populated_VDEs, i + 1);
	return (i);
	}
	return (-1);
	}

	static struct ddf_vdc_record *
	ddf_meta_find_vdc(struct ddf_meta meta, uint8_t GUID)
	{
	struct ddf_vdc_record *vdc;
	int i, num;

	num = GETCRNUM(meta);
	for (i = 0; i < num; i++) {
	vdc = GETVDCPTR(meta, i);
	if (GUID != NULL) {
	if (GET32D(meta, vdc->Signature) == DDF_VDCR_SIGNATURE &&
	memcmp(vdc->VD_GUID, GUID, 24) == 0)
	return (vdc);
	} else
	if (GET32D(meta, vdc->Signature) == 0xffffffff \|\|
	GET32D(meta, vdc->Signature) == 0)
	return (vdc);
	}
	return (NULL);
	}

	static int
	ddf_meta_count_vdc(struct ddf_meta meta, uint8_t GUID)
	{
	struct ddf_vdc_record *vdc;
	int i, num, cnt;

	cnt = 0;
	num = GETCRNUM(meta);
	for (i = 0; i < num; i++) {
	vdc = GETVDCPTR(meta, i);
	if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE)
	continue;
	if (GUID == NULL \|\| memcmp(vdc->VD_GUID, GUID, 24) == 0)
	cnt++;
	}
	return (cnt);
	}

	static int
	ddf_meta_find_disk(struct ddf_vol_meta *vmeta, uint32_t PD_Reference,
	int bvdp, int posp)
	{
	int i, bvd, pos;

	i = 0;
	for (bvd = 0; bvd < GET8(vmeta, vdc->Secondary_Element_Count); bvd++) {
	if (vmeta->bvdc[bvd] == NULL) {
	i += GET16(vmeta, vdc->Primary_Element_Count); // XXX
	continue;
	}
	for (pos = 0; pos < GET16(vmeta, bvdc[bvd]->Primary_Element_Count);
	pos++, i++) {
	if (GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]) ==
	PD_Reference) {
	if (bvdp != NULL)
	*bvdp = bvd;
	if (posp != NULL)
	*posp = pos;
	return (i);
	}
	}
	}
	return (-1);
	}

	static struct ddf_sa_record *
	ddf_meta_find_sa(struct ddf_meta *meta, int create)
	{
	struct ddf_sa_record *sa;
	int i, num;

	num = GETCRNUM(meta);
	for (i = 0; i < num; i++) {
	sa = GETSAPTR(meta, i);
	if (GET32D(meta, sa->Signature) == DDF_SA_SIGNATURE)
	return (sa);
	}
	if (create) {
	for (i = 0; i < num; i++) {
	sa = GETSAPTR(meta, i);
	if (GET32D(meta, sa->Signature) == 0xffffffff \|\|
	GET32D(meta, sa->Signature) == 0)
	return (sa);
	}
	}
	return (NULL);
	}

	static void
	ddf_meta_create(struct g_raid_disk disk, struct ddf_meta sample)
	{
	struct timespec ts;
	struct clocktime ct;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_object *mdi;
	struct ddf_meta *meta;
	struct ddf_pd_entry *pde;
	off_t anchorlba;
	u_int ss, pos, size;
	int len, error;
	char serial_buffer[DISK_IDENT_SIZE];

	if (sample->hdr == NULL)
	sample = NULL;

	mdi = (struct g_raid_md_ddf_object *)disk->d_softc->sc_md;
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	meta = &pd->pd_meta;
	ss = disk->d_consumer->provider->sectorsize;
	anchorlba = disk->d_consumer->provider->mediasize / ss - 1;

	meta->sectorsize = ss;
	meta->bigendian = sample ? sample->bigendian : mdi->mdio_bigendian;
	getnanotime(&ts);
	clock_ts_to_ct(&ts, &ct);

	/* Header */
	meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
	memset(meta->hdr, 0xff, ss);
	if (sample) {
	memcpy(meta->hdr, sample->hdr, sizeof(struct ddf_header));
	if (ss != sample->sectorsize) {
	SET32(meta, hdr->WorkSpace_Length,
	howmany(GET32(sample, hdr->WorkSpace_Length) *
	sample->sectorsize, ss));
	SET16(meta, hdr->Configuration_Record_Length,
	howmany(GET16(sample,
	hdr->Configuration_Record_Length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->cd_length,
	howmany(GET32(sample, hdr->cd_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->pdr_length,
	howmany(GET32(sample, hdr->pdr_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->vdr_length,
	howmany(GET32(sample, hdr->vdr_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->cr_length,
	howmany(GET32(sample, hdr->cr_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->pdd_length,
	howmany(GET32(sample, hdr->pdd_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->bbmlog_length,
	howmany(GET32(sample, hdr->bbmlog_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->Diagnostic_Space,
	howmany(GET32(sample, hdr->bbmlog_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->Vendor_Specific_Logs,
	howmany(GET32(sample, hdr->bbmlog_length) *
	sample->sectorsize, ss));
	}
	} else {
	SET32(meta, hdr->Signature, DDF_HEADER_SIGNATURE);
	snprintf(meta->hdr->DDF_Header_GUID, 25, "FreeBSD %08x%08x",
	(u_int)(ts.tv_sec - DECADE), arc4random());
	memcpy(meta->hdr->DDF_rev, "02.00.00", 8);
	SET32(meta, hdr->TimeStamp, (ts.tv_sec - DECADE));
	SET32(meta, hdr->WorkSpace_Length, 16 * 1024 * 1024 / ss);
	SET16(meta, hdr->Max_PD_Entries, DDF_MAX_DISKS - 1);
	SET16(meta, hdr->Max_VD_Entries, DDF_MAX_VDISKS);
	SET16(meta, hdr->Max_Partitions, DDF_MAX_PARTITIONS);
	SET16(meta, hdr->Max_Primary_Element_Entries, DDF_MAX_DISKS);
	SET16(meta, hdr->Configuration_Record_Length,
	howmany(sizeof(struct ddf_vdc_record) + (4 + 8) *
	GET16(meta, hdr->Max_Primary_Element_Entries), ss));
	SET32(meta, hdr->cd_length,
	howmany(sizeof(struct ddf_cd_record), ss));
	SET32(meta, hdr->pdr_length,
	howmany(sizeof(struct ddf_pd_record) +
	sizeof(struct ddf_pd_entry) * GET16(meta,
	hdr->Max_PD_Entries), ss));
	SET32(meta, hdr->vdr_length,
	howmany(sizeof(struct ddf_vd_record) +
	sizeof(struct ddf_vd_entry) *
	GET16(meta, hdr->Max_VD_Entries), ss));
	SET32(meta, hdr->cr_length,
	GET16(meta, hdr->Configuration_Record_Length) *
	(GET16(meta, hdr->Max_Partitions) + 1));
	SET32(meta, hdr->pdd_length,
	howmany(sizeof(struct ddf_pdd_record), ss));
	SET32(meta, hdr->bbmlog_length, 0);
	SET32(meta, hdr->Diagnostic_Space_Length, 0);
	SET32(meta, hdr->Vendor_Specific_Logs_Length, 0);
	}
	pos = 1;
	SET32(meta, hdr->cd_section, pos);
	pos += GET32(meta, hdr->cd_length);
	SET32(meta, hdr->pdr_section, pos);
	pos += GET32(meta, hdr->pdr_length);
	SET32(meta, hdr->vdr_section, pos);
	pos += GET32(meta, hdr->vdr_length);
	SET32(meta, hdr->cr_section, pos);
	pos += GET32(meta, hdr->cr_length);
	SET32(meta, hdr->pdd_section, pos);
	pos += GET32(meta, hdr->pdd_length);
	SET32(meta, hdr->bbmlog_section,
	GET32(meta, hdr->bbmlog_length) != 0 ? pos : 0xffffffff);
	pos += GET32(meta, hdr->bbmlog_length);
	SET32(meta, hdr->Diagnostic_Space,
	GET32(meta, hdr->Diagnostic_Space_Length) != 0 ? pos : 0xffffffff);
	pos += GET32(meta, hdr->Diagnostic_Space_Length);
	SET32(meta, hdr->Vendor_Specific_Logs,
	GET32(meta, hdr->Vendor_Specific_Logs_Length) != 0 ? pos : 0xffffffff);
	pos += min(GET32(meta, hdr->Vendor_Specific_Logs_Length), 1);
	SET64(meta, hdr->Primary_Header_LBA,
	anchorlba - pos);
	SET64(meta, hdr->Secondary_Header_LBA,
	0xffffffffffffffffULL);
	SET64(meta, hdr->WorkSpace_LBA,
	anchorlba + 1 - 32 * 1024 * 1024 / ss);

	/* Controller Data */
	size = GET32(meta, hdr->cd_length) * ss;
	meta->cdr = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->cdr, 0xff, size);
	SET32(meta, cdr->Signature, DDF_CONTROLLER_DATA_SIGNATURE);
	memcpy(meta->cdr->Controller_GUID, "FreeBSD GEOM RAID SERIAL", 24);
	memcpy(meta->cdr->Product_ID, "FreeBSD GEOMRAID", 16);

	/* Physical Drive Records. */
	size = GET32(meta, hdr->pdr_length) * ss;
	meta->pdr = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->pdr, 0xff, size);
	SET32(meta, pdr->Signature, DDF_PDR_SIGNATURE);
	SET16(meta, pdr->Populated_PDEs, 1);
	SET16(meta, pdr->Max_PDE_Supported,
	GET16(meta, hdr->Max_PD_Entries));

	pde = &meta->pdr->entry[0];
	len = sizeof(serial_buffer);
	error = g_io_getattr("GEOM::ident", disk->d_consumer, &len, serial_buffer);
	if (error == 0 && (len = strlen (serial_buffer)) >= 6 && len <= 20)
	snprintf(pde->PD_GUID, 25, "DISK%20s", serial_buffer);
	else
	snprintf(pde->PD_GUID, 25, "DISK%04d%02d%02d%08x%04x",
	ct.year, ct.mon, ct.day,
	arc4random(), arc4random() & 0xffff);
	SET32D(meta, pde->PD_Reference, arc4random());
	SET16D(meta, pde->PD_Type, DDF_PDE_GUID_FORCE);
	SET16D(meta, pde->PD_State, 0);
	SET64D(meta, pde->Configured_Size,
	anchorlba + 1 - 32 * 1024 * 1024 / ss);
	SET16D(meta, pde->Block_Size, ss);

	/* Virtual Drive Records. */
	size = GET32(meta, hdr->vdr_length) * ss;
	meta->vdr = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->vdr, 0xff, size);
	SET32(meta, vdr->Signature, DDF_VD_RECORD_SIGNATURE);
	SET32(meta, vdr->Populated_VDEs, 0);
	SET16(meta, vdr->Max_VDE_Supported,
	GET16(meta, hdr->Max_VD_Entries));

	/* Configuration Records. */
	size = GET32(meta, hdr->cr_length) * ss;
	meta->cr = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->cr, 0xff, size);

	/* Physical Disk Data. */
	size = GET32(meta, hdr->pdd_length) * ss;
	meta->pdd = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->pdd, 0xff, size);
	SET32(meta, pdd->Signature, DDF_PDD_SIGNATURE);
	memcpy(meta->pdd->PD_GUID, pde->PD_GUID, 24);
	SET32(meta, pdd->PD_Reference, GET32D(meta, pde->PD_Reference));
	SET8(meta, pdd->Forced_Ref_Flag, DDF_PDD_FORCED_REF);
	SET8(meta, pdd->Forced_PD_GUID_Flag, DDF_PDD_FORCED_GUID);

	/* Bad Block Management Log. */
	if (GET32(meta, hdr->bbmlog_length) != 0) {
	size = GET32(meta, hdr->bbmlog_length) * ss;
	meta->bbm = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->bbm, 0xff, size);
	SET32(meta, bbm->Signature, DDF_BBML_SIGNATURE);
	SET32(meta, bbm->Entry_Count, 0);
	SET32(meta, bbm->Spare_Block_Count, 0);
	}
	}

	static void
	ddf_meta_copy(struct ddf_meta dst, struct ddf_meta src)
	{
	u_int ss;

	dst->bigendian = src->bigendian;
	ss = dst->sectorsize = src->sectorsize;
	dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->hdr, src->hdr, ss);
	dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss);
	dst->pdr = malloc(GET32(src, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->pdr, src->pdr, GET32(src, hdr->pdr_length) * ss);
	dst->vdr = malloc(GET32(src, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->vdr, src->vdr, GET32(src, hdr->vdr_length) * ss);
	dst->cr = malloc(GET32(src, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->cr, src->cr, GET32(src, hdr->cr_length) * ss);
	dst->pdd = malloc(GET32(src, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->pdd, src->pdd, GET32(src, hdr->pdd_length) * ss);
	if (src->bbm != NULL) {
	dst->bbm = malloc(GET32(src, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->bbm, src->bbm, GET32(src, hdr->bbmlog_length) * ss);
	}
	}

	static void
	ddf_meta_update(struct ddf_meta meta, struct ddf_meta src)
	{
	struct ddf_pd_entry pde, spde;
	int i, j;

	for (i = 0; i < GET16(src, pdr->Populated_PDEs); i++) {
	spde = &src->pdr->entry[i];
	if (isff(spde->PD_GUID, 24))
	continue;
	j = ddf_meta_find_pd(meta, NULL,
	GET32(src, pdr->entry[i].PD_Reference));
	if (j < 0) {
	j = ddf_meta_find_pd(meta, NULL, 0xffffffff);
	pde = &meta->pdr->entry[j];
	memcpy(pde, spde, sizeof(*pde));
	} else {
	pde = &meta->pdr->entry[j];
	SET16D(meta, pde->PD_State,
	GET16D(meta, pde->PD_State) \|
	GET16D(src, pde->PD_State));
	}
	}
	}

	static void
	ddf_meta_free(struct ddf_meta *meta)
	{

	if (meta->hdr != NULL) {
	free(meta->hdr, M_MD_DDF);
	meta->hdr = NULL;
	}
	if (meta->cdr != NULL) {
	free(meta->cdr, M_MD_DDF);
	meta->cdr = NULL;
	}
	if (meta->pdr != NULL) {
	free(meta->pdr, M_MD_DDF);
	meta->pdr = NULL;
	}
	if (meta->vdr != NULL) {
	free(meta->vdr, M_MD_DDF);
	meta->vdr = NULL;
	}
	if (meta->cr != NULL) {
	free(meta->cr, M_MD_DDF);
	meta->cr = NULL;
	}
	if (meta->pdd != NULL) {
	free(meta->pdd, M_MD_DDF);
	meta->pdd = NULL;
	}
	if (meta->bbm != NULL) {
	free(meta->bbm, M_MD_DDF);
	meta->bbm = NULL;
	}
	}

	static void
	ddf_vol_meta_create(struct ddf_vol_meta meta, struct ddf_meta sample)
	{
	struct timespec ts;
	struct clocktime ct;
	u_int ss, size;

	meta->bigendian = sample->bigendian;
	ss = meta->sectorsize = sample->sectorsize;
	meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->hdr, sample->hdr, ss);
	meta->cdr = malloc(GET32(sample, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->cdr, sample->cdr, GET32(sample, hdr->cd_length) * ss);
	meta->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK);
	memset(meta->vde, 0xff, sizeof(struct ddf_vd_entry));
	getnanotime(&ts);
	clock_ts_to_ct(&ts, &ct);
	snprintf(meta->vde->VD_GUID, 25, "FreeBSD%04d%02d%02d%08x%01x",
	ct.year, ct.mon, ct.day,
	arc4random(), arc4random() & 0xf);
	size = GET16(sample, hdr->Configuration_Record_Length) * ss;
	meta->vdc = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->vdc, 0xff, size);
	SET32(meta, vdc->Signature, DDF_VDCR_SIGNATURE);
	memcpy(meta->vdc->VD_GUID, meta->vde->VD_GUID, 24);
	SET32(meta, vdc->Sequence_Number, 0);
	}

	static void
	ddf_vol_meta_update(struct ddf_vol_meta dst, struct ddf_meta src,
	uint8_t *GUID, int started)
	{
	struct ddf_vd_entry *vde;
	struct ddf_vdc_record *vdc;
	int vnew, bvnew, bvd, size;
	u_int ss;

	vde = &src->vdr->entry[ddf_meta_find_vd(src, GUID)];
	vdc = ddf_meta_find_vdc(src, GUID);
	if (GET8D(src, vdc->Secondary_Element_Count) == 1)
	bvd = 0;
	else
	bvd = GET8D(src, vdc->Secondary_Element_Seq);
	size = GET16(src, hdr->Configuration_Record_Length) * src->sectorsize;

	if (dst->vdc == NULL \|\|
	(!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) -
	GET32(dst, vdc->Sequence_Number))) > 0))
	vnew = 1;
	else
	vnew = 0;

	if (dst->bvdc[bvd] == NULL \|\|
	(!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) -
	GET32(dst, bvdc[bvd]->Sequence_Number))) > 0))
	bvnew = 1;
	else
	bvnew = 0;

	if (vnew) {
	dst->bigendian = src->bigendian;
	ss = dst->sectorsize = src->sectorsize;
	if (dst->hdr != NULL)
	free(dst->hdr, M_MD_DDF);
	dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->hdr, src->hdr, ss);
	if (dst->cdr != NULL)
	free(dst->cdr, M_MD_DDF);
	dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss);
	if (dst->vde != NULL)
	free(dst->vde, M_MD_DDF);
	dst->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK);
	memcpy(dst->vde, vde, sizeof(struct ddf_vd_entry));
	if (dst->vdc != NULL)
	free(dst->vdc, M_MD_DDF);
	dst->vdc = malloc(size, M_MD_DDF, M_WAITOK);
	memcpy(dst->vdc, vdc, size);
	}
	if (bvnew) {
	if (dst->bvdc[bvd] != NULL)
	free(dst->bvdc[bvd], M_MD_DDF);
	dst->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK);
	memcpy(dst->bvdc[bvd], vdc, size);
	}
	}

	static void
	ddf_vol_meta_free(struct ddf_vol_meta *meta)
	{
	int i;

	if (meta->hdr != NULL) {
	free(meta->hdr, M_MD_DDF);
	meta->hdr = NULL;
	}
	if (meta->cdr != NULL) {
	free(meta->cdr, M_MD_DDF);
	meta->cdr = NULL;
	}
	if (meta->vde != NULL) {
	free(meta->vde, M_MD_DDF);
	meta->vde = NULL;
	}
	if (meta->vdc != NULL) {
	free(meta->vdc, M_MD_DDF);
	meta->vdc = NULL;
	}
	for (i = 0; i < DDF_MAX_DISKS_HARD; i++) {
	if (meta->bvdc[i] != NULL) {
	free(meta->bvdc[i], M_MD_DDF);
	meta->bvdc[i] = NULL;
	}
	}
	}

	static int
	ddf_meta_unused_range(struct ddf_meta meta, off_t off, off_t *size)
	{
	struct ddf_vdc_record *vdc;
	off_t beg[32], end[32], beg1, end1;
	uint64_t *offp;
	int i, j, n, num, pos;
	uint32_t ref;

	*off = 0;
	*size = 0;
	ref = GET32(meta, pdd->PD_Reference);
	pos = ddf_meta_find_pd(meta, NULL, ref);
	beg[0] = 0;
	end[0] = GET64(meta, pdr->entry[pos].Configured_Size);
	n = 1;
	num = GETCRNUM(meta);
	for (i = 0; i < num; i++) {
	vdc = GETVDCPTR(meta, i);
	if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE)
	continue;
	for (pos = 0; pos < GET16D(meta, vdc->Primary_Element_Count); pos++)
	if (GET32D(meta, vdc->Physical_Disk_Sequence[pos]) == ref)
	break;
	if (pos == GET16D(meta, vdc->Primary_Element_Count))
	continue;
	offp = (uint64_t *)&(vdc->Physical_Disk_Sequence[
	GET16(meta, hdr->Max_Primary_Element_Entries)]);
	beg1 = GET64P(meta, offp + pos);
	end1 = beg1 + GET64D(meta, vdc->Block_Count);
	for (j = 0; j < n; j++) {
	if (beg[j] >= end1 \|\| end[j] <= beg1 )
	continue;
	if (beg[j] < beg1 && end[j] > end1) {
	beg[n] = end1;
	end[n] = end[j];
	end[j] = beg1;
	n++;
	} else if (beg[j] < beg1)
	end[j] = beg1;
	else
	beg[j] = end1;
	}
	}
	for (j = 0; j < n; j++) {
	if (end[j] - beg[j] > *size) {
	*off = beg[j];
	*size = end[j] - beg[j];
	}
	}
	return ((*size > 0) ? 1 : 0);
	}

	static void
	ddf_meta_get_name(struct ddf_meta meta, int num, char buf)
	{
	const char *b;
	int i;

	b = meta->vdr->entry[num].VD_Name;
	for (i = 15; i >= 0; i--)
	if (b[i] != 0x20)
	break;
	memcpy(buf, b, i + 1);
	buf[i + 1] = 0;
	}

	static void
	ddf_meta_put_name(struct ddf_vol_meta meta, char buf)
	{
	int len;

	len = min(strlen(buf), 16);
	memset(meta->vde->VD_Name, 0x20, 16);
	memcpy(meta->vde->VD_Name, buf, len);
	}

	static int
	ddf_meta_read(struct g_consumer cp, struct ddf_meta meta)
	{
	struct g_provider *pp;
	struct ddf_header ahdr, hdr;
	char abuf, buf;
	off_t plba, slba, lba;
	int error, len, i;
	u_int ss;
	uint32_t val;

	ddf_meta_free(meta);
	pp = cp->provider;
	ss = meta->sectorsize = pp->sectorsize;
	/* Read anchor block. */
	abuf = g_read_data(cp, pp->mediasize - ss, ss, &error);
	if (abuf == NULL) {
	G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
	pp->name, error);
	return (error);
	}
	ahdr = (struct ddf_header *)abuf;

	/* Check if this is an DDF RAID struct */
	if (be32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE)
	meta->bigendian = 1;
	else if (le32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE)
	meta->bigendian = 0;
	else {
	G_RAID_DEBUG(1, "DDF signature check failed on %s", pp->name);
	error = EINVAL;
	goto done;
	}
	if (ahdr->Header_Type != DDF_HEADER_ANCHOR) {
	G_RAID_DEBUG(1, "DDF header type check failed on %s", pp->name);
	error = EINVAL;
	goto done;
	}
	meta->hdr = ahdr;
	plba = GET64(meta, hdr->Primary_Header_LBA);
	slba = GET64(meta, hdr->Secondary_Header_LBA);
	val = GET32(meta, hdr->CRC);
	SET32(meta, hdr->CRC, 0xffffffff);
	meta->hdr = NULL;
	if (crc32(ahdr, ss) != val) {
	G_RAID_DEBUG(1, "DDF CRC mismatch on %s", pp->name);
	error = EINVAL;
	goto done;
	}
	if ((plba + 6) * ss >= pp->mediasize) {
	G_RAID_DEBUG(1, "DDF primary header LBA is wrong on %s", pp->name);
	error = EINVAL;
	goto done;
	}
	if (slba != -1 && (slba + 6) * ss >= pp->mediasize) {
	G_RAID_DEBUG(1, "DDF secondary header LBA is wrong on %s", pp->name);
	error = EINVAL;
	goto done;
	}
	lba = plba;

	doread:
	error = 0;
	ddf_meta_free(meta);

	/* Read header block. */
	buf = g_read_data(cp, lba * ss, ss, &error);
	if (buf == NULL) {
	readerror:
	G_RAID_DEBUG(1, "DDF %s metadata read error on %s (error=%d).",
	(lba == plba) ? "primary" : "secondary", pp->name, error);
	if (lba == plba && slba != -1) {
	lba = slba;
	goto doread;
	}
	G_RAID_DEBUG(1, "DDF metadata read error on %s.", pp->name);
	goto done;
	}
	meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->hdr, buf, ss);
	g_free(buf);
	hdr = meta->hdr;
	val = GET32(meta, hdr->CRC);
	SET32(meta, hdr->CRC, 0xffffffff);
	if (hdr->Signature != ahdr->Signature \|\|
	crc32(meta->hdr, ss) != val \|\|
	memcmp(hdr->DDF_Header_GUID, ahdr->DDF_Header_GUID, 24) \|\|
	GET64(meta, hdr->Primary_Header_LBA) != plba \|\|
	GET64(meta, hdr->Secondary_Header_LBA) != slba) {
	hdrerror:
	G_RAID_DEBUG(1, "DDF %s metadata check failed on %s",
	(lba == plba) ? "primary" : "secondary", pp->name);
	if (lba == plba && slba != -1) {
	lba = slba;
	goto doread;
	}
	G_RAID_DEBUG(1, "DDF metadata check failed on %s", pp->name);
	error = EINVAL;
	goto done;
	}
	if ((lba == plba && hdr->Header_Type != DDF_HEADER_PRIMARY) \|\|
	(lba == slba && hdr->Header_Type != DDF_HEADER_SECONDARY))
	goto hdrerror;
	len = 1;
	len = max(len, GET32(meta, hdr->cd_section) + GET32(meta, hdr->cd_length));
	len = max(len, GET32(meta, hdr->pdr_section) + GET32(meta, hdr->pdr_length));
	len = max(len, GET32(meta, hdr->vdr_section) + GET32(meta, hdr->vdr_length));
	len = max(len, GET32(meta, hdr->cr_section) + GET32(meta, hdr->cr_length));
	len = max(len, GET32(meta, hdr->pdd_section) + GET32(meta, hdr->pdd_length));
	if ((val = GET32(meta, hdr->bbmlog_section)) != 0xffffffff)
	len = max(len, val + GET32(meta, hdr->bbmlog_length));
	if ((val = GET32(meta, hdr->Diagnostic_Space)) != 0xffffffff)
	len = max(len, val + GET32(meta, hdr->Diagnostic_Space_Length));
	if ((val = GET32(meta, hdr->Vendor_Specific_Logs)) != 0xffffffff)
	len = max(len, val + GET32(meta, hdr->Vendor_Specific_Logs_Length));
	if ((plba + len) * ss >= pp->mediasize)
	goto hdrerror;
	if (slba != -1 && (slba + len) * ss >= pp->mediasize)
	goto hdrerror;
	/* Workaround for Adaptec implementation. */
	if (GET16(meta, hdr->Max_Primary_Element_Entries) == 0xffff) {
	SET16(meta, hdr->Max_Primary_Element_Entries,
	min(GET16(meta, hdr->Max_PD_Entries),
	(GET16(meta, hdr->Configuration_Record_Length) * ss - 512) / 12));
	}

	- if (GET32(meta, hdr->cd_length) * ss >= MAXPHYS \|\|
	- GET32(meta, hdr->pdr_length) * ss >= MAXPHYS \|\|
	- GET32(meta, hdr->vdr_length) * ss >= MAXPHYS \|\|
	- GET32(meta, hdr->cr_length) * ss >= MAXPHYS \|\|
	- GET32(meta, hdr->pdd_length) * ss >= MAXPHYS \|\|
	- GET32(meta, hdr->bbmlog_length) * ss >= MAXPHYS) {
	+ if (GET32(meta, hdr->cd_length) * ss >= maxphys \|\|
	+ GET32(meta, hdr->pdr_length) * ss >= maxphys \|\|
	+ GET32(meta, hdr->vdr_length) * ss >= maxphys \|\|
	+ GET32(meta, hdr->cr_length) * ss >= maxphys \|\|
	+ GET32(meta, hdr->pdd_length) * ss >= maxphys \|\|
	+ GET32(meta, hdr->bbmlog_length) * ss >= maxphys) {
	G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name);
	goto hdrerror;
	}

	/* Read controller data. */
	buf = g_read_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss,
	GET32(meta, hdr->cd_length) * ss, &error);
	if (buf == NULL)
	goto readerror;
	meta->cdr = malloc(GET32(meta, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->cdr, buf, GET32(meta, hdr->cd_length) * ss);
	g_free(buf);
	if (GET32(meta, cdr->Signature) != DDF_CONTROLLER_DATA_SIGNATURE)
	goto hdrerror;

	/* Read physical disk records. */
	buf = g_read_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss,
	GET32(meta, hdr->pdr_length) * ss, &error);
	if (buf == NULL)
	goto readerror;
	meta->pdr = malloc(GET32(meta, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->pdr, buf, GET32(meta, hdr->pdr_length) * ss);
	g_free(buf);
	if (GET32(meta, pdr->Signature) != DDF_PDR_SIGNATURE)
	goto hdrerror;
	/*
	* Workaround for reading metadata corrupted due to graid bug.
	* XXX: Remove this before we have disks above 128PB. :)
	*/
	if (meta->bigendian) {
	for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) {
	if (isff(meta->pdr->entry[i].PD_GUID, 24))
	continue;
	if (GET32(meta, pdr->entry[i].PD_Reference) ==
	0xffffffff)
	continue;
	if (GET64(meta, pdr->entry[i].Configured_Size) >=
	(1ULL << 48)) {
	SET16(meta, pdr->entry[i].PD_State,
	GET16(meta, pdr->entry[i].PD_State) &
	~DDF_PDE_FAILED);
	SET64(meta, pdr->entry[i].Configured_Size,
	GET64(meta, pdr->entry[i].Configured_Size) &
	((1ULL << 48) - 1));
	}
	}
	}

	/* Read virtual disk records. */
	buf = g_read_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss,
	GET32(meta, hdr->vdr_length) * ss, &error);
	if (buf == NULL)
	goto readerror;
	meta->vdr = malloc(GET32(meta, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->vdr, buf, GET32(meta, hdr->vdr_length) * ss);
	g_free(buf);
	if (GET32(meta, vdr->Signature) != DDF_VD_RECORD_SIGNATURE)
	goto hdrerror;

	/* Read configuration records. */
	buf = g_read_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss,
	GET32(meta, hdr->cr_length) * ss, &error);
	if (buf == NULL)
	goto readerror;
	meta->cr = malloc(GET32(meta, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->cr, buf, GET32(meta, hdr->cr_length) * ss);
	g_free(buf);

	/* Read physical disk data. */
	buf = g_read_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss,
	GET32(meta, hdr->pdd_length) * ss, &error);
	if (buf == NULL)
	goto readerror;
	meta->pdd = malloc(GET32(meta, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->pdd, buf, GET32(meta, hdr->pdd_length) * ss);
	g_free(buf);
	if (GET32(meta, pdd->Signature) != DDF_PDD_SIGNATURE)
	goto hdrerror;
	i = ddf_meta_find_pd(meta, NULL, GET32(meta, pdd->PD_Reference));
	if (i < 0)
	goto hdrerror;

	/* Read BBM Log. */
	if (GET32(meta, hdr->bbmlog_section) != 0xffffffff &&
	GET32(meta, hdr->bbmlog_length) != 0) {
	buf = g_read_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss,
	GET32(meta, hdr->bbmlog_length) * ss, &error);
	if (buf == NULL)
	goto readerror;
	meta->bbm = malloc(GET32(meta, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->bbm, buf, GET32(meta, hdr->bbmlog_length) * ss);
	g_free(buf);
	if (GET32(meta, bbm->Signature) != DDF_BBML_SIGNATURE)
	goto hdrerror;
	}

	done:
	g_free(abuf);
	if (error != 0)
	ddf_meta_free(meta);
	return (error);
	}

	static int
	ddf_meta_write(struct g_consumer cp, struct ddf_meta meta)
	{
	struct g_provider *pp;
	struct ddf_vdc_record *vdc;
	off_t alba, plba, slba, lba;
	u_int ss, size;
	int error, i, num;

	pp = cp->provider;
	ss = pp->sectorsize;
	lba = alba = pp->mediasize / ss - 1;
	plba = GET64(meta, hdr->Primary_Header_LBA);
	slba = GET64(meta, hdr->Secondary_Header_LBA);

	next:
	SET8(meta, hdr->Header_Type, (lba == alba) ? DDF_HEADER_ANCHOR :
	(lba == plba) ? DDF_HEADER_PRIMARY : DDF_HEADER_SECONDARY);
	SET32(meta, hdr->CRC, 0xffffffff);
	SET32(meta, hdr->CRC, crc32(meta->hdr, ss));
	error = g_write_data(cp, lba * ss, meta->hdr, ss);
	if (error != 0) {
	err:
	G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
	pp->name, error);
	if (lba != alba)
	goto done;
	}
	if (lba == alba) {
	lba = plba;
	goto next;
	}

	size = GET32(meta, hdr->cd_length) * ss;
	SET32(meta, cdr->CRC, 0xffffffff);
	SET32(meta, cdr->CRC, crc32(meta->cdr, size));
	error = g_write_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss,
	meta->cdr, size);
	if (error != 0)
	goto err;

	size = GET32(meta, hdr->pdr_length) * ss;
	SET32(meta, pdr->CRC, 0xffffffff);
	SET32(meta, pdr->CRC, crc32(meta->pdr, size));
	error = g_write_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss,
	meta->pdr, size);
	if (error != 0)
	goto err;

	size = GET32(meta, hdr->vdr_length) * ss;
	SET32(meta, vdr->CRC, 0xffffffff);
	SET32(meta, vdr->CRC, crc32(meta->vdr, size));
	error = g_write_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss,
	meta->vdr, size);
	if (error != 0)
	goto err;

	size = GET16(meta, hdr->Configuration_Record_Length) * ss;
	num = GETCRNUM(meta);
	for (i = 0; i < num; i++) {
	vdc = GETVDCPTR(meta, i);
	SET32D(meta, vdc->CRC, 0xffffffff);
	SET32D(meta, vdc->CRC, crc32(vdc, size));
	}
	error = g_write_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss,
	meta->cr, size * num);
	if (error != 0)
	goto err;

	size = GET32(meta, hdr->pdd_length) * ss;
	SET32(meta, pdd->CRC, 0xffffffff);
	SET32(meta, pdd->CRC, crc32(meta->pdd, size));
	error = g_write_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss,
	meta->pdd, size);
	if (error != 0)
	goto err;

	if (GET32(meta, hdr->bbmlog_length) != 0) {
	size = GET32(meta, hdr->bbmlog_length) * ss;
	SET32(meta, bbm->CRC, 0xffffffff);
	SET32(meta, bbm->CRC, crc32(meta->bbm, size));
	error = g_write_data(cp,
	(lba + GET32(meta, hdr->bbmlog_section)) * ss,
	meta->bbm, size);
	if (error != 0)
	goto err;
	}

	done:
	if (lba == plba && slba != -1) {
	lba = slba;
	goto next;
	}

	return (error);
	}

	static int
	ddf_meta_erase(struct g_consumer *cp)
	{
	struct g_provider *pp;
	char *buf;
	int error;

	pp = cp->provider;
	buf = malloc(pp->sectorsize, M_MD_DDF, M_WAITOK \| M_ZERO);
	error = g_write_data(cp, pp->mediasize - pp->sectorsize,
	buf, pp->sectorsize);
	if (error != 0) {
	G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
	pp->name, error);
	}
	free(buf, M_MD_DDF);
	return (error);
	}

	static struct g_raid_volume *
	g_raid_md_ddf_get_volume(struct g_raid_softc sc, uint8_t GUID)
	{
	struct g_raid_volume *vol;
	struct g_raid_md_ddf_pervolume *pv;

	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	pv = vol->v_md_data;
	if (memcmp(pv->pv_meta.vde->VD_GUID, GUID, 24) == 0)
	break;
	}
	return (vol);
	}

	static struct g_raid_disk *
	g_raid_md_ddf_get_disk(struct g_raid_softc sc, uint8_t GUID, uint32_t id)
	{
	struct g_raid_disk *disk;
	struct g_raid_md_ddf_perdisk *pd;
	struct ddf_meta *meta;

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	meta = &pd->pd_meta;
	if (GUID != NULL) {
	if (memcmp(meta->pdd->PD_GUID, GUID, 24) == 0)
	break;
	} else {
	if (GET32(meta, pdd->PD_Reference) == id)
	break;
	}
	}
	return (disk);
	}

	static int
	g_raid_md_ddf_purge_volumes(struct g_raid_softc *sc)
	{
	struct g_raid_volume vol, tvol;
	int i, res;

	res = 0;
	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
	if (vol->v_stopping)
	continue;
	for (i = 0; i < vol->v_disks_count; i++) {
	if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
	break;
	}
	if (i >= vol->v_disks_count) {
	g_raid_destroy_volume(vol);
	res = 1;
	}
	}
	return (res);
	}

	static int
	g_raid_md_ddf_purge_disks(struct g_raid_softc *sc)
	{
	#if 0
	struct g_raid_disk disk, tdisk;
	struct g_raid_volume *vol;
	struct g_raid_md_ddf_perdisk *pd;
	int i, j, res;

	res = 0;
	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
	if (disk->d_state == G_RAID_DISK_S_SPARE)
	continue;
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;

	/* Scan for deleted volumes. */
	for (i = 0; i < pd->pd_subdisks; ) {
	vol = g_raid_md_ddf_get_volume(sc,
	pd->pd_meta[i]->volume_id);
	if (vol != NULL && !vol->v_stopping) {
	i++;
	continue;
	}
	free(pd->pd_meta[i], M_MD_DDF);
	for (j = i; j < pd->pd_subdisks - 1; j++)
	pd->pd_meta[j] = pd->pd_meta[j + 1];
	pd->pd_meta[DDF_MAX_SUBDISKS - 1] = NULL;
	pd->pd_subdisks--;
	pd->pd_updated = 1;
	}

	/* If there is no metadata left - erase and delete disk. */
	if (pd->pd_subdisks == 0) {
	ddf_meta_erase(disk->d_consumer);
	g_raid_destroy_disk(disk);
	res = 1;
	}
	}
	return (res);
	#endif
	return (0);
	}

	static int
	g_raid_md_ddf_supported(int level, int qual, int disks, int force)
	{

	if (disks > DDF_MAX_DISKS_HARD)
	return (0);
	switch (level) {
	case G_RAID_VOLUME_RL_RAID0:
	if (qual != G_RAID_VOLUME_RLQ_NONE)
	return (0);
	if (disks < 1)
	return (0);
	if (!force && disks < 2)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID1:
	if (disks < 1)
	return (0);
	if (qual == G_RAID_VOLUME_RLQ_R1SM) {
	if (!force && disks != 2)
	return (0);
	} else if (qual == G_RAID_VOLUME_RLQ_R1MM) {
	if (!force && disks != 3)
	return (0);
	} else
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID3:
	if (qual != G_RAID_VOLUME_RLQ_R3P0 &&
	qual != G_RAID_VOLUME_RLQ_R3PN)
	return (0);
	if (disks < 3)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID4:
	if (qual != G_RAID_VOLUME_RLQ_R4P0 &&
	qual != G_RAID_VOLUME_RLQ_R4PN)
	return (0);
	if (disks < 3)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID5:
	if (qual != G_RAID_VOLUME_RLQ_R5RA &&
	qual != G_RAID_VOLUME_RLQ_R5RS &&
	qual != G_RAID_VOLUME_RLQ_R5LA &&
	qual != G_RAID_VOLUME_RLQ_R5LS)
	return (0);
	if (disks < 3)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID6:
	if (qual != G_RAID_VOLUME_RLQ_R6RA &&
	qual != G_RAID_VOLUME_RLQ_R6RS &&
	qual != G_RAID_VOLUME_RLQ_R6LA &&
	qual != G_RAID_VOLUME_RLQ_R6LS)
	return (0);
	if (disks < 4)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAIDMDF:
	if (qual != G_RAID_VOLUME_RLQ_RMDFRA &&
	qual != G_RAID_VOLUME_RLQ_RMDFRS &&
	qual != G_RAID_VOLUME_RLQ_RMDFLA &&
	qual != G_RAID_VOLUME_RLQ_RMDFLS)
	return (0);
	if (disks < 4)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID1E:
	if (qual != G_RAID_VOLUME_RLQ_R1EA &&
	qual != G_RAID_VOLUME_RLQ_R1EO)
	return (0);
	if (disks < 3)
	return (0);
	break;
	case G_RAID_VOLUME_RL_SINGLE:
	if (qual != G_RAID_VOLUME_RLQ_NONE)
	return (0);
	if (disks != 1)
	return (0);
	break;
	case G_RAID_VOLUME_RL_CONCAT:
	if (qual != G_RAID_VOLUME_RLQ_NONE)
	return (0);
	if (disks < 2)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID5E:
	if (qual != G_RAID_VOLUME_RLQ_R5ERA &&
	qual != G_RAID_VOLUME_RLQ_R5ERS &&
	qual != G_RAID_VOLUME_RLQ_R5ELA &&
	qual != G_RAID_VOLUME_RLQ_R5ELS)
	return (0);
	if (disks < 4)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID5EE:
	if (qual != G_RAID_VOLUME_RLQ_R5EERA &&
	qual != G_RAID_VOLUME_RLQ_R5EERS &&
	qual != G_RAID_VOLUME_RLQ_R5EELA &&
	qual != G_RAID_VOLUME_RLQ_R5EELS)
	return (0);
	if (disks < 4)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID5R:
	if (qual != G_RAID_VOLUME_RLQ_R5RRA &&
	qual != G_RAID_VOLUME_RLQ_R5RRS &&
	qual != G_RAID_VOLUME_RLQ_R5RLA &&
	qual != G_RAID_VOLUME_RLQ_R5RLS)
	return (0);
	if (disks < 3)
	return (0);
	break;
	default:
	return (0);
	}
	return (1);
	}

	static int
	g_raid_md_ddf_start_disk(struct g_raid_disk disk, struct g_raid_volume vol)
	{
	struct g_raid_softc *sc;
	struct g_raid_subdisk *sd;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_pervolume *pv;
	struct g_raid_md_ddf_object *mdi;
	struct ddf_vol_meta *vmeta;
	struct ddf_meta pdmeta, gmeta;
	struct ddf_vdc_record *vdc1;
	struct ddf_sa_record *sa;
	off_t size, eoff = 0, esize = 0;
	uint64_t *val2;
	int disk_pos, md_disk_bvd = -1, md_disk_pos = -1, md_pde_pos;
	int i, resurrection = 0;
	uint32_t reference;

	sc = disk->d_softc;
	mdi = (struct g_raid_md_ddf_object *)sc->sc_md;
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	pdmeta = &pd->pd_meta;
	reference = GET32(&pd->pd_meta, pdd->PD_Reference);

	pv = vol->v_md_data;
	vmeta = &pv->pv_meta;
	gmeta = &mdi->mdio_meta;

	/* Find disk position in metadata by its reference. */
	disk_pos = ddf_meta_find_disk(vmeta, reference,
	&md_disk_bvd, &md_disk_pos);
	md_pde_pos = ddf_meta_find_pd(gmeta, NULL, reference);

	if (disk_pos < 0) {
	G_RAID_DEBUG1(1, sc,
	"Disk %s is not a present part of the volume %s",
	g_raid_get_diskname(disk), vol->v_name);

	/* Failed stale disk is useless for us. */
	if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) != 0) {
	g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
	return (0);
	}

	/* If disk has some metadata for this volume - erase. */
	if ((vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL)
	SET32D(pdmeta, vdc1->Signature, 0xffffffff);

	/* If we are in the start process, that's all for now. */
	if (!pv->pv_started)
	goto nofit;
	/*
	* If we have already started - try to get use of the disk.
	* Try to replace OFFLINE disks first, then FAILED.
	*/
	if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >=
	GET16(&pd->pd_meta, hdr->Max_Partitions)) {
	G_RAID_DEBUG1(1, sc, "No free partitions on disk %s",
	g_raid_get_diskname(disk));
	goto nofit;
	}
	ddf_meta_unused_range(&pd->pd_meta, &eoff, &esize);
	if (esize == 0) {
	G_RAID_DEBUG1(1, sc, "No free space on disk %s",
	g_raid_get_diskname(disk));
	goto nofit;
	}
	eoff *= pd->pd_meta.sectorsize;
	esize *= pd->pd_meta.sectorsize;
	size = INT64_MAX;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
	size = sd->sd_size;
	if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
	(disk_pos < 0 \|\|
	vol->v_subdisks[i].sd_state < sd->sd_state))
	disk_pos = i;
	}
	if (disk_pos >= 0 &&
	vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
	esize < size) {
	G_RAID_DEBUG1(1, sc, "Disk %s free space "
	"is too small (%ju < %ju)",
	g_raid_get_diskname(disk), esize, size);
	disk_pos = -1;
	}
	if (disk_pos >= 0) {
	if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
	esize = size;
	md_disk_bvd = disk_pos / GET16(vmeta, vdc->Primary_Element_Count); // XXX
	md_disk_pos = disk_pos % GET16(vmeta, vdc->Primary_Element_Count); // XXX
	} else {
	nofit:
	if (disk->d_state == G_RAID_DISK_S_NONE)
	g_raid_change_disk_state(disk,
	G_RAID_DISK_S_STALE);
	return (0);
	}

	/*
	* If spare is committable, delete spare record.
	* Othersize, mark it active and leave there.
	*/
	sa = ddf_meta_find_sa(&pd->pd_meta, 0);
	if (sa != NULL) {
	if ((GET8D(&pd->pd_meta, sa->Spare_Type) &
	DDF_SAR_TYPE_REVERTIBLE) == 0) {
	SET32D(&pd->pd_meta, sa->Signature, 0xffffffff);
	} else {
	SET8D(&pd->pd_meta, sa->Spare_Type,
	GET8D(&pd->pd_meta, sa->Spare_Type) \|
	DDF_SAR_TYPE_ACTIVE);
	}
	}

	G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
	g_raid_get_diskname(disk), disk_pos, vol->v_name);
	resurrection = 1;
	}

	sd = &vol->v_subdisks[disk_pos];

	if (resurrection && sd->sd_disk != NULL) {
	g_raid_change_disk_state(sd->sd_disk,
	G_RAID_DISK_S_STALE_FAILED);
	TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
	sd, sd_next);
	}
	vol->v_subdisks[disk_pos].sd_disk = disk;
	TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);

	/* Welcome the new disk. */
	if (resurrection)
	g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
	else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA)
	g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
	else
	g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);

	if (resurrection) {
	sd->sd_offset = eoff;
	sd->sd_size = esize;
	} else if (pdmeta->cr != NULL &&
	(vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) {
	val2 = (uint64_t *)&(vdc1->Physical_Disk_Sequence[GET16(vmeta, hdr->Max_Primary_Element_Entries)]);
	sd->sd_offset = (off_t)GET64P(pdmeta, val2 + md_disk_pos) * 512;
	sd->sd_size = (off_t)GET64D(pdmeta, vdc1->Block_Count) * 512;
	}

	if (resurrection) {
	/* Stale disk, almost same as new. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_NEW);
	} else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) {
	/* Failed disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_FAILED);
	} else if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) &
	(DDF_PDE_FAILED \| DDF_PDE_REBUILD)) != 0) {
	/* Rebuilding disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_REBUILD);
	sd->sd_rebuild_pos = 0;
	} else if ((GET8(vmeta, vde->VD_State) & DDF_VDE_DIRTY) != 0 \|\|
	(GET8(vmeta, vde->Init_State) & DDF_VDE_INIT_MASK) !=
	DDF_VDE_INIT_FULL) {
	/* Stale disk or dirty volume (unclean shutdown). */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_STALE);
	} else {
	/* Up to date disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_ACTIVE);
	}
	g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
	G_RAID_EVENT_SUBDISK);

	return (resurrection);
	}

	static void
	g_raid_md_ddf_refill(struct g_raid_softc *sc)
	{
	struct g_raid_volume *vol;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	struct g_raid_md_object *md;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_pervolume *pv;
	int update, updated, i, bad;

	md = sc->sc_md;
	restart:
	updated = 0;
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	pv = vol->v_md_data;
	if (!pv->pv_started \|\| vol->v_stopping)
	continue;

	/* Search for subdisk that needs replacement. */
	bad = 0;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_state == G_RAID_SUBDISK_S_NONE \|\|
	sd->sd_state == G_RAID_SUBDISK_S_FAILED)
	bad = 1;
	}
	if (!bad)
	continue;

	G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
	"trying to refill.", vol->v_name);

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	/* Skip failed. */
	if (disk->d_state < G_RAID_DISK_S_SPARE)
	continue;
	/* Skip already used by this volume. */
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_disk == disk)
	break;
	}
	if (i < vol->v_disks_count)
	continue;

	/* Try to use disk if it has empty extents. */
	pd = disk->d_md_data;
	if (ddf_meta_count_vdc(&pd->pd_meta, NULL) <
	GET16(&pd->pd_meta, hdr->Max_Partitions)) {
	update = g_raid_md_ddf_start_disk(disk, vol);
	} else
	update = 0;
	if (update) {
	updated = 1;
	g_raid_md_write_ddf(md, vol, NULL, disk);
	break;
	}
	}
	}
	if (updated)
	goto restart;
	}

	static void
	g_raid_md_ddf_start(struct g_raid_volume *vol)
	{
	struct g_raid_softc *sc;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	struct g_raid_md_object *md;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_pervolume *pv;
	struct g_raid_md_ddf_object *mdi;
	struct ddf_vol_meta *vmeta;
	uint64_t *val2;
	int i, j, bvd;

	sc = vol->v_softc;
	md = sc->sc_md;
	mdi = (struct g_raid_md_ddf_object *)md;
	pv = vol->v_md_data;
	vmeta = &pv->pv_meta;

	vol->v_raid_level = GET8(vmeta, vdc->Primary_RAID_Level);
	vol->v_raid_level_qualifier = GET8(vmeta, vdc->RLQ);
	if (GET8(vmeta, vdc->Secondary_Element_Count) > 1 &&
	vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 &&
	GET8(vmeta, vdc->Secondary_RAID_Level) == 0)
	vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
	vol->v_sectorsize = GET16(vmeta, vdc->Block_Size);
	if (vol->v_sectorsize == 0xffff)
	vol->v_sectorsize = vmeta->sectorsize;
	vol->v_strip_size = vol->v_sectorsize << GET8(vmeta, vdc->Stripe_Size);
	vol->v_disks_count = GET16(vmeta, vdc->Primary_Element_Count) *
	GET8(vmeta, vdc->Secondary_Element_Count);
	vol->v_mdf_pdisks = GET8(vmeta, vdc->MDF_Parity_Disks);
	vol->v_mdf_polynomial = GET16(vmeta, vdc->MDF_Parity_Generator_Polynomial);
	vol->v_mdf_method = GET8(vmeta, vdc->MDF_Constant_Generation_Method);
	if (GET8(vmeta, vdc->Rotate_Parity_count) > 31)
	vol->v_rotate_parity = 1;
	else
	vol->v_rotate_parity = 1 << GET8(vmeta, vdc->Rotate_Parity_count);
	vol->v_mediasize = GET64(vmeta, vdc->VD_Size) * vol->v_sectorsize;
	for (i = 0, j = 0, bvd = 0; i < vol->v_disks_count; i++, j++) {
	if (j == GET16(vmeta, vdc->Primary_Element_Count)) {
	j = 0;
	bvd++;
	}
	sd = &vol->v_subdisks[i];
	if (vmeta->bvdc[bvd] == NULL) {
	sd->sd_offset = 0;
	sd->sd_size = GET64(vmeta, vdc->Block_Count) *
	vol->v_sectorsize;
	continue;
	}
	val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[
	GET16(vmeta, hdr->Max_Primary_Element_Entries)]);
	sd->sd_offset = GET64P(vmeta, val2 + j) * vol->v_sectorsize;
	sd->sd_size = GET64(vmeta, bvdc[bvd]->Block_Count) *
	vol->v_sectorsize;
	}
	g_raid_start_volume(vol);

	/* Make all disks found till the moment take their places. */
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	if (ddf_meta_find_vdc(&pd->pd_meta, vmeta->vdc->VD_GUID) != NULL)
	g_raid_md_ddf_start_disk(disk, vol);
	}

	pv->pv_started = 1;
	mdi->mdio_starting--;
	callout_stop(&pv->pv_start_co);
	G_RAID_DEBUG1(0, sc, "Volume started.");
	g_raid_md_write_ddf(md, vol, NULL, NULL);

	/* Pickup any STALE/SPARE disks to refill array if needed. */
	g_raid_md_ddf_refill(sc);

	g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
	}

	static void
	g_raid_ddf_go(void *arg)
	{
	struct g_raid_volume *vol;
	struct g_raid_softc *sc;
	struct g_raid_md_ddf_pervolume *pv;

	vol = arg;
	pv = vol->v_md_data;
	sc = vol->v_softc;
	if (!pv->pv_started) {
	G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
	g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
	G_RAID_EVENT_VOLUME);
	}
	}

	static void
	g_raid_md_ddf_new_disk(struct g_raid_disk *disk)
	{
	struct g_raid_softc *sc;
	struct g_raid_md_object *md;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_pervolume *pv;
	struct g_raid_md_ddf_object *mdi;
	struct g_raid_volume *vol;
	struct ddf_meta *pdmeta;
	struct ddf_vol_meta *vmeta;
	struct ddf_vdc_record *vdc;
	struct ddf_vd_entry *vde;
	int i, j, k, num, have, need, cnt, spare;
	uint32_t val;
	char buf[17];

	sc = disk->d_softc;
	md = sc->sc_md;
	mdi = (struct g_raid_md_ddf_object *)md;
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	pdmeta = &pd->pd_meta;
	spare = -1;

	if (mdi->mdio_meta.hdr == NULL)
	ddf_meta_copy(&mdi->mdio_meta, pdmeta);
	else
	ddf_meta_update(&mdi->mdio_meta, pdmeta);

	num = GETCRNUM(pdmeta);
	for (j = 0; j < num; j++) {
	vdc = GETVDCPTR(pdmeta, j);
	val = GET32D(pdmeta, vdc->Signature);

	if (val == DDF_SA_SIGNATURE && spare == -1)
	spare = 1;

	if (val != DDF_VDCR_SIGNATURE)
	continue;
	spare = 0;
	k = ddf_meta_find_vd(pdmeta, vdc->VD_GUID);
	if (k < 0)
	continue;
	vde = &pdmeta->vdr->entry[k];

	/* Look for volume with matching ID. */
	vol = g_raid_md_ddf_get_volume(sc, vdc->VD_GUID);
	if (vol == NULL) {
	ddf_meta_get_name(pdmeta, k, buf);
	vol = g_raid_create_volume(sc, buf,
	GET16D(pdmeta, vde->VD_Number));
	pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK \| M_ZERO);
	vol->v_md_data = pv;
	callout_init(&pv->pv_start_co, 1);
	callout_reset(&pv->pv_start_co,
	g_raid_start_timeout * hz,
	g_raid_ddf_go, vol);
	mdi->mdio_starting++;
	} else
	pv = vol->v_md_data;

	/* If we haven't started yet - check metadata freshness. */
	vmeta = &pv->pv_meta;
	ddf_vol_meta_update(vmeta, pdmeta, vdc->VD_GUID, pv->pv_started);
	}

	if (spare == 1) {
	g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
	g_raid_md_ddf_refill(sc);
	}

	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	pv = vol->v_md_data;
	vmeta = &pv->pv_meta;

	if (ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID) == NULL)
	continue;

	if (pv->pv_started) {
	if (g_raid_md_ddf_start_disk(disk, vol))
	g_raid_md_write_ddf(md, vol, NULL, NULL);
	continue;
	}

	/* If we collected all needed disks - start array. */
	need = 0;
	have = 0;
	for (k = 0; k < GET8(vmeta, vdc->Secondary_Element_Count); k++) {
	if (vmeta->bvdc[k] == NULL) {
	need += GET16(vmeta, vdc->Primary_Element_Count);
	continue;
	}
	cnt = GET16(vmeta, bvdc[k]->Primary_Element_Count);
	need += cnt;
	for (i = 0; i < cnt; i++) {
	val = GET32(vmeta, bvdc[k]->Physical_Disk_Sequence[i]);
	if (g_raid_md_ddf_get_disk(sc, NULL, val) != NULL)
	have++;
	}
	}
	G_RAID_DEBUG1(1, sc, "Volume %s now has %d of %d disks",
	vol->v_name, have, need);
	if (have == need)
	g_raid_md_ddf_start(vol);
	}
	}

	static int
	g_raid_md_create_req_ddf(struct g_raid_md_object md, struct g_class mp,
	struct gctl_req req, struct g_geom *gp)
	{
	struct g_geom *geom;
	struct g_raid_softc *sc;
	struct g_raid_md_ddf_object mdi, mdi1;
	char name[16];
	const char *fmtopt;
	int be = 1;

	mdi = (struct g_raid_md_ddf_object *)md;
	fmtopt = gctl_get_asciiparam(req, "fmtopt");
	if (fmtopt == NULL \|\| strcasecmp(fmtopt, "BE") == 0)
	be = 1;
	else if (strcasecmp(fmtopt, "LE") == 0)
	be = 0;
	else {
	gctl_error(req, "Incorrect fmtopt argument.");
	return (G_RAID_MD_TASTE_FAIL);
	}

	/* Search for existing node. */
	LIST_FOREACH(geom, &mp->geom, geom) {
	sc = geom->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_stopping != 0)
	continue;
	if (sc->sc_md->mdo_class != md->mdo_class)
	continue;
	mdi1 = (struct g_raid_md_ddf_object *)sc->sc_md;
	if (mdi1->mdio_bigendian != be)
	continue;
	break;
	}
	if (geom != NULL) {
	*gp = geom;
	return (G_RAID_MD_TASTE_EXISTING);
	}

	/* Create new one if not found. */
	mdi->mdio_bigendian = be;
	snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE");
	sc = g_raid_create_node(mp, name, md);
	if (sc == NULL)
	return (G_RAID_MD_TASTE_FAIL);
	md->mdo_softc = sc;
	*gp = sc->sc_geom;
	return (G_RAID_MD_TASTE_NEW);
	}

	static int
	g_raid_md_taste_ddf(struct g_raid_md_object md, struct g_class mp,
	struct g_consumer cp, struct g_geom *gp)
	{
	struct g_consumer *rcp;
	struct g_provider *pp;
	struct g_raid_softc *sc;
	struct g_raid_disk *disk;
	struct ddf_meta meta;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_object *mdi;
	struct g_geom *geom;
	int error, result, be;
	char name[16];

	G_RAID_DEBUG(1, "Tasting DDF on %s", cp->provider->name);
	mdi = (struct g_raid_md_ddf_object *)md;
	pp = cp->provider;

	/* Read metadata from device. */
	g_topology_unlock();
	bzero(&meta, sizeof(meta));
	error = ddf_meta_read(cp, &meta);
	g_topology_lock();
	if (error != 0)
	return (G_RAID_MD_TASTE_FAIL);
	be = meta.bigendian;

	/* Metadata valid. Print it. */
	g_raid_md_ddf_print(&meta);

	/* Search for matching node. */
	sc = NULL;
	LIST_FOREACH(geom, &mp->geom, geom) {
	sc = geom->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_stopping != 0)
	continue;
	if (sc->sc_md->mdo_class != md->mdo_class)
	continue;
	mdi = (struct g_raid_md_ddf_object *)sc->sc_md;
	if (mdi->mdio_bigendian != be)
	continue;
	break;
	}

	/* Found matching node. */
	if (geom != NULL) {
	G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
	result = G_RAID_MD_TASTE_EXISTING;

	} else { /* Not found matching node -- create one. */
	result = G_RAID_MD_TASTE_NEW;
	mdi->mdio_bigendian = be;
	snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE");
	sc = g_raid_create_node(mp, name, md);
	md->mdo_softc = sc;
	geom = sc->sc_geom;
	}

	/* There is no return after this point, so we close passed consumer. */
	g_access(cp, -1, 0, 0);

	rcp = g_new_consumer(geom);
	rcp->flags \|= G_CF_DIRECT_RECEIVE;
	g_attach(rcp, pp);
	if (g_access(rcp, 1, 1, 1) != 0)
	; //goto fail1;

	g_topology_unlock();
	sx_xlock(&sc->sc_lock);

	pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK \| M_ZERO);
	pd->pd_meta = meta;
	disk = g_raid_create_disk(sc);
	disk->d_md_data = (void *)pd;
	disk->d_consumer = rcp;
	rcp->private = disk;

	g_raid_get_disk_info(disk);

	g_raid_md_ddf_new_disk(disk);

	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	*gp = geom;
	return (result);
	}

	static int
	g_raid_md_event_ddf(struct g_raid_md_object *md,
	struct g_raid_disk *disk, u_int event)
	{
	struct g_raid_softc *sc;

	sc = md->mdo_softc;
	if (disk == NULL)
	return (-1);
	switch (event) {
	case G_RAID_DISK_E_DISCONNECTED:
	/* Delete disk. */
	g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
	g_raid_destroy_disk(disk);
	g_raid_md_ddf_purge_volumes(sc);

	/* Write updated metadata to all disks. */
	g_raid_md_write_ddf(md, NULL, NULL, NULL);

	/* Check if anything left. */
	if (g_raid_ndisks(sc, -1) == 0)
	g_raid_destroy_node(sc, 0);
	else
	g_raid_md_ddf_refill(sc);
	return (0);
	}
	return (-2);
	}

	static int
	g_raid_md_volume_event_ddf(struct g_raid_md_object *md,
	struct g_raid_volume *vol, u_int event)
	{
	struct g_raid_md_ddf_pervolume *pv;

	pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
	switch (event) {
	case G_RAID_VOLUME_E_STARTMD:
	if (!pv->pv_started)
	g_raid_md_ddf_start(vol);
	return (0);
	}
	return (-2);
	}

	static int
	g_raid_md_ctl_ddf(struct g_raid_md_object *md,
	struct gctl_req *req)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume vol, vol1;
	struct g_raid_subdisk *sd;
	struct g_raid_disk disk, disks[DDF_MAX_DISKS_HARD];
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_pervolume *pv;
	struct g_raid_md_ddf_object *mdi;
	struct ddf_sa_record *sa;
	struct g_consumer *cp;
	struct g_provider *pp;
	char arg[16];
	const char nodename, verb, volname, levelname, *diskname;
	char *tmp;
	int nargs, force;
	off_t size, sectorsize, strip, offs[DDF_MAX_DISKS_HARD], esize;
	intmax_t sizearg, striparg;
	int i, numdisks, len, level, qual;
	int error;

	sc = md->mdo_softc;
	mdi = (struct g_raid_md_ddf_object *)md;
	verb = gctl_get_param(req, "verb", NULL);
	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	error = 0;

	if (strcmp(verb, "label") == 0) {
	if (*nargs < 4) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	volname = gctl_get_asciiparam(req, "arg1");
	if (volname == NULL) {
	gctl_error(req, "No volume name.");
	return (-2);
	}
	levelname = gctl_get_asciiparam(req, "arg2");
	if (levelname == NULL) {
	gctl_error(req, "No RAID level.");
	return (-3);
	}
	if (g_raid_volume_str2level(levelname, &level, &qual)) {
	gctl_error(req, "Unknown RAID level '%s'.", levelname);
	return (-4);
	}
	numdisks = *nargs - 3;
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (!g_raid_md_ddf_supported(level, qual, numdisks,
	force ? *force : 0)) {
	gctl_error(req, "Unsupported RAID level "
	"(0x%02x/0x%02x), or number of disks (%d).",
	level, qual, numdisks);
	return (-5);
	}

	/* Search for disks, connect them and probe. */
	size = INT64_MAX;
	sectorsize = 0;
	bzero(disks, sizeof(disks));
	bzero(offs, sizeof(offs));
	for (i = 0; i < numdisks; i++) {
	snprintf(arg, sizeof(arg), "arg%d", i + 3);
	diskname = gctl_get_asciiparam(req, arg);
	if (diskname == NULL) {
	gctl_error(req, "No disk name (%s).", arg);
	error = -6;
	break;
	}
	if (strcmp(diskname, "NONE") == 0)
	continue;

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer != NULL &&
	disk->d_consumer->provider != NULL &&
	strcmp(disk->d_consumer->provider->name,
	diskname) == 0)
	break;
	}
	if (disk != NULL) {
	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
	gctl_error(req, "Disk '%s' is in a "
	"wrong state (%s).", diskname,
	g_raid_disk_state2str(disk->d_state));
	error = -7;
	break;
	}
	pd = disk->d_md_data;
	if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >=
	GET16(&pd->pd_meta, hdr->Max_Partitions)) {
	gctl_error(req, "No free partitions "
	"on disk '%s'.",
	diskname);
	error = -7;
	break;
	}
	pp = disk->d_consumer->provider;
	disks[i] = disk;
	ddf_meta_unused_range(&pd->pd_meta,
	&offs[i], &esize);
	offs[i] *= pp->sectorsize;
	size = MIN(size, (off_t)esize * pp->sectorsize);
	sectorsize = MAX(sectorsize, pp->sectorsize);
	continue;
	}

	g_topology_lock();
	cp = g_raid_open_consumer(sc, diskname);
	if (cp == NULL) {
	gctl_error(req, "Can't open disk '%s'.",
	diskname);
	g_topology_unlock();
	error = -8;
	break;
	}
	pp = cp->provider;
	pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK \| M_ZERO);
	disk = g_raid_create_disk(sc);
	disk->d_md_data = (void *)pd;
	disk->d_consumer = cp;
	disks[i] = disk;
	cp->private = disk;
	ddf_meta_create(disk, &mdi->mdio_meta);
	if (mdi->mdio_meta.hdr == NULL)
	ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta);
	else
	ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta);
	g_topology_unlock();

	g_raid_get_disk_info(disk);

	/* Reserve some space for metadata. */
	size = MIN(size, GET64(&pd->pd_meta,
	pdr->entry[0].Configured_Size) * pp->sectorsize);
	sectorsize = MAX(sectorsize, pp->sectorsize);
	}
	if (error != 0) {
	for (i = 0; i < numdisks; i++) {
	if (disks[i] != NULL &&
	disks[i]->d_state == G_RAID_DISK_S_NONE)
	g_raid_destroy_disk(disks[i]);
	}
	return (error);
	}

	if (sectorsize <= 0) {
	gctl_error(req, "Can't get sector size.");
	return (-8);
	}

	/* Handle size argument. */
	len = sizeof(*sizearg);
	sizearg = gctl_get_param(req, "size", &len);
	if (sizearg != NULL && len == sizeof(*sizearg) &&
	*sizearg > 0) {
	if (*sizearg > size) {
	gctl_error(req, "Size too big %lld > %lld.",
	(long long)*sizearg, (long long)size);
	return (-9);
	}
	size = *sizearg;
	}

	/* Handle strip argument. */
	strip = 131072;
	len = sizeof(*striparg);
	striparg = gctl_get_param(req, "strip", &len);
	if (striparg != NULL && len == sizeof(*striparg) &&
	*striparg > 0) {
	if (*striparg < sectorsize) {
	gctl_error(req, "Strip size too small.");
	return (-10);
	}
	if (*striparg % sectorsize != 0) {
	gctl_error(req, "Incorrect strip size.");
	return (-11);
	}
	strip = *striparg;
	}

	/* Round size down to strip or sector. */
	if (level == G_RAID_VOLUME_RL_RAID1 \|\|
	level == G_RAID_VOLUME_RL_RAID3 \|\|
	level == G_RAID_VOLUME_RL_SINGLE \|\|
	level == G_RAID_VOLUME_RL_CONCAT)
	size -= (size % sectorsize);
	else if (level == G_RAID_VOLUME_RL_RAID1E &&
	(numdisks & 1) != 0)
	size -= (size % (2 * strip));
	else
	size -= (size % strip);
	if (size <= 0) {
	gctl_error(req, "Size too small.");
	return (-13);
	}

	/* We have all we need, create things: volume, ... */
	pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK \| M_ZERO);
	ddf_vol_meta_create(&pv->pv_meta, &mdi->mdio_meta);
	pv->pv_started = 1;
	vol = g_raid_create_volume(sc, volname, -1);
	vol->v_md_data = pv;
	vol->v_raid_level = level;
	vol->v_raid_level_qualifier = qual;
	vol->v_strip_size = strip;
	vol->v_disks_count = numdisks;
	if (level == G_RAID_VOLUME_RL_RAID0 \|\|
	level == G_RAID_VOLUME_RL_CONCAT \|\|
	level == G_RAID_VOLUME_RL_SINGLE)
	vol->v_mediasize = size * numdisks;
	else if (level == G_RAID_VOLUME_RL_RAID1)
	vol->v_mediasize = size;
	else if (level == G_RAID_VOLUME_RL_RAID3 \|\|
	level == G_RAID_VOLUME_RL_RAID4 \|\|
	level == G_RAID_VOLUME_RL_RAID5)
	vol->v_mediasize = size * (numdisks - 1);
	else if (level == G_RAID_VOLUME_RL_RAID5R) {
	vol->v_mediasize = size * (numdisks - 1);
	vol->v_rotate_parity = 1024;
	} else if (level == G_RAID_VOLUME_RL_RAID6 \|\|
	level == G_RAID_VOLUME_RL_RAID5E \|\|
	level == G_RAID_VOLUME_RL_RAID5EE)
	vol->v_mediasize = size * (numdisks - 2);
	else if (level == G_RAID_VOLUME_RL_RAIDMDF) {
	if (numdisks < 5)
	vol->v_mdf_pdisks = 2;
	else
	vol->v_mdf_pdisks = 3;
	vol->v_mdf_polynomial = 0x11d;
	vol->v_mdf_method = 0x00;
	vol->v_mediasize = size * (numdisks - vol->v_mdf_pdisks);
	} else { /* RAID1E */
	vol->v_mediasize = ((size * numdisks) / strip / 2) *
	strip;
	}
	vol->v_sectorsize = sectorsize;
	g_raid_start_volume(vol);

	/* , and subdisks. */
	for (i = 0; i < numdisks; i++) {
	disk = disks[i];
	sd = &vol->v_subdisks[i];
	sd->sd_disk = disk;
	sd->sd_offset = offs[i];
	sd->sd_size = size;
	if (disk == NULL)
	continue;
	TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
	g_raid_change_disk_state(disk,
	G_RAID_DISK_S_ACTIVE);
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_ACTIVE);
	g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
	G_RAID_EVENT_SUBDISK);
	}

	/* Write metadata based on created entities. */
	G_RAID_DEBUG1(0, sc, "Array started.");
	g_raid_md_write_ddf(md, vol, NULL, NULL);

	/* Pickup any STALE/SPARE disks to refill array if needed. */
	g_raid_md_ddf_refill(sc);

	g_raid_event_send(vol, G_RAID_VOLUME_E_START,
	G_RAID_EVENT_VOLUME);
	return (0);
	}
	if (strcmp(verb, "add") == 0) {
	gctl_error(req, "`add` command is not applicable, "
	"use `label` instead.");
	return (-99);
	}
	if (strcmp(verb, "delete") == 0) {
	nodename = gctl_get_asciiparam(req, "arg0");
	if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
	nodename = NULL;

	/* Full node destruction. */
	if (*nargs == 1 && nodename != NULL) {
	/* Check if some volume is still open. */
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (force != NULL && *force == 0 &&
	g_raid_nopens(sc) != 0) {
	gctl_error(req, "Some volume is still open.");
	return (-4);
	}

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer)
	ddf_meta_erase(disk->d_consumer);
	}
	g_raid_destroy_node(sc, 0);
	return (0);
	}

	/* Destroy specified volume. If it was last - all node. */
	if (*nargs > 2) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	volname = gctl_get_asciiparam(req,
	nodename != NULL ? "arg1" : "arg0");
	if (volname == NULL) {
	gctl_error(req, "No volume name.");
	return (-2);
	}

	/* Search for volume. */
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (strcmp(vol->v_name, volname) == 0)
	break;
	pp = vol->v_provider;
	if (pp == NULL)
	continue;
	if (strcmp(pp->name, volname) == 0)
	break;
	if (strncmp(pp->name, "raid/", 5) == 0 &&
	strcmp(pp->name + 5, volname) == 0)
	break;
	}
	if (vol == NULL) {
	i = strtol(volname, &tmp, 10);
	if (verb != volname && tmp[0] == 0) {
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (vol->v_global_id == i)
	break;
	}
	}
	}
	if (vol == NULL) {
	gctl_error(req, "Volume '%s' not found.", volname);
	return (-3);
	}

	/* Check if volume is still open. */
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (force != NULL && *force == 0 &&
	vol->v_provider_open != 0) {
	gctl_error(req, "Volume is still open.");
	return (-4);
	}

	/* Destroy volume and potentially node. */
	i = 0;
	TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
	i++;
	if (i >= 2) {
	g_raid_destroy_volume(vol);
	g_raid_md_ddf_purge_disks(sc);
	g_raid_md_write_ddf(md, NULL, NULL, NULL);
	} else {
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer)
	ddf_meta_erase(disk->d_consumer);
	}
	g_raid_destroy_node(sc, 0);
	}
	return (0);
	}
	if (strcmp(verb, "remove") == 0 \|\|
	strcmp(verb, "fail") == 0) {
	if (*nargs < 2) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	for (i = 1; i < *nargs; i++) {
	snprintf(arg, sizeof(arg), "arg%d", i);
	diskname = gctl_get_asciiparam(req, arg);
	if (diskname == NULL) {
	gctl_error(req, "No disk name (%s).", arg);
	error = -2;
	break;
	}
	if (strncmp(diskname, _PATH_DEV, 5) == 0)
	diskname += 5;

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer != NULL &&
	disk->d_consumer->provider != NULL &&
	strcmp(disk->d_consumer->provider->name,
	diskname) == 0)
	break;
	}
	if (disk == NULL) {
	gctl_error(req, "Disk '%s' not found.",
	diskname);
	error = -3;
	break;
	}

	if (strcmp(verb, "fail") == 0) {
	g_raid_md_fail_disk_ddf(md, NULL, disk);
	continue;
	}

	/* Erase metadata on deleting disk and destroy it. */
	ddf_meta_erase(disk->d_consumer);
	g_raid_destroy_disk(disk);
	}
	g_raid_md_ddf_purge_volumes(sc);

	/* Write updated metadata to remaining disks. */
	g_raid_md_write_ddf(md, NULL, NULL, NULL);

	/* Check if anything left. */
	if (g_raid_ndisks(sc, -1) == 0)
	g_raid_destroy_node(sc, 0);
	else
	g_raid_md_ddf_refill(sc);
	return (error);
	}
	if (strcmp(verb, "insert") == 0) {
	if (*nargs < 2) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	for (i = 1; i < *nargs; i++) {
	/* Get disk name. */
	snprintf(arg, sizeof(arg), "arg%d", i);
	diskname = gctl_get_asciiparam(req, arg);
	if (diskname == NULL) {
	gctl_error(req, "No disk name (%s).", arg);
	error = -3;
	break;
	}

	/* Try to find provider with specified name. */
	g_topology_lock();
	cp = g_raid_open_consumer(sc, diskname);
	if (cp == NULL) {
	gctl_error(req, "Can't open disk '%s'.",
	diskname);
	g_topology_unlock();
	error = -4;
	break;
	}
	pp = cp->provider;
	g_topology_unlock();

	pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK \| M_ZERO);

	disk = g_raid_create_disk(sc);
	disk->d_consumer = cp;
	disk->d_md_data = (void *)pd;
	cp->private = disk;

	g_raid_get_disk_info(disk);

	/* Welcome the "new" disk. */
	g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
	ddf_meta_create(disk, &mdi->mdio_meta);
	sa = ddf_meta_find_sa(&pd->pd_meta, 1);
	if (sa != NULL) {
	SET32D(&pd->pd_meta, sa->Signature,
	DDF_SA_SIGNATURE);
	SET8D(&pd->pd_meta, sa->Spare_Type, 0);
	SET16D(&pd->pd_meta, sa->Populated_SAEs, 0);
	SET16D(&pd->pd_meta, sa->MAX_SAE_Supported,
	(GET16(&pd->pd_meta, hdr->Configuration_Record_Length) *
	pd->pd_meta.sectorsize -
	sizeof(struct ddf_sa_record)) /
	sizeof(struct ddf_sa_entry));
	}
	if (mdi->mdio_meta.hdr == NULL)
	ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta);
	else
	ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta);
	g_raid_md_write_ddf(md, NULL, NULL, NULL);
	g_raid_md_ddf_refill(sc);
	}
	return (error);
	}
	return (-100);
	}

	static int
	g_raid_md_write_ddf(struct g_raid_md_object md, struct g_raid_volume tvol,
	struct g_raid_subdisk tsd, struct g_raid_disk tdisk)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume *vol;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_pervolume *pv;
	struct g_raid_md_ddf_object *mdi;
	struct ddf_meta *gmeta;
	struct ddf_vol_meta *vmeta;
	struct ddf_vdc_record *vdc;
	struct ddf_sa_record *sa;
	uint64_t *val2;
	int i, j, pos, bvd, size;

	sc = md->mdo_softc;
	mdi = (struct g_raid_md_ddf_object *)md;
	gmeta = &mdi->mdio_meta;

	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
	return (0);

	/*
	* Clear disk flags to let only really needed ones to be reset.
	* Do it only if there are no volumes in starting state now,
	* as they can update disk statuses yet and we may kill innocent.
	*/
	if (mdi->mdio_starting == 0) {
	for (i = 0; i < GET16(gmeta, pdr->Populated_PDEs); i++) {
	if (isff(gmeta->pdr->entry[i].PD_GUID, 24))
	continue;
	SET16(gmeta, pdr->entry[i].PD_Type,
	GET16(gmeta, pdr->entry[i].PD_Type) &
	~(DDF_PDE_PARTICIPATING \|
	DDF_PDE_GLOBAL_SPARE \| DDF_PDE_CONFIG_SPARE));
	if ((GET16(gmeta, pdr->entry[i].PD_State) &
	DDF_PDE_PFA) == 0)
	SET16(gmeta, pdr->entry[i].PD_State, 0);
	}
	}

	/* Generate/update new per-volume metadata. */
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
	if (vol->v_stopping \|\| !pv->pv_started)
	continue;
	vmeta = &pv->pv_meta;

	SET32(vmeta, vdc->Sequence_Number,
	GET32(vmeta, vdc->Sequence_Number) + 1);
	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E &&
	vol->v_disks_count % 2 == 0)
	SET16(vmeta, vdc->Primary_Element_Count, 2);
	else
	SET16(vmeta, vdc->Primary_Element_Count,
	vol->v_disks_count);
	SET8(vmeta, vdc->Stripe_Size,
	ffs(vol->v_strip_size / vol->v_sectorsize) - 1);
	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E &&
	vol->v_disks_count % 2 == 0) {
	SET8(vmeta, vdc->Primary_RAID_Level,
	DDF_VDCR_RAID1);
	SET8(vmeta, vdc->RLQ, 0);
	SET8(vmeta, vdc->Secondary_Element_Count,
	vol->v_disks_count / 2);
	SET8(vmeta, vdc->Secondary_RAID_Level, 0);
	} else {
	SET8(vmeta, vdc->Primary_RAID_Level,
	vol->v_raid_level);
	SET8(vmeta, vdc->RLQ,
	vol->v_raid_level_qualifier);
	SET8(vmeta, vdc->Secondary_Element_Count, 1);
	SET8(vmeta, vdc->Secondary_RAID_Level, 0);
	}
	SET8(vmeta, vdc->Secondary_Element_Seq, 0);
	SET64(vmeta, vdc->Block_Count, 0);
	SET64(vmeta, vdc->VD_Size, vol->v_mediasize / vol->v_sectorsize);
	SET16(vmeta, vdc->Block_Size, vol->v_sectorsize);
	SET8(vmeta, vdc->Rotate_Parity_count,
	fls(vol->v_rotate_parity) - 1);
	SET8(vmeta, vdc->MDF_Parity_Disks, vol->v_mdf_pdisks);
	SET16(vmeta, vdc->MDF_Parity_Generator_Polynomial,
	vol->v_mdf_polynomial);
	SET8(vmeta, vdc->MDF_Constant_Generation_Method,
	vol->v_mdf_method);

	SET16(vmeta, vde->VD_Number, vol->v_global_id);
	if (vol->v_state <= G_RAID_VOLUME_S_BROKEN)
	SET8(vmeta, vde->VD_State, DDF_VDE_FAILED);
	else if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
	SET8(vmeta, vde->VD_State, DDF_VDE_DEGRADED);
	else if (vol->v_state <= G_RAID_VOLUME_S_SUBOPTIMAL)
	SET8(vmeta, vde->VD_State, DDF_VDE_PARTIAL);
	else
	SET8(vmeta, vde->VD_State, DDF_VDE_OPTIMAL);
	if (vol->v_dirty \|\|
	g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) > 0 \|\|
	g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) > 0)
	SET8(vmeta, vde->VD_State,
	GET8(vmeta, vde->VD_State) \| DDF_VDE_DIRTY);
	SET8(vmeta, vde->Init_State, DDF_VDE_INIT_FULL); // XXX
	ddf_meta_put_name(vmeta, vol->v_name);

	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	bvd = i / GET16(vmeta, vdc->Primary_Element_Count);
	pos = i % GET16(vmeta, vdc->Primary_Element_Count);
	disk = sd->sd_disk;
	if (disk != NULL) {
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	if (vmeta->bvdc[bvd] == NULL) {
	size = GET16(vmeta,
	hdr->Configuration_Record_Length) *
	vmeta->sectorsize;
	vmeta->bvdc[bvd] = malloc(size,
	M_MD_DDF, M_WAITOK);
	memset(vmeta->bvdc[bvd], 0xff, size);
	}
	memcpy(vmeta->bvdc[bvd], vmeta->vdc,
	sizeof(struct ddf_vdc_record));
	SET8(vmeta, bvdc[bvd]->Secondary_Element_Seq, bvd);
	SET64(vmeta, bvdc[bvd]->Block_Count,
	sd->sd_size / vol->v_sectorsize);
	SET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos],
	GET32(&pd->pd_meta, pdd->PD_Reference));
	val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[
	GET16(vmeta, hdr->Max_Primary_Element_Entries)]);
	SET64P(vmeta, val2 + pos,
	sd->sd_offset / vol->v_sectorsize);
	}
	if (vmeta->bvdc[bvd] == NULL)
	continue;

	j = ddf_meta_find_pd(gmeta, NULL,
	GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]));
	if (j < 0)
	continue;
	SET16(gmeta, pdr->entry[j].PD_Type,
	GET16(gmeta, pdr->entry[j].PD_Type) \|
	DDF_PDE_PARTICIPATING);
	if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
	SET16(gmeta, pdr->entry[j].PD_State,
	GET16(gmeta, pdr->entry[j].PD_State) \|
	(DDF_PDE_FAILED \| DDF_PDE_MISSING));
	else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED)
	SET16(gmeta, pdr->entry[j].PD_State,
	GET16(gmeta, pdr->entry[j].PD_State) \|
	(DDF_PDE_FAILED \| DDF_PDE_PFA));
	else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD)
	SET16(gmeta, pdr->entry[j].PD_State,
	GET16(gmeta, pdr->entry[j].PD_State) \|
	DDF_PDE_REBUILD);
	else
	SET16(gmeta, pdr->entry[j].PD_State,
	GET16(gmeta, pdr->entry[j].PD_State) \|
	DDF_PDE_ONLINE);
	}
	}

	/* Mark spare and failed disks as such. */
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	i = ddf_meta_find_pd(gmeta, NULL,
	GET32(&pd->pd_meta, pdd->PD_Reference));
	if (i < 0)
	continue;
	if (disk->d_state == G_RAID_DISK_S_FAILED) {
	SET16(gmeta, pdr->entry[i].PD_State,
	GET16(gmeta, pdr->entry[i].PD_State) \|
	(DDF_PDE_FAILED \| DDF_PDE_PFA));
	}
	if (disk->d_state != G_RAID_DISK_S_SPARE)
	continue;
	sa = ddf_meta_find_sa(&pd->pd_meta, 0);
	if (sa == NULL \|\|
	(GET8D(&pd->pd_meta, sa->Spare_Type) &
	DDF_SAR_TYPE_DEDICATED) == 0) {
	SET16(gmeta, pdr->entry[i].PD_Type,
	GET16(gmeta, pdr->entry[i].PD_Type) \|
	DDF_PDE_GLOBAL_SPARE);
	} else {
	SET16(gmeta, pdr->entry[i].PD_Type,
	GET16(gmeta, pdr->entry[i].PD_Type) \|
	DDF_PDE_CONFIG_SPARE);
	}
	SET16(gmeta, pdr->entry[i].PD_State,
	GET16(gmeta, pdr->entry[i].PD_State) \|
	DDF_PDE_ONLINE);
	}

	/* Remove disks without "participating" flag (unused). */
	for (i = 0, j = -1; i < GET16(gmeta, pdr->Populated_PDEs); i++) {
	if (isff(gmeta->pdr->entry[i].PD_GUID, 24))
	continue;
	if ((GET16(gmeta, pdr->entry[i].PD_Type) &
	(DDF_PDE_PARTICIPATING \|
	DDF_PDE_GLOBAL_SPARE \| DDF_PDE_CONFIG_SPARE)) != 0 \|\|
	g_raid_md_ddf_get_disk(sc,
	NULL, GET32(gmeta, pdr->entry[i].PD_Reference)) != NULL)
	j = i;
	else
	memset(&gmeta->pdr->entry[i], 0xff,
	sizeof(struct ddf_pd_entry));
	}
	SET16(gmeta, pdr->Populated_PDEs, j + 1);

	/* Update per-disk metadata and write them. */
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
	disk->d_state != G_RAID_DISK_S_SPARE)
	continue;
	/* Update PDR. */
	memcpy(pd->pd_meta.pdr, gmeta->pdr,
	GET32(&pd->pd_meta, hdr->pdr_length) *
	pd->pd_meta.sectorsize);
	/* Update VDR. */
	SET16(&pd->pd_meta, vdr->Populated_VDEs, 0);
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (vol->v_stopping)
	continue;
	pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
	i = ddf_meta_find_vd(&pd->pd_meta,
	pv->pv_meta.vde->VD_GUID);
	if (i < 0)
	i = ddf_meta_find_vd(&pd->pd_meta, NULL);
	if (i >= 0)
	memcpy(&pd->pd_meta.vdr->entry[i],
	pv->pv_meta.vde,
	sizeof(struct ddf_vd_entry));
	}
	/* Update VDC. */
	if (mdi->mdio_starting == 0) {
	/* Remove all VDCs to restore needed later. */
	j = GETCRNUM(&pd->pd_meta);
	for (i = 0; i < j; i++) {
	vdc = GETVDCPTR(&pd->pd_meta, i);
	if (GET32D(&pd->pd_meta, vdc->Signature) !=
	DDF_VDCR_SIGNATURE)
	continue;
	SET32D(&pd->pd_meta, vdc->Signature, 0xffffffff);
	}
	}
	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
	vol = sd->sd_volume;
	if (vol->v_stopping)
	continue;
	pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
	vmeta = &pv->pv_meta;
	vdc = ddf_meta_find_vdc(&pd->pd_meta,
	vmeta->vde->VD_GUID);
	if (vdc == NULL)
	vdc = ddf_meta_find_vdc(&pd->pd_meta, NULL);
	if (vdc != NULL) {
	bvd = sd->sd_pos / GET16(vmeta,
	vdc->Primary_Element_Count);
	memcpy(vdc, vmeta->bvdc[bvd],
	GET16(&pd->pd_meta,
	hdr->Configuration_Record_Length) *
	pd->pd_meta.sectorsize);
	}
	}
	G_RAID_DEBUG(1, "Writing DDF metadata to %s",
	g_raid_get_diskname(disk));
	g_raid_md_ddf_print(&pd->pd_meta);
	ddf_meta_write(disk->d_consumer, &pd->pd_meta);
	}
	return (0);
	}

	static int
	g_raid_md_fail_disk_ddf(struct g_raid_md_object *md,
	struct g_raid_subdisk tsd, struct g_raid_disk tdisk)
	{
	struct g_raid_softc *sc;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_subdisk *sd;
	int i;

	sc = md->mdo_softc;
	pd = (struct g_raid_md_ddf_perdisk *)tdisk->d_md_data;

	/* We can't fail disk that is not a part of array now. */
	if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
	return (-1);

	/*
	* Mark disk as failed in metadata and try to write that metadata
	* to the disk itself to prevent it's later resurrection as STALE.
	*/
	G_RAID_DEBUG(1, "Writing DDF metadata to %s",
	g_raid_get_diskname(tdisk));
	i = ddf_meta_find_pd(&pd->pd_meta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference));
	SET16(&pd->pd_meta, pdr->entry[i].PD_State, DDF_PDE_FAILED \| DDF_PDE_PFA);
	if (tdisk->d_consumer != NULL)
	ddf_meta_write(tdisk->d_consumer, &pd->pd_meta);

	/* Change states. */
	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_FAILED);
	g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
	G_RAID_EVENT_SUBDISK);
	}

	/* Write updated metadata to remaining disks. */
	g_raid_md_write_ddf(md, NULL, NULL, tdisk);

	g_raid_md_ddf_refill(sc);
	return (0);
	}

	static int
	g_raid_md_free_disk_ddf(struct g_raid_md_object *md,
	struct g_raid_disk *disk)
	{
	struct g_raid_md_ddf_perdisk *pd;

	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	ddf_meta_free(&pd->pd_meta);
	free(pd, M_MD_DDF);
	disk->d_md_data = NULL;
	return (0);
	}

	static int
	g_raid_md_free_volume_ddf(struct g_raid_md_object *md,
	struct g_raid_volume *vol)
	{
	struct g_raid_md_ddf_object *mdi;
	struct g_raid_md_ddf_pervolume *pv;

	mdi = (struct g_raid_md_ddf_object *)md;
	pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
	ddf_vol_meta_free(&pv->pv_meta);
	if (!pv->pv_started) {
	pv->pv_started = 1;
	mdi->mdio_starting--;
	callout_stop(&pv->pv_start_co);
	}
	free(pv, M_MD_DDF);
	vol->v_md_data = NULL;
	return (0);
	}

	static int
	g_raid_md_free_ddf(struct g_raid_md_object *md)
	{
	struct g_raid_md_ddf_object *mdi;

	mdi = (struct g_raid_md_ddf_object *)md;
	if (!mdi->mdio_started) {
	mdi->mdio_started = 0;
	callout_stop(&mdi->mdio_start_co);
	G_RAID_DEBUG1(1, md->mdo_softc,
	"root_mount_rel %p", mdi->mdio_rootmount);
	root_mount_rel(mdi->mdio_rootmount);
	mdi->mdio_rootmount = NULL;
	}
	ddf_meta_free(&mdi->mdio_meta);
	return (0);
	}

	G_RAID_MD_DECLARE(ddf, "DDF");
	diff --git a/sys/geom/raid/md_promise.c b/sys/geom/raid/md_promise.c
	index bec52d26a7c6..aacf0106ea15 100644
	--- a/sys/geom/raid/md_promise.c
	+++ b/sys/geom/raid/md_promise.c
	@@ -1,2004 +1,2004 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2011 Alexander Motin <mav@FreeBSD.org>
	* Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/bio.h>
	#include <sys/endian.h>
	#include <sys/kernel.h>
	#include <sys/kobj.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/systm.h>
	#include <geom/geom.h>
	#include <geom/geom_dbg.h>
	#include "geom/raid/g_raid.h"
	#include "g_raid_md_if.h"

	static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata");

	#define PROMISE_MAX_DISKS 8
	#define PROMISE_MAX_SUBDISKS 2
	#define PROMISE_META_OFFSET 14

	struct promise_raid_disk {
	uint8_t flags; /* Subdisk status. */
	#define PROMISE_F_VALID 0x01
	#define PROMISE_F_ONLINE 0x02
	#define PROMISE_F_ASSIGNED 0x04
	#define PROMISE_F_SPARE 0x08
	#define PROMISE_F_DUPLICATE 0x10
	#define PROMISE_F_REDIR 0x20
	#define PROMISE_F_DOWN 0x40
	#define PROMISE_F_READY 0x80

	uint8_t number; /* Position in a volume. */
	uint8_t channel; /* ATA channel number. */
	uint8_t device; /* ATA device number. */
	uint64_t id __packed; /* Subdisk ID. */
	} __packed;

	struct promise_raid_conf {
	char promise_id[24];
	#define PROMISE_MAGIC "Promise Technology, Inc."
	#define FREEBSD_MAGIC "FreeBSD ATA driver RAID "

	uint32_t dummy_0;
	uint64_t magic_0;
	#define PROMISE_MAGIC0(x) (((uint64_t)(x.channel) << 48) \| \
	((uint64_t)(x.device != 0) << 56))
	uint16_t magic_1;
	uint32_t magic_2;
	uint8_t filler1[470];

	uint32_t integrity;
	#define PROMISE_I_VALID 0x00000080

	struct promise_raid_disk disk; /* This subdisk info. */
	uint32_t disk_offset; /* Subdisk offset. */
	uint32_t disk_sectors; /* Subdisk size */
	uint32_t disk_rebuild; /* Rebuild position. */
	uint16_t generation; /* Generation number. */
	uint8_t status; /* Volume status. */
	#define PROMISE_S_VALID 0x01
	#define PROMISE_S_ONLINE 0x02
	#define PROMISE_S_INITED 0x04
	#define PROMISE_S_READY 0x08
	#define PROMISE_S_DEGRADED 0x10
	#define PROMISE_S_MARKED 0x20
	#define PROMISE_S_MIGRATING 0x40
	#define PROMISE_S_FUNCTIONAL 0x80

	uint8_t type; /* Voluem type. */
	#define PROMISE_T_RAID0 0x00
	#define PROMISE_T_RAID1 0x01
	#define PROMISE_T_RAID3 0x02
	#define PROMISE_T_RAID5 0x04
	#define PROMISE_T_SPAN 0x08
	#define PROMISE_T_JBOD 0x10

	uint8_t total_disks; /* Disks in this volume. */
	uint8_t stripe_shift; /* Strip size. */
	uint8_t array_width; /* Number of RAID0 stripes. */
	uint8_t array_number; /* Global volume number. */
	uint32_t total_sectors; /* Volume size. */
	uint16_t cylinders; /* Volume geometry: C. */
	uint8_t heads; /* Volume geometry: H. */
	uint8_t sectors; /* Volume geometry: S. */
	uint64_t volume_id __packed; /* Volume ID, */
	struct promise_raid_disk disks[PROMISE_MAX_DISKS];
	/* Subdisks in this volume. */
	char name[32]; /* Volume label. */

	uint32_t filler2[8];
	uint32_t magic_3; /* Something related to rebuild. */
	uint64_t rebuild_lba64; /* Per-volume rebuild position. */
	uint32_t magic_4;
	uint32_t magic_5;
	uint32_t total_sectors_high;
	uint8_t magic_6;
	uint8_t sector_size;
	uint16_t magic_7;
	uint32_t magic_8[31];
	uint32_t backup_time;
	uint16_t magic_9;
	uint32_t disk_offset_high;
	uint32_t disk_sectors_high;
	uint32_t disk_rebuild_high;
	uint16_t magic_10;
	uint32_t magic_11[3];
	uint32_t filler3[284];
	uint32_t checksum;
	} __packed;

	struct g_raid_md_promise_perdisk {
	int pd_updated;
	int pd_subdisks;
	struct promise_raid_conf *pd_meta[PROMISE_MAX_SUBDISKS];
	};

	struct g_raid_md_promise_pervolume {
	struct promise_raid_conf *pv_meta;
	uint64_t pv_id;
	uint16_t pv_generation;
	int pv_disks_present;
	int pv_started;
	struct callout pv_start_co; /* STARTING state timer. */
	};

	static g_raid_md_create_t g_raid_md_create_promise;
	static g_raid_md_taste_t g_raid_md_taste_promise;
	static g_raid_md_event_t g_raid_md_event_promise;
	static g_raid_md_volume_event_t g_raid_md_volume_event_promise;
	static g_raid_md_ctl_t g_raid_md_ctl_promise;
	static g_raid_md_write_t g_raid_md_write_promise;
	static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise;
	static g_raid_md_free_disk_t g_raid_md_free_disk_promise;
	static g_raid_md_free_volume_t g_raid_md_free_volume_promise;
	static g_raid_md_free_t g_raid_md_free_promise;

	static kobj_method_t g_raid_md_promise_methods[] = {
	KOBJMETHOD(g_raid_md_create, g_raid_md_create_promise),
	KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_promise),
	KOBJMETHOD(g_raid_md_event, g_raid_md_event_promise),
	KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_promise),
	KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_promise),
	KOBJMETHOD(g_raid_md_write, g_raid_md_write_promise),
	KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise),
	KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise),
	KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_promise),
	KOBJMETHOD(g_raid_md_free, g_raid_md_free_promise),
	{ 0, 0 }
	};

	static struct g_raid_md_class g_raid_md_promise_class = {
	"Promise",
	g_raid_md_promise_methods,
	sizeof(struct g_raid_md_object),
	.mdc_enable = 1,
	.mdc_priority = 100
	};

	static void
	g_raid_md_promise_print(struct promise_raid_conf *meta)
	{
	int i;

	if (g_raid_debug < 1)
	return;

	printf("******* ATA Promise Metadata *******\n");
	printf("promise_id <%.24s>\n", meta->promise_id);
	printf("disk %02x %02x %02x %02x %016jx\n",
	meta->disk.flags, meta->disk.number, meta->disk.channel,
	meta->disk.device, meta->disk.id);
	printf("disk_offset %u\n", meta->disk_offset);
	printf("disk_sectors %u\n", meta->disk_sectors);
	printf("disk_rebuild %u\n", meta->disk_rebuild);
	printf("generation %u\n", meta->generation);
	printf("status 0x%02x\n", meta->status);
	printf("type %u\n", meta->type);
	printf("total_disks %u\n", meta->total_disks);
	printf("stripe_shift %u\n", meta->stripe_shift);
	printf("array_width %u\n", meta->array_width);
	printf("array_number %u\n", meta->array_number);
	printf("total_sectors %u\n", meta->total_sectors);
	printf("cylinders %u\n", meta->cylinders);
	printf("heads %u\n", meta->heads);
	printf("sectors %u\n", meta->sectors);
	printf("volume_id 0x%016jx\n", meta->volume_id);
	printf("disks:\n");
	for (i = 0; i < PROMISE_MAX_DISKS; i++ ) {
	printf(" %02x %02x %02x %02x %016jx\n",
	meta->disks[i].flags, meta->disks[i].number,
	meta->disks[i].channel, meta->disks[i].device,
	meta->disks[i].id);
	}
	printf("name <%.32s>\n", meta->name);
	printf("magic_3 0x%08x\n", meta->magic_3);
	printf("rebuild_lba64 %ju\n", meta->rebuild_lba64);
	printf("magic_4 0x%08x\n", meta->magic_4);
	printf("magic_5 0x%08x\n", meta->magic_5);
	printf("total_sectors_high 0x%08x\n", meta->total_sectors_high);
	printf("sector_size %u\n", meta->sector_size);
	printf("backup_time %d\n", meta->backup_time);
	printf("disk_offset_high 0x%08x\n", meta->disk_offset_high);
	printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high);
	printf("disk_rebuild_high 0x%08x\n", meta->disk_rebuild_high);
	printf("=================================================\n");
	}

	static struct promise_raid_conf *
	promise_meta_copy(struct promise_raid_conf *meta)
	{
	struct promise_raid_conf *nmeta;

	nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK);
	memcpy(nmeta, meta, sizeof(*nmeta));
	return (nmeta);
	}

	static int
	promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id)
	{
	int pos;

	for (pos = 0; pos < meta->total_disks; pos++) {
	if (meta->disks[pos].id == id)
	return (pos);
	}
	return (-1);
	}

	static int
	promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd,
	off_t sectors, off_t off, off_t size)
	{
	off_t coff, csize, tmp;
	int i, j;

	sectors -= 131072;
	*off = 0;
	*size = 0;
	coff = 0;
	csize = sectors;
	i = 0;
	while (1) {
	for (j = 0; j < nsd; j++) {
	tmp = ((off_t)metaarr[j]->disk_offset_high << 32) +
	metaarr[j]->disk_offset;
	if (tmp >= coff)
	csize = MIN(csize, tmp - coff);
	}
	if (csize > *size) {
	*off = coff;
	*size = csize;
	}
	if (i >= nsd)
	break;
	coff = ((off_t)metaarr[i]->disk_offset_high << 32) +
	metaarr[i]->disk_offset +
	((off_t)metaarr[i]->disk_sectors_high << 32) +
	metaarr[i]->disk_sectors;
	csize = sectors - coff;
	i++;
	}
	return ((*size > 0) ? 1 : 0);
	}

	static int
	promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos)
	{
	int disk_pos, width;

	if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
	width = vol->v_disks_count / 2;
	disk_pos = (md_disk_pos / width) +
	(md_disk_pos % width) * width;
	} else
	disk_pos = md_disk_pos;
	return (disk_pos);
	}

	static void
	promise_meta_get_name(struct promise_raid_conf meta, char buf)
	{
	int i;

	strncpy(buf, meta->name, 32);
	buf[32] = 0;
	for (i = 31; i >= 0; i--) {
	if (buf[i] > 0x20)
	break;
	buf[i] = 0;
	}
	}

	static void
	promise_meta_put_name(struct promise_raid_conf meta, char buf)
	{

	memset(meta->name, 0x20, 32);
	memcpy(meta->name, buf, MIN(strlen(buf), 32));
	}

	static int
	promise_meta_read(struct g_consumer cp, struct promise_raid_conf *metaarr)
	{
	struct g_provider *pp;
	struct promise_raid_conf *meta;
	char *buf;
	int error, i, subdisks;
	uint32_t checksum, *ptr;

	pp = cp->provider;
	subdisks = 0;

	- if (pp->sectorsize * 4 > MAXPHYS) {
	+ if (pp->sectorsize * 4 > maxphys) {
	G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name);
	return (subdisks);
	}
	next:
	/* Read metadata block. */
	buf = g_read_data(cp, pp->mediasize - pp->sectorsize *
	(63 - subdisks * PROMISE_META_OFFSET),
	pp->sectorsize * 4, &error);
	if (buf == NULL) {
	G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
	pp->name, error);
	return (subdisks);
	}
	meta = (struct promise_raid_conf *)buf;

	/* Check if this is an Promise RAID struct */
	if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) &&
	strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) {
	if (subdisks == 0)
	G_RAID_DEBUG(1,
	"Promise signature check failed on %s", pp->name);
	g_free(buf);
	return (subdisks);
	}
	meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK);
	memcpy(meta, buf, MIN(sizeof(meta), pp->sectorsize 4));
	g_free(buf);

	/* Check metadata checksum. */
	for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
	checksum += *ptr++;
	if (checksum != meta->checksum) {
	G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name);
	free(meta, M_MD_PROMISE);
	return (subdisks);
	}

	if ((meta->integrity & PROMISE_I_VALID) == 0) {
	G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name);
	free(meta, M_MD_PROMISE);
	return (subdisks);
	}

	if (meta->total_disks > PROMISE_MAX_DISKS) {
	G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)",
	pp->name, meta->total_disks);
	free(meta, M_MD_PROMISE);
	return (subdisks);
	}

	/* Remove filler garbage from fields used in newer metadata. */
	if (meta->disk_offset_high == 0x8b8c8d8e &&
	meta->disk_sectors_high == 0x8788898a &&
	meta->disk_rebuild_high == 0x83848586) {
	meta->disk_offset_high = 0;
	meta->disk_sectors_high = 0;
	if (meta->disk_rebuild == UINT32_MAX)
	meta->disk_rebuild_high = UINT32_MAX;
	else
	meta->disk_rebuild_high = 0;
	if (meta->total_sectors_high == 0x15161718) {
	meta->total_sectors_high = 0;
	meta->backup_time = 0;
	if (meta->rebuild_lba64 == 0x2122232425262728)
	meta->rebuild_lba64 = UINT64_MAX;
	}
	}
	if (meta->sector_size < 1 \|\| meta->sector_size > 8)
	meta->sector_size = 1;

	/* Save this part and look for next. */
	*metaarr = meta;
	metaarr++;
	subdisks++;
	if (subdisks < PROMISE_MAX_SUBDISKS)
	goto next;

	return (subdisks);
	}

	static int
	promise_meta_write(struct g_consumer *cp,
	struct promise_raid_conf **metaarr, int nsd)
	{
	struct g_provider *pp;
	struct promise_raid_conf *meta;
	char *buf;
	off_t off, size;
	int error, i, subdisk, fake;
	uint32_t checksum, *ptr;

	pp = cp->provider;
	subdisk = 0;
	fake = 0;
	next:
	buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK \| M_ZERO);
	meta = NULL;
	if (subdisk < nsd) {
	meta = metaarr[subdisk];
	} else if (!fake && promise_meta_unused_range(metaarr, nsd,
	cp->provider->mediasize / cp->provider->sectorsize,
	&off, &size)) {
	/* Optionally add record for unused space. */
	meta = (struct promise_raid_conf *)buf;
	memcpy(&meta->promise_id[0], PROMISE_MAGIC,
	sizeof(PROMISE_MAGIC) - 1);
	meta->dummy_0 = 0x00020000;
	meta->integrity = PROMISE_I_VALID;
	meta->disk.flags = PROMISE_F_ONLINE \| PROMISE_F_VALID;
	meta->disk.number = 0xff;
	arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
	meta->disk_offset_high = off >> 32;
	meta->disk_offset = (uint32_t)off;
	meta->disk_sectors_high = size >> 32;
	meta->disk_sectors = (uint32_t)size;
	meta->disk_rebuild_high = UINT32_MAX;
	meta->disk_rebuild = UINT32_MAX;
	fake = 1;
	}
	if (meta != NULL) {
	/* Recalculate checksum for case if metadata were changed. */
	meta->checksum = 0;
	for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
	checksum += *ptr++;
	meta->checksum = checksum;
	memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta)));
	}
	error = g_write_data(cp, pp->mediasize - pp->sectorsize *
	(63 - subdisk * PROMISE_META_OFFSET),
	buf, pp->sectorsize * 4);
	if (error != 0) {
	G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
	pp->name, error);
	}
	free(buf, M_MD_PROMISE);

	subdisk++;
	if (subdisk < PROMISE_MAX_SUBDISKS)
	goto next;

	return (error);
	}

	static int
	promise_meta_erase(struct g_consumer *cp)
	{
	struct g_provider *pp;
	char *buf;
	int error, subdisk;

	pp = cp->provider;
	buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK \| M_ZERO);
	for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) {
	error = g_write_data(cp, pp->mediasize - pp->sectorsize *
	(63 - subdisk * PROMISE_META_OFFSET),
	buf, 4 * pp->sectorsize);
	if (error != 0) {
	G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
	pp->name, error);
	}
	}
	free(buf, M_MD_PROMISE);
	return (error);
	}

	static int
	promise_meta_write_spare(struct g_consumer *cp)
	{
	struct promise_raid_conf *meta;
	off_t tmp;
	int error;

	meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK \| M_ZERO);
	memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1);
	meta->dummy_0 = 0x00020000;
	meta->integrity = PROMISE_I_VALID;
	meta->disk.flags = PROMISE_F_SPARE \| PROMISE_F_ONLINE \| PROMISE_F_VALID;
	meta->disk.number = 0xff;
	arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
	tmp = cp->provider->mediasize / cp->provider->sectorsize - 131072;
	meta->disk_sectors_high = tmp >> 32;
	meta->disk_sectors = (uint32_t)tmp;
	meta->disk_rebuild_high = UINT32_MAX;
	meta->disk_rebuild = UINT32_MAX;
	error = promise_meta_write(cp, &meta, 1);
	free(meta, M_MD_PROMISE);
	return (error);
	}

	static struct g_raid_volume *
	g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id)
	{
	struct g_raid_volume *vol;
	struct g_raid_md_promise_pervolume *pv;

	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	pv = vol->v_md_data;
	if (pv->pv_id == id)
	break;
	}
	return (vol);
	}

	static int
	g_raid_md_promise_purge_volumes(struct g_raid_softc *sc)
	{
	struct g_raid_volume vol, tvol;
	struct g_raid_md_promise_pervolume *pv;
	int i, res;

	res = 0;
	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
	pv = vol->v_md_data;
	if (!pv->pv_started \|\| vol->v_stopping)
	continue;
	for (i = 0; i < vol->v_disks_count; i++) {
	if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
	break;
	}
	if (i >= vol->v_disks_count) {
	g_raid_destroy_volume(vol);
	res = 1;
	}
	}
	return (res);
	}

	static int
	g_raid_md_promise_purge_disks(struct g_raid_softc *sc)
	{
	struct g_raid_disk disk, tdisk;
	struct g_raid_volume *vol;
	struct g_raid_md_promise_perdisk *pd;
	int i, j, res;

	res = 0;
	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
	if (disk->d_state == G_RAID_DISK_S_SPARE)
	continue;
	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;

	/* Scan for deleted volumes. */
	for (i = 0; i < pd->pd_subdisks; ) {
	vol = g_raid_md_promise_get_volume(sc,
	pd->pd_meta[i]->volume_id);
	if (vol != NULL && !vol->v_stopping) {
	i++;
	continue;
	}
	free(pd->pd_meta[i], M_MD_PROMISE);
	for (j = i; j < pd->pd_subdisks - 1; j++)
	pd->pd_meta[j] = pd->pd_meta[j + 1];
	pd->pd_meta[pd->pd_subdisks - 1] = NULL;
	pd->pd_subdisks--;
	pd->pd_updated = 1;
	}

	/* If there is no metadata left - erase and delete disk. */
	if (pd->pd_subdisks == 0) {
	promise_meta_erase(disk->d_consumer);
	g_raid_destroy_disk(disk);
	res = 1;
	}
	}
	return (res);
	}

	static int
	g_raid_md_promise_supported(int level, int qual, int disks, int force)
	{

	if (disks > PROMISE_MAX_DISKS)
	return (0);
	switch (level) {
	case G_RAID_VOLUME_RL_RAID0:
	if (disks < 1)
	return (0);
	if (!force && disks < 2)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID1:
	if (disks < 1)
	return (0);
	if (!force && (disks != 2))
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID1E:
	if (disks < 2)
	return (0);
	if (disks % 2 != 0)
	return (0);
	if (!force && (disks != 4))
	return (0);
	break;
	case G_RAID_VOLUME_RL_SINGLE:
	if (disks != 1)
	return (0);
	break;
	case G_RAID_VOLUME_RL_CONCAT:
	if (disks < 2)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID5:
	if (disks < 3)
	return (0);
	if (qual != G_RAID_VOLUME_RLQ_R5LA)
	return (0);
	break;
	default:
	return (0);
	}
	if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE)
	return (0);
	return (1);
	}

	static int
	g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn,
	struct g_raid_volume *vol)
	{
	struct g_raid_softc *sc;
	struct g_raid_subdisk *sd;
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_md_promise_pervolume *pv;
	struct promise_raid_conf *meta;
	off_t eoff, esize, size;
	int disk_pos, md_disk_pos, i, resurrection = 0;

	sc = disk->d_softc;
	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;

	pv = vol->v_md_data;
	meta = pv->pv_meta;

	if (sdn >= 0) {
	/* Find disk position in metadata by its serial. */
	md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id);
	/* For RAID0+1 we need to translate order. */
	disk_pos = promise_meta_translate_disk(vol, md_disk_pos);
	} else {
	md_disk_pos = -1;
	disk_pos = -1;
	}
	if (disk_pos < 0) {
	G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s",
	g_raid_get_diskname(disk), vol->v_name);
	/* Failed stale disk is useless for us. */
	if (sdn >= 0 &&
	pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) {
	g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
	return (0);
	}
	/* If we were given specific metadata subdisk - erase it. */
	if (sdn >= 0) {
	free(pd->pd_meta[sdn], M_MD_PROMISE);
	for (i = sdn; i < pd->pd_subdisks - 1; i++)
	pd->pd_meta[i] = pd->pd_meta[i + 1];
	pd->pd_meta[pd->pd_subdisks - 1] = NULL;
	pd->pd_subdisks--;
	}
	/* If we are in the start process, that's all for now. */
	if (!pv->pv_started)
	goto nofit;
	/*
	* If we have already started - try to get use of the disk.
	* Try to replace OFFLINE disks first, then FAILED.
	*/
	promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks,
	disk->d_consumer->provider->mediasize /
	disk->d_consumer->provider->sectorsize,
	&eoff, &esize);
	if (esize == 0) {
	G_RAID_DEBUG1(1, sc, "No free space on disk %s",
	g_raid_get_diskname(disk));
	goto nofit;
	}
	size = INT64_MAX;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
	size = sd->sd_size;
	if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
	(disk_pos < 0 \|\|
	vol->v_subdisks[i].sd_state < sd->sd_state))
	disk_pos = i;
	}
	if (disk_pos >= 0 &&
	vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
	(off_t)esize * 512 < size) {
	G_RAID_DEBUG1(1, sc, "Disk %s free space "
	"is too small (%ju < %ju)",
	g_raid_get_diskname(disk),
	(off_t)esize * 512, size);
	disk_pos = -1;
	}
	if (disk_pos >= 0) {
	if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
	esize = size / 512;
	/* For RAID0+1 we need to translate order. */
	md_disk_pos = promise_meta_translate_disk(vol, disk_pos);
	} else {
	nofit:
	if (pd->pd_subdisks == 0) {
	g_raid_change_disk_state(disk,
	G_RAID_DISK_S_SPARE);
	}
	return (0);
	}
	G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
	g_raid_get_diskname(disk), disk_pos, vol->v_name);
	resurrection = 1;
	}

	sd = &vol->v_subdisks[disk_pos];

	if (resurrection && sd->sd_disk != NULL) {
	g_raid_change_disk_state(sd->sd_disk,
	G_RAID_DISK_S_STALE_FAILED);
	TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
	sd, sd_next);
	}
	vol->v_subdisks[disk_pos].sd_disk = disk;
	TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);

	/* Welcome the new disk. */
	if (resurrection)
	g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
	else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN)
	g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
	else
	g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);

	if (resurrection) {
	sd->sd_offset = (off_t)eoff * 512;
	sd->sd_size = (off_t)esize * 512;
	} else {
	sd->sd_offset = (((off_t)pd->pd_meta[sdn]->disk_offset_high
	<< 32) + pd->pd_meta[sdn]->disk_offset) * 512;
	sd->sd_size = (((off_t)pd->pd_meta[sdn]->disk_sectors_high
	<< 32) + pd->pd_meta[sdn]->disk_sectors) * 512;
	}

	if (resurrection) {
	/* Stale disk, almost same as new. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_NEW);
	} else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) {
	/* Failed disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_FAILED);
	} else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) {
	/* Rebuilding disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_REBUILD);
	if (pd->pd_meta[sdn]->generation != meta->generation)
	sd->sd_rebuild_pos = 0;
	else {
	sd->sd_rebuild_pos =
	(((off_t)pd->pd_meta[sdn]->disk_rebuild_high << 32) +
	pd->pd_meta[sdn]->disk_rebuild) * 512;
	}
	} else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) {
	/* Rebuilding disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_NEW);
	} else if (pd->pd_meta[sdn]->generation != meta->generation \|\|
	(meta->status & PROMISE_S_MARKED)) {
	/* Stale disk or dirty volume (unclean shutdown). */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_STALE);
	} else {
	/* Up to date disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_ACTIVE);
	}
	g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
	G_RAID_EVENT_SUBDISK);

	return (resurrection);
	}

	static void
	g_raid_md_promise_refill(struct g_raid_softc *sc)
	{
	struct g_raid_volume *vol;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	struct g_raid_md_object *md;
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_md_promise_pervolume *pv;
	int update, updated, i, bad;

	md = sc->sc_md;
	restart:
	updated = 0;
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	pv = vol->v_md_data;
	if (!pv->pv_started \|\| vol->v_stopping)
	continue;

	/* Search for subdisk that needs replacement. */
	bad = 0;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_state == G_RAID_SUBDISK_S_NONE \|\|
	sd->sd_state == G_RAID_SUBDISK_S_FAILED)
	bad = 1;
	}
	if (!bad)
	continue;

	G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
	"trying to refill.", vol->v_name);

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	/* Skip failed. */
	if (disk->d_state < G_RAID_DISK_S_SPARE)
	continue;
	/* Skip already used by this volume. */
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_disk == disk)
	break;
	}
	if (i < vol->v_disks_count)
	continue;

	/* Try to use disk if it has empty extents. */
	pd = disk->d_md_data;
	if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) {
	update =
	g_raid_md_promise_start_disk(disk, -1, vol);
	} else
	update = 0;
	if (update) {
	updated = 1;
	g_raid_md_write_promise(md, vol, NULL, disk);
	break;
	}
	}
	}
	if (updated)
	goto restart;
	}

	static void
	g_raid_md_promise_start(struct g_raid_volume *vol)
	{
	struct g_raid_softc *sc;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	struct g_raid_md_object *md;
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_md_promise_pervolume *pv;
	struct promise_raid_conf *meta;
	u_int i;

	sc = vol->v_softc;
	md = sc->sc_md;
	pv = vol->v_md_data;
	meta = pv->pv_meta;

	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
	if (meta->type == PROMISE_T_RAID0)
	vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
	else if (meta->type == PROMISE_T_RAID1) {
	if (meta->array_width == 1)
	vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
	else
	vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
	} else if (meta->type == PROMISE_T_RAID3)
	vol->v_raid_level = G_RAID_VOLUME_RL_RAID3;
	else if (meta->type == PROMISE_T_RAID5) {
	vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA;
	} else if (meta->type == PROMISE_T_SPAN)
	vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT;
	else if (meta->type == PROMISE_T_JBOD)
	vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
	else
	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
	vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ
	vol->v_disks_count = meta->total_disks;
	vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ
	if (meta->total_sectors_high < 256) /* If value looks sane. */
	vol->v_mediasize +=
	((off_t)meta->total_sectors_high << 32) * 512; //ZZZ
	vol->v_sectorsize = 512 * meta->sector_size;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	sd->sd_offset = (((off_t)meta->disk_offset_high << 32) +
	meta->disk_offset) * 512;
	sd->sd_size = (((off_t)meta->disk_sectors_high << 32) +
	meta->disk_sectors) * 512;
	}
	g_raid_start_volume(vol);

	/* Make all disks found till the moment take their places. */
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	pd = disk->d_md_data;
	for (i = 0; i < pd->pd_subdisks; i++) {
	if (pd->pd_meta[i]->volume_id == meta->volume_id)
	g_raid_md_promise_start_disk(disk, i, vol);
	}
	}

	pv->pv_started = 1;
	callout_stop(&pv->pv_start_co);
	G_RAID_DEBUG1(0, sc, "Volume started.");
	g_raid_md_write_promise(md, vol, NULL, NULL);

	/* Pickup any STALE/SPARE disks to refill array if needed. */
	g_raid_md_promise_refill(sc);

	g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
	}

	static void
	g_raid_promise_go(void *arg)
	{
	struct g_raid_volume *vol;
	struct g_raid_softc *sc;
	struct g_raid_md_promise_pervolume *pv;

	vol = arg;
	pv = vol->v_md_data;
	sc = vol->v_softc;
	if (!pv->pv_started) {
	G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
	g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
	G_RAID_EVENT_VOLUME);
	}
	}

	static void
	g_raid_md_promise_new_disk(struct g_raid_disk *disk)
	{
	struct g_raid_softc *sc;
	struct g_raid_md_object *md;
	struct promise_raid_conf *pdmeta;
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_md_promise_pervolume *pv;
	struct g_raid_volume *vol;
	int i;
	char buf[33];

	sc = disk->d_softc;
	md = sc->sc_md;
	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;

	if (pd->pd_subdisks == 0) {
	g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
	g_raid_md_promise_refill(sc);
	return;
	}

	for (i = 0; i < pd->pd_subdisks; i++) {
	pdmeta = pd->pd_meta[i];

	/* Look for volume with matching ID. */
	vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
	if (vol == NULL) {
	promise_meta_get_name(pdmeta, buf);
	vol = g_raid_create_volume(sc, buf, pdmeta->array_number);
	pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK \| M_ZERO);
	pv->pv_id = pdmeta->volume_id;
	vol->v_md_data = pv;
	callout_init(&pv->pv_start_co, 1);
	callout_reset(&pv->pv_start_co,
	g_raid_start_timeout * hz,
	g_raid_promise_go, vol);
	} else
	pv = vol->v_md_data;

	/* If we haven't started yet - check metadata freshness. */
	if (pv->pv_meta == NULL \|\| !pv->pv_started) {
	if (pv->pv_meta == NULL \|\|
	((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) {
	G_RAID_DEBUG1(1, sc, "Newer disk");
	if (pv->pv_meta != NULL)
	free(pv->pv_meta, M_MD_PROMISE);
	pv->pv_meta = promise_meta_copy(pdmeta);
	pv->pv_generation = pv->pv_meta->generation;
	pv->pv_disks_present = 1;
	} else if (pdmeta->generation == pv->pv_generation) {
	pv->pv_disks_present++;
	G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
	pv->pv_disks_present,
	pv->pv_meta->total_disks);
	} else {
	G_RAID_DEBUG1(1, sc, "Older disk");
	}
	}
	}

	for (i = 0; i < pd->pd_subdisks; i++) {
	pdmeta = pd->pd_meta[i];

	/* Look for volume with matching ID. */
	vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
	if (vol == NULL)
	continue;
	pv = vol->v_md_data;

	if (pv->pv_started) {
	if (g_raid_md_promise_start_disk(disk, i, vol))
	g_raid_md_write_promise(md, vol, NULL, NULL);
	} else {
	/* If we collected all needed disks - start array. */
	if (pv->pv_disks_present == pv->pv_meta->total_disks)
	g_raid_md_promise_start(vol);
	}
	}
	}

	static int
	g_raid_md_create_promise(struct g_raid_md_object md, struct g_class mp,
	struct g_geom **gp)
	{
	struct g_geom *geom;
	struct g_raid_softc *sc;

	/* Search for existing node. */
	LIST_FOREACH(geom, &mp->geom, geom) {
	sc = geom->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_stopping != 0)
	continue;
	if (sc->sc_md->mdo_class != md->mdo_class)
	continue;
	break;
	}
	if (geom != NULL) {
	*gp = geom;
	return (G_RAID_MD_TASTE_EXISTING);
	}

	/* Create new one if not found. */
	sc = g_raid_create_node(mp, "Promise", md);
	if (sc == NULL)
	return (G_RAID_MD_TASTE_FAIL);
	md->mdo_softc = sc;
	*gp = sc->sc_geom;
	return (G_RAID_MD_TASTE_NEW);
	}

	static int
	g_raid_md_taste_promise(struct g_raid_md_object md, struct g_class mp,
	struct g_consumer cp, struct g_geom *gp)
	{
	struct g_consumer *rcp;
	struct g_provider *pp;
	struct g_raid_softc *sc;
	struct g_raid_disk *disk;
	struct promise_raid_conf *metaarr[4];
	struct g_raid_md_promise_perdisk *pd;
	struct g_geom *geom;
	int i, j, result, len, subdisks;
	char name[16];
	uint16_t vendor;

	G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name);
	pp = cp->provider;

	/* Read metadata from device. */
	g_topology_unlock();
	vendor = 0xffff;
	len = sizeof(vendor);
	if (pp->geom->rank == 1)
	g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
	subdisks = promise_meta_read(cp, metaarr);
	g_topology_lock();
	if (subdisks == 0) {
	if (g_raid_aggressive_spare) {
	if (vendor == 0x105a \|\| vendor == 0x1002) {
	G_RAID_DEBUG(1,
	"No Promise metadata, forcing spare.");
	goto search;
	} else {
	G_RAID_DEBUG(1,
	"Promise/ATI vendor mismatch "
	"0x%04x != 0x105a/0x1002",
	vendor);
	}
	}
	return (G_RAID_MD_TASTE_FAIL);
	}

	/* Metadata valid. Print it. */
	for (i = 0; i < subdisks; i++)
	g_raid_md_promise_print(metaarr[i]);

	/* Purge meaningless (empty/spare) records. */
	for (i = 0; i < subdisks; ) {
	if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) {
	i++;
	continue;
	}
	free(metaarr[i], M_MD_PROMISE);
	for (j = i; j < subdisks - 1; j++)
	metaarr[i] = metaarr[j + 1];
	metaarr[subdisks - 1] = NULL;
	subdisks--;
	}

	search:
	/* Search for matching node. */
	sc = NULL;
	LIST_FOREACH(geom, &mp->geom, geom) {
	sc = geom->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_stopping != 0)
	continue;
	if (sc->sc_md->mdo_class != md->mdo_class)
	continue;
	break;
	}

	/* Found matching node. */
	if (geom != NULL) {
	G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
	result = G_RAID_MD_TASTE_EXISTING;

	} else { /* Not found matching node -- create one. */
	result = G_RAID_MD_TASTE_NEW;
	snprintf(name, sizeof(name), "Promise");
	sc = g_raid_create_node(mp, name, md);
	md->mdo_softc = sc;
	geom = sc->sc_geom;
	}

	/* There is no return after this point, so we close passed consumer. */
	g_access(cp, -1, 0, 0);

	rcp = g_new_consumer(geom);
	rcp->flags \|= G_CF_DIRECT_RECEIVE;
	g_attach(rcp, pp);
	if (g_access(rcp, 1, 1, 1) != 0)
	; //goto fail1;

	g_topology_unlock();
	sx_xlock(&sc->sc_lock);

	pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK \| M_ZERO);
	pd->pd_subdisks = subdisks;
	for (i = 0; i < subdisks; i++)
	pd->pd_meta[i] = metaarr[i];
	disk = g_raid_create_disk(sc);
	disk->d_md_data = (void *)pd;
	disk->d_consumer = rcp;
	rcp->private = disk;

	g_raid_get_disk_info(disk);

	g_raid_md_promise_new_disk(disk);

	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	*gp = geom;
	return (result);
	}

	static int
	g_raid_md_event_promise(struct g_raid_md_object *md,
	struct g_raid_disk *disk, u_int event)
	{
	struct g_raid_softc *sc;

	sc = md->mdo_softc;
	if (disk == NULL)
	return (-1);
	switch (event) {
	case G_RAID_DISK_E_DISCONNECTED:
	/* Delete disk. */
	g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
	g_raid_destroy_disk(disk);
	g_raid_md_promise_purge_volumes(sc);

	/* Write updated metadata to all disks. */
	g_raid_md_write_promise(md, NULL, NULL, NULL);

	/* Check if anything left. */
	if (g_raid_ndisks(sc, -1) == 0)
	g_raid_destroy_node(sc, 0);
	else
	g_raid_md_promise_refill(sc);
	return (0);
	}
	return (-2);
	}

	static int
	g_raid_md_volume_event_promise(struct g_raid_md_object *md,
	struct g_raid_volume *vol, u_int event)
	{
	struct g_raid_md_promise_pervolume *pv;

	pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
	switch (event) {
	case G_RAID_VOLUME_E_STARTMD:
	if (!pv->pv_started)
	g_raid_md_promise_start(vol);
	return (0);
	}
	return (-2);
	}

	static int
	g_raid_md_ctl_promise(struct g_raid_md_object *md,
	struct gctl_req *req)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume vol, vol1;
	struct g_raid_subdisk *sd;
	struct g_raid_disk disk, disks[PROMISE_MAX_DISKS];
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_md_promise_pervolume *pv;
	struct g_consumer *cp;
	struct g_provider *pp;
	char arg[16];
	const char nodename, verb, volname, levelname, *diskname;
	char *tmp;
	int nargs, force;
	off_t esize, offs[PROMISE_MAX_DISKS], size, sectorsize, strip;
	intmax_t sizearg, striparg;
	int numdisks, i, len, level, qual;
	int error;

	sc = md->mdo_softc;
	verb = gctl_get_param(req, "verb", NULL);
	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	error = 0;
	if (strcmp(verb, "label") == 0) {
	if (*nargs < 4) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	volname = gctl_get_asciiparam(req, "arg1");
	if (volname == NULL) {
	gctl_error(req, "No volume name.");
	return (-2);
	}
	levelname = gctl_get_asciiparam(req, "arg2");
	if (levelname == NULL) {
	gctl_error(req, "No RAID level.");
	return (-3);
	}
	if (strcasecmp(levelname, "RAID5") == 0)
	levelname = "RAID5-LA";
	if (g_raid_volume_str2level(levelname, &level, &qual)) {
	gctl_error(req, "Unknown RAID level '%s'.", levelname);
	return (-4);
	}
	numdisks = *nargs - 3;
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (!g_raid_md_promise_supported(level, qual, numdisks,
	force ? *force : 0)) {
	gctl_error(req, "Unsupported RAID level "
	"(0x%02x/0x%02x), or number of disks (%d).",
	level, qual, numdisks);
	return (-5);
	}

	/* Search for disks, connect them and probe. */
	size = INT64_MAX;
	sectorsize = 0;
	bzero(disks, sizeof(disks));
	bzero(offs, sizeof(offs));
	for (i = 0; i < numdisks; i++) {
	snprintf(arg, sizeof(arg), "arg%d", i + 3);
	diskname = gctl_get_asciiparam(req, arg);
	if (diskname == NULL) {
	gctl_error(req, "No disk name (%s).", arg);
	error = -6;
	break;
	}
	if (strcmp(diskname, "NONE") == 0)
	continue;

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer != NULL &&
	disk->d_consumer->provider != NULL &&
	strcmp(disk->d_consumer->provider->name,
	diskname) == 0)
	break;
	}
	if (disk != NULL) {
	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
	gctl_error(req, "Disk '%s' is in a "
	"wrong state (%s).", diskname,
	g_raid_disk_state2str(disk->d_state));
	error = -7;
	break;
	}
	pd = disk->d_md_data;
	if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) {
	gctl_error(req, "Disk '%s' already "
	"used by %d volumes.",
	diskname, pd->pd_subdisks);
	error = -7;
	break;
	}
	pp = disk->d_consumer->provider;
	disks[i] = disk;
	promise_meta_unused_range(pd->pd_meta,
	pd->pd_subdisks,
	pp->mediasize / pp->sectorsize,
	&offs[i], &esize);
	size = MIN(size, (off_t)esize * pp->sectorsize);
	sectorsize = MAX(sectorsize, pp->sectorsize);
	continue;
	}

	g_topology_lock();
	cp = g_raid_open_consumer(sc, diskname);
	if (cp == NULL) {
	gctl_error(req, "Can't open disk '%s'.",
	diskname);
	g_topology_unlock();
	error = -8;
	break;
	}
	pp = cp->provider;
	pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK \| M_ZERO);
	disk = g_raid_create_disk(sc);
	disk->d_md_data = (void *)pd;
	disk->d_consumer = cp;
	disks[i] = disk;
	cp->private = disk;
	g_topology_unlock();

	g_raid_get_disk_info(disk);

	/* Reserve some space for metadata. */
	size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize);
	sectorsize = MAX(sectorsize, pp->sectorsize);
	}
	if (error != 0) {
	for (i = 0; i < numdisks; i++) {
	if (disks[i] != NULL &&
	disks[i]->d_state == G_RAID_DISK_S_NONE)
	g_raid_destroy_disk(disks[i]);
	}
	return (error);
	}

	if (sectorsize <= 0) {
	gctl_error(req, "Can't get sector size.");
	return (-8);
	}

	/* Handle size argument. */
	len = sizeof(*sizearg);
	sizearg = gctl_get_param(req, "size", &len);
	if (sizearg != NULL && len == sizeof(*sizearg) &&
	*sizearg > 0) {
	if (*sizearg > size) {
	gctl_error(req, "Size too big %lld > %lld.",
	(long long)*sizearg, (long long)size);
	return (-9);
	}
	size = *sizearg;
	}

	/* Handle strip argument. */
	strip = 131072;
	len = sizeof(*striparg);
	striparg = gctl_get_param(req, "strip", &len);
	if (striparg != NULL && len == sizeof(*striparg) &&
	*striparg > 0) {
	if (*striparg < sectorsize) {
	gctl_error(req, "Strip size too small.");
	return (-10);
	}
	if (*striparg % sectorsize != 0) {
	gctl_error(req, "Incorrect strip size.");
	return (-11);
	}
	strip = *striparg;
	}

	/* Round size down to strip or sector. */
	if (level == G_RAID_VOLUME_RL_RAID1 \|\|
	level == G_RAID_VOLUME_RL_SINGLE \|\|
	level == G_RAID_VOLUME_RL_CONCAT)
	size -= (size % sectorsize);
	else if (level == G_RAID_VOLUME_RL_RAID1E &&
	(numdisks & 1) != 0)
	size -= (size % (2 * strip));
	else
	size -= (size % strip);
	if (size <= 0) {
	gctl_error(req, "Size too small.");
	return (-13);
	}

	/* We have all we need, create things: volume, ... */
	pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK \| M_ZERO);
	arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0);
	pv->pv_generation = 0;
	pv->pv_started = 1;
	vol = g_raid_create_volume(sc, volname, -1);
	vol->v_md_data = pv;
	vol->v_raid_level = level;
	vol->v_raid_level_qualifier = qual;
	vol->v_strip_size = strip;
	vol->v_disks_count = numdisks;
	if (level == G_RAID_VOLUME_RL_RAID0 \|\|
	level == G_RAID_VOLUME_RL_CONCAT \|\|
	level == G_RAID_VOLUME_RL_SINGLE)
	vol->v_mediasize = size * numdisks;
	else if (level == G_RAID_VOLUME_RL_RAID1)
	vol->v_mediasize = size;
	else if (level == G_RAID_VOLUME_RL_RAID3 \|\|
	level == G_RAID_VOLUME_RL_RAID5)
	vol->v_mediasize = size * (numdisks - 1);
	else { /* RAID1E */
	vol->v_mediasize = ((size * numdisks) / strip / 2) *
	strip;
	}
	vol->v_sectorsize = sectorsize;
	g_raid_start_volume(vol);

	/* , and subdisks. */
	for (i = 0; i < numdisks; i++) {
	disk = disks[i];
	sd = &vol->v_subdisks[i];
	sd->sd_disk = disk;
	sd->sd_offset = (off_t)offs[i] * 512;
	sd->sd_size = size;
	if (disk == NULL)
	continue;
	TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
	g_raid_change_disk_state(disk,
	G_RAID_DISK_S_ACTIVE);
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_ACTIVE);
	g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
	G_RAID_EVENT_SUBDISK);
	}

	/* Write metadata based on created entities. */
	G_RAID_DEBUG1(0, sc, "Array started.");
	g_raid_md_write_promise(md, vol, NULL, NULL);

	/* Pickup any STALE/SPARE disks to refill array if needed. */
	g_raid_md_promise_refill(sc);

	g_raid_event_send(vol, G_RAID_VOLUME_E_START,
	G_RAID_EVENT_VOLUME);
	return (0);
	}
	if (strcmp(verb, "add") == 0) {
	gctl_error(req, "`add` command is not applicable, "
	"use `label` instead.");
	return (-99);
	}
	if (strcmp(verb, "delete") == 0) {
	nodename = gctl_get_asciiparam(req, "arg0");
	if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
	nodename = NULL;

	/* Full node destruction. */
	if (*nargs == 1 && nodename != NULL) {
	/* Check if some volume is still open. */
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (force != NULL && *force == 0 &&
	g_raid_nopens(sc) != 0) {
	gctl_error(req, "Some volume is still open.");
	return (-4);
	}

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer)
	promise_meta_erase(disk->d_consumer);
	}
	g_raid_destroy_node(sc, 0);
	return (0);
	}

	/* Destroy specified volume. If it was last - all node. */
	if (*nargs > 2) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	volname = gctl_get_asciiparam(req,
	nodename != NULL ? "arg1" : "arg0");
	if (volname == NULL) {
	gctl_error(req, "No volume name.");
	return (-2);
	}

	/* Search for volume. */
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (strcmp(vol->v_name, volname) == 0)
	break;
	pp = vol->v_provider;
	if (pp == NULL)
	continue;
	if (strcmp(pp->name, volname) == 0)
	break;
	if (strncmp(pp->name, "raid/", 5) == 0 &&
	strcmp(pp->name + 5, volname) == 0)
	break;
	}
	if (vol == NULL) {
	i = strtol(volname, &tmp, 10);
	if (verb != volname && tmp[0] == 0) {
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (vol->v_global_id == i)
	break;
	}
	}
	}
	if (vol == NULL) {
	gctl_error(req, "Volume '%s' not found.", volname);
	return (-3);
	}

	/* Check if volume is still open. */
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (force != NULL && *force == 0 &&
	vol->v_provider_open != 0) {
	gctl_error(req, "Volume is still open.");
	return (-4);
	}

	/* Destroy volume and potentially node. */
	i = 0;
	TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
	i++;
	if (i >= 2) {
	g_raid_destroy_volume(vol);
	g_raid_md_promise_purge_disks(sc);
	g_raid_md_write_promise(md, NULL, NULL, NULL);
	} else {
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer)
	promise_meta_erase(disk->d_consumer);
	}
	g_raid_destroy_node(sc, 0);
	}
	return (0);
	}
	if (strcmp(verb, "remove") == 0 \|\|
	strcmp(verb, "fail") == 0) {
	if (*nargs < 2) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	for (i = 1; i < *nargs; i++) {
	snprintf(arg, sizeof(arg), "arg%d", i);
	diskname = gctl_get_asciiparam(req, arg);
	if (diskname == NULL) {
	gctl_error(req, "No disk name (%s).", arg);
	error = -2;
	break;
	}
	if (strncmp(diskname, _PATH_DEV, 5) == 0)
	diskname += 5;

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer != NULL &&
	disk->d_consumer->provider != NULL &&
	strcmp(disk->d_consumer->provider->name,
	diskname) == 0)
	break;
	}
	if (disk == NULL) {
	gctl_error(req, "Disk '%s' not found.",
	diskname);
	error = -3;
	break;
	}

	if (strcmp(verb, "fail") == 0) {
	g_raid_md_fail_disk_promise(md, NULL, disk);
	continue;
	}

	/* Erase metadata on deleting disk and destroy it. */
	promise_meta_erase(disk->d_consumer);
	g_raid_destroy_disk(disk);
	}
	g_raid_md_promise_purge_volumes(sc);

	/* Write updated metadata to remaining disks. */
	g_raid_md_write_promise(md, NULL, NULL, NULL);

	/* Check if anything left. */
	if (g_raid_ndisks(sc, -1) == 0)
	g_raid_destroy_node(sc, 0);
	else
	g_raid_md_promise_refill(sc);
	return (error);
	}
	if (strcmp(verb, "insert") == 0) {
	if (*nargs < 2) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	for (i = 1; i < *nargs; i++) {
	/* Get disk name. */
	snprintf(arg, sizeof(arg), "arg%d", i);
	diskname = gctl_get_asciiparam(req, arg);
	if (diskname == NULL) {
	gctl_error(req, "No disk name (%s).", arg);
	error = -3;
	break;
	}

	/* Try to find provider with specified name. */
	g_topology_lock();
	cp = g_raid_open_consumer(sc, diskname);
	if (cp == NULL) {
	gctl_error(req, "Can't open disk '%s'.",
	diskname);
	g_topology_unlock();
	error = -4;
	break;
	}
	pp = cp->provider;
	g_topology_unlock();

	pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK \| M_ZERO);

	disk = g_raid_create_disk(sc);
	disk->d_consumer = cp;
	disk->d_md_data = (void *)pd;
	cp->private = disk;

	g_raid_get_disk_info(disk);

	/* Welcome the "new" disk. */
	g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
	promise_meta_write_spare(cp);
	g_raid_md_promise_refill(sc);
	}
	return (error);
	}
	return (-100);
	}

	static int
	g_raid_md_write_promise(struct g_raid_md_object md, struct g_raid_volume tvol,
	struct g_raid_subdisk tsd, struct g_raid_disk tdisk)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume *vol;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_md_promise_pervolume *pv;
	struct promise_raid_conf *meta;
	off_t rebuild_lba64;
	int i, j, pos, rebuild;

	sc = md->mdo_softc;

	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
	return (0);

	/* Generate new per-volume metadata for affected volumes. */
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (vol->v_stopping)
	continue;

	/* Skip volumes not related to specified targets. */
	if (tvol != NULL && vol != tvol)
	continue;
	if (tsd != NULL && vol != tsd->sd_volume)
	continue;
	if (tdisk != NULL) {
	for (i = 0; i < vol->v_disks_count; i++) {
	if (vol->v_subdisks[i].sd_disk == tdisk)
	break;
	}
	if (i >= vol->v_disks_count)
	continue;
	}

	pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
	pv->pv_generation++;

	meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK \| M_ZERO);
	if (pv->pv_meta != NULL)
	memcpy(meta, pv->pv_meta, sizeof(*meta));
	memcpy(meta->promise_id, PROMISE_MAGIC,
	sizeof(PROMISE_MAGIC) - 1);
	meta->dummy_0 = 0x00020000;
	meta->integrity = PROMISE_I_VALID;

	meta->generation = pv->pv_generation;
	meta->status = PROMISE_S_VALID \| PROMISE_S_ONLINE \|
	PROMISE_S_INITED \| PROMISE_S_READY;
	if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
	meta->status \|= PROMISE_S_DEGRADED;
	if (vol->v_dirty)
	meta->status \|= PROMISE_S_MARKED; /* XXX: INVENTED! */
	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE)
	meta->type = PROMISE_T_RAID0;
	else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
	meta->type = PROMISE_T_RAID1;
	else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3)
	meta->type = PROMISE_T_RAID3;
	else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
	meta->type = PROMISE_T_RAID5;
	else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT)
	meta->type = PROMISE_T_SPAN;
	else
	meta->type = PROMISE_T_JBOD;
	meta->total_disks = vol->v_disks_count;
	meta->stripe_shift = ffs(vol->v_strip_size / 1024);
	meta->array_width = vol->v_disks_count;
	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
	meta->array_width /= 2;
	meta->array_number = vol->v_global_id;
	meta->total_sectors = vol->v_mediasize / 512;
	meta->total_sectors_high = (vol->v_mediasize / 512) >> 32;
	meta->sector_size = vol->v_sectorsize / 512;
	meta->cylinders = meta->total_sectors / (255 * 63) - 1;
	meta->heads = 254;
	meta->sectors = 63;
	meta->volume_id = pv->pv_id;
	rebuild_lba64 = UINT64_MAX;
	rebuild = 0;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	/* For RAID0+1 we need to translate order. */
	pos = promise_meta_translate_disk(vol, i);
	meta->disks[pos].flags = PROMISE_F_VALID \|
	PROMISE_F_ASSIGNED;
	if (sd->sd_state == G_RAID_SUBDISK_S_NONE) {
	meta->disks[pos].flags \|= 0;
	} else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) {
	meta->disks[pos].flags \|=
	PROMISE_F_DOWN \| PROMISE_F_REDIR;
	} else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) {
	meta->disks[pos].flags \|=
	PROMISE_F_ONLINE \| PROMISE_F_REDIR;
	if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
	rebuild_lba64 = MIN(rebuild_lba64,
	sd->sd_rebuild_pos / 512);
	} else
	rebuild_lba64 = 0;
	rebuild = 1;
	} else {
	meta->disks[pos].flags \|= PROMISE_F_ONLINE;
	if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) {
	meta->status \|= PROMISE_S_MARKED;
	if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
	rebuild_lba64 = MIN(rebuild_lba64,
	sd->sd_rebuild_pos / 512);
	} else
	rebuild_lba64 = 0;
	}
	}
	if (pv->pv_meta != NULL) {
	meta->disks[pos].id = pv->pv_meta->disks[pos].id;
	} else {
	meta->disks[pos].number = i * 2;
	arc4rand(&meta->disks[pos].id,
	sizeof(meta->disks[pos].id), 0);
	}
	}
	promise_meta_put_name(meta, vol->v_name);

	/* Try to mimic AMD BIOS rebuild/resync behavior. */
	if (rebuild_lba64 != UINT64_MAX) {
	if (rebuild)
	meta->magic_3 = 0x03040010UL; /* Rebuild? */
	else
	meta->magic_3 = 0x03040008UL; /* Resync? */
	/* Translate from per-disk to per-volume LBA. */
	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
	rebuild_lba64 *= meta->array_width;
	} else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) {
	rebuild_lba64 *= meta->array_width - 1;
	} else
	rebuild_lba64 = 0;
	} else
	meta->magic_3 = 0x03000000UL;
	meta->rebuild_lba64 = rebuild_lba64;
	meta->magic_4 = 0x04010101UL;

	/* Replace per-volume metadata with new. */
	if (pv->pv_meta != NULL)
	free(pv->pv_meta, M_MD_PROMISE);
	pv->pv_meta = meta;

	/* Copy new metadata to the disks, adding or replacing old. */
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	disk = sd->sd_disk;
	if (disk == NULL)
	continue;
	/* For RAID0+1 we need to translate order. */
	pos = promise_meta_translate_disk(vol, i);
	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
	for (j = 0; j < pd->pd_subdisks; j++) {
	if (pd->pd_meta[j]->volume_id == meta->volume_id)
	break;
	}
	if (j == pd->pd_subdisks)
	pd->pd_subdisks++;
	if (pd->pd_meta[j] != NULL)
	free(pd->pd_meta[j], M_MD_PROMISE);
	pd->pd_meta[j] = promise_meta_copy(meta);
	pd->pd_meta[j]->disk = meta->disks[pos];
	pd->pd_meta[j]->disk.number = pos;
	pd->pd_meta[j]->disk_offset_high =
	(sd->sd_offset / 512) >> 32;
	pd->pd_meta[j]->disk_offset = sd->sd_offset / 512;
	pd->pd_meta[j]->disk_sectors_high =
	(sd->sd_size / 512) >> 32;
	pd->pd_meta[j]->disk_sectors = sd->sd_size / 512;
	if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
	pd->pd_meta[j]->disk_rebuild_high =
	(sd->sd_rebuild_pos / 512) >> 32;
	pd->pd_meta[j]->disk_rebuild =
	sd->sd_rebuild_pos / 512;
	} else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD) {
	pd->pd_meta[j]->disk_rebuild_high = 0;
	pd->pd_meta[j]->disk_rebuild = 0;
	} else {
	pd->pd_meta[j]->disk_rebuild_high = UINT32_MAX;
	pd->pd_meta[j]->disk_rebuild = UINT32_MAX;
	}
	pd->pd_updated = 1;
	}
	}

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
	if (disk->d_state != G_RAID_DISK_S_ACTIVE)
	continue;
	if (!pd->pd_updated)
	continue;
	G_RAID_DEBUG(1, "Writing Promise metadata to %s",
	g_raid_get_diskname(disk));
	for (i = 0; i < pd->pd_subdisks; i++)
	g_raid_md_promise_print(pd->pd_meta[i]);
	promise_meta_write(disk->d_consumer,
	pd->pd_meta, pd->pd_subdisks);
	pd->pd_updated = 0;
	}

	return (0);
	}

	static int
	g_raid_md_fail_disk_promise(struct g_raid_md_object *md,
	struct g_raid_subdisk tsd, struct g_raid_disk tdisk)
	{
	struct g_raid_softc *sc;
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_subdisk *sd;
	int i, pos;

	sc = md->mdo_softc;
	pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data;

	/* We can't fail disk that is not a part of array now. */
	if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
	return (-1);

	/*
	* Mark disk as failed in metadata and try to write that metadata
	* to the disk itself to prevent it's later resurrection as STALE.
	*/
	if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL)
	G_RAID_DEBUG(1, "Writing Promise metadata to %s",
	g_raid_get_diskname(tdisk));
	for (i = 0; i < pd->pd_subdisks; i++) {
	pd->pd_meta[i]->disk.flags \|=
	PROMISE_F_DOWN \| PROMISE_F_REDIR;
	pos = pd->pd_meta[i]->disk.number;
	if (pos >= 0 && pos < PROMISE_MAX_DISKS) {
	pd->pd_meta[i]->disks[pos].flags \|=
	PROMISE_F_DOWN \| PROMISE_F_REDIR;
	}
	g_raid_md_promise_print(pd->pd_meta[i]);
	}
	if (tdisk->d_consumer != NULL)
	promise_meta_write(tdisk->d_consumer,
	pd->pd_meta, pd->pd_subdisks);

	/* Change states. */
	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_FAILED);
	g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
	G_RAID_EVENT_SUBDISK);
	}

	/* Write updated metadata to remaining disks. */
	g_raid_md_write_promise(md, NULL, NULL, tdisk);

	g_raid_md_promise_refill(sc);
	return (0);
	}

	static int
	g_raid_md_free_disk_promise(struct g_raid_md_object *md,
	struct g_raid_disk *disk)
	{
	struct g_raid_md_promise_perdisk *pd;
	int i;

	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
	for (i = 0; i < pd->pd_subdisks; i++) {
	if (pd->pd_meta[i] != NULL) {
	free(pd->pd_meta[i], M_MD_PROMISE);
	pd->pd_meta[i] = NULL;
	}
	}
	free(pd, M_MD_PROMISE);
	disk->d_md_data = NULL;
	return (0);
	}

	static int
	g_raid_md_free_volume_promise(struct g_raid_md_object *md,
	struct g_raid_volume *vol)
	{
	struct g_raid_md_promise_pervolume *pv;

	pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
	if (pv && pv->pv_meta != NULL) {
	free(pv->pv_meta, M_MD_PROMISE);
	pv->pv_meta = NULL;
	}
	if (pv && !pv->pv_started) {
	pv->pv_started = 1;
	callout_stop(&pv->pv_start_co);
	}
	free(pv, M_MD_PROMISE);
	vol->v_md_data = NULL;
	return (0);
	}

	static int
	g_raid_md_free_promise(struct g_raid_md_object *md)
	{

	return (0);
	}

	G_RAID_MD_DECLARE(promise, "Promise");
	diff --git a/sys/geom/raid3/g_raid3.c b/sys/geom/raid3/g_raid3.c
	index bf4f1cab8663..5fe67c00068d 100644
	--- a/sys/geom/raid3/g_raid3.c
	+++ b/sys/geom/raid3/g_raid3.c
	@@ -1,3589 +1,3589 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/bio.h>
	#include <sys/sbuf.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/eventhandler.h>
	#include <vm/uma.h>
	#include <geom/geom.h>
	#include <geom/geom_dbg.h>
	#include <sys/proc.h>
	#include <sys/kthread.h>
	#include <sys/sched.h>
	#include <geom/raid3/g_raid3.h>

	FEATURE(geom_raid3, "GEOM RAID-3 functionality");

	static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data");

	SYSCTL_DECL(_kern_geom);
	static SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_RAID3 stuff");
	u_int g_raid3_debug = 0;
	SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid3_debug, 0,
	"Debug level");
	static u_int g_raid3_timeout = 4;
	SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_raid3_timeout,
	0, "Time to wait on all raid3 components");
	static u_int g_raid3_idletime = 5;
	SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RWTUN,
	&g_raid3_idletime, 0, "Mark components as clean when idling");
	static u_int g_raid3_disconnect_on_failure = 1;
	SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
	&g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
	static u_int g_raid3_syncreqs = 2;
	SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
	&g_raid3_syncreqs, 0, "Parallel synchronization I/O requests.");
	static u_int g_raid3_use_malloc = 0;
	SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN,
	&g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9).");

	static u_int g_raid3_n64k = 50;
	SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RDTUN, &g_raid3_n64k, 0,
	"Maximum number of 64kB allocations");
	static u_int g_raid3_n16k = 200;
	SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RDTUN, &g_raid3_n16k, 0,
	"Maximum number of 16kB allocations");
	static u_int g_raid3_n4k = 1200;
	SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RDTUN, &g_raid3_n4k, 0,
	"Maximum number of 4kB allocations");

	static SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat,
	CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_RAID3 statistics");
	static u_int g_raid3_parity_mismatch = 0;
	SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
	&g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");

	#define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \
	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \
	msleep((ident), (mtx), (priority), (wmesg), (timeout)); \
	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \
	} while (0)

	static eventhandler_tag g_raid3_post_sync = NULL;
	static int g_raid3_shutdown = 0;

	static int g_raid3_destroy_geom(struct gctl_req req, struct g_class mp,
	struct g_geom *gp);
	static g_taste_t g_raid3_taste;
	static void g_raid3_init(struct g_class *mp);
	static void g_raid3_fini(struct g_class *mp);

	struct g_class g_raid3_class = {
	.name = G_RAID3_CLASS_NAME,
	.version = G_VERSION,
	.ctlreq = g_raid3_config,
	.taste = g_raid3_taste,
	.destroy_geom = g_raid3_destroy_geom,
	.init = g_raid3_init,
	.fini = g_raid3_fini
	};

	static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
	static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
	static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
	static void g_raid3_dumpconf(struct sbuf sb, const char indent,
	struct g_geom gp, struct g_consumer cp, struct g_provider *pp);
	static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
	static int g_raid3_register_request(struct bio *pbp);
	static void g_raid3_sync_release(struct g_raid3_softc *sc);

	static const char *
	g_raid3_disk_state2str(int state)
	{

	switch (state) {
	case G_RAID3_DISK_STATE_NODISK:
	return ("NODISK");
	case G_RAID3_DISK_STATE_NONE:
	return ("NONE");
	case G_RAID3_DISK_STATE_NEW:
	return ("NEW");
	case G_RAID3_DISK_STATE_ACTIVE:
	return ("ACTIVE");
	case G_RAID3_DISK_STATE_STALE:
	return ("STALE");
	case G_RAID3_DISK_STATE_SYNCHRONIZING:
	return ("SYNCHRONIZING");
	case G_RAID3_DISK_STATE_DISCONNECTED:
	return ("DISCONNECTED");
	default:
	return ("INVALID");
	}
	}

	static const char *
	g_raid3_device_state2str(int state)
	{

	switch (state) {
	case G_RAID3_DEVICE_STATE_STARTING:
	return ("STARTING");
	case G_RAID3_DEVICE_STATE_DEGRADED:
	return ("DEGRADED");
	case G_RAID3_DEVICE_STATE_COMPLETE:
	return ("COMPLETE");
	default:
	return ("INVALID");
	}
	}

	const char *
	g_raid3_get_diskname(struct g_raid3_disk *disk)
	{

	if (disk->d_consumer == NULL \|\| disk->d_consumer->provider == NULL)
	return ("[unknown]");
	return (disk->d_name);
	}

	static void *
	g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags)
	{
	void *ptr;
	enum g_raid3_zones zone;

	if (g_raid3_use_malloc \|\|
	(zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
	ptr = malloc(size, M_RAID3, flags);
	else {
	ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone,
	&sc->sc_zones[zone], flags);
	sc->sc_zones[zone].sz_requested++;
	if (ptr == NULL)
	sc->sc_zones[zone].sz_failed++;
	}
	return (ptr);
	}

	static void
	g_raid3_free(struct g_raid3_softc sc, void ptr, size_t size)
	{
	enum g_raid3_zones zone;

	if (g_raid3_use_malloc \|\|
	(zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
	free(ptr, M_RAID3);
	else {
	uma_zfree_arg(sc->sc_zones[zone].sz_zone,
	ptr, &sc->sc_zones[zone]);
	}
	}

	static int
	g_raid3_uma_ctor(void mem, int size, void arg, int flags)
	{
	struct g_raid3_zone *sz = arg;

	if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max)
	return (ENOMEM);
	sz->sz_inuse++;
	return (0);
	}

	static void
	g_raid3_uma_dtor(void mem, int size, void arg)
	{
	struct g_raid3_zone *sz = arg;

	sz->sz_inuse--;
	}

	#define g_raid3_xor(src, dst, size) \
	_g_raid3_xor((uint64_t *)(src), \
	(uint64_t *)(dst), (size_t)size)
	static void
	_g_raid3_xor(uint64_t src, uint64_t dst, size_t size)
	{

	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
	for (; size > 0; size -= 128) {
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	dst++ ^= (src++);
	}
	}

	static int
	g_raid3_is_zero(struct bio *bp)
	{
	static const uint64_t zeros[] = {
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
	};
	u_char *addr;
	ssize_t size;

	size = bp->bio_length;
	addr = (u_char *)bp->bio_data;
	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
	if (bcmp(addr, zeros, sizeof(zeros)) != 0)
	return (0);
	}
	return (1);
	}

	/*
	* --- Events handling functions ---
	* Events in geom_raid3 are used to maintain disks and device status
	* from one thread to simplify locking.
	*/
	static void
	g_raid3_event_free(struct g_raid3_event *ep)
	{

	free(ep, M_RAID3);
	}

	int
	g_raid3_event_send(void *arg, int state, int flags)
	{
	struct g_raid3_softc *sc;
	struct g_raid3_disk *disk;
	struct g_raid3_event *ep;
	int error;

	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
	disk = NULL;
	sc = arg;
	} else {
	disk = arg;
	sc = disk->d_softc;
	}
	ep->e_disk = disk;
	ep->e_state = state;
	ep->e_flags = flags;
	ep->e_error = 0;
	mtx_lock(&sc->sc_events_mtx);
	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
	mtx_unlock(&sc->sc_events_mtx);
	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
	mtx_lock(&sc->sc_queue_mtx);
	wakeup(sc);
	wakeup(&sc->sc_queue);
	mtx_unlock(&sc->sc_queue_mtx);
	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
	return (0);
	sx_assert(&sc->sc_lock, SX_XLOCKED);
	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
	sx_xunlock(&sc->sc_lock);
	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
	mtx_lock(&sc->sc_events_mtx);
	MSLEEP(ep, &sc->sc_events_mtx, PRIBIO \| PDROP, "r3:event",
	hz * 5);
	}
	error = ep->e_error;
	g_raid3_event_free(ep);
	sx_xlock(&sc->sc_lock);
	return (error);
	}

	static struct g_raid3_event *
	g_raid3_event_get(struct g_raid3_softc *sc)
	{
	struct g_raid3_event *ep;

	mtx_lock(&sc->sc_events_mtx);
	ep = TAILQ_FIRST(&sc->sc_events);
	mtx_unlock(&sc->sc_events_mtx);
	return (ep);
	}

	static void
	g_raid3_event_remove(struct g_raid3_softc sc, struct g_raid3_event ep)
	{

	mtx_lock(&sc->sc_events_mtx);
	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
	mtx_unlock(&sc->sc_events_mtx);
	}

	static void
	g_raid3_event_cancel(struct g_raid3_disk *disk)
	{
	struct g_raid3_softc *sc;
	struct g_raid3_event ep, tmpep;

	sc = disk->d_softc;
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	mtx_lock(&sc->sc_events_mtx);
	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
	if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
	continue;
	if (ep->e_disk != disk)
	continue;
	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
	if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
	g_raid3_event_free(ep);
	else {
	ep->e_error = ECANCELED;
	wakeup(ep);
	}
	}
	mtx_unlock(&sc->sc_events_mtx);
	}

	/*
	* Return the number of disks in the given state.
	* If state is equal to -1, count all connected disks.
	*/
	u_int
	g_raid3_ndisks(struct g_raid3_softc *sc, int state)
	{
	struct g_raid3_disk *disk;
	u_int n, ndisks;

	sx_assert(&sc->sc_lock, SX_LOCKED);

	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
	disk = &sc->sc_disks[n];
	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
	continue;
	if (state == -1 \|\| disk->d_state == state)
	ndisks++;
	}
	return (ndisks);
	}

	static u_int
	g_raid3_nrequests(struct g_raid3_softc sc, struct g_consumer cp)
	{
	struct bio *bp;
	u_int nreqs = 0;

	mtx_lock(&sc->sc_queue_mtx);
	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
	if (bp->bio_from == cp)
	nreqs++;
	}
	mtx_unlock(&sc->sc_queue_mtx);
	return (nreqs);
	}

	static int
	g_raid3_is_busy(struct g_raid3_softc sc, struct g_consumer cp)
	{

	if (cp->index > 0) {
	G_RAID3_DEBUG(2,
	"I/O requests for %s exist, can't destroy it now.",
	cp->provider->name);
	return (1);
	}
	if (g_raid3_nrequests(sc, cp) > 0) {
	G_RAID3_DEBUG(2,
	"I/O requests for %s in queue, can't destroy it now.",
	cp->provider->name);
	return (1);
	}
	return (0);
	}

	static void
	g_raid3_destroy_consumer(void *arg, int flags __unused)
	{
	struct g_consumer *cp;

	g_topology_assert();

	cp = arg;
	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
	g_detach(cp);
	g_destroy_consumer(cp);
	}

	static void
	g_raid3_kill_consumer(struct g_raid3_softc sc, struct g_consumer cp)
	{
	struct g_provider *pp;
	int retaste_wait;

	g_topology_assert();

	cp->private = NULL;
	if (g_raid3_is_busy(sc, cp))
	return;
	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
	pp = cp->provider;
	retaste_wait = 0;
	if (cp->acw == 1) {
	if ((pp->geom->flags & G_GEOM_WITHER) == 0)
	retaste_wait = 1;
	}
	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
	-cp->acw, -cp->ace, 0);
	if (cp->acr > 0 \|\| cp->acw > 0 \|\| cp->ace > 0)
	g_access(cp, -cp->acr, -cp->acw, -cp->ace);
	if (retaste_wait) {
	/*
	* After retaste event was send (inside g_access()), we can send
	* event to detach and destroy consumer.
	* A class, which has consumer to the given provider connected
	* will not receive retaste event for the provider.
	* This is the way how I ignore retaste events when I close
	* consumers opened for write: I detach and destroy consumer
	* after retaste event is sent.
	*/
	g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
	return;
	}
	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
	g_detach(cp);
	g_destroy_consumer(cp);
	}

	static int
	g_raid3_connect_disk(struct g_raid3_disk disk, struct g_provider pp)
	{
	struct g_consumer *cp;
	int error;

	g_topology_assert_not();
	KASSERT(disk->d_consumer == NULL,
	("Disk already connected (device %s).", disk->d_softc->sc_name));

	g_topology_lock();
	cp = g_new_consumer(disk->d_softc->sc_geom);
	error = g_attach(cp, pp);
	if (error != 0) {
	g_destroy_consumer(cp);
	g_topology_unlock();
	return (error);
	}
	error = g_access(cp, 1, 1, 1);
	g_topology_unlock();
	if (error != 0) {
	g_detach(cp);
	g_destroy_consumer(cp);
	G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
	pp->name, error);
	return (error);
	}
	disk->d_consumer = cp;
	disk->d_consumer->private = disk;
	disk->d_consumer->index = 0;
	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
	return (0);
	}

	static void
	g_raid3_disconnect_consumer(struct g_raid3_softc sc, struct g_consumer cp)
	{

	g_topology_assert();

	if (cp == NULL)
	return;
	if (cp->provider != NULL)
	g_raid3_kill_consumer(sc, cp);
	else
	g_destroy_consumer(cp);
	}

	/*
	* Initialize disk. This means allocate memory, create consumer, attach it
	* to the provider and open access (r1w1e1) to it.
	*/
	static struct g_raid3_disk *
	g_raid3_init_disk(struct g_raid3_softc sc, struct g_provider pp,
	struct g_raid3_metadata md, int errorp)
	{
	struct g_raid3_disk *disk;
	int error;

	disk = &sc->sc_disks[md->md_no];
	error = g_raid3_connect_disk(disk, pp);
	if (error != 0) {
	if (errorp != NULL)
	*errorp = error;
	return (NULL);
	}
	disk->d_state = G_RAID3_DISK_STATE_NONE;
	disk->d_flags = md->md_dflags;
	if (md->md_provider[0] != '\0')
	disk->d_flags \|= G_RAID3_DISK_FLAG_HARDCODED;
	disk->d_sync.ds_consumer = NULL;
	disk->d_sync.ds_offset = md->md_sync_offset;
	disk->d_sync.ds_offset_done = md->md_sync_offset;
	disk->d_genid = md->md_genid;
	disk->d_sync.ds_syncid = md->md_syncid;
	if (errorp != NULL)
	*errorp = 0;
	return (disk);
	}

	static void
	g_raid3_destroy_disk(struct g_raid3_disk *disk)
	{
	struct g_raid3_softc *sc;

	g_topology_assert_not();
	sc = disk->d_softc;
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
	return;
	g_raid3_event_cancel(disk);
	switch (disk->d_state) {
	case G_RAID3_DISK_STATE_SYNCHRONIZING:
	if (sc->sc_syncdisk != NULL)
	g_raid3_sync_stop(sc, 1);
	/* FALLTHROUGH */
	case G_RAID3_DISK_STATE_NEW:
	case G_RAID3_DISK_STATE_STALE:
	case G_RAID3_DISK_STATE_ACTIVE:
	g_topology_lock();
	g_raid3_disconnect_consumer(sc, disk->d_consumer);
	g_topology_unlock();
	disk->d_consumer = NULL;
	break;
	default:
	KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
	g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	}
	disk->d_state = G_RAID3_DISK_STATE_NODISK;
	}

	static void
	g_raid3_destroy_device(struct g_raid3_softc *sc)
	{
	struct g_raid3_event *ep;
	struct g_raid3_disk *disk;
	struct g_geom *gp;
	struct g_consumer *cp;
	u_int n;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	gp = sc->sc_geom;
	if (sc->sc_provider != NULL)
	g_raid3_destroy_provider(sc);
	for (n = 0; n < sc->sc_ndisks; n++) {
	disk = &sc->sc_disks[n];
	if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
	g_raid3_update_metadata(disk);
	g_raid3_destroy_disk(disk);
	}
	}
	while ((ep = g_raid3_event_get(sc)) != NULL) {
	g_raid3_event_remove(sc, ep);
	if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
	g_raid3_event_free(ep);
	else {
	ep->e_error = ECANCELED;
	ep->e_flags \|= G_RAID3_EVENT_DONE;
	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
	mtx_lock(&sc->sc_events_mtx);
	wakeup(ep);
	mtx_unlock(&sc->sc_events_mtx);
	}
	}
	callout_drain(&sc->sc_callout);
	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
	g_topology_lock();
	if (cp != NULL)
	g_raid3_disconnect_consumer(sc, cp);
	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
	g_wither_geom(gp, ENXIO);
	g_topology_unlock();
	if (!g_raid3_use_malloc) {
	uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
	uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
	uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
	}
	mtx_destroy(&sc->sc_queue_mtx);
	mtx_destroy(&sc->sc_events_mtx);
	sx_xunlock(&sc->sc_lock);
	sx_destroy(&sc->sc_lock);
	}

	static void
	g_raid3_orphan(struct g_consumer *cp)
	{
	struct g_raid3_disk *disk;

	g_topology_assert();

	disk = cp->private;
	if (disk == NULL)
	return;
	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
	G_RAID3_EVENT_DONTWAIT);
	}

	static int
	g_raid3_write_metadata(struct g_raid3_disk disk, struct g_raid3_metadata md)
	{
	struct g_raid3_softc *sc;
	struct g_consumer *cp;
	off_t offset, length;
	u_char *sector;
	int error = 0;

	g_topology_assert_not();
	sc = disk->d_softc;
	sx_assert(&sc->sc_lock, SX_LOCKED);

	cp = disk->d_consumer;
	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
	cp->acw, cp->ace));
	length = cp->provider->sectorsize;
	offset = cp->provider->mediasize - length;
	sector = malloc((size_t)length, M_RAID3, M_WAITOK \| M_ZERO);
	if (md != NULL)
	raid3_metadata_encode(md, sector);
	error = g_write_data(cp, offset, sector, length);
	free(sector, M_RAID3);
	if (error != 0) {
	if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
	G_RAID3_DEBUG(0, "Cannot write metadata on %s "
	"(device=%s, error=%d).",
	g_raid3_get_diskname(disk), sc->sc_name, error);
	disk->d_flags \|= G_RAID3_DISK_FLAG_BROKEN;
	} else {
	G_RAID3_DEBUG(1, "Cannot write metadata on %s "
	"(device=%s, error=%d).",
	g_raid3_get_diskname(disk), sc->sc_name, error);
	}
	if (g_raid3_disconnect_on_failure &&
	sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
	sc->sc_bump_id \|= G_RAID3_BUMP_GENID;
	g_raid3_event_send(disk,
	G_RAID3_DISK_STATE_DISCONNECTED,
	G_RAID3_EVENT_DONTWAIT);
	}
	}
	return (error);
	}

	int
	g_raid3_clear_metadata(struct g_raid3_disk *disk)
	{
	int error;

	g_topology_assert_not();
	sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);

	error = g_raid3_write_metadata(disk, NULL);
	if (error == 0) {
	G_RAID3_DEBUG(2, "Metadata on %s cleared.",
	g_raid3_get_diskname(disk));
	} else {
	G_RAID3_DEBUG(0,
	"Cannot clear metadata on disk %s (error=%d).",
	g_raid3_get_diskname(disk), error);
	}
	return (error);
	}

	void
	g_raid3_fill_metadata(struct g_raid3_disk disk, struct g_raid3_metadata md)
	{
	struct g_raid3_softc *sc;
	struct g_provider *pp;

	sc = disk->d_softc;
	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
	md->md_version = G_RAID3_VERSION;
	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
	md->md_id = sc->sc_id;
	md->md_all = sc->sc_ndisks;
	md->md_genid = sc->sc_genid;
	md->md_mediasize = sc->sc_mediasize;
	md->md_sectorsize = sc->sc_sectorsize;
	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
	md->md_no = disk->d_no;
	md->md_syncid = disk->d_sync.ds_syncid;
	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
	if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
	md->md_sync_offset = 0;
	else {
	md->md_sync_offset =
	disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1);
	}
	if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL)
	pp = disk->d_consumer->provider;
	else
	pp = NULL;
	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL)
	strlcpy(md->md_provider, pp->name, sizeof(md->md_provider));
	else
	bzero(md->md_provider, sizeof(md->md_provider));
	if (pp != NULL)
	md->md_provsize = pp->mediasize;
	else
	md->md_provsize = 0;
	}

	void
	g_raid3_update_metadata(struct g_raid3_disk *disk)
	{
	struct g_raid3_softc *sc;
	struct g_raid3_metadata md;
	int error;

	g_topology_assert_not();
	sc = disk->d_softc;
	sx_assert(&sc->sc_lock, SX_LOCKED);

	g_raid3_fill_metadata(disk, &md);
	error = g_raid3_write_metadata(disk, &md);
	if (error == 0) {
	G_RAID3_DEBUG(2, "Metadata on %s updated.",
	g_raid3_get_diskname(disk));
	} else {
	G_RAID3_DEBUG(0,
	"Cannot update metadata on disk %s (error=%d).",
	g_raid3_get_diskname(disk), error);
	}
	}

	static void
	g_raid3_bump_syncid(struct g_raid3_softc *sc)
	{
	struct g_raid3_disk *disk;
	u_int n;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);
	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
	("%s called with no active disks (device=%s).", __func__,
	sc->sc_name));

	sc->sc_syncid++;
	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
	sc->sc_syncid);
	for (n = 0; n < sc->sc_ndisks; n++) {
	disk = &sc->sc_disks[n];
	if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE \|\|
	disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
	disk->d_sync.ds_syncid = sc->sc_syncid;
	g_raid3_update_metadata(disk);
	}
	}
	}

	static void
	g_raid3_bump_genid(struct g_raid3_softc *sc)
	{
	struct g_raid3_disk *disk;
	u_int n;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);
	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
	("%s called with no active disks (device=%s).", __func__,
	sc->sc_name));

	sc->sc_genid++;
	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
	sc->sc_genid);
	for (n = 0; n < sc->sc_ndisks; n++) {
	disk = &sc->sc_disks[n];
	if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE \|\|
	disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
	disk->d_genid = sc->sc_genid;
	g_raid3_update_metadata(disk);
	}
	}
	}

	static int
	g_raid3_idle(struct g_raid3_softc *sc, int acw)
	{
	struct g_raid3_disk *disk;
	u_int i;
	int timeout;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	if (sc->sc_provider == NULL)
	return (0);
	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
	return (0);
	if (sc->sc_idle)
	return (0);
	if (sc->sc_writes > 0)
	return (0);
	if (acw > 0 \|\| (acw == -1 && sc->sc_provider->acw > 0)) {
	timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write);
	if (!g_raid3_shutdown && timeout > 0)
	return (timeout);
	}
	sc->sc_idle = 1;
	for (i = 0; i < sc->sc_ndisks; i++) {
	disk = &sc->sc_disks[i];
	if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
	continue;
	G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
	g_raid3_get_diskname(disk), sc->sc_name);
	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
	g_raid3_update_metadata(disk);
	}
	return (0);
	}

	static void
	g_raid3_unidle(struct g_raid3_softc *sc)
	{
	struct g_raid3_disk *disk;
	u_int i;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
	return;
	sc->sc_idle = 0;
	sc->sc_last_write = time_uptime;
	for (i = 0; i < sc->sc_ndisks; i++) {
	disk = &sc->sc_disks[i];
	if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
	continue;
	G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
	g_raid3_get_diskname(disk), sc->sc_name);
	disk->d_flags \|= G_RAID3_DISK_FLAG_DIRTY;
	g_raid3_update_metadata(disk);
	}
	}

	/*
	* Treat bio_driver1 field in parent bio as list head and field bio_caller1
	* in child bio as pointer to the next element on the list.
	*/
	#define G_RAID3_HEAD_BIO(pbp) (pbp)->bio_driver1

	#define G_RAID3_NEXT_BIO(cbp) (cbp)->bio_caller1

	#define G_RAID3_FOREACH_BIO(pbp, bp) \
	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL; \
	(bp) = G_RAID3_NEXT_BIO(bp))

	#define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp) \
	for ((bp) = G_RAID3_HEAD_BIO(pbp); \
	(bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1); \
	(bp) = (tmpbp))

	static void
	g_raid3_init_bio(struct bio *pbp)
	{

	G_RAID3_HEAD_BIO(pbp) = NULL;
	}

	static void
	g_raid3_remove_bio(struct bio *cbp)
	{
	struct bio pbp, bp;

	pbp = cbp->bio_parent;
	if (G_RAID3_HEAD_BIO(pbp) == cbp)
	G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
	else {
	G_RAID3_FOREACH_BIO(pbp, bp) {
	if (G_RAID3_NEXT_BIO(bp) == cbp) {
	G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
	break;
	}
	}
	}
	G_RAID3_NEXT_BIO(cbp) = NULL;
	}

	static void
	g_raid3_replace_bio(struct bio sbp, struct bio dbp)
	{
	struct bio pbp, bp;

	g_raid3_remove_bio(sbp);
	pbp = dbp->bio_parent;
	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
	if (G_RAID3_HEAD_BIO(pbp) == dbp)
	G_RAID3_HEAD_BIO(pbp) = sbp;
	else {
	G_RAID3_FOREACH_BIO(pbp, bp) {
	if (G_RAID3_NEXT_BIO(bp) == dbp) {
	G_RAID3_NEXT_BIO(bp) = sbp;
	break;
	}
	}
	}
	G_RAID3_NEXT_BIO(dbp) = NULL;
	}

	static void
	g_raid3_destroy_bio(struct g_raid3_softc sc, struct bio cbp)
	{
	struct bio bp, pbp;
	size_t size;

	pbp = cbp->bio_parent;
	pbp->bio_children--;
	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
	size = pbp->bio_length / (sc->sc_ndisks - 1);
	g_raid3_free(sc, cbp->bio_data, size);
	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
	G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
	G_RAID3_NEXT_BIO(cbp) = NULL;
	g_destroy_bio(cbp);
	} else {
	G_RAID3_FOREACH_BIO(pbp, bp) {
	if (G_RAID3_NEXT_BIO(bp) == cbp)
	break;
	}
	if (bp != NULL) {
	KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
	("NULL bp->bio_driver1"));
	G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
	G_RAID3_NEXT_BIO(cbp) = NULL;
	}
	g_destroy_bio(cbp);
	}
	}

	static struct bio *
	g_raid3_clone_bio(struct g_raid3_softc sc, struct bio pbp)
	{
	struct bio bp, cbp;
	size_t size;
	int memflag;

	cbp = g_clone_bio(pbp);
	if (cbp == NULL)
	return (NULL);
	size = pbp->bio_length / (sc->sc_ndisks - 1);
	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
	memflag = M_WAITOK;
	else
	memflag = M_NOWAIT;
	cbp->bio_data = g_raid3_alloc(sc, size, memflag);
	if (cbp->bio_data == NULL) {
	pbp->bio_children--;
	g_destroy_bio(cbp);
	return (NULL);
	}
	G_RAID3_NEXT_BIO(cbp) = NULL;
	if (G_RAID3_HEAD_BIO(pbp) == NULL)
	G_RAID3_HEAD_BIO(pbp) = cbp;
	else {
	G_RAID3_FOREACH_BIO(pbp, bp) {
	if (G_RAID3_NEXT_BIO(bp) == NULL) {
	G_RAID3_NEXT_BIO(bp) = cbp;
	break;
	}
	}
	}
	return (cbp);
	}

	static void
	g_raid3_scatter(struct bio *pbp)
	{
	struct g_raid3_softc *sc;
	struct g_raid3_disk *disk;
	struct bio bp, cbp, *tmpbp;
	off_t atom, cadd, padd, left;
	int first;

	sc = pbp->bio_to->geom->softc;
	bp = NULL;
	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
	/*
	* Find bio for which we should calculate data.
	*/
	G_RAID3_FOREACH_BIO(pbp, cbp) {
	if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
	bp = cbp;
	break;
	}
	}
	KASSERT(bp != NULL, ("NULL parity bio."));
	}
	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
	cadd = padd = 0;
	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
	G_RAID3_FOREACH_BIO(pbp, cbp) {
	if (cbp == bp)
	continue;
	bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
	padd += atom;
	}
	cadd += atom;
	}
	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
	/*
	* Calculate parity.
	*/
	first = 1;
	G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
	if (cbp == bp)
	continue;
	if (first) {
	bcopy(cbp->bio_data, bp->bio_data,
	bp->bio_length);
	first = 0;
	} else {
	g_raid3_xor(cbp->bio_data, bp->bio_data,
	bp->bio_length);
	}
	if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
	g_raid3_destroy_bio(sc, cbp);
	}
	}
	G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
	struct g_consumer *cp;

	disk = cbp->bio_caller2;
	cp = disk->d_consumer;
	cbp->bio_to = cp->provider;
	G_RAID3_LOGREQ(3, cbp, "Sending request.");
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
	cp->acr, cp->acw, cp->ace));
	cp->index++;
	sc->sc_writes++;
	g_io_request(cbp, cp);
	}
	}

	static void
	g_raid3_gather(struct bio *pbp)
	{
	struct g_raid3_softc *sc;
	struct g_raid3_disk *disk;
	struct bio xbp, fbp, *cbp;
	off_t atom, cadd, padd, left;

	sc = pbp->bio_to->geom->softc;
	/*
	* Find bio for which we have to calculate data.
	* While going through this path, check if all requests
	* succeeded, if not, deny whole request.
	* If we're in COMPLETE mode, we allow one request to fail,
	* so if we find one, we're sending it to the parity consumer.
	* If there are more failed requests, we deny whole request.
	*/
	xbp = fbp = NULL;
	G_RAID3_FOREACH_BIO(pbp, cbp) {
	if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
	KASSERT(xbp == NULL, ("More than one parity bio."));
	xbp = cbp;
	}
	if (cbp->bio_error == 0)
	continue;
	/*
	* Found failed request.
	*/
	if (fbp == NULL) {
	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
	/*
	* We are already in degraded mode, so we can't
	* accept any failures.
	*/
	if (pbp->bio_error == 0)
	pbp->bio_error = cbp->bio_error;
	} else {
	fbp = cbp;
	}
	} else {
	/*
	* Next failed request, that's too many.
	*/
	if (pbp->bio_error == 0)
	pbp->bio_error = fbp->bio_error;
	}
	disk = cbp->bio_caller2;
	if (disk == NULL)
	continue;
	if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
	disk->d_flags \|= G_RAID3_DISK_FLAG_BROKEN;
	G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).",
	cbp->bio_error);
	} else {
	G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).",
	cbp->bio_error);
	}
	if (g_raid3_disconnect_on_failure &&
	sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
	sc->sc_bump_id \|= G_RAID3_BUMP_GENID;
	g_raid3_event_send(disk,
	G_RAID3_DISK_STATE_DISCONNECTED,
	G_RAID3_EVENT_DONTWAIT);
	}
	}
	if (pbp->bio_error != 0)
	goto finish;
	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
	if (xbp != fbp)
	g_raid3_replace_bio(xbp, fbp);
	g_raid3_destroy_bio(sc, fbp);
	} else if (fbp != NULL) {
	struct g_consumer *cp;

	/*
	* One request failed, so send the same request to
	* the parity consumer.
	*/
	disk = pbp->bio_driver2;
	if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
	pbp->bio_error = fbp->bio_error;
	goto finish;
	}
	pbp->bio_pflags \|= G_RAID3_BIO_PFLAG_DEGRADED;
	pbp->bio_inbed--;
	fbp->bio_flags &= ~(BIO_DONE \| BIO_ERROR);
	if (disk->d_no == sc->sc_ndisks - 1)
	fbp->bio_cflags \|= G_RAID3_BIO_CFLAG_PARITY;
	fbp->bio_error = 0;
	fbp->bio_completed = 0;
	fbp->bio_children = 0;
	fbp->bio_inbed = 0;
	cp = disk->d_consumer;
	fbp->bio_caller2 = disk;
	fbp->bio_to = cp->provider;
	G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
	cp->acr, cp->acw, cp->ace));
	cp->index++;
	g_io_request(fbp, cp);
	return;
	}
	if (xbp != NULL) {
	/*
	* Calculate parity.
	*/
	G_RAID3_FOREACH_BIO(pbp, cbp) {
	if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
	continue;
	g_raid3_xor(cbp->bio_data, xbp->bio_data,
	xbp->bio_length);
	}
	xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
	if (!g_raid3_is_zero(xbp)) {
	g_raid3_parity_mismatch++;
	pbp->bio_error = EIO;
	goto finish;
	}
	g_raid3_destroy_bio(sc, xbp);
	}
	}
	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
	cadd = padd = 0;
	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
	G_RAID3_FOREACH_BIO(pbp, cbp) {
	bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
	pbp->bio_completed += atom;
	padd += atom;
	}
	cadd += atom;
	}
	finish:
	if (pbp->bio_error == 0)
	G_RAID3_LOGREQ(3, pbp, "Request finished.");
	else {
	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
	G_RAID3_LOGREQ(1, pbp, "Verification error.");
	else
	G_RAID3_LOGREQ(0, pbp, "Request failed.");
	}
	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
	g_raid3_destroy_bio(sc, cbp);
	g_io_deliver(pbp, pbp->bio_error);
	}

	static void
	g_raid3_done(struct bio *bp)
	{
	struct g_raid3_softc *sc;

	sc = bp->bio_from->geom->softc;
	bp->bio_cflags \|= G_RAID3_BIO_CFLAG_REGULAR;
	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
	mtx_lock(&sc->sc_queue_mtx);
	bioq_insert_head(&sc->sc_queue, bp);
	mtx_unlock(&sc->sc_queue_mtx);
	wakeup(sc);
	wakeup(&sc->sc_queue);
	}

	static void
	g_raid3_regular_request(struct bio *cbp)
	{
	struct g_raid3_softc *sc;
	struct g_raid3_disk *disk;
	struct bio *pbp;

	g_topology_assert_not();

	pbp = cbp->bio_parent;
	sc = pbp->bio_to->geom->softc;
	cbp->bio_from->index--;
	if (cbp->bio_cmd == BIO_WRITE)
	sc->sc_writes--;
	disk = cbp->bio_from->private;
	if (disk == NULL) {
	g_topology_lock();
	g_raid3_kill_consumer(sc, cbp->bio_from);
	g_topology_unlock();
	}

	G_RAID3_LOGREQ(3, cbp, "Request finished.");
	pbp->bio_inbed++;
	KASSERT(pbp->bio_inbed <= pbp->bio_children,
	("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
	pbp->bio_children));
	if (pbp->bio_inbed != pbp->bio_children)
	return;
	switch (pbp->bio_cmd) {
	case BIO_READ:
	g_raid3_gather(pbp);
	break;
	case BIO_WRITE:
	case BIO_DELETE:
	{
	int error = 0;

	pbp->bio_completed = pbp->bio_length;
	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
	if (cbp->bio_error == 0) {
	g_raid3_destroy_bio(sc, cbp);
	continue;
	}

	if (error == 0)
	error = cbp->bio_error;
	else if (pbp->bio_error == 0) {
	/*
	* Next failed request, that's too many.
	*/
	pbp->bio_error = error;
	}

	disk = cbp->bio_caller2;
	if (disk == NULL) {
	g_raid3_destroy_bio(sc, cbp);
	continue;
	}

	if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
	disk->d_flags \|= G_RAID3_DISK_FLAG_BROKEN;
	G_RAID3_LOGREQ(0, cbp,
	"Request failed (error=%d).",
	cbp->bio_error);
	} else {
	G_RAID3_LOGREQ(1, cbp,
	"Request failed (error=%d).",
	cbp->bio_error);
	}
	if (g_raid3_disconnect_on_failure &&
	sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
	sc->sc_bump_id \|= G_RAID3_BUMP_GENID;
	g_raid3_event_send(disk,
	G_RAID3_DISK_STATE_DISCONNECTED,
	G_RAID3_EVENT_DONTWAIT);
	}
	g_raid3_destroy_bio(sc, cbp);
	}
	if (pbp->bio_error == 0)
	G_RAID3_LOGREQ(3, pbp, "Request finished.");
	else
	G_RAID3_LOGREQ(0, pbp, "Request failed.");
	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
	bioq_remove(&sc->sc_inflight, pbp);
	/* Release delayed sync requests if possible. */
	g_raid3_sync_release(sc);
	g_io_deliver(pbp, pbp->bio_error);
	break;
	}
	}
	}

	static void
	g_raid3_sync_done(struct bio *bp)
	{
	struct g_raid3_softc *sc;

	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
	sc = bp->bio_from->geom->softc;
	bp->bio_cflags \|= G_RAID3_BIO_CFLAG_SYNC;
	mtx_lock(&sc->sc_queue_mtx);
	bioq_insert_head(&sc->sc_queue, bp);
	mtx_unlock(&sc->sc_queue_mtx);
	wakeup(sc);
	wakeup(&sc->sc_queue);
	}

	static void
	g_raid3_flush(struct g_raid3_softc sc, struct bio bp)
	{
	struct bio_queue_head queue;
	struct g_raid3_disk *disk;
	struct g_consumer *cp;
	struct bio *cbp;
	u_int i;

	bioq_init(&queue);
	for (i = 0; i < sc->sc_ndisks; i++) {
	disk = &sc->sc_disks[i];
	if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
	continue;
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	for (cbp = bioq_first(&queue); cbp != NULL;
	cbp = bioq_first(&queue)) {
	bioq_remove(&queue, cbp);
	g_destroy_bio(cbp);
	}
	if (bp->bio_error == 0)
	bp->bio_error = ENOMEM;
	g_io_deliver(bp, bp->bio_error);
	return;
	}
	bioq_insert_tail(&queue, cbp);
	cbp->bio_done = g_std_done;
	cbp->bio_caller1 = disk;
	cbp->bio_to = disk->d_consumer->provider;
	}
	for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) {
	bioq_remove(&queue, cbp);
	G_RAID3_LOGREQ(3, cbp, "Sending request.");
	disk = cbp->bio_caller1;
	cbp->bio_caller1 = NULL;
	cp = disk->d_consumer;
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
	cp->acr, cp->acw, cp->ace));
	g_io_request(cbp, disk->d_consumer);
	}
	}

	static void
	g_raid3_start(struct bio *bp)
	{
	struct g_raid3_softc *sc;

	sc = bp->bio_to->geom->softc;
	/*
	* If sc == NULL or there are no valid disks, provider's error
	* should be set and g_raid3_start() should not be called at all.
	*/
	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED \|\|
	sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
	("Provider's error should be set (error=%d)(device=%s).",
	bp->bio_to->error, bp->bio_to->name));
	G_RAID3_LOGREQ(3, bp, "Request received.");

	switch (bp->bio_cmd) {
	case BIO_READ:
	case BIO_WRITE:
	case BIO_DELETE:
	break;
	case BIO_SPEEDUP:
	case BIO_FLUSH:
	g_raid3_flush(sc, bp);
	return;
	case BIO_GETATTR:
	default:
	g_io_deliver(bp, EOPNOTSUPP);
	return;
	}
	mtx_lock(&sc->sc_queue_mtx);
	bioq_insert_tail(&sc->sc_queue, bp);
	mtx_unlock(&sc->sc_queue_mtx);
	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
	wakeup(sc);
	}

	/*
	* Return TRUE if the given request is colliding with a in-progress
	* synchronization request.
	*/
	static int
	g_raid3_sync_collision(struct g_raid3_softc sc, struct bio bp)
	{
	struct g_raid3_disk *disk;
	struct bio *sbp;
	off_t rstart, rend, sstart, send;
	int i;

	disk = sc->sc_syncdisk;
	if (disk == NULL)
	return (0);
	rstart = bp->bio_offset;
	rend = bp->bio_offset + bp->bio_length;
	for (i = 0; i < g_raid3_syncreqs; i++) {
	sbp = disk->d_sync.ds_bios[i];
	if (sbp == NULL)
	continue;
	sstart = sbp->bio_offset;
	send = sbp->bio_length;
	if (sbp->bio_cmd == BIO_WRITE) {
	sstart *= sc->sc_ndisks - 1;
	send *= sc->sc_ndisks - 1;
	}
	send += sstart;
	if (rend > sstart && rstart < send)
	return (1);
	}
	return (0);
	}

	/*
	* Return TRUE if the given sync request is colliding with a in-progress regular
	* request.
	*/
	static int
	g_raid3_regular_collision(struct g_raid3_softc sc, struct bio sbp)
	{
	off_t rstart, rend, sstart, send;
	struct bio *bp;

	if (sc->sc_syncdisk == NULL)
	return (0);
	sstart = sbp->bio_offset;
	send = sstart + sbp->bio_length;
	TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
	rstart = bp->bio_offset;
	rend = bp->bio_offset + bp->bio_length;
	if (rend > sstart && rstart < send)
	return (1);
	}
	return (0);
	}

	/*
	* Puts request onto delayed queue.
	*/
	static void
	g_raid3_regular_delay(struct g_raid3_softc sc, struct bio bp)
	{

	G_RAID3_LOGREQ(2, bp, "Delaying request.");
	bioq_insert_head(&sc->sc_regular_delayed, bp);
	}

	/*
	* Puts synchronization request onto delayed queue.
	*/
	static void
	g_raid3_sync_delay(struct g_raid3_softc sc, struct bio bp)
	{

	G_RAID3_LOGREQ(2, bp, "Delaying synchronization request.");
	bioq_insert_tail(&sc->sc_sync_delayed, bp);
	}

	/*
	* Releases delayed regular requests which don't collide anymore with sync
	* requests.
	*/
	static void
	g_raid3_regular_release(struct g_raid3_softc *sc)
	{
	struct bio bp, bp2;

	TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
	if (g_raid3_sync_collision(sc, bp))
	continue;
	bioq_remove(&sc->sc_regular_delayed, bp);
	G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
	mtx_lock(&sc->sc_queue_mtx);
	bioq_insert_head(&sc->sc_queue, bp);
	#if 0
	/*
	* wakeup() is not needed, because this function is called from
	* the worker thread.
	*/
	wakeup(&sc->sc_queue);
	#endif
	mtx_unlock(&sc->sc_queue_mtx);
	}
	}

	/*
	* Releases delayed sync requests which don't collide anymore with regular
	* requests.
	*/
	static void
	g_raid3_sync_release(struct g_raid3_softc *sc)
	{
	struct bio bp, bp2;

	TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
	if (g_raid3_regular_collision(sc, bp))
	continue;
	bioq_remove(&sc->sc_sync_delayed, bp);
	G_RAID3_LOGREQ(2, bp,
	"Releasing delayed synchronization request.");
	g_io_request(bp, bp->bio_from);
	}
	}

	/*
	* Handle synchronization requests.
	* Every synchronization request is two-steps process: first, READ request is
	* send to active provider and then WRITE request (with read data) to the provider
	* being synchronized. When WRITE is finished, new synchronization request is
	* send.
	*/
	static void
	g_raid3_sync_request(struct bio *bp)
	{
	struct g_raid3_softc *sc;
	struct g_raid3_disk *disk;

	bp->bio_from->index--;
	sc = bp->bio_from->geom->softc;
	disk = bp->bio_from->private;
	if (disk == NULL) {
	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
	g_topology_lock();
	g_raid3_kill_consumer(sc, bp->bio_from);
	g_topology_unlock();
	free(bp->bio_data, M_RAID3);
	g_destroy_bio(bp);
	sx_xlock(&sc->sc_lock);
	return;
	}

	/*
	* Synchronization request.
	*/
	switch (bp->bio_cmd) {
	case BIO_READ:
	{
	struct g_consumer *cp;
	u_char dst, src;
	off_t left;
	u_int atom;

	if (bp->bio_error != 0) {
	G_RAID3_LOGREQ(0, bp,
	"Synchronization request failed (error=%d).",
	bp->bio_error);
	g_destroy_bio(bp);
	return;
	}
	G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
	dst = src = bp->bio_data;
	if (disk->d_no == sc->sc_ndisks - 1) {
	u_int n;

	/* Parity component. */
	for (left = bp->bio_length; left > 0;
	left -= sc->sc_sectorsize) {
	bcopy(src, dst, atom);
	src += atom;
	for (n = 1; n < sc->sc_ndisks - 1; n++) {
	g_raid3_xor(src, dst, atom);
	src += atom;
	}
	dst += atom;
	}
	} else {
	/* Regular component. */
	src += atom * disk->d_no;
	for (left = bp->bio_length; left > 0;
	left -= sc->sc_sectorsize) {
	bcopy(src, dst, atom);
	src += sc->sc_sectorsize;
	dst += atom;
	}
	}
	bp->bio_driver1 = bp->bio_driver2 = NULL;
	bp->bio_pflags = 0;
	bp->bio_offset /= sc->sc_ndisks - 1;
	bp->bio_length /= sc->sc_ndisks - 1;
	bp->bio_cmd = BIO_WRITE;
	bp->bio_cflags = 0;
	bp->bio_children = bp->bio_inbed = 0;
	cp = disk->d_consumer;
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
	cp->acr, cp->acw, cp->ace));
	cp->index++;
	g_io_request(bp, cp);
	return;
	}
	case BIO_WRITE:
	{
	struct g_raid3_disk_sync *sync;
	off_t boffset, moffset;
	void *data;
	int i;

	if (bp->bio_error != 0) {
	G_RAID3_LOGREQ(0, bp,
	"Synchronization request failed (error=%d).",
	bp->bio_error);
	g_destroy_bio(bp);
	sc->sc_bump_id \|= G_RAID3_BUMP_GENID;
	g_raid3_event_send(disk,
	G_RAID3_DISK_STATE_DISCONNECTED,
	G_RAID3_EVENT_DONTWAIT);
	return;
	}
	G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
	sync = &disk->d_sync;
	if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) \|\|
	sync->ds_consumer == NULL \|\|
	(sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
	/* Don't send more synchronization requests. */
	sync->ds_inflight--;
	if (sync->ds_bios != NULL) {
	i = (int)(uintptr_t)bp->bio_caller1;
	sync->ds_bios[i] = NULL;
	}
	free(bp->bio_data, M_RAID3);
	g_destroy_bio(bp);
	if (sync->ds_inflight > 0)
	return;
	if (sync->ds_consumer == NULL \|\|
	(sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
	return;
	}
	/*
	* Disk up-to-date, activate it.
	*/
	g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
	G_RAID3_EVENT_DONTWAIT);
	return;
	}

	/* Send next synchronization request. */
	data = bp->bio_data;
	g_reset_bio(bp);
	bp->bio_cmd = BIO_READ;
	bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1);
	- bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
	+ bp->bio_length = MIN(maxphys, sc->sc_mediasize - bp->bio_offset);
	sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
	bp->bio_done = g_raid3_sync_done;
	bp->bio_data = data;
	bp->bio_from = sync->ds_consumer;
	bp->bio_to = sc->sc_provider;
	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
	sync->ds_consumer->index++;
	/*
	* Delay the request if it is colliding with a regular request.
	*/
	if (g_raid3_regular_collision(sc, bp))
	g_raid3_sync_delay(sc, bp);
	else
	g_io_request(bp, sync->ds_consumer);

	/* Release delayed requests if possible. */
	g_raid3_regular_release(sc);

	/* Find the smallest offset. */
	moffset = sc->sc_mediasize;
	for (i = 0; i < g_raid3_syncreqs; i++) {
	bp = sync->ds_bios[i];
	boffset = bp->bio_offset;
	if (bp->bio_cmd == BIO_WRITE)
	boffset *= sc->sc_ndisks - 1;
	if (boffset < moffset)
	moffset = boffset;
	}
	- if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) {
	+ if (sync->ds_offset_done + maxphys * 100 < moffset) {
	/* Update offset_done on every 100 blocks. */
	sync->ds_offset_done = moffset;
	g_raid3_update_metadata(disk);
	}
	return;
	}
	default:
	KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
	bp->bio_cmd, sc->sc_name));
	break;
	}
	}

	static int
	g_raid3_register_request(struct bio *pbp)
	{
	struct g_raid3_softc *sc;
	struct g_raid3_disk *disk;
	struct g_consumer *cp;
	struct bio cbp, tmpbp;
	off_t offset, length;
	u_int n, ndisks;
	int round_robin, verify;

	ndisks = 0;
	sc = pbp->bio_to->geom->softc;
	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
	sc->sc_syncdisk == NULL) {
	g_io_deliver(pbp, EIO);
	return (0);
	}
	g_raid3_init_bio(pbp);
	length = pbp->bio_length / (sc->sc_ndisks - 1);
	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
	round_robin = verify = 0;
	switch (pbp->bio_cmd) {
	case BIO_READ:
	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
	sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
	pbp->bio_pflags \|= G_RAID3_BIO_PFLAG_VERIFY;
	verify = 1;
	ndisks = sc->sc_ndisks;
	} else {
	verify = 0;
	ndisks = sc->sc_ndisks - 1;
	}
	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
	sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
	round_robin = 1;
	} else {
	round_robin = 0;
	}
	KASSERT(!round_robin \|\| !verify,
	("ROUND-ROBIN and VERIFY are mutually exclusive."));
	pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
	break;
	case BIO_WRITE:
	case BIO_DELETE:
	/*
	* Delay the request if it is colliding with a synchronization
	* request.
	*/
	if (g_raid3_sync_collision(sc, pbp)) {
	g_raid3_regular_delay(sc, pbp);
	return (0);
	}

	if (sc->sc_idle)
	g_raid3_unidle(sc);
	else
	sc->sc_last_write = time_uptime;

	ndisks = sc->sc_ndisks;
	break;
	}
	for (n = 0; n < ndisks; n++) {
	disk = &sc->sc_disks[n];
	cbp = g_raid3_clone_bio(sc, pbp);
	if (cbp == NULL) {
	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
	g_raid3_destroy_bio(sc, cbp);
	/*
	* To prevent deadlock, we must run back up
	* with the ENOMEM for failed requests of any
	* of our consumers. Our own sync requests
	* can stick around, as they are finite.
	*/
	if ((pbp->bio_cflags &
	G_RAID3_BIO_CFLAG_REGULAR) != 0) {
	g_io_deliver(pbp, ENOMEM);
	return (0);
	}
	return (ENOMEM);
	}
	cbp->bio_offset = offset;
	cbp->bio_length = length;
	cbp->bio_done = g_raid3_done;
	switch (pbp->bio_cmd) {
	case BIO_READ:
	if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
	/*
	* Replace invalid component with the parity
	* component.
	*/
	disk = &sc->sc_disks[sc->sc_ndisks - 1];
	cbp->bio_cflags \|= G_RAID3_BIO_CFLAG_PARITY;
	pbp->bio_pflags \|= G_RAID3_BIO_PFLAG_DEGRADED;
	} else if (round_robin &&
	disk->d_no == sc->sc_round_robin) {
	/*
	* In round-robin mode skip one data component
	* and use parity component when reading.
	*/
	pbp->bio_driver2 = disk;
	disk = &sc->sc_disks[sc->sc_ndisks - 1];
	cbp->bio_cflags \|= G_RAID3_BIO_CFLAG_PARITY;
	sc->sc_round_robin++;
	round_robin = 0;
	} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
	cbp->bio_cflags \|= G_RAID3_BIO_CFLAG_PARITY;
	}
	break;
	case BIO_WRITE:
	case BIO_DELETE:
	if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE \|\|
	disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
	if (n == ndisks - 1) {
	/*
	* Active parity component, mark it as such.
	*/
	cbp->bio_cflags \|=
	G_RAID3_BIO_CFLAG_PARITY;
	}
	} else {
	pbp->bio_pflags \|= G_RAID3_BIO_PFLAG_DEGRADED;
	if (n == ndisks - 1) {
	/*
	* Parity component is not connected,
	* so destroy its request.
	*/
	pbp->bio_pflags \|=
	G_RAID3_BIO_PFLAG_NOPARITY;
	g_raid3_destroy_bio(sc, cbp);
	cbp = NULL;
	} else {
	cbp->bio_cflags \|=
	G_RAID3_BIO_CFLAG_NODISK;
	disk = NULL;
	}
	}
	break;
	}
	if (cbp != NULL)
	cbp->bio_caller2 = disk;
	}
	switch (pbp->bio_cmd) {
	case BIO_READ:
	if (round_robin) {
	/*
	* If we are in round-robin mode and 'round_robin' is
	* still 1, it means, that we skipped parity component
	* for this read and must reset sc_round_robin field.
	*/
	sc->sc_round_robin = 0;
	}
	G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
	disk = cbp->bio_caller2;
	cp = disk->d_consumer;
	cbp->bio_to = cp->provider;
	G_RAID3_LOGREQ(3, cbp, "Sending request.");
	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
	("Consumer %s not opened (r%dw%de%d).",
	cp->provider->name, cp->acr, cp->acw, cp->ace));
	cp->index++;
	g_io_request(cbp, cp);
	}
	break;
	case BIO_WRITE:
	case BIO_DELETE:
	/*
	* Put request onto inflight queue, so we can check if new
	* synchronization requests don't collide with it.
	*/
	bioq_insert_tail(&sc->sc_inflight, pbp);

	/*
	* Bump syncid on first write.
	*/
	if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
	sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
	g_raid3_bump_syncid(sc);
	}
	g_raid3_scatter(pbp);
	break;
	}
	return (0);
	}

	static int
	g_raid3_can_destroy(struct g_raid3_softc *sc)
	{
	struct g_geom *gp;
	struct g_consumer *cp;

	g_topology_assert();
	gp = sc->sc_geom;
	if (gp->softc == NULL)
	return (1);
	LIST_FOREACH(cp, &gp->consumer, consumer) {
	if (g_raid3_is_busy(sc, cp))
	return (0);
	}
	gp = sc->sc_sync.ds_geom;
	LIST_FOREACH(cp, &gp->consumer, consumer) {
	if (g_raid3_is_busy(sc, cp))
	return (0);
	}
	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
	sc->sc_name);
	return (1);
	}

	static int
	g_raid3_try_destroy(struct g_raid3_softc *sc)
	{

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	if (sc->sc_rootmount != NULL) {
	G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
	sc->sc_rootmount);
	root_mount_rel(sc->sc_rootmount);
	sc->sc_rootmount = NULL;
	}

	g_topology_lock();
	if (!g_raid3_can_destroy(sc)) {
	g_topology_unlock();
	return (0);
	}
	sc->sc_geom->softc = NULL;
	sc->sc_sync.ds_geom->softc = NULL;
	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
	g_topology_unlock();
	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
	&sc->sc_worker);
	/* Unlock sc_lock here, as it can be destroyed after wakeup. */
	sx_xunlock(&sc->sc_lock);
	wakeup(&sc->sc_worker);
	sc->sc_worker = NULL;
	} else {
	g_topology_unlock();
	g_raid3_destroy_device(sc);
	free(sc->sc_disks, M_RAID3);
	free(sc, M_RAID3);
	}
	return (1);
	}

	/*
	* Worker thread.
	*/
	static void
	g_raid3_worker(void *arg)
	{
	struct g_raid3_softc *sc;
	struct g_raid3_event *ep;
	struct bio *bp;
	int timeout;

	sc = arg;
	thread_lock(curthread);
	sched_prio(curthread, PRIBIO);
	thread_unlock(curthread);

	sx_xlock(&sc->sc_lock);
	for (;;) {
	G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
	/*
	* First take a look at events.
	* This is important to handle events before any I/O requests.
	*/
	ep = g_raid3_event_get(sc);
	if (ep != NULL) {
	g_raid3_event_remove(sc, ep);
	if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
	/* Update only device status. */
	G_RAID3_DEBUG(3,
	"Running event for device %s.",
	sc->sc_name);
	ep->e_error = 0;
	g_raid3_update_device(sc, 1);
	} else {
	/* Update disk status. */
	G_RAID3_DEBUG(3, "Running event for disk %s.",
	g_raid3_get_diskname(ep->e_disk));
	ep->e_error = g_raid3_update_disk(ep->e_disk,
	ep->e_state);
	if (ep->e_error == 0)
	g_raid3_update_device(sc, 0);
	}
	if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
	KASSERT(ep->e_error == 0,
	("Error cannot be handled."));
	g_raid3_event_free(ep);
	} else {
	ep->e_flags \|= G_RAID3_EVENT_DONE;
	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
	ep);
	mtx_lock(&sc->sc_events_mtx);
	wakeup(ep);
	mtx_unlock(&sc->sc_events_mtx);
	}
	if ((sc->sc_flags &
	G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
	if (g_raid3_try_destroy(sc)) {
	curthread->td_pflags &= ~TDP_GEOM;
	G_RAID3_DEBUG(1, "Thread exiting.");
	kproc_exit(0);
	}
	}
	G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
	continue;
	}
	/*
	* Check if we can mark array as CLEAN and if we can't take
	* how much seconds should we wait.
	*/
	timeout = g_raid3_idle(sc, -1);
	/*
	* Now I/O requests.
	*/
	/* Get first request from the queue. */
	mtx_lock(&sc->sc_queue_mtx);
	bp = bioq_first(&sc->sc_queue);
	if (bp == NULL) {
	if ((sc->sc_flags &
	G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
	mtx_unlock(&sc->sc_queue_mtx);
	if (g_raid3_try_destroy(sc)) {
	curthread->td_pflags &= ~TDP_GEOM;
	G_RAID3_DEBUG(1, "Thread exiting.");
	kproc_exit(0);
	}
	mtx_lock(&sc->sc_queue_mtx);
	}
	sx_xunlock(&sc->sc_lock);
	/*
	* XXX: We can miss an event here, because an event
	* can be added without sx-device-lock and without
	* mtx-queue-lock. Maybe I should just stop using
	* dedicated mutex for events synchronization and
	* stick with the queue lock?
	* The event will hang here until next I/O request
	* or next event is received.
	*/
	MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO \| PDROP, "r3:w1",
	timeout * hz);
	sx_xlock(&sc->sc_lock);
	G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
	continue;
	}
	process:
	bioq_remove(&sc->sc_queue, bp);
	mtx_unlock(&sc->sc_queue_mtx);

	if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
	(bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
	g_raid3_sync_request(bp); /* READ */
	} else if (bp->bio_to != sc->sc_provider) {
	if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
	g_raid3_regular_request(bp);
	else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0)
	g_raid3_sync_request(bp); /* WRITE */
	else {
	KASSERT(0,
	("Invalid request cflags=0x%hx to=%s.",
	bp->bio_cflags, bp->bio_to->name));
	}
	} else if (g_raid3_register_request(bp) != 0) {
	mtx_lock(&sc->sc_queue_mtx);
	bioq_insert_head(&sc->sc_queue, bp);
	/*
	* We are short in memory, let see if there are finished
	* request we can free.
	*/
	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
	if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR)
	goto process;
	}
	/*
	* No finished regular request, so at least keep
	* synchronization running.
	*/
	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
	if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC)
	goto process;
	}
	sx_xunlock(&sc->sc_lock);
	MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO \| PDROP,
	"r3:lowmem", hz / 10);
	sx_xlock(&sc->sc_lock);
	}
	G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
	}
	}

	static void
	g_raid3_update_idle(struct g_raid3_softc sc, struct g_raid3_disk disk)
	{

	sx_assert(&sc->sc_lock, SX_LOCKED);
	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
	return;
	if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
	G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
	g_raid3_get_diskname(disk), sc->sc_name);
	disk->d_flags \|= G_RAID3_DISK_FLAG_DIRTY;
	} else if (sc->sc_idle &&
	(disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
	G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
	g_raid3_get_diskname(disk), sc->sc_name);
	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
	}
	}

	static void
	g_raid3_sync_start(struct g_raid3_softc *sc)
	{
	struct g_raid3_disk *disk;
	struct g_consumer *cp;
	struct bio *bp;
	int error;
	u_int n;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
	("Device not in DEGRADED state (%s, %u).", sc->sc_name,
	sc->sc_state));
	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
	sc->sc_name, sc->sc_state));
	disk = NULL;
	for (n = 0; n < sc->sc_ndisks; n++) {
	if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
	continue;
	disk = &sc->sc_disks[n];
	break;
	}
	if (disk == NULL)
	return;

	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	cp = g_new_consumer(sc->sc_sync.ds_geom);
	error = g_attach(cp, sc->sc_provider);
	KASSERT(error == 0,
	("Cannot attach to %s (error=%d).", sc->sc_name, error));
	error = g_access(cp, 1, 0, 0);
	KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);

	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
	g_raid3_get_diskname(disk));
	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0)
	disk->d_flags \|= G_RAID3_DISK_FLAG_DIRTY;
	KASSERT(disk->d_sync.ds_consumer == NULL,
	("Sync consumer already exists (device=%s, disk=%s).",
	sc->sc_name, g_raid3_get_diskname(disk)));

	disk->d_sync.ds_consumer = cp;
	disk->d_sync.ds_consumer->private = disk;
	disk->d_sync.ds_consumer->index = 0;
	sc->sc_syncdisk = disk;

	/*
	* Allocate memory for synchronization bios and initialize them.
	*/
	disk->d_sync.ds_bios = malloc(sizeof(struct bio ) g_raid3_syncreqs,
	M_RAID3, M_WAITOK);
	for (n = 0; n < g_raid3_syncreqs; n++) {
	bp = g_alloc_bio();
	disk->d_sync.ds_bios[n] = bp;
	bp->bio_parent = NULL;
	bp->bio_cmd = BIO_READ;
	- bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
	+ bp->bio_data = malloc(maxphys, M_RAID3, M_WAITOK);
	bp->bio_cflags = 0;
	bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
	- bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
	+ bp->bio_length = MIN(maxphys, sc->sc_mediasize - bp->bio_offset);
	disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
	bp->bio_done = g_raid3_sync_done;
	bp->bio_from = disk->d_sync.ds_consumer;
	bp->bio_to = sc->sc_provider;
	bp->bio_caller1 = (void *)(uintptr_t)n;
	}

	/* Set the number of in-flight synchronization requests. */
	disk->d_sync.ds_inflight = g_raid3_syncreqs;

	/*
	* Fire off first synchronization requests.
	*/
	for (n = 0; n < g_raid3_syncreqs; n++) {
	bp = disk->d_sync.ds_bios[n];
	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
	disk->d_sync.ds_consumer->index++;
	/*
	* Delay the request if it is colliding with a regular request.
	*/
	if (g_raid3_regular_collision(sc, bp))
	g_raid3_sync_delay(sc, bp);
	else
	g_io_request(bp, disk->d_sync.ds_consumer);
	}
	}

	/*
	* Stop synchronization process.
	* type: 0 - synchronization finished
	* 1 - synchronization stopped
	*/
	static void
	g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
	{
	struct g_raid3_disk *disk;
	struct g_consumer *cp;

	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_LOCKED);

	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
	("Device not in DEGRADED state (%s, %u).", sc->sc_name,
	sc->sc_state));
	disk = sc->sc_syncdisk;
	sc->sc_syncdisk = NULL;
	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
	("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	if (disk->d_sync.ds_consumer == NULL)
	return;

	if (type == 0) {
	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
	sc->sc_name, g_raid3_get_diskname(disk));
	} else /* if (type == 1) */ {
	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
	sc->sc_name, g_raid3_get_diskname(disk));
	}
	free(disk->d_sync.ds_bios, M_RAID3);
	disk->d_sync.ds_bios = NULL;
	cp = disk->d_sync.ds_consumer;
	disk->d_sync.ds_consumer = NULL;
	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
	g_topology_lock();
	g_raid3_kill_consumer(sc, cp);
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	}

	static void
	g_raid3_launch_provider(struct g_raid3_softc *sc)
	{
	struct g_provider *pp;
	struct g_raid3_disk *disk;
	int n;

	sx_assert(&sc->sc_lock, SX_LOCKED);

	g_topology_lock();
	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
	pp->mediasize = sc->sc_mediasize;
	pp->sectorsize = sc->sc_sectorsize;
	pp->stripesize = 0;
	pp->stripeoffset = 0;
	for (n = 0; n < sc->sc_ndisks; n++) {
	disk = &sc->sc_disks[n];
	if (disk->d_consumer && disk->d_consumer->provider &&
	disk->d_consumer->provider->stripesize > pp->stripesize) {
	pp->stripesize = disk->d_consumer->provider->stripesize;
	pp->stripeoffset = disk->d_consumer->provider->stripeoffset;
	}
	}
	pp->stripesize *= sc->sc_ndisks - 1;
	pp->stripeoffset *= sc->sc_ndisks - 1;
	sc->sc_provider = pp;
	g_error_provider(pp, 0);
	g_topology_unlock();
	G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
	g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks);

	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
	g_raid3_sync_start(sc);
	}

	static void
	g_raid3_destroy_provider(struct g_raid3_softc *sc)
	{
	struct bio *bp;

	g_topology_assert_not();
	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
	sc->sc_name));

	g_topology_lock();
	g_error_provider(sc->sc_provider, ENXIO);
	mtx_lock(&sc->sc_queue_mtx);
	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
	bioq_remove(&sc->sc_queue, bp);
	g_io_deliver(bp, ENXIO);
	}
	mtx_unlock(&sc->sc_queue_mtx);
	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
	sc->sc_provider->name);
	g_wither_provider(sc->sc_provider, ENXIO);
	g_topology_unlock();
	sc->sc_provider = NULL;
	if (sc->sc_syncdisk != NULL)
	g_raid3_sync_stop(sc, 1);
	}

	static void
	g_raid3_go(void *arg)
	{
	struct g_raid3_softc *sc;

	sc = arg;
	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
	g_raid3_event_send(sc, 0,
	G_RAID3_EVENT_DONTWAIT \| G_RAID3_EVENT_DEVICE);
	}

	static u_int
	g_raid3_determine_state(struct g_raid3_disk *disk)
	{
	struct g_raid3_softc *sc;
	u_int state;

	sc = disk->d_softc;
	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
	if ((disk->d_flags &
	G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
	/* Disk does not need synchronization. */
	state = G_RAID3_DISK_STATE_ACTIVE;
	} else {
	if ((sc->sc_flags &
	G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 \|\|
	(disk->d_flags &
	G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
	/*
	* We can start synchronization from
	* the stored offset.
	*/
	state = G_RAID3_DISK_STATE_SYNCHRONIZING;
	} else {
	state = G_RAID3_DISK_STATE_STALE;
	}
	}
	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
	/*
	* Reset all synchronization data for this disk,
	* because if it even was synchronized, it was
	* synchronized to disks with different syncid.
	*/
	disk->d_flags \|= G_RAID3_DISK_FLAG_SYNCHRONIZING;
	disk->d_sync.ds_offset = 0;
	disk->d_sync.ds_offset_done = 0;
	disk->d_sync.ds_syncid = sc->sc_syncid;
	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 \|\|
	(disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
	state = G_RAID3_DISK_STATE_SYNCHRONIZING;
	} else {
	state = G_RAID3_DISK_STATE_STALE;
	}
	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
	/*
	* Not good, NOT GOOD!
	* It means that device was started on stale disks
	* and more fresh disk just arrive.
	* If there were writes, device is broken, sorry.
	* I think the best choice here is don't touch
	* this disk and inform the user loudly.
	*/
	G_RAID3_DEBUG(0, "Device %s was started before the freshest "
	"disk (%s) arrives!! It will not be connected to the "
	"running device.", sc->sc_name,
	g_raid3_get_diskname(disk));
	g_raid3_destroy_disk(disk);
	state = G_RAID3_DISK_STATE_NONE;
	/* Return immediately, because disk was destroyed. */
	return (state);
	}
	G_RAID3_DEBUG(3, "State for %s disk: %s.",
	g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
	return (state);
	}

	/*
	* Update device state.
	*/
	static void
	g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
	{
	struct g_raid3_disk *disk;
	u_int state;

	sx_assert(&sc->sc_lock, SX_XLOCKED);

	switch (sc->sc_state) {
	case G_RAID3_DEVICE_STATE_STARTING:
	{
	u_int n, ndirty, ndisks, genid, syncid;

	KASSERT(sc->sc_provider == NULL,
	("Non-NULL provider in STARTING state (%s).", sc->sc_name));
	/*
	* Are we ready? We are, if all disks are connected or
	* one disk is missing and 'force' is true.
	*/
	if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
	if (!force)
	callout_drain(&sc->sc_callout);
	} else {
	if (force) {
	/*
	* Timeout expired, so destroy device.
	*/
	sc->sc_flags \|= G_RAID3_DEVICE_FLAG_DESTROY;
	G_RAID3_DEBUG(1, "root_mount_rel[%u] %p",
	__LINE__, sc->sc_rootmount);
	root_mount_rel(sc->sc_rootmount);
	sc->sc_rootmount = NULL;
	}
	return;
	}

	/*
	* Find the biggest genid.
	*/
	genid = 0;
	for (n = 0; n < sc->sc_ndisks; n++) {
	disk = &sc->sc_disks[n];
	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
	continue;
	if (disk->d_genid > genid)
	genid = disk->d_genid;
	}
	sc->sc_genid = genid;
	/*
	* Remove all disks without the biggest genid.
	*/
	for (n = 0; n < sc->sc_ndisks; n++) {
	disk = &sc->sc_disks[n];
	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
	continue;
	if (disk->d_genid < genid) {
	G_RAID3_DEBUG(0,
	"Component %s (device %s) broken, skipping.",
	g_raid3_get_diskname(disk), sc->sc_name);
	g_raid3_destroy_disk(disk);
	}
	}

	/*
	* There must be at least 'sc->sc_ndisks - 1' components
	* with the same syncid and without SYNCHRONIZING flag.
	*/

	/*
	* Find the biggest syncid, number of valid components and
	* number of dirty components.
	*/
	ndirty = ndisks = syncid = 0;
	for (n = 0; n < sc->sc_ndisks; n++) {
	disk = &sc->sc_disks[n];
	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
	continue;
	if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
	ndirty++;
	if (disk->d_sync.ds_syncid > syncid) {
	syncid = disk->d_sync.ds_syncid;
	ndisks = 0;
	} else if (disk->d_sync.ds_syncid < syncid) {
	continue;
	}
	if ((disk->d_flags &
	G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
	continue;
	}
	ndisks++;
	}
	/*
	* Do we have enough valid components?
	*/
	if (ndisks + 1 < sc->sc_ndisks) {
	G_RAID3_DEBUG(0,
	"Device %s is broken, too few valid components.",
	sc->sc_name);
	sc->sc_flags \|= G_RAID3_DEVICE_FLAG_DESTROY;
	return;
	}
	/*
	* If there is one DIRTY component and all disks are present,
	* mark it for synchronization. If there is more than one DIRTY
	* component, mark parity component for synchronization.
	*/
	if (ndisks == sc->sc_ndisks && ndirty == 1) {
	for (n = 0; n < sc->sc_ndisks; n++) {
	disk = &sc->sc_disks[n];
	if ((disk->d_flags &
	G_RAID3_DISK_FLAG_DIRTY) == 0) {
	continue;
	}
	disk->d_flags \|=
	G_RAID3_DISK_FLAG_SYNCHRONIZING;
	}
	} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
	disk = &sc->sc_disks[sc->sc_ndisks - 1];
	disk->d_flags \|= G_RAID3_DISK_FLAG_SYNCHRONIZING;
	}

	sc->sc_syncid = syncid;
	if (force) {
	/* Remember to bump syncid on first write. */
	sc->sc_bump_id \|= G_RAID3_BUMP_SYNCID;
	}
	if (ndisks == sc->sc_ndisks)
	state = G_RAID3_DEVICE_STATE_COMPLETE;
	else /* if (ndisks == sc->sc_ndisks - 1) */
	state = G_RAID3_DEVICE_STATE_DEGRADED;
	G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
	sc->sc_name, g_raid3_device_state2str(sc->sc_state),
	g_raid3_device_state2str(state));
	sc->sc_state = state;
	for (n = 0; n < sc->sc_ndisks; n++) {
	disk = &sc->sc_disks[n];
	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
	continue;
	state = g_raid3_determine_state(disk);
	g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
	if (state == G_RAID3_DISK_STATE_STALE)
	sc->sc_bump_id \|= G_RAID3_BUMP_SYNCID;
	}
	break;
	}
	case G_RAID3_DEVICE_STATE_DEGRADED:
	/*
	* Genid need to be bumped immediately, so do it here.
	*/
	if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
	sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
	g_raid3_bump_genid(sc);
	}

	if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
	return;
	if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
	sc->sc_ndisks - 1) {
	if (sc->sc_provider != NULL)
	g_raid3_destroy_provider(sc);
	sc->sc_flags \|= G_RAID3_DEVICE_FLAG_DESTROY;
	return;
	}
	if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
	sc->sc_ndisks) {
	state = G_RAID3_DEVICE_STATE_COMPLETE;
	G_RAID3_DEBUG(1,
	"Device %s state changed from %s to %s.",
	sc->sc_name, g_raid3_device_state2str(sc->sc_state),
	g_raid3_device_state2str(state));
	sc->sc_state = state;
	}
	if (sc->sc_provider == NULL)
	g_raid3_launch_provider(sc);
	if (sc->sc_rootmount != NULL) {
	G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
	sc->sc_rootmount);
	root_mount_rel(sc->sc_rootmount);
	sc->sc_rootmount = NULL;
	}
	break;
	case G_RAID3_DEVICE_STATE_COMPLETE:
	/*
	* Genid need to be bumped immediately, so do it here.
	*/
	if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
	sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
	g_raid3_bump_genid(sc);
	}

	if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
	return;
	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
	sc->sc_ndisks - 1,
	("Too few ACTIVE components in COMPLETE state (device %s).",
	sc->sc_name));
	if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
	sc->sc_ndisks - 1) {
	state = G_RAID3_DEVICE_STATE_DEGRADED;
	G_RAID3_DEBUG(1,
	"Device %s state changed from %s to %s.",
	sc->sc_name, g_raid3_device_state2str(sc->sc_state),
	g_raid3_device_state2str(state));
	sc->sc_state = state;
	}
	if (sc->sc_provider == NULL)
	g_raid3_launch_provider(sc);
	if (sc->sc_rootmount != NULL) {
	G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
	sc->sc_rootmount);
	root_mount_rel(sc->sc_rootmount);
	sc->sc_rootmount = NULL;
	}
	break;
	default:
	KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
	g_raid3_device_state2str(sc->sc_state)));
	break;
	}
	}

	/*
	* Update disk state and device state if needed.
	*/
	#define DISK_STATE_CHANGED() G_RAID3_DEBUG(1, \
	"Disk %s state changed from %s to %s (device %s).", \
	g_raid3_get_diskname(disk), \
	g_raid3_disk_state2str(disk->d_state), \
	g_raid3_disk_state2str(state), sc->sc_name)
	static int
	g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
	{
	struct g_raid3_softc *sc;

	sc = disk->d_softc;
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	again:
	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
	g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
	g_raid3_disk_state2str(state));
	switch (state) {
	case G_RAID3_DISK_STATE_NEW:
	/*
	* Possible scenarios:
	* 1. New disk arrive.
	*/
	/* Previous state should be NONE. */
	KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
	("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	DISK_STATE_CHANGED();

	disk->d_state = state;
	G_RAID3_DEBUG(1, "Device %s: provider %s detected.",
	sc->sc_name, g_raid3_get_diskname(disk));
	if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
	break;
	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED \|\|
	sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
	("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
	g_raid3_device_state2str(sc->sc_state),
	g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	state = g_raid3_determine_state(disk);
	if (state != G_RAID3_DISK_STATE_NONE)
	goto again;
	break;
	case G_RAID3_DISK_STATE_ACTIVE:
	/*
	* Possible scenarios:
	* 1. New disk does not need synchronization.
	* 2. Synchronization process finished successfully.
	*/
	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED \|\|
	sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
	("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
	g_raid3_device_state2str(sc->sc_state),
	g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	/* Previous state should be NEW or SYNCHRONIZING. */
	KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW \|\|
	disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
	("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	DISK_STATE_CHANGED();

	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
	disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
	disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
	g_raid3_sync_stop(sc, 0);
	}
	disk->d_state = state;
	disk->d_sync.ds_offset = 0;
	disk->d_sync.ds_offset_done = 0;
	g_raid3_update_idle(sc, disk);
	g_raid3_update_metadata(disk);
	G_RAID3_DEBUG(1, "Device %s: provider %s activated.",
	sc->sc_name, g_raid3_get_diskname(disk));
	break;
	case G_RAID3_DISK_STATE_STALE:
	/*
	* Possible scenarios:
	* 1. Stale disk was connected.
	*/
	/* Previous state should be NEW. */
	KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
	("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED \|\|
	sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
	("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
	g_raid3_device_state2str(sc->sc_state),
	g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	/*
	* STALE state is only possible if device is marked
	* NOAUTOSYNC.
	*/
	KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
	("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
	g_raid3_device_state2str(sc->sc_state),
	g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	DISK_STATE_CHANGED();

	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
	disk->d_state = state;
	g_raid3_update_metadata(disk);
	G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
	sc->sc_name, g_raid3_get_diskname(disk));
	break;
	case G_RAID3_DISK_STATE_SYNCHRONIZING:
	/*
	* Possible scenarios:
	* 1. Disk which needs synchronization was connected.
	*/
	/* Previous state should be NEW. */
	KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
	("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED \|\|
	sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
	("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
	g_raid3_device_state2str(sc->sc_state),
	g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	DISK_STATE_CHANGED();

	if (disk->d_state == G_RAID3_DISK_STATE_NEW)
	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
	disk->d_state = state;
	if (sc->sc_provider != NULL) {
	g_raid3_sync_start(sc);
	g_raid3_update_metadata(disk);
	}
	break;
	case G_RAID3_DISK_STATE_DISCONNECTED:
	/*
	* Possible scenarios:
	* 1. Device wasn't running yet, but disk disappear.
	* 2. Disk was active and disapppear.
	* 3. Disk disappear during synchronization process.
	*/
	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED \|\|
	sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
	/*
	* Previous state should be ACTIVE, STALE or
	* SYNCHRONIZING.
	*/
	KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE \|\|
	disk->d_state == G_RAID3_DISK_STATE_STALE \|\|
	disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
	("Wrong disk state (%s, %s).",
	g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
	/* Previous state should be NEW. */
	KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
	("Wrong disk state (%s, %s).",
	g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	/*
	* Reset bumping syncid if disk disappeared in STARTING
	* state.
	*/
	if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
	sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
	#ifdef INVARIANTS
	} else {
	KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
	sc->sc_name,
	g_raid3_device_state2str(sc->sc_state),
	g_raid3_get_diskname(disk),
	g_raid3_disk_state2str(disk->d_state)));
	#endif
	}
	DISK_STATE_CHANGED();
	G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
	sc->sc_name, g_raid3_get_diskname(disk));

	g_raid3_destroy_disk(disk);
	break;
	default:
	KASSERT(1 == 0, ("Unknown state (%u).", state));
	break;
	}
	return (0);
	}
	#undef DISK_STATE_CHANGED

	int
	g_raid3_read_metadata(struct g_consumer cp, struct g_raid3_metadata md)
	{
	struct g_provider *pp;
	u_char *buf;
	int error;

	g_topology_assert();

	error = g_access(cp, 1, 0, 0);
	if (error != 0)
	return (error);
	pp = cp->provider;
	g_topology_unlock();
	/* Metadata are stored on last sector. */
	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
	&error);
	g_topology_lock();
	g_access(cp, -1, 0, 0);
	if (buf == NULL) {
	G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
	cp->provider->name, error);
	return (error);
	}

	/* Decode metadata. */
	error = raid3_metadata_decode(buf, md);
	g_free(buf);
	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
	return (EINVAL);
	if (md->md_version > G_RAID3_VERSION) {
	G_RAID3_DEBUG(0,
	"Kernel module is too old to handle metadata from %s.",
	cp->provider->name);
	return (EINVAL);
	}
	if (error != 0) {
	G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
	cp->provider->name);
	return (error);
	}
	- if (md->md_sectorsize > MAXPHYS) {
	+ if (md->md_sectorsize > maxphys) {
	G_RAID3_DEBUG(0, "The blocksize is too big.");
	return (EINVAL);
	}

	return (0);
	}

	static int
	g_raid3_check_metadata(struct g_raid3_softc sc, struct g_provider pp,
	struct g_raid3_metadata *md)
	{

	if (md->md_no >= sc->sc_ndisks) {
	G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
	pp->name, md->md_no);
	return (EINVAL);
	}
	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
	G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
	pp->name, md->md_no);
	return (EEXIST);
	}
	if (md->md_all != sc->sc_ndisks) {
	G_RAID3_DEBUG(1,
	"Invalid '%s' field on disk %s (device %s), skipping.",
	"md_all", pp->name, sc->sc_name);
	return (EINVAL);
	}
	if ((md->md_mediasize % md->md_sectorsize) != 0) {
	G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != "
	"0) on disk %s (device %s), skipping.", pp->name,
	sc->sc_name);
	return (EINVAL);
	}
	if (md->md_mediasize != sc->sc_mediasize) {
	G_RAID3_DEBUG(1,
	"Invalid '%s' field on disk %s (device %s), skipping.",
	"md_mediasize", pp->name, sc->sc_name);
	return (EINVAL);
	}
	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
	G_RAID3_DEBUG(1,
	"Invalid '%s' field on disk %s (device %s), skipping.",
	"md_mediasize", pp->name, sc->sc_name);
	return (EINVAL);
	}
	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
	G_RAID3_DEBUG(1,
	"Invalid size of disk %s (device %s), skipping.", pp->name,
	sc->sc_name);
	return (EINVAL);
	}
	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
	G_RAID3_DEBUG(1,
	"Invalid '%s' field on disk %s (device %s), skipping.",
	"md_sectorsize", pp->name, sc->sc_name);
	return (EINVAL);
	}
	if (md->md_sectorsize != sc->sc_sectorsize) {
	G_RAID3_DEBUG(1,
	"Invalid '%s' field on disk %s (device %s), skipping.",
	"md_sectorsize", pp->name, sc->sc_name);
	return (EINVAL);
	}
	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
	G_RAID3_DEBUG(1,
	"Invalid sector size of disk %s (device %s), skipping.",
	pp->name, sc->sc_name);
	return (EINVAL);
	}
	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
	G_RAID3_DEBUG(1,
	"Invalid device flags on disk %s (device %s), skipping.",
	pp->name, sc->sc_name);
	return (EINVAL);
	}
	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
	(md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
	/*
	* VERIFY and ROUND-ROBIN options are mutally exclusive.
	*/
	G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
	"disk %s (device %s), skipping.", pp->name, sc->sc_name);
	return (EINVAL);
	}
	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
	G_RAID3_DEBUG(1,
	"Invalid disk flags on disk %s (device %s), skipping.",
	pp->name, sc->sc_name);
	return (EINVAL);
	}
	return (0);
	}

	int
	g_raid3_add_disk(struct g_raid3_softc sc, struct g_provider pp,
	struct g_raid3_metadata *md)
	{
	struct g_raid3_disk *disk;
	int error;

	g_topology_assert_not();
	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);

	error = g_raid3_check_metadata(sc, pp, md);
	if (error != 0)
	return (error);
	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
	md->md_genid < sc->sc_genid) {
	G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
	pp->name, sc->sc_name);
	return (EINVAL);
	}
	disk = g_raid3_init_disk(sc, pp, md, &error);
	if (disk == NULL)
	return (error);
	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
	G_RAID3_EVENT_WAIT);
	if (error != 0)
	return (error);
	if (md->md_version < G_RAID3_VERSION) {
	G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
	pp->name, md->md_version, G_RAID3_VERSION);
	g_raid3_update_metadata(disk);
	}
	return (0);
	}

	static void
	g_raid3_destroy_delayed(void *arg, int flag)
	{
	struct g_raid3_softc *sc;
	int error;

	if (flag == EV_CANCEL) {
	G_RAID3_DEBUG(1, "Destroying canceled.");
	return;
	}
	sc = arg;
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0,
	("DESTROY flag set on %s.", sc->sc_name));
	KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0,
	("DESTROYING flag not set on %s.", sc->sc_name));
	G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name);
	error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT);
	if (error != 0) {
	G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name);
	sx_xunlock(&sc->sc_lock);
	}
	g_topology_lock();
	}

	static int
	g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
	{
	struct g_raid3_softc *sc;
	int dcr, dcw, dce, error = 0;

	g_topology_assert();
	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
	acw, ace);

	sc = pp->geom->softc;
	if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0)
	return (0);
	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));

	dcr = pp->acr + acr;
	dcw = pp->acw + acw;
	dce = pp->ace + ace;

	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 \|\|
	g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) {
	if (acr > 0 \|\| acw > 0 \|\| ace > 0)
	error = ENXIO;
	goto end;
	}
	if (dcw == 0)
	g_raid3_idle(sc, dcw);
	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) {
	if (acr > 0 \|\| acw > 0 \|\| ace > 0) {
	error = ENXIO;
	goto end;
	}
	if (dcr == 0 && dcw == 0 && dce == 0) {
	g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK,
	sc, NULL);
	}
	}
	end:
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	return (error);
	}

	static struct g_geom *
	g_raid3_create(struct g_class mp, const struct g_raid3_metadata md)
	{
	struct g_raid3_softc *sc;
	struct g_geom *gp;
	int error, timeout;
	u_int n;

	g_topology_assert();
	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);

	/* One disk is minimum. */
	if (md->md_all < 1)
	return (NULL);
	/*
	* Action geom.
	*/
	gp = g_new_geomf(mp, "%s", md->md_name);
	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK \| M_ZERO);
	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
	M_WAITOK \| M_ZERO);
	gp->start = g_raid3_start;
	gp->orphan = g_raid3_orphan;
	gp->access = g_raid3_access;
	gp->dumpconf = g_raid3_dumpconf;

	sc->sc_id = md->md_id;
	sc->sc_mediasize = md->md_mediasize;
	sc->sc_sectorsize = md->md_sectorsize;
	sc->sc_ndisks = md->md_all;
	sc->sc_round_robin = 0;
	sc->sc_flags = md->md_mflags;
	sc->sc_bump_id = 0;
	sc->sc_idle = 1;
	sc->sc_last_write = time_uptime;
	sc->sc_writes = 0;
	for (n = 0; n < sc->sc_ndisks; n++) {
	sc->sc_disks[n].d_softc = sc;
	sc->sc_disks[n].d_no = n;
	sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
	}
	sx_init(&sc->sc_lock, "graid3:lock");
	bioq_init(&sc->sc_queue);
	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
	bioq_init(&sc->sc_regular_delayed);
	bioq_init(&sc->sc_inflight);
	bioq_init(&sc->sc_sync_delayed);
	TAILQ_INIT(&sc->sc_events);
	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
	callout_init(&sc->sc_callout, 1);
	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
	gp->softc = sc;
	sc->sc_geom = gp;
	sc->sc_provider = NULL;
	/*
	* Synchronization geom.
	*/
	gp = g_new_geomf(mp, "%s.sync", md->md_name);
	gp->softc = sc;
	gp->orphan = g_raid3_orphan;
	sc->sc_sync.ds_geom = gp;

	if (!g_raid3_use_malloc) {
	sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k",
	65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
	UMA_ALIGN_PTR, 0);
	sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0;
	sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k;
	sc->sc_zones[G_RAID3_ZONE_64K].sz_requested =
	sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0;
	sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k",
	16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
	UMA_ALIGN_PTR, 0);
	sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0;
	sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k;
	sc->sc_zones[G_RAID3_ZONE_16K].sz_requested =
	sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0;
	sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k",
	4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
	UMA_ALIGN_PTR, 0);
	sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0;
	sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k;
	sc->sc_zones[G_RAID3_ZONE_4K].sz_requested =
	sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0;
	}

	error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
	"g_raid3 %s", md->md_name);
	if (error != 0) {
	G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
	sc->sc_name);
	if (!g_raid3_use_malloc) {
	uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
	uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
	uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
	}
	g_destroy_geom(sc->sc_sync.ds_geom);
	mtx_destroy(&sc->sc_events_mtx);
	mtx_destroy(&sc->sc_queue_mtx);
	sx_destroy(&sc->sc_lock);
	g_destroy_geom(sc->sc_geom);
	free(sc->sc_disks, M_RAID3);
	free(sc, M_RAID3);
	return (NULL);
	}

	G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).",
	sc->sc_name, sc->sc_ndisks, sc->sc_id);

	sc->sc_rootmount = root_mount_hold("GRAID3");
	G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);

	/*
	* Run timeout.
	*/
	timeout = atomic_load_acq_int(&g_raid3_timeout);
	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
	return (sc->sc_geom);
	}

	int
	g_raid3_destroy(struct g_raid3_softc *sc, int how)
	{
	struct g_provider *pp;

	g_topology_assert_not();
	if (sc == NULL)
	return (ENXIO);
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	pp = sc->sc_provider;
	if (pp != NULL && (pp->acr != 0 \|\| pp->acw != 0 \|\| pp->ace != 0)) {
	switch (how) {
	case G_RAID3_DESTROY_SOFT:
	G_RAID3_DEBUG(1,
	"Device %s is still open (r%dw%de%d).", pp->name,
	pp->acr, pp->acw, pp->ace);
	return (EBUSY);
	case G_RAID3_DESTROY_DELAYED:
	G_RAID3_DEBUG(1,
	"Device %s will be destroyed on last close.",
	pp->name);
	if (sc->sc_syncdisk != NULL)
	g_raid3_sync_stop(sc, 1);
	sc->sc_flags \|= G_RAID3_DEVICE_FLAG_DESTROYING;
	return (EBUSY);
	case G_RAID3_DESTROY_HARD:
	G_RAID3_DEBUG(1, "Device %s is still open, so it "
	"can't be definitely removed.", pp->name);
	break;
	}
	}

	g_topology_lock();
	if (sc->sc_geom->softc == NULL) {
	g_topology_unlock();
	return (0);
	}
	sc->sc_geom->softc = NULL;
	sc->sc_sync.ds_geom->softc = NULL;
	g_topology_unlock();

	sc->sc_flags \|= G_RAID3_DEVICE_FLAG_DESTROY;
	sc->sc_flags \|= G_RAID3_DEVICE_FLAG_WAIT;
	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
	sx_xunlock(&sc->sc_lock);
	mtx_lock(&sc->sc_queue_mtx);
	wakeup(sc);
	wakeup(&sc->sc_queue);
	mtx_unlock(&sc->sc_queue_mtx);
	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
	while (sc->sc_worker != NULL)
	tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
	sx_xlock(&sc->sc_lock);
	g_raid3_destroy_device(sc);
	free(sc->sc_disks, M_RAID3);
	free(sc, M_RAID3);
	return (0);
	}

	static void
	g_raid3_taste_orphan(struct g_consumer *cp)
	{

	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
	cp->provider->name));
	}

	static struct g_geom *
	g_raid3_taste(struct g_class mp, struct g_provider pp, int flags __unused)
	{
	struct g_raid3_metadata md;
	struct g_raid3_softc *sc;
	struct g_consumer *cp;
	struct g_geom *gp;
	int error;

	g_topology_assert();
	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);

	gp = g_new_geomf(mp, "raid3:taste");
	/* This orphan function should be never called. */
	gp->orphan = g_raid3_taste_orphan;
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error == 0) {
	error = g_raid3_read_metadata(cp, &md);
	g_detach(cp);
	}
	g_destroy_consumer(cp);
	g_destroy_geom(gp);
	if (error != 0)
	return (NULL);
	gp = NULL;

	if (md.md_provider[0] != '\0' &&
	!g_compare_names(md.md_provider, pp->name))
	return (NULL);
	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
	return (NULL);
	if (g_raid3_debug >= 2)
	raid3_metadata_dump(&md);

	/*
	* Let's check if device already exists.
	*/
	sc = NULL;
	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_sync.ds_geom == gp)
	continue;
	if (strcmp(md.md_name, sc->sc_name) != 0)
	continue;
	if (md.md_id != sc->sc_id) {
	G_RAID3_DEBUG(0, "Device %s already configured.",
	sc->sc_name);
	return (NULL);
	}
	break;
	}
	if (gp == NULL) {
	gp = g_raid3_create(mp, &md);
	if (gp == NULL) {
	G_RAID3_DEBUG(0, "Cannot create device %s.",
	md.md_name);
	return (NULL);
	}
	sc = gp->softc;
	}
	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	error = g_raid3_add_disk(sc, pp, &md);
	if (error != 0) {
	G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
	pp->name, gp->name, error);
	if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
	sc->sc_ndisks) {
	g_cancel_event(sc);
	g_raid3_destroy(sc, G_RAID3_DESTROY_HARD);
	g_topology_lock();
	return (NULL);
	}
	gp = NULL;
	}
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	return (gp);
	}

	static int
	g_raid3_destroy_geom(struct gctl_req req __unused, struct g_class mp __unused,
	struct g_geom *gp)
	{
	struct g_raid3_softc *sc;
	int error;

	g_topology_unlock();
	sc = gp->softc;
	sx_xlock(&sc->sc_lock);
	g_cancel_event(sc);
	error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT);
	if (error != 0)
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	return (error);
	}

	static void
	g_raid3_dumpconf(struct sbuf sb, const char indent, struct g_geom *gp,
	struct g_consumer cp, struct g_provider pp)
	{
	struct g_raid3_softc *sc;

	g_topology_assert();

	sc = gp->softc;
	if (sc == NULL)
	return;
	/* Skip synchronization geom. */
	if (gp == sc->sc_sync.ds_geom)
	return;
	if (pp != NULL) {
	/* Nothing here. */
	} else if (cp != NULL) {
	struct g_raid3_disk *disk;

	disk = cp->private;
	if (disk == NULL)
	return;
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	sbuf_printf(sb, "%s<Type>", indent);
	if (disk->d_no == sc->sc_ndisks - 1)
	sbuf_cat(sb, "PARITY");
	else
	sbuf_cat(sb, "DATA");
	sbuf_cat(sb, "</Type>\n");
	sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
	(u_int)disk->d_no);
	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
	sbuf_printf(sb, "%s<Synchronized>", indent);
	if (disk->d_sync.ds_offset == 0)
	sbuf_cat(sb, "0%");
	else {
	sbuf_printf(sb, "%u%%",
	(u_int)((disk->d_sync.ds_offset * 100) /
	(sc->sc_mediasize / (sc->sc_ndisks - 1))));
	}
	sbuf_cat(sb, "</Synchronized>\n");
	if (disk->d_sync.ds_offset > 0) {
	sbuf_printf(sb, "%s<BytesSynced>%jd"
	"</BytesSynced>\n", indent,
	(intmax_t)disk->d_sync.ds_offset);
	}
	}
	sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
	disk->d_sync.ds_syncid);
	sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
	sbuf_printf(sb, "%s<Flags>", indent);
	if (disk->d_flags == 0)
	sbuf_cat(sb, "NONE");
	else {
	int first = 1;

	#define ADD_FLAG(flag, name) do { \
	if ((disk->d_flags & (flag)) != 0) { \
	if (!first) \
	sbuf_cat(sb, ", "); \
	else \
	first = 0; \
	sbuf_cat(sb, name); \
	} \
	} while (0)
	ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
	ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
	ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
	"SYNCHRONIZING");
	ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
	ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN");
	#undef ADD_FLAG
	}
	sbuf_cat(sb, "</Flags>\n");
	sbuf_printf(sb, "%s<State>%s</State>\n", indent,
	g_raid3_disk_state2str(disk->d_state));
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	} else {
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	if (!g_raid3_use_malloc) {
	sbuf_printf(sb,
	"%s<Zone4kRequested>%u</Zone4kRequested>\n", indent,
	sc->sc_zones[G_RAID3_ZONE_4K].sz_requested);
	sbuf_printf(sb,
	"%s<Zone4kFailed>%u</Zone4kFailed>\n", indent,
	sc->sc_zones[G_RAID3_ZONE_4K].sz_failed);
	sbuf_printf(sb,
	"%s<Zone16kRequested>%u</Zone16kRequested>\n", indent,
	sc->sc_zones[G_RAID3_ZONE_16K].sz_requested);
	sbuf_printf(sb,
	"%s<Zone16kFailed>%u</Zone16kFailed>\n", indent,
	sc->sc_zones[G_RAID3_ZONE_16K].sz_failed);
	sbuf_printf(sb,
	"%s<Zone64kRequested>%u</Zone64kRequested>\n", indent,
	sc->sc_zones[G_RAID3_ZONE_64K].sz_requested);
	sbuf_printf(sb,
	"%s<Zone64kFailed>%u</Zone64kFailed>\n", indent,
	sc->sc_zones[G_RAID3_ZONE_64K].sz_failed);
	}
	sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
	sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
	sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
	sbuf_printf(sb, "%s<Flags>", indent);
	if (sc->sc_flags == 0)
	sbuf_cat(sb, "NONE");
	else {
	int first = 1;

	#define ADD_FLAG(flag, name) do { \
	if ((sc->sc_flags & (flag)) != 0) { \
	if (!first) \
	sbuf_cat(sb, ", "); \
	else \
	first = 0; \
	sbuf_cat(sb, name); \
	} \
	} while (0)
	ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
	ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
	ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
	"ROUND-ROBIN");
	ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
	#undef ADD_FLAG
	}
	sbuf_cat(sb, "</Flags>\n");
	sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
	sc->sc_ndisks);
	sbuf_printf(sb, "%s<State>%s</State>\n", indent,
	g_raid3_device_state2str(sc->sc_state));
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	}
	}

	static void
	g_raid3_shutdown_post_sync(void *arg, int howto)
	{
	struct g_class *mp;
	struct g_geom gp, gp2;
	struct g_raid3_softc *sc;
	int error;

	mp = arg;
	g_topology_lock();
	g_raid3_shutdown = 1;
	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
	if ((sc = gp->softc) == NULL)
	continue;
	/* Skip synchronization geom. */
	if (gp == sc->sc_sync.ds_geom)
	continue;
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	g_raid3_idle(sc, -1);
	g_cancel_event(sc);
	error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED);
	if (error != 0)
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	}
	g_topology_unlock();
	}

	static void
	g_raid3_init(struct g_class *mp)
	{

	g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
	g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
	if (g_raid3_post_sync == NULL)
	G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
	}

	static void
	g_raid3_fini(struct g_class *mp)
	{

	if (g_raid3_post_sync != NULL)
	EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync);
	}

	DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
	MODULE_VERSION(geom_raid3, 0);
	diff --git a/sys/geom/shsec/g_shsec.c b/sys/geom/shsec/g_shsec.c
	index 4e359bee4d25..a3b2f59d0555 100644
	--- a/sys/geom/shsec/g_shsec.c
	+++ b/sys/geom/shsec/g_shsec.c
	@@ -1,837 +1,840 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/bio.h>
	#include <sys/sbuf.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <vm/uma.h>
	#include <geom/geom.h>
	#include <geom/geom_dbg.h>
	#include <geom/shsec/g_shsec.h>

	FEATURE(geom_shsec, "GEOM shared secret device support");

	static MALLOC_DEFINE(M_SHSEC, "shsec_data", "GEOM_SHSEC Data");

	static uma_zone_t g_shsec_zone;

	static int g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force);
	static int g_shsec_destroy_geom(struct gctl_req req, struct g_class mp,
	struct g_geom *gp);

	static g_taste_t g_shsec_taste;
	static g_ctl_req_t g_shsec_config;
	static g_dumpconf_t g_shsec_dumpconf;
	static g_init_t g_shsec_init;
	static g_fini_t g_shsec_fini;

	struct g_class g_shsec_class = {
	.name = G_SHSEC_CLASS_NAME,
	.version = G_VERSION,
	.ctlreq = g_shsec_config,
	.taste = g_shsec_taste,
	.destroy_geom = g_shsec_destroy_geom,
	.init = g_shsec_init,
	.fini = g_shsec_fini
	};

	SYSCTL_DECL(_kern_geom);
	static SYSCTL_NODE(_kern_geom, OID_AUTO, shsec, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_SHSEC stuff");
	-static u_int g_shsec_debug = 0;
	+static u_int g_shsec_debug;
	SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, debug, CTLFLAG_RWTUN, &g_shsec_debug, 0,
	"Debug level");
	-static u_int g_shsec_maxmem = MAXPHYS * 100;
	-SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, maxmem, CTLFLAG_RDTUN, &g_shsec_maxmem,
	+static u_long g_shsec_maxmem;
	+SYSCTL_ULONG(_kern_geom_shsec, OID_AUTO, maxmem,
	+ CTLFLAG_RDTUN \| CTLFLAG_NOFETCH, &g_shsec_maxmem,
	0, "Maximum memory that can be allocated for I/O (in bytes)");
	static u_int g_shsec_alloc_failed = 0;
	SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, alloc_failed, CTLFLAG_RD,
	&g_shsec_alloc_failed, 0, "How many times I/O allocation failed");

	/*
	* Greatest Common Divisor.
	*/
	static u_int
	gcd(u_int a, u_int b)
	{
	u_int c;

	while (b != 0) {
	c = a;
	a = b;
	b = (c % b);
	}
	return (a);
	}

	/*
	* Least Common Multiple.
	*/
	static u_int
	lcm(u_int a, u_int b)
	{

	return ((a * b) / gcd(a, b));
	}

	static void
	g_shsec_init(struct g_class *mp __unused)
	{

	- g_shsec_zone = uma_zcreate("g_shsec_zone", MAXPHYS, NULL, NULL, NULL,
	+ g_shsec_maxmem = maxphys * 100;
	+ TUNABLE_ULONG_FETCH("kern.geom.shsec.maxmem,", &g_shsec_maxmem);
	+ g_shsec_zone = uma_zcreate("g_shsec_zone", maxphys, NULL, NULL, NULL,
	NULL, 0, 0);
	- g_shsec_maxmem -= g_shsec_maxmem % MAXPHYS;
	- uma_zone_set_max(g_shsec_zone, g_shsec_maxmem / MAXPHYS);
	+ g_shsec_maxmem -= g_shsec_maxmem % maxphys;
	+ uma_zone_set_max(g_shsec_zone, g_shsec_maxmem / maxphys);
	}

	static void
	g_shsec_fini(struct g_class *mp __unused)
	{

	uma_zdestroy(g_shsec_zone);
	}

	/*
	* Return the number of valid disks.
	*/
	static u_int
	g_shsec_nvalid(struct g_shsec_softc *sc)
	{
	u_int i, no;

	no = 0;
	for (i = 0; i < sc->sc_ndisks; i++) {
	if (sc->sc_disks[i] != NULL)
	no++;
	}

	return (no);
	}

	static void
	g_shsec_remove_disk(struct g_consumer *cp)
	{
	struct g_shsec_softc *sc;
	u_int no;

	KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__));
	sc = (struct g_shsec_softc *)cp->private;
	KASSERT(sc != NULL, ("NULL sc in %s.", __func__));
	no = cp->index;

	G_SHSEC_DEBUG(0, "Disk %s removed from %s.", cp->provider->name,
	sc->sc_name);

	sc->sc_disks[no] = NULL;
	if (sc->sc_provider != NULL) {
	g_wither_provider(sc->sc_provider, ENXIO);
	sc->sc_provider = NULL;
	G_SHSEC_DEBUG(0, "Device %s removed.", sc->sc_name);
	}

	if (cp->acr > 0 \|\| cp->acw > 0 \|\| cp->ace > 0)
	return;
	g_detach(cp);
	g_destroy_consumer(cp);
	}

	static void
	g_shsec_orphan(struct g_consumer *cp)
	{
	struct g_shsec_softc *sc;
	struct g_geom *gp;

	g_topology_assert();
	gp = cp->geom;
	sc = gp->softc;
	if (sc == NULL)
	return;

	g_shsec_remove_disk(cp);
	/* If there are no valid disks anymore, remove device. */
	if (LIST_EMPTY(&gp->consumer))
	g_shsec_destroy(sc, 1);
	}

	static int
	g_shsec_access(struct g_provider *pp, int dr, int dw, int de)
	{
	struct g_consumer cp1, cp2, *tmp;
	struct g_shsec_softc *sc;
	struct g_geom *gp;
	int error;

	gp = pp->geom;
	sc = gp->softc;

	/* On first open, grab an extra "exclusive" bit */
	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
	de++;
	/* ... and let go of it on last close */
	if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0)
	de--;

	error = ENXIO;
	LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) {
	error = g_access(cp1, dr, dw, de);
	if (error != 0)
	goto fail;
	if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 &&
	cp1->flags & G_CF_ORPHAN) {
	g_detach(cp1);
	g_destroy_consumer(cp1);
	}
	}

	/* If there are no valid disks anymore, remove device. */
	if (LIST_EMPTY(&gp->consumer))
	g_shsec_destroy(sc, 1);

	return (error);

	fail:
	/* If we fail here, backout all previous changes. */
	LIST_FOREACH(cp2, &gp->consumer, consumer) {
	if (cp1 == cp2)
	break;
	g_access(cp2, -dr, -dw, -de);
	}
	return (error);
	}

	static void
	g_shsec_xor1(uint32_t src, uint32_t dst, ssize_t len)
	{

	for (; len > 0; len -= sizeof(uint32_t), dst++)
	dst = dst ^ *src++;
	KASSERT(len == 0, ("len != 0 (len=%zd)", len));
	}

	static void
	g_shsec_done(struct bio *bp)
	{
	struct g_shsec_softc *sc;
	struct bio *pbp;

	pbp = bp->bio_parent;
	sc = pbp->bio_to->geom->softc;
	if (bp->bio_error == 0)
	G_SHSEC_LOGREQ(2, bp, "Request done.");
	else {
	G_SHSEC_LOGREQ(0, bp, "Request failed (error=%d).",
	bp->bio_error);
	if (pbp->bio_error == 0)
	pbp->bio_error = bp->bio_error;
	}
	if (pbp->bio_cmd == BIO_READ) {
	if ((pbp->bio_pflags & G_SHSEC_BFLAG_FIRST) != 0) {
	bcopy(bp->bio_data, pbp->bio_data, pbp->bio_length);
	pbp->bio_pflags = 0;
	} else {
	g_shsec_xor1((uint32_t *)bp->bio_data,
	(uint32_t *)pbp->bio_data,
	(ssize_t)pbp->bio_length);
	}
	}
	explicit_bzero(bp->bio_data, bp->bio_length);
	uma_zfree(g_shsec_zone, bp->bio_data);
	g_destroy_bio(bp);
	pbp->bio_inbed++;
	if (pbp->bio_children == pbp->bio_inbed) {
	pbp->bio_completed = pbp->bio_length;
	g_io_deliver(pbp, pbp->bio_error);
	}
	}

	static void
	g_shsec_xor2(uint32_t rand, uint32_t dst, ssize_t len)
	{

	for (; len > 0; len -= sizeof(uint32_t), dst++) {
	*rand = arc4random();
	dst = dst ^ *rand++;
	}
	KASSERT(len == 0, ("len != 0 (len=%zd)", len));
	}

	static void
	g_shsec_start(struct bio *bp)
	{
	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
	struct g_shsec_softc *sc;
	struct bio *cbp;
	uint32_t *dst;
	ssize_t len;
	u_int no;
	int error;

	sc = bp->bio_to->geom->softc;
	/*
	* If sc == NULL, provider's error should be set and g_shsec_start()
	* should not be called at all.
	*/
	KASSERT(sc != NULL,
	("Provider's error should be set (error=%d)(device=%s).",
	bp->bio_to->error, bp->bio_to->name));

	G_SHSEC_LOGREQ(2, bp, "Request received.");

	switch (bp->bio_cmd) {
	case BIO_READ:
	case BIO_WRITE:
	case BIO_FLUSH:
	case BIO_SPEEDUP:
	/*
	* Only those requests are supported.
	*/
	break;
	case BIO_DELETE:
	case BIO_GETATTR:
	/* To which provider it should be delivered? */
	default:
	g_io_deliver(bp, EOPNOTSUPP);
	return;
	}

	/*
	* Allocate all bios first and calculate XOR.
	*/
	dst = NULL;
	len = bp->bio_length;
	if (bp->bio_cmd == BIO_READ)
	bp->bio_pflags = G_SHSEC_BFLAG_FIRST;
	for (no = 0; no < sc->sc_ndisks; no++) {
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	error = ENOMEM;
	goto failure;
	}
	TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);

	/*
	* Fill in the component buf structure.
	*/
	cbp->bio_done = g_shsec_done;
	cbp->bio_data = uma_zalloc(g_shsec_zone, M_NOWAIT);
	if (cbp->bio_data == NULL) {
	g_shsec_alloc_failed++;
	error = ENOMEM;
	goto failure;
	}
	cbp->bio_caller2 = sc->sc_disks[no];
	if (bp->bio_cmd == BIO_WRITE) {
	if (no == 0) {
	dst = (uint32_t *)cbp->bio_data;
	bcopy(bp->bio_data, dst, len);
	} else {
	g_shsec_xor2((uint32_t *)cbp->bio_data, dst,
	len);
	}
	}
	}
	/*
	* Fire off all allocated requests!
	*/
	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
	struct g_consumer *cp;

	TAILQ_REMOVE(&queue, cbp, bio_queue);
	cp = cbp->bio_caller2;
	cbp->bio_caller2 = NULL;
	cbp->bio_to = cp->provider;
	G_SHSEC_LOGREQ(2, cbp, "Sending request.");
	g_io_request(cbp, cp);
	}
	return;
	failure:
	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, cbp, bio_queue);
	bp->bio_children--;
	if (cbp->bio_data != NULL) {
	explicit_bzero(cbp->bio_data, cbp->bio_length);
	uma_zfree(g_shsec_zone, cbp->bio_data);
	}
	g_destroy_bio(cbp);
	}
	if (bp->bio_error == 0)
	bp->bio_error = error;
	g_io_deliver(bp, bp->bio_error);
	}

	static void
	g_shsec_check_and_run(struct g_shsec_softc *sc)
	{
	off_t mediasize, ms;
	u_int no, sectorsize = 0;

	if (g_shsec_nvalid(sc) != sc->sc_ndisks)
	return;

	sc->sc_provider = g_new_providerf(sc->sc_geom, "shsec/%s", sc->sc_name);
	/*
	* Find the smallest disk.
	*/
	mediasize = sc->sc_disks[0]->provider->mediasize;
	mediasize -= sc->sc_disks[0]->provider->sectorsize;
	sectorsize = sc->sc_disks[0]->provider->sectorsize;
	for (no = 1; no < sc->sc_ndisks; no++) {
	ms = sc->sc_disks[no]->provider->mediasize;
	ms -= sc->sc_disks[no]->provider->sectorsize;
	if (ms < mediasize)
	mediasize = ms;
	sectorsize = lcm(sectorsize,
	sc->sc_disks[no]->provider->sectorsize);
	}
	sc->sc_provider->sectorsize = sectorsize;
	sc->sc_provider->mediasize = mediasize;
	g_error_provider(sc->sc_provider, 0);

	G_SHSEC_DEBUG(0, "Device %s activated.", sc->sc_name);
	}

	static int
	g_shsec_read_metadata(struct g_consumer cp, struct g_shsec_metadata md)
	{
	struct g_provider *pp;
	u_char *buf;
	int error;

	g_topology_assert();

	error = g_access(cp, 1, 0, 0);
	if (error != 0)
	return (error);
	pp = cp->provider;
	g_topology_unlock();
	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
	&error);
	g_topology_lock();
	g_access(cp, -1, 0, 0);
	if (buf == NULL)
	return (error);

	/* Decode metadata. */
	shsec_metadata_decode(buf, md);
	g_free(buf);

	return (0);
	}

	/*
	* Add disk to given device.
	*/
	static int
	g_shsec_add_disk(struct g_shsec_softc sc, struct g_provider pp, u_int no)
	{
	struct g_consumer cp, fcp;
	struct g_geom *gp;
	struct g_shsec_metadata md;
	int error;

	/* Metadata corrupted? */
	if (no >= sc->sc_ndisks)
	return (EINVAL);

	/* Check if disk is not already attached. */
	if (sc->sc_disks[no] != NULL)
	return (EEXIST);

	gp = sc->sc_geom;
	fcp = LIST_FIRST(&gp->consumer);

	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error != 0) {
	g_destroy_consumer(cp);
	return (error);
	}

	if (fcp != NULL && (fcp->acr > 0 \|\| fcp->acw > 0 \|\| fcp->ace > 0)) {
	error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
	if (error != 0) {
	g_detach(cp);
	g_destroy_consumer(cp);
	return (error);
	}
	}

	/* Reread metadata. */
	error = g_shsec_read_metadata(cp, &md);
	if (error != 0)
	goto fail;

	if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0 \|\|
	strcmp(md.md_name, sc->sc_name) != 0 \|\| md.md_id != sc->sc_id) {
	G_SHSEC_DEBUG(0, "Metadata on %s changed.", pp->name);
	goto fail;
	}

	cp->private = sc;
	cp->index = no;
	sc->sc_disks[no] = cp;

	G_SHSEC_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name);

	g_shsec_check_and_run(sc);

	return (0);
	fail:
	if (fcp != NULL && (fcp->acr > 0 \|\| fcp->acw > 0 \|\| fcp->ace > 0))
	g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace);
	g_detach(cp);
	g_destroy_consumer(cp);
	return (error);
	}

	static struct g_geom *
	g_shsec_create(struct g_class mp, const struct g_shsec_metadata md)
	{
	struct g_shsec_softc *sc;
	struct g_geom *gp;
	u_int no;

	G_SHSEC_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);

	/* Two disks is minimum. */
	if (md->md_all < 2) {
	G_SHSEC_DEBUG(0, "Too few disks defined for %s.", md->md_name);
	return (NULL);
	}

	/* Check for duplicate unit */
	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) {
	G_SHSEC_DEBUG(0, "Device %s already configured.",
	sc->sc_name);
	return (NULL);
	}
	}
	gp = g_new_geomf(mp, "%s", md->md_name);
	sc = malloc(sizeof(*sc), M_SHSEC, M_WAITOK \| M_ZERO);
	gp->start = g_shsec_start;
	gp->spoiled = g_shsec_orphan;
	gp->orphan = g_shsec_orphan;
	gp->access = g_shsec_access;
	gp->dumpconf = g_shsec_dumpconf;

	sc->sc_id = md->md_id;
	sc->sc_ndisks = md->md_all;
	sc->sc_disks = malloc(sizeof(struct g_consumer ) sc->sc_ndisks,
	M_SHSEC, M_WAITOK \| M_ZERO);
	for (no = 0; no < sc->sc_ndisks; no++)
	sc->sc_disks[no] = NULL;

	gp->softc = sc;
	sc->sc_geom = gp;
	sc->sc_provider = NULL;

	G_SHSEC_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);

	return (gp);
	}

	static int
	g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force)
	{
	struct g_provider *pp;
	struct g_geom *gp;
	u_int no;

	g_topology_assert();

	if (sc == NULL)
	return (ENXIO);

	pp = sc->sc_provider;
	if (pp != NULL && (pp->acr != 0 \|\| pp->acw != 0 \|\| pp->ace != 0)) {
	if (force) {
	G_SHSEC_DEBUG(0, "Device %s is still open, so it "
	"can't be definitely removed.", pp->name);
	} else {
	G_SHSEC_DEBUG(1,
	"Device %s is still open (r%dw%de%d).", pp->name,
	pp->acr, pp->acw, pp->ace);
	return (EBUSY);
	}
	}

	for (no = 0; no < sc->sc_ndisks; no++) {
	if (sc->sc_disks[no] != NULL)
	g_shsec_remove_disk(sc->sc_disks[no]);
	}

	gp = sc->sc_geom;
	gp->softc = NULL;
	KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)",
	gp->name));
	free(sc->sc_disks, M_SHSEC);
	free(sc, M_SHSEC);

	pp = LIST_FIRST(&gp->provider);
	if (pp == NULL \|\| (pp->acr == 0 && pp->acw == 0 && pp->ace == 0))
	G_SHSEC_DEBUG(0, "Device %s destroyed.", gp->name);

	g_wither_geom(gp, ENXIO);

	return (0);
	}

	static int
	g_shsec_destroy_geom(struct gctl_req req __unused, struct g_class mp __unused,
	struct g_geom *gp)
	{
	struct g_shsec_softc *sc;

	sc = gp->softc;
	return (g_shsec_destroy(sc, 0));
	}

	static struct g_geom *
	g_shsec_taste(struct g_class mp, struct g_provider pp, int flags __unused)
	{
	struct g_shsec_metadata md;
	struct g_shsec_softc *sc;
	struct g_consumer *cp;
	struct g_geom *gp;
	int error;

	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
	g_topology_assert();

	/* Skip providers that are already open for writing. */
	if (pp->acw > 0)
	return (NULL);

	G_SHSEC_DEBUG(3, "Tasting %s.", pp->name);

	gp = g_new_geomf(mp, "shsec:taste");
	gp->start = g_shsec_start;
	gp->access = g_shsec_access;
	gp->orphan = g_shsec_orphan;
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error == 0) {
	error = g_shsec_read_metadata(cp, &md);
	g_detach(cp);
	}
	g_destroy_consumer(cp);
	g_destroy_geom(gp);
	if (error != 0)
	return (NULL);
	gp = NULL;

	if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0)
	return (NULL);
	if (md.md_version > G_SHSEC_VERSION) {
	G_SHSEC_DEBUG(0, "Kernel module is too old to handle %s.\n",
	pp->name);
	return (NULL);
	}
	/*
	* Backward compatibility:
	*/
	/* There was no md_provsize field in earlier versions of metadata. */
	if (md.md_version < 1)
	md.md_provsize = pp->mediasize;

	if (md.md_provider[0] != '\0' &&
	!g_compare_names(md.md_provider, pp->name))
	return (NULL);
	if (md.md_provsize != pp->mediasize)
	return (NULL);

	/*
	* Let's check if device already exists.
	*/
	sc = NULL;
	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc == NULL)
	continue;
	if (strcmp(md.md_name, sc->sc_name) != 0)
	continue;
	if (md.md_id != sc->sc_id)
	continue;
	break;
	}
	if (gp != NULL) {
	G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
	error = g_shsec_add_disk(sc, pp, md.md_no);
	if (error != 0) {
	G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
	pp->name, gp->name, error);
	return (NULL);
	}
	} else {
	gp = g_shsec_create(mp, &md);
	if (gp == NULL) {
	G_SHSEC_DEBUG(0, "Cannot create device %s.", md.md_name);
	return (NULL);
	}
	sc = gp->softc;
	G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
	error = g_shsec_add_disk(sc, pp, md.md_no);
	if (error != 0) {
	G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
	pp->name, gp->name, error);
	g_shsec_destroy(sc, 1);
	return (NULL);
	}
	}
	return (gp);
	}

	static struct g_shsec_softc *
	g_shsec_find_device(struct g_class mp, const char name)
	{
	struct g_shsec_softc *sc;
	struct g_geom *gp;

	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc == NULL)
	continue;
	if (strcmp(sc->sc_name, name) == 0)
	return (sc);
	}
	return (NULL);
	}

	static void
	g_shsec_ctl_destroy(struct gctl_req req, struct g_class mp)
	{
	struct g_shsec_softc *sc;
	int force, nargs, error;
	const char *name;
	char param[16];
	u_int i;

	g_topology_assert();

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "No '%s' argument.", "nargs");
	return;
	}
	if (*nargs <= 0) {
	gctl_error(req, "Missing device(s).");
	return;
	}
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (force == NULL) {
	gctl_error(req, "No '%s' argument.", "force");
	return;
	}

	for (i = 0; i < (u_int)*nargs; i++) {
	snprintf(param, sizeof(param), "arg%u", i);
	name = gctl_get_asciiparam(req, param);
	if (name == NULL) {
	gctl_error(req, "No 'arg%u' argument.", i);
	return;
	}
	sc = g_shsec_find_device(mp, name);
	if (sc == NULL) {
	gctl_error(req, "No such device: %s.", name);
	return;
	}
	error = g_shsec_destroy(sc, *force);
	if (error != 0) {
	gctl_error(req, "Cannot destroy device %s (error=%d).",
	sc->sc_name, error);
	return;
	}
	}
	}

	static void
	g_shsec_config(struct gctl_req req, struct g_class mp, const char *verb)
	{
	uint32_t *version;

	g_topology_assert();

	version = gctl_get_paraml(req, "version", sizeof(*version));
	if (version == NULL) {
	gctl_error(req, "No '%s' argument.", "version");
	return;
	}
	if (*version != G_SHSEC_VERSION) {
	gctl_error(req, "Userland and kernel parts are out of sync.");
	return;
	}

	if (strcmp(verb, "stop") == 0) {
	g_shsec_ctl_destroy(req, mp);
	return;
	}

	gctl_error(req, "Unknown verb.");
	}

	static void
	g_shsec_dumpconf(struct sbuf sb, const char indent, struct g_geom *gp,
	struct g_consumer cp, struct g_provider pp)
	{
	struct g_shsec_softc *sc;

	sc = gp->softc;
	if (sc == NULL)
	return;
	if (pp != NULL) {
	/* Nothing here. */
	} else if (cp != NULL) {
	sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
	(u_int)cp->index);
	} else {
	sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
	sbuf_printf(sb, "%s<Status>Total=%u, Online=%u</Status>\n",
	indent, sc->sc_ndisks, g_shsec_nvalid(sc));
	sbuf_printf(sb, "%s<State>", indent);
	if (sc->sc_provider != NULL && sc->sc_provider->error == 0)
	sbuf_printf(sb, "UP");
	else
	sbuf_printf(sb, "DOWN");
	sbuf_printf(sb, "</State>\n");
	}
	}

	DECLARE_GEOM_CLASS(g_shsec_class, g_shsec);
	MODULE_VERSION(geom_shsec, 0);
	diff --git a/sys/geom/stripe/g_stripe.c b/sys/geom/stripe/g_stripe.c
	index 0a76a1d8a074..22cc16753d9a 100644
	--- a/sys/geom/stripe/g_stripe.c
	+++ b/sys/geom/stripe/g_stripe.c
	@@ -1,1270 +1,1273 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2004-2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/bio.h>
	#include <sys/sbuf.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <vm/uma.h>
	#include <geom/geom.h>
	#include <geom/geom_dbg.h>
	#include <geom/stripe/g_stripe.h>

	FEATURE(geom_stripe, "GEOM striping support");

	static MALLOC_DEFINE(M_STRIPE, "stripe_data", "GEOM_STRIPE Data");

	static uma_zone_t g_stripe_zone;

	static int g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force);
	static int g_stripe_destroy_geom(struct gctl_req req, struct g_class mp,
	struct g_geom *gp);

	static g_taste_t g_stripe_taste;
	static g_ctl_req_t g_stripe_config;
	static g_dumpconf_t g_stripe_dumpconf;
	static g_init_t g_stripe_init;
	static g_fini_t g_stripe_fini;

	struct g_class g_stripe_class = {
	.name = G_STRIPE_CLASS_NAME,
	.version = G_VERSION,
	.ctlreq = g_stripe_config,
	.taste = g_stripe_taste,
	.destroy_geom = g_stripe_destroy_geom,
	.init = g_stripe_init,
	.fini = g_stripe_fini
	};

	SYSCTL_DECL(_kern_geom);
	static SYSCTL_NODE(_kern_geom, OID_AUTO, stripe, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_STRIPE stuff");
	static u_int g_stripe_debug = 0;
	SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, debug, CTLFLAG_RWTUN, &g_stripe_debug, 0,
	"Debug level");
	static int g_stripe_fast = 0;
	static int
	g_sysctl_stripe_fast(SYSCTL_HANDLER_ARGS)
	{
	int error, fast;

	fast = g_stripe_fast;
	error = sysctl_handle_int(oidp, &fast, 0, req);
	if (error == 0 && req->newptr != NULL)
	g_stripe_fast = fast;
	return (error);
	}
	SYSCTL_PROC(_kern_geom_stripe, OID_AUTO, fast,
	CTLTYPE_INT \| CTLFLAG_RWTUN \| CTLFLAG_NEEDGIANT, NULL, 0,
	g_sysctl_stripe_fast, "I",
	"Fast, but memory-consuming, mode");
	-static u_int g_stripe_maxmem = MAXPHYS * 100;
	-SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, maxmem, CTLFLAG_RDTUN, &g_stripe_maxmem,
	- 0, "Maximum memory that can be allocated in \"fast\" mode (in bytes)");
	+static u_long g_stripe_maxmem;
	+SYSCTL_ULONG(_kern_geom_stripe, OID_AUTO, maxmem,
	+ CTLFLAG_RDTUN \| CTLFLAG_NOFETCH, &g_stripe_maxmem, 0,
	+ "Maximum memory that can be allocated in \"fast\" mode (in bytes)");
	static u_int g_stripe_fast_failed = 0;
	SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, fast_failed, CTLFLAG_RD,
	&g_stripe_fast_failed, 0, "How many times \"fast\" mode failed");

	/*
	* Greatest Common Divisor.
	*/
	static u_int
	gcd(u_int a, u_int b)
	{
	u_int c;

	while (b != 0) {
	c = a;
	a = b;
	b = (c % b);
	}
	return (a);
	}

	/*
	* Least Common Multiple.
	*/
	static u_int
	lcm(u_int a, u_int b)
	{

	return ((a * b) / gcd(a, b));
	}

	static void
	g_stripe_init(struct g_class *mp __unused)
	{

	- g_stripe_zone = uma_zcreate("g_stripe_zone", MAXPHYS, NULL, NULL,
	+ g_stripe_maxmem = maxphys * 100;
	+ TUNABLE_ULONG_FETCH("kern.geom.stripe.maxmem,", &g_stripe_maxmem);
	+ g_stripe_zone = uma_zcreate("g_stripe_zone", maxphys, NULL, NULL,
	NULL, NULL, 0, 0);
	- g_stripe_maxmem -= g_stripe_maxmem % MAXPHYS;
	- uma_zone_set_max(g_stripe_zone, g_stripe_maxmem / MAXPHYS);
	+ g_stripe_maxmem -= g_stripe_maxmem % maxphys;
	+ uma_zone_set_max(g_stripe_zone, g_stripe_maxmem / maxphys);
	}

	static void
	g_stripe_fini(struct g_class *mp __unused)
	{

	uma_zdestroy(g_stripe_zone);
	}

	/*
	* Return the number of valid disks.
	*/
	static u_int
	g_stripe_nvalid(struct g_stripe_softc *sc)
	{
	u_int i, no;

	no = 0;
	for (i = 0; i < sc->sc_ndisks; i++) {
	if (sc->sc_disks[i] != NULL)
	no++;
	}

	return (no);
	}

	static void
	g_stripe_remove_disk(struct g_consumer *cp)
	{
	struct g_stripe_softc *sc;

	g_topology_assert();
	KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__));
	sc = (struct g_stripe_softc *)cp->geom->softc;
	KASSERT(sc != NULL, ("NULL sc in %s.", __func__));

	if (cp->private == NULL) {
	G_STRIPE_DEBUG(0, "Disk %s removed from %s.",
	cp->provider->name, sc->sc_name);
	cp->private = (void *)(uintptr_t)-1;
	}

	if (sc->sc_provider != NULL) {
	G_STRIPE_DEBUG(0, "Device %s deactivated.",
	sc->sc_provider->name);
	g_wither_provider(sc->sc_provider, ENXIO);
	sc->sc_provider = NULL;
	}

	if (cp->acr > 0 \|\| cp->acw > 0 \|\| cp->ace > 0)
	return;
	sc->sc_disks[cp->index] = NULL;
	cp->index = 0;
	g_detach(cp);
	g_destroy_consumer(cp);
	/* If there are no valid disks anymore, remove device. */
	if (LIST_EMPTY(&sc->sc_geom->consumer))
	g_stripe_destroy(sc, 1);
	}

	static void
	g_stripe_orphan(struct g_consumer *cp)
	{
	struct g_stripe_softc *sc;
	struct g_geom *gp;

	g_topology_assert();
	gp = cp->geom;
	sc = gp->softc;
	if (sc == NULL)
	return;

	g_stripe_remove_disk(cp);
	}

	static int
	g_stripe_access(struct g_provider *pp, int dr, int dw, int de)
	{
	struct g_consumer cp1, cp2, *tmp;
	struct g_stripe_softc *sc;
	struct g_geom *gp;
	int error;

	g_topology_assert();
	gp = pp->geom;
	sc = gp->softc;
	KASSERT(sc != NULL, ("NULL sc in %s.", __func__));

	/* On first open, grab an extra "exclusive" bit */
	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
	de++;
	/* ... and let go of it on last close */
	if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0)
	de--;

	LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) {
	error = g_access(cp1, dr, dw, de);
	if (error != 0)
	goto fail;
	if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 &&
	cp1->private != NULL) {
	g_stripe_remove_disk(cp1); /* May destroy geom. */
	}
	}
	return (0);

	fail:
	LIST_FOREACH(cp2, &gp->consumer, consumer) {
	if (cp1 == cp2)
	break;
	g_access(cp2, -dr, -dw, -de);
	}
	return (error);
	}

	static void
	g_stripe_copy(struct g_stripe_softc sc, char src, char *dst, off_t offset,
	off_t length, int mode)
	{
	off_t stripesize;
	size_t len;

	stripesize = sc->sc_stripesize;
	len = (size_t)(stripesize - (offset & (stripesize - 1)));
	do {
	bcopy(src, dst, len);
	if (mode) {
	dst += len + stripesize * (sc->sc_ndisks - 1);
	src += len;
	} else {
	dst += len;
	src += len + stripesize * (sc->sc_ndisks - 1);
	}
	length -= len;
	KASSERT(length >= 0,
	("Length < 0 (stripesize=%ju, offset=%ju, length=%jd).",
	(uintmax_t)stripesize, (uintmax_t)offset, (intmax_t)length));
	if (length > stripesize)
	len = stripesize;
	else
	len = length;
	} while (length > 0);
	}

	static void
	g_stripe_done(struct bio *bp)
	{
	struct g_stripe_softc *sc;
	struct bio *pbp;

	pbp = bp->bio_parent;
	sc = pbp->bio_to->geom->softc;
	if (bp->bio_cmd == BIO_READ && bp->bio_caller1 != NULL) {
	g_stripe_copy(sc, bp->bio_data, bp->bio_caller1, bp->bio_offset,
	bp->bio_length, 1);
	bp->bio_data = bp->bio_caller1;
	bp->bio_caller1 = NULL;
	}
	mtx_lock(&sc->sc_lock);
	if (pbp->bio_error == 0)
	pbp->bio_error = bp->bio_error;
	pbp->bio_completed += bp->bio_completed;
	pbp->bio_inbed++;
	if (pbp->bio_children == pbp->bio_inbed) {
	mtx_unlock(&sc->sc_lock);
	if (pbp->bio_driver1 != NULL)
	uma_zfree(g_stripe_zone, pbp->bio_driver1);
	if (bp->bio_cmd == BIO_SPEEDUP)
	pbp->bio_completed = pbp->bio_length;
	g_io_deliver(pbp, pbp->bio_error);
	} else
	mtx_unlock(&sc->sc_lock);
	g_destroy_bio(bp);
	}

	static int
	g_stripe_start_fast(struct bio *bp, u_int no, off_t offset, off_t length)
	{
	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
	struct g_stripe_softc *sc;
	char addr, data = NULL;
	struct bio *cbp;
	off_t stripesize;
	u_int nparts = 0;
	int error;

	sc = bp->bio_to->geom->softc;

	addr = bp->bio_data;
	stripesize = sc->sc_stripesize;

	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	error = ENOMEM;
	goto failure;
	}
	TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
	nparts++;
	/*
	* Fill in the component buf structure.
	*/
	cbp->bio_done = g_stripe_done;
	cbp->bio_offset = offset;
	cbp->bio_data = addr;
	cbp->bio_caller1 = NULL;
	cbp->bio_length = length;
	cbp->bio_caller2 = sc->sc_disks[no];

	/* offset -= offset % stripesize; */
	offset -= offset & (stripesize - 1);
	addr += length;
	length = bp->bio_length - length;
	for (no++; length > 0; no++, length -= stripesize, addr += stripesize) {
	if (no > sc->sc_ndisks - 1) {
	no = 0;
	offset += stripesize;
	}
	if (nparts >= sc->sc_ndisks) {
	cbp = TAILQ_NEXT(cbp, bio_queue);
	if (cbp == NULL)
	cbp = TAILQ_FIRST(&queue);
	nparts++;
	/*
	* Update bio structure.
	*/
	/*
	* MIN() is in case when
	* (bp->bio_length % sc->sc_stripesize) != 0.
	*/
	cbp->bio_length += MIN(stripesize, length);
	if (cbp->bio_caller1 == NULL) {
	cbp->bio_caller1 = cbp->bio_data;
	cbp->bio_data = NULL;
	if (data == NULL) {
	data = uma_zalloc(g_stripe_zone,
	M_NOWAIT);
	if (data == NULL) {
	error = ENOMEM;
	goto failure;
	}
	}
	}
	} else {
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	error = ENOMEM;
	goto failure;
	}
	TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
	nparts++;
	/*
	* Fill in the component buf structure.
	*/
	cbp->bio_done = g_stripe_done;
	cbp->bio_offset = offset;
	cbp->bio_data = addr;
	cbp->bio_caller1 = NULL;
	/*
	* MIN() is in case when
	* (bp->bio_length % sc->sc_stripesize) != 0.
	*/
	cbp->bio_length = MIN(stripesize, length);
	cbp->bio_caller2 = sc->sc_disks[no];
	}
	}
	if (data != NULL)
	bp->bio_driver1 = data;
	/*
	* Fire off all allocated requests!
	*/
	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
	struct g_consumer *cp;

	TAILQ_REMOVE(&queue, cbp, bio_queue);
	cp = cbp->bio_caller2;
	cbp->bio_caller2 = NULL;
	cbp->bio_to = cp->provider;
	if (cbp->bio_caller1 != NULL) {
	cbp->bio_data = data;
	if (bp->bio_cmd == BIO_WRITE) {
	g_stripe_copy(sc, cbp->bio_caller1, data,
	cbp->bio_offset, cbp->bio_length, 0);
	}
	data += cbp->bio_length;
	}
	G_STRIPE_LOGREQ(cbp, "Sending request.");
	g_io_request(cbp, cp);
	}
	return (0);
	failure:
	if (data != NULL)
	uma_zfree(g_stripe_zone, data);
	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, cbp, bio_queue);
	if (cbp->bio_caller1 != NULL) {
	cbp->bio_data = cbp->bio_caller1;
	cbp->bio_caller1 = NULL;
	}
	bp->bio_children--;
	g_destroy_bio(cbp);
	}
	return (error);
	}

	static int
	g_stripe_start_economic(struct bio *bp, u_int no, off_t offset, off_t length)
	{
	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
	struct g_stripe_softc *sc;
	off_t stripesize;
	struct bio *cbp;
	char *addr;
	int error;

	sc = bp->bio_to->geom->softc;

	stripesize = sc->sc_stripesize;

	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	error = ENOMEM;
	goto failure;
	}
	TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
	/*
	* Fill in the component buf structure.
	*/
	if (bp->bio_length == length)
	cbp->bio_done = g_std_done; /* Optimized lockless case. */
	else
	cbp->bio_done = g_stripe_done;
	cbp->bio_offset = offset;
	cbp->bio_length = length;
	if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
	bp->bio_ma_n = round_page(bp->bio_ma_offset +
	bp->bio_length) / PAGE_SIZE;
	addr = NULL;
	} else
	addr = bp->bio_data;
	cbp->bio_caller2 = sc->sc_disks[no];

	/* offset -= offset % stripesize; */
	offset -= offset & (stripesize - 1);
	if (bp->bio_cmd != BIO_DELETE)
	addr += length;
	length = bp->bio_length - length;
	for (no++; length > 0; no++, length -= stripesize) {
	if (no > sc->sc_ndisks - 1) {
	no = 0;
	offset += stripesize;
	}
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	error = ENOMEM;
	goto failure;
	}
	TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);

	/*
	* Fill in the component buf structure.
	*/
	cbp->bio_done = g_stripe_done;
	cbp->bio_offset = offset;
	/*
	* MIN() is in case when
	* (bp->bio_length % sc->sc_stripesize) != 0.
	*/
	cbp->bio_length = MIN(stripesize, length);
	if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
	cbp->bio_ma_offset += (uintptr_t)addr;
	cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
	cbp->bio_ma_offset %= PAGE_SIZE;
	cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
	cbp->bio_length) / PAGE_SIZE;
	} else
	cbp->bio_data = addr;

	cbp->bio_caller2 = sc->sc_disks[no];

	if (bp->bio_cmd != BIO_DELETE)
	addr += stripesize;
	}
	/*
	* Fire off all allocated requests!
	*/
	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
	struct g_consumer *cp;

	TAILQ_REMOVE(&queue, cbp, bio_queue);
	cp = cbp->bio_caller2;
	cbp->bio_caller2 = NULL;
	cbp->bio_to = cp->provider;
	G_STRIPE_LOGREQ(cbp, "Sending request.");
	g_io_request(cbp, cp);
	}
	return (0);
	failure:
	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, cbp, bio_queue);
	bp->bio_children--;
	g_destroy_bio(cbp);
	}
	return (error);
	}

	static void
	g_stripe_pushdown(struct g_stripe_softc sc, struct bio bp)
	{
	struct bio_queue_head queue;
	struct g_consumer *cp;
	struct bio *cbp;
	u_int no;

	bioq_init(&queue);
	for (no = 0; no < sc->sc_ndisks; no++) {
	cbp = g_clone_bio(bp);
	if (cbp == NULL) {
	for (cbp = bioq_first(&queue); cbp != NULL;
	cbp = bioq_first(&queue)) {
	bioq_remove(&queue, cbp);
	g_destroy_bio(cbp);
	}
	if (bp->bio_error == 0)
	bp->bio_error = ENOMEM;
	g_io_deliver(bp, bp->bio_error);
	return;
	}
	bioq_insert_tail(&queue, cbp);
	cbp->bio_done = g_stripe_done;
	cbp->bio_caller2 = sc->sc_disks[no];
	cbp->bio_to = sc->sc_disks[no]->provider;
	}
	for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) {
	bioq_remove(&queue, cbp);
	G_STRIPE_LOGREQ(cbp, "Sending request.");
	cp = cbp->bio_caller2;
	cbp->bio_caller2 = NULL;
	g_io_request(cbp, cp);
	}
	}

	static void
	g_stripe_start(struct bio *bp)
	{
	off_t offset, start, length, nstripe, stripesize;
	struct g_stripe_softc *sc;
	u_int no;
	int error, fast = 0;

	sc = bp->bio_to->geom->softc;
	/*
	* If sc == NULL, provider's error should be set and g_stripe_start()
	* should not be called at all.
	*/
	KASSERT(sc != NULL,
	("Provider's error should be set (error=%d)(device=%s).",
	bp->bio_to->error, bp->bio_to->name));

	G_STRIPE_LOGREQ(bp, "Request received.");

	switch (bp->bio_cmd) {
	case BIO_READ:
	case BIO_WRITE:
	case BIO_DELETE:
	break;
	case BIO_SPEEDUP:
	case BIO_FLUSH:
	g_stripe_pushdown(sc, bp);
	return;
	case BIO_GETATTR:
	/* To which provider it should be delivered? */
	default:
	g_io_deliver(bp, EOPNOTSUPP);
	return;
	}

	stripesize = sc->sc_stripesize;

	/*
	* Calculations are quite messy, but fast I hope.
	*/

	/* Stripe number. */
	/* nstripe = bp->bio_offset / stripesize; */
	nstripe = bp->bio_offset >> (off_t)sc->sc_stripebits;
	/* Disk number. */
	no = nstripe % sc->sc_ndisks;
	/* Start position in stripe. */
	/* start = bp->bio_offset % stripesize; */
	start = bp->bio_offset & (stripesize - 1);
	/* Start position in disk. */
	/* offset = (nstripe / sc->sc_ndisks) * stripesize + start; */
	offset = ((nstripe / sc->sc_ndisks) << sc->sc_stripebits) + start;
	/* Length of data to operate. */
	length = MIN(bp->bio_length, stripesize - start);

	/*
	* Do use "fast" mode when:
	* 1. "Fast" mode is ON.
	* and
	- * 2. Request size is less than or equal to MAXPHYS,
	+ * 2. Request size is less than or equal to maxphys,
	* which should always be true.
	* and
	* 3. Request size is bigger than stripesize * ndisks. If it isn't,
	* there will be no need to send more than one I/O request to
	* a provider, so there is nothing to optmize.
	* and
	* 4. Request is not unmapped.
	* and
	* 5. It is not a BIO_DELETE.
	*/
	- if (g_stripe_fast && bp->bio_length <= MAXPHYS &&
	+ if (g_stripe_fast && bp->bio_length <= maxphys &&
	bp->bio_length >= stripesize * sc->sc_ndisks &&
	(bp->bio_flags & BIO_UNMAPPED) == 0 &&
	bp->bio_cmd != BIO_DELETE) {
	fast = 1;
	}
	error = 0;
	if (fast) {
	error = g_stripe_start_fast(bp, no, offset, length);
	if (error != 0)
	g_stripe_fast_failed++;
	}
	/*
	* Do use "economic" when:
	* 1. "Economic" mode is ON.
	* or
	* 2. "Fast" mode failed. It can only fail if there is no memory.
	*/
	if (!fast \|\| error != 0)
	error = g_stripe_start_economic(bp, no, offset, length);
	if (error != 0) {
	if (bp->bio_error == 0)
	bp->bio_error = error;
	g_io_deliver(bp, bp->bio_error);
	}
	}

	static void
	g_stripe_check_and_run(struct g_stripe_softc *sc)
	{
	struct g_provider *dp;
	off_t mediasize, ms;
	u_int no, sectorsize = 0;

	g_topology_assert();
	if (g_stripe_nvalid(sc) != sc->sc_ndisks)
	return;

	sc->sc_provider = g_new_providerf(sc->sc_geom, "stripe/%s",
	sc->sc_name);
	sc->sc_provider->flags \|= G_PF_DIRECT_SEND \| G_PF_DIRECT_RECEIVE;
	if (g_stripe_fast == 0)
	sc->sc_provider->flags \|= G_PF_ACCEPT_UNMAPPED;
	/*
	* Find the smallest disk.
	*/
	mediasize = sc->sc_disks[0]->provider->mediasize;
	if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC)
	mediasize -= sc->sc_disks[0]->provider->sectorsize;
	mediasize -= mediasize % sc->sc_stripesize;
	sectorsize = sc->sc_disks[0]->provider->sectorsize;
	for (no = 1; no < sc->sc_ndisks; no++) {
	dp = sc->sc_disks[no]->provider;
	ms = dp->mediasize;
	if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC)
	ms -= dp->sectorsize;
	ms -= ms % sc->sc_stripesize;
	if (ms < mediasize)
	mediasize = ms;
	sectorsize = lcm(sectorsize, dp->sectorsize);

	/* A provider underneath us doesn't support unmapped */
	if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) {
	G_STRIPE_DEBUG(1, "Cancelling unmapped "
	"because of %s.", dp->name);
	sc->sc_provider->flags &= ~G_PF_ACCEPT_UNMAPPED;
	}
	}
	sc->sc_provider->sectorsize = sectorsize;
	sc->sc_provider->mediasize = mediasize * sc->sc_ndisks;
	sc->sc_provider->stripesize = sc->sc_stripesize;
	sc->sc_provider->stripeoffset = 0;
	g_error_provider(sc->sc_provider, 0);

	G_STRIPE_DEBUG(0, "Device %s activated.", sc->sc_provider->name);
	}

	static int
	g_stripe_read_metadata(struct g_consumer cp, struct g_stripe_metadata md)
	{
	struct g_provider *pp;
	u_char *buf;
	int error;

	g_topology_assert();

	error = g_access(cp, 1, 0, 0);
	if (error != 0)
	return (error);
	pp = cp->provider;
	g_topology_unlock();
	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
	&error);
	g_topology_lock();
	g_access(cp, -1, 0, 0);
	if (buf == NULL)
	return (error);

	/* Decode metadata. */
	stripe_metadata_decode(buf, md);
	g_free(buf);

	return (0);
	}

	/*
	* Add disk to given device.
	*/
	static int
	g_stripe_add_disk(struct g_stripe_softc sc, struct g_provider pp, u_int no)
	{
	struct g_consumer cp, fcp;
	struct g_geom *gp;
	int error;

	g_topology_assert();
	/* Metadata corrupted? */
	if (no >= sc->sc_ndisks)
	return (EINVAL);

	/* Check if disk is not already attached. */
	if (sc->sc_disks[no] != NULL)
	return (EEXIST);

	gp = sc->sc_geom;
	fcp = LIST_FIRST(&gp->consumer);

	cp = g_new_consumer(gp);
	cp->flags \|= G_CF_DIRECT_SEND \| G_CF_DIRECT_RECEIVE;
	cp->private = NULL;
	cp->index = no;
	error = g_attach(cp, pp);
	if (error != 0) {
	g_destroy_consumer(cp);
	return (error);
	}

	if (fcp != NULL && (fcp->acr > 0 \|\| fcp->acw > 0 \|\| fcp->ace > 0)) {
	error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
	if (error != 0) {
	g_detach(cp);
	g_destroy_consumer(cp);
	return (error);
	}
	}
	if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) {
	struct g_stripe_metadata md;

	/* Reread metadata. */
	error = g_stripe_read_metadata(cp, &md);
	if (error != 0)
	goto fail;

	if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0 \|\|
	strcmp(md.md_name, sc->sc_name) != 0 \|\|
	md.md_id != sc->sc_id) {
	G_STRIPE_DEBUG(0, "Metadata on %s changed.", pp->name);
	goto fail;
	}
	}

	sc->sc_disks[no] = cp;
	G_STRIPE_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name);
	g_stripe_check_and_run(sc);

	return (0);
	fail:
	if (fcp != NULL && (fcp->acr > 0 \|\| fcp->acw > 0 \|\| fcp->ace > 0))
	g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace);
	g_detach(cp);
	g_destroy_consumer(cp);
	return (error);
	}

	static struct g_geom *
	g_stripe_create(struct g_class mp, const struct g_stripe_metadata md,
	u_int type)
	{
	struct g_stripe_softc *sc;
	struct g_geom *gp;
	u_int no;

	g_topology_assert();
	G_STRIPE_DEBUG(1, "Creating device %s (id=%u).", md->md_name,
	md->md_id);

	/* Two disks is minimum. */
	if (md->md_all < 2) {
	G_STRIPE_DEBUG(0, "Too few disks defined for %s.", md->md_name);
	return (NULL);
	}
	#if 0
	/* Stripe size have to be grater than or equal to sector size. */
	if (md->md_stripesize < sectorsize) {
	G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name);
	return (NULL);
	}
	#endif
	/* Stripe size have to be power of 2. */
	if (!powerof2(md->md_stripesize)) {
	G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name);
	return (NULL);
	}

	/* Check for duplicate unit */
	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) {
	G_STRIPE_DEBUG(0, "Device %s already configured.",
	sc->sc_name);
	return (NULL);
	}
	}
	gp = g_new_geomf(mp, "%s", md->md_name);
	sc = malloc(sizeof(*sc), M_STRIPE, M_WAITOK \| M_ZERO);
	gp->start = g_stripe_start;
	gp->spoiled = g_stripe_orphan;
	gp->orphan = g_stripe_orphan;
	gp->access = g_stripe_access;
	gp->dumpconf = g_stripe_dumpconf;

	sc->sc_id = md->md_id;
	sc->sc_stripesize = md->md_stripesize;
	sc->sc_stripebits = bitcount32(sc->sc_stripesize - 1);
	sc->sc_ndisks = md->md_all;
	sc->sc_disks = malloc(sizeof(struct g_consumer ) sc->sc_ndisks,
	M_STRIPE, M_WAITOK \| M_ZERO);
	for (no = 0; no < sc->sc_ndisks; no++)
	sc->sc_disks[no] = NULL;
	sc->sc_type = type;
	mtx_init(&sc->sc_lock, "gstripe lock", NULL, MTX_DEF);

	gp->softc = sc;
	sc->sc_geom = gp;
	sc->sc_provider = NULL;

	G_STRIPE_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);

	return (gp);
	}

	static int
	g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force)
	{
	struct g_provider *pp;
	struct g_consumer cp, cp1;
	struct g_geom *gp;

	g_topology_assert();

	if (sc == NULL)
	return (ENXIO);

	pp = sc->sc_provider;
	if (pp != NULL && (pp->acr != 0 \|\| pp->acw != 0 \|\| pp->ace != 0)) {
	if (force) {
	G_STRIPE_DEBUG(0, "Device %s is still open, so it "
	"can't be definitely removed.", pp->name);
	} else {
	G_STRIPE_DEBUG(1,
	"Device %s is still open (r%dw%de%d).", pp->name,
	pp->acr, pp->acw, pp->ace);
	return (EBUSY);
	}
	}

	gp = sc->sc_geom;
	LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) {
	g_stripe_remove_disk(cp);
	if (cp1 == NULL)
	return (0); /* Recursion happened. */
	}
	if (!LIST_EMPTY(&gp->consumer))
	return (EINPROGRESS);

	gp->softc = NULL;
	KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)",
	gp->name));
	free(sc->sc_disks, M_STRIPE);
	mtx_destroy(&sc->sc_lock);
	free(sc, M_STRIPE);
	G_STRIPE_DEBUG(0, "Device %s destroyed.", gp->name);
	g_wither_geom(gp, ENXIO);
	return (0);
	}

	static int
	g_stripe_destroy_geom(struct gctl_req *req __unused,
	struct g_class mp __unused, struct g_geom gp)
	{
	struct g_stripe_softc *sc;

	sc = gp->softc;
	return (g_stripe_destroy(sc, 0));
	}

	static struct g_geom *
	g_stripe_taste(struct g_class mp, struct g_provider pp, int flags __unused)
	{
	struct g_stripe_metadata md;
	struct g_stripe_softc *sc;
	struct g_consumer *cp;
	struct g_geom *gp;
	int error;

	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
	g_topology_assert();

	/* Skip providers that are already open for writing. */
	if (pp->acw > 0)
	return (NULL);

	G_STRIPE_DEBUG(3, "Tasting %s.", pp->name);

	gp = g_new_geomf(mp, "stripe:taste");
	gp->start = g_stripe_start;
	gp->access = g_stripe_access;
	gp->orphan = g_stripe_orphan;
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error == 0) {
	error = g_stripe_read_metadata(cp, &md);
	g_detach(cp);
	}
	g_destroy_consumer(cp);
	g_destroy_geom(gp);
	if (error != 0)
	return (NULL);
	gp = NULL;

	if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0)
	return (NULL);
	if (md.md_version > G_STRIPE_VERSION) {
	printf("geom_stripe.ko module is too old to handle %s.\n",
	pp->name);
	return (NULL);
	}
	/*
	* Backward compatibility:
	*/
	/* There was no md_provider field in earlier versions of metadata. */
	if (md.md_version < 2)
	bzero(md.md_provider, sizeof(md.md_provider));
	/* There was no md_provsize field in earlier versions of metadata. */
	if (md.md_version < 3)
	md.md_provsize = pp->mediasize;

	if (md.md_provider[0] != '\0' &&
	!g_compare_names(md.md_provider, pp->name))
	return (NULL);
	if (md.md_provsize != pp->mediasize)
	return (NULL);

	/*
	* Let's check if device already exists.
	*/
	sc = NULL;
	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_type != G_STRIPE_TYPE_AUTOMATIC)
	continue;
	if (strcmp(md.md_name, sc->sc_name) != 0)
	continue;
	if (md.md_id != sc->sc_id)
	continue;
	break;
	}
	if (gp != NULL) {
	G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
	error = g_stripe_add_disk(sc, pp, md.md_no);
	if (error != 0) {
	G_STRIPE_DEBUG(0,
	"Cannot add disk %s to %s (error=%d).", pp->name,
	gp->name, error);
	return (NULL);
	}
	} else {
	gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_AUTOMATIC);
	if (gp == NULL) {
	G_STRIPE_DEBUG(0, "Cannot create device %s.",
	md.md_name);
	return (NULL);
	}
	sc = gp->softc;
	G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
	error = g_stripe_add_disk(sc, pp, md.md_no);
	if (error != 0) {
	G_STRIPE_DEBUG(0,
	"Cannot add disk %s to %s (error=%d).", pp->name,
	gp->name, error);
	g_stripe_destroy(sc, 1);
	return (NULL);
	}
	}

	return (gp);
	}

	static void
	g_stripe_ctl_create(struct gctl_req req, struct g_class mp)
	{
	u_int attached, no;
	struct g_stripe_metadata md;
	struct g_provider *pp;
	struct g_stripe_softc *sc;
	struct g_geom *gp;
	struct sbuf *sb;
	off_t *stripesize;
	const char *name;
	char param[16];
	int *nargs;

	g_topology_assert();
	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "No '%s' argument.", "nargs");
	return;
	}
	if (*nargs <= 2) {
	gctl_error(req, "Too few arguments.");
	return;
	}

	strlcpy(md.md_magic, G_STRIPE_MAGIC, sizeof(md.md_magic));
	md.md_version = G_STRIPE_VERSION;
	name = gctl_get_asciiparam(req, "arg0");
	if (name == NULL) {
	gctl_error(req, "No 'arg%u' argument.", 0);
	return;
	}
	strlcpy(md.md_name, name, sizeof(md.md_name));
	md.md_id = arc4random();
	md.md_no = 0;
	md.md_all = *nargs - 1;
	stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize));
	if (stripesize == NULL) {
	gctl_error(req, "No '%s' argument.", "stripesize");
	return;
	}
	md.md_stripesize = (uint32_t)*stripesize;
	bzero(md.md_provider, sizeof(md.md_provider));
	/* This field is not important here. */
	md.md_provsize = 0;

	/* Check all providers are valid */
	for (no = 1; no < *nargs; no++) {
	snprintf(param, sizeof(param), "arg%u", no);
	pp = gctl_get_provider(req, param);
	if (pp == NULL)
	return;
	}

	gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_MANUAL);
	if (gp == NULL) {
	gctl_error(req, "Can't configure %s.", md.md_name);
	return;
	}

	sc = gp->softc;
	sb = sbuf_new_auto();
	sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name);
	for (attached = 0, no = 1; no < *nargs; no++) {
	snprintf(param, sizeof(param), "arg%u", no);
	pp = gctl_get_provider(req, param);
	if (pp == NULL) {
	name = gctl_get_asciiparam(req, param);
	MPASS(name != NULL);
	sbuf_printf(sb, " %s", name);
	continue;
	}
	if (g_stripe_add_disk(sc, pp, no - 1) != 0) {
	G_STRIPE_DEBUG(1, "Disk %u (%s) not attached to %s.",
	no, pp->name, gp->name);
	sbuf_printf(sb, " %s", pp->name);
	continue;
	}
	attached++;
	}
	sbuf_finish(sb);
	if (md.md_all != attached) {
	g_stripe_destroy(gp->softc, 1);
	gctl_error(req, "%s", sbuf_data(sb));
	}
	sbuf_delete(sb);
	}

	static struct g_stripe_softc *
	g_stripe_find_device(struct g_class mp, const char name)
	{
	struct g_stripe_softc *sc;
	struct g_geom *gp;

	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc == NULL)
	continue;
	if (strcmp(sc->sc_name, name) == 0)
	return (sc);
	}
	return (NULL);
	}

	static void
	g_stripe_ctl_destroy(struct gctl_req req, struct g_class mp)
	{
	struct g_stripe_softc *sc;
	int force, nargs, error;
	const char *name;
	char param[16];
	u_int i;

	g_topology_assert();

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "No '%s' argument.", "nargs");
	return;
	}
	if (*nargs <= 0) {
	gctl_error(req, "Missing device(s).");
	return;
	}
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (force == NULL) {
	gctl_error(req, "No '%s' argument.", "force");
	return;
	}

	for (i = 0; i < (u_int)*nargs; i++) {
	snprintf(param, sizeof(param), "arg%u", i);
	name = gctl_get_asciiparam(req, param);
	if (name == NULL) {
	gctl_error(req, "No 'arg%u' argument.", i);
	return;
	}
	sc = g_stripe_find_device(mp, name);
	if (sc == NULL) {
	gctl_error(req, "No such device: %s.", name);
	return;
	}
	error = g_stripe_destroy(sc, *force);
	if (error != 0) {
	gctl_error(req, "Cannot destroy device %s (error=%d).",
	sc->sc_name, error);
	return;
	}
	}
	}

	static void
	g_stripe_config(struct gctl_req req, struct g_class mp, const char *verb)
	{
	uint32_t *version;

	g_topology_assert();

	version = gctl_get_paraml(req, "version", sizeof(*version));
	if (version == NULL) {
	gctl_error(req, "No '%s' argument.", "version");
	return;
	}
	if (*version != G_STRIPE_VERSION) {
	gctl_error(req, "Userland and kernel parts are out of sync.");
	return;
	}

	if (strcmp(verb, "create") == 0) {
	g_stripe_ctl_create(req, mp);
	return;
	} else if (strcmp(verb, "destroy") == 0 \|\|
	strcmp(verb, "stop") == 0) {
	g_stripe_ctl_destroy(req, mp);
	return;
	}

	gctl_error(req, "Unknown verb.");
	}

	static void
	g_stripe_dumpconf(struct sbuf sb, const char indent, struct g_geom *gp,
	struct g_consumer cp, struct g_provider pp)
	{
	struct g_stripe_softc *sc;

	sc = gp->softc;
	if (sc == NULL)
	return;
	if (pp != NULL) {
	/* Nothing here. */
	} else if (cp != NULL) {
	sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
	(u_int)cp->index);
	} else {
	sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
	sbuf_printf(sb, "%s<Stripesize>%ju</Stripesize>\n", indent,
	(uintmax_t)sc->sc_stripesize);
	sbuf_printf(sb, "%s<Type>", indent);
	switch (sc->sc_type) {
	case G_STRIPE_TYPE_AUTOMATIC:
	sbuf_cat(sb, "AUTOMATIC");
	break;
	case G_STRIPE_TYPE_MANUAL:
	sbuf_cat(sb, "MANUAL");
	break;
	default:
	sbuf_cat(sb, "UNKNOWN");
	break;
	}
	sbuf_cat(sb, "</Type>\n");
	sbuf_printf(sb, "%s<Status>Total=%u, Online=%u</Status>\n",
	indent, sc->sc_ndisks, g_stripe_nvalid(sc));
	sbuf_printf(sb, "%s<State>", indent);
	if (sc->sc_provider != NULL && sc->sc_provider->error == 0)
	sbuf_cat(sb, "UP");
	else
	sbuf_cat(sb, "DOWN");
	sbuf_cat(sb, "</State>\n");
	}
	}

	DECLARE_GEOM_CLASS(g_stripe_class, g_stripe);
	MODULE_VERSION(geom_stripe, 0);
	diff --git a/sys/geom/uzip/g_uzip.c b/sys/geom/uzip/g_uzip.c
	index bbb6b3c215f8..333f16910c5a 100644
	--- a/sys/geom/uzip/g_uzip.c
	+++ b/sys/geom/uzip/g_uzip.c
	@@ -1,992 +1,992 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2004 Max Khon
	* Copyright (c) 2014 Juniper Networks, Inc.
	* Copyright (c) 2006-2016 Maxim Sobolev <sobomax@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_geom.h"
	#include "opt_zstdio.h"

	#include <sys/param.h>
	#include <sys/bio.h>
	#include <sys/endian.h>
	#include <sys/errno.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/malloc.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/kthread.h>

	#include <geom/geom.h>

	#include <geom/uzip/g_uzip.h>
	#include <geom/uzip/g_uzip_cloop.h>
	#include <geom/uzip/g_uzip_softc.h>
	#include <geom/uzip/g_uzip_dapi.h>
	#include <geom/uzip/g_uzip_zlib.h>
	#include <geom/uzip/g_uzip_lzma.h>
	#ifdef ZSTDIO
	#include <geom/uzip/g_uzip_zstd.h>
	#endif
	#include <geom/uzip/g_uzip_wrkthr.h>

	MALLOC_DEFINE(M_GEOM_UZIP, "geom_uzip", "GEOM UZIP data structures");

	FEATURE(geom_uzip, "GEOM read-only compressed disks support");

	struct g_uzip_blk {
	uint64_t offset;
	uint32_t blen;
	unsigned char last:1;
	unsigned char padded:1;
	#define BLEN_UNDEF UINT32_MAX
	};

	#ifndef ABS
	#define ABS(a) ((a) < 0 ? -(a) : (a))
	#endif

	#define BLK_IN_RANGE(mcn, bcn, ilen) \
	(((bcn) != BLEN_UNDEF) && ( \
	((ilen) >= 0 && (mcn >= bcn) && (mcn <= ((intmax_t)(bcn) + (ilen)))) \|\| \
	((ilen) < 0 && (mcn <= bcn) && (mcn >= ((intmax_t)(bcn) + (ilen)))) \
	))

	#ifdef GEOM_UZIP_DEBUG
	# define GEOM_UZIP_DBG_DEFAULT 3
	#else
	# define GEOM_UZIP_DBG_DEFAULT 0
	#endif

	#define GUZ_DBG_ERR 1
	#define GUZ_DBG_INFO 2
	#define GUZ_DBG_IO 3
	#define GUZ_DBG_TOC 4

	#define GUZ_DEV_SUFX ".uzip"
	#define GUZ_DEV_NAME(p) (p GUZ_DEV_SUFX)

	static char g_uzip_attach_to[MAXPATHLEN] = {"*"};
	static char g_uzip_noattach_to[MAXPATHLEN] = {GUZ_DEV_NAME("*")};
	TUNABLE_STR("kern.geom.uzip.attach_to", g_uzip_attach_to,
	sizeof(g_uzip_attach_to));
	TUNABLE_STR("kern.geom.uzip.noattach_to", g_uzip_noattach_to,
	sizeof(g_uzip_noattach_to));

	SYSCTL_DECL(_kern_geom);
	SYSCTL_NODE(_kern_geom, OID_AUTO, uzip, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_UZIP stuff");
	static u_int g_uzip_debug = GEOM_UZIP_DBG_DEFAULT;
	SYSCTL_UINT(_kern_geom_uzip, OID_AUTO, debug, CTLFLAG_RWTUN, &g_uzip_debug, 0,
	"Debug level (0-4)");
	static u_int g_uzip_debug_block = BLEN_UNDEF;
	SYSCTL_UINT(_kern_geom_uzip, OID_AUTO, debug_block, CTLFLAG_RWTUN,
	&g_uzip_debug_block, 0, "Debug operations around specific cluster#");

	#define DPRINTF(lvl, a) \
	if ((lvl) <= g_uzip_debug) { \
	printf a; \
	}
	#define DPRINTF_BLK(lvl, cn, a) \
	if ((lvl) <= g_uzip_debug \|\| \
	BLK_IN_RANGE(cn, g_uzip_debug_block, 8) \|\| \
	BLK_IN_RANGE(cn, g_uzip_debug_block, -8)) { \
	printf a; \
	}
	#define DPRINTF_BRNG(lvl, bcn, ecn, a) \
	KASSERT(bcn < ecn, ("DPRINTF_BRNG: invalid range (%ju, %ju)", \
	(uintmax_t)bcn, (uintmax_t)ecn)); \
	if (((lvl) <= g_uzip_debug) \|\| \
	BLK_IN_RANGE(g_uzip_debug_block, bcn, \
	(intmax_t)ecn - (intmax_t)bcn)) { \
	printf a; \
	}

	#define UZIP_CLASS_NAME "UZIP"

	/*
	* Maximum allowed valid block size (to prevent foot-shooting)
	*/
	-#define MAX_BLKSZ (MAXPHYS)
	+#define MAX_BLKSZ (maxphys)

	static char CLOOP_MAGIC_START[] = "#!/bin/sh\n";

	static void g_uzip_read_done(struct bio *bp);
	static void g_uzip_do(struct g_uzip_softc , struct bio bp);

	static void
	g_uzip_softc_free(struct g_geom *gp)
	{
	struct g_uzip_softc *sc = gp->softc;

	DPRINTF(GUZ_DBG_INFO, ("%s: %d requests, %d cached\n",
	gp->name, sc->req_total, sc->req_cached));

	mtx_lock(&sc->queue_mtx);
	sc->wrkthr_flags \|= GUZ_SHUTDOWN;
	wakeup(sc);
	while (!(sc->wrkthr_flags & GUZ_EXITING)) {
	msleep(sc->procp, &sc->queue_mtx, PRIBIO, "guzfree",
	hz / 10);
	}
	mtx_unlock(&sc->queue_mtx);

	sc->dcp->free(sc->dcp);
	free(sc->toc, M_GEOM_UZIP);
	mtx_destroy(&sc->queue_mtx);
	mtx_destroy(&sc->last_mtx);
	free(sc->last_buf, M_GEOM_UZIP);
	free(sc, M_GEOM_UZIP);
	gp->softc = NULL;
	}

	static int
	g_uzip_cached(struct g_geom gp, struct bio bp)
	{
	struct g_uzip_softc *sc;
	off_t ofs;
	size_t blk, blkofs, usz;

	sc = gp->softc;
	ofs = bp->bio_offset + bp->bio_completed;
	blk = ofs / sc->blksz;
	mtx_lock(&sc->last_mtx);
	if (blk == sc->last_blk) {
	blkofs = ofs % sc->blksz;
	usz = sc->blksz - blkofs;
	if (bp->bio_resid < usz)
	usz = bp->bio_resid;
	memcpy(bp->bio_data + bp->bio_completed, sc->last_buf + blkofs,
	usz);
	sc->req_cached++;
	mtx_unlock(&sc->last_mtx);

	DPRINTF(GUZ_DBG_IO, ("%s/%s: %p: offset=%jd: got %jd bytes "
	"from cache\n", __func__, gp->name, bp, (intmax_t)ofs,
	(intmax_t)usz));

	bp->bio_completed += usz;
	bp->bio_resid -= usz;

	if (bp->bio_resid == 0) {
	g_io_deliver(bp, 0);
	return (1);
	}
	} else
	mtx_unlock(&sc->last_mtx);

	return (0);
	}

	#define BLK_ENDS(sc, bi) ((sc)->toc[(bi)].offset + \
	(sc)->toc[(bi)].blen)

	#define BLK_IS_CONT(sc, bi) (BLK_ENDS((sc), (bi) - 1) == \
	(sc)->toc[(bi)].offset)
	#define BLK_IS_NIL(sc, bi) ((sc)->toc[(bi)].blen == 0)

	#define TOFF_2_BOFF(sc, pp, bi) ((sc)->toc[(bi)].offset - \
	(sc)->toc[(bi)].offset % (pp)->sectorsize)
	#define TLEN_2_BLEN(sc, pp, bp, ei) roundup(BLK_ENDS((sc), (ei)) - \
	(bp)->bio_offset, (pp)->sectorsize)

	static int
	g_uzip_request(struct g_geom gp, struct bio bp)
	{
	struct g_uzip_softc *sc;
	struct bio *bp2;
	struct g_consumer *cp;
	struct g_provider *pp;
	off_t ofs, start_blk_ofs;
	size_t i, start_blk, end_blk, zsize;

	if (g_uzip_cached(gp, bp) != 0)
	return (1);

	sc = gp->softc;

	cp = LIST_FIRST(&gp->consumer);
	pp = cp->provider;

	ofs = bp->bio_offset + bp->bio_completed;
	start_blk = ofs / sc->blksz;
	KASSERT(start_blk < sc->nblocks, ("start_blk out of range"));
	end_blk = howmany(ofs + bp->bio_resid, sc->blksz);
	KASSERT(end_blk <= sc->nblocks, ("end_blk out of range"));

	for (; BLK_IS_NIL(sc, start_blk) && start_blk < end_blk; start_blk++) {
	/* Fill in any leading Nil blocks */
	start_blk_ofs = ofs % sc->blksz;
	zsize = MIN(sc->blksz - start_blk_ofs, bp->bio_resid);
	DPRINTF_BLK(GUZ_DBG_IO, start_blk, ("%s/%s: %p/%ju: "
	"filling %ju zero bytes\n", __func__, gp->name, gp,
	(uintmax_t)bp->bio_completed, (uintmax_t)zsize));
	bzero(bp->bio_data + bp->bio_completed, zsize);
	bp->bio_completed += zsize;
	bp->bio_resid -= zsize;
	ofs += zsize;
	}

	if (start_blk == end_blk) {
	KASSERT(bp->bio_resid == 0, ("bp->bio_resid is invalid"));
	/*
	* No non-Nil data is left, complete request immediately.
	*/
	DPRINTF(GUZ_DBG_IO, ("%s/%s: %p: all done returning %ju "
	"bytes\n", __func__, gp->name, gp,
	(uintmax_t)bp->bio_completed));
	g_io_deliver(bp, 0);
	return (1);
	}

	for (i = start_blk + 1; i < end_blk; i++) {
	/* Trim discontinuous areas if any */
	if (!BLK_IS_CONT(sc, i)) {
	end_blk = i;
	break;
	}
	}

	DPRINTF_BRNG(GUZ_DBG_IO, start_blk, end_blk, ("%s/%s: %p: "
	"start=%u (%ju[%jd]), end=%u (%ju)\n", __func__, gp->name, bp,
	(u_int)start_blk, (uintmax_t)sc->toc[start_blk].offset,
	(intmax_t)sc->toc[start_blk].blen,
	(u_int)end_blk, (uintmax_t)BLK_ENDS(sc, end_blk - 1)));

	bp2 = g_clone_bio(bp);
	if (bp2 == NULL) {
	g_io_deliver(bp, ENOMEM);
	return (1);
	}
	bp2->bio_done = g_uzip_read_done;

	bp2->bio_offset = TOFF_2_BOFF(sc, pp, start_blk);
	while (1) {
	bp2->bio_length = TLEN_2_BLEN(sc, pp, bp2, end_blk - 1);
	- if (bp2->bio_length <= MAXPHYS) {
	+ if (bp2->bio_length <= maxphys) {
	break;
	}
	if (end_blk == (start_blk + 1)) {
	break;
	}
	end_blk--;
	}

	DPRINTF(GUZ_DBG_IO, ("%s/%s: bp2->bio_length = %jd, "
	"bp2->bio_offset = %jd\n", __func__, gp->name,
	(intmax_t)bp2->bio_length, (intmax_t)bp2->bio_offset));

	bp2->bio_data = malloc(bp2->bio_length, M_GEOM_UZIP, M_NOWAIT);
	if (bp2->bio_data == NULL) {
	g_destroy_bio(bp2);
	g_io_deliver(bp, ENOMEM);
	return (1);
	}

	DPRINTF_BRNG(GUZ_DBG_IO, start_blk, end_blk, ("%s/%s: %p: "
	"reading %jd bytes from offset %jd\n", __func__, gp->name, bp,
	(intmax_t)bp2->bio_length, (intmax_t)bp2->bio_offset));

	g_io_request(bp2, cp);
	return (0);
	}

	static void
	g_uzip_read_done(struct bio *bp)
	{
	struct bio *bp2;
	struct g_geom *gp;
	struct g_uzip_softc *sc;

	bp2 = bp->bio_parent;
	gp = bp2->bio_to->geom;
	sc = gp->softc;

	mtx_lock(&sc->queue_mtx);
	bioq_disksort(&sc->bio_queue, bp);
	mtx_unlock(&sc->queue_mtx);
	wakeup(sc);
	}

	static int
	g_uzip_memvcmp(const void *memory, unsigned char val, size_t size)
	{
	const u_char *mm;

	mm = (const u_char *)memory;
	return (*mm == val) && memcmp(mm, mm + 1, size - 1) == 0;
	}

	static void
	g_uzip_do(struct g_uzip_softc sc, struct bio bp)
	{
	struct bio *bp2;
	struct g_provider *pp;
	struct g_consumer *cp;
	struct g_geom *gp;
	char data, data2;
	off_t ofs;
	size_t blk, blkofs, len, ulen, firstblk;
	int err;

	bp2 = bp->bio_parent;
	gp = bp2->bio_to->geom;

	cp = LIST_FIRST(&gp->consumer);
	pp = cp->provider;

	bp2->bio_error = bp->bio_error;
	if (bp2->bio_error != 0)
	goto done;

	/* Make sure there's forward progress. */
	if (bp->bio_completed == 0) {
	bp2->bio_error = ECANCELED;
	goto done;
	}

	ofs = bp2->bio_offset + bp2->bio_completed;
	firstblk = blk = ofs / sc->blksz;
	blkofs = ofs % sc->blksz;
	data = bp->bio_data + sc->toc[blk].offset % pp->sectorsize;
	data2 = bp2->bio_data + bp2->bio_completed;
	while (bp->bio_completed && bp2->bio_resid) {
	if (blk > firstblk && !BLK_IS_CONT(sc, blk)) {
	DPRINTF_BLK(GUZ_DBG_IO, blk, ("%s/%s: %p: backref'ed "
	"cluster #%u requested, looping around\n",
	__func__, gp->name, bp2, (u_int)blk));
	goto done;
	}
	ulen = MIN(sc->blksz - blkofs, bp2->bio_resid);
	len = sc->toc[blk].blen;
	DPRINTF(GUZ_DBG_IO, ("%s/%s: %p/%ju: data2=%p, ulen=%u, "
	"data=%p, len=%u\n", __func__, gp->name, gp,
	bp->bio_completed, data2, (u_int)ulen, data, (u_int)len));
	if (len == 0) {
	/* All zero block: no cache update */
	zero_block:
	bzero(data2, ulen);
	} else if (len <= bp->bio_completed) {
	mtx_lock(&sc->last_mtx);
	err = sc->dcp->decompress(sc->dcp, gp->name, data,
	len, sc->last_buf);
	if (err != 0 && sc->toc[blk].last != 0) {
	/*
	* Last block decompression has failed, check
	* if it's just zero padding.
	*/
	if (g_uzip_memvcmp(data, '\0', len) == 0) {
	sc->toc[blk].blen = 0;
	sc->last_blk = -1;
	mtx_unlock(&sc->last_mtx);
	len = 0;
	goto zero_block;
	}
	}
	if (err != 0) {
	sc->last_blk = -1;
	mtx_unlock(&sc->last_mtx);
	bp2->bio_error = EILSEQ;
	DPRINTF(GUZ_DBG_ERR, ("%s/%s: decompress"
	"(%p, %ju, %ju) failed\n", __func__,
	gp->name, sc->dcp, (uintmax_t)blk,
	(uintmax_t)len));
	goto done;
	}
	sc->last_blk = blk;
	memcpy(data2, sc->last_buf + blkofs, ulen);
	mtx_unlock(&sc->last_mtx);
	err = sc->dcp->rewind(sc->dcp, gp->name);
	if (err != 0) {
	bp2->bio_error = EILSEQ;
	DPRINTF(GUZ_DBG_ERR, ("%s/%s: rewind(%p) "
	"failed\n", __func__, gp->name, sc->dcp));
	goto done;
	}
	data += len;
	} else
	break;

	data2 += ulen;
	bp2->bio_completed += ulen;
	bp2->bio_resid -= ulen;
	bp->bio_completed -= len;
	blkofs = 0;
	blk++;
	}

	done:
	/* Finish processing the request. */
	free(bp->bio_data, M_GEOM_UZIP);
	g_destroy_bio(bp);
	if (bp2->bio_error != 0 \|\| bp2->bio_resid == 0)
	g_io_deliver(bp2, bp2->bio_error);
	else
	g_uzip_request(gp, bp2);
	}

	static void
	g_uzip_start(struct bio *bp)
	{
	struct g_provider *pp;
	struct g_geom *gp;
	struct g_uzip_softc *sc;

	pp = bp->bio_to;
	gp = pp->geom;

	DPRINTF(GUZ_DBG_IO, ("%s/%s: %p: cmd=%d, offset=%jd, length=%jd, "
	"buffer=%p\n", __func__, gp->name, bp, bp->bio_cmd,
	(intmax_t)bp->bio_offset, (intmax_t)bp->bio_length, bp->bio_data));

	sc = gp->softc;
	sc->req_total++;

	if (bp->bio_cmd == BIO_GETATTR) {
	struct bio *bp2;
	struct g_consumer *cp;
	struct g_geom *gp;
	struct g_provider *pp;

	/* pass on MNT:* requests and ignore others */
	if (strncmp(bp->bio_attribute, "MNT:", 4) == 0) {
	bp2 = g_clone_bio(bp);
	if (bp2 == NULL) {
	g_io_deliver(bp, ENOMEM);
	return;
	}
	bp2->bio_done = g_std_done;
	pp = bp->bio_to;
	gp = pp->geom;
	cp = LIST_FIRST(&gp->consumer);
	g_io_request(bp2, cp);
	return;
	}
	}
	if (bp->bio_cmd != BIO_READ) {
	g_io_deliver(bp, EOPNOTSUPP);
	return;
	}

	bp->bio_resid = bp->bio_length;
	bp->bio_completed = 0;

	g_uzip_request(gp, bp);
	}

	static void
	g_uzip_orphan(struct g_consumer *cp)
	{
	struct g_geom *gp;

	g_topology_assert();
	G_VALID_CONSUMER(cp);
	gp = cp->geom;
	g_trace(G_T_TOPOLOGY, "%s(%p/%s)", __func__, cp, gp->name);
	g_wither_geom(gp, ENXIO);

	/*
	* We can safely free the softc now if there are no accesses,
	* otherwise g_uzip_access() will do that after the last close.
	*/
	if ((cp->acr + cp->acw + cp->ace) == 0)
	g_uzip_softc_free(gp);
	}

	static void
	g_uzip_spoiled(struct g_consumer *cp)
	{

	g_trace(G_T_TOPOLOGY, "%s(%p/%s)", __func__, cp, cp->geom->name);
	cp->flags \|= G_CF_ORPHAN;
	g_uzip_orphan(cp);
	}

	static int
	g_uzip_access(struct g_provider *pp, int dr, int dw, int de)
	{
	struct g_geom *gp;
	struct g_consumer *cp;
	int error;

	gp = pp->geom;
	cp = LIST_FIRST(&gp->consumer);
	KASSERT (cp != NULL, ("g_uzip_access but no consumer"));

	if (cp->acw + dw > 0)
	return (EROFS);

	error = g_access(cp, dr, dw, de);

	/*
	* Free the softc if all providers have been closed and this geom
	* is being removed.
	*/
	if (error == 0 && (gp->flags & G_GEOM_WITHER) != 0 &&
	(cp->acr + cp->acw + cp->ace) == 0)
	g_uzip_softc_free(gp);

	return (error);
	}

	static int
	g_uzip_parse_toc(struct g_uzip_softc sc, struct g_provider pp,
	struct g_geom *gp)
	{
	uint32_t i, j, backref_to;
	uint64_t max_offset, min_offset;
	struct g_uzip_blk *last_blk;

	min_offset = sizeof(struct cloop_header) +
	(sc->nblocks + 1) * sizeof(uint64_t);
	max_offset = sc->toc[0].offset - 1;
	last_blk = &sc->toc[0];
	for (i = 0; i < sc->nblocks; i++) {
	/* First do some bounds checking */
	if ((sc->toc[i].offset < min_offset) \|\|
	(sc->toc[i].offset > pp->mediasize)) {
	goto error_offset;
	}
	DPRINTF_BLK(GUZ_DBG_IO, i, ("%s: cluster #%u "
	"offset=%ju max_offset=%ju\n", gp->name,
	(u_int)i, (uintmax_t)sc->toc[i].offset,
	(uintmax_t)max_offset));
	backref_to = BLEN_UNDEF;
	if (sc->toc[i].offset < max_offset) {
	/*
	* For the backref'ed blocks search already parsed
	* TOC entries for the matching offset and copy the
	* size from matched entry.
	*/
	for (j = 0; j <= i; j++) {
	if (sc->toc[j].offset == sc->toc[i].offset &&
	!BLK_IS_NIL(sc, j)) {
	break;
	}
	if (j != i) {
	continue;
	}
	DPRINTF(GUZ_DBG_ERR, ("%s: cannot match "
	"backref'ed offset at cluster #%u\n",
	gp->name, i));
	return (-1);
	}
	sc->toc[i].blen = sc->toc[j].blen;
	backref_to = j;
	} else {
	last_blk = &sc->toc[i];
	/*
	* For the "normal blocks" seek forward until we hit
	* block whose offset is larger than ours and assume
	* it's going to be the next one.
	*/
	for (j = i + 1; j < sc->nblocks + 1; j++) {
	if (sc->toc[j].offset > max_offset) {
	break;
	}
	}
	sc->toc[i].blen = sc->toc[j].offset -
	sc->toc[i].offset;
	if (BLK_ENDS(sc, i) > pp->mediasize) {
	DPRINTF(GUZ_DBG_ERR, ("%s: cluster #%u "
	"extends past media boundary (%ju > %ju)\n",
	gp->name, (u_int)i,
	(uintmax_t)BLK_ENDS(sc, i),
	(intmax_t)pp->mediasize));
	return (-1);
	}
	KASSERT(max_offset <= sc->toc[i].offset, (
	"%s: max_offset is incorrect: %ju",
	gp->name, (uintmax_t)max_offset));
	max_offset = BLK_ENDS(sc, i) - 1;
	}
	DPRINTF_BLK(GUZ_DBG_TOC, i, ("%s: cluster #%u, original %u "
	"bytes, in %u bytes", gp->name, i, sc->blksz,
	sc->toc[i].blen));
	if (backref_to != BLEN_UNDEF) {
	DPRINTF_BLK(GUZ_DBG_TOC, i, (" (->#%u)",
	(u_int)backref_to));
	}
	DPRINTF_BLK(GUZ_DBG_TOC, i, ("\n"));
	}
	last_blk->last = 1;
	/* Do a second pass to validate block lengths */
	for (i = 0; i < sc->nblocks; i++) {
	if (sc->toc[i].blen > sc->dcp->max_blen) {
	if (sc->toc[i].last == 0) {
	DPRINTF(GUZ_DBG_ERR, ("%s: cluster #%u "
	"length (%ju) exceeds "
	"max_blen (%ju)\n", gp->name, i,
	(uintmax_t)sc->toc[i].blen,
	(uintmax_t)sc->dcp->max_blen));
	return (-1);
	}
	DPRINTF(GUZ_DBG_INFO, ("%s: cluster #%u extra "
	"padding is detected, trimmed to %ju\n",
	gp->name, i, (uintmax_t)sc->dcp->max_blen));
	sc->toc[i].blen = sc->dcp->max_blen;
	sc->toc[i].padded = 1;
	}
	}
	return (0);

	error_offset:
	DPRINTF(GUZ_DBG_ERR, ("%s: cluster #%u: invalid offset %ju, "
	"min_offset=%ju mediasize=%jd\n", gp->name, (u_int)i,
	sc->toc[i].offset, min_offset, pp->mediasize));
	return (-1);
	}

	static struct g_geom *
	g_uzip_taste(struct g_class mp, struct g_provider pp, int flags)
	{
	int error;
	uint32_t i, total_offsets, offsets_read, blk;
	void *buf;
	struct cloop_header *header;
	struct g_consumer *cp;
	struct g_geom *gp;
	struct g_provider *pp2;
	struct g_uzip_softc *sc;
	struct g_geom_alias *gap;
	enum {
	G_UZIP = 1,
	G_ULZMA,
	G_ZSTD,
	} type;
	char cloop_version;

	g_trace(G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, pp->name);
	g_topology_assert();

	/* Skip providers that are already open for writing. */
	if (pp->acw > 0)
	return (NULL);

	if ((fnmatch(g_uzip_attach_to, pp->name, 0) != 0) \|\|
	(fnmatch(g_uzip_noattach_to, pp->name, 0) == 0)) {
	DPRINTF(GUZ_DBG_INFO, ("%s(%s,%s), ignoring\n", __func__,
	mp->name, pp->name));
	return (NULL);
	}

	buf = NULL;

	/*
	* Create geom instance.
	*/
	gp = g_new_geomf(mp, GUZ_DEV_NAME("%s"), pp->name);
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error != 0)
	goto e0;
	error = g_access(cp, 1, 0, 0);
	if (error)
	goto e1;
	g_topology_unlock();

	/*
	* Read cloop header, look for CLOOP magic, perform
	* other validity checks.
	*/
	DPRINTF(GUZ_DBG_INFO, ("%s: media sectorsize %u, mediasize %jd\n",
	gp->name, pp->sectorsize, (intmax_t)pp->mediasize));
	buf = g_read_data(cp, 0, pp->sectorsize, NULL);
	if (buf == NULL)
	goto e2;
	header = (struct cloop_header *) buf;
	if (strncmp(header->magic, CLOOP_MAGIC_START,
	sizeof(CLOOP_MAGIC_START) - 1) != 0) {
	DPRINTF(GUZ_DBG_ERR, ("%s: no CLOOP magic\n", gp->name));
	goto e3;
	}

	cloop_version = header->magic[CLOOP_OFS_VERSN];
	switch (header->magic[CLOOP_OFS_COMPR]) {
	case CLOOP_COMP_LZMA:
	case CLOOP_COMP_LZMA_DDP:
	type = G_ULZMA;
	if (cloop_version < CLOOP_MINVER_LZMA) {
	DPRINTF(GUZ_DBG_ERR, ("%s: image version too old\n",
	gp->name));
	goto e3;
	}
	DPRINTF(GUZ_DBG_INFO, ("%s: GEOM_UZIP_LZMA image found\n",
	gp->name));
	break;
	case CLOOP_COMP_LIBZ:
	case CLOOP_COMP_LIBZ_DDP:
	type = G_UZIP;
	if (cloop_version < CLOOP_MINVER_ZLIB) {
	DPRINTF(GUZ_DBG_ERR, ("%s: image version too old\n",
	gp->name));
	goto e3;
	}
	DPRINTF(GUZ_DBG_INFO, ("%s: GEOM_UZIP_ZLIB image found\n",
	gp->name));
	break;
	case CLOOP_COMP_ZSTD:
	case CLOOP_COMP_ZSTD_DDP:
	if (cloop_version < CLOOP_MINVER_ZSTD) {
	DPRINTF(GUZ_DBG_ERR, ("%s: image version too old\n",
	gp->name));
	goto e3;
	}
	#ifdef ZSTDIO
	DPRINTF(GUZ_DBG_INFO, ("%s: GEOM_UZIP_ZSTD image found.\n",
	gp->name));
	type = G_ZSTD;
	#else
	DPRINTF(GUZ_DBG_ERR, ("%s: GEOM_UZIP_ZSTD image found, but "
	"this kernel was configured with Zstd disabled.\n",
	gp->name));
	goto e3;
	#endif
	break;
	default:
	DPRINTF(GUZ_DBG_ERR, ("%s: unsupported image type\n",
	gp->name));
	goto e3;
	}

	/*
	* Initialize softc and read offsets.
	*/
	sc = malloc(sizeof(*sc), M_GEOM_UZIP, M_WAITOK \| M_ZERO);
	gp->softc = sc;
	sc->blksz = ntohl(header->blksz);
	sc->nblocks = ntohl(header->nblocks);
	if (sc->blksz % 512 != 0) {
	printf("%s: block size (%u) should be multiple of 512.\n",
	gp->name, sc->blksz);
	goto e4;
	}
	if (sc->blksz > MAX_BLKSZ) {
	- printf("%s: block size (%u) should not be larger than %d.\n",
	+ printf("%s: block size (%u) should not be larger than %lu.\n",
	gp->name, sc->blksz, MAX_BLKSZ);
	}
	total_offsets = sc->nblocks + 1;
	if (sizeof(struct cloop_header) +
	total_offsets * sizeof(uint64_t) > pp->mediasize) {
	printf("%s: media too small for %u blocks\n",
	gp->name, sc->nblocks);
	goto e4;
	}
	sc->toc = malloc(total_offsets * sizeof(struct g_uzip_blk),
	M_GEOM_UZIP, M_WAITOK \| M_ZERO);
	offsets_read = MIN(total_offsets,
	(pp->sectorsize - sizeof(*header)) / sizeof(uint64_t));
	for (i = 0; i < offsets_read; i++) {
	sc->toc[i].offset = be64toh(((uint64_t *) (header + 1))[i]);
	sc->toc[i].blen = BLEN_UNDEF;
	}
	DPRINTF(GUZ_DBG_INFO, ("%s: %u offsets in the first sector\n",
	gp->name, offsets_read));

	/*
	* The following invalidates the "header" pointer into the first
	* block's "buf."
	*/
	header = NULL;

	for (blk = 1; offsets_read < total_offsets; blk++) {
	uint32_t nread;

	free(buf, M_GEOM);
	buf = g_read_data(
	cp, blk * pp->sectorsize, pp->sectorsize, NULL);
	if (buf == NULL)
	goto e5;
	nread = MIN(total_offsets - offsets_read,
	pp->sectorsize / sizeof(uint64_t));
	DPRINTF(GUZ_DBG_TOC, ("%s: %u offsets read from sector %d\n",
	gp->name, nread, blk));
	for (i = 0; i < nread; i++) {
	sc->toc[offsets_read + i].offset =
	be64toh(((uint64_t *) buf)[i]);
	sc->toc[offsets_read + i].blen = BLEN_UNDEF;
	}
	offsets_read += nread;
	}
	free(buf, M_GEOM);
	buf = NULL;
	offsets_read -= 1;
	DPRINTF(GUZ_DBG_INFO, ("%s: done reading %u block offsets from %u "
	"sectors\n", gp->name, offsets_read, blk));
	if (sc->nblocks != offsets_read) {
	DPRINTF(GUZ_DBG_ERR, ("%s: read %s offsets than expected "
	"blocks\n", gp->name,
	sc->nblocks < offsets_read ? "more" : "less"));
	goto e5;
	}

	switch (type) {
	case G_UZIP:
	sc->dcp = g_uzip_zlib_ctor(sc->blksz);
	break;
	case G_ULZMA:
	sc->dcp = g_uzip_lzma_ctor(sc->blksz);
	break;
	#ifdef ZSTDIO
	case G_ZSTD:
	sc->dcp = g_uzip_zstd_ctor(sc->blksz);
	break;
	#endif
	default:
	goto e5;
	}

	/*
	* The last+1 block was not always initialized by earlier versions of
	* mkuzip(8). However, if it is initialized, the difference between
	* its offset and the prior block's offset represents the length of the
	* final real compressed block, and this is significant to the
	* decompressor.
	*/
	if (cloop_version >= CLOOP_MINVER_RELIABLE_LASTBLKSZ &&
	sc->toc[sc->nblocks].offset != 0) {
	if (sc->toc[sc->nblocks].offset > pp->mediasize) {
	DPRINTF(GUZ_DBG_ERR,
	("%s: bogus n+1 offset %ju > mediasize %ju\n",
	gp->name, (uintmax_t)sc->toc[sc->nblocks].offset,
	(uintmax_t)pp->mediasize));
	goto e6;
	}
	} else {
	sc->toc[sc->nblocks].offset = pp->mediasize;
	}
	/* Massage TOC (table of contents), make sure it is sound */
	if (g_uzip_parse_toc(sc, pp, gp) != 0) {
	DPRINTF(GUZ_DBG_ERR, ("%s: TOC error\n", gp->name));
	goto e6;
	}
	mtx_init(&sc->last_mtx, "geom_uzip cache", NULL, MTX_DEF);
	mtx_init(&sc->queue_mtx, "geom_uzip wrkthread", NULL, MTX_DEF);
	bioq_init(&sc->bio_queue);
	sc->last_blk = -1;
	sc->last_buf = malloc(sc->blksz, M_GEOM_UZIP, M_WAITOK);
	sc->req_total = 0;
	sc->req_cached = 0;

	sc->uzip_do = &g_uzip_do;

	error = kproc_create(g_uzip_wrkthr, sc, &sc->procp, 0, 0, "%s",
	gp->name);
	if (error != 0) {
	goto e7;
	}

	g_topology_lock();
	pp2 = g_new_providerf(gp, "%s", gp->name);
	pp2->sectorsize = 512;
	pp2->mediasize = (off_t)sc->nblocks * sc->blksz;
	pp2->stripesize = pp->stripesize;
	pp2->stripeoffset = pp->stripeoffset;
	LIST_FOREACH(gap, &pp->aliases, ga_next)
	g_provider_add_alias(pp2, GUZ_DEV_NAME("%s"), gap->ga_alias);
	g_error_provider(pp2, 0);
	g_access(cp, -1, 0, 0);

	DPRINTF(GUZ_DBG_INFO, ("%s: taste ok (%d, %ju), (%ju, %ju), %x\n",
	gp->name, pp2->sectorsize, (uintmax_t)pp2->mediasize,
	(uintmax_t)pp2->stripeoffset, (uintmax_t)pp2->stripesize, pp2->flags));
	DPRINTF(GUZ_DBG_INFO, ("%s: %u x %u blocks\n", gp->name, sc->nblocks,
	sc->blksz));
	return (gp);

	e7:
	free(sc->last_buf, M_GEOM);
	mtx_destroy(&sc->queue_mtx);
	mtx_destroy(&sc->last_mtx);
	e6:
	sc->dcp->free(sc->dcp);
	e5:
	free(sc->toc, M_GEOM);
	e4:
	free(gp->softc, M_GEOM_UZIP);
	e3:
	if (buf != NULL) {
	free(buf, M_GEOM);
	}
	e2:
	g_topology_lock();
	g_access(cp, -1, 0, 0);
	e1:
	g_detach(cp);
	e0:
	g_destroy_consumer(cp);
	g_destroy_geom(gp);

	return (NULL);
	}

	static int
	g_uzip_destroy_geom(struct gctl_req req, struct g_class mp, struct g_geom *gp)
	{
	struct g_provider *pp;

	KASSERT(gp != NULL, ("NULL geom"));
	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, gp->name);
	g_topology_assert();

	if (gp->softc == NULL) {
	DPRINTF(GUZ_DBG_ERR, ("%s(%s): gp->softc == NULL\n", __func__,
	gp->name));
	return (ENXIO);
	}

	pp = LIST_FIRST(&gp->provider);
	KASSERT(pp != NULL, ("NULL provider"));
	if (pp->acr > 0 \|\| pp->acw > 0 \|\| pp->ace > 0)
	return (EBUSY);

	g_wither_geom(gp, ENXIO);
	g_uzip_softc_free(gp);
	return (0);
	}

	static struct g_class g_uzip_class = {
	.name = UZIP_CLASS_NAME,
	.version = G_VERSION,
	.taste = g_uzip_taste,
	.destroy_geom = g_uzip_destroy_geom,

	.start = g_uzip_start,
	.orphan = g_uzip_orphan,
	.access = g_uzip_access,
	.spoiled = g_uzip_spoiled,
	};

	DECLARE_GEOM_CLASS(g_uzip_class, g_uzip);
	MODULE_DEPEND(g_uzip, xz, 1, 1, 1);
	MODULE_DEPEND(g_uzip, zlib, 1, 1, 1);
	MODULE_VERSION(geom_uzip, 0);
	diff --git a/sys/geom/vinum/geom_vinum_var.h b/sys/geom/vinum/geom_vinum_var.h
	index f6a367db59ed..2bd70875f6cd 100644
	--- a/sys/geom/vinum/geom_vinum_var.h
	+++ b/sys/geom/vinum/geom_vinum_var.h
	@@ -1,393 +1,393 @@
	/*-
	* SPDX-License-Identifier: BSD-4-Clause
	*
	* Copyright (c) 2004, 2007 Lukas Ertl
	* Copyright (c) 1997, 1998, 1999
	* Nan Yang Computer Services Limited. All rights reserved.
	*
	* Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
	* Parts written by Greg Lehey.
	*
	* This software is distributed under the so-called ``Berkeley
	* License'': *
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by Nan Yang Computer
	* Services Limited.
	* 4. Neither the name of the Company nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* This software is provided ``as is'', and any express or implied
	* warranties, including, but not limited to, the implied warranties of
	* merchantability and fitness for a particular purpose are disclaimed.
	* In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential
	* damages (including, but not limited to, procurement of substitute
	* goods or services; loss of use, data, or profits; or business
	* interruption) however caused and on any theory of liability, whether
	* in contract, strict liability, or tort (including negligence or
	* otherwise) arising in any way out of the use of this software, even if
	* advised of the possibility of such damage.
	*
	* $FreeBSD$
	*/

	#ifndef _GEOM_VINUM_VAR_H_
	#define _GEOM_VINUM_VAR_H_

	/*
	* Slice header
	*
	* Vinum drives start with this structure:
	*
	*\ Sector
	* \|--------------------------------------\|
	* \| PDP-11 memorial boot block \| 0
	* \|--------------------------------------\|
	* \| Disk label, maybe \| 1
	* \|--------------------------------------\|
	* \| Slice definition (vinum_hdr) \| 8
	* \|--------------------------------------\|
	* \| \|
	* \| Configuration info, first copy \| 9
	* \| \|
	* \|--------------------------------------\|
	* \| \|
	* \| Configuration info, second copy \| 9 + size of config
	* \| \|
	* \|--------------------------------------\|
	*/

	/* Sizes and offsets of our information. */
	#define GV_HDR_OFFSET 4096 /* Offset of vinum header. */
	#define GV_HDR_LEN 512 /* Size of vinum header. */
	#define GV_CFG_OFFSET 4608 /* Offset of first config copy. */
	#define GV_CFG_LEN 65536 /* Size of config copy. */

	/* This is where the actual data starts. */
	#define GV_DATA_START (GV_CFG_LEN * 2 + GV_CFG_OFFSET)
	/* #define GV_DATA_START (GV_CFG_LEN * 2 + GV_HDR_LEN) */

	#define GV_MAXDRIVENAME 32 /* Maximum length of a device name. */
	#define GV_MAXSDNAME 64 /* Maximum length of a subdisk name. */
	#define GV_MAXPLEXNAME 64 /* Maximum length of a plex name. */
	#define GV_MAXVOLNAME 64 /* Maximum length of a volume name. */

	/* Command line flags. */
	#define GV_FLAG_R 0x01
	#define GV_FLAG_S 0x02
	#define GV_FLAG_V 0x04
	#define GV_FLAG_VV 0x08
	#define GV_FLAG_F 0x10

	/* Object types. */
	#define GV_TYPE_VOL 1
	#define GV_TYPE_PLEX 2
	#define GV_TYPE_SD 3
	#define GV_TYPE_DRIVE 4

	/* State changing flags. */
	#define GV_SETSTATE_FORCE 0x1
	#define GV_SETSTATE_CONFIG 0x2

	/* Subdisk state bitmaps for plexes. */
	#define GV_SD_DOWNSTATE 0x01 /* Subdisk is down. */
	#define GV_SD_STALESTATE 0x02 /* Subdisk is stale. */
	#define GV_SD_INITSTATE 0x04 /* Subdisk is initializing. */
	#define GV_SD_UPSTATE 0x08 /* Subdisk is up. */

	/* Synchronization/initialization request sizes. */
	#define GV_MIN_SYNCSIZE 512
	-#define GV_MAX_SYNCSIZE MAXPHYS
	+#define GV_MAX_SYNCSIZE maxphys
	#define GV_DFLT_SYNCSIZE 65536

	/* Flags for BIOs, as they are processed within vinum. */
	#define GV_BIO_GROW 0x01
	#define GV_BIO_MALLOC 0x02
	#define GV_BIO_ONHOLD 0x04
	#define GV_BIO_SYNCREQ 0x08
	#define GV_BIO_INIT 0x10
	#define GV_BIO_REBUILD 0x20
	#define GV_BIO_CHECK 0x40
	#define GV_BIO_PARITY 0x80
	#define GV_BIO_INTERNAL \
	(GV_BIO_SYNCREQ \| GV_BIO_INIT \| GV_BIO_REBUILD \| GV_BIO_CHECK \| GV_BIO_GROW)

	/* Error codes to be used within gvinum. */
	#define GV_ERR_SETSTATE (-1) /* Error setting state. */
	#define GV_ERR_BADSIZE (-2) /* Object has wrong size. */
	#define GV_ERR_INVTYPE (-3) /* Invalid object type. */
	#define GV_ERR_CREATE (-4) /* Error creating gvinum object. */
	#define GV_ERR_ISBUSY (-5) /* Object is busy. */
	#define GV_ERR_ISATTACHED (-6) /* Object is attached to another. */
	#define GV_ERR_INVFLAG (-7) /* Invalid flag passed. */
	#define GV_ERR_INVSTATE (-8) /* Invalid state. */
	#define GV_ERR_NOTFOUND (-9) /* Object not found. */
	#define GV_ERR_NAMETAKEN (-10) /* Object name is taken. */
	#define GV_ERR_NOSPACE (-11) /* No space left on drive/subdisk. */
	#define GV_ERR_BADOFFSET (-12) /* Invalid offset specified. */
	#define GV_ERR_INVNAME (-13) /* Invalid object name. */
	#define GV_ERR_PLEXORG (-14) /* Invalid plex organization. */

	/*
	* hostname is 256 bytes long, but we don't need to shlep multiple copies in
	* vinum. We use the host name just to identify this system, and 32 bytes
	* should be ample for that purpose.
	*/

	#define GV_HOSTNAME_LEN 32
	struct gv_label {
	char sysname[GV_HOSTNAME_LEN]; /* System name at creation time. */
	char name[GV_MAXDRIVENAME]; /* Our name of the drive. */
	struct timeval date_of_birth; /* The time it was created ... */
	struct timeval last_update; /* ... and the time of last update. */
	off_t drive_size; /* Total size incl. headers. */
	};

	/* The 'header' of each valid vinum drive. */
	struct gv_hdr {
	uint64_t magic;
	#define GV_OLD_MAGIC 0x494E2056494E4F00LL
	#define GV_OLD_NOMAGIC 0x4E4F2056494E4F00LL
	#define GV_MAGIC 0x56494E554D2D3100LL
	#define GV_NOMAGIC 0x56494E554D2D2D00LL

	uint64_t config_length;
	struct gv_label label;
	};

	/* A single freelist entry of a drive. */
	struct gv_freelist {
	off_t size; /* Size of this free slot. */
	off_t offset; /* Offset on the drive. */
	LIST_ENTRY(gv_freelist) freelist;
	};

	/*
	* Since we share structures between userland and kernel, we need this helper
	* struct instead of struct bio_queue_head and friends. Maybe I find a proper
	* solution some day.
	*/
	struct gv_bioq {
	struct bio *bp;
	TAILQ_ENTRY(gv_bioq) queue;
	};

	#define GV_EVENT_DRIVE_TASTED 1
	#define GV_EVENT_DRIVE_LOST 2
	#define GV_EVENT_THREAD_EXIT 3
	#define GV_EVENT_CREATE_DRIVE 4
	#define GV_EVENT_CREATE_VOLUME 5
	#define GV_EVENT_CREATE_PLEX 6
	#define GV_EVENT_CREATE_SD 7
	#define GV_EVENT_SAVE_CONFIG 8
	#define GV_EVENT_RM_VOLUME 9
	#define GV_EVENT_RM_PLEX 10
	#define GV_EVENT_RM_SD 11
	#define GV_EVENT_RM_DRIVE 12
	#define GV_EVENT_SET_SD_STATE 13
	#define GV_EVENT_SET_DRIVE_STATE 14
	#define GV_EVENT_SET_VOL_STATE 15
	#define GV_EVENT_SET_PLEX_STATE 16
	#define GV_EVENT_RESET_CONFIG 17
	#define GV_EVENT_PARITY_REBUILD 18
	#define GV_EVENT_PARITY_CHECK 19
	#define GV_EVENT_START_PLEX 20
	#define GV_EVENT_START_VOLUME 21
	#define GV_EVENT_ATTACH_PLEX 22
	#define GV_EVENT_ATTACH_SD 23
	#define GV_EVENT_DETACH_PLEX 24
	#define GV_EVENT_DETACH_SD 25
	#define GV_EVENT_RENAME_VOL 26
	#define GV_EVENT_RENAME_PLEX 27
	#define GV_EVENT_RENAME_SD 28
	#define GV_EVENT_RENAME_DRIVE 29
	#define GV_EVENT_MOVE_SD 30
	#define GV_EVENT_SETUP_OBJECTS 31

	#ifdef _KERNEL
	struct gv_event {
	int type;
	void *arg1;
	void *arg2;
	intmax_t arg3;
	intmax_t arg4;
	TAILQ_ENTRY(gv_event) events;
	};

	/* This struct contains the main vinum config. */
	struct gv_softc {
	/* Linked lists of all objects in our setup. */
	LIST_HEAD(,gv_drive) drives; /* All drives. */
	LIST_HEAD(,gv_plex) plexes; /* All plexes. */
	LIST_HEAD(,gv_sd) subdisks; /* All subdisks. */
	LIST_HEAD(,gv_volume) volumes; /* All volumes. */

	TAILQ_HEAD(,gv_event) equeue; /* Event queue. */
	struct mtx equeue_mtx; /* Event queue lock. */
	struct mtx bqueue_mtx; /* BIO queue lock. */
	struct mtx config_mtx; /* Configuration lock. */
	struct bio_queue_head bqueue_down; / BIO queue incoming
	requests. */
	struct bio_queue_head bqueue_up; / BIO queue for completed
	requests. */
	struct g_geom geom; / Pointer to our VINUM geom. */
	struct proc worker; / Worker process. */
	};
	#endif

	/* softc for a drive. */
	struct gv_drive {
	char name[GV_MAXDRIVENAME]; /* The name of this drive. */
	char device[GV_MAXDRIVENAME]; /* Associated device. */
	int state; /* The state of this drive. */
	#define GV_DRIVE_DOWN 0
	#define GV_DRIVE_UP 1

	off_t size; /* Size of this drive. */
	off_t avail; /* Available space. */
	int sdcount; /* Number of subdisks. */

	int flags;
	#define GV_DRIVE_REFERENCED 0x01 /* The drive isn't really existing,
	but was referenced by a subdisk
	during taste. */
	#define GV_DRIVE_ORPHANED 0x02 /* The drive was orphaned. */

	struct gv_hdr hdr; / The drive header. */

	struct g_consumer consumer; / Consumer attached to this drive. */
	int active; /* Number of active requests. */

	int freelist_entries; /* Count of freelist entries. */
	LIST_HEAD(,gv_freelist) freelist; /* List of freelist entries. */
	LIST_HEAD(,gv_sd) subdisks; /* Subdisks on this drive. */
	LIST_ENTRY(gv_drive) drive; /* Entry in the vinum config. */

	struct gv_softc vinumconf; / Pointer to the vinum conf. */
	};

	/* softc for a subdisk. */
	struct gv_sd {
	char name[GV_MAXSDNAME]; /* The name of this subdisk. */
	off_t size; /* The size of this subdisk. */
	off_t drive_offset; /* Offset in the underlying drive. */
	off_t plex_offset; /* Offset in the associated plex. */
	int state; /* The state of this subdisk. */
	#define GV_SD_DOWN 0
	#define GV_SD_STALE 1
	#define GV_SD_INITIALIZING 2
	#define GV_SD_REVIVING 3
	#define GV_SD_UP 4

	off_t initialized; /* Count of initialized bytes. */

	int init_size; /* Initialization read/write size. */
	int init_error; /* Flag error on initialization. */

	int flags;
	#define GV_SD_NEWBORN 0x01 /* Subdisk is created by user. */
	#define GV_SD_TASTED 0x02 /* Subdisk is created during taste. */
	#define GV_SD_CANGOUP 0x04 /* Subdisk can go up immediately. */
	#define GV_SD_GROW 0x08 /* Subdisk is added to striped plex. */

	char drive[GV_MAXDRIVENAME]; /* Name of underlying drive. */
	char plex[GV_MAXPLEXNAME]; /* Name of associated plex. */

	struct gv_drive drive_sc; / Pointer to underlying drive. */
	struct gv_plex plex_sc; / Pointer to associated plex. */

	LIST_ENTRY(gv_sd) from_drive; /* Subdisk list of underlying drive. */
	LIST_ENTRY(gv_sd) in_plex; /* Subdisk list of associated plex. */
	LIST_ENTRY(gv_sd) sd; /* Entry in the vinum config. */

	struct gv_softc vinumconf; / Pointer to the vinum config. */
	};

	/* softc for a plex. */
	struct gv_plex {
	char name[GV_MAXPLEXNAME]; /* The name of the plex. */
	off_t size; /* The size of the plex. */
	int state; /* The plex state. */
	#define GV_PLEX_DOWN 0
	#define GV_PLEX_INITIALIZING 1
	#define GV_PLEX_DEGRADED 2
	#define GV_PLEX_GROWABLE 3
	#define GV_PLEX_UP 4

	int org; /* The plex organisation. */
	#define GV_PLEX_DISORG 0
	#define GV_PLEX_CONCAT 1
	#define GV_PLEX_STRIPED 2
	#define GV_PLEX_RAID5 4

	int stripesize; /* The stripe size of the plex. */

	char volume[GV_MAXVOLNAME]; /* Name of associated volume. */
	struct gv_volume vol_sc; / Pointer to associated volume. */

	int sddetached; /* Number of detached subdisks. */
	int sdcount; /* Number of subdisks in this plex. */
	int sddown; /* Number of subdisks that are down. */
	int flags;
	#define GV_PLEX_ADDED 0x01 /* Added to an existing volume. */
	#define GV_PLEX_SYNCING 0x02 /* Plex is syncing from another plex. */
	#define GV_PLEX_NEWBORN 0x20 /* The plex was just created. */
	#define GV_PLEX_REBUILDING 0x40 /* The plex is rebuilding. */
	#define GV_PLEX_GROWING 0x80 /* The plex is growing. */

	off_t synced; /* Count of synced bytes. */

	TAILQ_HEAD(,gv_raid5_packet) packets; /* RAID5 sub-requests. */

	LIST_HEAD(,gv_sd) subdisks; /* List of attached subdisks. */
	LIST_ENTRY(gv_plex) in_volume; /* Plex list of associated volume. */
	LIST_ENTRY(gv_plex) plex; /* Entry in the vinum config. */

	#ifdef _KERNEL
	struct bio_queue_head bqueue; / BIO queue. */
	struct bio_queue_head wqueue; / Waiting BIO queue. */
	struct bio_queue_head rqueue; / Rebuild waiting BIO queue. */
	#else
	char bpad, wpad, rpad; / Padding for userland. */
	#endif

	struct gv_softc vinumconf; / Pointer to the vinum config. */
	};

	/* softc for a volume. */
	struct gv_volume {
	char name[GV_MAXVOLNAME]; /* The name of the volume. */
	off_t size; /* The size of the volume. */
	int plexcount; /* Number of plexes. */
	int state; /* The state of the volume. */
	#define GV_VOL_DOWN 0
	#define GV_VOL_UP 1

	int flags;
	#define GV_VOL_NEWBORN 0x08 /* The volume was just created. */

	LIST_HEAD(,gv_plex) plexes; /* List of attached plexes. */
	LIST_ENTRY(gv_volume) volume; /* Entry in vinum config. */

	struct g_provider provider; / Provider of this volume. */

	#ifdef _KERNEL
	struct bio_queue_head wqueue; / BIO delayed request queue. */
	#else
	char wpad; / Padding for userland. */
	#endif

	struct gv_plex *last_read_plex;
	struct gv_softc vinumconf; / Pointer to the vinum config. */
	};

	#endif /* !_GEOM_VINUM_VAR_H */
	diff --git a/sys/geom/virstor/g_virstor.c b/sys/geom/virstor/g_virstor.c
	index 98892bcd30e4..e27d92b509d4 100644
	--- a/sys/geom/virstor/g_virstor.c
	+++ b/sys/geom/virstor/g_virstor.c
	@@ -1,1878 +1,1878 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2006-2007 Ivan Voras <ivoras@freebsd.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/* Implementation notes:
	* - "Components" are wrappers around providers that make up the
	* virtual storage (i.e. a virstor has "physical" components)
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sx.h>
	#include <sys/bio.h>
	#include <sys/sbuf.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/time.h>
	#include <sys/proc.h>
	#include <sys/kthread.h>
	#include <sys/mutex.h>
	#include <vm/uma.h>
	#include <geom/geom.h>
	#include <geom/geom_dbg.h>

	#include <geom/virstor/g_virstor.h>
	#include <geom/virstor/g_virstor_md.h>

	FEATURE(g_virstor, "GEOM virtual storage support");

	/* Declare malloc(9) label */
	static MALLOC_DEFINE(M_GVIRSTOR, "gvirstor", "GEOM_VIRSTOR Data");

	/* GEOM class methods */
	static g_init_t g_virstor_init;
	static g_fini_t g_virstor_fini;
	static g_taste_t g_virstor_taste;
	static g_ctl_req_t g_virstor_config;
	static g_ctl_destroy_geom_t g_virstor_destroy_geom;

	/* Declare & initialize class structure ("geom class") */
	struct g_class g_virstor_class = {
	.name = G_VIRSTOR_CLASS_NAME,
	.version = G_VERSION,
	.init = g_virstor_init,
	.fini = g_virstor_fini,
	.taste = g_virstor_taste,
	.ctlreq = g_virstor_config,
	.destroy_geom = g_virstor_destroy_geom
	/* The .dumpconf and the rest are only usable for a geom instance, so
	* they will be set when such instance is created. */
	};

	/* Declare sysctl's and loader tunables */
	SYSCTL_DECL(_kern_geom);
	static SYSCTL_NODE(_kern_geom, OID_AUTO, virstor,
	CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"GEOM_GVIRSTOR information");

	static u_int g_virstor_debug = 2; /* XXX: lower to 2 when released to public */
	SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, debug, CTLFLAG_RWTUN, &g_virstor_debug,
	0, "Debug level (2=production, 5=normal, 15=excessive)");

	static u_int g_virstor_chunk_watermark = 100;
	SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, chunk_watermark, CTLFLAG_RWTUN,
	&g_virstor_chunk_watermark, 0,
	"Minimum number of free chunks before issuing administrative warning");

	static u_int g_virstor_component_watermark = 1;
	SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, component_watermark, CTLFLAG_RWTUN,
	&g_virstor_component_watermark, 0,
	"Minimum number of free components before issuing administrative warning");

	static int read_metadata(struct g_consumer , struct g_virstor_metadata );
	static void write_metadata(struct g_consumer , struct g_virstor_metadata );
	static int clear_metadata(struct g_virstor_component *);
	static int add_provider_to_geom(struct g_virstor_softc , struct g_provider ,
	struct g_virstor_metadata *);
	static struct g_geom create_virstor_geom(struct g_class ,
	struct g_virstor_metadata *);
	static void virstor_check_and_run(struct g_virstor_softc *);
	static u_int virstor_valid_components(struct g_virstor_softc *);
	static int virstor_geom_destroy(struct g_virstor_softc *, boolean_t,
	boolean_t);
	static void remove_component(struct g_virstor_softc *,
	struct g_virstor_component *, boolean_t);
	static void bioq_dismantle(struct bio_queue_head *);
	static int allocate_chunk(struct g_virstor_softc *,
	struct g_virstor_component *, u_int , u_int *);
	static void delay_destroy_consumer(void *, int);
	static void dump_component(struct g_virstor_component *comp);
	#if 0
	static void dump_me(struct virstor_map_entry *me, unsigned int nr);
	#endif

	static void virstor_ctl_stop(struct gctl_req , struct g_class );
	static void virstor_ctl_add(struct gctl_req , struct g_class );
	static void virstor_ctl_remove(struct gctl_req , struct g_class );
	static struct g_virstor_softc * virstor_find_geom(const struct g_class *,
	const char *);
	static void update_metadata(struct g_virstor_softc *);
	static void fill_metadata(struct g_virstor_softc , struct g_virstor_metadata ,
	u_int, u_int);

	static void g_virstor_orphan(struct g_consumer *);
	static int g_virstor_access(struct g_provider *, int, int, int);
	static void g_virstor_start(struct bio *);
	static void g_virstor_dumpconf(struct sbuf , const char , struct g_geom *,
	struct g_consumer , struct g_provider );
	static void g_virstor_done(struct bio *);

	static void invalid_call(void);
	/*
	* Initialise GEOM class (per-class callback)
	*/
	static void
	g_virstor_init(struct g_class *mp __unused)
	{

	/* Catch map struct size mismatch at compile time; Map entries must
	- * fit into MAXPHYS exactly, with no wasted space. */
	- CTASSERT(VIRSTOR_MAP_BLOCK_ENTRIES*VIRSTOR_MAP_ENTRY_SIZE == MAXPHYS);
	+ * fit into maxphys exactly, with no wasted space. */
	+ MPASS(VIRSTOR_MAP_BLOCK_ENTRIES * VIRSTOR_MAP_ENTRY_SIZE == maxphys);

	/* Init UMA zones, TAILQ's, other global vars */
	}

	/*
	* Finalise GEOM class (per-class callback)
	*/
	static void
	g_virstor_fini(struct g_class *mp __unused)
	{

	/* Deinit UMA zones & global vars */
	}

	/*
	* Config (per-class callback)
	*/
	static void
	g_virstor_config(struct gctl_req req, struct g_class cp, char const *verb)
	{
	uint32_t *version;

	g_topology_assert();

	version = gctl_get_paraml(req, "version", sizeof(*version));
	if (version == NULL) {
	gctl_error(req, "Failed to get 'version' argument");
	return;
	}
	if (*version != G_VIRSTOR_VERSION) {
	gctl_error(req, "Userland and kernel versions out of sync");
	return;
	}

	g_topology_unlock();
	if (strcmp(verb, "add") == 0)
	virstor_ctl_add(req, cp);
	else if (strcmp(verb, "stop") == 0 \|\| strcmp(verb, "destroy") == 0)
	virstor_ctl_stop(req, cp);
	else if (strcmp(verb, "remove") == 0)
	virstor_ctl_remove(req, cp);
	else
	gctl_error(req, "unknown verb: '%s'", verb);
	g_topology_lock();
	}

	/*
	* "stop" verb from userland
	*/
	static void
	virstor_ctl_stop(struct gctl_req req, struct g_class cp)
	{
	int force, nargs;
	int i;

	nargs = gctl_get_paraml(req, "nargs", sizeof *nargs);
	if (nargs == NULL) {
	gctl_error(req, "Error fetching argument '%s'", "nargs");
	return;
	}
	if (*nargs < 1) {
	gctl_error(req, "Invalid number of arguments");
	return;
	}
	force = gctl_get_paraml(req, "force", sizeof *force);
	if (force == NULL) {
	gctl_error(req, "Error fetching argument '%s'", "force");
	return;
	}

	g_topology_lock();
	for (i = 0; i < *nargs; i++) {
	char param[8];
	const char *name;
	struct g_virstor_softc *sc;
	int error;

	snprintf(param, sizeof(param), "arg%d", i);
	name = gctl_get_asciiparam(req, param);
	if (name == NULL) {
	gctl_error(req, "No 'arg%d' argument", i);
	g_topology_unlock();
	return;
	}
	sc = virstor_find_geom(cp, name);
	if (sc == NULL) {
	gctl_error(req, "Don't know anything about '%s'", name);
	g_topology_unlock();
	return;
	}

	LOG_MSG(LVL_INFO, "Stopping %s by the userland command",
	sc->geom->name);
	update_metadata(sc);
	if ((error = virstor_geom_destroy(sc, TRUE, TRUE)) != 0) {
	LOG_MSG(LVL_ERROR, "Cannot destroy %s: %d",
	sc->geom->name, error);
	}
	}
	g_topology_unlock();
	}

	/*
	* "add" verb from userland - add new component(s) to the structure.
	* This will be done all at once in here, without going through the
	* .taste function for new components.
	*/
	static void
	virstor_ctl_add(struct gctl_req req, struct g_class cp)
	{
	/* Note: while this is going on, I/O is being done on
	* the g_up and g_down threads. The idea is to make changes
	* to softc members in a way that can atomically activate
	* them all at once. */
	struct g_virstor_softc *sc;
	int hardcode, nargs;
	const char geom_name; / geom to add a component to */
	struct g_consumer *fcp;
	struct g_virstor_bio_q *bq;
	u_int added;
	int error;
	int i;

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "Error fetching argument '%s'", "nargs");
	return;
	}
	if (*nargs < 2) {
	gctl_error(req, "Invalid number of arguments");
	return;
	}
	hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode));
	if (hardcode == NULL) {
	gctl_error(req, "Error fetching argument '%s'", "hardcode");
	return;
	}

	/* Find "our" geom */
	geom_name = gctl_get_asciiparam(req, "arg0");
	if (geom_name == NULL) {
	gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)");
	return;
	}
	sc = virstor_find_geom(cp, geom_name);
	if (sc == NULL) {
	gctl_error(req, "Don't know anything about '%s'", geom_name);
	return;
	}

	if (virstor_valid_components(sc) != sc->n_components) {
	LOG_MSG(LVL_ERROR, "Cannot add components to incomplete "
	"virstor %s", sc->geom->name);
	gctl_error(req, "Virstor %s is incomplete", sc->geom->name);
	return;
	}

	fcp = sc->components[0].gcons;
	added = 0;
	g_topology_lock();
	for (i = 1; i < *nargs; i++) {
	struct g_virstor_metadata md;
	char aname[8];
	struct g_provider *pp;
	struct g_consumer *cp;
	u_int nc;
	u_int j;

	snprintf(aname, sizeof aname, "arg%d", i);
	pp = gctl_get_provider(req, aname);
	if (pp == NULL) {
	/* This is the most common error so be verbose about it */
	if (added != 0) {
	gctl_error(req, "Invalid provider. (added"
	" %u components)", added);
	update_metadata(sc);
	}
	g_topology_unlock();
	return;
	}
	cp = g_new_consumer(sc->geom);
	if (cp == NULL) {
	gctl_error(req, "Cannot create consumer");
	g_topology_unlock();
	return;
	}
	error = g_attach(cp, pp);
	if (error != 0) {
	gctl_error(req, "Cannot attach a consumer to %s",
	pp->name);
	g_destroy_consumer(cp);
	g_topology_unlock();
	return;
	}
	if (fcp->acr != 0 \|\| fcp->acw != 0 \|\| fcp->ace != 0) {
	error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
	if (error != 0) {
	gctl_error(req, "Access request failed for %s",
	pp->name);
	g_destroy_consumer(cp);
	g_topology_unlock();
	return;
	}
	}
	if (fcp->provider->sectorsize != pp->sectorsize) {
	gctl_error(req, "Sector size doesn't fit for %s",
	pp->name);
	g_destroy_consumer(cp);
	g_topology_unlock();
	return;
	}
	for (j = 0; j < sc->n_components; j++) {
	if (strcmp(sc->components[j].gcons->provider->name,
	pp->name) == 0) {
	gctl_error(req, "Component %s already in %s",
	pp->name, sc->geom->name);
	g_destroy_consumer(cp);
	g_topology_unlock();
	return;
	}
	}
	sc->components = realloc(sc->components,
	sizeof(sc->components) (sc->n_components + 1),
	M_GVIRSTOR, M_WAITOK);

	nc = sc->n_components;
	sc->components[nc].gcons = cp;
	sc->components[nc].sc = sc;
	sc->components[nc].index = nc;
	sc->components[nc].chunk_count = cp->provider->mediasize /
	sc->chunk_size;
	sc->components[nc].chunk_next = 0;
	sc->components[nc].chunk_reserved = 0;

	if (sc->components[nc].chunk_count < 4) {
	gctl_error(req, "Provider too small: %s",
	cp->provider->name);
	g_destroy_consumer(cp);
	g_topology_unlock();
	return;
	}
	fill_metadata(sc, &md, nc, *hardcode);
	write_metadata(cp, &md);
	/* The new component becomes visible when n_components is
	* incremented */
	sc->n_components++;
	added++;
	}
	/* This call to update_metadata() is critical. In case there's a
	* power failure in the middle of it and some components are updated
	* while others are not, there will be trouble on next .taste() iff
	* a non-updated component is detected first */
	update_metadata(sc);
	g_topology_unlock();
	LOG_MSG(LVL_INFO, "Added %d component(s) to %s", added,
	sc->geom->name);
	/* Fire off BIOs previously queued because there wasn't any
	* physical space left. If the BIOs still can't be satisfied
	* they will again be added to the end of the queue (during
	* which the mutex will be recursed) */
	bq = malloc(sizeof(*bq), M_GVIRSTOR, M_WAITOK);
	bq->bio = NULL;
	mtx_lock(&sc->delayed_bio_q_mtx);
	/* First, insert a sentinel to the queue end, so we don't
	* end up in an infinite loop if there's still no free
	* space available. */
	STAILQ_INSERT_TAIL(&sc->delayed_bio_q, bq, linkage);
	while (!STAILQ_EMPTY(&sc->delayed_bio_q)) {
	bq = STAILQ_FIRST(&sc->delayed_bio_q);
	if (bq->bio != NULL) {
	g_virstor_start(bq->bio);
	STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage);
	free(bq, M_GVIRSTOR);
	} else {
	STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage);
	free(bq, M_GVIRSTOR);
	break;
	}
	}
	mtx_unlock(&sc->delayed_bio_q_mtx);

	}

	/*
	* Find a geom handled by the class
	*/
	static struct g_virstor_softc *
	virstor_find_geom(const struct g_class cp, const char name)
	{
	struct g_geom *gp;

	LIST_FOREACH(gp, &cp->geom, geom) {
	if (strcmp(name, gp->name) == 0)
	return (gp->softc);
	}
	return (NULL);
	}

	/*
	* Update metadata on all components to reflect the current state
	* of these fields:
	* - chunk_next
	* - flags
	* - md_count
	* Expects things to be set up so write_metadata() can work, i.e.
	* the topology lock must be held.
	*/
	static void
	update_metadata(struct g_virstor_softc *sc)
	{
	struct g_virstor_metadata md;
	u_int n;

	if (virstor_valid_components(sc) != sc->n_components)
	return; /* Incomplete device */
	LOG_MSG(LVL_DEBUG, "Updating metadata on components for %s",
	sc->geom->name);
	/* Update metadata on components */
	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
	sc->geom->class->name, sc->geom->name);
	g_topology_assert();
	for (n = 0; n < sc->n_components; n++) {
	read_metadata(sc->components[n].gcons, &md);
	md.chunk_next = sc->components[n].chunk_next;
	md.flags = sc->components[n].flags;
	md.md_count = sc->n_components;
	write_metadata(sc->components[n].gcons, &md);
	}
	}

	/*
	* Fills metadata (struct md) from information stored in softc and the nc'th
	* component of virstor
	*/
	static void
	fill_metadata(struct g_virstor_softc sc, struct g_virstor_metadata md,
	u_int nc, u_int hardcode)
	{
	struct g_virstor_component *c;

	bzero(md, sizeof *md);
	c = &sc->components[nc];

	strncpy(md->md_magic, G_VIRSTOR_MAGIC, sizeof md->md_magic);
	md->md_version = G_VIRSTOR_VERSION;
	strncpy(md->md_name, sc->geom->name, sizeof md->md_name);
	md->md_id = sc->id;
	md->md_virsize = sc->virsize;
	md->md_chunk_size = sc->chunk_size;
	md->md_count = sc->n_components;

	if (hardcode) {
	strncpy(md->provider, c->gcons->provider->name,
	sizeof md->provider);
	}
	md->no = nc;
	md->provsize = c->gcons->provider->mediasize;
	md->chunk_count = c->chunk_count;
	md->chunk_next = c->chunk_next;
	md->chunk_reserved = c->chunk_reserved;
	md->flags = c->flags;
	}

	/*
	* Remove a component from virstor device.
	* Can only be done if the component is unallocated.
	*/
	static void
	virstor_ctl_remove(struct gctl_req req, struct g_class cp)
	{
	/* As this is executed in parallel to I/O, operations on virstor
	* structures must be as atomic as possible. */
	struct g_virstor_softc *sc;
	int *nargs;
	const char *geom_name;
	u_int removed;
	int i;

	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	if (nargs == NULL) {
	gctl_error(req, "Error fetching argument '%s'", "nargs");
	return;
	}
	if (*nargs < 2) {
	gctl_error(req, "Invalid number of arguments");
	return;
	}
	/* Find "our" geom */
	geom_name = gctl_get_asciiparam(req, "arg0");
	if (geom_name == NULL) {
	gctl_error(req, "Error fetching argument '%s'",
	"geom_name (arg0)");
	return;
	}
	sc = virstor_find_geom(cp, geom_name);
	if (sc == NULL) {
	gctl_error(req, "Don't know anything about '%s'", geom_name);
	return;
	}

	if (virstor_valid_components(sc) != sc->n_components) {
	LOG_MSG(LVL_ERROR, "Cannot remove components from incomplete "
	"virstor %s", sc->geom->name);
	gctl_error(req, "Virstor %s is incomplete", sc->geom->name);
	return;
	}

	removed = 0;
	for (i = 1; i < *nargs; i++) {
	char param[8];
	const char *prov_name;
	int j, found;
	struct g_virstor_component newcomp, compbak;

	snprintf(param, sizeof(param), "arg%d", i);
	prov_name = gctl_get_asciiparam(req, param);
	if (prov_name == NULL) {
	gctl_error(req, "Error fetching argument '%s'", param);
	return;
	}
	if (strncmp(prov_name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
	prov_name += sizeof(_PATH_DEV) - 1;

	found = -1;
	for (j = 0; j < sc->n_components; j++) {
	if (strcmp(sc->components[j].gcons->provider->name,
	prov_name) == 0) {
	found = j;
	break;
	}
	}
	if (found == -1) {
	LOG_MSG(LVL_ERROR, "No %s component in %s",
	prov_name, sc->geom->name);
	continue;
	}

	compbak = sc->components;
	newcomp = malloc(sc->n_components * sizeof(*sc->components),
	M_GVIRSTOR, M_WAITOK \| M_ZERO);
	bcopy(sc->components, newcomp, found * sizeof(*sc->components));
	bcopy(&sc->components[found + 1], newcomp + found,
	found * sizeof(*sc->components));
	if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) != 0) {
	LOG_MSG(LVL_ERROR, "Allocated provider %s cannot be "
	"removed from %s",
	prov_name, sc->geom->name);
	free(newcomp, M_GVIRSTOR);
	/* We'll consider this non-fatal error */
	continue;
	}
	/* Renumerate unallocated components */
	for (j = 0; j < sc->n_components-1; j++) {
	if ((sc->components[j].flags &
	VIRSTOR_PROVIDER_ALLOCATED) == 0) {
	sc->components[j].index = j;
	}
	}
	/* This is the critical section. If a component allocation
	* event happens while both variables are not yet set,
	* there will be trouble. Something will panic on encountering
	* NULL sc->components[x].gcomp member.
	* Luckily, component allocation happens very rarely and
	* removing components is an abnormal action in any case. */
	sc->components = newcomp;
	sc->n_components--;
	/* End critical section */

	g_topology_lock();
	if (clear_metadata(&compbak[found]) != 0) {
	LOG_MSG(LVL_WARNING, "Trouble ahead: cannot clear "
	"metadata on %s", prov_name);
	}
	g_detach(compbak[found].gcons);
	g_destroy_consumer(compbak[found].gcons);
	g_topology_unlock();

	free(compbak, M_GVIRSTOR);

	removed++;
	}

	/* This call to update_metadata() is critical. In case there's a
	* power failure in the middle of it and some components are updated
	* while others are not, there will be trouble on next .taste() iff
	* a non-updated component is detected first */
	g_topology_lock();
	update_metadata(sc);
	g_topology_unlock();
	LOG_MSG(LVL_INFO, "Removed %d component(s) from %s", removed,
	sc->geom->name);
	}

	/*
	* Clear metadata sector on component
	*/
	static int
	clear_metadata(struct g_virstor_component *comp)
	{
	char *buf;
	int error;

	LOG_MSG(LVL_INFO, "Clearing metadata on %s",
	comp->gcons->provider->name);
	g_topology_assert();
	error = g_access(comp->gcons, 0, 1, 0);
	if (error != 0)
	return (error);
	buf = malloc(comp->gcons->provider->sectorsize, M_GVIRSTOR,
	M_WAITOK \| M_ZERO);
	error = g_write_data(comp->gcons,
	comp->gcons->provider->mediasize -
	comp->gcons->provider->sectorsize,
	buf,
	comp->gcons->provider->sectorsize);
	free(buf, M_GVIRSTOR);
	g_access(comp->gcons, 0, -1, 0);
	return (error);
	}

	/*
	* Destroy geom forcibly.
	*/
	static int
	g_virstor_destroy_geom(struct gctl_req req __unused, struct g_class mp,
	struct g_geom *gp)
	{
	struct g_virstor_softc *sc;
	int exitval;

	sc = gp->softc;
	KASSERT(sc != NULL, ("%s: NULL sc", __func__));

	exitval = 0;
	LOG_MSG(LVL_DEBUG, "%s called for %s, sc=%p", __func__, gp->name,
	gp->softc);

	if (sc != NULL) {
	#ifdef INVARIANTS
	char *buf;
	int error;
	off_t off;
	int isclean, count;
	int n;

	LOG_MSG(LVL_INFO, "INVARIANTS detected");
	LOG_MSG(LVL_INFO, "Verifying allocation "
	"table for %s", sc->geom->name);
	count = 0;
	for (n = 0; n < sc->chunk_count; n++) {
	if (sc->map[n].flags \|\| VIRSTOR_MAP_ALLOCATED != 0)
	count++;
	}
	LOG_MSG(LVL_INFO, "Device %s has %d allocated chunks",
	sc->geom->name, count);
	n = off = count = 0;
	isclean = 1;
	if (virstor_valid_components(sc) != sc->n_components) {
	/* This is a incomplete virstor device (not all
	* components have been found) */
	LOG_MSG(LVL_ERROR, "Device %s is incomplete",
	sc->geom->name);
	goto bailout;
	}
	error = g_access(sc->components[0].gcons, 1, 0, 0);
	KASSERT(error == 0, ("%s: g_access failed (%d)", __func__,
	error));
	/* Compare the whole on-disk allocation table with what's
	* currently in memory */
	while (n < sc->chunk_count) {
	buf = g_read_data(sc->components[0].gcons, off,
	sc->sectorsize, &error);
	KASSERT(buf != NULL, ("g_read_data returned NULL (%d) "
	"for read at %jd", error, off));
	if (bcmp(buf, &sc->map[n], sc->sectorsize) != 0) {
	LOG_MSG(LVL_ERROR, "ERROR in allocation table, "
	"entry %d, offset %jd", n, off);
	isclean = 0;
	count++;
	}
	n += sc->me_per_sector;
	off += sc->sectorsize;
	g_free(buf);
	}
	error = g_access(sc->components[0].gcons, -1, 0, 0);
	KASSERT(error == 0, ("%s: g_access failed (%d) on exit",
	__func__, error));
	if (isclean != 1) {
	LOG_MSG(LVL_ERROR, "ALLOCATION TABLE CORRUPTED FOR %s "
	"(%d sectors don't match, max %zu allocations)",
	sc->geom->name, count,
	count * sc->me_per_sector);
	} else {
	LOG_MSG(LVL_INFO, "Allocation table ok for %s",
	sc->geom->name);
	}
	bailout:
	#endif
	update_metadata(sc);
	virstor_geom_destroy(sc, FALSE, FALSE);
	exitval = EAGAIN;
	} else
	exitval = 0;
	return (exitval);
	}

	/*
	* Taste event (per-class callback)
	* Examines a provider and creates geom instances if needed
	*/
	static struct g_geom *
	g_virstor_taste(struct g_class mp, struct g_provider pp, int flags)
	{
	struct g_virstor_metadata md;
	struct g_geom *gp;
	struct g_consumer *cp;
	struct g_virstor_softc *sc;
	int error;

	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
	g_topology_assert();
	LOG_MSG(LVL_DEBUG, "Tasting %s", pp->name);

	/* We need a dummy geom to attach a consumer to the given provider */
	gp = g_new_geomf(mp, "virstor:taste.helper");
	gp->start = (void )invalid_call; / XXX: hacked up so the */
	gp->access = (void )invalid_call; / compiler doesn't complain. */
	gp->orphan = (void )invalid_call; / I really want these to fail. */

	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error == 0) {
	error = read_metadata(cp, &md);
	g_detach(cp);
	}
	g_destroy_consumer(cp);
	g_destroy_geom(gp);

	if (error != 0)
	return (NULL);

	if (strcmp(md.md_magic, G_VIRSTOR_MAGIC) != 0)
	return (NULL);
	if (md.md_version != G_VIRSTOR_VERSION) {
	LOG_MSG(LVL_ERROR, "Kernel module version invalid "
	"to handle %s (%s) : %d should be %d",
	md.md_name, pp->name, md.md_version, G_VIRSTOR_VERSION);
	return (NULL);
	}
	if (md.provsize != pp->mediasize)
	return (NULL);

	/* If the provider name is hardcoded, use the offered provider only
	* if it's been offered with its proper name (the one used in
	* the label command). */
	if (md.provider[0] != '\0' &&
	!g_compare_names(md.provider, pp->name))
	return (NULL);

	/* Iterate all geoms this class already knows about to see if a new
	* geom instance of this class needs to be created (in case the provider
	* is first from a (possibly) multi-consumer geom) or it just needs
	* to be added to an existing instance. */
	sc = NULL;
	gp = NULL;
	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc == NULL)
	continue;
	if (strcmp(md.md_name, sc->geom->name) != 0)
	continue;
	if (md.md_id != sc->id)
	continue;
	break;
	}
	if (gp != NULL) { /* We found an existing geom instance; add to it */
	LOG_MSG(LVL_INFO, "Adding %s to %s", pp->name, md.md_name);
	error = add_provider_to_geom(sc, pp, &md);
	if (error != 0) {
	LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)",
	pp->name, md.md_name, error);
	return (NULL);
	}
	} else { /* New geom instance needs to be created */
	gp = create_virstor_geom(mp, &md);
	if (gp == NULL) {
	LOG_MSG(LVL_ERROR, "Error creating new instance of "
	"class %s: %s", mp->name, md.md_name);
	LOG_MSG(LVL_DEBUG, "Error creating %s at %s",
	md.md_name, pp->name);
	return (NULL);
	}
	sc = gp->softc;
	LOG_MSG(LVL_INFO, "Adding %s to %s (first found)", pp->name,
	md.md_name);
	error = add_provider_to_geom(sc, pp, &md);
	if (error != 0) {
	LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)",
	pp->name, md.md_name, error);
	virstor_geom_destroy(sc, TRUE, FALSE);
	return (NULL);
	}
	}

	return (gp);
	}

	/*
	* Destroyes consumer passed to it in arguments. Used as a callback
	* on g_event queue.
	*/
	static void
	delay_destroy_consumer(void *arg, int flags __unused)
	{
	struct g_consumer *c = arg;
	KASSERT(c != NULL, ("%s: invalid consumer", __func__));
	LOG_MSG(LVL_DEBUG, "Consumer %s destroyed with delay",
	c->provider->name);
	g_detach(c);
	g_destroy_consumer(c);
	}

	/*
	* Remove a component (consumer) from geom instance; If it's the first
	* component being removed, orphan the provider to announce geom's being
	* dismantled
	*/
	static void
	remove_component(struct g_virstor_softc sc, struct g_virstor_component comp,
	boolean_t delay)
	{
	struct g_consumer *c;

	KASSERT(comp->gcons != NULL, ("Component with no consumer in %s",
	sc->geom->name));
	c = comp->gcons;

	comp->gcons = NULL;
	KASSERT(c->provider != NULL, ("%s: no provider", __func__));
	LOG_MSG(LVL_DEBUG, "Component %s removed from %s", c->provider->name,
	sc->geom->name);
	if (sc->provider != NULL) {
	LOG_MSG(LVL_INFO, "Removing provider %s", sc->provider->name);
	g_wither_provider(sc->provider, ENXIO);
	sc->provider = NULL;
	}

	if (c->acr > 0 \|\| c->acw > 0 \|\| c->ace > 0)
	return;
	if (delay) {
	/* Destroy consumer after it's tasted */
	g_post_event(delay_destroy_consumer, c, M_WAITOK, NULL);
	} else {
	g_detach(c);
	g_destroy_consumer(c);
	}
	}

	/*
	* Destroy geom - called internally
	* See g_virstor_destroy_geom for the other one
	*/
	static int
	virstor_geom_destroy(struct g_virstor_softc *sc, boolean_t force,
	boolean_t delay)
	{
	struct g_provider *pp;
	struct g_geom *gp;
	u_int n;

	g_topology_assert();

	if (sc == NULL)
	return (ENXIO);

	pp = sc->provider;
	if (pp != NULL && (pp->acr != 0 \|\| pp->acw != 0 \|\| pp->ace != 0)) {
	LOG_MSG(force ? LVL_WARNING : LVL_ERROR,
	"Device %s is still open.", pp->name);
	if (!force)
	return (EBUSY);
	}

	for (n = 0; n < sc->n_components; n++) {
	if (sc->components[n].gcons != NULL)
	remove_component(sc, &sc->components[n], delay);
	}

	gp = sc->geom;
	gp->softc = NULL;

	KASSERT(sc->provider == NULL, ("Provider still exists for %s",
	gp->name));

	/* XXX: This might or might not work, since we're called with
	* the topology lock held. Also, it might panic the kernel if
	* the error'd BIO is in softupdates code. */
	mtx_lock(&sc->delayed_bio_q_mtx);
	while (!STAILQ_EMPTY(&sc->delayed_bio_q)) {
	struct g_virstor_bio_q *bq;
	bq = STAILQ_FIRST(&sc->delayed_bio_q);
	bq->bio->bio_error = ENOSPC;
	g_io_deliver(bq->bio, EIO);
	STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage);
	free(bq, M_GVIRSTOR);
	}
	mtx_unlock(&sc->delayed_bio_q_mtx);
	mtx_destroy(&sc->delayed_bio_q_mtx);

	free(sc->map, M_GVIRSTOR);
	free(sc->components, M_GVIRSTOR);
	bzero(sc, sizeof *sc);
	free(sc, M_GVIRSTOR);

	pp = LIST_FIRST(&gp->provider); /* We only offer one provider */
	if (pp == NULL \|\| (pp->acr == 0 && pp->acw == 0 && pp->ace == 0))
	LOG_MSG(LVL_DEBUG, "Device %s destroyed", gp->name);

	g_wither_geom(gp, ENXIO);

	return (0);
	}

	/*
	* Utility function: read metadata & decode. Wants topology lock to be
	* held.
	*/
	static int
	read_metadata(struct g_consumer cp, struct g_virstor_metadata md)
	{
	struct g_provider *pp;
	char *buf;
	int error;

	g_topology_assert();
	error = g_access(cp, 1, 0, 0);
	if (error != 0)
	return (error);
	pp = cp->provider;
	g_topology_unlock();
	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
	&error);
	g_topology_lock();
	g_access(cp, -1, 0, 0);
	if (buf == NULL)
	return (error);

	virstor_metadata_decode(buf, md);
	g_free(buf);

	return (0);
	}

	/**
	* Utility function: encode & write metadata. Assumes topology lock is
	* held.
	*
	* There is no useful way of recovering from errors in this function,
	* not involving panicking the kernel. If the metadata cannot be written
	* the most we can do is notify the operator and hope he spots it and
	* replaces the broken drive.
	*/
	static void
	write_metadata(struct g_consumer cp, struct g_virstor_metadata md)
	{
	struct g_provider *pp;
	char *buf;
	int error;

	KASSERT(cp != NULL && md != NULL && cp->provider != NULL,
	("Something's fishy in %s", __func__));
	LOG_MSG(LVL_DEBUG, "Writing metadata on %s", cp->provider->name);
	g_topology_assert();
	error = g_access(cp, 0, 1, 0);
	if (error != 0) {
	LOG_MSG(LVL_ERROR, "g_access(0,1,0) failed for %s: %d",
	cp->provider->name, error);
	return;
	}
	pp = cp->provider;

	buf = malloc(pp->sectorsize, M_GVIRSTOR, M_WAITOK);
	bzero(buf, pp->sectorsize);
	virstor_metadata_encode(md, buf);
	g_topology_unlock();
	error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf,
	pp->sectorsize);
	g_topology_lock();
	g_access(cp, 0, -1, 0);
	free(buf, M_GVIRSTOR);

	if (error != 0)
	LOG_MSG(LVL_ERROR, "Error %d writing metadata to %s",
	error, cp->provider->name);
	}

	/*
	* Creates a new instance of this GEOM class, initialise softc
	*/
	static struct g_geom *
	create_virstor_geom(struct g_class mp, struct g_virstor_metadata md)
	{
	struct g_geom *gp;
	struct g_virstor_softc *sc;

	LOG_MSG(LVL_DEBUG, "Creating geom instance for %s (id=%u)",
	md->md_name, md->md_id);

	if (md->md_count < 1 \|\| md->md_chunk_size < 1 \|\|
	md->md_virsize < md->md_chunk_size) {
	/* This is bogus configuration, and probably means data is
	* somehow corrupted. Panic, maybe? */
	LOG_MSG(LVL_ERROR, "Nonsensical metadata information for %s",
	md->md_name);
	return (NULL);
	}

	/* Check if it's already created */
	LIST_FOREACH(gp, &mp->geom, geom) {
	sc = gp->softc;
	if (sc != NULL && strcmp(sc->geom->name, md->md_name) == 0) {
	LOG_MSG(LVL_WARNING, "Geom %s already exists",
	md->md_name);
	if (sc->id != md->md_id) {
	LOG_MSG(LVL_ERROR,
	"Some stale or invalid components "
	"exist for virstor device named %s. "
	"You will need to <CLEAR> all stale "
	"components and maybe reconfigure "
	"the virstor device. Tune "
	"kern.geom.virstor.debug sysctl up "
	"for more information.",
	sc->geom->name);
	}
	return (NULL);
	}
	}
	gp = g_new_geomf(mp, "%s", md->md_name);
	gp->softc = NULL; /* to circumevent races that test softc */

	gp->start = g_virstor_start;
	gp->spoiled = g_virstor_orphan;
	gp->orphan = g_virstor_orphan;
	gp->access = g_virstor_access;
	gp->dumpconf = g_virstor_dumpconf;

	sc = malloc(sizeof(*sc), M_GVIRSTOR, M_WAITOK \| M_ZERO);
	sc->id = md->md_id;
	sc->n_components = md->md_count;
	sc->components = malloc(sizeof(struct g_virstor_component) * md->md_count,
	M_GVIRSTOR, M_WAITOK \| M_ZERO);
	sc->chunk_size = md->md_chunk_size;
	sc->virsize = md->md_virsize;
	STAILQ_INIT(&sc->delayed_bio_q);
	mtx_init(&sc->delayed_bio_q_mtx, "gvirstor_delayed_bio_q_mtx",
	"gvirstor", MTX_DEF \| MTX_RECURSE);

	sc->geom = gp;
	sc->provider = NULL; /* virstor_check_and_run will create it */
	gp->softc = sc;

	LOG_MSG(LVL_ANNOUNCE, "Device %s created", sc->geom->name);

	return (gp);
	}

	/*
	* Add provider to a GEOM class instance
	*/
	static int
	add_provider_to_geom(struct g_virstor_softc sc, struct g_provider pp,
	struct g_virstor_metadata *md)
	{
	struct g_virstor_component *component;
	struct g_consumer cp, fcp;
	struct g_geom *gp;
	int error;

	if (md->no >= sc->n_components)
	return (EINVAL);

	/* "Current" compontent */
	component = &(sc->components[md->no]);
	if (component->gcons != NULL)
	return (EEXIST);

	gp = sc->geom;
	fcp = LIST_FIRST(&gp->consumer);

	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);

	if (error != 0) {
	g_destroy_consumer(cp);
	return (error);
	}

	if (fcp != NULL) {
	if (fcp->provider->sectorsize != pp->sectorsize) {
	/* TODO: this can be made to work */
	LOG_MSG(LVL_ERROR, "Provider %s of %s has invalid "
	"sector size (%d)", pp->name, sc->geom->name,
	pp->sectorsize);
	return (EINVAL);
	}
	if (fcp->acr > 0 \|\| fcp->acw \|\| fcp->ace > 0) {
	/* Replicate access permissions from first "live" consumer
	* to the new one */
	error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
	if (error != 0) {
	g_detach(cp);
	g_destroy_consumer(cp);
	return (error);
	}
	}
	}

	/* Bring up a new component */
	cp->private = component;
	component->gcons = cp;
	component->sc = sc;
	component->index = md->no;
	component->chunk_count = md->chunk_count;
	component->chunk_next = md->chunk_next;
	component->chunk_reserved = md->chunk_reserved;
	component->flags = md->flags;

	LOG_MSG(LVL_DEBUG, "%s attached to %s", pp->name, sc->geom->name);

	virstor_check_and_run(sc);
	return (0);
	}

	/*
	* Check if everything's ready to create the geom provider & device entry,
	* create and start provider.
	* Called ultimately by .taste, from g_event thread
	*/
	static void
	virstor_check_and_run(struct g_virstor_softc *sc)
	{
	off_t off;
	size_t n, count;
	int index;
	int error;

	if (virstor_valid_components(sc) != sc->n_components)
	return;

	if (virstor_valid_components(sc) == 0) {
	/* This is actually a candidate for panic() */
	LOG_MSG(LVL_ERROR, "No valid components for %s?",
	sc->provider->name);
	return;
	}

	sc->sectorsize = sc->components[0].gcons->provider->sectorsize;

	/* Initialise allocation map from the first consumer */
	sc->chunk_count = sc->virsize / sc->chunk_size;
	if (sc->chunk_count * (off_t)sc->chunk_size != sc->virsize) {
	LOG_MSG(LVL_WARNING, "Device %s truncated to %ju bytes",
	sc->provider->name,
	sc->chunk_count * (off_t)sc->chunk_size);
	}
	sc->map_size = sc->chunk_count * sizeof *(sc->map);
	/* The following allocation is in order of 4MB - 8MB */
	sc->map = malloc(sc->map_size, M_GVIRSTOR, M_WAITOK);
	KASSERT(sc->map != NULL, ("%s: Memory allocation error (%zu bytes) for %s",
	__func__, sc->map_size, sc->provider->name));
	sc->map_sectors = sc->map_size / sc->sectorsize;

	count = 0;
	for (n = 0; n < sc->n_components; n++)
	count += sc->components[n].chunk_count;
	LOG_MSG(LVL_INFO, "Device %s has %zu physical chunks and %zu virtual "
	"(%zu KB chunks)",
	sc->geom->name, count, sc->chunk_count, sc->chunk_size / 1024);

	error = g_access(sc->components[0].gcons, 1, 0, 0);
	if (error != 0) {
	LOG_MSG(LVL_ERROR, "Cannot acquire read access for %s to "
	"read allocation map for %s",
	sc->components[0].gcons->provider->name,
	sc->geom->name);
	return;
	}
	/* Read in the allocation map */
	LOG_MSG(LVL_DEBUG, "Reading map for %s from %s", sc->geom->name,
	sc->components[0].gcons->provider->name);
	off = count = n = 0;
	while (count < sc->map_size) {
	struct g_virstor_map_entry *mapbuf;
	size_t bs;

	- bs = MIN(MAXPHYS, sc->map_size - count);
	+ bs = MIN(maxphys, sc->map_size - count);
	if (bs % sc->sectorsize != 0) {
	/* Check for alignment errors */
	bs = rounddown(bs, sc->sectorsize);
	if (bs == 0)
	break;
	LOG_MSG(LVL_ERROR, "Trouble: map is not sector-aligned "
	"for %s on %s", sc->geom->name,
	sc->components[0].gcons->provider->name);
	}
	mapbuf = g_read_data(sc->components[0].gcons, off, bs, &error);
	if (mapbuf == NULL) {
	free(sc->map, M_GVIRSTOR);
	LOG_MSG(LVL_ERROR, "Error reading allocation map "
	"for %s from %s (offset %ju) (error %d)",
	sc->geom->name,
	sc->components[0].gcons->provider->name,
	off, error);
	return;
	}

	bcopy(mapbuf, &sc->map[n], bs);
	off += bs;
	count += bs;
	n += bs / sizeof *(sc->map);
	g_free(mapbuf);
	}
	g_access(sc->components[0].gcons, -1, 0, 0);
	LOG_MSG(LVL_DEBUG, "Read map for %s", sc->geom->name);

	/* find first component with allocatable chunks */
	index = -1;
	for (n = 0; n < sc->n_components; n++) {
	if (sc->components[n].chunk_next <
	sc->components[n].chunk_count) {
	index = n;
	break;
	}
	}
	if (index == -1)
	/* not found? set it to the last component and handle it
	* later */
	index = sc->n_components - 1;

	if (index >= sc->n_components - g_virstor_component_watermark - 1) {
	LOG_MSG(LVL_WARNING, "Device %s running out of components "
	"(%d/%u: %s)", sc->geom->name,
	index+1,
	sc->n_components,
	sc->components[index].gcons->provider->name);
	}
	sc->curr_component = index;

	if (sc->components[index].chunk_next >=
	sc->components[index].chunk_count - g_virstor_chunk_watermark) {
	LOG_MSG(LVL_WARNING,
	"Component %s of %s is running out of free space "
	"(%u chunks left)",
	sc->components[index].gcons->provider->name,
	sc->geom->name, sc->components[index].chunk_count -
	sc->components[index].chunk_next);
	}

	sc->me_per_sector = sc->sectorsize / sizeof *(sc->map);
	if (sc->sectorsize % sizeof *(sc->map) != 0) {
	LOG_MSG(LVL_ERROR,
	"%s: Map entries don't fit exactly in a sector (%s)",
	__func__, sc->geom->name);
	return;
	}

	/* Recalculate allocated chunks in components & at the same time
	* verify map data is sane. We could trust metadata on this, but
	* we want to make sure. */
	for (n = 0; n < sc->n_components; n++)
	sc->components[n].chunk_next = sc->components[n].chunk_reserved;

	for (n = 0; n < sc->chunk_count; n++) {
	if (sc->map[n].provider_no >= sc->n_components \|\|
	sc->map[n].provider_chunk >=
	sc->components[sc->map[n].provider_no].chunk_count) {
	LOG_MSG(LVL_ERROR, "%s: Invalid entry %u in map for %s",
	__func__, (u_int)n, sc->geom->name);
	LOG_MSG(LVL_ERROR, "%s: provider_no: %u, n_components: %u"
	" provider_chunk: %u, chunk_count: %u", __func__,
	sc->map[n].provider_no, sc->n_components,
	sc->map[n].provider_chunk,
	sc->components[sc->map[n].provider_no].chunk_count);
	return;
	}
	if (sc->map[n].flags & VIRSTOR_MAP_ALLOCATED)
	sc->components[sc->map[n].provider_no].chunk_next++;
	}

	sc->provider = g_new_providerf(sc->geom, "virstor/%s",
	sc->geom->name);

	sc->provider->sectorsize = sc->sectorsize;
	sc->provider->mediasize = sc->virsize;
	g_error_provider(sc->provider, 0);

	LOG_MSG(LVL_INFO, "%s activated", sc->provider->name);
	LOG_MSG(LVL_DEBUG, "%s starting with current component %u, starting "
	"chunk %u", sc->provider->name, sc->curr_component,
	sc->components[sc->curr_component].chunk_next);
	}

	/*
	* Returns count of active providers in this geom instance
	*/
	static u_int
	virstor_valid_components(struct g_virstor_softc *sc)
	{
	unsigned int nc, i;

	nc = 0;
	KASSERT(sc != NULL, ("%s: softc is NULL", __func__));
	KASSERT(sc->components != NULL, ("%s: sc->components is NULL", __func__));
	for (i = 0; i < sc->n_components; i++)
	if (sc->components[i].gcons != NULL)
	nc++;
	return (nc);
	}

	/*
	* Called when the consumer gets orphaned (?)
	*/
	static void
	g_virstor_orphan(struct g_consumer *cp)
	{
	struct g_virstor_softc *sc;
	struct g_virstor_component *comp;
	struct g_geom *gp;

	g_topology_assert();
	gp = cp->geom;
	sc = gp->softc;
	if (sc == NULL)
	return;

	comp = cp->private;
	KASSERT(comp != NULL, ("%s: No component in private part of consumer",
	__func__));
	remove_component(sc, comp, FALSE);
	if (LIST_EMPTY(&gp->consumer))
	virstor_geom_destroy(sc, TRUE, FALSE);
	}

	/*
	* Called to notify geom when it's been opened, and for what intent
	*/
	static int
	g_virstor_access(struct g_provider *pp, int dr, int dw, int de)
	{
	struct g_consumer c, c2, *tmp;
	struct g_virstor_softc *sc;
	struct g_geom *gp;
	int error;

	KASSERT(pp != NULL, ("%s: NULL provider", __func__));
	gp = pp->geom;
	KASSERT(gp != NULL, ("%s: NULL geom", __func__));
	sc = gp->softc;

	/* Grab an exclusive bit to propagate on our consumers on first open */
	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
	de++;
	/* ... drop it on close */
	if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) {
	de--;
	if (sc != NULL)
	update_metadata(sc);
	}

	error = ENXIO;
	LIST_FOREACH_SAFE(c, &gp->consumer, consumer, tmp) {
	error = g_access(c, dr, dw, de);
	if (error != 0)
	goto fail;
	if (c->acr == 0 && c->acw == 0 && c->ace == 0 &&
	c->flags & G_CF_ORPHAN) {
	g_detach(c);
	g_destroy_consumer(c);
	}
	}

	if (sc != NULL && LIST_EMPTY(&gp->consumer))
	virstor_geom_destroy(sc, TRUE, FALSE);

	return (error);

	fail:
	/* Backout earlier changes */
	LIST_FOREACH(c2, &gp->consumer, consumer) {
	if (c2 == c)
	break;
	g_access(c2, -dr, -dw, -de);
	}
	return (error);
	}

	/*
	* Generate XML dump of current state
	*/
	static void
	g_virstor_dumpconf(struct sbuf sb, const char indent, struct g_geom *gp,
	struct g_consumer cp, struct g_provider pp)
	{
	struct g_virstor_softc *sc;

	g_topology_assert();
	sc = gp->softc;

	if (sc == NULL \|\| pp != NULL)
	return;

	if (cp != NULL) {
	/* For each component */
	struct g_virstor_component *comp;

	comp = cp->private;
	if (comp == NULL)
	return;
	sbuf_printf(sb, "%s<ComponentIndex>%u</ComponentIndex>\n",
	indent, comp->index);
	sbuf_printf(sb, "%s<ChunkCount>%u</ChunkCount>\n",
	indent, comp->chunk_count);
	sbuf_printf(sb, "%s<ChunksUsed>%u</ChunksUsed>\n",
	indent, comp->chunk_next);
	sbuf_printf(sb, "%s<ChunksReserved>%u</ChunksReserved>\n",
	indent, comp->chunk_reserved);
	sbuf_printf(sb, "%s<StorageFree>%u%%</StorageFree>\n",
	indent,
	comp->chunk_next > 0 ? 100 -
	((comp->chunk_next + comp->chunk_reserved) * 100) /
	comp->chunk_count : 100);
	} else {
	/* For the whole thing */
	u_int count, used, i;
	off_t size;

	count = used = size = 0;
	for (i = 0; i < sc->n_components; i++) {
	if (sc->components[i].gcons != NULL) {
	count += sc->components[i].chunk_count;
	used += sc->components[i].chunk_next +
	sc->components[i].chunk_reserved;
	size += sc->components[i].gcons->
	provider->mediasize;
	}
	}

	sbuf_printf(sb, "%s<Status>"
	"Components=%u, Online=%u</Status>\n", indent,
	sc->n_components, virstor_valid_components(sc));
	sbuf_printf(sb, "%s<State>%u%% physical free</State>\n",
	indent, 100-(used * 100) / count);
	sbuf_printf(sb, "%s<ChunkSize>%zu</ChunkSize>\n", indent,
	sc->chunk_size);
	sbuf_printf(sb, "%s<PhysicalFree>%u%%</PhysicalFree>\n",
	indent, used > 0 ? 100 - (used * 100) / count : 100);
	sbuf_printf(sb, "%s<ChunkPhysicalCount>%u</ChunkPhysicalCount>\n",
	indent, count);
	sbuf_printf(sb, "%s<ChunkVirtualCount>%zu</ChunkVirtualCount>\n",
	indent, sc->chunk_count);
	sbuf_printf(sb, "%s<PhysicalBacking>%zu%%</PhysicalBacking>\n",
	indent,
	(count * 100) / sc->chunk_count);
	sbuf_printf(sb, "%s<PhysicalBackingSize>%jd</PhysicalBackingSize>\n",
	indent, size);
	sbuf_printf(sb, "%s<VirtualSize>%jd</VirtualSize>\n", indent,
	sc->virsize);
	}
	}

	/*
	* GEOM .done handler
	* Can't use standard handler because one requested IO may
	* fork into additional data IOs
	*/
	static void
	g_virstor_done(struct bio *b)
	{
	struct g_virstor_softc *sc;
	struct bio *parent_b;

	parent_b = b->bio_parent;
	sc = parent_b->bio_to->geom->softc;

	if (b->bio_error != 0) {
	LOG_MSG(LVL_ERROR, "Error %d for offset=%ju, length=%ju, %s",
	b->bio_error, b->bio_offset, b->bio_length,
	b->bio_to->name);
	if (parent_b->bio_error == 0)
	parent_b->bio_error = b->bio_error;
	}

	parent_b->bio_inbed++;
	parent_b->bio_completed += b->bio_completed;

	if (parent_b->bio_children == parent_b->bio_inbed) {
	parent_b->bio_completed = parent_b->bio_length;
	g_io_deliver(parent_b, parent_b->bio_error);
	}
	g_destroy_bio(b);
	}

	/*
	* I/O starts here
	* Called in g_down thread
	*/
	static void
	g_virstor_start(struct bio *b)
	{
	struct g_virstor_softc *sc;
	struct g_virstor_component *comp;
	struct bio *cb;
	struct g_provider *pp;
	char *addr;
	off_t offset, length;
	struct bio_queue_head bq;
	size_t chunk_size; /* cached for convenience */
	u_int count;

	pp = b->bio_to;
	sc = pp->geom->softc;
	KASSERT(sc != NULL, ("%s: no softc (error=%d, device=%s)", __func__,
	b->bio_to->error, b->bio_to->name));

	LOG_REQ(LVL_MOREDEBUG, b, "%s", __func__);

	switch (b->bio_cmd) {
	case BIO_READ:
	case BIO_WRITE:
	case BIO_DELETE:
	break;
	default:
	g_io_deliver(b, EOPNOTSUPP);
	return;
	}

	LOG_MSG(LVL_DEBUG2, "BIO arrived, size=%ju", b->bio_length);
	bioq_init(&bq);

	chunk_size = sc->chunk_size;
	addr = b->bio_data;
	offset = b->bio_offset; /* virtual offset and length */
	length = b->bio_length;

	while (length > 0) {
	size_t chunk_index, in_chunk_offset, in_chunk_length;
	struct virstor_map_entry *me;

	chunk_index = offset / chunk_size; /* round downwards */
	in_chunk_offset = offset % chunk_size;
	in_chunk_length = min(length, chunk_size - in_chunk_offset);
	LOG_MSG(LVL_DEBUG, "Mapped %s(%ju, %ju) to (%zu,%zu,%zu)",
	b->bio_cmd == BIO_READ ? "R" : "W",
	offset, length,
	chunk_index, in_chunk_offset, in_chunk_length);
	me = &sc->map[chunk_index];

	if (b->bio_cmd == BIO_READ \|\| b->bio_cmd == BIO_DELETE) {
	if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) {
	/* Reads from unallocated chunks return zeroed
	* buffers */
	if (b->bio_cmd == BIO_READ)
	bzero(addr, in_chunk_length);
	} else {
	comp = &sc->components[me->provider_no];

	cb = g_clone_bio(b);
	if (cb == NULL) {
	bioq_dismantle(&bq);
	if (b->bio_error == 0)
	b->bio_error = ENOMEM;
	g_io_deliver(b, b->bio_error);
	return;
	}
	cb->bio_to = comp->gcons->provider;
	cb->bio_done = g_virstor_done;
	cb->bio_offset =
	(off_t)me->provider_chunk * (off_t)chunk_size
	+ in_chunk_offset;
	cb->bio_length = in_chunk_length;
	cb->bio_data = addr;
	cb->bio_caller1 = comp;
	bioq_disksort(&bq, cb);
	}
	} else { /* handle BIO_WRITE */
	KASSERT(b->bio_cmd == BIO_WRITE,
	("%s: Unknown command %d", __func__,
	b->bio_cmd));

	if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) {
	/* We have a virtual chunk, represented by
	* the "me" entry, but it's not yet allocated
	* (tied to) a physical chunk. So do it now. */
	struct virstor_map_entry *data_me;
	u_int phys_chunk, comp_no;
	off_t s_offset;
	int error;

	error = allocate_chunk(sc, &comp, &comp_no,
	&phys_chunk);
	if (error != 0) {
	/* We cannot allocate a physical chunk
	* to satisfy this request, so we'll
	* delay it to when we can...
	* XXX: this will prevent the fs from
	* being umounted! */
	struct g_virstor_bio_q *biq;
	biq = malloc(sizeof *biq, M_GVIRSTOR,
	M_NOWAIT);
	if (biq == NULL) {
	bioq_dismantle(&bq);
	if (b->bio_error == 0)
	b->bio_error = ENOMEM;
	g_io_deliver(b, b->bio_error);
	return;
	}
	biq->bio = b;
	mtx_lock(&sc->delayed_bio_q_mtx);
	STAILQ_INSERT_TAIL(&sc->delayed_bio_q,
	biq, linkage);
	mtx_unlock(&sc->delayed_bio_q_mtx);
	LOG_MSG(LVL_WARNING, "Delaying BIO "
	"(size=%ju) until free physical "
	"space can be found on %s",
	b->bio_length,
	sc->provider->name);
	return;
	}
	LOG_MSG(LVL_DEBUG, "Allocated chunk %u on %s "
	"for %s",
	phys_chunk,
	comp->gcons->provider->name,
	sc->provider->name);

	me->provider_no = comp_no;
	me->provider_chunk = phys_chunk;
	me->flags \|= VIRSTOR_MAP_ALLOCATED;

	cb = g_clone_bio(b);
	if (cb == NULL) {
	me->flags &= ~VIRSTOR_MAP_ALLOCATED;
	me->provider_no = 0;
	me->provider_chunk = 0;
	bioq_dismantle(&bq);
	if (b->bio_error == 0)
	b->bio_error = ENOMEM;
	g_io_deliver(b, b->bio_error);
	return;
	}

	/* The allocation table is stored continuously
	* at the start of the drive. We need to
	* calculate the offset of the sector that holds
	* this map entry both on the drive and in the
	* map array.
	* sc_offset will end up pointing to the drive
	* sector. */
	s_offset = chunk_index * sizeof *me;
	s_offset = rounddown(s_offset, sc->sectorsize);

	/* data_me points to map entry sector
	* in memory (analogous to offset) */
	data_me = &sc->map[rounddown(chunk_index,
	sc->me_per_sector)];

	/* Commit sector with map entry to storage */
	cb->bio_to = sc->components[0].gcons->provider;
	cb->bio_done = g_virstor_done;
	cb->bio_offset = s_offset;
	cb->bio_data = (char *)data_me;
	cb->bio_length = sc->sectorsize;
	cb->bio_caller1 = &sc->components[0];
	bioq_disksort(&bq, cb);
	}

	comp = &sc->components[me->provider_no];
	cb = g_clone_bio(b);
	if (cb == NULL) {
	bioq_dismantle(&bq);
	if (b->bio_error == 0)
	b->bio_error = ENOMEM;
	g_io_deliver(b, b->bio_error);
	return;
	}
	/* Finally, handle the data */
	cb->bio_to = comp->gcons->provider;
	cb->bio_done = g_virstor_done;
	cb->bio_offset = (off_t)me->provider_chunk*(off_t)chunk_size +
	in_chunk_offset;
	cb->bio_length = in_chunk_length;
	cb->bio_data = addr;
	cb->bio_caller1 = comp;
	bioq_disksort(&bq, cb);
	}
	addr += in_chunk_length;
	length -= in_chunk_length;
	offset += in_chunk_length;
	}

	/* Fire off bio's here */
	count = 0;
	for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) {
	bioq_remove(&bq, cb);
	LOG_REQ(LVL_MOREDEBUG, cb, "Firing request");
	comp = cb->bio_caller1;
	cb->bio_caller1 = NULL;
	LOG_MSG(LVL_DEBUG, " firing bio, offset=%ju, length=%ju",
	cb->bio_offset, cb->bio_length);
	g_io_request(cb, comp->gcons);
	count++;
	}
	if (count == 0) { /* We handled everything locally */
	b->bio_completed = b->bio_length;
	g_io_deliver(b, 0);
	}

	}

	/*
	* Allocate a chunk from a physical provider. Returns physical component,
	* chunk index relative to the component and the component's index.
	*/
	static int
	allocate_chunk(struct g_virstor_softc sc, struct g_virstor_component *comp,
	u_int comp_no_p, u_int chunk)
	{
	u_int comp_no;

	KASSERT(sc->curr_component < sc->n_components,
	("%s: Invalid curr_component: %u", __func__, sc->curr_component));

	comp_no = sc->curr_component;
	*comp = &sc->components[comp_no];
	dump_component(*comp);
	if ((comp)->chunk_next >= (comp)->chunk_count) {
	/* This component is full. Allocate next component */
	if (comp_no >= sc->n_components-1) {
	LOG_MSG(LVL_ERROR, "All physical space allocated for %s",
	sc->geom->name);
	return (-1);
	}
	(*comp)->flags &= ~VIRSTOR_PROVIDER_CURRENT;
	sc->curr_component = ++comp_no;

	*comp = &sc->components[comp_no];
	if (comp_no >= sc->n_components - g_virstor_component_watermark-1)
	LOG_MSG(LVL_WARNING, "Device %s running out of components "
	"(switching to %u/%u: %s)", sc->geom->name,
	comp_no+1, sc->n_components,
	(*comp)->gcons->provider->name);
	/* Take care not to overwrite reserved chunks */
	if ( (*comp)->chunk_reserved > 0 &&
	(comp)->chunk_next < (comp)->chunk_reserved)
	(comp)->chunk_next = (comp)->chunk_reserved;

	(*comp)->flags \|=
	VIRSTOR_PROVIDER_ALLOCATED \| VIRSTOR_PROVIDER_CURRENT;
	dump_component(*comp);
	*comp_no_p = comp_no;
	chunk = (comp)->chunk_next++;
	} else {
	*comp_no_p = comp_no;
	chunk = (comp)->chunk_next++;
	}
	return (0);
	}

	/* Dump a component */
	static void
	dump_component(struct g_virstor_component *comp)
	{

	if (g_virstor_debug < LVL_DEBUG2)
	return;
	printf("Component %d: %s\n", comp->index, comp->gcons->provider->name);
	printf(" chunk_count: %u\n", comp->chunk_count);
	printf(" chunk_next: %u\n", comp->chunk_next);
	printf(" flags: %u\n", comp->flags);
	}

	#if 0
	/* Dump a map entry */
	static void
	dump_me(struct virstor_map_entry *me, unsigned int nr)
	{
	if (g_virstor_debug < LVL_DEBUG)
	return;
	printf("VIRT. CHUNK #%d: ", nr);
	if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0)
	printf("(unallocated)\n");
	else
	printf("allocated at provider %u, provider_chunk %u\n",
	me->provider_no, me->provider_chunk);
	}
	#endif

	/*
	* Dismantle bio_queue and destroy its components
	*/
	static void
	bioq_dismantle(struct bio_queue_head *bq)
	{
	struct bio *b;

	for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) {
	bioq_remove(bq, b);
	g_destroy_bio(b);
	}
	}

	/*
	* The function that shouldn't be called.
	* When this is called, the stack is already garbled because of
	* argument mismatch. There's nothing to do now but panic, which is
	* accidentally the whole purpose of this function.
	* Motivation: to guard from accidentally calling geom methods when
	* they shouldn't be called. (see g_..._taste)
	*/
	static void
	invalid_call(void)
	{
	panic("invalid_call() has just been called. Something's fishy here.");
	}

	DECLARE_GEOM_CLASS(g_virstor_class, g_virstor); /* Let there be light */
	MODULE_VERSION(geom_virstor, 0);
	diff --git a/sys/geom/virstor/g_virstor.h b/sys/geom/virstor/g_virstor.h
	index 9f2886bafc41..0d918ac70a0c 100644
	--- a/sys/geom/virstor/g_virstor.h
	+++ b/sys/geom/virstor/g_virstor.h
	@@ -1,111 +1,111 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2006-2007 Ivan Voras <ivoras@freebsd.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#ifndef _G_VIRSTOR_H_
	#define _G_VIRSTOR_H_

	#define G_VIRSTOR_CLASS_NAME "VIRSTOR"

	#define VIRSTOR_MAP_ALLOCATED 1
	struct virstor_map_entry {
	uint16_t flags;
	uint16_t provider_no;
	uint32_t provider_chunk;
	};

	#define VIRSTOR_MAP_ENTRY_SIZE (sizeof(struct virstor_map_entry))
	-#define VIRSTOR_MAP_BLOCK_ENTRIES (MAXPHYS / VIRSTOR_MAP_ENTRY_SIZE)
	-/* Struct size is guarded by CTASSERT in main source */
	+#define VIRSTOR_MAP_BLOCK_ENTRIES (maxphys / VIRSTOR_MAP_ENTRY_SIZE)
	+/* Struct size is guarded by MPASS in main source */

	#ifdef _KERNEL

	#define LOG_MSG(lvl, ...) \
	_GEOM_DEBUG("GEOM_VIRSTOR", g_virstor_debug, (lvl), NULL, __VA_ARGS__)
	#define LOG_MESSAGE LOG_MSG

	#define LOG_REQ(lvl, bp, ...) \
	_GEOM_DEBUG("GEOM_VIRSTOR", g_virstor_debug, (lvl), (bp), __VA_ARGS__)
	#define LOG_REQUEST LOG_REQ

	/* "critical" system announcements (e.g. "geom is up") */
	#define LVL_ANNOUNCE 0
	/* errors */
	#define LVL_ERROR 1
	/* warnings */
	#define LVL_WARNING 2
	/* info, noncritical for system operation (user doesn't have to see it */
	#define LVL_INFO 5
	/* debug info */
	#define LVL_DEBUG 10
	/* more debug info */
	#define LVL_DEBUG2 12
	/* superfluous debug info (large volumes of data) */
	#define LVL_MOREDEBUG 15

	/* Component data */
	struct g_virstor_component {
	struct g_consumer *gcons;
	struct g_virstor_softc *sc;
	unsigned int index; /* Component index in array */
	unsigned int chunk_count;
	unsigned int chunk_next;
	unsigned int chunk_reserved;
	unsigned int flags;
	};

	/* Internal geom instance data */
	struct g_virstor_softc {
	struct g_geom *geom;
	struct g_provider *provider;
	struct g_virstor_component *components;
	u_int n_components;
	u_int curr_component; /* Component currently used */
	uint32_t id; /* Unique ID of this geom */
	off_t virsize; /* Total size of virstor */
	off_t sectorsize;
	size_t chunk_size;
	size_t chunk_count; /* governs map_size */
	struct virstor_map_entry *map;
	size_t map_size; /* (in bytes) */
	size_t map_sectors; /* Size of map in sectors */
	size_t me_per_sector; /* # map entries in a sector */
	STAILQ_HEAD(, g_virstor_bio_q) delayed_bio_q; /* Queue of delayed BIOs */
	struct mtx delayed_bio_q_mtx;
	};

	/* "delayed BIOs" Queue element */
	struct g_virstor_bio_q {
	struct bio *bio;
	STAILQ_ENTRY(g_virstor_bio_q) linkage;
	};

	#endif /* _KERNEL */

	#endif /* !_G_VIRSTOR_H_ */
	diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c
	index 07db75ae753d..abd04b47023b 100644
	--- a/sys/kern/kern_mib.c
	+++ b/sys/kern/kern_mib.c
	@@ -1,717 +1,738 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Mike Karels at Berkeley Software Design, Inc.
	*
	* Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
	* project, to make these variables more userfriendly.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_posix.h"
	#include "opt_config.h"

	#include <sys/param.h>
	#include <sys/boot.h>
	#include <sys/elf.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/random.h>
	#include <sys/sbuf.h>
	#include <sys/smp.h>
	#include <sys/sx.h>
	#include <sys/sysent.h>
	#include <sys/vmmeter.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/unistd.h>

	SYSCTL_ROOT_NODE(0, sysctl, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Sysctl internal magic");
	SYSCTL_ROOT_NODE(CTL_KERN, kern, CTLFLAG_RW \| CTLFLAG_CAPRD \| CTLFLAG_MPSAFE, 0,
	"High kernel, proc, limits &c");
	SYSCTL_ROOT_NODE(CTL_VM, vm, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Virtual memory");
	SYSCTL_ROOT_NODE(CTL_VFS, vfs, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"File system");
	SYSCTL_ROOT_NODE(CTL_NET, net, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Network, (see socket.h)");
	SYSCTL_ROOT_NODE(CTL_DEBUG, debug, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Debugging");
	SYSCTL_NODE(_debug, OID_AUTO, sizeof, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Sizeof various things");
	SYSCTL_ROOT_NODE(CTL_HW, hw, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"hardware");
	SYSCTL_ROOT_NODE(CTL_MACHDEP, machdep, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"machine dependent");
	SYSCTL_NODE(_machdep, OID_AUTO, mitigations, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Machine dependent platform mitigations.");
	SYSCTL_ROOT_NODE(CTL_USER, user, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"user-level");
	SYSCTL_ROOT_NODE(CTL_P1003_1B, p1003_1b, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"p1003_1b, (see p1003_1b.h)");

	SYSCTL_ROOT_NODE(OID_AUTO, compat, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Compatibility code");
	SYSCTL_ROOT_NODE(OID_AUTO, security, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Security");
	#ifdef REGRESSION
	SYSCTL_ROOT_NODE(OID_AUTO, regression, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Regression test MIB");
	#endif

	SYSCTL_STRING(_kern, OID_AUTO, ident, CTLFLAG_RD\|CTLFLAG_MPSAFE,
	kern_ident, 0, "Kernel identifier");

	SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD\|CTLFLAG_CAPRD,
	SYSCTL_NULL_INT_PTR, BSD, "Operating system revision");

	SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD\|CTLFLAG_MPSAFE,
	version, 0, "Kernel version");

	SYSCTL_STRING(_kern, OID_AUTO, compiler_version, CTLFLAG_RD\|CTLFLAG_MPSAFE,
	compiler_version, 0, "Version of compiler used to compile kernel");

	SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD\|CTLFLAG_MPSAFE\|
	CTLFLAG_CAPRD, ostype, 0, "Operating system type");

	SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RDTUN \| CTLFLAG_NOFETCH,
	&maxproc, 0, "Maximum number of processes");

	SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW,
	&maxprocperuid, 0, "Maximum processes allowed per userid");

	SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RDTUN \| CTLFLAG_NOFETCH,
	&maxusers, 0, "Hint for kernel tuning");

	SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD\|CTLFLAG_CAPRD,
	SYSCTL_NULL_INT_PTR, ARG_MAX, "Maximum bytes of argument to execve(2)");

	SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD\|CTLFLAG_CAPRD,
	SYSCTL_NULL_INT_PTR, _POSIX_VERSION, "Version of POSIX attempting to comply to");

	SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RDTUN \|
	CTLFLAG_NOFETCH \| CTLFLAG_CAPRD, &ngroups_max, 0,
	"Maximum number of supplemental groups a user can belong to");

	SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD\|CTLFLAG_CAPRD,
	SYSCTL_NULL_INT_PTR, 1, "Whether job control is available");

	#ifdef _POSIX_SAVED_IDS
	SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD\|CTLFLAG_CAPRD,
	SYSCTL_NULL_INT_PTR, 1, "Whether saved set-group/user ID is available");
	#else
	SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD\|CTLFLAG_CAPRD,
	SYSCTL_NULL_INT_PTR, 0, "Whether saved set-group/user ID is available");
	#endif

	char kernelname[MAXPATHLEN] = PATH_KERNEL; /* XXX bloat */

	SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW \| CTLFLAG_MPSAFE,
	kernelname, sizeof kernelname, "Name of kernel file booted");

	-SYSCTL_INT(_kern, KERN_MAXPHYS, maxphys, CTLFLAG_RD \| CTLFLAG_CAPRD,
	- SYSCTL_NULL_INT_PTR, MAXPHYS, "Maximum block I/O access size");
	+#ifdef COMPAT_FREEBSD12
	+static int
	+sysctl_maxphys(SYSCTL_HANDLER_ARGS)
	+{
	+ u_long lvalue;
	+ int ivalue;
	+
	+ lvalue = maxphys;
	+ if (sizeof(int) == sizeof(u_long) \|\| req->oldlen >= sizeof(u_long))
	+ return (sysctl_handle_long(oidp, &lvalue, 0, req));
	+ if (lvalue > INT_MAX)
	+ return (sysctl_handle_long(oidp, &lvalue, 0, req));
	+ ivalue = lvalue;
	+ return (sysctl_handle_int(oidp, &ivalue, 0, req));
	+}
	+SYSCTL_PROC(_kern, KERN_MAXPHYS, maxphys, CTLTYPE_LONG \| CTLFLAG_RDTUN \|
	+ CTLFLAG_NOFETCH \| CTLFLAG_CAPRD \| CTLFLAG_MPSAFE,
	+ NULL, 0, sysctl_maxphys, "UL", "Maximum block I/O access size");
	+#else
	+SYSCTL_ULONG(_kern, KERN_MAXPHYS, maxphys,
	+ CTLFLAG_RDTUN \| CTLFLAG_NOFETCH \| CTLFLAG_CAPRD,
	+ &maxphys, 0, "Maximum block I/O access size");
	+#endif

	SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD\|CTLFLAG_CAPRD,
	&mp_ncpus, 0, "Number of active CPUs");

	SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD\|CTLFLAG_CAPRD,
	SYSCTL_NULL_INT_PTR, BYTE_ORDER, "System byte order");

	SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD\|CTLFLAG_CAPRD,
	SYSCTL_NULL_INT_PTR, PAGE_SIZE, "System memory page size");

	static int
	sysctl_kern_arnd(SYSCTL_HANDLER_ARGS)
	{
	char buf[256];
	size_t len;

	len = MIN(req->oldlen, sizeof(buf));
	read_random(buf, len);
	return (SYSCTL_OUT(req, buf, len));
	}

	SYSCTL_PROC(_kern, KERN_ARND, arandom,
	CTLTYPE_OPAQUE \| CTLFLAG_RD \| CTLFLAG_MPSAFE \| CTLFLAG_CAPRD, NULL, 0,
	sysctl_kern_arnd, "", "arc4rand");

	static int
	sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
	{
	u_long val, p;

	p = SIZE_T_MAX >> PAGE_SHIFT;
	if (physmem < p)
	p = physmem;
	val = ctob(p);
	return (sysctl_handle_long(oidp, &val, 0, req));
	}
	SYSCTL_PROC(_hw, HW_PHYSMEM, physmem,
	CTLTYPE_ULONG \| CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, 0,
	sysctl_hw_physmem, "LU",
	"Amount of physical memory (in bytes)");

	static int
	sysctl_hw_realmem(SYSCTL_HANDLER_ARGS)
	{
	u_long val, p;

	p = SIZE_T_MAX >> PAGE_SHIFT;
	if (realmem < p)
	p = realmem;
	val = ctob(p);
	return (sysctl_handle_long(oidp, &val, 0, req));
	}
	SYSCTL_PROC(_hw, HW_REALMEM, realmem,
	CTLTYPE_ULONG \| CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, 0,
	sysctl_hw_realmem, "LU",
	"Amount of memory (in bytes) reported by the firmware");

	static int
	sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
	{
	u_long val, p, p1;

	p1 = physmem - vm_wire_count();
	p = SIZE_T_MAX >> PAGE_SHIFT;
	if (p1 < p)
	p = p1;
	val = ctob(p);
	return (sysctl_handle_long(oidp, &val, 0, req));
	}
	SYSCTL_PROC(_hw, HW_USERMEM, usermem,
	CTLTYPE_ULONG \| CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, 0,
	sysctl_hw_usermem, "LU",
	"Amount of memory (in bytes) which is not wired");

	SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0,
	"Amount of physical memory (in pages)");

	u_long pagesizes[MAXPAGESIZES] = { PAGE_SIZE };

	static int
	sysctl_hw_pagesizes(SYSCTL_HANDLER_ARGS)
	{
	int error;
	size_t len;
	#ifdef SCTL_MASK32
	int i;
	uint32_t pagesizes32[MAXPAGESIZES];

	if (req->flags & SCTL_MASK32) {
	/*
	* Recreate the "pagesizes" array with 32-bit elements.
	* Truncate any page size greater than UINT32_MAX to zero,
	* which assumes that page sizes are powers of two.
	*/
	for (i = 0; i < MAXPAGESIZES; i++)
	pagesizes32[i] = (uint32_t)pagesizes[i];

	len = sizeof(pagesizes32);
	if (len > req->oldlen && req->oldptr != NULL)
	len = req->oldlen;
	error = SYSCTL_OUT(req, pagesizes32, len);
	} else
	#endif
	{
	len = sizeof(pagesizes);
	if (len > req->oldlen && req->oldptr != NULL)
	len = req->oldlen;
	error = SYSCTL_OUT(req, pagesizes, len);
	}
	return (error);
	}
	SYSCTL_PROC(_hw, OID_AUTO, pagesizes,
	CTLTYPE_OPAQUE \| CTLFLAG_RD \| CTLFLAG_MPSAFE, NULL, 0,
	sysctl_hw_pagesizes, "S,pagesizes",
	"Supported page sizes");

	int adaptive_machine_arch = 1;
	SYSCTL_INT(_debug, OID_AUTO, adaptive_machine_arch, CTLFLAG_RW,
	&adaptive_machine_arch, 1,
	"Adapt reported machine architecture to the ABI of the binary");

	static const char *
	proc_machine_arch(struct proc *p)
	{

	if (p->p_sysent->sv_machine_arch != NULL)
	return (p->p_sysent->sv_machine_arch(p));
	#ifdef COMPAT_FREEBSD32
	if (SV_PROC_FLAG(p, SV_ILP32))
	return (MACHINE_ARCH32);
	#endif
	return (MACHINE_ARCH);
	}

	static int
	sysctl_hw_machine_arch(SYSCTL_HANDLER_ARGS)
	{
	const char *machine_arch;

	if (adaptive_machine_arch)
	machine_arch = proc_machine_arch(curproc);
	else
	machine_arch = MACHINE_ARCH;
	return (SYSCTL_OUT(req, machine_arch, strlen(machine_arch) + 1));
	}
	SYSCTL_PROC(_hw, HW_MACHINE_ARCH, machine_arch, CTLTYPE_STRING \| CTLFLAG_RD \|
	CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_machine_arch, "A",
	"System architecture");

	#ifndef MACHINE_ARCHES
	#ifdef COMPAT_FREEBSD32
	#define MACHINE_ARCHES MACHINE_ARCH " " MACHINE_ARCH32
	#else
	#define MACHINE_ARCHES MACHINE_ARCH
	#endif
	#endif

	SYSCTL_STRING(_kern, OID_AUTO, supported_archs, CTLFLAG_RD \| CTLFLAG_MPSAFE,
	MACHINE_ARCHES, 0, "Supported architectures for binaries");

	static int
	sysctl_hostname(SYSCTL_HANDLER_ARGS)
	{
	struct prison pr, cpr;
	size_t pr_offset;
	char tmpname[MAXHOSTNAMELEN];
	int descend, error, len;

	/*
	* This function can set: hostname domainname hostuuid.
	* Keep that in mind when comments say "hostname".
	*/
	pr_offset = (size_t)arg1;
	len = arg2;
	KASSERT(len <= sizeof(tmpname),
	("length %d too long for %s", len, __func__));

	pr = req->td->td_ucred->cr_prison;
	if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME) && req->newptr)
	return (EPERM);
	/*
	* Make a local copy of hostname to get/set so we don't have to hold
	* the jail mutex during the sysctl copyin/copyout activities.
	*/
	mtx_lock(&pr->pr_mtx);
	bcopy((char *)pr + pr_offset, tmpname, len);
	mtx_unlock(&pr->pr_mtx);

	error = sysctl_handle_string(oidp, tmpname, len, req);

	if (req->newptr != NULL && error == 0) {
	/*
	* Copy the locally set hostname to all jails that share
	* this host info.
	*/
	sx_slock(&allprison_lock);
	while (!(pr->pr_flags & PR_HOST))
	pr = pr->pr_parent;
	mtx_lock(&pr->pr_mtx);
	bcopy(tmpname, (char *)pr + pr_offset, len);
	FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
	if (cpr->pr_flags & PR_HOST)
	descend = 0;
	else
	bcopy(tmpname, (char *)cpr + pr_offset, len);
	mtx_unlock(&pr->pr_mtx);
	sx_sunlock(&allprison_lock);
	}
	return (error);
	}

	SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname,
	CTLTYPE_STRING \| CTLFLAG_RW \| CTLFLAG_PRISON \| CTLFLAG_CAPRD \| CTLFLAG_MPSAFE,
	(void *)(offsetof(struct prison, pr_hostname)), MAXHOSTNAMELEN,
	sysctl_hostname, "A", "Hostname");
	SYSCTL_PROC(_kern, KERN_NISDOMAINNAME, domainname,
	CTLTYPE_STRING \| CTLFLAG_RW \| CTLFLAG_PRISON \| CTLFLAG_CAPRD \| CTLFLAG_MPSAFE,
	(void *)(offsetof(struct prison, pr_domainname)), MAXHOSTNAMELEN,
	sysctl_hostname, "A", "Name of the current YP/NIS domain");
	SYSCTL_PROC(_kern, KERN_HOSTUUID, hostuuid,
	CTLTYPE_STRING \| CTLFLAG_RW \| CTLFLAG_PRISON \| CTLFLAG_CAPRD \| CTLFLAG_MPSAFE,
	(void *)(offsetof(struct prison, pr_hostuuid)), HOSTUUIDLEN,
	sysctl_hostname, "A", "Host UUID");

	static int regression_securelevel_nonmonotonic = 0;

	#ifdef REGRESSION
	SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW,
	&regression_securelevel_nonmonotonic, 0, "securelevel may be lowered");
	#endif

	static int
	sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS)
	{
	struct prison pr, cpr;
	int descend, error, level;

	pr = req->td->td_ucred->cr_prison;

	/*
	* Reading the securelevel is easy, since the current jail's level
	* is known to be at least as secure as any higher levels. Perform
	* a lockless read since the securelevel is an integer.
	*/
	level = pr->pr_securelevel;
	error = sysctl_handle_int(oidp, &level, 0, req);
	if (error \|\| !req->newptr)
	return (error);
	/* Permit update only if the new securelevel exceeds the old. */
	sx_slock(&allprison_lock);
	mtx_lock(&pr->pr_mtx);
	if (!regression_securelevel_nonmonotonic &&
	level < pr->pr_securelevel) {
	mtx_unlock(&pr->pr_mtx);
	sx_sunlock(&allprison_lock);
	return (EPERM);
	}
	pr->pr_securelevel = level;
	/*
	* Set all child jails to be at least this level, but do not lower
	* them (even if regression_securelevel_nonmonotonic).
	*/
	FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) {
	if (cpr->pr_securelevel < level)
	cpr->pr_securelevel = level;
	}
	mtx_unlock(&pr->pr_mtx);
	sx_sunlock(&allprison_lock);
	return (error);
	}

	SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel,
	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_PRISON \| CTLFLAG_MPSAFE, 0, 0,
	sysctl_kern_securelvl, "I",
	"Current secure level");

	#ifdef INCLUDE_CONFIG_FILE
	/* Actual kernel configuration options. */
	extern char kernconfstring[];

	SYSCTL_STRING(_kern, OID_AUTO, conftxt, CTLFLAG_RD \| CTLFLAG_MPSAFE,
	kernconfstring, 0, "Kernel configuration file");
	#endif

	static int
	sysctl_hostid(SYSCTL_HANDLER_ARGS)
	{
	struct prison pr, cpr;
	u_long tmpid;
	int descend, error;

	/*
	* Like sysctl_hostname, except it operates on a u_long
	* instead of a string, and is used only for hostid.
	*/
	pr = req->td->td_ucred->cr_prison;
	if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME) && req->newptr)
	return (EPERM);
	tmpid = pr->pr_hostid;
	error = sysctl_handle_long(oidp, &tmpid, 0, req);

	if (req->newptr != NULL && error == 0) {
	sx_slock(&allprison_lock);
	while (!(pr->pr_flags & PR_HOST))
	pr = pr->pr_parent;
	mtx_lock(&pr->pr_mtx);
	pr->pr_hostid = tmpid;
	FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
	if (cpr->pr_flags & PR_HOST)
	descend = 0;
	else
	cpr->pr_hostid = tmpid;
	mtx_unlock(&pr->pr_mtx);
	sx_sunlock(&allprison_lock);
	}
	return (error);
	}

	SYSCTL_PROC(_kern, KERN_HOSTID, hostid,
	CTLTYPE_ULONG \| CTLFLAG_RW \| CTLFLAG_PRISON \| CTLFLAG_MPSAFE \| CTLFLAG_CAPRD,
	NULL, 0, sysctl_hostid, "LU", "Host ID");

	static struct mtx bootid_lk;
	MTX_SYSINIT(bootid_lock, &bootid_lk, "bootid generator lock", MTX_DEF);

	static int
	sysctl_bootid(SYSCTL_HANDLER_ARGS)
	{
	static uint8_t boot_id[16];
	static bool initialized = false;

	mtx_lock(&bootid_lk);
	if (!initialized) {
	if (!is_random_seeded()) {
	mtx_unlock(&bootid_lk);
	return (ENXIO);
	}
	arc4random_buf(boot_id, sizeof(boot_id));
	initialized = true;
	}
	mtx_unlock(&bootid_lk);

	return (SYSCTL_OUT(req, boot_id, sizeof(boot_id)));
	}
	SYSCTL_PROC(_kern, OID_AUTO, boot_id,
	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_MPSAFE \| CTLFLAG_CAPRD,
	NULL, 0, sysctl_bootid, "", "Random boot ID");

	/*
	* The osrelease string is copied from the global (osrelease in vers.c) into
	* prison0 by a sysinit and is inherited by child jails if not changed at jail
	* creation, so we always return the copy from the current prison data.
	*/
	static int
	sysctl_osrelease(SYSCTL_HANDLER_ARGS)
	{
	struct prison *pr;

	pr = req->td->td_ucred->cr_prison;
	return (SYSCTL_OUT(req, pr->pr_osrelease, strlen(pr->pr_osrelease) + 1));

	}

	SYSCTL_PROC(_kern, KERN_OSRELEASE, osrelease,
	CTLTYPE_STRING \| CTLFLAG_CAPRD \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	NULL, 0, sysctl_osrelease, "A", "Operating system release");

	/*
	* The osreldate number is copied from the global (osreldate in vers.c) into
	* prison0 by a sysinit and is inherited by child jails if not changed at jail
	* creation, so we always return the value from the current prison data.
	*/
	static int
	sysctl_osreldate(SYSCTL_HANDLER_ARGS)
	{
	struct prison *pr;

	pr = req->td->td_ucred->cr_prison;
	return (SYSCTL_OUT(req, &pr->pr_osreldate, sizeof(pr->pr_osreldate)));

	}

	/*
	* NOTICE: The userland release date is available in
	* /usr/include/osreldate.h
	*/
	SYSCTL_PROC(_kern, KERN_OSRELDATE, osreldate,
	CTLTYPE_INT \| CTLFLAG_CAPRD \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	NULL, 0, sysctl_osreldate, "I", "Kernel release date");

	/*
	* The build-id is copied from the ELF section .note.gnu.build-id. The linker
	* script defines two variables to expose the beginning and end. LLVM
	* currently uses a SHA-1 hash, but other formats can be supported by checking
	* the length of the section.
	*/

	extern char __build_id_start[];
	extern char __build_id_end[];

	#define BUILD_ID_HEADER_LEN 0x10
	#define BUILD_ID_HASH_MAXLEN 0x14

	static int
	sysctl_build_id(SYSCTL_HANDLER_ARGS)
	{
	uintptr_t sectionlen = (uintptr_t)(__build_id_end - __build_id_start);
	int hashlen;
	char buf[2*BUILD_ID_HASH_MAXLEN+1];

	/*
	* The ELF note section has a four byte length for the vendor name,
	* four byte length for the value, and a four byte vendor specific
	* type. The name for the build id is "GNU\0". We skip the first 16
	* bytes to read the build hash. We will return the remaining bytes up
	* to 20 (SHA-1) hash size. If the hash happens to be a custom number
	* of bytes we will pad the value with zeros, as the section should be
	* four byte aligned.
	*/
	if (sectionlen <= BUILD_ID_HEADER_LEN \|\|
	sectionlen > (BUILD_ID_HEADER_LEN + BUILD_ID_HASH_MAXLEN)) {
	return (ENOENT);
	}

	hashlen = sectionlen - BUILD_ID_HEADER_LEN;
	for (int i = 0; i < hashlen; i++) {
	uint8_t c = __build_id_start[i+BUILD_ID_HEADER_LEN];
	snprintf(&buf[2*i], 3, "%02x", c);
	}

	return (SYSCTL_OUT(req, buf, strlen(buf) + 1));
	}

	SYSCTL_PROC(_kern, OID_AUTO, build_id,
	CTLTYPE_STRING \| CTLFLAG_CAPRD \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	NULL, 0, sysctl_build_id, "A", "Operating system build-id");

	SYSCTL_NODE(_kern, OID_AUTO, features, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"Kernel Features");

	#ifdef COMPAT_FREEBSD4
	FEATURE(compat_freebsd4, "Compatible with FreeBSD 4");
	#endif

	#ifdef COMPAT_FREEBSD5
	FEATURE(compat_freebsd5, "Compatible with FreeBSD 5");
	#endif

	#ifdef COMPAT_FREEBSD6
	FEATURE(compat_freebsd6, "Compatible with FreeBSD 6");
	#endif

	#ifdef COMPAT_FREEBSD7
	FEATURE(compat_freebsd7, "Compatible with FreeBSD 7");
	#endif

	/*
	* This is really cheating. These actually live in the libc, something
	* which I'm not quite sure is a good idea anyway, but in order for
	* getnext and friends to actually work, we define dummies here.
	*
	* XXXRW: These probably should be CTLFLAG_CAPRD.
	*/
	SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD,
	"", 0, "PATH that finds all the standard utilities");
	SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Max ibase/obase values in bc(1)");
	SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Max array size in bc(1)");
	SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Max scale value in bc(1)");
	SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Max string length in bc(1)");
	SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry");
	SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "");
	SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Max length (bytes) of a text-processing utility's input line");
	SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Maximum number of repeats of a regexp permitted");
	SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0,
	"The version of POSIX 1003.2 with which the system attempts to comply");
	SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Whether C development supports the C bindings option");
	SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Whether system supports the C development utilities option");
	SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "");
	SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Whether system supports FORTRAN development utilities");
	SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Whether system supports FORTRAN runtime utilities");
	SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Whether system supports creation of locales");
	SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Whether system supports software development utilities");
	SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Whether system supports the user portability utilities");
	SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Min Maximum number of streams a process may have open at one time");
	SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, 0, "Min Maximum number of types supported for timezone names");

	static char localbase[MAXPATHLEN] = "";

	SYSCTL_STRING(_user, USER_LOCALBASE, localbase, CTLFLAG_RWTUN,
	localbase, sizeof(localbase), "Prefix used to install and locate add-on packages");

	#include <sys/vnode.h>
	SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, sizeof(struct vnode), "sizeof(struct vnode)");

	SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, sizeof(struct proc), "sizeof(struct proc)");

	static int
	sysctl_kern_pid_max(SYSCTL_HANDLER_ARGS)
	{
	int error, pm;

	pm = pid_max;
	error = sysctl_handle_int(oidp, &pm, 0, req);
	if (error \|\| !req->newptr)
	return (error);
	sx_xlock(&proctree_lock);
	sx_xlock(&allproc_lock);

	/*
	* Only permit the values less then PID_MAX.
	* As a safety measure, do not allow to limit the pid_max too much.
	*/
	if (pm < 300 \|\| pm > PID_MAX)
	error = EINVAL;
	else
	pid_max = pm;
	sx_xunlock(&allproc_lock);
	sx_xunlock(&proctree_lock);
	return (error);
	}
	SYSCTL_PROC(_kern, OID_AUTO, pid_max, CTLTYPE_INT \|
	CTLFLAG_RWTUN \| CTLFLAG_NOFETCH \| CTLFLAG_MPSAFE,
	0, 0, sysctl_kern_pid_max, "I", "Maximum allowed pid");

	#include <sys/bio.h>
	#include <sys/buf.h>
	SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, sizeof(struct bio), "sizeof(struct bio)");
	SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, sizeof(struct buf), "sizeof(struct buf)");

	#include <sys/user.h>
	SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)");

	/* Used by kernel debuggers. */
	const int pcb_size = sizeof(struct pcb);
	SYSCTL_INT(_debug_sizeof, OID_AUTO, pcb, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, sizeof(struct pcb), "sizeof(struct pcb)");

	/* XXX compatibility, remove for 6.0 */
	#include <sys/imgact.h>
	#include <sys/imgact_elf.h>
	SYSCTL_INT(_kern, OID_AUTO, fallback_elf_brand, CTLFLAG_RW,
	&__elfN(fallback_brand), sizeof(__elfN(fallback_brand)),
	"compatibility for kern.fallback_elf_brand");
	diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
	index 6e7ceeb11e0c..2b0f4d14b41e 100644
	--- a/sys/kern/kern_physio.c
	+++ b/sys/kern/kern_physio.c
	@@ -1,230 +1,231 @@
	/*-
	* SPDX-License-Identifier: BSD-4-Clause
	*
	* Copyright (c) 1994 John S. Dyson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice immediately at the beginning of the file, without modification,
	* this list of conditions, and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Absolutely no warranty of function or purpose is made by the author
	* John S. Dyson.
	* 4. Modifications may be freely made to this file if the above conditions
	* are met.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/conf.h>
	#include <sys/malloc.h>
	#include <sys/proc.h>
	#include <sys/racct.h>
	#include <sys/uio.h>
	#include <geom/geom.h>

	#include <vm/vm.h>
	#include <vm/vm_page.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_map.h>

	int
	physio(struct cdev dev, struct uio uio, int ioflag)
	{
	struct cdevsw *csw;
	struct buf *pbuf;
	struct bio *bp;
	struct vm_page **pages;
	char base, sa;
	u_int iolen, poff;
	int error, i, npages, maxpages;
	vm_prot_t prot;

	csw = dev->si_devsw;
	npages = 0;
	sa = NULL;
	/* check if character device is being destroyed */
	if (csw == NULL)
	return (ENXIO);

	/* XXX: sanity check */
	if(dev->si_iosize_max < PAGE_SIZE) {
	printf("WARNING: %s si_iosize_max=%d, using DFLTPHYS.\n",
	devtoname(dev), dev->si_iosize_max);
	dev->si_iosize_max = DFLTPHYS;
	}

	/*
	* If the driver does not want I/O to be split, that means that we
	* need to reject any requests that will not fit into one buffer.
	*/
	if (dev->si_flags & SI_NOSPLIT &&
	- (uio->uio_resid > dev->si_iosize_max \|\| uio->uio_resid > MAXPHYS \|\|
	+ (uio->uio_resid > dev->si_iosize_max \|\| uio->uio_resid > maxphys \|\|
	uio->uio_iovcnt > 1)) {
	/*
	* Tell the user why his I/O was rejected.
	*/
	if (uio->uio_resid > dev->si_iosize_max)
	uprintf("%s: request size=%zd > si_iosize_max=%d; "
	"cannot split request\n", devtoname(dev),
	uio->uio_resid, dev->si_iosize_max);
	- if (uio->uio_resid > MAXPHYS)
	- uprintf("%s: request size=%zd > MAXPHYS=%d; "
	+ if (uio->uio_resid > maxphys)
	+ uprintf("%s: request size=%zd > maxphys=%lu; "
	"cannot split request\n", devtoname(dev),
	- uio->uio_resid, MAXPHYS);
	+ uio->uio_resid, maxphys);
	if (uio->uio_iovcnt > 1)
	uprintf("%s: request vectors=%d > 1; "
	"cannot split request\n", devtoname(dev),
	uio->uio_iovcnt);
	return (EFBIG);
	}

	/*
	* Keep the process UPAGES from being swapped. Processes swapped
	* out while holding pbufs, used by swapper, may lead to deadlock.
	*/
	PHOLD(curproc);

	bp = g_alloc_bio();
	if (uio->uio_segflg != UIO_USERSPACE) {
	pbuf = NULL;
	pages = NULL;
	} else if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
	pbuf = NULL;
	- maxpages = btoc(MIN(uio->uio_resid, MAXPHYS)) + 1;
	+ maxpages = btoc(MIN(uio->uio_resid, maxphys)) + 1;
	pages = malloc(sizeof(pages) maxpages, M_DEVBUF, M_WAITOK);
	} else {
	pbuf = uma_zalloc(pbuf_zone, M_WAITOK);
	+ MPASS((pbuf->b_flags & B_MAXPHYS) != 0);
	sa = pbuf->b_data;
	- maxpages = btoc(MAXPHYS);
	+ maxpages = btoc(maxphys);
	pages = pbuf->b_pages;
	}
	prot = VM_PROT_READ;
	if (uio->uio_rw == UIO_READ)
	prot \|= VM_PROT_WRITE; /* Less backwards than it looks */
	error = 0;
	for (i = 0; i < uio->uio_iovcnt; i++) {
	#ifdef RACCT
	if (racct_enable) {
	PROC_LOCK(curproc);
	if (uio->uio_rw == UIO_READ) {
	racct_add_force(curproc, RACCT_READBPS,
	uio->uio_iov[i].iov_len);
	racct_add_force(curproc, RACCT_READIOPS, 1);
	} else {
	racct_add_force(curproc, RACCT_WRITEBPS,
	uio->uio_iov[i].iov_len);
	racct_add_force(curproc, RACCT_WRITEIOPS, 1);
	}
	PROC_UNLOCK(curproc);
	}
	#endif /* RACCT */

	while (uio->uio_iov[i].iov_len) {
	g_reset_bio(bp);
	if (uio->uio_rw == UIO_READ) {
	bp->bio_cmd = BIO_READ;
	curthread->td_ru.ru_inblock++;
	} else {
	bp->bio_cmd = BIO_WRITE;
	curthread->td_ru.ru_oublock++;
	}
	bp->bio_offset = uio->uio_offset;
	base = uio->uio_iov[i].iov_base;
	bp->bio_length = uio->uio_iov[i].iov_len;
	if (bp->bio_length > dev->si_iosize_max)
	bp->bio_length = dev->si_iosize_max;
	- if (bp->bio_length > MAXPHYS)
	- bp->bio_length = MAXPHYS;
	+ if (bp->bio_length > maxphys)
	+ bp->bio_length = maxphys;

	/*
	* Make sure the pbuf can map the request.
	- * The pbuf has kvasize = MAXPHYS, so a request
	- * larger than MAXPHYS - PAGE_SIZE must be
	+ * The pbuf has kvasize = maxphys, so a request
	+ * larger than maxphys - PAGE_SIZE must be
	* page aligned or it will be fragmented.
	*/
	poff = (vm_offset_t)base & PAGE_MASK;
	if (pbuf && bp->bio_length + poff > pbuf->b_kvasize) {
	if (dev->si_flags & SI_NOSPLIT) {
	uprintf("%s: request ptr %p is not "
	"on a page boundary; cannot split "
	"request\n", devtoname(dev),
	base);
	error = EFBIG;
	goto doerror;
	}
	bp->bio_length = pbuf->b_kvasize;
	if (poff != 0)
	bp->bio_length -= PAGE_SIZE;
	}

	bp->bio_bcount = bp->bio_length;
	bp->bio_dev = dev;

	if (pages) {
	if ((npages = vm_fault_quick_hold_pages(
	&curproc->p_vmspace->vm_map,
	(vm_offset_t)base, bp->bio_length,
	prot, pages, maxpages)) < 0) {
	error = EFAULT;
	goto doerror;
	}
	if (pbuf && sa) {
	pmap_qenter((vm_offset_t)sa,
	pages, npages);
	bp->bio_data = sa + poff;
	} else {
	bp->bio_ma = pages;
	bp->bio_ma_n = npages;
	bp->bio_ma_offset = poff;
	bp->bio_data = unmapped_buf;
	bp->bio_flags \|= BIO_UNMAPPED;
	}
	} else
	bp->bio_data = base;

	csw->d_strategy(bp);
	if (uio->uio_rw == UIO_READ)
	biowait(bp, "physrd");
	else
	biowait(bp, "physwr");

	if (pages) {
	if (pbuf)
	pmap_qremove((vm_offset_t)sa, npages);
	vm_page_unhold_pages(pages, npages);
	}

	iolen = bp->bio_length - bp->bio_resid;
	if (iolen == 0 && !(bp->bio_flags & BIO_ERROR))
	goto doerror; /* EOF */
	uio->uio_iov[i].iov_len -= iolen;
	uio->uio_iov[i].iov_base =
	(char *)uio->uio_iov[i].iov_base + iolen;
	uio->uio_resid -= iolen;
	uio->uio_offset += iolen;
	if (bp->bio_flags & BIO_ERROR) {
	error = bp->bio_error;
	goto doerror;
	}
	}
	}
	doerror:
	if (pbuf)
	uma_zfree(pbuf_zone, pbuf);
	else if (pages)
	free(pages, M_DEVBUF);
	g_destroy_bio(bp);
	PRELE(curproc);
	return (error);
	}
	diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
	index 3f6bbf816138..e0b9b0e261d4 100644
	--- a/sys/kern/kern_sendfile.c
	+++ b/sys/kern/kern_sendfile.c
	@@ -1,1336 +1,1336 @@
	/*-
	* Copyright (c) 2013-2015 Gleb Smirnoff <glebius@FreeBSD.org>
	* Copyright (c) 1998, David Greenman. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_kern_tls.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/capsicum.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/ktls.h>
	#include <sys/mutex.h>
	#include <sys/malloc.h>
	#include <sys/mman.h>
	#include <sys/mount.h>
	#include <sys/mbuf.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/rwlock.h>
	#include <sys/sf_buf.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/vnode.h>

	#include <net/vnet.h>
	#include <netinet/in.h>
	#include <netinet/tcp.h>

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_object.h>
	#include <vm/vm_pager.h>

	static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile dynamic memory");

	#define EXT_FLAG_SYNC EXT_FLAG_VENDOR1
	#define EXT_FLAG_NOCACHE EXT_FLAG_VENDOR2
	#define EXT_FLAG_CACHE_LAST EXT_FLAG_VENDOR3

	/*
	* Structure describing a single sendfile(2) I/O, which may consist of
	* several underlying pager I/Os.
	*
	* The syscall context allocates the structure and initializes 'nios'
	* to 1. As sendfile_swapin() runs through pages and starts asynchronous
	* paging operations, it increments 'nios'.
	*
	* Every I/O completion calls sendfile_iodone(), which decrements the 'nios',
	* and the syscall also calls sendfile_iodone() after allocating all mbufs,
	* linking them and sending to socket. Whoever reaches zero 'nios' is
	* responsible to * call pru_ready on the socket, to notify it of readyness
	* of the data.
	*/
	struct sf_io {
	volatile u_int nios;
	u_int error;
	int npages;
	struct socket *so;
	struct mbuf *m;
	vm_object_t obj;
	vm_pindex_t pindex0;
	#ifdef KERN_TLS
	struct ktls_session *tls;
	#endif
	vm_page_t pa[];
	};

	/*
	* Structure used to track requests with SF_SYNC flag.
	*/
	struct sendfile_sync {
	struct mtx mtx;
	struct cv cv;
	unsigned count;
	bool waiting;
	};

	static void
	sendfile_sync_destroy(struct sendfile_sync *sfs)
	{
	KASSERT(sfs->count == 0, ("sendfile sync %p still busy", sfs));

	cv_destroy(&sfs->cv);
	mtx_destroy(&sfs->mtx);
	free(sfs, M_SENDFILE);
	}

	static void
	sendfile_sync_signal(struct sendfile_sync *sfs)
	{
	mtx_lock(&sfs->mtx);
	KASSERT(sfs->count > 0, ("sendfile sync %p not busy", sfs));
	if (--sfs->count == 0) {
	if (!sfs->waiting) {
	/* The sendfile() waiter was interrupted by a signal. */
	sendfile_sync_destroy(sfs);
	return;
	} else {
	cv_signal(&sfs->cv);
	}
	}
	mtx_unlock(&sfs->mtx);
	}

	counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];

	static void
	sfstat_init(const void *unused)
	{

	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
	M_WAITOK);
	}
	SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);

	static int
	sfstat_sysctl(SYSCTL_HANDLER_ARGS)
	{
	struct sfstat s;

	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
	if (req->newptr)
	COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
	return (SYSCTL_OUT(req, &s, sizeof(s)));
	}
	SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat,
	CTLTYPE_OPAQUE \| CTLFLAG_RW \| CTLFLAG_NEEDGIANT, NULL, 0,
	sfstat_sysctl, "I",
	"sendfile statistics");

	static void
	sendfile_free_mext(struct mbuf *m)
	{
	struct sf_buf *sf;
	vm_page_t pg;
	int flags;

	KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_SFBUF,
	("%s: m %p !M_EXT or !EXT_SFBUF", __func__, m));

	sf = m->m_ext.ext_arg1;
	pg = sf_buf_page(sf);
	flags = (m->m_ext.ext_flags & EXT_FLAG_NOCACHE) != 0 ? VPR_TRYFREE : 0;

	sf_buf_free(sf);
	vm_page_release(pg, flags);

	if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
	struct sendfile_sync *sfs = m->m_ext.ext_arg2;
	sendfile_sync_signal(sfs);
	}
	}

	static void
	sendfile_free_mext_pg(struct mbuf *m)
	{
	vm_page_t pg;
	int flags, i;
	bool cache_last;

	M_ASSERTEXTPG(m);

	cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST;
	flags = (m->m_ext.ext_flags & EXT_FLAG_NOCACHE) != 0 ? VPR_TRYFREE : 0;

	for (i = 0; i < m->m_epg_npgs; i++) {
	if (cache_last && i == m->m_epg_npgs - 1)
	flags = 0;
	pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
	vm_page_release(pg, flags);
	}

	if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
	struct sendfile_sync *sfs = m->m_ext.ext_arg1;
	sendfile_sync_signal(sfs);
	}
	}

	/*
	* Helper function to calculate how much data to put into page i of n.
	* Only first and last pages are special.
	*/
	static inline off_t
	xfsize(int i, int n, off_t off, off_t len)
	{

	if (i == 0)
	return (omin(PAGE_SIZE - (off & PAGE_MASK), len));

	if (i == n - 1 && ((off + len) & PAGE_MASK) > 0)
	return ((off + len) & PAGE_MASK);

	return (PAGE_SIZE);
	}

	/*
	* Helper function to get offset within object for i page.
	*/
	static inline vm_ooffset_t
	vmoff(int i, off_t off)
	{

	if (i == 0)
	return ((vm_ooffset_t)off);

	return (trunc_page(off + i * PAGE_SIZE));
	}

	/*
	* Helper function used when allocation of a page or sf_buf failed.
	* Pretend as if we don't have enough space, subtract xfsize() of
	* all pages that failed.
	*/
	static inline void
	fixspace(int old, int new, off_t off, int *space)
	{

	KASSERT(old > new, ("%s: old %d new %d", __func__, old, new));

	/* Subtract last one. */
	space -= xfsize(old - 1, old, off, space);
	old--;

	if (new == old)
	/* There was only one page. */
	return;

	/* Subtract first one. */
	if (new == 0) {
	space -= xfsize(0, old, off, space);
	new++;
	}

	/* Rest of pages are full sized. */
	space -= (old - new) PAGE_SIZE;

	KASSERT(*space >= 0, ("%s: space went backwards", __func__));
	}

	/*
	* Wait for all in-flight ios to complete, we must not unwire pages
	* under them.
	*/
	static void
	sendfile_iowait(struct sf_io sfio, const char wmesg)
	{
	while (atomic_load_int(&sfio->nios) != 1)
	pause(wmesg, 1);
	}

	/*
	* I/O completion callback.
	*/
	static void
	sendfile_iodone(void arg, vm_page_t pa, int count, int error)
	{
	struct sf_io *sfio = arg;
	struct socket *so;
	int i;

	if (error != 0)
	sfio->error = error;

	/*
	* Restore the valid page pointers. They are already
	* unbusied, but still wired.
	*
	* XXXKIB since pages are only wired, and we do not
	* own the object lock, other users might have
	* invalidated them in meantime. Similarly, after we
	* unbusied the swapped-in pages, they can become
	* invalid under us.
	*/
	MPASS(count == 0 \|\| pa[0] != bogus_page);
	for (i = 0; i < count; i++) {
	if (pa[i] == bogus_page) {
	sfio->pa[(pa[0]->pindex - sfio->pindex0) + i] =
	pa[i] = vm_page_relookup(sfio->obj,
	pa[0]->pindex + i);
	KASSERT(pa[i] != NULL,
	("%s: page %p[%d] disappeared",
	__func__, pa, i));
	} else {
	vm_page_xunbusy_unchecked(pa[i]);
	}
	}

	if (!refcount_release(&sfio->nios))
	return;

	#ifdef INVARIANTS
	for (i = 1; i < sfio->npages; i++) {
	if (sfio->pa[i] == NULL)
	break;
	KASSERT(vm_page_wired(sfio->pa[i]),
	("sfio %p page %d %p not wired", sfio, i, sfio->pa[i]));
	if (i == 0)
	continue;
	KASSERT(sfio->pa[0]->object == sfio->pa[i]->object,
	("sfio %p page %d %p wrong owner %p %p", sfio, i,
	sfio->pa[i], sfio->pa[0]->object, sfio->pa[i]->object));
	KASSERT(sfio->pa[0]->pindex + i == sfio->pa[i]->pindex,
	("sfio %p page %d %p wrong index %jx %jx", sfio, i,
	sfio->pa[i], (uintmax_t)sfio->pa[0]->pindex,
	(uintmax_t)sfio->pa[i]->pindex));
	}
	#endif

	vm_object_pip_wakeup(sfio->obj);

	if (sfio->m == NULL) {
	/*
	* Either I/O operation failed, or we failed to allocate
	* buffers, or we bailed out on first busy page, or we
	* succeeded filling the request without any I/Os. Anyway,
	* pru_send hadn't been executed - nothing had been sent
	* to the socket yet.
	*/
	MPASS((curthread->td_pflags & TDP_KTHREAD) == 0);
	free(sfio, M_SENDFILE);
	return;
	}

	#if defined(KERN_TLS) && defined(INVARIANTS)
	if ((sfio->m->m_flags & M_EXTPG) != 0)
	KASSERT(sfio->tls == sfio->m->m_epg_tls,
	("TLS session mismatch"));
	else
	KASSERT(sfio->tls == NULL,
	("non-ext_pgs mbuf with TLS session"));
	#endif
	so = sfio->so;
	CURVNET_SET(so->so_vnet);
	if (__predict_false(sfio->error)) {
	/*
	* I/O operation failed. The state of data in the socket
	* is now inconsistent, and all what we can do is to tear
	* it down. Protocol abort method would tear down protocol
	* state, free all ready mbufs and detach not ready ones.
	* We will free the mbufs corresponding to this I/O manually.
	*
	* The socket would be marked with EIO and made available
	* for read, so that application receives EIO on next
	* syscall and eventually closes the socket.
	*/
	so->so_proto->pr_usrreqs->pru_abort(so);
	so->so_error = EIO;

	mb_free_notready(sfio->m, sfio->npages);
	#ifdef KERN_TLS
	} else if (sfio->tls != NULL && sfio->tls->mode == TCP_TLS_MODE_SW) {
	/*
	* I/O operation is complete, but we still need to
	* encrypt. We cannot do this in the interrupt thread
	* of the disk controller, so forward the mbufs to a
	* different thread.
	*
	* Donate the socket reference from sfio to rather
	* than explicitly invoking soref().
	*/
	ktls_enqueue(sfio->m, so, sfio->npages);
	goto out_with_ref;
	#endif
	} else
	(void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m,
	sfio->npages);

	SOCK_LOCK(so);
	sorele(so);
	#ifdef KERN_TLS
	out_with_ref:
	#endif
	CURVNET_RESTORE();
	free(sfio, M_SENDFILE);
	}

	/*
	* Iterate through pages vector and request paging for non-valid pages.
	*/
	static int
	sendfile_swapin(vm_object_t obj, struct sf_io sfio, int nios, off_t off,
	off_t len, int npages, int rhpages, int flags)
	{
	vm_page_t *pa;
	int a, count, count1, grabbed, i, j, rv;

	pa = sfio->pa;
	*nios = 0;
	flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0;
	sfio->pindex0 = OFF_TO_IDX(off);

	/*
	* First grab all the pages and wire them. Note that we grab
	* only required pages. Readahead pages are dealt with later.
	*/
	grabbed = vm_page_grab_pages_unlocked(obj, OFF_TO_IDX(off),
	VM_ALLOC_NORMAL \| VM_ALLOC_WIRED \| flags, pa, npages);
	if (grabbed < npages) {
	for (int i = grabbed; i < npages; i++)
	pa[i] = NULL;
	npages = grabbed;
	rhpages = 0;
	}

	for (i = 0; i < npages;) {
	/* Skip valid pages. */
	if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK,
	xfsize(i, npages, off, len))) {
	vm_page_xunbusy(pa[i]);
	SFSTAT_INC(sf_pages_valid);
	i++;
	continue;
	}

	/*
	* Next page is invalid. Check if it belongs to pager. It
	* may not be there, which is a regular situation for shmem
	* pager. For vnode pager this happens only in case of
	* a sparse file.
	*
	* Important feature of vm_pager_has_page() is the hint
	* stored in 'a', about how many pages we can pagein after
	* this page in a single I/O.
	*/
	VM_OBJECT_RLOCK(obj);
	if (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), NULL,
	&a)) {
	VM_OBJECT_RUNLOCK(obj);
	pmap_zero_page(pa[i]);
	vm_page_valid(pa[i]);
	MPASS(pa[i]->dirty == 0);
	vm_page_xunbusy(pa[i]);
	i++;
	continue;
	}
	VM_OBJECT_RUNLOCK(obj);

	/*
	* We want to pagein as many pages as possible, limited only
	* by the 'a' hint and actual request.
	*/
	count = min(a + 1, npages - i);

	/*
	* We should not pagein into a valid page because
	* there might be still unfinished write tracked by
	* e.g. a buffer, thus we substitute any valid pages
	* with the bogus one.
	*
	* We must not leave around xbusy pages which are not
	* part of the run passed to vm_pager_getpages(),
	* otherwise pager might deadlock waiting for the busy
	* status of the page, e.g. if it constitues the
	* buffer needed to validate other page.
	*
	* First trim the end of the run consisting of the
	* valid pages, then replace the rest of the valid
	* with bogus.
	*/
	count1 = count;
	for (j = i + count - 1; j > i; j--) {
	if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
	xfsize(j, npages, off, len))) {
	vm_page_xunbusy(pa[j]);
	SFSTAT_INC(sf_pages_valid);
	count--;
	} else {
	break;
	}
	}

	/*
	* The last page in the run pa[i + count - 1] is
	* guaranteed to be invalid by the trim above, so it
	* is not replaced with bogus, thus -1 in the loop end
	* condition.
	*/
	MPASS(pa[i + count - 1]->valid != VM_PAGE_BITS_ALL);
	for (j = i + 1; j < i + count - 1; j++) {
	if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
	xfsize(j, npages, off, len))) {
	vm_page_xunbusy(pa[j]);
	SFSTAT_INC(sf_pages_valid);
	SFSTAT_INC(sf_pages_bogus);
	pa[j] = bogus_page;
	}
	}

	refcount_acquire(&sfio->nios);
	rv = vm_pager_get_pages_async(obj, pa + i, count, NULL,
	i + count == npages ? &rhpages : NULL,
	&sendfile_iodone, sfio);
	if (__predict_false(rv != VM_PAGER_OK)) {
	sendfile_iowait(sfio, "sferrio");

	/*
	* Do remaining pages recovery before returning EIO.
	* Pages from 0 to npages are wired.
	* Pages from (i + count1) to npages are busied.
	*/
	for (j = 0; j < npages; j++) {
	if (j >= i + count1)
	vm_page_xunbusy(pa[j]);
	KASSERT(pa[j] != NULL && pa[j] != bogus_page,
	("%s: page %p[%d] I/O recovery failure",
	__func__, pa, j));
	vm_page_unwire(pa[j], PQ_INACTIVE);
	}
	return (EIO);
	}

	SFSTAT_INC(sf_iocnt);
	SFSTAT_ADD(sf_pages_read, count);
	if (i + count == npages)
	SFSTAT_ADD(sf_rhpages_read, rhpages);

	i += count1;
	(*nios)++;
	}

	if (*nios == 0 && npages != 0)
	SFSTAT_INC(sf_noiocnt);

	return (0);
	}

	static int
	sendfile_getobj(struct thread td, struct file fp, vm_object_t *obj_res,
	struct vnode vp_res, struct shmfd shmfd_res, off_t *obj_size,
	int *bsize)
	{
	struct vattr va;
	vm_object_t obj;
	struct vnode *vp;
	struct shmfd *shmfd;
	int error;

	vp = *vp_res = NULL;
	obj = NULL;
	shmfd = *shmfd_res = NULL;
	*bsize = 0;

	/*
	* The file descriptor must be a regular file and have a
	* backing VM object.
	*/
	if (fp->f_type == DTYPE_VNODE) {
	vp = fp->f_vnode;
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	if (vp->v_type != VREG) {
	error = EINVAL;
	goto out;
	}
	*bsize = vp->v_mount->mnt_stat.f_iosize;
	error = VOP_GETATTR(vp, &va, td->td_ucred);
	if (error != 0)
	goto out;
	*obj_size = va.va_size;
	obj = vp->v_object;
	if (obj == NULL) {
	error = EINVAL;
	goto out;
	}
	} else if (fp->f_type == DTYPE_SHM) {
	error = 0;
	shmfd = fp->f_data;
	obj = shmfd->shm_object;
	*obj_size = shmfd->shm_size;
	} else {
	error = EINVAL;
	goto out;
	}

	VM_OBJECT_WLOCK(obj);
	if ((obj->flags & OBJ_DEAD) != 0) {
	VM_OBJECT_WUNLOCK(obj);
	error = EBADF;
	goto out;
	}

	/*
	* Temporarily increase the backing VM object's reference
	* count so that a forced reclamation of its vnode does not
	* immediately destroy it.
	*/
	vm_object_reference_locked(obj);
	VM_OBJECT_WUNLOCK(obj);
	*obj_res = obj;
	*vp_res = vp;
	*shmfd_res = shmfd;

	out:
	if (vp != NULL)
	VOP_UNLOCK(vp);
	return (error);
	}

	static int
	sendfile_getsock(struct thread td, int s, struct file *sock_fp,
	struct socket **so)
	{
	int error;

	*sock_fp = NULL;
	*so = NULL;

	/*
	* The socket must be a stream socket and connected.
	*/
	error = getsock_cap(td, s, &cap_send_rights,
	sock_fp, NULL, NULL);
	if (error != 0)
	return (error);
	so = (sock_fp)->f_data;
	if ((*so)->so_type != SOCK_STREAM)
	return (EINVAL);
	/*
	* SCTP one-to-one style sockets currently don't work with
	* sendfile(). So indicate EINVAL for now.
	*/
	if ((*so)->so_proto->pr_protocol == IPPROTO_SCTP)
	return (EINVAL);
	if (SOLISTENING(*so))
	return (ENOTCONN);
	return (0);
	}

	int
	vn_sendfile(struct file fp, int sockfd, struct uio hdr_uio,
	struct uio trl_uio, off_t offset, size_t nbytes, off_t sent, int flags,
	struct thread *td)
	{
	struct file *sock_fp;
	struct vnode *vp;
	struct vm_object *obj;
	vm_page_t pga;
	struct socket *so;
	#ifdef KERN_TLS
	struct ktls_session *tls;
	#endif
	struct mbuf m, mh, *mhtail;
	struct sf_buf *sf;
	struct shmfd *shmfd;
	struct sendfile_sync *sfs;
	struct vattr va;
	off_t off, sbytes, rem, obj_size;
	int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr;
	#ifdef KERN_TLS
	int tls_enq_cnt;
	#endif
	bool use_ext_pgs;

	obj = NULL;
	so = NULL;
	m = mh = NULL;
	sfs = NULL;
	#ifdef KERN_TLS
	tls = NULL;
	#endif
	hdrlen = sbytes = 0;
	softerr = 0;
	use_ext_pgs = false;

	error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
	if (error != 0)
	return (error);

	error = sendfile_getsock(td, sockfd, &sock_fp, &so);
	if (error != 0)
	goto out;

	#ifdef MAC
	error = mac_socket_check_send(td->td_ucred, so);
	if (error != 0)
	goto out;
	#endif

	SFSTAT_INC(sf_syscalls);
	SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags));

	if (flags & SF_SYNC) {
	sfs = malloc(sizeof(*sfs), M_SENDFILE, M_WAITOK \| M_ZERO);
	mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
	cv_init(&sfs->cv, "sendfile");
	sfs->waiting = true;
	}

	rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset;

	/*
	* Protect against multiple writers to the socket.
	*
	* XXXRW: Historically this has assumed non-interruptibility, so now
	* we implement that, but possibly shouldn't.
	*/
	(void)sblock(&so->so_snd, SBL_WAIT \| SBL_NOINTR);
	#ifdef KERN_TLS
	tls = ktls_hold(so->so_snd.sb_tls_info);
	#endif

	/*
	* Loop through the pages of the file, starting with the requested
	* offset. Get a file page (do I/O if necessary), map the file page
	* into an sf_buf, attach an mbuf header to the sf_buf, and queue
	* it on the socket.
	* This is done in two loops. The inner loop turns as many pages
	* as it can, up to available socket buffer space, without blocking
	* into mbufs to have it bulk delivered into the socket send buffer.
	* The outer loop checks the state and available space of the socket
	* and takes care of the overall progress.
	*/
	for (off = offset; rem > 0; ) {
	struct sf_io *sfio;
	vm_page_t *pa;
	struct mbuf m0, mtail;
	int nios, space, npages, rhpages;

	mtail = NULL;
	/*
	* Check the socket state for ongoing connection,
	* no errors and space in socket buffer.
	* If space is low allow for the remainder of the
	* file to be processed if it fits the socket buffer.
	* Otherwise block in waiting for sufficient space
	* to proceed, or if the socket is nonblocking, return
	* to userland with EAGAIN while reporting how far
	* we've come.
	* We wait until the socket buffer has significant free
	* space to do bulk sends. This makes good use of file
	* system read ahead and allows packet segmentation
	* offloading hardware to take over lots of work. If
	* we were not careful here we would send off only one
	* sfbuf at a time.
	*/
	SOCKBUF_LOCK(&so->so_snd);
	if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
	so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
	retry_space:
	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
	error = EPIPE;
	SOCKBUF_UNLOCK(&so->so_snd);
	goto done;
	} else if (so->so_error) {
	error = so->so_error;
	so->so_error = 0;
	SOCKBUF_UNLOCK(&so->so_snd);
	goto done;
	}
	if ((so->so_state & SS_ISCONNECTED) == 0) {
	SOCKBUF_UNLOCK(&so->so_snd);
	error = ENOTCONN;
	goto done;
	}

	space = sbspace(&so->so_snd);
	if (space < rem &&
	(space <= 0 \|\|
	space < so->so_snd.sb_lowat)) {
	if (so->so_state & SS_NBIO) {
	SOCKBUF_UNLOCK(&so->so_snd);
	error = EAGAIN;
	goto done;
	}
	/*
	* sbwait drops the lock while sleeping.
	* When we loop back to retry_space the
	* state may have changed and we retest
	* for it.
	*/
	error = sbwait(&so->so_snd);
	/*
	* An error from sbwait usually indicates that we've
	* been interrupted by a signal. If we've sent anything
	* then return bytes sent, otherwise return the error.
	*/
	if (error != 0) {
	SOCKBUF_UNLOCK(&so->so_snd);
	goto done;
	}
	goto retry_space;
	}
	SOCKBUF_UNLOCK(&so->so_snd);

	/*
	* At the beginning of the first loop check if any headers
	* are specified and copy them into mbufs. Reduce space in
	* the socket buffer by the size of the header mbuf chain.
	* Clear hdr_uio here and hdrlen at the end of the first loop.
	*/
	if (hdr_uio != NULL && hdr_uio->uio_resid > 0) {
	hdr_uio->uio_td = td;
	hdr_uio->uio_rw = UIO_WRITE;
	#ifdef KERN_TLS
	if (tls != NULL)
	mh = m_uiotombuf(hdr_uio, M_WAITOK, space,
	tls->params.max_frame_len, M_EXTPG);
	else
	#endif
	mh = m_uiotombuf(hdr_uio, M_WAITOK,
	space, 0, 0);
	hdrlen = m_length(mh, &mhtail);
	space -= hdrlen;
	/*
	* If header consumed all the socket buffer space,
	* don't waste CPU cycles and jump to the end.
	*/
	if (space == 0) {
	sfio = NULL;
	nios = 0;
	goto prepend_header;
	}
	hdr_uio = NULL;
	}

	if (vp != NULL) {
	error = vn_lock(vp, LK_SHARED);
	if (error != 0)
	goto done;
	error = VOP_GETATTR(vp, &va, td->td_ucred);
	if (error != 0 \|\| off >= va.va_size) {
	VOP_UNLOCK(vp);
	goto done;
	}
	if (va.va_size != obj_size) {
	obj_size = va.va_size;
	rem = nbytes ?
	omin(nbytes + offset, obj_size) : obj_size;
	rem -= off;
	}
	}

	if (space > rem)
	space = rem;
	else if (space > PAGE_SIZE) {
	/*
	* Use page boundaries when possible for large
	* requests.
	*/
	if (off & PAGE_MASK)
	space -= (PAGE_SIZE - (off & PAGE_MASK));
	space = trunc_page(space);
	if (off & PAGE_MASK)
	space += (PAGE_SIZE - (off & PAGE_MASK));
	}

	npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE);

	/*
	* Calculate maximum allowed number of pages for readahead
	* at this iteration. If SF_USER_READAHEAD was set, we don't
	* do any heuristics and use exactly the value supplied by
	* application. Otherwise, we allow readahead up to "rem".
	* If application wants more, let it be, but there is no
	- * reason to go above MAXPHYS. Also check against "obj_size",
	+ * reason to go above maxphys. Also check against "obj_size",
	* since vm_pager_has_page() can hint beyond EOF.
	*/
	if (flags & SF_USER_READAHEAD) {
	rhpages = SF_READAHEAD(flags);
	} else {
	rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) -
	npages;
	rhpages += SF_READAHEAD(flags);
	}
	- rhpages = min(howmany(MAXPHYS, PAGE_SIZE), rhpages);
	+ rhpages = min(howmany(maxphys, PAGE_SIZE), rhpages);
	rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) -
	npages, rhpages);

	sfio = malloc(sizeof(struct sf_io) +
	npages * sizeof(vm_page_t), M_SENDFILE, M_WAITOK);
	refcount_init(&sfio->nios, 1);
	sfio->obj = obj;
	sfio->error = 0;
	sfio->m = NULL;
	#ifdef KERN_TLS
	/*
	* This doesn't use ktls_hold() because sfio->m will
	* also have a reference on 'tls' that will be valid
	* for all of sfio's lifetime.
	*/
	sfio->tls = tls;
	#endif
	vm_object_pip_add(obj, 1);
	error = sendfile_swapin(obj, sfio, &nios, off, space, npages,
	rhpages, flags);
	if (error != 0) {
	if (vp != NULL)
	VOP_UNLOCK(vp);
	sendfile_iodone(sfio, NULL, 0, error);
	goto done;
	}

	/*
	* Loop and construct maximum sized mbuf chain to be bulk
	* dumped into socket buffer.
	*/
	pa = sfio->pa;

	/*
	* Use unmapped mbufs if enabled for TCP. Unmapped
	* bufs are restricted to TCP as that is what has been
	* tested. In particular, unmapped mbufs have not
	* been tested with UNIX-domain sockets.
	*
	* TLS frames always require unmapped mbufs.
	*/
	if ((mb_use_ext_pgs &&
	so->so_proto->pr_protocol == IPPROTO_TCP)
	#ifdef KERN_TLS
	\|\| tls != NULL
	#endif
	) {
	use_ext_pgs = true;
	#ifdef KERN_TLS
	if (tls != NULL)
	max_pgs = num_pages(tls->params.max_frame_len);
	else
	#endif
	max_pgs = MBUF_PEXT_MAX_PGS;

	/* Start at last index, to wrap on first use. */
	ext_pgs_idx = max_pgs - 1;
	}

	for (int i = 0; i < npages; i++) {
	/*
	* If a page wasn't grabbed successfully, then
	* trim the array. Can happen only with SF_NODISKIO.
	*/
	if (pa[i] == NULL) {
	SFSTAT_INC(sf_busy);
	fixspace(npages, i, off, &space);
	npages = i;
	softerr = EBUSY;
	break;
	}
	pga = pa[i];
	if (pga == bogus_page)
	pga = vm_page_relookup(obj, sfio->pindex0 + i);

	if (use_ext_pgs) {
	off_t xfs;

	ext_pgs_idx++;
	if (ext_pgs_idx == max_pgs) {
	m0 = mb_alloc_ext_pgs(M_WAITOK,
	sendfile_free_mext_pg);

	if (flags & SF_NOCACHE) {
	m0->m_ext.ext_flags \|=
	EXT_FLAG_NOCACHE;

	/*
	* See comment below regarding
	* ignoring SF_NOCACHE for the
	* last page.
	*/
	if ((npages - i <= max_pgs) &&
	((off + space) & PAGE_MASK) &&
	(rem > space \|\| rhpages > 0))
	m0->m_ext.ext_flags \|=
	EXT_FLAG_CACHE_LAST;
	}
	if (sfs != NULL) {
	m0->m_ext.ext_flags \|=
	EXT_FLAG_SYNC;
	m0->m_ext.ext_arg1 = sfs;
	mtx_lock(&sfs->mtx);
	sfs->count++;
	mtx_unlock(&sfs->mtx);
	}
	ext_pgs_idx = 0;

	/* Append to mbuf chain. */
	if (mtail != NULL)
	mtail->m_next = m0;
	else
	m = m0;
	mtail = m0;
	m0->m_epg_1st_off =
	vmoff(i, off) & PAGE_MASK;
	}
	if (nios) {
	mtail->m_flags \|= M_NOTREADY;
	m0->m_epg_nrdy++;
	}

	m0->m_epg_pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pga);
	m0->m_epg_npgs++;
	xfs = xfsize(i, npages, off, space);
	m0->m_epg_last_len = xfs;
	MBUF_EXT_PGS_ASSERT_SANITY(m0);
	mtail->m_len += xfs;
	mtail->m_ext.ext_size += PAGE_SIZE;
	continue;
	}

	/*
	* Get a sendfile buf. When allocating the
	* first buffer for mbuf chain, we usually
	* wait as long as necessary, but this wait
	* can be interrupted. For consequent
	* buffers, do not sleep, since several
	* threads might exhaust the buffers and then
	* deadlock.
	*/
	sf = sf_buf_alloc(pga,
	m != NULL ? SFB_NOWAIT : SFB_CATCH);
	if (sf == NULL) {
	SFSTAT_INC(sf_allocfail);
	sendfile_iowait(sfio, "sfnosf");
	for (int j = i; j < npages; j++)
	vm_page_unwire(pa[j], PQ_INACTIVE);
	if (m == NULL)
	softerr = ENOBUFS;
	fixspace(npages, i, off, &space);
	npages = i;
	break;
	}

	m0 = m_get(M_WAITOK, MT_DATA);
	m0->m_ext.ext_buf = (char *)sf_buf_kva(sf);
	m0->m_ext.ext_size = PAGE_SIZE;
	m0->m_ext.ext_arg1 = sf;
	m0->m_ext.ext_type = EXT_SFBUF;
	m0->m_ext.ext_flags = EXT_FLAG_EMBREF;
	m0->m_ext.ext_free = sendfile_free_mext;
	/*
	* SF_NOCACHE sets the page as being freed upon send.
	* However, we ignore it for the last page in 'space',
	* if the page is truncated, and we got more data to
	* send (rem > space), or if we have readahead
	* configured (rhpages > 0).
	*/
	if ((flags & SF_NOCACHE) &&
	(i != npages - 1 \|\|
	!((off + space) & PAGE_MASK) \|\|
	!(rem > space \|\| rhpages > 0)))
	m0->m_ext.ext_flags \|= EXT_FLAG_NOCACHE;
	if (sfs != NULL) {
	m0->m_ext.ext_flags \|= EXT_FLAG_SYNC;
	m0->m_ext.ext_arg2 = sfs;
	mtx_lock(&sfs->mtx);
	sfs->count++;
	mtx_unlock(&sfs->mtx);
	}
	m0->m_ext.ext_count = 1;
	m0->m_flags \|= (M_EXT \| M_RDONLY);
	if (nios)
	m0->m_flags \|= M_NOTREADY;
	m0->m_data = (char *)sf_buf_kva(sf) +
	(vmoff(i, off) & PAGE_MASK);
	m0->m_len = xfsize(i, npages, off, space);

	/* Append to mbuf chain. */
	if (mtail != NULL)
	mtail->m_next = m0;
	else
	m = m0;
	mtail = m0;
	}

	if (vp != NULL)
	VOP_UNLOCK(vp);

	/* Keep track of bytes processed. */
	off += space;
	rem -= space;

	/*
	* Prepend header, if any. Save pointer to first mbuf
	* with a page.
	*/
	if (hdrlen) {
	prepend_header:
	m0 = mhtail->m_next = m;
	m = mh;
	mh = NULL;
	} else
	m0 = m;

	if (m == NULL) {
	KASSERT(softerr, ("%s: m NULL, no error", __func__));
	error = softerr;
	sendfile_iodone(sfio, NULL, 0, 0);
	goto done;
	}

	/* Add the buffer chain to the socket buffer. */
	KASSERT(m_length(m, NULL) == space + hdrlen,
	("%s: mlen %u space %d hdrlen %d",
	__func__, m_length(m, NULL), space, hdrlen));

	CURVNET_SET(so->so_vnet);
	#ifdef KERN_TLS
	if (tls != NULL)
	ktls_frame(m, tls, &tls_enq_cnt, TLS_RLTYPE_APP);
	#endif
	if (nios == 0) {
	/*
	* If sendfile_swapin() didn't initiate any I/Os,
	* which happens if all data is cached in VM, or if
	* the header consumed all socket buffer space and
	* sfio is NULL, then we can send data right now
	* without the PRUS_NOTREADY flag.
	*/
	if (sfio != NULL)
	sendfile_iodone(sfio, NULL, 0, 0);
	#ifdef KERN_TLS
	if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
	error = (*so->so_proto->pr_usrreqs->pru_send)
	(so, PRUS_NOTREADY, m, NULL, NULL, td);
	soref(so);
	ktls_enqueue(m, so, tls_enq_cnt);
	} else
	#endif
	error = (*so->so_proto->pr_usrreqs->pru_send)
	(so, 0, m, NULL, NULL, td);
	} else {
	sfio->so = so;
	sfio->m = m0;
	sfio->npages = npages;
	soref(so);
	error = (*so->so_proto->pr_usrreqs->pru_send)
	(so, PRUS_NOTREADY, m, NULL, NULL, td);
	sendfile_iodone(sfio, NULL, 0, 0);
	}
	CURVNET_RESTORE();

	m = NULL; /* pru_send always consumes */
	if (error)
	goto done;
	sbytes += space + hdrlen;
	if (hdrlen)
	hdrlen = 0;
	if (softerr) {
	error = softerr;
	goto done;
	}
	}

	/*
	* Send trailers. Wimp out and use writev(2).
	*/
	if (trl_uio != NULL) {
	sbunlock(&so->so_snd);
	error = kern_writev(td, sockfd, trl_uio);
	if (error == 0)
	sbytes += td->td_retval[0];
	goto out;
	}

	done:
	sbunlock(&so->so_snd);
	out:
	/*
	* If there was no error we have to clear td->td_retval[0]
	* because it may have been set by writev.
	*/
	if (error == 0) {
	td->td_retval[0] = 0;
	}
	if (sent != NULL) {
	(*sent) = sbytes;
	}
	if (obj != NULL)
	vm_object_deallocate(obj);
	if (so)
	fdrop(sock_fp, td);
	if (m)
	m_freem(m);
	if (mh)
	m_freem(mh);

	if (sfs != NULL) {
	mtx_lock(&sfs->mtx);
	if (sfs->count != 0)
	error = cv_wait_sig(&sfs->cv, &sfs->mtx);
	if (sfs->count == 0) {
	sendfile_sync_destroy(sfs);
	} else {
	sfs->waiting = false;
	mtx_unlock(&sfs->mtx);
	}
	}
	#ifdef KERN_TLS
	if (tls != NULL)
	ktls_free(tls);
	#endif

	if (error == ERESTART)
	error = EINTR;

	return (error);
	}

	static int
	sendfile(struct thread td, struct sendfile_args uap, int compat)
	{
	struct sf_hdtr hdtr;
	struct uio hdr_uio, trl_uio;
	struct file *fp;
	off_t sbytes;
	int error;

	/*
	* File offset must be positive. If it goes beyond EOF
	* we send only the header/trailer and no payload data.
	*/
	if (uap->offset < 0)
	return (EINVAL);

	sbytes = 0;
	hdr_uio = trl_uio = NULL;

	if (uap->hdtr != NULL) {
	error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
	if (error != 0)
	goto out;
	if (hdtr.headers != NULL) {
	error = copyinuio(hdtr.headers, hdtr.hdr_cnt,
	&hdr_uio);
	if (error != 0)
	goto out;
	#ifdef COMPAT_FREEBSD4
	/*
	* In FreeBSD < 5.0 the nbytes to send also included
	* the header. If compat is specified subtract the
	* header size from nbytes.
	*/
	if (compat) {
	if (uap->nbytes > hdr_uio->uio_resid)
	uap->nbytes -= hdr_uio->uio_resid;
	else
	uap->nbytes = 0;
	}
	#endif
	}
	if (hdtr.trailers != NULL) {
	error = copyinuio(hdtr.trailers, hdtr.trl_cnt,
	&trl_uio);
	if (error != 0)
	goto out;
	}
	}

	AUDIT_ARG_FD(uap->fd);

	/*
	* sendfile(2) can start at any offset within a file so we require
	* CAP_READ+CAP_SEEK = CAP_PREAD.
	*/
	if ((error = fget_read(td, uap->fd, &cap_pread_rights, &fp)) != 0)
	goto out;

	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
	uap->nbytes, &sbytes, uap->flags, td);
	fdrop(fp, td);

	if (uap->sbytes != NULL)
	copyout(&sbytes, uap->sbytes, sizeof(off_t));

	out:
	free(hdr_uio, M_IOV);
	free(trl_uio, M_IOV);
	return (error);
	}

	/*
	* sendfile(2)
	*
	* int sendfile(int fd, int s, off_t offset, size_t nbytes,
	* struct sf_hdtr hdtr, off_t sbytes, int flags)
	*
	* Send a file specified by 'fd' and starting at 'offset' to a socket
	* specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
	* 0. Optionally add a header and/or trailer to the socket output. If
	* specified, write the total number of bytes sent into *sbytes.
	*/
	int
	sys_sendfile(struct thread td, struct sendfile_args uap)
	{

	return (sendfile(td, uap, 0));
	}

	#ifdef COMPAT_FREEBSD4
	int
	freebsd4_sendfile(struct thread td, struct freebsd4_sendfile_args uap)
	{
	struct sendfile_args args;

	args.fd = uap->fd;
	args.s = uap->s;
	args.offset = uap->offset;
	args.nbytes = uap->nbytes;
	args.hdtr = uap->hdtr;
	args.sbytes = uap->sbytes;
	args.flags = uap->flags;

	return (sendfile(td, &args, 1));
	}
	#endif /* COMPAT_FREEBSD4 */
	diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
	index 032edd4229e1..39ec48a32cb3 100644
	--- a/sys/kern/subr_param.c
	+++ b/sys/kern/subr_param.c
	@@ -1,324 +1,338 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1980, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)param.c 8.3 (Berkeley) 8/20/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_param.h"
	#include "opt_msgbuf.h"
	+#include "opt_maxphys.h"
	#include "opt_maxusers.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/buf.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/msgbuf.h>
	#include <sys/sysctl.h>
	#include <sys/proc.h>
	#include <sys/vnode.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>

	/*
	* System parameter formulae.
	*/

	#ifndef HZ
	# if defined(__mips__) \|\| defined(__arm__)
	# define HZ 100
	# else
	# define HZ 1000
	# endif
	# ifndef HZ_VM
	# define HZ_VM 100
	# endif
	#else
	# ifndef HZ_VM
	# define HZ_VM HZ
	# endif
	#endif
	#define NPROC (20 + 16 * maxusers)
	#ifndef NBUF
	#define NBUF 0
	#endif
	#ifndef MAXFILES
	#define MAXFILES (40 + 32 * maxusers)
	#endif

	static int sysctl_kern_vm_guest(SYSCTL_HANDLER_ARGS);

	int hz; /* system clock's frequency */
	int tick; /* usec per tick (1000000 / hz) */
	struct bintime tick_bt; /* bintime per tick (1s / hz) */
	sbintime_t tick_sbt;
	int maxusers; /* base tunable */
	int maxproc; /* maximum # of processes */
	int maxprocperuid; /* max # of procs per user */
	int maxfiles; /* sys. wide open files limit */
	int maxfilesperproc; /* per-proc open files limit */
	int msgbufsize; /* size of kernel message buffer */
	-int nbuf;
	+int nbuf; /* number of bcache bufs */
	int bio_transient_maxcnt;
	int ngroups_max; /* max # groups per process */
	int nswbuf;
	pid_t pid_max = PID_MAX;
	-long maxswzone; /* max swmeta KVA storage */
	-long maxbcache; /* max buffer cache KVA storage */
	-long maxpipekva; /* Limit on pipe KVA */
	+u_long maxswzone; /* max swmeta KVA storage */
	+u_long maxbcache; /* max buffer cache KVA storage */
	+u_long maxpipekva; /* Limit on pipe KVA */
	+u_long maxphys; /* max raw I/O transfer size */
	int vm_guest = VM_GUEST_NO; /* Running as virtual machine guest? */
	u_long maxtsiz; /* max text size */
	u_long dfldsiz; /* initial data size limit */
	u_long maxdsiz; /* max data size */
	u_long dflssiz; /* initial stack size limit */
	u_long maxssiz; /* max stack size */
	u_long sgrowsiz; /* amount to grow stack */

	SYSCTL_INT(_kern, OID_AUTO, hz, CTLFLAG_RDTUN \| CTLFLAG_NOFETCH, &hz, 0,
	"Number of clock ticks per second");
	SYSCTL_INT(_kern, OID_AUTO, nbuf, CTLFLAG_RDTUN \| CTLFLAG_NOFETCH, &nbuf, 0,
	"Number of buffers in the buffer cache");
	SYSCTL_INT(_kern, OID_AUTO, nswbuf, CTLFLAG_RDTUN \| CTLFLAG_NOFETCH, &nswbuf, 0,
	"Number of swap buffers");
	SYSCTL_INT(_kern, OID_AUTO, msgbufsize, CTLFLAG_RDTUN \| CTLFLAG_NOFETCH, &msgbufsize, 0,
	"Size of the kernel message buffer");
	SYSCTL_LONG(_kern, OID_AUTO, maxswzone, CTLFLAG_RDTUN \| CTLFLAG_NOFETCH, &maxswzone, 0,
	"Maximum memory for swap metadata");
	SYSCTL_LONG(_kern, OID_AUTO, maxbcache, CTLFLAG_RDTUN \| CTLFLAG_NOFETCH, &maxbcache, 0,
	"Maximum value of vfs.maxbufspace");
	SYSCTL_INT(_kern, OID_AUTO, bio_transient_maxcnt, CTLFLAG_RDTUN \| CTLFLAG_NOFETCH,
	&bio_transient_maxcnt, 0,
	"Maximum number of transient BIOs mappings");
	SYSCTL_ULONG(_kern, OID_AUTO, maxtsiz, CTLFLAG_RWTUN \| CTLFLAG_NOFETCH, &maxtsiz, 0,
	"Maximum text size");
	SYSCTL_ULONG(_kern, OID_AUTO, dfldsiz, CTLFLAG_RWTUN \| CTLFLAG_NOFETCH, &dfldsiz, 0,
	"Initial data size limit");
	SYSCTL_ULONG(_kern, OID_AUTO, maxdsiz, CTLFLAG_RWTUN \| CTLFLAG_NOFETCH, &maxdsiz, 0,
	"Maximum data size");
	SYSCTL_ULONG(_kern, OID_AUTO, dflssiz, CTLFLAG_RWTUN \| CTLFLAG_NOFETCH, &dflssiz, 0,
	"Initial stack size limit");
	SYSCTL_ULONG(_kern, OID_AUTO, maxssiz, CTLFLAG_RWTUN \| CTLFLAG_NOFETCH, &maxssiz, 0,
	"Maximum stack size");
	SYSCTL_ULONG(_kern, OID_AUTO, sgrowsiz, CTLFLAG_RWTUN \| CTLFLAG_NOFETCH, &sgrowsiz, 0,
	"Amount to grow stack on a stack fault");
	SYSCTL_PROC(_kern, OID_AUTO, vm_guest,
	CTLFLAG_RD \| CTLTYPE_STRING \| CTLFLAG_MPSAFE, NULL, 0,
	sysctl_kern_vm_guest, "A",
	"Virtual machine guest detected?");

	/*
	* The elements of this array are ordered based upon the values of the
	* corresponding enum VM_GUEST members.
	*/
	static const char *const vm_guest_sysctl_names[] = {
	[VM_GUEST_NO] = "none",
	[VM_GUEST_VM] = "generic",
	[VM_GUEST_XEN] = "xen",
	[VM_GUEST_HV] = "hv",
	[VM_GUEST_VMWARE] = "vmware",
	[VM_GUEST_KVM] = "kvm",
	[VM_GUEST_BHYVE] = "bhyve",
	[VM_GUEST_VBOX] = "vbox",
	[VM_GUEST_PARALLELS] = "parallels",
	[VM_LAST] = NULL
	};
	CTASSERT(nitems(vm_guest_sysctl_names) - 1 == VM_LAST);

	/*
	* Boot time overrides that are not scaled against main memory
	*/
	void
	init_param1(void)
	{

	#if !defined(__mips__) && !defined(__arm64__)
	TUNABLE_INT_FETCH("kern.kstack_pages", &kstack_pages);
	#endif
	hz = -1;
	TUNABLE_INT_FETCH("kern.hz", &hz);
	if (hz == -1)
	hz = vm_guest > VM_GUEST_NO ? HZ_VM : HZ;
	tick = 1000000 / hz;
	tick_sbt = SBT_1S / hz;
	tick_bt = sbttobt(tick_sbt);

	/*
	* Arrange for ticks to wrap 10 minutes after boot to help catch
	* sign problems sooner.
	*/
	ticks = INT_MAX - (hz * 10 * 60);

	vn_lock_pair_pause_max = hz / 100;
	if (vn_lock_pair_pause_max == 0)
	vn_lock_pair_pause_max = 1;

	#ifdef VM_SWZONE_SIZE_MAX
	maxswzone = VM_SWZONE_SIZE_MAX;
	#endif
	TUNABLE_LONG_FETCH("kern.maxswzone", &maxswzone);
	#ifdef VM_BCACHE_SIZE_MAX
	maxbcache = VM_BCACHE_SIZE_MAX;
	#endif
	TUNABLE_LONG_FETCH("kern.maxbcache", &maxbcache);
	msgbufsize = MSGBUF_SIZE;
	TUNABLE_INT_FETCH("kern.msgbufsize", &msgbufsize);

	maxtsiz = MAXTSIZ;
	TUNABLE_ULONG_FETCH("kern.maxtsiz", &maxtsiz);
	dfldsiz = DFLDSIZ;
	TUNABLE_ULONG_FETCH("kern.dfldsiz", &dfldsiz);
	maxdsiz = MAXDSIZ;
	TUNABLE_ULONG_FETCH("kern.maxdsiz", &maxdsiz);
	dflssiz = DFLSSIZ;
	TUNABLE_ULONG_FETCH("kern.dflssiz", &dflssiz);
	maxssiz = MAXSSIZ;
	TUNABLE_ULONG_FETCH("kern.maxssiz", &maxssiz);
	sgrowsiz = SGROWSIZ;
	TUNABLE_ULONG_FETCH("kern.sgrowsiz", &sgrowsiz);

	/*
	* Let the administrator set {NGROUPS_MAX}, but disallow values
	* less than NGROUPS_MAX which would violate POSIX.1-2008 or
	* greater than INT_MAX-1 which would result in overflow.
	*/
	ngroups_max = NGROUPS_MAX;
	TUNABLE_INT_FETCH("kern.ngroups", &ngroups_max);
	if (ngroups_max < NGROUPS_MAX)
	ngroups_max = NGROUPS_MAX;

	/*
	* Only allow to lower the maximal pid.
	* Prevent setting up a non-bootable system if pid_max is too low.
	*/
	TUNABLE_INT_FETCH("kern.pid_max", &pid_max);
	if (pid_max > PID_MAX)
	pid_max = PID_MAX;
	else if (pid_max < 300)
	pid_max = 300;

	TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed);
	}

	/*
	* Boot time overrides that are scaled against main memory
	*/
	void
	init_param2(long physpages)
	{

	/* Base parameters */
	maxusers = MAXUSERS;
	TUNABLE_INT_FETCH("kern.maxusers", &maxusers);
	if (maxusers == 0) {
	maxusers = physpages / (2 * 1024 * 1024 / PAGE_SIZE);
	if (maxusers < 32)
	maxusers = 32;
	#ifdef VM_MAX_AUTOTUNE_MAXUSERS
	if (maxusers > VM_MAX_AUTOTUNE_MAXUSERS)
	maxusers = VM_MAX_AUTOTUNE_MAXUSERS;
	#endif
	/*
	* Scales down the function in which maxusers grows once
	* we hit 384.
	*/
	if (maxusers > 384)
	maxusers = 384 + ((maxusers - 384) / 8);
	}

	/*
	* The following can be overridden after boot via sysctl. Note:
	* unless overriden, these macros are ultimately based on maxusers.
	* Limit maxproc so that kmap entries cannot be exhausted by
	* processes.
	*/
	maxproc = NPROC;
	TUNABLE_INT_FETCH("kern.maxproc", &maxproc);
	if (maxproc > (physpages / 12))
	maxproc = physpages / 12;
	if (maxproc > pid_max)
	maxproc = pid_max;
	maxprocperuid = (maxproc * 9) / 10;

	/*
	* The default limit for maxfiles is 1/12 of the number of
	* physical page but not less than 16 times maxusers.
	* At most it can be 1/6 the number of physical pages.
	*/
	maxfiles = imax(MAXFILES, physpages / 8);
	TUNABLE_INT_FETCH("kern.maxfiles", &maxfiles);
	if (maxfiles > (physpages / 4))
	maxfiles = physpages / 4;
	maxfilesperproc = (maxfiles / 10) * 9;
	TUNABLE_INT_FETCH("kern.maxfilesperproc", &maxfilesperproc);

	/*
	* Cannot be changed after boot.
	*/
	nbuf = NBUF;
	TUNABLE_INT_FETCH("kern.nbuf", &nbuf);
	TUNABLE_INT_FETCH("kern.bio_transient_maxcnt", &bio_transient_maxcnt);
	+ maxphys = MAXPHYS;
	+ TUNABLE_ULONG_FETCH("kern.maxphys", &maxphys);
	+ if (maxphys == 0) {
	+ maxphys = MAXPHYS;
	+ } else if (__bitcountl(maxphys) != 1) { /* power of two */
	+ if (flsl(maxphys) == NBBY * sizeof(maxphys))
	+ maxphys = MAXPHYS;
	+ else
	+ maxphys = 1UL << flsl(maxphys);
	+ }
	+ if (maxphys < PAGE_SIZE)
	+ maxphys = MAXPHYS;

	/*
	* Physical buffers are pre-allocated buffers (struct buf) that
	* are used as temporary holders for I/O, such as paging I/O.
	*/
	TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);

	/*
	* The default for maxpipekva is min(1/64 of the kernel address space,
	* max(1/64 of main memory, 512KB)). See sys_pipe.c for more details.
	*/
	- maxpipekva = (physpages / 64) * PAGE_SIZE;
	+ maxpipekva = ptoa(physpages / 64);
	TUNABLE_LONG_FETCH("kern.ipc.maxpipekva", &maxpipekva);
	if (maxpipekva < 512 * 1024)
	maxpipekva = 512 * 1024;
	if (maxpipekva > (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 64)
	maxpipekva = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
	64;
	}

	/*
	* Sysctl stringifying handler for kern.vm_guest.
	*/
	static int
	sysctl_kern_vm_guest(SYSCTL_HANDLER_ARGS)
	{
	return (SYSCTL_OUT_STR(req, vm_guest_sysctl_names[vm_guest]));
	}
	diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
	index 18a9f8aeac7a..c91f17794599 100644
	--- a/sys/kern/vfs_aio.c
	+++ b/sys/kern/vfs_aio.c
	@@ -1,2987 +1,2994 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1997 John S. Dyson. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. John S. Dyson's name may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* DISCLAIMER: This code isn't warranted to do anything useful. Anything
	* bad that happens because of using this software isn't the responsibility
	* of the author. This software is distributed AS-IS.
	*/

	/*
	* This file contains support for the POSIX 1003.1B AIO/LIO facility.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/capsicum.h>
	#include <sys/eventhandler.h>
	#include <sys/sysproto.h>
	#include <sys/filedesc.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/kthread.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/unistd.h>
	#include <sys/posix4.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/signalvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/protosw.h>
	#include <sys/rwlock.h>
	#include <sys/sema.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/syscall.h>
	#include <sys/sysent.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/sx.h>
	#include <sys/taskqueue.h>
	#include <sys/vnode.h>
	#include <sys/conf.h>
	#include <sys/event.h>
	#include <sys/mount.h>
	#include <geom/geom.h>

	#include <machine/atomic.h>

	#include <vm/vm.h>
	#include <vm/vm_page.h>
	#include <vm/vm_extern.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/uma.h>
	#include <sys/aio.h>

	/*
	* Counter for allocating reference ids to new jobs. Wrapped to 1 on
	* overflow. (XXX will be removed soon.)
	*/
	static u_long jobrefid;

	/*
	* Counter for aio_fsync.
	*/
	static uint64_t jobseqno;

	#ifndef MAX_AIO_PER_PROC
	#define MAX_AIO_PER_PROC 32
	#endif

	#ifndef MAX_AIO_QUEUE_PER_PROC
	#define MAX_AIO_QUEUE_PER_PROC 256
	#endif

	#ifndef MAX_AIO_QUEUE
	#define MAX_AIO_QUEUE 1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */
	#endif

	#ifndef MAX_BUF_AIO
	#define MAX_BUF_AIO 16
	#endif

	FEATURE(aio, "Asynchronous I/O");
	SYSCTL_DECL(_p1003_1b);

	static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
	static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list");

	static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Async IO management");

	static int enable_aio_unsafe = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
	"Permit asynchronous IO on all file types, not just known-safe types");

	static unsigned int unsafe_warningcnt = 1;
	SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
	&unsafe_warningcnt, 0,
	"Warnings that will be triggered upon failed IO requests on unsafe files");

	static int max_aio_procs = MAX_AIO_PROCS;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
	"Maximum number of kernel processes to use for handling async IO ");

	static int num_aio_procs = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
	"Number of presently active kernel processes for async IO");

	/*
	* The code will adjust the actual number of AIO processes towards this
	* number when it gets a chance.
	*/
	static int target_aio_procs = TARGET_AIO_PROCS;
	SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
	0,
	"Preferred number of ready kernel processes for async IO");

	static int max_queue_count = MAX_AIO_QUEUE;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
	"Maximum number of aio requests to queue, globally");

	static int num_queue_count = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
	"Number of queued aio requests");

	static int num_buf_aio = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
	"Number of aio requests presently handled by the buf subsystem");

	static int num_unmapped_aio = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio,
	0,
	"Number of aio requests presently handled by unmapped I/O buffers");

	/* Number of async I/O processes in the process of being started */
	/* XXX This should be local to aio_aqueue() */
	static int num_aio_resv_start = 0;

	static int aiod_lifetime;
	SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
	"Maximum lifetime for idle aiod");

	static int max_aio_per_proc = MAX_AIO_PER_PROC;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
	0,
	"Maximum active aio requests per process");

	static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
	&max_aio_queue_per_proc, 0,
	"Maximum queued aio requests per process");

	static int max_buf_aio = MAX_BUF_AIO;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
	"Maximum buf aio requests per process");

	/*
	* Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires
	* sysconf(3) to support AIO_LISTIO_MAX, and we implement that with
	* vfs.aio.aio_listio_max.
	*/
	SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max,
	CTLFLAG_RD \| CTLFLAG_CAPRD, &max_aio_queue_per_proc,
	0, "Maximum aio requests for a single lio_listio call");

	#ifdef COMPAT_FREEBSD6
	typedef struct oaiocb {
	int aio_fildes; /* File descriptor */
	off_t aio_offset; /* File offset for I/O */
	volatile void aio_buf; / I/O buffer in process space */
	size_t aio_nbytes; /* Number of bytes for I/O */
	struct osigevent aio_sigevent; /* Signal to deliver */
	int aio_lio_opcode; /* LIO opcode */
	int aio_reqprio; /* Request priority -- ignored */
	struct __aiocb_private _aiocb_private;
	} oaiocb_t;
	#endif

	/*
	* Below is a key of locks used to protect each member of struct kaiocb
	* aioliojob and kaioinfo and any backends.
	*
	* * - need not protected
	* a - locked by kaioinfo lock
	* b - locked by backend lock, the backend lock can be null in some cases,
	* for example, BIO belongs to this type, in this case, proc lock is
	* reused.
	* c - locked by aio_job_mtx, the lock for the generic file I/O backend.
	*/

	/*
	* If the routine that services an AIO request blocks while running in an
	* AIO kernel process it can starve other I/O requests. BIO requests
	* queued via aio_qbio() complete asynchronously and do not use AIO kernel
	* processes at all. Socket I/O requests use a separate pool of
	* kprocs and also force non-blocking I/O. Other file I/O requests
	* use the generic fo_read/fo_write operations which can block. The
	* fsync and mlock operations can also block while executing. Ideally
	* none of these requests would block while executing.
	*
	* Note that the service routines cannot toggle O_NONBLOCK in the file
	* structure directly while handling a request due to races with
	* userland threads.
	*/

	/* jobflags */
	#define KAIOCB_QUEUEING 0x01
	#define KAIOCB_CANCELLED 0x02
	#define KAIOCB_CANCELLING 0x04
	#define KAIOCB_CHECKSYNC 0x08
	#define KAIOCB_CLEARED 0x10
	#define KAIOCB_FINISHED 0x20

	/*
	* AIO process info
	*/
	#define AIOP_FREE 0x1 /* proc on free queue */

	struct aioproc {
	int aioprocflags; /* (c) AIO proc flags */
	TAILQ_ENTRY(aioproc) list; /* (c) list of processes */
	struct proc aioproc; / () the AIO proc /
	};

	/*
	* data-structure for lio signal management
	*/
	struct aioliojob {
	int lioj_flags; /* (a) listio flags */
	int lioj_count; /* (a) listio flags */
	int lioj_finished_count; /* (a) listio flags */
	struct sigevent lioj_signal; /* (a) signal on all I/O done */
	TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */
	struct knlist klist; /* (a) list of knotes */
	ksiginfo_t lioj_ksi; /* (a) Realtime signal info */
	};

	#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
	#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
	#define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */

	/*
	* per process aio data structure
	*/
	struct kaioinfo {
	struct mtx kaio_mtx; /* the lock to protect this struct */
	int kaio_flags; /* (a) per process kaio flags */
	int kaio_active_count; /* (c) number of currently used AIOs */
	int kaio_count; /* (a) size of AIO queue */
	int kaio_buffer_count; /* (a) number of bio buffers */
	TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */
	TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */
	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
	TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */
	TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */
	TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */
	struct task kaio_task; /* () task to kick aio processes /
	struct task kaio_sync_task; /* () task to schedule fsync jobs /
	};

	#define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx)
	#define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx)
	#define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f))
	#define AIO_MTX(ki) (&(ki)->kaio_mtx)

	#define KAIO_RUNDOWN 0x1 /* process is being run down */
	#define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */

	/*
	* Operations used to interact with userland aio control blocks.
	* Different ABIs provide their own operations.
	*/
	struct aiocb_ops {
	int (aio_copyin)(struct aiocb ujob, struct aiocb *kjob);
	long (fetch_status)(struct aiocb ujob);
	long (fetch_error)(struct aiocb ujob);
	int (store_status)(struct aiocb ujob, long status);
	int (store_error)(struct aiocb ujob, long error);
	int (store_kernelinfo)(struct aiocb ujob, long jobref);
	int (store_aiocb)(struct aiocb ujobp, struct aiocb ujob);
	};

	static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */
	static struct sema aio_newproc_sem;
	static struct mtx aio_job_mtx;
	static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */
	static struct unrhdr *aiod_unr;

	void aio_init_aioinfo(struct proc *p);
	static int aio_onceonly(void);
	static int aio_free_entry(struct kaiocb *job);
	static void aio_process_rw(struct kaiocb *job);
	static void aio_process_sync(struct kaiocb *job);
	static void aio_process_mlock(struct kaiocb *job);
	static void aio_schedule_fsync(void *context, int pending);
	static int aio_newproc(int *);
	int aio_aqueue(struct thread td, struct aiocb ujob,
	struct aioliojob lio, int type, struct aiocb_ops ops);
	static int aio_queue_file(struct file fp, struct kaiocb job);
	static void aio_biowakeup(struct bio *bp);
	static void aio_proc_rundown(void arg, struct proc p);
	static void aio_proc_rundown_exec(void arg, struct proc p,
	struct image_params *imgp);
	static int aio_qbio(struct proc p, struct kaiocb job);
	static void aio_daemon(void *param);
	static void aio_bio_done_notify(struct proc userp, struct kaiocb job);
	static bool aio_clear_cancel_function_locked(struct kaiocb *job);
	static int aio_kick(struct proc *userp);
	static void aio_kick_nowait(struct proc *userp);
	static void aio_kick_helper(void *context, int pending);
	static int filt_aioattach(struct knote *kn);
	static void filt_aiodetach(struct knote *kn);
	static int filt_aio(struct knote *kn, long hint);
	static int filt_lioattach(struct knote *kn);
	static void filt_liodetach(struct knote *kn);
	static int filt_lio(struct knote *kn, long hint);

	/*
	* Zones for:
	* kaio Per process async io info
	* aiop async io process data
	* aiocb async io jobs
	* aiolio list io jobs
	*/
	static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone;

	/* kqueue filters for aio */
	static struct filterops aio_filtops = {
	.f_isfd = 0,
	.f_attach = filt_aioattach,
	.f_detach = filt_aiodetach,
	.f_event = filt_aio,
	};
	static struct filterops lio_filtops = {
	.f_isfd = 0,
	.f_attach = filt_lioattach,
	.f_detach = filt_liodetach,
	.f_event = filt_lio
	};

	static eventhandler_tag exit_tag, exec_tag;

	TASKQUEUE_DEFINE_THREAD(aiod_kick);

	/*
	* Main operations function for use as a kernel module.
	*/
	static int
	aio_modload(struct module module, int cmd, void arg)
	{
	int error = 0;

	switch (cmd) {
	case MOD_LOAD:
	aio_onceonly();
	break;
	case MOD_SHUTDOWN:
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}
	return (error);
	}

	static moduledata_t aio_mod = {
	"aio",
	&aio_modload,
	NULL
	};

	DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
	MODULE_VERSION(aio, 1);

	/*
	* Startup initialization
	*/
	static int
	aio_onceonly(void)
	{

	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
	EVENTHANDLER_PRI_ANY);
	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
	NULL, EVENTHANDLER_PRI_ANY);
	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
	TAILQ_INIT(&aio_freeproc);
	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
	TAILQ_INIT(&aio_jobs);
	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
	NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
	NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
	jobrefid = 1;
	p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);

	return (0);
	}

	/*
	* Init the per-process aioinfo structure. The aioinfo limits are set
	* per-process for user limit (resource) management.
	*/
	void
	aio_init_aioinfo(struct proc *p)
	{
	struct kaioinfo *ki;

	ki = uma_zalloc(kaio_zone, M_WAITOK);
	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF \| MTX_NEW);
	ki->kaio_flags = 0;
	ki->kaio_active_count = 0;
	ki->kaio_count = 0;
	ki->kaio_buffer_count = 0;
	TAILQ_INIT(&ki->kaio_all);
	TAILQ_INIT(&ki->kaio_done);
	TAILQ_INIT(&ki->kaio_jobqueue);
	TAILQ_INIT(&ki->kaio_liojoblist);
	TAILQ_INIT(&ki->kaio_syncqueue);
	TAILQ_INIT(&ki->kaio_syncready);
	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
	TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
	PROC_LOCK(p);
	if (p->p_aioinfo == NULL) {
	p->p_aioinfo = ki;
	PROC_UNLOCK(p);
	} else {
	PROC_UNLOCK(p);
	mtx_destroy(&ki->kaio_mtx);
	uma_zfree(kaio_zone, ki);
	}

	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
	aio_newproc(NULL);
	}

	static int
	aio_sendsig(struct proc p, struct sigevent sigev, ksiginfo_t *ksi)
	{
	struct thread *td;
	int error;

	error = sigev_findtd(p, sigev, &td);
	if (error)
	return (error);
	if (!KSI_ONQ(ksi)) {
	ksiginfo_set_sigev(ksi, sigev);
	ksi->ksi_code = SI_ASYNCIO;
	ksi->ksi_flags \|= KSI_EXT \| KSI_INS;
	tdsendsignal(p, td, ksi->ksi_signo, ksi);
	}
	PROC_UNLOCK(p);
	return (error);
	}

	/*
	* Free a job entry. Wait for completion if it is currently active, but don't
	* delay forever. If we delay, we return a flag that says that we have to
	* restart the queue scan.
	*/
	static int
	aio_free_entry(struct kaiocb *job)
	{
	struct kaioinfo *ki;
	struct aioliojob *lj;
	struct proc *p;

	p = job->userproc;
	MPASS(curproc == p);
	ki = p->p_aioinfo;
	MPASS(ki != NULL);

	AIO_LOCK_ASSERT(ki, MA_OWNED);
	MPASS(job->jobflags & KAIOCB_FINISHED);

	atomic_subtract_int(&num_queue_count, 1);

	ki->kaio_count--;
	MPASS(ki->kaio_count >= 0);

	TAILQ_REMOVE(&ki->kaio_done, job, plist);
	TAILQ_REMOVE(&ki->kaio_all, job, allist);

	lj = job->lio;
	if (lj) {
	lj->lioj_count--;
	lj->lioj_finished_count--;

	if (lj->lioj_count == 0) {
	TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
	/* lio is going away, we need to destroy any knotes */
	knlist_delete(&lj->klist, curthread, 1);
	PROC_LOCK(p);
	sigqueue_take(&lj->lioj_ksi);
	PROC_UNLOCK(p);
	uma_zfree(aiolio_zone, lj);
	}
	}

	/* job is going away, we need to destroy any knotes */
	knlist_delete(&job->klist, curthread, 1);
	PROC_LOCK(p);
	sigqueue_take(&job->ksi);
	PROC_UNLOCK(p);

	AIO_UNLOCK(ki);

	/*
	* The thread argument here is used to find the owning process
	* and is also passed to fo_close() which may pass it to various
	* places such as devsw close() routines. Because of that, we
	* need a thread pointer from the process owning the job that is
	* persistent and won't disappear out from under us or move to
	* another process.
	*
	* Currently, all the callers of this function call it to remove
	* a kaiocb from the current process' job list either via a
	* syscall or due to the current process calling exit() or
	* execve(). Thus, we know that p == curproc. We also know that
	* curthread can't exit since we are curthread.
	*
	* Therefore, we use curthread as the thread to pass to
	* knlist_delete(). This does mean that it is possible for the
	* thread pointer at close time to differ from the thread pointer
	* at open time, but this is already true of file descriptors in
	* a multithreaded process.
	*/
	if (job->fd_file)
	fdrop(job->fd_file, curthread);
	crfree(job->cred);
	uma_zfree(aiocb_zone, job);
	AIO_LOCK(ki);

	return (0);
	}

	static void
	aio_proc_rundown_exec(void arg, struct proc p,
	struct image_params *imgp __unused)
	{
	aio_proc_rundown(arg, p);
	}

	static int
	aio_cancel_job(struct proc p, struct kaioinfo ki, struct kaiocb *job)
	{
	aio_cancel_fn_t *func;
	int cancelled;

	AIO_LOCK_ASSERT(ki, MA_OWNED);
	if (job->jobflags & (KAIOCB_CANCELLED \| KAIOCB_FINISHED))
	return (0);
	MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
	job->jobflags \|= KAIOCB_CANCELLED;

	func = job->cancel_fn;

	/*
	* If there is no cancel routine, just leave the job marked as
	* cancelled. The job should be in active use by a caller who
	* should complete it normally or when it fails to install a
	* cancel routine.
	*/
	if (func == NULL)
	return (0);

	/*
	* Set the CANCELLING flag so that aio_complete() will defer
	* completions of this job. This prevents the job from being
	* freed out from under the cancel callback. After the
	* callback any deferred completion (whether from the callback
	* or any other source) will be completed.
	*/
	job->jobflags \|= KAIOCB_CANCELLING;
	AIO_UNLOCK(ki);
	func(job);
	AIO_LOCK(ki);
	job->jobflags &= ~KAIOCB_CANCELLING;
	if (job->jobflags & KAIOCB_FINISHED) {
	cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
	TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
	aio_bio_done_notify(p, job);
	} else {
	/*
	* The cancel callback might have scheduled an
	* operation to cancel this request, but it is
	* only counted as cancelled if the request is
	* cancelled when the callback returns.
	*/
	cancelled = 0;
	}
	return (cancelled);
	}

	/*
	* Rundown the jobs for a given process.
	*/
	static void
	aio_proc_rundown(void arg, struct proc p)
	{
	struct kaioinfo *ki;
	struct aioliojob *lj;
	struct kaiocb job, jobn;

	KASSERT(curthread->td_proc == p,
	("%s: called on non-curproc", __func__));
	ki = p->p_aioinfo;
	if (ki == NULL)
	return;

	AIO_LOCK(ki);
	ki->kaio_flags \|= KAIO_RUNDOWN;

	restart:

	/*
	* Try to cancel all pending requests. This code simulates
	* aio_cancel on all pending I/O requests.
	*/
	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
	aio_cancel_job(p, ki, job);
	}

	/* Wait for all running I/O to be finished */
	if (TAILQ_FIRST(&ki->kaio_jobqueue) \|\| ki->kaio_active_count != 0) {
	ki->kaio_flags \|= KAIO_WAKEUP;
	msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
	goto restart;
	}

	/* Free all completed I/O requests. */
	while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
	aio_free_entry(job);

	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
	if (lj->lioj_count == 0) {
	TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
	knlist_delete(&lj->klist, curthread, 1);
	PROC_LOCK(p);
	sigqueue_take(&lj->lioj_ksi);
	PROC_UNLOCK(p);
	uma_zfree(aiolio_zone, lj);
	} else {
	panic("LIO job not cleaned up: C:%d, FC:%d\n",
	lj->lioj_count, lj->lioj_finished_count);
	}
	}
	AIO_UNLOCK(ki);
	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
	mtx_destroy(&ki->kaio_mtx);
	uma_zfree(kaio_zone, ki);
	p->p_aioinfo = NULL;
	}

	/*
	* Select a job to run (called by an AIO daemon).
	*/
	static struct kaiocb *
	aio_selectjob(struct aioproc *aiop)
	{
	struct kaiocb *job;
	struct kaioinfo *ki;
	struct proc *userp;

	mtx_assert(&aio_job_mtx, MA_OWNED);
	restart:
	TAILQ_FOREACH(job, &aio_jobs, list) {
	userp = job->userproc;
	ki = userp->p_aioinfo;

	if (ki->kaio_active_count < max_aio_per_proc) {
	TAILQ_REMOVE(&aio_jobs, job, list);
	if (!aio_clear_cancel_function(job))
	goto restart;

	/* Account for currently active jobs. */
	ki->kaio_active_count++;
	break;
	}
	}
	return (job);
	}

	/*
	* Move all data to a permanent storage device. This code
	* simulates the fsync syscall.
	*/
	static int
	aio_fsync_vnode(struct thread td, struct vnode vp)
	{
	struct mount *mp;
	int error;

	if ((error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0)
	goto drop;
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	if (vp->v_object != NULL) {
	VM_OBJECT_WLOCK(vp->v_object);
	vm_object_page_clean(vp->v_object, 0, 0, 0);
	VM_OBJECT_WUNLOCK(vp->v_object);
	}
	error = VOP_FSYNC(vp, MNT_WAIT, td);

	VOP_UNLOCK(vp);
	vn_finished_write(mp);
	drop:
	return (error);
	}

	/*
	* The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that
	* does the I/O request for the non-bio version of the operations. The normal
	* vn operations are used, and this code should work in all instances for every
	* type of file, including pipes, sockets, fifos, and regular files.
	*
	* XXX I don't think it works well for socket, pipe, and fifo.
	*/
	static void
	aio_process_rw(struct kaiocb *job)
	{
	struct ucred *td_savedcred;
	struct thread *td;
	struct aiocb *cb;
	struct file *fp;
	struct uio auio;
	struct iovec aiov;
	ssize_t cnt;
	long msgsnd_st, msgsnd_end;
	long msgrcv_st, msgrcv_end;
	long oublock_st, oublock_end;
	long inblock_st, inblock_end;
	int error;

	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ \|\|
	job->uaiocb.aio_lio_opcode == LIO_WRITE,
	("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));

	aio_switch_vmspace(job);
	td = curthread;
	td_savedcred = td->td_ucred;
	td->td_ucred = job->cred;
	cb = &job->uaiocb;
	fp = job->fd_file;

	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
	aiov.iov_len = cb->aio_nbytes;

	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = cb->aio_offset;
	auio.uio_resid = cb->aio_nbytes;
	cnt = cb->aio_nbytes;
	auio.uio_segflg = UIO_USERSPACE;
	auio.uio_td = td;

	msgrcv_st = td->td_ru.ru_msgrcv;
	msgsnd_st = td->td_ru.ru_msgsnd;
	inblock_st = td->td_ru.ru_inblock;
	oublock_st = td->td_ru.ru_oublock;

	/*
	* aio_aqueue() acquires a reference to the file that is
	* released in aio_free_entry().
	*/
	if (cb->aio_lio_opcode == LIO_READ) {
	auio.uio_rw = UIO_READ;
	if (auio.uio_resid == 0)
	error = 0;
	else
	error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
	} else {
	if (fp->f_type == DTYPE_VNODE)
	bwillwrite();
	auio.uio_rw = UIO_WRITE;
	error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
	}
	msgrcv_end = td->td_ru.ru_msgrcv;
	msgsnd_end = td->td_ru.ru_msgsnd;
	inblock_end = td->td_ru.ru_inblock;
	oublock_end = td->td_ru.ru_oublock;

	job->msgrcv = msgrcv_end - msgrcv_st;
	job->msgsnd = msgsnd_end - msgsnd_st;
	job->inblock = inblock_end - inblock_st;
	job->outblock = oublock_end - oublock_st;

	if ((error) && (auio.uio_resid != cnt)) {
	if (error == ERESTART \|\| error == EINTR \|\| error == EWOULDBLOCK)
	error = 0;
	if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
	PROC_LOCK(job->userproc);
	kern_psignal(job->userproc, SIGPIPE);
	PROC_UNLOCK(job->userproc);
	}
	}

	cnt -= auio.uio_resid;
	td->td_ucred = td_savedcred;
	if (error)
	aio_complete(job, -1, error);
	else
	aio_complete(job, cnt, 0);
	}

	static void
	aio_process_sync(struct kaiocb *job)
	{
	struct thread *td = curthread;
	struct ucred *td_savedcred = td->td_ucred;
	struct file *fp = job->fd_file;
	int error = 0;

	KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC,
	("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));

	td->td_ucred = job->cred;
	if (fp->f_vnode != NULL)
	error = aio_fsync_vnode(td, fp->f_vnode);
	td->td_ucred = td_savedcred;
	if (error)
	aio_complete(job, -1, error);
	else
	aio_complete(job, 0, 0);
	}

	static void
	aio_process_mlock(struct kaiocb *job)
	{
	struct aiocb *cb = &job->uaiocb;
	int error;

	KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
	("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));

	aio_switch_vmspace(job);
	error = kern_mlock(job->userproc, job->cred,
	__DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes);
	aio_complete(job, error != 0 ? -1 : 0, error);
	}

	static void
	aio_bio_done_notify(struct proc userp, struct kaiocb job)
	{
	struct aioliojob *lj;
	struct kaioinfo *ki;
	struct kaiocb sjob, sjobn;
	int lj_done;
	bool schedule_fsync;

	ki = userp->p_aioinfo;
	AIO_LOCK_ASSERT(ki, MA_OWNED);
	lj = job->lio;
	lj_done = 0;
	if (lj) {
	lj->lioj_finished_count++;
	if (lj->lioj_count == lj->lioj_finished_count)
	lj_done = 1;
	}
	TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
	MPASS(job->jobflags & KAIOCB_FINISHED);

	if (ki->kaio_flags & KAIO_RUNDOWN)
	goto notification_done;

	if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL \|\|
	job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
	aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi);

	KNOTE_LOCKED(&job->klist, 1);

	if (lj_done) {
	if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
	lj->lioj_flags \|= LIOJ_KEVENT_POSTED;
	KNOTE_LOCKED(&lj->klist, 1);
	}
	if ((lj->lioj_flags & (LIOJ_SIGNAL\|LIOJ_SIGNAL_POSTED))
	== LIOJ_SIGNAL
	&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL \|\|
	lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
	aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
	lj->lioj_flags \|= LIOJ_SIGNAL_POSTED;
	}
	}

	notification_done:
	if (job->jobflags & KAIOCB_CHECKSYNC) {
	schedule_fsync = false;
	TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
	if (job->fd_file != sjob->fd_file \|\|
	job->seqno >= sjob->seqno)
	continue;
	if (--sjob->pending > 0)
	continue;
	TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
	if (!aio_clear_cancel_function_locked(sjob))
	continue;
	TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
	schedule_fsync = true;
	}
	if (schedule_fsync)
	taskqueue_enqueue(taskqueue_aiod_kick,
	&ki->kaio_sync_task);
	}
	if (ki->kaio_flags & KAIO_WAKEUP) {
	ki->kaio_flags &= ~KAIO_WAKEUP;
	wakeup(&userp->p_aioinfo);
	}
	}

	static void
	aio_schedule_fsync(void *context, int pending)
	{
	struct kaioinfo *ki;
	struct kaiocb *job;

	ki = context;
	AIO_LOCK(ki);
	while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
	job = TAILQ_FIRST(&ki->kaio_syncready);
	TAILQ_REMOVE(&ki->kaio_syncready, job, list);
	AIO_UNLOCK(ki);
	aio_schedule(job, aio_process_sync);
	AIO_LOCK(ki);
	}
	AIO_UNLOCK(ki);
	}

	bool
	aio_cancel_cleared(struct kaiocb *job)
	{

	/*
	* The caller should hold the same queue lock held when
	* aio_clear_cancel_function() was called and set this flag
	* ensuring this check sees an up-to-date value. However,
	* there is no way to assert that.
	*/
	return ((job->jobflags & KAIOCB_CLEARED) != 0);
	}

	static bool
	aio_clear_cancel_function_locked(struct kaiocb *job)
	{

	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
	MPASS(job->cancel_fn != NULL);
	if (job->jobflags & KAIOCB_CANCELLING) {
	job->jobflags \|= KAIOCB_CLEARED;
	return (false);
	}
	job->cancel_fn = NULL;
	return (true);
	}

	bool
	aio_clear_cancel_function(struct kaiocb *job)
	{
	struct kaioinfo *ki;
	bool ret;

	ki = job->userproc->p_aioinfo;
	AIO_LOCK(ki);
	ret = aio_clear_cancel_function_locked(job);
	AIO_UNLOCK(ki);
	return (ret);
	}

	static bool
	aio_set_cancel_function_locked(struct kaiocb job, aio_cancel_fn_t func)
	{

	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
	if (job->jobflags & KAIOCB_CANCELLED)
	return (false);
	job->cancel_fn = func;
	return (true);
	}

	bool
	aio_set_cancel_function(struct kaiocb job, aio_cancel_fn_t func)
	{
	struct kaioinfo *ki;
	bool ret;

	ki = job->userproc->p_aioinfo;
	AIO_LOCK(ki);
	ret = aio_set_cancel_function_locked(job, func);
	AIO_UNLOCK(ki);
	return (ret);
	}

	void
	aio_complete(struct kaiocb *job, long status, int error)
	{
	struct kaioinfo *ki;
	struct proc *userp;

	job->uaiocb._aiocb_private.error = error;
	job->uaiocb._aiocb_private.status = status;

	userp = job->userproc;
	ki = userp->p_aioinfo;

	AIO_LOCK(ki);
	KASSERT(!(job->jobflags & KAIOCB_FINISHED),
	("duplicate aio_complete"));
	job->jobflags \|= KAIOCB_FINISHED;
	if ((job->jobflags & (KAIOCB_QUEUEING \| KAIOCB_CANCELLING)) == 0) {
	TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
	aio_bio_done_notify(userp, job);
	}
	AIO_UNLOCK(ki);
	}

	void
	aio_cancel(struct kaiocb *job)
	{

	aio_complete(job, -1, ECANCELED);
	}

	void
	aio_switch_vmspace(struct kaiocb *job)
	{

	vmspace_switch_aio(job->userproc->p_vmspace);
	}

	/*
	* The AIO daemon, most of the actual work is done in aio_process_*,
	* but the setup (and address space mgmt) is done in this routine.
	*/
	static void
	aio_daemon(void *_id)
	{
	struct kaiocb *job;
	struct aioproc *aiop;
	struct kaioinfo *ki;
	struct proc *p;
	struct vmspace *myvm;
	struct thread *td = curthread;
	int id = (intptr_t)_id;

	/*
	* Grab an extra reference on the daemon's vmspace so that it
	* doesn't get freed by jobs that switch to a different
	* vmspace.
	*/
	p = td->td_proc;
	myvm = vmspace_acquire_ref(p);

	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));

	/*
	* Allocate and ready the aio control info. There is one aiop structure
	* per daemon.
	*/
	aiop = uma_zalloc(aiop_zone, M_WAITOK);
	aiop->aioproc = p;
	aiop->aioprocflags = 0;

	/*
	* Wakeup parent process. (Parent sleeps to keep from blasting away
	* and creating too many daemons.)
	*/
	sema_post(&aio_newproc_sem);

	mtx_lock(&aio_job_mtx);
	for (;;) {
	/*
	* Take daemon off of free queue
	*/
	if (aiop->aioprocflags & AIOP_FREE) {
	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	aiop->aioprocflags &= ~AIOP_FREE;
	}

	/*
	* Check for jobs.
	*/
	while ((job = aio_selectjob(aiop)) != NULL) {
	mtx_unlock(&aio_job_mtx);

	ki = job->userproc->p_aioinfo;
	job->handle_fn(job);

	mtx_lock(&aio_job_mtx);
	/* Decrement the active job count. */
	ki->kaio_active_count--;
	}

	/*
	* Disconnect from user address space.
	*/
	if (p->p_vmspace != myvm) {
	mtx_unlock(&aio_job_mtx);
	vmspace_switch_aio(myvm);
	mtx_lock(&aio_job_mtx);
	/*
	* We have to restart to avoid race, we only sleep if
	* no job can be selected.
	*/
	continue;
	}

	mtx_assert(&aio_job_mtx, MA_OWNED);

	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
	aiop->aioprocflags \|= AIOP_FREE;

	/*
	* If daemon is inactive for a long time, allow it to exit,
	* thereby freeing resources.
	*/
	if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
	aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
	(aiop->aioprocflags & AIOP_FREE) &&
	num_aio_procs > target_aio_procs)
	break;
	}
	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	num_aio_procs--;
	mtx_unlock(&aio_job_mtx);
	uma_zfree(aiop_zone, aiop);
	free_unr(aiod_unr, id);
	vmspace_free(myvm);

	KASSERT(p->p_vmspace == myvm,
	("AIOD: bad vmspace for exiting daemon"));
	KASSERT(refcount_load(&myvm->vm_refcnt) > 1,
	("AIOD: bad vm refcnt for exiting daemon: %d",
	refcount_load(&myvm->vm_refcnt)));
	kproc_exit(0);
	}

	/*
	* Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
	* AIO daemon modifies its environment itself.
	*/
	static int
	aio_newproc(int *start)
	{
	int error;
	struct proc *p;
	int id;

	id = alloc_unr(aiod_unr);
	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
	RFNOWAIT, 0, "aiod%d", id);
	if (error == 0) {
	/*
	* Wait until daemon is started.
	*/
	sema_wait(&aio_newproc_sem);
	mtx_lock(&aio_job_mtx);
	num_aio_procs++;
	if (start != NULL)
	(*start)--;
	mtx_unlock(&aio_job_mtx);
	} else {
	free_unr(aiod_unr, id);
	}
	return (error);
	}

	/*
	* Try the high-performance, low-overhead bio method for eligible
	* VCHR devices. This method doesn't use an aio helper thread, and
	* thus has very low overhead.
	*
	* Assumes that the caller, aio_aqueue(), has incremented the file
	* structure's reference count, preventing its deallocation for the
	* duration of this call.
	*/
	static int
	aio_qbio(struct proc p, struct kaiocb job)
	{
	struct aiocb *cb;
	struct file *fp;
	struct bio *bp;
	struct buf *pbuf;
	struct vnode *vp;
	struct cdevsw *csw;
	struct cdev *dev;
	struct kaioinfo *ki;
	int error, ref, poff;
	vm_prot_t prot;

	cb = &job->uaiocb;
	fp = job->fd_file;

	if (!(cb->aio_lio_opcode == LIO_WRITE \|\|
	cb->aio_lio_opcode == LIO_READ))
	return (-1);
	if (fp == NULL \|\| fp->f_type != DTYPE_VNODE)
	return (-1);

	vp = fp->f_vnode;
	if (vp->v_type != VCHR)
	return (-1);
	if (vp->v_bufobj.bo_bsize == 0)
	return (-1);
	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
	return (-1);

	ref = 0;
	csw = devvn_refthread(vp, &dev, &ref);
	if (csw == NULL)
	return (ENXIO);

	if ((csw->d_flags & D_DISK) == 0) {
	error = -1;
	goto unref;
	}
	if (cb->aio_nbytes > dev->si_iosize_max) {
	error = -1;
	goto unref;
	}

	ki = p->p_aioinfo;
	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
	if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
	- if (cb->aio_nbytes > MAXPHYS) {
	+ if (cb->aio_nbytes > maxphys) {
	error = -1;
	goto unref;
	}

	pbuf = NULL;
	+ job->pages = malloc(sizeof(vm_page_t) * (atop(round_page(
	+ cb->aio_nbytes)) + 1), M_TEMP, M_WAITOK \| M_ZERO);
	} else {
	- if (cb->aio_nbytes > MAXPHYS - poff) {
	+ if (cb->aio_nbytes > maxphys) {
	error = -1;
	goto unref;
	}
	if (ki->kaio_buffer_count >= max_buf_aio) {
	error = EAGAIN;
	goto unref;
	}

	job->pbuf = pbuf = uma_zalloc(pbuf_zone, M_WAITOK);
	BUF_KERNPROC(pbuf);
	AIO_LOCK(ki);
	ki->kaio_buffer_count++;
	AIO_UNLOCK(ki);
	+ job->pages = pbuf->b_pages;
	}
	job->bp = bp = g_alloc_bio();

	bp->bio_length = cb->aio_nbytes;
	bp->bio_bcount = cb->aio_nbytes;
	bp->bio_done = aio_biowakeup;
	bp->bio_offset = cb->aio_offset;
	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
	bp->bio_dev = dev;
	bp->bio_caller1 = (void *)job;

	prot = VM_PROT_READ;
	if (cb->aio_lio_opcode == LIO_READ)
	prot \|= VM_PROT_WRITE; /* Less backwards than it looks */
	job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
	(vm_offset_t)cb->aio_buf, bp->bio_length, prot, job->pages,
	- nitems(job->pages));
	+ atop(maxphys) + 1);
	if (job->npages < 0) {
	error = EFAULT;
	goto doerror;
	}
	if (pbuf != NULL) {
	pmap_qenter((vm_offset_t)pbuf->b_data,
	job->pages, job->npages);
	bp->bio_data = pbuf->b_data + poff;
	atomic_add_int(&num_buf_aio, 1);
	} else {
	bp->bio_ma = job->pages;
	bp->bio_ma_n = job->npages;
	bp->bio_ma_offset = poff;
	bp->bio_data = unmapped_buf;
	bp->bio_flags \|= BIO_UNMAPPED;
	atomic_add_int(&num_unmapped_aio, 1);
	}

	/* Perform transfer. */
	csw->d_strategy(bp);
	dev_relthread(dev, ref);
	return (0);

	doerror:
	if (pbuf != NULL) {
	AIO_LOCK(ki);
	ki->kaio_buffer_count--;
	AIO_UNLOCK(ki);
	uma_zfree(pbuf_zone, pbuf);
	job->pbuf = NULL;
	+ } else {
	+ free(job->pages, M_TEMP);
	}
	g_destroy_bio(bp);
	job->bp = NULL;
	unref:
	dev_relthread(dev, ref);
	return (error);
	}

	#ifdef COMPAT_FREEBSD6
	static int
	convert_old_sigevent(struct osigevent osig, struct sigevent nsig)
	{

	/*
	* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
	* supported by AIO with the old sigevent structure.
	*/
	nsig->sigev_notify = osig->sigev_notify;
	switch (nsig->sigev_notify) {
	case SIGEV_NONE:
	break;
	case SIGEV_SIGNAL:
	nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
	break;
	case SIGEV_KEVENT:
	nsig->sigev_notify_kqueue =
	osig->__sigev_u.__sigev_notify_kqueue;
	nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	aiocb_copyin_old_sigevent(struct aiocb ujob, struct aiocb kjob)
	{
	struct oaiocb *ojob;
	int error;

	bzero(kjob, sizeof(struct aiocb));
	error = copyin(ujob, kjob, sizeof(struct oaiocb));
	if (error)
	return (error);
	ojob = (struct oaiocb *)kjob;
	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
	}
	#endif

	static int
	aiocb_copyin(struct aiocb ujob, struct aiocb kjob)
	{

	return (copyin(ujob, kjob, sizeof(struct aiocb)));
	}

	static long
	aiocb_fetch_status(struct aiocb *ujob)
	{

	return (fuword(&ujob->_aiocb_private.status));
	}

	static long
	aiocb_fetch_error(struct aiocb *ujob)
	{

	return (fuword(&ujob->_aiocb_private.error));
	}

	static int
	aiocb_store_status(struct aiocb *ujob, long status)
	{

	return (suword(&ujob->_aiocb_private.status, status));
	}

	static int
	aiocb_store_error(struct aiocb *ujob, long error)
	{

	return (suword(&ujob->_aiocb_private.error, error));
	}

	static int
	aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
	{

	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
	}

	static int
	aiocb_store_aiocb(struct aiocb *ujobp, struct aiocb ujob)
	{

	return (suword(ujobp, (long)ujob));
	}

	static struct aiocb_ops aiocb_ops = {
	.aio_copyin = aiocb_copyin,
	.fetch_status = aiocb_fetch_status,
	.fetch_error = aiocb_fetch_error,
	.store_status = aiocb_store_status,
	.store_error = aiocb_store_error,
	.store_kernelinfo = aiocb_store_kernelinfo,
	.store_aiocb = aiocb_store_aiocb,
	};

	#ifdef COMPAT_FREEBSD6
	static struct aiocb_ops aiocb_ops_osigevent = {
	.aio_copyin = aiocb_copyin_old_sigevent,
	.fetch_status = aiocb_fetch_status,
	.fetch_error = aiocb_fetch_error,
	.store_status = aiocb_store_status,
	.store_error = aiocb_store_error,
	.store_kernelinfo = aiocb_store_kernelinfo,
	.store_aiocb = aiocb_store_aiocb,
	};
	#endif

	/*
	* Queue a new AIO request. Choosing either the threaded or direct bio VCHR
	* technique is done in this code.
	*/
	int
	aio_aqueue(struct thread td, struct aiocb ujob, struct aioliojob *lj,
	int type, struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	struct file *fp;
	struct kaiocb *job;
	struct kaioinfo *ki;
	struct kevent kev;
	int opcode;
	int error;
	int fd, kqfd;
	int jid;
	u_short evflags;

	if (p->p_aioinfo == NULL)
	aio_init_aioinfo(p);

	ki = p->p_aioinfo;

	ops->store_status(ujob, -1);
	ops->store_error(ujob, 0);
	ops->store_kernelinfo(ujob, -1);

	if (num_queue_count >= max_queue_count \|\|
	ki->kaio_count >= max_aio_queue_per_proc) {
	ops->store_error(ujob, EAGAIN);
	return (EAGAIN);
	}

	job = uma_zalloc(aiocb_zone, M_WAITOK \| M_ZERO);
	knlist_init_mtx(&job->klist, AIO_MTX(ki));

	error = ops->aio_copyin(ujob, &job->uaiocb);
	if (error) {
	ops->store_error(ujob, error);
	uma_zfree(aiocb_zone, job);
	return (error);
	}

	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
	uma_zfree(aiocb_zone, job);
	return (EINVAL);
	}

	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
	job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
	job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
	job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
	ops->store_error(ujob, EINVAL);
	uma_zfree(aiocb_zone, job);
	return (EINVAL);
	}

	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL \|\|
	job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
	!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
	uma_zfree(aiocb_zone, job);
	return (EINVAL);
	}

	ksiginfo_init(&job->ksi);

	/* Save userspace address of the job info. */
	job->ujob = ujob;

	/* Get the opcode. */
	if (type != LIO_NOP)
	job->uaiocb.aio_lio_opcode = type;
	opcode = job->uaiocb.aio_lio_opcode;

	/*
	* Validate the opcode and fetch the file object for the specified
	* file descriptor.
	*
	* XXXRW: Moved the opcode validation up here so that we don't
	* retrieve a file descriptor without knowing what the capabiltity
	* should be.
	*/
	fd = job->uaiocb.aio_fildes;
	switch (opcode) {
	case LIO_WRITE:
	error = fget_write(td, fd, &cap_pwrite_rights, &fp);
	break;
	case LIO_READ:
	error = fget_read(td, fd, &cap_pread_rights, &fp);
	break;
	case LIO_SYNC:
	error = fget(td, fd, &cap_fsync_rights, &fp);
	break;
	case LIO_MLOCK:
	fp = NULL;
	break;
	case LIO_NOP:
	error = fget(td, fd, &cap_no_rights, &fp);
	break;
	default:
	error = EINVAL;
	}
	if (error) {
	uma_zfree(aiocb_zone, job);
	ops->store_error(ujob, error);
	return (error);
	}

	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
	error = EINVAL;
	goto aqueue_fail;
	}

	if ((opcode == LIO_READ \|\| opcode == LIO_WRITE) &&
	job->uaiocb.aio_offset < 0 &&
	(fp->f_vnode == NULL \|\| fp->f_vnode->v_type != VCHR)) {
	error = EINVAL;
	goto aqueue_fail;
	}

	job->fd_file = fp;

	mtx_lock(&aio_job_mtx);
	jid = jobrefid++;
	job->seqno = jobseqno++;
	mtx_unlock(&aio_job_mtx);
	error = ops->store_kernelinfo(ujob, jid);
	if (error) {
	error = EINVAL;
	goto aqueue_fail;
	}
	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;

	if (opcode == LIO_NOP) {
	fdrop(fp, td);
	uma_zfree(aiocb_zone, job);
	return (0);
	}

	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
	goto no_kqueue;
	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
	if ((evflags & ~(EV_CLEAR \| EV_DISPATCH \| EV_ONESHOT)) != 0) {
	error = EINVAL;
	goto aqueue_fail;
	}
	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
	memset(&kev, 0, sizeof(kev));
	kev.ident = (uintptr_t)job->ujob;
	kev.filter = EVFILT_AIO;
	kev.flags = EV_ADD \| EV_ENABLE \| EV_FLAG1 \| evflags;
	kev.data = (intptr_t)job;
	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
	error = kqfd_register(kqfd, &kev, td, M_WAITOK);
	if (error)
	goto aqueue_fail;

	no_kqueue:

	ops->store_error(ujob, EINPROGRESS);
	job->uaiocb._aiocb_private.error = EINPROGRESS;
	job->userproc = p;
	job->cred = crhold(td->td_ucred);
	job->jobflags = KAIOCB_QUEUEING;
	job->lio = lj;

	if (opcode == LIO_MLOCK) {
	aio_schedule(job, aio_process_mlock);
	error = 0;
	} else if (fp->f_ops->fo_aio_queue == NULL)
	error = aio_queue_file(fp, job);
	else
	error = fo_aio_queue(fp, job);
	if (error)
	goto aqueue_fail;

	AIO_LOCK(ki);
	job->jobflags &= ~KAIOCB_QUEUEING;
	TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
	ki->kaio_count++;
	if (lj)
	lj->lioj_count++;
	atomic_add_int(&num_queue_count, 1);
	if (job->jobflags & KAIOCB_FINISHED) {
	/*
	* The queue callback completed the request synchronously.
	* The bulk of the completion is deferred in that case
	* until this point.
	*/
	aio_bio_done_notify(p, job);
	} else
	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
	AIO_UNLOCK(ki);
	return (0);

	aqueue_fail:
	knlist_delete(&job->klist, curthread, 0);
	if (fp)
	fdrop(fp, td);
	uma_zfree(aiocb_zone, job);
	ops->store_error(ujob, error);
	return (error);
	}

	static void
	aio_cancel_daemon_job(struct kaiocb *job)
	{

	mtx_lock(&aio_job_mtx);
	if (!aio_cancel_cleared(job))
	TAILQ_REMOVE(&aio_jobs, job, list);
	mtx_unlock(&aio_job_mtx);
	aio_cancel(job);
	}

	void
	aio_schedule(struct kaiocb job, aio_handle_fn_t func)
	{

	mtx_lock(&aio_job_mtx);
	if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
	mtx_unlock(&aio_job_mtx);
	aio_cancel(job);
	return;
	}
	job->handle_fn = func;
	TAILQ_INSERT_TAIL(&aio_jobs, job, list);
	aio_kick_nowait(job->userproc);
	mtx_unlock(&aio_job_mtx);
	}

	static void
	aio_cancel_sync(struct kaiocb *job)
	{
	struct kaioinfo *ki;

	ki = job->userproc->p_aioinfo;
	AIO_LOCK(ki);
	if (!aio_cancel_cleared(job))
	TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
	AIO_UNLOCK(ki);
	aio_cancel(job);
	}

	int
	aio_queue_file(struct file fp, struct kaiocb job)
	{
	struct kaioinfo *ki;
	struct kaiocb *job2;
	struct vnode *vp;
	struct mount *mp;
	int error;
	bool safe;

	ki = job->userproc->p_aioinfo;
	error = aio_qbio(job->userproc, job);
	if (error >= 0)
	return (error);
	safe = false;
	if (fp->f_type == DTYPE_VNODE) {
	vp = fp->f_vnode;
	if (vp->v_type == VREG \|\| vp->v_type == VDIR) {
	mp = fp->f_vnode->v_mount;
	if (mp == NULL \|\| (mp->mnt_flag & MNT_LOCAL) != 0)
	safe = true;
	}
	}
	if (!(safe \|\| enable_aio_unsafe)) {
	counted_warning(&unsafe_warningcnt,
	"is attempting to use unsafe AIO requests");
	return (EOPNOTSUPP);
	}

	switch (job->uaiocb.aio_lio_opcode) {
	case LIO_READ:
	case LIO_WRITE:
	aio_schedule(job, aio_process_rw);
	error = 0;
	break;
	case LIO_SYNC:
	AIO_LOCK(ki);
	TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
	if (job2->fd_file == job->fd_file &&
	job2->uaiocb.aio_lio_opcode != LIO_SYNC &&
	job2->seqno < job->seqno) {
	job2->jobflags \|= KAIOCB_CHECKSYNC;
	job->pending++;
	}
	}
	if (job->pending != 0) {
	if (!aio_set_cancel_function_locked(job,
	aio_cancel_sync)) {
	AIO_UNLOCK(ki);
	aio_cancel(job);
	return (0);
	}
	TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
	AIO_UNLOCK(ki);
	return (0);
	}
	AIO_UNLOCK(ki);
	aio_schedule(job, aio_process_sync);
	error = 0;
	break;
	default:
	error = EINVAL;
	}
	return (error);
	}

	static void
	aio_kick_nowait(struct proc *userp)
	{
	struct kaioinfo *ki = userp->p_aioinfo;
	struct aioproc *aiop;

	mtx_assert(&aio_job_mtx, MA_OWNED);
	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	aiop->aioprocflags &= ~AIOP_FREE;
	wakeup(aiop->aioproc);
	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
	ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
	taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
	}
	}

	static int
	aio_kick(struct proc *userp)
	{
	struct kaioinfo *ki = userp->p_aioinfo;
	struct aioproc *aiop;
	int error, ret = 0;

	mtx_assert(&aio_job_mtx, MA_OWNED);
	retryproc:
	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	aiop->aioprocflags &= ~AIOP_FREE;
	wakeup(aiop->aioproc);
	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
	ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
	num_aio_resv_start++;
	mtx_unlock(&aio_job_mtx);
	error = aio_newproc(&num_aio_resv_start);
	mtx_lock(&aio_job_mtx);
	if (error) {
	num_aio_resv_start--;
	goto retryproc;
	}
	} else {
	ret = -1;
	}
	return (ret);
	}

	static void
	aio_kick_helper(void *context, int pending)
	{
	struct proc *userp = context;

	mtx_lock(&aio_job_mtx);
	while (--pending >= 0) {
	if (aio_kick(userp))
	break;
	}
	mtx_unlock(&aio_job_mtx);
	}

	/*
	* Support the aio_return system call, as a side-effect, kernel resources are
	* released.
	*/
	static int
	kern_aio_return(struct thread td, struct aiocb ujob, struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	struct kaiocb *job;
	struct kaioinfo *ki;
	long status, error;

	ki = p->p_aioinfo;
	if (ki == NULL)
	return (EINVAL);
	AIO_LOCK(ki);
	TAILQ_FOREACH(job, &ki->kaio_done, plist) {
	if (job->ujob == ujob)
	break;
	}
	if (job != NULL) {
	MPASS(job->jobflags & KAIOCB_FINISHED);
	status = job->uaiocb._aiocb_private.status;
	error = job->uaiocb._aiocb_private.error;
	td->td_retval[0] = status;
	td->td_ru.ru_oublock += job->outblock;
	td->td_ru.ru_inblock += job->inblock;
	td->td_ru.ru_msgsnd += job->msgsnd;
	td->td_ru.ru_msgrcv += job->msgrcv;
	aio_free_entry(job);
	AIO_UNLOCK(ki);
	ops->store_error(ujob, error);
	ops->store_status(ujob, status);
	} else {
	error = EINVAL;
	AIO_UNLOCK(ki);
	}
	return (error);
	}

	int
	sys_aio_return(struct thread td, struct aio_return_args uap)
	{

	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
	}

	/*
	* Allow a process to wakeup when any of the I/O requests are completed.
	*/
	static int
	kern_aio_suspend(struct thread td, int njoblist, struct aiocb *ujoblist,
	struct timespec *ts)
	{
	struct proc *p = td->td_proc;
	struct timeval atv;
	struct kaioinfo *ki;
	struct kaiocb firstjob, job;
	int error, i, timo;

	timo = 0;
	if (ts) {
	if (ts->tv_nsec < 0 \|\| ts->tv_nsec >= 1000000000)
	return (EINVAL);

	TIMESPEC_TO_TIMEVAL(&atv, ts);
	if (itimerfix(&atv))
	return (EINVAL);
	timo = tvtohz(&atv);
	}

	ki = p->p_aioinfo;
	if (ki == NULL)
	return (EAGAIN);

	if (njoblist == 0)
	return (0);

	AIO_LOCK(ki);
	for (;;) {
	firstjob = NULL;
	error = 0;
	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
	for (i = 0; i < njoblist; i++) {
	if (job->ujob == ujoblist[i]) {
	if (firstjob == NULL)
	firstjob = job;
	if (job->jobflags & KAIOCB_FINISHED)
	goto RETURN;
	}
	}
	}
	/* All tasks were finished. */
	if (firstjob == NULL)
	break;

	ki->kaio_flags \|= KAIO_WAKEUP;
	error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO \| PCATCH,
	"aiospn", timo);
	if (error == ERESTART)
	error = EINTR;
	if (error)
	break;
	}
	RETURN:
	AIO_UNLOCK(ki);
	return (error);
	}

	int
	sys_aio_suspend(struct thread td, struct aio_suspend_args uap)
	{
	struct timespec ts, *tsp;
	struct aiocb **ujoblist;
	int error;

	if (uap->nent < 0 \|\| uap->nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (uap->timeout) {
	/* Get timespec struct. */
	if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
	return (error);
	tsp = &ts;
	} else
	tsp = NULL;

	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
	if (error == 0)
	error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
	free(ujoblist, M_AIOS);
	return (error);
	}

	/*
	* aio_cancel cancels any non-bio aio operations not currently in progress.
	*/
	int
	sys_aio_cancel(struct thread td, struct aio_cancel_args uap)
	{
	struct proc *p = td->td_proc;
	struct kaioinfo *ki;
	struct kaiocb job, jobn;
	struct file *fp;
	int error;
	int cancelled = 0;
	int notcancelled = 0;
	struct vnode *vp;

	/* Lookup file object. */
	error = fget(td, uap->fd, &cap_no_rights, &fp);
	if (error)
	return (error);

	ki = p->p_aioinfo;
	if (ki == NULL)
	goto done;

	if (fp->f_type == DTYPE_VNODE) {
	vp = fp->f_vnode;
	if (vn_isdisk(vp)) {
	fdrop(fp, td);
	td->td_retval[0] = AIO_NOTCANCELED;
	return (0);
	}
	}

	AIO_LOCK(ki);
	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
	if ((uap->fd == job->uaiocb.aio_fildes) &&
	((uap->aiocbp == NULL) \|\|
	(uap->aiocbp == job->ujob))) {
	if (aio_cancel_job(p, ki, job)) {
	cancelled++;
	} else {
	notcancelled++;
	}
	if (uap->aiocbp != NULL)
	break;
	}
	}
	AIO_UNLOCK(ki);

	done:
	fdrop(fp, td);

	if (uap->aiocbp != NULL) {
	if (cancelled) {
	td->td_retval[0] = AIO_CANCELED;
	return (0);
	}
	}

	if (notcancelled) {
	td->td_retval[0] = AIO_NOTCANCELED;
	return (0);
	}

	if (cancelled) {
	td->td_retval[0] = AIO_CANCELED;
	return (0);
	}

	td->td_retval[0] = AIO_ALLDONE;

	return (0);
	}

	/*
	* aio_error is implemented in the kernel level for compatibility purposes
	* only. For a user mode async implementation, it would be best to do it in
	* a userland subroutine.
	*/
	static int
	kern_aio_error(struct thread td, struct aiocb ujob, struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	struct kaiocb *job;
	struct kaioinfo *ki;
	int status;

	ki = p->p_aioinfo;
	if (ki == NULL) {
	td->td_retval[0] = EINVAL;
	return (0);
	}

	AIO_LOCK(ki);
	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
	if (job->ujob == ujob) {
	if (job->jobflags & KAIOCB_FINISHED)
	td->td_retval[0] =
	job->uaiocb._aiocb_private.error;
	else
	td->td_retval[0] = EINPROGRESS;
	AIO_UNLOCK(ki);
	return (0);
	}
	}
	AIO_UNLOCK(ki);

	/*
	* Hack for failure of aio_aqueue.
	*/
	status = ops->fetch_status(ujob);
	if (status == -1) {
	td->td_retval[0] = ops->fetch_error(ujob);
	return (0);
	}

	td->td_retval[0] = EINVAL;
	return (0);
	}

	int
	sys_aio_error(struct thread td, struct aio_error_args uap)
	{

	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
	}

	/* syscall - asynchronous read from a file (REALTIME) */
	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_aio_read(struct thread td, struct freebsd6_aio_read_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
	&aiocb_ops_osigevent));
	}
	#endif

	int
	sys_aio_read(struct thread td, struct aio_read_args uap)
	{

	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
	}

	/* syscall - asynchronous write to a file (REALTIME) */
	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_aio_write(struct thread td, struct freebsd6_aio_write_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
	&aiocb_ops_osigevent));
	}
	#endif

	int
	sys_aio_write(struct thread td, struct aio_write_args uap)
	{

	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
	}

	int
	sys_aio_mlock(struct thread td, struct aio_mlock_args uap)
	{

	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
	}

	static int
	kern_lio_listio(struct thread td, int mode, struct aiocb const *uacb_list,
	struct aiocb *acb_list, int nent, struct sigevent sig,
	struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	struct aiocb *job;
	struct kaioinfo *ki;
	struct aioliojob *lj;
	struct kevent kev;
	int error;
	int nagain, nerror;
	int i;

	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
	return (EINVAL);

	if (nent < 0 \|\| nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (p->p_aioinfo == NULL)
	aio_init_aioinfo(p);

	ki = p->p_aioinfo;

	lj = uma_zalloc(aiolio_zone, M_WAITOK);
	lj->lioj_flags = 0;
	lj->lioj_count = 0;
	lj->lioj_finished_count = 0;
	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
	ksiginfo_init(&lj->lioj_ksi);

	/*
	* Setup signal.
	*/
	if (sig && (mode == LIO_NOWAIT)) {
	bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
	if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
	/* Assume only new style KEVENT */
	memset(&kev, 0, sizeof(kev));
	kev.filter = EVFILT_LIO;
	kev.flags = EV_ADD \| EV_ENABLE \| EV_FLAG1;
	kev.ident = (uintptr_t)uacb_list; /* something unique */
	kev.data = (intptr_t)lj;
	/* pass user defined sigval data */
	kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
	error = kqfd_register(
	lj->lioj_signal.sigev_notify_kqueue, &kev, td,
	M_WAITOK);
	if (error) {
	uma_zfree(aiolio_zone, lj);
	return (error);
	}
	} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
	;
	} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL \|\|
	lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
	if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
	uma_zfree(aiolio_zone, lj);
	return EINVAL;
	}
	lj->lioj_flags \|= LIOJ_SIGNAL;
	} else {
	uma_zfree(aiolio_zone, lj);
	return EINVAL;
	}
	}

	AIO_LOCK(ki);
	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
	/*
	* Add extra aiocb count to avoid the lio to be freed
	* by other threads doing aio_waitcomplete or aio_return,
	* and prevent event from being sent until we have queued
	* all tasks.
	*/
	lj->lioj_count = 1;
	AIO_UNLOCK(ki);

	/*
	* Get pointers to the list of I/O requests.
	*/
	nagain = 0;
	nerror = 0;
	for (i = 0; i < nent; i++) {
	job = acb_list[i];
	if (job != NULL) {
	error = aio_aqueue(td, job, lj, LIO_NOP, ops);
	if (error == EAGAIN)
	nagain++;
	else if (error != 0)
	nerror++;
	}
	}

	error = 0;
	AIO_LOCK(ki);
	if (mode == LIO_WAIT) {
	while (lj->lioj_count - 1 != lj->lioj_finished_count) {
	ki->kaio_flags \|= KAIO_WAKEUP;
	error = msleep(&p->p_aioinfo, AIO_MTX(ki),
	PRIBIO \| PCATCH, "aiospn", 0);
	if (error == ERESTART)
	error = EINTR;
	if (error)
	break;
	}
	} else {
	if (lj->lioj_count - 1 == lj->lioj_finished_count) {
	if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
	lj->lioj_flags \|= LIOJ_KEVENT_POSTED;
	KNOTE_LOCKED(&lj->klist, 1);
	}
	if ((lj->lioj_flags & (LIOJ_SIGNAL\|LIOJ_SIGNAL_POSTED))
	== LIOJ_SIGNAL
	&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL \|\|
	lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
	aio_sendsig(p, &lj->lioj_signal,
	&lj->lioj_ksi);
	lj->lioj_flags \|= LIOJ_SIGNAL_POSTED;
	}
	}
	}
	lj->lioj_count--;
	if (lj->lioj_count == 0) {
	TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
	knlist_delete(&lj->klist, curthread, 1);
	PROC_LOCK(p);
	sigqueue_take(&lj->lioj_ksi);
	PROC_UNLOCK(p);
	AIO_UNLOCK(ki);
	uma_zfree(aiolio_zone, lj);
	} else
	AIO_UNLOCK(ki);

	if (nerror)
	return (EIO);
	else if (nagain)
	return (EAGAIN);
	else
	return (error);
	}

	/* syscall - list directed I/O (REALTIME) */
	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_lio_listio(struct thread td, struct freebsd6_lio_listio_args uap)
	{
	struct aiocb **acb_list;
	struct sigevent *sigp, sig;
	struct osigevent osig;
	int error, nent;

	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	return (EINVAL);

	nent = uap->nent;
	if (nent < 0 \|\| nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	error = copyin(uap->sig, &osig, sizeof(osig));
	if (error)
	return (error);
	error = convert_old_sigevent(&osig, &sig);
	if (error)
	return (error);
	sigp = &sig;
	} else
	sigp = NULL;

	acb_list = malloc(sizeof(struct aiocb ) nent, M_LIO, M_WAITOK);
	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
	if (error == 0)
	error = kern_lio_listio(td, uap->mode,
	(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
	&aiocb_ops_osigevent);
	free(acb_list, M_LIO);
	return (error);
	}
	#endif

	/* syscall - list directed I/O (REALTIME) */
	int
	sys_lio_listio(struct thread td, struct lio_listio_args uap)
	{
	struct aiocb **acb_list;
	struct sigevent *sigp, sig;
	int error, nent;

	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	return (EINVAL);

	nent = uap->nent;
	if (nent < 0 \|\| nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	error = copyin(uap->sig, &sig, sizeof(sig));
	if (error)
	return (error);
	sigp = &sig;
	} else
	sigp = NULL;

	acb_list = malloc(sizeof(struct aiocb ) nent, M_LIO, M_WAITOK);
	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
	if (error == 0)
	error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
	nent, sigp, &aiocb_ops);
	free(acb_list, M_LIO);
	return (error);
	}

	static void
	aio_biowakeup(struct bio *bp)
	{
	struct kaiocb job = (struct kaiocb )bp->bio_caller1;
	struct proc *userp;
	struct kaioinfo *ki;
	size_t nbytes;
	int error, nblks;

	/* Release mapping into kernel space. */
	userp = job->userproc;
	ki = userp->p_aioinfo;
	- if (job->pbuf) {
	+ vm_page_unhold_pages(job->pages, job->npages);
	+ if (job->pbuf != NULL) {
	pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages);
	uma_zfree(pbuf_zone, job->pbuf);
	job->pbuf = NULL;
	atomic_subtract_int(&num_buf_aio, 1);
	AIO_LOCK(ki);
	ki->kaio_buffer_count--;
	AIO_UNLOCK(ki);
	- } else
	+ } else {
	+ free(job->pages, M_TEMP);
	atomic_subtract_int(&num_unmapped_aio, 1);
	- vm_page_unhold_pages(job->pages, job->npages);
	+ }

	bp = job->bp;
	job->bp = NULL;
	nbytes = job->uaiocb.aio_nbytes - bp->bio_resid;
	error = 0;
	if (bp->bio_flags & BIO_ERROR)
	error = bp->bio_error;
	nblks = btodb(nbytes);
	if (job->uaiocb.aio_lio_opcode == LIO_WRITE)
	job->outblock += nblks;
	else
	job->inblock += nblks;

	if (error)
	aio_complete(job, -1, error);
	else
	aio_complete(job, nbytes, 0);

	g_destroy_bio(bp);
	}

	/* syscall - wait for the next completion of an aio request */
	static int
	kern_aio_waitcomplete(struct thread td, struct aiocb *ujobp,
	struct timespec ts, struct aiocb_ops ops)
	{
	struct proc *p = td->td_proc;
	struct timeval atv;
	struct kaioinfo *ki;
	struct kaiocb *job;
	struct aiocb *ujob;
	long error, status;
	int timo;

	ops->store_aiocb(ujobp, NULL);

	if (ts == NULL) {
	timo = 0;
	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
	timo = -1;
	} else {
	if ((ts->tv_nsec < 0) \|\| (ts->tv_nsec >= 1000000000))
	return (EINVAL);

	TIMESPEC_TO_TIMEVAL(&atv, ts);
	if (itimerfix(&atv))
	return (EINVAL);
	timo = tvtohz(&atv);
	}

	if (p->p_aioinfo == NULL)
	aio_init_aioinfo(p);
	ki = p->p_aioinfo;

	error = 0;
	job = NULL;
	AIO_LOCK(ki);
	while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
	if (timo == -1) {
	error = EWOULDBLOCK;
	break;
	}
	ki->kaio_flags \|= KAIO_WAKEUP;
	error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO \| PCATCH,
	"aiowc", timo);
	if (timo && error == ERESTART)
	error = EINTR;
	if (error)
	break;
	}

	if (job != NULL) {
	MPASS(job->jobflags & KAIOCB_FINISHED);
	ujob = job->ujob;
	status = job->uaiocb._aiocb_private.status;
	error = job->uaiocb._aiocb_private.error;
	td->td_retval[0] = status;
	td->td_ru.ru_oublock += job->outblock;
	td->td_ru.ru_inblock += job->inblock;
	td->td_ru.ru_msgsnd += job->msgsnd;
	td->td_ru.ru_msgrcv += job->msgrcv;
	aio_free_entry(job);
	AIO_UNLOCK(ki);
	ops->store_aiocb(ujobp, ujob);
	ops->store_error(ujob, error);
	ops->store_status(ujob, status);
	} else
	AIO_UNLOCK(ki);

	return (error);
	}

	int
	sys_aio_waitcomplete(struct thread td, struct aio_waitcomplete_args uap)
	{
	struct timespec ts, *tsp;
	int error;

	if (uap->timeout) {
	/* Get timespec struct. */
	error = copyin(uap->timeout, &ts, sizeof(ts));
	if (error)
	return (error);
	tsp = &ts;
	} else
	tsp = NULL;

	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
	}

	static int
	kern_aio_fsync(struct thread td, int op, struct aiocb ujob,
	struct aiocb_ops *ops)
	{

	if (op != O_SYNC) /* XXX lack of O_DSYNC */
	return (EINVAL);
	return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops));
	}

	int
	sys_aio_fsync(struct thread td, struct aio_fsync_args uap)
	{

	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
	}

	/* kqueue attach function */
	static int
	filt_aioattach(struct knote *kn)
	{
	struct kaiocb *job;

	job = (struct kaiocb *)(uintptr_t)kn->kn_sdata;

	/*
	* The job pointer must be validated before using it, so
	* registration is restricted to the kernel; the user cannot
	* set EV_FLAG1.
	*/
	if ((kn->kn_flags & EV_FLAG1) == 0)
	return (EPERM);
	kn->kn_ptr.p_aio = job;
	kn->kn_flags &= ~EV_FLAG1;

	knlist_add(&job->klist, kn, 0);

	return (0);
	}

	/* kqueue detach function */
	static void
	filt_aiodetach(struct knote *kn)
	{
	struct knlist *knl;

	knl = &kn->kn_ptr.p_aio->klist;
	knl->kl_lock(knl->kl_lockarg);
	if (!knlist_empty(knl))
	knlist_remove(knl, kn, 1);
	knl->kl_unlock(knl->kl_lockarg);
	}

	/* kqueue filter function */
	/ARGSUSED/
	static int
	filt_aio(struct knote *kn, long hint)
	{
	struct kaiocb *job = kn->kn_ptr.p_aio;

	kn->kn_data = job->uaiocb._aiocb_private.error;
	if (!(job->jobflags & KAIOCB_FINISHED))
	return (0);
	kn->kn_flags \|= EV_EOF;
	return (1);
	}

	/* kqueue attach function */
	static int
	filt_lioattach(struct knote *kn)
	{
	struct aioliojob *lj;

	lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata;

	/*
	* The aioliojob pointer must be validated before using it, so
	* registration is restricted to the kernel; the user cannot
	* set EV_FLAG1.
	*/
	if ((kn->kn_flags & EV_FLAG1) == 0)
	return (EPERM);
	kn->kn_ptr.p_lio = lj;
	kn->kn_flags &= ~EV_FLAG1;

	knlist_add(&lj->klist, kn, 0);

	return (0);
	}

	/* kqueue detach function */
	static void
	filt_liodetach(struct knote *kn)
	{
	struct knlist *knl;

	knl = &kn->kn_ptr.p_lio->klist;
	knl->kl_lock(knl->kl_lockarg);
	if (!knlist_empty(knl))
	knlist_remove(knl, kn, 1);
	knl->kl_unlock(knl->kl_lockarg);
	}

	/* kqueue filter function */
	/ARGSUSED/
	static int
	filt_lio(struct knote *kn, long hint)
	{
	struct aioliojob * lj = kn->kn_ptr.p_lio;

	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
	}

	#ifdef COMPAT_FREEBSD32
	#include <sys/mount.h>
	#include <sys/socket.h>
	#include <compat/freebsd32/freebsd32.h>
	#include <compat/freebsd32/freebsd32_proto.h>
	#include <compat/freebsd32/freebsd32_signal.h>
	#include <compat/freebsd32/freebsd32_syscall.h>
	#include <compat/freebsd32/freebsd32_util.h>

	struct __aiocb_private32 {
	int32_t status;
	int32_t error;
	uint32_t kernelinfo;
	};

	#ifdef COMPAT_FREEBSD6
	typedef struct oaiocb32 {
	int aio_fildes; /* File descriptor */
	uint64_t aio_offset __packed; /* File offset for I/O */
	uint32_t aio_buf; /* I/O buffer in process space */
	uint32_t aio_nbytes; /* Number of bytes for I/O */
	struct osigevent32 aio_sigevent; /* Signal to deliver */
	int aio_lio_opcode; /* LIO opcode */
	int aio_reqprio; /* Request priority -- ignored */
	struct __aiocb_private32 _aiocb_private;
	} oaiocb32_t;
	#endif

	typedef struct aiocb32 {
	int32_t aio_fildes; /* File descriptor */
	uint64_t aio_offset __packed; /* File offset for I/O */
	uint32_t aio_buf; /* I/O buffer in process space */
	uint32_t aio_nbytes; /* Number of bytes for I/O */
	int __spare__[2];
	uint32_t __spare2__;
	int aio_lio_opcode; /* LIO opcode */
	int aio_reqprio; /* Request priority -- ignored */
	struct __aiocb_private32 _aiocb_private;
	struct sigevent32 aio_sigevent; /* Signal to deliver */
	} aiocb32_t;

	#ifdef COMPAT_FREEBSD6
	static int
	convert_old_sigevent32(struct osigevent32 osig, struct sigevent nsig)
	{

	/*
	* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
	* supported by AIO with the old sigevent structure.
	*/
	CP(osig, nsig, sigev_notify);
	switch (nsig->sigev_notify) {
	case SIGEV_NONE:
	break;
	case SIGEV_SIGNAL:
	nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
	break;
	case SIGEV_KEVENT:
	nsig->sigev_notify_kqueue =
	osig->__sigev_u.__sigev_notify_kqueue;
	PTRIN_CP(osig, nsig, sigev_value.sival_ptr);
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	aiocb32_copyin_old_sigevent(struct aiocb ujob, struct aiocb kjob)
	{
	struct oaiocb32 job32;
	int error;

	bzero(kjob, sizeof(struct aiocb));
	error = copyin(ujob, &job32, sizeof(job32));
	if (error)
	return (error);

	CP(job32, *kjob, aio_fildes);
	CP(job32, *kjob, aio_offset);
	PTRIN_CP(job32, *kjob, aio_buf);
	CP(job32, *kjob, aio_nbytes);
	CP(job32, *kjob, aio_lio_opcode);
	CP(job32, *kjob, aio_reqprio);
	CP(job32, *kjob, _aiocb_private.status);
	CP(job32, *kjob, _aiocb_private.error);
	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
	return (convert_old_sigevent32(&job32.aio_sigevent,
	&kjob->aio_sigevent));
	}
	#endif

	static int
	aiocb32_copyin(struct aiocb ujob, struct aiocb kjob)
	{
	struct aiocb32 job32;
	int error;

	error = copyin(ujob, &job32, sizeof(job32));
	if (error)
	return (error);
	CP(job32, *kjob, aio_fildes);
	CP(job32, *kjob, aio_offset);
	PTRIN_CP(job32, *kjob, aio_buf);
	CP(job32, *kjob, aio_nbytes);
	CP(job32, *kjob, aio_lio_opcode);
	CP(job32, *kjob, aio_reqprio);
	CP(job32, *kjob, _aiocb_private.status);
	CP(job32, *kjob, _aiocb_private.error);
	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
	}

	static long
	aiocb32_fetch_status(struct aiocb *ujob)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (fuword32(&ujob32->_aiocb_private.status));
	}

	static long
	aiocb32_fetch_error(struct aiocb *ujob)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (fuword32(&ujob32->_aiocb_private.error));
	}

	static int
	aiocb32_store_status(struct aiocb *ujob, long status)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (suword32(&ujob32->_aiocb_private.status, status));
	}

	static int
	aiocb32_store_error(struct aiocb *ujob, long error)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (suword32(&ujob32->_aiocb_private.error, error));
	}

	static int
	aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
	}

	static int
	aiocb32_store_aiocb(struct aiocb *ujobp, struct aiocb ujob)
	{

	return (suword32(ujobp, (long)ujob));
	}

	static struct aiocb_ops aiocb32_ops = {
	.aio_copyin = aiocb32_copyin,
	.fetch_status = aiocb32_fetch_status,
	.fetch_error = aiocb32_fetch_error,
	.store_status = aiocb32_store_status,
	.store_error = aiocb32_store_error,
	.store_kernelinfo = aiocb32_store_kernelinfo,
	.store_aiocb = aiocb32_store_aiocb,
	};

	#ifdef COMPAT_FREEBSD6
	static struct aiocb_ops aiocb32_ops_osigevent = {
	.aio_copyin = aiocb32_copyin_old_sigevent,
	.fetch_status = aiocb32_fetch_status,
	.fetch_error = aiocb32_fetch_error,
	.store_status = aiocb32_store_status,
	.store_error = aiocb32_store_error,
	.store_kernelinfo = aiocb32_store_kernelinfo,
	.store_aiocb = aiocb32_store_aiocb,
	};
	#endif

	int
	freebsd32_aio_return(struct thread td, struct freebsd32_aio_return_args uap)
	{

	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
	}

	int
	freebsd32_aio_suspend(struct thread td, struct freebsd32_aio_suspend_args uap)
	{
	struct timespec32 ts32;
	struct timespec ts, *tsp;
	struct aiocb **ujoblist;
	uint32_t *ujoblist32;
	int error, i;

	if (uap->nent < 0 \|\| uap->nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (uap->timeout) {
	/* Get timespec struct. */
	if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
	return (error);
	CP(ts32, ts, tv_sec);
	CP(ts32, ts, tv_nsec);
	tsp = &ts;
	} else
	tsp = NULL;

	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
	ujoblist32 = (uint32_t *)ujoblist;
	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
	sizeof(ujoblist32[0]));
	if (error == 0) {
	for (i = uap->nent - 1; i >= 0; i--)
	ujoblist[i] = PTRIN(ujoblist32[i]);

	error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
	}
	free(ujoblist, M_AIOS);
	return (error);
	}

	int
	freebsd32_aio_error(struct thread td, struct freebsd32_aio_error_args uap)
	{

	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
	}

	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_freebsd32_aio_read(struct thread *td,
	struct freebsd6_freebsd32_aio_read_args *uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
	&aiocb32_ops_osigevent));
	}
	#endif

	int
	freebsd32_aio_read(struct thread td, struct freebsd32_aio_read_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
	&aiocb32_ops));
	}

	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_freebsd32_aio_write(struct thread *td,
	struct freebsd6_freebsd32_aio_write_args *uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
	&aiocb32_ops_osigevent));
	}
	#endif

	int
	freebsd32_aio_write(struct thread td, struct freebsd32_aio_write_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
	&aiocb32_ops));
	}

	int
	freebsd32_aio_mlock(struct thread td, struct freebsd32_aio_mlock_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
	&aiocb32_ops));
	}

	int
	freebsd32_aio_waitcomplete(struct thread *td,
	struct freebsd32_aio_waitcomplete_args *uap)
	{
	struct timespec32 ts32;
	struct timespec ts, *tsp;
	int error;

	if (uap->timeout) {
	/* Get timespec struct. */
	error = copyin(uap->timeout, &ts32, sizeof(ts32));
	if (error)
	return (error);
	CP(ts32, ts, tv_sec);
	CP(ts32, ts, tv_nsec);
	tsp = &ts;
	} else
	tsp = NULL;

	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
	&aiocb32_ops));
	}

	int
	freebsd32_aio_fsync(struct thread td, struct freebsd32_aio_fsync_args uap)
	{

	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
	&aiocb32_ops));
	}

	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_freebsd32_lio_listio(struct thread *td,
	struct freebsd6_freebsd32_lio_listio_args *uap)
	{
	struct aiocb **acb_list;
	struct sigevent *sigp, sig;
	struct osigevent32 osig;
	uint32_t *acb_list32;
	int error, i, nent;

	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	return (EINVAL);

	nent = uap->nent;
	if (nent < 0 \|\| nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	error = copyin(uap->sig, &osig, sizeof(osig));
	if (error)
	return (error);
	error = convert_old_sigevent32(&osig, &sig);
	if (error)
	return (error);
	sigp = &sig;
	} else
	sigp = NULL;

	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
	if (error) {
	free(acb_list32, M_LIO);
	return (error);
	}
	acb_list = malloc(sizeof(struct aiocb ) nent, M_LIO, M_WAITOK);
	for (i = 0; i < nent; i++)
	acb_list[i] = PTRIN(acb_list32[i]);
	free(acb_list32, M_LIO);

	error = kern_lio_listio(td, uap->mode,
	(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
	&aiocb32_ops_osigevent);
	free(acb_list, M_LIO);
	return (error);
	}
	#endif

	int
	freebsd32_lio_listio(struct thread td, struct freebsd32_lio_listio_args uap)
	{
	struct aiocb **acb_list;
	struct sigevent *sigp, sig;
	struct sigevent32 sig32;
	uint32_t *acb_list32;
	int error, i, nent;

	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	return (EINVAL);

	nent = uap->nent;
	if (nent < 0 \|\| nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	error = copyin(uap->sig, &sig32, sizeof(sig32));
	if (error)
	return (error);
	error = convert_sigevent32(&sig32, &sig);
	if (error)
	return (error);
	sigp = &sig;
	} else
	sigp = NULL;

	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
	if (error) {
	free(acb_list32, M_LIO);
	return (error);
	}
	acb_list = malloc(sizeof(struct aiocb ) nent, M_LIO, M_WAITOK);
	for (i = 0; i < nent; i++)
	acb_list[i] = PTRIN(acb_list32[i]);
	free(acb_list32, M_LIO);

	error = kern_lio_listio(td, uap->mode,
	(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
	&aiocb32_ops);
	free(acb_list, M_LIO);
	return (error);
	}

	#endif
	diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
	index 706e97106c67..2f18221e9270 100644
	--- a/sys/kern/vfs_bio.c
	+++ b/sys/kern/vfs_bio.c
	@@ -1,5478 +1,5507 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2004 Poul-Henning Kamp
	* Copyright (c) 1994,1997 John S. Dyson
	* Copyright (c) 2013 The FreeBSD Foundation
	* All rights reserved.
	*
	* Portions of this software were developed by Konstantin Belousov
	* under sponsorship from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* this file contains a new buffer I/O scheme implementing a coherent
	* VM object and buffer cache scheme. Pains have been taken to make
	* sure that the performance degradation associated with schemes such
	* as this is not realized.
	*
	* Author: John S. Dyson
	* Significant help during the development and debugging phases
	* had been provided by David Greenman, also of the FreeBSD core team.
	*
	* see man buf(9) for more info.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/bitset.h>
	#include <sys/conf.h>
	#include <sys/counter.h>
	#include <sys/buf.h>
	#include <sys/devicestat.h>
	#include <sys/eventhandler.h>
	#include <sys/fail.h>
	#include <sys/ktr.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/proc.h>
	#include <sys/racct.h>
	#include <sys/refcount.h>
	#include <sys/resourcevar.h>
	#include <sys/rwlock.h>
	#include <sys/smp.h>
	#include <sys/sysctl.h>
	#include <sys/syscallsubr.h>
	#include <sys/vmem.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>
	#include <sys/watchdog.h>
	#include <geom/geom.h>
	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pageout.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_map.h>
	#include <vm/swap_pager.h>

	static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");

	struct bio_ops bioops; /* I/O operation notification */

	struct buf_ops buf_ops_bio = {
	.bop_name = "buf_ops_bio",
	.bop_write = bufwrite,
	.bop_strategy = bufstrategy,
	.bop_sync = bufsync,
	.bop_bdflush = bufbdflush,
	};

	struct bufqueue {
	struct mtx_padalign bq_lock;
	TAILQ_HEAD(, buf) bq_queue;
	uint8_t bq_index;
	uint16_t bq_subqueue;
	int bq_len;
	} __aligned(CACHE_LINE_SIZE);

	#define BQ_LOCKPTR(bq) (&(bq)->bq_lock)
	#define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq)))
	#define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq)))
	#define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)

	struct bufdomain {
	struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
	struct bufqueue bd_dirtyq;
	struct bufqueue *bd_cleanq;
	struct mtx_padalign bd_run_lock;
	/* Constants */
	long bd_maxbufspace;
	long bd_hibufspace;
	long bd_lobufspace;
	long bd_bufspacethresh;
	int bd_hifreebuffers;
	int bd_lofreebuffers;
	int bd_hidirtybuffers;
	int bd_lodirtybuffers;
	int bd_dirtybufthresh;
	int bd_lim;
	/* atomics */
	int bd_wanted;
	int __aligned(CACHE_LINE_SIZE) bd_numdirtybuffers;
	int __aligned(CACHE_LINE_SIZE) bd_running;
	long __aligned(CACHE_LINE_SIZE) bd_bufspace;
	int __aligned(CACHE_LINE_SIZE) bd_freebuffers;
	} __aligned(CACHE_LINE_SIZE);

	#define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock)
	#define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd)))
	#define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd)))
	#define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
	#define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock)
	#define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd)))
	#define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd)))
	#define BD_DOMAIN(bd) (bd - bdomain)

	-static struct buf buf; / buffer header pool */
	-extern struct buf swbuf; / Swap buffer header pool. */
	+static char buf; / buffer header pool */
	+static struct buf *
	+nbufp(unsigned i)
	+{
	+ return ((struct buf *)(buf + (sizeof(struct buf) +
	+ sizeof(vm_page_t) * atop(maxbcachebuf)) * i));
	+}
	+
	caddr_t __read_mostly unmapped_buf;

	/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
	struct proc *bufdaemonproc;

	static void vm_hold_free_pages(struct buf *bp, int newbsize);
	static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
	vm_offset_t to);
	static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
	static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
	vm_page_t m);
	static void vfs_clean_pages_dirty_buf(struct buf *bp);
	static void vfs_setdirty_range(struct buf *bp);
	static void vfs_vmio_invalidate(struct buf *bp);
	static void vfs_vmio_truncate(struct buf *bp, int npages);
	static void vfs_vmio_extend(struct buf *bp, int npages, int size);
	static int vfs_bio_clcheck(struct vnode *vp, int size,
	daddr_t lblkno, daddr_t blkno);
	static void breada(struct vnode , daddr_t , int , int, struct ucred , int,
	void ()(struct buf ));
	static int buf_flush(struct vnode vp, struct bufdomain , int);
	static int flushbufqueues(struct vnode , struct bufdomain , int, int);
	static void buf_daemon(void);
	static __inline void bd_wakeup(void);
	static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
	static void bufkva_reclaim(vmem_t *, int);
	static void bufkva_free(struct buf *);
	static int buf_import(void , void *, int, int, int);
	static void buf_release(void , void *, int);
	static void maxbcachebuf_adjust(void);
	static inline struct bufdomain bufdomain(struct buf );
	static void bq_remove(struct bufqueue bq, struct buf bp);
	static void bq_insert(struct bufqueue bq, struct buf bp, bool unlock);
	static int buf_recycle(struct bufdomain *, bool kva);
	static void bq_init(struct bufqueue *bq, int qindex, int cpu,
	const char *lockname);
	static void bd_init(struct bufdomain *bd);
	static int bd_flushall(struct bufdomain *bd);
	static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS);
	static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS);

	static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
	int vmiodirenable = TRUE;
	SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
	"Use the VM system for directory writes");
	long runningbufspace;
	SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
	"Amount of presently outstanding async buffer io");
	SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG\|CTLFLAG_MPSAFE\|CTLFLAG_RD,
	NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers");
	static counter_u64_t bufkvaspace;
	SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
	"Kernel virtual memory used for buffers");
	static long maxbufspace;
	SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace,
	CTLTYPE_LONG\|CTLFLAG_MPSAFE\|CTLFLAG_RW, &maxbufspace,
	__offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L",
	"Maximum allowed value of bufspace (including metadata)");
	static long bufmallocspace;
	SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
	"Amount of malloced memory for buffers");
	static long maxbufmallocspace;
	SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
	0, "Maximum amount of malloced memory for buffers");
	static long lobufspace;
	SYSCTL_PROC(_vfs, OID_AUTO, lobufspace,
	CTLTYPE_LONG\|CTLFLAG_MPSAFE\|CTLFLAG_RW, &lobufspace,
	__offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L",
	"Minimum amount of buffers we want to have");
	long hibufspace;
	SYSCTL_PROC(_vfs, OID_AUTO, hibufspace,
	CTLTYPE_LONG\|CTLFLAG_MPSAFE\|CTLFLAG_RW, &hibufspace,
	__offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L",
	"Maximum allowed value of bufspace (excluding metadata)");
	long bufspacethresh;
	SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh,
	CTLTYPE_LONG\|CTLFLAG_MPSAFE\|CTLFLAG_RW, &bufspacethresh,
	__offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L",
	"Bufspace consumed before waking the daemon to free some");
	static counter_u64_t buffreekvacnt;
	SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
	"Number of times we have freed the KVA space from some buffer");
	static counter_u64_t bufdefragcnt;
	SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt,
	"Number of times we have had to repeat buffer allocation to defragment");
	static long lorunningspace;
	SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG \| CTLFLAG_MPSAFE \|
	CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
	"Minimum preferred space used for in-progress I/O");
	static long hirunningspace;
	SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG \| CTLFLAG_MPSAFE \|
	CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
	"Maximum amount of space to use for in-progress I/O");
	int dirtybufferflushes;
	SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
	0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
	int bdwriteskip;
	SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
	0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
	int altbufferflushes;
	SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW \| CTLFLAG_STATS,
	&altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers");
	static int recursiveflushes;
	SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW \| CTLFLAG_STATS,
	&recursiveflushes, 0, "Number of flushes skipped due to being recursive");
	static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS);
	SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers,
	CTLTYPE_INT\|CTLFLAG_MPSAFE\|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I",
	"Number of buffers that are dirty (has unwritten changes) at the moment");
	static int lodirtybuffers;
	SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers,
	CTLTYPE_INT\|CTLFLAG_MPSAFE\|CTLFLAG_RW, &lodirtybuffers,
	__offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I",
	"How many buffers we want to have free before bufdaemon can sleep");
	static int hidirtybuffers;
	SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers,
	CTLTYPE_INT\|CTLFLAG_MPSAFE\|CTLFLAG_RW, &hidirtybuffers,
	__offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I",
	"When the number of dirty buffers is considered severe");
	int dirtybufthresh;
	SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh,
	CTLTYPE_INT\|CTLFLAG_MPSAFE\|CTLFLAG_RW, &dirtybufthresh,
	__offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I",
	"Number of bdwrite to bawrite conversions to clear dirty buffers");
	static int numfreebuffers;
	SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
	"Number of free buffers");
	static int lofreebuffers;
	SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers,
	CTLTYPE_INT\|CTLFLAG_MPSAFE\|CTLFLAG_RW, &lofreebuffers,
	__offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I",
	"Target number of free buffers");
	static int hifreebuffers;
	SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers,
	CTLTYPE_INT\|CTLFLAG_MPSAFE\|CTLFLAG_RW, &hifreebuffers,
	__offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I",
	"Threshold for clean buffer recycling");
	static counter_u64_t getnewbufcalls;
	SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
	&getnewbufcalls, "Number of calls to getnewbuf");
	static counter_u64_t getnewbufrestarts;
	SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD,
	&getnewbufrestarts,
	"Number of times getnewbuf has had to restart a buffer acquisition");
	static counter_u64_t mappingrestarts;
	SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD,
	&mappingrestarts,
	"Number of times getblk has had to restart a buffer mapping for "
	"unmapped buffer");
	static counter_u64_t numbufallocfails;
	SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW,
	&numbufallocfails, "Number of times buffer allocations failed");
	static int flushbufqtarget = 100;
	SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
	"Amount of work to do in flushbufqueues when helping bufdaemon");
	static counter_u64_t notbufdflushes;
	SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes,
	"Number of dirty buffer flushes done by the bufdaemon helpers");
	static long barrierwrites;
	SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW \| CTLFLAG_STATS,
	&barrierwrites, 0, "Number of barrier writes");
	SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
	&unmapped_buf_allowed, 0,
	"Permit the use of the unmapped i/o");
	int maxbcachebuf = MAXBCACHEBUF;
	SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0,
	"Maximum size of a buffer cache block");

	/*
	* This lock synchronizes access to bd_request.
	*/
	static struct mtx_padalign __exclusive_cache_line bdlock;

	/*
	* This lock protects the runningbufreq and synchronizes runningbufwakeup and
	* waitrunningbufspace().
	*/
	static struct mtx_padalign __exclusive_cache_line rbreqlock;

	/*
	* Lock that protects bdirtywait.
	*/
	static struct mtx_padalign __exclusive_cache_line bdirtylock;

	/*
	* Wakeup point for bufdaemon, as well as indicator of whether it is already
	* active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it
	* is idling.
	*/
	static int bd_request;

	/*
	* Request for the buf daemon to write more buffers than is indicated by
	* lodirtybuf. This may be necessary to push out excess dependencies or
	* defragment the address space where a simple count of the number of dirty
	* buffers is insufficient to characterize the demand for flushing them.
	*/
	static int bd_speedupreq;

	/*
	* Synchronization (sleep/wakeup) variable for active buffer space requests.
	* Set when wait starts, cleared prior to wakeup().
	* Used in runningbufwakeup() and waitrunningbufspace().
	*/
	static int runningbufreq;

	/*
	* Synchronization for bwillwrite() waiters.
	*/
	static int bdirtywait;

	/*
	* Definitions for the buffer free lists.
	*/
	#define QUEUE_NONE 0 /* on no queue */
	#define QUEUE_EMPTY 1 /* empty buffer headers */
	#define QUEUE_DIRTY 2 /* B_DELWRI buffers */
	#define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */
	#define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */

	/* Maximum number of buffer domains. */
	#define BUF_DOMAINS 8

	struct bufdomainset bdlodirty; /* Domains > lodirty */
	struct bufdomainset bdhidirty; /* Domains > hidirty */

	/* Configured number of clean queues. */
	static int __read_mostly buf_domains;

	BITSET_DEFINE(bufdomainset, BUF_DOMAINS);
	struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS];
	struct bufqueue __exclusive_cache_line bqempty;

	/*
	* per-cpu empty buffer cache.
	*/
	uma_zone_t buf_zone;

	/*
	* Single global constant for BUF_WMESG, to avoid getting multiple references.
	* buf_wmesg is referred from macros.
	*/
	const char *buf_wmesg = BUF_WMESG;

	static int
	sysctl_runningspace(SYSCTL_HANDLER_ARGS)
	{
	long value;
	int error;

	value = (long )arg1;
	error = sysctl_handle_long(oidp, &value, 0, req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	mtx_lock(&rbreqlock);
	if (arg1 == &hirunningspace) {
	if (value < lorunningspace)
	error = EINVAL;
	else
	hirunningspace = value;
	} else {
	KASSERT(arg1 == &lorunningspace,
	("%s: unknown arg1", __func__));
	if (value > hirunningspace)
	error = EINVAL;
	else
	lorunningspace = value;
	}
	mtx_unlock(&rbreqlock);
	return (error);
	}

	static int
	sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS)
	{
	int error;
	int value;
	int i;

	value = (int )arg1;
	error = sysctl_handle_int(oidp, &value, 0, req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	(int )arg1 = value;
	for (i = 0; i < buf_domains; i++)
	(int )(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
	value / buf_domains;

	return (error);
	}

	static int
	sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS)
	{
	long value;
	int error;
	int i;

	value = (long )arg1;
	error = sysctl_handle_long(oidp, &value, 0, req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	(long )arg1 = value;
	for (i = 0; i < buf_domains; i++)
	(long )(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
	value / buf_domains;

	return (error);
	}

	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	static int
	sysctl_bufspace(SYSCTL_HANDLER_ARGS)
	{
	long lvalue;
	int ivalue;
	int i;

	lvalue = 0;
	for (i = 0; i < buf_domains; i++)
	lvalue += bdomain[i].bd_bufspace;
	if (sizeof(int) == sizeof(long) \|\| req->oldlen >= sizeof(long))
	return (sysctl_handle_long(oidp, &lvalue, 0, req));
	if (lvalue > INT_MAX)
	/* On overflow, still write out a long to trigger ENOMEM. */
	return (sysctl_handle_long(oidp, &lvalue, 0, req));
	ivalue = lvalue;
	return (sysctl_handle_int(oidp, &ivalue, 0, req));
	}
	#else
	static int
	sysctl_bufspace(SYSCTL_HANDLER_ARGS)
	{
	long lvalue;
	int i;

	lvalue = 0;
	for (i = 0; i < buf_domains; i++)
	lvalue += bdomain[i].bd_bufspace;
	return (sysctl_handle_long(oidp, &lvalue, 0, req));
	}
	#endif

	static int
	sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS)
	{
	int value;
	int i;

	value = 0;
	for (i = 0; i < buf_domains; i++)
	value += bdomain[i].bd_numdirtybuffers;
	return (sysctl_handle_int(oidp, &value, 0, req));
	}

	/*
	* bdirtywakeup:
	*
	* Wakeup any bwillwrite() waiters.
	*/
	static void
	bdirtywakeup(void)
	{
	mtx_lock(&bdirtylock);
	if (bdirtywait) {
	bdirtywait = 0;
	wakeup(&bdirtywait);
	}
	mtx_unlock(&bdirtylock);
	}

	/*
	* bd_clear:
	*
	* Clear a domain from the appropriate bitsets when dirtybuffers
	* is decremented.
	*/
	static void
	bd_clear(struct bufdomain *bd)
	{

	mtx_lock(&bdirtylock);
	if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers)
	BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
	if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers)
	BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
	mtx_unlock(&bdirtylock);
	}

	/*
	* bd_set:
	*
	* Set a domain in the appropriate bitsets when dirtybuffers
	* is incremented.
	*/
	static void
	bd_set(struct bufdomain *bd)
	{

	mtx_lock(&bdirtylock);
	if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers)
	BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
	if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers)
	BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
	mtx_unlock(&bdirtylock);
	}

	/*
	* bdirtysub:
	*
	* Decrement the numdirtybuffers count by one and wakeup any
	* threads blocked in bwillwrite().
	*/
	static void
	bdirtysub(struct buf *bp)
	{
	struct bufdomain *bd;
	int num;

	bd = bufdomain(bp);
	num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1);
	if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
	bdirtywakeup();
	if (num == bd->bd_lodirtybuffers \|\| num == bd->bd_hidirtybuffers)
	bd_clear(bd);
	}

	/*
	* bdirtyadd:
	*
	* Increment the numdirtybuffers count by one and wakeup the buf
	* daemon if needed.
	*/
	static void
	bdirtyadd(struct buf *bp)
	{
	struct bufdomain *bd;
	int num;

	/*
	* Only do the wakeup once as we cross the boundary. The
	* buf daemon will keep running until the condition clears.
	*/
	bd = bufdomain(bp);
	num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1);
	if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
	bd_wakeup();
	if (num == bd->bd_lodirtybuffers \|\| num == bd->bd_hidirtybuffers)
	bd_set(bd);
	}

	/*
	* bufspace_daemon_wakeup:
	*
	* Wakeup the daemons responsible for freeing clean bufs.
	*/
	static void
	bufspace_daemon_wakeup(struct bufdomain *bd)
	{

	/*
	* avoid the lock if the daemon is running.
	*/
	if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) {
	BD_RUN_LOCK(bd);
	atomic_store_int(&bd->bd_running, 1);
	wakeup(&bd->bd_running);
	BD_RUN_UNLOCK(bd);
	}
	}

	/*
	* bufspace_daemon_wait:
	*
	* Sleep until the domain falls below a limit or one second passes.
	*/
	static void
	bufspace_daemon_wait(struct bufdomain *bd)
	{
	/*
	* Re-check our limits and sleep. bd_running must be
	* cleared prior to checking the limits to avoid missed
	* wakeups. The waker will adjust one of bufspace or
	* freebuffers prior to checking bd_running.
	*/
	BD_RUN_LOCK(bd);
	atomic_store_int(&bd->bd_running, 0);
	if (bd->bd_bufspace < bd->bd_bufspacethresh &&
	bd->bd_freebuffers > bd->bd_lofreebuffers) {
	msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO\|PDROP,
	"-", hz);
	} else {
	/* Avoid spurious wakeups while running. */
	atomic_store_int(&bd->bd_running, 1);
	BD_RUN_UNLOCK(bd);
	}
	}

	/*
	* bufspace_adjust:
	*
	* Adjust the reported bufspace for a KVA managed buffer, possibly
	* waking any waiters.
	*/
	static void
	bufspace_adjust(struct buf *bp, int bufsize)
	{
	struct bufdomain *bd;
	long space;
	int diff;

	KASSERT((bp->b_flags & B_MALLOC) == 0,
	("bufspace_adjust: malloc buf %p", bp));
	bd = bufdomain(bp);
	diff = bufsize - bp->b_bufsize;
	if (diff < 0) {
	atomic_subtract_long(&bd->bd_bufspace, -diff);
	} else if (diff > 0) {
	space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
	/* Wake up the daemon on the transition. */
	if (space < bd->bd_bufspacethresh &&
	space + diff >= bd->bd_bufspacethresh)
	bufspace_daemon_wakeup(bd);
	}
	bp->b_bufsize = bufsize;
	}

	/*
	* bufspace_reserve:
	*
	* Reserve bufspace before calling allocbuf(). metadata has a
	* different space limit than data.
	*/
	static int
	bufspace_reserve(struct bufdomain *bd, int size, bool metadata)
	{
	long limit, new;
	long space;

	if (metadata)
	limit = bd->bd_maxbufspace;
	else
	limit = bd->bd_hibufspace;
	space = atomic_fetchadd_long(&bd->bd_bufspace, size);
	new = space + size;
	if (new > limit) {
	atomic_subtract_long(&bd->bd_bufspace, size);
	return (ENOSPC);
	}

	/* Wake up the daemon on the transition. */
	if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh)
	bufspace_daemon_wakeup(bd);

	return (0);
	}

	/*
	* bufspace_release:
	*
	* Release reserved bufspace after bufspace_adjust() has consumed it.
	*/
	static void
	bufspace_release(struct bufdomain *bd, int size)
	{

	atomic_subtract_long(&bd->bd_bufspace, size);
	}

	/*
	* bufspace_wait:
	*
	* Wait for bufspace, acting as the buf daemon if a locked vnode is
	* supplied. bd_wanted must be set prior to polling for space. The
	* operation must be re-tried on return.
	*/
	static void
	bufspace_wait(struct bufdomain bd, struct vnode vp, int gbflags,
	int slpflag, int slptimeo)
	{
	struct thread *td;
	int error, fl, norunbuf;

	if ((gbflags & GB_NOWAIT_BD) != 0)
	return;

	td = curthread;
	BD_LOCK(bd);
	while (bd->bd_wanted) {
	if (vp != NULL && vp->v_type != VCHR &&
	(td->td_pflags & TDP_BUFNEED) == 0) {
	BD_UNLOCK(bd);
	/*
	* getblk() is called with a vnode locked, and
	* some majority of the dirty buffers may as
	* well belong to the vnode. Flushing the
	* buffers there would make a progress that
	* cannot be achieved by the buf_daemon, that
	* cannot lock the vnode.
	*/
	norunbuf = ~(TDP_BUFNEED \| TDP_NORUNNINGBUF) \|
	(td->td_pflags & TDP_NORUNNINGBUF);

	/*
	* Play bufdaemon. The getnewbuf() function
	* may be called while the thread owns lock
	* for another dirty buffer for the same
	* vnode, which makes it impossible to use
	* VOP_FSYNC() there, due to the buffer lock
	* recursion.
	*/
	td->td_pflags \|= TDP_BUFNEED \| TDP_NORUNNINGBUF;
	fl = buf_flush(vp, bd, flushbufqtarget);
	td->td_pflags &= norunbuf;
	BD_LOCK(bd);
	if (fl != 0)
	continue;
	if (bd->bd_wanted == 0)
	break;
	}
	error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
	(PRIBIO + 4) \| slpflag, "newbuf", slptimeo);
	if (error != 0)
	break;
	}
	BD_UNLOCK(bd);
	}

	/*
	* bufspace_daemon:
	*
	* buffer space management daemon. Tries to maintain some marginal
	* amount of free buffer space so that requesting processes neither
	* block nor work to reclaim buffers.
	*/
	static void
	bufspace_daemon(void *arg)
	{
	struct bufdomain *bd;

	EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread,
	SHUTDOWN_PRI_LAST + 100);

	bd = arg;
	for (;;) {
	kthread_suspend_check();

	/*
	* Free buffers from the clean queue until we meet our
	* targets.
	*
	* Theory of operation: The buffer cache is most efficient
	* when some free buffer headers and space are always
	* available to getnewbuf(). This daemon attempts to prevent
	* the excessive blocking and synchronization associated
	* with shortfall. It goes through three phases according
	* demand:
	*
	* 1) The daemon wakes up voluntarily once per-second
	* during idle periods when the counters are below
	* the wakeup thresholds (bufspacethresh, lofreebuffers).
	*
	* 2) The daemon wakes up as we cross the thresholds
	* ahead of any potential blocking. This may bounce
	* slightly according to the rate of consumption and
	* release.
	*
	* 3) The daemon and consumers are starved for working
	* clean buffers. This is the 'bufspace' sleep below
	* which will inefficiently trade bufs with bqrelse
	* until we return to condition 2.
	*/
	while (bd->bd_bufspace > bd->bd_lobufspace \|\|
	bd->bd_freebuffers < bd->bd_hifreebuffers) {
	if (buf_recycle(bd, false) != 0) {
	if (bd_flushall(bd))
	continue;
	/*
	* Speedup dirty if we've run out of clean
	* buffers. This is possible in particular
	* because softdep may held many bufs locked
	* pending writes to other bufs which are
	* marked for delayed write, exhausting
	* clean space until they are written.
	*/
	bd_speedup();
	BD_LOCK(bd);
	if (bd->bd_wanted) {
	msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
	PRIBIO\|PDROP, "bufspace", hz/10);
	} else
	BD_UNLOCK(bd);
	}
	maybe_yield();
	}
	bufspace_daemon_wait(bd);
	}
	}

	/*
	* bufmallocadjust:
	*
	* Adjust the reported bufspace for a malloc managed buffer, possibly
	* waking any waiters.
	*/
	static void
	bufmallocadjust(struct buf *bp, int bufsize)
	{
	int diff;

	KASSERT((bp->b_flags & B_MALLOC) != 0,
	("bufmallocadjust: non-malloc buf %p", bp));
	diff = bufsize - bp->b_bufsize;
	if (diff < 0)
	atomic_subtract_long(&bufmallocspace, -diff);
	else
	atomic_add_long(&bufmallocspace, diff);
	bp->b_bufsize = bufsize;
	}

	/*
	* runningwakeup:
	*
	* Wake up processes that are waiting on asynchronous writes to fall
	* below lorunningspace.
	*/
	static void
	runningwakeup(void)
	{

	mtx_lock(&rbreqlock);
	if (runningbufreq) {
	runningbufreq = 0;
	wakeup(&runningbufreq);
	}
	mtx_unlock(&rbreqlock);
	}

	/*
	* runningbufwakeup:
	*
	* Decrement the outstanding write count according.
	*/
	void
	runningbufwakeup(struct buf *bp)
	{
	long space, bspace;

	bspace = bp->b_runningbufspace;
	if (bspace == 0)
	return;
	space = atomic_fetchadd_long(&runningbufspace, -bspace);
	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
	space, bspace));
	bp->b_runningbufspace = 0;
	/*
	* Only acquire the lock and wakeup on the transition from exceeding
	* the threshold to falling below it.
	*/
	if (space < lorunningspace)
	return;
	if (space - bspace > lorunningspace)
	return;
	runningwakeup();
	}

	/*
	* waitrunningbufspace()
	*
	* runningbufspace is a measure of the amount of I/O currently
	* running. This routine is used in async-write situations to
	* prevent creating huge backups of pending writes to a device.
	* Only asynchronous writes are governed by this function.
	*
	* This does NOT turn an async write into a sync write. It waits
	* for earlier writes to complete and generally returns before the
	* caller's write has reached the device.
	*/
	void
	waitrunningbufspace(void)
	{

	mtx_lock(&rbreqlock);
	while (runningbufspace > hirunningspace) {
	runningbufreq = 1;
	msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
	}
	mtx_unlock(&rbreqlock);
	}

	/*
	* vfs_buf_test_cache:
	*
	* Called when a buffer is extended. This function clears the B_CACHE
	* bit if the newly extended portion of the buffer does not contain
	* valid data.
	*/
	static __inline void
	vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
	vm_offset_t size, vm_page_t m)
	{

	/*
	* This function and its results are protected by higher level
	* synchronization requiring vnode and buf locks to page in and
	* validate pages.
	*/
	if (bp->b_flags & B_CACHE) {
	int base = (foff + off) & PAGE_MASK;
	if (vm_page_is_valid(m, base, size) == 0)
	bp->b_flags &= ~B_CACHE;
	}
	}

	/* Wake up the buffer daemon if necessary */
	static void
	bd_wakeup(void)
	{

	mtx_lock(&bdlock);
	if (bd_request == 0) {
	bd_request = 1;
	wakeup(&bd_request);
	}
	mtx_unlock(&bdlock);
	}

	/*
	* Adjust the maxbcachbuf tunable.
	*/
	static void
	maxbcachebuf_adjust(void)
	{
	int i;

	/*
	* maxbcachebuf must be a power of 2 >= MAXBSIZE.
	*/
	i = 2;
	while (i * 2 <= maxbcachebuf)
	i *= 2;
	maxbcachebuf = i;
	if (maxbcachebuf < MAXBSIZE)
	maxbcachebuf = MAXBSIZE;
	- if (maxbcachebuf > MAXPHYS)
	- maxbcachebuf = MAXPHYS;
	+ if (maxbcachebuf > maxphys)
	+ maxbcachebuf = maxphys;
	if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF)
	printf("maxbcachebuf=%d\n", maxbcachebuf);
	}

	/*
	* bd_speedup - speedup the buffer cache flushing code
	*/
	void
	bd_speedup(void)
	{
	int needwake;

	mtx_lock(&bdlock);
	needwake = 0;
	if (bd_speedupreq == 0 \|\| bd_request == 0)
	needwake = 1;
	bd_speedupreq = 1;
	bd_request = 1;
	if (needwake)
	wakeup(&bd_request);
	mtx_unlock(&bdlock);
	}

	#ifdef __i386__
	#define TRANSIENT_DENOM 5
	#else
	#define TRANSIENT_DENOM 10
	#endif

	/*
	* Calculating buffer cache scaling values and reserve space for buffer
	* headers. This is called during low level kernel initialization and
	* may be called more then once. We CANNOT write to the memory area
	* being reserved at this time.
	*/
	caddr_t
	kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
	{
	int tuned_nbuf;
	long maxbuf, maxbuf_sz, buf_sz, biotmap_sz;

	/*
	* physmem_est is in pages. Convert it to kilobytes (assumes
	* PAGE_SIZE is >= 1K)
	*/
	physmem_est = physmem_est * (PAGE_SIZE / 1024);

	maxbcachebuf_adjust();
	/*
	* The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
	* For the first 64MB of ram nominally allocate sufficient buffers to
	* cover 1/4 of our ram. Beyond the first 64MB allocate additional
	* buffers to cover 1/10 of our ram over 64MB. When auto-sizing
	* the buffer cache we limit the eventual kva reservation to
	* maxbcache bytes.
	*
	* factor represents the 1/4 x ram conversion.
	*/
	if (nbuf == 0) {
	int factor = 4 * BKVASIZE / 1024;

	nbuf = 50;
	if (physmem_est > 4096)
	nbuf += min((physmem_est - 4096) / factor,
	65536 / factor);
	if (physmem_est > 65536)
	nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
	32 * 1024 * 1024 / (factor * 5));

	if (maxbcache && nbuf > maxbcache / BKVASIZE)
	nbuf = maxbcache / BKVASIZE;
	tuned_nbuf = 1;
	} else
	tuned_nbuf = 0;

	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
	maxbuf = (LONG_MAX / 3) / BKVASIZE;
	if (nbuf > maxbuf) {
	if (!tuned_nbuf)
	printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
	maxbuf);
	nbuf = maxbuf;
	}

	/*
	* Ideal allocation size for the transient bio submap is 10%
	* of the maximal space buffer map. This roughly corresponds
	* to the amount of the buffer mapped for typical UFS load.
	*
	* Clip the buffer map to reserve space for the transient
	* BIOs, if its extent is bigger than 90% (80% on i386) of the
	* maximum buffer map extent on the platform.
	*
	* The fall-back to the maxbuf in case of maxbcache unset,
	* allows to not trim the buffer KVA for the architectures
	* with ample KVA space.
	*/
	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
	maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
	buf_sz = (long)nbuf * BKVASIZE;
	if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
	(TRANSIENT_DENOM - 1)) {
	/*
	* There is more KVA than memory. Do not
	* adjust buffer map size, and assign the rest
	* of maxbuf to transient map.
	*/
	biotmap_sz = maxbuf_sz - buf_sz;
	} else {
	/*
	* Buffer map spans all KVA we could afford on
	* this platform. Give 10% (20% on i386) of
	* the buffer map to the transient bio map.
	*/
	biotmap_sz = buf_sz / TRANSIENT_DENOM;
	buf_sz -= biotmap_sz;
	}
	- if (biotmap_sz / INT_MAX > MAXPHYS)
	+ if (biotmap_sz / INT_MAX > maxphys)
	bio_transient_maxcnt = INT_MAX;
	else
	- bio_transient_maxcnt = biotmap_sz / MAXPHYS;
	+ bio_transient_maxcnt = biotmap_sz / maxphys;
	/*
	* Artificially limit to 1024 simultaneous in-flight I/Os
	* using the transient mapping.
	*/
	if (bio_transient_maxcnt > 1024)
	bio_transient_maxcnt = 1024;
	if (tuned_nbuf)
	nbuf = buf_sz / BKVASIZE;
	}

	if (nswbuf == 0) {
	nswbuf = min(nbuf / 4, 256);
	if (nswbuf < NSWBUF_MIN)
	nswbuf = NSWBUF_MIN;
	}

	/*
	* Reserve space for the buffer cache buffers
	*/
	- buf = (void *)v;
	- v = (caddr_t)(buf + nbuf);
	+ buf = (char *)v;
	+ v = (caddr_t)buf + (sizeof(struct buf) + sizeof(vm_page_t) *
	+ atop(maxbcachebuf)) * nbuf;

	- return(v);
	+ return (v);
	}

	/* Initialize the buffer subsystem. Called before use of any buffers. */
	void
	bufinit(void)
	{
	struct buf *bp;
	int i;

	KASSERT(maxbcachebuf >= MAXBSIZE,
	("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
	MAXBSIZE));
	bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);

	- unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
	+ unmapped_buf = (caddr_t)kva_alloc(maxphys);

	/* finally, initialize each buffer header and stick on empty q */
	for (i = 0; i < nbuf; i++) {
	- bp = &buf[i];
	- bzero(bp, sizeof *bp);
	+ bp = nbufp(i);
	+ bzero(bp, sizeof(bp) + sizeof(vm_page_t) atop(maxbcachebuf));
	bp->b_flags = B_INVAL;
	bp->b_rcred = NOCRED;
	bp->b_wcred = NOCRED;
	bp->b_qindex = QUEUE_NONE;
	bp->b_domain = -1;
	bp->b_subqueue = mp_maxid + 1;
	bp->b_xflags = 0;
	bp->b_data = bp->b_kvabase = unmapped_buf;
	LIST_INIT(&bp->b_dep);
	BUF_LOCKINIT(bp);
	bq_insert(&bqempty, bp, false);
	}

	/*
	* maxbufspace is the absolute maximum amount of buffer space we are
	* allowed to reserve in KVM and in real terms. The absolute maximum
	* is nominally used by metadata. hibufspace is the nominal maximum
	* used by most other requests. The differential is required to
	* ensure that metadata deadlocks don't occur.
	*
	* maxbufspace is based on BKVASIZE. Allocating buffers larger then
	* this may result in KVM fragmentation which is not handled optimally
	* by the system. XXX This is less true with vmem. We could use
	* PAGE_SIZE.
	*/
	maxbufspace = (long)nbuf * BKVASIZE;
	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10);
	lobufspace = (hibufspace / 20) * 19; /* 95% */
	bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;

	/*
	* Note: The 16 MiB upper limit for hirunningspace was chosen
	* arbitrarily and may need further tuning. It corresponds to
	* 128 outstanding write IO requests (if IO size is 128 KiB),
	* which fits with many RAID controllers' tagged queuing limits.
	* The lower 1 MiB limit is the historical upper limit for
	* hirunningspace.
	*/
	hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf),
	16 * 1024 * 1024), 1024 * 1024);
	lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf);

	/*
	* Limit the amount of malloc memory since it is wired permanently into
	* the kernel space. Even though this is accounted for in the buffer
	* allocation, we don't want the malloced region to grow uncontrolled.
	* The malloc scheme improves memory utilization significantly on
	* average (small) directories.
	*/
	maxbufmallocspace = hibufspace / 20;

	/*
	* Reduce the chance of a deadlock occurring by limiting the number
	* of delayed-write dirty buffers we allow to stack up.
	*/
	hidirtybuffers = nbuf / 4 + 20;
	dirtybufthresh = hidirtybuffers * 9 / 10;
	/*
	* To support extreme low-memory systems, make sure hidirtybuffers
	* cannot eat up all available buffer space. This occurs when our
	* minimum cannot be met. We try to size hidirtybuffers to 3/4 our
	* buffer space assuming BKVASIZE'd buffers.
	*/
	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
	hidirtybuffers >>= 1;
	}
	lodirtybuffers = hidirtybuffers / 2;

	/*
	* lofreebuffers should be sufficient to avoid stalling waiting on
	* buf headers under heavy utilization. The bufs in per-cpu caches
	* are counted as free but will be unavailable to threads executing
	* on other cpus.
	*
	* hifreebuffers is the free target for the bufspace daemon. This
	* should be set appropriately to limit work per-iteration.
	*/
	lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
	hifreebuffers = (3 * lofreebuffers) / 2;
	numfreebuffers = nbuf;

	/* Setup the kva and free list allocators. */
	vmem_set_reclaim(buffer_arena, bufkva_reclaim);
	- buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
	+ buf_zone = uma_zcache_create("buf free cache",
	+ sizeof(struct buf) + sizeof(vm_page_t) * atop(maxbcachebuf),
	NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);

	/*
	* Size the clean queue according to the amount of buffer space.
	* One queue per-256mb up to the max. More queues gives better
	* concurrency but less accurate LRU.
	*/
	buf_domains = MIN(howmany(maxbufspace, 25610241024), BUF_DOMAINS);
	for (i = 0 ; i < buf_domains; i++) {
	struct bufdomain *bd;

	bd = &bdomain[i];
	bd_init(bd);
	bd->bd_freebuffers = nbuf / buf_domains;
	bd->bd_hifreebuffers = hifreebuffers / buf_domains;
	bd->bd_lofreebuffers = lofreebuffers / buf_domains;
	bd->bd_bufspace = 0;
	bd->bd_maxbufspace = maxbufspace / buf_domains;
	bd->bd_hibufspace = hibufspace / buf_domains;
	bd->bd_lobufspace = lobufspace / buf_domains;
	bd->bd_bufspacethresh = bufspacethresh / buf_domains;
	bd->bd_numdirtybuffers = 0;
	bd->bd_hidirtybuffers = hidirtybuffers / buf_domains;
	bd->bd_lodirtybuffers = lodirtybuffers / buf_domains;
	bd->bd_dirtybufthresh = dirtybufthresh / buf_domains;
	/* Don't allow more than 2% of bufs in the per-cpu caches. */
	bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus;
	}
	getnewbufcalls = counter_u64_alloc(M_WAITOK);
	getnewbufrestarts = counter_u64_alloc(M_WAITOK);
	mappingrestarts = counter_u64_alloc(M_WAITOK);
	numbufallocfails = counter_u64_alloc(M_WAITOK);
	notbufdflushes = counter_u64_alloc(M_WAITOK);
	buffreekvacnt = counter_u64_alloc(M_WAITOK);
	bufdefragcnt = counter_u64_alloc(M_WAITOK);
	bufkvaspace = counter_u64_alloc(M_WAITOK);
	}

	#ifdef INVARIANTS
	static inline void
	vfs_buf_check_mapped(struct buf *bp)
	{

	KASSERT(bp->b_kvabase != unmapped_buf,
	("mapped buf: b_kvabase was not updated %p", bp));
	KASSERT(bp->b_data != unmapped_buf,
	("mapped buf: b_data was not updated %p", bp));
	KASSERT(bp->b_data < unmapped_buf \|\| bp->b_data >= unmapped_buf +
	- MAXPHYS, ("b_data + b_offset unmapped %p", bp));
	+ maxphys, ("b_data + b_offset unmapped %p", bp));
	}

	static inline void
	vfs_buf_check_unmapped(struct buf *bp)
	{

	KASSERT(bp->b_data == unmapped_buf,
	("unmapped buf: corrupted b_data %p", bp));
	}

	#define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
	#define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
	#else
	#define BUF_CHECK_MAPPED(bp) do {} while (0)
	#define BUF_CHECK_UNMAPPED(bp) do {} while (0)
	#endif

	static int
	isbufbusy(struct buf *bp)
	{
	if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) \|\|
	((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI))
	return (1);
	return (0);
	}

	/*
	* Shutdown the system cleanly to prepare for reboot, halt, or power off.
	*/
	void
	bufshutdown(int show_busybufs)
	{
	static int first_buf_printf = 1;
	struct buf *bp;
	- int iter, nbusy, pbusy;
	+ int i, iter, nbusy, pbusy;
	#ifndef PREEMPTION
	int subiter;
	#endif

	/*
	* Sync filesystems for shutdown
	*/
	wdog_kern_pat(WD_LASTVAL);
	kern_sync(curthread);

	/*
	* With soft updates, some buffers that are
	* written will be remarked as dirty until other
	* buffers are written.
	*/
	for (iter = pbusy = 0; iter < 20; iter++) {
	nbusy = 0;
	- for (bp = &buf[nbuf]; --bp >= buf; )
	+ for (i = nbuf - 1; i >= 0; i--) {
	+ bp = nbufp(i);
	if (isbufbusy(bp))
	nbusy++;
	+ }
	if (nbusy == 0) {
	if (first_buf_printf)
	printf("All buffers synced.");
	break;
	}
	if (first_buf_printf) {
	printf("Syncing disks, buffers remaining... ");
	first_buf_printf = 0;
	}
	printf("%d ", nbusy);
	if (nbusy < pbusy)
	iter = 0;
	pbusy = nbusy;

	wdog_kern_pat(WD_LASTVAL);
	kern_sync(curthread);

	#ifdef PREEMPTION
	/*
	* Spin for a while to allow interrupt threads to run.
	*/
	DELAY(50000 * iter);
	#else
	/*
	* Context switch several times to allow interrupt
	* threads to run.
	*/
	for (subiter = 0; subiter < 50 * iter; subiter++) {
	thread_lock(curthread);
	mi_switch(SW_VOL);
	DELAY(1000);
	}
	#endif
	}
	printf("\n");
	/*
	* Count only busy local buffers to prevent forcing
	* a fsck if we're just a client of a wedged NFS server
	*/
	nbusy = 0;
	- for (bp = &buf[nbuf]; --bp >= buf; ) {
	+ for (i = nbuf - 1; i >= 0; i--) {
	+ bp = nbufp(i);
	if (isbufbusy(bp)) {
	#if 0
	/* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */
	if (bp->b_dev == NULL) {
	TAILQ_REMOVE(&mountlist,
	bp->b_vp->v_mount, mnt_list);
	continue;
	}
	#endif
	nbusy++;
	if (show_busybufs > 0) {
	printf(
	"%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
	nbusy, bp, bp->b_vp, bp->b_flags,
	(intmax_t)bp->b_blkno,
	(intmax_t)bp->b_lblkno);
	BUF_LOCKPRINTINFO(bp);
	if (show_busybufs > 1)
	vn_printf(bp->b_vp,
	"vnode content: ");
	}
	}
	}
	if (nbusy) {
	/*
	* Failed to sync all blocks. Indicate this and don't
	* unmount filesystems (thus forcing an fsck on reboot).
	*/
	printf("Giving up on %d buffers\n", nbusy);
	DELAY(5000000); /* 5 seconds */
	} else {
	if (!first_buf_printf)
	printf("Final sync complete\n");
	/*
	* Unmount filesystems
	*/
	if (!KERNEL_PANICKED())
	vfs_unmountall();
	}
	swapoff_all();
	DELAY(100000); /* wait for console output to finish */
	}

	static void
	bpmap_qenter(struct buf *bp)
	{

	BUF_CHECK_MAPPED(bp);

	/*
	* bp->b_data is relative to bp->b_offset, but
	* bp->b_offset may be offset into the first page.
	*/
	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data \|
	(vm_offset_t)(bp->b_offset & PAGE_MASK));
	}

	static inline struct bufdomain *
	bufdomain(struct buf *bp)
	{

	return (&bdomain[bp->b_domain]);
	}

	static struct bufqueue *
	bufqueue(struct buf *bp)
	{

	switch (bp->b_qindex) {
	case QUEUE_NONE:
	/* FALLTHROUGH */
	case QUEUE_SENTINEL:
	return (NULL);
	case QUEUE_EMPTY:
	return (&bqempty);
	case QUEUE_DIRTY:
	return (&bufdomain(bp)->bd_dirtyq);
	case QUEUE_CLEAN:
	return (&bufdomain(bp)->bd_subq[bp->b_subqueue]);
	default:
	break;
	}
	panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex);
	}

	/*
	* Return the locked bufqueue that bp is a member of.
	*/
	static struct bufqueue *
	bufqueue_acquire(struct buf *bp)
	{
	struct bufqueue bq, nbq;

	/*
	* bp can be pushed from a per-cpu queue to the
	* cleanq while we're waiting on the lock. Retry
	* if the queues don't match.
	*/
	bq = bufqueue(bp);
	BQ_LOCK(bq);
	for (;;) {
	nbq = bufqueue(bp);
	if (bq == nbq)
	break;
	BQ_UNLOCK(bq);
	BQ_LOCK(nbq);
	bq = nbq;
	}
	return (bq);
	}

	/*
	* binsfree:
	*
	* Insert the buffer into the appropriate free list. Requires a
	* locked buffer on entry and buffer is unlocked before return.
	*/
	static void
	binsfree(struct buf *bp, int qindex)
	{
	struct bufdomain *bd;
	struct bufqueue *bq;

	KASSERT(qindex == QUEUE_CLEAN \|\| qindex == QUEUE_DIRTY,
	("binsfree: Invalid qindex %d", qindex));
	BUF_ASSERT_XLOCKED(bp);

	/*
	* Handle delayed bremfree() processing.
	*/
	if (bp->b_flags & B_REMFREE) {
	if (bp->b_qindex == qindex) {
	bp->b_flags \|= B_REUSE;
	bp->b_flags &= ~B_REMFREE;
	BUF_UNLOCK(bp);
	return;
	}
	bq = bufqueue_acquire(bp);
	bq_remove(bq, bp);
	BQ_UNLOCK(bq);
	}
	bd = bufdomain(bp);
	if (qindex == QUEUE_CLEAN) {
	if (bd->bd_lim != 0)
	bq = &bd->bd_subq[PCPU_GET(cpuid)];
	else
	bq = bd->bd_cleanq;
	} else
	bq = &bd->bd_dirtyq;
	bq_insert(bq, bp, true);
	}

	/*
	* buf_free:
	*
	* Free a buffer to the buf zone once it no longer has valid contents.
	*/
	static void
	buf_free(struct buf *bp)
	{

	if (bp->b_flags & B_REMFREE)
	bremfreef(bp);
	if (bp->b_vflags & BV_BKGRDINPROG)
	panic("losing buffer 1");
	if (bp->b_rcred != NOCRED) {
	crfree(bp->b_rcred);
	bp->b_rcred = NOCRED;
	}
	if (bp->b_wcred != NOCRED) {
	crfree(bp->b_wcred);
	bp->b_wcred = NOCRED;
	}
	if (!LIST_EMPTY(&bp->b_dep))
	buf_deallocate(bp);
	bufkva_free(bp);
	atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1);
	+ MPASS((bp->b_flags & B_MAXPHYS) == 0);
	BUF_UNLOCK(bp);
	uma_zfree(buf_zone, bp);
	}

	/*
	* buf_import:
	*
	* Import bufs into the uma cache from the buf list. The system still
	* expects a static array of bufs and much of the synchronization
	* around bufs assumes type stable storage. As a result, UMA is used
	* only as a per-cpu cache of bufs still maintained on a global list.
	*/
	static int
	buf_import(void arg, void *store, int cnt, int domain, int flags)
	{
	struct buf *bp;
	int i;

	BQ_LOCK(&bqempty);
	for (i = 0; i < cnt; i++) {
	bp = TAILQ_FIRST(&bqempty.bq_queue);
	if (bp == NULL)
	break;
	bq_remove(&bqempty, bp);
	store[i] = bp;
	}
	BQ_UNLOCK(&bqempty);

	return (i);
	}

	/*
	* buf_release:
	*
	* Release bufs from the uma cache back to the buffer queues.
	*/
	static void
	buf_release(void arg, void *store, int cnt)
	{
	struct bufqueue *bq;
	struct buf *bp;
	int i;

	bq = &bqempty;
	BQ_LOCK(bq);
	for (i = 0; i < cnt; i++) {
	bp = store[i];
	/* Inline bq_insert() to batch locking. */
	TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
	bp->b_flags &= ~(B_AGE \| B_REUSE);
	bq->bq_len++;
	bp->b_qindex = bq->bq_index;
	}
	BQ_UNLOCK(bq);
	}

	/*
	* buf_alloc:
	*
	* Allocate an empty buffer header.
	*/
	static struct buf *
	buf_alloc(struct bufdomain *bd)
	{
	struct buf *bp;
	int freebufs, error;

	/*
	* We can only run out of bufs in the buf zone if the average buf
	* is less than BKVASIZE. In this case the actual wait/block will
	* come from buf_reycle() failing to flush one of these small bufs.
	*/
	bp = NULL;
	freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1);
	if (freebufs > 0)
	bp = uma_zalloc(buf_zone, M_NOWAIT);
	if (bp == NULL) {
	atomic_add_int(&bd->bd_freebuffers, 1);
	bufspace_daemon_wakeup(bd);
	counter_u64_add(numbufallocfails, 1);
	return (NULL);
	}
	/*
	* Wake-up the bufspace daemon on transition below threshold.
	*/
	if (freebufs == bd->bd_lofreebuffers)
	bufspace_daemon_wakeup(bd);

	error = BUF_LOCK(bp, LK_EXCLUSIVE, NULL);
	KASSERT(error == 0, ("%s: BUF_LOCK on free buf %p: %d.", __func__, bp,
	error));
	(void)error;

	KASSERT(bp->b_vp == NULL,
	("bp: %p still has vnode %p.", bp, bp->b_vp));
	KASSERT((bp->b_flags & (B_DELWRI \| B_NOREUSE)) == 0,
	("invalid buffer %p flags %#x", bp, bp->b_flags));
	KASSERT((bp->b_xflags & (BX_VNCLEAN\|BX_VNDIRTY)) == 0,
	("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
	KASSERT(bp->b_npages == 0,
	("bp: %p still has %d vm pages\n", bp, bp->b_npages));
	KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
	KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
	+ MPASS((bp->b_flags & B_MAXPHYS) == 0);

	bp->b_domain = BD_DOMAIN(bd);
	bp->b_flags = 0;
	bp->b_ioflags = 0;
	bp->b_xflags = 0;
	bp->b_vflags = 0;
	bp->b_vp = NULL;
	bp->b_blkno = bp->b_lblkno = 0;
	bp->b_offset = NOOFFSET;
	bp->b_iodone = 0;
	bp->b_error = 0;
	bp->b_resid = 0;
	bp->b_bcount = 0;
	bp->b_npages = 0;
	bp->b_dirtyoff = bp->b_dirtyend = 0;
	bp->b_bufobj = NULL;
	bp->b_data = bp->b_kvabase = unmapped_buf;
	bp->b_fsprivate1 = NULL;
	bp->b_fsprivate2 = NULL;
	bp->b_fsprivate3 = NULL;
	LIST_INIT(&bp->b_dep);

	return (bp);
	}

	/*
	* buf_recycle:
	*
	* Free a buffer from the given bufqueue. kva controls whether the
	* freed buf must own some kva resources. This is used for
	* defragmenting.
	*/
	static int
	buf_recycle(struct bufdomain *bd, bool kva)
	{
	struct bufqueue *bq;
	struct buf bp, nbp;

	if (kva)
	counter_u64_add(bufdefragcnt, 1);
	nbp = NULL;
	bq = bd->bd_cleanq;
	BQ_LOCK(bq);
	KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd),
	("buf_recycle: Locks don't match"));
	nbp = TAILQ_FIRST(&bq->bq_queue);

	/*
	* Run scan, possibly freeing data and/or kva mappings on the fly
	* depending.
	*/
	while ((bp = nbp) != NULL) {
	/*
	* Calculate next bp (we can only use it if we do not
	* release the bqlock).
	*/
	nbp = TAILQ_NEXT(bp, b_freelist);

	/*
	* If we are defragging then we need a buffer with
	* some kva to reclaim.
	*/
	if (kva && bp->b_kvasize == 0)
	continue;

	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT, NULL) != 0)
	continue;

	/*
	* Implement a second chance algorithm for frequently
	* accessed buffers.
	*/
	if ((bp->b_flags & B_REUSE) != 0) {
	TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
	TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
	bp->b_flags &= ~B_REUSE;
	BUF_UNLOCK(bp);
	continue;
	}

	/*
	* Skip buffers with background writes in progress.
	*/
	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
	BUF_UNLOCK(bp);
	continue;
	}

	KASSERT(bp->b_qindex == QUEUE_CLEAN,
	("buf_recycle: inconsistent queue %d bp %p",
	bp->b_qindex, bp));
	KASSERT(bp->b_domain == BD_DOMAIN(bd),
	("getnewbuf: queue domain %d doesn't match request %d",
	bp->b_domain, (int)BD_DOMAIN(bd)));
	/*
	* NOTE: nbp is now entirely invalid. We can only restart
	* the scan from this point on.
	*/
	bq_remove(bq, bp);
	BQ_UNLOCK(bq);

	/*
	* Requeue the background write buffer with error and
	* restart the scan.
	*/
	if ((bp->b_vflags & BV_BKGRDERR) != 0) {
	bqrelse(bp);
	BQ_LOCK(bq);
	nbp = TAILQ_FIRST(&bq->bq_queue);
	continue;
	}
	bp->b_flags \|= B_INVAL;
	brelse(bp);
	return (0);
	}
	bd->bd_wanted = 1;
	BQ_UNLOCK(bq);

	return (ENOBUFS);
	}

	/*
	* bremfree:
	*
	* Mark the buffer for removal from the appropriate free list.
	*
	*/
	void
	bremfree(struct buf *bp)
	{

	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
	KASSERT((bp->b_flags & B_REMFREE) == 0,
	("bremfree: buffer %p already marked for delayed removal.", bp));
	KASSERT(bp->b_qindex != QUEUE_NONE,
	("bremfree: buffer %p not on a queue.", bp));
	BUF_ASSERT_XLOCKED(bp);

	bp->b_flags \|= B_REMFREE;
	}

	/*
	* bremfreef:
	*
	* Force an immediate removal from a free list. Used only in nfs when
	* it abuses the b_freelist pointer.
	*/
	void
	bremfreef(struct buf *bp)
	{
	struct bufqueue *bq;

	bq = bufqueue_acquire(bp);
	bq_remove(bq, bp);
	BQ_UNLOCK(bq);
	}

	static void
	bq_init(struct bufqueue bq, int qindex, int subqueue, const char lockname)
	{

	mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF);
	TAILQ_INIT(&bq->bq_queue);
	bq->bq_len = 0;
	bq->bq_index = qindex;
	bq->bq_subqueue = subqueue;
	}

	static void
	bd_init(struct bufdomain *bd)
	{
	int i;

	bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1];
	bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock");
	bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock");
	for (i = 0; i <= mp_maxid; i++)
	bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i,
	"bufq clean subqueue lock");
	mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF);
	}

	/*
	* bq_remove:
	*
	* Removes a buffer from the free list, must be called with the
	* correct qlock held.
	*/
	static void
	bq_remove(struct bufqueue bq, struct buf bp)
	{

	CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X",
	bp, bp->b_vp, bp->b_flags);
	KASSERT(bp->b_qindex != QUEUE_NONE,
	("bq_remove: buffer %p not on a queue.", bp));
	KASSERT(bufqueue(bp) == bq,
	("bq_remove: Remove buffer %p from wrong queue.", bp));

	BQ_ASSERT_LOCKED(bq);
	if (bp->b_qindex != QUEUE_EMPTY) {
	BUF_ASSERT_XLOCKED(bp);
	}
	KASSERT(bq->bq_len >= 1,
	("queue %d underflow", bp->b_qindex));
	TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
	bq->bq_len--;
	bp->b_qindex = QUEUE_NONE;
	bp->b_flags &= ~(B_REMFREE \| B_REUSE);
	}

	static void
	bd_flush(struct bufdomain bd, struct bufqueue bq)
	{
	struct buf *bp;

	BQ_ASSERT_LOCKED(bq);
	if (bq != bd->bd_cleanq) {
	BD_LOCK(bd);
	while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) {
	TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
	TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp,
	b_freelist);
	bp->b_subqueue = bd->bd_cleanq->bq_subqueue;
	}
	bd->bd_cleanq->bq_len += bq->bq_len;
	bq->bq_len = 0;
	}
	if (bd->bd_wanted) {
	bd->bd_wanted = 0;
	wakeup(&bd->bd_wanted);
	}
	if (bq != bd->bd_cleanq)
	BD_UNLOCK(bd);
	}

	static int
	bd_flushall(struct bufdomain *bd)
	{
	struct bufqueue *bq;
	int flushed;
	int i;

	if (bd->bd_lim == 0)
	return (0);
	flushed = 0;
	for (i = 0; i <= mp_maxid; i++) {
	bq = &bd->bd_subq[i];
	if (bq->bq_len == 0)
	continue;
	BQ_LOCK(bq);
	bd_flush(bd, bq);
	BQ_UNLOCK(bq);
	flushed++;
	}

	return (flushed);
	}

	static void
	bq_insert(struct bufqueue bq, struct buf bp, bool unlock)
	{
	struct bufdomain *bd;

	if (bp->b_qindex != QUEUE_NONE)
	panic("bq_insert: free buffer %p onto another queue?", bp);

	bd = bufdomain(bp);
	if (bp->b_flags & B_AGE) {
	/* Place this buf directly on the real queue. */
	if (bq->bq_index == QUEUE_CLEAN)
	bq = bd->bd_cleanq;
	BQ_LOCK(bq);
	TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist);
	} else {
	BQ_LOCK(bq);
	TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
	}
	bp->b_flags &= ~(B_AGE \| B_REUSE);
	bq->bq_len++;
	bp->b_qindex = bq->bq_index;
	bp->b_subqueue = bq->bq_subqueue;

	/*
	* Unlock before we notify so that we don't wakeup a waiter that
	* fails a trylock on the buf and sleeps again.
	*/
	if (unlock)
	BUF_UNLOCK(bp);

	if (bp->b_qindex == QUEUE_CLEAN) {
	/*
	* Flush the per-cpu queue and notify any waiters.
	*/
	if (bd->bd_wanted \|\| (bq != bd->bd_cleanq &&
	bq->bq_len >= bd->bd_lim))
	bd_flush(bd, bq);
	}
	BQ_UNLOCK(bq);
	}

	/*
	* bufkva_free:
	*
	* Free the kva allocation for a buffer.
	*
	*/
	static void
	bufkva_free(struct buf *bp)
	{

	#ifdef INVARIANTS
	if (bp->b_kvasize == 0) {
	KASSERT(bp->b_kvabase == unmapped_buf &&
	bp->b_data == unmapped_buf,
	("Leaked KVA space on %p", bp));
	} else if (buf_mapped(bp))
	BUF_CHECK_MAPPED(bp);
	else
	BUF_CHECK_UNMAPPED(bp);
	#endif
	if (bp->b_kvasize == 0)
	return;

	vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
	counter_u64_add(bufkvaspace, -bp->b_kvasize);
	counter_u64_add(buffreekvacnt, 1);
	bp->b_data = bp->b_kvabase = unmapped_buf;
	bp->b_kvasize = 0;
	}

	/*
	* bufkva_alloc:
	*
	* Allocate the buffer KVA and set b_kvasize and b_kvabase.
	*/
	static int
	bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
	{
	vm_offset_t addr;
	int error;

	KASSERT((gbflags & GB_UNMAPPED) == 0 \|\| (gbflags & GB_KVAALLOC) != 0,
	("Invalid gbflags 0x%x in %s", gbflags, __func__));
	+ MPASS((bp->b_flags & B_MAXPHYS) == 0);
	+ KASSERT(maxsize <= maxbcachebuf,
	+ ("bufkva_alloc kva too large %d %u", maxsize, maxbcachebuf));

	bufkva_free(bp);

	addr = 0;
	error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT \| M_NOWAIT, &addr);
	if (error != 0) {
	/*
	* Buffer map is too fragmented. Request the caller
	* to defragment the map.
	*/
	return (error);
	}
	bp->b_kvabase = (caddr_t)addr;
	bp->b_kvasize = maxsize;
	counter_u64_add(bufkvaspace, bp->b_kvasize);
	if ((gbflags & GB_UNMAPPED) != 0) {
	bp->b_data = unmapped_buf;
	BUF_CHECK_UNMAPPED(bp);
	} else {
	bp->b_data = bp->b_kvabase;
	BUF_CHECK_MAPPED(bp);
	}
	return (0);
	}

	/*
	* bufkva_reclaim:
	*
	* Reclaim buffer kva by freeing buffers holding kva. This is a vmem
	* callback that fires to avoid returning failure.
	*/
	static void
	bufkva_reclaim(vmem_t *vmem, int flags)
	{
	bool done;
	int q;
	int i;

	done = false;
	for (i = 0; i < 5; i++) {
	for (q = 0; q < buf_domains; q++)
	if (buf_recycle(&bdomain[q], true) != 0)
	done = true;
	if (done)
	break;
	}
	return;
	}

	/*
	* Attempt to initiate asynchronous I/O on read-ahead blocks. We must
	* clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
	* the buffer is valid and we do not have to do anything.
	*/
	static void
	breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt,
	struct ucred * cred, int flags, void (ckhashfunc)(struct buf ))
	{
	struct buf *rabp;
	struct thread *td;
	int i;

	td = curthread;

	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
	if (inmem(vp, *rablkno))
	continue;
	rabp = getblk(vp, rablkno, rabsize, 0, 0, 0);
	if ((rabp->b_flags & B_CACHE) != 0) {
	brelse(rabp);
	continue;
	}
	#ifdef RACCT
	if (racct_enable) {
	PROC_LOCK(curproc);
	racct_add_buf(curproc, rabp, 0);
	PROC_UNLOCK(curproc);
	}
	#endif /* RACCT */
	td->td_ru.ru_inblock++;
	rabp->b_flags \|= B_ASYNC;
	rabp->b_flags &= ~B_INVAL;
	if ((flags & GB_CKHASH) != 0) {
	rabp->b_flags \|= B_CKHASH;
	rabp->b_ckhashcalc = ckhashfunc;
	}
	rabp->b_ioflags &= ~BIO_ERROR;
	rabp->b_iocmd = BIO_READ;
	if (rabp->b_rcred == NOCRED && cred != NOCRED)
	rabp->b_rcred = crhold(cred);
	vfs_busy_pages(rabp, 0);
	BUF_KERNPROC(rabp);
	rabp->b_iooffset = dbtob(rabp->b_blkno);
	bstrategy(rabp);
	}
	}

	/*
	* Entry point for bread() and breadn() via #defines in sys/buf.h.
	*
	* Get a buffer with the specified data. Look in the cache first. We
	* must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE
	* is set, the buffer is valid and we do not have to do anything, see
	* getblk(). Also starts asynchronous I/O on read-ahead blocks.
	*
	* Always return a NULL buffer pointer (in bpp) when returning an error.
	*
	* The blkno parameter is the logical block being requested. Normally
	* the mapping of logical block number to disk block address is done
	* by calling VOP_BMAP(). However, if the mapping is already known, the
	* disk block address can be passed using the dblkno parameter. If the
	* disk block address is not known, then the same value should be passed
	* for blkno and dblkno.
	*/
	int
	breadn_flags(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size,
	daddr_t rablkno, int rabsize, int cnt, struct ucred *cred, int flags,
	void (ckhashfunc)(struct buf ), struct buf **bpp)
	{
	struct buf *bp;
	struct thread *td;
	int error, readwait, rv;

	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
	td = curthread;
	/*
	* Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags
	* are specified.
	*/
	error = getblkx(vp, blkno, dblkno, size, 0, 0, flags, &bp);
	if (error != 0) {
	*bpp = NULL;
	return (error);
	}
	KASSERT(blkno == bp->b_lblkno,
	("getblkx returned buffer for blkno %jd instead of blkno %jd",
	(intmax_t)bp->b_lblkno, (intmax_t)blkno));
	flags &= ~GB_NOSPARSE;
	*bpp = bp;

	/*
	* If not found in cache, do some I/O
	*/
	readwait = 0;
	if ((bp->b_flags & B_CACHE) == 0) {
	#ifdef RACCT
	if (racct_enable) {
	PROC_LOCK(td->td_proc);
	racct_add_buf(td->td_proc, bp, 0);
	PROC_UNLOCK(td->td_proc);
	}
	#endif /* RACCT */
	td->td_ru.ru_inblock++;
	bp->b_iocmd = BIO_READ;
	bp->b_flags &= ~B_INVAL;
	if ((flags & GB_CKHASH) != 0) {
	bp->b_flags \|= B_CKHASH;
	bp->b_ckhashcalc = ckhashfunc;
	}
	if ((flags & GB_CVTENXIO) != 0)
	bp->b_xflags \|= BX_CVTENXIO;
	bp->b_ioflags &= ~BIO_ERROR;
	if (bp->b_rcred == NOCRED && cred != NOCRED)
	bp->b_rcred = crhold(cred);
	vfs_busy_pages(bp, 0);
	bp->b_iooffset = dbtob(bp->b_blkno);
	bstrategy(bp);
	++readwait;
	}

	/*
	* Attempt to initiate asynchronous I/O on read-ahead blocks.
	*/
	breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc);

	rv = 0;
	if (readwait) {
	rv = bufwait(bp);
	if (rv != 0) {
	brelse(bp);
	*bpp = NULL;
	}
	}
	return (rv);
	}

	/*
	* Write, release buffer on completion. (Done by iodone
	* if async). Do not bother writing anything if the buffer
	* is invalid.
	*
	* Note that we set B_CACHE here, indicating that buffer is
	* fully valid and thus cacheable. This is true even of NFS
	* now so we set it generally. This could be set either here
	* or in biodone() since the I/O is synchronous. We put it
	* here.
	*/
	int
	bufwrite(struct buf *bp)
	{
	int oldflags;
	struct vnode *vp;
	long space;
	int vp_md;

	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
	if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
	bp->b_flags \|= B_INVAL \| B_RELBUF;
	bp->b_flags &= ~B_CACHE;
	brelse(bp);
	return (ENXIO);
	}
	if (bp->b_flags & B_INVAL) {
	brelse(bp);
	return (0);
	}

	if (bp->b_flags & B_BARRIER)
	atomic_add_long(&barrierwrites, 1);

	oldflags = bp->b_flags;

	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
	("FFS background buffer should not get here %p", bp));

	vp = bp->b_vp;
	if (vp)
	vp_md = vp->v_vflag & VV_MD;
	else
	vp_md = 0;

	/*
	* Mark the buffer clean. Increment the bufobj write count
	* before bundirty() call, to prevent other thread from seeing
	* empty dirty list and zero counter for writes in progress,
	* falsely indicating that the bufobj is clean.
	*/
	bufobj_wref(bp->b_bufobj);
	bundirty(bp);

	bp->b_flags &= ~B_DONE;
	bp->b_ioflags &= ~BIO_ERROR;
	bp->b_flags \|= B_CACHE;
	bp->b_iocmd = BIO_WRITE;

	vfs_busy_pages(bp, 1);

	/*
	* Normal bwrites pipeline writes
	*/
	bp->b_runningbufspace = bp->b_bufsize;
	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);

	#ifdef RACCT
	if (racct_enable) {
	PROC_LOCK(curproc);
	racct_add_buf(curproc, bp, 1);
	PROC_UNLOCK(curproc);
	}
	#endif /* RACCT */
	curthread->td_ru.ru_oublock++;
	if (oldflags & B_ASYNC)
	BUF_KERNPROC(bp);
	bp->b_iooffset = dbtob(bp->b_blkno);
	buf_track(bp, __func__);
	bstrategy(bp);

	if ((oldflags & B_ASYNC) == 0) {
	int rtval = bufwait(bp);
	brelse(bp);
	return (rtval);
	} else if (space > hirunningspace) {
	/*
	* don't allow the async write to saturate the I/O
	* system. We will not deadlock here because
	* we are blocking waiting for I/O that is already in-progress
	* to complete. We do not block here if it is the update
	* or syncer daemon trying to clean up as that can lead
	* to deadlock.
	*/
	if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
	waitrunningbufspace();
	}

	return (0);
	}

	void
	bufbdflush(struct bufobj bo, struct buf bp)
	{
	struct buf *nbp;

	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
	(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
	altbufferflushes++;
	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
	BO_LOCK(bo);
	/*
	* Try to find a buffer to flush.
	*/
	TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
	if ((nbp->b_vflags & BV_BKGRDINPROG) \|\|
	BUF_LOCK(nbp,
	LK_EXCLUSIVE \| LK_NOWAIT, NULL))
	continue;
	if (bp == nbp)
	panic("bdwrite: found ourselves");
	BO_UNLOCK(bo);
	/* Don't countdeps with the bo lock held. */
	if (buf_countdeps(nbp, 0)) {
	BO_LOCK(bo);
	BUF_UNLOCK(nbp);
	continue;
	}
	if (nbp->b_flags & B_CLUSTEROK) {
	vfs_bio_awrite(nbp);
	} else {
	bremfree(nbp);
	bawrite(nbp);
	}
	dirtybufferflushes++;
	break;
	}
	if (nbp == NULL)
	BO_UNLOCK(bo);
	}
	}

	/*
	* Delayed write. (Buffer is marked dirty). Do not bother writing
	* anything if the buffer is marked invalid.
	*
	* Note that since the buffer must be completely valid, we can safely
	* set B_CACHE. In fact, we have to set B_CACHE here rather then in
	* biodone() in order to prevent getblk from writing the buffer
	* out synchronously.
	*/
	void
	bdwrite(struct buf *bp)
	{
	struct thread *td = curthread;
	struct vnode *vp;
	struct bufobj *bo;

	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
	KASSERT((bp->b_flags & B_BARRIER) == 0,
	("Barrier request in delayed write %p", bp));

	if (bp->b_flags & B_INVAL) {
	brelse(bp);
	return;
	}

	/*
	* If we have too many dirty buffers, don't create any more.
	* If we are wildly over our limit, then force a complete
	* cleanup. Otherwise, just keep the situation from getting
	* out of control. Note that we have to avoid a recursive
	* disaster and not try to clean up after our own cleanup!
	*/
	vp = bp->b_vp;
	bo = bp->b_bufobj;
	if ((td->td_pflags & (TDP_COWINPROGRESS\|TDP_INBDFLUSH)) == 0) {
	td->td_pflags \|= TDP_INBDFLUSH;
	BO_BDFLUSH(bo, bp);
	td->td_pflags &= ~TDP_INBDFLUSH;
	} else
	recursiveflushes++;

	bdirty(bp);
	/*
	* Set B_CACHE, indicating that the buffer is fully valid. This is
	* true even of NFS now.
	*/
	bp->b_flags \|= B_CACHE;

	/*
	* This bmap keeps the system from needing to do the bmap later,
	* perhaps when the system is attempting to do a sync. Since it
	* is likely that the indirect block -- or whatever other datastructure
	* that the filesystem needs is still in memory now, it is a good
	* thing to do this. Note also, that if the pageout daemon is
	* requesting a sync -- there might not be enough memory to do
	* the bmap then... So, this is important to do.
	*/
	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
	VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
	}

	buf_track(bp, __func__);

	/*
	* Set the dirty buffer range based upon the VM system dirty
	* pages.
	*
	* Mark the buffer pages as clean. We need to do this here to
	* satisfy the vnode_pager and the pageout daemon, so that it
	* thinks that the pages have been "cleaned". Note that since
	* the pages are in a delayed write buffer -- the VFS layer
	* "will" see that the pages get written out on the next sync,
	* or perhaps the cluster will be completed.
	*/
	vfs_clean_pages_dirty_buf(bp);
	bqrelse(bp);

	/*
	* note: we cannot initiate I/O from a bdwrite even if we wanted to,
	* due to the softdep code.
	*/
	}

	/*
	* bdirty:
	*
	* Turn buffer into delayed write request. We must clear BIO_READ and
	* B_RELBUF, and we must set B_DELWRI. We reassign the buffer to
	* itself to properly update it in the dirty/clean lists. We mark it
	* B_DONE to ensure that any asynchronization of the buffer properly
	* clears B_DONE ( else a panic will occur later ).
	*
	* bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
	* might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty()
	* should only be called if the buffer is known-good.
	*
	* Since the buffer is not on a queue, we do not update the numfreebuffers
	* count.
	*
	* The buffer must be on QUEUE_NONE.
	*/
	void
	bdirty(struct buf *bp)
	{

	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
	bp, bp->b_vp, bp->b_flags);
	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
	KASSERT(bp->b_flags & B_REMFREE \|\| bp->b_qindex == QUEUE_NONE,
	("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
	bp->b_flags &= ~(B_RELBUF);
	bp->b_iocmd = BIO_WRITE;

	if ((bp->b_flags & B_DELWRI) == 0) {
	bp->b_flags \|= /* XXX B_DONE \| */ B_DELWRI;
	reassignbuf(bp);
	bdirtyadd(bp);
	}
	}

	/*
	* bundirty:
	*
	* Clear B_DELWRI for buffer.
	*
	* Since the buffer is not on a queue, we do not update the numfreebuffers
	* count.
	*
	* The buffer must be on QUEUE_NONE.
	*/

	void
	bundirty(struct buf *bp)
	{

	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
	KASSERT(bp->b_flags & B_REMFREE \|\| bp->b_qindex == QUEUE_NONE,
	("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));

	if (bp->b_flags & B_DELWRI) {
	bp->b_flags &= ~B_DELWRI;
	reassignbuf(bp);
	bdirtysub(bp);
	}
	/*
	* Since it is now being written, we can clear its deferred write flag.
	*/
	bp->b_flags &= ~B_DEFERRED;
	}

	/*
	* bawrite:
	*
	* Asynchronous write. Start output on a buffer, but do not wait for
	* it to complete. The buffer is released when the output completes.
	*
	* bwrite() ( or the VOP routine anyway ) is responsible for handling
	* B_INVAL buffers. Not us.
	*/
	void
	bawrite(struct buf *bp)
	{

	bp->b_flags \|= B_ASYNC;
	(void) bwrite(bp);
	}

	/*
	* babarrierwrite:
	*
	* Asynchronous barrier write. Start output on a buffer, but do not
	* wait for it to complete. Place a write barrier after this write so
	* that this buffer and all buffers written before it are committed to
	* the disk before any buffers written after this write are committed
	* to the disk. The buffer is released when the output completes.
	*/
	void
	babarrierwrite(struct buf *bp)
	{

	bp->b_flags \|= B_ASYNC \| B_BARRIER;
	(void) bwrite(bp);
	}

	/*
	* bbarrierwrite:
	*
	* Synchronous barrier write. Start output on a buffer and wait for
	* it to complete. Place a write barrier after this write so that
	* this buffer and all buffers written before it are committed to
	* the disk before any buffers written after this write are committed
	* to the disk. The buffer is released when the output completes.
	*/
	int
	bbarrierwrite(struct buf *bp)
	{

	bp->b_flags \|= B_BARRIER;
	return (bwrite(bp));
	}

	/*
	* bwillwrite:
	*
	* Called prior to the locking of any vnodes when we are expecting to
	* write. We do not want to starve the buffer cache with too many
	* dirty buffers so we block here. By blocking prior to the locking
	* of any vnodes we attempt to avoid the situation where a locked vnode
	* prevents the various system daemons from flushing related buffers.
	*/
	void
	bwillwrite(void)
	{

	if (buf_dirty_count_severe()) {
	mtx_lock(&bdirtylock);
	while (buf_dirty_count_severe()) {
	bdirtywait = 1;
	msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
	"flswai", 0);
	}
	mtx_unlock(&bdirtylock);
	}
	}

	/*
	* Return true if we have too many dirty buffers.
	*/
	int
	buf_dirty_count_severe(void)
	{

	return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty));
	}

	/*
	* brelse:
	*
	* Release a busy buffer and, if requested, free its resources. The
	* buffer will be stashed in the appropriate bufqueue[] allowing it
	* to be accessed later as a cache entity or reused for other purposes.
	*/
	void
	brelse(struct buf *bp)
	{
	struct mount *v_mnt;
	int qindex;

	/*
	* Many functions erroneously call brelse with a NULL bp under rare
	* error conditions. Simply return when called with a NULL bp.
	*/
	if (bp == NULL)
	return;
	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
	bp, bp->b_vp, bp->b_flags);
	KASSERT(!(bp->b_flags & (B_CLUSTER\|B_PAGING)),
	("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
	KASSERT((bp->b_flags & B_VMIO) != 0 \|\| (bp->b_flags & B_NOREUSE) == 0,
	("brelse: non-VMIO buffer marked NOREUSE"));

	if (BUF_LOCKRECURSED(bp)) {
	/*
	* Do not process, in particular, do not handle the
	* B_INVAL/B_RELBUF and do not release to free list.
	*/
	BUF_UNLOCK(bp);
	return;
	}

	if (bp->b_flags & B_MANAGED) {
	bqrelse(bp);
	return;
	}

	if ((bp->b_vflags & (BV_BKGRDINPROG \| BV_BKGRDERR)) == BV_BKGRDERR) {
	BO_LOCK(bp->b_bufobj);
	bp->b_vflags &= ~BV_BKGRDERR;
	BO_UNLOCK(bp->b_bufobj);
	bdirty(bp);
	}

	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
	(bp->b_flags & B_INVALONERR)) {
	/*
	* Forced invalidation of dirty buffer contents, to be used
	* after a failed write in the rare case that the loss of the
	* contents is acceptable. The buffer is invalidated and
	* freed.
	*/
	bp->b_flags \|= B_INVAL \| B_RELBUF \| B_NOCACHE;
	bp->b_flags &= ~(B_ASYNC \| B_CACHE);
	}

	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
	(bp->b_error != ENXIO \|\| !LIST_EMPTY(&bp->b_dep)) &&
	!(bp->b_flags & B_INVAL)) {
	/*
	* Failed write, redirty. All errors except ENXIO (which
	* means the device is gone) are treated as being
	* transient.
	*
	* XXX Treating EIO as transient is not correct; the
	* contract with the local storage device drivers is that
	* they will only return EIO once the I/O is no longer
	* retriable. Network I/O also respects this through the
	* guarantees of TCP and/or the internal retries of NFS.
	* ENOMEM might be transient, but we also have no way of
	* knowing when its ok to retry/reschedule. In general,
	* this entire case should be made obsolete through better
	* error handling/recovery and resource scheduling.
	*
	* Do this also for buffers that failed with ENXIO, but have
	* non-empty dependencies - the soft updates code might need
	* to access the buffer to untangle them.
	*
	* Must clear BIO_ERROR to prevent pages from being scrapped.
	*/
	bp->b_ioflags &= ~BIO_ERROR;
	bdirty(bp);
	} else if ((bp->b_flags & (B_NOCACHE \| B_INVAL)) \|\|
	(bp->b_ioflags & BIO_ERROR) \|\| (bp->b_bufsize <= 0)) {
	/*
	* Either a failed read I/O, or we were asked to free or not
	* cache the buffer, or we failed to write to a device that's
	* no longer present.
	*/
	bp->b_flags \|= B_INVAL;
	if (!LIST_EMPTY(&bp->b_dep))
	buf_deallocate(bp);
	if (bp->b_flags & B_DELWRI)
	bdirtysub(bp);
	bp->b_flags &= ~(B_DELWRI \| B_CACHE);
	if ((bp->b_flags & B_VMIO) == 0) {
	allocbuf(bp, 0);
	if (bp->b_vp)
	brelvp(bp);
	}
	}

	/*
	* We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate()
	* is called with B_DELWRI set, the underlying pages may wind up
	* getting freed causing a previous write (bdwrite()) to get 'lost'
	* because pages associated with a B_DELWRI bp are marked clean.
	*
	* We still allow the B_INVAL case to call vfs_vmio_truncate(), even
	* if B_DELWRI is set.
	*/
	if (bp->b_flags & B_DELWRI)
	bp->b_flags &= ~B_RELBUF;

	/*
	* VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
	* constituted, not even NFS buffers now. Two flags effect this. If
	* B_INVAL, the struct buf is invalidated but the VM object is kept
	* around ( i.e. so it is trivial to reconstitute the buffer later ).
	*
	* If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
	* invalidated. BIO_ERROR cannot be set for a failed write unless the
	* buffer is also B_INVAL because it hits the re-dirtying code above.
	*
	* Normally we can do this whether a buffer is B_DELWRI or not. If
	* the buffer is an NFS buffer, it is tracking piecemeal writes or
	* the commit state and we cannot afford to lose the buffer. If the
	* buffer has a background write in progress, we need to keep it
	* around to prevent it from being reconstituted and starting a second
	* background write.
	*/

	v_mnt = bp->b_vp != NULL ? bp->b_vp->v_mount : NULL;

	if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE \|\|
	(bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) &&
	(v_mnt == NULL \|\| (v_mnt->mnt_vfc->vfc_flags & VFCF_NETWORK) == 0 \|\|
	vn_isdisk(bp->b_vp) \|\| (bp->b_flags & B_DELWRI) == 0)) {
	vfs_vmio_invalidate(bp);
	allocbuf(bp, 0);
	}

	if ((bp->b_flags & (B_INVAL \| B_RELBUF)) != 0 \|\|
	(bp->b_flags & (B_DELWRI \| B_NOREUSE)) == B_NOREUSE) {
	allocbuf(bp, 0);
	bp->b_flags &= ~B_NOREUSE;
	if (bp->b_vp != NULL)
	brelvp(bp);
	}

	/*
	* If the buffer has junk contents signal it and eventually
	* clean up B_DELWRI and diassociate the vnode so that gbincore()
	* doesn't find it.
	*/
	if (bp->b_bufsize == 0 \|\| (bp->b_ioflags & BIO_ERROR) != 0 \|\|
	(bp->b_flags & (B_INVAL \| B_NOCACHE \| B_RELBUF)) != 0)
	bp->b_flags \|= B_INVAL;
	if (bp->b_flags & B_INVAL) {
	if (bp->b_flags & B_DELWRI)
	bundirty(bp);
	if (bp->b_vp)
	brelvp(bp);
	}

	buf_track(bp, __func__);

	/* buffers with no memory */
	if (bp->b_bufsize == 0) {
	buf_free(bp);
	return;
	}
	/* buffers with junk contents */
	if (bp->b_flags & (B_INVAL \| B_NOCACHE \| B_RELBUF) \|\|
	(bp->b_ioflags & BIO_ERROR)) {
	bp->b_xflags &= ~(BX_BKGRDWRITE \| BX_ALTDATA);
	if (bp->b_vflags & BV_BKGRDINPROG)
	panic("losing buffer 2");
	qindex = QUEUE_CLEAN;
	bp->b_flags \|= B_AGE;
	/* remaining buffers */
	} else if (bp->b_flags & B_DELWRI)
	qindex = QUEUE_DIRTY;
	else
	qindex = QUEUE_CLEAN;

	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
	panic("brelse: not dirty");

	bp->b_flags &= ~(B_ASYNC \| B_NOCACHE \| B_RELBUF \| B_DIRECT);
	bp->b_xflags &= ~(BX_CVTENXIO);
	/* binsfree unlocks bp. */
	binsfree(bp, qindex);
	}

	/*
	* Release a buffer back to the appropriate queue but do not try to free
	* it. The buffer is expected to be used again soon.
	*
	* bqrelse() is used by bdwrite() to requeue a delayed write, and used by
	* biodone() to requeue an async I/O on completion. It is also used when
	* known good buffers need to be requeued but we think we may need the data
	* again soon.
	*
	* XXX we should be able to leave the B_RELBUF hint set on completion.
	*/
	void
	bqrelse(struct buf *bp)
	{
	int qindex;

	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
	KASSERT(!(bp->b_flags & (B_CLUSTER\|B_PAGING)),
	("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));

	qindex = QUEUE_NONE;
	if (BUF_LOCKRECURSED(bp)) {
	/* do not release to free list */
	BUF_UNLOCK(bp);
	return;
	}
	bp->b_flags &= ~(B_ASYNC \| B_NOCACHE \| B_AGE \| B_RELBUF);
	bp->b_xflags &= ~(BX_CVTENXIO);

	if (bp->b_flags & B_MANAGED) {
	if (bp->b_flags & B_REMFREE)
	bremfreef(bp);
	goto out;
	}

	/* buffers with stale but valid contents */
	if ((bp->b_flags & B_DELWRI) != 0 \|\| (bp->b_vflags & (BV_BKGRDINPROG \|
	BV_BKGRDERR)) == BV_BKGRDERR) {
	BO_LOCK(bp->b_bufobj);
	bp->b_vflags &= ~BV_BKGRDERR;
	BO_UNLOCK(bp->b_bufobj);
	qindex = QUEUE_DIRTY;
	} else {
	if ((bp->b_flags & B_DELWRI) == 0 &&
	(bp->b_xflags & BX_VNDIRTY))
	panic("bqrelse: not dirty");
	if ((bp->b_flags & B_NOREUSE) != 0) {
	brelse(bp);
	return;
	}
	qindex = QUEUE_CLEAN;
	}
	buf_track(bp, __func__);
	/* binsfree unlocks bp. */
	binsfree(bp, qindex);
	return;

	out:
	buf_track(bp, __func__);
	/* unlock */
	BUF_UNLOCK(bp);
	}

	/*
	* Complete I/O to a VMIO backed page. Validate the pages as appropriate,
	* restore bogus pages.
	*/
	static void
	vfs_vmio_iodone(struct buf *bp)
	{
	vm_ooffset_t foff;
	vm_page_t m;
	vm_object_t obj;
	struct vnode *vp __unused;
	int i, iosize, resid;
	bool bogus;

	obj = bp->b_bufobj->bo_object;
	KASSERT(blockcount_read(&obj->paging_in_progress) >= bp->b_npages,
	("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)",
	blockcount_read(&obj->paging_in_progress), bp->b_npages));

	vp = bp->b_vp;
	VNPASS(vp->v_holdcnt > 0, vp);
	VNPASS(vp->v_object != NULL, vp);

	foff = bp->b_offset;
	KASSERT(bp->b_offset != NOOFFSET,
	("vfs_vmio_iodone: bp %p has no buffer offset", bp));

	bogus = false;
	iosize = bp->b_bcount - bp->b_resid;
	for (i = 0; i < bp->b_npages; i++) {
	resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
	if (resid > iosize)
	resid = iosize;

	/*
	* cleanup bogus pages, restoring the originals
	*/
	m = bp->b_pages[i];
	if (m == bogus_page) {
	bogus = true;
	m = vm_page_relookup(obj, OFF_TO_IDX(foff));
	if (m == NULL)
	panic("biodone: page disappeared!");
	bp->b_pages[i] = m;
	} else if ((bp->b_iocmd == BIO_READ) && resid > 0) {
	/*
	* In the write case, the valid and clean bits are
	* already changed correctly ( see bdwrite() ), so we
	* only need to do this here in the read case.
	*/
	KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK,
	resid)) == 0, ("vfs_vmio_iodone: page %p "
	"has unexpected dirty bits", m));
	vfs_page_set_valid(bp, foff, m);
	}
	KASSERT(OFF_TO_IDX(foff) == m->pindex,
	("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch",
	(intmax_t)foff, (uintmax_t)m->pindex));

	vm_page_sunbusy(m);
	foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
	iosize -= resid;
	}
	vm_object_pip_wakeupn(obj, bp->b_npages);
	if (bogus && buf_mapped(bp)) {
	BUF_CHECK_MAPPED(bp);
	pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
	bp->b_pages, bp->b_npages);
	}
	}

	/*
	* Perform page invalidation when a buffer is released. The fully invalid
	* pages will be reclaimed later in vfs_vmio_truncate().
	*/
	static void
	vfs_vmio_invalidate(struct buf *bp)
	{
	vm_object_t obj;
	vm_page_t m;
	int flags, i, resid, poffset, presid;

	if (buf_mapped(bp)) {
	BUF_CHECK_MAPPED(bp);
	pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
	} else
	BUF_CHECK_UNMAPPED(bp);
	/*
	* Get the base offset and length of the buffer. Note that
	* in the VMIO case if the buffer block size is not
	* page-aligned then b_data pointer may not be page-aligned.
	* But our b_pages[] array IS page aligned.
	*
	* block sizes less then DEV_BSIZE (usually 512) are not
	* supported due to the page granularity bits (m->valid,
	* m->dirty, etc...).
	*
	* See man buf(9) for more information
	*/
	flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0;
	obj = bp->b_bufobj->bo_object;
	resid = bp->b_bufsize;
	poffset = bp->b_offset & PAGE_MASK;
	VM_OBJECT_WLOCK(obj);
	for (i = 0; i < bp->b_npages; i++) {
	m = bp->b_pages[i];
	if (m == bogus_page)
	panic("vfs_vmio_invalidate: Unexpected bogus page.");
	bp->b_pages[i] = NULL;

	presid = resid > (PAGE_SIZE - poffset) ?
	(PAGE_SIZE - poffset) : resid;
	KASSERT(presid >= 0, ("brelse: extra page"));
	vm_page_busy_acquire(m, VM_ALLOC_SBUSY);
	if (pmap_page_wired_mappings(m) == 0)
	vm_page_set_invalid(m, poffset, presid);
	vm_page_sunbusy(m);
	vm_page_release_locked(m, flags);
	resid -= presid;
	poffset = 0;
	}
	VM_OBJECT_WUNLOCK(obj);
	bp->b_npages = 0;
	}

	/*
	* Page-granular truncation of an existing VMIO buffer.
	*/
	static void
	vfs_vmio_truncate(struct buf *bp, int desiredpages)
	{
	vm_object_t obj;
	vm_page_t m;
	int flags, i;

	if (bp->b_npages == desiredpages)
	return;

	if (buf_mapped(bp)) {
	BUF_CHECK_MAPPED(bp);
	pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) +
	(desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages);
	} else
	BUF_CHECK_UNMAPPED(bp);

	/*
	* The object lock is needed only if we will attempt to free pages.
	*/
	flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0;
	if ((bp->b_flags & B_DIRECT) != 0) {
	flags \|= VPR_TRYFREE;
	obj = bp->b_bufobj->bo_object;
	VM_OBJECT_WLOCK(obj);
	} else {
	obj = NULL;
	}
	for (i = desiredpages; i < bp->b_npages; i++) {
	m = bp->b_pages[i];
	KASSERT(m != bogus_page, ("allocbuf: bogus page found"));
	bp->b_pages[i] = NULL;
	if (obj != NULL)
	vm_page_release_locked(m, flags);
	else
	vm_page_release(m, flags);
	}
	if (obj != NULL)
	VM_OBJECT_WUNLOCK(obj);
	bp->b_npages = desiredpages;
	}

	/*
	* Byte granular extension of VMIO buffers.
	*/
	static void
	vfs_vmio_extend(struct buf *bp, int desiredpages, int size)
	{
	/*
	* We are growing the buffer, possibly in a
	* byte-granular fashion.
	*/
	vm_object_t obj;
	vm_offset_t toff;
	vm_offset_t tinc;
	vm_page_t m;

	/*
	* Step 1, bring in the VM pages from the object, allocating
	* them if necessary. We must clear B_CACHE if these pages
	* are not valid for the range covered by the buffer.
	*/
	obj = bp->b_bufobj->bo_object;
	if (bp->b_npages < desiredpages) {
	+ KASSERT(desiredpages <= atop(maxbcachebuf),
	+ ("vfs_vmio_extend past maxbcachebuf %p %d %u",
	+ bp, desiredpages, maxbcachebuf));
	+
	/*
	* We must allocate system pages since blocking
	* here could interfere with paging I/O, no
	* matter which process we are.
	*
	* Only exclusive busy can be tested here.
	* Blocking on shared busy might lead to
	* deadlocks once allocbuf() is called after
	* pages are vfs_busy_pages().
	*/
	(void)vm_page_grab_pages_unlocked(obj,
	OFF_TO_IDX(bp->b_offset) + bp->b_npages,
	VM_ALLOC_SYSTEM \| VM_ALLOC_IGN_SBUSY \|
	VM_ALLOC_NOBUSY \| VM_ALLOC_WIRED,
	&bp->b_pages[bp->b_npages], desiredpages - bp->b_npages);
	bp->b_npages = desiredpages;
	}

	/*
	* Step 2. We've loaded the pages into the buffer,
	* we have to figure out if we can still have B_CACHE
	* set. Note that B_CACHE is set according to the
	* byte-granular range ( bcount and size ), not the
	* aligned range ( newbsize ).
	*
	* The VM test is against m->valid, which is DEV_BSIZE
	* aligned. Needless to say, the validity of the data
	* needs to also be DEV_BSIZE aligned. Note that this
	* fails with NFS if the server or some other client
	* extends the file's EOF. If our buffer is resized,
	* B_CACHE may remain set! XXX
	*/
	toff = bp->b_bcount;
	tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
	while ((bp->b_flags & B_CACHE) && toff < size) {
	vm_pindex_t pi;

	if (tinc > (size - toff))
	tinc = size - toff;
	pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT;
	m = bp->b_pages[pi];
	vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m);
	toff += tinc;
	tinc = PAGE_SIZE;
	}

	/*
	* Step 3, fixup the KVA pmap.
	*/
	if (buf_mapped(bp))
	bpmap_qenter(bp);
	else
	BUF_CHECK_UNMAPPED(bp);
	}

	/*
	* Check to see if a block at a particular lbn is available for a clustered
	* write.
	*/
	static int
	vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
	{
	struct buf *bpa;
	int match;

	match = 0;

	/* If the buf isn't in core skip it */
	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
	return (0);

	/* If the buf is busy we don't want to wait for it */
	if (BUF_LOCK(bpa, LK_EXCLUSIVE \| LK_NOWAIT, NULL) != 0)
	return (0);

	/* Only cluster with valid clusterable delayed write buffers */
	if ((bpa->b_flags & (B_DELWRI \| B_CLUSTEROK \| B_INVAL)) !=
	(B_DELWRI \| B_CLUSTEROK))
	goto done;

	if (bpa->b_bufsize != size)
	goto done;

	/*
	* Check to see if it is in the expected place on disk and that the
	* block has been mapped.
	*/
	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
	match = 1;
	done:
	BUF_UNLOCK(bpa);
	return (match);
	}

	/*
	* vfs_bio_awrite:
	*
	* Implement clustered async writes for clearing out B_DELWRI buffers.
	* This is much better then the old way of writing only one buffer at
	* a time. Note that we may not be presented with the buffers in the
	* correct order, so we search for the cluster in both directions.
	*/
	int
	vfs_bio_awrite(struct buf *bp)
	{
	struct bufobj *bo;
	int i;
	int j;
	daddr_t lblkno = bp->b_lblkno;
	struct vnode *vp = bp->b_vp;
	int ncl;
	int nwritten;
	int size;
	int maxcl;
	int gbflags;

	bo = &vp->v_bufobj;
	gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0;
	/*
	* right now we support clustered writing only to regular files. If
	* we find a clusterable block we could be in the middle of a cluster
	* rather then at the beginning.
	*/
	if ((vp->v_type == VREG) &&
	(vp->v_mount != 0) && /* Only on nodes that have the size info */
	(bp->b_flags & (B_CLUSTEROK \| B_INVAL)) == B_CLUSTEROK) {
	size = vp->v_mount->mnt_stat.f_iosize;
	- maxcl = MAXPHYS / size;
	+ maxcl = maxphys / size;

	BO_RLOCK(bo);
	for (i = 1; i < maxcl; i++)
	if (vfs_bio_clcheck(vp, size, lblkno + i,
	bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
	break;

	for (j = 1; i + j <= maxcl && j <= lblkno; j++)
	if (vfs_bio_clcheck(vp, size, lblkno - j,
	bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
	break;
	BO_RUNLOCK(bo);
	--j;
	ncl = i + j;
	/*
	* this is a possible cluster write
	*/
	if (ncl != 1) {
	BUF_UNLOCK(bp);
	nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
	gbflags);
	return (nwritten);
	}
	}
	bremfree(bp);
	bp->b_flags \|= B_ASYNC;
	/*
	* default (old) behavior, writing out only one block
	*
	* XXX returns b_bufsize instead of b_bcount for nwritten?
	*/
	nwritten = bp->b_bufsize;
	(void) bwrite(bp);

	return (nwritten);
	}

	/*
	* getnewbuf_kva:
	*
	* Allocate KVA for an empty buf header according to gbflags.
	*/
	static int
	getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
	{

	if ((gbflags & (GB_UNMAPPED \| GB_KVAALLOC)) != GB_UNMAPPED) {
	/*
	* In order to keep fragmentation sane we only allocate kva
	* in BKVASIZE chunks. XXX with vmem we can do page size.
	*/
	maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;

	if (maxsize != bp->b_kvasize &&
	bufkva_alloc(bp, maxsize, gbflags))
	return (ENOSPC);
	}
	return (0);
	}

	/*
	* getnewbuf:
	*
	* Find and initialize a new buffer header, freeing up existing buffers
	* in the bufqueues as necessary. The new buffer is returned locked.
	*
	* We block if:
	* We have insufficient buffer headers
	* We have insufficient buffer space
	* buffer_arena is too fragmented ( space reservation fails )
	* If we have to flush dirty buffers ( but we try to avoid this )
	*
	* The caller is responsible for releasing the reserved bufspace after
	* allocbuf() is called.
	*/
	static struct buf *
	getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
	{
	struct bufdomain *bd;
	struct buf *bp;
	bool metadata, reserved;

	bp = NULL;
	KASSERT((gbflags & (GB_UNMAPPED \| GB_KVAALLOC)) != GB_KVAALLOC,
	("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
	if (!unmapped_buf_allowed)
	gbflags &= ~(GB_UNMAPPED \| GB_KVAALLOC);

	if (vp == NULL \|\| (vp->v_vflag & (VV_MD \| VV_SYSTEM)) != 0 \|\|
	vp->v_type == VCHR)
	metadata = true;
	else
	metadata = false;
	if (vp == NULL)
	bd = &bdomain[0];
	else
	bd = &bdomain[vp->v_bufobj.bo_domain];

	counter_u64_add(getnewbufcalls, 1);
	reserved = false;
	do {
	if (reserved == false &&
	bufspace_reserve(bd, maxsize, metadata) != 0) {
	counter_u64_add(getnewbufrestarts, 1);
	continue;
	}
	reserved = true;
	if ((bp = buf_alloc(bd)) == NULL) {
	counter_u64_add(getnewbufrestarts, 1);
	continue;
	}
	if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
	return (bp);
	break;
	} while (buf_recycle(bd, false) == 0);

	if (reserved)
	bufspace_release(bd, maxsize);
	if (bp != NULL) {
	bp->b_flags \|= B_INVAL;
	brelse(bp);
	}
	bufspace_wait(bd, vp, gbflags, slpflag, slptimeo);

	return (NULL);
	}

	/*
	* buf_daemon:
	*
	* buffer flushing daemon. Buffers are normally flushed by the
	* update daemon but if it cannot keep up this process starts to
	* take the load in an attempt to prevent getnewbuf() from blocking.
	*/
	static struct kproc_desc buf_kp = {
	"bufdaemon",
	buf_daemon,
	&bufdaemonproc
	};
	SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);

	static int
	buf_flush(struct vnode vp, struct bufdomain bd, int target)
	{
	int flushed;

	flushed = flushbufqueues(vp, bd, target, 0);
	if (flushed == 0) {
	/*
	* Could not find any buffers without rollback
	* dependencies, so just write the first one
	* in the hopes of eventually making progress.
	*/
	if (vp != NULL && target > 2)
	target /= 2;
	flushbufqueues(vp, bd, target, 1);
	}
	return (flushed);
	}

	static void
	buf_daemon()
	{
	struct bufdomain *bd;
	int speedupreq;
	int lodirty;
	int i;

	/*
	* This process needs to be suspended prior to shutdown sync.
	*/
	EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread,
	SHUTDOWN_PRI_LAST + 100);

	/*
	* Start the buf clean daemons as children threads.
	*/
	for (i = 0 ; i < buf_domains; i++) {
	int error;

	error = kthread_add((void ()(void ))bufspace_daemon,
	&bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i);
	if (error)
	panic("error %d spawning bufspace daemon", error);
	}

	/*
	* This process is allowed to take the buffer cache to the limit
	*/
	curthread->td_pflags \|= TDP_NORUNNINGBUF \| TDP_BUFNEED;
	mtx_lock(&bdlock);
	for (;;) {
	bd_request = 0;
	mtx_unlock(&bdlock);

	kthread_suspend_check();

	/*
	* Save speedupreq for this pass and reset to capture new
	* requests.
	*/
	speedupreq = bd_speedupreq;
	bd_speedupreq = 0;

	/*
	* Flush each domain sequentially according to its level and
	* the speedup request.
	*/
	for (i = 0; i < buf_domains; i++) {
	bd = &bdomain[i];
	if (speedupreq)
	lodirty = bd->bd_numdirtybuffers / 2;
	else
	lodirty = bd->bd_lodirtybuffers;
	while (bd->bd_numdirtybuffers > lodirty) {
	if (buf_flush(NULL, bd,
	bd->bd_numdirtybuffers - lodirty) == 0)
	break;
	kern_yield(PRI_USER);
	}
	}

	/*
	* Only clear bd_request if we have reached our low water
	* mark. The buf_daemon normally waits 1 second and
	* then incrementally flushes any dirty buffers that have
	* built up, within reason.
	*
	* If we were unable to hit our low water mark and couldn't
	* find any flushable buffers, we sleep for a short period
	* to avoid endless loops on unlockable buffers.
	*/
	mtx_lock(&bdlock);
	if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) {
	/*
	* We reached our low water mark, reset the
	* request and sleep until we are needed again.
	* The sleep is just so the suspend code works.
	*/
	bd_request = 0;
	/*
	* Do an extra wakeup in case dirty threshold
	* changed via sysctl and the explicit transition
	* out of shortfall was missed.
	*/
	bdirtywakeup();
	if (runningbufspace <= lorunningspace)
	runningwakeup();
	msleep(&bd_request, &bdlock, PVM, "psleep", hz);
	} else {
	/*
	* We couldn't find any flushable dirty buffers but
	* still have too many dirty buffers, we
	* have to sleep and try again. (rare)
	*/
	msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
	}
	}
	}

	/*
	* flushbufqueues:
	*
	* Try to flush a buffer in the dirty queue. We must be careful to
	* free up B_INVAL buffers instead of write them, which NFS is
	* particularly sensitive to.
	*/
	static int flushwithdeps = 0;
	SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW \| CTLFLAG_STATS,
	&flushwithdeps, 0,
	"Number of buffers flushed with dependecies that require rollbacks");

	static int
	flushbufqueues(struct vnode lvp, struct bufdomain bd, int target,
	int flushdeps)
	{
	struct bufqueue *bq;
	struct buf *sentinel;
	struct vnode *vp;
	struct mount *mp;
	struct buf *bp;
	int hasdeps;
	int flushed;
	int error;
	bool unlock;

	flushed = 0;
	bq = &bd->bd_dirtyq;
	bp = NULL;
	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK \| M_ZERO);
	sentinel->b_qindex = QUEUE_SENTINEL;
	BQ_LOCK(bq);
	TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist);
	BQ_UNLOCK(bq);
	while (flushed != target) {
	maybe_yield();
	BQ_LOCK(bq);
	bp = TAILQ_NEXT(sentinel, b_freelist);
	if (bp != NULL) {
	TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist);
	TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel,
	b_freelist);
	} else {
	BQ_UNLOCK(bq);
	break;
	}
	/*
	* Skip sentinels inserted by other invocations of the
	* flushbufqueues(), taking care to not reorder them.
	*
	* Only flush the buffers that belong to the
	* vnode locked by the curthread.
	*/
	if (bp->b_qindex == QUEUE_SENTINEL \|\| (lvp != NULL &&
	bp->b_vp != lvp)) {
	BQ_UNLOCK(bq);
	continue;
	}
	error = BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT, NULL);
	BQ_UNLOCK(bq);
	if (error != 0)
	continue;

	/*
	* BKGRDINPROG can only be set with the buf and bufobj
	* locks both held. We tolerate a race to clear it here.
	*/
	if ((bp->b_vflags & BV_BKGRDINPROG) != 0 \|\|
	(bp->b_flags & B_DELWRI) == 0) {
	BUF_UNLOCK(bp);
	continue;
	}
	if (bp->b_flags & B_INVAL) {
	bremfreef(bp);
	brelse(bp);
	flushed++;
	continue;
	}

	if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
	if (flushdeps == 0) {
	BUF_UNLOCK(bp);
	continue;
	}
	hasdeps = 1;
	} else
	hasdeps = 0;
	/*
	* We must hold the lock on a vnode before writing
	* one of its buffers. Otherwise we may confuse, or
	* in the case of a snapshot vnode, deadlock the
	* system.
	*
	* The lock order here is the reverse of the normal
	* of vnode followed by buf lock. This is ok because
	* the NOWAIT will prevent deadlock.
	*/
	vp = bp->b_vp;
	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
	BUF_UNLOCK(bp);
	continue;
	}
	if (lvp == NULL) {
	unlock = true;
	error = vn_lock(vp, LK_EXCLUSIVE \| LK_NOWAIT);
	} else {
	ASSERT_VOP_LOCKED(vp, "getbuf");
	unlock = false;
	error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
	vn_lock(vp, LK_TRYUPGRADE);
	}
	if (error == 0) {
	CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
	bp, bp->b_vp, bp->b_flags);
	if (curproc == bufdaemonproc) {
	vfs_bio_awrite(bp);
	} else {
	bremfree(bp);
	bwrite(bp);
	counter_u64_add(notbufdflushes, 1);
	}
	vn_finished_write(mp);
	if (unlock)
	VOP_UNLOCK(vp);
	flushwithdeps += hasdeps;
	flushed++;

	/*
	* Sleeping on runningbufspace while holding
	* vnode lock leads to deadlock.
	*/
	if (curproc == bufdaemonproc &&
	runningbufspace > hirunningspace)
	waitrunningbufspace();
	continue;
	}
	vn_finished_write(mp);
	BUF_UNLOCK(bp);
	}
	BQ_LOCK(bq);
	TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist);
	BQ_UNLOCK(bq);
	free(sentinel, M_TEMP);
	return (flushed);
	}

	/*
	* Check to see if a block is currently memory resident.
	*/
	struct buf *
	incore(struct bufobj *bo, daddr_t blkno)
	{
	return (gbincore_unlocked(bo, blkno));
	}

	/*
	* Returns true if no I/O is needed to access the
	* associated VM object. This is like incore except
	* it also hunts around in the VM system for the data.
	*/
	bool
	inmem(struct vnode * vp, daddr_t blkno)
	{
	vm_object_t obj;
	vm_offset_t toff, tinc, size;
	vm_page_t m, n;
	vm_ooffset_t off;
	int valid;

	ASSERT_VOP_LOCKED(vp, "inmem");

	if (incore(&vp->v_bufobj, blkno))
	return (true);
	if (vp->v_mount == NULL)
	return (false);
	obj = vp->v_object;
	if (obj == NULL)
	return (false);

	size = PAGE_SIZE;
	if (size > vp->v_mount->mnt_stat.f_iosize)
	size = vp->v_mount->mnt_stat.f_iosize;
	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;

	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
	m = vm_page_lookup_unlocked(obj, OFF_TO_IDX(off + toff));
	recheck:
	if (m == NULL)
	return (false);

	tinc = size;
	if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
	tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
	/*
	* Consider page validity only if page mapping didn't change
	* during the check.
	*/
	valid = vm_page_is_valid(m,
	(vm_offset_t)((toff + off) & PAGE_MASK), tinc);
	n = vm_page_lookup_unlocked(obj, OFF_TO_IDX(off + toff));
	if (m != n) {
	m = n;
	goto recheck;
	}
	if (!valid)
	return (false);
	}
	return (true);
	}

	/*
	* Set the dirty range for a buffer based on the status of the dirty
	* bits in the pages comprising the buffer. The range is limited
	* to the size of the buffer.
	*
	* Tell the VM system that the pages associated with this buffer
	* are clean. This is used for delayed writes where the data is
	* going to go to disk eventually without additional VM intevention.
	*
	* Note that while we only really need to clean through to b_bcount, we
	* just go ahead and clean through to b_bufsize.
	*/
	static void
	vfs_clean_pages_dirty_buf(struct buf *bp)
	{
	vm_ooffset_t foff, noff, eoff;
	vm_page_t m;
	int i;

	if ((bp->b_flags & B_VMIO) == 0 \|\| bp->b_bufsize == 0)
	return;

	foff = bp->b_offset;
	KASSERT(bp->b_offset != NOOFFSET,
	("vfs_clean_pages_dirty_buf: no buffer offset"));

	vfs_busy_pages_acquire(bp);
	vfs_setdirty_range(bp);
	for (i = 0; i < bp->b_npages; i++) {
	noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
	eoff = noff;
	if (eoff > bp->b_offset + bp->b_bufsize)
	eoff = bp->b_offset + bp->b_bufsize;
	m = bp->b_pages[i];
	vfs_page_set_validclean(bp, foff, m);
	/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
	foff = noff;
	}
	vfs_busy_pages_release(bp);
	}

	static void
	vfs_setdirty_range(struct buf *bp)
	{
	vm_offset_t boffset;
	vm_offset_t eoffset;
	int i;

	/*
	* test the pages to see if they have been modified directly
	* by users through the VM system.
	*/
	for (i = 0; i < bp->b_npages; i++)
	vm_page_test_dirty(bp->b_pages[i]);

	/*
	* Calculate the encompassing dirty range, boffset and eoffset,
	* (eoffset - boffset) bytes.
	*/

	for (i = 0; i < bp->b_npages; i++) {
	if (bp->b_pages[i]->dirty)
	break;
	}
	boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);

	for (i = bp->b_npages - 1; i >= 0; --i) {
	if (bp->b_pages[i]->dirty) {
	break;
	}
	}
	eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);

	/*
	* Fit it to the buffer.
	*/

	if (eoffset > bp->b_bcount)
	eoffset = bp->b_bcount;

	/*
	* If we have a good dirty range, merge with the existing
	* dirty range.
	*/

	if (boffset < eoffset) {
	if (bp->b_dirtyoff > boffset)
	bp->b_dirtyoff = boffset;
	if (bp->b_dirtyend < eoffset)
	bp->b_dirtyend = eoffset;
	}
	}

	/*
	* Allocate the KVA mapping for an existing buffer.
	* If an unmapped buffer is provided but a mapped buffer is requested, take
	* also care to properly setup mappings between pages and KVA.
	*/
	static void
	bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
	{
	int bsize, maxsize, need_mapping, need_kva;
	off_t offset;

	need_mapping = bp->b_data == unmapped_buf &&
	(gbflags & GB_UNMAPPED) == 0;
	need_kva = bp->b_kvabase == unmapped_buf &&
	bp->b_data == unmapped_buf &&
	(gbflags & GB_KVAALLOC) != 0;
	if (!need_mapping && !need_kva)
	return;

	BUF_CHECK_UNMAPPED(bp);

	if (need_mapping && bp->b_kvabase != unmapped_buf) {
	/*
	* Buffer is not mapped, but the KVA was already
	* reserved at the time of the instantiation. Use the
	* allocated space.
	*/
	goto has_addr;
	}

	/*
	* Calculate the amount of the address space we would reserve
	* if the buffer was mapped.
	*/
	bsize = vn_isdisk(bp->b_vp) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
	KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
	offset = blkno * bsize;
	maxsize = size + (offset & PAGE_MASK);
	maxsize = imax(maxsize, bsize);

	while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
	if ((gbflags & GB_NOWAIT_BD) != 0) {
	/*
	* XXXKIB: defragmentation cannot
	* succeed, not sure what else to do.
	*/
	panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
	}
	counter_u64_add(mappingrestarts, 1);
	bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0);
	}
	has_addr:
	if (need_mapping) {
	/* b_offset is handled by bpmap_qenter. */
	bp->b_data = bp->b_kvabase;
	BUF_CHECK_MAPPED(bp);
	bpmap_qenter(bp);
	}
	}

	struct buf *
	getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
	int flags)
	{
	struct buf *bp;
	int error;

	error = getblkx(vp, blkno, blkno, size, slpflag, slptimeo, flags, &bp);
	if (error != 0)
	return (NULL);
	return (bp);
	}

	/*
	* getblkx:
	*
	* Get a block given a specified block and offset into a file/device.
	* The buffers B_DONE bit will be cleared on return, making it almost
	* ready for an I/O initiation. B_INVAL may or may not be set on
	* return. The caller should clear B_INVAL prior to initiating a
	* READ.
	*
	* For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
	* an existing buffer.
	*
	* For a VMIO buffer, B_CACHE is modified according to the backing VM.
	* If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
	* and then cleared based on the backing VM. If the previous buffer is
	* non-0-sized but invalid, B_CACHE will be cleared.
	*
	* If getblk() must create a new buffer, the new buffer is returned with
	* both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
	* case it is returned with B_INVAL clear and B_CACHE set based on the
	* backing VM.
	*
	* getblk() also forces a bwrite() for any B_DELWRI buffer whose
	* B_CACHE bit is clear.
	*
	* What this means, basically, is that the caller should use B_CACHE to
	* determine whether the buffer is fully valid or not and should clear
	* B_INVAL prior to issuing a read. If the caller intends to validate
	* the buffer by loading its data area with something, the caller needs
	* to clear B_INVAL. If the caller does this without issuing an I/O,
	* the caller should set B_CACHE ( as an optimization ), else the caller
	* should issue the I/O and biodone() will set B_CACHE if the I/O was
	* a write attempt or if it was a successful read. If the caller
	* intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
	* prior to issuing the READ. biodone() will not clear B_INVAL.
	*
	* The blkno parameter is the logical block being requested. Normally
	* the mapping of logical block number to disk block address is done
	* by calling VOP_BMAP(). However, if the mapping is already known, the
	* disk block address can be passed using the dblkno parameter. If the
	* disk block address is not known, then the same value should be passed
	* for blkno and dblkno.
	*/
	int
	getblkx(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, int slpflag,
	int slptimeo, int flags, struct buf **bpp)
	{
	struct buf *bp;
	struct bufobj *bo;
	daddr_t d_blkno;
	int bsize, error, maxsize, vmio;
	off_t offset;

	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
	KASSERT((flags & (GB_UNMAPPED \| GB_KVAALLOC)) != GB_KVAALLOC,
	("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
	ASSERT_VOP_LOCKED(vp, "getblk");
	if (size > maxbcachebuf)
	panic("getblk: size(%d) > maxbcachebuf(%d)\n", size,
	maxbcachebuf);
	if (!unmapped_buf_allowed)
	flags &= ~(GB_UNMAPPED \| GB_KVAALLOC);

	bo = &vp->v_bufobj;
	d_blkno = dblkno;

	/* Attempt lockless lookup first. */
	bp = gbincore_unlocked(bo, blkno);
	if (bp == NULL)
	goto newbuf_unlocked;

	error = BUF_TIMELOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT, NULL, "getblku", 0,
	0);
	if (error != 0)
	goto loop;

	/* Verify buf identify has not changed since lookup. */
	if (bp->b_bufobj == bo && bp->b_lblkno == blkno)
	goto foundbuf_fastpath;

	/* It changed, fallback to locked lookup. */
	BUF_UNLOCK_RAW(bp);

	loop:
	BO_RLOCK(bo);
	bp = gbincore(bo, blkno);
	if (bp != NULL) {
	int lockflags;

	/*
	* Buffer is in-core. If the buffer is not busy nor managed,
	* it must be on a queue.
	*/
	lockflags = LK_EXCLUSIVE \| LK_INTERLOCK \|
	((flags & GB_LOCK_NOWAIT) ? LK_NOWAIT : LK_SLEEPFAIL);

	error = BUF_TIMELOCK(bp, lockflags,
	BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);

	/*
	* If we slept and got the lock we have to restart in case
	* the buffer changed identities.
	*/
	if (error == ENOLCK)
	goto loop;
	/* We timed out or were interrupted. */
	else if (error != 0)
	return (error);

	foundbuf_fastpath:
	/* If recursed, assume caller knows the rules. */
	if (BUF_LOCKRECURSED(bp))
	goto end;

	/*
	* The buffer is locked. B_CACHE is cleared if the buffer is
	* invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set
	* and for a VMIO buffer B_CACHE is adjusted according to the
	* backing VM cache.
	*/
	if (bp->b_flags & B_INVAL)
	bp->b_flags &= ~B_CACHE;
	else if ((bp->b_flags & (B_VMIO \| B_INVAL)) == 0)
	bp->b_flags \|= B_CACHE;
	if (bp->b_flags & B_MANAGED)
	MPASS(bp->b_qindex == QUEUE_NONE);
	else
	bremfree(bp);

	/*
	* check for size inconsistencies for non-VMIO case.
	*/
	if (bp->b_bcount != size) {
	if ((bp->b_flags & B_VMIO) == 0 \|\|
	(size > bp->b_kvasize)) {
	if (bp->b_flags & B_DELWRI) {
	bp->b_flags \|= B_NOCACHE;
	bwrite(bp);
	} else {
	if (LIST_EMPTY(&bp->b_dep)) {
	bp->b_flags \|= B_RELBUF;
	brelse(bp);
	} else {
	bp->b_flags \|= B_NOCACHE;
	bwrite(bp);
	}
	}
	goto loop;
	}
	}

	/*
	* Handle the case of unmapped buffer which should
	* become mapped, or the buffer for which KVA
	* reservation is requested.
	*/
	bp_unmapped_get_kva(bp, blkno, size, flags);

	/*
	* If the size is inconsistent in the VMIO case, we can resize
	* the buffer. This might lead to B_CACHE getting set or
	* cleared. If the size has not changed, B_CACHE remains
	* unchanged from its previous state.
	*/
	allocbuf(bp, size);

	KASSERT(bp->b_offset != NOOFFSET,
	("getblk: no buffer offset"));

	/*
	* A buffer with B_DELWRI set and B_CACHE clear must
	* be committed before we can return the buffer in
	* order to prevent the caller from issuing a read
	* ( due to B_CACHE not being set ) and overwriting
	* it.
	*
	* Most callers, including NFS and FFS, need this to
	* operate properly either because they assume they
	* can issue a read if B_CACHE is not set, or because
	* ( for example ) an uncached B_DELWRI might loop due
	* to softupdates re-dirtying the buffer. In the latter
	* case, B_CACHE is set after the first write completes,
	* preventing further loops.
	* NOTE! b*write() sets B_CACHE. If we cleared B_CACHE
	* above while extending the buffer, we cannot allow the
	* buffer to remain with B_CACHE set after the write
	* completes or it will represent a corrupt state. To
	* deal with this we set B_NOCACHE to scrap the buffer
	* after the write.
	*
	* We might be able to do something fancy, like setting
	* B_CACHE in bwrite() except if B_DELWRI is already set,
	* so the below call doesn't set B_CACHE, but that gets real
	* confusing. This is much easier.
	*/

	if ((bp->b_flags & (B_CACHE\|B_DELWRI)) == B_DELWRI) {
	bp->b_flags \|= B_NOCACHE;
	bwrite(bp);
	goto loop;
	}
	bp->b_flags &= ~B_DONE;
	} else {
	/*
	* Buffer is not in-core, create new buffer. The buffer
	* returned by getnewbuf() is locked. Note that the returned
	* buffer is also considered valid (not marked B_INVAL).
	*/
	BO_RUNLOCK(bo);
	newbuf_unlocked:
	/*
	* If the user does not want us to create the buffer, bail out
	* here.
	*/
	if (flags & GB_NOCREAT)
	return (EEXIST);

	bsize = vn_isdisk(vp) ? DEV_BSIZE : bo->bo_bsize;
	KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
	offset = blkno * bsize;
	vmio = vp->v_object != NULL;
	if (vmio) {
	maxsize = size + (offset & PAGE_MASK);
	} else {
	maxsize = size;
	/* Do not allow non-VMIO notmapped buffers. */
	flags &= ~(GB_UNMAPPED \| GB_KVAALLOC);
	}
	maxsize = imax(maxsize, bsize);
	if ((flags & GB_NOSPARSE) != 0 && vmio &&
	!vn_isdisk(vp)) {
	error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0);
	KASSERT(error != EOPNOTSUPP,
	("GB_NOSPARSE from fs not supporting bmap, vp %p",
	vp));
	if (error != 0)
	return (error);
	if (d_blkno == -1)
	return (EJUSTRETURN);
	}

	bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
	if (bp == NULL) {
	if (slpflag \|\| slptimeo)
	return (ETIMEDOUT);
	/*
	* XXX This is here until the sleep path is diagnosed
	* enough to work under very low memory conditions.
	*
	* There's an issue on low memory, 4BSD+non-preempt
	* systems (eg MIPS routers with 32MB RAM) where buffer
	* exhaustion occurs without sleeping for buffer
	* reclaimation. This just sticks in a loop and
	* constantly attempts to allocate a buffer, which
	* hits exhaustion and tries to wakeup bufdaemon.
	* This never happens because we never yield.
	*
	* The real solution is to identify and fix these cases
	* so we aren't effectively busy-waiting in a loop
	* until the reclaimation path has cycles to run.
	*/
	kern_yield(PRI_USER);
	goto loop;
	}

	/*
	* This code is used to make sure that a buffer is not
	* created while the getnewbuf routine is blocked.
	* This can be a problem whether the vnode is locked or not.
	* If the buffer is created out from under us, we have to
	* throw away the one we just created.
	*
	* Note: this must occur before we associate the buffer
	* with the vp especially considering limitations in
	* the splay tree implementation when dealing with duplicate
	* lblkno's.
	*/
	BO_LOCK(bo);
	if (gbincore(bo, blkno)) {
	BO_UNLOCK(bo);
	bp->b_flags \|= B_INVAL;
	bufspace_release(bufdomain(bp), maxsize);
	brelse(bp);
	goto loop;
	}

	/*
	* Insert the buffer into the hash, so that it can
	* be found by incore.
	*/
	bp->b_lblkno = blkno;
	bp->b_blkno = d_blkno;
	bp->b_offset = offset;
	bgetvp(vp, bp);
	BO_UNLOCK(bo);

	/*
	* set B_VMIO bit. allocbuf() the buffer bigger. Since the
	* buffer size starts out as 0, B_CACHE will be set by
	* allocbuf() for the VMIO case prior to it testing the
	* backing store for validity.
	*/

	if (vmio) {
	bp->b_flags \|= B_VMIO;
	KASSERT(vp->v_object == bp->b_bufobj->bo_object,
	("ARGH! different b_bufobj->bo_object %p %p %p\n",
	bp, vp->v_object, bp->b_bufobj->bo_object));
	} else {
	bp->b_flags &= ~B_VMIO;
	KASSERT(bp->b_bufobj->bo_object == NULL,
	("ARGH! has b_bufobj->bo_object %p %p\n",
	bp, bp->b_bufobj->bo_object));
	BUF_CHECK_MAPPED(bp);
	}

	allocbuf(bp, size);
	bufspace_release(bufdomain(bp), maxsize);
	bp->b_flags &= ~B_DONE;
	}
	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
	end:
	buf_track(bp, __func__);
	KASSERT(bp->b_bufobj == bo,
	("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
	*bpp = bp;
	return (0);
	}

	/*
	* Get an empty, disassociated buffer of given size. The buffer is initially
	* set to B_INVAL.
	*/
	struct buf *
	geteblk(int size, int flags)
	{
	struct buf *bp;
	int maxsize;

	maxsize = (size + BKVAMASK) & ~BKVAMASK;
	while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
	if ((flags & GB_NOWAIT_BD) &&
	(curthread->td_pflags & TDP_BUFNEED) != 0)
	return (NULL);
	}
	allocbuf(bp, size);
	bufspace_release(bufdomain(bp), maxsize);
	bp->b_flags \|= B_INVAL; /* b_dep cleared by getnewbuf() */
	return (bp);
	}

	/*
	* Truncate the backing store for a non-vmio buffer.
	*/
	static void
	vfs_nonvmio_truncate(struct buf *bp, int newbsize)
	{

	if (bp->b_flags & B_MALLOC) {
	/*
	* malloced buffers are not shrunk
	*/
	if (newbsize == 0) {
	bufmallocadjust(bp, 0);
	free(bp->b_data, M_BIOBUF);
	bp->b_data = bp->b_kvabase;
	bp->b_flags &= ~B_MALLOC;
	}
	return;
	}
	vm_hold_free_pages(bp, newbsize);
	bufspace_adjust(bp, newbsize);
	}

	/*
	* Extend the backing for a non-VMIO buffer.
	*/
	static void
	vfs_nonvmio_extend(struct buf *bp, int newbsize)
	{
	caddr_t origbuf;
	int origbufsize;

	/*
	* We only use malloced memory on the first allocation.
	* and revert to page-allocated memory when the buffer
	* grows.
	*
	* There is a potential smp race here that could lead
	* to bufmallocspace slightly passing the max. It
	* is probably extremely rare and not worth worrying
	* over.
	*/
	if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 &&
	bufmallocspace < maxbufmallocspace) {
	bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK);
	bp->b_flags \|= B_MALLOC;
	bufmallocadjust(bp, newbsize);
	return;
	}

	/*
	* If the buffer is growing on its other-than-first
	* allocation then we revert to the page-allocation
	* scheme.
	*/
	origbuf = NULL;
	origbufsize = 0;
	if (bp->b_flags & B_MALLOC) {
	origbuf = bp->b_data;
	origbufsize = bp->b_bufsize;
	bp->b_data = bp->b_kvabase;
	bufmallocadjust(bp, 0);
	bp->b_flags &= ~B_MALLOC;
	newbsize = round_page(newbsize);
	}
	vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
	(vm_offset_t) bp->b_data + newbsize);
	if (origbuf != NULL) {
	bcopy(origbuf, bp->b_data, origbufsize);
	free(origbuf, M_BIOBUF);
	}
	bufspace_adjust(bp, newbsize);
	}

	/*
	* This code constitutes the buffer memory from either anonymous system
	* memory (in the case of non-VMIO operations) or from an associated
	* VM object (in the case of VMIO operations). This code is able to
	* resize a buffer up or down.
	*
	* Note that this code is tricky, and has many complications to resolve
	* deadlock or inconsistent data situations. Tread lightly!!!
	* There are B_CACHE and B_DELWRI interactions that must be dealt with by
	* the caller. Calling this code willy nilly can result in the loss of data.
	*
	* allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with
	* B_CACHE for the non-VMIO case.
	*/
	int
	allocbuf(struct buf *bp, int size)
	{
	int newbsize;

	if (bp->b_bcount == size)
	return (1);

	if (bp->b_kvasize != 0 && bp->b_kvasize < size)
	panic("allocbuf: buffer too small");

	newbsize = roundup2(size, DEV_BSIZE);
	if ((bp->b_flags & B_VMIO) == 0) {
	if ((bp->b_flags & B_MALLOC) == 0)
	newbsize = round_page(newbsize);
	/*
	* Just get anonymous memory from the kernel. Don't
	* mess with B_CACHE.
	*/
	if (newbsize < bp->b_bufsize)
	vfs_nonvmio_truncate(bp, newbsize);
	else if (newbsize > bp->b_bufsize)
	vfs_nonvmio_extend(bp, newbsize);
	} else {
	int desiredpages;

	desiredpages = (size == 0) ? 0 :
	num_pages((bp->b_offset & PAGE_MASK) + newbsize);

	if (bp->b_flags & B_MALLOC)
	panic("allocbuf: VMIO buffer can't be malloced");
	/*
	* Set B_CACHE initially if buffer is 0 length or will become
	* 0-length.
	*/
	if (size == 0 \|\| bp->b_bufsize == 0)
	bp->b_flags \|= B_CACHE;

	if (newbsize < bp->b_bufsize)
	vfs_vmio_truncate(bp, desiredpages);
	/* XXX This looks as if it should be newbsize > b_bufsize */
	else if (size > bp->b_bcount)
	vfs_vmio_extend(bp, desiredpages, size);
	bufspace_adjust(bp, newbsize);
	}
	bp->b_bcount = size; /* requested buffer size. */
	return (1);
	}

	extern int inflight_transient_maps;

	static struct bio_queue nondump_bios;

	void
	biodone(struct bio *bp)
	{
	struct mtx *mtxp;
	void (done)(struct bio );
	vm_offset_t start, end;

	biotrack(bp, __func__);

	/*
	* Avoid completing I/O when dumping after a panic since that may
	* result in a deadlock in the filesystem or pager code. Note that
	* this doesn't affect dumps that were started manually since we aim
	* to keep the system usable after it has been resumed.
	*/
	if (__predict_false(dumping && SCHEDULER_STOPPED())) {
	TAILQ_INSERT_HEAD(&nondump_bios, bp, bio_queue);
	return;
	}
	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
	bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
	bp->bio_flags \|= BIO_UNMAPPED;
	start = trunc_page((vm_offset_t)bp->bio_data);
	end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
	bp->bio_data = unmapped_buf;
	pmap_qremove(start, atop(end - start));
	vmem_free(transient_arena, start, end - start);
	atomic_add_int(&inflight_transient_maps, -1);
	}
	done = bp->bio_done;
	if (done == NULL) {
	mtxp = mtx_pool_find(mtxpool_sleep, bp);
	mtx_lock(mtxp);
	bp->bio_flags \|= BIO_DONE;
	wakeup(bp);
	mtx_unlock(mtxp);
	} else
	done(bp);
	}

	/*
	* Wait for a BIO to finish.
	*/
	int
	biowait(struct bio bp, const char wchan)
	{
	struct mtx *mtxp;

	mtxp = mtx_pool_find(mtxpool_sleep, bp);
	mtx_lock(mtxp);
	while ((bp->bio_flags & BIO_DONE) == 0)
	msleep(bp, mtxp, PRIBIO, wchan, 0);
	mtx_unlock(mtxp);
	if (bp->bio_error != 0)
	return (bp->bio_error);
	if (!(bp->bio_flags & BIO_ERROR))
	return (0);
	return (EIO);
	}

	void
	biofinish(struct bio bp, struct devstat stat, int error)
	{

	if (error) {
	bp->bio_error = error;
	bp->bio_flags \|= BIO_ERROR;
	}
	if (stat != NULL)
	devstat_end_transaction_bio(stat, bp);
	biodone(bp);
	}

	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	void
	biotrack_buf(struct bio bp, const char location)
	{

	buf_track(bp->bio_track_bp, location);
	}
	#endif

	/*
	* bufwait:
	*
	* Wait for buffer I/O completion, returning error status. The buffer
	* is left locked and B_DONE on return. B_EINTR is converted into an EINTR
	* error and cleared.
	*/
	int
	bufwait(struct buf *bp)
	{
	if (bp->b_iocmd == BIO_READ)
	bwait(bp, PRIBIO, "biord");
	else
	bwait(bp, PRIBIO, "biowr");
	if (bp->b_flags & B_EINTR) {
	bp->b_flags &= ~B_EINTR;
	return (EINTR);
	}
	if (bp->b_ioflags & BIO_ERROR) {
	return (bp->b_error ? bp->b_error : EIO);
	} else {
	return (0);
	}
	}

	/*
	* bufdone:
	*
	* Finish I/O on a buffer, optionally calling a completion function.
	* This is usually called from an interrupt so process blocking is
	* not allowed.
	*
	* biodone is also responsible for setting B_CACHE in a B_VMIO bp.
	* In a non-VMIO bp, B_CACHE will be set on the next getblk()
	* assuming B_INVAL is clear.
	*
	* For the VMIO case, we set B_CACHE if the op was a read and no
	* read error occurred, or if the op was a write. B_CACHE is never
	* set if the buffer is invalid or otherwise uncacheable.
	*
	* bufdone does not mess with B_INVAL, allowing the I/O routine or the
	* initiator to leave B_INVAL set to brelse the buffer out of existence
	* in the biodone routine.
	*/
	void
	bufdone(struct buf *bp)
	{
	struct bufobj *dropobj;
	void (biodone)(struct buf );

	buf_track(bp, __func__);
	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
	dropobj = NULL;

	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));

	runningbufwakeup(bp);
	if (bp->b_iocmd == BIO_WRITE)
	dropobj = bp->b_bufobj;
	/* call optional completion function if requested */
	if (bp->b_iodone != NULL) {
	biodone = bp->b_iodone;
	bp->b_iodone = NULL;
	(*biodone) (bp);
	if (dropobj)
	bufobj_wdrop(dropobj);
	return;
	}
	if (bp->b_flags & B_VMIO) {
	/*
	* Set B_CACHE if the op was a normal read and no error
	* occurred. B_CACHE is set for writes in the b*write()
	* routines.
	*/
	if (bp->b_iocmd == BIO_READ &&
	!(bp->b_flags & (B_INVAL\|B_NOCACHE)) &&
	!(bp->b_ioflags & BIO_ERROR))
	bp->b_flags \|= B_CACHE;
	vfs_vmio_iodone(bp);
	}
	if (!LIST_EMPTY(&bp->b_dep))
	buf_complete(bp);
	if ((bp->b_flags & B_CKHASH) != 0) {
	KASSERT(bp->b_iocmd == BIO_READ,
	("bufdone: b_iocmd %d not BIO_READ", bp->b_iocmd));
	KASSERT(buf_mapped(bp), ("bufdone: bp %p not mapped", bp));
	(*bp->b_ckhashcalc)(bp);
	}
	/*
	* For asynchronous completions, release the buffer now. The brelse
	* will do a wakeup there if necessary - so no need to do a wakeup
	* here in the async case. The sync case always needs to do a wakeup.
	*/
	if (bp->b_flags & B_ASYNC) {
	if ((bp->b_flags & (B_NOCACHE \| B_INVAL \| B_RELBUF)) \|\|
	(bp->b_ioflags & BIO_ERROR))
	brelse(bp);
	else
	bqrelse(bp);
	} else
	bdone(bp);
	if (dropobj)
	bufobj_wdrop(dropobj);
	}

	/*
	* This routine is called in lieu of iodone in the case of
	* incomplete I/O. This keeps the busy status for pages
	* consistent.
	*/
	void
	vfs_unbusy_pages(struct buf *bp)
	{
	int i;
	vm_object_t obj;
	vm_page_t m;

	runningbufwakeup(bp);
	if (!(bp->b_flags & B_VMIO))
	return;

	obj = bp->b_bufobj->bo_object;
	for (i = 0; i < bp->b_npages; i++) {
	m = bp->b_pages[i];
	if (m == bogus_page) {
	m = vm_page_relookup(obj, OFF_TO_IDX(bp->b_offset) + i);
	if (!m)
	panic("vfs_unbusy_pages: page missing\n");
	bp->b_pages[i] = m;
	if (buf_mapped(bp)) {
	BUF_CHECK_MAPPED(bp);
	pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
	bp->b_pages, bp->b_npages);
	} else
	BUF_CHECK_UNMAPPED(bp);
	}
	vm_page_sunbusy(m);
	}
	vm_object_pip_wakeupn(obj, bp->b_npages);
	}

	/*
	* vfs_page_set_valid:
	*
	* Set the valid bits in a page based on the supplied offset. The
	* range is restricted to the buffer's size.
	*
	* This routine is typically called after a read completes.
	*/
	static void
	vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
	{
	vm_ooffset_t eoff;

	/*
	* Compute the end offset, eoff, such that [off, eoff) does not span a
	* page boundary and eoff is not greater than the end of the buffer.
	* The end of the buffer, in this case, is our file EOF, not the
	* allocation size of the buffer.
	*/
	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
	if (eoff > bp->b_offset + bp->b_bcount)
	eoff = bp->b_offset + bp->b_bcount;

	/*
	* Set valid range. This is typically the entire buffer and thus the
	* entire page.
	*/
	if (eoff > off)
	vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
	}

	/*
	* vfs_page_set_validclean:
	*
	* Set the valid bits and clear the dirty bits in a page based on the
	* supplied offset. The range is restricted to the buffer's size.
	*/
	static void
	vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
	{
	vm_ooffset_t soff, eoff;

	/*
	* Start and end offsets in buffer. eoff - soff may not cross a
	* page boundary or cross the end of the buffer. The end of the
	* buffer, in this case, is our file EOF, not the allocation size
	* of the buffer.
	*/
	soff = off;
	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
	if (eoff > bp->b_offset + bp->b_bcount)
	eoff = bp->b_offset + bp->b_bcount;

	/*
	* Set valid range. This is typically the entire buffer and thus the
	* entire page.
	*/
	if (eoff > soff) {
	vm_page_set_validclean(
	m,
	(vm_offset_t) (soff & PAGE_MASK),
	(vm_offset_t) (eoff - soff)
	);
	}
	}

	/*
	* Acquire a shared busy on all pages in the buf.
	*/
	void
	vfs_busy_pages_acquire(struct buf *bp)
	{
	int i;

	for (i = 0; i < bp->b_npages; i++)
	vm_page_busy_acquire(bp->b_pages[i], VM_ALLOC_SBUSY);
	}

	void
	vfs_busy_pages_release(struct buf *bp)
	{
	int i;

	for (i = 0; i < bp->b_npages; i++)
	vm_page_sunbusy(bp->b_pages[i]);
	}

	/*
	* This routine is called before a device strategy routine.
	* It is used to tell the VM system that paging I/O is in
	* progress, and treat the pages associated with the buffer
	* almost as being exclusive busy. Also the object paging_in_progress
	* flag is handled to make sure that the object doesn't become
	* inconsistent.
	*
	* Since I/O has not been initiated yet, certain buffer flags
	* such as BIO_ERROR or B_INVAL may be in an inconsistent state
	* and should be ignored.
	*/
	void
	vfs_busy_pages(struct buf *bp, int clear_modify)
	{
	vm_object_t obj;
	vm_ooffset_t foff;
	vm_page_t m;
	int i;
	bool bogus;

	if (!(bp->b_flags & B_VMIO))
	return;

	obj = bp->b_bufobj->bo_object;
	foff = bp->b_offset;
	KASSERT(bp->b_offset != NOOFFSET,
	("vfs_busy_pages: no buffer offset"));
	if ((bp->b_flags & B_CLUSTER) == 0) {
	vm_object_pip_add(obj, bp->b_npages);
	vfs_busy_pages_acquire(bp);
	}
	if (bp->b_bufsize != 0)
	vfs_setdirty_range(bp);
	bogus = false;
	for (i = 0; i < bp->b_npages; i++) {
	m = bp->b_pages[i];
	vm_page_assert_sbusied(m);

	/*
	* When readying a buffer for a read ( i.e
	* clear_modify == 0 ), it is important to do
	* bogus_page replacement for valid pages in
	* partially instantiated buffers. Partially
	* instantiated buffers can, in turn, occur when
	* reconstituting a buffer from its VM backing store
	* base. We only have to do this if B_CACHE is
	* clear ( which causes the I/O to occur in the
	* first place ). The replacement prevents the read
	* I/O from overwriting potentially dirty VM-backed
	* pages. XXX bogus page replacement is, uh, bogus.
	* It may not work properly with small-block devices.
	* We need to find a better way.
	*/
	if (clear_modify) {
	pmap_remove_write(m);
	vfs_page_set_validclean(bp, foff, m);
	} else if (vm_page_all_valid(m) &&
	(bp->b_flags & B_CACHE) == 0) {
	bp->b_pages[i] = bogus_page;
	bogus = true;
	}
	foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
	}
	if (bogus && buf_mapped(bp)) {
	BUF_CHECK_MAPPED(bp);
	pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
	bp->b_pages, bp->b_npages);
	}
	}

	/*
	* vfs_bio_set_valid:
	*
	* Set the range within the buffer to valid. The range is
	* relative to the beginning of the buffer, b_offset. Note that
	* b_offset itself may be offset from the beginning of the first
	* page.
	*/
	void
	vfs_bio_set_valid(struct buf *bp, int base, int size)
	{
	int i, n;
	vm_page_t m;

	if (!(bp->b_flags & B_VMIO))
	return;

	/*
	* Fixup base to be relative to beginning of first page.
	* Set initial n to be the maximum number of bytes in the
	* first page that can be validated.
	*/
	base += (bp->b_offset & PAGE_MASK);
	n = PAGE_SIZE - (base & PAGE_MASK);

	/*
	* Busy may not be strictly necessary here because the pages are
	* unlikely to be fully valid and the vnode lock will synchronize
	* their access via getpages. It is grabbed for consistency with
	* other page validation.
	*/
	vfs_busy_pages_acquire(bp);
	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
	m = bp->b_pages[i];
	if (n > size)
	n = size;
	vm_page_set_valid_range(m, base & PAGE_MASK, n);
	base += n;
	size -= n;
	n = PAGE_SIZE;
	}
	vfs_busy_pages_release(bp);
	}

	/*
	* vfs_bio_clrbuf:
	*
	* If the specified buffer is a non-VMIO buffer, clear the entire
	* buffer. If the specified buffer is a VMIO buffer, clear and
	* validate only the previously invalid portions of the buffer.
	* This routine essentially fakes an I/O, so we need to clear
	* BIO_ERROR and B_INVAL.
	*
	* Note that while we only theoretically need to clear through b_bcount,
	* we go ahead and clear through b_bufsize.
	*/
	void
	vfs_bio_clrbuf(struct buf *bp)
	{
	int i, j, mask, sa, ea, slide;

	if ((bp->b_flags & (B_VMIO \| B_MALLOC)) != B_VMIO) {
	clrbuf(bp);
	return;
	}
	bp->b_flags &= ~B_INVAL;
	bp->b_ioflags &= ~BIO_ERROR;
	vfs_busy_pages_acquire(bp);
	sa = bp->b_offset & PAGE_MASK;
	slide = 0;
	for (i = 0; i < bp->b_npages; i++, sa = 0) {
	slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
	ea = slide & PAGE_MASK;
	if (ea == 0)
	ea = PAGE_SIZE;
	if (bp->b_pages[i] == bogus_page)
	continue;
	j = sa / DEV_BSIZE;
	mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
	if ((bp->b_pages[i]->valid & mask) == mask)
	continue;
	if ((bp->b_pages[i]->valid & mask) == 0)
	pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
	else {
	for (; sa < ea; sa += DEV_BSIZE, j++) {
	if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
	pmap_zero_page_area(bp->b_pages[i],
	sa, DEV_BSIZE);
	}
	}
	}
	vm_page_set_valid_range(bp->b_pages[i], j * DEV_BSIZE,
	roundup2(ea - sa, DEV_BSIZE));
	}
	vfs_busy_pages_release(bp);
	bp->b_resid = 0;
	}

	void
	vfs_bio_bzero_buf(struct buf *bp, int base, int size)
	{
	vm_page_t m;
	int i, n;

	if (buf_mapped(bp)) {
	BUF_CHECK_MAPPED(bp);
	bzero(bp->b_data + base, size);
	} else {
	BUF_CHECK_UNMAPPED(bp);
	n = PAGE_SIZE - (base & PAGE_MASK);
	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
	m = bp->b_pages[i];
	if (n > size)
	n = size;
	pmap_zero_page_area(m, base & PAGE_MASK, n);
	base += n;
	size -= n;
	n = PAGE_SIZE;
	}
	}
	}

	/*
	* Update buffer flags based on I/O request parameters, optionally releasing the
	* buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM,
	* where they may be placed on a page queue (VMIO) or freed immediately (direct
	* I/O). Otherwise the buffer is released to the cache.
	*/
	static void
	b_io_dismiss(struct buf *bp, int ioflag, bool release)
	{

	KASSERT((ioflag & IO_NOREUSE) == 0 \|\| (ioflag & IO_VMIO) != 0,
	("buf %p non-VMIO noreuse", bp));

	if ((ioflag & IO_DIRECT) != 0)
	bp->b_flags \|= B_DIRECT;
	if ((ioflag & IO_EXT) != 0)
	bp->b_xflags \|= BX_ALTDATA;
	if ((ioflag & (IO_VMIO \| IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) {
	bp->b_flags \|= B_RELBUF;
	if ((ioflag & IO_NOREUSE) != 0)
	bp->b_flags \|= B_NOREUSE;
	if (release)
	brelse(bp);
	} else if (release)
	bqrelse(bp);
	}

	void
	vfs_bio_brelse(struct buf *bp, int ioflag)
	{

	b_io_dismiss(bp, ioflag, true);
	}

	void
	vfs_bio_set_flags(struct buf *bp, int ioflag)
	{

	b_io_dismiss(bp, ioflag, false);
	}

	/*
	* vm_hold_load_pages and vm_hold_free_pages get pages into
	* a buffers address space. The pages are anonymous and are
	* not associated with a file object.
	*/
	static void
	vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
	{
	vm_offset_t pg;
	vm_page_t p;
	int index;

	BUF_CHECK_MAPPED(bp);

	to = round_page(to);
	from = round_page(from);
	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
	+ MPASS((bp->b_flags & B_MAXPHYS) == 0);
	+ KASSERT(to - from <= maxbcachebuf,
	+ ("vm_hold_load_pages too large %p %#jx %#jx %u",
	+ bp, (uintmax_t)from, (uintmax_t)to, maxbcachebuf));

	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
	/*
	* note: must allocate system pages since blocking here
	* could interfere with paging I/O, no matter which
	* process we are.
	*/
	p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM \| VM_ALLOC_NOOBJ \|
	VM_ALLOC_WIRED \| VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) \|
	VM_ALLOC_WAITOK);
	pmap_qenter(pg, &p, 1);
	bp->b_pages[index] = p;
	}
	bp->b_npages = index;
	}

	/* Return pages associated with this buf to the vm system */
	static void
	vm_hold_free_pages(struct buf *bp, int newbsize)
	{
	vm_offset_t from;
	vm_page_t p;
	int index, newnpages;

	BUF_CHECK_MAPPED(bp);

	from = round_page((vm_offset_t)bp->b_data + newbsize);
	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
	if (bp->b_npages > newnpages)
	pmap_qremove(from, bp->b_npages - newnpages);
	for (index = newnpages; index < bp->b_npages; index++) {
	p = bp->b_pages[index];
	bp->b_pages[index] = NULL;
	vm_page_unwire_noq(p);
	vm_page_free(p);
	}
	bp->b_npages = newnpages;
	}

	/*
	* Map an IO request into kernel virtual address space.
	*
	* All requests are (re)mapped into kernel VA space.
	* Notice that we use b_bufsize for the size of the buffer
	* to be mapped. b_bcount might be modified by the driver.
	*
	* Note that even if the caller determines that the address space should
	* be valid, a race or a smaller-file mapped into a larger space may
	* actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
	* check the return value.
	*
	* This function only works with pager buffers.
	*/
	int
	vmapbuf(struct buf bp, void uaddr, size_t len, int mapbuf)
	{
	vm_prot_t prot;
	int pidx;

	+ MPASS((bp->b_flags & B_MAXPHYS) != 0);
	prot = VM_PROT_READ;
	if (bp->b_iocmd == BIO_READ)
	prot \|= VM_PROT_WRITE; /* Less backwards than it looks */
	- if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
	- (vm_offset_t)uaddr, len, prot, bp->b_pages,
	- btoc(MAXPHYS))) < 0)
	+ pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
	+ (vm_offset_t)uaddr, len, prot, bp->b_pages, PBUF_PAGES);
	+ if (pidx < 0)
	return (-1);
	bp->b_bufsize = len;
	bp->b_npages = pidx;
	bp->b_offset = ((vm_offset_t)uaddr) & PAGE_MASK;
	if (mapbuf \|\| !unmapped_buf_allowed) {
	pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx);
	bp->b_data = bp->b_kvabase + bp->b_offset;
	} else
	bp->b_data = unmapped_buf;
	- return(0);
	+ return (0);
	}

	/*
	* Free the io map PTEs associated with this IO operation.
	* We also invalidate the TLB entries and restore the original b_addr.
	*
	* This function only works with pager buffers.
	*/
	void
	vunmapbuf(struct buf *bp)
	{
	int npages;

	npages = bp->b_npages;
	if (buf_mapped(bp))
	pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
	vm_page_unhold_pages(bp->b_pages, npages);

	bp->b_data = unmapped_buf;
	}

	void
	bdone(struct buf *bp)
	{
	struct mtx *mtxp;

	mtxp = mtx_pool_find(mtxpool_sleep, bp);
	mtx_lock(mtxp);
	bp->b_flags \|= B_DONE;
	wakeup(bp);
	mtx_unlock(mtxp);
	}

	void
	bwait(struct buf bp, u_char pri, const char wchan)
	{
	struct mtx *mtxp;

	mtxp = mtx_pool_find(mtxpool_sleep, bp);
	mtx_lock(mtxp);
	while ((bp->b_flags & B_DONE) == 0)
	msleep(bp, mtxp, pri, wchan, 0);
	mtx_unlock(mtxp);
	}

	int
	bufsync(struct bufobj *bo, int waitfor)
	{

	return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread));
	}

	void
	bufstrategy(struct bufobj bo, struct buf bp)
	{
	int i __unused;
	struct vnode *vp;

	vp = bp->b_vp;
	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
	("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
	i = VOP_STRATEGY(vp, bp);
	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
	}

	/*
	* Initialize a struct bufobj before use. Memory is assumed zero filled.
	*/
	void
	bufobj_init(struct bufobj bo, void private)
	{
	static volatile int bufobj_cleanq;

	bo->bo_domain =
	atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains;
	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
	bo->bo_private = private;
	TAILQ_INIT(&bo->bo_clean.bv_hd);
	TAILQ_INIT(&bo->bo_dirty.bv_hd);
	}

	void
	bufobj_wrefl(struct bufobj *bo)
	{

	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
	ASSERT_BO_WLOCKED(bo);
	bo->bo_numoutput++;
	}

	void
	bufobj_wref(struct bufobj *bo)
	{

	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
	BO_LOCK(bo);
	bo->bo_numoutput++;
	BO_UNLOCK(bo);
	}

	void
	bufobj_wdrop(struct bufobj *bo)
	{

	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
	BO_LOCK(bo);
	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
	bo->bo_flag &= ~BO_WWAIT;
	wakeup(&bo->bo_numoutput);
	}
	BO_UNLOCK(bo);
	}

	int
	bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
	{
	int error;

	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
	ASSERT_BO_WLOCKED(bo);
	error = 0;
	while (bo->bo_numoutput) {
	bo->bo_flag \|= BO_WWAIT;
	error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
	slpflag \| (PRIBIO + 1), "bo_wwait", timeo);
	if (error)
	break;
	}
	return (error);
	}

	/*
	* Set bio_data or bio_ma for struct bio from the struct buf.
	*/
	void
	bdata2bio(struct buf bp, struct bio bip)
	{

	if (!buf_mapped(bp)) {
	KASSERT(unmapped_buf_allowed, ("unmapped"));
	bip->bio_ma = bp->b_pages;
	bip->bio_ma_n = bp->b_npages;
	bip->bio_data = unmapped_buf;
	bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
	bip->bio_flags \|= BIO_UNMAPPED;
	KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
	PAGE_SIZE == bp->b_npages,
	("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
	(long long)bip->bio_length, bip->bio_ma_n));
	} else {
	bip->bio_data = bp->b_data;
	bip->bio_ma = NULL;
	}
	}

	/*
	* The MIPS pmap code currently doesn't handle aliased pages.
	* The VIPT caches may not handle page aliasing themselves, leading
	* to data corruption.
	*
	* As such, this code makes a system extremely unhappy if said
	* system doesn't support unaliasing the above situation in hardware.
	* Some "recent" systems (eg some mips24k/mips74k cores) don't enable
	* this feature at build time, so it has to be handled in software.
	*
	* Once the MIPS pmap/cache code grows to support this function on
	* earlier chips, it should be flipped back off.
	*/
	#ifdef __mips__
	static int buf_pager_relbuf = 1;
	#else
	static int buf_pager_relbuf = 0;
	#endif
	SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
	&buf_pager_relbuf, 0,
	"Make buffer pager release buffers after reading");

	/*
	* The buffer pager. It uses buffer reads to validate pages.
	*
	* In contrast to the generic local pager from vm/vnode_pager.c, this
	* pager correctly and easily handles volumes where the underlying
	* device block size is greater than the machine page size. The
	* buffer cache transparently extends the requested page run to be
	* aligned at the block boundary, and does the necessary bogus page
	* replacements in the addends to avoid obliterating already valid
	* pages.
	*
	* The only non-trivial issue is that the exclusive busy state for
	* pages, which is assumed by the vm_pager_getpages() interface, is
	* incompatible with the VMIO buffer cache's desire to share-busy the
	* pages. This function performs a trivial downgrade of the pages'
	* state before reading buffers, and a less trivial upgrade from the
	* shared-busy to excl-busy state after the read.
	*/
	int
	vfs_bio_getpages(struct vnode vp, vm_page_t ma, int count,
	int rbehind, int rahead, vbg_get_lblkno_t get_lblkno,
	vbg_get_blksize_t get_blksize)
	{
	vm_page_t m;
	vm_object_t object;
	struct buf *bp;
	struct mount *mp;
	daddr_t lbn, lbnp;
	vm_ooffset_t la, lb, poff, poffe;
	long bsize;
	int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b;
	bool redo, lpart;

	object = vp->v_object;
	mp = vp->v_mount;
	error = 0;
	la = IDX_TO_OFF(ma[count - 1]->pindex);
	if (la >= object->un_pager.vnp.vnp_size)
	return (VM_PAGER_BAD);

	/*
	* Change the meaning of la from where the last requested page starts
	* to where it ends, because that's the end of the requested region
	* and the start of the potential read-ahead region.
	*/
	la += PAGE_SIZE;
	lpart = la > object->un_pager.vnp.vnp_size;
	bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)));

	/*
	* Calculate read-ahead, behind and total pages.
	*/
	pgsin = count;
	lb = IDX_TO_OFF(ma[0]->pindex);
	pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
	pgsin += pgsin_b;
	if (rbehind != NULL)
	*rbehind = pgsin_b;
	pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la);
	if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size)
	pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size,
	PAGE_SIZE) - la);
	pgsin += pgsin_a;
	if (rahead != NULL)
	*rahead = pgsin_a;
	VM_CNT_INC(v_vnodein);
	VM_CNT_ADD(v_vnodepgsin, pgsin);

	br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS)
	!= 0) ? GB_UNMAPPED : 0;
	again:
	for (i = 0; i < count; i++) {
	if (ma[i] != bogus_page)
	vm_page_busy_downgrade(ma[i]);
	}

	lbnp = -1;
	for (i = 0; i < count; i++) {
	m = ma[i];
	if (m == bogus_page)
	continue;

	/*
	* Pages are shared busy and the object lock is not
	* owned, which together allow for the pages'
	* invalidation. The racy test for validity avoids
	* useless creation of the buffer for the most typical
	* case when invalidation is not used in redo or for
	* parallel read. The shared->excl upgrade loop at
	* the end of the function catches the race in a
	* reliable way (protected by the object lock).
	*/
	if (vm_page_all_valid(m))
	continue;

	poff = IDX_TO_OFF(m->pindex);
	poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size);
	for (; poff < poffe; poff += bsize) {
	lbn = get_lblkno(vp, poff);
	if (lbn == lbnp)
	goto next_page;
	lbnp = lbn;

	bsize = get_blksize(vp, lbn);
	error = bread_gb(vp, lbn, bsize, curthread->td_ucred,
	br_flags, &bp);
	if (error != 0)
	goto end_pages;
	if (bp->b_rcred == curthread->td_ucred) {
	crfree(bp->b_rcred);
	bp->b_rcred = NOCRED;
	}
	if (LIST_EMPTY(&bp->b_dep)) {
	/*
	* Invalidation clears m->valid, but
	* may leave B_CACHE flag if the
	* buffer existed at the invalidation
	* time. In this case, recycle the
	* buffer to do real read on next
	* bread() after redo.
	*
	* Otherwise B_RELBUF is not strictly
	* necessary, enable to reduce buf
	* cache pressure.
	*/
	if (buf_pager_relbuf \|\|
	!vm_page_all_valid(m))
	bp->b_flags \|= B_RELBUF;

	bp->b_flags &= ~B_NOCACHE;
	brelse(bp);
	} else {
	bqrelse(bp);
	}
	}
	KASSERT(1 /* racy, enable for debugging */ \|\|
	vm_page_all_valid(m) \|\| i == count - 1,
	("buf %d %p invalid", i, m));
	if (i == count - 1 && lpart) {
	if (!vm_page_none_valid(m) &&
	!vm_page_all_valid(m))
	vm_page_zero_invalid(m, TRUE);
	}
	next_page:;
	}
	end_pages:

	redo = false;
	for (i = 0; i < count; i++) {
	if (ma[i] == bogus_page)
	continue;
	if (vm_page_busy_tryupgrade(ma[i]) == 0) {
	vm_page_sunbusy(ma[i]);
	ma[i] = vm_page_grab_unlocked(object, ma[i]->pindex,
	VM_ALLOC_NORMAL);
	}

	/*
	* Since the pages were only sbusy while neither the
	* buffer nor the object lock was held by us, or
	* reallocated while vm_page_grab() slept for busy
	* relinguish, they could have been invalidated.
	* Recheck the valid bits and re-read as needed.
	*
	* Note that the last page is made fully valid in the
	* read loop, and partial validity for the page at
	* index count - 1 could mean that the page was
	* invalidated or removed, so we must restart for
	* safety as well.
	*/
	if (!vm_page_all_valid(ma[i]))
	redo = true;
	}
	if (redo && error == 0)
	goto again;
	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
	}

	#include "opt_ddb.h"
	#ifdef DDB
	#include <ddb/ddb.h>

	/* DDB command to show buffer data */
	DB_SHOW_COMMAND(buffer, db_show_buffer)
	{
	/* get args */
	struct buf bp = (struct buf )addr;
	#ifdef FULL_BUF_TRACKING
	uint32_t i, j;
	#endif

	if (!have_addr) {
	db_printf("usage: show buffer <addr>\n");
	return;
	}

	db_printf("buf at %p\n", bp);
	db_printf("b_flags = 0x%b, b_xflags=0x%b\n",
	(u_int)bp->b_flags, PRINT_BUF_FLAGS,
	(u_int)bp->b_xflags, PRINT_BUF_XFLAGS);
	db_printf("b_vflags=0x%b b_ioflags0x%b\n",
	(u_int)bp->b_vflags, PRINT_BUF_VFLAGS,
	(u_int)bp->b_ioflags, PRINT_BIO_FLAGS);
	db_printf(
	"b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
	"b_bufobj = (%p), b_data = %p\n, b_blkno = %jd, b_lblkno = %jd, "
	"b_vp = %p, b_dep = %p\n",
	bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
	bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
	(intmax_t)bp->b_lblkno, bp->b_vp, bp->b_dep.lh_first);
	db_printf("b_kvabase = %p, b_kvasize = %d\n",
	bp->b_kvabase, bp->b_kvasize);
	if (bp->b_npages) {
	int i;
	db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
	for (i = 0; i < bp->b_npages; i++) {
	vm_page_t m;
	m = bp->b_pages[i];
	if (m != NULL)
	db_printf("(%p, 0x%lx, 0x%lx)", m->object,
	(u_long)m->pindex,
	(u_long)VM_PAGE_TO_PHYS(m));
	else
	db_printf("( ??? )");
	if ((i + 1) < bp->b_npages)
	db_printf(",");
	}
	db_printf("\n");
	}
	BUF_LOCKPRINTINFO(bp);
	#if defined(FULL_BUF_TRACKING)
	db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt);

	i = bp->b_io_tcnt % BUF_TRACKING_SIZE;
	for (j = 1; j <= BUF_TRACKING_SIZE; j++) {
	if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL)
	continue;
	db_printf(" %2u: %s\n", j,
	bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]);
	}
	#elif defined(BUF_TRACKING)
	db_printf("b_io_tracking: %s\n", bp->b_io_tracking);
	#endif
	db_printf(" ");
	}

	DB_SHOW_COMMAND(bufqueues, bufqueues)
	{
	struct bufdomain *bd;
	struct buf *bp;
	long total;
	int i, j, cnt;

	db_printf("bqempty: %d\n", bqempty.bq_len);

	for (i = 0; i < buf_domains; i++) {
	bd = &bdomain[i];
	db_printf("Buf domain %d\n", i);
	db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers);
	db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers);
	db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers);
	db_printf("\n");
	db_printf("\tbufspace\t%ld\n", bd->bd_bufspace);
	db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace);
	db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace);
	db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace);
	db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh);
	db_printf("\n");
	db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers);
	db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers);
	db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers);
	db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh);
	db_printf("\n");
	total = 0;
	TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist)
	total += bp->b_bufsize;
	db_printf("\tcleanq count\t%d (%ld)\n",
	bd->bd_cleanq->bq_len, total);
	total = 0;
	TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist)
	total += bp->b_bufsize;
	db_printf("\tdirtyq count\t%d (%ld)\n",
	bd->bd_dirtyq.bq_len, total);
	db_printf("\twakeup\t\t%d\n", bd->bd_wanted);
	db_printf("\tlim\t\t%d\n", bd->bd_lim);
	db_printf("\tCPU ");
	for (j = 0; j <= mp_maxid; j++)
	db_printf("%d, ", bd->bd_subq[j].bq_len);
	db_printf("\n");
	cnt = 0;
	total = 0;
	- for (j = 0; j < nbuf; j++)
	- if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) {
	+ for (j = 0; j < nbuf; j++) {
	+ bp = nbufp(j);
	+ if (bp->b_domain == i && BUF_ISLOCKED(bp)) {
	cnt++;
	- total += buf[j].b_bufsize;
	+ total += bp->b_bufsize;
	}
	+ }
	db_printf("\tLocked buffers: %d space %ld\n", cnt, total);
	cnt = 0;
	total = 0;
	- for (j = 0; j < nbuf; j++)
	- if (buf[j].b_domain == i) {
	+ for (j = 0; j < nbuf; j++) {
	+ bp = nbufp(j);
	+ if (bp->b_domain == i) {
	cnt++;
	- total += buf[j].b_bufsize;
	+ total += bp->b_bufsize;
	}
	+ }
	db_printf("\tTotal buffers: %d space %ld\n", cnt, total);
	}
	}

	DB_SHOW_COMMAND(lockedbufs, lockedbufs)
	{
	struct buf *bp;
	int i;

	for (i = 0; i < nbuf; i++) {
	- bp = &buf[i];
	+ bp = nbufp(i);
	if (BUF_ISLOCKED(bp)) {
	db_show_buffer((uintptr_t)bp, 1, 0, NULL);
	db_printf("\n");
	if (db_pager_quit)
	break;
	}
	}
	}

	DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
	{
	struct vnode *vp;
	struct buf *bp;

	if (!have_addr) {
	db_printf("usage: show vnodebufs <addr>\n");
	return;
	}
	vp = (struct vnode *)addr;
	db_printf("Clean buffers:\n");
	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
	db_show_buffer((uintptr_t)bp, 1, 0, NULL);
	db_printf("\n");
	}
	db_printf("Dirty buffers:\n");
	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
	db_show_buffer((uintptr_t)bp, 1, 0, NULL);
	db_printf("\n");
	}
	}

	DB_COMMAND(countfreebufs, db_coundfreebufs)
	{
	struct buf *bp;
	int i, used = 0, nfree = 0;

	if (have_addr) {
	db_printf("usage: countfreebufs\n");
	return;
	}

	for (i = 0; i < nbuf; i++) {
	- bp = &buf[i];
	+ bp = nbufp(i);
	if (bp->b_qindex == QUEUE_EMPTY)
	nfree++;
	else
	used++;
	}

	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
	nfree + used);
	db_printf("numfreebuffers is %d\n", numfreebuffers);
	}
	#endif /* DDB */
	diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
	index 5dfbb9f113be..6b77adf5df34 100644
	--- a/sys/kern/vfs_cluster.c
	+++ b/sys/kern/vfs_cluster.c
	@@ -1,1083 +1,1085 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1993
	* The Regents of the University of California. All rights reserved.
	* Modifications/enhancements:
	* Copyright (c) 1995 John S. Dyson. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_debug_cluster.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/proc.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/vnode.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/rwlock.h>
	#include <sys/vmmeter.h>
	#include <vm/vm.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <sys/sysctl.h>

	#if defined(CLUSTERDEBUG)
	static int rcluster= 0;
	SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
	"Debug VFS clustering code");
	#endif

	static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
	static uma_zone_t cluster_pbuf_zone;

	static void cluster_init(void *);
	static struct cluster_save cluster_collectbufs(struct vnode vp,
	struct buf *last_bp, int gbflags);
	static struct buf cluster_rbuild(struct vnode vp, u_quad_t filesize,
	daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
	struct buf *fbp);
	static void cluster_callback(struct buf *);

	static int write_behind = 1;
	SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
	"Cluster write-behind; 0: disable, 1: enable, 2: backed off");

	static int read_max = 64;
	SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
	"Cluster read-ahead max block count");

	static int read_min = 1;
	SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0,
	"Cluster read min block count");

	SYSINIT(cluster, SI_SUB_CPU, SI_ORDER_ANY, cluster_init, NULL);

	static void
	cluster_init(void *dummy)
	{

	cluster_pbuf_zone = pbuf_zsecond_create("clpbuf", nswbuf / 2);
	}

	/*
	* Read data to a buf, including read-ahead if we find this to be beneficial.
	* cluster_read replaces bread.
	*/
	int
	cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
	struct ucred *cred, long totread, int seqcount, int gbflags,
	struct buf **bpp)
	{
	struct buf bp, rbp, *reqbp;
	struct bufobj *bo;
	struct thread *td;
	daddr_t blkno, origblkno;
	int maxra, racluster;
	int error, ncontig;
	int i;

	error = 0;
	td = curthread;
	bo = &vp->v_bufobj;
	if (!unmapped_buf_allowed)
	gbflags &= ~GB_UNMAPPED;

	/*
	* Try to limit the amount of read-ahead by a few
	* ad-hoc parameters. This needs work!!!
	*/
	racluster = vp->v_mount->mnt_iosize_max / size;
	maxra = seqcount;
	maxra = min(read_max, maxra);
	maxra = min(nbuf/8, maxra);
	if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
	maxra = (filesize / size) - lblkno;

	/*
	* get the requested block
	*/
	error = getblkx(vp, lblkno, lblkno, size, 0, 0, gbflags, &bp);
	if (error != 0) {
	*bpp = NULL;
	return (error);
	}
	gbflags &= ~GB_NOSPARSE;
	origblkno = lblkno;
	*bpp = reqbp = bp;

	/*
	* if it is in the cache, then check to see if the reads have been
	* sequential. If they have, then try some read-ahead, otherwise
	* back-off on prospective read-aheads.
	*/
	if (bp->b_flags & B_CACHE) {
	if (!seqcount) {
	return 0;
	} else if ((bp->b_flags & B_RAM) == 0) {
	return 0;
	} else {
	bp->b_flags &= ~B_RAM;
	BO_RLOCK(bo);
	for (i = 1; i < maxra; i++) {
	/*
	* Stop if the buffer does not exist or it
	* is invalid (about to go away?)
	*/
	rbp = gbincore(&vp->v_bufobj, lblkno+i);
	if (rbp == NULL \|\| (rbp->b_flags & B_INVAL))
	break;

	/*
	* Set another read-ahead mark so we know
	* to check again. (If we can lock the
	* buffer without waiting)
	*/
	if ((((i % racluster) == (racluster - 1)) \|\|
	(i == (maxra - 1)))
	&& (0 == BUF_LOCK(rbp,
	LK_EXCLUSIVE \| LK_NOWAIT, NULL))) {
	rbp->b_flags \|= B_RAM;
	BUF_UNLOCK(rbp);
	}
	}
	BO_RUNLOCK(bo);
	if (i >= maxra) {
	return 0;
	}
	lblkno += i;
	}
	reqbp = bp = NULL;
	/*
	* If it isn't in the cache, then get a chunk from
	* disk if sequential, otherwise just get the block.
	*/
	} else {
	off_t firstread = bp->b_offset;
	int nblks;
	long minread;

	KASSERT(bp->b_offset != NOOFFSET,
	("cluster_read: no buffer offset"));

	ncontig = 0;

	/*
	* Adjust totread if needed
	*/
	minread = read_min * size;
	if (minread > totread)
	totread = minread;

	/*
	* Compute the total number of blocks that we should read
	* synchronously.
	*/
	if (firstread + totread > filesize)
	totread = filesize - firstread;
	nblks = howmany(totread, size);
	if (nblks > racluster)
	nblks = racluster;

	/*
	* Now compute the number of contiguous blocks.
	*/
	if (nblks > 1) {
	error = VOP_BMAP(vp, lblkno, NULL,
	&blkno, &ncontig, NULL);
	/*
	* If this failed to map just do the original block.
	*/
	if (error \|\| blkno == -1)
	ncontig = 0;
	}

	/*
	* If we have contiguous data available do a cluster
	* otherwise just read the requested block.
	*/
	if (ncontig) {
	/* Account for our first block. */
	ncontig = min(ncontig + 1, nblks);
	if (ncontig < nblks)
	nblks = ncontig;
	bp = cluster_rbuild(vp, filesize, lblkno,
	blkno, size, nblks, gbflags, bp);
	lblkno += (bp->b_bufsize / size);
	} else {
	bp->b_flags \|= B_RAM;
	bp->b_iocmd = BIO_READ;
	lblkno += 1;
	}
	}

	/*
	* handle the synchronous read so that it is available ASAP.
	*/
	if (bp) {
	if ((bp->b_flags & B_CLUSTER) == 0) {
	vfs_busy_pages(bp, 0);
	}
	bp->b_flags &= ~B_INVAL;
	bp->b_ioflags &= ~BIO_ERROR;
	if ((bp->b_flags & B_ASYNC) \|\| bp->b_iodone != NULL)
	BUF_KERNPROC(bp);
	bp->b_iooffset = dbtob(bp->b_blkno);
	bstrategy(bp);
	#ifdef RACCT
	if (racct_enable) {
	PROC_LOCK(td->td_proc);
	racct_add_buf(td->td_proc, bp, 0);
	PROC_UNLOCK(td->td_proc);
	}
	#endif /* RACCT */
	td->td_ru.ru_inblock++;
	}

	/*
	* If we have been doing sequential I/O, then do some read-ahead.
	*/
	while (lblkno < (origblkno + maxra)) {
	error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL);
	if (error)
	break;

	if (blkno == -1)
	break;

	/*
	* We could throttle ncontig here by maxra but we might as
	* well read the data if it is contiguous. We're throttled
	* by racluster anyway.
	*/
	if (ncontig) {
	ncontig = min(ncontig + 1, racluster);
	rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
	size, ncontig, gbflags, NULL);
	lblkno += (rbp->b_bufsize / size);
	if (rbp->b_flags & B_DELWRI) {
	bqrelse(rbp);
	continue;
	}
	} else {
	rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
	lblkno += 1;
	if (rbp->b_flags & B_DELWRI) {
	bqrelse(rbp);
	continue;
	}
	rbp->b_flags \|= B_ASYNC \| B_RAM;
	rbp->b_iocmd = BIO_READ;
	rbp->b_blkno = blkno;
	}
	if (rbp->b_flags & B_CACHE) {
	rbp->b_flags &= ~B_ASYNC;
	bqrelse(rbp);
	continue;
	}
	if ((rbp->b_flags & B_CLUSTER) == 0) {
	vfs_busy_pages(rbp, 0);
	}
	rbp->b_flags &= ~B_INVAL;
	rbp->b_ioflags &= ~BIO_ERROR;
	if ((rbp->b_flags & B_ASYNC) \|\| rbp->b_iodone != NULL)
	BUF_KERNPROC(rbp);
	rbp->b_iooffset = dbtob(rbp->b_blkno);
	bstrategy(rbp);
	#ifdef RACCT
	if (racct_enable) {
	PROC_LOCK(td->td_proc);
	racct_add_buf(td->td_proc, rbp, 0);
	PROC_UNLOCK(td->td_proc);
	}
	#endif /* RACCT */
	td->td_ru.ru_inblock++;
	}

	if (reqbp) {
	/*
	* Like bread, always brelse() the buffer when
	* returning an error.
	*/
	error = bufwait(reqbp);
	if (error != 0) {
	brelse(reqbp);
	*bpp = NULL;
	}
	}
	return (error);
	}

	/*
	* If blocks are contiguous on disk, use this to provide clustered
	* read ahead. We will read as many blocks as possible sequentially
	* and then parcel them up into logical blocks in the buffer hash table.
	*/
	static struct buf *
	cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
	daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
	{
	struct buf bp, tbp;
	daddr_t bn;
	off_t off;
	long tinc, tsize;
	int i, inc, j, k, toff;

	KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
	("cluster_rbuild: size %ld != f_iosize %jd\n",
	size, (intmax_t)vp->v_mount->mnt_stat.f_iosize));

	/*
	* avoid a division
	*/
	while ((u_quad_t) size * (lbn + run) > filesize) {
	--run;
	}

	if (fbp) {
	tbp = fbp;
	tbp->b_iocmd = BIO_READ;
	} else {
	tbp = getblk(vp, lbn, size, 0, 0, gbflags);
	if (tbp->b_flags & B_CACHE)
	return tbp;
	tbp->b_flags \|= B_ASYNC \| B_RAM;
	tbp->b_iocmd = BIO_READ;
	}
	tbp->b_blkno = blkno;
	if( (tbp->b_flags & B_MALLOC) \|\|
	((tbp->b_flags & B_VMIO) == 0) \|\| (run <= 1) )
	return tbp;

	bp = uma_zalloc(cluster_pbuf_zone, M_NOWAIT);
	if (bp == NULL)
	return tbp;
	+ MPASS((bp->b_flags & B_MAXPHYS) != 0);

	/*
	* We are synthesizing a buffer out of vm_page_t's, but
	* if the block size is not page aligned then the starting
	* address may not be either. Inherit the b_data offset
	* from the original buffer.
	*/
	bp->b_flags = B_ASYNC \| B_CLUSTER \| B_VMIO;
	if ((gbflags & GB_UNMAPPED) != 0) {
	bp->b_data = unmapped_buf;
	} else {
	bp->b_data = (char *)((vm_offset_t)bp->b_data \|
	((vm_offset_t)tbp->b_data & PAGE_MASK));
	}
	bp->b_iocmd = BIO_READ;
	bp->b_iodone = cluster_callback;
	bp->b_blkno = blkno;
	bp->b_lblkno = lbn;
	bp->b_offset = tbp->b_offset;
	KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
	pbgetvp(vp, bp);

	TAILQ_INIT(&bp->b_cluster.cluster_head);

	bp->b_bcount = 0;
	bp->b_bufsize = 0;
	bp->b_npages = 0;

	inc = btodb(size);
	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
	if (i == 0) {
	vm_object_pip_add(tbp->b_bufobj->bo_object,
	tbp->b_npages);
	vfs_busy_pages_acquire(tbp);
	} else {
	if ((bp->b_npages * PAGE_SIZE) +
	round_page(size) > vp->v_mount->mnt_iosize_max) {
	break;
	}

	tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT \|
	(gbflags & GB_UNMAPPED));

	/* Don't wait around for locked bufs. */
	if (tbp == NULL)
	break;

	/*
	* Stop scanning if the buffer is fully valid
	* (marked B_CACHE), or locked (may be doing a
	* background write), or if the buffer is not
	* VMIO backed. The clustering code can only deal
	* with VMIO-backed buffers. The bo lock is not
	* required for the BKGRDINPROG check since it
	* can not be set without the buf lock.
	*/
	if ((tbp->b_vflags & BV_BKGRDINPROG) \|\|
	(tbp->b_flags & B_CACHE) \|\|
	(tbp->b_flags & B_VMIO) == 0) {
	bqrelse(tbp);
	break;
	}

	/*
	* The buffer must be completely invalid in order to
	* take part in the cluster. If it is partially valid
	* then we stop.
	*/
	off = tbp->b_offset;
	tsize = size;
	for (j = 0; tsize > 0; j++) {
	toff = off & PAGE_MASK;
	tinc = tsize;
	if (toff + tinc > PAGE_SIZE)
	tinc = PAGE_SIZE - toff;
	if (vm_page_trysbusy(tbp->b_pages[j]) == 0)
	break;
	if ((tbp->b_pages[j]->valid &
	vm_page_bits(toff, tinc)) != 0) {
	vm_page_sunbusy(tbp->b_pages[j]);
	break;
	}
	vm_object_pip_add(tbp->b_bufobj->bo_object, 1);
	off += tinc;
	tsize -= tinc;
	}
	if (tsize > 0) {
	clean_sbusy:
	vm_object_pip_wakeupn(tbp->b_bufobj->bo_object,
	j);
	for (k = 0; k < j; k++)
	vm_page_sunbusy(tbp->b_pages[k]);
	bqrelse(tbp);
	break;
	}

	/*
	* Set a read-ahead mark as appropriate
	*/
	if ((fbp && (i == 1)) \|\| (i == (run - 1)))
	tbp->b_flags \|= B_RAM;

	/*
	* Set the buffer up for an async read (XXX should
	* we do this only if we do not wind up brelse()ing?).
	* Set the block number if it isn't set, otherwise
	* if it is make sure it matches the block number we
	* expect.
	*/
	tbp->b_flags \|= B_ASYNC;
	tbp->b_iocmd = BIO_READ;
	if (tbp->b_blkno == tbp->b_lblkno) {
	tbp->b_blkno = bn;
	} else if (tbp->b_blkno != bn) {
	goto clean_sbusy;
	}
	}
	/*
	* XXX fbp from caller may not be B_ASYNC, but we are going
	* to biodone() it in cluster_callback() anyway
	*/
	BUF_KERNPROC(tbp);
	TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
	tbp, b_cluster.cluster_entry);
	for (j = 0; j < tbp->b_npages; j += 1) {
	vm_page_t m;

	m = tbp->b_pages[j];
	if ((bp->b_npages == 0) \|\|
	(bp->b_pages[bp->b_npages-1] != m)) {
	bp->b_pages[bp->b_npages] = m;
	bp->b_npages++;
	}
	if (vm_page_all_valid(m))
	tbp->b_pages[j] = bogus_page;
	}

	/*
	* Don't inherit tbp->b_bufsize as it may be larger due to
	* a non-page-aligned size. Instead just aggregate using
	* 'size'.
	*/
	if (tbp->b_bcount != size)
	printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
	if (tbp->b_bufsize != size)
	printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
	bp->b_bcount += size;
	bp->b_bufsize += size;
	}

	/*
	* Fully valid pages in the cluster are already good and do not need
	* to be re-read from disk. Replace the page with bogus_page
	*/
	for (j = 0; j < bp->b_npages; j++) {
	if (vm_page_all_valid(bp->b_pages[j]))
	bp->b_pages[j] = bogus_page;
	}
	if (bp->b_bufsize > bp->b_kvasize)
	panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
	bp->b_bufsize, bp->b_kvasize);

	if (buf_mapped(bp)) {
	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
	(vm_page_t *)bp->b_pages, bp->b_npages);
	}
	return (bp);
	}

	/*
	* Cleanup after a clustered read or write.
	* This is complicated by the fact that any of the buffers might have
	* extra memory (if there were no empty buffer headers at allocbuf time)
	* that we will need to shift around.
	*/
	static void
	cluster_callback(struct buf *bp)
	{
	struct buf nbp, tbp;
	int error = 0;

	/*
	* Must propagate errors to all the components.
	*/
	if (bp->b_ioflags & BIO_ERROR)
	error = bp->b_error;

	if (buf_mapped(bp)) {
	pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
	bp->b_npages);
	}
	/*
	* Move memory from the large cluster buffer into the component
	* buffers and mark IO as done on these.
	*/
	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
	tbp; tbp = nbp) {
	nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
	if (error) {
	tbp->b_ioflags \|= BIO_ERROR;
	tbp->b_error = error;
	} else {
	tbp->b_dirtyoff = tbp->b_dirtyend = 0;
	tbp->b_flags &= ~B_INVAL;
	tbp->b_ioflags &= ~BIO_ERROR;
	/*
	* XXX the bdwrite()/bqrelse() issued during
	* cluster building clears B_RELBUF (see bqrelse()
	* comment). If direct I/O was specified, we have
	* to restore it here to allow the buffer and VM
	* to be freed.
	*/
	if (tbp->b_flags & B_DIRECT)
	tbp->b_flags \|= B_RELBUF;
	}
	bufdone(tbp);
	}
	pbrelvp(bp);
	uma_zfree(cluster_pbuf_zone, bp);
	}

	/*
	* cluster_wbuild_wb:
	*
	* Implement modified write build for cluster.
	*
	* write_behind = 0 write behind disabled
	* write_behind = 1 write behind normal (default)
	* write_behind = 2 write behind backed-off
	*/

	static __inline int
	cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
	int gbflags)
	{
	int r = 0;

	switch (write_behind) {
	case 2:
	if (start_lbn < len)
	break;
	start_lbn -= len;
	/* FALLTHROUGH */
	case 1:
	r = cluster_wbuild(vp, size, start_lbn, len, gbflags);
	/* FALLTHROUGH */
	default:
	/* FALLTHROUGH */
	break;
	}
	return(r);
	}

	/*
	* Do clustered write for FFS.
	*
	* Three cases:
	* 1. Write is not sequential (write asynchronously)
	* Write is sequential:
	* 2. beginning of cluster - begin cluster
	* 3. middle of a cluster - add to cluster
	* 4. end of a cluster - asynchronously write cluster
	*/
	void
	cluster_write(struct vnode vp, struct buf bp, u_quad_t filesize, int seqcount,
	int gbflags)
	{
	daddr_t lbn;
	int maxclen, cursize;
	int lblocksize;
	int async;

	if (!unmapped_buf_allowed)
	gbflags &= ~GB_UNMAPPED;

	if (vp->v_type == VREG) {
	async = DOINGASYNC(vp);
	lblocksize = vp->v_mount->mnt_stat.f_iosize;
	} else {
	async = 0;
	lblocksize = bp->b_bufsize;
	}
	lbn = bp->b_lblkno;
	KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));

	/* Initialize vnode to beginning of file. */
	if (lbn == 0)
	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;

	if (vp->v_clen == 0 \|\| lbn != vp->v_lastw + 1 \|\|
	(bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
	maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
	if (vp->v_clen != 0) {
	/*
	* Next block is not sequential.
	*
	* If we are not writing at end of file, the process
	* seeked to another point in the file since its last
	* write, or we have reached our maximum cluster size,
	* then push the previous cluster. Otherwise try
	* reallocating to make it sequential.
	*
	* Change to algorithm: only push previous cluster if
	* it was sequential from the point of view of the
	* seqcount heuristic, otherwise leave the buffer
	* intact so we can potentially optimize the I/O
	* later on in the buf_daemon or update daemon
	* flush.
	*/
	cursize = vp->v_lastw - vp->v_cstart + 1;
	if (((u_quad_t) bp->b_offset + lblocksize) != filesize \|\|
	lbn != vp->v_lastw + 1 \|\| vp->v_clen <= cursize) {
	if (!async && seqcount > 0) {
	cluster_wbuild_wb(vp, lblocksize,
	vp->v_cstart, cursize, gbflags);
	}
	} else {
	struct buf bpp, endbp;
	struct cluster_save *buflist;

	buflist = cluster_collectbufs(vp, bp, gbflags);
	if (buflist == NULL) {
	/*
	* Cluster build failed so just write
	* it now.
	*/
	bawrite(bp);
	return;
	}
	endbp = &buflist->bs_children
	[buflist->bs_nchildren - 1];
	if (VOP_REALLOCBLKS(vp, buflist)) {
	/*
	* Failed, push the previous cluster
	* if really writing sequentially
	* in the logical file (seqcount > 1),
	* otherwise delay it in the hopes that
	* the low level disk driver can
	* optimize the write ordering.
	*/
	for (bpp = buflist->bs_children;
	bpp < endbp; bpp++)
	brelse(*bpp);
	free(buflist, M_SEGMENT);
	if (seqcount > 1) {
	cluster_wbuild_wb(vp,
	lblocksize, vp->v_cstart,
	cursize, gbflags);
	}
	} else {
	/*
	* Succeeded, keep building cluster.
	*/
	for (bpp = buflist->bs_children;
	bpp <= endbp; bpp++)
	bdwrite(*bpp);
	free(buflist, M_SEGMENT);
	vp->v_lastw = lbn;
	vp->v_lasta = bp->b_blkno;
	return;
	}
	}
	}
	/*
	* Consider beginning a cluster. If at end of file, make
	* cluster as large as possible, otherwise find size of
	* existing cluster.
	*/
	if ((vp->v_type == VREG) &&
	((u_quad_t) bp->b_offset + lblocksize) != filesize &&
	(bp->b_blkno == bp->b_lblkno) &&
	(VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) \|\|
	bp->b_blkno == -1)) {
	bawrite(bp);
	vp->v_clen = 0;
	vp->v_lasta = bp->b_blkno;
	vp->v_cstart = lbn + 1;
	vp->v_lastw = lbn;
	return;
	}
	vp->v_clen = maxclen;
	if (!async && maxclen == 0) { /* I/O not contiguous */
	vp->v_cstart = lbn + 1;
	bawrite(bp);
	} else { /* Wait for rest of cluster */
	vp->v_cstart = lbn;
	bdwrite(bp);
	}
	} else if (lbn == vp->v_cstart + vp->v_clen) {
	/*
	* At end of cluster, write it out if seqcount tells us we
	* are operating sequentially, otherwise let the buf or
	* update daemon handle it.
	*/
	bdwrite(bp);
	if (seqcount > 1) {
	cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
	vp->v_clen + 1, gbflags);
	}
	vp->v_clen = 0;
	vp->v_cstart = lbn + 1;
	} else if (vm_page_count_severe()) {
	/*
	* We are low on memory, get it going NOW
	*/
	bawrite(bp);
	} else {
	/*
	* In the middle of a cluster, so just delay the I/O for now.
	*/
	bdwrite(bp);
	}
	vp->v_lastw = lbn;
	vp->v_lasta = bp->b_blkno;
	}

	/*
	* This is an awful lot like cluster_rbuild...wish they could be combined.
	* The last lbn argument is the current block on which I/O is being
	* performed. Check to see that it doesn't fall in the middle of
	* the current block (if last_bp == NULL).
	*/
	int
	cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
	int gbflags)
	{
	struct buf bp, tbp;
	struct bufobj *bo;
	int i, j;
	int totalwritten = 0;
	int dbsize = btodb(size);

	if (!unmapped_buf_allowed)
	gbflags &= ~GB_UNMAPPED;

	bo = &vp->v_bufobj;
	while (len > 0) {
	/*
	* If the buffer is not delayed-write (i.e. dirty), or it
	* is delayed-write but either locked or inval, it cannot
	* partake in the clustered write.
	*/
	BO_LOCK(bo);
	if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL \|\|
	(tbp->b_vflags & BV_BKGRDINPROG)) {
	BO_UNLOCK(bo);
	++start_lbn;
	--len;
	continue;
	}
	if (BUF_LOCK(tbp,
	LK_EXCLUSIVE \| LK_NOWAIT \| LK_INTERLOCK, BO_LOCKPTR(bo))) {
	++start_lbn;
	--len;
	continue;
	}
	if ((tbp->b_flags & (B_INVAL \| B_DELWRI)) != B_DELWRI) {
	BUF_UNLOCK(tbp);
	++start_lbn;
	--len;
	continue;
	}
	bremfree(tbp);
	tbp->b_flags &= ~B_DONE;

	/*
	* Extra memory in the buffer, punt on this buffer.
	* XXX we could handle this in most cases, but we would
	* have to push the extra memory down to after our max
	* possible cluster size and then potentially pull it back
	* up if the cluster was terminated prematurely--too much
	* hassle.
	*/
	if (((tbp->b_flags & (B_CLUSTEROK \| B_MALLOC \| B_VMIO)) !=
	(B_CLUSTEROK \| B_VMIO)) \|\|
	(tbp->b_bcount != tbp->b_bufsize) \|\|
	(tbp->b_bcount != size) \|\|
	(len == 1) \|\|
	((bp = uma_zalloc(cluster_pbuf_zone, M_NOWAIT)) == NULL)) {
	totalwritten += tbp->b_bufsize;
	bawrite(tbp);
	++start_lbn;
	--len;
	continue;
	}
	+ MPASS((bp->b_flags & B_MAXPHYS) != 0);

	/*
	* We got a pbuf to make the cluster in.
	* so initialise it.
	*/
	TAILQ_INIT(&bp->b_cluster.cluster_head);
	bp->b_bcount = 0;
	bp->b_bufsize = 0;
	bp->b_npages = 0;
	if (tbp->b_wcred != NOCRED)
	bp->b_wcred = crhold(tbp->b_wcred);

	bp->b_blkno = tbp->b_blkno;
	bp->b_lblkno = tbp->b_lblkno;
	bp->b_offset = tbp->b_offset;

	/*
	* We are synthesizing a buffer out of vm_page_t's, but
	* if the block size is not page aligned then the starting
	* address may not be either. Inherit the b_data offset
	* from the original buffer.
	*/
	if ((gbflags & GB_UNMAPPED) == 0 \|\|
	(tbp->b_flags & B_VMIO) == 0) {
	bp->b_data = (char *)((vm_offset_t)bp->b_data \|
	((vm_offset_t)tbp->b_data & PAGE_MASK));
	} else {
	bp->b_data = unmapped_buf;
	}
	bp->b_flags \|= B_CLUSTER \| (tbp->b_flags & (B_VMIO \|
	B_NEEDCOMMIT));
	bp->b_iodone = cluster_callback;
	pbgetvp(vp, bp);
	/*
	* From this location in the file, scan forward to see
	* if there are buffers with adjacent data that need to
	* be written as well.
	*/
	for (i = 0; i < len; ++i, ++start_lbn) {
	if (i != 0) { /* If not the first buffer */
	/*
	* If the adjacent data is not even in core it
	* can't need to be written.
	*/
	BO_LOCK(bo);
	if ((tbp = gbincore(bo, start_lbn)) == NULL \|\|
	(tbp->b_vflags & BV_BKGRDINPROG)) {
	BO_UNLOCK(bo);
	break;
	}

	/*
	* If it IS in core, but has different
	* characteristics, or is locked (which
	* means it could be undergoing a background
	* I/O or be in a weird state), then don't
	* cluster with it.
	*/
	if (BUF_LOCK(tbp,
	LK_EXCLUSIVE \| LK_NOWAIT \| LK_INTERLOCK,
	BO_LOCKPTR(bo)))
	break;

	if ((tbp->b_flags & (B_VMIO \| B_CLUSTEROK \|
	B_INVAL \| B_DELWRI \| B_NEEDCOMMIT))
	!= (B_DELWRI \| B_CLUSTEROK \|
	(bp->b_flags & (B_VMIO \| B_NEEDCOMMIT))) \|\|
	tbp->b_wcred != bp->b_wcred) {
	BUF_UNLOCK(tbp);
	break;
	}

	/*
	* Check that the combined cluster
	* would make sense with regard to pages
	* and would not be too large
	*/
	if ((tbp->b_bcount != size) \|\|
	((bp->b_blkno + (dbsize * i)) !=
	tbp->b_blkno) \|\|
	((tbp->b_npages + bp->b_npages) >
	(vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
	BUF_UNLOCK(tbp);
	break;
	}

	/*
	* Ok, it's passed all the tests,
	* so remove it from the free list
	* and mark it busy. We will use it.
	*/
	bremfree(tbp);
	tbp->b_flags &= ~B_DONE;
	} /* end of code for non-first buffers only */
	/*
	* If the IO is via the VM then we do some
	* special VM hackery (yuck). Since the buffer's
	* block size may not be page-aligned it is possible
	* for a page to be shared between two buffers. We
	* have to get rid of the duplication when building
	* the cluster.
	*/
	if (tbp->b_flags & B_VMIO) {
	vm_page_t m;

	if (i == 0) {
	vfs_busy_pages_acquire(tbp);
	} else { /* if not first buffer */
	for (j = 0; j < tbp->b_npages; j += 1) {
	m = tbp->b_pages[j];
	if (vm_page_trysbusy(m) == 0) {
	for (j--; j >= 0; j--)
	vm_page_sunbusy(
	tbp->b_pages[j]);
	bqrelse(tbp);
	goto finishcluster;
	}
	}
	}
	vm_object_pip_add(tbp->b_bufobj->bo_object,
	tbp->b_npages);
	for (j = 0; j < tbp->b_npages; j += 1) {
	m = tbp->b_pages[j];
	if ((bp->b_npages == 0) \|\|
	(bp->b_pages[bp->b_npages - 1] != m)) {
	bp->b_pages[bp->b_npages] = m;
	bp->b_npages++;
	}
	}
	}
	bp->b_bcount += size;
	bp->b_bufsize += size;
	/*
	* If any of the clustered buffers have their
	* B_BARRIER flag set, transfer that request to
	* the cluster.
	*/
	bp->b_flags \|= (tbp->b_flags & B_BARRIER);
	tbp->b_flags &= ~(B_DONE \| B_BARRIER);
	tbp->b_flags \|= B_ASYNC;
	tbp->b_ioflags &= ~BIO_ERROR;
	tbp->b_iocmd = BIO_WRITE;
	bundirty(tbp);
	reassignbuf(tbp); /* put on clean list */
	bufobj_wref(tbp->b_bufobj);
	BUF_KERNPROC(tbp);
	buf_track(tbp, __func__);
	TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
	tbp, b_cluster.cluster_entry);
	}
	finishcluster:
	if (buf_mapped(bp)) {
	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
	(vm_page_t *)bp->b_pages, bp->b_npages);
	}
	if (bp->b_bufsize > bp->b_kvasize)
	panic(
	"cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
	bp->b_bufsize, bp->b_kvasize);
	totalwritten += bp->b_bufsize;
	bp->b_dirtyoff = 0;
	bp->b_dirtyend = bp->b_bufsize;
	bawrite(bp);

	len -= i;
	}
	return totalwritten;
	}

	/*
	* Collect together all the buffers in a cluster.
	* Plus add one additional buffer.
	*/
	static struct cluster_save *
	cluster_collectbufs(struct vnode vp, struct buf last_bp, int gbflags)
	{
	struct cluster_save *buflist;
	struct buf *bp;
	daddr_t lbn;
	int i, j, len, error;

	len = vp->v_lastw - vp->v_cstart + 1;
	buflist = malloc(sizeof(struct buf ) (len + 1) + sizeof(*buflist),
	M_SEGMENT, M_WAITOK);
	buflist->bs_nchildren = 0;
	buflist->bs_children = (struct buf **) (buflist + 1);
	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
	error = bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
	gbflags, &bp);
	if (error != 0) {
	/*
	* If read fails, release collected buffers
	* and return failure.
	*/
	for (j = 0; j < i; j++)
	brelse(buflist->bs_children[j]);
	free(buflist, M_SEGMENT);
	return (NULL);
	}
	buflist->bs_children[i] = bp;
	if (bp->b_blkno == bp->b_lblkno)
	VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
	NULL, NULL);
	}
	buflist->bs_children[i] = bp = last_bp;
	if (bp->b_blkno == bp->b_lblkno)
	VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
	buflist->bs_nchildren = i + 1;
	return (buflist);
	}
	diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
	index d9f44e1dc6b9..4b96d9522ce3 100644
	--- a/sys/kern/vfs_default.c
	+++ b/sys/kern/vfs_default.c
	@@ -1,1594 +1,1594 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed
	* to Berkeley by John Heidemann of the UCLA Ficus project.
	*
	* Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/conf.h>
	#include <sys/event.h>
	#include <sys/filio.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/lockf.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/namei.h>
	#include <sys/rwlock.h>
	#include <sys/fcntl.h>
	#include <sys/unistd.h>
	#include <sys/vnode.h>
	#include <sys/dirent.h>
	#include <sys/poll.h>
	#include <sys/stat.h>
	#include <security/audit/audit.h>
	#include <sys/priv.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_object.h>
	#include <vm/vm_extern.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>
	#include <vm/vnode_pager.h>

	static int vop_nolookup(struct vop_lookup_args *);
	static int vop_norename(struct vop_rename_args *);
	static int vop_nostrategy(struct vop_strategy_args *);
	static int get_next_dirent(struct vnode vp, struct dirent *dpp,
	char dirbuf, int dirbuflen, off_t off,
	char *cpos, int len, int *eofflag,
	struct thread *td);
	static int dirent_exists(struct vnode vp, const char dirname,
	struct thread *td);

	#define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)

	static int vop_stdis_text(struct vop_is_text_args *ap);
	static int vop_stdunset_text(struct vop_unset_text_args *ap);
	static int vop_stdadd_writecount(struct vop_add_writecount_args *ap);
	static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap);
	static int vop_stdfdatasync(struct vop_fdatasync_args *ap);
	static int vop_stdgetpages_async(struct vop_getpages_async_args *ap);
	static int vop_stdread_pgcache(struct vop_read_pgcache_args *ap);
	static int vop_stdstat(struct vop_stat_args *ap);

	/*
	* This vnode table stores what we want to do if the filesystem doesn't
	* implement a particular VOP.
	*
	* If there is no specific entry here, we will return EOPNOTSUPP.
	*
	* Note that every filesystem has to implement either vop_access
	* or vop_accessx; failing to do so will result in immediate crash
	* due to stack overflow, as vop_stdaccess() calls vop_stdaccessx(),
	* which calls vop_stdaccess() etc.
	*/

	struct vop_vector default_vnodeops = {
	.vop_default = NULL,
	.vop_bypass = VOP_EOPNOTSUPP,

	.vop_access = vop_stdaccess,
	.vop_accessx = vop_stdaccessx,
	.vop_advise = vop_stdadvise,
	.vop_advlock = vop_stdadvlock,
	.vop_advlockasync = vop_stdadvlockasync,
	.vop_advlockpurge = vop_stdadvlockpurge,
	.vop_allocate = vop_stdallocate,
	.vop_bmap = vop_stdbmap,
	.vop_close = VOP_NULL,
	.vop_fsync = VOP_NULL,
	.vop_stat = vop_stdstat,
	.vop_fdatasync = vop_stdfdatasync,
	.vop_getpages = vop_stdgetpages,
	.vop_getpages_async = vop_stdgetpages_async,
	.vop_getwritemount = vop_stdgetwritemount,
	.vop_inactive = VOP_NULL,
	.vop_need_inactive = vop_stdneed_inactive,
	.vop_ioctl = vop_stdioctl,
	.vop_kqfilter = vop_stdkqfilter,
	.vop_islocked = vop_stdislocked,
	.vop_lock1 = vop_stdlock,
	.vop_lookup = vop_nolookup,
	.vop_open = VOP_NULL,
	.vop_pathconf = VOP_EINVAL,
	.vop_poll = vop_nopoll,
	.vop_putpages = vop_stdputpages,
	.vop_readlink = VOP_EINVAL,
	.vop_read_pgcache = vop_stdread_pgcache,
	.vop_rename = vop_norename,
	.vop_revoke = VOP_PANIC,
	.vop_strategy = vop_nostrategy,
	.vop_unlock = vop_stdunlock,
	.vop_vptocnp = vop_stdvptocnp,
	.vop_vptofh = vop_stdvptofh,
	.vop_unp_bind = vop_stdunp_bind,
	.vop_unp_connect = vop_stdunp_connect,
	.vop_unp_detach = vop_stdunp_detach,
	.vop_is_text = vop_stdis_text,
	.vop_set_text = vop_stdset_text,
	.vop_unset_text = vop_stdunset_text,
	.vop_add_writecount = vop_stdadd_writecount,
	.vop_copy_file_range = vop_stdcopy_file_range,
	};
	VFS_VOP_VECTOR_REGISTER(default_vnodeops);

	/*
	* Series of placeholder functions for various error returns for
	* VOPs.
	*/

	int
	vop_eopnotsupp(struct vop_generic_args *ap)
	{
	/*
	printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
	*/

	return (EOPNOTSUPP);
	}

	int
	vop_ebadf(struct vop_generic_args *ap)
	{

	return (EBADF);
	}

	int
	vop_enotty(struct vop_generic_args *ap)
	{

	return (ENOTTY);
	}

	int
	vop_einval(struct vop_generic_args *ap)
	{

	return (EINVAL);
	}

	int
	vop_enoent(struct vop_generic_args *ap)
	{

	return (ENOENT);
	}

	int
	vop_eagain(struct vop_generic_args *ap)
	{

	return (EAGAIN);
	}

	int
	vop_null(struct vop_generic_args *ap)
	{

	return (0);
	}

	/*
	* Helper function to panic on some bad VOPs in some filesystems.
	*/
	int
	vop_panic(struct vop_generic_args *ap)
	{

	panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name);
	}

	/*
	* vop_std<something> and vop_no<something> are default functions for use by
	* filesystems that need the "default reasonable" implementation for a
	* particular operation.
	*
	* The documentation for the operations they implement exists (if it exists)
	* in the VOP_<SOMETHING>(9) manpage (all uppercase).
	*/

	/*
	* Default vop for filesystems that do not support name lookup
	*/
	static int
	vop_nolookup(ap)
	struct vop_lookup_args /* {
	struct vnode *a_dvp;
	struct vnode **a_vpp;
	struct componentname *a_cnp;
	} / ap;
	{

	*ap->a_vpp = NULL;
	return (ENOTDIR);
	}

	/*
	* vop_norename:
	*
	* Handle unlock and reference counting for arguments of vop_rename
	* for filesystems that do not implement rename operation.
	*/
	static int
	vop_norename(struct vop_rename_args *ap)
	{

	vop_rename_fail(ap);
	return (EOPNOTSUPP);
	}

	/*
	* vop_nostrategy:
	*
	* Strategy routine for VFS devices that have none.
	*
	* BIO_ERROR and B_INVAL must be cleared prior to calling any strategy
	* routine. Typically this is done for a BIO_READ strategy call.
	* Typically B_INVAL is assumed to already be clear prior to a write
	* and should not be cleared manually unless you just made the buffer
	* invalid. BIO_ERROR should be cleared either way.
	*/

	static int
	vop_nostrategy (struct vop_strategy_args *ap)
	{
	printf("No strategy for buffer at %p\n", ap->a_bp);
	vn_printf(ap->a_vp, "vnode ");
	ap->a_bp->b_ioflags \|= BIO_ERROR;
	ap->a_bp->b_error = EOPNOTSUPP;
	bufdone(ap->a_bp);
	return (EOPNOTSUPP);
	}

	static int
	get_next_dirent(struct vnode vp, struct dirent dpp, char dirbuf,
	int dirbuflen, off_t off, char cpos, int len,
	int eofflag, struct thread td)
	{
	int error, reclen;
	struct uio uio;
	struct iovec iov;
	struct dirent *dp;

	KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
	KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));

	if (*len == 0) {
	iov.iov_base = dirbuf;
	iov.iov_len = dirbuflen;

	uio.uio_iov = &iov;
	uio.uio_iovcnt = 1;
	uio.uio_offset = *off;
	uio.uio_resid = dirbuflen;
	uio.uio_segflg = UIO_SYSSPACE;
	uio.uio_rw = UIO_READ;
	uio.uio_td = td;

	*eofflag = 0;

	#ifdef MAC
	error = mac_vnode_check_readdir(td->td_ucred, vp);
	if (error == 0)
	#endif
	error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag,
	NULL, NULL);
	if (error)
	return (error);

	*off = uio.uio_offset;

	*cpos = dirbuf;
	*len = (dirbuflen - uio.uio_resid);

	if (*len == 0)
	return (ENOENT);
	}

	dp = (struct dirent )(cpos);
	reclen = dp->d_reclen;
	*dpp = dp;

	/* check for malformed directory.. */
	if (reclen < DIRENT_MINSIZE)
	return (EINVAL);

	*cpos += reclen;
	*len -= reclen;

	return (0);
	}

	/*
	* Check if a named file exists in a given directory vnode.
	*/
	static int
	dirent_exists(struct vnode vp, const char dirname, struct thread *td)
	{
	char dirbuf, cpos;
	int error, eofflag, dirbuflen, len, found;
	off_t off;
	struct dirent *dp;
	struct vattr va;

	KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
	KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));

	found = 0;

	error = VOP_GETATTR(vp, &va, td->td_ucred);
	if (error)
	return (found);

	dirbuflen = DEV_BSIZE;
	if (dirbuflen < va.va_blocksize)
	dirbuflen = va.va_blocksize;
	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);

	off = 0;
	len = 0;
	do {
	error = get_next_dirent(vp, &dp, dirbuf, dirbuflen, &off,
	&cpos, &len, &eofflag, td);
	if (error)
	goto out;

	if (dp->d_type != DT_WHT && dp->d_fileno != 0 &&
	strcmp(dp->d_name, dirname) == 0) {
	found = 1;
	goto out;
	}
	} while (len > 0 \|\| !eofflag);

	out:
	free(dirbuf, M_TEMP);
	return (found);
	}

	int
	vop_stdaccess(struct vop_access_args *ap)
	{

	KASSERT((ap->a_accmode & ~(VEXEC \| VWRITE \| VREAD \| VADMIN \|
	VAPPEND)) == 0, ("invalid bit in accmode"));

	return (VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred, ap->a_td));
	}

	int
	vop_stdaccessx(struct vop_accessx_args *ap)
	{
	int error;
	accmode_t accmode = ap->a_accmode;

	error = vfs_unixify_accmode(&accmode);
	if (error != 0)
	return (error);

	if (accmode == 0)
	return (0);

	return (VOP_ACCESS(ap->a_vp, accmode, ap->a_cred, ap->a_td));
	}

	/*
	* Advisory record locking support
	*/
	int
	vop_stdadvlock(struct vop_advlock_args *ap)
	{
	struct vnode *vp;
	struct vattr vattr;
	int error;

	vp = ap->a_vp;
	if (ap->a_fl->l_whence == SEEK_END) {
	/*
	* The NFSv4 server must avoid doing a vn_lock() here, since it
	* can deadlock the nfsd threads, due to a LOR. Fortunately
	* the NFSv4 server always uses SEEK_SET and this code is
	* only required for the SEEK_END case.
	*/
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
	VOP_UNLOCK(vp);
	if (error)
	return (error);
	} else
	vattr.va_size = 0;

	return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size));
	}

	int
	vop_stdadvlockasync(struct vop_advlockasync_args *ap)
	{
	struct vnode *vp;
	struct vattr vattr;
	int error;

	vp = ap->a_vp;
	if (ap->a_fl->l_whence == SEEK_END) {
	/* The size argument is only needed for SEEK_END. */
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
	VOP_UNLOCK(vp);
	if (error)
	return (error);
	} else
	vattr.va_size = 0;

	return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size));
	}

	int
	vop_stdadvlockpurge(struct vop_advlockpurge_args *ap)
	{
	struct vnode *vp;

	vp = ap->a_vp;
	lf_purgelocks(vp, &vp->v_lockf);
	return (0);
	}

	/*
	* vop_stdpathconf:
	*
	* Standard implementation of POSIX pathconf, to get information about limits
	* for a filesystem.
	* Override per filesystem for the case where the filesystem has smaller
	* limits.
	*/
	int
	vop_stdpathconf(ap)
	struct vop_pathconf_args /* {
	struct vnode *a_vp;
	int a_name;
	int *a_retval;
	} / ap;
	{

	switch (ap->a_name) {
	case _PC_ASYNC_IO:
	*ap->a_retval = _POSIX_ASYNCHRONOUS_IO;
	return (0);
	case _PC_PATH_MAX:
	*ap->a_retval = PATH_MAX;
	return (0);
	case _PC_ACL_EXTENDED:
	case _PC_ACL_NFS4:
	case _PC_CAP_PRESENT:
	case _PC_INF_PRESENT:
	case _PC_MAC_PRESENT:
	*ap->a_retval = 0;
	return (0);
	default:
	return (EINVAL);
	}
	/* NOTREACHED */
	}

	/*
	* Standard lock, unlock and islocked functions.
	*/
	int
	vop_stdlock(ap)
	struct vop_lock1_args /* {
	struct vnode *a_vp;
	int a_flags;
	char *file;
	int line;
	} / ap;
	{
	struct vnode *vp = ap->a_vp;
	struct mtx *ilk;

	ilk = VI_MTX(vp);
	return (lockmgr_lock_flags(vp->v_vnlock, ap->a_flags,
	&ilk->lock_object, ap->a_file, ap->a_line));
	}

	/* See above. */
	int
	vop_stdunlock(ap)
	struct vop_unlock_args /* {
	struct vnode *a_vp;
	} / ap;
	{
	struct vnode *vp = ap->a_vp;

	return (lockmgr_unlock(vp->v_vnlock));
	}

	/* See above. */
	int
	vop_stdislocked(ap)
	struct vop_islocked_args /* {
	struct vnode *a_vp;
	} / ap;
	{

	return (lockstatus(ap->a_vp->v_vnlock));
	}

	/*
	* Variants of the above set.
	*
	* Differences are:
	* - shared locking disablement is not supported
	* - v_vnlock pointer is not honored
	*/
	int
	vop_lock(ap)
	struct vop_lock1_args /* {
	struct vnode *a_vp;
	int a_flags;
	char *file;
	int line;
	} / ap;
	{
	struct vnode *vp = ap->a_vp;
	int flags = ap->a_flags;
	struct mtx *ilk;

	MPASS(vp->v_vnlock == &vp->v_lock);

	if (__predict_false((flags & ~(LK_TYPE_MASK \| LK_NODDLKTREAT \| LK_RETRY)) != 0))
	goto other;

	switch (flags & LK_TYPE_MASK) {
	case LK_SHARED:
	return (lockmgr_slock(&vp->v_lock, flags, ap->a_file, ap->a_line));
	case LK_EXCLUSIVE:
	return (lockmgr_xlock(&vp->v_lock, flags, ap->a_file, ap->a_line));
	}
	other:
	ilk = VI_MTX(vp);
	return (lockmgr_lock_flags(&vp->v_lock, flags,
	&ilk->lock_object, ap->a_file, ap->a_line));
	}

	int
	vop_unlock(ap)
	struct vop_unlock_args /* {
	struct vnode *a_vp;
	} / ap;
	{
	struct vnode *vp = ap->a_vp;

	MPASS(vp->v_vnlock == &vp->v_lock);

	return (lockmgr_unlock(&vp->v_lock));
	}

	int
	vop_islocked(ap)
	struct vop_islocked_args /* {
	struct vnode *a_vp;
	} / ap;
	{
	struct vnode *vp = ap->a_vp;

	MPASS(vp->v_vnlock == &vp->v_lock);

	return (lockstatus(&vp->v_lock));
	}

	/*
	* Return true for select/poll.
	*/
	int
	vop_nopoll(ap)
	struct vop_poll_args /* {
	struct vnode *a_vp;
	int a_events;
	struct ucred *a_cred;
	struct thread *a_td;
	} / ap;
	{

	if (ap->a_events & ~POLLSTANDARD)
	return (POLLNVAL);
	return (ap->a_events & (POLLIN \| POLLOUT \| POLLRDNORM \| POLLWRNORM));
	}

	/*
	* Implement poll for local filesystems that support it.
	*/
	int
	vop_stdpoll(ap)
	struct vop_poll_args /* {
	struct vnode *a_vp;
	int a_events;
	struct ucred *a_cred;
	struct thread *a_td;
	} / ap;
	{
	if (ap->a_events & ~POLLSTANDARD)
	return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events));
	return (ap->a_events & (POLLIN \| POLLOUT \| POLLRDNORM \| POLLWRNORM));
	}

	/*
	* Return our mount point, as we will take charge of the writes.
	*/
	int
	vop_stdgetwritemount(ap)
	struct vop_getwritemount_args /* {
	struct vnode *a_vp;
	struct mount **a_mpp;
	} / ap;
	{
	struct mount *mp;
	struct mount_pcpu *mpcpu;
	struct vnode *vp;

	/*
	* Note that having a reference does not prevent forced unmount from
	* setting ->v_mount to NULL after the lock gets released. This is of
	* no consequence for typical consumers (most notably vn_start_write)
	* since in this case the vnode is VIRF_DOOMED. Unmount might have
	* progressed far enough that its completion is only delayed by the
	* reference obtained here. The consumer only needs to concern itself
	* with releasing it.
	*/
	vp = ap->a_vp;
	mp = vp->v_mount;
	if (mp == NULL) {
	*(ap->a_mpp) = NULL;
	return (0);
	}
	if (vfs_op_thread_enter(mp, mpcpu)) {
	if (mp == vp->v_mount) {
	vfs_mp_count_add_pcpu(mpcpu, ref, 1);
	vfs_op_thread_exit(mp, mpcpu);
	} else {
	vfs_op_thread_exit(mp, mpcpu);
	mp = NULL;
	}
	} else {
	MNT_ILOCK(mp);
	if (mp == vp->v_mount) {
	MNT_REF(mp);
	MNT_IUNLOCK(mp);
	} else {
	MNT_IUNLOCK(mp);
	mp = NULL;
	}
	}
	*(ap->a_mpp) = mp;
	return (0);
	}

	/*
	* If the file system doesn't implement VOP_BMAP, then return sensible defaults:
	* - Return the vnode's bufobj instead of any underlying device's bufobj
	* - Calculate the physical block number as if there were equal size
	* consecutive blocks, but
	* - Report no contiguous runs of blocks.
	*/
	int
	vop_stdbmap(ap)
	struct vop_bmap_args /* {
	struct vnode *a_vp;
	daddr_t a_bn;
	struct bufobj **a_bop;
	daddr_t *a_bnp;
	int *a_runp;
	int *a_runb;
	} / ap;
	{

	if (ap->a_bop != NULL)
	*ap->a_bop = &ap->a_vp->v_bufobj;
	if (ap->a_bnp != NULL)
	ap->a_bnp = ap->a_bn btodb(ap->a_vp->v_mount->mnt_stat.f_iosize);
	if (ap->a_runp != NULL)
	*ap->a_runp = 0;
	if (ap->a_runb != NULL)
	*ap->a_runb = 0;
	return (0);
	}

	int
	vop_stdfsync(ap)
	struct vop_fsync_args /* {
	struct vnode *a_vp;
	int a_waitfor;
	struct thread *a_td;
	} / ap;
	{

	return (vn_fsync_buf(ap->a_vp, ap->a_waitfor));
	}

	static int
	vop_stdfdatasync(struct vop_fdatasync_args *ap)
	{

	return (VOP_FSYNC(ap->a_vp, MNT_WAIT, ap->a_td));
	}

	int
	vop_stdfdatasync_buf(struct vop_fdatasync_args *ap)
	{

	return (vn_fsync_buf(ap->a_vp, MNT_WAIT));
	}

	/* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */
	int
	vop_stdgetpages(ap)
	struct vop_getpages_args /* {
	struct vnode *a_vp;
	vm_page_t *a_m;
	int a_count;
	int *a_rbehind;
	int *a_rahead;
	} / ap;
	{

	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
	ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL);
	}

	static int
	vop_stdgetpages_async(struct vop_getpages_async_args *ap)
	{
	int error;

	error = VOP_GETPAGES(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
	ap->a_rahead);
	if (ap->a_iodone != NULL)
	ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
	return (error);
	}

	int
	vop_stdkqfilter(struct vop_kqfilter_args *ap)
	{
	return vfs_kqfilter(ap);
	}

	/* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */
	int
	vop_stdputpages(ap)
	struct vop_putpages_args /* {
	struct vnode *a_vp;
	vm_page_t *a_m;
	int a_count;
	int a_sync;
	int *a_rtvals;
	} / ap;
	{

	return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
	ap->a_sync, ap->a_rtvals);
	}

	int
	vop_stdvptofh(struct vop_vptofh_args *ap)
	{
	return (EOPNOTSUPP);
	}

	int
	vop_stdvptocnp(struct vop_vptocnp_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct vnode **dvp = ap->a_vpp;
	struct ucred *cred;
	char *buf = ap->a_buf;
	size_t *buflen = ap->a_buflen;
	char dirbuf, cpos;
	int i, error, eofflag, dirbuflen, flags, locked, len, covered;
	off_t off;
	ino_t fileno;
	struct vattr va;
	struct nameidata nd;
	struct thread *td;
	struct dirent *dp;
	struct vnode *mvp;

	i = *buflen;
	error = 0;
	covered = 0;
	td = curthread;
	cred = td->td_ucred;

	if (vp->v_type != VDIR)
	return (ENOENT);

	error = VOP_GETATTR(vp, &va, cred);
	if (error)
	return (error);

	VREF(vp);
	locked = VOP_ISLOCKED(vp);
	VOP_UNLOCK(vp);
	NDINIT_ATVP(&nd, LOOKUP, FOLLOW \| LOCKSHARED \| LOCKLEAF, UIO_SYSSPACE,
	"..", vp, td);
	flags = FREAD;
	error = vn_open_cred(&nd, &flags, 0, VN_OPEN_NOAUDIT, cred, NULL);
	if (error) {
	vn_lock(vp, locked \| LK_RETRY);
	return (error);
	}
	NDFREE(&nd, NDF_ONLY_PNBUF);

	mvp = *dvp = nd.ni_vp;

	if (vp->v_mount != (*dvp)->v_mount &&
	((*dvp)->v_vflag & VV_ROOT) &&
	((*dvp)->v_mount->mnt_flag & MNT_UNION)) {
	dvp = (dvp)->v_mount->mnt_vnodecovered;
	VREF(mvp);
	VOP_UNLOCK(mvp);
	vn_close(mvp, FREAD, cred, td);
	VREF(*dvp);
	vn_lock(*dvp, LK_SHARED \| LK_RETRY);
	covered = 1;
	}

	fileno = va.va_fileid;

	dirbuflen = DEV_BSIZE;
	if (dirbuflen < va.va_blocksize)
	dirbuflen = va.va_blocksize;
	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);

	if ((*dvp)->v_type != VDIR) {
	error = ENOENT;
	goto out;
	}

	off = 0;
	len = 0;
	do {
	/* call VOP_READDIR of parent */
	error = get_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off,
	&cpos, &len, &eofflag, td);
	if (error)
	goto out;

	if ((dp->d_type != DT_WHT) &&
	(dp->d_fileno == fileno)) {
	if (covered) {
	VOP_UNLOCK(*dvp);
	vn_lock(mvp, LK_SHARED \| LK_RETRY);
	if (dirent_exists(mvp, dp->d_name, td)) {
	error = ENOENT;
	VOP_UNLOCK(mvp);
	vn_lock(*dvp, LK_SHARED \| LK_RETRY);
	goto out;
	}
	VOP_UNLOCK(mvp);
	vn_lock(*dvp, LK_SHARED \| LK_RETRY);
	}
	i -= dp->d_namlen;

	if (i < 0) {
	error = ENOMEM;
	goto out;
	}
	if (dp->d_namlen == 1 && dp->d_name[0] == '.') {
	error = ENOENT;
	} else {
	bcopy(dp->d_name, buf + i, dp->d_namlen);
	error = 0;
	}
	goto out;
	}
	} while (len > 0 \|\| !eofflag);
	error = ENOENT;

	out:
	free(dirbuf, M_TEMP);
	if (!error) {
	*buflen = i;
	vref(*dvp);
	}
	if (covered) {
	vput(*dvp);
	vrele(mvp);
	} else {
	VOP_UNLOCK(mvp);
	vn_close(mvp, FREAD, cred, td);
	}
	vn_lock(vp, locked \| LK_RETRY);
	return (error);
	}

	int
	vop_stdallocate(struct vop_allocate_args *ap)
	{
	#ifdef __notyet__
	struct statfs *sfs;
	off_t maxfilesize = 0;
	#endif
	struct iovec aiov;
	struct vattr vattr, *vap;
	struct uio auio;
	off_t fsize, len, cur, offset;
	uint8_t *buf;
	struct thread *td;
	struct vnode *vp;
	size_t iosize;
	int error;

	buf = NULL;
	error = 0;
	td = curthread;
	vap = &vattr;
	vp = ap->a_vp;
	len = *ap->a_len;
	offset = *ap->a_offset;

	error = VOP_GETATTR(vp, vap, td->td_ucred);
	if (error != 0)
	goto out;
	fsize = vap->va_size;
	iosize = vap->va_blocksize;
	if (iosize == 0)
	iosize = BLKDEV_IOSIZE;
	- if (iosize > MAXPHYS)
	- iosize = MAXPHYS;
	+ if (iosize > maxphys)
	+ iosize = maxphys;
	buf = malloc(iosize, M_TEMP, M_WAITOK);

	#ifdef __notyet__
	/*
	* Check if the filesystem sets f_maxfilesize; if not use
	* VOP_SETATTR to perform the check.
	*/
	sfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
	error = VFS_STATFS(vp->v_mount, sfs, td);
	if (error == 0)
	maxfilesize = sfs->f_maxfilesize;
	free(sfs, M_STATFS);
	if (error != 0)
	goto out;
	if (maxfilesize) {
	if (offset > maxfilesize \|\| len > maxfilesize \|\|
	offset + len > maxfilesize) {
	error = EFBIG;
	goto out;
	}
	} else
	#endif
	if (offset + len > vap->va_size) {
	/*
	* Test offset + len against the filesystem's maxfilesize.
	*/
	VATTR_NULL(vap);
	vap->va_size = offset + len;
	error = VOP_SETATTR(vp, vap, td->td_ucred);
	if (error != 0)
	goto out;
	VATTR_NULL(vap);
	vap->va_size = fsize;
	error = VOP_SETATTR(vp, vap, td->td_ucred);
	if (error != 0)
	goto out;
	}

	for (;;) {
	/*
	* Read and write back anything below the nominal file
	* size. There's currently no way outside the filesystem
	* to know whether this area is sparse or not.
	*/
	cur = iosize;
	if ((offset % iosize) != 0)
	cur -= (offset % iosize);
	if (cur > len)
	cur = len;
	if (offset < fsize) {
	aiov.iov_base = buf;
	aiov.iov_len = cur;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = offset;
	auio.uio_resid = cur;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_rw = UIO_READ;
	auio.uio_td = td;
	error = VOP_READ(vp, &auio, 0, td->td_ucred);
	if (error != 0)
	break;
	if (auio.uio_resid > 0) {
	bzero(buf + cur - auio.uio_resid,
	auio.uio_resid);
	}
	} else {
	bzero(buf, cur);
	}

	aiov.iov_base = buf;
	aiov.iov_len = cur;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = offset;
	auio.uio_resid = cur;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_rw = UIO_WRITE;
	auio.uio_td = td;

	error = VOP_WRITE(vp, &auio, 0, td->td_ucred);
	if (error != 0)
	break;

	len -= cur;
	offset += cur;
	if (len == 0)
	break;
	if (should_yield())
	break;
	}

	out:
	*ap->a_len = len;
	*ap->a_offset = offset;
	free(buf, M_TEMP);
	return (error);
	}

	int
	vop_stdadvise(struct vop_advise_args *ap)
	{
	struct vnode *vp;
	struct bufobj *bo;
	daddr_t startn, endn;
	off_t bstart, bend, start, end;
	int bsize, error;

	vp = ap->a_vp;
	switch (ap->a_advice) {
	case POSIX_FADV_WILLNEED:
	/*
	* Do nothing for now. Filesystems should provide a
	* custom method which starts an asynchronous read of
	* the requested region.
	*/
	error = 0;
	break;
	case POSIX_FADV_DONTNEED:
	error = 0;
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	if (VN_IS_DOOMED(vp)) {
	VOP_UNLOCK(vp);
	break;
	}

	/*
	* Round to block boundaries (and later possibly further to
	* page boundaries). Applications cannot reasonably be aware
	* of the boundaries, and the rounding must be to expand at
	* both extremities to cover enough. It still doesn't cover
	* read-ahead. For partial blocks, this gives unnecessary
	* discarding of buffers but is efficient enough since the
	* pages usually remain in VMIO for some time.
	*/
	bsize = vp->v_bufobj.bo_bsize;
	bstart = rounddown(ap->a_start, bsize);
	bend = roundup(ap->a_end, bsize);

	/*
	* Deactivate pages in the specified range from the backing VM
	* object. Pages that are resident in the buffer cache will
	* remain wired until their corresponding buffers are released
	* below.
	*/
	if (vp->v_object != NULL) {
	start = trunc_page(bstart);
	end = round_page(bend);
	VM_OBJECT_RLOCK(vp->v_object);
	vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start),
	OFF_TO_IDX(end));
	VM_OBJECT_RUNLOCK(vp->v_object);
	}

	bo = &vp->v_bufobj;
	BO_RLOCK(bo);
	startn = bstart / bsize;
	endn = bend / bsize;
	error = bnoreuselist(&bo->bo_clean, bo, startn, endn);
	if (error == 0)
	error = bnoreuselist(&bo->bo_dirty, bo, startn, endn);
	BO_RUNLOCK(bo);
	VOP_UNLOCK(vp);
	break;
	default:
	error = EINVAL;
	break;
	}
	return (error);
	}

	int
	vop_stdunp_bind(struct vop_unp_bind_args *ap)
	{

	ap->a_vp->v_unpcb = ap->a_unpcb;
	return (0);
	}

	int
	vop_stdunp_connect(struct vop_unp_connect_args *ap)
	{

	*ap->a_unpcb = ap->a_vp->v_unpcb;
	return (0);
	}

	int
	vop_stdunp_detach(struct vop_unp_detach_args *ap)
	{

	ap->a_vp->v_unpcb = NULL;
	return (0);
	}

	static int
	vop_stdis_text(struct vop_is_text_args *ap)
	{

	return (ap->a_vp->v_writecount < 0);
	}

	int
	vop_stdset_text(struct vop_set_text_args *ap)
	{
	struct vnode *vp;
	struct mount *mp;
	int error;

	vp = ap->a_vp;
	VI_LOCK(vp);
	if (vp->v_writecount > 0) {
	error = ETXTBSY;
	} else {
	/*
	* If requested by fs, keep a use reference to the
	* vnode until the last text reference is released.
	*/
	mp = vp->v_mount;
	if (mp != NULL && (mp->mnt_kern_flag & MNTK_TEXT_REFS) != 0 &&
	vp->v_writecount == 0) {
	VNPASS((vp->v_iflag & VI_TEXT_REF) == 0, vp);
	vp->v_iflag \|= VI_TEXT_REF;
	vrefl(vp);
	}

	vp->v_writecount--;
	error = 0;
	}
	VI_UNLOCK(vp);
	return (error);
	}

	static int
	vop_stdunset_text(struct vop_unset_text_args *ap)
	{
	struct vnode *vp;
	int error;
	bool last;

	vp = ap->a_vp;
	last = false;
	VI_LOCK(vp);
	if (vp->v_writecount < 0) {
	if ((vp->v_iflag & VI_TEXT_REF) != 0 &&
	vp->v_writecount == -1) {
	last = true;
	vp->v_iflag &= ~VI_TEXT_REF;
	}
	vp->v_writecount++;
	error = 0;
	} else {
	error = EINVAL;
	}
	VI_UNLOCK(vp);
	if (last)
	vunref(vp);
	return (error);
	}

	static int
	vop_stdadd_writecount(struct vop_add_writecount_args *ap)
	{
	struct vnode *vp;
	struct mount *mp;
	int error;

	vp = ap->a_vp;
	VI_LOCK_FLAGS(vp, MTX_DUPOK);
	if (vp->v_writecount < 0) {
	error = ETXTBSY;
	} else {
	VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp,
	("neg writecount increment %d", ap->a_inc));
	if (vp->v_writecount == 0) {
	mp = vp->v_mount;
	if (mp != NULL && (mp->mnt_kern_flag & MNTK_NOMSYNC) == 0)
	vlazy(vp);
	}
	vp->v_writecount += ap->a_inc;
	error = 0;
	}
	VI_UNLOCK(vp);
	return (error);
	}

	int
	vop_stdneed_inactive(struct vop_need_inactive_args *ap)
	{

	return (1);
	}

	int
	vop_stdioctl(struct vop_ioctl_args *ap)
	{
	struct vnode *vp;
	struct vattr va;
	off_t *offp;
	int error;

	switch (ap->a_command) {
	case FIOSEEKDATA:
	case FIOSEEKHOLE:
	vp = ap->a_vp;
	error = vn_lock(vp, LK_SHARED);
	if (error != 0)
	return (EBADF);
	if (vp->v_type == VREG)
	error = VOP_GETATTR(vp, &va, ap->a_cred);
	else
	error = ENOTTY;
	if (error == 0) {
	offp = ap->a_data;
	if (offp < 0 \|\| offp >= va.va_size)
	error = ENXIO;
	else if (ap->a_command == FIOSEEKHOLE)
	*offp = va.va_size;
	}
	VOP_UNLOCK(vp);
	break;
	default:
	error = ENOTTY;
	break;
	}
	return (error);
	}

	/*
	* vfs default ops
	* used to fill the vfs function table to get reasonable default return values.
	*/
	int
	vfs_stdroot (mp, flags, vpp)
	struct mount *mp;
	int flags;
	struct vnode **vpp;
	{

	return (EOPNOTSUPP);
	}

	int
	vfs_stdstatfs (mp, sbp)
	struct mount *mp;
	struct statfs *sbp;
	{

	return (EOPNOTSUPP);
	}

	int
	vfs_stdquotactl (mp, cmds, uid, arg)
	struct mount *mp;
	int cmds;
	uid_t uid;
	void *arg;
	{

	return (EOPNOTSUPP);
	}

	int
	vfs_stdsync(mp, waitfor)
	struct mount *mp;
	int waitfor;
	{
	struct vnode vp, mvp;
	struct thread *td;
	int error, lockreq, allerror = 0;

	td = curthread;
	lockreq = LK_EXCLUSIVE \| LK_INTERLOCK;
	if (waitfor != MNT_WAIT)
	lockreq \|= LK_NOWAIT;
	/*
	* Force stale buffer cache information to be flushed.
	*/
	loop:
	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
	if (vp->v_bufobj.bo_dirty.bv_cnt == 0) {
	VI_UNLOCK(vp);
	continue;
	}
	if ((error = vget(vp, lockreq)) != 0) {
	if (error == ENOENT) {
	MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
	goto loop;
	}
	continue;
	}
	error = VOP_FSYNC(vp, waitfor, td);
	if (error)
	allerror = error;
	vput(vp);
	}
	return (allerror);
	}

	int
	vfs_stdnosync (mp, waitfor)
	struct mount *mp;
	int waitfor;
	{

	return (0);
	}

	static int
	vop_stdcopy_file_range(struct vop_copy_file_range_args *ap)
	{
	int error;

	error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp,
	ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred,
	ap->a_outcred, ap->a_fsizetd);
	return (error);
	}

	int
	vfs_stdvget (mp, ino, flags, vpp)
	struct mount *mp;
	ino_t ino;
	int flags;
	struct vnode **vpp;
	{

	return (EOPNOTSUPP);
	}

	int
	vfs_stdfhtovp (mp, fhp, flags, vpp)
	struct mount *mp;
	struct fid *fhp;
	int flags;
	struct vnode **vpp;
	{

	return (EOPNOTSUPP);
	}

	int
	vfs_stdinit (vfsp)
	struct vfsconf *vfsp;
	{

	return (0);
	}

	int
	vfs_stduninit (vfsp)
	struct vfsconf *vfsp;
	{

	return(0);
	}

	int
	vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)
	struct mount *mp;
	int cmd;
	struct vnode *filename_vp;
	int attrnamespace;
	const char *attrname;
	{

	if (filename_vp != NULL)
	VOP_UNLOCK(filename_vp);
	return (EOPNOTSUPP);
	}

	int
	vfs_stdsysctl(mp, op, req)
	struct mount *mp;
	fsctlop_t op;
	struct sysctl_req *req;
	{

	return (EOPNOTSUPP);
	}

	static vop_bypass_t *
	bp_by_off(struct vop_vector vop, struct vop_generic_args a)
	{

	return ((vop_bypass_t )((char )vop + a->a_desc->vdesc_vop_offset));
	}

	int
	vop_sigdefer(struct vop_vector vop, struct vop_generic_args a)
	{
	vop_bypass_t *bp;
	int prev_stops, rc;

	bp = bp_by_off(vop, a);
	MPASS(bp != NULL);

	prev_stops = sigdeferstop(SIGDEFERSTOP_SILENT);
	rc = bp(a);
	sigallowstop(prev_stops);
	return (rc);
	}

	static int
	vop_stdstat(struct vop_stat_args *a)
	{
	struct vattr vattr;
	struct vattr *vap;
	struct vnode *vp;
	struct stat *sb;
	int error;
	u_short mode;

	vp = a->a_vp;
	sb = a->a_sb;

	error = vop_stat_helper_pre(a);
	if (error != 0)
	return (error);

	vap = &vattr;

	/*
	* Initialize defaults for new and unusual fields, so that file
	* systems which don't support these fields don't need to know
	* about them.
	*/
	vap->va_birthtime.tv_sec = -1;
	vap->va_birthtime.tv_nsec = 0;
	vap->va_fsid = VNOVAL;
	vap->va_rdev = NODEV;

	error = VOP_GETATTR(vp, vap, a->a_active_cred);
	if (error)
	goto out;

	/*
	* Zero the spare stat fields
	*/
	bzero(sb, sizeof *sb);

	/*
	* Copy from vattr table
	*/
	if (vap->va_fsid != VNOVAL)
	sb->st_dev = vap->va_fsid;
	else
	sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
	sb->st_ino = vap->va_fileid;
	mode = vap->va_mode;
	switch (vap->va_type) {
	case VREG:
	mode \|= S_IFREG;
	break;
	case VDIR:
	mode \|= S_IFDIR;
	break;
	case VBLK:
	mode \|= S_IFBLK;
	break;
	case VCHR:
	mode \|= S_IFCHR;
	break;
	case VLNK:
	mode \|= S_IFLNK;
	break;
	case VSOCK:
	mode \|= S_IFSOCK;
	break;
	case VFIFO:
	mode \|= S_IFIFO;
	break;
	default:
	error = EBADF;
	goto out;
	}
	sb->st_mode = mode;
	sb->st_nlink = vap->va_nlink;
	sb->st_uid = vap->va_uid;
	sb->st_gid = vap->va_gid;
	sb->st_rdev = vap->va_rdev;
	if (vap->va_size > OFF_MAX) {
	error = EOVERFLOW;
	goto out;
	}
	sb->st_size = vap->va_size;
	sb->st_atim.tv_sec = vap->va_atime.tv_sec;
	sb->st_atim.tv_nsec = vap->va_atime.tv_nsec;
	sb->st_mtim.tv_sec = vap->va_mtime.tv_sec;
	sb->st_mtim.tv_nsec = vap->va_mtime.tv_nsec;
	sb->st_ctim.tv_sec = vap->va_ctime.tv_sec;
	sb->st_ctim.tv_nsec = vap->va_ctime.tv_nsec;
	sb->st_birthtim.tv_sec = vap->va_birthtime.tv_sec;
	sb->st_birthtim.tv_nsec = vap->va_birthtime.tv_nsec;

	/*
	* According to www.opengroup.org, the meaning of st_blksize is
	* "a filesystem-specific preferred I/O block size for this
	* object. In some filesystem types, this may vary from file
	* to file"
	* Use minimum/default of PAGE_SIZE (e.g. for VCHR).
	*/

	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
	sb->st_flags = vap->va_flags;
	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
	sb->st_gen = vap->va_gen;
	out:
	return (vop_stat_helper_post(a, error));
	}

	static int
	vop_stdread_pgcache(struct vop_read_pgcache_args *ap __unused)
	{
	return (EJUSTRETURN);
	}
	diff --git a/sys/mips/ingenic/jz4780_mmc.c b/sys/mips/ingenic/jz4780_mmc.c
	index e6359e03435f..f7622f4709a1 100644
	--- a/sys/mips/ingenic/jz4780_mmc.c
	+++ b/sys/mips/ingenic/jz4780_mmc.c
	@@ -1,1004 +1,1004 @@
	/*-
	* Copyright (c) 2015 Alexander Kabaev <kan@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/resource.h>
	#include <sys/rman.h>
	#include <sys/sysctl.h>

	#include <machine/bus.h>

	#include <dev/extres/clk/clk.h>

	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>

	#include <dev/mmc/bridge.h>
	#include <dev/mmc/mmcreg.h>
	#include <dev/mmc/mmcbrvar.h>

	#include <mips/ingenic/jz4780_regs.h>

	#undef JZ_MMC_DEBUG

	#define JZ_MSC_MEMRES 0
	#define JZ_MSC_IRQRES 1
	#define JZ_MSC_RESSZ 2
	#define JZ_MSC_DMA_SEGS 128
	-#define JZ_MSC_DMA_MAX_SIZE MAXPHYS
	+#define JZ_MSC_DMA_MAX_SIZE maxphys

	#define JZ_MSC_INT_ERR_BITS (JZ_INT_CRC_RES_ERR \| JZ_INT_CRC_READ_ERR \| \
	JZ_INT_CRC_WRITE_ERR \| JZ_INT_TIMEOUT_RES \| \
	JZ_INT_TIMEOUT_READ)
	static int jz4780_mmc_pio_mode = 0;

	TUNABLE_INT("hw.jz.mmc.pio_mode", &jz4780_mmc_pio_mode);

	struct jz4780_mmc_dma_desc {
	uint32_t dma_next;
	uint32_t dma_phys;
	uint32_t dma_len;
	uint32_t dma_cmd;
	};

	struct jz4780_mmc_softc {
	bus_space_handle_t sc_bsh;
	bus_space_tag_t sc_bst;
	device_t sc_dev;
	clk_t sc_clk;
	int sc_bus_busy;
	int sc_resid;
	int sc_timeout;
	struct callout sc_timeoutc;
	struct mmc_host sc_host;
	struct mmc_request * sc_req;
	struct mtx sc_mtx;
	struct resource * sc_res[JZ_MSC_RESSZ];
	uint32_t sc_intr_seen;
	uint32_t sc_intr_mask;
	uint32_t sc_intr_wait;
	void * sc_intrhand;
	uint32_t sc_cmdat;

	/* Fields required for DMA access. */
	bus_addr_t sc_dma_desc_phys;
	bus_dmamap_t sc_dma_map;
	bus_dma_tag_t sc_dma_tag;
	void * sc_dma_desc;
	bus_dmamap_t sc_dma_buf_map;
	bus_dma_tag_t sc_dma_buf_tag;
	int sc_dma_inuse;
	int sc_dma_map_err;
	uint32_t sc_dma_ctl;
	};

	static struct resource_spec jz4780_mmc_res_spec[] = {
	{ SYS_RES_MEMORY, 0, RF_ACTIVE },
	{ SYS_RES_IRQ, 0, RF_ACTIVE \| RF_SHAREABLE },
	{ -1, 0, 0 }
	};

	static int jz4780_mmc_probe(device_t);
	static int jz4780_mmc_attach(device_t);
	static int jz4780_mmc_detach(device_t);
	static int jz4780_mmc_setup_dma(struct jz4780_mmc_softc *);
	static int jz4780_mmc_reset(struct jz4780_mmc_softc *);
	static void jz4780_mmc_intr(void *);
	static int jz4780_mmc_enable_clock(struct jz4780_mmc_softc *);
	static int jz4780_mmc_config_clock(struct jz4780_mmc_softc *, uint32_t);

	static int jz4780_mmc_update_ios(device_t, device_t);
	static int jz4780_mmc_request(device_t, device_t, struct mmc_request *);
	static int jz4780_mmc_get_ro(device_t, device_t);
	static int jz4780_mmc_acquire_host(device_t, device_t);
	static int jz4780_mmc_release_host(device_t, device_t);

	#define JZ_MMC_LOCK(_sc) mtx_lock(&(_sc)->sc_mtx)
	#define JZ_MMC_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_mtx)
	#define JZ_MMC_READ_2(_sc, _reg) \
	bus_space_read_2((_sc)->sc_bst, (_sc)->sc_bsh, _reg)
	#define JZ_MMC_WRITE_2(_sc, _reg, _value) \
	bus_space_write_2((_sc)->sc_bst, (_sc)->sc_bsh, _reg, _value)
	#define JZ_MMC_READ_4(_sc, _reg) \
	bus_space_read_4((_sc)->sc_bst, (_sc)->sc_bsh, _reg)
	#define JZ_MMC_WRITE_4(_sc, _reg, _value) \
	bus_space_write_4((_sc)->sc_bst, (_sc)->sc_bsh, _reg, _value)

	static int
	jz4780_mmc_probe(device_t dev)
	{

	if (!ofw_bus_status_okay(dev))
	return (ENXIO);
	if (!ofw_bus_is_compatible(dev, "ingenic,jz4780-mmc"))
	return (ENXIO);
	if (device_get_unit(dev) > 0) /* XXXKAN */
	return (ENXIO);
	device_set_desc(dev, "Ingenic JZ4780 Integrated MMC/SD controller");

	return (BUS_PROBE_DEFAULT);
	}

	static int
	jz4780_mmc_attach(device_t dev)
	{
	struct jz4780_mmc_softc *sc;
	struct sysctl_ctx_list *ctx;
	struct sysctl_oid_list *tree;
	device_t child;
	ssize_t len;
	pcell_t prop;
	phandle_t node;

	sc = device_get_softc(dev);
	sc->sc_dev = dev;
	sc->sc_req = NULL;
	if (bus_alloc_resources(dev, jz4780_mmc_res_spec, sc->sc_res) != 0) {
	device_printf(dev, "cannot allocate device resources\n");
	return (ENXIO);
	}
	sc->sc_bst = rman_get_bustag(sc->sc_res[JZ_MSC_MEMRES]);
	sc->sc_bsh = rman_get_bushandle(sc->sc_res[JZ_MSC_MEMRES]);
	if (bus_setup_intr(dev, sc->sc_res[JZ_MSC_IRQRES],
	INTR_TYPE_MISC \| INTR_MPSAFE, NULL, jz4780_mmc_intr, sc,
	&sc->sc_intrhand)) {
	bus_release_resources(dev, jz4780_mmc_res_spec, sc->sc_res);
	device_printf(dev, "cannot setup interrupt handler\n");
	return (ENXIO);
	}
	sc->sc_timeout = 10;
	ctx = device_get_sysctl_ctx(dev);
	tree = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
	SYSCTL_ADD_INT(ctx, tree, OID_AUTO, "req_timeout", CTLFLAG_RW,
	&sc->sc_timeout, 0, "Request timeout in seconds");
	mtx_init(&sc->sc_mtx, device_get_nameunit(sc->sc_dev), "jz4780_mmc",
	MTX_DEF);
	callout_init_mtx(&sc->sc_timeoutc, &sc->sc_mtx, 0);

	/* Reset controller. */
	if (jz4780_mmc_reset(sc) != 0) {
	device_printf(dev, "cannot reset the controller\n");
	goto fail;
	}
	if (jz4780_mmc_pio_mode == 0 && jz4780_mmc_setup_dma(sc) != 0) {
	device_printf(sc->sc_dev, "Couldn't setup DMA!\n");
	jz4780_mmc_pio_mode = 1;
	}
	if (bootverbose)
	device_printf(sc->sc_dev, "DMA status: %s\n",
	jz4780_mmc_pio_mode ? "disabled" : "enabled");

	node = ofw_bus_get_node(dev);
	/* Determine max operating frequency */
	sc->sc_host.f_max = 24000000;
	len = OF_getencprop(node, "max-frequency", &prop, sizeof(prop));
	if (len / sizeof(prop) == 1)
	sc->sc_host.f_max = prop;
	sc->sc_host.f_min = sc->sc_host.f_max / 128;

	sc->sc_host.host_ocr = MMC_OCR_320_330 \| MMC_OCR_330_340;
	sc->sc_host.caps = MMC_CAP_HSPEED;
	sc->sc_host.mode = mode_sd;
	/*
	* Check for bus-width property, default to both 4 and 8 bit
	* if no bus width is specified.
	*/
	len = OF_getencprop(node, "bus-width", &prop, sizeof(prop));
	if (len / sizeof(prop) != 1)
	sc->sc_host.caps \|= MMC_CAP_4_BIT_DATA \| MMC_CAP_8_BIT_DATA;
	else if (prop == 8)
	sc->sc_host.caps \|= MMC_CAP_8_BIT_DATA;
	else if (prop == 4)
	sc->sc_host.caps \|= MMC_CAP_4_BIT_DATA;
	/* Activate the module clock. */
	if (jz4780_mmc_enable_clock(sc) != 0) {
	device_printf(dev, "cannot activate mmc clock\n");
	goto fail;
	}

	child = device_add_child(dev, "mmc", -1);
	if (child == NULL) {
	device_printf(dev, "attaching MMC bus failed!\n");
	goto fail;
	}
	if (device_probe_and_attach(child) != 0) {
	device_printf(dev, "attaching MMC child failed!\n");
	device_delete_child(dev, child);
	goto fail;
	}

	return (0);

	fail:
	callout_drain(&sc->sc_timeoutc);
	mtx_destroy(&sc->sc_mtx);
	bus_teardown_intr(dev, sc->sc_res[JZ_MSC_IRQRES], sc->sc_intrhand);
	bus_release_resources(dev, jz4780_mmc_res_spec, sc->sc_res);
	if (sc->sc_clk != NULL)
	clk_release(sc->sc_clk);
	return (ENXIO);
	}

	static int
	jz4780_mmc_detach(device_t dev)
	{

	return (EBUSY);
	}

	static int
	jz4780_mmc_enable_clock(struct jz4780_mmc_softc *sc)
	{
	int err;

	err = clk_get_by_ofw_name(sc->sc_dev, 0, "mmc", &sc->sc_clk);
	if (err == 0)
	err = clk_enable(sc->sc_clk);
	if (err == 0)
	err = clk_set_freq(sc->sc_clk, sc->sc_host.f_max, 0);
	if (err != 0)
	clk_release(sc->sc_clk);
	return (err);
	}

	static void
	jz4780_mmc_dma_desc_cb(void arg, bus_dma_segment_t segs, int nsegs, int err)
	{
	struct jz4780_mmc_softc *sc;

	sc = (struct jz4780_mmc_softc *)arg;
	if (err) {
	sc->sc_dma_map_err = err;
	return;
	}
	sc->sc_dma_desc_phys = segs[0].ds_addr;
	}

	static int
	jz4780_mmc_setup_dma(struct jz4780_mmc_softc *sc)
	{
	int dma_desc_size, error;

	/* Allocate the DMA descriptor memory. */
	dma_desc_size = sizeof(struct jz4780_mmc_dma_desc) * JZ_MSC_DMA_SEGS;
	error = bus_dma_tag_create(bus_get_dma_tag(sc->sc_dev), 1, 0,
	BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL,
	dma_desc_size, 1, dma_desc_size, 0, NULL, NULL, &sc->sc_dma_tag);
	if (error)
	return (error);
	error = bus_dmamem_alloc(sc->sc_dma_tag, &sc->sc_dma_desc,
	BUS_DMA_WAITOK \| BUS_DMA_ZERO, &sc->sc_dma_map);
	if (error)
	return (error);

	error = bus_dmamap_load(sc->sc_dma_tag, sc->sc_dma_map,
	sc->sc_dma_desc, dma_desc_size, jz4780_mmc_dma_desc_cb, sc, 0);
	if (error)
	return (error);
	if (sc->sc_dma_map_err)
	return (sc->sc_dma_map_err);

	/* Create the DMA map for data transfers. */
	error = bus_dma_tag_create(bus_get_dma_tag(sc->sc_dev), 1, 0,
	BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL,
	JZ_MSC_DMA_MAX_SIZE * JZ_MSC_DMA_SEGS, JZ_MSC_DMA_SEGS,
	JZ_MSC_DMA_MAX_SIZE, BUS_DMA_ALLOCNOW, NULL, NULL,
	&sc->sc_dma_buf_tag);
	if (error)
	return (error);
	error = bus_dmamap_create(sc->sc_dma_buf_tag, 0,
	&sc->sc_dma_buf_map);
	if (error)
	return (error);

	return (0);
	}

	static void
	jz4780_mmc_dma_cb(void arg, bus_dma_segment_t segs, int nsegs, int err)
	{
	struct jz4780_mmc_dma_desc *dma_desc;
	struct jz4780_mmc_softc *sc;
	uint32_t dma_desc_phys;
	int i;

	sc = (struct jz4780_mmc_softc *)arg;
	sc->sc_dma_map_err = err;
	dma_desc = sc->sc_dma_desc;
	dma_desc_phys = sc->sc_dma_desc_phys;

	/* Note nsegs is guaranteed to be zero if err is non-zero. */
	for (i = 0; i < nsegs; i++) {
	dma_desc[i].dma_phys = segs[i].ds_addr;
	dma_desc[i].dma_len = segs[i].ds_len;
	if (i < (nsegs - 1)) {
	dma_desc_phys += sizeof(struct jz4780_mmc_dma_desc);
	dma_desc[i].dma_next = dma_desc_phys;
	dma_desc[i].dma_cmd = (i << 16) \| JZ_DMA_LINK;
	} else {
	dma_desc[i].dma_next = 0;
	dma_desc[i].dma_cmd = (i << 16) \| JZ_DMA_ENDI;
	}
	#ifdef JZ_MMC_DEBUG
	device_printf(sc->sc_dev, "%d: desc %#x phys %#x len %d next %#x cmd %#x\n",
	i, dma_desc_phys - sizeof(struct jz4780_mmc_dma_desc),
	dma_desc[i].dma_phys, dma_desc[i].dma_len,
	dma_desc[i].dma_next, dma_desc[i].dma_cmd);
	#endif
	}
	}

	static int
	jz4780_mmc_prepare_dma(struct jz4780_mmc_softc *sc)
	{
	bus_dmasync_op_t sync_op;
	int error;
	struct mmc_command *cmd;
	uint32_t off;

	cmd = sc->sc_req->cmd;
	if (cmd->data->len > JZ_MSC_DMA_MAX_SIZE * JZ_MSC_DMA_SEGS)
	return (EFBIG);
	error = bus_dmamap_load(sc->sc_dma_buf_tag, sc->sc_dma_buf_map,
	cmd->data->data, cmd->data->len, jz4780_mmc_dma_cb, sc,
	BUS_DMA_NOWAIT);
	if (error)
	return (error);
	if (sc->sc_dma_map_err)
	return (sc->sc_dma_map_err);

	sc->sc_dma_inuse = 1;
	if (cmd->data->flags & MMC_DATA_WRITE)
	sync_op = BUS_DMASYNC_PREWRITE;
	else
	sync_op = BUS_DMASYNC_PREREAD;
	bus_dmamap_sync(sc->sc_dma_buf_tag, sc->sc_dma_buf_map, sync_op);
	bus_dmamap_sync(sc->sc_dma_tag, sc->sc_dma_map, BUS_DMASYNC_PREWRITE);

	/* Configure default DMA parameters */
	sc->sc_dma_ctl = JZ_MODE_SEL \| JZ_INCR_64 \| JZ_DMAEN;

	/* Enable unaligned buffer handling */
	off = (uintptr_t)cmd->data->data & 3;
	if (off != 0)
	sc->sc_dma_ctl \|= (off << JZ_AOFST_S) \| JZ_ALIGNEN;
	return (0);
	}

	static void
	jz4780_mmc_start_dma(struct jz4780_mmc_softc *sc)
	{

	/* Set the address of the first descriptor */
	JZ_MMC_WRITE_4(sc, JZ_MSC_DMANDA, sc->sc_dma_desc_phys);
	/* Enable and start the dma engine */
	JZ_MMC_WRITE_4(sc, JZ_MSC_DMAC, sc->sc_dma_ctl);
	}

	static int
	jz4780_mmc_reset(struct jz4780_mmc_softc *sc)
	{
	int timeout;

	/* Stop the clock */
	JZ_MMC_WRITE_4(sc, JZ_MSC_CTRL, JZ_CLOCK_STOP);

	timeout = 1000;
	while (--timeout > 0) {
	if ((JZ_MMC_READ_4(sc, JZ_MSC_STAT) & JZ_CLK_EN) == 0)
	break;
	DELAY(100);
	}
	if (timeout == 0) {
	device_printf(sc->sc_dev, "Failed to stop clk.\n");
	return (ETIMEDOUT);
	}

	/* Reset */
	JZ_MMC_WRITE_4(sc, JZ_MSC_CTRL, JZ_RESET);

	timeout = 10;
	while (--timeout > 0) {
	if ((JZ_MMC_READ_4(sc, JZ_MSC_STAT) & JZ_IS_RESETTING) == 0)
	break;
	DELAY(1000);
	}

	if (timeout == 0) {
	/*
	* X1000 never clears reseting bit.
	* Ignore for now.
	*/
	}

	/* Set the timeouts. */
	JZ_MMC_WRITE_4(sc, JZ_MSC_RESTO, 0xffff);
	JZ_MMC_WRITE_4(sc, JZ_MSC_RDTO, 0xffffffff);

	/* Mask all interrupt initially */
	JZ_MMC_WRITE_4(sc, JZ_MSC_IMASK, 0xffffffff);
	/* Clear pending interrupts. */
	JZ_MMC_WRITE_4(sc, JZ_MSC_IFLG, 0xffffffff);

	/* Remember interrupts we always want */
	sc->sc_intr_mask = JZ_MSC_INT_ERR_BITS;

	return (0);
	}

	static void
	jz4780_mmc_req_done(struct jz4780_mmc_softc *sc)
	{
	struct mmc_command *cmd;
	struct mmc_request *req;
	bus_dmasync_op_t sync_op;

	cmd = sc->sc_req->cmd;
	/* Reset the controller in case of errors */
	if (cmd->error != MMC_ERR_NONE)
	jz4780_mmc_reset(sc);
	/* Unmap DMA if necessary */
	if (sc->sc_dma_inuse == 1) {
	if (cmd->data->flags & MMC_DATA_WRITE)
	sync_op = BUS_DMASYNC_POSTWRITE;
	else
	sync_op = BUS_DMASYNC_POSTREAD;
	bus_dmamap_sync(sc->sc_dma_buf_tag, sc->sc_dma_buf_map,
	sync_op);
	bus_dmamap_sync(sc->sc_dma_tag, sc->sc_dma_map,
	BUS_DMASYNC_POSTWRITE);
	bus_dmamap_unload(sc->sc_dma_buf_tag, sc->sc_dma_buf_map);
	}
	req = sc->sc_req;
	callout_stop(&sc->sc_timeoutc);
	sc->sc_req = NULL;
	sc->sc_resid = 0;
	sc->sc_dma_inuse = 0;
	sc->sc_dma_map_err = 0;
	sc->sc_intr_wait = 0;
	sc->sc_intr_seen = 0;
	req->done(req);
	}

	static void
	jz4780_mmc_read_response(struct jz4780_mmc_softc *sc)
	{
	struct mmc_command *cmd;
	int i;

	cmd = sc->sc_req->cmd;
	if (cmd->flags & MMC_RSP_PRESENT) {
	if (cmd->flags & MMC_RSP_136) {
	uint16_t val;

	val = JZ_MMC_READ_2(sc, JZ_MSC_RES);
	for (i = 0; i < 4; i++) {
	cmd->resp[i] = val << 24;
	val = JZ_MMC_READ_2(sc, JZ_MSC_RES);
	cmd->resp[i] \|= val << 8;
	val = JZ_MMC_READ_2(sc, JZ_MSC_RES);
	cmd->resp[i] \|= val >> 8;
	}
	} else {
	cmd->resp[0] = JZ_MMC_READ_2(sc, JZ_MSC_RES) << 24;
	cmd->resp[0] \|= JZ_MMC_READ_2(sc, JZ_MSC_RES) << 8;
	cmd->resp[0] \|= JZ_MMC_READ_2(sc, JZ_MSC_RES) & 0xff;
	}
	}
	}

	static void
	jz4780_mmc_req_ok(struct jz4780_mmc_softc *sc)
	{
	struct mmc_command *cmd;

	cmd = sc->sc_req->cmd;
	/* All data has been transferred ? */
	if (cmd->data != NULL && (sc->sc_resid << 2) < cmd->data->len)
	cmd->error = MMC_ERR_FAILED;
	jz4780_mmc_req_done(sc);
	}

	static void
	jz4780_mmc_timeout(void *arg)
	{
	struct jz4780_mmc_softc *sc;

	sc = (struct jz4780_mmc_softc *)arg;
	if (sc->sc_req != NULL) {
	device_printf(sc->sc_dev, "controller timeout, rint %#x stat %#x\n",
	JZ_MMC_READ_4(sc, JZ_MSC_IFLG), JZ_MMC_READ_4(sc, JZ_MSC_STAT));
	sc->sc_req->cmd->error = MMC_ERR_TIMEOUT;
	jz4780_mmc_req_done(sc);
	} else
	device_printf(sc->sc_dev,
	"Spurious timeout - no active request\n");
	}

	static int
	jz4780_mmc_pio_transfer(struct jz4780_mmc_softc sc, struct mmc_data data)
	{
	uint32_t mask, *buf;
	int i, write;

	buf = (uint32_t *)data->data;
	write = (data->flags & MMC_DATA_WRITE) ? 1 : 0;
	mask = write ? JZ_DATA_FIFO_FULL : JZ_DATA_FIFO_EMPTY;
	for (i = sc->sc_resid; i < (data->len >> 2); i++) {
	if ((JZ_MMC_READ_4(sc, JZ_MSC_STAT) & mask))
	return (1);
	if (write)
	JZ_MMC_WRITE_4(sc, JZ_MSC_TXFIFO, buf[i]);
	else
	buf[i] = JZ_MMC_READ_4(sc, JZ_MSC_RXFIFO);
	sc->sc_resid = i + 1;
	}

	/* Done with pio transfer, shut FIFO interrupts down */
	mask = JZ_MMC_READ_4(sc, JZ_MSC_IMASK);
	mask \|= (JZ_INT_TXFIFO_WR_REQ \| JZ_INT_RXFIFO_RD_REQ);
	JZ_MMC_WRITE_4(sc, JZ_MSC_IMASK, mask);
	return (0);
	}

	static void
	jz4780_mmc_intr(void *arg)
	{
	struct jz4780_mmc_softc *sc;
	struct mmc_data *data;
	uint32_t rint;

	sc = (struct jz4780_mmc_softc *)arg;
	JZ_MMC_LOCK(sc);
	rint = JZ_MMC_READ_4(sc, JZ_MSC_IFLG);
	#if defined(JZ_MMC_DEBUG)
	device_printf(sc->sc_dev, "rint: %#x, stat: %#x\n",
	rint, JZ_MMC_READ_4(sc, JZ_MSC_STAT));
	if (sc->sc_dma_inuse == 1 && (sc->sc_intr_seen & JZ_INT_DMAEND) == 0)
	device_printf(sc->sc_dev, "\tdmada %#x dmanext %#x dmac %#x"
	" dmalen %d dmacmd %#x\n",
	JZ_MMC_READ_4(sc, JZ_MSC_DMADA),
	JZ_MMC_READ_4(sc, JZ_MSC_DMANDA),
	JZ_MMC_READ_4(sc, JZ_MSC_DMAC),
	JZ_MMC_READ_4(sc, JZ_MSC_DMALEN),
	JZ_MMC_READ_4(sc, JZ_MSC_DMACMD));
	#endif
	if (sc->sc_req == NULL) {
	device_printf(sc->sc_dev,
	"Spurious interrupt - no active request, rint: 0x%08X\n",
	rint);
	goto end;
	}
	if (rint & JZ_MSC_INT_ERR_BITS) {
	#if defined(JZ_MMC_DEBUG)
	device_printf(sc->sc_dev, "controller error, rint %#x stat %#x\n",
	rint, JZ_MMC_READ_4(sc, JZ_MSC_STAT));
	#endif
	if (rint & (JZ_INT_TIMEOUT_RES \| JZ_INT_TIMEOUT_READ))
	sc->sc_req->cmd->error = MMC_ERR_TIMEOUT;
	else
	sc->sc_req->cmd->error = MMC_ERR_FAILED;
	jz4780_mmc_req_done(sc);
	goto end;
	}
	data = sc->sc_req->cmd->data;
	/* Check for command response */
	if (rint & JZ_INT_END_CMD_RES) {
	jz4780_mmc_read_response(sc);
	if (sc->sc_dma_inuse == 1)
	jz4780_mmc_start_dma(sc);
	}
	if (data != NULL) {
	if (sc->sc_dma_inuse == 1 && (rint & JZ_INT_DMAEND))
	sc->sc_resid = data->len >> 2;
	else if (sc->sc_dma_inuse == 0 &&
	(rint & (JZ_INT_TXFIFO_WR_REQ \| JZ_INT_RXFIFO_RD_REQ)))
	jz4780_mmc_pio_transfer(sc, data);
	}
	sc->sc_intr_seen \|= rint;
	if ((sc->sc_intr_seen & sc->sc_intr_wait) == sc->sc_intr_wait)
	jz4780_mmc_req_ok(sc);
	end:
	JZ_MMC_WRITE_4(sc, JZ_MSC_IFLG, rint);
	JZ_MMC_UNLOCK(sc);
	}

	static int
	jz4780_mmc_request(device_t bus, device_t child, struct mmc_request *req)
	{
	struct jz4780_mmc_softc *sc;
	struct mmc_command *cmd;
	uint32_t cmdat, iwait;
	int blksz;

	sc = device_get_softc(bus);
	JZ_MMC_LOCK(sc);
	if (sc->sc_req != NULL) {
	JZ_MMC_UNLOCK(sc);
	return (EBUSY);
	}
	/* Start with template value */
	cmdat = sc->sc_cmdat;
	iwait = JZ_INT_END_CMD_RES;

	/* Configure response format */
	cmd = req->cmd;
	switch (MMC_RSP(cmd->flags)) {
	case MMC_RSP_R1:
	case MMC_RSP_R1B:
	cmdat \|= JZ_RES_R1;
	break;
	case MMC_RSP_R2:
	cmdat \|= JZ_RES_R2;
	break;
	case MMC_RSP_R3:
	cmdat \|= JZ_RES_R3;
	break;
	};
	if (cmd->opcode == MMC_GO_IDLE_STATE)
	cmdat \|= JZ_INIT;
	if (cmd->flags & MMC_RSP_BUSY) {
	cmdat \|= JZ_BUSY;
	iwait \|= JZ_INT_PRG_DONE;
	}

	sc->sc_req = req;
	sc->sc_resid = 0;
	cmd->error = MMC_ERR_NONE;

	if (cmd->data != NULL) {
	cmdat \|= JZ_DATA_EN;
	if (cmd->data->flags & MMC_DATA_MULTI) {
	cmdat \|= JZ_AUTO_CMD12;
	iwait \|= JZ_INT_AUTO_CMD12_DONE;
	}
	if (cmd->data->flags & MMC_DATA_WRITE) {
	cmdat \|= JZ_WRITE;
	iwait \|= JZ_INT_PRG_DONE;
	}
	if (cmd->data->flags & MMC_DATA_STREAM)
	cmdat \|= JZ_STREAM;
	else
	iwait \|= JZ_INT_DATA_TRAN_DONE;

	blksz = min(cmd->data->len, MMC_SECTOR_SIZE);
	JZ_MMC_WRITE_4(sc, JZ_MSC_BLKLEN, blksz);
	JZ_MMC_WRITE_4(sc, JZ_MSC_NOB, cmd->data->len / blksz);

	/* Attempt to setup DMA for this transaction */
	if (jz4780_mmc_pio_mode == 0)
	jz4780_mmc_prepare_dma(sc);
	if (sc->sc_dma_inuse != 0) {
	/* Wait for DMA completion interrupt */
	iwait \|= JZ_INT_DMAEND;
	} else {
	iwait \|= (cmd->data->flags & MMC_DATA_WRITE) ?
	JZ_INT_TXFIFO_WR_REQ : JZ_INT_RXFIFO_RD_REQ;
	JZ_MMC_WRITE_4(sc, JZ_MSC_DMAC, 0);
	}
	}

	sc->sc_intr_seen = 0;
	sc->sc_intr_wait = iwait;
	JZ_MMC_WRITE_4(sc, JZ_MSC_IMASK, ~(sc->sc_intr_mask \| iwait));

	#if defined(JZ_MMC_DEBUG)
	device_printf(sc->sc_dev,
	"REQUEST: CMD%u arg %#x flags %#x cmdat %#x sc_intr_wait = %#x\n",
	cmd->opcode, cmd->arg, cmd->flags, cmdat, sc->sc_intr_wait);
	#endif

	JZ_MMC_WRITE_4(sc, JZ_MSC_ARG, cmd->arg);
	JZ_MMC_WRITE_4(sc, JZ_MSC_CMD, cmd->opcode);
	JZ_MMC_WRITE_4(sc, JZ_MSC_CMDAT, cmdat);

	JZ_MMC_WRITE_4(sc, JZ_MSC_CTRL, JZ_START_OP \| JZ_CLOCK_START);

	callout_reset(&sc->sc_timeoutc, sc->sc_timeout * hz,
	jz4780_mmc_timeout, sc);
	JZ_MMC_UNLOCK(sc);

	return (0);
	}

	static int
	jz4780_mmc_read_ivar(device_t bus, device_t child, int which,
	uintptr_t *result)
	{
	struct jz4780_mmc_softc *sc;

	sc = device_get_softc(bus);
	switch (which) {
	default:
	return (EINVAL);
	case MMCBR_IVAR_BUS_MODE:
	(int )result = sc->sc_host.ios.bus_mode;
	break;
	case MMCBR_IVAR_BUS_WIDTH:
	(int )result = sc->sc_host.ios.bus_width;
	break;
	case MMCBR_IVAR_CHIP_SELECT:
	(int )result = sc->sc_host.ios.chip_select;
	break;
	case MMCBR_IVAR_CLOCK:
	(int )result = sc->sc_host.ios.clock;
	break;
	case MMCBR_IVAR_F_MIN:
	(int )result = sc->sc_host.f_min;
	break;
	case MMCBR_IVAR_F_MAX:
	(int )result = sc->sc_host.f_max;
	break;
	case MMCBR_IVAR_HOST_OCR:
	(int )result = sc->sc_host.host_ocr;
	break;
	case MMCBR_IVAR_MODE:
	(int )result = sc->sc_host.mode;
	break;
	case MMCBR_IVAR_OCR:
	(int )result = sc->sc_host.ocr;
	break;
	case MMCBR_IVAR_POWER_MODE:
	(int )result = sc->sc_host.ios.power_mode;
	break;
	case MMCBR_IVAR_RETUNE_REQ:
	(int )result = retune_req_none;
	break;
	case MMCBR_IVAR_VDD:
	(int )result = sc->sc_host.ios.vdd;
	break;
	case MMCBR_IVAR_VCCQ:
	*result = sc->sc_host.ios.vccq;
	break;
	case MMCBR_IVAR_CAPS:
	(int )result = sc->sc_host.caps;
	break;
	case MMCBR_IVAR_TIMING:
	(int )result = sc->sc_host.ios.timing;
	break;
	case MMCBR_IVAR_MAX_DATA:
	(int )result = 65535;
	break;
	case MMCBR_IVAR_MAX_BUSY_TIMEOUT:
	(int )result = 1000000; /* 1s max */
	break;
	}

	return (0);
	}

	static int
	jz4780_mmc_write_ivar(device_t bus, device_t child, int which,
	uintptr_t value)
	{
	struct jz4780_mmc_softc *sc;

	sc = device_get_softc(bus);
	switch (which) {
	default:
	return (EINVAL);
	case MMCBR_IVAR_BUS_MODE:
	sc->sc_host.ios.bus_mode = value;
	break;
	case MMCBR_IVAR_BUS_WIDTH:
	sc->sc_host.ios.bus_width = value;
	break;
	case MMCBR_IVAR_CHIP_SELECT:
	sc->sc_host.ios.chip_select = value;
	break;
	case MMCBR_IVAR_CLOCK:
	sc->sc_host.ios.clock = value;
	break;
	case MMCBR_IVAR_MODE:
	sc->sc_host.mode = value;
	break;
	case MMCBR_IVAR_OCR:
	sc->sc_host.ocr = value;
	break;
	case MMCBR_IVAR_POWER_MODE:
	sc->sc_host.ios.power_mode = value;
	break;
	case MMCBR_IVAR_VDD:
	sc->sc_host.ios.vdd = value;
	break;
	case MMCBR_IVAR_VCCQ:
	sc->sc_host.ios.vccq = value;
	break;
	case MMCBR_IVAR_TIMING:
	sc->sc_host.ios.timing = value;
	break;
	/* These are read-only */
	case MMCBR_IVAR_CAPS:
	case MMCBR_IVAR_HOST_OCR:
	case MMCBR_IVAR_F_MIN:
	case MMCBR_IVAR_F_MAX:
	case MMCBR_IVAR_MAX_DATA:
	return (EINVAL);
	}

	return (0);
	}

	static int
	jz4780_mmc_disable_clock(struct jz4780_mmc_softc *sc)
	{
	int timeout;

	JZ_MMC_WRITE_4(sc, JZ_MSC_CTRL, JZ_CLOCK_STOP);

	for (timeout = 1000; timeout > 0; timeout--)
	if ((JZ_MMC_READ_4(sc, JZ_MSC_STAT) & JZ_CLK_EN) == 0)
	return (0);
	return (ETIMEDOUT);
	}

	static int
	jz4780_mmc_config_clock(struct jz4780_mmc_softc *sc, uint32_t freq)
	{
	uint64_t rate;
	uint32_t clk_freq;
	int err, div;

	err = jz4780_mmc_disable_clock(sc);
	if (err != 0)
	return (err);

	clk_get_freq(sc->sc_clk, &rate);
	clk_freq = (uint32_t)rate;

	div = 0;
	while (clk_freq > freq) {
	div++;
	clk_freq >>= 1;
	}
	if (div >= 7)
	div = 7;
	#if defined(JZ_MMC_DEBUG)
	if (div != JZ_MMC_READ_4(sc, JZ_MSC_CLKRT))
	device_printf(sc->sc_dev,
	"UPDATE_IOS: clk -> %u\n", clk_freq);
	#endif
	JZ_MMC_WRITE_4(sc, JZ_MSC_CLKRT, div);
	return (0);
	}

	static int
	jz4780_mmc_update_ios(device_t bus, device_t child)
	{
	struct jz4780_mmc_softc *sc;
	struct mmc_ios *ios;
	int error;

	sc = device_get_softc(bus);
	ios = &sc->sc_host.ios;
	if (ios->clock) {
	/* Set the MMC clock. */
	error = jz4780_mmc_config_clock(sc, ios->clock);
	if (error != 0)
	return (error);
	}

	/* Set the bus width. */
	switch (ios->bus_width) {
	case bus_width_1:
	sc->sc_cmdat &= ~(JZ_BUS_WIDTH_M);
	sc->sc_cmdat \|= JZ_BUS_1BIT;
	break;
	case bus_width_4:
	sc->sc_cmdat &= ~(JZ_BUS_WIDTH_M);
	sc->sc_cmdat \|= JZ_BUS_4BIT;
	break;
	case bus_width_8:
	sc->sc_cmdat &= ~(JZ_BUS_WIDTH_M);
	sc->sc_cmdat \|= JZ_BUS_8BIT;
	break;
	}
	return (0);
	}

	static int
	jz4780_mmc_get_ro(device_t bus, device_t child)
	{

	return (0);
	}

	static int
	jz4780_mmc_acquire_host(device_t bus, device_t child)
	{
	struct jz4780_mmc_softc *sc;
	int error;

	sc = device_get_softc(bus);
	JZ_MMC_LOCK(sc);
	while (sc->sc_bus_busy) {
	error = msleep(sc, &sc->sc_mtx, PCATCH, "mmchw", 0);
	if (error != 0) {
	JZ_MMC_UNLOCK(sc);
	return (error);
	}
	}
	sc->sc_bus_busy++;
	JZ_MMC_UNLOCK(sc);

	return (0);
	}

	static int
	jz4780_mmc_release_host(device_t bus, device_t child)
	{
	struct jz4780_mmc_softc *sc;

	sc = device_get_softc(bus);
	JZ_MMC_LOCK(sc);
	sc->sc_bus_busy--;
	wakeup(sc);
	JZ_MMC_UNLOCK(sc);

	return (0);
	}

	static device_method_t jz4780_mmc_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, jz4780_mmc_probe),
	DEVMETHOD(device_attach, jz4780_mmc_attach),
	DEVMETHOD(device_detach, jz4780_mmc_detach),

	/* Bus interface */
	DEVMETHOD(bus_read_ivar, jz4780_mmc_read_ivar),
	DEVMETHOD(bus_write_ivar, jz4780_mmc_write_ivar),

	/* MMC bridge interface */
	DEVMETHOD(mmcbr_update_ios, jz4780_mmc_update_ios),
	DEVMETHOD(mmcbr_request, jz4780_mmc_request),
	DEVMETHOD(mmcbr_get_ro, jz4780_mmc_get_ro),
	DEVMETHOD(mmcbr_acquire_host, jz4780_mmc_acquire_host),
	DEVMETHOD(mmcbr_release_host, jz4780_mmc_release_host),

	DEVMETHOD_END
	};

	static devclass_t jz4780_mmc_devclass;

	static driver_t jz4780_mmc_driver = {
	"jzmmc",
	jz4780_mmc_methods,
	sizeof(struct jz4780_mmc_softc),
	};

	DRIVER_MODULE(jzmmc, simplebus, jz4780_mmc_driver, jz4780_mmc_devclass, NULL,
	NULL);
	MMC_DECLARE_BRIDGE(jzmmc);
	diff --git a/sys/net/if.c b/sys/net/if.c
	index b6248ff051ab..c82c1694bc8a 100644
	--- a/sys/net/if.c
	+++ b/sys/net/if.c
	@@ -1,4554 +1,4554 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1980, 1986, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)if.c 8.5 (Berkeley) 1/9/95
	* $FreeBSD$
	*/

	#include "opt_bpf.h"
	#include "opt_inet6.h"
	#include "opt_inet.h"

	#include <sys/param.h>
	#include <sys/conf.h>
	#include <sys/eventhandler.h>
	#include <sys/malloc.h>
	#include <sys/domainset.h>
	#include <sys/sbuf.h>
	#include <sys/bus.h>
	#include <sys/epoch.h>
	#include <sys/mbuf.h>
	#include <sys/systm.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/protosw.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/refcount.h>
	#include <sys/module.h>
	#include <sys/rwlock.h>
	#include <sys/sockio.h>
	#include <sys/syslog.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/taskqueue.h>
	#include <sys/domain.h>
	#include <sys/jail.h>
	#include <sys/priv.h>

	#include <machine/stdarg.h>
	#include <vm/uma.h>

	#include <net/bpf.h>
	#include <net/ethernet.h>
	#include <net/if.h>
	#include <net/if_arp.h>
	#include <net/if_clone.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/if_var.h>
	#include <net/if_media.h>
	#include <net/if_vlan_var.h>
	#include <net/radix.h>
	#include <net/route.h>
	#include <net/route/route_ctl.h>
	#include <net/vnet.h>

	#if defined(INET) \|\| defined(INET6)
	#include <net/ethernet.h>
	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip_carp.h>
	#ifdef INET
	#include <net/debugnet.h>
	#include <netinet/if_ether.h>
	#endif /* INET */
	#ifdef INET6
	#include <netinet6/in6_var.h>
	#include <netinet6/in6_ifattach.h>
	#endif /* INET6 */
	#endif /* INET \|\| INET6 */

	#include <security/mac/mac_framework.h>

	/*
	* Consumers of struct ifreq such as tcpdump assume no pad between ifr_name
	* and ifr_ifru when it is used in SIOCGIFCONF.
	*/
	_Static_assert(sizeof(((struct ifreq *)0)->ifr_name) ==
	offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru");

	__read_mostly epoch_t net_epoch_preempt;
	#ifdef COMPAT_FREEBSD32
	#include <sys/mount.h>
	#include <compat/freebsd32/freebsd32.h>

	struct ifreq_buffer32 {
	uint32_t length; /* (size_t) */
	uint32_t buffer; /* (void ) /
	};

	/*
	* Interface request structure used for socket
	* ioctl's. All interface ioctl's must have parameter
	* definitions which begin with ifr_name. The
	* remainder may be interface specific.
	*/
	struct ifreq32 {
	char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */
	union {
	struct sockaddr ifru_addr;
	struct sockaddr ifru_dstaddr;
	struct sockaddr ifru_broadaddr;
	struct ifreq_buffer32 ifru_buffer;
	short ifru_flags[2];
	short ifru_index;
	int ifru_jid;
	int ifru_metric;
	int ifru_mtu;
	int ifru_phys;
	int ifru_media;
	uint32_t ifru_data;
	int ifru_cap[2];
	u_int ifru_fib;
	u_char ifru_vlan_pcp;
	} ifr_ifru;
	};
	CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32));
	CTASSERT(__offsetof(struct ifreq, ifr_ifru) ==
	__offsetof(struct ifreq32, ifr_ifru));

	struct ifgroupreq32 {
	char ifgr_name[IFNAMSIZ];
	u_int ifgr_len;
	union {
	char ifgru_group[IFNAMSIZ];
	uint32_t ifgru_groups;
	} ifgr_ifgru;
	};

	struct ifmediareq32 {
	char ifm_name[IFNAMSIZ];
	int ifm_current;
	int ifm_mask;
	int ifm_status;
	int ifm_active;
	int ifm_count;
	uint32_t ifm_ulist; /* (int ) /
	};
	#define SIOCGIFMEDIA32 _IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32)
	#define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32)

	#define _CASE_IOC_IFGROUPREQ_32(cmd) \
	_IOC_NEWTYPE((cmd), struct ifgroupreq32): case
	#else /* !COMPAT_FREEBSD32 */
	#define _CASE_IOC_IFGROUPREQ_32(cmd)
	#endif /* !COMPAT_FREEBSD32 */

	#define CASE_IOC_IFGROUPREQ(cmd) \
	_CASE_IOC_IFGROUPREQ_32(cmd) \
	(cmd)

	union ifreq_union {
	struct ifreq ifr;
	#ifdef COMPAT_FREEBSD32
	struct ifreq32 ifr32;
	#endif
	};

	union ifgroupreq_union {
	struct ifgroupreq ifgr;
	#ifdef COMPAT_FREEBSD32
	struct ifgroupreq32 ifgr32;
	#endif
	};

	SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Link layers");
	SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Generic link-management");

	SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN,
	&ifqmaxlen, 0, "max send queue size");

	/* Log link state change events */
	static int log_link_state_change = 1;

	SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
	&log_link_state_change, 0,
	"log interface link state change events");

	/* Log promiscuous mode change events */
	static int log_promisc_mode_change = 1;

	SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN,
	&log_promisc_mode_change, 1,
	"log promiscuous mode change events");

	/* Interface description */
	static unsigned int ifdescr_maxlen = 1024;
	SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
	&ifdescr_maxlen, 0,
	"administrative maximum length for interface description");

	static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");

	/* global sx for non-critical path ifdescr */
	static struct sx ifdescr_sx;
	SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");

	void (ng_ether_link_state_p)(struct ifnet ifp, int state);
	void (lagg_linkstate_p)(struct ifnet ifp, int state);
	/* These are external hooks for CARP. */
	void (carp_linkstate_p)(struct ifnet ifp);
	void (carp_demote_adj_p)(int, char );
	int (carp_master_p)(struct ifaddr );
	#if defined(INET) \|\| defined(INET6)
	int (carp_forus_p)(struct ifnet ifp, u_char *dhost);
	int (carp_output_p)(struct ifnet ifp, struct mbuf *m,
	const struct sockaddr *sa);
	int (carp_ioctl_p)(struct ifreq , u_long, struct thread *);
	int (carp_attach_p)(struct ifaddr , int);
	void (carp_detach_p)(struct ifaddr , bool);
	#endif
	#ifdef INET
	int (carp_iamatch_p)(struct ifaddr , uint8_t **);
	#endif
	#ifdef INET6
	struct ifaddr (carp_iamatch6_p)(struct ifnet ifp, struct in6_addr taddr6);
	caddr_t (carp_macmatch6_p)(struct ifnet ifp, struct mbuf *m,
	const struct in6_addr *taddr);
	#endif

	struct mbuf (tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;

	/*
	* XXX: Style; these should be sorted alphabetically, and unprototyped
	* static functions should be prototyped. Currently they are sorted by
	* declaration order.
	*/
	static void if_attachdomain(void *);
	static void if_attachdomain1(struct ifnet *);
	static int ifconf(u_long, caddr_t);
	static void *if_grow(void);
	static void if_input_default(struct ifnet , struct mbuf );
	static int if_requestencap_default(struct ifnet , struct if_encap_req );
	static void if_route(struct ifnet *, int flag, int fam);
	static int if_setflag(struct ifnet , int, int, int , int);
	static int if_transmit(struct ifnet ifp, struct mbuf m);
	static void if_unroute(struct ifnet *, int flag, int fam);
	static int if_delmulti_locked(struct ifnet , struct ifmultiaddr , int);
	static void do_link_state_change(void *, int);
	static int if_getgroup(struct ifgroupreq , struct ifnet );
	static int if_getgroupmembers(struct ifgroupreq *);
	static void if_delgroups(struct ifnet *);
	static void if_attach_internal(struct ifnet , int, struct if_clone );
	static int if_detach_internal(struct ifnet , int, struct if_clone *);
	static void if_siocaddmulti(void *, int);
	static void if_link_ifnet(struct ifnet *);
	static bool if_unlink_ifnet(struct ifnet *, bool);
	#ifdef VIMAGE
	static int if_vmove(struct ifnet , struct vnet );
	#endif

	#ifdef INET6
	/*
	* XXX: declare here to avoid to include many inet6 related files..
	* should be more generalized?
	*/
	extern void nd6_setmtu(struct ifnet *);
	#endif

	/* ipsec helper hooks */
	VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]);
	VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]);

	VNET_DEFINE(int, if_index);
	int ifqmaxlen = IFQ_MAXLEN;
	VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */
	VNET_DEFINE(struct ifgrouphead, ifg_head);

	VNET_DEFINE_STATIC(int, if_indexlim) = 8;

	/* Table of ifnet by index. */
	VNET_DEFINE(struct ifnet **, ifindex_table);

	#define V_if_indexlim VNET(if_indexlim)
	#define V_ifindex_table VNET(ifindex_table)

	/*
	* The global network interface list (V_ifnet) and related state (such as
	* if_index, if_indexlim, and ifindex_table) are protected by an sxlock.
	* This may be acquired to stabilise the list, or we may rely on NET_EPOCH.
	*/
	struct sx ifnet_sxlock;
	SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE);

	/*
	* The allocation of network interfaces is a rather non-atomic affair; we
	* need to select an index before we are ready to expose the interface for
	* use, so will use this pointer value to indicate reservation.
	*/
	#define IFNET_HOLD (void *)(uintptr_t)(-1)

	#ifdef VIMAGE
	#define VNET_IS_SHUTTING_DOWN(_vnet) \
	((_vnet)->vnet_shutdown && (_vnet)->vnet_state < SI_SUB_VNET_DONE)
	#endif

	static if_com_alloc_t *if_com_alloc[256];
	static if_com_free_t *if_com_free[256];

	static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
	MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
	MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");

	struct ifnet *
	ifnet_byindex(u_short idx)
	{
	struct ifnet *ifp;

	if (__predict_false(idx > V_if_index))
	return (NULL);

	ifp = (struct ifnet const volatile *)(V_ifindex_table + idx);
	return (__predict_false(ifp == IFNET_HOLD) ? NULL : ifp);
	}

	struct ifnet *
	ifnet_byindex_ref(u_short idx)
	{
	struct ifnet *ifp;

	NET_EPOCH_ASSERT();

	ifp = ifnet_byindex(idx);
	if (ifp == NULL \|\| (ifp->if_flags & IFF_DYING))
	return (NULL);
	if_ref(ifp);
	return (ifp);
	}

	/*
	* Allocate an ifindex array entry; return 0 on success or an error on
	* failure.
	*/
	static u_short
	ifindex_alloc(void **old)
	{
	u_short idx;

	IFNET_WLOCK_ASSERT();
	/*
	* Try to find an empty slot below V_if_index. If we fail, take the
	* next slot.
	*/
	for (idx = 1; idx <= V_if_index; idx++) {
	if (V_ifindex_table[idx] == NULL)
	break;
	}

	/* Catch if_index overflow. */
	if (idx >= V_if_indexlim) {
	*old = if_grow();
	return (USHRT_MAX);
	}
	if (idx > V_if_index)
	V_if_index = idx;
	return (idx);
	}

	static void
	ifindex_free_locked(u_short idx)
	{

	IFNET_WLOCK_ASSERT();

	V_ifindex_table[idx] = NULL;
	while (V_if_index > 0 &&
	V_ifindex_table[V_if_index] == NULL)
	V_if_index--;
	}

	static void
	ifindex_free(u_short idx)
	{

	IFNET_WLOCK();
	ifindex_free_locked(idx);
	IFNET_WUNLOCK();
	}

	static void
	ifnet_setbyindex(u_short idx, struct ifnet *ifp)
	{

	V_ifindex_table[idx] = ifp;
	}

	struct ifaddr *
	ifaddr_byindex(u_short idx)
	{
	struct ifnet *ifp;
	struct ifaddr *ifa = NULL;

	NET_EPOCH_ASSERT();

	ifp = ifnet_byindex(idx);
	if (ifp != NULL && (ifa = ifp->if_addr) != NULL)
	ifa_ref(ifa);
	return (ifa);
	}

	/*
	* Network interface utility routines.
	*
	* Routines with ifa_ifwith* names take sockaddr *'s as
	* parameters.
	*/

	static void
	vnet_if_init(const void *unused __unused)
	{
	void *old;

	CK_STAILQ_INIT(&V_ifnet);
	CK_STAILQ_INIT(&V_ifg_head);
	IFNET_WLOCK();
	old = if_grow(); /* create initial table */
	IFNET_WUNLOCK();
	epoch_wait_preempt(net_epoch_preempt);
	free(old, M_IFNET);
	vnet_if_clone_init();
	}
	VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init,
	NULL);

	#ifdef VIMAGE
	static void
	vnet_if_uninit(const void *unused __unused)
	{

	VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p "
	"not empty", __func__, __LINE__, &V_ifnet));
	VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p "
	"not empty", __func__, __LINE__, &V_ifg_head));

	free((caddr_t)V_ifindex_table, M_IFNET);
	}
	VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
	vnet_if_uninit, NULL);
	#endif

	static void
	if_link_ifnet(struct ifnet *ifp)
	{

	IFNET_WLOCK();
	CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
	#ifdef VIMAGE
	curvnet->vnet_ifcnt++;
	#endif
	IFNET_WUNLOCK();
	}

	static bool
	if_unlink_ifnet(struct ifnet *ifp, bool vmove)
	{
	struct ifnet *iter;
	int found = 0;

	IFNET_WLOCK();
	CK_STAILQ_FOREACH(iter, &V_ifnet, if_link)
	if (iter == ifp) {
	CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link);
	if (!vmove)
	ifp->if_flags \|= IFF_DYING;
	found = 1;
	break;
	}
	#ifdef VIMAGE
	curvnet->vnet_ifcnt--;
	#endif
	IFNET_WUNLOCK();

	return (found);
	}

	#ifdef VIMAGE
	static void
	vnet_if_return(const void *unused __unused)
	{
	struct ifnet ifp, nifp;
	struct ifnet **pending;
	int found, i;

	i = 0;

	/*
	* We need to protect our access to the V_ifnet tailq. Ordinarily we'd
	* enter NET_EPOCH, but that's not possible, because if_vmove() calls
	* if_detach_internal(), which waits for NET_EPOCH callbacks to
	* complete. We can't do that from within NET_EPOCH.
	*
	* However, we can also use the IFNET_xLOCK, which is the V_ifnet
	* read/write lock. We cannot hold the lock as we call if_vmove()
	* though, as that presents LOR w.r.t ifnet_sx, in_multi_sx and iflib
	* ctx lock.
	*/
	IFNET_WLOCK();

	pending = malloc(sizeof(struct ifnet ) curvnet->vnet_ifcnt,
	M_IFNET, M_WAITOK \| M_ZERO);

	/* Return all inherited interfaces to their parent vnets. */
	CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) {
	if (ifp->if_home_vnet != ifp->if_vnet) {
	found = if_unlink_ifnet(ifp, true);
	MPASS(found);

	pending[i++] = ifp;
	}
	}
	IFNET_WUNLOCK();

	for (int j = 0; j < i; j++) {
	if_vmove(pending[j], pending[j]->if_home_vnet);
	}

	free(pending, M_IFNET);
	}
	VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY,
	vnet_if_return, NULL);
	#endif

	static void *
	if_grow(void)
	{
	int oldlim;
	u_int n;
	struct ifnet **e;
	void *old;

	old = NULL;
	IFNET_WLOCK_ASSERT();
	oldlim = V_if_indexlim;
	IFNET_WUNLOCK();
	n = (oldlim << 1) * sizeof(*e);
	e = malloc(n, M_IFNET, M_WAITOK \| M_ZERO);
	IFNET_WLOCK();
	if (V_if_indexlim != oldlim) {
	free(e, M_IFNET);
	return (NULL);
	}
	if (V_ifindex_table != NULL) {
	memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
	old = V_ifindex_table;
	}
	V_if_indexlim <<= 1;
	V_ifindex_table = e;
	return (old);
	}

	/*
	* Allocate a struct ifnet and an index for an interface. A layer 2
	* common structure will also be allocated if an allocation routine is
	* registered for the passed type.
	*/
	struct ifnet *
	if_alloc_domain(u_char type, int numa_domain)
	{
	struct ifnet *ifp;
	u_short idx;
	void *old;

	KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large"));
	if (numa_domain == IF_NODOM)
	ifp = malloc(sizeof(struct ifnet), M_IFNET,
	M_WAITOK \| M_ZERO);
	else
	ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET,
	DOMAINSET_PREF(numa_domain), M_WAITOK \| M_ZERO);
	restart:
	IFNET_WLOCK();
	idx = ifindex_alloc(&old);
	if (__predict_false(idx == USHRT_MAX)) {
	IFNET_WUNLOCK();
	epoch_wait_preempt(net_epoch_preempt);
	free(old, M_IFNET);
	goto restart;
	}
	ifnet_setbyindex(idx, IFNET_HOLD);
	IFNET_WUNLOCK();
	ifp->if_index = idx;
	ifp->if_type = type;
	ifp->if_alloctype = type;
	ifp->if_numa_domain = numa_domain;
	#ifdef VIMAGE
	ifp->if_vnet = curvnet;
	#endif
	if (if_com_alloc[type] != NULL) {
	ifp->if_l2com = if_com_alloc[type](type, ifp);
	if (ifp->if_l2com == NULL) {
	free(ifp, M_IFNET);
	ifindex_free(idx);
	return (NULL);
	}
	}

	IF_ADDR_LOCK_INIT(ifp);
	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
	TASK_INIT(&ifp->if_addmultitask, 0, if_siocaddmulti, ifp);
	ifp->if_afdata_initialized = 0;
	IF_AFDATA_LOCK_INIT(ifp);
	CK_STAILQ_INIT(&ifp->if_addrhead);
	CK_STAILQ_INIT(&ifp->if_multiaddrs);
	CK_STAILQ_INIT(&ifp->if_groups);
	#ifdef MAC
	mac_ifnet_init(ifp);
	#endif
	ifq_init(&ifp->if_snd, ifp);

	refcount_init(&ifp->if_refcount, 1); /* Index reference. */
	for (int i = 0; i < IFCOUNTERS; i++)
	ifp->if_counters[i] = counter_u64_alloc(M_WAITOK);
	ifp->if_get_counter = if_get_counter_default;
	ifp->if_pcp = IFNET_PCP_NONE;
	ifnet_setbyindex(ifp->if_index, ifp);
	return (ifp);
	}

	struct ifnet *
	if_alloc_dev(u_char type, device_t dev)
	{
	int numa_domain;

	if (dev == NULL \|\| bus_get_domain(dev, &numa_domain) != 0)
	return (if_alloc_domain(type, IF_NODOM));
	return (if_alloc_domain(type, numa_domain));
	}

	struct ifnet *
	if_alloc(u_char type)
	{

	return (if_alloc_domain(type, IF_NODOM));
	}
	/*
	* Do the actual work of freeing a struct ifnet, and layer 2 common
	* structure. This call is made when the last reference to an
	* interface is released.
	*/
	static void
	if_free_internal(struct ifnet *ifp)
	{

	KASSERT((ifp->if_flags & IFF_DYING),
	("if_free_internal: interface not dying"));

	if (if_com_free[ifp->if_alloctype] != NULL)
	if_com_free[ifp->if_alloctype](ifp->if_l2com,
	ifp->if_alloctype);

	#ifdef MAC
	mac_ifnet_destroy(ifp);
	#endif /* MAC */
	IF_AFDATA_DESTROY(ifp);
	IF_ADDR_LOCK_DESTROY(ifp);
	ifq_delete(&ifp->if_snd);

	for (int i = 0; i < IFCOUNTERS; i++)
	counter_u64_free(ifp->if_counters[i]);

	free(ifp->if_description, M_IFDESCR);
	free(ifp->if_hw_addr, M_IFADDR);
	free(ifp, M_IFNET);
	}

	static void
	if_destroy(epoch_context_t ctx)
	{
	struct ifnet *ifp;

	ifp = __containerof(ctx, struct ifnet, if_epoch_ctx);
	if_free_internal(ifp);
	}

	/*
	* Deregister an interface and free the associated storage.
	*/
	void
	if_free(struct ifnet *ifp)
	{

	ifp->if_flags \|= IFF_DYING; /* XXX: Locking */

	CURVNET_SET_QUIET(ifp->if_vnet);
	IFNET_WLOCK();
	KASSERT(ifp == ifnet_byindex(ifp->if_index),
	("%s: freeing unallocated ifnet", ifp->if_xname));

	ifindex_free_locked(ifp->if_index);
	IFNET_WUNLOCK();

	if (refcount_release(&ifp->if_refcount))
	NET_EPOCH_CALL(if_destroy, &ifp->if_epoch_ctx);
	CURVNET_RESTORE();
	}

	/*
	* Interfaces to keep an ifnet type-stable despite the possibility of the
	* driver calling if_free(). If there are additional references, we defer
	* freeing the underlying data structure.
	*/
	void
	if_ref(struct ifnet *ifp)
	{

	/* We don't assert the ifnet list lock here, but arguably should. */
	refcount_acquire(&ifp->if_refcount);
	}

	void
	if_rele(struct ifnet *ifp)
	{

	if (!refcount_release(&ifp->if_refcount))
	return;
	NET_EPOCH_CALL(if_destroy, &ifp->if_epoch_ctx);
	}

	void
	ifq_init(struct ifaltq ifq, struct ifnet ifp)
	{

	mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);

	if (ifq->ifq_maxlen == 0)
	ifq->ifq_maxlen = ifqmaxlen;

	ifq->altq_type = 0;
	ifq->altq_disc = NULL;
	ifq->altq_flags &= ALTQF_CANTCHANGE;
	ifq->altq_tbr = NULL;
	ifq->altq_ifp = ifp;
	}

	void
	ifq_delete(struct ifaltq *ifq)
	{
	mtx_destroy(&ifq->ifq_mtx);
	}

	/*
	* Perform generic interface initialization tasks and attach the interface
	* to the list of "active" interfaces. If vmove flag is set on entry
	* to if_attach_internal(), perform only a limited subset of initialization
	* tasks, given that we are moving from one vnet to another an ifnet which
	* has already been fully initialized.
	*
	* Note that if_detach_internal() removes group membership unconditionally
	* even when vmove flag is set, and if_attach_internal() adds only IFG_ALL.
	* Thus, when if_vmove() is applied to a cloned interface, group membership
	* is lost while a cloned one always joins a group whose name is
	* ifc->ifc_name. To recover this after if_detach_internal() and
	* if_attach_internal(), the cloner should be specified to
	* if_attach_internal() via ifc. If it is non-NULL, if_attach_internal()
	* attempts to join a group whose name is ifc->ifc_name.
	*
	* XXX:
	* - The decision to return void and thus require this function to
	* succeed is questionable.
	* - We should probably do more sanity checking. For instance we don't
	* do anything to insure if_xname is unique or non-empty.
	*/
	void
	if_attach(struct ifnet *ifp)
	{

	if_attach_internal(ifp, 0, NULL);
	}

	/*
	* Compute the least common TSO limit.
	*/
	void
	if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax)
	{
	/*
	* 1) If there is no limit currently, take the limit from
	* the network adapter.
	*
	* 2) If the network adapter has a limit below the current
	* limit, apply it.
	*/
	if (pmax->tsomaxbytes == 0 \|\| (ifp->if_hw_tsomax != 0 &&
	ifp->if_hw_tsomax < pmax->tsomaxbytes)) {
	pmax->tsomaxbytes = ifp->if_hw_tsomax;
	}
	if (pmax->tsomaxsegcount == 0 \|\| (ifp->if_hw_tsomaxsegcount != 0 &&
	ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) {
	pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
	}
	if (pmax->tsomaxsegsize == 0 \|\| (ifp->if_hw_tsomaxsegsize != 0 &&
	ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) {
	pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
	}
	}

	/*
	* Update TSO limit of a network adapter.
	*
	* Returns zero if no change. Else non-zero.
	*/
	int
	if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax)
	{
	int retval = 0;
	if (ifp->if_hw_tsomax != pmax->tsomaxbytes) {
	ifp->if_hw_tsomax = pmax->tsomaxbytes;
	retval++;
	}
	if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) {
	ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize;
	retval++;
	}
	if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) {
	ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount;
	retval++;
	}
	return (retval);
	}

	static void
	if_attach_internal(struct ifnet ifp, int vmove, struct if_clone ifc)
	{
	unsigned socksize, ifasize;
	int namelen, masklen;
	struct sockaddr_dl *sdl;
	struct ifaddr *ifa;

	if (ifp->if_index == 0 \|\| ifp != ifnet_byindex(ifp->if_index))
	panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
	ifp->if_xname);

	#ifdef VIMAGE
	ifp->if_vnet = curvnet;
	if (ifp->if_home_vnet == NULL)
	ifp->if_home_vnet = curvnet;
	#endif

	if_addgroup(ifp, IFG_ALL);

	/* Restore group membership for cloned interfaces. */
	if (vmove && ifc != NULL)
	if_clone_addgroup(ifp, ifc);

	getmicrotime(&ifp->if_lastchange);
	ifp->if_epoch = time_uptime;

	KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) \|\|
	(ifp->if_transmit != NULL && ifp->if_qflush != NULL),
	("transmit and qflush must both either be set or both be NULL"));
	if (ifp->if_transmit == NULL) {
	ifp->if_transmit = if_transmit;
	ifp->if_qflush = if_qflush;
	}
	if (ifp->if_input == NULL)
	ifp->if_input = if_input_default;

	if (ifp->if_requestencap == NULL)
	ifp->if_requestencap = if_requestencap_default;

	if (!vmove) {
	#ifdef MAC
	mac_ifnet_create(ifp);
	#endif

	/*
	* Create a Link Level name for this device.
	*/
	namelen = strlen(ifp->if_xname);
	/*
	* Always save enough space for any possiable name so we
	* can do a rename in place later.
	*/
	masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
	socksize = masklen + ifp->if_addrlen;
	if (socksize < sizeof(*sdl))
	socksize = sizeof(*sdl);
	socksize = roundup2(socksize, sizeof(long));
	ifasize = sizeof(ifa) + 2 socksize;
	ifa = ifa_alloc(ifasize, M_WAITOK);
	sdl = (struct sockaddr_dl *)(ifa + 1);
	sdl->sdl_len = socksize;
	sdl->sdl_family = AF_LINK;
	bcopy(ifp->if_xname, sdl->sdl_data, namelen);
	sdl->sdl_nlen = namelen;
	sdl->sdl_index = ifp->if_index;
	sdl->sdl_type = ifp->if_type;
	ifp->if_addr = ifa;
	ifa->ifa_ifp = ifp;
	ifa->ifa_addr = (struct sockaddr *)sdl;
	sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
	ifa->ifa_netmask = (struct sockaddr *)sdl;
	sdl->sdl_len = masklen;
	while (namelen != 0)
	sdl->sdl_data[--namelen] = 0xff;
	CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
	/* Reliably crash if used uninitialized. */
	ifp->if_broadcastaddr = NULL;

	if (ifp->if_type == IFT_ETHER) {
	ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR,
	M_WAITOK \| M_ZERO);
	}

	#if defined(INET) \|\| defined(INET6)
	/* Use defaults for TSO, if nothing is set */
	if (ifp->if_hw_tsomax == 0 &&
	ifp->if_hw_tsomaxsegcount == 0 &&
	ifp->if_hw_tsomaxsegsize == 0) {
	/*
	* The TSO defaults needs to be such that an
	* NFS mbuf list of 35 mbufs totalling just
	* below 64K works and that a chain of mbufs
	* can be defragged into at most 32 segments:
	*/
	ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) -
	(ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
	ifp->if_hw_tsomaxsegcount = 35;
	ifp->if_hw_tsomaxsegsize = 2048; /* 2K */

	/* XXX some drivers set IFCAP_TSO after ethernet attach */
	if (ifp->if_capabilities & IFCAP_TSO) {
	if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n",
	ifp->if_hw_tsomax,
	ifp->if_hw_tsomaxsegcount,
	ifp->if_hw_tsomaxsegsize);
	}
	}
	#endif
	}
	#ifdef VIMAGE
	else {
	/*
	* Update the interface index in the link layer address
	* of the interface.
	*/
	for (ifa = ifp->if_addr; ifa != NULL;
	ifa = CK_STAILQ_NEXT(ifa, ifa_link)) {
	if (ifa->ifa_addr->sa_family == AF_LINK) {
	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
	sdl->sdl_index = ifp->if_index;
	}
	}
	}
	#endif

	if_link_ifnet(ifp);

	if (domain_init_status >= 2)
	if_attachdomain1(ifp);

	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
	if (IS_DEFAULT_VNET(curvnet))
	devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);

	/* Announce the interface. */
	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
	}

	static void
	if_epochalloc(void *dummy __unused)
	{

	net_epoch_preempt = epoch_alloc("Net preemptible", EPOCH_PREEMPT);
	}
	SYSINIT(ifepochalloc, SI_SUB_EPOCH, SI_ORDER_ANY, if_epochalloc, NULL);

	static void
	if_attachdomain(void *dummy)
	{
	struct ifnet *ifp;

	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link)
	if_attachdomain1(ifp);
	}
	SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
	if_attachdomain, NULL);

	static void
	if_attachdomain1(struct ifnet *ifp)
	{
	struct domain *dp;

	/*
	* Since dp->dom_ifattach calls malloc() with M_WAITOK, we
	* cannot lock ifp->if_afdata initialization, entirely.
	*/
	IF_AFDATA_LOCK(ifp);
	if (ifp->if_afdata_initialized >= domain_init_status) {
	IF_AFDATA_UNLOCK(ifp);
	log(LOG_WARNING, "%s called more than once on %s\n",
	__func__, ifp->if_xname);
	return;
	}
	ifp->if_afdata_initialized = domain_init_status;
	IF_AFDATA_UNLOCK(ifp);

	/* address family dependent data region */
	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
	for (dp = domains; dp; dp = dp->dom_next) {
	if (dp->dom_ifattach)
	ifp->if_afdata[dp->dom_family] =
	(*dp->dom_ifattach)(ifp);
	}
	}

	/*
	* Remove any unicast or broadcast network addresses from an interface.
	*/
	void
	if_purgeaddrs(struct ifnet *ifp)
	{
	struct ifaddr *ifa;

	while (1) {
	struct epoch_tracker et;

	NET_EPOCH_ENTER(et);
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != AF_LINK)
	break;
	}
	NET_EPOCH_EXIT(et);

	if (ifa == NULL)
	break;
	#ifdef INET
	/* XXX: Ugly!! ad hoc just for INET */
	if (ifa->ifa_addr->sa_family == AF_INET) {
	struct ifaliasreq ifr;

	bzero(&ifr, sizeof(ifr));
	ifr.ifra_addr = *ifa->ifa_addr;
	if (ifa->ifa_dstaddr)
	ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
	if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
	NULL) == 0)
	continue;
	}
	#endif /* INET */
	#ifdef INET6
	if (ifa->ifa_addr->sa_family == AF_INET6) {
	in6_purgeaddr(ifa);
	/* ifp_addrhead is already updated */
	continue;
	}
	#endif /* INET6 */
	IF_ADDR_WLOCK(ifp);
	CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link);
	IF_ADDR_WUNLOCK(ifp);
	ifa_free(ifa);
	}
	}

	/*
	* Remove any multicast network addresses from an interface when an ifnet
	* is going away.
	*/
	static void
	if_purgemaddrs(struct ifnet *ifp)
	{
	struct ifmultiaddr *ifma;

	IF_ADDR_WLOCK(ifp);
	while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) {
	ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs);
	CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
	if_delmulti_locked(ifp, ifma, 1);
	}
	IF_ADDR_WUNLOCK(ifp);
	}

	/*
	* Detach an interface, removing it from the list of "active" interfaces.
	* If vmove flag is set on entry to if_detach_internal(), perform only a
	* limited subset of cleanup tasks, given that we are moving an ifnet from
	* one vnet to another, where it must be fully operational.
	*
	* XXXRW: There are some significant questions about event ordering, and
	* how to prevent things from starting to use the interface during detach.
	*/
	void
	if_detach(struct ifnet *ifp)
	{
	bool found;

	CURVNET_SET_QUIET(ifp->if_vnet);
	found = if_unlink_ifnet(ifp, false);
	if (found)
	if_detach_internal(ifp, 0, NULL);
	CURVNET_RESTORE();
	}

	/*
	* The vmove flag, if set, indicates that we are called from a callpath
	* that is moving an interface to a different vnet instance.
	*
	* The shutdown flag, if set, indicates that we are called in the
	* process of shutting down a vnet instance. Currently only the
	* vnet_if_return SYSUNINIT function sets it. Note: we can be called
	* on a vnet instance shutdown without this flag being set, e.g., when
	* the cloned interfaces are destoyed as first thing of teardown.
	*/
	static int
	if_detach_internal(struct ifnet ifp, int vmove, struct if_clone *ifcp)
	{
	struct ifaddr *ifa;
	int i;
	struct domain *dp;
	#ifdef VIMAGE
	bool shutdown;

	shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet);
	#endif

	/*
	* At this point we know the interface still was on the ifnet list
	* and we removed it so we are in a stable state.
	*/
	epoch_wait_preempt(net_epoch_preempt);

	/*
	* Ensure all pending EPOCH(9) callbacks have been executed. This
	* fixes issues about late destruction of multicast options
	* which lead to leave group calls, which in turn access the
	* belonging ifnet structure:
	*/
	epoch_drain_callbacks(net_epoch_preempt);

	/*
	* In any case (destroy or vmove) detach us from the groups
	* and remove/wait for pending events on the taskq.
	* XXX-BZ in theory an interface could still enqueue a taskq change?
	*/
	if_delgroups(ifp);

	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
	taskqueue_drain(taskqueue_swi, &ifp->if_addmultitask);

	/*
	* Check if this is a cloned interface or not. Must do even if
	* shutting down as a if_vmove_reclaim() would move the ifp and
	* the if_clone_addgroup() will have a corrupted string overwise
	* from a gibberish pointer.
	*/
	if (vmove && ifcp != NULL)
	*ifcp = if_clone_findifc(ifp);

	if_down(ifp);

	#ifdef VIMAGE
	/*
	* On VNET shutdown abort here as the stack teardown will do all
	* the work top-down for us.
	*/
	if (shutdown) {
	/* Give interface users the chance to clean up. */
	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);

	/*
	* In case of a vmove we are done here without error.
	* If we would signal an error it would lead to the same
	* abort as if we did not find the ifnet anymore.
	* if_detach() calls us in void context and does not care
	* about an early abort notification, so life is splendid :)
	*/
	goto finish_vnet_shutdown;
	}
	#endif

	/*
	* At this point we are not tearing down a VNET and are either
	* going to destroy or vmove the interface and have to cleanup
	* accordingly.
	*/

	/*
	* Remove routes and flush queues.
	*/
	#ifdef ALTQ
	if (ALTQ_IS_ENABLED(&ifp->if_snd))
	altq_disable(&ifp->if_snd);
	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
	altq_detach(&ifp->if_snd);
	#endif

	if_purgeaddrs(ifp);

	#ifdef INET
	in_ifdetach(ifp);
	#endif

	#ifdef INET6
	/*
	* Remove all IPv6 kernel structs related to ifp. This should be done
	* before removing routing entries below, since IPv6 interface direct
	* routes are expected to be removed by the IPv6-specific kernel API.
	* Otherwise, the kernel will detect some inconsistency and bark it.
	*/
	in6_ifdetach(ifp);
	#endif
	if_purgemaddrs(ifp);

	/* Announce that the interface is gone. */
	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
	if (IS_DEFAULT_VNET(curvnet))
	devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);

	if (!vmove) {
	/*
	* Prevent further calls into the device driver via ifnet.
	*/
	if_dead(ifp);

	/*
	* Clean up all addresses.
	*/
	IF_ADDR_WLOCK(ifp);
	if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) {
	ifa = CK_STAILQ_FIRST(&ifp->if_addrhead);
	CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link);
	IF_ADDR_WUNLOCK(ifp);
	ifa_free(ifa);
	} else
	IF_ADDR_WUNLOCK(ifp);
	}

	rt_flushifroutes(ifp);

	#ifdef VIMAGE
	finish_vnet_shutdown:
	#endif
	/*
	* We cannot hold the lock over dom_ifdetach calls as they might
	* sleep, for example trying to drain a callout, thus open up the
	* theoretical race with re-attaching.
	*/
	IF_AFDATA_LOCK(ifp);
	i = ifp->if_afdata_initialized;
	ifp->if_afdata_initialized = 0;
	IF_AFDATA_UNLOCK(ifp);
	for (dp = domains; i > 0 && dp; dp = dp->dom_next) {
	if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) {
	(*dp->dom_ifdetach)(ifp,
	ifp->if_afdata[dp->dom_family]);
	ifp->if_afdata[dp->dom_family] = NULL;
	}
	}

	return (0);
	}

	#ifdef VIMAGE
	/*
	* if_vmove() performs a limited version of if_detach() in current
	* vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
	* An attempt is made to shrink if_index in current vnet, find an
	* unused if_index in target vnet and calls if_grow() if necessary,
	* and finally find an unused if_xname for the target vnet.
	*/
	static int
	if_vmove(struct ifnet ifp, struct vnet new_vnet)
	{
	struct if_clone *ifc;
	#ifdef DEV_BPF
	u_int bif_dlt, bif_hdrlen;
	#endif
	void *old;
	int rc;

	#ifdef DEV_BPF
	/*
	* if_detach_internal() will call the eventhandler to notify
	* interface departure. That will detach if_bpf. We need to
	* safe the dlt and hdrlen so we can re-attach it later.
	*/
	bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen);
	#endif

	/*
	* Detach from current vnet, but preserve LLADDR info, do not
	* mark as dead etc. so that the ifnet can be reattached later.
	* If we cannot find it, we lost the race to someone else.
	*/
	rc = if_detach_internal(ifp, 1, &ifc);
	if (rc != 0)
	return (rc);

	/*
	* Unlink the ifnet from ifindex_table[] in current vnet, and shrink
	* the if_index for that vnet if possible.
	*
	* NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized,
	* or we'd lock on one vnet and unlock on another.
	*/
	IFNET_WLOCK();
	ifindex_free_locked(ifp->if_index);
	IFNET_WUNLOCK();


	/* Don't re-attach DYING interfaces. */
	if (ifp->if_flags & IFF_DYING)
	return (0);

	/*
	* Perform interface-specific reassignment tasks, if provided by
	* the driver.
	*/
	if (ifp->if_reassign != NULL)
	ifp->if_reassign(ifp, new_vnet, NULL);

	/*
	* Switch to the context of the target vnet.
	*/
	CURVNET_SET_QUIET(new_vnet);
	restart:
	IFNET_WLOCK();
	ifp->if_index = ifindex_alloc(&old);
	if (__predict_false(ifp->if_index == USHRT_MAX)) {
	IFNET_WUNLOCK();
	epoch_wait_preempt(net_epoch_preempt);
	free(old, M_IFNET);
	goto restart;
	}
	ifnet_setbyindex(ifp->if_index, ifp);
	IFNET_WUNLOCK();

	if_attach_internal(ifp, 1, ifc);

	#ifdef DEV_BPF
	if (ifp->if_bpf == NULL)
	bpfattach(ifp, bif_dlt, bif_hdrlen);
	#endif

	CURVNET_RESTORE();
	return (0);
	}

	/*
	* Move an ifnet to or from another child prison/vnet, specified by the jail id.
	*/
	static int
	if_vmove_loan(struct thread td, struct ifnet ifp, char *ifname, int jid)
	{
	struct prison *pr;
	struct ifnet *difp;
	int error;
	bool found;
	bool shutdown;

	/* Try to find the prison within our visibility. */
	sx_slock(&allprison_lock);
	pr = prison_find_child(td->td_ucred->cr_prison, jid);
	sx_sunlock(&allprison_lock);
	if (pr == NULL)
	return (ENXIO);
	prison_hold_locked(pr);
	mtx_unlock(&pr->pr_mtx);

	/* Do not try to move the iface from and to the same prison. */
	if (pr->pr_vnet == ifp->if_vnet) {
	prison_free(pr);
	return (EEXIST);
	}

	/* Make sure the named iface does not exists in the dst. prison/vnet. */
	/* XXX Lock interfaces to avoid races. */
	CURVNET_SET_QUIET(pr->pr_vnet);
	difp = ifunit(ifname);
	if (difp != NULL) {
	CURVNET_RESTORE();
	prison_free(pr);
	return (EEXIST);
	}

	/* Make sure the VNET is stable. */
	shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet);
	if (shutdown) {
	CURVNET_RESTORE();
	prison_free(pr);
	return (EBUSY);
	}
	CURVNET_RESTORE();

	found = if_unlink_ifnet(ifp, true);
	MPASS(found);

	/* Move the interface into the child jail/vnet. */
	error = if_vmove(ifp, pr->pr_vnet);

	/* Report the new if_xname back to the userland on success. */
	if (error == 0)
	sprintf(ifname, "%s", ifp->if_xname);

	prison_free(pr);
	return (error);
	}

	static int
	if_vmove_reclaim(struct thread td, char ifname, int jid)
	{
	struct prison *pr;
	struct vnet *vnet_dst;
	struct ifnet *ifp;
	int error, found;
	bool shutdown;

	/* Try to find the prison within our visibility. */
	sx_slock(&allprison_lock);
	pr = prison_find_child(td->td_ucred->cr_prison, jid);
	sx_sunlock(&allprison_lock);
	if (pr == NULL)
	return (ENXIO);
	prison_hold_locked(pr);
	mtx_unlock(&pr->pr_mtx);

	/* Make sure the named iface exists in the source prison/vnet. */
	CURVNET_SET(pr->pr_vnet);
	ifp = ifunit(ifname); /* XXX Lock to avoid races. */
	if (ifp == NULL) {
	CURVNET_RESTORE();
	prison_free(pr);
	return (ENXIO);
	}

	/* Do not try to move the iface from and to the same prison. */
	vnet_dst = TD_TO_VNET(td);
	if (vnet_dst == ifp->if_vnet) {
	CURVNET_RESTORE();
	prison_free(pr);
	return (EEXIST);
	}

	/* Make sure the VNET is stable. */
	shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet);
	if (shutdown) {
	CURVNET_RESTORE();
	prison_free(pr);
	return (EBUSY);
	}

	/* Get interface back from child jail/vnet. */
	found = if_unlink_ifnet(ifp, true);
	MPASS(found);
	error = if_vmove(ifp, vnet_dst);
	CURVNET_RESTORE();

	/* Report the new if_xname back to the userland on success. */
	if (error == 0)
	sprintf(ifname, "%s", ifp->if_xname);

	prison_free(pr);
	return (error);
	}
	#endif /* VIMAGE */

	/*
	* Add a group to an interface
	*/
	int
	if_addgroup(struct ifnet ifp, const char groupname)
	{
	struct ifg_list *ifgl;
	struct ifg_group *ifg = NULL;
	struct ifg_member *ifgm;
	int new = 0;

	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
	groupname[strlen(groupname) - 1] <= '9')
	return (EINVAL);

	IFNET_WLOCK();
	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
	if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
	IFNET_WUNLOCK();
	return (EEXIST);
	}

	if ((ifgl = malloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL) {
	IFNET_WUNLOCK();
	return (ENOMEM);
	}

	if ((ifgm = malloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
	free(ifgl, M_TEMP);
	IFNET_WUNLOCK();
	return (ENOMEM);
	}

	CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
	if (!strcmp(ifg->ifg_group, groupname))
	break;

	if (ifg == NULL) {
	if ((ifg = malloc(sizeof(*ifg), M_TEMP, M_NOWAIT)) == NULL) {
	free(ifgl, M_TEMP);
	free(ifgm, M_TEMP);
	IFNET_WUNLOCK();
	return (ENOMEM);
	}
	strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
	ifg->ifg_refcnt = 0;
	CK_STAILQ_INIT(&ifg->ifg_members);
	CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
	new = 1;
	}

	ifg->ifg_refcnt++;
	ifgl->ifgl_group = ifg;
	ifgm->ifgm_ifp = ifp;

	IF_ADDR_WLOCK(ifp);
	CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
	CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
	IF_ADDR_WUNLOCK(ifp);

	IFNET_WUNLOCK();

	if (new)
	EVENTHANDLER_INVOKE(group_attach_event, ifg);
	EVENTHANDLER_INVOKE(group_change_event, groupname);

	return (0);
	}

	/*
	* Helper function to remove a group out of an interface. Expects the global
	* ifnet lock to be write-locked, and drops it before returning.
	*/
	static void
	_if_delgroup_locked(struct ifnet ifp, struct ifg_list ifgl,
	const char *groupname)
	{
	struct ifg_member *ifgm;
	bool freeifgl;

	IFNET_WLOCK_ASSERT();

	IF_ADDR_WLOCK(ifp);
	CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next);
	IF_ADDR_WUNLOCK(ifp);

	CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) {
	if (ifgm->ifgm_ifp == ifp) {
	CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm,
	ifg_member, ifgm_next);
	break;
	}
	}

	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
	CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group,
	ifg_next);
	freeifgl = true;
	} else {
	freeifgl = false;
	}
	IFNET_WUNLOCK();

	epoch_wait_preempt(net_epoch_preempt);
	if (freeifgl) {
	EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
	free(ifgl->ifgl_group, M_TEMP);
	}
	free(ifgm, M_TEMP);
	free(ifgl, M_TEMP);

	EVENTHANDLER_INVOKE(group_change_event, groupname);
	}

	/*
	* Remove a group from an interface
	*/
	int
	if_delgroup(struct ifnet ifp, const char groupname)
	{
	struct ifg_list *ifgl;

	IFNET_WLOCK();
	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
	if (strcmp(ifgl->ifgl_group->ifg_group, groupname) == 0)
	break;
	if (ifgl == NULL) {
	IFNET_WUNLOCK();
	return (ENOENT);
	}

	_if_delgroup_locked(ifp, ifgl, groupname);

	return (0);
	}

	/*
	* Remove an interface from all groups
	*/
	static void
	if_delgroups(struct ifnet *ifp)
	{
	struct ifg_list *ifgl;
	char groupname[IFNAMSIZ];

	IFNET_WLOCK();
	while ((ifgl = CK_STAILQ_FIRST(&ifp->if_groups)) != NULL) {
	strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
	_if_delgroup_locked(ifp, ifgl, groupname);
	IFNET_WLOCK();
	}
	IFNET_WUNLOCK();
	}

	static char *
	ifgr_group_get(void *ifgrp)
	{
	union ifgroupreq_union *ifgrup;

	ifgrup = ifgrp;
	#ifdef COMPAT_FREEBSD32
	if (SV_CURPROC_FLAG(SV_ILP32))
	return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]);
	#endif
	return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]);
	}

	static struct ifg_req *
	ifgr_groups_get(void *ifgrp)
	{
	union ifgroupreq_union *ifgrup;

	ifgrup = ifgrp;
	#ifdef COMPAT_FREEBSD32
	if (SV_CURPROC_FLAG(SV_ILP32))
	return ((struct ifg_req *)(uintptr_t)
	ifgrup->ifgr32.ifgr_ifgru.ifgru_groups);
	#endif
	return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups);
	}

	/*
	* Stores all groups from an interface in memory pointed to by ifgr.
	*/
	static int
	if_getgroup(struct ifgroupreq ifgr, struct ifnet ifp)
	{
	int len, error;
	struct ifg_list *ifgl;
	struct ifg_req ifgrq, *ifgp;

	NET_EPOCH_ASSERT();

	if (ifgr->ifgr_len == 0) {
	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
	ifgr->ifgr_len += sizeof(struct ifg_req);
	return (0);
	}

	len = ifgr->ifgr_len;
	ifgp = ifgr_groups_get(ifgr);
	/* XXX: wire */
	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
	if (len < sizeof(ifgrq))
	return (EINVAL);
	bzero(&ifgrq, sizeof ifgrq);
	strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
	sizeof(ifgrq.ifgrq_group));
	if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req))))
	return (error);
	len -= sizeof(ifgrq);
	ifgp++;
	}

	return (0);
	}

	/*
	* Stores all members of a group in memory pointed to by igfr
	*/
	static int
	if_getgroupmembers(struct ifgroupreq *ifgr)
	{
	struct ifg_group *ifg;
	struct ifg_member *ifgm;
	struct ifg_req ifgrq, *ifgp;
	int len, error;

	IFNET_RLOCK();
	CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
	if (strcmp(ifg->ifg_group, ifgr->ifgr_name) == 0)
	break;
	if (ifg == NULL) {
	IFNET_RUNLOCK();
	return (ENOENT);
	}

	if (ifgr->ifgr_len == 0) {
	CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
	ifgr->ifgr_len += sizeof(ifgrq);
	IFNET_RUNLOCK();
	return (0);
	}

	len = ifgr->ifgr_len;
	ifgp = ifgr_groups_get(ifgr);
	CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
	if (len < sizeof(ifgrq)) {
	IFNET_RUNLOCK();
	return (EINVAL);
	}
	bzero(&ifgrq, sizeof ifgrq);
	strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
	sizeof(ifgrq.ifgrq_member));
	if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
	IFNET_RUNLOCK();
	return (error);
	}
	len -= sizeof(ifgrq);
	ifgp++;
	}
	IFNET_RUNLOCK();

	return (0);
	}

	/*
	* Return counter values from counter(9)s stored in ifnet.
	*/
	uint64_t
	if_get_counter_default(struct ifnet *ifp, ift_counter cnt)
	{

	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));

	return (counter_u64_fetch(ifp->if_counters[cnt]));
	}

	/*
	* Increase an ifnet counter. Usually used for counters shared
	* between the stack and a driver, but function supports them all.
	*/
	void
	if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc)
	{

	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));

	counter_u64_add(ifp->if_counters[cnt], inc);
	}

	/*
	* Copy data from ifnet to userland API structure if_data.
	*/
	void
	if_data_copy(struct ifnet ifp, struct if_data ifd)
	{

	ifd->ifi_type = ifp->if_type;
	ifd->ifi_physical = 0;
	ifd->ifi_addrlen = ifp->if_addrlen;
	ifd->ifi_hdrlen = ifp->if_hdrlen;
	ifd->ifi_link_state = ifp->if_link_state;
	ifd->ifi_vhid = 0;
	ifd->ifi_datalen = sizeof(struct if_data);
	ifd->ifi_mtu = ifp->if_mtu;
	ifd->ifi_metric = ifp->if_metric;
	ifd->ifi_baudrate = ifp->if_baudrate;
	ifd->ifi_hwassist = ifp->if_hwassist;
	ifd->ifi_epoch = ifp->if_epoch;
	ifd->ifi_lastchange = ifp->if_lastchange;

	ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS);
	ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS);
	ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS);
	ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS);
	ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS);
	ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES);
	ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES);
	ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS);
	ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS);
	ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS);
	ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
	ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO);
	}

	/*
	* Initialization, destruction and refcounting functions for ifaddrs.
	*/
	struct ifaddr *
	ifa_alloc(size_t size, int flags)
	{
	struct ifaddr *ifa;

	KASSERT(size >= sizeof(struct ifaddr),
	("%s: invalid size %zu", __func__, size));

	ifa = malloc(size, M_IFADDR, M_ZERO \| flags);
	if (ifa == NULL)
	return (NULL);

	if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL)
	goto fail;
	if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL)
	goto fail;
	if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL)
	goto fail;
	if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL)
	goto fail;

	refcount_init(&ifa->ifa_refcnt, 1);

	return (ifa);

	fail:
	/* free(NULL) is okay */
	counter_u64_free(ifa->ifa_opackets);
	counter_u64_free(ifa->ifa_ipackets);
	counter_u64_free(ifa->ifa_obytes);
	counter_u64_free(ifa->ifa_ibytes);
	free(ifa, M_IFADDR);

	return (NULL);
	}

	void
	ifa_ref(struct ifaddr *ifa)
	{

	refcount_acquire(&ifa->ifa_refcnt);
	}

	static void
	ifa_destroy(epoch_context_t ctx)
	{
	struct ifaddr *ifa;

	ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx);
	counter_u64_free(ifa->ifa_opackets);
	counter_u64_free(ifa->ifa_ipackets);
	counter_u64_free(ifa->ifa_obytes);
	counter_u64_free(ifa->ifa_ibytes);
	free(ifa, M_IFADDR);
	}

	void
	ifa_free(struct ifaddr *ifa)
	{

	if (refcount_release(&ifa->ifa_refcnt))
	NET_EPOCH_CALL(ifa_destroy, &ifa->ifa_epoch_ctx);
	}

	/*
	* XXX: Because sockaddr_dl has deeper structure than the sockaddr
	* structs used to represent other address families, it is necessary
	* to perform a different comparison.
	*/

	#define sa_dl_equal(a1, a2) \
	((((const struct sockaddr_dl *)(a1))->sdl_len == \
	((const struct sockaddr_dl *)(a2))->sdl_len) && \
	(bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \
	CLLADDR((const struct sockaddr_dl *)(a2)), \
	((const struct sockaddr_dl *)(a1))->sdl_alen) == 0))

	/*
	* Locate an interface based on a complete address.
	*/
	/ARGSUSED/
	struct ifaddr *
	ifa_ifwithaddr(const struct sockaddr *addr)
	{
	struct ifnet *ifp;
	struct ifaddr *ifa;

	NET_EPOCH_ASSERT();

	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != addr->sa_family)
	continue;
	if (sa_equal(addr, ifa->ifa_addr)) {
	goto done;
	}
	/* IP6 doesn't have broadcast */
	if ((ifp->if_flags & IFF_BROADCAST) &&
	ifa->ifa_broadaddr &&
	ifa->ifa_broadaddr->sa_len != 0 &&
	sa_equal(ifa->ifa_broadaddr, addr)) {
	goto done;
	}
	}
	}
	ifa = NULL;
	done:
	return (ifa);
	}

	int
	ifa_ifwithaddr_check(const struct sockaddr *addr)
	{
	struct epoch_tracker et;
	int rc;

	NET_EPOCH_ENTER(et);
	rc = (ifa_ifwithaddr(addr) != NULL);
	NET_EPOCH_EXIT(et);
	return (rc);
	}

	/*
	* Locate an interface based on the broadcast address.
	*/
	/* ARGSUSED */
	struct ifaddr *
	ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum)
	{
	struct ifnet *ifp;
	struct ifaddr *ifa;

	NET_EPOCH_ASSERT();
	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
	continue;
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != addr->sa_family)
	continue;
	if ((ifp->if_flags & IFF_BROADCAST) &&
	ifa->ifa_broadaddr &&
	ifa->ifa_broadaddr->sa_len != 0 &&
	sa_equal(ifa->ifa_broadaddr, addr)) {
	goto done;
	}
	}
	}
	ifa = NULL;
	done:
	return (ifa);
	}

	/*
	* Locate the point to point interface with a given destination address.
	*/
	/ARGSUSED/
	struct ifaddr *
	ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum)
	{
	struct ifnet *ifp;
	struct ifaddr *ifa;

	NET_EPOCH_ASSERT();
	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
	continue;
	if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
	continue;
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != addr->sa_family)
	continue;
	if (ifa->ifa_dstaddr != NULL &&
	sa_equal(addr, ifa->ifa_dstaddr)) {
	goto done;
	}
	}
	}
	ifa = NULL;
	done:
	return (ifa);
	}

	/*
	* Find an interface on a specific network. If many, choice
	* is most specific found.
	*/
	struct ifaddr *
	ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum)
	{
	struct ifnet *ifp;
	struct ifaddr *ifa;
	struct ifaddr *ifa_maybe = NULL;
	u_int af = addr->sa_family;
	const char addr_data = addr->sa_data, cplim;

	NET_EPOCH_ASSERT();
	/*
	* AF_LINK addresses can be looked up directly by their index number,
	* so do that if we can.
	*/
	if (af == AF_LINK) {
	const struct sockaddr_dl sdl = (const struct sockaddr_dl )addr;
	if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
	return (ifaddr_byindex(sdl->sdl_index));
	}

	/*
	* Scan though each interface, looking for ones that have addresses
	* in this address family and the requested fib.
	*/
	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
	continue;
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	const char cp, cp2, *cp3;

	if (ifa->ifa_addr->sa_family != af)
	next: continue;
	if (af == AF_INET &&
	ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
	/*
	* This is a bit broken as it doesn't
	* take into account that the remote end may
	* be a single node in the network we are
	* looking for.
	* The trouble is that we don't know the
	* netmask for the remote end.
	*/
	if (ifa->ifa_dstaddr != NULL &&
	sa_equal(addr, ifa->ifa_dstaddr)) {
	goto done;
	}
	} else {
	/*
	* Scan all the bits in the ifa's address.
	* If a bit dissagrees with what we are
	* looking for, mask it with the netmask
	* to see if it really matters.
	* (A byte at a time)
	*/
	if (ifa->ifa_netmask == 0)
	continue;
	cp = addr_data;
	cp2 = ifa->ifa_addr->sa_data;
	cp3 = ifa->ifa_netmask->sa_data;
	cplim = ifa->ifa_netmask->sa_len
	+ (char *)ifa->ifa_netmask;
	while (cp3 < cplim)
	if ((cp++ ^ cp2++) & *cp3++)
	goto next; /* next address! */
	/*
	* If the netmask of what we just found
	* is more specific than what we had before
	* (if we had one), or if the virtual status
	* of new prefix is better than of the old one,
	* then remember the new one before continuing
	* to search for an even better one.
	*/
	if (ifa_maybe == NULL \|\|
	ifa_preferred(ifa_maybe, ifa) \|\|
	rn_refines((caddr_t)ifa->ifa_netmask,
	(caddr_t)ifa_maybe->ifa_netmask)) {
	ifa_maybe = ifa;
	}
	}
	}
	}
	ifa = ifa_maybe;
	ifa_maybe = NULL;
	done:
	return (ifa);
	}

	/*
	* Find an interface address specific to an interface best matching
	* a given address.
	*/
	struct ifaddr *
	ifaof_ifpforaddr(const struct sockaddr addr, struct ifnet ifp)
	{
	struct ifaddr *ifa;
	const char cp, cp2, *cp3;
	char *cplim;
	struct ifaddr *ifa_maybe = NULL;
	u_int af = addr->sa_family;

	if (af >= AF_MAX)
	return (NULL);

	NET_EPOCH_ASSERT();
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != af)
	continue;
	if (ifa_maybe == NULL)
	ifa_maybe = ifa;
	if (ifa->ifa_netmask == 0) {
	if (sa_equal(addr, ifa->ifa_addr) \|\|
	(ifa->ifa_dstaddr &&
	sa_equal(addr, ifa->ifa_dstaddr)))
	goto done;
	continue;
	}
	if (ifp->if_flags & IFF_POINTOPOINT) {
	if (sa_equal(addr, ifa->ifa_dstaddr))
	goto done;
	} else {
	cp = addr->sa_data;
	cp2 = ifa->ifa_addr->sa_data;
	cp3 = ifa->ifa_netmask->sa_data;
	cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
	for (; cp3 < cplim; cp3++)
	if ((cp++ ^ cp2++) & *cp3)
	break;
	if (cp3 == cplim)
	goto done;
	}
	}
	ifa = ifa_maybe;
	done:
	return (ifa);
	}

	/*
	* See whether new ifa is better than current one:
	* 1) A non-virtual one is preferred over virtual.
	* 2) A virtual in master state preferred over any other state.
	*
	* Used in several address selecting functions.
	*/
	int
	ifa_preferred(struct ifaddr cur, struct ifaddr next)
	{

	return (cur->ifa_carp && (!next->ifa_carp \|\|
	((carp_master_p)(next) && !(carp_master_p)(cur))));
	}

	struct sockaddr_dl *
	link_alloc_sdl(size_t size, int flags)
	{

	return (malloc(size, M_TEMP, flags));
	}

	void
	link_free_sdl(struct sockaddr *sa)
	{
	free(sa, M_TEMP);
	}

	/*
	* Fills in given sdl with interface basic info.
	* Returns pointer to filled sdl.
	*/
	struct sockaddr_dl *
	link_init_sdl(struct ifnet ifp, struct sockaddr paddr, u_char iftype)
	{
	struct sockaddr_dl *sdl;

	sdl = (struct sockaddr_dl *)paddr;
	memset(sdl, 0, sizeof(struct sockaddr_dl));
	sdl->sdl_len = sizeof(struct sockaddr_dl);
	sdl->sdl_family = AF_LINK;
	sdl->sdl_index = ifp->if_index;
	sdl->sdl_type = iftype;

	return (sdl);
	}

	/*
	* Mark an interface down and notify protocols of
	* the transition.
	*/
	static void
	if_unroute(struct ifnet *ifp, int flag, int fam)
	{
	struct ifaddr *ifa;

	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));

	ifp->if_flags &= ~flag;
	getmicrotime(&ifp->if_lastchange);
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
	if (fam == PF_UNSPEC \|\| (fam == ifa->ifa_addr->sa_family))
	pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
	ifp->if_qflush(ifp);

	if (ifp->if_carp)
	(*carp_linkstate_p)(ifp);
	rt_ifmsg(ifp);
	}

	/*
	* Mark an interface up and notify protocols of
	* the transition.
	*/
	static void
	if_route(struct ifnet *ifp, int flag, int fam)
	{
	struct ifaddr *ifa;

	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));

	ifp->if_flags \|= flag;
	getmicrotime(&ifp->if_lastchange);
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
	if (fam == PF_UNSPEC \|\| (fam == ifa->ifa_addr->sa_family))
	pfctlinput(PRC_IFUP, ifa->ifa_addr);
	if (ifp->if_carp)
	(*carp_linkstate_p)(ifp);
	rt_ifmsg(ifp);
	#ifdef INET6
	in6_if_up(ifp);
	#endif
	}

	void (vlan_link_state_p)(struct ifnet ); /* XXX: private from if_vlan */
	void (vlan_trunk_cap_p)(struct ifnet ); /* XXX: private from if_vlan */
	struct ifnet (vlan_trunkdev_p)(struct ifnet *);
	struct ifnet (vlan_devat_p)(struct ifnet *, uint16_t);
	int (vlan_tag_p)(struct ifnet , uint16_t *);
	int (vlan_pcp_p)(struct ifnet , uint16_t *);
	int (vlan_setcookie_p)(struct ifnet , void *);
	void (vlan_cookie_p)(struct ifnet *);

	/*
	* Handle a change in the interface link state. To avoid LORs
	* between driver lock and upper layer locks, as well as possible
	* recursions, we post event to taskqueue, and all job
	* is done in static do_link_state_change().
	*/
	void
	if_link_state_change(struct ifnet *ifp, int link_state)
	{
	/* Return if state hasn't changed. */
	if (ifp->if_link_state == link_state)
	return;

	ifp->if_link_state = link_state;

	/* XXXGL: reference ifp? */
	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
	}

	static void
	do_link_state_change(void *arg, int pending)
	{
	struct ifnet *ifp;
	int link_state;

	ifp = arg;
	link_state = ifp->if_link_state;

	CURVNET_SET(ifp->if_vnet);
	rt_ifmsg(ifp);
	if (ifp->if_vlantrunk != NULL)
	(*vlan_link_state_p)(ifp);

	if ((ifp->if_type == IFT_ETHER \|\| ifp->if_type == IFT_L2VLAN) &&
	ifp->if_l2com != NULL)
	(*ng_ether_link_state_p)(ifp, link_state);
	if (ifp->if_carp)
	(*carp_linkstate_p)(ifp);
	if (ifp->if_bridge)
	ifp->if_bridge_linkstate(ifp);
	if (ifp->if_lagg)
	(*lagg_linkstate_p)(ifp, link_state);

	if (IS_DEFAULT_VNET(curvnet))
	devctl_notify("IFNET", ifp->if_xname,
	(link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
	NULL);
	if (pending > 1)
	if_printf(ifp, "%d link states coalesced\n", pending);
	if (log_link_state_change)
	if_printf(ifp, "link state changed to %s\n",
	(link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
	EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state);
	CURVNET_RESTORE();
	}

	/*
	* Mark an interface down and notify protocols of
	* the transition.
	*/
	void
	if_down(struct ifnet *ifp)
	{

	EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN);
	if_unroute(ifp, IFF_UP, AF_UNSPEC);
	}

	/*
	* Mark an interface up and notify protocols of
	* the transition.
	*/
	void
	if_up(struct ifnet *ifp)
	{

	if_route(ifp, IFF_UP, AF_UNSPEC);
	EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP);
	}

	/*
	* Flush an interface queue.
	*/
	void
	if_qflush(struct ifnet *ifp)
	{
	struct mbuf m, n;
	struct ifaltq *ifq;

	ifq = &ifp->if_snd;
	IFQ_LOCK(ifq);
	#ifdef ALTQ
	if (ALTQ_IS_ENABLED(ifq))
	ALTQ_PURGE(ifq);
	#endif
	n = ifq->ifq_head;
	while ((m = n) != NULL) {
	n = m->m_nextpkt;
	m_freem(m);
	}
	ifq->ifq_head = 0;
	ifq->ifq_tail = 0;
	ifq->ifq_len = 0;
	IFQ_UNLOCK(ifq);
	}

	/*
	* Map interface name to interface structure pointer, with or without
	* returning a reference.
	*/
	struct ifnet *
	ifunit_ref(const char *name)
	{
	struct epoch_tracker et;
	struct ifnet *ifp;

	NET_EPOCH_ENTER(et);
	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
	!(ifp->if_flags & IFF_DYING))
	break;
	}
	if (ifp != NULL)
	if_ref(ifp);
	NET_EPOCH_EXIT(et);
	return (ifp);
	}

	struct ifnet *
	ifunit(const char *name)
	{
	struct epoch_tracker et;
	struct ifnet *ifp;

	NET_EPOCH_ENTER(et);
	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
	break;
	}
	NET_EPOCH_EXIT(et);
	return (ifp);
	}

	void *
	ifr_buffer_get_buffer(void *data)
	{
	union ifreq_union *ifrup;

	ifrup = data;
	#ifdef COMPAT_FREEBSD32
	if (SV_CURPROC_FLAG(SV_ILP32))
	return ((void *)(uintptr_t)
	ifrup->ifr32.ifr_ifru.ifru_buffer.buffer);
	#endif
	return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer);
	}

	static void
	ifr_buffer_set_buffer_null(void *data)
	{
	union ifreq_union *ifrup;

	ifrup = data;
	#ifdef COMPAT_FREEBSD32
	if (SV_CURPROC_FLAG(SV_ILP32))
	ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0;
	else
	#endif
	ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL;
	}

	size_t
	ifr_buffer_get_length(void *data)
	{
	union ifreq_union *ifrup;

	ifrup = data;
	#ifdef COMPAT_FREEBSD32
	if (SV_CURPROC_FLAG(SV_ILP32))
	return (ifrup->ifr32.ifr_ifru.ifru_buffer.length);
	#endif
	return (ifrup->ifr.ifr_ifru.ifru_buffer.length);
	}

	static void
	ifr_buffer_set_length(void *data, size_t len)
	{
	union ifreq_union *ifrup;

	ifrup = data;
	#ifdef COMPAT_FREEBSD32
	if (SV_CURPROC_FLAG(SV_ILP32))
	ifrup->ifr32.ifr_ifru.ifru_buffer.length = len;
	else
	#endif
	ifrup->ifr.ifr_ifru.ifru_buffer.length = len;
	}

	void *
	ifr_data_get_ptr(void *ifrp)
	{
	union ifreq_union *ifrup;

	ifrup = ifrp;
	#ifdef COMPAT_FREEBSD32
	if (SV_CURPROC_FLAG(SV_ILP32))
	return ((void *)(uintptr_t)
	ifrup->ifr32.ifr_ifru.ifru_data);
	#endif
	return (ifrup->ifr.ifr_ifru.ifru_data);
	}

	/*
	* Hardware specific interface ioctls.
	*/
	int
	ifhwioctl(u_long cmd, struct ifnet ifp, caddr_t data, struct thread td)
	{
	struct ifreq *ifr;
	int error = 0, do_ifup = 0;
	int new_flags, temp_flags;
	size_t namelen, onamelen;
	size_t descrlen;
	char descrbuf, odescrbuf;
	char new_name[IFNAMSIZ];
	struct ifaddr *ifa;
	struct sockaddr_dl *sdl;

	ifr = (struct ifreq *)data;
	switch (cmd) {
	case SIOCGIFINDEX:
	ifr->ifr_index = ifp->if_index;
	break;

	case SIOCGIFFLAGS:
	temp_flags = ifp->if_flags \| ifp->if_drv_flags;
	ifr->ifr_flags = temp_flags & 0xffff;
	ifr->ifr_flagshigh = temp_flags >> 16;
	break;

	case SIOCGIFCAP:
	ifr->ifr_reqcap = ifp->if_capabilities;
	ifr->ifr_curcap = ifp->if_capenable;
	break;

	case SIOCGIFDATA:
	{
	struct if_data ifd;

	/* Ensure uninitialised padding is not leaked. */
	memset(&ifd, 0, sizeof(ifd));

	if_data_copy(ifp, &ifd);
	error = copyout(&ifd, ifr_data_get_ptr(ifr), sizeof(ifd));
	break;
	}

	#ifdef MAC
	case SIOCGIFMAC:
	error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
	break;
	#endif

	case SIOCGIFMETRIC:
	ifr->ifr_metric = ifp->if_metric;
	break;

	case SIOCGIFMTU:
	ifr->ifr_mtu = ifp->if_mtu;
	break;

	case SIOCGIFPHYS:
	/* XXXGL: did this ever worked? */
	ifr->ifr_phys = 0;
	break;

	case SIOCGIFDESCR:
	error = 0;
	sx_slock(&ifdescr_sx);
	if (ifp->if_description == NULL)
	error = ENOMSG;
	else {
	/* space for terminating nul */
	descrlen = strlen(ifp->if_description) + 1;
	if (ifr_buffer_get_length(ifr) < descrlen)
	ifr_buffer_set_buffer_null(ifr);
	else
	error = copyout(ifp->if_description,
	ifr_buffer_get_buffer(ifr), descrlen);
	ifr_buffer_set_length(ifr, descrlen);
	}
	sx_sunlock(&ifdescr_sx);
	break;

	case SIOCSIFDESCR:
	error = priv_check(td, PRIV_NET_SETIFDESCR);
	if (error)
	return (error);

	/*
	* Copy only (length-1) bytes to make sure that
	* if_description is always nul terminated. The
	* length parameter is supposed to count the
	* terminating nul in.
	*/
	if (ifr_buffer_get_length(ifr) > ifdescr_maxlen)
	return (ENAMETOOLONG);
	else if (ifr_buffer_get_length(ifr) == 0)
	descrbuf = NULL;
	else {
	descrbuf = malloc(ifr_buffer_get_length(ifr),
	M_IFDESCR, M_WAITOK \| M_ZERO);
	error = copyin(ifr_buffer_get_buffer(ifr), descrbuf,
	ifr_buffer_get_length(ifr) - 1);
	if (error) {
	free(descrbuf, M_IFDESCR);
	break;
	}
	}

	sx_xlock(&ifdescr_sx);
	odescrbuf = ifp->if_description;
	ifp->if_description = descrbuf;
	sx_xunlock(&ifdescr_sx);

	getmicrotime(&ifp->if_lastchange);
	free(odescrbuf, M_IFDESCR);
	break;

	case SIOCGIFFIB:
	ifr->ifr_fib = ifp->if_fib;
	break;

	case SIOCSIFFIB:
	error = priv_check(td, PRIV_NET_SETIFFIB);
	if (error)
	return (error);
	if (ifr->ifr_fib >= rt_numfibs)
	return (EINVAL);

	ifp->if_fib = ifr->ifr_fib;
	break;

	case SIOCSIFFLAGS:
	error = priv_check(td, PRIV_NET_SETIFFLAGS);
	if (error)
	return (error);
	/*
	* Currently, no driver owned flags pass the IFF_CANTCHANGE
	* check, so we don't need special handling here yet.
	*/
	new_flags = (ifr->ifr_flags & 0xffff) \|
	(ifr->ifr_flagshigh << 16);
	if (ifp->if_flags & IFF_UP &&
	(new_flags & IFF_UP) == 0) {
	if_down(ifp);
	} else if (new_flags & IFF_UP &&
	(ifp->if_flags & IFF_UP) == 0) {
	do_ifup = 1;
	}
	/* See if permanently promiscuous mode bit is about to flip */
	if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
	if (new_flags & IFF_PPROMISC)
	ifp->if_flags \|= IFF_PROMISC;
	else if (ifp->if_pcount == 0)
	ifp->if_flags &= ~IFF_PROMISC;
	if (log_promisc_mode_change)
	if_printf(ifp, "permanently promiscuous mode %s\n",
	((new_flags & IFF_PPROMISC) ?
	"enabled" : "disabled"));
	}
	ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) \|
	(new_flags &~ IFF_CANTCHANGE);
	if (ifp->if_ioctl) {
	(void) (*ifp->if_ioctl)(ifp, cmd, data);
	}
	if (do_ifup)
	if_up(ifp);
	getmicrotime(&ifp->if_lastchange);
	break;

	case SIOCSIFCAP:
	error = priv_check(td, PRIV_NET_SETIFCAP);
	if (error)
	return (error);
	if (ifp->if_ioctl == NULL)
	return (EOPNOTSUPP);
	if (ifr->ifr_reqcap & ~ifp->if_capabilities)
	return (EINVAL);
	error = (*ifp->if_ioctl)(ifp, cmd, data);
	if (error == 0)
	getmicrotime(&ifp->if_lastchange);
	break;

	#ifdef MAC
	case SIOCSIFMAC:
	error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
	break;
	#endif

	case SIOCSIFNAME:
	error = priv_check(td, PRIV_NET_SETIFNAME);
	if (error)
	return (error);
	error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ,
	NULL);
	if (error != 0)
	return (error);
	if (new_name[0] == '\0')
	return (EINVAL);
	if (new_name[IFNAMSIZ-1] != '\0') {
	new_name[IFNAMSIZ-1] = '\0';
	if (strlen(new_name) == IFNAMSIZ-1)
	return (EINVAL);
	}
	if (strcmp(new_name, ifp->if_xname) == 0)
	break;
	if (ifunit(new_name) != NULL)
	return (EEXIST);

	/*
	* XXX: Locking. Nothing else seems to lock if_flags,
	* and there are numerous other races with the
	* ifunit() checks not being atomic with namespace
	* changes (renames, vmoves, if_attach, etc).
	*/
	ifp->if_flags \|= IFF_RENAMING;

	/* Announce the departure of the interface. */
	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);

	if_printf(ifp, "changing name to '%s'\n", new_name);

	IF_ADDR_WLOCK(ifp);
	strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
	ifa = ifp->if_addr;
	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
	namelen = strlen(new_name);
	onamelen = sdl->sdl_nlen;
	/*
	* Move the address if needed. This is safe because we
	* allocate space for a name of length IFNAMSIZ when we
	* create this in if_attach().
	*/
	if (namelen != onamelen) {
	bcopy(sdl->sdl_data + onamelen,
	sdl->sdl_data + namelen, sdl->sdl_alen);
	}
	bcopy(new_name, sdl->sdl_data, namelen);
	sdl->sdl_nlen = namelen;
	sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
	bzero(sdl->sdl_data, onamelen);
	while (namelen != 0)
	sdl->sdl_data[--namelen] = 0xff;
	IF_ADDR_WUNLOCK(ifp);

	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
	/* Announce the return of the interface. */
	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);

	ifp->if_flags &= ~IFF_RENAMING;
	break;

	#ifdef VIMAGE
	case SIOCSIFVNET:
	error = priv_check(td, PRIV_NET_SETIFVNET);
	if (error)
	return (error);
	error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
	break;
	#endif

	case SIOCSIFMETRIC:
	error = priv_check(td, PRIV_NET_SETIFMETRIC);
	if (error)
	return (error);
	ifp->if_metric = ifr->ifr_metric;
	getmicrotime(&ifp->if_lastchange);
	break;

	case SIOCSIFPHYS:
	error = priv_check(td, PRIV_NET_SETIFPHYS);
	if (error)
	return (error);
	if (ifp->if_ioctl == NULL)
	return (EOPNOTSUPP);
	error = (*ifp->if_ioctl)(ifp, cmd, data);
	if (error == 0)
	getmicrotime(&ifp->if_lastchange);
	break;

	case SIOCSIFMTU:
	{
	u_long oldmtu = ifp->if_mtu;

	error = priv_check(td, PRIV_NET_SETIFMTU);
	if (error)
	return (error);
	if (ifr->ifr_mtu < IF_MINMTU \|\| ifr->ifr_mtu > IF_MAXMTU)
	return (EINVAL);
	if (ifp->if_ioctl == NULL)
	return (EOPNOTSUPP);
	error = (*ifp->if_ioctl)(ifp, cmd, data);
	if (error == 0) {
	getmicrotime(&ifp->if_lastchange);
	rt_ifmsg(ifp);
	#ifdef INET
	DEBUGNET_NOTIFY_MTU(ifp);
	#endif
	}
	/*
	* If the link MTU changed, do network layer specific procedure.
	*/
	if (ifp->if_mtu != oldmtu) {
	#ifdef INET6
	nd6_setmtu(ifp);
	#endif
	rt_updatemtu(ifp);
	}
	break;
	}

	case SIOCADDMULTI:
	case SIOCDELMULTI:
	if (cmd == SIOCADDMULTI)
	error = priv_check(td, PRIV_NET_ADDMULTI);
	else
	error = priv_check(td, PRIV_NET_DELMULTI);
	if (error)
	return (error);

	/* Don't allow group membership on non-multicast interfaces. */
	if ((ifp->if_flags & IFF_MULTICAST) == 0)
	return (EOPNOTSUPP);

	/* Don't let users screw up protocols' entries. */
	if (ifr->ifr_addr.sa_family != AF_LINK)
	return (EINVAL);

	if (cmd == SIOCADDMULTI) {
	struct epoch_tracker et;
	struct ifmultiaddr *ifma;

	/*
	* Userland is only permitted to join groups once
	* via the if_addmulti() KPI, because it cannot hold
	* struct ifmultiaddr * between calls. It may also
	* lose a race while we check if the membership
	* already exists.
	*/
	NET_EPOCH_ENTER(et);
	ifma = if_findmulti(ifp, &ifr->ifr_addr);
	NET_EPOCH_EXIT(et);
	if (ifma != NULL)
	error = EADDRINUSE;
	else
	error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
	} else {
	error = if_delmulti(ifp, &ifr->ifr_addr);
	}
	if (error == 0)
	getmicrotime(&ifp->if_lastchange);
	break;

	case SIOCSIFPHYADDR:
	case SIOCDIFPHYADDR:
	#ifdef INET6
	case SIOCSIFPHYADDR_IN6:
	#endif
	case SIOCSIFMEDIA:
	case SIOCSIFGENERIC:
	error = priv_check(td, PRIV_NET_HWIOCTL);
	if (error)
	return (error);
	if (ifp->if_ioctl == NULL)
	return (EOPNOTSUPP);
	error = (*ifp->if_ioctl)(ifp, cmd, data);
	if (error == 0)
	getmicrotime(&ifp->if_lastchange);
	break;

	case SIOCGIFSTATUS:
	case SIOCGIFPSRCADDR:
	case SIOCGIFPDSTADDR:
	case SIOCGIFMEDIA:
	case SIOCGIFXMEDIA:
	case SIOCGIFGENERIC:
	case SIOCGIFRSSKEY:
	case SIOCGIFRSSHASH:
	case SIOCGIFDOWNREASON:
	if (ifp->if_ioctl == NULL)
	return (EOPNOTSUPP);
	error = (*ifp->if_ioctl)(ifp, cmd, data);
	break;

	case SIOCSIFLLADDR:
	error = priv_check(td, PRIV_NET_SETLLADDR);
	if (error)
	return (error);
	error = if_setlladdr(ifp,
	ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
	break;

	case SIOCGHWADDR:
	error = if_gethwaddr(ifp, ifr);
	break;

	case CASE_IOC_IFGROUPREQ(SIOCAIFGROUP):
	error = priv_check(td, PRIV_NET_ADDIFGROUP);
	if (error)
	return (error);
	if ((error = if_addgroup(ifp,
	ifgr_group_get((struct ifgroupreq *)data))))
	return (error);
	break;

	case CASE_IOC_IFGROUPREQ(SIOCGIFGROUP):
	{
	struct epoch_tracker et;

	NET_EPOCH_ENTER(et);
	error = if_getgroup((struct ifgroupreq *)data, ifp);
	NET_EPOCH_EXIT(et);
	break;
	}

	case CASE_IOC_IFGROUPREQ(SIOCDIFGROUP):
	error = priv_check(td, PRIV_NET_DELIFGROUP);
	if (error)
	return (error);
	if ((error = if_delgroup(ifp,
	ifgr_group_get((struct ifgroupreq *)data))))
	return (error);
	break;

	default:
	error = ENOIOCTL;
	break;
	}
	return (error);
	}

	#ifdef COMPAT_FREEBSD32
	struct ifconf32 {
	int32_t ifc_len;
	union {
	uint32_t ifcu_buf;
	uint32_t ifcu_req;
	} ifc_ifcu;
	};
	#define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32)
	#endif

	#ifdef COMPAT_FREEBSD32
	static void
	ifmr_init(struct ifmediareq *ifmr, caddr_t data)
	{
	struct ifmediareq32 *ifmr32;

	ifmr32 = (struct ifmediareq32 *)data;
	memcpy(ifmr->ifm_name, ifmr32->ifm_name,
	sizeof(ifmr->ifm_name));
	ifmr->ifm_current = ifmr32->ifm_current;
	ifmr->ifm_mask = ifmr32->ifm_mask;
	ifmr->ifm_status = ifmr32->ifm_status;
	ifmr->ifm_active = ifmr32->ifm_active;
	ifmr->ifm_count = ifmr32->ifm_count;
	ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist;
	}

	static void
	ifmr_update(const struct ifmediareq *ifmr, caddr_t data)
	{
	struct ifmediareq32 *ifmr32;

	ifmr32 = (struct ifmediareq32 *)data;
	ifmr32->ifm_current = ifmr->ifm_current;
	ifmr32->ifm_mask = ifmr->ifm_mask;
	ifmr32->ifm_status = ifmr->ifm_status;
	ifmr32->ifm_active = ifmr->ifm_active;
	ifmr32->ifm_count = ifmr->ifm_count;
	}
	#endif

	/*
	* Interface ioctls.
	*/
	int
	ifioctl(struct socket so, u_long cmd, caddr_t data, struct thread td)
	{
	#ifdef COMPAT_FREEBSD32
	caddr_t saved_data = NULL;
	struct ifmediareq ifmr;
	struct ifmediareq *ifmrp = NULL;
	#endif
	struct ifnet *ifp;
	struct ifreq *ifr;
	int error;
	int oif_flags;
	#ifdef VIMAGE
	bool shutdown;
	#endif

	CURVNET_SET(so->so_vnet);
	#ifdef VIMAGE
	/* Make sure the VNET is stable. */
	shutdown = VNET_IS_SHUTTING_DOWN(so->so_vnet);
	if (shutdown) {
	CURVNET_RESTORE();
	return (EBUSY);
	}
	#endif

	switch (cmd) {
	case SIOCGIFCONF:
	error = ifconf(cmd, data);
	goto out_noref;

	#ifdef COMPAT_FREEBSD32
	case SIOCGIFCONF32:
	{
	struct ifconf32 *ifc32;
	struct ifconf ifc;

	ifc32 = (struct ifconf32 *)data;
	ifc.ifc_len = ifc32->ifc_len;
	ifc.ifc_buf = PTRIN(ifc32->ifc_buf);

	error = ifconf(SIOCGIFCONF, (void *)&ifc);
	if (error == 0)
	ifc32->ifc_len = ifc.ifc_len;
	goto out_noref;
	}
	#endif
	}

	#ifdef COMPAT_FREEBSD32
	switch (cmd) {
	case SIOCGIFMEDIA32:
	case SIOCGIFXMEDIA32:
	ifmrp = &ifmr;
	ifmr_init(ifmrp, data);
	cmd = _IOC_NEWTYPE(cmd, struct ifmediareq);
	saved_data = data;
	data = (caddr_t)ifmrp;
	}
	#endif

	ifr = (struct ifreq *)data;
	switch (cmd) {
	#ifdef VIMAGE
	case SIOCSIFRVNET:
	error = priv_check(td, PRIV_NET_SETIFVNET);
	if (error == 0)
	error = if_vmove_reclaim(td, ifr->ifr_name,
	ifr->ifr_jid);
	goto out_noref;
	#endif
	case SIOCIFCREATE:
	case SIOCIFCREATE2:
	error = priv_check(td, PRIV_NET_IFCREATE);
	if (error == 0)
	error = if_clone_create(ifr->ifr_name,
	sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ?
	ifr_data_get_ptr(ifr) : NULL);
	goto out_noref;
	case SIOCIFDESTROY:
	error = priv_check(td, PRIV_NET_IFDESTROY);
	if (error == 0)
	error = if_clone_destroy(ifr->ifr_name);
	goto out_noref;

	case SIOCIFGCLONERS:
	error = if_clone_list((struct if_clonereq *)data);
	goto out_noref;

	case CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB):
	error = if_getgroupmembers((struct ifgroupreq *)data);
	goto out_noref;

	#if defined(INET) \|\| defined(INET6)
	case SIOCSVH:
	case SIOCGVH:
	if (carp_ioctl_p == NULL)
	error = EPROTONOSUPPORT;
	else
	error = (*carp_ioctl_p)(ifr, cmd, td);
	goto out_noref;
	#endif
	}

	ifp = ifunit_ref(ifr->ifr_name);
	if (ifp == NULL) {
	error = ENXIO;
	goto out_noref;
	}

	error = ifhwioctl(cmd, ifp, data, td);
	if (error != ENOIOCTL)
	goto out_ref;

	oif_flags = ifp->if_flags;
	if (so->so_proto == NULL) {
	error = EOPNOTSUPP;
	goto out_ref;
	}

	/*
	* Pass the request on to the socket control method, and if the
	* latter returns EOPNOTSUPP, directly to the interface.
	*
	* Make an exception for the legacy SIOCSIF* requests. Drivers
	* trust SIOCSIFADDR et al to come from an already privileged
	* layer, and do not perform any credentials checks or input
	* validation.
	*/
	error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data,
	ifp, td));
	if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL &&
	cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR &&
	cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK)
	error = (*ifp->if_ioctl)(ifp, cmd, data);

	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
	#ifdef INET6
	if (ifp->if_flags & IFF_UP)
	in6_if_up(ifp);
	#endif
	}

	out_ref:
	if_rele(ifp);
	out_noref:
	#ifdef COMPAT_FREEBSD32
	if (ifmrp != NULL) {
	KASSERT((cmd == SIOCGIFMEDIA \|\| cmd == SIOCGIFXMEDIA),
	("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx",
	cmd));
	data = saved_data;
	ifmr_update(ifmrp, data);
	}
	#endif
	CURVNET_RESTORE();
	return (error);
	}

	/*
	* The code common to handling reference counted flags,
	* e.g., in ifpromisc() and if_allmulti().
	* The "pflag" argument can specify a permanent mode flag to check,
	* such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
	*
	* Only to be used on stack-owned flags, not driver-owned flags.
	*/
	static int
	if_setflag(struct ifnet ifp, int flag, int pflag, int refcount, int onswitch)
	{
	struct ifreq ifr;
	int error;
	int oldflags, oldcount;

	/* Sanity checks to catch programming errors */
	KASSERT((flag & (IFF_DRV_OACTIVE\|IFF_DRV_RUNNING)) == 0,
	("%s: setting driver-owned flag %d", __func__, flag));

	if (onswitch)
	KASSERT(*refcount >= 0,
	("%s: increment negative refcount %d for flag %d",
	__func__, *refcount, flag));
	else
	KASSERT(*refcount > 0,
	("%s: decrement non-positive refcount %d for flag %d",
	__func__, *refcount, flag));

	/* In case this mode is permanent, just touch refcount */
	if (ifp->if_flags & pflag) {
	*refcount += onswitch ? 1 : -1;
	return (0);
	}

	/* Save ifnet parameters for if_ioctl() may fail */
	oldcount = *refcount;
	oldflags = ifp->if_flags;

	/*
	* See if we aren't the only and touching refcount is enough.
	* Actually toggle interface flag if we are the first or last.
	*/
	if (onswitch) {
	if ((*refcount)++)
	return (0);
	ifp->if_flags \|= flag;
	} else {
	if (--(*refcount))
	return (0);
	ifp->if_flags &= ~flag;
	}

	/* Call down the driver since we've changed interface flags */
	if (ifp->if_ioctl == NULL) {
	error = EOPNOTSUPP;
	goto recover;
	}
	ifr.ifr_flags = ifp->if_flags & 0xffff;
	ifr.ifr_flagshigh = ifp->if_flags >> 16;
	error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
	if (error)
	goto recover;
	/* Notify userland that interface flags have changed */
	rt_ifmsg(ifp);
	return (0);

	recover:
	/* Recover after driver error */
	*refcount = oldcount;
	ifp->if_flags = oldflags;
	return (error);
	}

	/*
	* Set/clear promiscuous mode on interface ifp based on the truth value
	* of pswitch. The calls are reference counted so that only the first
	* "on" request actually has an effect, as does the final "off" request.
	* Results are undefined if the "off" and "on" requests are not matched.
	*/
	int
	ifpromisc(struct ifnet *ifp, int pswitch)
	{
	int error;
	int oldflags = ifp->if_flags;

	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
	&ifp->if_pcount, pswitch);
	/* If promiscuous mode status has changed, log a message */
	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) &&
	log_promisc_mode_change)
	if_printf(ifp, "promiscuous mode %s\n",
	(ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
	return (error);
	}

	/*
	* Return interface configuration
	* of system. List may be used
	* in later ioctl's (above) to get
	* other information.
	*/
	/ARGSUSED/
	static int
	ifconf(u_long cmd, caddr_t data)
	{
	struct ifconf ifc = (struct ifconf )data;
	struct ifnet *ifp;
	struct ifaddr *ifa;
	struct ifreq ifr;
	struct sbuf *sb;
	int error, full = 0, valid_len, max_len;

	- /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
	- max_len = MAXPHYS - 1;
	+ /* Limit initial buffer size to maxphys to avoid DoS from userspace. */
	+ max_len = maxphys - 1;

	/* Prevent hostile input from being able to crash the system */
	if (ifc->ifc_len <= 0)
	return (EINVAL);

	again:
	if (ifc->ifc_len <= max_len) {
	max_len = ifc->ifc_len;
	full = 1;
	}
	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
	max_len = 0;
	valid_len = 0;

	IFNET_RLOCK();
	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	struct epoch_tracker et;
	int addrs;

	/*
	* Zero the ifr to make sure we don't disclose the contents
	* of the stack.
	*/
	memset(&ifr, 0, sizeof(ifr));

	if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
	>= sizeof(ifr.ifr_name)) {
	sbuf_delete(sb);
	IFNET_RUNLOCK();
	return (ENAMETOOLONG);
	}

	addrs = 0;
	NET_EPOCH_ENTER(et);
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	struct sockaddr *sa = ifa->ifa_addr;

	if (prison_if(curthread->td_ucred, sa) != 0)
	continue;
	addrs++;
	if (sa->sa_len <= sizeof(*sa)) {
	if (sa->sa_len < sizeof(*sa)) {
	memset(&ifr.ifr_ifru.ifru_addr, 0,
	sizeof(ifr.ifr_ifru.ifru_addr));
	memcpy(&ifr.ifr_ifru.ifru_addr, sa,
	sa->sa_len);
	} else
	ifr.ifr_ifru.ifru_addr = *sa;
	sbuf_bcat(sb, &ifr, sizeof(ifr));
	max_len += sizeof(ifr);
	} else {
	sbuf_bcat(sb, &ifr,
	offsetof(struct ifreq, ifr_addr));
	max_len += offsetof(struct ifreq, ifr_addr);
	sbuf_bcat(sb, sa, sa->sa_len);
	max_len += sa->sa_len;
	}

	if (sbuf_error(sb) == 0)
	valid_len = sbuf_len(sb);
	}
	NET_EPOCH_EXIT(et);
	if (addrs == 0) {
	sbuf_bcat(sb, &ifr, sizeof(ifr));
	max_len += sizeof(ifr);

	if (sbuf_error(sb) == 0)
	valid_len = sbuf_len(sb);
	}
	}
	IFNET_RUNLOCK();

	/*
	* If we didn't allocate enough space (uncommon), try again. If
	* we have already allocated as much space as we are allowed,
	* return what we've got.
	*/
	if (valid_len != max_len && !full) {
	sbuf_delete(sb);
	goto again;
	}

	ifc->ifc_len = valid_len;
	sbuf_finish(sb);
	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
	sbuf_delete(sb);
	return (error);
	}

	/*
	* Just like ifpromisc(), but for all-multicast-reception mode.
	*/
	int
	if_allmulti(struct ifnet *ifp, int onswitch)
	{

	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
	}

	struct ifmultiaddr *
	if_findmulti(struct ifnet ifp, const struct sockaddr sa)
	{
	struct ifmultiaddr *ifma;

	IF_ADDR_LOCK_ASSERT(ifp);

	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (sa->sa_family == AF_LINK) {
	if (sa_dl_equal(ifma->ifma_addr, sa))
	break;
	} else {
	if (sa_equal(ifma->ifma_addr, sa))
	break;
	}
	}

	return ifma;
	}

	/*
	* Allocate a new ifmultiaddr and initialize based on passed arguments. We
	* make copies of passed sockaddrs. The ifmultiaddr will not be added to
	* the ifnet multicast address list here, so the caller must do that and
	* other setup work (such as notifying the device driver). The reference
	* count is initialized to 1.
	*/
	static struct ifmultiaddr *
	if_allocmulti(struct ifnet ifp, struct sockaddr sa, struct sockaddr *llsa,
	int mflags)
	{
	struct ifmultiaddr *ifma;
	struct sockaddr *dupsa;

	ifma = malloc(sizeof *ifma, M_IFMADDR, mflags \|
	M_ZERO);
	if (ifma == NULL)
	return (NULL);

	dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
	if (dupsa == NULL) {
	free(ifma, M_IFMADDR);
	return (NULL);
	}
	bcopy(sa, dupsa, sa->sa_len);
	ifma->ifma_addr = dupsa;

	ifma->ifma_ifp = ifp;
	ifma->ifma_refcount = 1;
	ifma->ifma_protospec = NULL;

	if (llsa == NULL) {
	ifma->ifma_lladdr = NULL;
	return (ifma);
	}

	dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
	if (dupsa == NULL) {
	free(ifma->ifma_addr, M_IFMADDR);
	free(ifma, M_IFMADDR);
	return (NULL);
	}
	bcopy(llsa, dupsa, llsa->sa_len);
	ifma->ifma_lladdr = dupsa;

	return (ifma);
	}

	/*
	* if_freemulti: free ifmultiaddr structure and possibly attached related
	* addresses. The caller is responsible for implementing reference
	* counting, notifying the driver, handling routing messages, and releasing
	* any dependent link layer state.
	*/
	#ifdef MCAST_VERBOSE
	extern void kdb_backtrace(void);
	#endif
	static void
	if_freemulti_internal(struct ifmultiaddr *ifma)
	{

	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
	ifma->ifma_refcount));

	if (ifma->ifma_lladdr != NULL)
	free(ifma->ifma_lladdr, M_IFMADDR);
	#ifdef MCAST_VERBOSE
	kdb_backtrace();
	printf("%s freeing ifma: %p\n", __func__, ifma);
	#endif
	free(ifma->ifma_addr, M_IFMADDR);
	free(ifma, M_IFMADDR);
	}

	static void
	if_destroymulti(epoch_context_t ctx)
	{
	struct ifmultiaddr *ifma;

	ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx);
	if_freemulti_internal(ifma);
	}

	void
	if_freemulti(struct ifmultiaddr *ifma)
	{
	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d",
	ifma->ifma_refcount));

	NET_EPOCH_CALL(if_destroymulti, &ifma->ifma_epoch_ctx);
	}

	/*
	* Register an additional multicast address with a network interface.
	*
	* - If the address is already present, bump the reference count on the
	* address and return.
	* - If the address is not link-layer, look up a link layer address.
	* - Allocate address structures for one or both addresses, and attach to the
	* multicast address list on the interface. If automatically adding a link
	* layer address, the protocol address will own a reference to the link
	* layer address, to be freed when it is freed.
	* - Notify the network device driver of an addition to the multicast address
	* list.
	*
	* 'sa' points to caller-owned memory with the desired multicast address.
	*
	* 'retifma' will be used to return a pointer to the resulting multicast
	* address reference, if desired.
	*/
	int
	if_addmulti(struct ifnet ifp, struct sockaddr sa,
	struct ifmultiaddr **retifma)
	{
	struct ifmultiaddr ifma, ll_ifma;
	struct sockaddr *llsa;
	struct sockaddr_dl sdl;
	int error;

	#ifdef INET
	IN_MULTI_LIST_UNLOCK_ASSERT();
	#endif
	#ifdef INET6
	IN6_MULTI_LIST_UNLOCK_ASSERT();
	#endif
	/*
	* If the address is already present, return a new reference to it;
	* otherwise, allocate storage and set up a new address.
	*/
	IF_ADDR_WLOCK(ifp);
	ifma = if_findmulti(ifp, sa);
	if (ifma != NULL) {
	ifma->ifma_refcount++;
	if (retifma != NULL)
	*retifma = ifma;
	IF_ADDR_WUNLOCK(ifp);
	return (0);
	}

	/*
	* The address isn't already present; resolve the protocol address
	* into a link layer address, and then look that up, bump its
	* refcount or allocate an ifma for that also.
	* Most link layer resolving functions returns address data which
	* fits inside default sockaddr_dl structure. However callback
	* can allocate another sockaddr structure, in that case we need to
	* free it later.
	*/
	llsa = NULL;
	ll_ifma = NULL;
	if (ifp->if_resolvemulti != NULL) {
	/* Provide called function with buffer size information */
	sdl.sdl_len = sizeof(sdl);
	llsa = (struct sockaddr *)&sdl;
	error = ifp->if_resolvemulti(ifp, &llsa, sa);
	if (error)
	goto unlock_out;
	}

	/*
	* Allocate the new address. Don't hook it up yet, as we may also
	* need to allocate a link layer multicast address.
	*/
	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
	if (ifma == NULL) {
	error = ENOMEM;
	goto free_llsa_out;
	}

	/*
	* If a link layer address is found, we'll need to see if it's
	* already present in the address list, or allocate is as well.
	* When this block finishes, the link layer address will be on the
	* list.
	*/
	if (llsa != NULL) {
	ll_ifma = if_findmulti(ifp, llsa);
	if (ll_ifma == NULL) {
	ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
	if (ll_ifma == NULL) {
	--ifma->ifma_refcount;
	if_freemulti(ifma);
	error = ENOMEM;
	goto free_llsa_out;
	}
	ll_ifma->ifma_flags \|= IFMA_F_ENQUEUED;
	CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
	ifma_link);
	} else
	ll_ifma->ifma_refcount++;
	ifma->ifma_llifma = ll_ifma;
	}

	/*
	* We now have a new multicast address, ifma, and possibly a new or
	* referenced link layer address. Add the primary address to the
	* ifnet address list.
	*/
	ifma->ifma_flags \|= IFMA_F_ENQUEUED;
	CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);

	if (retifma != NULL)
	*retifma = ifma;

	/*
	* Must generate the message while holding the lock so that 'ifma'
	* pointer is still valid.
	*/
	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
	IF_ADDR_WUNLOCK(ifp);

	/*
	* We are certain we have added something, so call down to the
	* interface to let them know about it.
	*/
	if (ifp->if_ioctl != NULL) {
	if (THREAD_CAN_SLEEP())
	(void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
	else
	taskqueue_enqueue(taskqueue_swi, &ifp->if_addmultitask);
	}

	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
	link_free_sdl(llsa);

	return (0);

	free_llsa_out:
	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
	link_free_sdl(llsa);

	unlock_out:
	IF_ADDR_WUNLOCK(ifp);
	return (error);
	}

	static void
	if_siocaddmulti(void *arg, int pending)
	{
	struct ifnet *ifp;

	ifp = arg;
	#ifdef DIAGNOSTIC
	if (pending > 1)
	if_printf(ifp, "%d SIOCADDMULTI coalesced\n", pending);
	#endif
	CURVNET_SET(ifp->if_vnet);
	(void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
	CURVNET_RESTORE();
	}

	/*
	* Delete a multicast group membership by network-layer group address.
	*
	* Returns ENOENT if the entry could not be found. If ifp no longer
	* exists, results are undefined. This entry point should only be used
	* from subsystems which do appropriate locking to hold ifp for the
	* duration of the call.
	* Network-layer protocol domains must use if_delmulti_ifma().
	*/
	int
	if_delmulti(struct ifnet ifp, struct sockaddr sa)
	{
	struct ifmultiaddr *ifma;
	int lastref;

	KASSERT(ifp, ("%s: NULL ifp", __func__));

	IF_ADDR_WLOCK(ifp);
	lastref = 0;
	ifma = if_findmulti(ifp, sa);
	if (ifma != NULL)
	lastref = if_delmulti_locked(ifp, ifma, 0);
	IF_ADDR_WUNLOCK(ifp);

	if (ifma == NULL)
	return (ENOENT);

	if (lastref && ifp->if_ioctl != NULL) {
	(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
	}

	return (0);
	}

	/*
	* Delete all multicast group membership for an interface.
	* Should be used to quickly flush all multicast filters.
	*/
	void
	if_delallmulti(struct ifnet *ifp)
	{
	struct ifmultiaddr *ifma;
	struct ifmultiaddr *next;

	IF_ADDR_WLOCK(ifp);
	CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
	if_delmulti_locked(ifp, ifma, 0);
	IF_ADDR_WUNLOCK(ifp);
	}

	void
	if_delmulti_ifma(struct ifmultiaddr *ifma)
	{
	if_delmulti_ifma_flags(ifma, 0);
	}

	/*
	* Delete a multicast group membership by group membership pointer.
	* Network-layer protocol domains must use this routine.
	*
	* It is safe to call this routine if the ifp disappeared.
	*/
	void
	if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags)
	{
	struct ifnet *ifp;
	int lastref;
	MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma);
	#ifdef INET
	IN_MULTI_LIST_UNLOCK_ASSERT();
	#endif
	ifp = ifma->ifma_ifp;
	#ifdef DIAGNOSTIC
	if (ifp == NULL) {
	printf("%s: ifma_ifp seems to be detached\n", __func__);
	} else {
	struct epoch_tracker et;
	struct ifnet *oifp;

	NET_EPOCH_ENTER(et);
	CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link)
	if (ifp == oifp)
	break;
	NET_EPOCH_EXIT(et);
	if (ifp != oifp)
	ifp = NULL;
	}
	#endif
	/*
	* If and only if the ifnet instance exists: Acquire the address lock.
	*/
	if (ifp != NULL)
	IF_ADDR_WLOCK(ifp);

	lastref = if_delmulti_locked(ifp, ifma, flags);

	if (ifp != NULL) {
	/*
	* If and only if the ifnet instance exists:
	* Release the address lock.
	* If the group was left: update the hardware hash filter.
	*/
	IF_ADDR_WUNLOCK(ifp);
	if (lastref && ifp->if_ioctl != NULL) {
	(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
	}
	}
	}

	/*
	* Perform deletion of network-layer and/or link-layer multicast address.
	*
	* Return 0 if the reference count was decremented.
	* Return 1 if the final reference was released, indicating that the
	* hardware hash filter should be reprogrammed.
	*/
	static int
	if_delmulti_locked(struct ifnet ifp, struct ifmultiaddr ifma, int detaching)
	{
	struct ifmultiaddr *ll_ifma;

	if (ifp != NULL && ifma->ifma_ifp != NULL) {
	KASSERT(ifma->ifma_ifp == ifp,
	("%s: inconsistent ifp %p", __func__, ifp));
	IF_ADDR_WLOCK_ASSERT(ifp);
	}

	ifp = ifma->ifma_ifp;
	MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : "");

	/*
	* If the ifnet is detaching, null out references to ifnet,
	* so that upper protocol layers will notice, and not attempt
	* to obtain locks for an ifnet which no longer exists. The
	* routing socket announcement must happen before the ifnet
	* instance is detached from the system.
	*/
	if (detaching) {
	#ifdef DIAGNOSTIC
	printf("%s: detaching ifnet instance %p\n", __func__, ifp);
	#endif
	/*
	* ifp may already be nulled out if we are being reentered
	* to delete the ll_ifma.
	*/
	if (ifp != NULL) {
	rt_newmaddrmsg(RTM_DELMADDR, ifma);
	ifma->ifma_ifp = NULL;
	}
	}

	if (--ifma->ifma_refcount > 0)
	return 0;

	if (ifp != NULL && detaching == 0 && (ifma->ifma_flags & IFMA_F_ENQUEUED)) {
	CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
	ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
	}
	/*
	* If this ifma is a network-layer ifma, a link-layer ifma may
	* have been associated with it. Release it first if so.
	*/
	ll_ifma = ifma->ifma_llifma;
	if (ll_ifma != NULL) {
	KASSERT(ifma->ifma_lladdr != NULL,
	("%s: llifma w/o lladdr", __func__));
	if (detaching)
	ll_ifma->ifma_ifp = NULL; /* XXX */
	if (--ll_ifma->ifma_refcount == 0) {
	if (ifp != NULL) {
	if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
	CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr,
	ifma_link);
	ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
	}
	}
	if_freemulti(ll_ifma);
	}
	}
	#ifdef INVARIANTS
	if (ifp) {
	struct ifmultiaddr *ifmatmp;

	CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link)
	MPASS(ifma != ifmatmp);
	}
	#endif
	if_freemulti(ifma);
	/*
	* The last reference to this instance of struct ifmultiaddr
	* was released; the hardware should be notified of this change.
	*/
	return 1;
	}

	/*
	* Set the link layer address on an interface.
	*
	* At this time we only support certain types of interfaces,
	* and we don't allow the length of the address to change.
	*
	* Set noinline to be dtrace-friendly
	*/
	__noinline int
	if_setlladdr(struct ifnet ifp, const u_char lladdr, int len)
	{
	struct sockaddr_dl *sdl;
	struct ifaddr *ifa;
	struct ifreq ifr;

	ifa = ifp->if_addr;
	if (ifa == NULL)
	return (EINVAL);

	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
	if (sdl == NULL)
	return (EINVAL);

	if (len != sdl->sdl_alen) /* don't allow length to change */
	return (EINVAL);

	switch (ifp->if_type) {
	case IFT_ETHER:
	case IFT_XETHER:
	case IFT_L2VLAN:
	case IFT_BRIDGE:
	case IFT_IEEE8023ADLAG:
	bcopy(lladdr, LLADDR(sdl), len);
	break;
	default:
	return (ENODEV);
	}

	/*
	* If the interface is already up, we need
	* to re-init it in order to reprogram its
	* address filter.
	*/
	if ((ifp->if_flags & IFF_UP) != 0) {
	if (ifp->if_ioctl) {
	ifp->if_flags &= ~IFF_UP;
	ifr.ifr_flags = ifp->if_flags & 0xffff;
	ifr.ifr_flagshigh = ifp->if_flags >> 16;
	(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
	ifp->if_flags \|= IFF_UP;
	ifr.ifr_flags = ifp->if_flags & 0xffff;
	ifr.ifr_flagshigh = ifp->if_flags >> 16;
	(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
	}
	}
	EVENTHANDLER_INVOKE(iflladdr_event, ifp);

	return (0);
	}

	/*
	* Compat function for handling basic encapsulation requests.
	* Not converted stacks (FDDI, IB, ..) supports traditional
	* output model: ARP (and other similar L2 protocols) are handled
	* inside output routine, arpresolve/nd6_resolve() returns MAC
	* address instead of full prepend.
	*
	* This function creates calculated header==MAC for IPv4/IPv6 and
	* returns EAFNOSUPPORT (which is then handled in ARP code) for other
	* address families.
	*/
	static int
	if_requestencap_default(struct ifnet ifp, struct if_encap_req req)
	{

	if (req->rtype != IFENCAP_LL)
	return (EOPNOTSUPP);

	if (req->bufsize < req->lladdr_len)
	return (ENOMEM);

	switch (req->family) {
	case AF_INET:
	case AF_INET6:
	break;
	default:
	return (EAFNOSUPPORT);
	}

	/* Copy lladdr to storage as is */
	memmove(req->buf, req->lladdr, req->lladdr_len);
	req->bufsize = req->lladdr_len;
	req->lladdr_off = 0;

	return (0);
	}

	/*
	* Tunnel interfaces can nest, also they may cause infinite recursion
	* calls when misconfigured. We'll prevent this by detecting loops.
	* High nesting level may cause stack exhaustion. We'll prevent this
	* by introducing upper limit.
	*
	* Return 0, if tunnel nesting count is equal or less than limit.
	*/
	int
	if_tunnel_check_nesting(struct ifnet ifp, struct mbuf m, uint32_t cookie,
	int limit)
	{
	struct m_tag *mtag;
	int count;

	count = 1;
	mtag = NULL;
	while ((mtag = m_tag_locate(m, cookie, 0, mtag)) != NULL) {
	if ((struct ifnet *)(mtag + 1) == ifp) {
	log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp));
	return (EIO);
	}
	count++;
	}
	if (count > limit) {
	log(LOG_NOTICE,
	"%s: if_output recursively called too many times(%d)\n",
	if_name(ifp), count);
	return (EIO);
	}
	mtag = m_tag_alloc(cookie, 0, sizeof(struct ifnet *), M_NOWAIT);
	if (mtag == NULL)
	return (ENOMEM);
	(struct ifnet *)(mtag + 1) = ifp;
	m_tag_prepend(m, mtag);
	return (0);
	}

	/*
	* Get the link layer address that was read from the hardware at attach.
	*
	* This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type
	* their component interfaces as IFT_IEEE8023ADLAG.
	*/
	int
	if_gethwaddr(struct ifnet ifp, struct ifreq ifr)
	{

	if (ifp->if_hw_addr == NULL)
	return (ENODEV);

	switch (ifp->if_type) {
	case IFT_ETHER:
	case IFT_IEEE8023ADLAG:
	bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen);
	return (0);
	default:
	return (ENODEV);
	}
	}

	/*
	* The name argument must be a pointer to storage which will last as
	* long as the interface does. For physical devices, the result of
	* device_get_name(dev) is a good choice and for pseudo-devices a
	* static string works well.
	*/
	void
	if_initname(struct ifnet ifp, const char name, int unit)
	{
	ifp->if_dname = name;
	ifp->if_dunit = unit;
	if (unit != IF_DUNIT_NONE)
	snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
	else
	strlcpy(ifp->if_xname, name, IFNAMSIZ);
	}

	int
	if_printf(struct ifnet ifp, const char fmt, ...)
	{
	char if_fmt[256];
	va_list ap;

	snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt);
	va_start(ap, fmt);
	vlog(LOG_INFO, if_fmt, ap);
	va_end(ap);
	return (0);
	}

	void
	if_start(struct ifnet *ifp)
	{

	(*(ifp)->if_start)(ifp);
	}

	/*
	* Backwards compatibility interface for drivers
	* that have not implemented it
	*/
	static int
	if_transmit(struct ifnet ifp, struct mbuf m)
	{
	int error;

	IFQ_HANDOFF(ifp, m, error);
	return (error);
	}

	static void
	if_input_default(struct ifnet ifp __unused, struct mbuf m)
	{

	m_freem(m);
	}

	int
	if_handoff(struct ifqueue ifq, struct mbuf m, struct ifnet *ifp, int adjust)
	{
	int active = 0;

	IF_LOCK(ifq);
	if (_IF_QFULL(ifq)) {
	IF_UNLOCK(ifq);
	if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
	m_freem(m);
	return (0);
	}
	if (ifp != NULL) {
	if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust);
	if (m->m_flags & (M_BCAST\|M_MCAST))
	if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
	active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
	}
	_IF_ENQUEUE(ifq, m);
	IF_UNLOCK(ifq);
	if (ifp != NULL && !active)
	(*(ifp)->if_start)(ifp);
	return (1);
	}

	void
	if_register_com_alloc(u_char type,
	if_com_alloc_t a, if_com_free_t f)
	{

	KASSERT(if_com_alloc[type] == NULL,
	("if_register_com_alloc: %d already registered", type));
	KASSERT(if_com_free[type] == NULL,
	("if_register_com_alloc: %d free already registered", type));

	if_com_alloc[type] = a;
	if_com_free[type] = f;
	}

	void
	if_deregister_com_alloc(u_char type)
	{

	KASSERT(if_com_alloc[type] != NULL,
	("if_deregister_com_alloc: %d not registered", type));
	KASSERT(if_com_free[type] != NULL,
	("if_deregister_com_alloc: %d free not registered", type));
	if_com_alloc[type] = NULL;
	if_com_free[type] = NULL;
	}

	/* API for driver access to network stack owned ifnet.*/
	uint64_t
	if_setbaudrate(struct ifnet *ifp, uint64_t baudrate)
	{
	uint64_t oldbrate;

	oldbrate = ifp->if_baudrate;
	ifp->if_baudrate = baudrate;
	return (oldbrate);
	}

	uint64_t
	if_getbaudrate(if_t ifp)
	{

	return (((struct ifnet *)ifp)->if_baudrate);
	}

	int
	if_setcapabilities(if_t ifp, int capabilities)
	{
	((struct ifnet *)ifp)->if_capabilities = capabilities;
	return (0);
	}

	int
	if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit)
	{
	((struct ifnet *)ifp)->if_capabilities \|= setbit;
	((struct ifnet *)ifp)->if_capabilities &= ~clearbit;

	return (0);
	}

	int
	if_getcapabilities(if_t ifp)
	{
	return ((struct ifnet *)ifp)->if_capabilities;
	}

	int
	if_setcapenable(if_t ifp, int capabilities)
	{
	((struct ifnet *)ifp)->if_capenable = capabilities;
	return (0);
	}

	int
	if_setcapenablebit(if_t ifp, int setcap, int clearcap)
	{
	if(setcap)
	((struct ifnet *)ifp)->if_capenable \|= setcap;
	if(clearcap)
	((struct ifnet *)ifp)->if_capenable &= ~clearcap;

	return (0);
	}

	const char *
	if_getdname(if_t ifp)
	{
	return ((struct ifnet *)ifp)->if_dname;
	}

	int
	if_togglecapenable(if_t ifp, int togglecap)
	{
	((struct ifnet *)ifp)->if_capenable ^= togglecap;
	return (0);
	}

	int
	if_getcapenable(if_t ifp)
	{
	return ((struct ifnet *)ifp)->if_capenable;
	}

	/*
	* This is largely undesirable because it ties ifnet to a device, but does
	* provide flexiblity for an embedded product vendor. Should be used with
	* the understanding that it violates the interface boundaries, and should be
	* a last resort only.
	*/
	int
	if_setdev(if_t ifp, void *dev)
	{
	return (0);
	}

	int
	if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags)
	{
	((struct ifnet *)ifp)->if_drv_flags \|= set_flags;
	((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags;

	return (0);
	}

	int
	if_getdrvflags(if_t ifp)
	{
	return ((struct ifnet *)ifp)->if_drv_flags;
	}

	int
	if_setdrvflags(if_t ifp, int flags)
	{
	((struct ifnet *)ifp)->if_drv_flags = flags;
	return (0);
	}

	int
	if_setflags(if_t ifp, int flags)
	{

	ifp->if_flags = flags;
	return (0);
	}

	int
	if_setflagbits(if_t ifp, int set, int clear)
	{
	((struct ifnet *)ifp)->if_flags \|= set;
	((struct ifnet *)ifp)->if_flags &= ~clear;

	return (0);
	}

	int
	if_getflags(if_t ifp)
	{
	return ((struct ifnet *)ifp)->if_flags;
	}

	int
	if_clearhwassist(if_t ifp)
	{
	((struct ifnet *)ifp)->if_hwassist = 0;
	return (0);
	}

	int
	if_sethwassistbits(if_t ifp, int toset, int toclear)
	{
	((struct ifnet *)ifp)->if_hwassist \|= toset;
	((struct ifnet *)ifp)->if_hwassist &= ~toclear;

	return (0);
	}

	int
	if_sethwassist(if_t ifp, int hwassist_bit)
	{
	((struct ifnet *)ifp)->if_hwassist = hwassist_bit;
	return (0);
	}

	int
	if_gethwassist(if_t ifp)
	{
	return ((struct ifnet *)ifp)->if_hwassist;
	}

	int
	if_setmtu(if_t ifp, int mtu)
	{
	((struct ifnet *)ifp)->if_mtu = mtu;
	return (0);
	}

	int
	if_getmtu(if_t ifp)
	{
	return ((struct ifnet *)ifp)->if_mtu;
	}

	int
	if_getmtu_family(if_t ifp, int family)
	{
	struct domain *dp;

	for (dp = domains; dp; dp = dp->dom_next) {
	if (dp->dom_family == family && dp->dom_ifmtu != NULL)
	return (dp->dom_ifmtu((struct ifnet *)ifp));
	}

	return (((struct ifnet *)ifp)->if_mtu);
	}

	/*
	* Methods for drivers to access interface unicast and multicast
	* link level addresses. Driver shall not know 'struct ifaddr' neither
	* 'struct ifmultiaddr'.
	*/
	u_int
	if_lladdr_count(if_t ifp)
	{
	struct epoch_tracker et;
	struct ifaddr *ifa;
	u_int count;

	count = 0;
	NET_EPOCH_ENTER(et);
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
	if (ifa->ifa_addr->sa_family == AF_LINK)
	count++;
	NET_EPOCH_EXIT(et);

	return (count);
	}

	u_int
	if_foreach_lladdr(if_t ifp, iflladdr_cb_t cb, void *cb_arg)
	{
	struct epoch_tracker et;
	struct ifaddr *ifa;
	u_int count;

	MPASS(cb);

	count = 0;
	NET_EPOCH_ENTER(et);
	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != AF_LINK)
	continue;
	count += (cb)(cb_arg, (struct sockaddr_dl )ifa->ifa_addr,
	count);
	}
	NET_EPOCH_EXIT(et);

	return (count);
	}

	u_int
	if_llmaddr_count(if_t ifp)
	{
	struct epoch_tracker et;
	struct ifmultiaddr *ifma;
	int count;

	count = 0;
	NET_EPOCH_ENTER(et);
	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
	if (ifma->ifma_addr->sa_family == AF_LINK)
	count++;
	NET_EPOCH_EXIT(et);

	return (count);
	}

	u_int
	if_foreach_llmaddr(if_t ifp, iflladdr_cb_t cb, void *cb_arg)
	{
	struct epoch_tracker et;
	struct ifmultiaddr *ifma;
	u_int count;

	MPASS(cb);

	count = 0;
	NET_EPOCH_ENTER(et);
	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (ifma->ifma_addr->sa_family != AF_LINK)
	continue;
	count += (cb)(cb_arg, (struct sockaddr_dl )ifma->ifma_addr,
	count);
	}
	NET_EPOCH_EXIT(et);

	return (count);
	}

	int
	if_setsoftc(if_t ifp, void *softc)
	{
	((struct ifnet *)ifp)->if_softc = softc;
	return (0);
	}

	void *
	if_getsoftc(if_t ifp)
	{
	return ((struct ifnet *)ifp)->if_softc;
	}

	void
	if_setrcvif(struct mbuf *m, if_t ifp)
	{

	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
	m->m_pkthdr.rcvif = (struct ifnet *)ifp;
	}

	void
	if_setvtag(struct mbuf *m, uint16_t tag)
	{
	m->m_pkthdr.ether_vtag = tag;
	}

	uint16_t
	if_getvtag(struct mbuf *m)
	{

	return (m->m_pkthdr.ether_vtag);
	}

	int
	if_sendq_empty(if_t ifp)
	{
	return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd);
	}

	struct ifaddr *
	if_getifaddr(if_t ifp)
	{
	return ((struct ifnet *)ifp)->if_addr;
	}

	int
	if_getamcount(if_t ifp)
	{
	return ((struct ifnet *)ifp)->if_amcount;
	}

	int
	if_setsendqready(if_t ifp)
	{
	IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd);
	return (0);
	}

	int
	if_setsendqlen(if_t ifp, int tx_desc_count)
	{
	IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count);
	((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count;

	return (0);
	}

	int
	if_vlantrunkinuse(if_t ifp)
	{
	return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0;
	}

	int
	if_input(if_t ifp, struct mbuf* sendmp)
	{
	(((struct ifnet )ifp)->if_input)((struct ifnet *)ifp, sendmp);
	return (0);

	}

	struct mbuf *
	if_dequeue(if_t ifp)
	{
	struct mbuf *m;
	IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m);

	return (m);
	}

	int
	if_sendq_prepend(if_t ifp, struct mbuf *m)
	{
	IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m);
	return (0);
	}

	int
	if_setifheaderlen(if_t ifp, int len)
	{
	((struct ifnet *)ifp)->if_hdrlen = len;
	return (0);
	}

	caddr_t
	if_getlladdr(if_t ifp)
	{
	return (IF_LLADDR((struct ifnet *)ifp));
	}

	void *
	if_gethandle(u_char type)
	{
	return (if_alloc(type));
	}

	void
	if_bpfmtap(if_t ifh, struct mbuf *m)
	{
	struct ifnet ifp = (struct ifnet )ifh;

	BPF_MTAP(ifp, m);
	}

	void
	if_etherbpfmtap(if_t ifh, struct mbuf *m)
	{
	struct ifnet ifp = (struct ifnet )ifh;

	ETHER_BPF_MTAP(ifp, m);
	}

	void
	if_vlancap(if_t ifh)
	{
	struct ifnet ifp = (struct ifnet )ifh;
	VLAN_CAPABILITIES(ifp);
	}

	int
	if_sethwtsomax(if_t ifp, u_int if_hw_tsomax)
	{

	((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax;
	return (0);
	}

	int
	if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount)
	{

	((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount;
	return (0);
	}

	int
	if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize)
	{

	((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize;
	return (0);
	}

	u_int
	if_gethwtsomax(if_t ifp)
	{

	return (((struct ifnet *)ifp)->if_hw_tsomax);
	}

	u_int
	if_gethwtsomaxsegcount(if_t ifp)
	{

	return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount);
	}

	u_int
	if_gethwtsomaxsegsize(if_t ifp)
	{

	return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize);
	}

	void
	if_setinitfn(if_t ifp, void (init_fn)(void ))
	{
	((struct ifnet *)ifp)->if_init = init_fn;
	}

	void
	if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t))
	{
	((struct ifnet )ifp)->if_ioctl = (void )ioctl_fn;
	}

	void
	if_setstartfn(if_t ifp, void (*start_fn)(if_t))
	{
	((struct ifnet )ifp)->if_start = (void )start_fn;
	}

	void
	if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn)
	{
	((struct ifnet *)ifp)->if_transmit = start_fn;
	}

	void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn)
	{
	((struct ifnet *)ifp)->if_qflush = flush_fn;

	}

	void
	if_setgetcounterfn(if_t ifp, if_get_counter_t fn)
	{

	ifp->if_get_counter = fn;
	}

	/* Revisit these - These are inline functions originally. */
	int
	drbr_inuse_drv(if_t ifh, struct buf_ring *br)
	{
	return drbr_inuse(ifh, br);
	}

	struct mbuf*
	drbr_dequeue_drv(if_t ifh, struct buf_ring *br)
	{
	return drbr_dequeue(ifh, br);
	}

	int
	drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br)
	{
	return drbr_needs_enqueue(ifh, br);
	}

	int
	drbr_enqueue_drv(if_t ifh, struct buf_ring br, struct mbuf m)
	{
	return drbr_enqueue(ifh, br, m);

	}
	diff --git a/sys/powerpc/mambo/mambo_disk.c b/sys/powerpc/mambo/mambo_disk.c
	index 3a9a53a581b6..935aeab636bc 100644
	--- a/sys/powerpc/mambo/mambo_disk.c
	+++ b/sys/powerpc/mambo/mambo_disk.c
	@@ -1,279 +1,279 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2008 Nathan Whitehorn. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <geom/geom_disk.h>

	#include <powerpc/mambo/mambocall.h>

	struct mambodisk_softc {
	device_t dev;
	struct mtx sc_mtx;
	struct disk *disk;
	struct proc *p;
	struct bio_queue_head bio_queue;
	int running;
	int maxblocks;
	};

	#define MAMBO_DISK_READ 116
	#define MAMBO_DISK_WRITE 117
	#define MAMBO_DISK_INFO 118

	#define MAMBO_INFO_STATUS 1
	#define MAMBO_INFO_BLKSZ 2
	#define MAMBO_INFO_DEVSZ 3

	/* bus entry points */
	static void mambodisk_identify(driver_t *driver, device_t parent);
	static int mambodisk_probe(device_t dev);
	static int mambodisk_attach(device_t dev);

	/* disk routines */
	static int mambodisk_open(struct disk *dp);
	static int mambodisk_close(struct disk *dp);
	static void mambodisk_strategy(struct bio *bp);
	static void mambodisk_task(void *arg);

	#define MBODISK_LOCK(_sc) mtx_lock(&(_sc)->sc_mtx)
	#define MBODISK_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_mtx)
	#define MBODISK_LOCK_INIT(_sc) \
	mtx_init(&_sc->sc_mtx, device_get_nameunit(_sc->dev), \
	"mambodisk", MTX_DEF)
	#define MBODISK_LOCK_DESTROY(_sc) mtx_destroy(&_sc->sc_mtx);
	#define MBODISK_ASSERT_LOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_OWNED);
	#define MBODISK_ASSERT_UNLOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_NOTOWNED);

	static void
	mambodisk_identify(driver_t *driver, device_t parent)
	{
	int i = 0;

	for (i = 0; mambocall(MAMBO_DISK_INFO,MAMBO_INFO_DEVSZ,i) > 0; i++)
	BUS_ADD_CHILD(parent,0,"mambodisk",i);
	}

	static int
	mambodisk_probe(device_t dev)
	{
	device_set_desc(dev, "Mambo Simulated Block Device");
	return (0);
	}

	static int
	mambodisk_attach(device_t dev)
	{
	struct mambodisk_softc *sc;
	struct disk *d;
	intmax_t mb;
	char unit;

	sc = device_get_softc(dev);
	sc->dev = dev;
	MBODISK_LOCK_INIT(sc);

	d = sc->disk = disk_alloc();
	d->d_open = mambodisk_open;
	d->d_close = mambodisk_close;
	d->d_strategy = mambodisk_strategy;
	d->d_name = "mambodisk";
	d->d_drv1 = sc;
	- d->d_maxsize = MAXPHYS; /* Maybe ask bridge? */
	+ d->d_maxsize = maxphys; /* Maybe ask bridge? */

	d->d_sectorsize = 512;
	sc->maxblocks = mambocall(MAMBO_DISK_INFO,MAMBO_INFO_BLKSZ,d->d_unit)
	/ 512;

	d->d_unit = device_get_unit(dev);
	d->d_mediasize = mambocall(MAMBO_DISK_INFO,MAMBO_INFO_DEVSZ,d->d_unit)
	* 1024ULL; /* Mambo gives size in KB */

	mb = d->d_mediasize >> 20; /* 1MiB == 1 << 20 */
	unit = 'M';
	if (mb >= 10240) { /* 1GiB = 1024 MiB */
	unit = 'G';
	mb /= 1024;
	}
	device_printf(dev, "%ju%cB, %d byte sectors\n", mb, unit,
	d->d_sectorsize);
	disk_create(d, DISK_VERSION);
	bioq_init(&sc->bio_queue);

	sc->running = 1;
	kproc_create(&mambodisk_task, sc, &sc->p, 0, 0, "task: mambo hd");

	return (0);
	}

	static int
	mambodisk_detach(device_t dev)
	{
	struct mambodisk_softc *sc = device_get_softc(dev);

	/* kill thread */
	MBODISK_LOCK(sc);
	sc->running = 0;
	wakeup(sc);
	MBODISK_UNLOCK(sc);

	/* wait for thread to finish. XXX probably want timeout. -sorbo */
	MBODISK_LOCK(sc);
	while (sc->running != -1)
	msleep(sc, &sc->sc_mtx, PRIBIO, "detach", 0);
	MBODISK_UNLOCK(sc);

	/* kill disk */
	disk_destroy(sc->disk);
	/* XXX destroy anything in queue */

	MBODISK_LOCK_DESTROY(sc);

	return (0);
	}

	static int
	mambodisk_open(struct disk *dp)
	{
	return (0);
	}

	static int
	mambodisk_close(struct disk *dp)
	{
	return (0);
	}

	static void
	mambodisk_strategy(struct bio *bp)
	{
	struct mambodisk_softc *sc;

	sc = (struct mambodisk_softc *)bp->bio_disk->d_drv1;
	MBODISK_LOCK(sc);
	bioq_disksort(&sc->bio_queue, bp);
	wakeup(sc);
	MBODISK_UNLOCK(sc);
	}

	static void
	mambodisk_task(void *arg)
	{
	struct mambodisk_softc sc = (struct mambodisk_softc)arg;
	struct bio *bp;
	size_t sz;
	int result;
	daddr_t block, end;
	device_t dev;
	u_long unit;

	dev = sc->dev;
	unit = device_get_unit(dev);

	while (sc->running) {
	MBODISK_LOCK(sc);
	do {
	bp = bioq_first(&sc->bio_queue);
	if (bp == NULL)
	msleep(sc, &sc->sc_mtx, PRIBIO, "jobqueue", 0);
	} while (bp == NULL && sc->running);
	if (bp)
	bioq_remove(&sc->bio_queue, bp);
	MBODISK_UNLOCK(sc);
	if (!sc->running)
	break;
	sz = sc->disk->d_sectorsize;
	end = bp->bio_pblkno + (bp->bio_bcount / sz);
	for (block = bp->bio_pblkno; block < end;) {
	u_long numblocks;
	char *vaddr = bp->bio_data +
	(block - bp->bio_pblkno) * sz;

	numblocks = end - block;
	if (numblocks > sc->maxblocks)
	numblocks = sc->maxblocks;

	if (bp->bio_cmd == BIO_READ) {
	result = mambocall(MAMBO_DISK_READ, vaddr,
	(u_long)block, (numblocks << 16) \| unit);
	} else if (bp->bio_cmd == BIO_WRITE) {
	result = mambocall(MAMBO_DISK_WRITE, vaddr,
	(u_long)block, (numblocks << 16) \| unit);
	} else {
	result = 1;
	}

	if (result)
	break;

	block += numblocks;
	}
	if (block < end) {
	bp->bio_error = EIO;
	bp->bio_resid = (end - block) * sz;
	bp->bio_flags \|= BIO_ERROR;
	}
	biodone(bp);
	}

	/* tell parent we're done */
	MBODISK_LOCK(sc);
	sc->running = -1;
	wakeup(sc);
	MBODISK_UNLOCK(sc);

	kproc_exit(0);
	}

	static device_method_t mambodisk_methods[] = {
	DEVMETHOD(device_identify, mambodisk_identify),
	DEVMETHOD(device_probe, mambodisk_probe),
	DEVMETHOD(device_attach, mambodisk_attach),
	DEVMETHOD(device_detach, mambodisk_detach),
	{0, 0},
	};

	static driver_t mambodisk_driver = {
	"mambodisk",
	mambodisk_methods,
	sizeof(struct mambodisk_softc),
	};
	static devclass_t mambodisk_devclass;

	DRIVER_MODULE(mambodisk, mambo, mambodisk_driver, mambodisk_devclass, 0, 0);
	diff --git a/sys/powerpc/mpc85xx/fsl_sata.c b/sys/powerpc/mpc85xx/fsl_sata.c
	index 64fe1cafa38a..49e3846e79ca 100644
	--- a/sys/powerpc/mpc85xx/fsl_sata.c
	+++ b/sys/powerpc/mpc85xx/fsl_sata.c
	@@ -1,1896 +1,1896 @@
	/*-
	* Copyright (c) 2009-2012 Alexander Motin <mav@FreeBSD.org>
	* Copyright (c) 2017 Justin Hibbits <jhibbits@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/module.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/endian.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/rman.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/cam_debug.h>

	#include <dev/ofw/ofw_bus_subr.h>

	#include <machine/bus.h>
	#include <machine/resource.h>

	#include "fsl_sata.h"

	struct fsl_sata_channel;
	struct fsl_sata_slot;
	enum fsl_sata_err_type;
	struct fsl_sata_cmd_tab;

	/* local prototypes */
	static int fsl_sata_init(device_t dev);
	static int fsl_sata_deinit(device_t dev);
	static int fsl_sata_suspend(device_t dev);
	static int fsl_sata_resume(device_t dev);
	static void fsl_sata_pm(void *arg);
	static void fsl_sata_intr(void *arg);
	static void fsl_sata_intr_main(struct fsl_sata_channel *ch, uint32_t istatus);
	static void fsl_sata_begin_transaction(struct fsl_sata_channel ch, union ccb ccb);
	static void fsl_sata_dmasetprd(void arg, bus_dma_segment_t segs, int nsegs, int error);
	static void fsl_sata_execute_transaction(struct fsl_sata_slot *slot);
	static void fsl_sata_timeout(void *arg);
	static void fsl_sata_end_transaction(struct fsl_sata_slot *slot, enum fsl_sata_err_type et);
	static int fsl_sata_setup_fis(struct fsl_sata_channel ch, struct fsl_sata_cmd_tab ctp, union ccb *ccb, int tag);
	static void fsl_sata_dmainit(device_t dev);
	static void fsl_sata_dmasetupc_cb(void xsc, bus_dma_segment_t segs, int nsegs, int error);
	static void fsl_sata_dmafini(device_t dev);
	static void fsl_sata_slotsalloc(device_t dev);
	static void fsl_sata_slotsfree(device_t dev);
	static void fsl_sata_reset(struct fsl_sata_channel *ch);
	static void fsl_sata_start(struct fsl_sata_channel *ch);
	static void fsl_sata_stop(struct fsl_sata_channel *ch);

	static void fsl_sata_issue_recovery(struct fsl_sata_channel *ch);
	static void fsl_sata_process_read_log(struct fsl_sata_channel ch, union ccb ccb);
	static void fsl_sata_process_request_sense(struct fsl_sata_channel ch, union ccb ccb);

	static void fsl_sataaction(struct cam_sim sim, union ccb ccb);
	static void fsl_satapoll(struct cam_sim *sim);

	static MALLOC_DEFINE(M_FSL_SATA, "FSL SATA driver", "FSL SATA driver data buffers");

	#define recovery_type spriv_field0
	#define RECOVERY_NONE 0
	#define RECOVERY_READ_LOG 1
	#define RECOVERY_REQUEST_SENSE 2
	#define recovery_slot spriv_field1

	#define FSL_SATA_P_CQR 0x0
	#define FSL_SATA_P_CAR 0x4
	#define FSL_SATA_P_CCR 0x10
	#define FSL_SATA_P_CER 0x18
	#define FSL_SATA_P_DER 0x20
	#define FSL_SATA_P_CHBA 0x24
	#define FSL_SATA_P_HSTS 0x28
	#define FSL_SATA_P_HSTS_HS_ON 0x80000000
	#define FSL_SATA_P_HSTS_ME 0x00040000
	#define FSL_SATA_P_HSTS_DLM 0x00001000
	#define FSL_SATA_P_HSTS_FOT 0x00000200
	#define FSL_SATA_P_HSTS_FOR 0x00000100
	#define FSL_SATA_P_HSTS_FE 0x00000020
	#define FSL_SATA_P_HSTS_PR 0x00000010
	#define FSL_SATA_P_HSTS_SNTFU 0x00000004
	#define FSL_SATA_P_HSTS_DE 0x00000002
	#define FSL_SATA_P_HCTRL 0x2c
	#define FSL_SATA_P_HCTRL_HC_ON 0x80000000
	#define FSL_SATA_P_HCTRL_HC_FORCE_OFF 0x40000000
	#define FSL_SATA_P_HCTRL_ENT 0x10000000
	#define FSL_SATA_P_HCTRL_SNOOP 0x00000400
	#define FSL_SATA_P_HCTRL_PM 0x00000200
	#define FSL_SATA_P_HCTRL_FATAL 0x00000020
	#define FSL_SATA_P_HCTRL_PHYRDY 0x00000010
	#define FSL_SATA_P_HCTRL_SIG 0x00000008
	#define FSL_SATA_P_HCTRL_SNTFY 0x00000004
	#define FSL_SATA_P_HCTRL_DE 0x00000002
	#define FSL_SATA_P_HCTRL_CC 0x00000001
	#define FSL_SATA_P_HCTRL_INT_MASK 0x0000003f
	#define FSL_SATA_P_CQPMP 0x30
	#define FSL_SATA_P_SIG 0x34
	#define FSL_SATA_P_ICC 0x38
	#define FSL_SATA_P_ICC_ITC_M 0x1f000000
	#define FSL_SATA_P_ICC_ITC_S 24
	#define FSL_SATA_P_ICC_ITTCV_M 0x0007ffff
	#define FSL_SATA_P_PCC 0x15c
	#define FSL_SATA_P_PCC_SLUMBER 0x0000000c
	#define FSL_SATA_P_PCC_PARTIAL 0x0000000a
	#define FSL_SATA_PCC_LPB_EN 0x0000000e

	#define FSL_SATA_MAX_SLOTS 16
	/* FSL_SATA register defines */

	#define FSL_SATA_P_SSTS 0x100
	#define FSL_SATA_P_SERR 0x104
	#define FSL_SATA_P_SCTL 0x108
	#define FSL_SATA_P_SNTF 0x10c

	/* Pessimistic prognosis on number of required S/G entries */
	#define FSL_SATA_SG_ENTRIES 63
	/* Command list. 16 commands. First, 1Kbyte aligned. */
	#define FSL_SATA_CL_OFFSET 0
	#define FSL_SATA_CL_SIZE 16
	/* Command tables. Up to 32 commands, Each, 4-byte aligned. */
	#define FSL_SATA_CT_OFFSET (FSL_SATA_CL_OFFSET + FSL_SATA_CL_SIZE * FSL_SATA_MAX_SLOTS)
	#define FSL_SATA_CT_SIZE (96 + FSL_SATA_SG_ENTRIES * 16)
	/* Total main work area. */
	#define FSL_SATA_WORK_SIZE (FSL_SATA_CT_OFFSET + FSL_SATA_CT_SIZE * FSL_SATA_MAX_SLOTS)
	#define FSL_SATA_MAX_XFER (64 * 1024 * 1024)

	/* Some convenience macros for getting the CTP and CLP */
	#define FSL_SATA_CTP_BUS(ch, slot) \
	((ch->dma.work_bus + FSL_SATA_CT_OFFSET + (FSL_SATA_CT_SIZE * slot->slot)))
	#define FSL_SATA_PRD_OFFSET(prd) (96 + (prd) * 16)
	#define FSL_SATA_CTP(ch, slot) \
	((struct fsl_sata_cmd_tab *)(ch->dma.work + FSL_SATA_CT_OFFSET + \
	(FSL_SATA_CT_SIZE * slot->slot)))
	#define FSL_SATA_CLP(ch, slot) \
	((struct fsl_sata_cmd_list *) (ch->dma.work + FSL_SATA_CL_OFFSET + \
	(FSL_SATA_CL_SIZE * slot->slot)))

	struct fsl_sata_dma_prd {
	uint32_t dba;
	uint32_t reserved;
	uint32_t reserved2;
	uint32_t dwc_flg; /* 0 based */
	#define FSL_SATA_PRD_MASK 0x01fffffc /* max 32MB */
	#define FSL_SATA_PRD_MAX (FSL_SATA_PRD_MASK + 4)
	#define FSL_SATA_PRD_SNOOP 0x10000000
	#define FSL_SATA_PRD_EXT 0x80000000
	} __packed;

	struct fsl_sata_cmd_tab {
	uint8_t cfis[32];
	uint8_t sfis[32];
	uint8_t acmd[16];
	uint8_t reserved[16];
	struct fsl_sata_dma_prd prd_tab[FSL_SATA_SG_ENTRIES];
	#define FSL_SATA_PRD_EXT_INDEX 15
	#define FSL_SATA_PRD_MAX_DIRECT 16
	} __packed;

	struct fsl_sata_cmd_list {
	uint32_t cda; /* word aligned */
	uint16_t fis_length; /* length in bytes (aligned to words) */
	uint16_t prd_length; /* PRD entries */
	uint32_t ttl;
	uint32_t cmd_flags;
	#define FSL_SATA_CMD_TAG_MASK 0x001f
	#define FSL_SATA_CMD_ATAPI 0x0020
	#define FSL_SATA_CMD_BIST 0x0040
	#define FSL_SATA_CMD_RESET 0x0080
	#define FSL_SATA_CMD_QUEUED 0x0100
	#define FSL_SATA_CMD_SNOOP 0x0200
	#define FSL_SATA_CMD_VBIST 0x0400
	#define FSL_SATA_CMD_WRITE 0x0800

	} __packed;

	/* misc defines */
	#define ATA_IRQ_RID 0
	#define ATA_INTR_FLAGS (INTR_MPSAFE\|INTR_TYPE_BIO\|INTR_ENTROPY)

	struct ata_dmaslot {
	bus_dmamap_t data_map; /* data DMA map */
	int nsegs; /* Number of segs loaded */
	};

	/* structure holding DMA related information */
	struct ata_dma {
	bus_dma_tag_t work_tag; /* workspace DMA tag */
	bus_dmamap_t work_map; /* workspace DMA map */
	uint8_t work; / workspace */
	bus_addr_t work_bus; /* bus address of work */
	bus_dma_tag_t data_tag; /* data DMA tag */
	};

	enum fsl_sata_slot_states {
	FSL_SATA_SLOT_EMPTY,
	FSL_SATA_SLOT_LOADING,
	FSL_SATA_SLOT_RUNNING,
	FSL_SATA_SLOT_EXECUTING
	};

	struct fsl_sata_slot {
	struct fsl_sata_channel ch; / Channel */
	uint8_t slot; /* Number of this slot */
	enum fsl_sata_slot_states state; /* Slot state */
	union ccb ccb; / CCB occupying slot */
	struct ata_dmaslot dma; /* DMA data of this slot */
	struct callout timeout; /* Execution timeout */
	uint32_t ttl;
	};

	struct fsl_sata_device {
	int revision;
	int mode;
	u_int bytecount;
	u_int atapi;
	u_int tags;
	u_int caps;
	};

	/* structure describing an ATA channel */
	struct fsl_sata_channel {
	device_t dev; /* Device handle */
	int r_mid; /* Physical channel RID */
	struct resource r_mem; / Memory of this channel */
	struct resource r_irq; / Interrupt of this channel */
	void ih; / Interrupt handle */
	struct ata_dma dma; /* DMA data */
	struct cam_sim *sim;
	struct cam_path *path;
	uint32_t caps; /* Controller capabilities */
	int pm_level; /* power management level */
	int devices; /* What is present */
	int pm_present; /* PM presence reported */

	union ccb *hold[FSL_SATA_MAX_SLOTS];
	struct fsl_sata_slot slot[FSL_SATA_MAX_SLOTS];
	uint32_t oslots; /* Occupied slots */
	uint32_t rslots; /* Running slots */
	uint32_t aslots; /* Slots with atomic commands */
	uint32_t eslots; /* Slots in error */
	uint32_t toslots; /* Slots in timeout */
	int lastslot; /* Last used slot */
	int taggedtarget; /* Last tagged target */
	int numrslots; /* Number of running slots */
	int numrslotspd[16];/* Number of running slots per dev */
	int numtslots; /* Number of tagged slots */
	int numtslotspd[16];/* Number of tagged slots per dev */
	int numhslots; /* Number of held slots */
	int recoverycmd; /* Our READ LOG active */
	int fatalerr; /* Fatal error happend */
	int resetting; /* Hard-reset in progress. */
	int resetpolldiv; /* Hard-reset poll divider. */
	union ccb frozen; / Frozen command */
	struct callout pm_timer; /* Power management events */
	struct callout reset_timer; /* Hard-reset timeout */

	struct fsl_sata_device user[16]; /* User-specified settings */
	struct fsl_sata_device curr[16]; /* Current settings */

	struct mtx_padalign mtx; /* state lock */
	STAILQ_HEAD(, ccb_hdr) doneq; /* queue of completed CCBs */
	int batch; /* doneq is in use */
	};

	enum fsl_sata_err_type {
	FSL_SATA_ERR_NONE, /* No error */
	FSL_SATA_ERR_INVALID, /* Error detected by us before submitting. */
	FSL_SATA_ERR_INNOCENT, /* Innocent victim. */
	FSL_SATA_ERR_TFE, /* Task File Error. */
	FSL_SATA_ERR_SATA, /* SATA error. */
	FSL_SATA_ERR_TIMEOUT, /* Command execution timeout. */
	FSL_SATA_ERR_NCQ, /* NCQ command error. CCB should be put on hold
	* until READ LOG executed to reveal error. */
	};

	/* macros to hide busspace uglyness */
	#define ATA_INL(res, offset) \
	bus_read_4((res), (offset))
	#define ATA_OUTL(res, offset, value) \
	bus_write_4((res), (offset), (value))

	static int
	fsl_sata_probe(device_t dev)
	{

	if (!ofw_bus_is_compatible(dev, "fsl,pq-sata-v2") &&
	!ofw_bus_is_compatible(dev, "fsl,pq-sata"))
	return (ENXIO);

	device_set_desc_copy(dev, "Freescale Integrated SATA Controller");
	return (BUS_PROBE_DEFAULT);
	}

	static int
	fsl_sata_attach(device_t dev)
	{
	struct fsl_sata_channel *ch = device_get_softc(dev);
	struct cam_devq *devq;
	int rid, error, i, sata_rev = 0;

	ch->dev = dev;
	mtx_init(&ch->mtx, "FSL SATA channel lock", NULL, MTX_DEF);
	ch->pm_level = 0;
	resource_int_value(device_get_name(dev),
	device_get_unit(dev), "pm_level", &ch->pm_level);
	STAILQ_INIT(&ch->doneq);
	if (ch->pm_level > 3)
	callout_init_mtx(&ch->pm_timer, &ch->mtx, 0);
	resource_int_value(device_get_name(dev),
	device_get_unit(dev), "sata_rev", &sata_rev);
	for (i = 0; i < 16; i++) {
	ch->user[i].revision = sata_rev;
	ch->user[i].mode = 0;
	ch->user[i].bytecount = 8192;
	ch->user[i].tags = FSL_SATA_MAX_SLOTS;
	ch->user[i].caps = 0;
	ch->curr[i] = ch->user[i];
	if (ch->pm_level) {
	ch->user[i].caps = CTS_SATA_CAPS_H_PMREQ \|
	CTS_SATA_CAPS_D_PMREQ;
	}
	ch->user[i].caps \|= CTS_SATA_CAPS_H_AN;
	}
	ch->r_mid = 0;
	if (!(ch->r_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&ch->r_mid, RF_ACTIVE)))
	return (ENXIO);
	rman_set_bustag(ch->r_mem, &bs_le_tag);
	fsl_sata_dmainit(dev);
	fsl_sata_slotsalloc(dev);
	fsl_sata_init(dev);
	rid = ATA_IRQ_RID;
	if (!(ch->r_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ,
	&rid, RF_SHAREABLE \| RF_ACTIVE))) {
	device_printf(dev, "Unable to map interrupt\n");
	error = ENXIO;
	goto err0;
	}
	if ((bus_setup_intr(dev, ch->r_irq, ATA_INTR_FLAGS, NULL,
	fsl_sata_intr, ch, &ch->ih))) {
	device_printf(dev, "Unable to setup interrupt\n");
	error = ENXIO;
	goto err1;
	}
	mtx_lock(&ch->mtx);
	/* Create the device queue for our SIM. */
	devq = cam_simq_alloc(FSL_SATA_MAX_SLOTS);
	if (devq == NULL) {
	device_printf(dev, "Unable to allocate simq\n");
	error = ENOMEM;
	goto err1;
	}
	/* Construct SIM entry */
	ch->sim = cam_sim_alloc(fsl_sataaction, fsl_satapoll, "fslsata", ch,
	device_get_unit(dev), (struct mtx *)&ch->mtx, 2, FSL_SATA_MAX_SLOTS,
	devq);
	if (ch->sim == NULL) {
	cam_simq_free(devq);
	device_printf(dev, "unable to allocate sim\n");
	error = ENOMEM;
	goto err1;
	}
	if (xpt_bus_register(ch->sim, dev, 0) != CAM_SUCCESS) {
	device_printf(dev, "unable to register xpt bus\n");
	error = ENXIO;
	goto err2;
	}
	if (xpt_create_path(&ch->path, /periph/NULL, cam_sim_path(ch->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	device_printf(dev, "unable to create path\n");
	error = ENXIO;
	goto err3;
	}
	if (ch->pm_level > 3) {
	callout_reset(&ch->pm_timer,
	(ch->pm_level == 4) ? hz / 1000 : hz / 8,
	fsl_sata_pm, ch);
	}
	mtx_unlock(&ch->mtx);
	return (0);

	err3:
	xpt_bus_deregister(cam_sim_path(ch->sim));
	err2:
	cam_sim_free(ch->sim, /free_devq/TRUE);
	err1:
	mtx_unlock(&ch->mtx);
	bus_release_resource(dev, SYS_RES_IRQ, ATA_IRQ_RID, ch->r_irq);
	err0:
	bus_release_resource(dev, SYS_RES_MEMORY, ch->r_mid, ch->r_mem);
	mtx_destroy(&ch->mtx);
	return (error);
	}

	static int
	fsl_sata_detach(device_t dev)
	{
	struct fsl_sata_channel *ch = device_get_softc(dev);

	mtx_lock(&ch->mtx);
	xpt_async(AC_LOST_DEVICE, ch->path, NULL);

	xpt_free_path(ch->path);
	xpt_bus_deregister(cam_sim_path(ch->sim));
	cam_sim_free(ch->sim, /free_devq/TRUE);
	mtx_unlock(&ch->mtx);

	if (ch->pm_level > 3)
	callout_drain(&ch->pm_timer);
	bus_teardown_intr(dev, ch->r_irq, ch->ih);
	bus_release_resource(dev, SYS_RES_IRQ, ATA_IRQ_RID, ch->r_irq);

	fsl_sata_deinit(dev);
	fsl_sata_slotsfree(dev);
	fsl_sata_dmafini(dev);

	bus_release_resource(dev, SYS_RES_MEMORY, ch->r_mid, ch->r_mem);
	mtx_destroy(&ch->mtx);
	return (0);
	}

	static int
	fsl_sata_wait_register(struct fsl_sata_channel *ch, bus_size_t off,
	unsigned int mask, unsigned int val, int t)
	{
	int timeout = 0;
	uint32_t rval;

	while (((rval = ATA_INL(ch->r_mem, off)) & mask) != val) {
	if (timeout > t) {
	return (EBUSY);
	}
	DELAY(1000);
	timeout++;
	}
	return (0);
	}

	static int
	fsl_sata_init(device_t dev)
	{
	struct fsl_sata_channel *ch = device_get_softc(dev);
	uint64_t work;
	uint32_t r;

	/* Disable port interrupts */
	r = ATA_INL(ch->r_mem, FSL_SATA_P_HCTRL);
	r &= ~FSL_SATA_P_HCTRL_HC_ON;
	r \|= FSL_SATA_P_HCTRL_HC_FORCE_OFF;
	ATA_OUTL(ch->r_mem, FSL_SATA_P_HCTRL, r & ~FSL_SATA_P_HCTRL_INT_MASK);
	fsl_sata_wait_register(ch, FSL_SATA_P_HSTS,
	FSL_SATA_P_HSTS_HS_ON, 0, 1000);
	/* Setup work areas */
	work = ch->dma.work_bus + FSL_SATA_CL_OFFSET;
	ATA_OUTL(ch->r_mem, FSL_SATA_P_CHBA, work);
	r &= ~FSL_SATA_P_HCTRL_ENT;
	r &= ~FSL_SATA_P_HCTRL_PM;
	ATA_OUTL(ch->r_mem, FSL_SATA_P_HCTRL, r);
	r = ATA_INL(ch->r_mem, FSL_SATA_P_PCC);
	ATA_OUTL(ch->r_mem, FSL_SATA_P_PCC, r & ~FSL_SATA_PCC_LPB_EN);
	ATA_OUTL(ch->r_mem, FSL_SATA_P_ICC, (1 << FSL_SATA_P_ICC_ITC_S));
	fsl_sata_start(ch);
	return (0);
	}

	static int
	fsl_sata_deinit(device_t dev)
	{
	struct fsl_sata_channel *ch = device_get_softc(dev);
	uint32_t r;

	/* Disable port interrupts. */
	r = ATA_INL(ch->r_mem, FSL_SATA_P_HCTRL);
	ATA_OUTL(ch->r_mem, FSL_SATA_P_HCTRL, r & ~FSL_SATA_P_HCTRL_INT_MASK);
	/* Reset command register. */
	fsl_sata_stop(ch);
	/* Allow everything, including partial and slumber modes. */
	ATA_OUTL(ch->r_mem, FSL_SATA_P_SCTL, 0);
	DELAY(100);
	/* Disable PHY. */
	ATA_OUTL(ch->r_mem, FSL_SATA_P_SCTL, ATA_SC_DET_DISABLE);
	r = ATA_INL(ch->r_mem, FSL_SATA_P_HCTRL);
	/* Turn off the controller. */
	ATA_OUTL(ch->r_mem, FSL_SATA_P_HCTRL, r & ~FSL_SATA_P_HCTRL_HC_ON);
	return (0);
	}

	static int
	fsl_sata_suspend(device_t dev)
	{
	struct fsl_sata_channel *ch = device_get_softc(dev);

	mtx_lock(&ch->mtx);
	xpt_freeze_simq(ch->sim, 1);
	while (ch->oslots)
	msleep(ch, &ch->mtx, PRIBIO, "fsl_satasusp", hz/100);
	fsl_sata_deinit(dev);
	mtx_unlock(&ch->mtx);
	return (0);
	}

	static int
	fsl_sata_resume(device_t dev)
	{
	struct fsl_sata_channel *ch = device_get_softc(dev);

	mtx_lock(&ch->mtx);
	fsl_sata_init(dev);
	fsl_sata_reset(ch);
	xpt_release_simq(ch->sim, TRUE);
	mtx_unlock(&ch->mtx);
	return (0);
	}

	devclass_t fsl_satach_devclass;
	static device_method_t fsl_satach_methods[] = {
	DEVMETHOD(device_probe, fsl_sata_probe),
	DEVMETHOD(device_attach, fsl_sata_attach),
	DEVMETHOD(device_detach, fsl_sata_detach),
	DEVMETHOD(device_suspend, fsl_sata_suspend),
	DEVMETHOD(device_resume, fsl_sata_resume),
	DEVMETHOD_END
	};
	static driver_t fsl_satach_driver = {
	"fslsata",
	fsl_satach_methods,
	sizeof(struct fsl_sata_channel)
	};
	DRIVER_MODULE(fsl_satach, simplebus, fsl_satach_driver, fsl_satach_devclass, NULL, NULL);

	struct fsl_sata_dc_cb_args {
	bus_addr_t maddr;
	int error;
	};

	static void
	fsl_sata_dmainit(device_t dev)
	{
	struct fsl_sata_channel *ch = device_get_softc(dev);
	struct fsl_sata_dc_cb_args dcba;

	/* Command area. */
	if (bus_dma_tag_create(bus_get_dma_tag(dev), 1024, 0,
	BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR,
	NULL, NULL, FSL_SATA_WORK_SIZE, 1, FSL_SATA_WORK_SIZE,
	0, NULL, NULL, &ch->dma.work_tag))
	goto error;
	if (bus_dmamem_alloc(ch->dma.work_tag, (void **)&ch->dma.work,
	BUS_DMA_ZERO, &ch->dma.work_map))
	goto error;
	if (bus_dmamap_load(ch->dma.work_tag, ch->dma.work_map, ch->dma.work,
	FSL_SATA_WORK_SIZE, fsl_sata_dmasetupc_cb, &dcba, 0) \|\| dcba.error) {
	bus_dmamem_free(ch->dma.work_tag, ch->dma.work, ch->dma.work_map);
	goto error;
	}
	ch->dma.work_bus = dcba.maddr;
	/* Data area. */
	if (bus_dma_tag_create(bus_get_dma_tag(dev), 4, 0,
	BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR,
	NULL, NULL, FSL_SATA_MAX_XFER,
	FSL_SATA_SG_ENTRIES - 1, FSL_SATA_PRD_MAX,
	0, busdma_lock_mutex, &ch->mtx, &ch->dma.data_tag)) {
	goto error;
	}
	if (bootverbose)
	device_printf(dev, "work area: %p\n", ch->dma.work);
	return;

	error:
	device_printf(dev, "WARNING - DMA initialization failed\n");
	fsl_sata_dmafini(dev);
	}

	static void
	fsl_sata_dmasetupc_cb(void xsc, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct fsl_sata_dc_cb_args dcba = (struct fsl_sata_dc_cb_args )xsc;

	if (!(dcba->error = error))
	dcba->maddr = segs[0].ds_addr;
	}

	static void
	fsl_sata_dmafini(device_t dev)
	{
	struct fsl_sata_channel *ch = device_get_softc(dev);

	if (ch->dma.data_tag) {
	bus_dma_tag_destroy(ch->dma.data_tag);
	ch->dma.data_tag = NULL;
	}
	if (ch->dma.work_bus) {
	bus_dmamap_unload(ch->dma.work_tag, ch->dma.work_map);
	bus_dmamem_free(ch->dma.work_tag, ch->dma.work, ch->dma.work_map);
	ch->dma.work_bus = 0;
	ch->dma.work = NULL;
	}
	if (ch->dma.work_tag) {
	bus_dma_tag_destroy(ch->dma.work_tag);
	ch->dma.work_tag = NULL;
	}
	}

	static void
	fsl_sata_slotsalloc(device_t dev)
	{
	struct fsl_sata_channel *ch = device_get_softc(dev);
	int i;

	/* Alloc and setup command/dma slots */
	bzero(ch->slot, sizeof(ch->slot));
	for (i = 0; i < FSL_SATA_MAX_SLOTS; i++) {
	struct fsl_sata_slot *slot = &ch->slot[i];

	slot->ch = ch;
	slot->slot = i;
	slot->state = FSL_SATA_SLOT_EMPTY;
	slot->ccb = NULL;
	callout_init_mtx(&slot->timeout, &ch->mtx, 0);

	if (bus_dmamap_create(ch->dma.data_tag, 0, &slot->dma.data_map))
	device_printf(ch->dev, "FAILURE - create data_map\n");
	}
	}

	static void
	fsl_sata_slotsfree(device_t dev)
	{
	struct fsl_sata_channel *ch = device_get_softc(dev);
	int i;

	/* Free all dma slots */
	for (i = 0; i < FSL_SATA_MAX_SLOTS; i++) {
	struct fsl_sata_slot *slot = &ch->slot[i];

	callout_drain(&slot->timeout);
	if (slot->dma.data_map) {
	bus_dmamap_destroy(ch->dma.data_tag, slot->dma.data_map);
	slot->dma.data_map = NULL;
	}
	}
	}

	static int
	fsl_sata_phy_check_events(struct fsl_sata_channel *ch, u_int32_t serr)
	{

	if (((ch->pm_level == 0) && (serr & ATA_SE_PHY_CHANGED)) \|\|
	((ch->pm_level != 0) && (serr & ATA_SE_EXCHANGED))) {
	u_int32_t status = ATA_INL(ch->r_mem, FSL_SATA_P_SSTS);
	union ccb *ccb;

	if (bootverbose) {
	if ((status & ATA_SS_DET_MASK) != ATA_SS_DET_NO_DEVICE)
	device_printf(ch->dev, "CONNECT requested\n");
	else
	device_printf(ch->dev, "DISCONNECT requested\n");
	}
	/* Issue soft reset */
	xpt_async(AC_BUS_RESET, ch->path, NULL);
	if ((ccb = xpt_alloc_ccb_nowait()) == NULL)
	return (0);
	if (xpt_create_path(&ccb->ccb_h.path, NULL,
	cam_sim_path(ch->sim),
	CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	xpt_free_ccb(ccb);
	return (0);
	}
	xpt_rescan(ccb);
	return (1);
	}
	return (0);
	}

	static void
	fsl_sata_notify_events(struct fsl_sata_channel *ch, u_int32_t status)
	{
	struct cam_path *dpath;
	int i;

	ATA_OUTL(ch->r_mem, FSL_SATA_P_SNTF, status);
	if (bootverbose)
	device_printf(ch->dev, "SNTF 0x%04x\n", status);
	for (i = 0; i < 16; i++) {
	if ((status & (1 << i)) == 0)
	continue;
	if (xpt_create_path(&dpath, NULL,
	xpt_path_path_id(ch->path), i, 0) == CAM_REQ_CMP) {
	xpt_async(AC_SCSI_AEN, dpath, NULL);
	xpt_free_path(dpath);
	}
	}
	}

	static void
	fsl_sata_done(struct fsl_sata_channel ch, union ccb ccb)
	{

	mtx_assert(&ch->mtx, MA_OWNED);
	if ((ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0 \|\|
	ch->batch == 0) {
	xpt_done(ccb);
	return;
	}

	STAILQ_INSERT_TAIL(&ch->doneq, &ccb->ccb_h, sim_links.stqe);
	}

	static void
	fsl_sata_intr(void *arg)
	{
	struct fsl_sata_channel ch = (struct fsl_sata_channel )arg;
	struct ccb_hdr *ccb_h;
	uint32_t istatus;
	STAILQ_HEAD(, ccb_hdr) tmp_doneq = STAILQ_HEAD_INITIALIZER(tmp_doneq);

	/* Read interrupt statuses. */
	istatus = ATA_INL(ch->r_mem, FSL_SATA_P_HSTS) & 0x7ffff;
	if ((istatus & 0x3f) == 0)
	return;

	mtx_lock(&ch->mtx);
	ch->batch = 1;
	fsl_sata_intr_main(ch, istatus);
	ch->batch = 0;
	/*
	* Prevent the possibility of issues caused by processing the queue
	* while unlocked below by moving the contents to a local queue.
	*/
	STAILQ_CONCAT(&tmp_doneq, &ch->doneq);
	mtx_unlock(&ch->mtx);
	while ((ccb_h = STAILQ_FIRST(&tmp_doneq)) != NULL) {
	STAILQ_REMOVE_HEAD(&tmp_doneq, sim_links.stqe);
	xpt_done_direct((union ccb *)ccb_h);
	}
	/* Clear interrupt statuses. */
	ATA_OUTL(ch->r_mem, FSL_SATA_P_HSTS, istatus & 0x3f);

	}

	static void
	fsl_sata_pm(void *arg)
	{
	struct fsl_sata_channel ch = (struct fsl_sata_channel )arg;
	uint32_t work;

	if (ch->numrslots != 0)
	return;
	work = ATA_INL(ch->r_mem, FSL_SATA_P_PCC) & ~FSL_SATA_PCC_LPB_EN;
	if (ch->pm_level == 4)
	work \|= FSL_SATA_P_PCC_PARTIAL;
	else
	work \|= FSL_SATA_P_PCC_SLUMBER;
	ATA_OUTL(ch->r_mem, FSL_SATA_P_PCC, work);
	}

	/* XXX: interrupt todo */
	static void
	fsl_sata_intr_main(struct fsl_sata_channel *ch, uint32_t istatus)
	{
	uint32_t cer, der, serr = 0, sntf = 0, ok, err;
	enum fsl_sata_err_type et;
	int i;

	/* Complete all successful commands. */
	ok = ATA_INL(ch->r_mem, FSL_SATA_P_CCR);
	/* Mark all commands complete, to complete the interrupt. */
	ATA_OUTL(ch->r_mem, FSL_SATA_P_CCR, ok);
	if (ch->aslots == 0 && ok != 0) {
	for (i = 0; i < FSL_SATA_MAX_SLOTS; i++) {
	if (((ok >> i) & 1) && ch->slot[i].ccb != NULL)
	fsl_sata_end_transaction(&ch->slot[i],
	FSL_SATA_ERR_NONE);
	}
	}
	/* Read command statuses. */
	if (istatus & FSL_SATA_P_HSTS_SNTFU)
	sntf = ATA_INL(ch->r_mem, FSL_SATA_P_SNTF);
	/* XXX: Process PHY events */
	serr = ATA_INL(ch->r_mem, FSL_SATA_P_SERR);
	ATA_OUTL(ch->r_mem, FSL_SATA_P_SERR, serr);
	if (istatus & (FSL_SATA_P_HSTS_PR)) {
	if (serr) {
	fsl_sata_phy_check_events(ch, serr);
	}
	}
	/* Process command errors */
	err = (istatus & (FSL_SATA_P_HSTS_FE \| FSL_SATA_P_HSTS_DE));
	cer = ATA_INL(ch->r_mem, FSL_SATA_P_CER);
	ATA_OUTL(ch->r_mem, FSL_SATA_P_CER, cer);
	der = ATA_INL(ch->r_mem, FSL_SATA_P_DER);
	ATA_OUTL(ch->r_mem, FSL_SATA_P_DER, der);
	/* On error, complete the rest of commands with error statuses. */
	if (err) {
	if (ch->frozen) {
	union ccb *fccb = ch->frozen;
	ch->frozen = NULL;
	fccb->ccb_h.status = CAM_REQUEUE_REQ \| CAM_RELEASE_SIMQ;
	if (!(fccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(fccb->ccb_h.path, 1);
	fccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	fsl_sata_done(ch, fccb);
	}
	for (i = 0; i < FSL_SATA_MAX_SLOTS; i++) {
	if (ch->slot[i].ccb == NULL)
	continue;
	if ((cer & (1 << i)) != 0)
	et = FSL_SATA_ERR_TFE;
	else if ((der & (1 << ch->slot[i].ccb->ccb_h.target_id)) != 0)
	et = FSL_SATA_ERR_SATA;
	else
	et = FSL_SATA_ERR_INVALID;
	fsl_sata_end_transaction(&ch->slot[i], et);
	}
	}
	/* Process NOTIFY events */
	if (sntf)
	fsl_sata_notify_events(ch, sntf);
	}

	/* Must be called with channel locked. */
	static int
	fsl_sata_check_collision(struct fsl_sata_channel ch, union ccb ccb)
	{
	int t = ccb->ccb_h.target_id;

	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA)) {
	/* Tagged command while we have no supported tag free. */
	if (((~ch->oslots) & (0xffff >> (16 - ch->curr[t].tags))) == 0)
	return (1);
	/* Tagged command while untagged are active. */
	if (ch->numrslotspd[t] != 0 && ch->numtslotspd[t] == 0)
	return (1);
	} else {
	/* Untagged command while tagged are active. */
	if (ch->numrslotspd[t] != 0 && ch->numtslotspd[t] != 0)
	return (1);
	}
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & (CAM_ATAIO_CONTROL \| CAM_ATAIO_NEEDRESULT))) {
	/* Atomic command while anything active. */
	if (ch->numrslots != 0)
	return (1);
	}
	/* We have some atomic command running. */
	if (ch->aslots != 0)
	return (1);
	return (0);
	}

	/* Must be called with channel locked. */
	static void
	fsl_sata_begin_transaction(struct fsl_sata_channel ch, union ccb ccb)
	{
	struct fsl_sata_slot *slot;
	int tag, tags;

	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
	("fsl_sata_begin_transaction func_code=0x%x\n", ccb->ccb_h.func_code));
	/* Choose empty slot. */
	tags = FSL_SATA_MAX_SLOTS;
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA))
	tags = ch->curr[ccb->ccb_h.target_id].tags;
	if (ch->lastslot + 1 < tags)
	tag = ffs(~(ch->oslots >> (ch->lastslot + 1)));
	else
	tag = 0;
	if (tag == 0 \|\| tag + ch->lastslot >= tags)
	tag = ffs(~ch->oslots) - 1;
	else
	tag += ch->lastslot;
	ch->lastslot = tag;
	/* Occupy chosen slot. */
	slot = &ch->slot[tag];
	slot->ccb = ccb;
	slot->ttl = 0;
	/* Stop PM timer. */
	if (ch->numrslots == 0 && ch->pm_level > 3)
	callout_stop(&ch->pm_timer);
	/* Update channel stats. */
	ch->oslots \|= (1 << tag);
	ch->numrslots++;
	ch->numrslotspd[ccb->ccb_h.target_id]++;
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA)) {
	ch->numtslots++;
	ch->numtslotspd[ccb->ccb_h.target_id]++;
	ch->taggedtarget = ccb->ccb_h.target_id;
	}
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & (CAM_ATAIO_CONTROL \| CAM_ATAIO_NEEDRESULT)))
	ch->aslots \|= (1 << tag);
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) {
	slot->state = FSL_SATA_SLOT_LOADING;
	bus_dmamap_load_ccb(ch->dma.data_tag, slot->dma.data_map, ccb,
	fsl_sata_dmasetprd, slot, 0);
	} else {
	slot->dma.nsegs = 0;
	fsl_sata_execute_transaction(slot);
	}

	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
	("fsl_sata_begin_transaction exit\n"));
	}

	/* Locked by busdma engine. */
	static void
	fsl_sata_dmasetprd(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct fsl_sata_slot *slot = arg;
	struct fsl_sata_channel *ch = slot->ch;
	struct fsl_sata_cmd_tab *ctp;
	struct fsl_sata_dma_prd *prd;
	int i, j, len, extlen;

	if (error) {
	device_printf(ch->dev, "DMA load error %d\n", error);
	fsl_sata_end_transaction(slot, FSL_SATA_ERR_INVALID);
	return;
	}
	KASSERT(nsegs <= FSL_SATA_SG_ENTRIES - 1,
	("too many DMA segment entries\n"));
	/* Get a piece of the workspace for this request */
	ctp = FSL_SATA_CTP(ch, slot);
	/* Fill S/G table */
	prd = &ctp->prd_tab[0];
	for (i = 0, j = 0; i < nsegs; i++, j++) {
	if (j == FSL_SATA_PRD_EXT_INDEX &&
	FSL_SATA_PRD_MAX_DIRECT < nsegs) {
	prd[j].dba = htole32(FSL_SATA_CTP_BUS(ch, slot) +
	FSL_SATA_PRD_OFFSET(j+1));
	j++;
	extlen = 0;
	}
	len = segs[i].ds_len;
	len = roundup2(len, sizeof(uint32_t));
	prd[j].dba = htole32((uint32_t)segs[i].ds_addr);
	prd[j].dwc_flg = htole32(FSL_SATA_PRD_SNOOP \| len);
	slot->ttl += len;
	if (j > FSL_SATA_PRD_MAX_DIRECT)
	extlen += len;
	}
	slot->dma.nsegs = j;
	if (j > FSL_SATA_PRD_MAX_DIRECT)
	prd[FSL_SATA_PRD_EXT_INDEX].dwc_flg =
	htole32(FSL_SATA_PRD_SNOOP \| FSL_SATA_PRD_EXT \| extlen);
	bus_dmamap_sync(ch->dma.data_tag, slot->dma.data_map,
	((slot->ccb->ccb_h.flags & CAM_DIR_IN) ?
	BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE));
	fsl_sata_execute_transaction(slot);
	}

	/* Must be called with channel locked. */
	static void
	fsl_sata_execute_transaction(struct fsl_sata_slot *slot)
	{
	struct fsl_sata_channel *ch = slot->ch;
	struct fsl_sata_cmd_tab *ctp;
	struct fsl_sata_cmd_list *clp;
	union ccb *ccb = slot->ccb;
	int port = ccb->ccb_h.target_id & 0x0f;
	int fis_size, i, softreset;
	uint32_t tmp;
	uint32_t cmd_flags = FSL_SATA_CMD_WRITE \| FSL_SATA_CMD_SNOOP;

	softreset = 0;
	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
	("fsl_sata_execute_transaction func_code=0x%x\n", ccb->ccb_h.func_code));
	/* Get a piece of the workspace for this request */
	ctp = FSL_SATA_CTP(ch, slot);
	/* Setup the FIS for this request */
	if (!(fis_size = fsl_sata_setup_fis(ch, ctp, ccb, slot->slot))) {
	device_printf(ch->dev, "Setting up SATA FIS failed\n");
	fsl_sata_end_transaction(slot, FSL_SATA_ERR_INVALID);
	return;
	}
	/* Setup the command list entry */
	clp = FSL_SATA_CLP(ch, slot);
	clp->fis_length = htole16(fis_size);
	clp->prd_length = htole16(slot->dma.nsegs);
	/* Special handling for Soft Reset command. */
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL)) {
	if (ccb->ataio.cmd.control & ATA_A_RESET) {
	softreset = 1;
	cmd_flags \|= FSL_SATA_CMD_RESET;
	} else {
	/* Prepare FIS receive area for check. */
	for (i = 0; i < 32; i++)
	ctp->sfis[i] = 0xff;
	softreset = 2;
	}
	}
	if (ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA)
	cmd_flags \|= FSL_SATA_CMD_QUEUED;
	clp->cmd_flags = htole32(cmd_flags \|
	(ccb->ccb_h.func_code == XPT_SCSI_IO ? FSL_SATA_CMD_ATAPI : 0) \|
	slot->slot);
	clp->ttl = htole32(slot->ttl);
	clp->cda = htole32(FSL_SATA_CTP_BUS(ch, slot));
	bus_dmamap_sync(ch->dma.work_tag, ch->dma.work_map,
	BUS_DMASYNC_PREREAD \| BUS_DMASYNC_PREWRITE);
	/* Issue command to the controller. */
	slot->state = FSL_SATA_SLOT_RUNNING;
	ch->rslots \|= (1 << slot->slot);
	ATA_OUTL(ch->r_mem, FSL_SATA_P_CQPMP, port);
	ATA_OUTL(ch->r_mem, FSL_SATA_P_CQR, (1 << slot->slot));
	/* Device reset commands don't interrupt. Poll them. */
	if (ccb->ccb_h.func_code == XPT_ATA_IO &&
	(ccb->ataio.cmd.command == ATA_DEVICE_RESET \|\| softreset)) {
	int count, timeout = ccb->ccb_h.timeout * 100;
	enum fsl_sata_err_type et = FSL_SATA_ERR_NONE;

	for (count = 0; count < timeout; count++) {
	DELAY(10);
	tmp = 0;
	if (softreset == 2) {
	tmp = ATA_INL(ch->r_mem, FSL_SATA_P_SIG);
	if (tmp != 0 && tmp != 0xffffffff)
	break;
	continue;
	}
	if ((ATA_INL(ch->r_mem, FSL_SATA_P_CCR) & (1 << slot->slot)) != 0)
	break;
	}

	if (timeout && (count >= timeout)) {
	device_printf(ch->dev, "Poll timeout on slot %d port %d (round %d)\n",
	slot->slot, port, softreset);
	device_printf(ch->dev, "hsts %08x cqr %08x ccr %08x ss %08x "
	"rs %08x cer %08x der %08x serr %08x car %08x sig %08x\n",
	ATA_INL(ch->r_mem, FSL_SATA_P_HSTS),
	ATA_INL(ch->r_mem, FSL_SATA_P_CQR),
	ATA_INL(ch->r_mem, FSL_SATA_P_CCR),
	ATA_INL(ch->r_mem, FSL_SATA_P_SSTS), ch->rslots,
	ATA_INL(ch->r_mem, FSL_SATA_P_CER),
	ATA_INL(ch->r_mem, FSL_SATA_P_DER),
	ATA_INL(ch->r_mem, FSL_SATA_P_SERR),
	ATA_INL(ch->r_mem, FSL_SATA_P_CAR),
	ATA_INL(ch->r_mem, FSL_SATA_P_SIG));
	et = FSL_SATA_ERR_TIMEOUT;
	}

	fsl_sata_end_transaction(slot, et);
	return;
	}
	/* Start command execution timeout */
	callout_reset_sbt(&slot->timeout, SBT_1MS * ccb->ccb_h.timeout / 2,
	0, fsl_sata_timeout, slot, 0);
	return;
	}

	/* Must be called with channel locked. */
	static void
	fsl_sata_process_timeout(struct fsl_sata_channel *ch)
	{
	int i;

	mtx_assert(&ch->mtx, MA_OWNED);
	/* Handle the rest of commands. */
	for (i = 0; i < FSL_SATA_MAX_SLOTS; i++) {
	/* Do we have a running request on slot? */
	if (ch->slot[i].state < FSL_SATA_SLOT_RUNNING)
	continue;
	fsl_sata_end_transaction(&ch->slot[i], FSL_SATA_ERR_TIMEOUT);
	}
	}

	/* Must be called with channel locked. */
	static void
	fsl_sata_rearm_timeout(struct fsl_sata_channel *ch)
	{
	int i;

	mtx_assert(&ch->mtx, MA_OWNED);
	for (i = 0; i < FSL_SATA_MAX_SLOTS; i++) {
	struct fsl_sata_slot *slot = &ch->slot[i];

	/* Do we have a running request on slot? */
	if (slot->state < FSL_SATA_SLOT_RUNNING)
	continue;
	if ((ch->toslots & (1 << i)) == 0)
	continue;
	callout_reset_sbt(&slot->timeout,
	SBT_1MS * slot->ccb->ccb_h.timeout / 2, 0,
	fsl_sata_timeout, slot, 0);
	}
	}

	/* Locked by callout mechanism. */
	static void
	fsl_sata_timeout(void *arg)
	{
	struct fsl_sata_slot *slot = arg;
	struct fsl_sata_channel *ch = slot->ch;
	device_t dev = ch->dev;
	uint32_t sstatus;

	/* Check for stale timeout. */
	if (slot->state < FSL_SATA_SLOT_RUNNING)
	return;

	/* Check if slot was not being executed last time we checked. */
	if (slot->state < FSL_SATA_SLOT_EXECUTING) {
	/* Check if slot started executing. */
	sstatus = ATA_INL(ch->r_mem, FSL_SATA_P_CAR);
	if ((sstatus & (1 << slot->slot)) != 0)
	slot->state = FSL_SATA_SLOT_EXECUTING;

	callout_reset_sbt(&slot->timeout,
	SBT_1MS * slot->ccb->ccb_h.timeout / 2, 0,
	fsl_sata_timeout, slot, 0);
	return;
	}

	device_printf(dev, "Timeout on slot %d port %d\n",
	slot->slot, slot->ccb->ccb_h.target_id & 0x0f);

	/* Handle frozen command. */
	if (ch->frozen) {
	union ccb *fccb = ch->frozen;
	ch->frozen = NULL;
	fccb->ccb_h.status = CAM_REQUEUE_REQ \| CAM_RELEASE_SIMQ;
	if (!(fccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(fccb->ccb_h.path, 1);
	fccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	fsl_sata_done(ch, fccb);
	}
	if (ch->toslots == 0)
	xpt_freeze_simq(ch->sim, 1);
	ch->toslots \|= (1 << slot->slot);
	if ((ch->rslots & ~ch->toslots) == 0)
	fsl_sata_process_timeout(ch);
	else
	device_printf(dev, " ... waiting for slots %08x\n",
	ch->rslots & ~ch->toslots);
	}

	/* Must be called with channel locked. */
	static void
	fsl_sata_end_transaction(struct fsl_sata_slot *slot, enum fsl_sata_err_type et)
	{
	struct fsl_sata_channel *ch = slot->ch;
	union ccb *ccb = slot->ccb;
	struct fsl_sata_cmd_list *clp;
	int lastto;
	uint32_t sig;

	bus_dmamap_sync(ch->dma.work_tag, ch->dma.work_map,
	BUS_DMASYNC_POSTREAD \| BUS_DMASYNC_POSTWRITE);
	clp = FSL_SATA_CLP(ch, slot);
	/* Read result registers to the result struct */
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	struct ata_res *res = &ccb->ataio.res;

	if ((et == FSL_SATA_ERR_TFE) \|\|
	(ccb->ataio.cmd.flags & CAM_ATAIO_NEEDRESULT)) {
	struct fsl_sata_cmd_tab *ctp = FSL_SATA_CTP(ch, slot);
	uint8_t *fis = ctp->sfis;

	res->status = fis[2];
	res->error = fis[3];
	res->lba_low = fis[4];
	res->lba_mid = fis[5];
	res->lba_high = fis[6];
	res->device = fis[7];
	res->lba_low_exp = fis[8];
	res->lba_mid_exp = fis[9];
	res->lba_high_exp = fis[10];
	res->sector_count = fis[12];
	res->sector_count_exp = fis[13];

	if ((ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) &&
	(ccb->ataio.cmd.control & ATA_A_RESET) == 0) {
	sig = ATA_INL(ch->r_mem, FSL_SATA_P_SIG);
	res->lba_high = sig >> 24;
	res->lba_mid = sig >> 16;
	res->lba_low = sig >> 8;
	res->sector_count = sig;
	}
	} else
	bzero(res, sizeof(*res));
	if ((ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA) == 0 &&
	(ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) {
	ccb->ataio.resid =
	ccb->ataio.dxfer_len - le32toh(clp->ttl);
	}
	} else {
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) {
	ccb->csio.resid =
	ccb->csio.dxfer_len - le32toh(clp->ttl);
	}
	}
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) {
	bus_dmamap_sync(ch->dma.data_tag, slot->dma.data_map,
	(ccb->ccb_h.flags & CAM_DIR_IN) ?
	BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
	bus_dmamap_unload(ch->dma.data_tag, slot->dma.data_map);
	}
	if (et != FSL_SATA_ERR_NONE)
	ch->eslots \|= (1 << slot->slot);
	/* In case of error, freeze device for proper recovery. */
	if ((et != FSL_SATA_ERR_NONE) && (!ch->recoverycmd) &&
	!(ccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	/* Set proper result status. */
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	switch (et) {
	case FSL_SATA_ERR_NONE:
	ccb->ccb_h.status \|= CAM_REQ_CMP;
	if (ccb->ccb_h.func_code == XPT_SCSI_IO)
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	break;
	case FSL_SATA_ERR_INVALID:
	ch->fatalerr = 1;
	ccb->ccb_h.status \|= CAM_REQ_INVALID;
	break;
	case FSL_SATA_ERR_INNOCENT:
	ccb->ccb_h.status \|= CAM_REQUEUE_REQ;
	break;
	case FSL_SATA_ERR_TFE:
	case FSL_SATA_ERR_NCQ:
	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
	ccb->ccb_h.status \|= CAM_SCSI_STATUS_ERROR;
	ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
	} else {
	ccb->ccb_h.status \|= CAM_ATA_STATUS_ERROR;
	}
	break;
	case FSL_SATA_ERR_SATA:
	ch->fatalerr = 1;
	if (!ch->recoverycmd) {
	xpt_freeze_simq(ch->sim, 1);
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	ccb->ccb_h.status \|= CAM_RELEASE_SIMQ;
	}
	ccb->ccb_h.status \|= CAM_UNCOR_PARITY;
	break;
	case FSL_SATA_ERR_TIMEOUT:
	if (!ch->recoverycmd) {
	xpt_freeze_simq(ch->sim, 1);
	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
	ccb->ccb_h.status \|= CAM_RELEASE_SIMQ;
	}
	ccb->ccb_h.status \|= CAM_CMD_TIMEOUT;
	break;
	default:
	ch->fatalerr = 1;
	ccb->ccb_h.status \|= CAM_REQ_CMP_ERR;
	}
	/* Free slot. */
	ch->oslots &= ~(1 << slot->slot);
	ch->rslots &= ~(1 << slot->slot);
	ch->aslots &= ~(1 << slot->slot);
	slot->state = FSL_SATA_SLOT_EMPTY;
	slot->ccb = NULL;
	/* Update channel stats. */
	ch->numrslots--;
	ch->numrslotspd[ccb->ccb_h.target_id]--;
	ATA_OUTL(ch->r_mem, FSL_SATA_P_CCR, 1 << slot->slot);
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA)) {
	ch->numtslots--;
	ch->numtslotspd[ccb->ccb_h.target_id]--;
	}
	/* Cancel timeout state if request completed normally. */
	if (et != FSL_SATA_ERR_TIMEOUT) {
	lastto = (ch->toslots == (1 << slot->slot));
	ch->toslots &= ~(1 << slot->slot);
	if (lastto)
	xpt_release_simq(ch->sim, TRUE);
	}
	/* If it was first request of reset sequence and there is no error,
	* proceed to second request. */
	if ((ccb->ccb_h.func_code == XPT_ATA_IO) &&
	(ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) &&
	(ccb->ataio.cmd.control & ATA_A_RESET) &&
	et == FSL_SATA_ERR_NONE) {
	ccb->ataio.cmd.control &= ~ATA_A_RESET;
	fsl_sata_begin_transaction(ch, ccb);
	return;
	}
	/* If it was our READ LOG command - process it. */
	if (ccb->ccb_h.recovery_type == RECOVERY_READ_LOG) {
	fsl_sata_process_read_log(ch, ccb);
	/* If it was our REQUEST SENSE command - process it. */
	} else if (ccb->ccb_h.recovery_type == RECOVERY_REQUEST_SENSE) {
	fsl_sata_process_request_sense(ch, ccb);
	/* If it was NCQ or ATAPI command error, put result on hold. */
	} else if (et == FSL_SATA_ERR_NCQ \|\|
	((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR &&
	(ccb->ccb_h.flags & CAM_DIS_AUTOSENSE) == 0)) {
	ch->hold[slot->slot] = ccb;
	ch->numhslots++;
	} else
	fsl_sata_done(ch, ccb);
	/* If we have no other active commands, ... */
	if (ch->rslots == 0) {
	/* if there was fatal error - reset port. */
	if (ch->toslots != 0 \|\| ch->fatalerr) {
	fsl_sata_reset(ch);
	} else {
	/* if we have slots in error, we can reinit port. */
	if (ch->eslots != 0) {
	fsl_sata_stop(ch);
	fsl_sata_start(ch);
	}
	/* if there commands on hold, we can do READ LOG. */
	if (!ch->recoverycmd && ch->numhslots)
	fsl_sata_issue_recovery(ch);
	}
	/* If all the rest of commands are in timeout - give them chance. */
	} else if ((ch->rslots & ~ch->toslots) == 0 &&
	et != FSL_SATA_ERR_TIMEOUT)
	fsl_sata_rearm_timeout(ch);
	/* Unfreeze frozen command. */
	if (ch->frozen && !fsl_sata_check_collision(ch, ch->frozen)) {
	union ccb *fccb = ch->frozen;
	ch->frozen = NULL;
	fsl_sata_begin_transaction(ch, fccb);
	xpt_release_simq(ch->sim, TRUE);
	}
	/* Start PM timer. */
	if (ch->numrslots == 0 && ch->pm_level > 3 &&
	(ch->curr[ch->pm_present ? 15 : 0].caps & CTS_SATA_CAPS_D_PMREQ)) {
	callout_schedule(&ch->pm_timer,
	(ch->pm_level == 4) ? hz / 1000 : hz / 8);
	}
	}

	static void
	fsl_sata_issue_recovery(struct fsl_sata_channel *ch)
	{
	union ccb *ccb;
	struct ccb_ataio *ataio;
	struct ccb_scsiio *csio;
	int i;

	/* Find some held command. */
	for (i = 0; i < FSL_SATA_MAX_SLOTS; i++) {
	if (ch->hold[i])
	break;
	}
	ccb = xpt_alloc_ccb_nowait();
	if (ccb == NULL) {
	device_printf(ch->dev, "Unable to allocate recovery command\n");
	completeall:
	/* We can't do anything -- complete held commands. */
	for (i = 0; i < FSL_SATA_MAX_SLOTS; i++) {
	if (ch->hold[i] == NULL)
	continue;
	ch->hold[i]->ccb_h.status &= ~CAM_STATUS_MASK;
	ch->hold[i]->ccb_h.status \|= CAM_RESRC_UNAVAIL;
	fsl_sata_done(ch, ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	fsl_sata_reset(ch);
	return;
	}
	ccb->ccb_h = ch->hold[i]->ccb_h; /* Reuse old header. */
	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
	/* READ LOG */
	ccb->ccb_h.recovery_type = RECOVERY_READ_LOG;
	ccb->ccb_h.func_code = XPT_ATA_IO;
	ccb->ccb_h.flags = CAM_DIR_IN;
	ccb->ccb_h.timeout = 1000; /* 1s should be enough. */
	ataio = &ccb->ataio;
	ataio->data_ptr = malloc(512, M_FSL_SATA, M_NOWAIT);
	if (ataio->data_ptr == NULL) {
	xpt_free_ccb(ccb);
	device_printf(ch->dev,
	"Unable to allocate memory for READ LOG command\n");
	goto completeall;
	}
	ataio->dxfer_len = 512;
	bzero(&ataio->cmd, sizeof(ataio->cmd));
	ataio->cmd.flags = CAM_ATAIO_48BIT;
	ataio->cmd.command = 0x2F; /* READ LOG EXT */
	ataio->cmd.sector_count = 1;
	ataio->cmd.sector_count_exp = 0;
	ataio->cmd.lba_low = 0x10;
	ataio->cmd.lba_mid = 0;
	ataio->cmd.lba_mid_exp = 0;
	} else {
	/* REQUEST SENSE */
	ccb->ccb_h.recovery_type = RECOVERY_REQUEST_SENSE;
	ccb->ccb_h.recovery_slot = i;
	ccb->ccb_h.func_code = XPT_SCSI_IO;
	ccb->ccb_h.flags = CAM_DIR_IN;
	ccb->ccb_h.status = 0;
	ccb->ccb_h.timeout = 1000; /* 1s should be enough. */
	csio = &ccb->csio;
	csio->data_ptr = (void *)&ch->hold[i]->csio.sense_data;
	csio->dxfer_len = ch->hold[i]->csio.sense_len;
	csio->cdb_len = 6;
	bzero(&csio->cdb_io, sizeof(csio->cdb_io));
	csio->cdb_io.cdb_bytes[0] = 0x03;
	csio->cdb_io.cdb_bytes[4] = csio->dxfer_len;
	}
	/* Freeze SIM while doing recovery. */
	ch->recoverycmd = 1;
	xpt_freeze_simq(ch->sim, 1);
	fsl_sata_begin_transaction(ch, ccb);
	}

	static void
	fsl_sata_process_read_log(struct fsl_sata_channel ch, union ccb ccb)
	{
	uint8_t *data;
	struct ata_res *res;
	int i;

	ch->recoverycmd = 0;

	data = ccb->ataio.data_ptr;
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP &&
	(data[0] & 0x80) == 0) {
	for (i = 0; i < FSL_SATA_MAX_SLOTS; i++) {
	if (!ch->hold[i])
	continue;
	if (ch->hold[i]->ccb_h.func_code != XPT_ATA_IO)
	continue;
	if ((data[0] & 0x1F) == i) {
	res = &ch->hold[i]->ataio.res;
	res->status = data[2];
	res->error = data[3];
	res->lba_low = data[4];
	res->lba_mid = data[5];
	res->lba_high = data[6];
	res->device = data[7];
	res->lba_low_exp = data[8];
	res->lba_mid_exp = data[9];
	res->lba_high_exp = data[10];
	res->sector_count = data[12];
	res->sector_count_exp = data[13];
	} else {
	ch->hold[i]->ccb_h.status &= ~CAM_STATUS_MASK;
	ch->hold[i]->ccb_h.status \|= CAM_REQUEUE_REQ;
	}
	fsl_sata_done(ch, ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	} else {
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)
	device_printf(ch->dev, "Error while READ LOG EXT\n");
	else if ((data[0] & 0x80) == 0) {
	device_printf(ch->dev, "Non-queued command error in READ LOG EXT\n");
	}
	for (i = 0; i < FSL_SATA_MAX_SLOTS; i++) {
	if (!ch->hold[i])
	continue;
	if (ch->hold[i]->ccb_h.func_code != XPT_ATA_IO)
	continue;
	fsl_sata_done(ch, ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	}
	free(ccb->ataio.data_ptr, M_FSL_SATA);
	xpt_free_ccb(ccb);
	xpt_release_simq(ch->sim, TRUE);
	}

	static void
	fsl_sata_process_request_sense(struct fsl_sata_channel ch, union ccb ccb)
	{
	int i;

	ch->recoverycmd = 0;

	i = ccb->ccb_h.recovery_slot;
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	ch->hold[i]->ccb_h.status \|= CAM_AUTOSNS_VALID;
	} else {
	ch->hold[i]->ccb_h.status &= ~CAM_STATUS_MASK;
	ch->hold[i]->ccb_h.status \|= CAM_AUTOSENSE_FAIL;
	}
	fsl_sata_done(ch, ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	xpt_free_ccb(ccb);
	xpt_release_simq(ch->sim, TRUE);
	}

	static void
	fsl_sata_start(struct fsl_sata_channel *ch)
	{
	u_int32_t cmd;

	/* Clear SATA error register */
	ATA_OUTL(ch->r_mem, FSL_SATA_P_SERR, 0xFFFFFFFF);
	/* Clear any interrupts pending on this channel */
	ATA_OUTL(ch->r_mem, FSL_SATA_P_HSTS, 0x3F);
	ATA_OUTL(ch->r_mem, FSL_SATA_P_CER, 0xFFFF);
	ATA_OUTL(ch->r_mem, FSL_SATA_P_DER, 0xFFFF);
	/* Start operations on this channel */
	cmd = ATA_INL(ch->r_mem, FSL_SATA_P_HCTRL);
	cmd \|= FSL_SATA_P_HCTRL_HC_ON \| FSL_SATA_P_HCTRL_SNOOP;
	cmd &= ~FSL_SATA_P_HCTRL_HC_FORCE_OFF;
	ATA_OUTL(ch->r_mem, FSL_SATA_P_HCTRL, cmd \|
	(ch->pm_present ? FSL_SATA_P_HCTRL_PM : 0));
	fsl_sata_wait_register(ch, FSL_SATA_P_HSTS,
	FSL_SATA_P_HSTS_PR, FSL_SATA_P_HSTS_PR, 500);
	ATA_OUTL(ch->r_mem, FSL_SATA_P_HSTS,
	ATA_INL(ch->r_mem, FSL_SATA_P_HSTS) & FSL_SATA_P_HSTS_PR);
	}

	static void
	fsl_sata_stop(struct fsl_sata_channel *ch)
	{
	uint32_t cmd;
	int i;

	/* Kill all activity on this channel */
	cmd = ATA_INL(ch->r_mem, FSL_SATA_P_HCTRL);
	cmd &= ~FSL_SATA_P_HCTRL_HC_ON;

	for (i = 0; i < 2; i++) {
	ATA_OUTL(ch->r_mem, FSL_SATA_P_HCTRL, cmd);
	if (fsl_sata_wait_register(ch, FSL_SATA_P_HSTS,
	FSL_SATA_P_HSTS_HS_ON, 0, 500)) {
	if (i != 0)
	device_printf(ch->dev,
	"stopping FSL SATA engine failed\n");
	cmd \|= FSL_SATA_P_HCTRL_HC_FORCE_OFF;
	} else
	break;
	}
	ch->eslots = 0;
	}

	static void
	fsl_sata_reset(struct fsl_sata_channel *ch)
	{
	uint32_t ctrl;
	int i;

	xpt_freeze_simq(ch->sim, 1);
	if (bootverbose)
	device_printf(ch->dev, "FSL SATA reset...\n");

	/* Requeue freezed command. */
	if (ch->frozen) {
	union ccb *fccb = ch->frozen;
	ch->frozen = NULL;
	fccb->ccb_h.status = CAM_REQUEUE_REQ \| CAM_RELEASE_SIMQ;
	if (!(fccb->ccb_h.status & CAM_DEV_QFRZN)) {
	xpt_freeze_devq(fccb->ccb_h.path, 1);
	fccb->ccb_h.status \|= CAM_DEV_QFRZN;
	}
	fsl_sata_done(ch, fccb);
	}
	/* Kill the engine and requeue all running commands. */
	fsl_sata_stop(ch);
	DELAY(1000); /* sleep for 1ms */
	for (i = 0; i < FSL_SATA_MAX_SLOTS; i++) {
	/* Do we have a running request on slot? */
	if (ch->slot[i].state < FSL_SATA_SLOT_RUNNING)
	continue;
	/* XXX; Commands in loading state. */
	fsl_sata_end_transaction(&ch->slot[i], FSL_SATA_ERR_INNOCENT);
	}
	for (i = 0; i < FSL_SATA_MAX_SLOTS; i++) {
	if (!ch->hold[i])
	continue;
	fsl_sata_done(ch, ch->hold[i]);
	ch->hold[i] = NULL;
	ch->numhslots--;
	}
	if (ch->toslots != 0)
	xpt_release_simq(ch->sim, TRUE);
	ch->eslots = 0;
	ch->toslots = 0;
	ch->fatalerr = 0;
	/* Tell the XPT about the event */
	xpt_async(AC_BUS_RESET, ch->path, NULL);
	/* Disable port interrupts */
	ATA_OUTL(ch->r_mem, FSL_SATA_P_HCTRL,
	ATA_INL(ch->r_mem, FSL_SATA_P_HCTRL) & ~0x3f);
	/* Reset and reconnect PHY, */
	fsl_sata_start(ch);
	if (fsl_sata_wait_register(ch, FSL_SATA_P_HSTS, 0x08, 0x08, 500)) {
	if (bootverbose)
	device_printf(ch->dev,
	"FSL SATA reset: device not found\n");
	ch->devices = 0;
	/* Enable wanted port interrupts */
	ATA_OUTL(ch->r_mem, FSL_SATA_P_HCTRL,
	ATA_INL(ch->r_mem, FSL_SATA_P_HCTRL) \| FSL_SATA_P_HCTRL_PHYRDY);
	xpt_release_simq(ch->sim, TRUE);
	return;
	}
	if (bootverbose)
	device_printf(ch->dev, "FSL SATA reset: device found\n");
	ch->devices = 1;
	/* Enable wanted port interrupts */
	ctrl = ATA_INL(ch->r_mem, FSL_SATA_P_HCTRL) & ~0x3f;
	ATA_OUTL(ch->r_mem, FSL_SATA_P_HCTRL,
	ctrl \| FSL_SATA_P_HCTRL_FATAL \| FSL_SATA_P_HCTRL_PHYRDY \|
	FSL_SATA_P_HCTRL_SIG \| FSL_SATA_P_HCTRL_SNTFY \|
	FSL_SATA_P_HCTRL_DE \| FSL_SATA_P_HCTRL_CC);
	xpt_release_simq(ch->sim, TRUE);
	}

	static int
	fsl_sata_setup_fis(struct fsl_sata_channel ch, struct fsl_sata_cmd_tab ctp, union ccb *ccb, int tag)
	{
	uint8_t *fis = &ctp->cfis[0];

	bzero(fis, 32);
	fis[0] = 0x27; /* host to device */
	fis[1] = (ccb->ccb_h.target_id & 0x0f);
	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
	fis[1] \|= 0x80;
	fis[2] = ATA_PACKET_CMD;
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE &&
	ch->curr[ccb->ccb_h.target_id].mode >= ATA_DMA)
	fis[3] = ATA_F_DMA;
	else {
	fis[5] = ccb->csio.dxfer_len;
	fis[6] = ccb->csio.dxfer_len >> 8;
	}
	fis[7] = ATA_D_LBA;
	fis[15] = ATA_A_4BIT;
	bcopy((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
	ccb->csio.cdb_io.cdb_ptr : ccb->csio.cdb_io.cdb_bytes,
	ctp->acmd, ccb->csio.cdb_len);
	bzero(ctp->acmd + ccb->csio.cdb_len, 32 - ccb->csio.cdb_len);
	} else if ((ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) == 0) {
	fis[1] \|= 0x80;
	fis[2] = ccb->ataio.cmd.command;
	fis[3] = ccb->ataio.cmd.features;
	fis[4] = ccb->ataio.cmd.lba_low;
	fis[5] = ccb->ataio.cmd.lba_mid;
	fis[6] = ccb->ataio.cmd.lba_high;
	fis[7] = ccb->ataio.cmd.device;
	fis[8] = ccb->ataio.cmd.lba_low_exp;
	fis[9] = ccb->ataio.cmd.lba_mid_exp;
	fis[10] = ccb->ataio.cmd.lba_high_exp;
	fis[11] = ccb->ataio.cmd.features_exp;
	if (ccb->ataio.cmd.flags & CAM_ATAIO_FPDMA) {
	fis[12] = tag << 3;
	fis[13] = 0;
	} else {
	fis[12] = ccb->ataio.cmd.sector_count;
	fis[13] = ccb->ataio.cmd.sector_count_exp;
	}
	fis[15] = ATA_A_4BIT;
	} else {
	fis[15] = ccb->ataio.cmd.control;
	}
	return (20);
	}

	static int
	fsl_sata_check_ids(struct fsl_sata_channel ch, union ccb ccb)
	{

	if (ccb->ccb_h.target_id > 15) {
	ccb->ccb_h.status = CAM_TID_INVALID;
	fsl_sata_done(ch, ccb);
	return (-1);
	}
	if (ccb->ccb_h.target_lun != 0) {
	ccb->ccb_h.status = CAM_LUN_INVALID;
	fsl_sata_done(ch, ccb);
	return (-1);
	}
	return (0);
	}

	static void
	fsl_sataaction(struct cam_sim sim, union ccb ccb)
	{
	struct fsl_sata_channel *ch;

	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
	("fsl_sataaction func_code=0x%x\n", ccb->ccb_h.func_code));

	ch = (struct fsl_sata_channel *)cam_sim_softc(sim);
	switch (ccb->ccb_h.func_code) {
	/* Common cases first */
	case XPT_ATA_IO: /* Execute the requested I/O operation */
	case XPT_SCSI_IO:
	if (fsl_sata_check_ids(ch, ccb))
	return;
	if (ch->devices == 0 \|\|
	(ch->pm_present == 0 &&
	ccb->ccb_h.target_id > 0 && ccb->ccb_h.target_id < 15)) {
	ccb->ccb_h.status = CAM_SEL_TIMEOUT;
	break;
	}
	ccb->ccb_h.recovery_type = RECOVERY_NONE;
	/* Check for command collision. */
	if (fsl_sata_check_collision(ch, ccb)) {
	/* Freeze command. */
	ch->frozen = ccb;
	/* We have only one frozen slot, so freeze simq also. */
	xpt_freeze_simq(ch->sim, 1);
	return;
	}
	fsl_sata_begin_transaction(ch, ccb);
	return;
	case XPT_ABORT: /* Abort the specified CCB */
	/* XXX Implement */
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	case XPT_SET_TRAN_SETTINGS:
	{
	struct ccb_trans_settings *cts = &ccb->cts;
	struct fsl_sata_device *d;

	if (fsl_sata_check_ids(ch, ccb))
	return;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS)
	d = &ch->curr[ccb->ccb_h.target_id];
	else
	d = &ch->user[ccb->ccb_h.target_id];
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_REVISION)
	d->revision = cts->xport_specific.sata.revision;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_MODE)
	d->mode = cts->xport_specific.sata.mode;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_BYTECOUNT)
	d->bytecount = min(8192, cts->xport_specific.sata.bytecount);
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_TAGS)
	d->tags = min(FSL_SATA_MAX_SLOTS, cts->xport_specific.sata.tags);
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_PM)
	ch->pm_present = cts->xport_specific.sata.pm_present;
	if (cts->xport_specific.sata.valid & CTS_SATA_VALID_ATAPI)
	d->atapi = cts->xport_specific.sata.atapi;
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_GET_TRAN_SETTINGS:
	/* Get default/user set transfer settings for the target */
	{
	struct ccb_trans_settings *cts = &ccb->cts;
	struct fsl_sata_device *d;
	uint32_t status;

	if (fsl_sata_check_ids(ch, ccb))
	return;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS)
	d = &ch->curr[ccb->ccb_h.target_id];
	else
	d = &ch->user[ccb->ccb_h.target_id];
	cts->protocol = PROTO_UNSPECIFIED;
	cts->protocol_version = PROTO_VERSION_UNSPECIFIED;
	cts->transport = XPORT_SATA;
	cts->transport_version = XPORT_VERSION_UNSPECIFIED;
	cts->proto_specific.valid = 0;
	cts->xport_specific.sata.valid = 0;
	if (cts->type == CTS_TYPE_CURRENT_SETTINGS &&
	(ccb->ccb_h.target_id == 15 \|\|
	(ccb->ccb_h.target_id == 0 && !ch->pm_present))) {
	status = ATA_INL(ch->r_mem, FSL_SATA_P_SSTS) & ATA_SS_SPD_MASK;
	if (status & 0x0f0) {
	cts->xport_specific.sata.revision =
	(status & 0x0f0) >> 4;
	cts->xport_specific.sata.valid \|=
	CTS_SATA_VALID_REVISION;
	}
	cts->xport_specific.sata.caps = d->caps & CTS_SATA_CAPS_D;
	if (ch->pm_level) {
	cts->xport_specific.sata.caps \|= CTS_SATA_CAPS_H_PMREQ;
	}
	cts->xport_specific.sata.caps \|= CTS_SATA_CAPS_H_AN;
	cts->xport_specific.sata.caps &=
	ch->user[ccb->ccb_h.target_id].caps;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_CAPS;
	} else {
	cts->xport_specific.sata.revision = d->revision;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_REVISION;
	cts->xport_specific.sata.caps = d->caps;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_CAPS;
	}
	cts->xport_specific.sata.mode = d->mode;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_MODE;
	cts->xport_specific.sata.bytecount = d->bytecount;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_BYTECOUNT;
	cts->xport_specific.sata.pm_present = ch->pm_present;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_PM;
	cts->xport_specific.sata.tags = d->tags;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_TAGS;
	cts->xport_specific.sata.atapi = d->atapi;
	cts->xport_specific.sata.valid \|= CTS_SATA_VALID_ATAPI;
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	case XPT_RESET_BUS: /* Reset the specified SCSI bus */
	case XPT_RESET_DEV: /* Bus Device Reset the specified SCSI device */
	fsl_sata_reset(ch);
	ccb->ccb_h.status = CAM_REQ_CMP;
	break;
	case XPT_TERM_IO: /* Terminate the I/O process */
	/* XXX Implement */
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	case XPT_PATH_INQ: /* Path routing inquiry */
	{
	struct ccb_pathinq *cpi = &ccb->cpi;

	cpi->version_num = 1; /* XXX??? */
	cpi->hba_inquiry = PI_SDTR_ABLE;
	cpi->hba_inquiry \|= PI_TAG_ABLE;
	#if 0
	/*
	* XXX: CAM tries to reset port 15 if it sees port multiplier
	* support. Disable it for now.
	*/
	cpi->hba_inquiry \|= PI_SATAPM;
	#endif
	cpi->target_sprt = 0;
	cpi->hba_misc = PIM_SEQSCAN \| PIM_UNMAPPED;
	cpi->hba_eng_cnt = 0;
	/*
	* XXX: This should be 15, since hardware does support a port
	* multiplier. See above.
	*/
	cpi->max_target = 0;
	cpi->max_lun = 0;
	cpi->initiator_id = 0;
	cpi->bus_id = cam_sim_bus(sim);
	cpi->base_transfer_speed = 150000;
	strncpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strncpy(cpi->hba_vid, "FSL SATA", HBA_IDLEN);
	strncpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
	cpi->unit_number = cam_sim_unit(sim);
	cpi->transport = XPORT_SATA;
	cpi->transport_version = XPORT_VERSION_UNSPECIFIED;
	cpi->protocol = PROTO_ATA;
	cpi->protocol_version = PROTO_VERSION_UNSPECIFIED;
	- cpi->maxio = MAXPHYS;
	+ cpi->maxio = maxphys;
	cpi->ccb_h.status = CAM_REQ_CMP;
	break;
	}
	default:
	ccb->ccb_h.status = CAM_REQ_INVALID;
	break;
	}
	fsl_sata_done(ch, ccb);
	}

	static void
	fsl_satapoll(struct cam_sim *sim)
	{
	struct fsl_sata_channel ch = (struct fsl_sata_channel )cam_sim_softc(sim);
	uint32_t istatus;

	/* Read interrupt statuses and process if any. */
	istatus = ATA_INL(ch->r_mem, FSL_SATA_P_HSTS);
	if (istatus != 0)
	fsl_sata_intr_main(ch, istatus);
	}
	MODULE_VERSION(fsl_sata, 1);
	MODULE_DEPEND(fsl_sata, cam, 1, 1, 1);
	diff --git a/sys/sys/aio.h b/sys/sys/aio.h
	index 6ab8bdb7f296..380c5bcf7cf9 100644
	--- a/sys/sys/aio.h
	+++ b/sys/sys/aio.h
	@@ -1,266 +1,266 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1997 John S. Dyson. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. John S. Dyson's name may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* DISCLAIMER: This code isn't warranted to do anything useful. Anything
	* bad that happens because of using this software isn't the responsibility
	* of the author. This software is distributed AS-IS.
	*
	* $FreeBSD$
	*/

	#ifndef _SYS_AIO_H_
	#define _SYS_AIO_H_

	#include <sys/types.h>
	#include <sys/signal.h>
	#ifdef _KERNEL
	#include <sys/queue.h>
	#include <sys/event.h>
	#include <sys/signalvar.h>
	#endif

	/*
	* Returned by aio_cancel:
	*/
	#define AIO_CANCELED 0x1
	#define AIO_NOTCANCELED 0x2
	#define AIO_ALLDONE 0x3

	/*
	* LIO opcodes
	*/
	#define LIO_NOP 0x0
	#define LIO_WRITE 0x1
	#define LIO_READ 0x2
	#ifdef _KERNEL
	#define LIO_SYNC 0x3
	#define LIO_MLOCK 0x4
	#endif

	/*
	* LIO modes
	*/
	#define LIO_NOWAIT 0x0
	#define LIO_WAIT 0x1

	/*
	* Maximum number of operations in a single lio_listio call
	*/
	#define AIO_LISTIO_MAX 16

	#ifdef _KERNEL

	/* Default values of tunables for the AIO worker pool. */

	#ifndef MAX_AIO_PROCS
	#define MAX_AIO_PROCS 32
	#endif

	#ifndef TARGET_AIO_PROCS
	#define TARGET_AIO_PROCS 4
	#endif

	#ifndef AIOD_LIFETIME_DEFAULT
	#define AIOD_LIFETIME_DEFAULT (30 * hz)
	#endif

	#endif

	/*
	* Private members for aiocb -- don't access
	* directly.
	*/
	struct __aiocb_private {
	long status;
	long error;
	void *kernelinfo;
	};

	/*
	* I/O control block
	*/
	typedef struct aiocb {
	int aio_fildes; /* File descriptor */
	off_t aio_offset; /* File offset for I/O */
	volatile void aio_buf; / I/O buffer in process space */
	size_t aio_nbytes; /* Number of bytes for I/O */
	int __spare__[2];
	void *__spare2__;
	int aio_lio_opcode; /* LIO opcode */
	int aio_reqprio; /* Request priority -- ignored */
	struct __aiocb_private _aiocb_private;
	struct sigevent aio_sigevent; /* Signal to deliver */
	} aiocb_t;

	#ifdef _KERNEL

	typedef void aio_cancel_fn_t(struct kaiocb *);
	typedef void aio_handle_fn_t(struct kaiocb *);

	/*
	* Kernel version of an I/O control block.
	*
	* Locking key:
	* * - need not protected
	* a - locked by kaioinfo lock
	* b - locked by backend lock
	* c - locked by aio_job_mtx
	*/
	struct kaiocb {
	TAILQ_ENTRY(kaiocb) list; /* (b) backend-specific list of jobs */
	TAILQ_ENTRY(kaiocb) plist; /* (a) lists of pending / done jobs */
	TAILQ_ENTRY(kaiocb) allist; /* (a) list of all jobs in proc */
	int jobflags; /* (a) job flags */
	int inblock; /* () input blocks /
	int outblock; /* () output blocks /
	int msgsnd; /* () messages sent /
	int msgrcv; /* () messages received /
	struct proc userproc; / () user process /
	struct ucred cred; / () active credential when created /
	struct file fd_file; / () pointer to file structure /
	struct aioliojob lio; / () optional lio job /
	struct aiocb ujob; / () pointer in userspace of aiocb /
	struct knlist klist; /* (a) list of knotes */
	struct aiocb uaiocb; /* () copy of user I/O control block /
	ksiginfo_t ksi; /* (a) realtime signal info */
	uint64_t seqno; /* () job number /
	aio_cancel_fn_t cancel_fn; / (a) backend cancel function */
	aio_handle_fn_t handle_fn; / (c) backend handle function */
	union { /* Backend-specific data fields */
	struct { /* BIO backend */
	struct bio bp; / () BIO pointer /
	struct buf pbuf; / () buffer pointer /
	- struct vm_page pages[btoc(MAXPHYS)+1]; / () /
	int npages; /* () number of pages /
	+ struct vm_page *pages; / () /
	};
	struct { /* fsync() requests */
	int pending; /* (a) number of pending I/O */
	};
	struct {
	void *backend1;
	void *backend2;
	long backend3;
	int backend4;
	};
	};
	};

	struct socket;
	struct sockbuf;

	/*
	* AIO backends should permit cancellation of queued requests waiting to
	* be serviced by installing a cancel routine while the request is
	* queued. The cancellation routine should dequeue the request if
	* necessary and cancel it. Care must be used to handle races between
	* queueing and dequeueing requests and cancellation.
	*
	* When queueing a request somewhere such that it can be cancelled, the
	* caller should:
	*
	* 1) Acquire lock that protects the associated queue.
	* 2) Call aio_set_cancel_function() to install the cancel routine.
	* 3) If that fails, the request has a pending cancel and should be
	* cancelled via aio_cancel().
	* 4) Queue the request.
	*
	* When dequeueing a request to service it or hand it off to somewhere else,
	* the caller should:
	*
	* 1) Acquire the lock that protects the associated queue.
	* 2) Dequeue the request.
	* 3) Call aio_clear_cancel_function() to clear the cancel routine.
	* 4) If that fails, the cancel routine is about to be called. The
	* caller should ignore the request.
	*
	* The cancel routine should:
	*
	* 1) Acquire the lock that protects the associated queue.
	* 2) Call aio_cancel_cleared() to determine if the request is already
	* dequeued due to a race with dequeueing thread.
	* 3) If that fails, dequeue the request.
	* 4) Cancel the request via aio_cancel().
	*/

	bool aio_cancel_cleared(struct kaiocb *job);
	void aio_cancel(struct kaiocb *job);
	bool aio_clear_cancel_function(struct kaiocb *job);
	void aio_complete(struct kaiocb *job, long status, int error);
	void aio_schedule(struct kaiocb job, aio_handle_fn_t func);
	bool aio_set_cancel_function(struct kaiocb job, aio_cancel_fn_t func);
	void aio_switch_vmspace(struct kaiocb *job);

	#else /* !_KERNEL */

	struct timespec;

	__BEGIN_DECLS
	/*
	* Asynchronously read from a file
	*/
	int aio_read(struct aiocb *);

	/*
	* Asynchronously write to file
	*/
	int aio_write(struct aiocb *);

	/*
	* List I/O Asynchronously/synchronously read/write to/from file
	* "lio_mode" specifies whether or not the I/O is synchronous.
	* "acb_list" is an array of "nacb_listent" I/O control blocks.
	* when all I/Os are complete, the optional signal "sig" is sent.
	*/
	int lio_listio(int, struct aiocb __restrict const __restrict, int,
	struct sigevent *);

	/*
	* Get completion status
	* returns EINPROGRESS until I/O is complete.
	* this routine does not block.
	*/
	int aio_error(const struct aiocb *);

	/*
	* Finish up I/O, releasing I/O resources and returns the value
	* that would have been associated with a synchronous I/O request.
	* This routine must be called once and only once for each
	* I/O control block who has had I/O associated with it.
	*/
	ssize_t aio_return(struct aiocb *);

	/*
	* Cancel I/O
	*/
	int aio_cancel(int, struct aiocb *);

	/*
	* Suspend until all specified I/O or timeout is complete.
	*/
	int aio_suspend(const struct aiocb * const[], int, const struct timespec *);

	/*
	* Asynchronous mlock
	*/
	int aio_mlock(struct aiocb *);

	#if __BSD_VISIBLE
	ssize_t aio_waitcomplete(struct aiocb *, struct timespec );
	#endif

	int aio_fsync(int op, struct aiocb *aiocbp);
	__END_DECLS

	#endif /* !_KERNEL */

	#endif /* !_SYS_AIO_H_ */
	diff --git a/sys/sys/buf.h b/sys/sys/buf.h
	index 64d70da056c9..50fa0f35491e 100644
	--- a/sys/sys/buf.h
	+++ b/sys/sys/buf.h
	@@ -1,599 +1,599 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)buf.h 8.9 (Berkeley) 3/30/95
	* $FreeBSD$
	*/

	#ifndef _SYS_BUF_H_
	#define _SYS_BUF_H_

	#include <sys/bufobj.h>
	#include <sys/queue.h>
	#include <sys/lock.h>
	#include <sys/lockmgr.h>
	#include <vm/uma.h>

	struct bio;
	struct buf;
	struct bufobj;
	struct mount;
	struct vnode;
	struct uio;

	/*
	* To avoid including <ufs/ffs/softdep.h>
	*/
	LIST_HEAD(workhead, worklist);
	/*
	* These are currently used only by the soft dependency code, hence
	* are stored once in a global variable. If other subsystems wanted
	* to use these hooks, a pointer to a set of bio_ops could be added
	* to each buffer.
	*/
	extern struct bio_ops {
	void (io_start)(struct buf );
	void (io_complete)(struct buf );
	void (io_deallocate)(struct buf );
	int (io_countdeps)(struct buf , int);
	} bioops;

	struct vm_object;
	struct vm_page;

	typedef uint32_t b_xflags_t;

	/*
	* The buffer header describes an I/O operation in the kernel.
	*
	* NOTES:
	* b_bufsize, b_bcount. b_bufsize is the allocation size of the
	* buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the
	* originally requested buffer size and can serve as a bounds check
	* against EOF. For most, but not all uses, b_bcount == b_bufsize.
	*
	* b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned
	* ranges of dirty data that need to be written to backing store.
	* The range is typically clipped at b_bcount ( not b_bufsize ).
	*
	* b_resid. Number of bytes remaining in I/O. After an I/O operation
	* completes, b_resid is usually 0 indicating 100% success.
	*
	* All fields are protected by the buffer lock except those marked:
	* V - Protected by owning bufobj lock
	* Q - Protected by the buf queue lock
	* D - Protected by an dependency implementation specific lock
	*/
	struct buf {
	struct bufobj *b_bufobj;
	long b_bcount;
	void *b_caller1;
	caddr_t b_data;
	int b_error;
	uint16_t b_iocmd; /* BIO_* bio_cmd from bio.h */
	uint16_t b_ioflags; /* BIO_* bio_flags from bio.h */
	off_t b_iooffset;
	long b_resid;
	void (b_iodone)(struct buf );
	void (b_ckhashcalc)(struct buf );
	uint64_t b_ckhash; /* B_CKHASH requested check-hash */
	daddr_t b_blkno; /* Underlying physical block number. */
	off_t b_offset; /* Offset into file. */
	TAILQ_ENTRY(buf) b_bobufs; /* (V) Buffer's associated vnode. */
	uint32_t b_vflags; /* (V) BV_* flags */
	uint8_t b_qindex; /* (Q) buffer queue index */
	uint8_t b_domain; /* (Q) buf domain this resides in */
	uint16_t b_subqueue; /* (Q) per-cpu q if any */
	uint32_t b_flags; /* B_* flags. */
	b_xflags_t b_xflags; /* extra flags */
	struct lock b_lock; /* Buffer lock */
	long b_bufsize; /* Allocated buffer size. */
	int b_runningbufspace; /* when I/O is running, pipelining */
	int b_kvasize; /* size of kva for buffer */
	int b_dirtyoff; /* Offset in buffer of dirty region. */
	int b_dirtyend; /* Offset of end of dirty region. */
	caddr_t b_kvabase; /* base kva for buffer */
	daddr_t b_lblkno; /* Logical block number. */
	struct vnode b_vp; / Device vnode. */
	struct ucred b_rcred; / Read credentials reference. */
	struct ucred b_wcred; / Write credentials reference. */
	union {
	TAILQ_ENTRY(buf) b_freelist; /* (Q) */
	struct {
	void (b_pgiodone)(void , vm_page_t *, int, int);
	int b_pgbefore;
	int b_pgafter;
	};
	};
	union cluster_info {
	TAILQ_HEAD(cluster_list_head, buf) cluster_head;
	TAILQ_ENTRY(buf) cluster_entry;
	} b_cluster;
	- struct vm_page *b_pages[btoc(MAXPHYS)];
	int b_npages;
	struct workhead b_dep; /* (D) List of filesystem dependencies. */
	void *b_fsprivate1;
	void *b_fsprivate2;
	void *b_fsprivate3;

	#if defined(FULL_BUF_TRACKING)
	#define BUF_TRACKING_SIZE 32
	#define BUF_TRACKING_ENTRY(x) ((x) & (BUF_TRACKING_SIZE - 1))
	const char *b_io_tracking[BUF_TRACKING_SIZE];
	uint32_t b_io_tcnt;
	#elif defined(BUF_TRACKING)
	const char *b_io_tracking;
	#endif
	+ struct vm_page *b_pages[];
	};

	#define b_object b_bufobj->bo_object

	/*
	* These flags are kept in b_flags.
	*
	* Notes:
	*
	* B_ASYNC VOP calls on bp's are usually async whether or not
	* B_ASYNC is set, but some subsystems, such as NFS, like
	* to know what is best for the caller so they can
	* optimize the I/O.
	*
	* B_PAGING Indicates that bp is being used by the paging system or
	* some paging system and that the bp is not linked into
	* the b_vp's clean/dirty linked lists or ref counts.
	* Buffer vp reassignments are illegal in this case.
	*
	* B_CACHE This may only be set if the buffer is entirely valid.
	* The situation where B_DELWRI is set and B_CACHE is
	* clear MUST be committed to disk by getblk() so
	* B_DELWRI can also be cleared. See the comments for
	* getblk() in kern/vfs_bio.c. If B_CACHE is clear,
	* the caller is expected to clear BIO_ERROR and B_INVAL,
	* set BIO_READ, and initiate an I/O.
	*
	* The 'entire buffer' is defined to be the range from
	* 0 through b_bcount.
	*
	* B_MALLOC Request that the buffer be allocated from the malloc
	* pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
	*
	* B_CLUSTEROK This flag is typically set for B_DELWRI buffers
	* by filesystems that allow clustering when the buffer
	* is fully dirty and indicates that it may be clustered
	* with other adjacent dirty buffers. Note the clustering
	* may not be used with the stage 1 data write under NFS
	* but may be used for the commit rpc portion.
	*
	* B_INVALONERR This flag is set on dirty buffers. It specifies that a
	* write error should forcibly invalidate the buffer
	* contents. This flag should be used with caution, as it
	* discards data. It is incompatible with B_ASYNC.
	*
	* B_VMIO Indicates that the buffer is tied into an VM object.
	* The buffer's data is always PAGE_SIZE aligned even
	* if b_bufsize and b_bcount are not. ( b_bufsize is
	* always at least DEV_BSIZE aligned, though ).
	*
	* B_DIRECT Hint that we should attempt to completely free
	* the pages underlying the buffer. B_DIRECT is
	* sticky until the buffer is released and typically
	* only has an effect when B_RELBUF is also set.
	*
	*/

	#define B_AGE 0x00000001 /* Move to age queue when I/O done. */
	#define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */
	#define B_ASYNC 0x00000004 /* Start I/O, do not wait. */
	#define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */
	#define B_DEFERRED 0x00000010 /* Skipped over for cleaning */
	#define B_CACHE 0x00000020 /* Bread found us in the cache. */
	#define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */
	#define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */
	#define B_CKHASH 0x00000100 /* checksum hash calculated on read */
	#define B_DONE 0x00000200 /* I/O completed. */
	#define B_EINTR 0x00000400 /* I/O was interrupted */
	#define B_NOREUSE 0x00000800 /* Contents not reused once released. */
	#define B_REUSE 0x00001000 /* Contents reused, second chance. */
	#define B_INVAL 0x00002000 /* Does not contain valid info. */
	#define B_BARRIER 0x00004000 /* Write this and all preceding first. */
	#define B_NOCACHE 0x00008000 /* Do not cache block after use. */
	#define B_MALLOC 0x00010000 /* malloced b_data */
	#define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */
	#define B_INVALONERR 0x00040000 /* Invalidate on write error. */
	#define B_00080000 0x00080000 /* Available flag. */
	#define B_00100000 0x00100000 /* Available flag. */
	-#define B_00200000 0x00200000 /* Available flag. */
	+#define B_MAXPHYS 0x00200000 /* nitems(b_pages[]) = atop(MAXPHYS). */
	#define B_RELBUF 0x00400000 /* Release VMIO buffer. */
	#define B_FS_FLAG1 0x00800000 /* Available flag for FS use. */
	#define B_NOCOPY 0x01000000 /* Don't copy-on-write this buf. */
	#define B_INFREECNT 0x02000000 /* buf is counted in numfreebufs */
	#define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */
	#define B_MANAGED 0x08000000 /* Managed by FS. */
	#define B_RAM 0x10000000 /* Read ahead mark (flag) */
	#define B_VMIO 0x20000000 /* VMIO flag */
	#define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */
	#define B_REMFREE 0x80000000 /* Delayed bremfree */

	#define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \
	- "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26b21\25b20" \
	+ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26maxphys\25b20" \
	"\24b19\23invalonerr\22clusterok\21malloc\20nocache\17b14\16inval" \
	"\15reuse\14noreuse\13eintr\12done\11b8\10delwri" \
	"\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age"

	/*
	* These flags are kept in b_xflags.
	*
	* BX_FSPRIV reserves a set of eight flags that may be used by individual
	* filesystems for their own purpose. Their specific definitions are
	* found in the header files for each filesystem that uses them.
	*/
	#define BX_VNDIRTY 0x00000001 /* On vnode dirty list */
	#define BX_VNCLEAN 0x00000002 /* On vnode clean list */
	#define BX_CVTENXIO 0x00000004 /* Convert errors to ENXIO */
	#define BX_BKGRDWRITE 0x00000010 /* Do writes in background */
	#define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */
	#define BX_ALTDATA 0x00000040 /* Holds extended data */
	#define BX_FSPRIV 0x00FF0000 /* Filesystem-specific flags mask */

	#define PRINT_BUF_XFLAGS "\20\7altdata\6bkgrdmarker\5bkgrdwrite\3cvtenxio" \
	"\2clean\1dirty"

	#define NOOFFSET (-1LL) /* No buffer offset calculated yet */

	/*
	* These flags are kept in b_vflags.
	*/
	#define BV_SCANNED 0x00000001 /* VOP_FSYNC funcs mark written bufs */
	#define BV_BKGRDINPROG 0x00000002 /* Background write in progress */
	#define BV_BKGRDWAIT 0x00000004 /* Background write waiting */
	#define BV_BKGRDERR 0x00000008 /* Error from background write */

	#define PRINT_BUF_VFLAGS "\20\4bkgrderr\3bkgrdwait\2bkgrdinprog\1scanned"

	#ifdef _KERNEL

	#ifndef NSWBUF_MIN
	#define NSWBUF_MIN 16
	#endif

	/*
	* Buffer locking
	*/
	extern const char buf_wmesg; / Default buffer lock message */
	#define BUF_WMESG "bufwait"
	#include <sys/proc.h> /* XXX for curthread */
	#include <sys/mutex.h>

	/*
	* Initialize a lock.
	*/
	#define BUF_LOCKINIT(bp) \
	lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, LK_NEW)
	/*
	*
	* Get a lock sleeping non-interruptably until it becomes available.
	*/
	#define BUF_LOCK(bp, locktype, interlock) \
	_lockmgr_args_rw(&(bp)->b_lock, (locktype), (interlock), \
	LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \
	LOCK_FILE, LOCK_LINE)

	/*
	* Get a lock sleeping with specified interruptably and timeout.
	*/
	#define BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo) \
	_lockmgr_args_rw(&(bp)->b_lock, (locktype) \| LK_TIMELOCK, \
	(interlock), (wmesg), (PRIBIO + 4) \| (catch), (timo), \
	LOCK_FILE, LOCK_LINE)

	/*
	* Release a lock. Only the acquiring process may free the lock unless
	* it has been handed off to biodone.
	*/
	#define BUF_UNLOCK(bp) do { \
	KASSERT(((bp)->b_flags & B_REMFREE) == 0, \
	("BUF_UNLOCK %p while B_REMFREE is still set.", (bp))); \
	\
	BUF_UNLOCK_RAW((bp)); \
	} while (0)
	#define BUF_UNLOCK_RAW(bp) do { \
	(void)_lockmgr_args(&(bp)->b_lock, LK_RELEASE, NULL, \
	LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \
	LOCK_FILE, LOCK_LINE); \
	} while (0)

	/*
	* Check if a buffer lock is recursed.
	*/
	#define BUF_LOCKRECURSED(bp) \
	lockmgr_recursed(&(bp)->b_lock)

	/*
	* Check if a buffer lock is currently held.
	*/
	#define BUF_ISLOCKED(bp) \
	lockstatus(&(bp)->b_lock)
	/*
	* Free a buffer lock.
	*/
	#define BUF_LOCKFREE(bp) \
	lockdestroy(&(bp)->b_lock)

	/*
	* Print informations on a buffer lock.
	*/
	#define BUF_LOCKPRINTINFO(bp) \
	lockmgr_printinfo(&(bp)->b_lock)

	/*
	* Buffer lock assertions.
	*/
	#if defined(INVARIANTS) && defined(INVARIANT_SUPPORT)
	#define BUF_ASSERT_LOCKED(bp) \
	_lockmgr_assert(&(bp)->b_lock, KA_LOCKED, LOCK_FILE, LOCK_LINE)
	#define BUF_ASSERT_SLOCKED(bp) \
	_lockmgr_assert(&(bp)->b_lock, KA_SLOCKED, LOCK_FILE, LOCK_LINE)
	#define BUF_ASSERT_XLOCKED(bp) \
	_lockmgr_assert(&(bp)->b_lock, KA_XLOCKED, LOCK_FILE, LOCK_LINE)
	#define BUF_ASSERT_UNLOCKED(bp) \
	_lockmgr_assert(&(bp)->b_lock, KA_UNLOCKED, LOCK_FILE, LOCK_LINE)
	#else
	#define BUF_ASSERT_LOCKED(bp)
	#define BUF_ASSERT_SLOCKED(bp)
	#define BUF_ASSERT_XLOCKED(bp)
	#define BUF_ASSERT_UNLOCKED(bp)
	#endif

	#ifdef _SYS_PROC_H_ /* Avoid #include <sys/proc.h> pollution */
	/*
	* When initiating asynchronous I/O, change ownership of the lock to the
	* kernel. Once done, the lock may legally released by biodone. The
	* original owning process can no longer acquire it recursively, but must
	* wait until the I/O is completed and the lock has been freed by biodone.
	*/
	#define BUF_KERNPROC(bp) \
	_lockmgr_disown(&(bp)->b_lock, LOCK_FILE, LOCK_LINE)
	#endif

	#endif /* _KERNEL */

	struct buf_queue_head {
	TAILQ_HEAD(buf_queue, buf) queue;
	daddr_t last_pblkno;
	struct buf *insert_point;
	struct buf *switch_point;
	};

	/*
	* This structure describes a clustered I/O.
	*/
	struct cluster_save {
	long bs_bcount; /* Saved b_bcount. */
	long bs_bufsize; /* Saved b_bufsize. */
	int bs_nchildren; /* Number of associated buffers. */
	struct buf *bs_children; / List of associated buffers. */
	};

	#ifdef _KERNEL

	static __inline int
	bwrite(struct buf *bp)
	{

	KASSERT(bp->b_bufobj != NULL, ("bwrite: no bufobj bp=%p", bp));
	KASSERT(bp->b_bufobj->bo_ops != NULL, ("bwrite: no bo_ops bp=%p", bp));
	KASSERT(bp->b_bufobj->bo_ops->bop_write != NULL,
	("bwrite: no bop_write bp=%p", bp));
	return (BO_WRITE(bp->b_bufobj, bp));
	}

	static __inline void
	bstrategy(struct buf *bp)
	{

	KASSERT(bp->b_bufobj != NULL, ("bstrategy: no bufobj bp=%p", bp));
	KASSERT(bp->b_bufobj->bo_ops != NULL,
	("bstrategy: no bo_ops bp=%p", bp));
	KASSERT(bp->b_bufobj->bo_ops->bop_strategy != NULL,
	("bstrategy: no bop_strategy bp=%p", bp));
	BO_STRATEGY(bp->b_bufobj, bp);
	}

	static __inline void
	buf_start(struct buf *bp)
	{
	if (bioops.io_start)
	(*bioops.io_start)(bp);
	}

	static __inline void
	buf_complete(struct buf *bp)
	{
	if (bioops.io_complete)
	(*bioops.io_complete)(bp);
	}

	static __inline void
	buf_deallocate(struct buf *bp)
	{
	if (bioops.io_deallocate)
	(*bioops.io_deallocate)(bp);
	}

	static __inline int
	buf_countdeps(struct buf *bp, int i)
	{
	if (bioops.io_countdeps)
	return ((*bioops.io_countdeps)(bp, i));
	else
	return (0);
	}

	static __inline void
	buf_track(struct buf bp __unused, const char location __unused)
	{

	#if defined(FULL_BUF_TRACKING)
	bp->b_io_tracking[BUF_TRACKING_ENTRY(bp->b_io_tcnt++)] = location;
	#elif defined(BUF_TRACKING)
	bp->b_io_tracking = location;
	#endif
	}

	#endif /* _KERNEL */

	/*
	* Zero out the buffer's data area.
	*/
	#define clrbuf(bp) { \
	bzero((bp)->b_data, (u_int)(bp)->b_bcount); \
	(bp)->b_resid = 0; \
	}

	/*
	* Flags for getblk's last parameter.
	*/
	#define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */
	#define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */
	#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */
	#define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */
	#define GB_KVAALLOC 0x0010 /* But allocate KVA. */
	#define GB_CKHASH 0x0020 /* If reading, calc checksum hash */
	#define GB_NOSPARSE 0x0040 /* Do not instantiate holes */
	#define GB_CVTENXIO 0x0080 /* Convert errors to ENXIO */

	#ifdef _KERNEL
	extern int nbuf; /* The number of buffer headers */
	-extern long maxswzone; /* Max KVA for swap structures */
	-extern long maxbcache; /* Max KVA for buffer cache */
	+extern u_long maxswzone; /* Max KVA for swap structures */
	+extern u_long maxbcache; /* Max KVA for buffer cache */
	extern int maxbcachebuf; /* Max buffer cache block size */
	extern long runningbufspace;
	extern long hibufspace;
	extern int dirtybufthresh;
	extern int bdwriteskip;
	extern int dirtybufferflushes;
	extern int altbufferflushes;
	extern int nswbuf; /* Number of swap I/O buffer headers. */
	extern caddr_t unmapped_buf; /* Data address for unmapped buffers. */

	static inline int
	buf_mapped(struct buf *bp)
	{

	return (bp->b_data != unmapped_buf);
	}

	void runningbufwakeup(struct buf *);
	void waitrunningbufspace(void);
	caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est);
	void bufinit(void);
	void bufshutdown(int);
	void bdata2bio(struct buf bp, struct bio bip);
	void bwillwrite(void);
	int buf_dirty_count_severe(void);
	void bremfree(struct buf *);
	void bremfreef(struct buf ); / XXX Force bremfree, only for nfs. */
	#define bread(vp, blkno, size, cred, bpp) \
	breadn_flags(vp, blkno, blkno, size, NULL, NULL, 0, cred, 0, \
	NULL, bpp)
	#define bread_gb(vp, blkno, size, cred, gbflags, bpp) \
	breadn_flags(vp, blkno, blkno, size, NULL, NULL, 0, cred, \
	gbflags, NULL, bpp)
	#define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \
	breadn_flags(vp, blkno, blkno, size, rablkno, rabsize, cnt, cred, \
	0, NULL, bpp)
	int breadn_flags(struct vnode , daddr_t, daddr_t, int, daddr_t , int *,
	int, struct ucred , int, void ()(struct buf ), struct buf *);
	void bdwrite(struct buf *);
	void bawrite(struct buf *);
	void babarrierwrite(struct buf *);
	int bbarrierwrite(struct buf *);
	void bdirty(struct buf *);
	void bundirty(struct buf *);
	void bufstrategy(struct bufobj , struct buf );
	void brelse(struct buf *);
	void bqrelse(struct buf *);
	int vfs_bio_awrite(struct buf *);
	void vfs_busy_pages_acquire(struct buf *bp);
	void vfs_busy_pages_release(struct buf *bp);
	struct buf incore(struct bufobj , daddr_t);
	bool inmem(struct vnode *, daddr_t);
	struct buf gbincore(struct bufobj , daddr_t);
	struct buf gbincore_unlocked(struct bufobj , daddr_t);
	struct buf getblk(struct vnode , daddr_t, int, int, int, int);
	int getblkx(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size,
	int slpflag, int slptimeo, int flags, struct buf **bpp);
	struct buf *geteblk(int, int);
	int bufwait(struct buf *);
	int bufwrite(struct buf *);
	void bufdone(struct buf *);
	void bd_speedup(void);

	extern uma_zone_t pbuf_zone;
	uma_zone_t pbuf_zsecond_create(const char *name, int max);

	int cluster_read(struct vnode *, u_quad_t, daddr_t, long,
	struct ucred , long, int, int, struct buf *);
	int cluster_wbuild(struct vnode *, long, daddr_t, int, int);
	void cluster_write(struct vnode , struct buf , u_quad_t, int, int);
	void vfs_bio_brelse(struct buf *bp, int ioflags);
	void vfs_bio_bzero_buf(struct buf *bp, int base, int size);
	void vfs_bio_clrbuf(struct buf *);
	void vfs_bio_set_flags(struct buf *bp, int ioflags);
	void vfs_bio_set_valid(struct buf *, int base, int size);
	void vfs_busy_pages(struct buf *, int clear_modify);
	void vfs_unbusy_pages(struct buf *);
	int vmapbuf(struct buf , void , size_t, int);
	void vunmapbuf(struct buf *);
	void brelvp(struct buf *);
	void bgetvp(struct vnode , struct buf );
	void pbgetbo(struct bufobj bo, struct buf bp);
	void pbgetvp(struct vnode , struct buf );
	void pbrelbo(struct buf *);
	void pbrelvp(struct buf *);
	int allocbuf(struct buf *bp, int size);
	void reassignbuf(struct buf *);
	void bwait(struct buf , u_char, const char );
	void bdone(struct buf *);

	typedef daddr_t (vbg_get_lblkno_t)(struct vnode *, vm_ooffset_t);
	typedef int (vbg_get_blksize_t)(struct vnode *, daddr_t);
	int vfs_bio_getpages(struct vnode vp, struct vm_page *ma, int count,
	int rbehind, int rahead, vbg_get_lblkno_t get_lblkno,
	vbg_get_blksize_t get_blksize);

	#endif /* _KERNEL */

	#endif /* !_SYS_BUF_H_ */
	diff --git a/sys/sys/param.h b/sys/sys/param.h
	index 577f0e1d6e07..00fb0c860e72 100644
	--- a/sys/sys/param.h
	+++ b/sys/sys/param.h
	@@ -1,370 +1,370 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)param.h 8.3 (Berkeley) 4/4/95
	* $FreeBSD$
	*/

	#ifndef _SYS_PARAM_H_
	#define _SYS_PARAM_H_

	#include <sys/_null.h>

	#define BSD 199506 /* System version (year & month). */
	#define BSD4_3 1
	#define BSD4_4 1

	/*
	* __FreeBSD_version numbers are documented in the Porter's Handbook.
	* If you bump the version for any reason, you should update the documentation
	* there.
	* Currently this lives here in the doc/ repository:
	*
	* head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml
	*
	* scheme is: <major><two digit minor>Rxx
	* 'R' is in the range 0 to 4 if this is a release branch or
	* X.0-CURRENT before releng/X.0 is created, otherwise 'R' is
	* in the range 5 to 9.
	*/
	#undef __FreeBSD_version
	#define __FreeBSD_version 1300130 /* Master, propagated to newvers */

	/*
	* __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
	* which by definition is always true on FreeBSD. This macro is also defined
	* on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD.
	*
	* It is tempting to use this macro in userland code when we want to enable
	* kernel-specific routines, and in fact it's fine to do this in code that
	* is part of FreeBSD itself. However, be aware that as presence of this
	* macro is still not widespread (e.g. older FreeBSD versions, 3rd party
	* compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in
	* external applications without also checking for __FreeBSD__ as an
	* alternative.
	*/
	#undef __FreeBSD_kernel__
	#define __FreeBSD_kernel__

	#if defined(_KERNEL) \|\| defined(IN_RTLD)
	#define P_OSREL_SIGWAIT 700000
	#define P_OSREL_SIGSEGV 700004
	#define P_OSREL_MAP_ANON 800104
	#define P_OSREL_MAP_FSTRICT 1100036
	#define P_OSREL_SHUTDOWN_ENOTCONN 1100077
	#define P_OSREL_MAP_GUARD 1200035
	#define P_OSREL_WRFSBASE 1200041
	#define P_OSREL_CK_CYLGRP 1200046
	#define P_OSREL_VMTOTAL64 1200054
	#define P_OSREL_CK_SUPERBLOCK 1300000
	#define P_OSREL_CK_INODE 1300005
	#define P_OSREL_POWERPC_NEW_AUX_ARGS 1300070

	#define P_OSREL_MAJOR(x) ((x) / 100000)
	#endif

	#ifndef LOCORE
	#include <sys/types.h>
	#endif

	/*
	* Machine-independent constants (some used in following include files).
	* Redefined constants are from POSIX 1003.1 limits file.
	*
	* MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>)
	*/
	#include <sys/syslimits.h>

	#define MAXCOMLEN 19 /* max command name remembered */
	#define MAXINTERP PATH_MAX /* max interpreter file name length */
	#define MAXLOGNAME 33 /* max login name length (incl. NUL) */
	#define MAXUPRC CHILD_MAX /* max simultaneous processes */
	#define NCARGS ARG_MAX /* max bytes for an exec function */
	#define NGROUPS (NGROUPS_MAX+1) /* max number groups */
	#define NOFILE OPEN_MAX /* max open files per process */
	#define NOGROUP 65535 /* marker for empty group set member */
	#define MAXHOSTNAMELEN 256 /* max hostname size */
	#define SPECNAMELEN 255 /* max length of devicename */

	/* More types and definitions used throughout the kernel. */
	#ifdef _KERNEL
	#include <sys/cdefs.h>
	#include <sys/errno.h>
	#ifndef LOCORE
	#include <sys/time.h>
	#include <sys/priority.h>
	#endif

	#ifndef FALSE
	#define FALSE 0
	#endif
	#ifndef TRUE
	#define TRUE 1
	#endif
	#endif

	#ifndef _KERNEL
	#ifndef LOCORE
	/* Signals. */
	#include <sys/signal.h>
	#endif
	#endif

	/* Machine type dependent parameters. */
	#include <machine/param.h>
	#ifndef _KERNEL
	#include <sys/limits.h>
	#endif

	#ifndef DEV_BSHIFT
	#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */
	#endif
	#define DEV_BSIZE (1<<DEV_BSHIFT)

	#ifndef BLKDEV_IOSIZE
	#define BLKDEV_IOSIZE PAGE_SIZE /* default block device I/O size */
	#endif
	#ifndef DFLTPHYS
	#define DFLTPHYS (64 * 1024) /* default max raw I/O transfer size */
	#endif
	#ifndef MAXPHYS
	-#define MAXPHYS (128 * 1024) /* max raw I/O transfer size */
	+#define MAXPHYS (1024 * 1024) /* max raw I/O transfer size */
	#endif
	#ifndef MAXDUMPPGS
	#define MAXDUMPPGS (DFLTPHYS/PAGE_SIZE)
	#endif

	/*
	* Constants related to network buffer management.
	* MCLBYTES must be no larger than PAGE_SIZE.
	*/
	#ifndef MSIZE
	#define MSIZE 256 /* size of an mbuf */
	#endif

	#ifndef MCLSHIFT
	#define MCLSHIFT 11 /* convert bytes to mbuf clusters */
	#endif /* MCLSHIFT */

	#define MCLBYTES (1 << MCLSHIFT) /* size of an mbuf cluster */

	#if PAGE_SIZE < 2048
	#define MJUMPAGESIZE MCLBYTES
	#elif PAGE_SIZE <= 8192
	#define MJUMPAGESIZE PAGE_SIZE
	#else
	#define MJUMPAGESIZE (8 * 1024)
	#endif

	#define MJUM9BYTES (9 * 1024) /* jumbo cluster 9k */
	#define MJUM16BYTES (16 * 1024) /* jumbo cluster 16k */

	/*
	* Some macros for units conversion
	*/

	/* clicks to bytes */
	#ifndef ctob
	#define ctob(x) ((x)<<PAGE_SHIFT)
	#endif

	/* bytes to clicks */
	#ifndef btoc
	#define btoc(x) (((vm_offset_t)(x)+PAGE_MASK)>>PAGE_SHIFT)
	#endif

	/*
	* btodb() is messy and perhaps slow because `bytes' may be an off_t. We
	* want to shift an unsigned type to avoid sign extension and we don't
	* want to widen `bytes' unnecessarily. Assume that the result fits in
	* a daddr_t.
	*/
	#ifndef btodb
	#define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \
	(sizeof (bytes) > sizeof(long) \
	? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \
	: (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT))
	#endif

	#ifndef dbtob
	#define dbtob(db) /* calculates (db * DEV_BSIZE) */ \
	((off_t)(db) << DEV_BSHIFT)
	#endif

	#define PRIMASK 0x0ff
	#define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */
	#define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */

	#define NZERO 0 /* default "nice" */

	#define NBBY 8 /* number of bits in a byte */
	#define NBPW sizeof(int) /* number of bytes per word (integer) */

	#define CMASK 022 /* default file mask: S_IWGRP\|S_IWOTH */

	#define NODEV (dev_t)(-1) /* non-existent device */

	/*
	* File system parameters and macros.
	*
	* MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes
	* per block. MAXBSIZE may be made larger without effecting
	* any existing filesystems as long as it does not exceed MAXPHYS,
	* and may be made smaller at the risk of not being able to use
	* filesystems which require a block size exceeding MAXBSIZE.
	*
	* MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must
	* be >= MAXBSIZE and can be set differently for different
	* architectures by defining it in <machine/param.h>.
	* Making this larger allows NFS to do larger reads/writes.
	*
	* BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the
	* minimum KVM memory reservation the kernel is willing to make.
	* Filesystems can of course request smaller chunks. Actual
	* backing memory uses a chunk size of a page (PAGE_SIZE).
	* The default value here can be overridden on a per-architecture
	* basis by defining it in <machine/param.h>.
	*
	* If you make BKVASIZE too small you risk seriously fragmenting
	* the buffer KVM map which may slow things down a bit. If you
	* make it too big the kernel will not be able to optimally use
	* the KVM memory reserved for the buffer cache and will wind
	* up with too-few buffers.
	*
	* The default is 16384, roughly 2x the block size used by a
	* normal UFS filesystem.
	*/
	#define MAXBSIZE 65536 /* must be power of 2 */
	#ifndef MAXBCACHEBUF
	#define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */
	#endif
	#ifndef BKVASIZE
	#define BKVASIZE 16384 /* must be power of 2 */
	#endif
	#define BKVAMASK (BKVASIZE-1)

	/*
	* MAXPATHLEN defines the longest permissible path length after expanding
	* symbolic links. It is used to allocate a temporary buffer from the buffer
	* pool in which to do the name expansion, hence should be a power of two,
	* and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the
	* maximum number of symbolic links that may be expanded in a path name.
	* It should be set high enough to allow all legitimate uses, but halt
	* infinite loops reasonably quickly.
	*/
	#define MAXPATHLEN PATH_MAX
	#define MAXSYMLINKS 32

	/* Bit map related macros. */
	#define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] \|= 1<<((i)%NBBY))
	#define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY)))
	#define isset(a,i) \
	(((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY)))
	#define isclr(a,i) \
	((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0)

	/* Macros for counting and rounding. */
	#ifndef howmany
	#define howmany(x, y) (((x)+((y)-1))/(y))
	#endif
	#define nitems(x) (sizeof((x)) / sizeof((x)[0]))
	#define rounddown(x, y) (((x)/(y))*(y))
	#define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */
	#define roundup(x, y) ((((x)+((y)-1))/(y))(y)) / to any y */
	#define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
	#define powerof2(x) ((((x)-1)&(x))==0)

	/* Macros for min/max. */
	#define MIN(a,b) (((a)<(b))?(a):(b))
	#define MAX(a,b) (((a)>(b))?(a):(b))

	#ifdef _KERNEL
	/*
	* Basic byte order function prototypes for non-inline functions.
	*/
	#ifndef LOCORE
	#ifndef _BYTEORDER_PROTOTYPED
	#define _BYTEORDER_PROTOTYPED
	__BEGIN_DECLS
	__uint32_t htonl(__uint32_t);
	__uint16_t htons(__uint16_t);
	__uint32_t ntohl(__uint32_t);
	__uint16_t ntohs(__uint16_t);
	__END_DECLS
	#endif
	#endif

	#ifndef _BYTEORDER_FUNC_DEFINED
	#define _BYTEORDER_FUNC_DEFINED
	#define htonl(x) __htonl(x)
	#define htons(x) __htons(x)
	#define ntohl(x) __ntohl(x)
	#define ntohs(x) __ntohs(x)
	#endif /* !_BYTEORDER_FUNC_DEFINED */
	#endif /* _KERNEL */

	/*
	* Scale factor for scaled integers used to count %cpu time and load avgs.
	*
	* The number of CPU `tick's that map to a unique `%age' can be expressed
	* by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that
	* can be calculated (assuming 32 bits) can be closely approximated using
	* the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15).
	*
	* For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age',
	* FSHIFT must be at least 11; this gives us a maximum load avg of ~1024.
	*/
	#define FSHIFT 11 /* bits to right of fixed binary point */
	#define FSCALE (1<<FSHIFT)

	#define dbtoc(db) /* calculates devblks to pages */ \
	((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT))

	#define ctodb(db) /* calculates pages to devblks */ \
	((db) << (PAGE_SHIFT - DEV_BSHIFT))

	/*
	* Old spelling of __containerof().
	*/
	#define member2struct(s, m, x) \
	((struct s )(void )((char *)(x) - offsetof(struct s, m)))

	/*
	* Access a variable length array that has been declared as a fixed
	* length array.
	*/
	#define __PAST_END(array, offset) (((__typeof__((array)) )(array))[offset])

	#endif /* _SYS_PARAM_H_ */
	diff --git a/sys/sys/systm.h b/sys/sys/systm.h
	index 3d9dfc8cbe28..5de12e5bc1e5 100644
	--- a/sys/sys/systm.h
	+++ b/sys/sys/systm.h
	@@ -1,631 +1,633 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1982, 1988, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)systm.h 8.7 (Berkeley) 3/29/95
	* $FreeBSD$
	*/

	#ifndef _SYS_SYSTM_H_
	#define _SYS_SYSTM_H_

	#include <sys/cdefs.h>
	#include <machine/atomic.h>
	#include <machine/cpufunc.h>
	#include <sys/callout.h>
	#include <sys/queue.h>
	#include <sys/stdint.h> /* for people using printf mainly */

	__NULLABILITY_PRAGMA_PUSH

	#ifdef _KERNEL
	extern int cold; /* nonzero if we are doing a cold boot */
	extern int suspend_blocked; /* block suspend due to pending shutdown */
	extern int rebooting; /* kern_reboot() has been called. */
	extern const char panicstr; / panic message */
	extern bool panicked;
	#define KERNEL_PANICKED() __predict_false(panicked)
	extern char version[]; /* system version */
	extern char compiler_version[]; /* compiler version */
	extern char copyright[]; /* system copyright */
	extern int kstack_pages; /* number of kernel stack pages */

	extern u_long pagesizes[]; /* supported page sizes */
	extern long physmem; /* physical memory */
	extern long realmem; /* 'real' memory */

	extern char rootdevnames[2]; / names of possible root devices */

	extern int boothowto; /* reboot flags, from console subsystem */
	extern int bootverbose; /* nonzero to print verbose messages */

	extern int maxusers; /* system tune hint */
	extern int ngroups_max; /* max # of supplemental groups */
	extern int vm_guest; /* Running as virtual machine guest? */

	+extern u_long maxphys; /* max raw I/O transfer size */
	+
	/*
	* Detected virtual machine guest types. The intention is to expand
	* and/or add to the VM_GUEST_VM type if specific VM functionality is
	* ever implemented (e.g. vendor-specific paravirtualization features).
	* Keep in sync with vm_guest_sysctl_names[].
	*/
	enum VM_GUEST { VM_GUEST_NO = 0, VM_GUEST_VM, VM_GUEST_XEN, VM_GUEST_HV,
	VM_GUEST_VMWARE, VM_GUEST_KVM, VM_GUEST_BHYVE, VM_GUEST_VBOX,
	VM_GUEST_PARALLELS, VM_LAST };

	#ifdef INVARIANTS /* The option is always available */
	#define VNASSERT(exp, vp, msg) do { \
	if (__predict_false(!(exp))) { \
	vn_printf(vp, "VNASSERT failed: %s not true at %s:%d (%s)\n",\
	#exp, __FILE__, __LINE__, __func__); \
	kassert_panic msg; \
	} \
	} while (0)
	#define VNPASS(exp, vp) do { \
	const char *_exp = #exp; \
	VNASSERT(exp, vp, ("condition %s not met at %s:%d (%s)", \
	_exp, __FILE__, __LINE__, __func__)); \
	} while (0)
	#define __assert_unreachable() \
	panic("executing segment marked as unreachable at %s:%d (%s)\n", \
	__FILE__, __LINE__, __func__)
	#else
	#define VNASSERT(exp, vp, msg) do { \
	} while (0)
	#define VNPASS(exp, vp) do { \
	} while (0)
	#define __assert_unreachable() __unreachable()
	#endif

	#ifndef CTASSERT /* Allow lint to override */
	#define CTASSERT(x) _Static_assert(x, "compile-time assertion failed")
	#endif
	#endif /* KERNEL */

	/*
	* These functions need to be declared before the KASSERT macro is invoked in
	* !KASSERT_PANIC_OPTIONAL builds, so their declarations are sort of out of
	* place compared to other function definitions in this header. On the other
	* hand, this header is a bit disorganized anyway.
	*/
	void panic(const char *, ...) __dead2 __printflike(1, 2);
	void vpanic(const char *, __va_list) __dead2 __printflike(1, 0);


	#if defined(_STANDALONE)
	struct ucred;
	/*
	* Until we have more experience with KASSERTS that are called
	* from the boot loader, they are off. The bootloader does this
	* a little differently than the kernel (we just call printf atm).
	* we avoid most of the common functions in the boot loader, so
	* declare printf() here too.
	*/
	int printf(const char *, ...) __printflike(1, 2);
	# define kassert_panic printf
	#else /* !_STANDALONE */
	# if defined(WITNESS) \|\| defined(INVARIANT_SUPPORT)
	# ifdef KASSERT_PANIC_OPTIONAL
	void kassert_panic(const char *fmt, ...) __printflike(1, 2);
	# else
	# define kassert_panic panic
	# endif /* KASSERT_PANIC_OPTIONAL */
	# endif /* defined(WITNESS) \|\| defined(INVARIANT_SUPPORT) */
	#endif /* _STANDALONE */

	#if defined(INVARIANTS) \|\| defined(_STANDALONE)
	#define KASSERT(exp,msg) do { \
	if (__predict_false(!(exp))) \
	kassert_panic msg; \
	} while (0)
	#else /* !INVARIANTS && !_STANDALONE */
	#define KASSERT(exp,msg) do { \
	} while (0)
	#endif /* INVARIANTS \|\| _STANDALONE */

	/*
	* Helpful macros for quickly coming up with assertions with informative
	* panic messages.
	*/
	#define MPASS(ex) MPASS4(ex, #ex, __FILE__, __LINE__)
	#define MPASS2(ex, what) MPASS4(ex, what, __FILE__, __LINE__)
	#define MPASS3(ex, file, line) MPASS4(ex, #ex, file, line)
	#define MPASS4(ex, what, file, line) \
	KASSERT((ex), ("Assertion %s failed at %s:%d", what, file, line))

	/*
	* Align variables.
	*/
	#define __read_mostly __section(".data.read_mostly")
	#define __read_frequently __section(".data.read_frequently")
	#define __exclusive_cache_line __aligned(CACHE_LINE_SIZE) \
	__section(".data.exclusive_cache_line")
	#ifdef _KERNEL
	#include <sys/param.h> /* MAXCPU */
	#include <sys/pcpu.h> /* curthread */
	#include <sys/kpilite.h>

	/*
	* Assert that a pointer can be loaded from memory atomically.
	*
	* This assertion enforces stronger alignment than necessary. For example,
	* on some architectures, atomicity for unaligned loads will depend on
	* whether or not the load spans multiple cache lines.
	*/
	#define ASSERT_ATOMIC_LOAD_PTR(var, msg) \
	KASSERT(sizeof(var) == sizeof(void *) && \
	((uintptr_t)&(var) & (sizeof(void *) - 1)) == 0, msg)

	/*
	* Assert that a thread is in critical(9) section.
	*/
	#define CRITICAL_ASSERT(td) \
	KASSERT((td)->td_critnest >= 1, ("Not in critical section"));

	/*
	* If we have already panic'd and this is the thread that called
	* panic(), then don't block on any mutexes but silently succeed.
	* Otherwise, the kernel will deadlock since the scheduler isn't
	* going to run the thread that holds any lock we need.
	*/
	#define SCHEDULER_STOPPED_TD(td) ({ \
	MPASS((td) == curthread); \
	__predict_false((td)->td_stopsched); \
	})
	#define SCHEDULER_STOPPED() SCHEDULER_STOPPED_TD(curthread)

	extern int osreldate;

	extern const void zero_region; / address space maps to a zeroed page */

	extern int unmapped_buf_allowed;

	#ifdef __LP64__
	#define IOSIZE_MAX iosize_max()
	#define DEVFS_IOSIZE_MAX devfs_iosize_max()
	#else
	#define IOSIZE_MAX SSIZE_MAX
	#define DEVFS_IOSIZE_MAX SSIZE_MAX
	#endif

	/*
	* General function declarations.
	*/

	struct inpcb;
	struct lock_object;
	struct malloc_type;
	struct mtx;
	struct proc;
	struct socket;
	struct thread;
	struct tty;
	struct ucred;
	struct uio;
	struct _jmp_buf;
	struct trapframe;
	struct eventtimer;

	int setjmp(struct _jmp_buf *) __returns_twice;
	void longjmp(struct _jmp_buf *, int) __dead2;
	int dumpstatus(vm_offset_t addr, off_t count);
	int nullop(void);
	int eopnotsupp(void);
	int ureadc(int, struct uio *);
	void hashdestroy(void , struct malloc_type , u_long);
	void hashinit(int count, struct malloc_type type, u_long *hashmask);
	void hashinit_flags(int count, struct malloc_type type,
	u_long *hashmask, int flags);
	#define HASH_NOWAIT 0x00000001
	#define HASH_WAITOK 0x00000002

	void phashinit(int count, struct malloc_type type, u_long *nentries);
	void phashinit_flags(int count, struct malloc_type type, u_long *nentries,
	int flags);
	void g_waitidle(void);

	void cpu_flush_dcache(void *, size_t);
	void cpu_rootconf(void);
	void critical_enter_KBI(void);
	void critical_exit_KBI(void);
	void critical_exit_preempt(void);
	void init_param1(void);
	void init_param2(long physpages);
	void init_static_kenv(char *, size_t);
	void tablefull(const char *);

	/*
	* Allocate per-thread "current" state in the linuxkpi
	*/
	extern int (lkpi_alloc_current)(struct thread , int);
	int linux_alloc_current_noop(struct thread *, int);

	#if defined(KLD_MODULE) \|\| defined(KTR_CRITICAL) \|\| !defined(_KERNEL) \|\| defined(GENOFFSET)
	#define critical_enter() critical_enter_KBI()
	#define critical_exit() critical_exit_KBI()
	#else
	static __inline void
	critical_enter(void)
	{
	struct thread_lite *td;

	td = (struct thread_lite *)curthread;
	td->td_critnest++;
	__compiler_membar();
	}

	static __inline void
	critical_exit(void)
	{
	struct thread_lite *td;

	td = (struct thread_lite *)curthread;
	KASSERT(td->td_critnest != 0,
	("critical_exit: td_critnest == 0"));
	__compiler_membar();
	td->td_critnest--;
	__compiler_membar();
	if (__predict_false(td->td_owepreempt))
	critical_exit_preempt();

	}
	#endif

	#ifdef EARLY_PRINTF
	typedef void early_putc_t(int ch);
	extern early_putc_t *early_putc;
	#endif
	int kvprintf(char const , void ()(int, void), void , int,
	__va_list) __printflike(1, 0);
	void log(int, const char *, ...) __printflike(2, 3);
	void log_console(struct uio *);
	void vlog(int, const char *, __va_list) __printflike(2, 0);
	int asprintf(char *ret, struct malloc_type mtp, const char *format,
	...) __printflike(3, 4);
	int printf(const char *, ...) __printflike(1, 2);
	int snprintf(char , size_t, const char , ...) __printflike(3, 4);
	int sprintf(char buf, const char , ...) __printflike(2, 3);
	int uprintf(const char *, ...) __printflike(1, 2);
	int vprintf(const char *, __va_list) __printflike(1, 0);
	int vasprintf(char *ret, struct malloc_type mtp, const char *format,
	__va_list ap) __printflike(3, 0);
	int vsnprintf(char , size_t, const char , __va_list) __printflike(3, 0);
	int vsnrprintf(char , size_t, int, const char , __va_list) __printflike(4, 0);
	int vsprintf(char buf, const char , __va_list) __printflike(2, 0);
	int sscanf(const char , char const _Nonnull, ...) __scanflike(2, 3);
	int vsscanf(const char * _Nonnull, char const * _Nonnull, __va_list) __scanflike(2, 0);
	long strtol(const char , char *, int);
	u_long strtoul(const char , char *, int);
	quad_t strtoq(const char , char *, int);
	u_quad_t strtouq(const char , char *, int);
	void tprintf(struct proc p, int pri, const char , ...) __printflike(3, 4);
	void vtprintf(struct proc , int, const char , __va_list) __printflike(3, 0);
	void hexdump(const void ptr, int length, const char hdr, int flags);
	#define HD_COLUMN_MASK 0xff
	#define HD_DELIM_MASK 0xff00
	#define HD_OMIT_COUNT (1 << 16)
	#define HD_OMIT_HEX (1 << 17)
	#define HD_OMIT_CHARS (1 << 18)

	#define ovbcopy(f, t, l) bcopy((f), (t), (l))
	void bcopy(const void * _Nonnull from, void * _Nonnull to, size_t len);
	void bzero(void * _Nonnull buf, size_t len);
	void explicit_bzero(void * _Nonnull, size_t);
	int bcmp(const void b1, const void b2, size_t len);

	void memset(void _Nonnull buf, int c, size_t len);
	void memcpy(void _Nonnull to, const void * _Nonnull from, size_t len);
	void memmove(void _Nonnull dest, const void * _Nonnull src, size_t n);
	int memcmp(const void b1, const void b2, size_t len);

	#ifdef KCSAN
	void kcsan_memset(void , int, size_t);
	void kcsan_memcpy(void , const void *, size_t);
	void kcsan_memmove(void , const void *, size_t);
	int kcsan_memcmp(const void , const void , size_t);
	#define bcopy(from, to, len) kcsan_memmove((to), (from), (len))
	#define bzero(buf, len) kcsan_memset((buf), 0, (len))
	#define bcmp(b1, b2, len) kcsan_memcmp((b1), (b2), (len))
	#define memset(buf, c, len) kcsan_memset((buf), (c), (len))
	#define memcpy(to, from, len) kcsan_memcpy((to), (from), (len))
	#define memmove(dest, src, n) kcsan_memmove((dest), (src), (n))
	#define memcmp(b1, b2, len) kcsan_memcmp((b1), (b2), (len))
	#else
	#define bcopy(from, to, len) __builtin_memmove((to), (from), (len))
	#define bzero(buf, len) __builtin_memset((buf), 0, (len))
	#define bcmp(b1, b2, len) __builtin_memcmp((b1), (b2), (len))
	#define memset(buf, c, len) __builtin_memset((buf), (c), (len))
	#define memcpy(to, from, len) __builtin_memcpy((to), (from), (len))
	#define memmove(dest, src, n) __builtin_memmove((dest), (src), (n))
	#define memcmp(b1, b2, len) __builtin_memcmp((b1), (b2), (len))
	#endif

	void memset_early(void _Nonnull buf, int c, size_t len);
	#define bzero_early(buf, len) memset_early((buf), 0, (len))
	void memcpy_early(void _Nonnull to, const void * _Nonnull from, size_t len);
	void memmove_early(void _Nonnull dest, const void * _Nonnull src, size_t n);
	#define bcopy_early(from, to, len) memmove_early((to), (from), (len))

	#define copystr(src, dst, len, outlen) ({ \
	size_t __r, __len, *__outlen; \
	\
	__len = (len); \
	__outlen = (outlen); \
	__r = strlcpy((dst), (src), __len); \
	if (__outlen != NULL) \
	*__outlen = ((__r >= __len) ? __len : __r + 1); \
	((__r >= __len) ? ENAMETOOLONG : 0); \
	})

	int copyinstr(const void * __restrict udaddr,
	void * _Nonnull __restrict kaddr, size_t len,
	size_t * __restrict lencopied);
	int copyin(const void * __restrict udaddr,
	void * _Nonnull __restrict kaddr, size_t len);
	int copyin_nofault(const void * __restrict udaddr,
	void * _Nonnull __restrict kaddr, size_t len);
	int copyout(const void * _Nonnull __restrict kaddr,
	void * __restrict udaddr, size_t len);
	int copyout_nofault(const void * _Nonnull __restrict kaddr,
	void * __restrict udaddr, size_t len);

	#ifdef KCSAN
	int kcsan_copyin(const void , void , size_t);
	int kcsan_copyinstr(const void , void , size_t, size_t *);
	int kcsan_copyout(const void , void , size_t);
	#define copyin(u, k, l) kcsan_copyin((u), (k), (l))
	#define copyinstr(u, k, l, lc) kcsan_copyinstr((u), (k), (l), (lc))
	#define copyout(k, u, l) kcsan_copyout((k), (u), (l))
	#endif

	int fubyte(volatile const void *base);
	long fuword(volatile const void *base);
	int fuword16(volatile const void *base);
	int32_t fuword32(volatile const void *base);
	int64_t fuword64(volatile const void *base);
	int fueword(volatile const void base, long val);
	int fueword32(volatile const void base, int32_t val);
	int fueword64(volatile const void base, int64_t val);
	int subyte(volatile void *base, int byte);
	int suword(volatile void *base, long word);
	int suword16(volatile void *base, int word);
	int suword32(volatile void *base, int32_t word);
	int suword64(volatile void *base, int64_t word);
	uint32_t casuword32(volatile uint32_t *base, uint32_t oldval, uint32_t newval);
	u_long casuword(volatile u_long *p, u_long oldval, u_long newval);
	int casueword32(volatile uint32_t base, uint32_t oldval, uint32_t oldvalp,
	uint32_t newval);
	int casueword(volatile u_long p, u_long oldval, u_long oldvalp,
	u_long newval);

	void realitexpire(void *);

	int sysbeep(int hertz, int period);

	void hardclock(int cnt, int usermode);
	void hardclock_sync(int cpu);
	void softclock(void *);
	void statclock(int cnt, int usermode);
	void profclock(int cnt, int usermode, uintfptr_t pc);

	int hardclockintr(void);

	void startprofclock(struct proc *);
	void stopprofclock(struct proc *);
	void cpu_startprofclock(void);
	void cpu_stopprofclock(void);
	void suspendclock(void);
	void resumeclock(void);
	sbintime_t cpu_idleclock(void);
	void cpu_activeclock(void);
	void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt);
	void cpu_et_frequency(struct eventtimer *et, uint64_t newfreq);
	extern int cpu_disable_c2_sleep;
	extern int cpu_disable_c3_sleep;

	char kern_getenv(const char name);
	void freeenv(char *env);
	int getenv_int(const char name, int data);
	int getenv_uint(const char name, unsigned int data);
	int getenv_long(const char name, long data);
	int getenv_ulong(const char name, unsigned long data);
	int getenv_string(const char name, char data, int size);
	int getenv_int64(const char name, int64_t data);
	int getenv_uint64(const char name, uint64_t data);
	int getenv_quad(const char name, quad_t data);
	int getenv_bool(const char name, bool data);
	bool getenv_is_true(const char *name);
	bool getenv_is_false(const char *name);
	int kern_setenv(const char name, const char value);
	int kern_unsetenv(const char *name);
	int testenv(const char *name);

	int getenv_array(const char name, void data, int size, int *psize,
	int type_size, bool allow_signed);
	#define GETENV_UNSIGNED false /* negative numbers not allowed */
	#define GETENV_SIGNED true /* negative numbers allowed */

	typedef uint64_t (cpu_tick_f)(void);
	void set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var);
	extern cpu_tick_f *cpu_ticks;
	uint64_t cpu_tickrate(void);
	uint64_t cputick2usec(uint64_t tick);

	#include <sys/libkern.h>

	/* Initialize the world */
	void consinit(void);
	void cpu_initclocks(void);
	void cpu_initclocks_bsp(void);
	void cpu_initclocks_ap(void);
	void usrinfoinit(void);

	/* Finalize the world */
	void kern_reboot(int) __dead2;
	void shutdown_nice(int);

	/* Stubs for obsolete functions that used to be for interrupt management */
	static __inline intrmask_t splhigh(void) { return 0; }
	static __inline intrmask_t splimp(void) { return 0; }
	static __inline intrmask_t splnet(void) { return 0; }
	static __inline intrmask_t spltty(void) { return 0; }
	static __inline void splx(intrmask_t ipl __unused) { return; }

	/*
	* Common `proc' functions are declared here so that proc.h can be included
	* less often.
	*/
	int _sleep(const void * _Nonnull chan, struct lock_object *lock, int pri,
	const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags);
	#define msleep(chan, mtx, pri, wmesg, timo) \
	_sleep((chan), &(mtx)->lock_object, (pri), (wmesg), \
	tick_sbt * (timo), 0, C_HARDCLOCK)
	#define msleep_sbt(chan, mtx, pri, wmesg, bt, pr, flags) \
	_sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (bt), (pr), \
	(flags))
	int msleep_spin_sbt(const void * _Nonnull chan, struct mtx *mtx,
	const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags);
	#define msleep_spin(chan, mtx, wmesg, timo) \
	msleep_spin_sbt((chan), (mtx), (wmesg), tick_sbt * (timo), \
	0, C_HARDCLOCK)
	int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr,
	int flags);
	#define pause(wmesg, timo) \
	pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK)
	#define pause_sig(wmesg, timo) \
	pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK \| C_CATCH)
	#define tsleep(chan, pri, wmesg, timo) \
	_sleep((chan), NULL, (pri), (wmesg), tick_sbt * (timo), \
	0, C_HARDCLOCK)
	#define tsleep_sbt(chan, pri, wmesg, bt, pr, flags) \
	_sleep((chan), NULL, (pri), (wmesg), (bt), (pr), (flags))
	void wakeup(const void *chan);
	void wakeup_one(const void *chan);
	void wakeup_any(const void *chan);

	/*
	* Common `struct cdev *' stuff are declared here to avoid #include poisoning
	*/

	struct cdev;
	dev_t dev2udev(struct cdev *x);
	const char devtoname(struct cdev cdev);

	#ifdef __LP64__
	size_t devfs_iosize_max(void);
	size_t iosize_max(void);
	#endif

	int poll_no_poll(int events);

	/* XXX: Should be void nanodelay(u_int nsec); */
	void DELAY(int usec);

	/* Root mount holdback API */
	struct root_hold_token {
	int flags;
	const char *who;
	TAILQ_ENTRY(root_hold_token) list;
	};

	struct root_hold_token root_mount_hold(const char identifier);
	void root_mount_hold_token(const char identifier, struct root_hold_token h);
	void root_mount_rel(struct root_hold_token *h);
	int root_mounted(void);

	/*
	* Unit number allocation API. (kern/subr_unit.c)
	*/
	struct unrhdr;
	struct unrhdr new_unrhdr(int low, int high, struct mtx mutex);
	void init_unrhdr(struct unrhdr uh, int low, int high, struct mtx mutex);
	void delete_unrhdr(struct unrhdr *uh);
	void clear_unrhdr(struct unrhdr *uh);
	void clean_unrhdr(struct unrhdr *uh);
	void clean_unrhdrl(struct unrhdr *uh);
	int alloc_unr(struct unrhdr *uh);
	int alloc_unr_specific(struct unrhdr *uh, u_int item);
	int alloc_unrl(struct unrhdr *uh);
	void free_unr(struct unrhdr *uh, u_int item);

	#ifndef __LP64__
	#define UNR64_LOCKED
	#endif

	struct unrhdr64 {
	uint64_t counter;
	};

	static __inline void
	new_unrhdr64(struct unrhdr64 *unr64, uint64_t low)
	{

	unr64->counter = low;
	}

	#ifdef UNR64_LOCKED
	uint64_t alloc_unr64(struct unrhdr64 *);
	#else
	static __inline uint64_t
	alloc_unr64(struct unrhdr64 *unr64)
	{

	return (atomic_fetchadd_64(&unr64->counter, 1));
	}
	#endif

	void intr_prof_stack_use(struct thread td, struct trapframe frame);

	void counted_warning(unsigned counter, const char msg);

	/*
	* APIs to manage deprecation and obsolescence.
	*/
	struct device;
	void _gone_in(int major, const char *msg);
	void _gone_in_dev(struct device dev, int major, const char msg);
	#ifdef NO_OBSOLETE_CODE
	#define __gone_ok(m, msg) \
	_Static_assert(m < P_OSREL_MAJOR(__FreeBSD_version)), \
	"Obsolete code: " msg);
	#else
	#define __gone_ok(m, msg)
	#endif
	#define gone_in(major, msg) __gone_ok(major, msg) _gone_in(major, msg)
	#define gone_in_dev(dev, major, msg) __gone_ok(major, msg) _gone_in_dev(dev, major, msg)
	#endif /* _KERNEL */

	__NULLABILITY_PRAGMA_POP

	#endif /* !_SYS_SYSTM_H_ */
	diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
	index e3e14fac314f..0b3110c955f5 100644
	--- a/sys/ufs/ffs/ffs_vfsops.c
	+++ b/sys/ufs/ffs/ffs_vfsops.c
	@@ -1,2687 +1,2687 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1989, 1991, 1993, 1994
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_quota.h"
	#include "opt_ufs.h"
	#include "opt_ffs.h"
	#include "opt_ddb.h"

	#include <sys/param.h>
	#include <sys/gsb_crc32.h>
	#include <sys/systm.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/taskqueue.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/vnode.h>
	#include <sys/mount.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/conf.h>
	#include <sys/fcntl.h>
	#include <sys/ioccom.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/rwlock.h>
	#include <sys/sysctl.h>
	#include <sys/vmmeter.h>

	#include <security/mac/mac_framework.h>

	#include <ufs/ufs/dir.h>
	#include <ufs/ufs/extattr.h>
	#include <ufs/ufs/gjournal.h>
	#include <ufs/ufs/quota.h>
	#include <ufs/ufs/ufsmount.h>
	#include <ufs/ufs/inode.h>
	#include <ufs/ufs/ufs_extern.h>

	#include <ufs/ffs/fs.h>
	#include <ufs/ffs/ffs_extern.h>

	#include <vm/vm.h>
	#include <vm/uma.h>
	#include <vm/vm_page.h>

	#include <geom/geom.h>
	#include <geom/geom_vfs.h>

	#include <ddb/ddb.h>

	static uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
	VFS_SMR_DECLARE;

	static int ffs_mountfs(struct vnode , struct mount , struct thread *);
	static void ffs_oldfscompat_read(struct fs , struct ufsmount ,
	ufs2_daddr_t);
	static void ffs_ifree(struct ufsmount ump, struct inode ip);
	static int ffs_sync_lazy(struct mount *mp);
	static int ffs_use_bread(void devfd, off_t loc, void *bufp, int size);
	static int ffs_use_bwrite(void devfd, off_t loc, void buf, int size);

	static vfs_init_t ffs_init;
	static vfs_uninit_t ffs_uninit;
	static vfs_extattrctl_t ffs_extattrctl;
	static vfs_cmount_t ffs_cmount;
	static vfs_unmount_t ffs_unmount;
	static vfs_mount_t ffs_mount;
	static vfs_statfs_t ffs_statfs;
	static vfs_fhtovp_t ffs_fhtovp;
	static vfs_sync_t ffs_sync;

	static struct vfsops ufs_vfsops = {
	.vfs_extattrctl = ffs_extattrctl,
	.vfs_fhtovp = ffs_fhtovp,
	.vfs_init = ffs_init,
	.vfs_mount = ffs_mount,
	.vfs_cmount = ffs_cmount,
	.vfs_quotactl = ufs_quotactl,
	.vfs_root = vfs_cache_root,
	.vfs_cachedroot = ufs_root,
	.vfs_statfs = ffs_statfs,
	.vfs_sync = ffs_sync,
	.vfs_uninit = ffs_uninit,
	.vfs_unmount = ffs_unmount,
	.vfs_vget = ffs_vget,
	.vfs_susp_clean = process_deferred_inactive,
	};

	VFS_SET(ufs_vfsops, ufs, 0);
	MODULE_VERSION(ufs, 1);

	static b_strategy_t ffs_geom_strategy;
	static b_write_t ffs_bufwrite;

	static struct buf_ops ffs_ops = {
	.bop_name = "FFS",
	.bop_write = ffs_bufwrite,
	.bop_strategy = ffs_geom_strategy,
	.bop_sync = bufsync,
	#ifdef NO_FFS_SNAPSHOT
	.bop_bdflush = bufbdflush,
	#else
	.bop_bdflush = ffs_bdflush,
	#endif
	};

	/*
	* Note that userquota and groupquota options are not currently used
	* by UFS/FFS code and generally mount(8) does not pass those options
	* from userland, but they can be passed by loader(8) via
	* vfs.root.mountfrom.options.
	*/
	static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr",
	"noclusterw", "noexec", "export", "force", "from", "groupquota",
	"multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir",
	"nosymfollow", "sync", "union", "userquota", "untrusted", NULL };

	static int ffs_enxio_enable = 1;
	SYSCTL_DECL(_vfs_ffs);
	SYSCTL_INT(_vfs_ffs, OID_AUTO, enxio_enable, CTLFLAG_RWTUN,
	&ffs_enxio_enable, 0,
	"enable mapping of other disk I/O errors to ENXIO");

	/*
	* Return buffer with the contents of block "offset" from the beginning of
	* directory "ip". If "res" is non-zero, fill it in with a pointer to the
	* remaining space in the directory.
	*/
	static int
	ffs_blkatoff(struct vnode vp, off_t offset, char res, struct buf *bpp)
	{
	struct inode *ip;
	struct fs *fs;
	struct buf *bp;
	ufs_lbn_t lbn;
	int bsize, error;

	ip = VTOI(vp);
	fs = ITOFS(ip);
	lbn = lblkno(fs, offset);
	bsize = blksize(fs, ip, lbn);

	*bpp = NULL;
	error = bread(vp, lbn, bsize, NOCRED, &bp);
	if (error) {
	return (error);
	}
	if (res)
	res = (char )bp->b_data + blkoff(fs, offset);
	*bpp = bp;
	return (0);
	}

	/*
	* Load up the contents of an inode and copy the appropriate pieces
	* to the incore copy.
	*/
	static int
	ffs_load_inode(struct buf bp, struct inode ip, struct fs *fs, ino_t ino)
	{
	struct ufs1_dinode *dip1;
	struct ufs2_dinode *dip2;
	int error;

	if (I_IS_UFS1(ip)) {
	dip1 = ip->i_din1;
	*dip1 =
	((struct ufs1_dinode )bp->b_data + ino_to_fsbo(fs, ino));
	ip->i_mode = dip1->di_mode;
	ip->i_nlink = dip1->di_nlink;
	ip->i_effnlink = dip1->di_nlink;
	ip->i_size = dip1->di_size;
	ip->i_flags = dip1->di_flags;
	ip->i_gen = dip1->di_gen;
	ip->i_uid = dip1->di_uid;
	ip->i_gid = dip1->di_gid;
	return (0);
	}
	dip2 = ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino));
	if ((error = ffs_verify_dinode_ckhash(fs, dip2)) != 0 &&
	!ffs_fsfail_cleanup(ITOUMP(ip), error)) {
	printf("%s: inode %jd: check-hash failed\n", fs->fs_fsmnt,
	(intmax_t)ino);
	return (error);
	}
	ip->i_din2 = dip2;
	dip2 = ip->i_din2;
	ip->i_mode = dip2->di_mode;
	ip->i_nlink = dip2->di_nlink;
	ip->i_effnlink = dip2->di_nlink;
	ip->i_size = dip2->di_size;
	ip->i_flags = dip2->di_flags;
	ip->i_gen = dip2->di_gen;
	ip->i_uid = dip2->di_uid;
	ip->i_gid = dip2->di_gid;
	return (0);
	}

	/*
	* Verify that a filesystem block number is a valid data block.
	* This routine is only called on untrusted filesystems.
	*/
	static int
	ffs_check_blkno(struct mount *mp, ino_t inum, ufs2_daddr_t daddr, int blksize)
	{
	struct fs *fs;
	struct ufsmount *ump;
	ufs2_daddr_t end_daddr;
	int cg, havemtx;

	KASSERT((mp->mnt_flag & MNT_UNTRUSTED) != 0,
	("ffs_check_blkno called on a trusted file system"));
	ump = VFSTOUFS(mp);
	fs = ump->um_fs;
	cg = dtog(fs, daddr);
	end_daddr = daddr + numfrags(fs, blksize);
	/*
	* Verify that the block number is a valid data block. Also check
	* that it does not point to an inode block or a superblock. Accept
	* blocks that are unalloacted (0) or part of snapshot metadata
	* (BLK_NOCOPY or BLK_SNAP).
	*
	* Thus, the block must be in a valid range for the filesystem and
	* either in the space before a backup superblock (except the first
	* cylinder group where that space is used by the bootstrap code) or
	* after the inode blocks and before the end of the cylinder group.
	*/
	if ((uint64_t)daddr <= BLK_SNAP \|\|
	((uint64_t)end_daddr <= fs->fs_size &&
	((cg > 0 && end_daddr <= cgsblock(fs, cg)) \|\|
	(daddr >= cgdmin(fs, cg) &&
	end_daddr <= cgbase(fs, cg) + fs->fs_fpg))))
	return (0);
	if ((havemtx = mtx_owned(UFS_MTX(ump))) == 0)
	UFS_LOCK(ump);
	if (ppsratecheck(&ump->um_last_integritymsg,
	&ump->um_secs_integritymsg, 1)) {
	UFS_UNLOCK(ump);
	uprintf("\n%s: inode %jd, out-of-range indirect block "
	"number %jd\n", mp->mnt_stat.f_mntonname, inum, daddr);
	if (havemtx)
	UFS_LOCK(ump);
	} else if (!havemtx)
	UFS_UNLOCK(ump);
	return (EINTEGRITY);
	}

	/*
	* Initiate a forcible unmount.
	* Used to unmount filesystems whose underlying media has gone away.
	*/
	static void
	ffs_fsfail_unmount(void *v, int pending)
	{
	struct fsfail_task *etp;
	struct mount *mp;

	etp = v;

	/*
	* Find our mount and get a ref on it, then try to unmount.
	*/
	mp = vfs_getvfs(&etp->fsid);
	if (mp != NULL)
	dounmount(mp, MNT_FORCE, curthread);
	free(etp, M_UFSMNT);
	}

	/*
	* On first ENXIO error, start a task that forcibly unmounts the filesystem.
	*
	* Return true if a cleanup is in progress.
	*/
	int
	ffs_fsfail_cleanup(struct ufsmount *ump, int error)
	{
	int retval;

	UFS_LOCK(ump);
	retval = ffs_fsfail_cleanup_locked(ump, error);
	UFS_UNLOCK(ump);
	return (retval);
	}

	int
	ffs_fsfail_cleanup_locked(struct ufsmount *ump, int error)
	{
	struct fsfail_task *etp;
	struct task *tp;

	mtx_assert(UFS_MTX(ump), MA_OWNED);
	if (error == ENXIO && (ump->um_flags & UM_FSFAIL_CLEANUP) == 0) {
	ump->um_flags \|= UM_FSFAIL_CLEANUP;
	/*
	* Queue an async forced unmount.
	*/
	etp = ump->um_fsfail_task;
	ump->um_fsfail_task = NULL;
	if (etp != NULL) {
	tp = &etp->task;
	TASK_INIT(tp, 0, ffs_fsfail_unmount, etp);
	taskqueue_enqueue(taskqueue_thread, tp);
	printf("UFS: forcibly unmounting %s from %s\n",
	ump->um_mountp->mnt_stat.f_mntfromname,
	ump->um_mountp->mnt_stat.f_mntonname);
	}
	}
	return ((ump->um_flags & UM_FSFAIL_CLEANUP) != 0);
	}

	/*
	* Wrapper used during ENXIO cleanup to allocate empty buffers when
	* the kernel is unable to read the real one. They are needed so that
	* the soft updates code can use them to unwind its dependencies.
	*/
	int
	ffs_breadz(struct ufsmount ump, struct vnode vp, daddr_t lblkno,
	daddr_t dblkno, int size, daddr_t rablkno, int rabsize, int cnt,
	struct ucred cred, int flags, void (ckhashfunc)(struct buf *),
	struct buf **bpp)
	{
	int error;

	flags \|= GB_CVTENXIO;
	error = breadn_flags(vp, lblkno, dblkno, size, rablkno, rabsize, cnt,
	cred, flags, ckhashfunc, bpp);
	if (error != 0 && ffs_fsfail_cleanup(ump, error)) {
	error = getblkx(vp, lblkno, dblkno, size, 0, 0, flags, bpp);
	KASSERT(error == 0, ("getblkx failed"));
	vfs_bio_bzero_buf(*bpp, 0, size);
	}
	return (error);
	}

	static int
	ffs_mount(struct mount *mp)
	{
	struct vnode devvp, odevvp;
	struct thread *td;
	struct ufsmount *ump = NULL;
	struct fs *fs;
	pid_t fsckpid = 0;
	int error, error1, flags;
	uint64_t mntorflags, saved_mnt_flag;
	accmode_t accmode;
	struct nameidata ndp;
	char *fspec;

	td = curthread;
	if (vfs_filteropt(mp->mnt_optnew, ffs_opts))
	return (EINVAL);
	if (uma_inode == NULL) {
	uma_inode = uma_zcreate("FFS inode",
	sizeof(struct inode), NULL, NULL, NULL, NULL,
	UMA_ALIGN_PTR, 0);
	uma_ufs1 = uma_zcreate("FFS1 dinode",
	sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL,
	UMA_ALIGN_PTR, 0);
	uma_ufs2 = uma_zcreate("FFS2 dinode",
	sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL,
	UMA_ALIGN_PTR, 0);
	VFS_SMR_ZONE_SET(uma_inode);
	}

	vfs_deleteopt(mp->mnt_optnew, "groupquota");
	vfs_deleteopt(mp->mnt_optnew, "userquota");

	fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
	if (error)
	return (error);

	mntorflags = 0;
	if (vfs_getopt(mp->mnt_optnew, "untrusted", NULL, NULL) == 0)
	mntorflags \|= MNT_UNTRUSTED;

	if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0)
	mntorflags \|= MNT_ACLS;

	if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) {
	mntorflags \|= MNT_SNAPSHOT;
	/*
	* Once we have set the MNT_SNAPSHOT flag, do not
	* persist "snapshot" in the options list.
	*/
	vfs_deleteopt(mp->mnt_optnew, "snapshot");
	vfs_deleteopt(mp->mnt_opt, "snapshot");
	}

	if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 &&
	vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) {
	/*
	* Once we have set the restricted PID, do not
	* persist "fsckpid" in the options list.
	*/
	vfs_deleteopt(mp->mnt_optnew, "fsckpid");
	vfs_deleteopt(mp->mnt_opt, "fsckpid");
	if (mp->mnt_flag & MNT_UPDATE) {
	if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 &&
	vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) {
	vfs_mount_error(mp,
	"Checker enable: Must be read-only");
	return (EINVAL);
	}
	} else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) {
	vfs_mount_error(mp,
	"Checker enable: Must be read-only");
	return (EINVAL);
	}
	/* Set to -1 if we are done */
	if (fsckpid == 0)
	fsckpid = -1;
	}

	if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) {
	if (mntorflags & MNT_ACLS) {
	vfs_mount_error(mp,
	"\"acls\" and \"nfsv4acls\" options "
	"are mutually exclusive");
	return (EINVAL);
	}
	mntorflags \|= MNT_NFS4ACLS;
	}

	MNT_ILOCK(mp);
	mp->mnt_kern_flag &= ~MNTK_FPLOOKUP;
	mp->mnt_flag \|= mntorflags;
	MNT_IUNLOCK(mp);
	/*
	* If updating, check whether changing from read-only to
	* read/write; if there is no device name, that's all we do.
	*/
	if (mp->mnt_flag & MNT_UPDATE) {
	ump = VFSTOUFS(mp);
	fs = ump->um_fs;
	odevvp = ump->um_odevvp;
	devvp = ump->um_devvp;
	if (fsckpid == -1 && ump->um_fsckpid > 0) {
	if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 \|\|
	(error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0)
	return (error);
	g_topology_lock();
	/*
	* Return to normal read-only mode.
	*/
	error = g_access(ump->um_cp, 0, -1, 0);
	g_topology_unlock();
	ump->um_fsckpid = 0;
	}
	if (fs->fs_ronly == 0 &&
	vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
	/*
	* Flush any dirty data and suspend filesystem.
	*/
	if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
	return (error);
	error = vfs_write_suspend_umnt(mp);
	if (error != 0)
	return (error);
	/*
	* Check for and optionally get rid of files open
	* for writing.
	*/
	flags = WRITECLOSE;
	if (mp->mnt_flag & MNT_FORCE)
	flags \|= FORCECLOSE;
	if (MOUNTEDSOFTDEP(mp)) {
	error = softdep_flushfiles(mp, flags, td);
	} else {
	error = ffs_flushfiles(mp, flags, td);
	}
	if (error) {
	vfs_write_resume(mp, 0);
	return (error);
	}
	if (fs->fs_pendingblocks != 0 \|\|
	fs->fs_pendinginodes != 0) {
	printf("WARNING: %s Update error: blocks %jd "
	"files %d\n", fs->fs_fsmnt,
	(intmax_t)fs->fs_pendingblocks,
	fs->fs_pendinginodes);
	fs->fs_pendingblocks = 0;
	fs->fs_pendinginodes = 0;
	}
	if ((fs->fs_flags & (FS_UNCLEAN \| FS_NEEDSFSCK)) == 0)
	fs->fs_clean = 1;
	if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
	fs->fs_ronly = 0;
	fs->fs_clean = 0;
	vfs_write_resume(mp, 0);
	return (error);
	}
	if (MOUNTEDSOFTDEP(mp))
	softdep_unmount(mp);
	g_topology_lock();
	/*
	* Drop our write and exclusive access.
	*/
	g_access(ump->um_cp, 0, -1, -1);
	g_topology_unlock();
	fs->fs_ronly = 1;
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_RDONLY;
	MNT_IUNLOCK(mp);
	/*
	* Allow the writers to note that filesystem
	* is ro now.
	*/
	vfs_write_resume(mp, 0);
	}
	if ((mp->mnt_flag & MNT_RELOAD) &&
	(error = ffs_reload(mp, td, 0)) != 0)
	return (error);
	if (fs->fs_ronly &&
	!vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
	/*
	* If we are running a checker, do not allow upgrade.
	*/
	if (ump->um_fsckpid > 0) {
	vfs_mount_error(mp,
	"Active checker, cannot upgrade to write");
	return (EINVAL);
	}
	/*
	* If upgrade to read-write by non-root, then verify
	* that user has necessary permissions on the device.
	*/
	vn_lock(odevvp, LK_EXCLUSIVE \| LK_RETRY);
	error = VOP_ACCESS(odevvp, VREAD \| VWRITE,
	td->td_ucred, td);
	if (error)
	error = priv_check(td, PRIV_VFS_MOUNT_PERM);
	VOP_UNLOCK(odevvp);
	if (error) {
	return (error);
	}
	fs->fs_flags &= ~FS_UNCLEAN;
	if (fs->fs_clean == 0) {
	fs->fs_flags \|= FS_UNCLEAN;
	if ((mp->mnt_flag & MNT_FORCE) \|\|
	((fs->fs_flags &
	(FS_SUJ \| FS_NEEDSFSCK)) == 0 &&
	(fs->fs_flags & FS_DOSOFTDEP))) {
	printf("WARNING: %s was not properly "
	"dismounted\n", fs->fs_fsmnt);
	} else {
	vfs_mount_error(mp,
	"R/W mount of %s denied. %s.%s",
	fs->fs_fsmnt,
	"Filesystem is not clean - run fsck",
	(fs->fs_flags & FS_SUJ) == 0 ? "" :
	" Forced mount will invalidate"
	" journal contents");
	return (EPERM);
	}
	}
	g_topology_lock();
	/*
	* Request exclusive write access.
	*/
	error = g_access(ump->um_cp, 0, 1, 1);
	g_topology_unlock();
	if (error)
	return (error);
	if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
	return (error);
	error = vfs_write_suspend_umnt(mp);
	if (error != 0)
	return (error);
	fs->fs_ronly = 0;
	MNT_ILOCK(mp);
	saved_mnt_flag = MNT_RDONLY;
	if (MOUNTEDSOFTDEP(mp) && (mp->mnt_flag &
	MNT_ASYNC) != 0)
	saved_mnt_flag \|= MNT_ASYNC;
	mp->mnt_flag &= ~saved_mnt_flag;
	MNT_IUNLOCK(mp);
	fs->fs_mtime = time_second;
	/* check to see if we need to start softdep */
	if ((fs->fs_flags & FS_DOSOFTDEP) &&
	(error = softdep_mount(devvp, mp, fs, td->td_ucred))){
	fs->fs_ronly = 1;
	MNT_ILOCK(mp);
	mp->mnt_flag \|= saved_mnt_flag;
	MNT_IUNLOCK(mp);
	vfs_write_resume(mp, 0);
	return (error);
	}
	fs->fs_clean = 0;
	if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
	fs->fs_ronly = 1;
	MNT_ILOCK(mp);
	mp->mnt_flag \|= saved_mnt_flag;
	MNT_IUNLOCK(mp);
	vfs_write_resume(mp, 0);
	return (error);
	}
	if (fs->fs_snapinum[0] != 0)
	ffs_snapshot_mount(mp);
	vfs_write_resume(mp, 0);
	}
	/*
	* Soft updates is incompatible with "async",
	* so if we are doing softupdates stop the user
	* from setting the async flag in an update.
	* Softdep_mount() clears it in an initial mount
	* or ro->rw remount.
	*/
	if (MOUNTEDSOFTDEP(mp)) {
	/* XXX: Reset too late ? */
	MNT_ILOCK(mp);
	mp->mnt_flag &= ~MNT_ASYNC;
	MNT_IUNLOCK(mp);
	}
	/*
	* Keep MNT_ACLS flag if it is stored in superblock.
	*/
	if ((fs->fs_flags & FS_ACLS) != 0) {
	/* XXX: Set too late ? */
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_ACLS;
	MNT_IUNLOCK(mp);
	}

	if ((fs->fs_flags & FS_NFS4ACLS) != 0) {
	/* XXX: Set too late ? */
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_NFS4ACLS;
	MNT_IUNLOCK(mp);
	}
	/*
	* If this is a request from fsck to clean up the filesystem,
	* then allow the specified pid to proceed.
	*/
	if (fsckpid > 0) {
	if (ump->um_fsckpid != 0) {
	vfs_mount_error(mp,
	"Active checker already running on %s",
	fs->fs_fsmnt);
	return (EINVAL);
	}
	KASSERT(MOUNTEDSOFTDEP(mp) == 0,
	("soft updates enabled on read-only file system"));
	g_topology_lock();
	/*
	* Request write access.
	*/
	error = g_access(ump->um_cp, 0, 1, 0);
	g_topology_unlock();
	if (error) {
	vfs_mount_error(mp,
	"Checker activation failed on %s",
	fs->fs_fsmnt);
	return (error);
	}
	ump->um_fsckpid = fsckpid;
	if (fs->fs_snapinum[0] != 0)
	ffs_snapshot_mount(mp);
	fs->fs_mtime = time_second;
	fs->fs_fmod = 1;
	fs->fs_clean = 0;
	(void) ffs_sbupdate(ump, MNT_WAIT, 0);
	}

	/*
	* If this is a snapshot request, take the snapshot.
	*/
	if (mp->mnt_flag & MNT_SNAPSHOT)
	return (ffs_snapshot(mp, fspec));

	/*
	* Must not call namei() while owning busy ref.
	*/
	vfs_unbusy(mp);
	}

	/*
	* Not an update, or updating the name: look up the name
	* and verify that it refers to a sensible disk device.
	*/
	NDINIT(&ndp, LOOKUP, FOLLOW \| LOCKLEAF, UIO_SYSSPACE, fspec, td);
	error = namei(&ndp);
	if ((mp->mnt_flag & MNT_UPDATE) != 0) {
	/*
	* Unmount does not start if MNT_UPDATE is set. Mount
	* update busies mp before setting MNT_UPDATE. We
	* must be able to retain our busy ref succesfully,
	* without sleep.
	*/
	error1 = vfs_busy(mp, MBF_NOWAIT);
	MPASS(error1 == 0);
	}
	if (error != 0)
	return (error);
	NDFREE(&ndp, NDF_ONLY_PNBUF);
	devvp = ndp.ni_vp;
	if (!vn_isdisk_error(devvp, &error)) {
	vput(devvp);
	return (error);
	}

	/*
	* If mount by non-root, then verify that user has necessary
	* permissions on the device.
	*/
	accmode = VREAD;
	if ((mp->mnt_flag & MNT_RDONLY) == 0)
	accmode \|= VWRITE;
	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
	if (error)
	error = priv_check(td, PRIV_VFS_MOUNT_PERM);
	if (error) {
	vput(devvp);
	return (error);
	}

	if (mp->mnt_flag & MNT_UPDATE) {
	/*
	* Update only
	*
	* If it's not the same vnode, or at least the same device
	* then it's not correct.
	*/

	if (devvp->v_rdev != ump->um_devvp->v_rdev)
	error = EINVAL; /* needs translation */
	vput(devvp);
	if (error)
	return (error);
	} else {
	/*
	* New mount
	*
	* We need the name for the mount point (also used for
	* "last mounted on") copied in. If an error occurs,
	* the mount point is discarded by the upper level code.
	* Note that vfs_mount_alloc() populates f_mntonname for us.
	*/
	if ((error = ffs_mountfs(devvp, mp, td)) != 0) {
	vrele(devvp);
	return (error);
	}
	if (fsckpid > 0) {
	KASSERT(MOUNTEDSOFTDEP(mp) == 0,
	("soft updates enabled on read-only file system"));
	ump = VFSTOUFS(mp);
	fs = ump->um_fs;
	g_topology_lock();
	/*
	* Request write access.
	*/
	error = g_access(ump->um_cp, 0, 1, 0);
	g_topology_unlock();
	if (error) {
	printf("WARNING: %s: Checker activation "
	"failed\n", fs->fs_fsmnt);
	} else {
	ump->um_fsckpid = fsckpid;
	if (fs->fs_snapinum[0] != 0)
	ffs_snapshot_mount(mp);
	fs->fs_mtime = time_second;
	fs->fs_clean = 0;
	(void) ffs_sbupdate(ump, MNT_WAIT, 0);
	}
	}
	}

	MNT_ILOCK(mp);
	/*
	* This is racy versus lookup, see ufs_fplookup_vexec for details.
	*/
	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) != 0)
	panic("MNTK_FPLOOKUP set on mount %p when it should not be", mp);
	if ((mp->mnt_flag & (MNT_ACLS \| MNT_NFS4ACLS \| MNT_UNION)) == 0)
	mp->mnt_kern_flag \|= MNTK_FPLOOKUP;
	MNT_IUNLOCK(mp);

	vfs_mountedfrom(mp, fspec);
	return (0);
	}

	/*
	* Compatibility with old mount system call.
	*/

	static int
	ffs_cmount(struct mntarg ma, void data, uint64_t flags)
	{
	struct ufs_args args;
	int error;

	if (data == NULL)
	return (EINVAL);
	error = copyin(data, &args, sizeof args);
	if (error)
	return (error);

	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
	ma = mount_arg(ma, "export", &args.export, sizeof(args.export));
	error = kernel_mount(ma, flags);

	return (error);
	}

	/*
	* Reload all incore data for a filesystem (used after running fsck on
	* the root filesystem and finding things to fix). If the 'force' flag
	* is 0, the filesystem must be mounted read-only.
	*
	* Things to do to update the mount:
	* 1) invalidate all cached meta-data.
	* 2) re-read superblock from disk.
	* 3) re-read summary information from disk.
	* 4) invalidate all inactive vnodes.
	* 5) clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags, allowing secondary
	* writers, if requested.
	* 6) invalidate all cached file data.
	* 7) re-read inode data for all active vnodes.
	*/
	int
	ffs_reload(struct mount mp, struct thread td, int flags)
	{
	struct vnode vp, mvp, *devvp;
	struct inode *ip;
	void *space;
	struct buf *bp;
	struct fs fs, newfs;
	struct ufsmount *ump;
	ufs2_daddr_t sblockloc;
	int i, blks, error;
	u_long size;
	int32_t *lp;

	ump = VFSTOUFS(mp);

	MNT_ILOCK(mp);
	if ((mp->mnt_flag & MNT_RDONLY) == 0 && (flags & FFSR_FORCE) == 0) {
	MNT_IUNLOCK(mp);
	return (EINVAL);
	}
	MNT_IUNLOCK(mp);

	/*
	* Step 1: invalidate all cached meta-data.
	*/
	devvp = VFSTOUFS(mp)->um_devvp;
	vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY);
	if (vinvalbuf(devvp, 0, 0, 0) != 0)
	panic("ffs_reload: dirty1");
	VOP_UNLOCK(devvp);

	/*
	* Step 2: re-read superblock from disk.
	*/
	fs = VFSTOUFS(mp)->um_fs;
	if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize,
	NOCRED, &bp)) != 0)
	return (error);
	newfs = (struct fs *)bp->b_data;
	if ((newfs->fs_magic != FS_UFS1_MAGIC &&
	newfs->fs_magic != FS_UFS2_MAGIC) \|\|
	newfs->fs_bsize > MAXBSIZE \|\|
	newfs->fs_bsize < sizeof(struct fs)) {
	brelse(bp);
	return (EIO); /* XXX needs translation */
	}
	/*
	* Preserve the summary information, read-only status, and
	* superblock location by copying these fields into our new
	* superblock before using it to update the existing superblock.
	*/
	newfs->fs_si = fs->fs_si;
	newfs->fs_ronly = fs->fs_ronly;
	sblockloc = fs->fs_sblockloc;
	bcopy(newfs, fs, (u_int)fs->fs_sbsize);
	brelse(bp);
	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
	ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc);
	UFS_LOCK(ump);
	if (fs->fs_pendingblocks != 0 \|\| fs->fs_pendinginodes != 0) {
	printf("WARNING: %s: reload pending error: blocks %jd "
	"files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
	fs->fs_pendinginodes);
	fs->fs_pendingblocks = 0;
	fs->fs_pendinginodes = 0;
	}
	UFS_UNLOCK(ump);

	/*
	* Step 3: re-read summary information from disk.
	*/
	size = fs->fs_cssize;
	blks = howmany(size, fs->fs_fsize);
	if (fs->fs_contigsumsize > 0)
	size += fs->fs_ncg * sizeof(int32_t);
	size += fs->fs_ncg * sizeof(u_int8_t);
	free(fs->fs_csp, M_UFSMNT);
	space = malloc(size, M_UFSMNT, M_WAITOK);
	fs->fs_csp = space;
	for (i = 0; i < blks; i += fs->fs_frag) {
	size = fs->fs_bsize;
	if (i + fs->fs_frag > blks)
	size = (blks - i) * fs->fs_fsize;
	error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
	NOCRED, &bp);
	if (error)
	return (error);
	bcopy(bp->b_data, space, (u_int)size);
	space = (char *)space + size;
	brelse(bp);
	}
	/*
	* We no longer know anything about clusters per cylinder group.
	*/
	if (fs->fs_contigsumsize > 0) {
	fs->fs_maxcluster = lp = space;
	for (i = 0; i < fs->fs_ncg; i++)
	*lp++ = fs->fs_contigsumsize;
	space = lp;
	}
	size = fs->fs_ncg * sizeof(u_int8_t);
	fs->fs_contigdirs = (u_int8_t *)space;
	bzero(fs->fs_contigdirs, size);
	if ((flags & FFSR_UNSUSPEND) != 0) {
	MNT_ILOCK(mp);
	mp->mnt_kern_flag &= ~(MNTK_SUSPENDED \| MNTK_SUSPEND2);
	wakeup(&mp->mnt_flag);
	MNT_IUNLOCK(mp);
	}

	loop:
	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
	/*
	* Skip syncer vnode.
	*/
	if (vp->v_type == VNON) {
	VI_UNLOCK(vp);
	continue;
	}
	/*
	* Step 4: invalidate all cached file data.
	*/
	if (vget(vp, LK_EXCLUSIVE \| LK_INTERLOCK)) {
	MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
	goto loop;
	}
	if (vinvalbuf(vp, 0, 0, 0))
	panic("ffs_reload: dirty2");
	/*
	* Step 5: re-read inode data for all active vnodes.
	*/
	ip = VTOI(vp);
	error =
	bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
	(int)fs->fs_bsize, NOCRED, &bp);
	if (error) {
	vput(vp);
	MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
	return (error);
	}
	if ((error = ffs_load_inode(bp, ip, fs, ip->i_number)) != 0) {
	brelse(bp);
	vput(vp);
	MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
	return (error);
	}
	ip->i_effnlink = ip->i_nlink;
	brelse(bp);
	vput(vp);
	}
	return (0);
	}

	/*
	* Common code for mount and mountroot
	*/
	static int
	ffs_mountfs(odevvp, mp, td)
	struct vnode *odevvp;
	struct mount *mp;
	struct thread *td;
	{
	struct ufsmount *ump;
	struct fs *fs;
	struct cdev *dev;
	int error, i, len, ronly;
	struct ucred *cred;
	struct g_consumer *cp;
	struct mount *nmp;
	struct vnode *devvp;
	struct fsfail_task *etp;
	int candelete, canspeedup;
	off_t loc;

	fs = NULL;
	ump = NULL;
	cred = td ? td->td_ucred : NOCRED;
	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;

	devvp = mntfs_allocvp(mp, odevvp);
	VOP_UNLOCK(odevvp);
	KASSERT(devvp->v_type == VCHR, ("reclaimed devvp"));
	dev = devvp->v_rdev;
	if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0,
	(uintptr_t)mp) == 0) {
	mntfs_freevp(devvp);
	return (EBUSY);
	}
	g_topology_lock();
	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
	g_topology_unlock();
	if (error != 0) {
	atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
	mntfs_freevp(devvp);
	return (error);
	}
	dev_ref(dev);
	devvp->v_bufobj.bo_ops = &ffs_ops;
	BO_LOCK(&odevvp->v_bufobj);
	odevvp->v_bufobj.bo_flag \|= BO_NOBUFS;
	BO_UNLOCK(&odevvp->v_bufobj);
	if (dev->si_iosize_max != 0)
	mp->mnt_iosize_max = dev->si_iosize_max;
	- if (mp->mnt_iosize_max > MAXPHYS)
	- mp->mnt_iosize_max = MAXPHYS;
	+ if (mp->mnt_iosize_max > maxphys)
	+ mp->mnt_iosize_max = maxphys;
	if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) {
	error = EINVAL;
	vfs_mount_error(mp,
	"Invalid sectorsize %d for superblock size %d",
	cp->provider->sectorsize, SBLOCKSIZE);
	goto out;
	}
	/* fetch the superblock and summary information */
	loc = STDSB;
	if ((mp->mnt_flag & MNT_ROOTFS) != 0)
	loc = STDSB_NOHASHFAIL;
	if ((error = ffs_sbget(devvp, &fs, loc, M_UFSMNT, ffs_use_bread)) != 0)
	goto out;
	fs->fs_flags &= ~FS_UNCLEAN;
	if (fs->fs_clean == 0) {
	fs->fs_flags \|= FS_UNCLEAN;
	if (ronly \|\| (mp->mnt_flag & MNT_FORCE) \|\|
	((fs->fs_flags & (FS_SUJ \| FS_NEEDSFSCK)) == 0 &&
	(fs->fs_flags & FS_DOSOFTDEP))) {
	printf("WARNING: %s was not properly dismounted\n",
	fs->fs_fsmnt);
	} else {
	vfs_mount_error(mp, "R/W mount of %s denied. %s%s",
	fs->fs_fsmnt, "Filesystem is not clean - run fsck.",
	(fs->fs_flags & FS_SUJ) == 0 ? "" :
	" Forced mount will invalidate journal contents");
	error = EPERM;
	goto out;
	}
	if ((fs->fs_pendingblocks != 0 \|\| fs->fs_pendinginodes != 0) &&
	(mp->mnt_flag & MNT_FORCE)) {
	printf("WARNING: %s: lost blocks %jd files %d\n",
	fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
	fs->fs_pendinginodes);
	fs->fs_pendingblocks = 0;
	fs->fs_pendinginodes = 0;
	}
	}
	if (fs->fs_pendingblocks != 0 \|\| fs->fs_pendinginodes != 0) {
	printf("WARNING: %s: mount pending error: blocks %jd "
	"files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
	fs->fs_pendinginodes);
	fs->fs_pendingblocks = 0;
	fs->fs_pendinginodes = 0;
	}
	if ((fs->fs_flags & FS_GJOURNAL) != 0) {
	#ifdef UFS_GJOURNAL
	/*
	* Get journal provider name.
	*/
	len = 1024;
	mp->mnt_gjprovider = malloc((u_long)len, M_UFSMNT, M_WAITOK);
	if (g_io_getattr("GJOURNAL::provider", cp, &len,
	mp->mnt_gjprovider) == 0) {
	mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, len,
	M_UFSMNT, M_WAITOK);
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_GJOURNAL;
	MNT_IUNLOCK(mp);
	} else {
	printf("WARNING: %s: GJOURNAL flag on fs "
	"but no gjournal provider below\n",
	mp->mnt_stat.f_mntonname);
	free(mp->mnt_gjprovider, M_UFSMNT);
	mp->mnt_gjprovider = NULL;
	}
	#else
	printf("WARNING: %s: GJOURNAL flag on fs but no "
	"UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname);
	#endif
	} else {
	mp->mnt_gjprovider = NULL;
	}
	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK \| M_ZERO);
	ump->um_cp = cp;
	ump->um_bo = &devvp->v_bufobj;
	ump->um_fs = fs;
	if (fs->fs_magic == FS_UFS1_MAGIC) {
	ump->um_fstype = UFS1;
	ump->um_balloc = ffs_balloc_ufs1;
	} else {
	ump->um_fstype = UFS2;
	ump->um_balloc = ffs_balloc_ufs2;
	}
	ump->um_blkatoff = ffs_blkatoff;
	ump->um_truncate = ffs_truncate;
	ump->um_update = ffs_update;
	ump->um_valloc = ffs_valloc;
	ump->um_vfree = ffs_vfree;
	ump->um_ifree = ffs_ifree;
	ump->um_rdonly = ffs_rdonly;
	ump->um_snapgone = ffs_snapgone;
	if ((mp->mnt_flag & MNT_UNTRUSTED) != 0)
	ump->um_check_blkno = ffs_check_blkno;
	else
	ump->um_check_blkno = NULL;
	mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF);
	ffs_oldfscompat_read(fs, ump, fs->fs_sblockloc);
	fs->fs_ronly = ronly;
	fs->fs_active = NULL;
	mp->mnt_data = ump;
	mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
	mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
	nmp = NULL;
	if (fs->fs_id[0] == 0 \|\| fs->fs_id[1] == 0 \|\|
	(nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) {
	if (nmp)
	vfs_rel(nmp);
	vfs_getnewfsid(mp);
	}
	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_LOCAL;
	MNT_IUNLOCK(mp);
	if ((fs->fs_flags & FS_MULTILABEL) != 0) {
	#ifdef MAC
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_MULTILABEL;
	MNT_IUNLOCK(mp);
	#else
	printf("WARNING: %s: multilabel flag on fs but "
	"no MAC support\n", mp->mnt_stat.f_mntonname);
	#endif
	}
	if ((fs->fs_flags & FS_ACLS) != 0) {
	#ifdef UFS_ACL
	MNT_ILOCK(mp);

	if (mp->mnt_flag & MNT_NFS4ACLS)
	printf("WARNING: %s: ACLs flag on fs conflicts with "
	"\"nfsv4acls\" mount option; option ignored\n",
	mp->mnt_stat.f_mntonname);
	mp->mnt_flag &= ~MNT_NFS4ACLS;
	mp->mnt_flag \|= MNT_ACLS;

	MNT_IUNLOCK(mp);
	#else
	printf("WARNING: %s: ACLs flag on fs but no ACLs support\n",
	mp->mnt_stat.f_mntonname);
	#endif
	}
	if ((fs->fs_flags & FS_NFS4ACLS) != 0) {
	#ifdef UFS_ACL
	MNT_ILOCK(mp);

	if (mp->mnt_flag & MNT_ACLS)
	printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts "
	"with \"acls\" mount option; option ignored\n",
	mp->mnt_stat.f_mntonname);
	mp->mnt_flag &= ~MNT_ACLS;
	mp->mnt_flag \|= MNT_NFS4ACLS;

	MNT_IUNLOCK(mp);
	#else
	printf("WARNING: %s: NFSv4 ACLs flag on fs but no "
	"ACLs support\n", mp->mnt_stat.f_mntonname);
	#endif
	}
	if ((fs->fs_flags & FS_TRIM) != 0) {
	len = sizeof(int);
	if (g_io_getattr("GEOM::candelete", cp, &len,
	&candelete) == 0) {
	if (candelete)
	ump->um_flags \|= UM_CANDELETE;
	else
	printf("WARNING: %s: TRIM flag on fs but disk "
	"does not support TRIM\n",
	mp->mnt_stat.f_mntonname);
	} else {
	printf("WARNING: %s: TRIM flag on fs but disk does "
	"not confirm that it supports TRIM\n",
	mp->mnt_stat.f_mntonname);
	}
	if (((ump->um_flags) & UM_CANDELETE) != 0) {
	ump->um_trim_tq = taskqueue_create("trim", M_WAITOK,
	taskqueue_thread_enqueue, &ump->um_trim_tq);
	taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS,
	"%s trim", mp->mnt_stat.f_mntonname);
	ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM,
	&ump->um_trimlisthashsize);
	}
	}

	len = sizeof(int);
	if (g_io_getattr("GEOM::canspeedup", cp, &len, &canspeedup) == 0) {
	if (canspeedup)
	ump->um_flags \|= UM_CANSPEEDUP;
	}

	ump->um_mountp = mp;
	ump->um_dev = dev;
	ump->um_devvp = devvp;
	ump->um_odevvp = odevvp;
	ump->um_nindir = fs->fs_nindir;
	ump->um_bptrtodb = fs->fs_fsbtodb;
	ump->um_seqinc = fs->fs_frag;
	for (i = 0; i < MAXQUOTAS; i++)
	ump->um_quotas[i] = NULLVP;
	#ifdef UFS_EXTATTR
	ufs_extattr_uepm_init(&ump->um_extattr);
	#endif
	/*
	* Set FS local "last mounted on" information (NULL pad)
	*/
	bzero(fs->fs_fsmnt, MAXMNTLEN);
	strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
	mp->mnt_stat.f_iosize = fs->fs_bsize;

	if (mp->mnt_flag & MNT_ROOTFS) {
	/*
	* Root mount; update timestamp in mount structure.
	* this will be used by the common root mount code
	* to update the system clock.
	*/
	mp->mnt_time = fs->fs_time;
	}

	if (ronly == 0) {
	fs->fs_mtime = time_second;
	if ((fs->fs_flags & FS_DOSOFTDEP) &&
	(error = softdep_mount(devvp, mp, fs, cred)) != 0) {
	ffs_flushfiles(mp, FORCECLOSE, td);
	goto out;
	}
	if (fs->fs_snapinum[0] != 0)
	ffs_snapshot_mount(mp);
	fs->fs_fmod = 1;
	fs->fs_clean = 0;
	(void) ffs_sbupdate(ump, MNT_WAIT, 0);
	}
	/*
	* Initialize filesystem state information in mount struct.
	*/
	MNT_ILOCK(mp);
	mp->mnt_kern_flag \|= MNTK_LOOKUP_SHARED \| MNTK_EXTENDED_SHARED \|
	MNTK_NO_IOPF \| MNTK_UNMAPPED_BUFS \| MNTK_USES_BCACHE;
	MNT_IUNLOCK(mp);
	#ifdef UFS_EXTATTR
	#ifdef UFS_EXTATTR_AUTOSTART
	/*
	*
	* Auto-starting does the following:
	* - check for /.attribute in the fs, and extattr_start if so
	* - for each file in .attribute, enable that file with
	* an attribute of the same name.
	* Not clear how to report errors -- probably eat them.
	* This would all happen while the filesystem was busy/not
	* available, so would effectively be "atomic".
	*/
	(void) ufs_extattr_autostart(mp, td);
	#endif /* !UFS_EXTATTR_AUTOSTART */
	#endif /* !UFS_EXTATTR */
	etp = malloc(sizeof *ump->um_fsfail_task, M_UFSMNT, M_WAITOK \| M_ZERO);
	etp->fsid = mp->mnt_stat.f_fsid;
	ump->um_fsfail_task = etp;
	return (0);
	out:
	if (fs != NULL) {
	free(fs->fs_csp, M_UFSMNT);
	free(fs->fs_si, M_UFSMNT);
	free(fs, M_UFSMNT);
	}
	if (cp != NULL) {
	g_topology_lock();
	g_vfs_close(cp);
	g_topology_unlock();
	}
	if (ump) {
	mtx_destroy(UFS_MTX(ump));
	if (mp->mnt_gjprovider != NULL) {
	free(mp->mnt_gjprovider, M_UFSMNT);
	mp->mnt_gjprovider = NULL;
	}
	free(ump, M_UFSMNT);
	mp->mnt_data = NULL;
	}
	BO_LOCK(&odevvp->v_bufobj);
	odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS;
	BO_UNLOCK(&odevvp->v_bufobj);
	atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
	mntfs_freevp(devvp);
	dev_rel(dev);
	return (error);
	}

	/*
	* A read function for use by filesystem-layer routines.
	*/
	static int
	ffs_use_bread(void devfd, off_t loc, void *bufp, int size)
	{
	struct buf *bp;
	int error;

	KASSERT(bufp == NULL, ("ffs_use_bread: non-NULL bufp %p\n", *bufp));
	*bufp = malloc(size, M_UFSMNT, M_WAITOK);
	if ((error = bread((struct vnode *)devfd, btodb(loc), size, NOCRED,
	&bp)) != 0)
	return (error);
	bcopy(bp->b_data, *bufp, size);
	bp->b_flags \|= B_INVAL \| B_NOCACHE;
	brelse(bp);
	return (0);
	}

	static int bigcgs = 0;
	SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, "");

	/*
	* Sanity checks for loading old filesystem superblocks.
	* See ffs_oldfscompat_write below for unwound actions.
	*
	* XXX - Parts get retired eventually.
	* Unfortunately new bits get added.
	*/
	static void
	ffs_oldfscompat_read(fs, ump, sblockloc)
	struct fs *fs;
	struct ufsmount *ump;
	ufs2_daddr_t sblockloc;
	{
	off_t maxfilesize;

	/*
	* If not yet done, update fs_flags location and value of fs_sblockloc.
	*/
	if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
	fs->fs_flags = fs->fs_old_flags;
	fs->fs_old_flags \|= FS_FLAGS_UPDATED;
	fs->fs_sblockloc = sblockloc;
	}
	/*
	* If not yet done, update UFS1 superblock with new wider fields.
	*/
	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) {
	fs->fs_maxbsize = fs->fs_bsize;
	fs->fs_time = fs->fs_old_time;
	fs->fs_size = fs->fs_old_size;
	fs->fs_dsize = fs->fs_old_dsize;
	fs->fs_csaddr = fs->fs_old_csaddr;
	fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
	fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
	fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
	fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
	}
	if (fs->fs_magic == FS_UFS1_MAGIC &&
	fs->fs_old_inodefmt < FS_44INODEFMT) {
	fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1;
	fs->fs_qbmask = ~fs->fs_bmask;
	fs->fs_qfmask = ~fs->fs_fmask;
	}
	if (fs->fs_magic == FS_UFS1_MAGIC) {
	ump->um_savedmaxfilesize = fs->fs_maxfilesize;
	maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1;
	if (fs->fs_maxfilesize > maxfilesize)
	fs->fs_maxfilesize = maxfilesize;
	}
	/* Compatibility for old filesystems */
	if (fs->fs_avgfilesize <= 0)
	fs->fs_avgfilesize = AVFILESIZ;
	if (fs->fs_avgfpdir <= 0)
	fs->fs_avgfpdir = AFPDIR;
	if (bigcgs) {
	fs->fs_save_cgsize = fs->fs_cgsize;
	fs->fs_cgsize = fs->fs_bsize;
	}
	}

	/*
	* Unwinding superblock updates for old filesystems.
	* See ffs_oldfscompat_read above for details.
	*
	* XXX - Parts get retired eventually.
	* Unfortunately new bits get added.
	*/
	void
	ffs_oldfscompat_write(fs, ump)
	struct fs *fs;
	struct ufsmount *ump;
	{

	/*
	* Copy back UFS2 updated fields that UFS1 inspects.
	*/
	if (fs->fs_magic == FS_UFS1_MAGIC) {
	fs->fs_old_time = fs->fs_time;
	fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
	fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
	fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
	fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
	fs->fs_maxfilesize = ump->um_savedmaxfilesize;
	}
	if (bigcgs) {
	fs->fs_cgsize = fs->fs_save_cgsize;
	fs->fs_save_cgsize = 0;
	}
	}

	/*
	* unmount system call
	*/
	static int
	ffs_unmount(mp, mntflags)
	struct mount *mp;
	int mntflags;
	{
	struct thread *td;
	struct ufsmount *ump = VFSTOUFS(mp);
	struct fs *fs;
	int error, flags, susp;
	#ifdef UFS_EXTATTR
	int e_restart;
	#endif

	flags = 0;
	td = curthread;
	fs = ump->um_fs;
	if (mntflags & MNT_FORCE)
	flags \|= FORCECLOSE;
	susp = fs->fs_ronly == 0;
	#ifdef UFS_EXTATTR
	if ((error = ufs_extattr_stop(mp, td))) {
	if (error != EOPNOTSUPP)
	printf("WARNING: unmount %s: ufs_extattr_stop "
	"returned errno %d\n", mp->mnt_stat.f_mntonname,
	error);
	e_restart = 0;
	} else {
	ufs_extattr_uepm_destroy(&ump->um_extattr);
	e_restart = 1;
	}
	#endif
	if (susp) {
	error = vfs_write_suspend_umnt(mp);
	if (error != 0)
	goto fail1;
	}
	if (MOUNTEDSOFTDEP(mp))
	error = softdep_flushfiles(mp, flags, td);
	else
	error = ffs_flushfiles(mp, flags, td);
	if (error != 0 && !ffs_fsfail_cleanup(ump, error))
	goto fail;

	UFS_LOCK(ump);
	if (fs->fs_pendingblocks != 0 \|\| fs->fs_pendinginodes != 0) {
	printf("WARNING: unmount %s: pending error: blocks %jd "
	"files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
	fs->fs_pendinginodes);
	fs->fs_pendingblocks = 0;
	fs->fs_pendinginodes = 0;
	}
	UFS_UNLOCK(ump);
	if (MOUNTEDSOFTDEP(mp))
	softdep_unmount(mp);
	if (fs->fs_ronly == 0 \|\| ump->um_fsckpid > 0) {
	fs->fs_clean = fs->fs_flags & (FS_UNCLEAN\|FS_NEEDSFSCK) ? 0 : 1;
	error = ffs_sbupdate(ump, MNT_WAIT, 0);
	if (ffs_fsfail_cleanup(ump, error))
	error = 0;
	if (error != 0 && !ffs_fsfail_cleanup(ump, error)) {
	fs->fs_clean = 0;
	goto fail;
	}
	}
	if (susp)
	vfs_write_resume(mp, VR_START_WRITE);
	if (ump->um_trim_tq != NULL) {
	while (ump->um_trim_inflight != 0)
	pause("ufsutr", hz);
	taskqueue_drain_all(ump->um_trim_tq);
	taskqueue_free(ump->um_trim_tq);
	free (ump->um_trimhash, M_TRIM);
	}
	g_topology_lock();
	if (ump->um_fsckpid > 0) {
	/*
	* Return to normal read-only mode.
	*/
	error = g_access(ump->um_cp, 0, -1, 0);
	ump->um_fsckpid = 0;
	}
	g_vfs_close(ump->um_cp);
	g_topology_unlock();
	BO_LOCK(&ump->um_odevvp->v_bufobj);
	ump->um_odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS;
	BO_UNLOCK(&ump->um_odevvp->v_bufobj);
	atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0);
	mntfs_freevp(ump->um_devvp);
	vrele(ump->um_odevvp);
	dev_rel(ump->um_dev);
	mtx_destroy(UFS_MTX(ump));
	if (mp->mnt_gjprovider != NULL) {
	free(mp->mnt_gjprovider, M_UFSMNT);
	mp->mnt_gjprovider = NULL;
	}
	free(fs->fs_csp, M_UFSMNT);
	free(fs->fs_si, M_UFSMNT);
	free(fs, M_UFSMNT);
	if (ump->um_fsfail_task != NULL)
	free(ump->um_fsfail_task, M_UFSMNT);
	free(ump, M_UFSMNT);
	mp->mnt_data = NULL;
	MNT_ILOCK(mp);
	mp->mnt_flag &= ~MNT_LOCAL;
	MNT_IUNLOCK(mp);
	if (td->td_su == mp) {
	td->td_su = NULL;
	vfs_rel(mp);
	}
	return (error);

	fail:
	if (susp)
	vfs_write_resume(mp, VR_START_WRITE);
	fail1:
	#ifdef UFS_EXTATTR
	if (e_restart) {
	ufs_extattr_uepm_init(&ump->um_extattr);
	#ifdef UFS_EXTATTR_AUTOSTART
	(void) ufs_extattr_autostart(mp, td);
	#endif
	}
	#endif

	return (error);
	}

	/*
	* Flush out all the files in a filesystem.
	*/
	int
	ffs_flushfiles(mp, flags, td)
	struct mount *mp;
	int flags;
	struct thread *td;
	{
	struct ufsmount *ump;
	int qerror, error;

	ump = VFSTOUFS(mp);
	qerror = 0;
	#ifdef QUOTA
	if (mp->mnt_flag & MNT_QUOTA) {
	int i;
	error = vflush(mp, 0, SKIPSYSTEM\|flags, td);
	if (error)
	return (error);
	for (i = 0; i < MAXQUOTAS; i++) {
	error = quotaoff(td, mp, i);
	if (error != 0) {
	if ((flags & EARLYFLUSH) == 0)
	return (error);
	else
	qerror = error;
	}
	}

	/*
	* Here we fall through to vflush again to ensure that
	* we have gotten rid of all the system vnodes, unless
	* quotas must not be closed.
	*/
	}
	#endif
	ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles");
	if (ump->um_devvp->v_vflag & VV_COPYONWRITE) {
	if ((error = vflush(mp, 0, SKIPSYSTEM \| flags, td)) != 0)
	return (error);
	ffs_snapshot_unmount(mp);
	flags \|= FORCECLOSE;
	/*
	* Here we fall through to vflush again to ensure
	* that we have gotten rid of all the system vnodes.
	*/
	}

	/*
	* Do not close system files if quotas were not closed, to be
	* able to sync the remaining dquots. The freeblks softupdate
	* workitems might hold a reference on a dquot, preventing
	* quotaoff() from completing. Next round of
	* softdep_flushworklist() iteration should process the
	* blockers, allowing the next run of quotaoff() to finally
	* flush held dquots.
	*
	* Otherwise, flush all the files.
	*/
	if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0)
	return (error);

	/*
	* Flush filesystem metadata.
	*/
	vn_lock(ump->um_devvp, LK_EXCLUSIVE \| LK_RETRY);
	error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td);
	VOP_UNLOCK(ump->um_devvp);
	return (error);
	}

	/*
	* Get filesystem statistics.
	*/
	static int
	ffs_statfs(mp, sbp)
	struct mount *mp;
	struct statfs *sbp;
	{
	struct ufsmount *ump;
	struct fs *fs;

	ump = VFSTOUFS(mp);
	fs = ump->um_fs;
	if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC)
	panic("ffs_statfs");
	sbp->f_version = STATFS_VERSION;
	sbp->f_bsize = fs->fs_fsize;
	sbp->f_iosize = fs->fs_bsize;
	sbp->f_blocks = fs->fs_dsize;
	UFS_LOCK(ump);
	sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag +
	fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
	sbp->f_bavail = freespace(fs, fs->fs_minfree) +
	dbtofsb(fs, fs->fs_pendingblocks);
	sbp->f_files = fs->fs_ncg * fs->fs_ipg - UFS_ROOTINO;
	sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
	UFS_UNLOCK(ump);
	sbp->f_namemax = UFS_MAXNAMLEN;
	return (0);
	}

	static bool
	sync_doupdate(struct inode *ip)
	{

	return ((ip->i_flag & (IN_ACCESS \| IN_CHANGE \| IN_MODIFIED \|
	IN_UPDATE)) != 0);
	}

	static int
	ffs_sync_lazy_filter(struct vnode vp, void arg __unused)
	{
	struct inode *ip;

	/*
	* Flags are safe to access because ->v_data invalidation
	* is held off by listmtx.
	*/
	if (vp->v_type == VNON)
	return (false);
	ip = VTOI(vp);
	if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0)
	return (false);
	return (true);
	}

	/*
	* For a lazy sync, we only care about access times, quotas and the
	* superblock. Other filesystem changes are already converted to
	* cylinder group blocks or inode blocks updates and are written to
	* disk by syncer.
	*/
	static int
	ffs_sync_lazy(mp)
	struct mount *mp;
	{
	struct vnode mvp, vp;
	struct inode *ip;
	struct thread *td;
	int allerror, error;

	allerror = 0;
	td = curthread;
	if ((mp->mnt_flag & MNT_NOATIME) != 0) {
	#ifdef QUOTA
	qsync(mp);
	#endif
	goto sbupdate;
	}
	MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, ffs_sync_lazy_filter, NULL) {
	if (vp->v_type == VNON) {
	VI_UNLOCK(vp);
	continue;
	}
	ip = VTOI(vp);

	/*
	* The IN_ACCESS flag is converted to IN_MODIFIED by
	* ufs_close() and ufs_getattr() by the calls to
	* ufs_itimes_locked(), without subsequent UFS_UPDATE().
	* Test also all the other timestamp flags too, to pick up
	* any other cases that could be missed.
	*/
	if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) {
	VI_UNLOCK(vp);
	continue;
	}
	if ((error = vget(vp, LK_EXCLUSIVE \| LK_NOWAIT \| LK_INTERLOCK)) != 0)
	continue;
	#ifdef QUOTA
	qsyncvp(vp);
	#endif
	if (sync_doupdate(ip))
	error = ffs_update(vp, 0);
	if (error != 0)
	allerror = error;
	vput(vp);
	}
	sbupdate:
	if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 &&
	(error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0)
	allerror = error;
	return (allerror);
	}

	/*
	* Go through the disk queues to initiate sandbagged IO;
	* go through the inodes to write those that have been modified;
	* initiate the writing of the super block if it has been modified.
	*
	* Note: we are always called with the filesystem marked busy using
	* vfs_busy().
	*/
	static int
	ffs_sync(mp, waitfor)
	struct mount *mp;
	int waitfor;
	{
	struct vnode mvp, vp, *devvp;
	struct thread *td;
	struct inode *ip;
	struct ufsmount *ump = VFSTOUFS(mp);
	struct fs *fs;
	int error, count, lockreq, allerror = 0;
	int suspend;
	int suspended;
	int secondary_writes;
	int secondary_accwrites;
	int softdep_deps;
	int softdep_accdeps;
	struct bufobj *bo;

	suspend = 0;
	suspended = 0;
	td = curthread;
	fs = ump->um_fs;
	if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0)
	panic("%s: ffs_sync: modification on read-only filesystem",
	fs->fs_fsmnt);
	if (waitfor == MNT_LAZY) {
	if (!rebooting)
	return (ffs_sync_lazy(mp));
	waitfor = MNT_NOWAIT;
	}

	/*
	* Write back each (modified) inode.
	*/
	lockreq = LK_EXCLUSIVE \| LK_NOWAIT;
	if (waitfor == MNT_SUSPEND) {
	suspend = 1;
	waitfor = MNT_WAIT;
	}
	if (waitfor == MNT_WAIT)
	lockreq = LK_EXCLUSIVE;
	lockreq \|= LK_INTERLOCK \| LK_SLEEPFAIL;
	loop:
	/* Grab snapshot of secondary write counts */
	MNT_ILOCK(mp);
	secondary_writes = mp->mnt_secondary_writes;
	secondary_accwrites = mp->mnt_secondary_accwrites;
	MNT_IUNLOCK(mp);

	/* Grab snapshot of softdep dependency counts */
	softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps);

	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
	/*
	* Depend on the vnode interlock to keep things stable enough
	* for a quick test. Since there might be hundreds of
	* thousands of vnodes, we cannot afford even a subroutine
	* call unless there's a good chance that we have work to do.
	*/
	if (vp->v_type == VNON) {
	VI_UNLOCK(vp);
	continue;
	}
	ip = VTOI(vp);
	if ((ip->i_flag &
	(IN_ACCESS \| IN_CHANGE \| IN_MODIFIED \| IN_UPDATE)) == 0 &&
	vp->v_bufobj.bo_dirty.bv_cnt == 0) {
	VI_UNLOCK(vp);
	continue;
	}
	if ((error = vget(vp, lockreq)) != 0) {
	if (error == ENOENT \|\| error == ENOLCK) {
	MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
	goto loop;
	}
	continue;
	}
	#ifdef QUOTA
	qsyncvp(vp);
	#endif
	for (;;) {
	error = ffs_syncvnode(vp, waitfor, 0);
	if (error == ERELOOKUP)
	continue;
	if (error != 0)
	allerror = error;
	break;
	}
	vput(vp);
	}
	/*
	* Force stale filesystem control information to be flushed.
	*/
	if (waitfor == MNT_WAIT \|\| rebooting) {
	if ((error = softdep_flushworklist(ump->um_mountp, &count, td)))
	allerror = error;
	if (ffs_fsfail_cleanup(ump, allerror))
	allerror = 0;
	/* Flushed work items may create new vnodes to clean */
	if (allerror == 0 && count)
	goto loop;
	}

	devvp = ump->um_devvp;
	bo = &devvp->v_bufobj;
	BO_LOCK(bo);
	if (bo->bo_numoutput > 0 \|\| bo->bo_dirty.bv_cnt > 0) {
	BO_UNLOCK(bo);
	vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY);
	error = VOP_FSYNC(devvp, waitfor, td);
	VOP_UNLOCK(devvp);
	if (MOUNTEDSOFTDEP(mp) && (error == 0 \|\| error == EAGAIN))
	error = ffs_sbupdate(ump, waitfor, 0);
	if (error != 0)
	allerror = error;
	if (ffs_fsfail_cleanup(ump, allerror))
	allerror = 0;
	if (allerror == 0 && waitfor == MNT_WAIT)
	goto loop;
	} else if (suspend != 0) {
	if (softdep_check_suspend(mp,
	devvp,
	softdep_deps,
	softdep_accdeps,
	secondary_writes,
	secondary_accwrites) != 0) {
	MNT_IUNLOCK(mp);
	goto loop; /* More work needed */
	}
	mtx_assert(MNT_MTX(mp), MA_OWNED);
	mp->mnt_kern_flag \|= MNTK_SUSPEND2 \| MNTK_SUSPENDED;
	MNT_IUNLOCK(mp);
	suspended = 1;
	} else
	BO_UNLOCK(bo);
	/*
	* Write back modified superblock.
	*/
	if (fs->fs_fmod != 0 &&
	(error = ffs_sbupdate(ump, waitfor, suspended)) != 0)
	allerror = error;
	if (ffs_fsfail_cleanup(ump, allerror))
	allerror = 0;
	return (allerror);
	}

	int
	ffs_vget(mp, ino, flags, vpp)
	struct mount *mp;
	ino_t ino;
	int flags;
	struct vnode **vpp;
	{
	return (ffs_vgetf(mp, ino, flags, vpp, 0));
	}

	int
	ffs_vgetf(mp, ino, flags, vpp, ffs_flags)
	struct mount *mp;
	ino_t ino;
	int flags;
	struct vnode **vpp;
	int ffs_flags;
	{
	struct fs *fs;
	struct inode *ip;
	struct ufsmount *ump;
	struct buf *bp;
	struct vnode *vp;
	daddr_t dbn;
	int error;

	MPASS((ffs_flags & FFSV_REPLACE) == 0 \|\| (flags & LK_EXCLUSIVE) != 0);

	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
	if (error != 0)
	return (error);
	if (*vpp != NULL) {
	if ((ffs_flags & FFSV_REPLACE) == 0)
	return (0);
	vgone(*vpp);
	vput(*vpp);
	}

	/*
	* We must promote to an exclusive lock for vnode creation. This
	* can happen if lookup is passed LOCKSHARED.
	*/
	if ((flags & LK_TYPE_MASK) == LK_SHARED) {
	flags &= ~LK_TYPE_MASK;
	flags \|= LK_EXCLUSIVE;
	}

	/*
	* We do not lock vnode creation as it is believed to be too
	* expensive for such rare case as simultaneous creation of vnode
	* for same ino by different processes. We just allow them to race
	* and check later to decide who wins. Let the race begin!
	*/

	ump = VFSTOUFS(mp);
	fs = ump->um_fs;
	ip = uma_zalloc_smr(uma_inode, M_WAITOK \| M_ZERO);

	/* Allocate a new vnode/inode. */
	error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ?
	&ffs_vnodeops1 : &ffs_vnodeops2, &vp);
	if (error) {
	*vpp = NULL;
	uma_zfree_smr(uma_inode, ip);
	return (error);
	}
	/*
	* FFS supports recursive locking.
	*/
	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
	VN_LOCK_AREC(vp);
	vp->v_data = ip;
	vp->v_bufobj.bo_bsize = fs->fs_bsize;
	ip->i_vnode = vp;
	ip->i_ump = ump;
	ip->i_number = ino;
	ip->i_ea_refs = 0;
	ip->i_nextclustercg = -1;
	ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2;
	ip->i_mode = 0; /* ensure error cases below throw away vnode */
	#ifdef DIAGNOSTIC
	ufs_init_trackers(ip);
	#endif
	#ifdef QUOTA
	{
	int i;
	for (i = 0; i < MAXQUOTAS; i++)
	ip->i_dquot[i] = NODQUOT;
	}
	#endif

	if (ffs_flags & FFSV_FORCEINSMQ)
	vp->v_vflag \|= VV_FORCEINSMQ;
	error = insmntque(vp, mp);
	if (error != 0) {
	uma_zfree_smr(uma_inode, ip);
	*vpp = NULL;
	return (error);
	}
	vp->v_vflag &= ~VV_FORCEINSMQ;
	error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL);
	if (error != 0)
	return (error);
	if (*vpp != NULL) {
	/*
	* Calls from ffs_valloc() (i.e. FFSV_REPLACE set)
	* operate on empty inode, which must not be found by
	* other threads until fully filled. Vnode for empty
	* inode must be not re-inserted on the hash by other
	* thread, after removal by us at the beginning.
	*/
	MPASS((ffs_flags & FFSV_REPLACE) == 0);
	return (0);
	}

	/* Read in the disk contents for the inode, copy into the inode. */
	dbn = fsbtodb(fs, ino_to_fsba(fs, ino));
	error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize,
	NULL, NULL, 0, NOCRED, 0, NULL, &bp);
	if (error != 0) {
	/*
	* The inode does not contain anything useful, so it would
	* be misleading to leave it on its hash chain. With mode
	* still zero, it will be unlinked and returned to the free
	* list by vput().
	*/
	vgone(vp);
	vput(vp);
	*vpp = NULL;
	return (error);
	}
	if (I_IS_UFS1(ip))
	ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK);
	else
	ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK);
	if ((error = ffs_load_inode(bp, ip, fs, ino)) != 0) {
	bqrelse(bp);
	vgone(vp);
	vput(vp);
	*vpp = NULL;
	return (error);
	}
	if (DOINGSOFTDEP(vp))
	softdep_load_inodeblock(ip);
	else
	ip->i_effnlink = ip->i_nlink;
	bqrelse(bp);

	/*
	* Initialize the vnode from the inode, check for aliases.
	* Note that the underlying vnode may have changed.
	*/
	error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2,
	&vp);
	if (error) {
	vgone(vp);
	vput(vp);
	*vpp = NULL;
	return (error);
	}

	/*
	* Finish inode initialization.
	*/
	if (vp->v_type != VFIFO) {
	/* FFS supports shared locking for all files except fifos. */
	VN_LOCK_ASHARE(vp);
	}

	/*
	* Set up a generation number for this inode if it does not
	* already have one. This should only happen on old filesystems.
	*/
	if (ip->i_gen == 0) {
	while (ip->i_gen == 0)
	ip->i_gen = arc4random();
	if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
	UFS_INODE_SET_FLAG(ip, IN_MODIFIED);
	DIP_SET(ip, i_gen, ip->i_gen);
	}
	}
	#ifdef MAC
	if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
	/*
	* If this vnode is already allocated, and we're running
	* multi-label, attempt to perform a label association
	* from the extended attributes on the inode.
	*/
	error = mac_vnode_associate_extattr(mp, vp);
	if (error) {
	/* ufs_inactive will release ip->i_devvp ref. */
	vgone(vp);
	vput(vp);
	*vpp = NULL;
	return (error);
	}
	}
	#endif

	*vpp = vp;
	return (0);
	}

	/*
	* File handle to vnode
	*
	* Have to be really careful about stale file handles:
	* - check that the inode number is valid
	* - for UFS2 check that the inode number is initialized
	* - call ffs_vget() to get the locked inode
	* - check for an unallocated inode (i_mode == 0)
	* - check that the given client host has export rights and return
	* those rights via. exflagsp and credanonp
	*/
	static int
	ffs_fhtovp(mp, fhp, flags, vpp)
	struct mount *mp;
	struct fid *fhp;
	int flags;
	struct vnode **vpp;
	{
	struct ufid *ufhp;
	struct ufsmount *ump;
	struct fs *fs;
	struct cg *cgp;
	struct buf *bp;
	ino_t ino;
	u_int cg;
	int error;

	ufhp = (struct ufid *)fhp;
	ino = ufhp->ufid_ino;
	ump = VFSTOUFS(mp);
	fs = ump->um_fs;
	if (ino < UFS_ROOTINO \|\| ino >= fs->fs_ncg * fs->fs_ipg)
	return (ESTALE);
	/*
	* Need to check if inode is initialized because UFS2 does lazy
	* initialization and nfs_fhtovp can offer arbitrary inode numbers.
	*/
	if (fs->fs_magic != FS_UFS2_MAGIC)
	return (ufs_fhtovp(mp, ufhp, flags, vpp));
	cg = ino_to_cg(fs, ino);
	if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0)
	return (error);
	if (ino >= cg * fs->fs_ipg + cgp->cg_initediblk) {
	brelse(bp);
	return (ESTALE);
	}
	brelse(bp);
	return (ufs_fhtovp(mp, ufhp, flags, vpp));
	}

	/*
	* Initialize the filesystem.
	*/
	static int
	ffs_init(vfsp)
	struct vfsconf *vfsp;
	{

	ffs_susp_initialize();
	softdep_initialize();
	return (ufs_init(vfsp));
	}

	/*
	* Undo the work of ffs_init().
	*/
	static int
	ffs_uninit(vfsp)
	struct vfsconf *vfsp;
	{
	int ret;

	ret = ufs_uninit(vfsp);
	softdep_uninitialize();
	ffs_susp_uninitialize();
	taskqueue_drain_all(taskqueue_thread);
	return (ret);
	}

	/*
	* Structure used to pass information from ffs_sbupdate to its
	* helper routine ffs_use_bwrite.
	*/
	struct devfd {
	struct ufsmount *ump;
	struct buf *sbbp;
	int waitfor;
	int suspended;
	int error;
	};

	/*
	* Write a superblock and associated information back to disk.
	*/
	int
	ffs_sbupdate(ump, waitfor, suspended)
	struct ufsmount *ump;
	int waitfor;
	int suspended;
	{
	struct fs *fs;
	struct buf *sbbp;
	struct devfd devfd;

	fs = ump->um_fs;
	if (fs->fs_ronly == 1 &&
	(ump->um_mountp->mnt_flag & (MNT_RDONLY \| MNT_UPDATE)) !=
	(MNT_RDONLY \| MNT_UPDATE) && ump->um_fsckpid == 0)
	panic("ffs_sbupdate: write read-only filesystem");
	/*
	* We use the superblock's buf to serialize calls to ffs_sbupdate().
	*/
	sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
	(int)fs->fs_sbsize, 0, 0, 0);
	/*
	* Initialize info needed for write function.
	*/
	devfd.ump = ump;
	devfd.sbbp = sbbp;
	devfd.waitfor = waitfor;
	devfd.suspended = suspended;
	devfd.error = 0;
	return (ffs_sbput(&devfd, fs, fs->fs_sblockloc, ffs_use_bwrite));
	}

	/*
	* Write function for use by filesystem-layer routines.
	*/
	static int
	ffs_use_bwrite(void devfd, off_t loc, void buf, int size)
	{
	struct devfd *devfdp;
	struct ufsmount *ump;
	struct buf *bp;
	struct fs *fs;
	int error;

	devfdp = devfd;
	ump = devfdp->ump;
	fs = ump->um_fs;
	/*
	* Writing the superblock summary information.
	*/
	if (loc != fs->fs_sblockloc) {
	bp = getblk(ump->um_devvp, btodb(loc), size, 0, 0, 0);
	bcopy(buf, bp->b_data, (u_int)size);
	if (devfdp->suspended)
	bp->b_flags \|= B_VALIDSUSPWRT;
	if (devfdp->waitfor != MNT_WAIT)
	bawrite(bp);
	else if ((error = bwrite(bp)) != 0)
	devfdp->error = error;
	return (0);
	}
	/*
	* Writing the superblock itself. We need to do special checks for it.
	*/
	bp = devfdp->sbbp;
	if (ffs_fsfail_cleanup(ump, devfdp->error))
	devfdp->error = 0;
	if (devfdp->error != 0) {
	brelse(bp);
	return (devfdp->error);
	}
	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 &&
	(fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
	printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n",
	fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1);
	fs->fs_sblockloc = SBLOCK_UFS1;
	}
	if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 &&
	(fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
	printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n",
	fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2);
	fs->fs_sblockloc = SBLOCK_UFS2;
	}
	if (MOUNTEDSOFTDEP(ump->um_mountp))
	softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp);
	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
	fs = (struct fs *)bp->b_data;
	ffs_oldfscompat_write(fs, ump);
	fs->fs_si = NULL;
	/* Recalculate the superblock hash */
	fs->fs_ckhash = ffs_calc_sbhash(fs);
	if (devfdp->suspended)
	bp->b_flags \|= B_VALIDSUSPWRT;
	if (devfdp->waitfor != MNT_WAIT)
	bawrite(bp);
	else if ((error = bwrite(bp)) != 0)
	devfdp->error = error;
	return (devfdp->error);
	}

	static int
	ffs_extattrctl(struct mount mp, int cmd, struct vnode filename_vp,
	int attrnamespace, const char *attrname)
	{

	#ifdef UFS_EXTATTR
	return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace,
	attrname));
	#else
	return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace,
	attrname));
	#endif
	}

	static void
	ffs_ifree(struct ufsmount ump, struct inode ip)
	{

	if (ump->um_fstype == UFS1 && ip->i_din1 != NULL)
	uma_zfree(uma_ufs1, ip->i_din1);
	else if (ip->i_din2 != NULL)
	uma_zfree(uma_ufs2, ip->i_din2);
	uma_zfree_smr(uma_inode, ip);
	}

	static int dobkgrdwrite = 1;
	SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
	"Do background writes (honoring the BV_BKGRDWRITE flag)?");

	/*
	* Complete a background write started from bwrite.
	*/
	static void
	ffs_backgroundwritedone(struct buf *bp)
	{
	struct bufobj *bufobj;
	struct buf *origbp;

	#ifdef SOFTUPDATES
	if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) != 0)
	softdep_handle_error(bp);
	#endif

	/*
	* Find the original buffer that we are writing.
	*/
	bufobj = bp->b_bufobj;
	BO_LOCK(bufobj);
	if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL)
	panic("backgroundwritedone: lost buffer");

	/*
	* We should mark the cylinder group buffer origbp as
	* dirty, to not lose the failed write.
	*/
	if ((bp->b_ioflags & BIO_ERROR) != 0)
	origbp->b_vflags \|= BV_BKGRDERR;
	BO_UNLOCK(bufobj);
	/*
	* Process dependencies then return any unfinished ones.
	*/
	if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0)
	buf_complete(bp);
	#ifdef SOFTUPDATES
	if (!LIST_EMPTY(&bp->b_dep))
	softdep_move_dependencies(bp, origbp);
	#endif
	/*
	* This buffer is marked B_NOCACHE so when it is released
	* by biodone it will be tossed.
	*/
	bp->b_flags \|= B_NOCACHE;
	bp->b_flags &= ~B_CACHE;
	pbrelvp(bp);

	/*
	* Prevent brelse() from trying to keep and re-dirtying bp on
	* errors. It causes b_bufobj dereference in
	* bdirty()/reassignbuf(), and b_bufobj was cleared in
	* pbrelvp() above.
	*/
	if ((bp->b_ioflags & BIO_ERROR) != 0)
	bp->b_flags \|= B_INVAL;
	bufdone(bp);
	BO_LOCK(bufobj);
	/*
	* Clear the BV_BKGRDINPROG flag in the original buffer
	* and awaken it if it is waiting for the write to complete.
	* If BV_BKGRDINPROG is not set in the original buffer it must
	* have been released and re-instantiated - which is not legal.
	*/
	KASSERT((origbp->b_vflags & BV_BKGRDINPROG),
	("backgroundwritedone: lost buffer2"));
	origbp->b_vflags &= ~BV_BKGRDINPROG;
	if (origbp->b_vflags & BV_BKGRDWAIT) {
	origbp->b_vflags &= ~BV_BKGRDWAIT;
	wakeup(&origbp->b_xflags);
	}
	BO_UNLOCK(bufobj);
	}

	/*
	* Write, release buffer on completion. (Done by iodone
	* if async). Do not bother writing anything if the buffer
	* is invalid.
	*
	* Note that we set B_CACHE here, indicating that buffer is
	* fully valid and thus cacheable. This is true even of NFS
	* now so we set it generally. This could be set either here
	* or in biodone() since the I/O is synchronous. We put it
	* here.
	*/
	static int
	ffs_bufwrite(struct buf *bp)
	{
	struct buf *newbp;
	struct cg *cgp;

	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
	if (bp->b_flags & B_INVAL) {
	brelse(bp);
	return (0);
	}

	if (!BUF_ISLOCKED(bp))
	panic("bufwrite: buffer is not busy???");
	/*
	* If a background write is already in progress, delay
	* writing this block if it is asynchronous. Otherwise
	* wait for the background write to complete.
	*/
	BO_LOCK(bp->b_bufobj);
	if (bp->b_vflags & BV_BKGRDINPROG) {
	if (bp->b_flags & B_ASYNC) {
	BO_UNLOCK(bp->b_bufobj);
	bdwrite(bp);
	return (0);
	}
	bp->b_vflags \|= BV_BKGRDWAIT;
	msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO,
	"bwrbg", 0);
	if (bp->b_vflags & BV_BKGRDINPROG)
	panic("bufwrite: still writing");
	}
	bp->b_vflags &= ~BV_BKGRDERR;
	BO_UNLOCK(bp->b_bufobj);

	/*
	* If this buffer is marked for background writing and we
	* do not have to wait for it, make a copy and write the
	* copy so as to leave this buffer ready for further use.
	*
	* This optimization eats a lot of memory. If we have a page
	* or buffer shortfall we can't do it.
	*/
	if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) &&
	(bp->b_flags & B_ASYNC) &&
	!vm_page_count_severe() &&
	!buf_dirty_count_severe()) {
	KASSERT(bp->b_iodone == NULL,
	("bufwrite: needs chained iodone (%p)", bp->b_iodone));

	/* get a new block */
	newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD);
	if (newbp == NULL)
	goto normal_write;

	KASSERT(buf_mapped(bp), ("Unmapped cg"));
	memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
	BO_LOCK(bp->b_bufobj);
	bp->b_vflags \|= BV_BKGRDINPROG;
	BO_UNLOCK(bp->b_bufobj);
	newbp->b_xflags \|=
	(bp->b_xflags & BX_FSPRIV) \| BX_BKGRDMARKER;
	newbp->b_lblkno = bp->b_lblkno;
	newbp->b_blkno = bp->b_blkno;
	newbp->b_offset = bp->b_offset;
	newbp->b_iodone = ffs_backgroundwritedone;
	newbp->b_flags \|= B_ASYNC;
	newbp->b_flags &= ~B_INVAL;
	pbgetvp(bp->b_vp, newbp);

	#ifdef SOFTUPDATES
	/*
	* Move over the dependencies. If there are rollbacks,
	* leave the parent buffer dirtied as it will need to
	* be written again.
	*/
	if (LIST_EMPTY(&bp->b_dep) \|\|
	softdep_move_dependencies(bp, newbp) == 0)
	bundirty(bp);
	#else
	bundirty(bp);
	#endif

	/*
	* Initiate write on the copy, release the original. The
	* BKGRDINPROG flag prevents it from going away until
	* the background write completes. We have to recalculate
	* its check hash in case the buffer gets freed and then
	* reconstituted from the buffer cache during a later read.
	*/
	if ((bp->b_xflags & BX_CYLGRP) != 0) {
	cgp = (struct cg *)bp->b_data;
	cgp->cg_ckhash = 0;
	cgp->cg_ckhash =
	calculate_crc32c(~0L, bp->b_data, bp->b_bcount);
	}
	bqrelse(bp);
	bp = newbp;
	} else
	/* Mark the buffer clean */
	bundirty(bp);

	/* Let the normal bufwrite do the rest for us */
	normal_write:
	/*
	* If we are writing a cylinder group, update its time.
	*/
	if ((bp->b_xflags & BX_CYLGRP) != 0) {
	cgp = (struct cg *)bp->b_data;
	cgp->cg_old_time = cgp->cg_time = time_second;
	}
	return (bufwrite(bp));
	}

	static void
	ffs_geom_strategy(struct bufobj bo, struct buf bp)
	{
	struct vnode *vp;
	struct buf *tbp;
	int error, nocopy;

	/*
	* This is the bufobj strategy for the private VCHR vnodes
	* used by FFS to access the underlying storage device.
	* We override the default bufobj strategy and thus bypass
	* VOP_STRATEGY() for these vnodes.
	*/
	vp = bo2vnode(bo);
	KASSERT(bp->b_vp == NULL \|\| bp->b_vp->v_type != VCHR \|\|
	bp->b_vp->v_rdev == NULL \|\|
	bp->b_vp->v_rdev->si_mountpt == NULL \|\|
	VFSTOUFS(bp->b_vp->v_rdev->si_mountpt) == NULL \|\|
	vp == VFSTOUFS(bp->b_vp->v_rdev->si_mountpt)->um_devvp,
	("ffs_geom_strategy() with wrong vp"));
	if (bp->b_iocmd == BIO_WRITE) {
	if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
	bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
	(bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
	panic("ffs_geom_strategy: bad I/O");
	nocopy = bp->b_flags & B_NOCOPY;
	bp->b_flags &= ~(B_VALIDSUSPWRT \| B_NOCOPY);
	if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 &&
	vp->v_rdev->si_snapdata != NULL) {
	if ((bp->b_flags & B_CLUSTER) != 0) {
	runningbufwakeup(bp);
	TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
	b_cluster.cluster_entry) {
	error = ffs_copyonwrite(vp, tbp);
	if (error != 0 &&
	error != EOPNOTSUPP) {
	bp->b_error = error;
	bp->b_ioflags \|= BIO_ERROR;
	bp->b_flags &= ~B_BARRIER;
	bufdone(bp);
	return;
	}
	}
	bp->b_runningbufspace = bp->b_bufsize;
	atomic_add_long(&runningbufspace,
	bp->b_runningbufspace);
	} else {
	error = ffs_copyonwrite(vp, bp);
	if (error != 0 && error != EOPNOTSUPP) {
	bp->b_error = error;
	bp->b_ioflags \|= BIO_ERROR;
	bp->b_flags &= ~B_BARRIER;
	bufdone(bp);
	return;
	}
	}
	}
	#ifdef SOFTUPDATES
	if ((bp->b_flags & B_CLUSTER) != 0) {
	TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
	b_cluster.cluster_entry) {
	if (!LIST_EMPTY(&tbp->b_dep))
	buf_start(tbp);
	}
	} else {
	if (!LIST_EMPTY(&bp->b_dep))
	buf_start(bp);
	}

	#endif
	/*
	* Check for metadata that needs check-hashes and update them.
	*/
	switch (bp->b_xflags & BX_FSPRIV) {
	case BX_CYLGRP:
	((struct cg *)bp->b_data)->cg_ckhash = 0;
	((struct cg *)bp->b_data)->cg_ckhash =
	calculate_crc32c(~0L, bp->b_data, bp->b_bcount);
	break;

	case BX_SUPERBLOCK:
	case BX_INODE:
	case BX_INDIR:
	case BX_DIR:
	printf("Check-hash write is unimplemented!!!\n");
	break;

	case 0:
	break;

	default:
	printf("multiple buffer types 0x%b\n",
	(u_int)(bp->b_xflags & BX_FSPRIV),
	PRINT_UFS_BUF_XFLAGS);
	break;
	}
	}
	if (bp->b_iocmd != BIO_READ && ffs_enxio_enable)
	bp->b_xflags \|= BX_CVTENXIO;
	g_vfs_strategy(bo, bp);
	}

	int
	ffs_own_mount(const struct mount *mp)
	{

	if (mp->mnt_op == &ufs_vfsops)
	return (1);
	return (0);
	}

	#ifdef DDB
	#ifdef SOFTUPDATES

	/* defined in ffs_softdep.c */
	extern void db_print_ffs(struct ufsmount *ump);

	DB_SHOW_COMMAND(ffs, db_show_ffs)
	{
	struct mount *mp;
	struct ufsmount *ump;

	if (have_addr) {
	ump = VFSTOUFS((struct mount *)addr);
	db_print_ffs(ump);
	return;
	}

	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name))
	db_print_ffs(VFSTOUFS(mp));
	}
	}

	#endif /* SOFTUPDATES */
	#endif /* DDB */
	diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
	index 74642982acf1..3789a0217252 100644
	--- a/sys/vm/swap_pager.c
	+++ b/sys/vm/swap_pager.c
	@@ -1,3100 +1,3102 @@
	/*-
	* SPDX-License-Identifier: BSD-4-Clause
	*
	* Copyright (c) 1998 Matthew Dillon,
	* Copyright (c) 1994 John S. Dyson
	* Copyright (c) 1990 University of Utah.
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* the Systems Programming Group of the University of Utah Computer
	* Science Department.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* New Swap System
	* Matthew Dillon
	*
	* Radix Bitmap 'blists'.
	*
	* - The new swapper uses the new radix bitmap code. This should scale
	* to arbitrarily small or arbitrarily large swap spaces and an almost
	* arbitrary degree of fragmentation.
	*
	* Features:
	*
	* - on the fly reallocation of swap during putpages. The new system
	* does not try to keep previously allocated swap blocks for dirty
	* pages.
	*
	* - on the fly deallocation of swap
	*
	* - No more garbage collection required. Unnecessarily allocated swap
	* blocks only exist for dirty vm_page_t's now and these are already
	* cycled (in a high-load system) by the pager. We also do on-the-fly
	* removal of invalidated swap blocks when a page is destroyed
	* or renamed.
	*
	* from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
	*
	* @(#)swap_pager.c 8.9 (Berkeley) 3/21/94
	* @(#)vm_swap.c 8.5 (Berkeley) 2/17/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_vm.h"

	#include <sys/param.h>
	#include <sys/bio.h>
	#include <sys/blist.h>
	#include <sys/buf.h>
	#include <sys/conf.h>
	#include <sys/disk.h>
	#include <sys/disklabel.h>
	#include <sys/eventhandler.h>
	#include <sys/fcntl.h>
	#include <sys/lock.h>
	#include <sys/kernel.h>
	#include <sys/mount.h>
	#include <sys/namei.h>
	#include <sys/malloc.h>
	#include <sys/pctrie.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/racct.h>
	#include <sys/resource.h>
	#include <sys/resourcevar.h>
	#include <sys/rwlock.h>
	#include <sys/sbuf.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/sx.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_pageout.h>
	#include <vm/vm_param.h>
	#include <vm/swap_pager.h>
	#include <vm/vm_extern.h>
	#include <vm/uma.h>

	#include <geom/geom.h>

	/*
	* MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64.
	* The 64-page limit is due to the radix code (kern/subr_blist.c).
	*/
	#ifndef MAX_PAGEOUT_CLUSTER
	#define MAX_PAGEOUT_CLUSTER 32
	#endif

	#if !defined(SWB_NPAGES)
	#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
	#endif

	#define SWAP_META_PAGES PCTRIE_COUNT

	/*
	* A swblk structure maps each page index within a
	* SWAP_META_PAGES-aligned and sized range to the address of an
	* on-disk swap block (or SWAPBLK_NONE). The collection of these
	* mappings for an entire vm object is implemented as a pc-trie.
	*/
	struct swblk {
	vm_pindex_t p;
	daddr_t d[SWAP_META_PAGES];
	};

	static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
	static struct mtx sw_dev_mtx;
	static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
	static struct swdevt swdevhd; / Allocate from here next */
	static int nswapdev; /* Number of swap devices */
	int swap_pager_avail;
	static struct sx swdev_syscall_lock; /* serialize swap(on\|off) */

	static __exclusive_cache_line u_long swap_reserved;
	static u_long swap_total;
	static int sysctl_page_shift(SYSCTL_HANDLER_ARGS);

	static SYSCTL_NODE(_vm_stats, OID_AUTO, swap, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	"VM swap stats");

	SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&swap_reserved, 0, sysctl_page_shift, "A",
	"Amount of swap storage needed to back all allocated anonymous memory.");
	SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&swap_total, 0, sysctl_page_shift, "A",
	"Total amount of available swap storage.");

	static int overcommit = 0;
	SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0,
	"Configure virtual memory overcommit behavior. See tuning(7) "
	"for details.");
	static unsigned long swzone;
	SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0,
	"Actual size of swap metadata zone");
	static unsigned long swap_maxpages;
	SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0,
	"Maximum amount of swap supported");

	static COUNTER_U64_DEFINE_EARLY(swap_free_deferred);
	SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_deferred,
	CTLFLAG_RD, &swap_free_deferred,
	"Number of pages that deferred freeing swap space");

	static COUNTER_U64_DEFINE_EARLY(swap_free_completed);
	SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_completed,
	CTLFLAG_RD, &swap_free_completed,
	"Number of deferred frees completed");

	/* bits from overcommit */
	#define SWAP_RESERVE_FORCE_ON (1 << 0)
	#define SWAP_RESERVE_RLIMIT_ON (1 << 1)
	#define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2)

	static int
	sysctl_page_shift(SYSCTL_HANDLER_ARGS)
	{
	uint64_t newval;
	u_long value = (u_long )arg1;

	newval = ((uint64_t)value) << PAGE_SHIFT;
	return (sysctl_handle_64(oidp, &newval, 0, req));
	}

	static bool
	swap_reserve_by_cred_rlimit(u_long pincr, struct ucred *cred, int oc)
	{
	struct uidinfo *uip;
	u_long prev;

	uip = cred->cr_ruidinfo;

	prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr);
	if ((oc & SWAP_RESERVE_RLIMIT_ON) != 0 &&
	prev + pincr > lim_cur(curthread, RLIMIT_SWAP) &&
	priv_check(curthread, PRIV_VM_SWAP_NORLIMIT) != 0) {
	prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr);
	KASSERT(prev >= pincr, ("negative vmsize for uid = %d\n", uip->ui_uid));
	return (false);
	}
	return (true);
	}

	static void
	swap_release_by_cred_rlimit(u_long pdecr, struct ucred *cred)
	{
	struct uidinfo *uip;
	#ifdef INVARIANTS
	u_long prev;
	#endif

	uip = cred->cr_ruidinfo;

	#ifdef INVARIANTS
	prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr);
	KASSERT(prev >= pdecr, ("negative vmsize for uid = %d\n", uip->ui_uid));
	#else
	atomic_subtract_long(&uip->ui_vmsize, pdecr);
	#endif
	}

	static void
	swap_reserve_force_rlimit(u_long pincr, struct ucred *cred)
	{
	struct uidinfo *uip;

	uip = cred->cr_ruidinfo;
	atomic_add_long(&uip->ui_vmsize, pincr);
	}

	bool
	swap_reserve(vm_ooffset_t incr)
	{

	return (swap_reserve_by_cred(incr, curthread->td_ucred));
	}

	bool
	swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred)
	{
	u_long r, s, prev, pincr;
	#ifdef RACCT
	int error;
	#endif
	int oc;
	static int curfail;
	static struct timeval lastfail;

	KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__,
	(uintmax_t)incr));

	#ifdef RACCT
	if (RACCT_ENABLED()) {
	PROC_LOCK(curproc);
	error = racct_add(curproc, RACCT_SWAP, incr);
	PROC_UNLOCK(curproc);
	if (error != 0)
	return (false);
	}
	#endif

	pincr = atop(incr);
	prev = atomic_fetchadd_long(&swap_reserved, pincr);
	r = prev + pincr;
	s = swap_total;
	oc = atomic_load_int(&overcommit);
	if (r > s && (oc & SWAP_RESERVE_ALLOW_NONWIRED) != 0) {
	s += vm_cnt.v_page_count - vm_cnt.v_free_reserved -
	vm_wire_count();
	}
	if ((oc & SWAP_RESERVE_FORCE_ON) != 0 && r > s &&
	priv_check(curthread, PRIV_VM_SWAP_NOQUOTA) != 0) {
	prev = atomic_fetchadd_long(&swap_reserved, -pincr);
	KASSERT(prev >= pincr, ("swap_reserved < incr on overcommit fail"));
	goto out_error;
	}

	if (!swap_reserve_by_cred_rlimit(pincr, cred, oc)) {
	prev = atomic_fetchadd_long(&swap_reserved, -pincr);
	KASSERT(prev >= pincr, ("swap_reserved < incr on overcommit fail"));
	goto out_error;
	}

	return (true);

	out_error:
	if (ppsratecheck(&lastfail, &curfail, 1)) {
	printf("uid %d, pid %d: swap reservation for %jd bytes failed\n",
	cred->cr_ruidinfo->ui_uid, curproc->p_pid, incr);
	}
	#ifdef RACCT
	if (RACCT_ENABLED()) {
	PROC_LOCK(curproc);
	racct_sub(curproc, RACCT_SWAP, incr);
	PROC_UNLOCK(curproc);
	}
	#endif

	return (false);
	}

	void
	swap_reserve_force(vm_ooffset_t incr)
	{
	u_long pincr;

	KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__,
	(uintmax_t)incr));

	#ifdef RACCT
	if (RACCT_ENABLED()) {
	PROC_LOCK(curproc);
	racct_add_force(curproc, RACCT_SWAP, incr);
	PROC_UNLOCK(curproc);
	}
	#endif
	pincr = atop(incr);
	atomic_add_long(&swap_reserved, pincr);
	swap_reserve_force_rlimit(pincr, curthread->td_ucred);
	}

	void
	swap_release(vm_ooffset_t decr)
	{
	struct ucred *cred;

	PROC_LOCK(curproc);
	cred = curproc->p_ucred;
	swap_release_by_cred(decr, cred);
	PROC_UNLOCK(curproc);
	}

	void
	swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
	{
	u_long pdecr;
	#ifdef INVARIANTS
	u_long prev;
	#endif

	KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", __func__,
	(uintmax_t)decr));

	pdecr = atop(decr);
	#ifdef INVARIANTS
	prev = atomic_fetchadd_long(&swap_reserved, -pdecr);
	KASSERT(prev >= pdecr, ("swap_reserved < decr"));
	#else
	atomic_subtract_long(&swap_reserved, pdecr);
	#endif

	swap_release_by_cred_rlimit(pdecr, cred);
	#ifdef RACCT
	if (racct_enable)
	racct_sub_cred(cred, RACCT_SWAP, decr);
	#endif
	}

	static int swap_pager_full = 2; /* swap space exhaustion (task killing) */
	static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
	static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */
	static int nsw_wcount_async; /* limit async write buffers */
	static int nsw_wcount_async_max;/* assigned maximum */
	static int nsw_cluster_max; /* maximum VOP I/O allowed */

	static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS);
	SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT \| CTLFLAG_RW \|
	CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I",
	"Maximum running async swap ops");
	static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS);
	SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING \| CTLFLAG_RD \|
	CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A",
	"Swap Fragmentation Info");

	static struct sx sw_alloc_sx;

	/*
	* "named" and "unnamed" anon region objects. Try to reduce the overhead
	* of searching a named list by hashing it just a little.
	*/

	#define NOBJLISTS 8

	#define NOBJLIST(handle) \
	(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])

	static struct pagerlst swap_pager_object_list[NOBJLISTS];
	static uma_zone_t swwbuf_zone;
	static uma_zone_t swrbuf_zone;
	static uma_zone_t swblk_zone;
	static uma_zone_t swpctrie_zone;

	/*
	* pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure
	* calls hooked from other parts of the VM system and do not appear here.
	* (see vm/swap_pager.h).
	*/
	static vm_object_t
	swap_pager_alloc(void *handle, vm_ooffset_t size,
	vm_prot_t prot, vm_ooffset_t offset, struct ucred *);
	static void swap_pager_dealloc(vm_object_t object);
	static int swap_pager_getpages(vm_object_t, vm_page_t , int, int ,
	int *);
	static int swap_pager_getpages_async(vm_object_t, vm_page_t , int, int ,
	int , pgo_getpages_iodone_t, void );
	static void swap_pager_putpages(vm_object_t, vm_page_t , int, boolean_t, int );
	static boolean_t
	swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int before, int after);
	static void swap_pager_init(void);
	static void swap_pager_unswapped(vm_page_t);
	static void swap_pager_swapoff(struct swdevt *sp);
	static void swap_pager_update_writecount(vm_object_t object,
	vm_offset_t start, vm_offset_t end);
	static void swap_pager_release_writecount(vm_object_t object,
	vm_offset_t start, vm_offset_t end);

	struct pagerops swappagerops = {
	.pgo_init = swap_pager_init, /* early system initialization of pager */
	.pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */
	.pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */
	.pgo_getpages = swap_pager_getpages, /* pagein */
	.pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */
	.pgo_putpages = swap_pager_putpages, /* pageout */
	.pgo_haspage = swap_pager_haspage, /* get backing store status for page */
	.pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */
	.pgo_update_writecount = swap_pager_update_writecount,
	.pgo_release_writecount = swap_pager_release_writecount,
	};

	/*
	* swap_() routines are externally accessible. swp_() routines are
	* internal.
	*/
	static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */
	static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */

	SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0,
	"Maximum size of a swap block in pages");

	static void swp_sizecheck(void);
	static void swp_pager_async_iodone(struct buf *bp);
	static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit);
	static void swp_pager_free_empty_swblk(vm_object_t, struct swblk *sb);
	static int swapongeom(struct vnode *);
	static int swaponvp(struct thread , struct vnode , u_long);
	static int swapoff_one(struct swdevt sp, struct ucred cred);

	/*
	* Swap bitmap functions
	*/
	static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages);
	static daddr_t swp_pager_getswapspace(int *npages);

	/*
	* Metadata functions
	*/
	static daddr_t swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
	static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
	static void swp_pager_meta_transfer(vm_object_t src, vm_object_t dst,
	vm_pindex_t pindex, vm_pindex_t count);
	static void swp_pager_meta_free_all(vm_object_t);
	static daddr_t swp_pager_meta_lookup(vm_object_t, vm_pindex_t);

	static void
	swp_pager_init_freerange(daddr_t start, daddr_t num)
	{

	*start = SWAPBLK_NONE;
	*num = 0;
	}

	static void
	swp_pager_update_freerange(daddr_t start, daddr_t num, daddr_t addr)
	{

	if (start + num == addr) {
	(*num)++;
	} else {
	swp_pager_freeswapspace(start, num);
	*start = addr;
	*num = 1;
	}
	}

	static void *
	swblk_trie_alloc(struct pctrie *ptree)
	{

	return (uma_zalloc(swpctrie_zone, M_NOWAIT \| (curproc == pageproc ?
	M_USE_RESERVE : 0)));
	}

	static void
	swblk_trie_free(struct pctrie ptree, void node)
	{

	uma_zfree(swpctrie_zone, node);
	}

	PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free);

	/*
	* SWP_SIZECHECK() - update swap_pager_full indication
	*
	* update the swap_pager_almost_full indication and warn when we are
	* about to run out of swap space, using lowat/hiwat hysteresis.
	*
	* Clear swap_pager_full ( task killing ) indication when lowat is met.
	*
	* No restrictions on call
	* This routine may not block.
	*/
	static void
	swp_sizecheck(void)
	{

	if (swap_pager_avail < nswap_lowat) {
	if (swap_pager_almost_full == 0) {
	printf("swap_pager: out of swap space\n");
	swap_pager_almost_full = 1;
	}
	} else {
	swap_pager_full = 0;
	if (swap_pager_avail > nswap_hiwat)
	swap_pager_almost_full = 0;
	}
	}

	/*
	* SWAP_PAGER_INIT() - initialize the swap pager!
	*
	* Expected to be started from system init. NOTE: This code is run
	* before much else so be careful what you depend on. Most of the VM
	* system has yet to be initialized at this point.
	*/
	static void
	swap_pager_init(void)
	{
	/*
	* Initialize object lists
	*/
	int i;

	for (i = 0; i < NOBJLISTS; ++i)
	TAILQ_INIT(&swap_pager_object_list[i]);
	mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
	sx_init(&sw_alloc_sx, "swspsx");
	sx_init(&swdev_syscall_lock, "swsysc");
	}

	/*
	* SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
	*
	* Expected to be started from pageout process once, prior to entering
	* its main loop.
	*/
	void
	swap_pager_swap_init(void)
	{
	unsigned long n, n2;

	/*
	* Number of in-transit swap bp operations. Don't
	* exhaust the pbufs completely. Make sure we
	* initialize workable values (0 will work for hysteresis
	* but it isn't very efficient).
	*
	* The nsw_cluster_max is constrained by the bp->b_pages[]
	- * array, which has MAXPHYS / PAGE_SIZE entries, and our locally
	+ * array, which has maxphys / PAGE_SIZE entries, and our locally
	* defined MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are
	* constrained by the swap device interleave stripe size.
	*
	* Currently we hardwire nsw_wcount_async to 4. This limit is
	* designed to prevent other I/O from having high latencies due to
	* our pageout I/O. The value 4 works well for one or two active swap
	* devices but is probably a little low if you have more. Even so,
	* a higher value would probably generate only a limited improvement
	* with three or four active swap devices since the system does not
	* typically have to pageout at extreme bandwidths. We will want
	* at least 2 per swap devices, and 4 is a pretty good value if you
	* have one NFS swap device due to the command/ack latency over NFS.
	* So it all works out pretty well.
	*/
	- nsw_cluster_max = min(MAXPHYS / PAGE_SIZE, MAX_PAGEOUT_CLUSTER);
	+ nsw_cluster_max = min(maxphys / PAGE_SIZE, MAX_PAGEOUT_CLUSTER);

	nsw_wcount_async = 4;
	nsw_wcount_async_max = nsw_wcount_async;
	mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF);

	swwbuf_zone = pbuf_zsecond_create("swwbuf", nswbuf / 4);
	swrbuf_zone = pbuf_zsecond_create("swrbuf", nswbuf / 2);

	/*
	* Initialize our zone, taking the user's requested size or
	* estimating the number we need based on the number of pages
	* in the system.
	*/
	n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) :
	vm_cnt.v_page_count / 2;
	swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL,
	pctrie_zone_init, NULL, UMA_ALIGN_PTR, 0);
	if (swpctrie_zone == NULL)
	panic("failed to create swap pctrie zone.");
	swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL,
	NULL, NULL, _Alignof(struct swblk) - 1, 0);
	if (swblk_zone == NULL)
	panic("failed to create swap blk zone.");
	n2 = n;
	do {
	if (uma_zone_reserve_kva(swblk_zone, n))
	break;
	/*
	* if the allocation failed, try a zone two thirds the
	* size of the previous attempt.
	*/
	n -= ((n + 2) / 3);
	} while (n > 0);

	/*
	* Often uma_zone_reserve_kva() cannot reserve exactly the
	* requested size. Account for the difference when
	* calculating swap_maxpages.
	*/
	n = uma_zone_get_max(swblk_zone);

	if (n < n2)
	printf("Swap blk zone entries changed from %lu to %lu.\n",
	n2, n);
	/* absolute maximum we can handle assuming 100% efficiency */
	swap_maxpages = n * SWAP_META_PAGES;
	swzone = n * sizeof(struct swblk);
	if (!uma_zone_reserve_kva(swpctrie_zone, n))
	printf("Cannot reserve swap pctrie zone, "
	"reduce kern.maxswzone.\n");
	}

	static vm_object_t
	swap_pager_alloc_init(void handle, struct ucred cred, vm_ooffset_t size,
	vm_ooffset_t offset)
	{
	vm_object_t object;

	if (cred != NULL) {
	if (!swap_reserve_by_cred(size, cred))
	return (NULL);
	crhold(cred);
	}

	/*
	* The un_pager.swp.swp_blks trie is initialized by
	* vm_object_allocate() to ensure the correct order of
	* visibility to other threads.
	*/
	object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset +
	PAGE_MASK + size));

	object->un_pager.swp.writemappings = 0;
	object->handle = handle;
	if (cred != NULL) {
	object->cred = cred;
	object->charge = size;
	}
	return (object);
	}

	/*
	* SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
	* its metadata structures.
	*
	* This routine is called from the mmap and fork code to create a new
	* OBJT_SWAP object.
	*
	* This routine must ensure that no live duplicate is created for
	* the named object request, which is protected against by
	* holding the sw_alloc_sx lock in case handle != NULL.
	*/
	static vm_object_t
	swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
	vm_ooffset_t offset, struct ucred *cred)
	{
	vm_object_t object;

	if (handle != NULL) {
	/*
	* Reference existing named region or allocate new one. There
	* should not be a race here against swp_pager_meta_build()
	* as called from vm_page_remove() in regards to the lookup
	* of the handle.
	*/
	sx_xlock(&sw_alloc_sx);
	object = vm_pager_object_lookup(NOBJLIST(handle), handle);
	if (object == NULL) {
	object = swap_pager_alloc_init(handle, cred, size,
	offset);
	if (object != NULL) {
	TAILQ_INSERT_TAIL(NOBJLIST(object->handle),
	object, pager_object_list);
	}
	}
	sx_xunlock(&sw_alloc_sx);
	} else {
	object = swap_pager_alloc_init(handle, cred, size, offset);
	}
	return (object);
	}

	/*
	* SWAP_PAGER_DEALLOC() - remove swap metadata from object
	*
	* The swap backing for the object is destroyed. The code is
	* designed such that we can reinstantiate it later, but this
	* routine is typically called only when the entire object is
	* about to be destroyed.
	*
	* The object must be locked.
	*/
	static void
	swap_pager_dealloc(vm_object_t object)
	{

	VM_OBJECT_ASSERT_WLOCKED(object);
	KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj"));

	/*
	* Remove from list right away so lookups will fail if we block for
	* pageout completion.
	*/
	if ((object->flags & OBJ_ANON) == 0 && object->handle != NULL) {
	VM_OBJECT_WUNLOCK(object);
	sx_xlock(&sw_alloc_sx);
	TAILQ_REMOVE(NOBJLIST(object->handle), object,
	pager_object_list);
	sx_xunlock(&sw_alloc_sx);
	VM_OBJECT_WLOCK(object);
	}

	vm_object_pip_wait(object, "swpdea");

	/*
	* Free all remaining metadata. We only bother to free it from
	* the swap meta data. We do not attempt to free swapblk's still
	* associated with vm_page_t's for this object. We do not care
	* if paging is still in progress on some objects.
	*/
	swp_pager_meta_free_all(object);
	object->handle = NULL;
	object->type = OBJT_DEAD;
	}

	/************************************************************************
	* SWAP PAGER BITMAP ROUTINES *
	************************************************************************/

	/*
	* SWP_PAGER_GETSWAPSPACE() - allocate raw swap space
	*
	* Allocate swap for up to the requested number of pages. The
	* starting swap block number (a page index) is returned or
	* SWAPBLK_NONE if the allocation failed.
	*
	* Also has the side effect of advising that somebody made a mistake
	* when they configured swap and didn't configure enough.
	*
	* This routine may not sleep.
	*
	* We allocate in round-robin fashion from the configured devices.
	*/
	static daddr_t
	swp_pager_getswapspace(int *io_npages)
	{
	daddr_t blk;
	struct swdevt *sp;
	int mpages, npages;

	KASSERT(*io_npages >= 1,
	("%s: npages not positive", __func__));
	blk = SWAPBLK_NONE;
	mpages = *io_npages;
	npages = imin(BLIST_MAX_ALLOC, mpages);
	mtx_lock(&sw_dev_mtx);
	sp = swdevhd;
	while (!TAILQ_EMPTY(&swtailq)) {
	if (sp == NULL)
	sp = TAILQ_FIRST(&swtailq);
	if ((sp->sw_flags & SW_CLOSING) == 0)
	blk = blist_alloc(sp->sw_blist, &npages, mpages);
	if (blk != SWAPBLK_NONE)
	break;
	sp = TAILQ_NEXT(sp, sw_list);
	if (swdevhd == sp) {
	if (npages == 1)
	break;
	mpages = npages - 1;
	npages >>= 1;
	}
	}
	if (blk != SWAPBLK_NONE) {
	*io_npages = npages;
	blk += sp->sw_first;
	sp->sw_used += npages;
	swap_pager_avail -= npages;
	swp_sizecheck();
	swdevhd = TAILQ_NEXT(sp, sw_list);
	} else {
	if (swap_pager_full != 2) {
	printf("swp_pager_getswapspace(%d): failed\n",
	*io_npages);
	swap_pager_full = 2;
	swap_pager_almost_full = 1;
	}
	swdevhd = NULL;
	}
	mtx_unlock(&sw_dev_mtx);
	return (blk);
	}

	static bool
	swp_pager_isondev(daddr_t blk, struct swdevt *sp)
	{

	return (blk >= sp->sw_first && blk < sp->sw_end);
	}

	static void
	swp_pager_strategy(struct buf *bp)
	{
	struct swdevt *sp;

	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	if (swp_pager_isondev(bp->b_blkno, sp)) {
	mtx_unlock(&sw_dev_mtx);
	if ((sp->sw_flags & SW_UNMAPPED) != 0 &&
	unmapped_buf_allowed) {
	bp->b_data = unmapped_buf;
	bp->b_offset = 0;
	} else {
	pmap_qenter((vm_offset_t)bp->b_data,
	&bp->b_pages[0], bp->b_bcount / PAGE_SIZE);
	}
	sp->sw_strategy(bp, sp);
	return;
	}
	}
	panic("Swapdev not found");
	}

	/*
	* SWP_PAGER_FREESWAPSPACE() - free raw swap space
	*
	* This routine returns the specified swap blocks back to the bitmap.
	*
	* This routine may not sleep.
	*/
	static void
	swp_pager_freeswapspace(daddr_t blk, daddr_t npages)
	{
	struct swdevt *sp;

	if (npages == 0)
	return;
	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	if (swp_pager_isondev(blk, sp)) {
	sp->sw_used -= npages;
	/*
	* If we are attempting to stop swapping on
	* this device, we don't want to mark any
	* blocks free lest they be reused.
	*/
	if ((sp->sw_flags & SW_CLOSING) == 0) {
	blist_free(sp->sw_blist, blk - sp->sw_first,
	npages);
	swap_pager_avail += npages;
	swp_sizecheck();
	}
	mtx_unlock(&sw_dev_mtx);
	return;
	}
	}
	panic("Swapdev not found");
	}

	/*
	* SYSCTL_SWAP_FRAGMENTATION() - produce raw swap space stats
	*/
	static int
	sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS)
	{
	struct sbuf sbuf;
	struct swdevt *sp;
	const char *devname;
	int error;

	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
	return (error);
	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	if (vn_isdisk(sp->sw_vp))
	devname = devtoname(sp->sw_vp->v_rdev);
	else
	devname = "[file]";
	sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname);
	blist_stats(sp->sw_blist, &sbuf);
	}
	mtx_unlock(&sw_dev_mtx);
	error = sbuf_finish(&sbuf);
	sbuf_delete(&sbuf);
	return (error);
	}

	/*
	* SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page
	* range within an object.
	*
	* This is a globally accessible routine.
	*
	* This routine removes swapblk assignments from swap metadata.
	*
	* The external callers of this routine typically have already destroyed
	* or renamed vm_page_t's associated with this range in the object so
	* we should be ok.
	*
	* The object must be locked.
	*/
	void
	swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
	{

	swp_pager_meta_free(object, start, size);
	}

	/*
	* SWAP_PAGER_RESERVE() - reserve swap blocks in object
	*
	* Assigns swap blocks to the specified range within the object. The
	* swap blocks are not zeroed. Any previous swap assignment is destroyed.
	*
	* Returns 0 on success, -1 on failure.
	*/
	int
	swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
	{
	daddr_t addr, blk, n_free, s_free;
	int i, j, n;

	swp_pager_init_freerange(&s_free, &n_free);
	VM_OBJECT_WLOCK(object);
	for (i = 0; i < size; i += n) {
	n = size - i;
	blk = swp_pager_getswapspace(&n);
	if (blk == SWAPBLK_NONE) {
	swp_pager_meta_free(object, start, i);
	VM_OBJECT_WUNLOCK(object);
	return (-1);
	}
	for (j = 0; j < n; ++j) {
	addr = swp_pager_meta_build(object,
	start + i + j, blk + j);
	if (addr != SWAPBLK_NONE)
	swp_pager_update_freerange(&s_free, &n_free,
	addr);
	}
	}
	swp_pager_freeswapspace(s_free, n_free);
	VM_OBJECT_WUNLOCK(object);
	return (0);
	}

	static bool
	swp_pager_xfer_source(vm_object_t srcobject, vm_object_t dstobject,
	vm_pindex_t pindex, daddr_t addr)
	{
	daddr_t dstaddr;

	KASSERT(srcobject->type == OBJT_SWAP,
	("%s: Srcobject not swappable", __func__));
	if (dstobject->type == OBJT_SWAP &&
	swp_pager_meta_lookup(dstobject, pindex) != SWAPBLK_NONE) {
	/* Caller should destroy the source block. */
	return (false);
	}

	/*
	* Destination has no swapblk and is not resident, transfer source.
	* swp_pager_meta_build() can sleep.
	*/
	VM_OBJECT_WUNLOCK(srcobject);
	dstaddr = swp_pager_meta_build(dstobject, pindex, addr);
	KASSERT(dstaddr == SWAPBLK_NONE,
	("Unexpected destination swapblk"));
	VM_OBJECT_WLOCK(srcobject);

	return (true);
	}

	/*
	* SWAP_PAGER_COPY() - copy blocks from source pager to destination pager
	* and destroy the source.
	*
	* Copy any valid swapblks from the source to the destination. In
	* cases where both the source and destination have a valid swapblk,
	* we keep the destination's.
	*
	* This routine is allowed to sleep. It may sleep allocating metadata
	* indirectly through swp_pager_meta_build().
	*
	* The source object contains no vm_page_t's (which is just as well)
	*
	* The source object is of type OBJT_SWAP.
	*
	* The source and destination objects must be locked.
	* Both object locks may temporarily be released.
	*/
	void
	swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
	vm_pindex_t offset, int destroysource)
	{

	VM_OBJECT_ASSERT_WLOCKED(srcobject);
	VM_OBJECT_ASSERT_WLOCKED(dstobject);

	/*
	* If destroysource is set, we remove the source object from the
	* swap_pager internal queue now.
	*/
	if (destroysource && (srcobject->flags & OBJ_ANON) == 0 &&
	srcobject->handle != NULL) {
	VM_OBJECT_WUNLOCK(srcobject);
	VM_OBJECT_WUNLOCK(dstobject);
	sx_xlock(&sw_alloc_sx);
	TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject,
	pager_object_list);
	sx_xunlock(&sw_alloc_sx);
	VM_OBJECT_WLOCK(dstobject);
	VM_OBJECT_WLOCK(srcobject);
	}

	/*
	* Transfer source to destination.
	*/
	swp_pager_meta_transfer(srcobject, dstobject, offset, dstobject->size);

	/*
	* Free left over swap blocks in source.
	*
	* We have to revert the type to OBJT_DEFAULT so we do not accidentally
	* double-remove the object from the swap queues.
	*/
	if (destroysource) {
	swp_pager_meta_free_all(srcobject);
	/*
	* Reverting the type is not necessary, the caller is going
	* to destroy srcobject directly, but I'm doing it here
	* for consistency since we've removed the object from its
	* queues.
	*/
	srcobject->type = OBJT_DEFAULT;
	}
	}

	/*
	* SWAP_PAGER_HASPAGE() - determine if we have good backing store for
	* the requested page.
	*
	* We determine whether good backing store exists for the requested
	* page and return TRUE if it does, FALSE if it doesn't.
	*
	* If TRUE, we also try to determine how much valid, contiguous backing
	* store exists before and after the requested page.
	*/
	static boolean_t
	swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
	int *after)
	{
	daddr_t blk, blk0;
	int i;

	VM_OBJECT_ASSERT_LOCKED(object);
	KASSERT(object->type == OBJT_SWAP,
	("%s: object not swappable", __func__));

	/*
	* do we have good backing store at the requested index ?
	*/
	blk0 = swp_pager_meta_lookup(object, pindex);
	if (blk0 == SWAPBLK_NONE) {
	if (before)
	*before = 0;
	if (after)
	*after = 0;
	return (FALSE);
	}

	/*
	* find backwards-looking contiguous good backing store
	*/
	if (before != NULL) {
	for (i = 1; i < SWB_NPAGES; i++) {
	if (i > pindex)
	break;
	blk = swp_pager_meta_lookup(object, pindex - i);
	if (blk != blk0 - i)
	break;
	}
	*before = i - 1;
	}

	/*
	* find forward-looking contiguous good backing store
	*/
	if (after != NULL) {
	for (i = 1; i < SWB_NPAGES; i++) {
	blk = swp_pager_meta_lookup(object, pindex + i);
	if (blk != blk0 + i)
	break;
	}
	*after = i - 1;
	}
	return (TRUE);
	}

	/*
	* SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
	*
	* This removes any associated swap backing store, whether valid or
	* not, from the page.
	*
	* This routine is typically called when a page is made dirty, at
	* which point any associated swap can be freed. MADV_FREE also
	* calls us in a special-case situation
	*
	* NOTE!!! If the page is clean and the swap was valid, the caller
	* should make the page dirty before calling this routine. This routine
	* does NOT change the m->dirty status of the page. Also: MADV_FREE
	* depends on it.
	*
	* This routine may not sleep.
	*
	* The object containing the page may be locked.
	*/
	static void
	swap_pager_unswapped(vm_page_t m)
	{
	struct swblk *sb;
	vm_object_t obj;

	/*
	* Handle enqueing deferred frees first. If we do not have the
	* object lock we wait for the page daemon to clear the space.
	*/
	obj = m->object;
	if (!VM_OBJECT_WOWNED(obj)) {
	VM_PAGE_OBJECT_BUSY_ASSERT(m);
	/*
	* The caller is responsible for synchronization but we
	* will harmlessly handle races. This is typically provided
	* by only calling unswapped() when a page transitions from
	* clean to dirty.
	*/
	if ((m->a.flags & (PGA_SWAP_SPACE \| PGA_SWAP_FREE)) ==
	PGA_SWAP_SPACE) {
	vm_page_aflag_set(m, PGA_SWAP_FREE);
	counter_u64_add(swap_free_deferred, 1);
	}
	return;
	}
	if ((m->a.flags & PGA_SWAP_FREE) != 0)
	counter_u64_add(swap_free_completed, 1);
	vm_page_aflag_clear(m, PGA_SWAP_FREE \| PGA_SWAP_SPACE);

	/*
	* The meta data only exists if the object is OBJT_SWAP
	* and even then might not be allocated yet.
	*/
	KASSERT(m->object->type == OBJT_SWAP,
	("Free object not swappable"));

	sb = SWAP_PCTRIE_LOOKUP(&m->object->un_pager.swp.swp_blks,
	rounddown(m->pindex, SWAP_META_PAGES));
	if (sb == NULL)
	return;
	if (sb->d[m->pindex % SWAP_META_PAGES] == SWAPBLK_NONE)
	return;
	swp_pager_freeswapspace(sb->d[m->pindex % SWAP_META_PAGES], 1);
	sb->d[m->pindex % SWAP_META_PAGES] = SWAPBLK_NONE;
	swp_pager_free_empty_swblk(m->object, sb);
	}

	/*
	* swap_pager_getpages() - bring pages in from swap
	*
	* Attempt to page in the pages in array "ma" of length "count". The
	* caller may optionally specify that additional pages preceding and
	* succeeding the specified range be paged in. The number of such pages
	* is returned in the "rbehind" and "rahead" parameters, and they will
	* be in the inactive queue upon return.
	*
	* The pages in "ma" must be busied and will remain busied upon return.
	*/
	static int
	swap_pager_getpages_locked(vm_object_t object, vm_page_t *ma, int count,
	int rbehind, int rahead)
	{
	struct buf *bp;
	vm_page_t bm, mpred, msucc, p;
	vm_pindex_t pindex;
	daddr_t blk;
	int i, maxahead, maxbehind, reqcount;

	VM_OBJECT_ASSERT_WLOCKED(object);
	reqcount = count;

	KASSERT(object->type == OBJT_SWAP,
	("%s: object not swappable", __func__));
	if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead)) {
	VM_OBJECT_WUNLOCK(object);
	return (VM_PAGER_FAIL);
	}

	KASSERT(reqcount - 1 <= maxahead,
	("page count %d extends beyond swap block", reqcount));

	/*
	* Do not transfer any pages other than those that are xbusied
	* when running during a split or collapse operation. This
	* prevents clustering from re-creating pages which are being
	* moved into another object.
	*/
	if ((object->flags & (OBJ_SPLIT \| OBJ_DEAD)) != 0) {
	maxahead = reqcount - 1;
	maxbehind = 0;
	}

	/*
	* Clip the readahead and readbehind ranges to exclude resident pages.
	*/
	if (rahead != NULL) {
	rahead = imin(rahead, maxahead - (reqcount - 1));
	pindex = ma[reqcount - 1]->pindex;
	msucc = TAILQ_NEXT(ma[reqcount - 1], listq);
	if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead)
	*rahead = msucc->pindex - pindex - 1;
	}
	if (rbehind != NULL) {
	rbehind = imin(rbehind, maxbehind);
	pindex = ma[0]->pindex;
	mpred = TAILQ_PREV(ma[0], pglist, listq);
	if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind)
	*rbehind = pindex - mpred->pindex - 1;
	}

	bm = ma[0];
	for (i = 0; i < count; i++)
	ma[i]->oflags \|= VPO_SWAPINPROG;

	/*
	* Allocate readahead and readbehind pages.
	*/
	if (rbehind != NULL) {
	for (i = 1; i <= *rbehind; i++) {
	p = vm_page_alloc(object, ma[0]->pindex - i,
	VM_ALLOC_NORMAL);
	if (p == NULL)
	break;
	p->oflags \|= VPO_SWAPINPROG;
	bm = p;
	}
	*rbehind = i - 1;
	}
	if (rahead != NULL) {
	for (i = 0; i < *rahead; i++) {
	p = vm_page_alloc(object,
	ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL);
	if (p == NULL)
	break;
	p->oflags \|= VPO_SWAPINPROG;
	}
	*rahead = i;
	}
	if (rbehind != NULL)
	count += *rbehind;
	if (rahead != NULL)
	count += *rahead;

	vm_object_pip_add(object, count);

	pindex = bm->pindex;
	blk = swp_pager_meta_lookup(object, pindex);
	KASSERT(blk != SWAPBLK_NONE,
	("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex));

	VM_OBJECT_WUNLOCK(object);
	bp = uma_zalloc(swrbuf_zone, M_WAITOK);
	+ MPASS((bp->b_flags & B_MAXPHYS) != 0);
	/* Pages cannot leave the object while busy. */
	for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) {
	MPASS(p->pindex == bm->pindex + i);
	bp->b_pages[i] = p;
	}

	bp->b_flags \|= B_PAGING;
	bp->b_iocmd = BIO_READ;
	bp->b_iodone = swp_pager_async_iodone;
	bp->b_rcred = crhold(thread0.td_ucred);
	bp->b_wcred = crhold(thread0.td_ucred);
	bp->b_blkno = blk;
	bp->b_bcount = PAGE_SIZE * count;
	bp->b_bufsize = PAGE_SIZE * count;
	bp->b_npages = count;
	bp->b_pgbefore = rbehind != NULL ? *rbehind : 0;
	bp->b_pgafter = rahead != NULL ? *rahead : 0;

	VM_CNT_INC(v_swapin);
	VM_CNT_ADD(v_swappgsin, count);

	/*
	* perform the I/O. NOTE!!! bp cannot be considered valid after
	* this point because we automatically release it on completion.
	* Instead, we look at the one page we are interested in which we
	* still hold a lock on even through the I/O completion.
	*
	* The other pages in our ma[] array are also released on completion,
	* so we cannot assume they are valid anymore either.
	*
	* NOTE: b_blkno is destroyed by the call to swapdev_strategy
	*/
	BUF_KERNPROC(bp);
	swp_pager_strategy(bp);

	/*
	* Wait for the pages we want to complete. VPO_SWAPINPROG is always
	* cleared on completion. If an I/O error occurs, SWAPBLK_NONE
	* is set in the metadata for each page in the request.
	*/
	VM_OBJECT_WLOCK(object);
	/* This could be implemented more efficiently with aflags */
	while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) {
	ma[0]->oflags \|= VPO_SWAPSLEEP;
	VM_CNT_INC(v_intrans);
	if (VM_OBJECT_SLEEP(object, &object->handle, PSWP,
	"swread", hz * 20)) {
	printf(
	"swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
	bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
	}
	}
	VM_OBJECT_WUNLOCK(object);

	/*
	* If we had an unrecoverable read error pages will not be valid.
	*/
	for (i = 0; i < reqcount; i++)
	if (ma[i]->valid != VM_PAGE_BITS_ALL)
	return (VM_PAGER_ERROR);

	return (VM_PAGER_OK);

	/*
	* A final note: in a low swap situation, we cannot deallocate swap
	* and mark a page dirty here because the caller is likely to mark
	* the page clean when we return, causing the page to possibly revert
	* to all-zero's later.
	*/
	}

	static int
	swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count,
	int rbehind, int rahead)
	{

	VM_OBJECT_WLOCK(object);
	return (swap_pager_getpages_locked(object, ma, count, rbehind, rahead));
	}

	/*
	* swap_pager_getpages_async():
	*
	* Right now this is emulation of asynchronous operation on top of
	* swap_pager_getpages().
	*/
	static int
	swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count,
	int rbehind, int rahead, pgo_getpages_iodone_t iodone, void *arg)
	{
	int r, error;

	r = swap_pager_getpages(object, ma, count, rbehind, rahead);
	switch (r) {
	case VM_PAGER_OK:
	error = 0;
	break;
	case VM_PAGER_ERROR:
	error = EIO;
	break;
	case VM_PAGER_FAIL:
	error = EINVAL;
	break;
	default:
	panic("unhandled swap_pager_getpages() error %d", r);
	}
	(iodone)(arg, ma, count, error);

	return (r);
	}

	/*
	* swap_pager_putpages:
	*
	* Assign swap (if necessary) and initiate I/O on the specified pages.
	*
	* We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects
	* are automatically converted to SWAP objects.
	*
	* In a low memory situation we may block in VOP_STRATEGY(), but the new
	* vm_page reservation system coupled with properly written VFS devices
	* should ensure that no low-memory deadlock occurs. This is an area
	* which needs work.
	*
	* The parent has N vm_object_pip_add() references prior to
	* calling us and will remove references for rtvals[] that are
	* not set to VM_PAGER_PEND. We need to remove the rest on I/O
	* completion.
	*
	* The parent has soft-busy'd the pages it passes us and will unbusy
	* those whose rtvals[] entry is not set to VM_PAGER_PEND on return.
	* We need to unbusy the rest on I/O completion.
	*/
	static void
	swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count,
	int flags, int *rtvals)
	{
	struct buf *bp;
	daddr_t addr, blk, n_free, s_free;
	vm_page_t mreq;
	int i, j, n;
	bool async;

	KASSERT(count == 0 \|\| ma[0]->object == object,
	("%s: object mismatch %p/%p",
	__func__, object, ma[0]->object));

	/*
	* Step 1
	*
	* Turn object into OBJT_SWAP. Force sync if not a pageout process.
	*/
	if (object->type != OBJT_SWAP) {
	addr = swp_pager_meta_build(object, 0, SWAPBLK_NONE);
	KASSERT(addr == SWAPBLK_NONE,
	("unexpected object swap block"));
	}
	VM_OBJECT_WUNLOCK(object);
	async = curproc == pageproc && (flags & VM_PAGER_PUT_SYNC) == 0;
	swp_pager_init_freerange(&s_free, &n_free);

	/*
	* Step 2
	*
	* Assign swap blocks and issue I/O. We reallocate swap on the fly.
	* The page is left dirty until the pageout operation completes
	* successfully.
	*/
	for (i = 0; i < count; i += n) {
	/* Maximum I/O size is limited by maximum swap block size. */
	n = min(count - i, nsw_cluster_max);

	if (async) {
	mtx_lock(&swbuf_mtx);
	while (nsw_wcount_async == 0)
	msleep(&nsw_wcount_async, &swbuf_mtx, PVM,
	"swbufa", 0);
	nsw_wcount_async--;
	mtx_unlock(&swbuf_mtx);
	}

	/* Get a block of swap of size up to size n. */
	VM_OBJECT_WLOCK(object);
	blk = swp_pager_getswapspace(&n);
	if (blk == SWAPBLK_NONE) {
	VM_OBJECT_WUNLOCK(object);
	mtx_lock(&swbuf_mtx);
	if (++nsw_wcount_async == 1)
	wakeup(&nsw_wcount_async);
	mtx_unlock(&swbuf_mtx);
	for (j = 0; j < n; ++j)
	rtvals[i + j] = VM_PAGER_FAIL;
	continue;
	}
	for (j = 0; j < n; ++j) {
	mreq = ma[i + j];
	vm_page_aflag_clear(mreq, PGA_SWAP_FREE);
	addr = swp_pager_meta_build(mreq->object, mreq->pindex,
	blk + j);
	if (addr != SWAPBLK_NONE)
	swp_pager_update_freerange(&s_free, &n_free,
	addr);
	MPASS(mreq->dirty == VM_PAGE_BITS_ALL);
	mreq->oflags \|= VPO_SWAPINPROG;
	}
	VM_OBJECT_WUNLOCK(object);

	bp = uma_zalloc(swwbuf_zone, M_WAITOK);
	+ MPASS((bp->b_flags & B_MAXPHYS) != 0);
	if (async)
	- bp->b_flags = B_ASYNC;
	+ bp->b_flags \|= B_ASYNC;
	bp->b_flags \|= B_PAGING;
	bp->b_iocmd = BIO_WRITE;

	bp->b_rcred = crhold(thread0.td_ucred);
	bp->b_wcred = crhold(thread0.td_ucred);
	bp->b_bcount = PAGE_SIZE * n;
	bp->b_bufsize = PAGE_SIZE * n;
	bp->b_blkno = blk;
	for (j = 0; j < n; j++)
	bp->b_pages[j] = ma[i + j];
	bp->b_npages = n;

	/*
	* Must set dirty range for NFS to work.
	*/
	bp->b_dirtyoff = 0;
	bp->b_dirtyend = bp->b_bcount;

	VM_CNT_INC(v_swapout);
	VM_CNT_ADD(v_swappgsout, bp->b_npages);

	/*
	* We unconditionally set rtvals[] to VM_PAGER_PEND so that we
	* can call the async completion routine at the end of a
	* synchronous I/O operation. Otherwise, our caller would
	* perform duplicate unbusy and wakeup operations on the page
	* and object, respectively.
	*/
	for (j = 0; j < n; j++)
	rtvals[i + j] = VM_PAGER_PEND;

	/*
	* asynchronous
	*
	* NOTE: b_blkno is destroyed by the call to swapdev_strategy.
	*/
	if (async) {
	bp->b_iodone = swp_pager_async_iodone;
	BUF_KERNPROC(bp);
	swp_pager_strategy(bp);
	continue;
	}

	/*
	* synchronous
	*
	* NOTE: b_blkno is destroyed by the call to swapdev_strategy.
	*/
	bp->b_iodone = bdone;
	swp_pager_strategy(bp);

	/*
	* Wait for the sync I/O to complete.
	*/
	bwait(bp, PVM, "swwrt");

	/*
	* Now that we are through with the bp, we can call the
	* normal async completion, which frees everything up.
	*/
	swp_pager_async_iodone(bp);
	}
	swp_pager_freeswapspace(s_free, n_free);
	VM_OBJECT_WLOCK(object);
	}

	/*
	* swp_pager_async_iodone:
	*
	* Completion routine for asynchronous reads and writes from/to swap.
	* Also called manually by synchronous code to finish up a bp.
	*
	* This routine may not sleep.
	*/
	static void
	swp_pager_async_iodone(struct buf *bp)
	{
	int i;
	vm_object_t object = NULL;

	/*
	* Report error - unless we ran out of memory, in which case
	* we've already logged it in swapgeom_strategy().
	*/
	if (bp->b_ioflags & BIO_ERROR && bp->b_error != ENOMEM) {
	printf(
	"swap_pager: I/O error - %s failed; blkno %ld,"
	"size %ld, error %d\n",
	((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
	(long)bp->b_blkno,
	(long)bp->b_bcount,
	bp->b_error
	);
	}

	/*
	* remove the mapping for kernel virtual
	*/
	if (buf_mapped(bp))
	pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
	else
	bp->b_data = bp->b_kvabase;

	if (bp->b_npages) {
	object = bp->b_pages[0]->object;
	VM_OBJECT_WLOCK(object);
	}

	/*
	* cleanup pages. If an error occurs writing to swap, we are in
	* very serious trouble. If it happens to be a disk error, though,
	* we may be able to recover by reassigning the swap later on. So
	* in this case we remove the m->swapblk assignment for the page
	* but do not free it in the rlist. The errornous block(s) are thus
	* never reallocated as swap. Redirty the page and continue.
	*/
	for (i = 0; i < bp->b_npages; ++i) {
	vm_page_t m = bp->b_pages[i];

	m->oflags &= ~VPO_SWAPINPROG;
	if (m->oflags & VPO_SWAPSLEEP) {
	m->oflags &= ~VPO_SWAPSLEEP;
	wakeup(&object->handle);
	}

	/* We always have space after I/O, successful or not. */
	vm_page_aflag_set(m, PGA_SWAP_SPACE);

	if (bp->b_ioflags & BIO_ERROR) {
	/*
	* If an error occurs I'd love to throw the swapblk
	* away without freeing it back to swapspace, so it
	* can never be used again. But I can't from an
	* interrupt.
	*/
	if (bp->b_iocmd == BIO_READ) {
	/*
	* NOTE: for reads, m->dirty will probably
	* be overridden by the original caller of
	* getpages so don't play cute tricks here.
	*/
	vm_page_invalid(m);
	} else {
	/*
	* If a write error occurs, reactivate page
	* so it doesn't clog the inactive list,
	* then finish the I/O.
	*/
	MPASS(m->dirty == VM_PAGE_BITS_ALL);

	/* PQ_UNSWAPPABLE? */
	vm_page_activate(m);
	vm_page_sunbusy(m);
	}
	} else if (bp->b_iocmd == BIO_READ) {
	/*
	* NOTE: for reads, m->dirty will probably be
	* overridden by the original caller of getpages so
	* we cannot set them in order to free the underlying
	* swap in a low-swap situation. I don't think we'd
	* want to do that anyway, but it was an optimization
	* that existed in the old swapper for a time before
	* it got ripped out due to precisely this problem.
	*/
	KASSERT(!pmap_page_is_mapped(m),
	("swp_pager_async_iodone: page %p is mapped", m));
	KASSERT(m->dirty == 0,
	("swp_pager_async_iodone: page %p is dirty", m));

	vm_page_valid(m);
	if (i < bp->b_pgbefore \|\|
	i >= bp->b_npages - bp->b_pgafter)
	vm_page_readahead_finish(m);
	} else {
	/*
	* For write success, clear the dirty
	* status, then finish the I/O ( which decrements the
	* busy count and possibly wakes waiter's up ).
	* A page is only written to swap after a period of
	* inactivity. Therefore, we do not expect it to be
	* reused.
	*/
	KASSERT(!pmap_page_is_write_mapped(m),
	("swp_pager_async_iodone: page %p is not write"
	" protected", m));
	vm_page_undirty(m);
	vm_page_deactivate_noreuse(m);
	vm_page_sunbusy(m);
	}
	}

	/*
	* adjust pip. NOTE: the original parent may still have its own
	* pip refs on the object.
	*/
	if (object != NULL) {
	vm_object_pip_wakeupn(object, bp->b_npages);
	VM_OBJECT_WUNLOCK(object);
	}

	/*
	* swapdev_strategy() manually sets b_vp and b_bufobj before calling
	* bstrategy(). Set them back to NULL now we're done with it, or we'll
	* trigger a KASSERT in relpbuf().
	*/
	if (bp->b_vp) {
	bp->b_vp = NULL;
	bp->b_bufobj = NULL;
	}
	/*
	* release the physical I/O buffer
	*/
	if (bp->b_flags & B_ASYNC) {
	mtx_lock(&swbuf_mtx);
	if (++nsw_wcount_async == 1)
	wakeup(&nsw_wcount_async);
	mtx_unlock(&swbuf_mtx);
	}
	uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp);
	}

	int
	swap_pager_nswapdev(void)
	{

	return (nswapdev);
	}

	static void
	swp_pager_force_dirty(vm_page_t m)
	{

	vm_page_dirty(m);
	swap_pager_unswapped(m);
	vm_page_launder(m);
	}

	/*
	* swap_pager_swapoff_object:
	*
	* Page in all of the pages that have been paged out for an object
	* to a swap device.
	*/
	static void
	swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object)
	{
	struct swblk *sb;
	vm_page_t m;
	vm_pindex_t pi;
	daddr_t blk;
	int i, nv, rahead, rv;

	KASSERT(object->type == OBJT_SWAP,
	("%s: Object not swappable", __func__));

	for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
	&object->un_pager.swp.swp_blks, pi)) != NULL; ) {
	if ((object->flags & OBJ_DEAD) != 0) {
	/*
	* Make sure that pending writes finish before
	* returning.
	*/
	vm_object_pip_wait(object, "swpoff");
	swp_pager_meta_free_all(object);
	break;
	}
	for (i = 0; i < SWAP_META_PAGES; i++) {
	/*
	* Count the number of contiguous valid blocks.
	*/
	for (nv = 0; nv < SWAP_META_PAGES - i; nv++) {
	blk = sb->d[i + nv];
	if (!swp_pager_isondev(blk, sp) \|\|
	blk == SWAPBLK_NONE)
	break;
	}
	if (nv == 0)
	continue;

	/*
	* Look for a page corresponding to the first
	* valid block and ensure that any pending paging
	* operations on it are complete. If the page is valid,
	* mark it dirty and free the swap block. Try to batch
	* this operation since it may cause sp to be freed,
	* meaning that we must restart the scan. Avoid busying
	* valid pages since we may block forever on kernel
	* stack pages.
	*/
	m = vm_page_lookup(object, sb->p + i);
	if (m == NULL) {
	m = vm_page_alloc(object, sb->p + i,
	VM_ALLOC_NORMAL \| VM_ALLOC_WAITFAIL);
	if (m == NULL)
	break;
	} else {
	if ((m->oflags & VPO_SWAPINPROG) != 0) {
	m->oflags \|= VPO_SWAPSLEEP;
	VM_OBJECT_SLEEP(object, &object->handle,
	PSWP, "swpoff", 0);
	break;
	}
	if (vm_page_all_valid(m)) {
	do {
	swp_pager_force_dirty(m);
	} while (--nv > 0 &&
	(m = vm_page_next(m)) != NULL &&
	vm_page_all_valid(m) &&
	(m->oflags & VPO_SWAPINPROG) == 0);
	break;
	}
	if (!vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL))
	break;
	}

	vm_object_pip_add(object, 1);
	rahead = SWAP_META_PAGES;
	rv = swap_pager_getpages_locked(object, &m, 1, NULL,
	&rahead);
	if (rv != VM_PAGER_OK)
	panic("%s: read from swap failed: %d",
	__func__, rv);
	vm_object_pip_wakeupn(object, 1);
	VM_OBJECT_WLOCK(object);
	vm_page_xunbusy(m);

	/*
	* The object lock was dropped so we must restart the
	* scan of this swap block. Pages paged in during this
	* iteration will be marked dirty in a future iteration.
	*/
	break;
	}
	if (i == SWAP_META_PAGES)
	pi = sb->p + SWAP_META_PAGES;
	}
	}

	/*
	* swap_pager_swapoff:
	*
	* Page in all of the pages that have been paged out to the
	* given device. The corresponding blocks in the bitmap must be
	* marked as allocated and the device must be flagged SW_CLOSING.
	* There may be no processes swapped out to the device.
	*
	* This routine may block.
	*/
	static void
	swap_pager_swapoff(struct swdevt *sp)
	{
	vm_object_t object;
	int retries;

	sx_assert(&swdev_syscall_lock, SA_XLOCKED);

	retries = 0;
	full_rescan:
	mtx_lock(&vm_object_list_mtx);
	TAILQ_FOREACH(object, &vm_object_list, object_list) {
	if (object->type != OBJT_SWAP)
	continue;
	mtx_unlock(&vm_object_list_mtx);
	/* Depends on type-stability. */
	VM_OBJECT_WLOCK(object);

	/*
	* Dead objects are eventually terminated on their own.
	*/
	if ((object->flags & OBJ_DEAD) != 0)
	goto next_obj;

	/*
	* Sync with fences placed after pctrie
	* initialization. We must not access pctrie below
	* unless we checked that our object is swap and not
	* dead.
	*/
	atomic_thread_fence_acq();
	if (object->type != OBJT_SWAP)
	goto next_obj;

	swap_pager_swapoff_object(sp, object);
	next_obj:
	VM_OBJECT_WUNLOCK(object);
	mtx_lock(&vm_object_list_mtx);
	}
	mtx_unlock(&vm_object_list_mtx);

	if (sp->sw_used) {
	/*
	* Objects may be locked or paging to the device being
	* removed, so we will miss their pages and need to
	* make another pass. We have marked this device as
	* SW_CLOSING, so the activity should finish soon.
	*/
	retries++;
	if (retries > 100) {
	panic("swapoff: failed to locate %d swap blocks",
	sp->sw_used);
	}
	pause("swpoff", hz / 20);
	goto full_rescan;
	}
	EVENTHANDLER_INVOKE(swapoff, sp);
	}

	/************************************************************************
	* SWAP META DATA *
	************************************************************************
	*
	* These routines manipulate the swap metadata stored in the
	* OBJT_SWAP object.
	*
	* Swap metadata is implemented with a global hash and not directly
	* linked into the object. Instead the object simply contains
	* appropriate tracking counters.
	*/

	/*
	* SWP_PAGER_SWBLK_EMPTY() - is a range of blocks free?
	*/
	static bool
	swp_pager_swblk_empty(struct swblk *sb, int start, int limit)
	{
	int i;

	MPASS(0 <= start && start <= limit && limit <= SWAP_META_PAGES);
	for (i = start; i < limit; i++) {
	if (sb->d[i] != SWAPBLK_NONE)
	return (false);
	}
	return (true);
	}

	/*
	* SWP_PAGER_FREE_EMPTY_SWBLK() - frees if a block is free
	*
	* Nothing is done if the block is still in use.
	*/
	static void
	swp_pager_free_empty_swblk(vm_object_t object, struct swblk *sb)
	{

	if (swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) {
	SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
	uma_zfree(swblk_zone, sb);
	}
	}

	/*
	* SWP_PAGER_META_BUILD() - add swap block to swap meta data for object
	*
	* We first convert the object to a swap object if it is a default
	* object.
	*
	* The specified swapblk is added to the object's swap metadata. If
	* the swapblk is not valid, it is freed instead. Any previously
	* assigned swapblk is returned.
	*/
	static daddr_t
	swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
	{
	static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted;
	struct swblk sb, sb1;
	vm_pindex_t modpi, rdpi;
	daddr_t prev_swapblk;
	int error, i;

	VM_OBJECT_ASSERT_WLOCKED(object);

	/*
	* Convert default object to swap object if necessary
	*/
	if (object->type != OBJT_SWAP) {
	pctrie_init(&object->un_pager.swp.swp_blks);

	/*
	* Ensure that swap_pager_swapoff()'s iteration over
	* object_list does not see a garbage pctrie.
	*/
	atomic_thread_fence_rel();

	object->type = OBJT_SWAP;
	object->un_pager.swp.writemappings = 0;
	KASSERT((object->flags & OBJ_ANON) != 0 \|\|
	object->handle == NULL,
	("default pager %p with handle %p",
	object, object->handle));
	}

	rdpi = rounddown(pindex, SWAP_META_PAGES);
	sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi);
	if (sb == NULL) {
	if (swapblk == SWAPBLK_NONE)
	return (SWAPBLK_NONE);
	for (;;) {
	sb = uma_zalloc(swblk_zone, M_NOWAIT \| (curproc ==
	pageproc ? M_USE_RESERVE : 0));
	if (sb != NULL) {
	sb->p = rdpi;
	for (i = 0; i < SWAP_META_PAGES; i++)
	sb->d[i] = SWAPBLK_NONE;
	if (atomic_cmpset_int(&swblk_zone_exhausted,
	1, 0))
	printf("swblk zone ok\n");
	break;
	}
	VM_OBJECT_WUNLOCK(object);
	if (uma_zone_exhausted(swblk_zone)) {
	if (atomic_cmpset_int(&swblk_zone_exhausted,
	0, 1))
	printf("swap blk zone exhausted, "
	"increase kern.maxswzone\n");
	vm_pageout_oom(VM_OOM_SWAPZ);
	pause("swzonxb", 10);
	} else
	uma_zwait(swblk_zone);
	VM_OBJECT_WLOCK(object);
	sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
	rdpi);
	if (sb != NULL)
	/*
	* Somebody swapped out a nearby page,
	* allocating swblk at the rdpi index,
	* while we dropped the object lock.
	*/
	goto allocated;
	}
	for (;;) {
	error = SWAP_PCTRIE_INSERT(
	&object->un_pager.swp.swp_blks, sb);
	if (error == 0) {
	if (atomic_cmpset_int(&swpctrie_zone_exhausted,
	1, 0))
	printf("swpctrie zone ok\n");
	break;
	}
	VM_OBJECT_WUNLOCK(object);
	if (uma_zone_exhausted(swpctrie_zone)) {
	if (atomic_cmpset_int(&swpctrie_zone_exhausted,
	0, 1))
	printf("swap pctrie zone exhausted, "
	"increase kern.maxswzone\n");
	vm_pageout_oom(VM_OOM_SWAPZ);
	pause("swzonxp", 10);
	} else
	uma_zwait(swpctrie_zone);
	VM_OBJECT_WLOCK(object);
	sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
	rdpi);
	if (sb1 != NULL) {
	uma_zfree(swblk_zone, sb);
	sb = sb1;
	goto allocated;
	}
	}
	}
	allocated:
	MPASS(sb->p == rdpi);

	modpi = pindex % SWAP_META_PAGES;
	/* Return prior contents of metadata. */
	prev_swapblk = sb->d[modpi];
	/* Enter block into metadata. */
	sb->d[modpi] = swapblk;

	/*
	* Free the swblk if we end up with the empty page run.
	*/
	if (swapblk == SWAPBLK_NONE)
	swp_pager_free_empty_swblk(object, sb);
	return (prev_swapblk);
	}

	/*
	* SWP_PAGER_META_TRANSFER() - free a range of blocks in the srcobject's swap
	* metadata, or transfer it into dstobject.
	*
	* This routine will free swap metadata structures as they are cleaned
	* out.
	*/
	static void
	swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject,
	vm_pindex_t pindex, vm_pindex_t count)
	{
	struct swblk *sb;
	daddr_t n_free, s_free;
	vm_pindex_t offset, last;
	int i, limit, start;

	VM_OBJECT_ASSERT_WLOCKED(srcobject);
	if (srcobject->type != OBJT_SWAP \|\| count == 0)
	return;

	swp_pager_init_freerange(&s_free, &n_free);
	offset = pindex;
	last = pindex + count;
	for (;;) {
	sb = SWAP_PCTRIE_LOOKUP_GE(&srcobject->un_pager.swp.swp_blks,
	rounddown(pindex, SWAP_META_PAGES));
	if (sb == NULL \|\| sb->p >= last)
	break;
	start = pindex > sb->p ? pindex - sb->p : 0;
	limit = last - sb->p < SWAP_META_PAGES ? last - sb->p :
	SWAP_META_PAGES;
	for (i = start; i < limit; i++) {
	if (sb->d[i] == SWAPBLK_NONE)
	continue;
	if (dstobject == NULL \|\|
	!swp_pager_xfer_source(srcobject, dstobject,
	sb->p + i - offset, sb->d[i])) {
	swp_pager_update_freerange(&s_free, &n_free,
	sb->d[i]);
	}
	sb->d[i] = SWAPBLK_NONE;
	}
	pindex = sb->p + SWAP_META_PAGES;
	if (swp_pager_swblk_empty(sb, 0, start) &&
	swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) {
	SWAP_PCTRIE_REMOVE(&srcobject->un_pager.swp.swp_blks,
	sb->p);
	uma_zfree(swblk_zone, sb);
	}
	}
	swp_pager_freeswapspace(s_free, n_free);
	}

	/*
	* SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
	*
	* The requested range of blocks is freed, with any associated swap
	* returned to the swap bitmap.
	*
	* This routine will free swap metadata structures as they are cleaned
	* out. This routine does NOT operate on swap metadata associated
	* with resident pages.
	*/
	static void
	swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count)
	{
	swp_pager_meta_transfer(object, NULL, pindex, count);
	}

	/*
	* SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
	*
	* This routine locates and destroys all swap metadata associated with
	* an object.
	*/
	static void
	swp_pager_meta_free_all(vm_object_t object)
	{
	struct swblk *sb;
	daddr_t n_free, s_free;
	vm_pindex_t pindex;
	int i;

	VM_OBJECT_ASSERT_WLOCKED(object);
	if (object->type != OBJT_SWAP)
	return;

	swp_pager_init_freerange(&s_free, &n_free);
	for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
	&object->un_pager.swp.swp_blks, pindex)) != NULL;) {
	pindex = sb->p + SWAP_META_PAGES;
	for (i = 0; i < SWAP_META_PAGES; i++) {
	if (sb->d[i] == SWAPBLK_NONE)
	continue;
	swp_pager_update_freerange(&s_free, &n_free, sb->d[i]);
	}
	SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
	uma_zfree(swblk_zone, sb);
	}
	swp_pager_freeswapspace(s_free, n_free);
	}

	/*
	* SWP_PAGER_METACTL() - misc control of swap meta data.
	*
	* This routine is capable of looking up, or removing swapblk
	* assignments in the swap meta data. It returns the swapblk being
	* looked-up, popped, or SWAPBLK_NONE if the block was invalid.
	*
	* When acting on a busy resident page and paging is in progress, we
	* have to wait until paging is complete but otherwise can act on the
	* busy page.
	*/
	static daddr_t
	swp_pager_meta_lookup(vm_object_t object, vm_pindex_t pindex)
	{
	struct swblk *sb;

	VM_OBJECT_ASSERT_LOCKED(object);

	/*
	* The meta data only exists if the object is OBJT_SWAP
	* and even then might not be allocated yet.
	*/
	KASSERT(object->type == OBJT_SWAP,
	("Lookup object not swappable"));

	sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
	rounddown(pindex, SWAP_META_PAGES));
	if (sb == NULL)
	return (SWAPBLK_NONE);
	return (sb->d[pindex % SWAP_META_PAGES]);
	}

	/*
	* Returns the least page index which is greater than or equal to the
	* parameter pindex and for which there is a swap block allocated.
	* Returns object's size if the object's type is not swap or if there
	* are no allocated swap blocks for the object after the requested
	* pindex.
	*/
	vm_pindex_t
	swap_pager_find_least(vm_object_t object, vm_pindex_t pindex)
	{
	struct swblk *sb;
	int i;

	VM_OBJECT_ASSERT_LOCKED(object);
	if (object->type != OBJT_SWAP)
	return (object->size);

	sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
	rounddown(pindex, SWAP_META_PAGES));
	if (sb == NULL)
	return (object->size);
	if (sb->p < pindex) {
	for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) {
	if (sb->d[i] != SWAPBLK_NONE)
	return (sb->p + i);
	}
	sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
	roundup(pindex, SWAP_META_PAGES));
	if (sb == NULL)
	return (object->size);
	}
	for (i = 0; i < SWAP_META_PAGES; i++) {
	if (sb->d[i] != SWAPBLK_NONE)
	return (sb->p + i);
	}

	/*
	* We get here if a swblk is present in the trie but it
	* doesn't map any blocks.
	*/
	MPASS(0);
	return (object->size);
	}

	/*
	* System call swapon(name) enables swapping on device name,
	* which must be in the swdevsw. Return EBUSY
	* if already swapping on this device.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct swapon_args {
	char *name;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	sys_swapon(struct thread td, struct swapon_args uap)
	{
	struct vattr attr;
	struct vnode *vp;
	struct nameidata nd;
	int error;

	error = priv_check(td, PRIV_SWAPON);
	if (error)
	return (error);

	sx_xlock(&swdev_syscall_lock);

	/*
	* Swap metadata may not fit in the KVM if we have physical
	* memory of >1GB.
	*/
	if (swblk_zone == NULL) {
	error = ENOMEM;
	goto done;
	}

	NDINIT(&nd, LOOKUP, ISOPEN \| FOLLOW \| AUDITVNODE1, UIO_USERSPACE,
	uap->name, td);
	error = namei(&nd);
	if (error)
	goto done;

	NDFREE(&nd, NDF_ONLY_PNBUF);
	vp = nd.ni_vp;

	if (vn_isdisk_error(vp, &error)) {
	error = swapongeom(vp);
	} else if (vp->v_type == VREG &&
	(vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
	(error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
	/*
	* Allow direct swapping to NFS regular files in the same
	* way that nfs_mountroot() sets up diskless swapping.
	*/
	error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
	}

	if (error)
	vrele(vp);
	done:
	sx_xunlock(&swdev_syscall_lock);
	return (error);
	}

	/*
	* Check that the total amount of swap currently configured does not
	* exceed half the theoretical maximum. If it does, print a warning
	* message.
	*/
	static void
	swapon_check_swzone(void)
	{

	/* recommend using no more than half that amount */
	if (swap_total > swap_maxpages / 2) {
	printf("warning: total configured swap (%lu pages) "
	"exceeds maximum recommended amount (%lu pages).\n",
	swap_total, swap_maxpages / 2);
	printf("warning: increase kern.maxswzone "
	"or reduce amount of swap.\n");
	}
	}

	static void
	swaponsomething(struct vnode vp, void id, u_long nblks,
	sw_strategy_t strategy, sw_close_t close, dev_t dev, int flags)
	{
	struct swdevt sp, tsp;
	daddr_t dvbase;

	/*
	* nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
	* First chop nblks off to page-align it, then convert.
	*
	* sw->sw_nblks is in page-sized chunks now too.
	*/
	nblks &= ~(ctodb(1) - 1);
	nblks = dbtoc(nblks);

	sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK \| M_ZERO);
	sp->sw_blist = blist_create(nblks, M_WAITOK);
	sp->sw_vp = vp;
	sp->sw_id = id;
	sp->sw_dev = dev;
	sp->sw_nblks = nblks;
	sp->sw_used = 0;
	sp->sw_strategy = strategy;
	sp->sw_close = close;
	sp->sw_flags = flags;

	/*
	* Do not free the first blocks in order to avoid overwriting
	* any bsd label at the front of the partition
	*/
	blist_free(sp->sw_blist, howmany(BBSIZE, PAGE_SIZE),
	nblks - howmany(BBSIZE, PAGE_SIZE));

	dvbase = 0;
	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(tsp, &swtailq, sw_list) {
	if (tsp->sw_end >= dvbase) {
	/*
	* We put one uncovered page between the devices
	* in order to definitively prevent any cross-device
	* I/O requests
	*/
	dvbase = tsp->sw_end + 1;
	}
	}
	sp->sw_first = dvbase;
	sp->sw_end = dvbase + nblks;
	TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
	nswapdev++;
	swap_pager_avail += nblks - howmany(BBSIZE, PAGE_SIZE);
	swap_total += nblks;
	swapon_check_swzone();
	swp_sizecheck();
	mtx_unlock(&sw_dev_mtx);
	EVENTHANDLER_INVOKE(swapon, sp);
	}

	/*
	* SYSCALL: swapoff(devname)
	*
	* Disable swapping on the given device.
	*
	* XXX: Badly designed system call: it should use a device index
	* rather than filename as specification. We keep sw_vp around
	* only to make this work.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct swapoff_args {
	char *name;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	sys_swapoff(struct thread td, struct swapoff_args uap)
	{
	struct vnode *vp;
	struct nameidata nd;
	struct swdevt *sp;
	int error;

	error = priv_check(td, PRIV_SWAPOFF);
	if (error)
	return (error);

	sx_xlock(&swdev_syscall_lock);

	NDINIT(&nd, LOOKUP, FOLLOW \| AUDITVNODE1, UIO_USERSPACE, uap->name,
	td);
	error = namei(&nd);
	if (error)
	goto done;
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vp = nd.ni_vp;

	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	if (sp->sw_vp == vp)
	break;
	}
	mtx_unlock(&sw_dev_mtx);
	if (sp == NULL) {
	error = EINVAL;
	goto done;
	}
	error = swapoff_one(sp, td->td_ucred);
	done:
	sx_xunlock(&swdev_syscall_lock);
	return (error);
	}

	static int
	swapoff_one(struct swdevt sp, struct ucred cred)
	{
	u_long nblks;
	#ifdef MAC
	int error;
	#endif

	sx_assert(&swdev_syscall_lock, SA_XLOCKED);
	#ifdef MAC
	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE \| LK_RETRY);
	error = mac_system_check_swapoff(cred, sp->sw_vp);
	(void) VOP_UNLOCK(sp->sw_vp);
	if (error != 0)
	return (error);
	#endif
	nblks = sp->sw_nblks;

	/*
	* We can turn off this swap device safely only if the
	* available virtual memory in the system will fit the amount
	* of data we will have to page back in, plus an epsilon so
	* the system doesn't become critically low on swap space.
	*/
	if (vm_free_count() + swap_pager_avail < nblks + nswap_lowat)
	return (ENOMEM);

	/*
	* Prevent further allocations on this device.
	*/
	mtx_lock(&sw_dev_mtx);
	sp->sw_flags \|= SW_CLOSING;
	swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks);
	swap_total -= nblks;
	mtx_unlock(&sw_dev_mtx);

	/*
	* Page in the contents of the device and close it.
	*/
	swap_pager_swapoff(sp);

	sp->sw_close(curthread, sp);
	mtx_lock(&sw_dev_mtx);
	sp->sw_id = NULL;
	TAILQ_REMOVE(&swtailq, sp, sw_list);
	nswapdev--;
	if (nswapdev == 0) {
	swap_pager_full = 2;
	swap_pager_almost_full = 1;
	}
	if (swdevhd == sp)
	swdevhd = NULL;
	mtx_unlock(&sw_dev_mtx);
	blist_destroy(sp->sw_blist);
	free(sp, M_VMPGDATA);
	return (0);
	}

	void
	swapoff_all(void)
	{
	struct swdevt sp, spt;
	const char *devname;
	int error;

	sx_xlock(&swdev_syscall_lock);

	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
	mtx_unlock(&sw_dev_mtx);
	if (vn_isdisk(sp->sw_vp))
	devname = devtoname(sp->sw_vp->v_rdev);
	else
	devname = "[file]";
	error = swapoff_one(sp, thread0.td_ucred);
	if (error != 0) {
	printf("Cannot remove swap device %s (error=%d), "
	"skipping.\n", devname, error);
	} else if (bootverbose) {
	printf("Swap device %s removed.\n", devname);
	}
	mtx_lock(&sw_dev_mtx);
	}
	mtx_unlock(&sw_dev_mtx);

	sx_xunlock(&swdev_syscall_lock);
	}

	void
	swap_pager_status(int total, int used)
	{

	*total = swap_total;
	*used = swap_total - swap_pager_avail -
	nswapdev * howmany(BBSIZE, PAGE_SIZE);
	}

	int
	swap_dev_info(int name, struct xswdev xs, char devname, size_t len)
	{
	struct swdevt *sp;
	const char *tmp_devname;
	int error, n;

	n = 0;
	error = ENOENT;
	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	if (n != name) {
	n++;
	continue;
	}
	xs->xsw_version = XSWDEV_VERSION;
	xs->xsw_dev = sp->sw_dev;
	xs->xsw_flags = sp->sw_flags;
	xs->xsw_nblks = sp->sw_nblks;
	xs->xsw_used = sp->sw_used;
	if (devname != NULL) {
	if (vn_isdisk(sp->sw_vp))
	tmp_devname = devtoname(sp->sw_vp->v_rdev);
	else
	tmp_devname = "[file]";
	strncpy(devname, tmp_devname, len);
	}
	error = 0;
	break;
	}
	mtx_unlock(&sw_dev_mtx);
	return (error);
	}

	#if defined(COMPAT_FREEBSD11)
	#define XSWDEV_VERSION_11 1
	struct xswdev11 {
	u_int xsw_version;
	uint32_t xsw_dev;
	int xsw_flags;
	int xsw_nblks;
	int xsw_used;
	};
	#endif

	#if defined(__amd64__) && defined(COMPAT_FREEBSD32)
	struct xswdev32 {
	u_int xsw_version;
	u_int xsw_dev1, xsw_dev2;
	int xsw_flags;
	int xsw_nblks;
	int xsw_used;
	};
	#endif

	static int
	sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
	{
	struct xswdev xs;
	#if defined(__amd64__) && defined(COMPAT_FREEBSD32)
	struct xswdev32 xs32;
	#endif
	#if defined(COMPAT_FREEBSD11)
	struct xswdev11 xs11;
	#endif
	int error;

	if (arg2 != 1) /* name length */
	return (EINVAL);
	error = swap_dev_info((int )arg1, &xs, NULL, 0);
	if (error != 0)
	return (error);
	#if defined(__amd64__) && defined(COMPAT_FREEBSD32)
	if (req->oldlen == sizeof(xs32)) {
	xs32.xsw_version = XSWDEV_VERSION;
	xs32.xsw_dev1 = xs.xsw_dev;
	xs32.xsw_dev2 = xs.xsw_dev >> 32;
	xs32.xsw_flags = xs.xsw_flags;
	xs32.xsw_nblks = xs.xsw_nblks;
	xs32.xsw_used = xs.xsw_used;
	error = SYSCTL_OUT(req, &xs32, sizeof(xs32));
	return (error);
	}
	#endif
	#if defined(COMPAT_FREEBSD11)
	if (req->oldlen == sizeof(xs11)) {
	xs11.xsw_version = XSWDEV_VERSION_11;
	xs11.xsw_dev = xs.xsw_dev; /* truncation */
	xs11.xsw_flags = xs.xsw_flags;
	xs11.xsw_nblks = xs.xsw_nblks;
	xs11.xsw_used = xs.xsw_used;
	error = SYSCTL_OUT(req, &xs11, sizeof(xs11));
	return (error);
	}
	#endif
	error = SYSCTL_OUT(req, &xs, sizeof(xs));
	return (error);
	}

	SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
	"Number of swap devices");
	SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD \| CTLFLAG_MPSAFE,
	sysctl_vm_swap_info,
	"Swap statistics by device");

	/*
	* Count the approximate swap usage in pages for a vmspace. The
	* shadowed or not yet copied on write swap blocks are not accounted.
	* The map must be locked.
	*/
	long
	vmspace_swap_count(struct vmspace *vmspace)
	{
	vm_map_t map;
	vm_map_entry_t cur;
	vm_object_t object;
	struct swblk *sb;
	vm_pindex_t e, pi;
	long count;
	int i;

	map = &vmspace->vm_map;
	count = 0;

	VM_MAP_ENTRY_FOREACH(cur, map) {
	if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
	continue;
	object = cur->object.vm_object;
	if (object == NULL \|\| object->type != OBJT_SWAP)
	continue;
	VM_OBJECT_RLOCK(object);
	if (object->type != OBJT_SWAP)
	goto unlock;
	pi = OFF_TO_IDX(cur->offset);
	e = pi + OFF_TO_IDX(cur->end - cur->start);
	for (;; pi = sb->p + SWAP_META_PAGES) {
	sb = SWAP_PCTRIE_LOOKUP_GE(
	&object->un_pager.swp.swp_blks, pi);
	if (sb == NULL \|\| sb->p >= e)
	break;
	for (i = 0; i < SWAP_META_PAGES; i++) {
	if (sb->p + i < e &&
	sb->d[i] != SWAPBLK_NONE)
	count++;
	}
	}
	unlock:
	VM_OBJECT_RUNLOCK(object);
	}
	return (count);
	}

	/*
	* GEOM backend
	*
	* Swapping onto disk devices.
	*
	*/

	static g_orphan_t swapgeom_orphan;

	static struct g_class g_swap_class = {
	.name = "SWAP",
	.version = G_VERSION,
	.orphan = swapgeom_orphan,
	};

	DECLARE_GEOM_CLASS(g_swap_class, g_class);

	static void
	swapgeom_close_ev(void *arg, int flags)
	{
	struct g_consumer *cp;

	cp = arg;
	g_access(cp, -1, -1, 0);
	g_detach(cp);
	g_destroy_consumer(cp);
	}

	/*
	* Add a reference to the g_consumer for an inflight transaction.
	*/
	static void
	swapgeom_acquire(struct g_consumer *cp)
	{

	mtx_assert(&sw_dev_mtx, MA_OWNED);
	cp->index++;
	}

	/*
	* Remove a reference from the g_consumer. Post a close event if all
	* references go away, since the function might be called from the
	* biodone context.
	*/
	static void
	swapgeom_release(struct g_consumer cp, struct swdevt sp)
	{

	mtx_assert(&sw_dev_mtx, MA_OWNED);
	cp->index--;
	if (cp->index == 0) {
	if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0)
	sp->sw_id = NULL;
	}
	}

	static void
	swapgeom_done(struct bio *bp2)
	{
	struct swdevt *sp;
	struct buf *bp;
	struct g_consumer *cp;

	bp = bp2->bio_caller2;
	cp = bp2->bio_from;
	bp->b_ioflags = bp2->bio_flags;
	if (bp2->bio_error)
	bp->b_ioflags \|= BIO_ERROR;
	bp->b_resid = bp->b_bcount - bp2->bio_completed;
	bp->b_error = bp2->bio_error;
	bp->b_caller1 = NULL;
	bufdone(bp);
	sp = bp2->bio_caller1;
	mtx_lock(&sw_dev_mtx);
	swapgeom_release(cp, sp);
	mtx_unlock(&sw_dev_mtx);
	g_destroy_bio(bp2);
	}

	static void
	swapgeom_strategy(struct buf bp, struct swdevt sp)
	{
	struct bio *bio;
	struct g_consumer *cp;

	mtx_lock(&sw_dev_mtx);
	cp = sp->sw_id;
	if (cp == NULL) {
	mtx_unlock(&sw_dev_mtx);
	bp->b_error = ENXIO;
	bp->b_ioflags \|= BIO_ERROR;
	bufdone(bp);
	return;
	}
	swapgeom_acquire(cp);
	mtx_unlock(&sw_dev_mtx);
	if (bp->b_iocmd == BIO_WRITE)
	bio = g_new_bio();
	else
	bio = g_alloc_bio();
	if (bio == NULL) {
	mtx_lock(&sw_dev_mtx);
	swapgeom_release(cp, sp);
	mtx_unlock(&sw_dev_mtx);
	bp->b_error = ENOMEM;
	bp->b_ioflags \|= BIO_ERROR;
	printf("swap_pager: cannot allocate bio\n");
	bufdone(bp);
	return;
	}

	bp->b_caller1 = bio;
	bio->bio_caller1 = sp;
	bio->bio_caller2 = bp;
	bio->bio_cmd = bp->b_iocmd;
	bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
	bio->bio_length = bp->b_bcount;
	bio->bio_done = swapgeom_done;
	if (!buf_mapped(bp)) {
	bio->bio_ma = bp->b_pages;
	bio->bio_data = unmapped_buf;
	bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
	bio->bio_ma_n = bp->b_npages;
	bio->bio_flags \|= BIO_UNMAPPED;
	} else {
	bio->bio_data = bp->b_data;
	bio->bio_ma = NULL;
	}
	g_io_request(bio, cp);
	return;
	}

	static void
	swapgeom_orphan(struct g_consumer *cp)
	{
	struct swdevt *sp;
	int destroy;

	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	if (sp->sw_id == cp) {
	sp->sw_flags \|= SW_CLOSING;
	break;
	}
	}
	/*
	* Drop reference we were created with. Do directly since we're in a
	* special context where we don't have to queue the call to
	* swapgeom_close_ev().
	*/
	cp->index--;
	destroy = ((sp != NULL) && (cp->index == 0));
	if (destroy)
	sp->sw_id = NULL;
	mtx_unlock(&sw_dev_mtx);
	if (destroy)
	swapgeom_close_ev(cp, 0);
	}

	static void
	swapgeom_close(struct thread td, struct swdevt sw)
	{
	struct g_consumer *cp;

	mtx_lock(&sw_dev_mtx);
	cp = sw->sw_id;
	sw->sw_id = NULL;
	mtx_unlock(&sw_dev_mtx);

	/*
	* swapgeom_close() may be called from the biodone context,
	* where we cannot perform topology changes. Delegate the
	* work to the events thread.
	*/
	if (cp != NULL)
	g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
	}

	static int
	swapongeom_locked(struct cdev dev, struct vnode vp)
	{
	struct g_provider *pp;
	struct g_consumer *cp;
	static struct g_geom *gp;
	struct swdevt *sp;
	u_long nblks;
	int error;

	pp = g_dev_getprovider(dev);
	if (pp == NULL)
	return (ENODEV);
	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	cp = sp->sw_id;
	if (cp != NULL && cp->provider == pp) {
	mtx_unlock(&sw_dev_mtx);
	return (EBUSY);
	}
	}
	mtx_unlock(&sw_dev_mtx);
	if (gp == NULL)
	gp = g_new_geomf(&g_swap_class, "swap");
	cp = g_new_consumer(gp);
	cp->index = 1; /* Number of active I/Os, plus one for being active. */
	cp->flags \|= G_CF_DIRECT_SEND \| G_CF_DIRECT_RECEIVE;
	g_attach(cp, pp);
	/*
	* XXX: Every time you think you can improve the margin for
	* footshooting, somebody depends on the ability to do so:
	* savecore(8) wants to write to our swapdev so we cannot
	* set an exclusive count :-(
	*/
	error = g_access(cp, 1, 1, 0);
	if (error != 0) {
	g_detach(cp);
	g_destroy_consumer(cp);
	return (error);
	}
	nblks = pp->mediasize / DEV_BSIZE;
	swaponsomething(vp, cp, nblks, swapgeom_strategy,
	swapgeom_close, dev2udev(dev),
	(pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0);
	return (0);
	}

	static int
	swapongeom(struct vnode *vp)
	{
	int error;

	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	if (vp->v_type != VCHR \|\| VN_IS_DOOMED(vp)) {
	error = ENOENT;
	} else {
	g_topology_lock();
	error = swapongeom_locked(vp->v_rdev, vp);
	g_topology_unlock();
	}
	VOP_UNLOCK(vp);
	return (error);
	}

	/*
	* VNODE backend
	*
	* This is used mainly for network filesystem (read: probably only tested
	* with NFS) swapfiles.
	*
	*/

	static void
	swapdev_strategy(struct buf bp, struct swdevt sp)
	{
	struct vnode *vp2;

	bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);

	vp2 = sp->sw_id;
	vhold(vp2);
	if (bp->b_iocmd == BIO_WRITE) {
	if (bp->b_bufobj)
	bufobj_wdrop(bp->b_bufobj);
	bufobj_wref(&vp2->v_bufobj);
	}
	if (bp->b_bufobj != &vp2->v_bufobj)
	bp->b_bufobj = &vp2->v_bufobj;
	bp->b_vp = vp2;
	bp->b_iooffset = dbtob(bp->b_blkno);
	bstrategy(bp);
	return;
	}

	static void
	swapdev_close(struct thread td, struct swdevt sp)
	{

	VOP_CLOSE(sp->sw_vp, FREAD \| FWRITE, td->td_ucred, td);
	vrele(sp->sw_vp);
	}

	static int
	swaponvp(struct thread td, struct vnode vp, u_long nblks)
	{
	struct swdevt *sp;
	int error;

	if (nblks == 0)
	return (ENXIO);
	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	if (sp->sw_id == vp) {
	mtx_unlock(&sw_dev_mtx);
	return (EBUSY);
	}
	}
	mtx_unlock(&sw_dev_mtx);

	(void) vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	#ifdef MAC
	error = mac_system_check_swapon(td->td_ucred, vp);
	if (error == 0)
	#endif
	error = VOP_OPEN(vp, FREAD \| FWRITE, td->td_ucred, td, NULL);
	(void) VOP_UNLOCK(vp);
	if (error)
	return (error);

	swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
	NODEV, 0);
	return (0);
	}

	static int
	sysctl_swap_async_max(SYSCTL_HANDLER_ARGS)
	{
	int error, new, n;

	new = nsw_wcount_async_max;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);

	if (new > nswbuf / 2 \|\| new < 1)
	return (EINVAL);

	mtx_lock(&swbuf_mtx);
	while (nsw_wcount_async_max != new) {
	/*
	* Adjust difference. If the current async count is too low,
	* we will need to sqeeze our update slowly in. Sleep with a
	* higher priority than getpbuf() to finish faster.
	*/
	n = new - nsw_wcount_async_max;
	if (nsw_wcount_async + n >= 0) {
	nsw_wcount_async += n;
	nsw_wcount_async_max += n;
	wakeup(&nsw_wcount_async);
	} else {
	nsw_wcount_async_max -= nsw_wcount_async;
	nsw_wcount_async = 0;
	msleep(&nsw_wcount_async, &swbuf_mtx, PSWP,
	"swpsysctl", 0);
	}
	}
	mtx_unlock(&swbuf_mtx);

	return (0);
	}

	static void
	swap_pager_update_writecount(vm_object_t object, vm_offset_t start,
	vm_offset_t end)
	{

	VM_OBJECT_WLOCK(object);
	KASSERT((object->flags & OBJ_ANON) == 0,
	("Splittable object with writecount"));
	object->un_pager.swp.writemappings += (vm_ooffset_t)end - start;
	VM_OBJECT_WUNLOCK(object);
	}

	static void
	swap_pager_release_writecount(vm_object_t object, vm_offset_t start,
	vm_offset_t end)
	{

	VM_OBJECT_WLOCK(object);
	KASSERT((object->flags & OBJ_ANON) == 0,
	("Splittable object with writecount"));
	object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start;
	VM_OBJECT_WUNLOCK(object);
	}
	diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
	index 81bcfb6b58b2..da7b1f1d2d8e 100644
	--- a/sys/vm/vm_fault.c
	+++ b/sys/vm/vm_fault.c
	@@ -1,2080 +1,2079 @@
	/*-
	* SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
	*
	* Copyright (c) 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* Copyright (c) 1994 John S. Dyson
	* All rights reserved.
	* Copyright (c) 1994 David Greenman
	* All rights reserved.
	*
	*
	* This code is derived from software contributed to Berkeley by
	* The Mach Operating System project at Carnegie-Mellon University.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94
	*
	*
	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	* All rights reserved.
	*
	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	*
	* Permission to use, copy, modify and distribute this software and
	* its documentation is hereby granted, provided that both the copyright
	* notice and this permission notice appear in all copies of the
	* software, derivative works or modified versions, and any portions
	* thereof, and that both notices appear in supporting documentation.
	*
	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	*
	* Carnegie Mellon requests users of this software to return to
	*
	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	* School of Computer Science
	* Carnegie Mellon University
	* Pittsburgh PA 15213-3890
	*
	* any improvements or extensions that they make and grant Carnegie the
	* rights to redistribute these changes.
	*/

	/*
	* Page fault handling module.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ktrace.h"
	#include "opt_vm.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mman.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/racct.h>
	#include <sys/refcount.h>
	#include <sys/resourcevar.h>
	#include <sys/rwlock.h>
	#include <sys/signalvar.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pageout.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_reserv.h>

	#define PFBAK 4
	#define PFFOR 4

	#define VM_FAULT_READ_DEFAULT (1 + VM_FAULT_READ_AHEAD_INIT)
	-#define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX)

	#define VM_FAULT_DONTNEED_MIN 1048576

	struct faultstate {
	/* Fault parameters. */
	vm_offset_t vaddr;
	vm_page_t *m_hold;
	vm_prot_t fault_type;
	vm_prot_t prot;
	int fault_flags;
	int oom;
	boolean_t wired;

	/* Page reference for cow. */
	vm_page_t m_cow;

	/* Current object. */
	vm_object_t object;
	vm_pindex_t pindex;
	vm_page_t m;

	/* Top-level map object. */
	vm_object_t first_object;
	vm_pindex_t first_pindex;
	vm_page_t first_m;

	/* Map state. */
	vm_map_t map;
	vm_map_entry_t entry;
	int map_generation;
	bool lookup_still_valid;

	/* Vnode if locked. */
	struct vnode *vp;
	};

	static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr,
	int ahead);
	static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
	int backward, int forward, bool obj_locked);

	static int vm_pfault_oom_attempts = 3;
	SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN,
	&vm_pfault_oom_attempts, 0,
	"Number of page allocation attempts in page fault handler before it "
	"triggers OOM handling");

	static int vm_pfault_oom_wait = 10;
	SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN,
	&vm_pfault_oom_wait, 0,
	"Number of seconds to wait for free pages before retrying "
	"the page fault handler");

	static inline void
	fault_page_release(vm_page_t *mp)
	{
	vm_page_t m;

	m = *mp;
	if (m != NULL) {
	/*
	* We are likely to loop around again and attempt to busy
	* this page. Deactivating it leaves it available for
	* pageout while optimizing fault restarts.
	*/
	vm_page_deactivate(m);
	vm_page_xunbusy(m);
	*mp = NULL;
	}
	}

	static inline void
	fault_page_free(vm_page_t *mp)
	{
	vm_page_t m;

	m = *mp;
	if (m != NULL) {
	VM_OBJECT_ASSERT_WLOCKED(m->object);
	if (!vm_page_wired(m))
	vm_page_free(m);
	else
	vm_page_xunbusy(m);
	*mp = NULL;
	}
	}

	static inline void
	unlock_map(struct faultstate *fs)
	{

	if (fs->lookup_still_valid) {
	vm_map_lookup_done(fs->map, fs->entry);
	fs->lookup_still_valid = false;
	}
	}

	static void
	unlock_vp(struct faultstate *fs)
	{

	if (fs->vp != NULL) {
	vput(fs->vp);
	fs->vp = NULL;
	}
	}

	static void
	fault_deallocate(struct faultstate *fs)
	{

	fault_page_release(&fs->m_cow);
	fault_page_release(&fs->m);
	vm_object_pip_wakeup(fs->object);
	if (fs->object != fs->first_object) {
	VM_OBJECT_WLOCK(fs->first_object);
	fault_page_free(&fs->first_m);
	VM_OBJECT_WUNLOCK(fs->first_object);
	vm_object_pip_wakeup(fs->first_object);
	}
	vm_object_deallocate(fs->first_object);
	unlock_map(fs);
	unlock_vp(fs);
	}

	static void
	unlock_and_deallocate(struct faultstate *fs)
	{

	VM_OBJECT_WUNLOCK(fs->object);
	fault_deallocate(fs);
	}

	static void
	vm_fault_dirty(struct faultstate *fs, vm_page_t m)
	{
	bool need_dirty;

	if (((fs->prot & VM_PROT_WRITE) == 0 &&
	(fs->fault_flags & VM_FAULT_DIRTY) == 0) \|\|
	(m->oflags & VPO_UNMANAGED) != 0)
	return;

	VM_PAGE_OBJECT_BUSY_ASSERT(m);

	need_dirty = ((fs->fault_type & VM_PROT_WRITE) != 0 &&
	(fs->fault_flags & VM_FAULT_WIRE) == 0) \|\|
	(fs->fault_flags & VM_FAULT_DIRTY) != 0;

	vm_object_set_writeable_dirty(m->object);

	/*
	* If the fault is a write, we know that this page is being
	* written NOW so dirty it explicitly to save on
	* pmap_is_modified() calls later.
	*
	* Also, since the page is now dirty, we can possibly tell
	* the pager to release any swap backing the page.
	*/
	if (need_dirty && vm_page_set_dirty(m) == 0) {
	/*
	* If this is a NOSYNC mmap we do not want to set PGA_NOSYNC
	* if the page is already dirty to prevent data written with
	* the expectation of being synced from not being synced.
	* Likewise if this entry does not request NOSYNC then make
	* sure the page isn't marked NOSYNC. Applications sharing
	* data should use the same flags to avoid ping ponging.
	*/
	if ((fs->entry->eflags & MAP_ENTRY_NOSYNC) != 0)
	vm_page_aflag_set(m, PGA_NOSYNC);
	else
	vm_page_aflag_clear(m, PGA_NOSYNC);
	}

	}

	/*
	* Unlocks fs.first_object and fs.map on success.
	*/
	static int
	vm_fault_soft_fast(struct faultstate *fs)
	{
	vm_page_t m, m_map;
	#if VM_NRESERVLEVEL > 0
	vm_page_t m_super;
	int flags;
	#endif
	int psind, rv;
	vm_offset_t vaddr;

	MPASS(fs->vp == NULL);
	vaddr = fs->vaddr;
	vm_object_busy(fs->first_object);
	m = vm_page_lookup(fs->first_object, fs->first_pindex);
	/* A busy page can be mapped for read\|execute access. */
	if (m == NULL \|\| ((fs->prot & VM_PROT_WRITE) != 0 &&
	vm_page_busied(m)) \|\| !vm_page_all_valid(m)) {
	rv = KERN_FAILURE;
	goto out;
	}
	m_map = m;
	psind = 0;
	#if VM_NRESERVLEVEL > 0
	if ((m->flags & PG_FICTITIOUS) == 0 &&
	(m_super = vm_reserv_to_superpage(m)) != NULL &&
	rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start &&
	roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end &&
	(vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) &
	(pagesizes[m_super->psind] - 1)) && !fs->wired &&
	pmap_ps_enabled(fs->map->pmap)) {
	flags = PS_ALL_VALID;
	if ((fs->prot & VM_PROT_WRITE) != 0) {
	/*
	* Create a superpage mapping allowing write access
	* only if none of the constituent pages are busy and
	* all of them are already dirty (except possibly for
	* the page that was faulted on).
	*/
	flags \|= PS_NONE_BUSY;
	if ((fs->first_object->flags & OBJ_UNMANAGED) == 0)
	flags \|= PS_ALL_DIRTY;
	}
	if (vm_page_ps_test(m_super, flags, m)) {
	m_map = m_super;
	psind = m_super->psind;
	vaddr = rounddown2(vaddr, pagesizes[psind]);
	/* Preset the modified bit for dirty superpages. */
	if ((flags & PS_ALL_DIRTY) != 0)
	fs->fault_type \|= VM_PROT_WRITE;
	}
	}
	#endif
	rv = pmap_enter(fs->map->pmap, vaddr, m_map, fs->prot, fs->fault_type \|
	PMAP_ENTER_NOSLEEP \| (fs->wired ? PMAP_ENTER_WIRED : 0), psind);
	if (rv != KERN_SUCCESS)
	goto out;
	if (fs->m_hold != NULL) {
	(*fs->m_hold) = m;
	vm_page_wire(m);
	}
	if (psind == 0 && !fs->wired)
	vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true);
	VM_OBJECT_RUNLOCK(fs->first_object);
	vm_fault_dirty(fs, m);
	vm_map_lookup_done(fs->map, fs->entry);
	curthread->td_ru.ru_minflt++;

	out:
	vm_object_unbusy(fs->first_object);
	return (rv);
	}

	static void
	vm_fault_restore_map_lock(struct faultstate *fs)
	{

	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
	MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0);

	if (!vm_map_trylock_read(fs->map)) {
	VM_OBJECT_WUNLOCK(fs->first_object);
	vm_map_lock_read(fs->map);
	VM_OBJECT_WLOCK(fs->first_object);
	}
	fs->lookup_still_valid = true;
	}

	static void
	vm_fault_populate_check_page(vm_page_t m)
	{

	/*
	* Check each page to ensure that the pager is obeying the
	* interface: the page must be installed in the object, fully
	* valid, and exclusively busied.
	*/
	MPASS(m != NULL);
	MPASS(vm_page_all_valid(m));
	MPASS(vm_page_xbusied(m));
	}

	static void
	vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first,
	vm_pindex_t last)
	{
	vm_page_t m;
	vm_pindex_t pidx;

	VM_OBJECT_ASSERT_WLOCKED(object);
	MPASS(first <= last);
	for (pidx = first, m = vm_page_lookup(object, pidx);
	pidx <= last; pidx++, m = vm_page_next(m)) {
	vm_fault_populate_check_page(m);
	vm_page_deactivate(m);
	vm_page_xunbusy(m);
	}
	}

	static int
	vm_fault_populate(struct faultstate *fs)
	{
	vm_offset_t vaddr;
	vm_page_t m;
	vm_pindex_t map_first, map_last, pager_first, pager_last, pidx;
	int bdry_idx, i, npages, psind, rv;

	MPASS(fs->object == fs->first_object);
	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
	MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0);
	MPASS(fs->first_object->backing_object == NULL);
	MPASS(fs->lookup_still_valid);

	pager_first = OFF_TO_IDX(fs->entry->offset);
	pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1;
	unlock_map(fs);
	unlock_vp(fs);

	/*
	* Call the pager (driver) populate() method.
	*
	* There is no guarantee that the method will be called again
	* if the current fault is for read, and a future fault is
	* for write. Report the entry's maximum allowed protection
	* to the driver.
	*/
	rv = vm_pager_populate(fs->first_object, fs->first_pindex,
	fs->fault_type, fs->entry->max_protection, &pager_first,
	&pager_last);

	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
	if (rv == VM_PAGER_BAD) {
	/*
	* VM_PAGER_BAD is the backdoor for a pager to request
	* normal fault handling.
	*/
	vm_fault_restore_map_lock(fs);
	if (fs->map->timestamp != fs->map_generation)
	return (KERN_RESTART);
	return (KERN_NOT_RECEIVER);
	}
	if (rv != VM_PAGER_OK)
	return (KERN_FAILURE); /* AKA SIGSEGV */

	/* Ensure that the driver is obeying the interface. */
	MPASS(pager_first <= pager_last);
	MPASS(fs->first_pindex <= pager_last);
	MPASS(fs->first_pindex >= pager_first);
	MPASS(pager_last < fs->first_object->size);

	vm_fault_restore_map_lock(fs);
	bdry_idx = (fs->entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
	MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
	if (fs->map->timestamp != fs->map_generation) {
	if (bdry_idx == 0) {
	vm_fault_populate_cleanup(fs->first_object, pager_first,
	pager_last);
	} else {
	m = vm_page_lookup(fs->first_object, pager_first);
	if (m != fs->m)
	vm_page_xunbusy(m);
	}
	return (KERN_RESTART);
	}

	/*
	* The map is unchanged after our last unlock. Process the fault.
	*
	* First, the special case of largepage mappings, where
	* populate only busies the first page in superpage run.
	*/
	if (bdry_idx != 0) {
	KASSERT(PMAP_HAS_LARGEPAGES,
	("missing pmap support for large pages"));
	m = vm_page_lookup(fs->first_object, pager_first);
	vm_fault_populate_check_page(m);
	VM_OBJECT_WUNLOCK(fs->first_object);
	vaddr = fs->entry->start + IDX_TO_OFF(pager_first) -
	fs->entry->offset;
	/* assert alignment for entry */
	KASSERT((vaddr & (pagesizes[bdry_idx] - 1)) == 0,
	("unaligned superpage start %#jx pager_first %#jx offset %#jx vaddr %#jx",
	(uintmax_t)fs->entry->start, (uintmax_t)pager_first,
	(uintmax_t)fs->entry->offset, (uintmax_t)vaddr));
	KASSERT((VM_PAGE_TO_PHYS(m) & (pagesizes[bdry_idx] - 1)) == 0,
	("unaligned superpage m %p %#jx", m,
	(uintmax_t)VM_PAGE_TO_PHYS(m)));
	rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot,
	fs->fault_type \| (fs->wired ? PMAP_ENTER_WIRED : 0) \|
	PMAP_ENTER_LARGEPAGE, bdry_idx);
	VM_OBJECT_WLOCK(fs->first_object);
	vm_page_xunbusy(m);
	if ((fs->fault_flags & VM_FAULT_WIRE) != 0) {
	for (i = 0; i < atop(pagesizes[bdry_idx]); i++)
	vm_page_wire(m + i);
	}
	if (fs->m_hold != NULL) {
	*fs->m_hold = m + (fs->first_pindex - pager_first);
	vm_page_wire(*fs->m_hold);
	}
	goto out;
	}

	/*
	* The range [pager_first, pager_last] that is given to the
	* pager is only a hint. The pager may populate any range
	* within the object that includes the requested page index.
	* In case the pager expanded the range, clip it to fit into
	* the map entry.
	*/
	map_first = OFF_TO_IDX(fs->entry->offset);
	if (map_first > pager_first) {
	vm_fault_populate_cleanup(fs->first_object, pager_first,
	map_first - 1);
	pager_first = map_first;
	}
	map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1;
	if (map_last < pager_last) {
	vm_fault_populate_cleanup(fs->first_object, map_last + 1,
	pager_last);
	pager_last = map_last;
	}
	for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx);
	pidx <= pager_last;
	pidx += npages, m = vm_page_next(&m[npages - 1])) {
	vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset;
	#if defined(__aarch64__) \|\| defined(__amd64__) \|\| (defined(__arm__) && \
	__ARM_ARCH >= 6) \|\| defined(__i386__) \|\| defined(__riscv) \|\| \
	defined(__powerpc64__)
	psind = m->psind;
	if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 \|\|
	pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last \|\|
	!pmap_ps_enabled(fs->map->pmap) \|\| fs->wired))
	psind = 0;
	#else
	psind = 0;
	#endif
	npages = atop(pagesizes[psind]);
	for (i = 0; i < npages; i++) {
	vm_fault_populate_check_page(&m[i]);
	vm_fault_dirty(fs, &m[i]);
	}
	VM_OBJECT_WUNLOCK(fs->first_object);
	rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot, fs->fault_type \|
	(fs->wired ? PMAP_ENTER_WIRED : 0), psind);
	#if defined(__amd64__)
	if (psind > 0 && rv == KERN_FAILURE) {
	for (i = 0; i < npages; i++) {
	rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i),
	&m[i], fs->prot, fs->fault_type \|
	(fs->wired ? PMAP_ENTER_WIRED : 0), 0);
	MPASS(rv == KERN_SUCCESS);
	}
	}
	#else
	MPASS(rv == KERN_SUCCESS);
	#endif
	VM_OBJECT_WLOCK(fs->first_object);
	for (i = 0; i < npages; i++) {
	if ((fs->fault_flags & VM_FAULT_WIRE) != 0)
	vm_page_wire(&m[i]);
	else
	vm_page_activate(&m[i]);
	if (fs->m_hold != NULL && m[i].pindex == fs->first_pindex) {
	(*fs->m_hold) = &m[i];
	vm_page_wire(&m[i]);
	}
	vm_page_xunbusy(&m[i]);
	}
	}
	out:
	curthread->td_ru.ru_majflt++;
	return (KERN_SUCCESS);
	}

	static int prot_fault_translation;
	SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN,
	&prot_fault_translation, 0,
	"Control signal to deliver on protection fault");

	/* compat definition to keep common code for signal translation */
	#define UCODE_PAGEFLT 12
	#ifdef T_PAGEFLT
	_Static_assert(UCODE_PAGEFLT == T_PAGEFLT, "T_PAGEFLT");
	#endif

	/*
	* vm_fault_trap:
	*
	* Handle a page fault occurring at the given address,
	* requiring the given permissions, in the map specified.
	* If successful, the page is inserted into the
	* associated physical map.
	*
	* NOTE: the given address should be truncated to the
	* proper page address.
	*
	* KERN_SUCCESS is returned if the page fault is handled; otherwise,
	* a standard error specifying why the fault is fatal is returned.
	*
	* The map in question must be referenced, and remains so.
	* Caller may hold no locks.
	*/
	int
	vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
	int fault_flags, int signo, int ucode)
	{
	int result;

	MPASS(signo == NULL \|\| ucode != NULL);
	#ifdef KTRACE
	if (map != kernel_map && KTRPOINT(curthread, KTR_FAULT))
	ktrfault(vaddr, fault_type);
	#endif
	result = vm_fault(map, trunc_page(vaddr), fault_type, fault_flags,
	NULL);
	KASSERT(result == KERN_SUCCESS \|\| result == KERN_FAILURE \|\|
	result == KERN_INVALID_ADDRESS \|\|
	result == KERN_RESOURCE_SHORTAGE \|\|
	result == KERN_PROTECTION_FAILURE \|\|
	result == KERN_OUT_OF_BOUNDS,
	("Unexpected Mach error %d from vm_fault()", result));
	#ifdef KTRACE
	if (map != kernel_map && KTRPOINT(curthread, KTR_FAULTEND))
	ktrfaultend(result);
	#endif
	if (result != KERN_SUCCESS && signo != NULL) {
	switch (result) {
	case KERN_FAILURE:
	case KERN_INVALID_ADDRESS:
	*signo = SIGSEGV;
	*ucode = SEGV_MAPERR;
	break;
	case KERN_RESOURCE_SHORTAGE:
	*signo = SIGBUS;
	*ucode = BUS_OOMERR;
	break;
	case KERN_OUT_OF_BOUNDS:
	*signo = SIGBUS;
	*ucode = BUS_OBJERR;
	break;
	case KERN_PROTECTION_FAILURE:
	if (prot_fault_translation == 0) {
	/*
	* Autodetect. This check also covers
	* the images without the ABI-tag ELF
	* note.
	*/
	if (SV_CURPROC_ABI() == SV_ABI_FREEBSD &&
	curproc->p_osrel >= P_OSREL_SIGSEGV) {
	*signo = SIGSEGV;
	*ucode = SEGV_ACCERR;
	} else {
	*signo = SIGBUS;
	*ucode = UCODE_PAGEFLT;
	}
	} else if (prot_fault_translation == 1) {
	/* Always compat mode. */
	*signo = SIGBUS;
	*ucode = UCODE_PAGEFLT;
	} else {
	/* Always SIGSEGV mode. */
	*signo = SIGSEGV;
	*ucode = SEGV_ACCERR;
	}
	break;
	default:
	KASSERT(0, ("Unexpected Mach error %d from vm_fault()",
	result));
	break;
	}
	}
	return (result);
	}

	static int
	vm_fault_lock_vnode(struct faultstate *fs, bool objlocked)
	{
	struct vnode *vp;
	int error, locked;

	if (fs->object->type != OBJT_VNODE)
	return (KERN_SUCCESS);
	vp = fs->object->handle;
	if (vp == fs->vp) {
	ASSERT_VOP_LOCKED(vp, "saved vnode is not locked");
	return (KERN_SUCCESS);
	}

	/*
	* Perform an unlock in case the desired vnode changed while
	* the map was unlocked during a retry.
	*/
	unlock_vp(fs);

	locked = VOP_ISLOCKED(vp);
	if (locked != LK_EXCLUSIVE)
	locked = LK_SHARED;

	/*
	* We must not sleep acquiring the vnode lock while we have
	* the page exclusive busied or the object's
	* paging-in-progress count incremented. Otherwise, we could
	* deadlock.
	*/
	error = vget(vp, locked \| LK_CANRECURSE \| LK_NOWAIT);
	if (error == 0) {
	fs->vp = vp;
	return (KERN_SUCCESS);
	}

	vhold(vp);
	if (objlocked)
	unlock_and_deallocate(fs);
	else
	fault_deallocate(fs);
	error = vget(vp, locked \| LK_RETRY \| LK_CANRECURSE);
	vdrop(vp);
	fs->vp = vp;
	KASSERT(error == 0, ("vm_fault: vget failed %d", error));
	return (KERN_RESOURCE_SHORTAGE);
	}

	/*
	* Calculate the desired readahead. Handle drop-behind.
	*
	* Returns the number of readahead blocks to pass to the pager.
	*/
	static int
	vm_fault_readahead(struct faultstate *fs)
	{
	int era, nera;
	u_char behavior;

	KASSERT(fs->lookup_still_valid, ("map unlocked"));
	era = fs->entry->read_ahead;
	behavior = vm_map_entry_behavior(fs->entry);
	if (behavior == MAP_ENTRY_BEHAV_RANDOM) {
	nera = 0;
	} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
	nera = VM_FAULT_READ_AHEAD_MAX;
	if (fs->vaddr == fs->entry->next_read)
	vm_fault_dontneed(fs, fs->vaddr, nera);
	} else if (fs->vaddr == fs->entry->next_read) {
	/*
	* This is a sequential fault. Arithmetically
	* increase the requested number of pages in
	* the read-ahead window. The requested
	* number of pages is "# of sequential faults
	* x (read ahead min + 1) + read ahead min"
	*/
	nera = VM_FAULT_READ_AHEAD_MIN;
	if (era > 0) {
	nera += era + 1;
	if (nera > VM_FAULT_READ_AHEAD_MAX)
	nera = VM_FAULT_READ_AHEAD_MAX;
	}
	if (era == VM_FAULT_READ_AHEAD_MAX)
	vm_fault_dontneed(fs, fs->vaddr, nera);
	} else {
	/*
	* This is a non-sequential fault.
	*/
	nera = 0;
	}
	if (era != nera) {
	/*
	* A read lock on the map suffices to update
	* the read ahead count safely.
	*/
	fs->entry->read_ahead = nera;
	}

	return (nera);
	}

	static int
	vm_fault_lookup(struct faultstate *fs)
	{
	int result;

	KASSERT(!fs->lookup_still_valid,
	("vm_fault_lookup: Map already locked."));
	result = vm_map_lookup(&fs->map, fs->vaddr, fs->fault_type \|
	VM_PROT_FAULT_LOOKUP, &fs->entry, &fs->first_object,
	&fs->first_pindex, &fs->prot, &fs->wired);
	if (result != KERN_SUCCESS) {
	unlock_vp(fs);
	return (result);
	}

	fs->map_generation = fs->map->timestamp;

	if (fs->entry->eflags & MAP_ENTRY_NOFAULT) {
	panic("%s: fault on nofault entry, addr: %#lx",
	__func__, (u_long)fs->vaddr);
	}

	if (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION &&
	fs->entry->wiring_thread != curthread) {
	vm_map_unlock_read(fs->map);
	vm_map_lock(fs->map);
	if (vm_map_lookup_entry(fs->map, fs->vaddr, &fs->entry) &&
	(fs->entry->eflags & MAP_ENTRY_IN_TRANSITION)) {
	unlock_vp(fs);
	fs->entry->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	vm_map_unlock_and_wait(fs->map, 0);
	} else
	vm_map_unlock(fs->map);
	return (KERN_RESOURCE_SHORTAGE);
	}

	MPASS((fs->entry->eflags & MAP_ENTRY_GUARD) == 0);

	if (fs->wired)
	fs->fault_type = fs->prot \| (fs->fault_type & VM_PROT_COPY);
	else
	KASSERT((fs->fault_flags & VM_FAULT_WIRE) == 0,
	("!fs->wired && VM_FAULT_WIRE"));
	fs->lookup_still_valid = true;

	return (KERN_SUCCESS);
	}

	static int
	vm_fault_relookup(struct faultstate *fs)
	{
	vm_object_t retry_object;
	vm_pindex_t retry_pindex;
	vm_prot_t retry_prot;
	int result;

	if (!vm_map_trylock_read(fs->map))
	return (KERN_RESTART);

	fs->lookup_still_valid = true;
	if (fs->map->timestamp == fs->map_generation)
	return (KERN_SUCCESS);

	result = vm_map_lookup_locked(&fs->map, fs->vaddr, fs->fault_type,
	&fs->entry, &retry_object, &retry_pindex, &retry_prot,
	&fs->wired);
	if (result != KERN_SUCCESS) {
	/*
	* If retry of map lookup would have blocked then
	* retry fault from start.
	*/
	if (result == KERN_FAILURE)
	return (KERN_RESTART);
	return (result);
	}
	if (retry_object != fs->first_object \|\|
	retry_pindex != fs->first_pindex)
	return (KERN_RESTART);

	/*
	* Check whether the protection has changed or the object has
	* been copied while we left the map unlocked. Changing from
	* read to write permission is OK - we leave the page
	* write-protected, and catch the write fault. Changing from
	* write to read permission means that we can't mark the page
	* write-enabled after all.
	*/
	fs->prot &= retry_prot;
	fs->fault_type &= retry_prot;
	if (fs->prot == 0)
	return (KERN_RESTART);

	/* Reassert because wired may have changed. */
	KASSERT(fs->wired \|\| (fs->fault_flags & VM_FAULT_WIRE) == 0,
	("!wired && VM_FAULT_WIRE"));

	return (KERN_SUCCESS);
	}

	static void
	vm_fault_cow(struct faultstate *fs)
	{
	bool is_first_object_locked;

	/*
	* This allows pages to be virtually copied from a backing_object
	* into the first_object, where the backing object has no other
	* refs to it, and cannot gain any more refs. Instead of a bcopy,
	* we just move the page from the backing object to the first
	* object. Note that we must mark the page dirty in the first
	* object so that it will go out to swap when needed.
	*/
	is_first_object_locked = false;
	if (
	/*
	* Only one shadow object and no other refs.
	*/
	fs->object->shadow_count == 1 && fs->object->ref_count == 1 &&
	/*
	* No other ways to look the object up
	*/
	fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0 &&
	/*
	* We don't chase down the shadow chain and we can acquire locks.
	*/
	(is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object)) &&
	fs->object == fs->first_object->backing_object &&
	VM_OBJECT_TRYWLOCK(fs->object)) {
	/*
	* Remove but keep xbusy for replace. fs->m is moved into
	* fs->first_object and left busy while fs->first_m is
	* conditionally freed.
	*/
	vm_page_remove_xbusy(fs->m);
	vm_page_replace(fs->m, fs->first_object, fs->first_pindex,
	fs->first_m);
	vm_page_dirty(fs->m);
	#if VM_NRESERVLEVEL > 0
	/*
	* Rename the reservation.
	*/
	vm_reserv_rename(fs->m, fs->first_object, fs->object,
	OFF_TO_IDX(fs->first_object->backing_object_offset));
	#endif
	VM_OBJECT_WUNLOCK(fs->object);
	VM_OBJECT_WUNLOCK(fs->first_object);
	fs->first_m = fs->m;
	fs->m = NULL;
	VM_CNT_INC(v_cow_optim);
	} else {
	if (is_first_object_locked)
	VM_OBJECT_WUNLOCK(fs->first_object);
	/*
	* Oh, well, lets copy it.
	*/
	pmap_copy_page(fs->m, fs->first_m);
	vm_page_valid(fs->first_m);
	if (fs->wired && (fs->fault_flags & VM_FAULT_WIRE) == 0) {
	vm_page_wire(fs->first_m);
	vm_page_unwire(fs->m, PQ_INACTIVE);
	}
	/*
	* Save the cow page to be released after
	* pmap_enter is complete.
	*/
	fs->m_cow = fs->m;
	fs->m = NULL;
	}
	/*
	* fs->object != fs->first_object due to above
	* conditional
	*/
	vm_object_pip_wakeup(fs->object);

	/*
	* Only use the new page below...
	*/
	fs->object = fs->first_object;
	fs->pindex = fs->first_pindex;
	fs->m = fs->first_m;
	VM_CNT_INC(v_cow_faults);
	curthread->td_cow++;
	}

	static bool
	vm_fault_next(struct faultstate *fs)
	{
	vm_object_t next_object;

	/*
	* The requested page does not exist at this object/
	* offset. Remove the invalid page from the object,
	* waking up anyone waiting for it, and continue on to
	* the next object. However, if this is the top-level
	* object, we must leave the busy page in place to
	* prevent another process from rushing past us, and
	* inserting the page in that object at the same time
	* that we are.
	*/
	if (fs->object == fs->first_object) {
	fs->first_m = fs->m;
	fs->m = NULL;
	} else
	fault_page_free(&fs->m);

	/*
	* Move on to the next object. Lock the next object before
	* unlocking the current one.
	*/
	VM_OBJECT_ASSERT_WLOCKED(fs->object);
	next_object = fs->object->backing_object;
	if (next_object == NULL)
	return (false);
	MPASS(fs->first_m != NULL);
	KASSERT(fs->object != next_object, ("object loop %p", next_object));
	VM_OBJECT_WLOCK(next_object);
	vm_object_pip_add(next_object, 1);
	if (fs->object != fs->first_object)
	vm_object_pip_wakeup(fs->object);
	fs->pindex += OFF_TO_IDX(fs->object->backing_object_offset);
	VM_OBJECT_WUNLOCK(fs->object);
	fs->object = next_object;

	return (true);
	}

	static void
	vm_fault_zerofill(struct faultstate *fs)
	{

	/*
	* If there's no object left, fill the page in the top
	* object with zeros.
	*/
	if (fs->object != fs->first_object) {
	vm_object_pip_wakeup(fs->object);
	fs->object = fs->first_object;
	fs->pindex = fs->first_pindex;
	}
	MPASS(fs->first_m != NULL);
	MPASS(fs->m == NULL);
	fs->m = fs->first_m;
	fs->first_m = NULL;

	/*
	* Zero the page if necessary and mark it valid.
	*/
	if ((fs->m->flags & PG_ZERO) == 0) {
	pmap_zero_page(fs->m);
	} else {
	VM_CNT_INC(v_ozfod);
	}
	VM_CNT_INC(v_zfod);
	vm_page_valid(fs->m);
	}

	/*
	* Allocate a page directly or via the object populate method.
	*/
	static int
	vm_fault_allocate(struct faultstate *fs)
	{
	struct domainset *dset;
	int alloc_req;
	int rv;

	if ((fs->object->flags & OBJ_SIZEVNLOCK) != 0) {
	rv = vm_fault_lock_vnode(fs, true);
	MPASS(rv == KERN_SUCCESS \|\| rv == KERN_RESOURCE_SHORTAGE);
	if (rv == KERN_RESOURCE_SHORTAGE)
	return (rv);
	}

	if (fs->pindex >= fs->object->size)
	return (KERN_OUT_OF_BOUNDS);

	if (fs->object == fs->first_object &&
	(fs->first_object->flags & OBJ_POPULATE) != 0 &&
	fs->first_object->shadow_count == 0) {
	rv = vm_fault_populate(fs);
	switch (rv) {
	case KERN_SUCCESS:
	case KERN_FAILURE:
	case KERN_RESTART:
	return (rv);
	case KERN_NOT_RECEIVER:
	/*
	* Pager's populate() method
	* returned VM_PAGER_BAD.
	*/
	break;
	default:
	panic("inconsistent return codes");
	}
	}

	/*
	* Allocate a new page for this object/offset pair.
	*
	* Unlocked read of the p_flag is harmless. At worst, the P_KILLED
	* might be not observed there, and allocation can fail, causing
	* restart and new reading of the p_flag.
	*/
	dset = fs->object->domain.dr_policy;
	if (dset == NULL)
	dset = curthread->td_domain.dr_policy;
	if (!vm_page_count_severe_set(&dset->ds_mask) \|\| P_KILLED(curproc)) {
	#if VM_NRESERVLEVEL > 0
	vm_object_color(fs->object, atop(fs->vaddr) - fs->pindex);
	#endif
	alloc_req = P_KILLED(curproc) ?
	VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL;
	if (fs->object->type != OBJT_VNODE &&
	fs->object->backing_object == NULL)
	alloc_req \|= VM_ALLOC_ZERO;
	fs->m = vm_page_alloc(fs->object, fs->pindex, alloc_req);
	}
	if (fs->m == NULL) {
	unlock_and_deallocate(fs);
	if (vm_pfault_oom_attempts < 0 \|\|
	fs->oom < vm_pfault_oom_attempts) {
	fs->oom++;
	vm_waitpfault(dset, vm_pfault_oom_wait * hz);
	} else {
	if (bootverbose)
	printf(
	"proc %d (%s) failed to alloc page on fault, starting OOM\n",
	curproc->p_pid, curproc->p_comm);
	vm_pageout_oom(VM_OOM_MEM_PF);
	fs->oom = 0;
	}
	return (KERN_RESOURCE_SHORTAGE);
	}
	fs->oom = 0;

	return (KERN_NOT_RECEIVER);
	}

	/*
	* Call the pager to retrieve the page if there is a chance
	* that the pager has it, and potentially retrieve additional
	* pages at the same time.
	*/
	static int
	vm_fault_getpages(struct faultstate fs, int nera, int behindp, int *aheadp)
	{
	vm_offset_t e_end, e_start;
	int ahead, behind, cluster_offset, rv;
	u_char behavior;

	/*
	* Prepare for unlocking the map. Save the map
	* entry's start and end addresses, which are used to
	* optimize the size of the pager operation below.
	* Even if the map entry's addresses change after
	* unlocking the map, using the saved addresses is
	* safe.
	*/
	e_start = fs->entry->start;
	e_end = fs->entry->end;
	behavior = vm_map_entry_behavior(fs->entry);

	/*
	* Release the map lock before locking the vnode or
	* sleeping in the pager. (If the current object has
	* a shadow, then an earlier iteration of this loop
	* may have already unlocked the map.)
	*/
	unlock_map(fs);

	rv = vm_fault_lock_vnode(fs, false);
	MPASS(rv == KERN_SUCCESS \|\| rv == KERN_RESOURCE_SHORTAGE);
	if (rv == KERN_RESOURCE_SHORTAGE)
	return (rv);
	KASSERT(fs->vp == NULL \|\| !fs->map->system_map,
	("vm_fault: vnode-backed object mapped by system map"));

	/*
	* Page in the requested page and hint the pager,
	* that it may bring up surrounding pages.
	*/
	if (nera == -1 \|\| behavior == MAP_ENTRY_BEHAV_RANDOM \|\|
	P_KILLED(curproc)) {
	behind = 0;
	ahead = 0;
	} else {
	/* Is this a sequential fault? */
	if (nera > 0) {
	behind = 0;
	ahead = nera;
	} else {
	/*
	* Request a cluster of pages that is
	* aligned to a VM_FAULT_READ_DEFAULT
	* page offset boundary within the
	* object. Alignment to a page offset
	* boundary is more likely to coincide
	* with the underlying file system
	* block than alignment to a virtual
	* address boundary.
	*/
	cluster_offset = fs->pindex % VM_FAULT_READ_DEFAULT;
	behind = ulmin(cluster_offset,
	atop(fs->vaddr - e_start));
	ahead = VM_FAULT_READ_DEFAULT - 1 - cluster_offset;
	}
	ahead = ulmin(ahead, atop(e_end - fs->vaddr) - 1);
	}
	*behindp = behind;
	*aheadp = ahead;
	rv = vm_pager_get_pages(fs->object, &fs->m, 1, behindp, aheadp);
	if (rv == VM_PAGER_OK)
	return (KERN_SUCCESS);
	if (rv == VM_PAGER_ERROR)
	printf("vm_fault: pager read error, pid %d (%s)\n",
	curproc->p_pid, curproc->p_comm);
	/*
	* If an I/O error occurred or the requested page was
	* outside the range of the pager, clean up and return
	* an error.
	*/
	if (rv == VM_PAGER_ERROR \|\| rv == VM_PAGER_BAD)
	return (KERN_OUT_OF_BOUNDS);
	return (KERN_NOT_RECEIVER);
	}

	/*
	* Wait/Retry if the page is busy. We have to do this if the page is
	* either exclusive or shared busy because the vm_pager may be using
	* read busy for pageouts (and even pageins if it is the vnode pager),
	* and we could end up trying to pagein and pageout the same page
	* simultaneously.
	*
	* We can theoretically allow the busy case on a read fault if the page
	* is marked valid, but since such pages are typically already pmap'd,
	* putting that special case in might be more effort then it is worth.
	* We cannot under any circumstances mess around with a shared busied
	* page except, perhaps, to pmap it.
	*/
	static void
	vm_fault_busy_sleep(struct faultstate *fs)
	{
	/*
	* Reference the page before unlocking and
	* sleeping so that the page daemon is less
	* likely to reclaim it.
	*/
	vm_page_aflag_set(fs->m, PGA_REFERENCED);
	if (fs->object != fs->first_object) {
	fault_page_release(&fs->first_m);
	vm_object_pip_wakeup(fs->first_object);
	}
	vm_object_pip_wakeup(fs->object);
	unlock_map(fs);
	if (fs->m == vm_page_lookup(fs->object, fs->pindex))
	vm_page_busy_sleep(fs->m, "vmpfw", false);
	else
	VM_OBJECT_WUNLOCK(fs->object);
	VM_CNT_INC(v_intrans);
	vm_object_deallocate(fs->first_object);
	}

	int
	vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
	int fault_flags, vm_page_t *m_hold)
	{
	struct faultstate fs;
	int ahead, behind, faultcount;
	int nera, result, rv;
	bool dead, hardfault;

	VM_CNT_INC(v_vm_faults);

	if ((curthread->td_pflags & TDP_NOFAULTING) != 0)
	return (KERN_PROTECTION_FAILURE);

	fs.vp = NULL;
	fs.vaddr = vaddr;
	fs.m_hold = m_hold;
	fs.fault_flags = fault_flags;
	fs.map = map;
	fs.lookup_still_valid = false;
	fs.oom = 0;
	faultcount = 0;
	nera = -1;
	hardfault = false;

	RetryFault:
	fs.fault_type = fault_type;

	/*
	* Find the backing store object and offset into it to begin the
	* search.
	*/
	result = vm_fault_lookup(&fs);
	if (result != KERN_SUCCESS) {
	if (result == KERN_RESOURCE_SHORTAGE)
	goto RetryFault;
	return (result);
	}

	/*
	* Try to avoid lock contention on the top-level object through
	* special-case handling of some types of page faults, specifically,
	* those that are mapping an existing page from the top-level object.
	* Under this condition, a read lock on the object suffices, allowing
	* multiple page faults of a similar type to run in parallel.
	*/
	if (fs.vp == NULL /* avoid locked vnode leak */ &&
	(fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) == 0 &&
	(fs.fault_flags & (VM_FAULT_WIRE \| VM_FAULT_DIRTY)) == 0) {
	VM_OBJECT_RLOCK(fs.first_object);
	rv = vm_fault_soft_fast(&fs);
	if (rv == KERN_SUCCESS)
	return (rv);
	if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) {
	VM_OBJECT_RUNLOCK(fs.first_object);
	VM_OBJECT_WLOCK(fs.first_object);
	}
	} else {
	VM_OBJECT_WLOCK(fs.first_object);
	}

	/*
	* Make a reference to this object to prevent its disposal while we
	* are messing with it. Once we have the reference, the map is free
	* to be diddled. Since objects reference their shadows (and copies),
	* they will stay around as well.
	*
	* Bump the paging-in-progress count to prevent size changes (e.g.
	* truncation operations) during I/O.
	*/
	vm_object_reference_locked(fs.first_object);
	vm_object_pip_add(fs.first_object, 1);

	fs.m_cow = fs.m = fs.first_m = NULL;

	/*
	* Search for the page at object/offset.
	*/
	fs.object = fs.first_object;
	fs.pindex = fs.first_pindex;

	if ((fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) != 0) {
	rv = vm_fault_allocate(&fs);
	switch (rv) {
	case KERN_RESTART:
	unlock_and_deallocate(&fs);
	/* FALLTHROUGH */
	case KERN_RESOURCE_SHORTAGE:
	goto RetryFault;
	case KERN_SUCCESS:
	case KERN_FAILURE:
	case KERN_OUT_OF_BOUNDS:
	unlock_and_deallocate(&fs);
	return (rv);
	case KERN_NOT_RECEIVER:
	break;
	default:
	panic("vm_fault: Unhandled rv %d", rv);
	}
	}

	while (TRUE) {
	KASSERT(fs.m == NULL,
	("page still set %p at loop start", fs.m));
	/*
	* If the object is marked for imminent termination,
	* we retry here, since the collapse pass has raced
	* with us. Otherwise, if we see terminally dead
	* object, return fail.
	*/
	if ((fs.object->flags & OBJ_DEAD) != 0) {
	dead = fs.object->type == OBJT_DEAD;
	unlock_and_deallocate(&fs);
	if (dead)
	return (KERN_PROTECTION_FAILURE);
	pause("vmf_de", 1);
	goto RetryFault;
	}

	/*
	* See if page is resident
	*/
	fs.m = vm_page_lookup(fs.object, fs.pindex);
	if (fs.m != NULL) {
	if (vm_page_tryxbusy(fs.m) == 0) {
	vm_fault_busy_sleep(&fs);
	goto RetryFault;
	}

	/*
	* The page is marked busy for other processes and the
	* pagedaemon. If it still is completely valid we
	* are done.
	*/
	if (vm_page_all_valid(fs.m)) {
	VM_OBJECT_WUNLOCK(fs.object);
	break; /* break to PAGE HAS BEEN FOUND. */
	}
	}
	VM_OBJECT_ASSERT_WLOCKED(fs.object);

	/*
	* Page is not resident. If the pager might contain the page
	* or this is the beginning of the search, allocate a new
	* page. (Default objects are zero-fill, so there is no real
	* pager for them.)
	*/
	if (fs.m == NULL && (fs.object->type != OBJT_DEFAULT \|\|
	fs.object == fs.first_object)) {
	rv = vm_fault_allocate(&fs);
	switch (rv) {
	case KERN_RESTART:
	unlock_and_deallocate(&fs);
	/* FALLTHROUGH */
	case KERN_RESOURCE_SHORTAGE:
	goto RetryFault;
	case KERN_SUCCESS:
	case KERN_FAILURE:
	case KERN_OUT_OF_BOUNDS:
	unlock_and_deallocate(&fs);
	return (rv);
	case KERN_NOT_RECEIVER:
	break;
	default:
	panic("vm_fault: Unhandled rv %d", rv);
	}
	}

	/*
	* Default objects have no pager so no exclusive busy exists
	* to protect this page in the chain. Skip to the next
	* object without dropping the lock to preserve atomicity of
	* shadow faults.
	*/
	if (fs.object->type != OBJT_DEFAULT) {
	/*
	* At this point, we have either allocated a new page
	* or found an existing page that is only partially
	* valid.
	*
	* We hold a reference on the current object and the
	* page is exclusive busied. The exclusive busy
	* prevents simultaneous faults and collapses while
	* the object lock is dropped.
	*/
	VM_OBJECT_WUNLOCK(fs.object);

	/*
	* If the pager for the current object might have
	* the page, then determine the number of additional
	* pages to read and potentially reprioritize
	* previously read pages for earlier reclamation.
	* These operations should only be performed once per
	* page fault. Even if the current pager doesn't
	* have the page, the number of additional pages to
	* read will apply to subsequent objects in the
	* shadow chain.
	*/
	if (nera == -1 && !P_KILLED(curproc))
	nera = vm_fault_readahead(&fs);

	rv = vm_fault_getpages(&fs, nera, &behind, &ahead);
	if (rv == KERN_SUCCESS) {
	faultcount = behind + 1 + ahead;
	hardfault = true;
	break; /* break to PAGE HAS BEEN FOUND. */
	}
	if (rv == KERN_RESOURCE_SHORTAGE)
	goto RetryFault;
	VM_OBJECT_WLOCK(fs.object);
	if (rv == KERN_OUT_OF_BOUNDS) {
	fault_page_free(&fs.m);
	unlock_and_deallocate(&fs);
	return (rv);
	}
	}

	/*
	* The page was not found in the current object. Try to
	* traverse into a backing object or zero fill if none is
	* found.
	*/
	if (vm_fault_next(&fs))
	continue;
	if ((fs.fault_flags & VM_FAULT_NOFILL) != 0) {
	if (fs.first_object == fs.object)
	fault_page_free(&fs.first_m);
	unlock_and_deallocate(&fs);
	return (KERN_OUT_OF_BOUNDS);
	}
	VM_OBJECT_WUNLOCK(fs.object);
	vm_fault_zerofill(&fs);
	/* Don't try to prefault neighboring pages. */
	faultcount = 1;
	break; /* break to PAGE HAS BEEN FOUND. */
	}

	/*
	* PAGE HAS BEEN FOUND. A valid page has been found and exclusively
	* busied. The object lock must no longer be held.
	*/
	vm_page_assert_xbusied(fs.m);
	VM_OBJECT_ASSERT_UNLOCKED(fs.object);

	/*
	* If the page is being written, but isn't already owned by the
	* top-level object, we have to copy it into a new page owned by the
	* top-level object.
	*/
	if (fs.object != fs.first_object) {
	/*
	* We only really need to copy if we want to write it.
	*/
	if ((fs.fault_type & (VM_PROT_COPY \| VM_PROT_WRITE)) != 0) {
	vm_fault_cow(&fs);
	/*
	* We only try to prefault read-only mappings to the
	* neighboring pages when this copy-on-write fault is
	* a hard fault. In other cases, trying to prefault
	* is typically wasted effort.
	*/
	if (faultcount == 0)
	faultcount = 1;

	} else {
	fs.prot &= ~VM_PROT_WRITE;
	}
	}

	/*
	* We must verify that the maps have not changed since our last
	* lookup.
	*/
	if (!fs.lookup_still_valid) {
	result = vm_fault_relookup(&fs);
	if (result != KERN_SUCCESS) {
	fault_deallocate(&fs);
	if (result == KERN_RESTART)
	goto RetryFault;
	return (result);
	}
	}
	VM_OBJECT_ASSERT_UNLOCKED(fs.object);

	/*
	* If the page was filled by a pager, save the virtual address that
	* should be faulted on next under a sequential access pattern to the
	* map entry. A read lock on the map suffices to update this address
	* safely.
	*/
	if (hardfault)
	fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE;

	/*
	* Page must be completely valid or it is not fit to
	* map into user space. vm_pager_get_pages() ensures this.
	*/
	vm_page_assert_xbusied(fs.m);
	KASSERT(vm_page_all_valid(fs.m),
	("vm_fault: page %p partially invalid", fs.m));

	vm_fault_dirty(&fs, fs.m);

	/*
	* Put this page into the physical map. We had to do the unlock above
	* because pmap_enter() may sleep. We don't put the page
	* back on the active queue until later so that the pageout daemon
	* won't find it (yet).
	*/
	pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot,
	fs.fault_type \| (fs.wired ? PMAP_ENTER_WIRED : 0), 0);
	if (faultcount != 1 && (fs.fault_flags & VM_FAULT_WIRE) == 0 &&
	fs.wired == 0)
	vm_fault_prefault(&fs, vaddr,
	faultcount > 0 ? behind : PFBAK,
	faultcount > 0 ? ahead : PFFOR, false);

	/*
	* If the page is not wired down, then put it where the pageout daemon
	* can find it.
	*/
	if ((fs.fault_flags & VM_FAULT_WIRE) != 0)
	vm_page_wire(fs.m);
	else
	vm_page_activate(fs.m);
	if (fs.m_hold != NULL) {
	(*fs.m_hold) = fs.m;
	vm_page_wire(fs.m);
	}
	vm_page_xunbusy(fs.m);
	fs.m = NULL;

	/*
	* Unlock everything, and return
	*/
	fault_deallocate(&fs);
	if (hardfault) {
	VM_CNT_INC(v_io_faults);
	curthread->td_ru.ru_majflt++;
	#ifdef RACCT
	if (racct_enable && fs.object->type == OBJT_VNODE) {
	PROC_LOCK(curproc);
	if ((fs.fault_type & (VM_PROT_COPY \| VM_PROT_WRITE)) != 0) {
	racct_add_force(curproc, RACCT_WRITEBPS,
	PAGE_SIZE + behind * PAGE_SIZE);
	racct_add_force(curproc, RACCT_WRITEIOPS, 1);
	} else {
	racct_add_force(curproc, RACCT_READBPS,
	PAGE_SIZE + ahead * PAGE_SIZE);
	racct_add_force(curproc, RACCT_READIOPS, 1);
	}
	PROC_UNLOCK(curproc);
	}
	#endif
	} else
	curthread->td_ru.ru_minflt++;

	return (KERN_SUCCESS);
	}

	/*
	* Speed up the reclamation of pages that precede the faulting pindex within
	* the first object of the shadow chain. Essentially, perform the equivalent
	* to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes
	* the faulting pindex by the cluster size when the pages read by vm_fault()
	* cross a cluster-size boundary. The cluster size is the greater of the
	* smallest superpage size and VM_FAULT_DONTNEED_MIN.
	*
	* When "fs->first_object" is a shadow object, the pages in the backing object
	* that precede the faulting pindex are deactivated by vm_fault(). So, this
	* function must only be concerned with pages in the first object.
	*/
	static void
	vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead)
	{
	vm_map_entry_t entry;
	vm_object_t first_object, object;
	vm_offset_t end, start;
	vm_page_t m, m_next;
	vm_pindex_t pend, pstart;
	vm_size_t size;

	object = fs->object;
	VM_OBJECT_ASSERT_UNLOCKED(object);
	first_object = fs->first_object;
	/* Neither fictitious nor unmanaged pages can be reclaimed. */
	if ((first_object->flags & (OBJ_FICTITIOUS \| OBJ_UNMANAGED)) == 0) {
	VM_OBJECT_RLOCK(first_object);
	size = VM_FAULT_DONTNEED_MIN;
	if (MAXPAGESIZES > 1 && size < pagesizes[1])
	size = pagesizes[1];
	end = rounddown2(vaddr, size);
	if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) &&
	(entry = fs->entry)->start < end) {
	if (end - entry->start < size)
	start = entry->start;
	else
	start = end - size;
	pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED);
	pstart = OFF_TO_IDX(entry->offset) + atop(start -
	entry->start);
	m_next = vm_page_find_least(first_object, pstart);
	pend = OFF_TO_IDX(entry->offset) + atop(end -
	entry->start);
	while ((m = m_next) != NULL && m->pindex < pend) {
	m_next = TAILQ_NEXT(m, listq);
	if (!vm_page_all_valid(m) \|\|
	vm_page_busied(m))
	continue;

	/*
	* Don't clear PGA_REFERENCED, since it would
	* likely represent a reference by a different
	* process.
	*
	* Typically, at this point, prefetched pages
	* are still in the inactive queue. Only
	* pages that triggered page faults are in the
	* active queue. The test for whether the page
	* is in the inactive queue is racy; in the
	* worst case we will requeue the page
	* unnecessarily.
	*/
	if (!vm_page_inactive(m))
	vm_page_deactivate(m);
	}
	}
	VM_OBJECT_RUNLOCK(first_object);
	}
	}

	/*
	* vm_fault_prefault provides a quick way of clustering
	* pagefaults into a processes address space. It is a "cousin"
	* of vm_map_pmap_enter, except it runs at page fault time instead
	* of mmap time.
	*/
	static void
	vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
	int backward, int forward, bool obj_locked)
	{
	pmap_t pmap;
	vm_map_entry_t entry;
	vm_object_t backing_object, lobject;
	vm_offset_t addr, starta;
	vm_pindex_t pindex;
	vm_page_t m;
	int i;

	pmap = fs->map->pmap;
	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
	return;

	entry = fs->entry;

	if (addra < backward * PAGE_SIZE) {
	starta = entry->start;
	} else {
	starta = addra - backward * PAGE_SIZE;
	if (starta < entry->start)
	starta = entry->start;
	}

	/*
	* Generate the sequence of virtual addresses that are candidates for
	* prefaulting in an outward spiral from the faulting virtual address,
	* "addra". Specifically, the sequence is "addra - PAGE_SIZE", "addra
	* + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ...
	* If the candidate address doesn't have a backing physical page, then
	* the loop immediately terminates.
	*/
	for (i = 0; i < 2 * imax(backward, forward); i++) {
	addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE :
	PAGE_SIZE);
	if (addr > addra + forward * PAGE_SIZE)
	addr = 0;

	if (addr < starta \|\| addr >= entry->end)
	continue;

	if (!pmap_is_prefaultable(pmap, addr))
	continue;

	pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
	lobject = entry->object.vm_object;
	if (!obj_locked)
	VM_OBJECT_RLOCK(lobject);
	while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
	lobject->type == OBJT_DEFAULT &&
	(backing_object = lobject->backing_object) != NULL) {
	KASSERT((lobject->backing_object_offset & PAGE_MASK) ==
	0, ("vm_fault_prefault: unaligned object offset"));
	pindex += lobject->backing_object_offset >> PAGE_SHIFT;
	VM_OBJECT_RLOCK(backing_object);
	if (!obj_locked \|\| lobject != entry->object.vm_object)
	VM_OBJECT_RUNLOCK(lobject);
	lobject = backing_object;
	}
	if (m == NULL) {
	if (!obj_locked \|\| lobject != entry->object.vm_object)
	VM_OBJECT_RUNLOCK(lobject);
	break;
	}
	if (vm_page_all_valid(m) &&
	(m->flags & PG_FICTITIOUS) == 0)
	pmap_enter_quick(pmap, addr, m, entry->protection);
	if (!obj_locked \|\| lobject != entry->object.vm_object)
	VM_OBJECT_RUNLOCK(lobject);
	}
	}

	/*
	* Hold each of the physical pages that are mapped by the specified range of
	* virtual addresses, ["addr", "addr" + "len"), if those mappings are valid
	* and allow the specified types of access, "prot". If all of the implied
	* pages are successfully held, then the number of held pages is returned
	* together with pointers to those pages in the array "ma". However, if any
	* of the pages cannot be held, -1 is returned.
	*/
	int
	vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
	vm_prot_t prot, vm_page_t *ma, int max_count)
	{
	vm_offset_t end, va;
	vm_page_t *mp;
	int count;
	boolean_t pmap_failed;

	if (len == 0)
	return (0);
	end = round_page(addr + len);
	addr = trunc_page(addr);

	if (!vm_map_range_valid(map, addr, end))
	return (-1);

	if (atop(end - addr) > max_count)
	panic("vm_fault_quick_hold_pages: count > max_count");
	count = atop(end - addr);

	/*
	* Most likely, the physical pages are resident in the pmap, so it is
	* faster to try pmap_extract_and_hold() first.
	*/
	pmap_failed = FALSE;
	for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) {
	*mp = pmap_extract_and_hold(map->pmap, va, prot);
	if (*mp == NULL)
	pmap_failed = TRUE;
	else if ((prot & VM_PROT_WRITE) != 0 &&
	(*mp)->dirty != VM_PAGE_BITS_ALL) {
	/*
	* Explicitly dirty the physical page. Otherwise, the
	* caller's changes may go unnoticed because they are
	* performed through an unmanaged mapping or by a DMA
	* operation.
	*
	* The object lock is not held here.
	* See vm_page_clear_dirty_mask().
	*/
	vm_page_dirty(*mp);
	}
	}
	if (pmap_failed) {
	/*
	* One or more pages could not be held by the pmap. Either no
	* page was mapped at the specified virtual address or that
	* mapping had insufficient permissions. Attempt to fault in
	* and hold these pages.
	*
	* If vm_fault_disable_pagefaults() was called,
	* i.e., TDP_NOFAULTING is set, we must not sleep nor
	* acquire MD VM locks, which means we must not call
	* vm_fault(). Some (out of tree) callers mark
	* too wide a code area with vm_fault_disable_pagefaults()
	* already, use the VM_PROT_QUICK_NOFAULT flag to request
	* the proper behaviour explicitly.
	*/
	if ((prot & VM_PROT_QUICK_NOFAULT) != 0 &&
	(curthread->td_pflags & TDP_NOFAULTING) != 0)
	goto error;
	for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
	if (*mp == NULL && vm_fault(map, va, prot,
	VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
	goto error;
	}
	return (count);
	error:
	for (mp = ma; mp < ma + count; mp++)
	if (*mp != NULL)
	vm_page_unwire(*mp, PQ_INACTIVE);
	return (-1);
	}

	/*
	* Routine:
	* vm_fault_copy_entry
	* Function:
	* Create new shadow object backing dst_entry with private copy of
	* all underlying pages. When src_entry is equal to dst_entry,
	* function implements COW for wired-down map entry. Otherwise,
	* it forks wired entry into dst_map.
	*
	* In/out conditions:
	* The source and destination maps must be locked for write.
	* The source map entry must be wired down (or be a sharing map
	* entry corresponding to a main map entry that is wired down).
	*/
	void
	vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
	vm_map_entry_t dst_entry, vm_map_entry_t src_entry,
	vm_ooffset_t *fork_charge)
	{
	vm_object_t backing_object, dst_object, object, src_object;
	vm_pindex_t dst_pindex, pindex, src_pindex;
	vm_prot_t access, prot;
	vm_offset_t vaddr;
	vm_page_t dst_m;
	vm_page_t src_m;
	boolean_t upgrade;

	#ifdef lint
	src_map++;
	#endif /* lint */

	upgrade = src_entry == dst_entry;
	access = prot = dst_entry->protection;

	src_object = src_entry->object.vm_object;
	src_pindex = OFF_TO_IDX(src_entry->offset);

	if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
	dst_object = src_object;
	vm_object_reference(dst_object);
	} else {
	/*
	* Create the top-level object for the destination entry.
	* Doesn't actually shadow anything - we copy the pages
	* directly.
	*/
	dst_object = vm_object_allocate_anon(atop(dst_entry->end -
	dst_entry->start), NULL, NULL, 0);
	#if VM_NRESERVLEVEL > 0
	dst_object->flags \|= OBJ_COLORED;
	dst_object->pg_color = atop(dst_entry->start);
	#endif
	dst_object->domain = src_object->domain;
	dst_object->charge = dst_entry->end - dst_entry->start;
	}

	VM_OBJECT_WLOCK(dst_object);
	KASSERT(upgrade \|\| dst_entry->object.vm_object == NULL,
	("vm_fault_copy_entry: vm_object not NULL"));
	if (src_object != dst_object) {
	dst_entry->object.vm_object = dst_object;
	dst_entry->offset = 0;
	dst_entry->eflags &= ~MAP_ENTRY_VN_EXEC;
	}
	if (fork_charge != NULL) {
	KASSERT(dst_entry->cred == NULL,
	("vm_fault_copy_entry: leaked swp charge"));
	dst_object->cred = curthread->td_ucred;
	crhold(dst_object->cred);
	*fork_charge += dst_object->charge;
	} else if ((dst_object->type == OBJT_DEFAULT \|\|
	dst_object->type == OBJT_SWAP) &&
	dst_object->cred == NULL) {
	KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
	dst_entry));
	dst_object->cred = dst_entry->cred;
	dst_entry->cred = NULL;
	}

	/*
	* If not an upgrade, then enter the mappings in the pmap as
	* read and/or execute accesses. Otherwise, enter them as
	* write accesses.
	*
	* A writeable large page mapping is only created if all of
	* the constituent small page mappings are modified. Marking
	* PTEs as modified on inception allows promotion to happen
	* without taking potentially large number of soft faults.
	*/
	if (!upgrade)
	access &= ~VM_PROT_WRITE;

	/*
	* Loop through all of the virtual pages within the entry's
	* range, copying each page from the source object to the
	* destination object. Since the source is wired, those pages
	* must exist. In contrast, the destination is pageable.
	* Since the destination object doesn't share any backing storage
	* with the source object, all of its pages must be dirtied,
	* regardless of whether they can be written.
	*/
	for (vaddr = dst_entry->start, dst_pindex = 0;
	vaddr < dst_entry->end;
	vaddr += PAGE_SIZE, dst_pindex++) {
	again:
	/*
	* Find the page in the source object, and copy it in.
	* Because the source is wired down, the page will be
	* in memory.
	*/
	if (src_object != dst_object)
	VM_OBJECT_RLOCK(src_object);
	object = src_object;
	pindex = src_pindex + dst_pindex;
	while ((src_m = vm_page_lookup(object, pindex)) == NULL &&
	(backing_object = object->backing_object) != NULL) {
	/*
	* Unless the source mapping is read-only or
	* it is presently being upgraded from
	* read-only, the first object in the shadow
	* chain should provide all of the pages. In
	* other words, this loop body should never be
	* executed when the source mapping is already
	* read/write.
	*/
	KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 \|\|
	upgrade,
	("vm_fault_copy_entry: main object missing page"));

	VM_OBJECT_RLOCK(backing_object);
	pindex += OFF_TO_IDX(object->backing_object_offset);
	if (object != dst_object)
	VM_OBJECT_RUNLOCK(object);
	object = backing_object;
	}
	KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing"));

	if (object != dst_object) {
	/*
	* Allocate a page in the destination object.
	*/
	dst_m = vm_page_alloc(dst_object, (src_object ==
	dst_object ? src_pindex : 0) + dst_pindex,
	VM_ALLOC_NORMAL);
	if (dst_m == NULL) {
	VM_OBJECT_WUNLOCK(dst_object);
	VM_OBJECT_RUNLOCK(object);
	vm_wait(dst_object);
	VM_OBJECT_WLOCK(dst_object);
	goto again;
	}
	pmap_copy_page(src_m, dst_m);
	VM_OBJECT_RUNLOCK(object);
	dst_m->dirty = dst_m->valid = src_m->valid;
	} else {
	dst_m = src_m;
	if (vm_page_busy_acquire(dst_m, VM_ALLOC_WAITFAIL) == 0)
	goto again;
	if (dst_m->pindex >= dst_object->size) {
	/*
	* We are upgrading. Index can occur
	* out of bounds if the object type is
	* vnode and the file was truncated.
	*/
	vm_page_xunbusy(dst_m);
	break;
	}
	}
	VM_OBJECT_WUNLOCK(dst_object);

	/*
	* Enter it in the pmap. If a wired, copy-on-write
	* mapping is being replaced by a write-enabled
	* mapping, then wire that new mapping.
	*
	* The page can be invalid if the user called
	* msync(MS_INVALIDATE) or truncated the backing vnode
	* or shared memory object. In this case, do not
	* insert it into pmap, but still do the copy so that
	* all copies of the wired map entry have similar
	* backing pages.
	*/
	if (vm_page_all_valid(dst_m)) {
	pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
	access \| (upgrade ? PMAP_ENTER_WIRED : 0), 0);
	}

	/*
	* Mark it no longer busy, and put it on the active list.
	*/
	VM_OBJECT_WLOCK(dst_object);

	if (upgrade) {
	if (src_m != dst_m) {
	vm_page_unwire(src_m, PQ_INACTIVE);
	vm_page_wire(dst_m);
	} else {
	KASSERT(vm_page_wired(dst_m),
	("dst_m %p is not wired", dst_m));
	}
	} else {
	vm_page_activate(dst_m);
	}
	vm_page_xunbusy(dst_m);
	}
	VM_OBJECT_WUNLOCK(dst_object);
	if (upgrade) {
	dst_entry->eflags &= ~(MAP_ENTRY_COW \| MAP_ENTRY_NEEDS_COPY);
	vm_object_deallocate(src_object);
	}
	}

	/*
	* Block entry into the machine-independent layer's page fault handler by
	* the calling thread. Subsequent calls to vm_fault() by that thread will
	* return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of
	* spurious page faults.
	*/
	int
	vm_fault_disable_pagefaults(void)
	{

	return (curthread_pflags_set(TDP_NOFAULTING \| TDP_RESETSPUR));
	}

	void
	vm_fault_enable_pagefaults(int save)
	{

	curthread_pflags_restore(save);
	}
	diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c
	index 7884eb19eef9..f17f81264eb6 100644
	--- a/sys/vm/vm_init.c
	+++ b/sys/vm/vm_init.c
	@@ -1,260 +1,260 @@
	/*-
	* SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
	*
	* Copyright (c) 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* The Mach Operating System project at Carnegie-Mellon University.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)vm_init.c 8.1 (Berkeley) 6/11/93
	*
	*
	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	* All rights reserved.
	*
	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	*
	* Permission to use, copy, modify and distribute this software and
	* its documentation is hereby granted, provided that both the copyright
	* notice and this permission notice appear in all copies of the
	* software, derivative works or modified versions, and any portions
	* thereof, and that both notices appear in supporting documentation.
	*
	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	*
	* Carnegie Mellon requests users of this software to return to
	*
	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	* School of Computer Science
	* Carnegie Mellon University
	* Pittsburgh PA 15213-3890
	*
	* any improvements or extensions that they make and grant Carnegie the
	* rights to redistribute these changes.
	*/

	/*
	* Initialize the Virtual Memory subsystem.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/domainset.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/proc.h>
	#include <sys/rwlock.h>
	#include <sys/malloc.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/selinfo.h>
	#include <sys/smp.h>
	#include <sys/pipe.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/vmem.h>
	#include <sys/vmmeter.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_phys.h>
	#include <vm/vm_pagequeue.h>
	#include <vm/vm_map.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_extern.h>

	extern void uma_startup1(vm_offset_t);

	long physmem;

	/*
	* System initialization
	*/
	static void vm_mem_init(void *);
	SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_FIRST, vm_mem_init, NULL);

	/*
	* vm_init initializes the virtual memory system.
	* This is done only by the first cpu up.
	*/
	static void
	vm_mem_init(void *dummy)
	{

	/*
	* Initialize static domainsets, used by various allocators.
	*/
	domainset_init();

	/*
	* Initialize resident memory structures. From here on, all physical
	* memory is accounted for, and we use only virtual addresses.
	*/
	vm_set_page_size();
	virtual_avail = vm_page_startup(virtual_avail);

	/*
	* Set an initial domain policy for thread0 so that allocations
	* can work.
	*/
	domainset_zero();

	/* Bootstrap the kernel memory allocator. */
	uma_startup1(virtual_avail);

	/*
	* Initialize other VM packages
	*/
	vmem_startup();
	vm_object_init();
	vm_map_startup();
	kmem_init(virtual_avail, virtual_end);

	kmem_init_zero_region();
	pmap_init();
	vm_pager_init();
	}

	void
	vm_ksubmap_init(struct kva_md_info *kmi)
	{
	vm_offset_t firstaddr;
	caddr_t v;
	vm_size_t size = 0;
	long physmem_est;
	vm_offset_t minaddr;
	vm_offset_t maxaddr;

	/*
	* Allocate space for system data structures.
	* The first available kernel virtual address is in "v".
	* As pages of kernel virtual memory are allocated, "v" is incremented.
	* As pages of memory are allocated and cleared,
	* "firstaddr" is incremented.
	*/

	/*
	* Make two passes. The first pass calculates how much memory is
	* needed and allocates it. The second pass assigns virtual
	* addresses to the various data structures.
	*/
	firstaddr = 0;
	again:
	v = (caddr_t)firstaddr;

	/*
	* Discount the physical memory larger than the size of kernel_map
	* to avoid eating up all of KVA space.
	*/
	physmem_est = lmin(physmem, btoc(vm_map_max(kernel_map) -
	vm_map_min(kernel_map)));

	v = kern_vfs_bio_buffer_alloc(v, physmem_est);

	/*
	* End of first pass, size has been calculated so allocate memory
	*/
	if (firstaddr == 0) {
	size = (vm_size_t)v;
	#ifdef VM_FREELIST_DMA32
	/*
	* Try to protect 32-bit DMAable memory from the largest
	* early alloc of wired mem.
	*/
	firstaddr = kmem_alloc_attr(size, M_ZERO \| M_NOWAIT,
	(vm_paddr_t)1 << 32, ~(vm_paddr_t)0, VM_MEMATTR_DEFAULT);
	if (firstaddr == 0)
	#endif
	firstaddr = kmem_malloc(size, M_ZERO \| M_WAITOK);
	if (firstaddr == 0)
	panic("startup: no room for tables");
	goto again;
	}

	/*
	* End of second pass, addresses have been assigned
	*/
	if ((vm_size_t)((char *)v - firstaddr) != size)
	panic("startup: table size inconsistency");

	/*
	* Allocate the clean map to hold all of I/O virtual memory.
	*/
	- size = (long)nbuf * BKVASIZE + (long)bio_transient_maxcnt * MAXPHYS;
	+ size = (long)nbuf * BKVASIZE + (long)bio_transient_maxcnt * maxphys;
	kmi->clean_sva = firstaddr = kva_alloc(size);
	kmi->clean_eva = firstaddr + size;

	/*
	* Allocate the buffer arena.
	*
	* Enable the quantum cache if we have more than 4 cpus. This
	* avoids lock contention at the expense of some fragmentation.
	*/
	size = (long)nbuf * BKVASIZE;
	kmi->buffer_sva = firstaddr;
	kmi->buffer_eva = kmi->buffer_sva + size;
	vmem_init(buffer_arena, "buffer arena", kmi->buffer_sva, size,
	PAGE_SIZE, (mp_ncpus > 4) ? BKVASIZE * 8 : 0, 0);
	firstaddr += size;

	/*
	* And optionally transient bio space.
	*/
	if (bio_transient_maxcnt != 0) {
	- size = (long)bio_transient_maxcnt * MAXPHYS;
	+ size = (long)bio_transient_maxcnt * maxphys;
	vmem_init(transient_arena, "transient arena",
	firstaddr, size, PAGE_SIZE, 0, 0);
	firstaddr += size;
	}
	if (firstaddr != kmi->clean_eva)
	panic("Clean map calculation incorrect");

	/*
	* Allocate the pageable submaps. We may cache an exec map entry per
	* CPU, so we therefore need to reserve space for at least ncpu+1
	* entries to avoid deadlock. The exec map is also used by some image
	* activators, so we leave a fixed number of pages for their use.
	*/
	#ifdef __LP64__
	exec_map_entries = 8 * mp_ncpus;
	#else
	exec_map_entries = 2 * mp_ncpus + 4;
	#endif
	exec_map_entry_size = round_page(PATH_MAX + ARG_MAX);
	kmem_subinit(exec_map, kernel_map, &minaddr, &maxaddr,
	exec_map_entries * exec_map_entry_size + 64 * PAGE_SIZE, false);
	kmem_subinit(pipe_map, kernel_map, &minaddr, &maxaddr, maxpipekva,
	false);
	}
	diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h
	index 47c4d949c811..349bb4815762 100644
	--- a/sys/vm/vm_map.h
	+++ b/sys/vm/vm_map.h
	@@ -1,529 +1,529 @@
	/*-
	* SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
	*
	* Copyright (c) 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* The Mach Operating System project at Carnegie-Mellon University.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)vm_map.h 8.9 (Berkeley) 5/17/95
	*
	*
	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	* All rights reserved.
	*
	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	*
	* Permission to use, copy, modify and distribute this software and
	* its documentation is hereby granted, provided that both the copyright
	* notice and this permission notice appear in all copies of the
	* software, derivative works or modified versions, and any portions
	* thereof, and that both notices appear in supporting documentation.
	*
	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	*
	* Carnegie Mellon requests users of this software to return to
	*
	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	* School of Computer Science
	* Carnegie Mellon University
	* Pittsburgh PA 15213-3890
	*
	* any improvements or extensions that they make and grant Carnegie the
	* rights to redistribute these changes.
	*
	* $FreeBSD$
	*/

	/*
	* Virtual memory map module definitions.
	*/
	#ifndef _VM_MAP_
	#define _VM_MAP_

	#include <sys/lock.h>
	#include <sys/sx.h>
	#include <sys/_mutex.h>

	/*
	* Types defined:
	*
	* vm_map_t the high-level address map data structure.
	* vm_map_entry_t an entry in an address map.
	*/

	typedef u_char vm_flags_t;
	typedef u_int vm_eflags_t;

	/*
	* Objects which live in maps may be either VM objects, or
	* another map (called a "sharing map") which denotes read-write
	* sharing with other maps.
	*/
	union vm_map_object {
	struct vm_object vm_object; / object object */
	struct vm_map sub_map; / belongs to another map */
	};

	/*
	* Address map entries consist of start and end addresses,
	* a VM object (or sharing map) and offset into that object,
	* and user-exported inheritance and protection information.
	* Also included is control information for virtual copy operations.
	*/
	struct vm_map_entry {
	struct vm_map_entry left; / left child or previous entry */
	struct vm_map_entry right; / right child or next entry */
	vm_offset_t start; /* start address */
	vm_offset_t end; /* end address */
	vm_offset_t next_read; /* vaddr of the next sequential read */
	vm_size_t max_free; /* max free space in subtree */
	union vm_map_object object; /* object I point to */
	vm_ooffset_t offset; /* offset into object */
	vm_eflags_t eflags; /* map entry flags */
	vm_prot_t protection; /* protection code */
	vm_prot_t max_protection; /* maximum protection */
	vm_inherit_t inheritance; /* inheritance */
	uint8_t read_ahead; /* pages in the read-ahead window */
	int wired_count; /* can be paged if = 0 */
	struct ucred cred; / tmp storage for creator ref */
	struct thread *wiring_thread;
	};

	#define MAP_ENTRY_NOSYNC 0x00000001
	#define MAP_ENTRY_IS_SUB_MAP 0x00000002
	#define MAP_ENTRY_COW 0x00000004
	#define MAP_ENTRY_NEEDS_COPY 0x00000008
	#define MAP_ENTRY_NOFAULT 0x00000010
	#define MAP_ENTRY_USER_WIRED 0x00000020

	#define MAP_ENTRY_BEHAV_NORMAL 0x00000000 /* default behavior */
	#define MAP_ENTRY_BEHAV_SEQUENTIAL 0x00000040 /* expect sequential
	access */
	#define MAP_ENTRY_BEHAV_RANDOM 0x00000080 /* expect random
	access */
	#define MAP_ENTRY_BEHAV_RESERVED 0x000000c0 /* future use */
	#define MAP_ENTRY_BEHAV_MASK 0x000000c0
	#define MAP_ENTRY_IN_TRANSITION 0x00000100 /* entry being
	changed */
	#define MAP_ENTRY_NEEDS_WAKEUP 0x00000200 /* waiters in
	transition */
	#define MAP_ENTRY_NOCOREDUMP 0x00000400 /* don't include in
	a core */
	#define MAP_ENTRY_VN_EXEC 0x00000800 /* text vnode mapping */
	#define MAP_ENTRY_GROWS_DOWN 0x00001000 /* top-down stacks */
	#define MAP_ENTRY_GROWS_UP 0x00002000 /* bottom-up stacks */

	#define MAP_ENTRY_WIRE_SKIPPED 0x00004000
	#define MAP_ENTRY_WRITECNT 0x00008000 /* tracked writeable
	mapping */
	#define MAP_ENTRY_GUARD 0x00010000
	#define MAP_ENTRY_STACK_GAP_DN 0x00020000
	#define MAP_ENTRY_STACK_GAP_UP 0x00040000
	#define MAP_ENTRY_HEADER 0x00080000

	#define MAP_ENTRY_SPLIT_BOUNDARY_MASK 0x00300000

	#define MAP_ENTRY_SPLIT_BOUNDARY_SHIFT 20

	#ifdef _KERNEL
	static __inline u_char
	vm_map_entry_behavior(vm_map_entry_t entry)
	{
	return (entry->eflags & MAP_ENTRY_BEHAV_MASK);
	}

	static __inline int
	vm_map_entry_user_wired_count(vm_map_entry_t entry)
	{
	if (entry->eflags & MAP_ENTRY_USER_WIRED)
	return (1);
	return (0);
	}

	static __inline int
	vm_map_entry_system_wired_count(vm_map_entry_t entry)
	{
	return (entry->wired_count - vm_map_entry_user_wired_count(entry));
	}
	#endif /* _KERNEL */

	/*
	* A map is a set of map entries. These map entries are
	* organized as a threaded binary search tree. Both structures
	* are ordered based upon the start and end addresses contained
	* within each map entry. The largest gap between an entry in a
	* subtree and one of its neighbors is saved in the max_free
	* field, and that field is updated when the tree is
	* restructured.
	*
	* Sleator and Tarjan's top-down splay algorithm is employed to
	* control height imbalance in the binary search tree.
	*
	* The map's min offset value is stored in map->header.end, and
	* its max offset value is stored in map->header.start. These
	* values act as sentinels for any forward or backward address
	* scan of the list. The right and left fields of the map
	* header point to the first and list map entries. The map
	* header has a special value for the eflags field,
	* MAP_ENTRY_HEADER, that is set initially, is never changed,
	* and prevents an eflags match of the header with any other map
	* entry.
	*
	* List of locks
	* (c) const until freed
	*/
	struct vm_map {
	struct vm_map_entry header; /* List of entries */
	struct sx lock; /* Lock for map data */
	struct mtx system_mtx;
	int nentries; /* Number of entries */
	vm_size_t size; /* virtual size */
	u_int timestamp; /* Version number */
	u_char needs_wakeup;
	u_char system_map; /* (c) Am I a system map? */
	vm_flags_t flags; /* flags for this vm_map */
	vm_map_entry_t root; /* Root of a binary search tree */
	pmap_t pmap; /* (c) Physical map */
	vm_offset_t anon_loc;
	int busy;
	#ifdef DIAGNOSTIC
	int nupdates;
	#endif
	};

	/*
	* vm_flags_t values
	*/
	#define MAP_WIREFUTURE 0x01 /* wire all future pages */
	#define MAP_BUSY_WAKEUP 0x02
	#define MAP_IS_SUB_MAP 0x04 /* has parent */
	#define MAP_ASLR 0x08 /* enabled ASLR */
	#define MAP_ASLR_IGNSTART 0x10
	#define MAP_REPLENISH 0x20

	#ifdef _KERNEL
	#if defined(KLD_MODULE) && !defined(KLD_TIED)
	#define vm_map_max(map) vm_map_max_KBI((map))
	#define vm_map_min(map) vm_map_min_KBI((map))
	#define vm_map_pmap(map) vm_map_pmap_KBI((map))
	#define vm_map_range_valid(map, start, end) \
	vm_map_range_valid_KBI((map), (start), (end))
	#else
	static __inline vm_offset_t
	vm_map_max(const struct vm_map *map)
	{

	return (map->header.start);
	}

	static __inline vm_offset_t
	vm_map_min(const struct vm_map *map)
	{

	return (map->header.end);
	}

	static __inline pmap_t
	vm_map_pmap(vm_map_t map)
	{
	return (map->pmap);
	}

	static __inline void
	vm_map_modflags(vm_map_t map, vm_flags_t set, vm_flags_t clear)
	{
	map->flags = (map->flags \| set) & ~clear;
	}

	static inline bool
	vm_map_range_valid(vm_map_t map, vm_offset_t start, vm_offset_t end)
	{
	if (end < start)
	return (false);
	if (start < vm_map_min(map) \|\| end > vm_map_max(map))
	return (false);
	return (true);
	}

	#endif /* KLD_MODULE */
	#endif /* _KERNEL */

	/*
	* Shareable process virtual address space.
	*
	* List of locks
	* (c) const until freed
	*/
	struct vmspace {
	struct vm_map vm_map; /* VM address map */
	struct shmmap_state vm_shm; / SYS5 shared memory private data XXX */
	segsz_t vm_swrss; /* resident set size before last swap */
	segsz_t vm_tsize; /* text size (pages) XXX */
	segsz_t vm_dsize; /* data size (pages) XXX */
	segsz_t vm_ssize; /* stack size (pages) */
	caddr_t vm_taddr; /* (c) user virtual address of text */
	caddr_t vm_daddr; /* (c) user virtual address of data */
	caddr_t vm_maxsaddr; /* user VA at max stack growth */
	u_int vm_refcnt; /* number of references */
	/*
	* Keep the PMAP last, so that CPU-specific variations of that
	* structure on a single architecture don't result in offset
	* variations of the machine-independent fields in the vmspace.
	*/
	struct pmap vm_pmap; /* private physical map */
	};

	#ifdef _KERNEL
	static __inline pmap_t
	vmspace_pmap(struct vmspace *vmspace)
	{
	return &vmspace->vm_pmap;
	}
	#endif /* _KERNEL */

	#ifdef _KERNEL
	/*
	* Macros: vm_map_lock, etc.
	* Function:
	* Perform locking on the data portion of a map. Note that
	* these macros mimic procedure calls returning void. The
	* semicolon is supplied by the user of these macros, not
	* by the macros themselves. The macros can safely be used
	* as unbraced elements in a higher level statement.
	*/

	void _vm_map_lock(vm_map_t map, const char *file, int line);
	void _vm_map_unlock(vm_map_t map, const char *file, int line);
	int _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line);
	void _vm_map_lock_read(vm_map_t map, const char *file, int line);
	void _vm_map_unlock_read(vm_map_t map, const char *file, int line);
	int _vm_map_trylock(vm_map_t map, const char *file, int line);
	int _vm_map_trylock_read(vm_map_t map, const char *file, int line);
	int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line);
	void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line);
	int vm_map_locked(vm_map_t map);
	void vm_map_wakeup(vm_map_t map);
	void vm_map_busy(vm_map_t map);
	void vm_map_unbusy(vm_map_t map);
	void vm_map_wait_busy(vm_map_t map);
	vm_offset_t vm_map_max_KBI(const struct vm_map *map);
	vm_offset_t vm_map_min_KBI(const struct vm_map *map);
	pmap_t vm_map_pmap_KBI(vm_map_t map);
	bool vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end);

	#define vm_map_lock(map) _vm_map_lock(map, LOCK_FILE, LOCK_LINE)
	#define vm_map_unlock(map) _vm_map_unlock(map, LOCK_FILE, LOCK_LINE)
	#define vm_map_unlock_and_wait(map, timo) \
	_vm_map_unlock_and_wait(map, timo, LOCK_FILE, LOCK_LINE)
	#define vm_map_lock_read(map) _vm_map_lock_read(map, LOCK_FILE, LOCK_LINE)
	#define vm_map_unlock_read(map) _vm_map_unlock_read(map, LOCK_FILE, LOCK_LINE)
	#define vm_map_trylock(map) _vm_map_trylock(map, LOCK_FILE, LOCK_LINE)
	#define vm_map_trylock_read(map) \
	_vm_map_trylock_read(map, LOCK_FILE, LOCK_LINE)
	#define vm_map_lock_upgrade(map) \
	_vm_map_lock_upgrade(map, LOCK_FILE, LOCK_LINE)
	#define vm_map_lock_downgrade(map) \
	_vm_map_lock_downgrade(map, LOCK_FILE, LOCK_LINE)

	long vmspace_resident_count(struct vmspace *vmspace);
	#endif /* _KERNEL */

	/*
	* Copy-on-write flags for vm_map operations
	*/
	#define MAP_INHERIT_SHARE 0x00000001
	#define MAP_COPY_ON_WRITE 0x00000002
	#define MAP_NOFAULT 0x00000004
	#define MAP_PREFAULT 0x00000008
	#define MAP_PREFAULT_PARTIAL 0x00000010
	#define MAP_DISABLE_SYNCER 0x00000020
	#define MAP_CHECK_EXCL 0x00000040
	#define MAP_CREATE_GUARD 0x00000080
	#define MAP_DISABLE_COREDUMP 0x00000100
	#define MAP_PREFAULT_MADVISE 0x00000200 /* from (user) madvise request */
	#define MAP_WRITECOUNT 0x00000400
	#define MAP_REMAP 0x00000800
	#define MAP_STACK_GROWS_DOWN 0x00001000
	#define MAP_STACK_GROWS_UP 0x00002000
	#define MAP_ACC_CHARGED 0x00004000
	#define MAP_ACC_NO_CHARGE 0x00008000
	#define MAP_CREATE_STACK_GAP_UP 0x00010000
	#define MAP_CREATE_STACK_GAP_DN 0x00020000
	#define MAP_VN_EXEC 0x00040000
	#define MAP_SPLIT_BOUNDARY_MASK 0x00180000

	#define MAP_SPLIT_BOUNDARY_SHIFT 19

	/*
	* vm_fault option flags
	*/
	#define VM_FAULT_NORMAL 0x00 /* Nothing special */
	#define VM_FAULT_WIRE 0x01 /* Wire the mapped page */
	#define VM_FAULT_DIRTY 0x02 /* Dirty the page; use w/VM_PROT_COPY */
	#define VM_FAULT_NOFILL 0x04 /* Fail if the pager doesn't have a copy */

	/*
	* Initially, mappings are slightly sequential. The maximum window size must
	* account for the map entry's "read_ahead" field being defined as an uint8_t.
	*/
	#define VM_FAULT_READ_AHEAD_MIN 7
	#define VM_FAULT_READ_AHEAD_INIT 15
	-#define VM_FAULT_READ_AHEAD_MAX min(atop(MAXPHYS) - 1, UINT8_MAX)
	+#define VM_FAULT_READ_AHEAD_MAX min(atop(maxphys) - 1, UINT8_MAX)

	/*
	* The following "find_space" options are supported by vm_map_find().
	*
	* For VMFS_ALIGNED_SPACE, the desired alignment is specified to
	* the macro argument as log base 2 of the desired alignment.
	*/
	#define VMFS_NO_SPACE 0 /* don't find; use the given range */
	#define VMFS_ANY_SPACE 1 /* find a range with any alignment */
	#define VMFS_OPTIMAL_SPACE 2 /* find a range with optimal alignment*/
	#define VMFS_SUPER_SPACE 3 /* find a superpage-aligned range */
	#define VMFS_ALIGNED_SPACE(x) ((x) << 8) /* find a range with fixed alignment */

	/*
	* vm_map_wire and vm_map_unwire option flags
	*/
	#define VM_MAP_WIRE_SYSTEM 0 /* wiring in a kernel map */
	#define VM_MAP_WIRE_USER 1 /* wiring in a user map */

	#define VM_MAP_WIRE_NOHOLES 0 /* region must not have holes */
	#define VM_MAP_WIRE_HOLESOK 2 /* region may have holes */

	#define VM_MAP_WIRE_WRITE 4 /* Validate writable. */

	typedef int vm_map_entry_reader(void *token, vm_map_entry_t addr,
	vm_map_entry_t dest);

	#ifndef _KERNEL
	/*
	* Find the successor of a map_entry, using a reader to dereference pointers.
	* '*clone' is a copy of a vm_map entry. 'reader' is used to copy a map entry
	* at some address into 'clone'. Change clone to a copy of the next map
	* entry, and return the address of that entry, or NULL if copying has failed.
	*
	* This function is made available to user-space code that needs to traverse
	* map entries.
	*/
	static inline vm_map_entry_t
	vm_map_entry_read_succ(void token, struct vm_map_entry const clone,
	vm_map_entry_reader reader)
	{
	vm_map_entry_t after, backup;
	vm_offset_t start;

	after = clone->right;
	start = clone->start;
	if (!reader(token, after, clone))
	return (NULL);
	backup = clone->left;
	if (!reader(token, backup, clone))
	return (NULL);
	if (clone->start > start) {
	do {
	after = backup;
	backup = clone->left;
	if (!reader(token, backup, clone))
	return (NULL);
	} while (clone->start != start);
	}
	if (!reader(token, after, clone))
	return (NULL);
	return (after);
	}
	#endif /* ! _KERNEL */

	#ifdef _KERNEL
	boolean_t vm_map_check_protection (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t);
	int vm_map_delete(vm_map_t, vm_offset_t, vm_offset_t);
	int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t,
	vm_offset_t, int, vm_prot_t, vm_prot_t, int);
	int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *,
	vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int);
	int vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
	vm_offset_t max_addr, vm_offset_t alignment);
	int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t,
	vm_prot_t, vm_prot_t, int);
	vm_offset_t vm_map_findspace(vm_map_t, vm_offset_t, vm_size_t);
	int vm_map_inherit (vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t);
	void vm_map_init(vm_map_t, pmap_t, vm_offset_t, vm_offset_t);
	int vm_map_insert (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_offset_t, vm_prot_t, vm_prot_t, int);
	int vm_map_lookup (vm_map_t , vm_offset_t, vm_prot_t, vm_map_entry_t , vm_object_t *,
	vm_pindex_t , vm_prot_t , boolean_t *);
	int vm_map_lookup_locked(vm_map_t , vm_offset_t, vm_prot_t, vm_map_entry_t , vm_object_t *,
	vm_pindex_t , vm_prot_t , boolean_t *);
	void vm_map_lookup_done (vm_map_t, vm_map_entry_t);
	boolean_t vm_map_lookup_entry (vm_map_t, vm_offset_t, vm_map_entry_t *);

	static inline vm_map_entry_t
	vm_map_entry_first(vm_map_t map)
	{

	return (map->header.right);
	}

	static inline vm_map_entry_t
	vm_map_entry_succ(vm_map_entry_t entry)
	{
	vm_map_entry_t after;

	after = entry->right;
	if (after->left->start > entry->start) {
	do
	after = after->left;
	while (after->left != entry);
	}
	return (after);
	}

	#define VM_MAP_ENTRY_FOREACH(it, map) \
	for ((it) = vm_map_entry_first(map); \
	(it) != &(map)->header; \
	(it) = vm_map_entry_succ(it))
	int vm_map_protect (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t);
	int vm_map_remove (vm_map_t, vm_offset_t, vm_offset_t);
	void vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev,
	vm_map_entry_t entry);
	void vm_map_startup (void);
	int vm_map_submap (vm_map_t, vm_offset_t, vm_offset_t, vm_map_t);
	int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t);
	int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int);
	int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int);
	int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
	int flags);
	int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags);
	int vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end,
	int flags);
	long vmspace_swap_count(struct vmspace *vmspace);
	void vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add);
	#endif /* _KERNEL */
	#endif /* _VM_MAP_ */
	diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c
	index e3073efe8bd3..2a30a2073b36 100644
	--- a/sys/vm/vm_pager.c
	+++ b/sys/vm/vm_pager.c
	@@ -1,498 +1,499 @@
	/*-
	* SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
	*
	* Copyright (c) 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* The Mach Operating System project at Carnegie-Mellon University.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)vm_pager.c 8.6 (Berkeley) 1/12/94
	*
	*
	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	* All rights reserved.
	*
	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	*
	* Permission to use, copy, modify and distribute this software and
	* its documentation is hereby granted, provided that both the copyright
	* notice and this permission notice appear in all copies of the
	* software, derivative works or modified versions, and any portions
	* thereof, and that both notices appear in supporting documentation.
	*
	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	*
	* Carnegie Mellon requests users of this software to return to
	*
	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	* School of Computer Science
	* Carnegie Mellon University
	* Pittsburgh PA 15213-3890
	*
	* any improvements or extensions that they make and grant Carnegie the
	* rights to redistribute these changes.
	*/

	/*
	* Paging space routine stubs. Emulates a matchmaker-like interface
	* for builtin pagers.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_param.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/vnode.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/ucred.h>
	#include <sys/malloc.h>
	#include <sys/rwlock.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_extern.h>
	#include <vm/uma.h>

	uma_zone_t pbuf_zone;
	static int pbuf_init(void *, int, int);
	static int pbuf_ctor(void , int, void , int);
	static void pbuf_dtor(void , int, void );

	static int dead_pager_getpages(vm_object_t, vm_page_t , int, int , int *);
	static vm_object_t dead_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
	vm_ooffset_t, struct ucred *);
	static void dead_pager_putpages(vm_object_t, vm_page_t , int, int, int );
	static boolean_t dead_pager_haspage(vm_object_t, vm_pindex_t, int , int );
	static void dead_pager_dealloc(vm_object_t);

	static int
	dead_pager_getpages(vm_object_t obj, vm_page_t ma, int count, int rbehind,
	int *rahead)
	{

	return (VM_PAGER_FAIL);
	}

	static vm_object_t
	dead_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
	vm_ooffset_t off, struct ucred *cred)
	{

	return (NULL);
	}

	static void
	dead_pager_putpages(vm_object_t object, vm_page_t *m, int count,
	int flags, int *rtvals)
	{
	int i;

	for (i = 0; i < count; i++)
	rtvals[i] = VM_PAGER_AGAIN;
	}

	static int
	dead_pager_haspage(vm_object_t object, vm_pindex_t pindex, int prev, int next)
	{

	if (prev != NULL)
	*prev = 0;
	if (next != NULL)
	*next = 0;
	return (FALSE);
	}

	static void
	dead_pager_dealloc(vm_object_t object)
	{

	}

	static struct pagerops deadpagerops = {
	.pgo_alloc = dead_pager_alloc,
	.pgo_dealloc = dead_pager_dealloc,
	.pgo_getpages = dead_pager_getpages,
	.pgo_putpages = dead_pager_putpages,
	.pgo_haspage = dead_pager_haspage,
	};

	struct pagerops *pagertab[] = {
	&defaultpagerops, /* OBJT_DEFAULT */
	&swappagerops, /* OBJT_SWAP */
	&vnodepagerops, /* OBJT_VNODE */
	&devicepagerops, /* OBJT_DEVICE */
	&physpagerops, /* OBJT_PHYS */
	&deadpagerops, /* OBJT_DEAD */
	&sgpagerops, /* OBJT_SG */
	&mgtdevicepagerops, /* OBJT_MGTDEVICE */
	};

	void
	vm_pager_init(void)
	{
	struct pagerops **pgops;

	/*
	* Initialize known pagers
	*/
	for (pgops = pagertab; pgops < &pagertab[nitems(pagertab)]; pgops++)
	if ((*pgops)->pgo_init != NULL)
	((pgops)->pgo_init)();
	}

	static int nswbuf_max;

	void
	vm_pager_bufferinit(void)
	{

	/* Main zone for paging bufs. */
	- pbuf_zone = uma_zcreate("pbuf", sizeof(struct buf),
	+ pbuf_zone = uma_zcreate("pbuf",
	+ sizeof(struct buf) + PBUF_PAGES * sizeof(vm_page_t),
	pbuf_ctor, pbuf_dtor, pbuf_init, NULL, UMA_ALIGN_CACHE,
	UMA_ZONE_NOFREE);
	/* Few systems may still use this zone directly, so it needs a limit. */
	nswbuf_max += uma_zone_set_max(pbuf_zone, NSWBUF_MIN);
	}

	uma_zone_t
	pbuf_zsecond_create(const char *name, int max)
	{
	uma_zone_t zone;

	zone = uma_zsecond_create(name, pbuf_ctor, pbuf_dtor, NULL, NULL,
	pbuf_zone);
	/*
	* uma_prealloc() rounds up to items per slab. If we would prealloc
	* immediately on every pbuf_zsecond_create(), we may accumulate too
	* much of difference between hard limit and prealloced items, which
	* means wasted memory.
	*/
	if (nswbuf_max > 0)
	nswbuf_max += uma_zone_set_max(zone, max);
	else
	uma_prealloc(pbuf_zone, uma_zone_set_max(zone, max));

	return (zone);
	}

	static void
	pbuf_prealloc(void *arg __unused)
	{

	uma_prealloc(pbuf_zone, nswbuf_max);
	nswbuf_max = -1;
	}

	SYSINIT(pbuf, SI_SUB_KTHREAD_BUF, SI_ORDER_ANY, pbuf_prealloc, NULL);

	/*
	* Allocate an instance of a pager of the given type.
	* Size, protection and offset parameters are passed in for pagers that
	* need to perform page-level validation (e.g. the device pager).
	*/
	vm_object_t
	vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size,
	vm_prot_t prot, vm_ooffset_t off, struct ucred *cred)
	{
	vm_object_t ret;
	struct pagerops *ops;

	ops = pagertab[type];
	if (ops)
	ret = (*ops->pgo_alloc)(handle, size, prot, off, cred);
	else
	ret = NULL;
	return (ret);
	}

	/*
	* The object must be locked.
	*/
	void
	vm_pager_deallocate(vm_object_t object)
	{

	VM_OBJECT_ASSERT_WLOCKED(object);
	(*pagertab[object->type]->pgo_dealloc) (object);
	}

	static void
	vm_pager_assert_in(vm_object_t object, vm_page_t *m, int count)
	{
	#ifdef INVARIANTS

	/*
	* All pages must be consecutive, busied, not mapped, not fully valid,
	* not dirty and belong to the proper object. Some pages may be the
	* bogus page, but the first and last pages must be a real ones.
	*/

	VM_OBJECT_ASSERT_UNLOCKED(object);
	VM_OBJECT_ASSERT_PAGING(object);
	KASSERT(count > 0, ("%s: 0 count", __func__));
	for (int i = 0 ; i < count; i++) {
	if (m[i] == bogus_page) {
	KASSERT(i != 0 && i != count - 1,
	("%s: page %d is the bogus page", __func__, i));
	continue;
	}
	vm_page_assert_xbusied(m[i]);
	KASSERT(!pmap_page_is_mapped(m[i]),
	("%s: page %p is mapped", __func__, m[i]));
	KASSERT(m[i]->valid != VM_PAGE_BITS_ALL,
	("%s: request for a valid page %p", __func__, m[i]));
	KASSERT(m[i]->dirty == 0,
	("%s: page %p is dirty", __func__, m[i]));
	KASSERT(m[i]->object == object,
	("%s: wrong object %p/%p", __func__, object, m[i]->object));
	KASSERT(m[i]->pindex == m[0]->pindex + i,
	("%s: page %p isn't consecutive", __func__, m[i]));
	}
	#endif
	}

	/*
	* Page in the pages for the object using its associated pager.
	* The requested page must be fully valid on successful return.
	*/
	int
	vm_pager_get_pages(vm_object_t object, vm_page_t m, int count, int rbehind,
	int *rahead)
	{
	#ifdef INVARIANTS
	vm_pindex_t pindex = m[0]->pindex;
	#endif
	int r;

	vm_pager_assert_in(object, m, count);

	r = (*pagertab[object->type]->pgo_getpages)(object, m, count, rbehind,
	rahead);
	if (r != VM_PAGER_OK)
	return (r);

	for (int i = 0; i < count; i++) {
	/*
	* If pager has replaced a page, assert that it had
	* updated the array.
	*/
	#ifdef INVARIANTS
	VM_OBJECT_RLOCK(object);
	KASSERT(m[i] == vm_page_lookup(object, pindex++),
	("%s: mismatch page %p pindex %ju", __func__,
	m[i], (uintmax_t )pindex - 1));
	VM_OBJECT_RUNLOCK(object);
	#endif
	/*
	* Zero out partially filled data.
	*/
	if (m[i]->valid != VM_PAGE_BITS_ALL)
	vm_page_zero_invalid(m[i], TRUE);
	}
	return (VM_PAGER_OK);
	}

	int
	vm_pager_get_pages_async(vm_object_t object, vm_page_t *m, int count,
	int rbehind, int rahead, pgo_getpages_iodone_t iodone, void *arg)
	{

	vm_pager_assert_in(object, m, count);

	return ((*pagertab[object->type]->pgo_getpages_async)(object, m,
	count, rbehind, rahead, iodone, arg));
	}

	/*
	* vm_pager_put_pages() - inline, see vm/vm_pager.h
	* vm_pager_has_page() - inline, see vm/vm_pager.h
	*/

	/*
	* Search the specified pager object list for an object with the
	* specified handle. If an object with the specified handle is found,
	* increase its reference count and return it. Otherwise, return NULL.
	*
	* The pager object list must be locked.
	*/
	vm_object_t
	vm_pager_object_lookup(struct pagerlst pg_list, void handle)
	{
	vm_object_t object;

	TAILQ_FOREACH(object, pg_list, pager_object_list) {
	if (object->handle == handle) {
	VM_OBJECT_WLOCK(object);
	if ((object->flags & OBJ_DEAD) == 0) {
	vm_object_reference_locked(object);
	VM_OBJECT_WUNLOCK(object);
	break;
	}
	VM_OBJECT_WUNLOCK(object);
	}
	}
	return (object);
	}

	static int
	pbuf_ctor(void mem, int size, void arg, int flags)
	{
	struct buf *bp = mem;

	bp->b_vp = NULL;
	bp->b_bufobj = NULL;

	/* copied from initpbuf() */
	bp->b_rcred = NOCRED;
	bp->b_wcred = NOCRED;
	bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */
	bp->b_data = bp->b_kvabase;
	bp->b_xflags = 0;
	- bp->b_flags = 0;
	+ bp->b_flags = B_MAXPHYS;
	bp->b_ioflags = 0;
	bp->b_iodone = NULL;
	bp->b_error = 0;
	BUF_LOCK(bp, LK_EXCLUSIVE, NULL);

	return (0);
	}

	static void
	pbuf_dtor(void mem, int size, void arg)
	{
	struct buf *bp = mem;

	if (bp->b_rcred != NOCRED) {
	crfree(bp->b_rcred);
	bp->b_rcred = NOCRED;
	}
	if (bp->b_wcred != NOCRED) {
	crfree(bp->b_wcred);
	bp->b_wcred = NOCRED;
	}

	BUF_UNLOCK(bp);
	}

	static int
	pbuf_init(void *mem, int size, int flags)
	{
	struct buf *bp = mem;

	- bp->b_kvabase = (void *)kva_alloc(MAXPHYS);
	+ bp->b_kvabase = (void *)kva_alloc(ptoa(PBUF_PAGES));
	if (bp->b_kvabase == NULL)
	return (ENOMEM);
	- bp->b_kvasize = MAXPHYS;
	+ bp->b_kvasize = ptoa(PBUF_PAGES);
	BUF_LOCKINIT(bp);
	LIST_INIT(&bp->b_dep);
	bp->b_rcred = bp->b_wcred = NOCRED;
	bp->b_xflags = 0;

	return (0);
	}

	/*
	* Associate a p-buffer with a vnode.
	*
	* Also sets B_PAGING flag to indicate that vnode is not fully associated
	* with the buffer. i.e. the bp has not been linked into the vnode or
	* ref-counted.
	*/
	void
	pbgetvp(struct vnode vp, struct buf bp)
	{

	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
	KASSERT(bp->b_bufobj == NULL, ("pbgetvp: not free (bufobj)"));

	bp->b_vp = vp;
	bp->b_flags \|= B_PAGING;
	bp->b_bufobj = &vp->v_bufobj;
	}

	/*
	* Associate a p-buffer with a vnode.
	*
	* Also sets B_PAGING flag to indicate that vnode is not fully associated
	* with the buffer. i.e. the bp has not been linked into the vnode or
	* ref-counted.
	*/
	void
	pbgetbo(struct bufobj bo, struct buf bp)
	{

	KASSERT(bp->b_vp == NULL, ("pbgetbo: not free (vnode)"));
	KASSERT(bp->b_bufobj == NULL, ("pbgetbo: not free (bufobj)"));

	bp->b_flags \|= B_PAGING;
	bp->b_bufobj = bo;
	}

	/*
	* Disassociate a p-buffer from a vnode.
	*/
	void
	pbrelvp(struct buf *bp)
	{

	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
	KASSERT(bp->b_bufobj != NULL, ("pbrelvp: NULL bufobj"));
	KASSERT((bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) == 0,
	("pbrelvp: pager buf on vnode list."));

	bp->b_vp = NULL;
	bp->b_bufobj = NULL;
	bp->b_flags &= ~B_PAGING;
	}

	/*
	* Disassociate a p-buffer from a bufobj.
	*/
	void
	pbrelbo(struct buf *bp)
	{

	KASSERT(bp->b_vp == NULL, ("pbrelbo: vnode"));
	KASSERT(bp->b_bufobj != NULL, ("pbrelbo: NULL bufobj"));
	KASSERT((bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) == 0,
	("pbrelbo: pager buf on vnode list."));

	bp->b_bufobj = NULL;
	bp->b_flags &= ~B_PAGING;
	}
	diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h
	index a8f92eac59e3..45a5a5654806 100644
	--- a/sys/vm/vm_pager.h
	+++ b/sys/vm/vm_pager.h
	@@ -1,248 +1,254 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1990 University of Utah.
	* Copyright (c) 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* the Systems Programming Group of the University of Utah Computer
	* Science Department.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)vm_pager.h 8.4 (Berkeley) 1/12/94
	* $FreeBSD$
	*/

	/*
	* Pager routine interface definition.
	*/

	#ifndef _VM_PAGER_
	#define _VM_PAGER_

	#include <sys/queue.h>

	TAILQ_HEAD(pagerlst, vm_object);

	typedef void pgo_init_t(void);
	typedef vm_object_t pgo_alloc_t(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t,
	struct ucred *);
	typedef void pgo_dealloc_t(vm_object_t);
	typedef int pgo_getpages_t(vm_object_t, vm_page_t , int, int , int *);
	typedef void pgo_getpages_iodone_t(void , vm_page_t , int, int);
	typedef int pgo_getpages_async_t(vm_object_t, vm_page_t , int, int , int *,
	pgo_getpages_iodone_t, void *);
	typedef void pgo_putpages_t(vm_object_t, vm_page_t , int, int, int );
	typedef boolean_t pgo_haspage_t(vm_object_t, vm_pindex_t, int , int );
	typedef int pgo_populate_t(vm_object_t, vm_pindex_t, int, vm_prot_t,
	vm_pindex_t , vm_pindex_t );
	typedef void pgo_pageunswapped_t(vm_page_t);
	typedef void pgo_writecount_t(vm_object_t, vm_offset_t, vm_offset_t);

	struct pagerops {
	pgo_init_t pgo_init; / Initialize pager. */
	pgo_alloc_t pgo_alloc; / Allocate pager. */
	pgo_dealloc_t pgo_dealloc; / Disassociate. */
	pgo_getpages_t pgo_getpages; / Get (read) page. */
	pgo_getpages_async_t pgo_getpages_async; / Get page asyncly. */
	pgo_putpages_t pgo_putpages; / Put (write) page. */
	pgo_haspage_t pgo_haspage; / Query page. */
	pgo_populate_t pgo_populate; / Bulk spec pagein. */
	pgo_pageunswapped_t *pgo_pageunswapped;
	/* Operations for specialized writecount handling */
	pgo_writecount_t *pgo_update_writecount;
	pgo_writecount_t *pgo_release_writecount;
	};

	extern struct pagerops defaultpagerops;
	extern struct pagerops swappagerops;
	extern struct pagerops vnodepagerops;
	extern struct pagerops devicepagerops;
	extern struct pagerops physpagerops;
	extern struct pagerops sgpagerops;
	extern struct pagerops mgtdevicepagerops;

	/*
	* get/put return values
	* OK operation was successful
	* BAD specified data was out of the accepted range
	* FAIL specified data was in range, but doesn't exist
	* PEND operations was initiated but not completed
	* ERROR error while accessing data that is in range and exists
	* AGAIN temporary resource shortage prevented operation from happening
	*/
	#define VM_PAGER_OK 0
	#define VM_PAGER_BAD 1
	#define VM_PAGER_FAIL 2
	#define VM_PAGER_PEND 3
	#define VM_PAGER_ERROR 4
	#define VM_PAGER_AGAIN 5

	#define VM_PAGER_PUT_SYNC 0x0001
	#define VM_PAGER_PUT_INVAL 0x0002
	#define VM_PAGER_PUT_NOREUSE 0x0004
	#define VM_PAGER_CLUSTER_OK 0x0008

	#ifdef _KERNEL

	extern struct pagerops *pagertab[];
	extern struct mtx_padalign pbuf_mtx;

	+/*
	+ * Number of pages that pbuf buffer can store in b_pages.
	+ * It is +1 to allow for unaligned data buffer of maxphys size.
	+ */
	+#define PBUF_PAGES (atop(maxphys) + 1)
	+
	vm_object_t vm_pager_allocate(objtype_t, void *, vm_ooffset_t, vm_prot_t,
	vm_ooffset_t, struct ucred *);
	void vm_pager_bufferinit(void);
	void vm_pager_deallocate(vm_object_t);
	int vm_pager_get_pages(vm_object_t, vm_page_t , int, int , int *);
	int vm_pager_get_pages_async(vm_object_t, vm_page_t , int, int , int *,
	pgo_getpages_iodone_t, void *);
	void vm_pager_init(void);
	vm_object_t vm_pager_object_lookup(struct pagerlst , void );

	static __inline void
	vm_pager_put_pages(
	vm_object_t object,
	vm_page_t *m,
	int count,
	int flags,
	int *rtvals
	) {
	VM_OBJECT_ASSERT_WLOCKED(object);
	(*pagertab[object->type]->pgo_putpages)
	(object, m, count, flags, rtvals);
	}

	/*
	* vm_pager_haspage
	*
	* Check to see if an object's pager has the requested page. The
	* object's pager will also set before and after to give the caller
	* some idea of the number of pages before and after the requested
	* page can be I/O'd efficiently.
	*
	* The object must be locked.
	*/
	static __inline boolean_t
	vm_pager_has_page(
	vm_object_t object,
	vm_pindex_t offset,
	int *before,
	int *after
	) {
	boolean_t ret;

	VM_OBJECT_ASSERT_LOCKED(object);
	ret = (*pagertab[object->type]->pgo_haspage)
	(object, offset, before, after);
	return (ret);
	}

	static __inline int
	vm_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
	vm_prot_t max_prot, vm_pindex_t first, vm_pindex_t last)
	{

	MPASS((object->flags & OBJ_POPULATE) != 0);
	MPASS(pidx < object->size);
	MPASS(blockcount_read(&object->paging_in_progress) > 0);
	return ((*pagertab[object->type]->pgo_populate)(object, pidx,
	fault_type, max_prot, first, last));
	}

	/*
	* vm_pager_page_unswapped
	*
	* Destroy swap associated with the page.
	*
	* XXX: A much better name would be "vm_pager_page_dirtied()"
	* XXX: It is not obvious if this could be profitably used by any
	* XXX: pagers besides the swap_pager or if it should even be a
	* XXX: generic pager_op in the first place.
	*/
	static __inline void
	vm_pager_page_unswapped(vm_page_t m)
	{

	if (pagertab[m->object->type]->pgo_pageunswapped)
	(*pagertab[m->object->type]->pgo_pageunswapped)(m);
	}

	static __inline void
	vm_pager_update_writecount(vm_object_t object, vm_offset_t start,
	vm_offset_t end)
	{

	if (pagertab[object->type]->pgo_update_writecount)
	pagertab[object->type]->pgo_update_writecount(object, start,
	end);
	}

	static __inline void
	vm_pager_release_writecount(vm_object_t object, vm_offset_t start,
	vm_offset_t end)
	{

	if (pagertab[object->type]->pgo_release_writecount)
	pagertab[object->type]->pgo_release_writecount(object, start,
	end);
	}

	struct cdev_pager_ops {
	int (*cdev_pg_fault)(vm_object_t vm_obj, vm_ooffset_t offset,
	int prot, vm_page_t *mres);
	int (*cdev_pg_populate)(vm_object_t vm_obj, vm_pindex_t pidx,
	int fault_type, vm_prot_t max_prot, vm_pindex_t *first,
	vm_pindex_t *last);
	int (cdev_pg_ctor)(void handle, vm_ooffset_t size, vm_prot_t prot,
	vm_ooffset_t foff, struct ucred cred, u_short color);
	void (cdev_pg_dtor)(void handle);
	};

	vm_object_t cdev_pager_allocate(void *handle, enum obj_type tp,
	struct cdev_pager_ops *ops, vm_ooffset_t size, vm_prot_t prot,
	vm_ooffset_t foff, struct ucred *cred);
	vm_object_t cdev_pager_lookup(void *handle);
	void cdev_pager_free_page(vm_object_t object, vm_page_t m);

	struct phys_pager_ops {
	int (phys_pg_getpages)(vm_object_t vm_obj, vm_page_t m, int count,
	int rbehind, int rahead);
	int (*phys_pg_populate)(vm_object_t vm_obj, vm_pindex_t pidx,
	int fault_type, vm_prot_t max_prot, vm_pindex_t *first,
	vm_pindex_t *last);
	boolean_t (*phys_pg_haspage)(vm_object_t obj, vm_pindex_t pindex,
	int before, int after);
	void (*phys_pg_ctor)(vm_object_t vm_obj, vm_prot_t prot,
	vm_ooffset_t foff, struct ucred *cred);
	void (*phys_pg_dtor)(vm_object_t vm_obj);
	};
	extern struct phys_pager_ops default_phys_pg_ops;
	vm_object_t phys_pager_allocate(void handle, struct phys_pager_ops ops,
	void *data, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff,
	struct ucred *cred);

	#endif /* _KERNEL */
	#endif /* _VM_PAGER_ */
	diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
	index 0c67a3785ea2..e75c6fb6b5d7 100644
	--- a/sys/vm/vnode_pager.c
	+++ b/sys/vm/vnode_pager.c
	@@ -1,1603 +1,1604 @@
	/*-
	* SPDX-License-Identifier: BSD-4-Clause
	*
	* Copyright (c) 1990 University of Utah.
	* Copyright (c) 1991 The Regents of the University of California.
	* All rights reserved.
	* Copyright (c) 1993, 1994 John S. Dyson
	* Copyright (c) 1995, David Greenman
	*
	* This code is derived from software contributed to Berkeley by
	* the Systems Programming Group of the University of Utah Computer
	* Science Department.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91
	*/

	/*
	* Page to/from files (vnodes).
	*/

	/*
	* TODO:
	* Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
	* greatly re-simplify the vnode_pager.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_vm.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/systm.h>
	#include <sys/sysctl.h>
	#include <sys/proc.h>
	#include <sys/vnode.h>
	#include <sys/mount.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/vmmeter.h>
	#include <sys/ktr.h>
	#include <sys/limits.h>
	#include <sys/conf.h>
	#include <sys/refcount.h>
	#include <sys/rwlock.h>
	#include <sys/sf_buf.h>
	#include <sys/domainset.h>

	#include <machine/atomic.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_map.h>
	#include <vm/vnode_pager.h>
	#include <vm/vm_extern.h>
	#include <vm/uma.h>

	static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
	daddr_t rtaddress, int run);
	static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
	static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
	static void vnode_pager_dealloc(vm_object_t);
	static int vnode_pager_getpages(vm_object_t, vm_page_t , int, int , int *);
	static int vnode_pager_getpages_async(vm_object_t, vm_page_t , int, int ,
	int , vop_getpages_iodone_t, void );
	static void vnode_pager_putpages(vm_object_t, vm_page_t , int, int, int );
	static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int , int );
	static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
	vm_ooffset_t, struct ucred *cred);
	static int vnode_pager_generic_getpages_done(struct buf *);
	static void vnode_pager_generic_getpages_done_async(struct buf *);
	static void vnode_pager_update_writecount(vm_object_t, vm_offset_t,
	vm_offset_t);
	static void vnode_pager_release_writecount(vm_object_t, vm_offset_t,
	vm_offset_t);

	struct pagerops vnodepagerops = {
	.pgo_alloc = vnode_pager_alloc,
	.pgo_dealloc = vnode_pager_dealloc,
	.pgo_getpages = vnode_pager_getpages,
	.pgo_getpages_async = vnode_pager_getpages_async,
	.pgo_putpages = vnode_pager_putpages,
	.pgo_haspage = vnode_pager_haspage,
	.pgo_update_writecount = vnode_pager_update_writecount,
	.pgo_release_writecount = vnode_pager_release_writecount,
	};

	static struct domainset *vnode_domainset = NULL;

	SYSCTL_PROC(_debug, OID_AUTO, vnode_domainset,
	CTLTYPE_STRING \| CTLFLAG_MPSAFE \| CTLFLAG_RW, &vnode_domainset, 0,
	sysctl_handle_domainset, "A", "Default vnode NUMA policy");

	static int nvnpbufs;
	SYSCTL_INT(_vm, OID_AUTO, vnode_pbufs, CTLFLAG_RDTUN \| CTLFLAG_NOFETCH,
	&nvnpbufs, 0, "number of physical buffers allocated for vnode pager");

	static uma_zone_t vnode_pbuf_zone;

	static void
	vnode_pager_init(void *dummy)
	{

	#ifdef __LP64__
	nvnpbufs = nswbuf * 2;
	#else
	nvnpbufs = nswbuf / 2;
	#endif
	TUNABLE_INT_FETCH("vm.vnode_pbufs", &nvnpbufs);
	vnode_pbuf_zone = pbuf_zsecond_create("vnpbuf", nvnpbufs);
	}
	SYSINIT(vnode_pager, SI_SUB_CPU, SI_ORDER_ANY, vnode_pager_init, NULL);

	/* Create the VM system backing object for this vnode */
	int
	vnode_create_vobject(struct vnode vp, off_t isize, struct thread td)
	{
	vm_object_t object;
	vm_ooffset_t size = isize;
	struct vattr va;
	bool last;

	if (!vn_isdisk(vp) && vn_canvmio(vp) == FALSE)
	return (0);

	object = vp->v_object;
	if (object != NULL)
	return (0);

	if (size == 0) {
	if (vn_isdisk(vp)) {
	size = IDX_TO_OFF(INT_MAX);
	} else {
	if (VOP_GETATTR(vp, &va, td->td_ucred))
	return (0);
	size = va.va_size;
	}
	}

	object = vnode_pager_alloc(vp, size, 0, 0, td->td_ucred);
	/*
	* Dereference the reference we just created. This assumes
	* that the object is associated with the vp. We still have
	* to serialize with vnode_pager_dealloc() for the last
	* potential reference.
	*/
	VM_OBJECT_RLOCK(object);
	last = refcount_release(&object->ref_count);
	VM_OBJECT_RUNLOCK(object);
	if (last)
	vrele(vp);

	KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object"));

	return (0);
	}

	void
	vnode_destroy_vobject(struct vnode *vp)
	{
	struct vm_object *obj;

	obj = vp->v_object;
	if (obj == NULL \|\| obj->handle != vp)
	return;
	ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
	VM_OBJECT_WLOCK(obj);
	MPASS(obj->type == OBJT_VNODE);
	umtx_shm_object_terminated(obj);
	if (obj->ref_count == 0) {
	KASSERT((obj->flags & OBJ_DEAD) == 0,
	("vnode_destroy_vobject: Terminating dead object"));
	vm_object_set_flag(obj, OBJ_DEAD);

	/*
	* Clean pages and flush buffers.
	*/
	vm_object_page_clean(obj, 0, 0, OBJPC_SYNC);
	VM_OBJECT_WUNLOCK(obj);

	vinvalbuf(vp, V_SAVE, 0, 0);

	BO_LOCK(&vp->v_bufobj);
	vp->v_bufobj.bo_flag \|= BO_DEAD;
	BO_UNLOCK(&vp->v_bufobj);

	VM_OBJECT_WLOCK(obj);
	vm_object_terminate(obj);
	} else {
	/*
	* Woe to the process that tries to page now :-).
	*/
	vm_pager_deallocate(obj);
	VM_OBJECT_WUNLOCK(obj);
	}
	KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object));
	}

	/*
	* Allocate (or lookup) pager for a vnode.
	* Handle is a vnode pointer.
	*/
	vm_object_t
	vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
	vm_ooffset_t offset, struct ucred *cred)
	{
	vm_object_t object;
	struct vnode *vp;

	/*
	* Pageout to vnode, no can do yet.
	*/
	if (handle == NULL)
	return (NULL);

	vp = (struct vnode *)handle;
	ASSERT_VOP_LOCKED(vp, "vnode_pager_alloc");
	VNPASS(vp->v_usecount > 0, vp);
	retry:
	object = vp->v_object;

	if (object == NULL) {
	/*
	* Add an object of the appropriate size
	*/
	object = vm_object_allocate(OBJT_VNODE,
	OFF_TO_IDX(round_page(size)));

	object->un_pager.vnp.vnp_size = size;
	object->un_pager.vnp.writemappings = 0;
	object->domain.dr_policy = vnode_domainset;
	object->handle = handle;
	if ((vp->v_vflag & VV_VMSIZEVNLOCK) != 0) {
	VM_OBJECT_WLOCK(object);
	vm_object_set_flag(object, OBJ_SIZEVNLOCK);
	VM_OBJECT_WUNLOCK(object);
	}
	VI_LOCK(vp);
	if (vp->v_object != NULL) {
	/*
	* Object has been created while we were allocating.
	*/
	VI_UNLOCK(vp);
	VM_OBJECT_WLOCK(object);
	KASSERT(object->ref_count == 1,
	("leaked ref %p %d", object, object->ref_count));
	object->type = OBJT_DEAD;
	refcount_init(&object->ref_count, 0);
	VM_OBJECT_WUNLOCK(object);
	vm_object_destroy(object);
	goto retry;
	}
	vp->v_object = object;
	VI_UNLOCK(vp);
	vrefact(vp);
	} else {
	vm_object_reference(object);
	#if VM_NRESERVLEVEL > 0
	if ((object->flags & OBJ_COLORED) == 0) {
	VM_OBJECT_WLOCK(object);
	vm_object_color(object, 0);
	VM_OBJECT_WUNLOCK(object);
	}
	#endif
	}
	return (object);
	}

	/*
	* The object must be locked.
	*/
	static void
	vnode_pager_dealloc(vm_object_t object)
	{
	struct vnode *vp;
	int refs;

	vp = object->handle;
	if (vp == NULL)
	panic("vnode_pager_dealloc: pager already dealloced");

	VM_OBJECT_ASSERT_WLOCKED(object);
	vm_object_pip_wait(object, "vnpdea");
	refs = object->ref_count;

	object->handle = NULL;
	object->type = OBJT_DEAD;
	ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc");
	if (object->un_pager.vnp.writemappings > 0) {
	object->un_pager.vnp.writemappings = 0;
	VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
	CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
	__func__, vp, vp->v_writecount);
	}
	vp->v_object = NULL;
	VI_LOCK(vp);

	/*
	* vm_map_entry_set_vnode_text() cannot reach this vnode by
	* following object->handle. Clear all text references now.
	* This also clears the transient references from
	* kern_execve(), which is fine because dead_vnodeops uses nop
	* for VOP_UNSET_TEXT().
	*/
	if (vp->v_writecount < 0)
	vp->v_writecount = 0;
	VI_UNLOCK(vp);
	VM_OBJECT_WUNLOCK(object);
	if (refs > 0)
	vunref(vp);
	VM_OBJECT_WLOCK(object);
	}

	static boolean_t
	vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
	int *after)
	{
	struct vnode *vp = object->handle;
	daddr_t bn;
	uintptr_t lockstate;
	int err;
	daddr_t reqblock;
	int poff;
	int bsize;
	int pagesperblock, blocksperpage;

	VM_OBJECT_ASSERT_LOCKED(object);
	/*
	* If no vp or vp is doomed or marked transparent to VM, we do not
	* have the page.
	*/
	if (vp == NULL \|\| VN_IS_DOOMED(vp))
	return FALSE;
	/*
	* If the offset is beyond end of file we do
	* not have the page.
	*/
	if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size)
	return FALSE;

	bsize = vp->v_mount->mnt_stat.f_iosize;
	pagesperblock = bsize / PAGE_SIZE;
	blocksperpage = 0;
	if (pagesperblock > 0) {
	reqblock = pindex / pagesperblock;
	} else {
	blocksperpage = (PAGE_SIZE / bsize);
	reqblock = pindex * blocksperpage;
	}
	lockstate = VM_OBJECT_DROP(object);
	err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before);
	VM_OBJECT_PICKUP(object, lockstate);
	if (err)
	return TRUE;
	if (bn == -1)
	return FALSE;
	if (pagesperblock > 0) {
	poff = pindex - (reqblock * pagesperblock);
	if (before) {
	before = pagesperblock;
	*before += poff;
	}
	if (after) {
	/*
	* The BMAP vop can report a partial block in the
	* 'after', but must not report blocks after EOF.
	* Assert the latter, and truncate 'after' in case
	* of the former.
	*/
	KASSERT((reqblock + after) pagesperblock <
	roundup2(object->size, pagesperblock),
	("%s: reqblock %jd after %d size %ju", __func__,
	(intmax_t )reqblock, *after,
	(uintmax_t )object->size));
	after = pagesperblock;
	*after += pagesperblock - (poff + 1);
	if (pindex + *after >= object->size)
	*after = object->size - 1 - pindex;
	}
	} else {
	if (before) {
	*before /= blocksperpage;
	}

	if (after) {
	*after /= blocksperpage;
	}
	}
	return TRUE;
	}

	/*
	* Lets the VM system know about a change in size for a file.
	* We adjust our own internal size and flush any cached pages in
	* the associated object that are affected by the size change.
	*
	* Note: this routine may be invoked as a result of a pager put
	* operation (possibly at object termination time), so we must be careful.
	*/
	void
	vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize)
	{
	vm_object_t object;
	vm_page_t m;
	vm_pindex_t nobjsize;

	if ((object = vp->v_object) == NULL)
	return;
	#ifdef DEBUG_VFS_LOCKS
	{
	struct mount *mp;

	mp = vp->v_mount;
	if (mp != NULL && (mp->mnt_kern_flag & MNTK_VMSETSIZE_BUG) == 0)
	assert_vop_elocked(vp,
	"vnode_pager_setsize and not locked vnode");
	}
	#endif
	VM_OBJECT_WLOCK(object);
	if (object->type == OBJT_DEAD) {
	VM_OBJECT_WUNLOCK(object);
	return;
	}
	KASSERT(object->type == OBJT_VNODE,
	("not vnode-backed object %p", object));
	if (nsize == object->un_pager.vnp.vnp_size) {
	/*
	* Hasn't changed size
	*/
	VM_OBJECT_WUNLOCK(object);
	return;
	}
	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
	if (nsize < object->un_pager.vnp.vnp_size) {
	/*
	* File has shrunk. Toss any cached pages beyond the new EOF.
	*/
	if (nobjsize < object->size)
	vm_object_page_remove(object, nobjsize, object->size,
	0);
	/*
	* this gets rid of garbage at the end of a page that is now
	* only partially backed by the vnode.
	*
	* XXX for some reason (I don't know yet), if we take a
	* completely invalid page and mark it partially valid
	* it can screw up NFS reads, so we don't allow the case.
	*/
	if (!(nsize & PAGE_MASK))
	goto out;
	m = vm_page_grab(object, OFF_TO_IDX(nsize), VM_ALLOC_NOCREAT);
	if (m == NULL)
	goto out;
	if (!vm_page_none_valid(m)) {
	int base = (int)nsize & PAGE_MASK;
	int size = PAGE_SIZE - base;

	/*
	* Clear out partial-page garbage in case
	* the page has been mapped.
	*/
	pmap_zero_page_area(m, base, size);

	/*
	* Update the valid bits to reflect the blocks that
	* have been zeroed. Some of these valid bits may
	* have already been set.
	*/
	vm_page_set_valid_range(m, base, size);

	/*
	* Round "base" to the next block boundary so that the
	* dirty bit for a partially zeroed block is not
	* cleared.
	*/
	base = roundup2(base, DEV_BSIZE);

	/*
	* Clear out partial-page dirty bits.
	*
	* note that we do not clear out the valid
	* bits. This would prevent bogus_page
	* replacement from working properly.
	*/
	vm_page_clear_dirty(m, base, PAGE_SIZE - base);
	}
	vm_page_xunbusy(m);
	}
	out:
	#if defined(__powerpc__) && !defined(__powerpc64__)
	object->un_pager.vnp.vnp_size = nsize;
	#else
	atomic_store_64(&object->un_pager.vnp.vnp_size, nsize);
	#endif
	object->size = nobjsize;
	VM_OBJECT_WUNLOCK(object);
	}

	/*
	* calculate the linear (byte) disk address of specified virtual
	* file address
	*/
	static int
	vnode_pager_addr(struct vnode vp, vm_ooffset_t address, daddr_t rtaddress,
	int *run)
	{
	int bsize;
	int err;
	daddr_t vblock;
	daddr_t voffset;

	if (VN_IS_DOOMED(vp))
	return -1;

	bsize = vp->v_mount->mnt_stat.f_iosize;
	vblock = address / bsize;
	voffset = address % bsize;

	err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL);
	if (err == 0) {
	if (*rtaddress != -1)
	*rtaddress += voffset / DEV_BSIZE;
	if (run) {
	*run += 1;
	run = bsize / PAGE_SIZE;
	*run -= voffset / PAGE_SIZE;
	}
	}

	return (err);
	}

	/*
	* small block filesystem vnode pager input
	*/
	static int
	vnode_pager_input_smlfs(vm_object_t object, vm_page_t m)
	{
	struct vnode *vp;
	struct bufobj *bo;
	struct buf *bp;
	struct sf_buf *sf;
	daddr_t fileaddr;
	vm_offset_t bsize;
	vm_page_bits_t bits;
	int error, i;

	error = 0;
	vp = object->handle;
	if (VN_IS_DOOMED(vp))
	return VM_PAGER_BAD;

	bsize = vp->v_mount->mnt_stat.f_iosize;

	VOP_BMAP(vp, 0, &bo, 0, NULL, NULL);

	sf = sf_buf_alloc(m, 0);

	for (i = 0; i < PAGE_SIZE / bsize; i++) {
	vm_ooffset_t address;

	bits = vm_page_bits(i * bsize, bsize);
	if (m->valid & bits)
	continue;

	address = IDX_TO_OFF(m->pindex) + i * bsize;
	if (address >= object->un_pager.vnp.vnp_size) {
	fileaddr = -1;
	} else {
	error = vnode_pager_addr(vp, address, &fileaddr, NULL);
	if (error)
	break;
	}
	if (fileaddr != -1) {
	bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK);

	/* build a minimal buffer header */
	bp->b_iocmd = BIO_READ;
	bp->b_iodone = bdone;
	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
	bp->b_rcred = crhold(curthread->td_ucred);
	bp->b_wcred = crhold(curthread->td_ucred);
	bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize;
	bp->b_blkno = fileaddr;
	pbgetbo(bo, bp);
	bp->b_vp = vp;
	bp->b_bcount = bsize;
	bp->b_bufsize = bsize;
	bp->b_runningbufspace = bp->b_bufsize;
	atomic_add_long(&runningbufspace, bp->b_runningbufspace);

	/* do the input */
	bp->b_iooffset = dbtob(bp->b_blkno);
	bstrategy(bp);

	bwait(bp, PVM, "vnsrd");

	if ((bp->b_ioflags & BIO_ERROR) != 0) {
	KASSERT(bp->b_error != 0,
	("%s: buf error but b_error == 0\n", __func__));
	error = bp->b_error;
	}

	/*
	* free the buffer header back to the swap buffer pool
	*/
	bp->b_vp = NULL;
	pbrelbo(bp);
	uma_zfree(vnode_pbuf_zone, bp);
	if (error)
	break;
	} else
	bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize);
	KASSERT((m->dirty & bits) == 0,
	("vnode_pager_input_smlfs: page %p is dirty", m));
	vm_page_bits_set(m, &m->valid, bits);
	}
	sf_buf_free(sf);
	if (error) {
	return VM_PAGER_ERROR;
	}
	return VM_PAGER_OK;
	}

	/*
	* old style vnode pager input routine
	*/
	static int
	vnode_pager_input_old(vm_object_t object, vm_page_t m)
	{
	struct uio auio;
	struct iovec aiov;
	int error;
	int size;
	struct sf_buf *sf;
	struct vnode *vp;

	VM_OBJECT_ASSERT_WLOCKED(object);
	error = 0;

	/*
	* Return failure if beyond current EOF
	*/
	if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
	return VM_PAGER_BAD;
	} else {
	size = PAGE_SIZE;
	if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
	size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
	vp = object->handle;
	VM_OBJECT_WUNLOCK(object);

	/*
	* Allocate a kernel virtual address and initialize so that
	* we can use VOP_READ/WRITE routines.
	*/
	sf = sf_buf_alloc(m, 0);

	aiov.iov_base = (caddr_t)sf_buf_kva(sf);
	aiov.iov_len = size;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = IDX_TO_OFF(m->pindex);
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_rw = UIO_READ;
	auio.uio_resid = size;
	auio.uio_td = curthread;

	error = VOP_READ(vp, &auio, 0, curthread->td_ucred);
	if (!error) {
	int count = size - auio.uio_resid;

	if (count == 0)
	error = EINVAL;
	else if (count != PAGE_SIZE)
	bzero((caddr_t)sf_buf_kva(sf) + count,
	PAGE_SIZE - count);
	}
	sf_buf_free(sf);

	VM_OBJECT_WLOCK(object);
	}
	KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m));
	if (!error)
	vm_page_valid(m);
	return error ? VM_PAGER_ERROR : VM_PAGER_OK;
	}

	/*
	* generic vnode pager input routine
	*/

	/*
	* Local media VFS's that do not implement their own VOP_GETPAGES
	* should have their VOP_GETPAGES call to vnode_pager_generic_getpages()
	* to implement the previous behaviour.
	*
	* All other FS's should use the bypass to get to the local media
	* backing vp's VOP_GETPAGES.
	*/
	static int
	vnode_pager_getpages(vm_object_t object, vm_page_t m, int count, int rbehind,
	int *rahead)
	{
	struct vnode *vp;
	int rtval;

	/* Handle is stable with paging in progress. */
	vp = object->handle;
	rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead);
	KASSERT(rtval != EOPNOTSUPP,
	("vnode_pager: FS getpages not implemented\n"));
	return rtval;
	}

	static int
	vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count,
	int rbehind, int rahead, vop_getpages_iodone_t iodone, void *arg)
	{
	struct vnode *vp;
	int rtval;

	vp = object->handle;
	rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg);
	KASSERT(rtval != EOPNOTSUPP,
	("vnode_pager: FS getpages_async not implemented\n"));
	return (rtval);
	}

	/*
	* The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for
	* local filesystems, where partially valid pages can only occur at
	* the end of file.
	*/
	int
	vnode_pager_local_getpages(struct vop_getpages_args *ap)
	{

	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
	ap->a_rbehind, ap->a_rahead, NULL, NULL));
	}

	int
	vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap)
	{
	int error;

	error = vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
	ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg);
	if (error != 0 && ap->a_iodone != NULL)
	ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
	return (error);
	}

	/*
	* This is now called from local media FS's to operate against their
	* own vnodes if they fail to implement VOP_GETPAGES.
	*/
	int
	vnode_pager_generic_getpages(struct vnode vp, vm_page_t m, int count,
	int a_rbehind, int a_rahead, vop_getpages_iodone_t iodone, void *arg)
	{
	vm_object_t object;
	struct bufobj *bo;
	struct buf *bp;
	off_t foff;
	#ifdef INVARIANTS
	off_t blkno0;
	#endif
	int bsize, pagesperblock;
	int error, before, after, rbehind, rahead, poff, i;
	int bytecount, secmask;

	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
	("%s does not support devices", __func__));

	if (VN_IS_DOOMED(vp))
	return (VM_PAGER_BAD);

	object = vp->v_object;
	foff = IDX_TO_OFF(m[0]->pindex);
	bsize = vp->v_mount->mnt_stat.f_iosize;
	pagesperblock = bsize / PAGE_SIZE;

	KASSERT(foff < object->un_pager.vnp.vnp_size,
	("%s: page %p offset beyond vp %p size", __func__, m[0], vp));
	- KASSERT(count <= nitems(bp->b_pages),
	+ KASSERT(count <= atop(maxphys),
	("%s: requested %d pages", __func__, count));

	/*
	* The last page has valid blocks. Invalid part can only
	* exist at the end of file, and the page is made fully valid
	* by zeroing in vm_pager_get_pages().
	*/
	if (!vm_page_none_valid(m[count - 1]) && --count == 0) {
	if (iodone != NULL)
	iodone(arg, m, 1, 0);
	return (VM_PAGER_OK);
	}

	bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK);
	+ MPASS((bp->b_flags & B_MAXPHYS) != 0);

	/*
	* Get the underlying device blocks for the file with VOP_BMAP().
	* If the file system doesn't support VOP_BMAP, use old way of
	* getting pages via VOP_READ.
	*/
	error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before);
	if (error == EOPNOTSUPP) {
	uma_zfree(vnode_pbuf_zone, bp);
	VM_OBJECT_WLOCK(object);
	for (i = 0; i < count; i++) {
	VM_CNT_INC(v_vnodein);
	VM_CNT_INC(v_vnodepgsin);
	error = vnode_pager_input_old(object, m[i]);
	if (error)
	break;
	}
	VM_OBJECT_WUNLOCK(object);
	return (error);
	} else if (error != 0) {
	uma_zfree(vnode_pbuf_zone, bp);
	return (VM_PAGER_ERROR);
	}

	/*
	* If the file system supports BMAP, but blocksize is smaller
	* than a page size, then use special small filesystem code.
	*/
	if (pagesperblock == 0) {
	uma_zfree(vnode_pbuf_zone, bp);
	for (i = 0; i < count; i++) {
	VM_CNT_INC(v_vnodein);
	VM_CNT_INC(v_vnodepgsin);
	error = vnode_pager_input_smlfs(object, m[i]);
	if (error)
	break;
	}
	return (error);
	}

	/*
	* A sparse file can be encountered only for a single page request,
	* which may not be preceded by call to vm_pager_haspage().
	*/
	if (bp->b_blkno == -1) {
	KASSERT(count == 1,
	("%s: array[%d] request to a sparse file %p", __func__,
	count, vp));
	uma_zfree(vnode_pbuf_zone, bp);
	pmap_zero_page(m[0]);
	KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty",
	__func__, m[0]));
	vm_page_valid(m[0]);
	return (VM_PAGER_OK);
	}

	#ifdef INVARIANTS
	blkno0 = bp->b_blkno;
	#endif
	bp->b_blkno += (foff % bsize) / DEV_BSIZE;

	/* Recalculate blocks available after/before to pages. */
	poff = (foff % bsize) / PAGE_SIZE;
	before *= pagesperblock;
	before += poff;
	after *= pagesperblock;
	after += pagesperblock - (poff + 1);
	if (m[0]->pindex + after >= object->size)
	after = object->size - 1 - m[0]->pindex;
	KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d",
	__func__, count, after + 1));
	after -= count - 1;

	/* Trim requested rbehind/rahead to possible values. */
	rbehind = a_rbehind ? *a_rbehind : 0;
	rahead = a_rahead ? *a_rahead : 0;
	rbehind = min(rbehind, before);
	rbehind = min(rbehind, m[0]->pindex);
	rahead = min(rahead, after);
	rahead = min(rahead, object->size - m[count - 1]->pindex);
	/*
	* Check that total amount of pages fit into buf. Trim rbehind and
	* rahead evenly if not.
	*/
	- if (rbehind + rahead + count > nitems(bp->b_pages)) {
	+ if (rbehind + rahead + count > atop(maxphys)) {
	int trim, sum;

	- trim = rbehind + rahead + count - nitems(bp->b_pages) + 1;
	+ trim = rbehind + rahead + count - atop(maxphys) + 1;
	sum = rbehind + rahead;
	if (rbehind == before) {
	/* Roundup rbehind trim to block size. */
	rbehind -= roundup(trim * rbehind / sum, pagesperblock);
	if (rbehind < 0)
	rbehind = 0;
	} else
	rbehind -= trim * rbehind / sum;
	rahead -= trim * rahead / sum;
	}
	- KASSERT(rbehind + rahead + count <= nitems(bp->b_pages),
	- ("%s: behind %d ahead %d count %d", __func__,
	- rbehind, rahead, count));
	+ KASSERT(rbehind + rahead + count <= atop(maxphys),
	+ ("%s: behind %d ahead %d count %d maxphys %lu", __func__,
	+ rbehind, rahead, count, maxphys));

	/*
	* Fill in the bp->b_pages[] array with requested and optional
	* read behind or read ahead pages. Read behind pages are looked
	* up in a backward direction, down to a first cached page. Same
	* for read ahead pages, but there is no need to shift the array
	* in case of encountering a cached page.
	*/
	i = bp->b_npages = 0;
	if (rbehind) {
	vm_pindex_t startpindex, tpindex;
	vm_page_t p;

	VM_OBJECT_WLOCK(object);
	startpindex = m[0]->pindex - rbehind;
	if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL &&
	p->pindex >= startpindex)
	startpindex = p->pindex + 1;

	/* tpindex is unsigned; beware of numeric underflow. */
	for (tpindex = m[0]->pindex - 1;
	tpindex >= startpindex && tpindex < m[0]->pindex;
	tpindex--, i++) {
	p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
	if (p == NULL) {
	/* Shift the array. */
	for (int j = 0; j < i; j++)
	bp->b_pages[j] = bp->b_pages[j +
	tpindex + 1 - startpindex];
	break;
	}
	bp->b_pages[tpindex - startpindex] = p;
	}

	bp->b_pgbefore = i;
	bp->b_npages += i;
	bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE;
	} else
	bp->b_pgbefore = 0;

	/* Requested pages. */
	for (int j = 0; j < count; j++, i++)
	bp->b_pages[i] = m[j];
	bp->b_npages += count;

	if (rahead) {
	vm_pindex_t endpindex, tpindex;
	vm_page_t p;

	if (!VM_OBJECT_WOWNED(object))
	VM_OBJECT_WLOCK(object);
	endpindex = m[count - 1]->pindex + rahead + 1;
	if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL &&
	p->pindex < endpindex)
	endpindex = p->pindex;
	if (endpindex > object->size)
	endpindex = object->size;

	for (tpindex = m[count - 1]->pindex + 1;
	tpindex < endpindex; i++, tpindex++) {
	p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
	if (p == NULL)
	break;
	bp->b_pages[i] = p;
	}

	bp->b_pgafter = i - bp->b_npages;
	bp->b_npages = i;
	} else
	bp->b_pgafter = 0;

	if (VM_OBJECT_WOWNED(object))
	VM_OBJECT_WUNLOCK(object);

	/* Report back actual behind/ahead read. */
	if (a_rbehind)
	*a_rbehind = bp->b_pgbefore;
	if (a_rahead)
	*a_rahead = bp->b_pgafter;

	#ifdef INVARIANTS
	- KASSERT(bp->b_npages <= nitems(bp->b_pages),
	+ KASSERT(bp->b_npages <= atop(maxphys),
	("%s: buf %p overflowed", __func__, bp));
	for (int j = 1, prev = 0; j < bp->b_npages; j++) {
	if (bp->b_pages[j] == bogus_page)
	continue;
	KASSERT(bp->b_pages[j]->pindex - bp->b_pages[prev]->pindex ==
	j - prev, ("%s: pages array not consecutive, bp %p",
	__func__, bp));
	prev = j;
	}
	#endif

	/*
	* Recalculate first offset and bytecount with regards to read behind.
	* Truncate bytecount to vnode real size and round up physical size
	* for real devices.
	*/
	foff = IDX_TO_OFF(bp->b_pages[0]->pindex);
	bytecount = bp->b_npages << PAGE_SHIFT;
	if ((foff + bytecount) > object->un_pager.vnp.vnp_size)
	bytecount = object->un_pager.vnp.vnp_size - foff;
	secmask = bo->bo_bsize - 1;
	KASSERT(secmask < PAGE_SIZE && secmask > 0,
	("%s: sector size %d too large", __func__, secmask + 1));
	bytecount = (bytecount + secmask) & ~secmask;

	/*
	* And map the pages to be read into the kva, if the filesystem
	* requires mapped buffers.
	*/
	if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
	unmapped_buf_allowed) {
	bp->b_data = unmapped_buf;
	bp->b_offset = 0;
	} else {
	bp->b_data = bp->b_kvabase;
	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
	}

	/* Build a minimal buffer header. */
	bp->b_iocmd = BIO_READ;
	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
	bp->b_rcred = crhold(curthread->td_ucred);
	bp->b_wcred = crhold(curthread->td_ucred);
	pbgetbo(bo, bp);
	bp->b_vp = vp;
	bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount;
	bp->b_iooffset = dbtob(bp->b_blkno);
	KASSERT(IDX_TO_OFF(m[0]->pindex - bp->b_pages[0]->pindex) ==
	(blkno0 - bp->b_blkno) * DEV_BSIZE +
	IDX_TO_OFF(m[0]->pindex) % bsize,
	("wrong offsets bsize %d m[0] %ju b_pages[0] %ju "
	"blkno0 %ju b_blkno %ju", bsize,
	(uintmax_t)m[0]->pindex, (uintmax_t)bp->b_pages[0]->pindex,
	(uintmax_t)blkno0, (uintmax_t)bp->b_blkno));

	atomic_add_long(&runningbufspace, bp->b_runningbufspace);
	VM_CNT_INC(v_vnodein);
	VM_CNT_ADD(v_vnodepgsin, bp->b_npages);

	if (iodone != NULL) { /* async */
	bp->b_pgiodone = iodone;
	bp->b_caller1 = arg;
	bp->b_iodone = vnode_pager_generic_getpages_done_async;
	bp->b_flags \|= B_ASYNC;
	BUF_KERNPROC(bp);
	bstrategy(bp);
	return (VM_PAGER_OK);
	} else {
	bp->b_iodone = bdone;
	bstrategy(bp);
	bwait(bp, PVM, "vnread");
	error = vnode_pager_generic_getpages_done(bp);
	for (i = 0; i < bp->b_npages; i++)
	bp->b_pages[i] = NULL;
	bp->b_vp = NULL;
	pbrelbo(bp);
	uma_zfree(vnode_pbuf_zone, bp);
	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
	}
	}

	static void
	vnode_pager_generic_getpages_done_async(struct buf *bp)
	{
	int error;

	error = vnode_pager_generic_getpages_done(bp);
	/* Run the iodone upon the requested range. */
	bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore,
	bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error);
	for (int i = 0; i < bp->b_npages; i++)
	bp->b_pages[i] = NULL;
	bp->b_vp = NULL;
	pbrelbo(bp);
	uma_zfree(vnode_pbuf_zone, bp);
	}

	static int
	vnode_pager_generic_getpages_done(struct buf *bp)
	{
	vm_object_t object;
	off_t tfoff, nextoff;
	int i, error;

	KASSERT((bp->b_ioflags & BIO_ERROR) == 0 \|\| bp->b_error != 0,
	("%s: buf error but b_error == 0\n", __func__));
	error = (bp->b_ioflags & BIO_ERROR) != 0 ? bp->b_error : 0;
	object = bp->b_vp->v_object;

	if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) {
	if (!buf_mapped(bp)) {
	bp->b_data = bp->b_kvabase;
	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages,
	bp->b_npages);
	}
	bzero(bp->b_data + bp->b_bcount,
	PAGE_SIZE * bp->b_npages - bp->b_bcount);
	}
	if (buf_mapped(bp)) {
	pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
	bp->b_data = unmapped_buf;
	}

	/*
	* If the read failed, we must free any read ahead/behind pages here.
	* The requested pages are freed by the caller (for sync requests)
	* or by the bp->b_pgiodone callback (for async requests).
	*/
	if (error != 0) {
	VM_OBJECT_WLOCK(object);
	for (i = 0; i < bp->b_pgbefore; i++)
	vm_page_free_invalid(bp->b_pages[i]);
	for (i = bp->b_npages - bp->b_pgafter; i < bp->b_npages; i++)
	vm_page_free_invalid(bp->b_pages[i]);
	VM_OBJECT_WUNLOCK(object);
	return (error);
	}

	/* Read lock to protect size. */
	VM_OBJECT_RLOCK(object);
	for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex);
	i < bp->b_npages; i++, tfoff = nextoff) {
	vm_page_t mt;

	nextoff = tfoff + PAGE_SIZE;
	mt = bp->b_pages[i];
	if (mt == bogus_page)
	continue;

	if (nextoff <= object->un_pager.vnp.vnp_size) {
	/*
	* Read filled up entire page.
	*/
	vm_page_valid(mt);
	KASSERT(mt->dirty == 0,
	("%s: page %p is dirty", __func__, mt));
	KASSERT(!pmap_page_is_mapped(mt),
	("%s: page %p is mapped", __func__, mt));
	} else {
	/*
	* Read did not fill up entire page.
	*
	* Currently we do not set the entire page valid,
	* we just try to clear the piece that we couldn't
	* read.
	*/
	vm_page_set_valid_range(mt, 0,
	object->un_pager.vnp.vnp_size - tfoff);
	KASSERT((mt->dirty & vm_page_bits(0,
	object->un_pager.vnp.vnp_size - tfoff)) == 0,
	("%s: page %p is dirty", __func__, mt));
	}

	if (i < bp->b_pgbefore \|\| i >= bp->b_npages - bp->b_pgafter)
	vm_page_readahead_finish(mt);
	}
	VM_OBJECT_RUNLOCK(object);

	return (error);
	}

	/*
	* EOPNOTSUPP is no longer legal. For local media VFS's that do not
	* implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
	* vnode_pager_generic_putpages() to implement the previous behaviour.
	*
	* All other FS's should use the bypass to get to the local media
	* backing vp's VOP_PUTPAGES.
	*/
	static void
	vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count,
	int flags, int *rtvals)
	{
	int rtval;
	struct vnode *vp;
	int bytes = count * PAGE_SIZE;

	/*
	* Force synchronous operation if we are extremely low on memory
	* to prevent a low-memory deadlock. VOP operations often need to
	* allocate more memory to initiate the I/O ( i.e. do a BMAP
	* operation ). The swapper handles the case by limiting the amount
	* of asynchronous I/O, but that sort of solution doesn't scale well
	* for the vnode pager without a lot of work.
	*
	* Also, the backing vnode's iodone routine may not wake the pageout
	* daemon up. This should be probably be addressed XXX.
	*/

	if (vm_page_count_min())
	flags \|= VM_PAGER_PUT_SYNC;

	/*
	* Call device-specific putpages function
	*/
	vp = object->handle;
	VM_OBJECT_WUNLOCK(object);
	rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals);
	KASSERT(rtval != EOPNOTSUPP,
	("vnode_pager: stale FS putpages\n"));
	VM_OBJECT_WLOCK(object);
	}

	static int
	vn_off2bidx(vm_ooffset_t offset)
	{

	return ((offset & PAGE_MASK) / DEV_BSIZE);
	}

	static bool
	vn_dirty_blk(vm_page_t m, vm_ooffset_t offset)
	{

	KASSERT(IDX_TO_OFF(m->pindex) <= offset &&
	offset < IDX_TO_OFF(m->pindex + 1),
	("page %p pidx %ju offset %ju", m, (uintmax_t)m->pindex,
	(uintmax_t)offset));
	return ((m->dirty & ((vm_page_bits_t)1 << vn_off2bidx(offset))) != 0);
	}

	/*
	* This is now called from local media FS's to operate against their
	* own vnodes if they fail to implement VOP_PUTPAGES.
	*
	* This is typically called indirectly via the pageout daemon and
	* clustering has already typically occurred, so in general we ask the
	* underlying filesystem to write the data out asynchronously rather
	* then delayed.
	*/
	int
	vnode_pager_generic_putpages(struct vnode vp, vm_page_t ma, int bytecount,
	int flags, int *rtvals)
	{
	vm_object_t object;
	vm_page_t m;
	vm_ooffset_t maxblksz, next_offset, poffset, prev_offset;
	struct uio auio;
	struct iovec aiov;
	off_t prev_resid, wrsz;
	int count, error, i, maxsize, ncount, pgoff, ppscheck;
	bool in_hole;
	static struct timeval lastfail;
	static int curfail;

	object = vp->v_object;
	count = bytecount / PAGE_SIZE;

	for (i = 0; i < count; i++)
	rtvals[i] = VM_PAGER_ERROR;

	if ((int64_t)ma[0]->pindex < 0) {
	printf("vnode_pager_generic_putpages: "
	"attempt to write meta-data 0x%jx(%lx)\n",
	(uintmax_t)ma[0]->pindex, (u_long)ma[0]->dirty);
	rtvals[0] = VM_PAGER_BAD;
	return (VM_PAGER_BAD);
	}

	maxsize = count * PAGE_SIZE;
	ncount = count;

	poffset = IDX_TO_OFF(ma[0]->pindex);

	/*
	* If the page-aligned write is larger then the actual file we
	* have to invalidate pages occurring beyond the file EOF. However,
	* there is an edge case where a file may not be page-aligned where
	* the last page is partially invalid. In this case the filesystem
	* may not properly clear the dirty bits for the entire page (which
	* could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
	* With the page busied we are free to fix up the dirty bits here.
	*
	* We do not under any circumstances truncate the valid bits, as
	* this will screw up bogus page replacement.
	*/
	VM_OBJECT_RLOCK(object);
	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
	if (object->un_pager.vnp.vnp_size > poffset) {
	maxsize = object->un_pager.vnp.vnp_size - poffset;
	ncount = btoc(maxsize);
	if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
	pgoff = roundup2(pgoff, DEV_BSIZE);

	/*
	* If the page is busy and the following
	* conditions hold, then the page's dirty
	* field cannot be concurrently changed by a
	* pmap operation.
	*/
	m = ma[ncount - 1];
	vm_page_assert_sbusied(m);
	KASSERT(!pmap_page_is_write_mapped(m),
	("vnode_pager_generic_putpages: page %p is not read-only", m));
	MPASS(m->dirty != 0);
	vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
	pgoff);
	}
	} else {
	maxsize = 0;
	ncount = 0;
	}
	for (i = ncount; i < count; i++)
	rtvals[i] = VM_PAGER_BAD;
	}
	VM_OBJECT_RUNLOCK(object);

	auio.uio_iov = &aiov;
	auio.uio_segflg = UIO_NOCOPY;
	auio.uio_rw = UIO_WRITE;
	auio.uio_td = NULL;
	maxblksz = roundup2(poffset + maxsize, DEV_BSIZE);

	for (prev_offset = poffset; prev_offset < maxblksz;) {
	/* Skip clean blocks. */
	for (in_hole = true; in_hole && prev_offset < maxblksz;) {
	m = ma[OFF_TO_IDX(prev_offset - poffset)];
	for (i = vn_off2bidx(prev_offset);
	i < sizeof(vm_page_bits_t) * NBBY &&
	prev_offset < maxblksz; i++) {
	if (vn_dirty_blk(m, prev_offset)) {
	in_hole = false;
	break;
	}
	prev_offset += DEV_BSIZE;
	}
	}
	if (in_hole)
	goto write_done;

	/* Find longest run of dirty blocks. */
	for (next_offset = prev_offset; next_offset < maxblksz;) {
	m = ma[OFF_TO_IDX(next_offset - poffset)];
	for (i = vn_off2bidx(next_offset);
	i < sizeof(vm_page_bits_t) * NBBY &&
	next_offset < maxblksz; i++) {
	if (!vn_dirty_blk(m, next_offset))
	goto start_write;
	next_offset += DEV_BSIZE;
	}
	}
	start_write:
	if (next_offset > poffset + maxsize)
	next_offset = poffset + maxsize;

	/*
	* Getting here requires finding a dirty block in the
	* 'skip clean blocks' loop.
	*/
	MPASS(prev_offset < next_offset);

	aiov.iov_base = NULL;
	auio.uio_iovcnt = 1;
	auio.uio_offset = prev_offset;
	prev_resid = auio.uio_resid = aiov.iov_len = next_offset -
	prev_offset;
	error = VOP_WRITE(vp, &auio,
	vnode_pager_putpages_ioflags(flags), curthread->td_ucred);

	wrsz = prev_resid - auio.uio_resid;
	if (wrsz == 0) {
	if (ppsratecheck(&lastfail, &curfail, 1) != 0) {
	vn_printf(vp, "vnode_pager_putpages: "
	"zero-length write at %ju resid %zd\n",
	auio.uio_offset, auio.uio_resid);
	}
	break;
	}

	/* Adjust the starting offset for next iteration. */
	prev_offset += wrsz;
	MPASS(auio.uio_offset == prev_offset);

	ppscheck = 0;
	if (error != 0 && (ppscheck = ppsratecheck(&lastfail,
	&curfail, 1)) != 0)
	vn_printf(vp, "vnode_pager_putpages: I/O error %d\n",
	error);
	if (auio.uio_resid != 0 && (ppscheck != 0 \|\|
	ppsratecheck(&lastfail, &curfail, 1) != 0))
	vn_printf(vp, "vnode_pager_putpages: residual I/O %zd "
	"at %ju\n", auio.uio_resid,
	(uintmax_t)ma[0]->pindex);
	if (error != 0 \|\| auio.uio_resid != 0)
	break;
	}
	write_done:
	/* Mark completely processed pages. */
	for (i = 0; i < OFF_TO_IDX(prev_offset - poffset); i++)
	rtvals[i] = VM_PAGER_OK;
	/* Mark partial EOF page. */
	if (prev_offset == poffset + maxsize && (prev_offset & PAGE_MASK) != 0)
	rtvals[i++] = VM_PAGER_OK;
	/* Unwritten pages in range, free bonus if the page is clean. */
	for (; i < ncount; i++)
	rtvals[i] = ma[i]->dirty == 0 ? VM_PAGER_OK : VM_PAGER_ERROR;
	VM_CNT_ADD(v_vnodepgsout, i);
	VM_CNT_INC(v_vnodeout);
	return (rtvals[0]);
	}

	int
	vnode_pager_putpages_ioflags(int pager_flags)
	{
	int ioflags;

	/*
	* Pageouts are already clustered, use IO_ASYNC to force a
	* bawrite() rather then a bdwrite() to prevent paging I/O
	* from saturating the buffer cache. Dummy-up the sequential
	* heuristic to cause large ranges to cluster. If neither
	* IO_SYNC or IO_ASYNC is set, the system decides how to
	* cluster.
	*/
	ioflags = IO_VMIO;
	if ((pager_flags & (VM_PAGER_PUT_SYNC \| VM_PAGER_PUT_INVAL)) != 0)
	ioflags \|= IO_SYNC;
	else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0)
	ioflags \|= IO_ASYNC;
	ioflags \|= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0;
	ioflags \|= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0;
	ioflags \|= IO_SEQMAX << IO_SEQSHIFT;
	return (ioflags);
	}

	/*
	* vnode_pager_undirty_pages().
	*
	* A helper to mark pages as clean after pageout that was possibly
	* done with a short write. The lpos argument specifies the page run
	* length in bytes, and the written argument specifies how many bytes
	* were actually written. eof is the offset past the last valid byte
	* in the vnode using the absolute file position of the first byte in
	* the run as the base from which it is computed.
	*/
	void
	vnode_pager_undirty_pages(vm_page_t ma, int rtvals, int written, off_t eof,
	int lpos)
	{
	vm_object_t obj;
	int i, pos, pos_devb;

	if (written == 0 && eof >= lpos)
	return;
	obj = ma[0]->object;
	for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) {
	if (pos < trunc_page(written)) {
	rtvals[i] = VM_PAGER_OK;
	vm_page_undirty(ma[i]);
	} else {
	/* Partially written page. */
	rtvals[i] = VM_PAGER_AGAIN;
	vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK);
	}
	}
	if (eof >= lpos) /* avoid truncation */
	return;
	for (pos = eof, i = OFF_TO_IDX(trunc_page(pos)); pos < lpos; i++) {
	if (pos != trunc_page(pos)) {
	/*
	* The page contains the last valid byte in
	* the vnode, mark the rest of the page as
	* clean, potentially making the whole page
	* clean.
	*/
	pos_devb = roundup2(pos & PAGE_MASK, DEV_BSIZE);
	vm_page_clear_dirty(ma[i], pos_devb, PAGE_SIZE -
	pos_devb);

	/*
	* If the page was cleaned, report the pageout
	* on it as successful. msync() no longer
	* needs to write out the page, endlessly
	* creating write requests and dirty buffers.
	*/
	if (ma[i]->dirty == 0)
	rtvals[i] = VM_PAGER_OK;

	pos = round_page(pos);
	} else {
	/* vm_pageout_flush() clears dirty */
	rtvals[i] = VM_PAGER_BAD;
	pos += PAGE_SIZE;
	}
	}
	}

	static void
	vnode_pager_update_writecount(vm_object_t object, vm_offset_t start,
	vm_offset_t end)
	{
	struct vnode *vp;
	vm_ooffset_t old_wm;

	VM_OBJECT_WLOCK(object);
	if (object->type != OBJT_VNODE) {
	VM_OBJECT_WUNLOCK(object);
	return;
	}
	old_wm = object->un_pager.vnp.writemappings;
	object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start;
	vp = object->handle;
	if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) {
	ASSERT_VOP_LOCKED(vp, "v_writecount inc");
	VOP_ADD_WRITECOUNT_CHECKED(vp, 1);
	CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
	__func__, vp, vp->v_writecount);
	} else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) {
	ASSERT_VOP_LOCKED(vp, "v_writecount dec");
	VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
	CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
	__func__, vp, vp->v_writecount);
	}
	VM_OBJECT_WUNLOCK(object);
	}

	static void
	vnode_pager_release_writecount(vm_object_t object, vm_offset_t start,
	vm_offset_t end)
	{
	struct vnode *vp;
	struct mount *mp;
	vm_offset_t inc;

	VM_OBJECT_WLOCK(object);

	/*
	* First, recheck the object type to account for the race when
	* the vnode is reclaimed.
	*/
	if (object->type != OBJT_VNODE) {
	VM_OBJECT_WUNLOCK(object);
	return;
	}

	/*
	* Optimize for the case when writemappings is not going to
	* zero.
	*/
	inc = end - start;
	if (object->un_pager.vnp.writemappings != inc) {
	object->un_pager.vnp.writemappings -= inc;
	VM_OBJECT_WUNLOCK(object);
	return;
	}

	vp = object->handle;
	vhold(vp);
	VM_OBJECT_WUNLOCK(object);
	mp = NULL;
	vn_start_write(vp, &mp, V_WAIT);
	vn_lock(vp, LK_SHARED \| LK_RETRY);

	/*
	* Decrement the object's writemappings, by swapping the start
	* and end arguments for vnode_pager_update_writecount(). If
	* there was not a race with vnode reclaimation, then the
	* vnode's v_writecount is decremented.
	*/
	vnode_pager_update_writecount(object, end, start);
	VOP_UNLOCK(vp);
	vdrop(vp);
	if (mp != NULL)
	vn_finished_write(mp);
	}

File Metadata

Mime Type: application/octet-stream
Expires: Tue, Jul 9, 8:15 PM (2 d)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: I.h9327mKpBO
Default Alt Text: (5 MB)

Offset	End	Complete
0	4194304	Yes
4194304	5345046	Yes

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions